summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorYann Herklotz <git@yannherklotz.com>2022-04-18 16:23:37 +0100
committerYann Herklotz <git@yannherklotz.com>2022-04-18 16:23:37 +0100
commit5432fc846a60bf6b2567cd5d1f886de052e34940 (patch)
treea61db8522fae4e60ece814619c7e15c629bb5b06
parentdc7d1d5802348356a1c493330f5e290b7bdf7673 (diff)
downloadlsr22_fvhls-5432fc846a60bf6b2567cd5d1f886de052e34940.tar.gz
lsr22_fvhls-5432fc846a60bf6b2567cd5d1f886de052e34940.zip
More work on all chapters
-rw-r--r--chapters/background.tex108
-rw-r--r--chapters/hls.tex755
-rw-r--r--chapters/introduction.tex112
-rw-r--r--chapters/schedule.tex2
-rw-r--r--chapters/scheduling.tex24
-rw-r--r--figures/timing-1.tex27
-rw-r--r--figures/timing-2.tex24
-rw-r--r--lsr_env.tex127
-rw-r--r--main.tex2
9 files changed, 666 insertions, 515 deletions
diff --git a/chapters/background.tex b/chapters/background.tex
index 6ca5563..7208050 100644
--- a/chapters/background.tex
+++ b/chapters/background.tex
@@ -5,41 +5,19 @@
\chapter[sec:background]{Background}
-%\begin{table}
-% \centering
-% \begin{tabular}{lccccc}\toprule
-% \textbf{Tool} & \textbf{Implementation} & \textbf{Mainstream} & \textbf{Proof} & \textbf{Mechanised} & \textbf{HLS}\\
-% \midrule
-% Catapult C~\cite{mentor20_catap_high_level_synth} & \cmark{} & \cmark{} & \cmark{} & \xmark{} & \cmark{}\\
-% Chapman \textit{et al.}~\cite{chapman92_verif_bedroc} & \cmark{} & \xmark{} & \cmark{} & \cmark{} & \cmark{}\\
-% Clarke \textit{et al.}~\cite{clarke03_behav_c_veril} & \cmark{} & \cmark{} & \cmark{} & \xmark{} & \cmark{}\\
-% Ellis~\cite{ellis08_correc} & \xmark{} & \xmark{} & \cmark{} & \cmark{} & \cmark{}\\
-% Karfa \textit{et al.}~\cite{karfa06_formal_verif_method_sched_high_synth} & \cmark{} & \xmark{} & \cmark{} & \xmark{} & \cmark{}\\
-% Kundu \textit{et al.}~\cite{kundu08_valid_high_level_synth} & \cmark{} & \xmark{} & \cmark{} & \xmark{} & \cmark\\
-% LegUp~\cite{canis13_legup} & \cmark{} & \cmark{} & \xmark{} & \xmark{} & \cmark{}\\
-% Lööw \textit{et al.}~\cite{loow19_verif_compil_verif_proces} & \cmark{} & \xmark{} & \cmark{} & \cmark{} & \xmark{}\\
-% OpenCL SDK~\cite{intel20_sdk_openc_applic} & \cmark{} & \xmark{} & \xmark{} & \xmark{} & \cmark{}\\
-% Perna \textit{et al.}~\cite{perna12_mechan_wire_wise_verif_handel_c_synth} & \cmark{} & \xmark{} & \cmark{} & \cmark{} & \xmark{}\\
-% Vivado HLS~\cite{xilinx20_vivad_high_synth} & \cmark{} & \cmark{} & \xmark{} & \xmark{} & \cmark{}\\
-% \bottomrule
-% \end{tabular}
-% \caption{Table showing the various related works and comparing them.}\label{tab:existing_tools}
-%\end{table}
-
-This section describes the relevant literature around high-level synthesis, verified high-level
-synthesis, and finally mechanised proofs of HLS. Table~\ref{tab:existing_tools} classifies
-existing tools in the are of formal verification and high-level synthesis. The next section covers
-the main different categories that the tools in Table~\ref{tab:existing_tools} show, which include
-tools that cover \emph{implementation}, \emph{proof} and finally tools that are \emph{mechanically
- verified}.
+\startsynopsis
+ This chapter describes the current state-of-the-art optimisations implemented in high-level
+ synthesis tools, in particular focusing on static scheduling, loop pipelining and finally also
+ describing dynamic scheduling which is becoming a popular alternative.
+\stopsynopsis
\section{Implementation: High-level Synthesis}
-High-level synthesis (HLS) is the transformation of software directly into hardware. There are many
-different types of HLS, which can vary greatly with what languages they accept, however, they often
-share similar steps in how the translation is performed, as they all go from a higher-level,
-behavioural description of the algorithm to a timed hardware description. The main steps performed
-in the translation are the following~\cite{coussy09_introd_to_high_level_synth,canis13_legup}:
+\HLS\ is the transformation of software directly into hardware. There are many different types of
+\HLS, which can vary greatly with what languages they accept, however, they often share similar
+steps in how the translation is performed, as they all go from a higher-level, behavioural
+description of the algorithm to a timed hardware description. The main steps performed in the
+translation are the following~\cite{coussy09_introd_to_high_level_synth,canis13_legup}:
\desc{Compilation of specification} First, the input specification has to be compiled into a
representation that can be used to apply all the necessary optimisations. This can be an
@@ -58,7 +36,7 @@ the input specification language is normally an untimed behavioural representati
the spatial parallelism of the hardware can be taken advantage of, meaning operations that are not
dependent on each other can run in parallel. There are various possible scheduling methods that
could be used, such as static or dynamic scheduling, which are described further in
-sections~\ref{sec:static_scheduling} and~\ref{sec:dynamic_scheduling}.
+\in{Section}[sec:static_scheduling] and in \in{Section}[sec:dynamic_scheduling].
\desc{Operation and variable binding} After the operations have been scheduled, the operations and
the variables need to be bound to hardware units and registers respectively. It is often not that
@@ -79,6 +57,39 @@ tools take in languages that were designed for hardware such as
Handel-C~\cite{aubury96_handel_c_languag_refer_guide}, where time is encoded as part of the
semantics.
+\subsection[sec:language-blocks]{Intermediate Language}
+
+This section describes some possible characteristics of a language that could be used as an input to
+an \HLS\ tool. These language normally require some structure to allow for easier optimisation and
+analysis passes. In particular, it is often useful to have contiguous blocks of instructions that
+do not contain any control-flow in one list. This means that these instructions can safely be
+rearranged by only looking at local information of the block itself, and in particular it allows for
+complete removal of control-flow as only the data-flow is important in that block.
+
+\subsubsection{Basic blocks}
+
+%TODO: Finish the basic blocks section
+
+Basic blocks are the simplest form of structure, as these are only formed of lists of instructions
+that do not include control-flow.
+
+\subsubsection{Superblocks}
+
+Superblocks extend the notion of basic blocks to contiguous regions without any incoming
+control-flow, however, there can be multiple exits out of the block. The main benefit of this
+definition is that due to the extra flexibility of allowing multiple exits, the basic blocks can be
+extended, which often improves most optimisations that make use of basic blocks. However, the
+downside is that the representation of the blocks can be more complex due to the introduction of the
+extra control-flow. Any analysis passes will have to take into account the possibility of
+control-flow being present and many simplifications will not be possible anymore.
+
+\subsubsection{Hyperblocks}
+
+Hyperblocks are also an extension of basic blocks similar to superblocks, but instead of introducing
+special control-flow instructions into the block, every instruction is predicated. This leads to
+possibly more complex control-flow than in both of the previous cases, however, it can be reasoned
+with using a \SAT\ or \SMT\ solver.
+
\subsection[sec:static_scheduling]{Static Scheduling}
Static scheduling is used by the majority of synthesis
@@ -102,9 +113,8 @@ met. This is done until all the resources for the current clock cycle are used
Static scheduling can normally produce extremely small circuits, however, it is often not possible
to guarantee that the circuits will have the best
throughput~\cite{cheng20_combin_dynam_static_sched_high_level_synth}, as this requires extensive
-control-flow analysis and complex optimisations. Especially for loops, finding the optimal
-initiation interval (II) can be tricky if there are loads and stores in the loop or if the loops are
-nested.
+control-flow analysis and complex optimisations. Especially for loops, finding the optimal \II\ can
+be tricky if there are loads and stores in the loop or if the loops are nested.
\subsection[sec:dynamic_scheduling]{Dynamic Scheduling}
@@ -225,9 +235,31 @@ inlining~\cite{tristan08_formal_verif_trans_valid}. After RTL, each intermediat
to get closer to the assembly language of the architecture, performing operations such as register
allocation and making sure that the stack variables are correctly aligned.
-\subsection{Isabelle HLS}
+\subsubsection{CompCertSSA}
+
+CompCertSSA is an extension of CompCert with an additional \SSA\ intermediate language. This
+language enforces \SSA\ properties and therefore allows for many convenient proofs about
+optimisations performed on this intermediate language, as many assumptions about variables can be
+made when these are encountered. The main interesting porperty that arises from using \SSA\ is the
+\emph{equational lemma}, stating that given a variable, if it was assigned by an operation that does
+not depend on memory, then loading the destination value of that variable is the same as recomputing
+the value of that variable with the current context.
+
+Given a well formed SSA program $p$, a reachable state $\Sigma\ s\ f\ \sigma\ R\ M$, a memory
+independent operation which was defined at a node $d$ as $\mono{Iop}\ \mathit{op}\ \vec{a}\ x\ n$
+assuming that $\sigma$ is dominated by $d$ ($p \le_{d} d$), then the following equation holds:
+
+\placeformula[eq:equational-lemma]\startformula
+ \left(\mathit{op}, \vec{a}\right) \Downarrow (R, M) = \left\lfloor R[x] \right\rfloor
+\stopformula
+
+This is an important lemma as it essentially allows one to know the value of a register as long as
+one knows that it's assignment dominates the current node and one knows what expressions it was
+assigned.
+
+\subsection{HLS Formalised in Isabelle}
-Martin Ellis' work on correct synthesis~\cite{ellis08} is the first example of a mechanically
+Martin Ellis' work on correct synthesis~\cite{ellis08_correc} is the first example of a mechanically
verified high-level synthesis tool, also called hardware/software compilers, as they produce
software as well as hardware for special functional units that should be hardware accelerated. The
main goal of the thesis is to provide a framework to prove hardware/software compilers correct, by
diff --git a/chapters/hls.tex b/chapters/hls.tex
index 18227e6..fafb815 100644
--- a/chapters/hls.tex
+++ b/chapters/hls.tex
@@ -9,9 +9,11 @@
\chapter[sec:hls]{High-Level Synthesis}
-This chapter outlines the base implementation of a formally verified \HLS\ tool called
-Vericert~\cite{herklotz21_veric}. This chapter is based on a paper describing the initial
-implementation in detail~\citef{herklotz21_formal_verif_high_level_synth}.
+\startsynopsis
+ This chapter outlines the base implementation of a formally verified high-level synthesis tool
+ called Vericert~\citef{herklotz21_veric}. This chapter is based on a paper describing the initial
+ implementation in detail~\citef{herklotz21_formal_verif_high_level_synth}.
+\stopsynopsis
We have designed a new HLS tool in the Coq theorem prover and proved that any output design it
produces always has the same behaviour as its input program. Our tool, called Vericert, is
@@ -55,17 +57,6 @@ function calls, non-32-bit integers, floats, and global variables.
that its correctness theorem is watertight.
\stopitemize
-\paragraph{Companion material.} Vericert is fully open source and available on GitHub at
-
-\blank[big]
-\startalignment[center]
- \goto{\hyphenatedurl{https://github.com/ymherklotz/vericert}}[url(https://github.com/ymherklotz/vericert)].
-\stopalignment
-\blank[big]
-
-A snapshot of the Vericert development is also available in a Zenodo
-repository~\citef{herklotz21_veric}.
-
\section[sec:hls:design]{Designing a Verified HLS Tool}
This section describes the main architecture of the HLS tool, and the way in which the Verilog back
@@ -99,7 +90,7 @@ K\^{o}ika~\cite{bourgeat20_essen_blues}.
\paragraph{Choice of implementation language.} We chose Coq as the implementation language because
of its mature support for code extraction; that is, its ability to generate OCaml programs directly
from the definitions used in the theorems. We note that other authors have had some success
-reasoning about the HLS process using other theorem provers such as Isabelle~\cite{ellis08}.
+reasoning about the HLS process using other theorem provers such as Isabelle~\cite{ellis08_correc}.
CompCert~\cite{leroy09_formal_verif_realis_compil} was chosen as the front end because it has a well
established framework for simulation proofs about intermediate languages, and it already provides a
validated C parser~\cite{jourdan12_valid_lr_parser}. The Vellvm
@@ -196,11 +187,10 @@ Vericert, this event is either a positive (rising) or a negative (falling) clock
always-blocks triggering on the same event are executed in parallel. Always-blocks can also express
control-flow using if-statements and case-statements.
-\startplacefigure[reference={fig:tutorial:state_machine},location=top]
- \setupinterlinespace[line=2.8ex]
+\startplacemarginfigure[reference={fig:tutorial:state_machine},location=top,title={Hello}]
\startfloatcombination [nx=2, ny=1]
\startplacesubfigure
- \startframedtext[width={0.55\textwidth},frame=off,offset=none,bodyfont=11pt]
+ \startframedtext[width={0.9\textwidth},frame=off,offset=none,loffset=3cm,bodyfont=11pt]
\starthlverilog
module main(input rst, input y,
input clk, output reg z);
@@ -243,7 +233,7 @@ control-flow using if-statements and case-statements.
\stoptikzpicture}
\stopplacesubfigure
\stopfloatcombination
-\stopplacefigure
+\stopplacemarginfigure
A simple state machine can be implemented as shown in Fig.~\ref{fig:tutorial:state_machine}. At
every positive edge of the clock (\mono{clk}), both of the always-blocks will trigger
@@ -258,106 +248,98 @@ will reset the value of \mono{tmp} and then set \mono{z} to the original value o
nonblocking assignment does not change its value until the end of the clock cycle. Finally, the
last always-block will set the state to 0 again.
-\startplacefigure[location=here,reference={hello}]
- \startfloatcombination[nx=2,ny=2]
- \startplacesubfigure[title={1}]
- \externalfigure[hacker.jpg]
+\startplacemarginfigure[location=top,reference={fig:accumulator_c_rtl},title={Translating a simple
+ program from C to Verilog.}]
+ \startfloatcombination[nx=2,ny=1]
+ \startplacefigure[number=no]
+ \startfloatcombination[nx=1,ny=2]
+ \startplacesubfigure[title={Example C code passed to Vericert.}]
+ \startframedtext[width={0.4\textwidth},frame=off,offset=none,bodyfont=11pt]
+ \starthlC
+ int main() {
+ int x[2] = {3, 6};
+ int i = 1;
+ return x[i];
+ }
+ \stophlC
+ \stopframedtext
\stopplacesubfigure
- \startplacesubfigure[title={2}]
- \externalfigure[hacker.jpg]
+ \startplacesubfigure[title={3AC produced by the CompCert front-end without any optimisations.}]
+ \startframedtext[width={0.4\textwidth},frame=off,offset=none,bodyfont=11pt]
+ \starthlC
+ main() {
+ x5 = 3
+ int32[stack(0)] = x5
+ x4 = 6
+ int32[stack(4)] = x4
+ x1 = 1
+ x3 = stack(0) (int)
+ x2 = int32[x3 + x1
+ * 4 + 0]
+ return x2
+ }
+ \stophlC
+ \stopframedtext
\stopplacesubfigure
- \startplacesubfigure[title={3}]
- \externalfigure[hacker.jpg]
- \stopplacesubfigure
- \startplacesubfigure[]
- \externalfigure[hacker.jpg]
+ \stopfloatcombination
+ \stopplacefigure
+ \startplacesubfigure[title={Verilog produced by Vericert. It demonstrates the instantiation of
+ the RAM (lines 9--15), the data-path (lines 16--32) and the control logic (lines 33--42).}]
+ \startframedtext[width={\textwidth},frame=off,offset=none,bodyfont=10pt]
+ \starthlverilog
+module main(reset, clk, finish, return_val);
+ input [0:0] reset, clk;
+ output reg [0:0] finish = 0;
+ output reg [31:0] return_val = 0;
+ reg [31:0] reg_3 = 0, addr = 0, d_in = 0,
+ reg_5 = 0, wr_en = 0;
+ reg [0:0] en = 0, u_en = 0;
+ reg [31:0] state = 0, reg_2 = 0, reg_4 = 0,
+ d_out = 0, reg_1 = 0;
+ reg [31:0] stack [1:0];
+ // RAM interface
+ always @(negedge clk)
+ if ({u_en != en}) begin
+ if (wr_en) stack[addr] <= d_in;
+ else d_out <= stack[addr];
+ en <= u_en;
+ end
+ // Data-path
+ always @(posedge clk)
+ case (state)
+ 32'd11: reg_2 <= d_out;
+ 32'd8: reg_5 <= 32'd3;
+ 32'd7: begin u_en <= ( ~ u_en); wr_en <= 32'd1;
+ d_in <= reg_5; addr <= 32'd0; end
+ 32'd6: reg_4 <= 32'd6;
+ 32'd5: begin u_en <= ( ~ u_en); wr_en <= 32'd1;
+ d_in <= reg_4; addr <= 32'd1; end
+ 32'd4: reg_1 <= 32'd1;
+ 32'd3: reg_3 <= 32'd0;
+ 32'd2: begin u_en <= ( ~ u_en); wr_en <= 32'd0;
+ addr <= {{{reg_3 + 32'd0}
+ + {reg_1 * 32'd4}}
+ / 32'd4};
+ end
+ 32'd1: begin finish = 32'd1; return_val = reg_2; end
+ default: ;
+ endcase
+ // Control logic
+ always @(posedge clk)
+ if ({reset == 32'd1}) state <= 32'd8;
+ else case (state)
+ 32'd11: state <= 32'd1; 32'd4: state <= 32'd3;
+ 32'd8: state <= 32'd7; 32'd3: state <= 32'd2;
+ 32'd7: state <= 32'd6; 32'd2: state <= 32'd11;
+ 32'd6: state <= 32'd5; 32'd1: ;
+ 32'd5: state <= 32'd4; default: ;
+ endcase
+endmodule
+ \stophlverilog
+ \stopframedtext
\stopplacesubfigure
\stopfloatcombination
-\stopplacefigure
-
-\in{Figure}{b}[hello]
-
-%\begin{figure}
-% \centering
-% \begin{subfigure}[b]{0.3\linewidth}
-% \begin{subfigure}[t]{1\linewidth}
-%\begin{minted}[fontsize=\footnotesize,linenos,xleftmargin=20pt]{c}
-%int main() {
-% int x[2] = {3, 6};
-% int i = 1;
-% return x[i];
-%}
-%\end{minted}
-% \caption{Example C code passed to Vericert.}\label{fig:accumulator_c}
-% \end{subfigure}\\\vspace{3em}
-% \begin{subfigure}[b]{1\linewidth}
-%\begin{minted}[fontsize=\footnotesize,linenos,xleftmargin=20pt]{c}
-%main() {
-% x5 = 3
-% int32[stack(0)] = x5
-% x4 = 6
-% int32[stack(4)] = x4
-% x1 = 1
-% x3 = stack(0) (int)
-% x2 = int32[x3 + x1
-% * 4 + 0]
-% return x2
-%}
-%\end{minted}
-% \caption{3AC produced by the CompCert front-end without any optimisations.}\label{fig:accumulator_rtl}
-% \end{subfigure}
-% \end{subfigure}\hfill%
-% \begin{subfigure}[b]{0.65\linewidth}
-%\begin{minted}[fontsize=\tiny,linenos,xleftmargin=20pt]{verilog}
-%module main(reset, clk, finish, return_val);
-% input [0:0] reset, clk;
-% output reg [0:0] finish = 0;
-% output reg [31:0] return_val = 0;
-% reg [31:0] reg_3 = 0, addr = 0, d_in = 0, reg_5 = 0, wr_en = 0;
-% reg [0:0] en = 0, u_en = 0;
-% reg [31:0] state = 0, reg_2 = 0, reg_4 = 0, d_out = 0, reg_1 = 0;
-% reg [31:0] stack [1:0];
-% // RAM interface
-% always @(negedge clk)
-% if ({u_en != en}) begin
-% if (wr_en) stack[addr] <= d_in;
-% else d_out <= stack[addr];
-% en <= u_en;
-% end
-% // Data-path
-% always @(posedge clk)
-% case (state)
-% 32'd11: reg_2 <= d_out;
-% 32'd8: reg_5 <= 32'd3;
-% 32'd7: begin u_en <= ( ~ u_en); wr_en <= 32'd1;
-% d_in <= reg_5; addr <= 32'd0; end
-% 32'd6: reg_4 <= 32'd6;
-% 32'd5: begin u_en <= ( ~ u_en); wr_en <= 32'd1;
-% d_in <= reg_4; addr <= 32'd1; end
-% 32'd4: reg_1 <= 32'd1;
-% 32'd3: reg_3 <= 32'd0;
-% 32'd2: begin u_en <= ( ~ u_en); wr_en <= 32'd0;
-% addr <= {{{reg_3 + 32'd0} + {reg_1 * 32'd4}} / 32'd4}; end
-% 32'd1: begin finish = 32'd1; return_val = reg_2; end
-% default: ;
-% endcase
-% // Control logic
-% always @(posedge clk)
-% if ({reset == 32'd1}) state <= 32'd8;
-% else case (state)
-% 32'd11: state <= 32'd1; 32'd4: state <= 32'd3;
-% 32'd8: state <= 32'd7; 32'd3: state <= 32'd2;
-% 32'd7: state <= 32'd6; 32'd2: state <= 32'd11;
-% 32'd6: state <= 32'd5; 32'd1: ;
-% 32'd5: state <= 32'd4; default: ;
-% endcase
-%endmodule
-%\end{minted}
-%\caption{Verilog produced by Vericert. It demonstrates the instantiation of the RAM (lines 9--15), the data-path (lines 16--32) and the control logic (lines 33--42).}\label{fig:accumulator_v}
-%\end{subfigure}
-%\caption{Translating a simple program from C to Verilog.}\label{fig:accumulator_c_rtl}
-%% \NR{Is the default in line 41 of (c) supposed to be empty?}\YH{Yep, that's how it's generated.}
-%\end{figure}
+\stopplacemarginfigure
\subsection{Translating C to Verilog by Example}
Fig.~\ref{fig:accumulator_c_rtl} illustrates the translation of a simple program that stores and retrieves values from an array.
@@ -393,112 +375,113 @@ similar semantics to Verilog at the lower level by already using Verilog stateme
abstract instructions. Compared to plain Verilog, HTL is simpler to manipulate and analyse, thereby
making it easier to prove optimisations like proper RAM insertion.
-%\begin{figure*}
-% \centering
-%\definecolor{control}{HTML}{b3e2cd}
-%\definecolor{data}{HTML}{fdcdac}
-%\begin{tikzpicture}
-% \begin{scope}[scale=1.15]
-% \fill[control,fill opacity=1] (6.5,0) rectangle (12,5);
-% \fill[data,fill opacity=1] (0,0) rectangle (5.5,5);
-% \node at (1,4.7) {Data-path};
-% \node at (7.5,4.7) {Control Logic};
-%
-% \fill[white,rounded corners=10pt] (7,0.5) rectangle (11.5,2.2);
-% \node at (8,2) {\footnotesize Next State FSM};
-% \foreach \x in {8,...,2}
-% {\pgfmathtruncatemacro{\y}{8-\x}%
-% \node[draw,circle,inner sep=0,minimum size=10,scale=0.8] (s\x) at (7.5+\y/2,1.35) {\tiny \x};}
-% \node[draw,circle,inner sep=0,minimum size=10,scale=0.8] (s1c) at (11,1.35) {\tiny 1};
-% \node[draw,circle,inner sep=0,minimum size=13,scale=0.8] (s1) at (s1c) {};
-% \foreach \x in {8,...,3}
-% {\pgfmathtruncatemacro{\y}{\x-1}\draw[-{Latex[length=1mm,width=0.7mm]}] (s\x) -- (s\y);}
-% \node[draw,circle,inner sep=0,minimum size=10,scale=0.8] (s11) at (10.5,0.9) {\tiny 11};
-% \draw[-{Latex[length=1mm,width=0.7mm]}] (s2) -- (s11);
-% \draw[-{Latex[length=1mm,width=0.7mm]}] (s11) -- (s1);
-% \draw[-{Latex[length=1mm,width=0.7mm]}] (7.2,1.7) to [out=0,in=100] (s8);
-%
-% \node[draw,fill=white] (nextstate) at (9.25,3) {\tiny Current State};
-% \draw[-{Latex[length=1mm,width=0.7mm]}] let \p1 = (nextstate) in
-% (11.5,1.25) -| (11.75,\y1) -- (nextstate);
-% \draw let \p1 = (nextstate) in (nextstate) -- (6,\y1) |- (6,1.5);
-% \node[scale=0.5,rotate=60] at (7.5,0.75) {\mono{clk}};
-% \node[scale=0.5,rotate=60] at (7.7,0.75) {\mono{rst}};
-% \draw[-{Latex[length=1mm,width=0.7mm]}] (7.65,-0.5) -- (7.65,0.5);
-% \draw[-{Latex[length=1mm,width=0.7mm]}] (7.45,-0.5) -- (7.45,0.5);
-%
-% \fill[white,rounded corners=10pt] (2,0.5) rectangle (5,3);
-% \filldraw[fill=white] (0.25,0.5) rectangle (1.5,2.75);
-% \node at (2.6,2.8) {\footnotesize Update};
-% \node[align=center] at (0.875,2.55) {\footnotesize \mono{RAM}};
-% \node[scale=0.5] at (4.7,1.5) {\mono{state}};
-% \draw[-{Latex[length=1mm,width=0.7mm]}] (6,1.5) -- (5,1.5);
-% \draw[-{Latex[length=1mm,width=0.7mm]}] (6,1.5) -- (7,1.5);
-% \node[scale=0.5,rotate=60] at (4.1,0.9) {\mono{finished}};
-% \node[scale=0.5,rotate=60] at (3.9,0.95) {\mono{return\_val}};
-% \node[scale=0.5,rotate=60] at (2.5,0.75) {\mono{clk}};
-% \node[scale=0.5,rotate=60] at (2.7,0.75) {\mono{rst}};
-%
-% \node[scale=0.5,right,inner sep=5pt] (ram1) at (2,2.1) {\mono{u\_en}};
-% \node[scale=0.5,right,inner sep=5pt] (ram2) at (2,1.9) {\mono{wr\_en}};
-% \node[scale=0.5,right,inner sep=5pt] (ram3) at (2,1.7) {\mono{addr}};
-% \node[scale=0.5,right,inner sep=5pt] (ram4) at (2,1.5) {\mono{d\_in}};
-% \node[scale=0.5,right,inner sep=5pt] (ram5) at (2,1.3) {\mono{d\_out}};
-%
-% \node[scale=0.5,left,inner sep=5pt] (r1) at (1.5,2.1) {\mono{u\_en}};
-% \node[scale=0.5,left,inner sep=5pt] (r2) at (1.5,1.9) {\mono{wr\_en}};
-% \node[scale=0.5,left,inner sep=5pt] (r3) at (1.5,1.7) {\mono{addr}};
-% \node[scale=0.5,left,inner sep=5pt] (r4) at (1.5,1.5) {\mono{d\_in}};
-% \node[scale=0.5,left,inner sep=5pt] (r5) at (1.5,1.3) {\mono{d\_out}};
-%
-% \draw[-{Latex[length=1mm,width=0.7mm]}] (ram1) -- (r1);
-% \draw[-{Latex[length=1mm,width=0.7mm]}] (ram2) -- (r2);
-% \draw[-{Latex[length=1mm,width=0.7mm]}] (ram3) -- (r3);
-% \draw[-{Latex[length=1mm,width=0.7mm]}] (ram4) -- (r4);
-% \draw[-{Latex[length=1mm,width=0.7mm]}] (r5) -- (ram5);
-%
-% \draw[-{Latex[length=1mm,width=0.7mm]}] (4,0.5) -- (4,-0.5);
-% \draw[-{Latex[length=1mm,width=0.7mm]}] (3.75,0.5) -- (3.75,-0.5);
-% \draw[-{Latex[length=1mm,width=0.7mm]}] (2.45,-0.5) -- (2.45,0.5);
-% \draw[-{Latex[length=1mm,width=0.7mm]}] (2.65,-0.5) -- (2.65,0.5);
-%
-% \foreach \x in {0,...,1}
-% {\draw (0.25,1-0.25*\x) -- (1.5,1-0.25*\x); \node at (0.875,0.88-0.25*\x) {\tiny \x};}
-%
-% %\node[scale=0.5] at (1.2,2.2) {\mono{wr\_en}};
-% %\node[scale=0.5] at (1.2,2) {\mono{wr\_addr}};
-% %\node[scale=0.5] at (1.2,1.8) {\mono{wr\_data}};
-% %\node[scale=0.5] at (1.2,1.4) {\mono{r\_addr}};
-% %\node[scale=0.5] at (1.2,1.2) {\mono{r\_data}};
-% %
-% %\node[scale=0.5] at (2.3,2.2) {\mono{wr\_en}};
-% %\node[scale=0.5] at (2.3,2) {\mono{wr\_addr}};
-% %\node[scale=0.5] at (2.3,1.8) {\mono{wr\_data}};
-% %\node[scale=0.5] at (2.3,1.4) {\mono{r\_addr}};
-% %\node[scale=0.5] at (2.3,1.2) {\mono{r\_data}};
-% %
-% %\draw[-{Latex[length=1mm,width=0.7mm]}] (2,2.2) -- (1.5,2.2);
-% %\draw[-{Latex[length=1mm,width=0.7mm]}] (2,2) -- (1.5,2);
-% %\draw[-{Latex[length=1mm,width=0.7mm]}] (2,1.8) -- (1.5,1.8);
-% %\draw[-{Latex[length=1mm,width=0.7mm]}] (2,1.4) -- (1.5,1.4);
-% %\draw[-{Latex[length=1mm,width=0.7mm]}] (1.5,1.2) -- (2,1.2);
-%
-% \filldraw[fill=white] (2.8,3.25) rectangle (4.2,4.75);
-% \node at (3.5,4.55) {\footnotesize \mono{Registers}};
-% \draw[-{Latex[length=1mm,width=0.7mm]}] (2,2.4) -| (1.75,4) -- (2.8,4);
-% \draw[-{Latex[length=1mm,width=0.7mm]}] (4.2,4) -- (5.25,4) |- (5,2.4);
-% \draw[-{Latex[length=1mm,width=0.7mm]}] (5.25,2.4) -- (6.2,2.4) |- (7,1.8);
-%
-% \node[scale=0.5] at (3.5,4.2) {\mono{reg\_1}};
-% \node[scale=0.5] at (3.5,4) {\mono{reg\_2}};
-% \node[scale=0.5] at (3.5,3.8) {\mono{reg\_3}};
-% \node[scale=0.5] at (3.5,3.6) {\mono{reg\_4}};
-% \node[scale=0.5] at (3.5,3.4) {\mono{reg\_5}};
-%\end{scope}
-%\end{tikzpicture}
-% \Description{Diagram displaying the data-path and its internal modules, as well as the control logic and its state machine.}
-% \caption{The FSMD for the example shown in Fig.~\ref{fig:accumulator_c_rtl}, split into a data-path and control logic for the next state calculation. The Update block takes the current state, current values of all registers and at most one value stored in the RAM, and calculates a new value that can either be stored back in the RAM or in a register.}\label{fig:accumulator_diagram}
-%\end{figure*}
+\startplacemarginfigure[title={The FSMD for the example shown in Fig.~\ref{fig:accumulator_c_rtl},
+ split into a data-path and control logic for the next state calculation. The Update block takes
+ the current state, current values of all registers and at most one value stored in the RAM, and
+ calculates a new value that can either be stored back in the RAM or in a
+ register.},reference={fig:accumulator_diagram},location=top]
+\definecolor[control][x=B3E2CD]
+\definecolor[data][x=FDCDAC]
+\starttikzpicture
+ \startscope[scale=1.15]
+ \fill[control,fill opacity=1] (6.5,0) rectangle (12,5);
+ \fill[data,fill opacity=1] (0,0) rectangle (5.5,5);
+ \node at (1,4.7) {Data-path};
+ \node at (7.5,4.7) {Control Logic};
+
+ \fill[white,rounded corners=10pt] (7,0.5) rectangle (11.5,2.2);
+ \node at (8,2) {\rmxx Next State FSM};
+ \foreach \x in {8,...,2}
+ {\pgfmathtruncatemacro{\y}{8-\x}%
+ \node[draw,circle,inner sep=0,minimum size=10,scale=0.8] (s\x) at (7.5+\y/2,1.35) {\rmxx \x};}
+ \node[draw,circle,inner sep=0,minimum size=10,scale=0.8] (s1c) at (11,1.35) {\rmxx 1};
+ \node[draw,circle,inner sep=0,minimum size=13,scale=0.8] (s1) at (s1c) {};
+ \foreach \x in {8,...,3}
+ {\pgfmathtruncatemacro{\y}{\x-1}\draw[-{Latex[length=1mm,width=0.7mm]}] (s\x) -- (s\y);}
+ \node[draw,circle,inner sep=0,minimum size=10,scale=0.8] (s11) at (10.5,0.9) {\rmxx 11};
+ \draw[-{Latex[length=1mm,width=0.7mm]}] (s2) -- (s11);
+ \draw[-{Latex[length=1mm,width=0.7mm]}] (s11) -- (s1);
+ \draw[-{Latex[length=1mm,width=0.7mm]}] (7.2,1.7) to [out=0,in=100] (s8);
+
+ \node[draw,fill=white] (nextstate) at (9.25,3) {\rmxx Current State};
+ \draw[-{Latex[length=1mm,width=0.7mm]}] let \p1 = (nextstate) in
+ (11.5,1.25) -| (11.75,\y1) -- (nextstate);
+ \draw let \p1 = (nextstate) in (nextstate) -- (6,\y1) |- (6,1.5);
+ \node[scale=0.5,rotate=60] at (7.5,0.75) {\mono{clk}};
+ \node[scale=0.5,rotate=60] at (7.7,0.75) {\mono{rst}};
+ \draw[-{Latex[length=1mm,width=0.7mm]}] (7.65,-0.5) -- (7.65,0.5);
+ \draw[-{Latex[length=1mm,width=0.7mm]}] (7.45,-0.5) -- (7.45,0.5);
+
+ \fill[white,rounded corners=10pt] (2,0.5) rectangle (5,3);
+ \filldraw[fill=white] (0.25,0.5) rectangle (1.5,2.75);
+ \node at (2.6,2.8) {\rmxx Update};
+ \node[align=center] at (0.875,2.55) {\rmxx \mono{RAM}};
+ \node[scale=0.5] at (4.7,1.5) {\mono{state}};
+ \draw[-{Latex[length=1mm,width=0.7mm]}] (6,1.5) -- (5,1.5);
+ \draw[-{Latex[length=1mm,width=0.7mm]}] (6,1.5) -- (7,1.5);
+ \node[scale=0.5,rotate=60] at (4.1,0.9) {\mono{finished}};
+ \node[scale=0.5,rotate=60] at (3.9,0.95) {\mono{return\_val}};
+ \node[scale=0.5,rotate=60] at (2.5,0.75) {\mono{clk}};
+ \node[scale=0.5,rotate=60] at (2.7,0.75) {\mono{rst}};
+
+ \node[scale=0.5,right,inner sep=5pt] (ram1) at (2,2.1) {\mono{u\_en}};
+ \node[scale=0.5,right,inner sep=5pt] (ram2) at (2,1.9) {\mono{wr\_en}};
+ \node[scale=0.5,right,inner sep=5pt] (ram3) at (2,1.7) {\mono{addr}};
+ \node[scale=0.5,right,inner sep=5pt] (ram4) at (2,1.5) {\mono{d\_in}};
+ \node[scale=0.5,right,inner sep=5pt] (ram5) at (2,1.3) {\mono{d\_out}};
+
+ \node[scale=0.5,left,inner sep=5pt] (r1) at (1.5,2.1) {\mono{u\_en}};
+ \node[scale=0.5,left,inner sep=5pt] (r2) at (1.5,1.9) {\mono{wr\_en}};
+ \node[scale=0.5,left,inner sep=5pt] (r3) at (1.5,1.7) {\mono{addr}};
+ \node[scale=0.5,left,inner sep=5pt] (r4) at (1.5,1.5) {\mono{d\_in}};
+ \node[scale=0.5,left,inner sep=5pt] (r5) at (1.5,1.3) {\mono{d\_out}};
+
+ \draw[-{Latex[length=1mm,width=0.7mm]}] (ram1) -- (r1);
+ \draw[-{Latex[length=1mm,width=0.7mm]}] (ram2) -- (r2);
+ \draw[-{Latex[length=1mm,width=0.7mm]}] (ram3) -- (r3);
+ \draw[-{Latex[length=1mm,width=0.7mm]}] (ram4) -- (r4);
+ \draw[-{Latex[length=1mm,width=0.7mm]}] (r5) -- (ram5);
+
+ \draw[-{Latex[length=1mm,width=0.7mm]}] (4,0.5) -- (4,-0.5);
+ \draw[-{Latex[length=1mm,width=0.7mm]}] (3.75,0.5) -- (3.75,-0.5);
+ \draw[-{Latex[length=1mm,width=0.7mm]}] (2.45,-0.5) -- (2.45,0.5);
+ \draw[-{Latex[length=1mm,width=0.7mm]}] (2.65,-0.5) -- (2.65,0.5);
+
+ \foreach \x in {0,...,1}
+ {\draw (0.25,1-0.25*\x) -- (1.5,1-0.25*\x); \node at (0.875,0.88-0.25*\x) {\rmxx \x};}
+
+ %\node[scale=0.5] at (1.2,2.2) {\mono{wr\_en}};
+ %\node[scale=0.5] at (1.2,2) {\mono{wr\_addr}};
+ %\node[scale=0.5] at (1.2,1.8) {\mono{wr\_data}};
+ %\node[scale=0.5] at (1.2,1.4) {\mono{r\_addr}};
+ %\node[scale=0.5] at (1.2,1.2) {\mono{r\_data}};
+ %
+ %\node[scale=0.5] at (2.3,2.2) {\mono{wr\_en}};
+ %\node[scale=0.5] at (2.3,2) {\mono{wr\_addr}};
+ %\node[scale=0.5] at (2.3,1.8) {\mono{wr\_data}};
+ %\node[scale=0.5] at (2.3,1.4) {\mono{r\_addr}};
+ %\node[scale=0.5] at (2.3,1.2) {\mono{r\_data}};
+ %
+ %\draw[-{Latex[length=1mm,width=0.7mm]}] (2,2.2) -- (1.5,2.2);
+ %\draw[-{Latex[length=1mm,width=0.7mm]}] (2,2) -- (1.5,2);
+ %\draw[-{Latex[length=1mm,width=0.7mm]}] (2,1.8) -- (1.5,1.8);
+ %\draw[-{Latex[length=1mm,width=0.7mm]}] (2,1.4) -- (1.5,1.4);
+ %\draw[-{Latex[length=1mm,width=0.7mm]}] (1.5,1.2) -- (2,1.2);
+
+ \filldraw[fill=white] (2.8,3.25) rectangle (4.2,4.75);
+ \node at (3.5,4.55) {\rmxx \mono{Registers}};
+ \draw[-{Latex[length=1mm,width=0.7mm]}] (2,2.4) -| (1.75,4) -- (2.8,4);
+ \draw[-{Latex[length=1mm,width=0.7mm]}] (4.2,4) -- (5.25,4) |- (5,2.4);
+ \draw[-{Latex[length=1mm,width=0.7mm]}] (5.25,2.4) -- (6.2,2.4) |- (7,1.8);
+
+ \node[scale=0.5] at (3.5,4.2) {\mono{reg\_1}};
+ \node[scale=0.5] at (3.5,4) {\mono{reg\_2}};
+ \node[scale=0.5] at (3.5,3.8) {\mono{reg\_3}};
+ \node[scale=0.5] at (3.5,3.6) {\mono{reg\_4}};
+ \node[scale=0.5] at (3.5,3.4) {\mono{reg\_5}};
+\stopscope
+\stoptikzpicture
+\stopplacemarginfigure
\paragraph{Translating memory} Typically, HLS-generated hardware consists of a sea of registers and
RAMs. This memory view is very different from the C memory model, so we perform the following
@@ -588,8 +571,8 @@ which performs the requested load or store. By factoring all the memory accesses
interface, we ensure that the underlying array is only accessed from a single program point in the
Verilog code, and thus ensure that the synthesis tool will correctly infer a RAM
block.\footnote{Interestingly, the Verilog code shown for the RAM interface must not be modified,
- because the synthesis tool will only generate a RAM when the code matches a small set of specific
- patterns.}
+ because the synthesis tool will only generate a RAM when the code matches a small set of
+ specific patterns.}
Therefore, an extra compiler pass is added from HTL to HTL to extract all the direct accesses to the
Verilog array and replace them by signals that access the RAM interface in a separate
@@ -624,48 +607,21 @@ current value \mono{u\_en} set by the data-path. When the values are different,
enabled, and then \mono{en} is set to the value of \mono{u\_en}. This ensures that the RAM will
always be disabled straight after it was used, without having to insert or modify any other states.
-%\begin{figure}
-% \centering
-% \begin{subfigure}[b]{0.48\linewidth}
-% \begin{tikztimingtable}[timing/d/background/.style={fill=white}]
-% \small clk & 2L 3{6C} \\
-% \small u\_en & 2D{u\_en} 18D{$\overline{\text{u\_en}}$}\\
-% \small addr & 2U 18D{3} \\
-% \small wr\_en & 2U 18L \\
-% \small en & 8D{u\_en} 12D{$\overline{\text{u\_en}}$}\\
-% \small d\_out & 8U 12D{0xDEADBEEF} \\
-% \small r & 14U 6D{0xDEADBEEF} \\
-% \extracode
-% \node[help lines] at (2,2.25) {\tiny 1};
-% \node[help lines] at (8,2.25) {\tiny 2};
-% \node[help lines] at (14,2.25) {\tiny 3};
-% \begin{pgfonlayer}{background}
-% \vertlines[help lines]{2,8,14}
-% \end{pgfonlayer}
-% \end{tikztimingtable}
-% \caption{Timing diagram for loads. At time 1, the \mono{u\_en} signal is toggled to enable the RAM. At time 2, \mono{d\_out} is set to the value stored at the address in the RAM, which is finally assigned to the register \mono{r} at time 3.}\label{fig:ram_load}
-% \end{subfigure}\hfill%
-% \begin{subfigure}[b]{0.48\linewidth}
-% \begin{tikztimingtable}[timing/d/background/.style={fill=white}]
-% \small clk & 2L 2{7C} \\
-% \small u\_en & 2D{u\_en} 14D{$\overline{\text{u\_en}}$}\\
-% \small addr & 2U 14D{3} \\
-% \small wr\_en & 2U 14H \\
-% \small d\_in & 2U 14D{0xDEADBEEF} \\
-% \small en & 9D{u\_en} 7D{$\overline{\text{u\_en}}$}\\
-% \small stack[addr] & 9U 7D{0xDEADBEEF} \\
-% \extracode
-% \node[help lines] at (2,2.25) {\tiny 1};
-% \node[help lines] at (9,2.25) {\tiny 2};
-% \begin{pgfonlayer}{background}
-% \vertlines[help lines]{2,9}
-% \end{pgfonlayer}
-% \end{tikztimingtable}
-% \caption{Timing diagram for stores. At time 1, the \mono{u\_en} signal is toggled to enable the RAM, and the address \mono{addr} and the data to store \mono{d\_in} are set. On the negative edge at time 2, the data is stored into the RAM.}\label{fig:ram_store}
-% \end{subfigure}
-% \Description{Timing diagrams of loads and stores, showing which signals are modified at which time step.}
-% \caption{Timing diagrams showing the execution of loads and stores over multiple clock cycles.}\label{fig:ram_load_store}
-%\end{figure}
+\startplacemarginfigure[reference={fig:ram_load_store},title={Timing diagrams showing the execution
+ of loads and stores over multiple clock cycles.}]
+ \startfloatcombination[nx=2]
+ \startplacesubfigure[title={Timing diagram for loads. At time 1, the \mono{u\_en} signal is
+ toggled to enable the RAM. At time 2, \mono{d\_out} is set to the value stored at the
+ address in the RAM, which is finally assigned to the register \mono{r} at time 3.}]
+ \externalfigure[figures/timing-1.pdf]
+ \stopplacesubfigure
+ \startplacesubfigure[title={Timing diagram for stores. At time 1, the \mono{u\_en} signal is
+ toggled to enable the RAM, and the address \mono{addr} and the data to store \mono{d\_in} are
+ set. On the negative edge at time 2, the data is stored into the RAM.}]
+ \externalfigure[figures/timing-2.pdf]
+ \stopplacesubfigure
+ \stopfloatcombination
+\stopplacemarginfigure
Fig.~\ref{fig:ram_load_store} gives an example of how the RAM interface behaves when values are
loaded and stored.
@@ -718,7 +674,10 @@ big-step semantics is used to execute all the statements. An example of a rule f
always-block that is triggered at the positive edge of the clock is shown below, where $\Sigma$ is
the state of the registers in the module and $s$ is the statement inside the always-block:
-%\startformula \inferrule[Always]{(\Sigma, s)\downarrow_{\text{stmnt}} \Sigma'}{(\Sigma, \yhkeyword{always @(posedge clk) } s) \downarrow_{\text{always}^{+}} \Sigma'} \stopformula
+\placeformula\startformula
+ \text{\sc Always }\ \dfrac{(\Sigma, s)\downarrow_{\text{stmnt}} \Sigma'}{(\Sigma, \text{\mono{always
+ @(posedge clk) }} s) \downarrow_{\text{always}^{+}} \Sigma'}
+\stopformula
This rule says that assuming the statement $s$ in the always-block runs with state $\Sigma$ and
produces the new state $\Sigma'$, the always-block will result in the same final state.
@@ -732,7 +691,7 @@ contains the values that will be assigned at the end of the clock cycle. $\Sigma
defined as follows: $\Sigma = (\Gamma, \Delta)$. Nonblocking assignment can therefore be expressed
as follows:
-%\startformula \inferrule[Nonblocking Reg]{\yhkeyword{name}\ d = \yhkeyword{OK}\ n \\ (\Gamma, e) \downarrow_{\text{expr}} v}{((\Gamma, \Delta), d\ \yhkeyword{ <= } e) \downarrow_{\text{stmnt}} (\Gamma, \Delta [n \mapsto v])}\\ \stopformula
+\placeformula\startformula \text{\sc Nonblocking Reg }\ \frac{\text{\tt name}\ d = \text{\tt OK}\ n \\ (\Gamma, e) \downarrow_{\text{expr}} v}{((\Gamma, \Delta), d\ \text{\tt <= } e) \downarrow_{\text{stmnt}} (\Gamma, \Delta [n \mapsto v])}\\ \stopformula
where assuming that $\downarrow_{\text{expr}}$ evaluates an expression $e$ to a value $v$, the
nonblocking assignment $d\ \mono{ <= } e$ updates the future state of the variable $d$ with value
@@ -740,8 +699,11 @@ $v$.
Finally, the following rule dictates how the whole module runs in one clock cycle:
-%\startformula \inferrule[Module]{(\Gamma, \epsilon, \vec{m})\ \downarrow_{\text{module}} (\Gamma', \Delta')}{(\Gamma, \yhkeyword{module } \yhconstant{main} \yhkeyword{(...);}\ \vec{m}\ \yhkeyword{endmodule}) \downarrow_{\text{program}} (\Gamma'\ //\ \Delta')} \stopformula
-%
+\placeformula\startformula \text{\sc Module }\ \frac{(\Gamma, \epsilon, \vec{m})\
+ \downarrow_{\text{module}} (\Gamma', \Delta')}{(\Gamma, \text{\tt module } \text{\tt main}
+ \text{\tt (...);}\ \vec{m}\ \text{\tt endmodule}) \downarrow_{\text{program}} (\Gamma'\ //\
+ \Delta')} \stopformula
+
where $\Gamma$ is the initial state of all the variables, $\epsilon$ is the empty map because the
$\Delta$ map is assumed to be empty at the start of the clock cycle, and $\vec{m}$ is a list of
variable declarations and always-blocks that $\downarrow_{\text{module}}$ evaluates sequentially to
@@ -750,11 +712,11 @@ operator, which gives priority to the right-hand operand in a conflict. This rul
nonblocking assignments overwrite at the end of the clock cycle any blocking assignments made during
the cycle.
-\subsection[title={Changes to the Semantics},reference={changes-to-the-semantics}]
+\subsection[changes-to-the-semantics]{Changes to the Semantics}
Five changes were made to the semantics proposed by to make it suitable as an HLS target.
-\subsubsection[title={Adding array support},reference={adding-array-support}]
+\subsubsection[adding-array-support]{Adding array support}
The main change is the addition of support for arrays, which are needed to model RAM in Verilog. RAM
is needed to model the stack in C efficiently, without having to declare a variable for each
@@ -785,7 +747,11 @@ described in Section~\goto{{[}sec:algorithm:optimisation:ram{]}}[sec:algorithm:o
support for negative-edge-triggered always-blocks was added to the semantics. This is shown in the
modifications of the {\sc Module} rule shown below:
-%\startformula \inferrule[Module]{(\Gamma, \epsilon, \vec{m})\ \downarrow_{\text{module}^{+}} (\Gamma', \Delta') \\ (\Gamma'\ //\ \Delta', \epsilon, \vec{m}) \downarrow_{\text{module}^{-}} (\Gamma'', \Delta'')}{(\Gamma, \yhkeyword{module}\ \yhconstant{main} \yhkeyword{(...);}\ \vec{m}\ \yhkeyword{endmodule}) \downarrow_{\text{program}} (\Gamma''\ //\ \Delta'')} \stopformula
+\placeformula\startformula \text{\sc Module }\ \frac{(\Gamma, \epsilon, \vec{m})\
+ \downarrow_{\text{module}^{+}} (\Gamma', \Delta') \\ (\Gamma'\ //\ \Delta', \epsilon, \vec{m})
+ \downarrow_{\text{module}^{-}} (\Gamma'', \Delta'')}{(\Gamma, \text{\tt module}\ \text{\tt main}
+ \text{\tt (...);}\ \vec{m}\ \text{\tt endmodule}) \downarrow_{\text{program}} (\Gamma''\ //\
+ \Delta'')} \stopformula
The main execution of the module $\downarrow_{\text{module}}$ is split into
$\downarrow_{\text{module}^{+}}$ and $\downarrow_{\text{module}^{-}}$, which are rules that only
@@ -817,15 +783,24 @@ because (currently) only supports types represented by 32 bits.
\subsection[sec:verilog:integrating]{Integrating the Verilog Semantics into 's Model}
-%\startformula \begin{gathered}
-% \inferrule[Step]{\Gamma_r[\mathit{rst}] = 0 \\ \Gamma_r[\mathit{fin}] = 0 \\ (m, (\Gamma_r, \Gamma_a))\ \downarrow_{\text{program}} (\Gamma_r', \Gamma_a')}{\yhconstant{State}\ \mathit{sf}\ m\ \ \Gamma_r[\sigma]\ \ \Gamma_r\ \Gamma_a \longrightarrow \yhconstant{State}\ \mathit{sf}\ m\ \ \Gamma_r'[\sigma]\ \ \Gamma_r'\ \Gamma_a'}\\
-% %
-% \inferrule[Finish]{\Gamma_r[\mathit{fin}] = 1}{\yhconstant{State}\ \mathit{sf}\ m\ \sigma\ \Gamma_r\ \Gamma_a \longrightarrow \yhconstant{Returnstate}\ \mathit{sf}\ \Gamma_r[ \mathit{ret}]}\\
-% %
-% \inferrule[Call]{ }{\yhconstant{Callstate}\ \mathit{sf}\ m\ \vec{r} \longrightarrow \yhconstant{State}\ \mathit{sf}\ m\ n\ ((\yhfunction{init\_params}\ \vec{r}\ a)[\sigma \mapsto n, \mathit{fin} \mapsto 0, \mathit{rst} \mapsto 0])\ \epsilon}\\
-% %
-% \inferrule[Return]{ }{\yhconstant{Returnstate}\ (\yhconstant{Stackframe}\ r\ m\ \mathit{pc}\ \Gamma_r\ \Gamma_a :: \mathit{sf})\ v \longrightarrow \yhconstant{State}\ \mathit{sf}\ m\ \mathit{pc}\ (\Gamma_{r} [ \sigma \mapsto \mathit{pc}, r \mapsto v ])\ \Gamma_{a}}
-% \end{gathered} \stopformula
+\startplacemarginfigure[location=top]
+\startformula \startalign[n=1]
+ \NC\text{\sc Step }\ \frac{\startmatrix[n=1]
+ \NC \Gamma_r[\mathit{rst}] = 0 \qquad \Gamma_r[\mathit{fin}] = 0 \NR
+ \NC(m, (\Gamma_r, \Gamma_a))\ \downarrow_{\text{program}} (\Gamma_r', \Gamma_a') \NR
+ \stopmatrix}{\text{\tt State}\ \mathit{sf}\ m\ \ \Gamma_r[\sigma]\ \ \Gamma_r\ \Gamma_a
+ \longrightarrow \text{\tt State}\ \mathit{sf}\ m\ \ \Gamma_r'[\sigma]\ \ \Gamma_r'\ \Gamma_a'}\NR
+%
+ \NC\text{\sc Finish }\ \frac{\Gamma_r[\mathit{fin}] = 1}{\text{\tt State}\ \mathit{sf}\ m\ \sigma\
+ \Gamma_r\ \Gamma_a \longrightarrow \text{\tt Returnstate}\ \mathit{sf}\ \Gamma_r[ \mathit{ret}]}\NR
+%
+ \NC\text{\sc Call }\ \frac{}{\text{\tt Callstate}\ \mathit{sf}\ m\ \vec{r} \longrightarrow
+ \text{\tt State}\ \mathit{sf}\ m\ n\ ((\text{\tt init\_params}\ \vec{r}\ a)[\sigma \mapsto n, \mathit{fin} \mapsto 0, \mathit{rst} \mapsto 0])\ \epsilon}\NR
+%
+ \NC\text{\sc Return }\ \frac{}{\text{\tt Returnstate}\ (\text{\tt Stackframe}\ r\ m\ \mathit{pc}\
+ \Gamma_r\ \Gamma_a :: \mathit{sf})\ v \longrightarrow \text{\tt State}\ \mathit{sf}\ m\ \mathit{pc}\ (\Gamma_{r} [ \sigma \mapsto \mathit{pc}, r \mapsto v ])\ \Gamma_{a}}\NR
+\stopalign \stopformula
+\stopplacemarginfigure
The computation model defines a set of states through which execution passes. In this subsection, we
explain how we extend our Verilog semantics with four special-purpose registers in order to
@@ -833,49 +808,32 @@ integrate it into .
executions pass through three main states:
-%\startdescription{\type{State} $\mathit{sf}$ $m$ $v$ $\Gamma_{r}$
-%$\Gamma_{a}$}
-% The main state when executing a function, with stack frame
-% $\mathit{sf}$, current module $m$, current state $v$ and variable
-% states $\Gamma_{r}$ and $\Gamma_{a}$.
-%\stopdescription
-%
-%\startdescription{\type{Callstate} $\mathit{sf}$ $m$ $\vec{r}$}
-% The state that is reached when a function is called, with the current
-% stack frame $\mathit{sf}$, current module $m$ and arguments $\vec{r}$.
-%\stopdescription
-%
-%\startdescription{\type{Returnstate} $\mathit{sf}$ $v$}
-% The state that is reached when a function returns back to the caller,
-% with stack frame $\mathit{sf}$ and return value $v$.
-%\stopdescription
-%
-%To support this computational model, we extend the Verilog module we
-%generate with the following four registers and a RAM block:
-%
-%\startdescription{program counter}
-% The program counter can be modelled using a register that keeps track
-% of the state, denoted as $\sigma$.
-%\stopdescription
-%
-%\startdescription{function entry point}
-% When a function is called, the entry point denotes the first
-% instruction that will be executed. This can be modelled using a reset
-% signal that sets the state accordingly, denoted as $\mathit{rst}$.
-%\stopdescription
-%
-%\startdescription{return value}
-% The return value can be modelled by setting a finished flag to 1 when
-% the result is ready, and putting the result into a 32-bit output
-% register. These are denoted as $\mathit{fin}$ and $\mathit{ret}$
-% respectively.
-%\stopdescription
-%
-%\startdescription{stack}
-% The function stack can be modelled as a RAM block, which is
-% implemented using an array in the module, and denoted as
-% $\mathit{stk}$.
-%\stopdescription
+\desc{\type{State} $\mathit{sf}$ $m$ $v$ $\Gamma_{r}$ $\Gamma_{a}$} The main state when executing a
+function, with stack frame $\mathit{sf}$, current module $m$, current state $v$ and variable states
+$\Gamma_{r}$ and $\Gamma_{a}$.
+
+\desc{\type{Callstate} $\mathit{sf}$ $m$ $\vec{r}$} The state that is reached when a function is
+called, with the current stack frame $\mathit{sf}$, current module $m$ and arguments $\vec{r}$.
+
+\desc{\type{Returnstate} $\mathit{sf}$ $v$} The state that is reached when a function returns back
+to the caller, with stack frame $\mathit{sf}$ and return value $v$.
+
+To support this computational model, we extend the Verilog module we generate with the following
+four registers and a RAM block:
+
+\desc{program counter} The program counter can be modelled using a register that keeps track of the
+state, denoted as $\sigma$.
+
+\desc{function entry point} When a function is called, the entry point denotes the first instruction
+that will be executed. This can be modelled using a reset signal that sets the state accordingly,
+denoted as $\mathit{rst}$.
+
+\desc{return value} The return value can be modelled by setting a finished flag to 1 when the result
+is ready, and putting the result into a 32-bit output register. These are denoted as $\mathit{fin}$
+and $\mathit{ret}$ respectively.
+
+\desc{stack} The function stack can be modelled as a RAM block, which is implemented using an array
+in the module, and denoted as $\mathit{stk}$.
Fig.~\goto{{[}fig:inference_module{]}}[fig:inference_module] shows the inference rules for moving
between the computational states. The first, {\sc Step}, is the normal rule of execution. It defines
@@ -893,16 +851,18 @@ function. Therefore, in addition to the rules shown in
Fig.~\goto{{[}fig:inference_module{]}}[fig:inference_module], an initial state and final state need
to be defined:
-%\startformula \begin{gathered}
-% \inferrule[Initial]{\yhfunction{is\_internal}\ P.\mono{main}}{\yhfunction{initial\_state}\ (\yhconstant{Callstate}\ []\ P.\mono{main}\ [])}\qquad
-% \inferrule[Final]{ }{\yhfunction{final\_state}\ (\yhconstant{Returnstate}\ []\ n)\ n}\end{gathered} \stopformula
+\placeformula\startformula \startalign[n=1]
+ \NC\text{\sc Initial }\ \frac{\text{\tt is\_internal}\ P.\text{\tt main}}{\text{\tt initial\_state}\
+ (\text{\tt Callstate}\ []\ P.\mono{\tt main}\ [])}\NR
+ \NC\text{\sc Final }\ \frac{}{\text{\sc final\_state}\ (\text{\sc Returnstate}\ []\ n)\ n}\NR
+\stopalign \stopformula
where the initial state is the \type{Callstate} with an empty stack frame and no arguments for the
\type{main} function of program $P$, where this \type{main} function needs to be in the current
translation unit. The final state results in the program output of value $n$ when reaching a
\type{Returnstate} with an empty stack frame.
-\subsection[title={Memory Model},reference={sec:verilog:memory}]
+\subsection[sec:verilog:memory]{Memory Model}
The Verilog semantics do not define a memory model for Verilog, as this is not needed for a hardware
description language. There is no preexisting architecture that Verilog will produce; it can
@@ -981,7 +941,8 @@ supported in , this trace will always be empty.
Whenever the translation from $C$ succeeds and produces Verilog $V$, and all observable behaviours
of $C$ are safe, then $V$ has behaviour $B$ only if $C$ has behaviour $B$.
-%\startformula \forall C, V, B,\quad \yhfunction{HLS} (C) = \yhconstant{OK} (V) \land \mathit{Safe}(C) \implies (V \Downarrow B \implies C \Downarrow B). \stopformula
+\placeformula\startformula \forall C, V, B,\quad \text{\tt HLS} (C) = \text{\tt OK} (V) \land
+\mathit{Safe}(C) \implies (V \Downarrow B \implies C \Downarrow B). \stopformula
Why is this correctness theorem also the right one for HLS? It could be argued that hardware
inherently runs forever and therefore does not produce a definitive final result. This would mean
@@ -1010,11 +971,10 @@ forward simulation, it suffices to prove forward simulations between each pair o
intermediate languages, as these results can be composed to prove the correctness of the whole HLS
tool. The forward simulation from 3AC to HTL is stated in \in{Lemma}[lemma:htl]
(\in{Section}[sec:proof:3ac_htl]), the forward simulation for the RAM insertion is shown in
-Lemma~\goto{{[}lemma:htl_ram{]}}[lemma:htl_ram] (\in{Section}[sec:proof:ram_insertion]), then the
-forward simulation between HTL and Verilog is shown in
-Lemma~\goto{{[}lemma:verilog{]}}[lemma:verilog] (\in{Section}[sec:proof:htl_verilog]), and finally,
-the proof that Verilog is deterministic is given in
-Lemma~\goto{{[}lemma:deterministic{]}}[lemma:deterministic] (\in{Section}[sec:proof:deterministic]).
+\in{Lemma}[lemma:htl_ram] (\in{Section}[sec:proof:ram_insertion]), then the forward simulation
+between HTL and Verilog is shown in \in{Lemma}[lemma:verilog] (\in{Section}[sec:proof:htl_verilog]),
+and finally, the proof that Verilog is deterministic is given in \in{Lemma}[lemma:deterministic]
+(\in{Section}[sec:proof:deterministic]).
\subsection[sec:proof:3ac_htl]{Forward Simulation from 3AC to HTL}
@@ -1023,20 +983,19 @@ requires a larger proof, because the translation from 3AC instructions to Verilo
to be proven correct in this step. In addition to that, the semantics of HTL are also quite
different to the 3AC semantics. Instead of defining small-step semantics for each construct in
Verilog, the semantics are defined over one clock cycle and mirror the semantics defined for
-Verilog. Lemma~\goto{{[}lemma:htl{]}}[lemma:htl] shows the result that needs to be proven in this
-subsection.
+Verilog. \in{Lemma}[lemma:htl] shows the result that needs to be proven in this subsection.
\startlemma[lemma:htl]
Writing \type{tr_htl} for the translation from 3AC to HTL, we have:
\startformula
-\forall c, h, B \in \mono{Safe},\quad \mono{tr\_htl} (c) = \mono{OK} (h) \land c
+\forall c, h, B \in \text{\tt Safe},\quad \text{\tt tr\_htl} (c) = \text{\tt OK} (h) \land c
\Downarrow B \implies h \Downarrow B.
\stopformula
\stoplemma
\startproof
We prove this lemma by first establishing a specification of the translation
-function $\mono{tr\_htl}$ that captures its important properties, and then splitting the proof into
+function $\text{\tt tr\_htl}$ that captures its important properties, and then splitting the proof into
two parts: one to show that the translation function does indeed meet its specification, and one to
show that the specification implies the desired simulation result. This strategy is in keeping with
standard practice.
@@ -1047,27 +1006,33 @@ standard practice.
The specification for the translation of 3AC instructions into HTL data-path and control logic can
be defined by the following predicate:
-%\startformula \yhfunction{spec\_instr}\ \mathit{fin}\ \mathit{ret}\ \sigma\ \mathit{stk}\ i\ \mathit{data}\ \mathit{control} \stopformula
+\startformula \text{\tt spec\_instr}\ \mathit{fin}\ \mathit{ret}\ \sigma\ \mathit{stk}\ i\
+ \mathit{data}\ \mathit{control} \stopformula
Here, the $\mathit{control}$ and $\mathit{data}$ parameters are the statements that the current 3AC
instruction $i$ should translate to. The other parameters are the special registers defined in
Section~\goto{{[}sec:verilog:integrating{]}}[sec:verilog:integrating]. An example of a rule
describing the translation of an arithmetic/logical operation from 3AC is the following:
-%\startformula \inferrule[Iop]{\yhfunction{tr\_op}\ \mathit{op}\ \vec{a} = \yhconstant{OK}\ e}{\yhfunction{spec\_instr}\ \mathit{fin}\ \mathit{ret}\ \sigma\ \mathit{stk}\ (\yhconstant{Iop}\ \mathit{op}\ \vec{a}\ d\ n)\ (d\ \yhkeyword{<=}\ e)\ (\sigma\ \yhkeyword{<=}\ n)} \stopformula
+\startformula \text{\sc Iop}{\text{\tt tr\_op}\ \mathit{op}\ \vec{a} = \text{\tt OK}\ e}{\text{\tt
+ spec\_instr}\ \mathit{fin}\ \mathit{ret}\ \sigma\ \mathit{stk}\ (\text{\tt Iop}\ \mathit{op}\
+ \vec{a}\ d\ n)\ (d\ \text{\tt <=}\ e)\ (\sigma\ \text{\tt <=}\ n)} \stopformula
Assuming that the translation of the operator $\mathit{op}$ with operands $\vec{a}$ is successful
and results in expression $e$, the rule describes how the destination register $d$ is updated to $e$
via a non-blocking assignment in the data path, and how the program counter $\sigma$ is updated to
point to the next CFG node $n$ via another non-blocking assignment in the control logic.
-In the following lemma, $\mono{spec\_htl}$ is the top-level specification predicate, which is built
-using $\mono{spec\_instr}$ at the level of instructions.
+In the following lemma, $\text{\tt spec\_htl}$ is the top-level specification predicate, which is built
+using $\text{\tt spec\_instr}$ at the level of instructions.
-\reference[lemma:specification]{} If a 3AC program $c$ is translated correctly to an HTL program
-$h$, then the specification of the translation holds.
+\startlemma[lemma:specification]
+ If a 3AC program $c$ is translated correctly to an HTL program $h$, then the specification of the
+ translation holds.
-%\startformula \forall c, h,\quad \yhfunction{tr\_htl} (c) = \yhconstant{OK}(h) \implies \yhfunction{spec\_htl}\ c\ h. \stopformula
+\startformula \forall c, h,\quad \text{\tt tr\_htl} (c) = \text{\tt OK}(h) \implies \text{\tt
+ spec\_htl}\ c\ h. \stopformula
+\stoplemma
\subsubsection[from-specification-to-simulation]{From Specification to Simulation}
@@ -1112,15 +1077,19 @@ HTL state can be represented by the pair $(\Gamma_{r}, \Gamma_{a})$, which captu
all the registers and arrays in the module. Finally, $\mathcal{I}$ stands for the other invariants
that need to hold for the states to match.
-\reference[lemma:simulation_diagram]{} Given the 3AC state $(R,M,\mathit{pc})$ and the matching HTL
-state $(\Gamma_{r}, \Gamma_{a})$, assuming one step in the 3AC semantics produces state
-$(R',M',\mathit{pc}')$, there exist one or more steps in the HTL semantics that result in matching
-states $(\Gamma_{r}', \Gamma_{a}')$. This is all under the assumption that the specification
-$\mono{spec\_{htl}}$ holds for the translation.
+\startlemma[lemma:simulation_diagram]
+ Given the 3AC state $(R,M,\mathit{pc})$ and the matching HTL state $(\Gamma_{r}, \Gamma_{a})$,
+ assuming one step in the 3AC semantics produces state $(R',M',\mathit{pc}')$, there exist one or
+ more steps in the HTL semantics that result in matching states $(\Gamma_{r}', \Gamma_{a}')$. This
+ is all under the assumption that the specification $\text{\tt spec\_{htl}}$ holds for the
+ translation.
+\stoplemma
-{\em Proof sketch.} This simulation diagram is proven by induction over the operational semantics of
+\startproof
+This simulation diagram is proven by induction over the operational semantics of
3AC, which allows us to find one or more steps in the HTL semantics that will produce the same final
-matching state.~◻
+matching state.
+\stopproof
\subsection[sec:proof:ram_insertion]{Forward Simulation of RAM Insertion}
@@ -1158,10 +1127,10 @@ translation proves that the specification holds.
\subsubsection[from-specification-to-simulation-1]{From Specification to Simulation}
Another simulation proof is performed to prove that the insertion of the RAM is semantics
-preserving. As in Lemma~\goto{{[}lemma:simulation_diagram{]}}[lemma:simulation_diagram], we require
-some invariants that always hold at the start and end of the simulation. The invariants needed for
-the simulation of the RAM insertion are quite different to the previous ones, so we can define these
-invariants $\mathcal{I}_{r}$ to be the following:
+preserving. As in \in{Lemma}[lemma:simulation_diagram], we require some invariants that always hold
+at the start and end of the simulation. The invariants needed for the simulation of the RAM
+insertion are quite different to the previous ones, so we can define these invariants
+$\mathcal{I}_{r}$ to be the following:
\startitemize
\item
@@ -1173,17 +1142,19 @@ invariants $\mathcal{I}_{r}$ to be the following:
\stopitemize
The other invariants and assumptions for defining two matching states in HTL are quite similar to
-the simulation performed in Lemma~\goto{{[}lemma:simulation_diagram{]}}[lemma:simulation_diagram],
-such as ensuring that the states have the same value, and that the values in the registers are less
-defined. In particular, the less defined relation matches up all the registers, except for the new
-registers introduced by the RAM.
+the simulation performed in \in{Lemma}[lemma:simulation_diagram], such as ensuring that the states
+have the same value, and that the values in the registers are less defined. In particular, the less
+defined relation matches up all the registers, except for the new registers introduced by the RAM.
-\reference[lemma:htl_ram]{} Given an HTL program, the forward-simulation relation should hold after
-inserting the RAM and wiring the load, store, and control signals.
+\startlemma[lemma:htl_ram]
+ Given an HTL program, the forward-simulation relation should hold after inserting the RAM and
+ wiring the load, store, and control signals.
-%\startformula \begin{aligned}
-% \forall h, h', B \in \mono{Safe},\quad \yhfunction{tr\_ram\_ins}(h) = h' \land h \Downarrow B \implies h' \Downarrow B.
-% \end{aligned} \stopformula
+\startformula
+ \forall h, h', B \in \text{\tt Safe},\quad \text{\tt tr\_ram\_ins}(h) = h' \land h \Downarrow B
+ \implies h' \Downarrow B.
+\stopformula
+\stoplemma
\subsection[sec:proof:htl_verilog]{Forward Simulation from HTL to Verilog}
@@ -1192,41 +1163,53 @@ representation of the code to the case-statement representation. The proof is mo
semantics of a map structure is quite different to that of the case-statement to which it is
converted.
-\reference[lemma:verilog]{} In the following, we write $\mono{tr\_verilog}$ for the translation from
-HTL to Verilog. (Note that this translation cannot fail, so we do not need the constructor here.)
+\startlemma[lemma:verilog]
+ In the following, we write $\text{\tt tr\_verilog}$ for the translation from HTL to Verilog.
+ (Note that this translation cannot fail, so we do not need the constructor here.)
-%\startformula \begin{aligned}
- %\forall h, V, B \in \mono{Safe},\quad \yhfunction{tr\_verilog} (h) = V \land h \Downarrow B \implies V \Downarrow B.
- %\end{aligned} \stopformula
+ \startformula
+ \forall h, V, B \in \text{\tt Safe},\quad \text{\tt tr\_verilog} (h) = V \land h \Downarrow B
+ \implies V \Downarrow B.
-{\em Proof sketch.} The translation from maps to case-statements is done by turning each node of the
-tree into a case-expression containing the same statements. The main difficulty is that a
-random-access structure is being transformed into an inductive structure where a certain number of
-constructors need to be called to get to the correct case.~◻
+ \stopformula
+\stoplemma
+
+\startproof
+ The translation from maps to case-statements is done by turning each node of the
+ tree into a case-expression containing the same statements. The main difficulty is that a
+ random-access structure is being transformed into an inductive structure where a certain number of
+ constructors need to be called to get to the correct case.
+\stopproof
\subsection[sec:proof:deterministic]{Deterministic Verilog Semantics}
The final lemma we need is that the Verilog semantics is deterministic. This result allows us to
replace the forwards simulation we have proved with the backwards simulation we desire.
-\reference[lemma:deterministic]{} If a Verilog program $V$ admits behaviours $B_1$ and $B_2$, then
+\startlemma[lemma:deterministic]
+If a Verilog program $V$ admits behaviours $B_1$ and $B_2$, then
$B_1$ and $B_2$ must be the same.
-%\startformula \forall V, B_{1}, B_{2},\quad V \Downarrow B_{1} \land V \Downarrow B_{2} \implies B_{1} = B_{2}. \stopformula
+ \startformula
+ \forall V, B_{1}, B_{2},\quad V \Downarrow B_{1} \land V \Downarrow B_{2} \implies B_{1} =
+ B_{2}.
+ \stopformula
+\stoplemma
-{\em Proof sketch.} The Verilog semantics is deterministic because the
-order of operation of all the constructs is defined, so there is only
-one way to evaluate the module, and hence only one possible behaviour.
-This was proven for the small-step semantics shown in
-Fig.~\goto{{[}fig:inference_module{]}}[fig:inference_module].~◻
+\startproof
+ The Verilog semantics is deterministic because the order of operation of all the constructs is
+ defined, so there is only one way to evaluate the module, and hence only one possible behaviour.
+ This was proven for the small-step semantics shown in
+ Fig.~\goto{{[}fig:inference_module{]}}[fig:inference_module].
+\stopproof
\subsection[coq-mechanisation]{Coq Mechanisation}
\startplacetable[reference=tab:proof-statistics]
\starttabulate[|l|r|r|r|r|r|]
- \HL
- \NC \NC {\bf Coq code} \NC \NC {\bf Spec} \NC \NC {\bf Total} \NC \NR
- \HL
+ \FL
+ \NC \NS[1][c] {\bf Coq code} \NS[1][c] {\bf Spec} \NC {\bf Total} \NC \NR
+ \ML
\NC Data structures and libraries \NC 280 \NC --- \NC --- \NC 500 \NC 780 \NC \NR
\NC Integers and values \NC 98 \NC --- \NC 15 \NC 968 \NC 1081 \NC \NR
\NC HTL semantics \NC 50 \NC --- \NC 181 \NC 65 \NC 296 \NC \NR
@@ -1235,9 +1218,9 @@ Fig.~\goto{{[}fig:inference_module{]}}[fig:inference_module].~◻
\NC Verilog semantics \NC 78 \NC --- \NC 431 \NC 235 \NC 744 \NC \NR
\NC Verilog generation \NC 104 \NC --- \NC --- \NC 486 \NC 590 \NC \NR
\NC Top-level driver, pretty printers \NC 318 \NC 775 \NC --- \NC 189 \NC 1282 \NC \NR
- \HL
+ \LL
\NC {\bf Total} \NC 1721 \NC 775 \NC 693 \NC 8355 \NC 11544 \NC \NR
- \HL
+ \BL
\stoptabulate
\stopplacetable
diff --git a/chapters/introduction.tex b/chapters/introduction.tex
index 8588d3f..7ab9d47 100644
--- a/chapters/introduction.tex
+++ b/chapters/introduction.tex
@@ -1,52 +1,57 @@
+\environment fonts_env
+\environment lsr_env
+
\startcomponent introduction
\chapter{Introduction}
-\startthesis
- An optimising \infull{HLS} tool can be proven correct using an interactive theorem prover while
- also remaining practical.
-\stopthesis
-
-\noindent As latency, throughput, and energy efficiency become increasingly important, custom
-hardware accelerators are being designed for numerous applications. Alas, designing these
-accelerators can be a tedious and error-prone process using a hardware description language (HDL)
-such as Verilog. An attractive alternative is \emph{\infull{HLS}} (\HLS), in which hardware designs
-are automatically compiled from software written in a high-level language like C. Modern \HLS\
-tools such as LegUp~\cite{canis11_legup}, Vivado HLS~\cite{xilinx20_vivad_high_synth}, Intel
-i++~\cite{intel20_high_synth_compil}, and Bambu HLS~\cite{pilato13_bambu} promise designs with
-comparable performance and energy-efficiency to those hand-written in an
-HDL~\cite{homsirikamol14_can, gauthier20_high_level_synth, pelcat16_desig_hdl}, while offering the
-convenient abstractions and rich ecosystems of software development.
+As latency, throughput, and energy efficiency become increasingly important, custom hardware
+accelerators are being designed for numerous applications. Alas, designing these accelerators can
+be a tedious and error-prone process using a \HDL\ such as Verilog. An attractive alternative is
+\HLS, in which hardware designs are automatically compiled from software written in a high-level
+language like C. Modern \HLS\ tools such as LegUp~\cite{canis11_legup}, Vivado
+HLS~\cite{xilinx20_vivad_high_synth}, Intel i++~\cite{intel20_high_synth_compil}, and Bambu
+HLS~\cite{pilato13_bambu} promise designs with comparable performance and energy-efficiency to those
+hand-written in an HDL~\cite{homsirikamol14_can, gauthier20_high_level_synth, pelcat16_desig_hdl},
+while offering the convenient abstractions and rich ecosystems of software development.
There are two main use-cases of \HLS. Firstly, it is a very convenient tool to use for prototyping
designs, as this means that the software model can be reused for the initial design of the hardware.
Secondly, \HLS\ is also often used design hardware that has quite regular control-flow and is
-compute intensive, one example being DSP hardware design.
-
-But existing HLS tools cannot always guarantee that the hardware designs they produce are equivalent
-to the software they were given, and this undermines any reasoning conducted at the software level.
+compute intensive, one example being \DSP\ hardware design~\cite{coussy08_gaut}. Ideally, \HLS\
+should also be good for projects that require detailed functional verification, as manual \HDL\
+designs cannot often be fully verified because of their size. \HLS\ would move the functional
+verification problem to a higher-level, allowing for a much deeper understanding about the
+functional behaviour of the design. A recent survey by \cite[authoryear][lahti19_are_we_there_yet]
+describes that verification is still a time-consuming part of the design process though, even with
+the use of \HLS, however, that in general it still reduced the verification effort by half. Most
+papers that were surveyed did not mention verification of designs though and it therefore still
+remains an underexplored area in \HLS\ research.
-Indeed, there are reasons to doubt that HLS tools actually \emph{do} always preserve equivalence.
-For instance, Vivado HLS has been shown to apply pipelining optimisations
+This is especially important when existing HLS tools cannot always guarantee that the hardware
+designs they produce are equivalent to the software they were given, and this undermines any
+reasoning conducted at the software level. Indeed, there are reasons to doubt that \HLS\ tools
+actually \emph{do} always preserve equivalence. For instance, Vivado \HLS\ has been shown to apply
+pipelining optimisations
incorrectly\footnote{\goto{bit.ly/vivado-hls-pipeline-bug}[url(https://bit.ly/vivado-hls-pipeline-bug)]}
or to silently generate wrong code should the programmer stray outside the fragment of C that it
supports.\footnote{\goto{bit.ly/vivado-hls-pointer-bug}[url(https://bit.ly/vivado-hls-pointer-bug)]}
Meanwhile, \cite{lidbury15_many_core_compil_fuzzin} had to abandon their attempt to fuzz-test
Altera's (now Intel's) OpenCL compiler since it \quotation{either crashed or emitted an internal
compiler error} on so many of their test inputs. More recently,
-\citef{herklotz21_empir_study_reliab_high_level_synth_tools} fuzz-tested three commercial HLS tools
-using Csmith~\cite{yang11_findin_under_bugs_c_compil}, and despite restricting the generated
+\citef{herklotz21_empir_study_reliab_high_level_synth_tools} fuzz-tested three commercial \HLS\
+tools using Csmith~\cite{yang11_findin_under_bugs_c_compil}, and despite restricting the generated
programs to the C fragment explicitly supported by all the tools, they still found that on average
2.5\% of test-cases were compiled to designs that behaved incorrectly.
-\paragraph{Existing workarounds.} Aware of the reliability shortcomings of HLS tools, hardware
-designers routinely check the generated hardware for functional correctness. This is commonly done
-by simulating the generated design against a large test-bench. But unless the test-bench covers all
-inputs exhaustively -- which is often infeasible -- there is a risk that bugs remain.
+Aware of the reliability shortcomings of \HLS\ tools, hardware designers routinely check the
+generated hardware for functional correctness. This is commonly done by simulating the generated
+design against a large test-bench. But unless the test-bench covers all inputs exhaustively -- which
+is often infeasible -- there is a risk that bugs remain.
One alternative is to use \emph{translation validation}~\cite{pnueli98_trans} to prove equivalence
between the input program and the output design. Translation validation has been successfully
-applied to several HLS
+applied to several \HLS\
optimisations~\cite{kim04_autom_fsmd,karfa06_formal_verif_method_sched_high_synth,chouksey20_verif_sched_condit_behav_high_level_synth,banerjee14_verif_code_motion_techn_using_value_propag,chouksey19_trans_valid_code_motion_trans_invol_loops}.
Nevertheless, it is an expensive task, especially for large designs, and it must be repeated every
time the compiler is invoked. For example, the translation validation for Catapult
@@ -54,9 +59,54 @@ C~\cite{mentor20_catap_high_level_synth} may require several rounds of expert
`adjustments'~\cite[alternative=authoryear,right={, p.~3)}][chauhan20_formal_ensur_equiv_c_rtl] to
the input C program before validation succeeds. And even when it succeeds, translation validation
does not provide watertight guarantees unless the validator itself has been mechanically proven
-correct~\cite[tristan08_formal_verif_trans_valid], which has not been the case in HLS tools to date.
+correct~\cite[tristan08_formal_verif_trans_valid], which has not been the case in \HLS\ tools to
+date.
+
+The main thesis of this dissertation is therefore the following.
+
+\startthesis
+ An optimising high-level synthesis tool can be proven correct using an interactive theorem prover
+ while also remaining practical.
+\stopthesis
+
+Our position is that is that none of these workarounds are necessary if the \HLS\ tool can be
+trusted, and is proven to produce correct designs. This dissertation will describe the
+implementation of a mechanically verified \HLS\ tool called Vericert, including two critical \HLS\
+optimisations to make Vericert practical. Vericert is fully open source and available on GitHub at
+
+\blank[big]
+\startalignment[center]
+ \goto{\hyphenatedurl{https://github.com/ymherklotz/vericert}}[url(https://github.com/ymherklotz/vericert)].
+\stopalignment
+\blank[big]
+
+A snapshot of the Vericert development is also available in a Zenodo
+repository~\citef{herklotz21_veric}.
+
+\section{Outline and Contributions}
+
+The dissertation is split into the following sections:
+
+\desc{\in{Chapter}[sec:background]} covers background information about high-level synthesis, static
+scheduling, loop pipelining (modulo scheduling) and dynamic scheduling.
+
+\desc{\in{Chapter}[sec:hls]} describes the initial implementation of Vericert without
+any optimisations. It describes the addition of a new intermediate language called HTL, providing a
+state machine language which has a closer structure to the final hardware implementation, but does
+not yet have to be proper Verilog syntax.\footnote{In fact, HTL implements a \FSMD\ to simplify the
+ state machine representation and the number of states.} This chapter also describes the Verilog
+semantics that were chosen as a target.
+
+\desc{\in{Chapter}[sec:scheduling]} describes the implementation of static hyperblock scheduling,
+making use of validation to prove it correct.
+
+\desc{\in{Chapter}[sec:pipelining]} describes the current plan to implement loop pipelining in
+Vericert by also using a validator.
+
+\desc{\in{Chapter}[sec:dynamic]} describes a possible extension to Vericert by implementing dynamic
+scheduling instead of the current static scheduling implementation. This relies on ongoing work in
+a fork of CompCert which implements \SSA\ as an intermediate language.
-Our position is that none of the above workarounds are necessary if the HLS tool can simply be
-trusted to work correctly.
+\desc{\in{Chapter}[sec:schedule]} describes the current implementation timeline.
\stopcomponent
diff --git a/chapters/schedule.tex b/chapters/schedule.tex
index 49d5251..bf62c59 100644
--- a/chapters/schedule.tex
+++ b/chapters/schedule.tex
@@ -42,7 +42,7 @@ proof.
Vericert to pass through \SSA, as well as GSA, and finally generate dynamic HTL (or some other
similar hardware language). During my visit at Inria Rennes, I worked with Sandrine Blazy and
Delphine Demange to extend CompCertSSA~\cite[barthe14_formal_verif_ssa_based_middl_end_compc] with
-\infull{GSA} (\GSA)~\cite[ottenstein90_progr_depen_web]. Not only should this allow for global
+\GSA~\cite[ottenstein90_progr_depen_web]. Not only should this allow for global
optimisations of \SSA\ code, but it should also allow for a more straightforward translation to
hardware as the GSA code can just be viewed as a \DFG. Instead of working on modulo scheduling, it
could therefore be interesting to work on a formally verified dynamic scheduling translation.
diff --git a/chapters/scheduling.tex b/chapters/scheduling.tex
index 662d877..15850ce 100644
--- a/chapters/scheduling.tex
+++ b/chapters/scheduling.tex
@@ -51,12 +51,10 @@ flexible scheduling operations that could be beneficial to various different bac
the key points of this paper are the following:
\startitemize
-\item
-Description of how hyperblocks are introduced for improved scheduling of instructions, and how these
-can be built and translated back to instructions without predicates.
-\item
-Description of the implementation and correctness proof of a \infull{SDC} (\SDC) scheduling
-algorithm, which can be adapted to various different scheduling strategies.
+\item Description of how hyperblocks are introduced for improved scheduling of instructions, and how
+ these can be built and translated back to instructions without predicates.
+\item Description of the implementation and correctness proof of a \SDC\ scheduling algorithm, which
+ can be adapted to various different scheduling strategies.
\stopitemize
\section[sec:scheduling]{Implementation of Hyperblocks in CompCert}
@@ -322,8 +320,8 @@ proven correct in Coq to prove the scheduling translation correct. The SAT solv
be verified, because it could otherwise not be used in the proof of correctness.
The DPLL algorithm is used to implement the SAT solver. This means that the SAT solver takes in a
-\infull{CNF} (\CNF) formula. The conversion from a recursive predicate type into a \CNF formula is
-done by recursively expanding the predicate in equivalent \CNF formulas.
+\CNF\ formula. The conversion from a recursive predicate type into a \CNF\ formula is done by
+recursively expanding the predicate in equivalent \CNF\ formulas.
The SAT solver is implemented in Coq and proven to be sound and complete. The soundness and
completeness of an algorithm to check whether a predicate is satisfiable or not also directly
@@ -413,8 +411,7 @@ Section~\goto{3}[abstr_interp], the formally verified SAT solver described in
Section~\goto{4.2}[verified_sat] can be used prove that if the two predicated expression lists are
equivalent, that they will always evaluate to the same value.
-\section[implementation-details-of-static-sdc-scheduling]{Implementation Details of Static \SDC
- Scheduling}
+\section[implementation-details-of-static-sdc-scheduling]{\SDC\ Scheduling Implementation Details}
This section describes the implementation details of the static \SDC\ scheduling algorithm used, and
also describes some of the details of the basic block generation as well as the if-conversion pass
@@ -437,9 +434,10 @@ that do not support predicates, which are most the default back ends included in
\subsection[static-sdc-scheduling]{Static \SDC\ Scheduling}
-\Word{\infull{SDC}} (\SDC) scheduling~ is a flexible algorithm to perform scheduling of
-instructions, especially if the target is hardware directly. It allows for flexible definitions of
-optimisation functions in terms of latencies and dependencies of the operations in the basic block.
+\Word{\infull{SDC}} (\SDC) scheduling~\cite[cong06_sdc] is a flexible algorithm to perform
+scheduling of instructions, especially if the target is hardware directly. It allows for flexible
+definitions of optimisation functions in terms of latencies and dependencies of the operations in
+the basic block.
\section[performance-comparison]{Performance Comparison}
diff --git a/figures/timing-1.tex b/figures/timing-1.tex
new file mode 100644
index 0000000..914576d
--- /dev/null
+++ b/figures/timing-1.tex
@@ -0,0 +1,27 @@
+\documentclass{standalone}
+
+\usepackage{tikz}
+\usepackage{tikz-timing}
+\usepackage{newpxtext}
+\usepackage{newpxmath}
+
+\begin{document}
+
+\begin{tikztimingtable}[timing/d/background/.style={fill=white}]
+ \small clk & 2L 3{6C} \\
+ \small u\_en & 2D{u\_en} 18D{$\overline{\texttt{u\_en}}$}\\
+ \small addr & 2U 18D{3} \\
+ \small wr\_en & 2U 18L \\
+ \small en & 8D{u\_en} 12D{$\overline{\texttt{u\_en}}$}\\
+ \small d\_out & 8U 12D{0xDEADBEEF} \\
+ \small r & 14U 6D{0xDEADBEEF} \\
+ \extracode
+ \node[help lines] at (2,2.25) {\tiny 1};
+ \node[help lines] at (8,2.25) {\tiny 2};
+ \node[help lines] at (14,2.25) {\tiny 3};
+ \begin{pgfonlayer}{background}
+ \vertlines[help lines]{2,8,14}
+ \end{pgfonlayer}
+\end{tikztimingtable}
+
+\end{document}
diff --git a/figures/timing-2.tex b/figures/timing-2.tex
new file mode 100644
index 0000000..dea9b14
--- /dev/null
+++ b/figures/timing-2.tex
@@ -0,0 +1,24 @@
+\documentclass{standalone}
+
+\usepackage{tikz}
+\usepackage{tikz-timing}
+\usepackage{newpxtext}
+\usepackage{newpxmath}
+
+\begin{document}
+\begin{tikztimingtable}[timing/d/background/.style={fill=white}]
+ \small clk & 2L 2{7C} \\
+ \small u\_en & 2D{u\_en} 14D{$\overline{\texttt{u\_en}}$}\\
+ \small addr & 2U 14D{3} \\
+ \small wr\_en & 2U 14H \\
+ \small d\_in & 2U 14D{0xDEADBEEF} \\
+ \small en & 9D{u\_en} 7D{$\overline{\texttt{u\_en}}$}\\
+ \small stack[addr] & 9U 7D{0xDEADBEEF} \\
+ \extracode
+ \node[help lines] at (2,2.25) {\tiny 1};
+ \node[help lines] at (9,2.25) {\tiny 2};
+ \begin{pgfonlayer}{background}
+ \vertlines[help lines]{2,9}
+ \end{pgfonlayer}
+\end{tikztimingtable}
+\end{document}
diff --git a/lsr_env.tex b/lsr_env.tex
index 86984e3..547f786 100644
--- a/lsr_env.tex
+++ b/lsr_env.tex
@@ -29,8 +29,8 @@
\setuppapersize[A4][A4]
\setupbodyfont[ebgaramondlato,11pt]
%\setupalign[hz,hanging,nothyphenated]
-\setupalign[hz,hanging]
-\setuptolerance[stretch,tolerant]
+\setupalign[hz,hanging,lesshyphenation,verytolerant]
+\setupinterlinespace[big]
\setuppagenumbering[location=none]
@@ -62,32 +62,31 @@
]
\startsectionblockenvironment [backpart]
-
-\setuplayout[wide]
-\setupheads[chapter][
- after={\blank[2*big]},
- before={\blank[big,force]},
- style={\bfd},
- header=empty,
- align=flushleft,
-]
-
-\setupfootertexts[pagenumber]
-
+ \setuplayout[wide]
+ \setupheads[chapter][
+ after={\blank[2*big]},
+ before={\blank[big,force]},
+ style={\bfd},
+ header=empty,
+ align=flushleft,
+ ]
+ \setupheadertexts[margin][][]
+ \setupheadertexts[text][][]
+ \setupfootertexts[pagenumber]
\stopsectionblockenvironment
\startsectionblockenvironment [frontpart]
-\setuplayout[wide]
-\setupheads[chapter][
- after={\blank[2*big]},
- before={\blank[big,force]},
- style={\bfd},
- header=empty,
- align=flushleft,
-]
-
-\setupfootertexts[pagenumber]
-
+ \setuplayout[wide]
+ \setupheads[chapter][
+ after={\blank[2*big]},
+ before={\blank[big,force]},
+ style={\bfd},
+ header=empty,
+ align=flushleft,
+ ]
+ \setupheadertexts[margin][][]
+ \setupheadertexts[text][][]
+ \setupfootertexts[pagenumber]
\stopsectionblockenvironment
% @see http://wiki.contextgarden.net/Command/setuppagenumbering
@@ -157,12 +156,13 @@
% before={\blank[4*big,force]},
%]
-\startsectionblockenvironment [bodypart]
-\setuplayout[reset]
-
\setupheadertexts[margin][][{{\it \getmarking[chapter]}\wordright{\pagenumber}}]
\setupheadertexts[text][][{\someheadnumber[chapter][current]}]
+\startsectionblockenvironment [bodypart]
+\setuplayout[reset]
+%\setupinterlinespace[3.8ex]
+
\setcounter[userpage][1]
\stopsectionblockenvironment
@@ -173,7 +173,7 @@
% @see http://wiki.contextgarden.net/Command/setupindenting
% @see http://wiki.contextgarden.net/Indentation
% Indent all paragraph after all section headers.
-\setupindenting[yes,medium,next]
+\setupindenting[yes,medium]
%\setupheads[section, subsection][
% indentnext=yes,
% numberwidth=1.27cm,
@@ -244,6 +244,7 @@
% Abbreviation
% ==========================================================================
% @see https://tex.stackexchange.com/a/389791/141902
+
\definehspace[abbrwidth][1em]
\definesynonyms[abbreviation][abbreviations][\infull]
\setupsynonyms[abbreviation][
@@ -254,6 +255,26 @@
style={--\hspace[abbrwidth]},
character=normal,
]
+\setupsynonyms[abbreviation][alternative=last]
+
+\abbreviation{SSA}{static single assignment}
+\abbreviation{GSA}{gated-SSA}
+\abbreviation{HDL}{hardware description language}
+\abbreviation{HLS}{high-level synthesis}
+\abbreviation{FPGA}{field-programmable gate array}
+\abbreviation{DFG}{data-flow graph}
+\abbreviation{CFG}{control-flow graph}
+\abbreviation{CDFG}{control- and data-flow graph}
+\abbreviation{SDC}{system of difference constraints}
+\abbreviation{CNF}{conjunctive normal form}
+\abbreviation{SAT}{satisfiability}
+\abbreviation{SMT}{satisfiability modulo scheduling}
+\abbreviation{DSP}{digital signal processing}
+\abbreviation{ITP}{interactive theorem prover}
+\abbreviation{FSMD}{finite-state machine with data-path}
+\abbreviation{II}{initiation interval}
+\abbreviation{ASAP}{as soon as possible}
+\abbreviation{ALAP}{as late as possible}
% ==========================================================================
% Appendices
@@ -313,6 +334,7 @@
\definevimtyping [hlcoq] [syntax=coq]
\definevimtyping [hlocaml] [syntax=ocaml]
\definevimtyping [hlverilog] [syntax=verilog]
+\definevimtyping [hlC] [syntax=C]
% ==========================================================================
% Tikz
@@ -339,24 +361,25 @@
\hyphenation{de-riche}
\definedescription[desc][
- headstyle=bold, style=normal, align={yes,tolerant}, alternative=hanging,
- width=broad, margin=1cm]
+ headstyle=bold,
+ style=normal,
+ align={yes,tolerant},
+ alternative=hanging,
+ width=broad,
+ margin=0.5cm,
+ distance=0.25cm,
+ stretch=0.05,
+ shrink=0.05]
-\definesynonyms[abbreviation][abbreviations][\infull]
+\setupmargindata[stack=yes]
-\abbreviation{SSA}{single static assignment}
-\abbreviation{GSA}{gated-SSA}
-\abbreviation{HLS}{high-level synthesis}
-\abbreviation{FPGA}{field-programmable gate array}
-\abbreviation{DFG}{data-flow graph}
-\abbreviation{CFG}{control-flow graph}
-\abbreviation{CDFG}{control- and data-flow graph}
-\abbreviation{SDC}{system of difference constraints}
-\abbreviation{CNF}{conjunctive normal form}
-\abbreviation{SAT}{satisfiability}
-\abbreviation{SMT}{satisfiability modulo scheduling}
+\definemargindata[inrightm][right][
+ align={yes,verytolerant,stretch},
+ style={\rmx\setupinterlinespace[reset]},
+ width=\rightmarginwidth,
+]
-\define[1]\citef{\cite[#1]\footnote{\cite[entry][#1]}}
+\define[1]\citef{\cite[#1]\inrightm{\cite[entry][#1]\blank}}
\usemodule[gantt-s-tikz]
@@ -369,14 +392,17 @@
headstyle=bold,
titlestyle=bold,
way=bychapter,
- conversion=numbers]
+ conversion=numbers,
+ stretch=0,
+ shrink=0]
\defineenumeration
[thesis]
[ text=Thesis,
number=no,
+ before={\blank[big]\startnarrower[middle]},
+ after={\stopnarrower\blank[big]},
headalign=center,
- alternative=top,
style=italic]
\defineenumeration
@@ -420,4 +446,13 @@
after={\stopnarrower
\blank[big]}]
+\setupnarrower[indentnext=no]
+
+\definestartstop
+ [synopsis]
+ [before={\blank[big]\startnarrower\startstyle[it]},
+ after={\stopstyle\stopnarrower\blank[big]\indenting[next]}]
+
+\setupfootnotes[align={stretch,verytolerant,hz,hanging,hyphenated}]
+
\stopenvironment
diff --git a/main.tex b/main.tex
index c9ee69b..3d3a974 100644
--- a/main.tex
+++ b/main.tex
@@ -14,11 +14,13 @@
% \showframe
% \showbodyfontenvironment
%\showbodyfont
+%\showjustification
\component title
\startfrontmatter
\completecontent
+ \completelistofabbreviations
\stopfrontmatter
\startbodymatter