1 files changed, 369 insertions, 386 deletions
diff --git a/chapters/hls.tex b/chapters/hls.tex
index 18227e6..fafb815 100644
--- a/chapters/hls.tex
+++ b/chapters/hls.tex
@@ -9,9 +9,11 @@
 
 \chapter[sec:hls]{High-Level Synthesis}
 
-This chapter outlines the base implementation of a formally verified \HLS\ tool called
-Vericert~\cite{herklotz21_veric}.  This chapter is based on a paper describing the initial
-implementation in detail~\citef{herklotz21_formal_verif_high_level_synth}.
+\startsynopsis
+  This chapter outlines the base implementation of a formally verified high-level synthesis tool
+  called Vericert~\citef{herklotz21_veric}.  This chapter is based on a paper describing the initial
+  implementation in detail~\citef{herklotz21_formal_verif_high_level_synth}.
+\stopsynopsis
 
 We have designed a new HLS tool in the Coq theorem prover and proved that any output design it
 produces always has the same behaviour as its input program. Our tool, called Vericert, is
@@ -55,17 +57,6 @@ function calls, non-32-bit integers, floats, and global variables.
   that its correctness theorem is watertight.
 \stopitemize
 
-\paragraph{Companion material.} Vericert is fully open source and available on GitHub at
-
-\blank[big]
-\startalignment[center]
-  \goto{\hyphenatedurl{https://github.com/ymherklotz/vericert}}[url(https://github.com/ymherklotz/vericert)].
-\stopalignment
-\blank[big]
-
-A snapshot of the Vericert development is also available in a Zenodo
-repository~\citef{herklotz21_veric}.
-
 \section[sec:hls:design]{Designing a Verified HLS Tool}
 
 This section describes the main architecture of the HLS tool, and the way in which the Verilog back
@@ -99,7 +90,7 @@ K\^{o}ika~\cite{bourgeat20_essen_blues}.
 \paragraph{Choice of implementation language.} We chose Coq as the implementation language because
 of its mature support for code extraction; that is, its ability to generate OCaml programs directly
 from the definitions used in the theorems.  We note that other authors have had some success
-reasoning about the HLS process using other theorem provers such as Isabelle~\cite{ellis08}.
+reasoning about the HLS process using other theorem provers such as Isabelle~\cite{ellis08_correc}.
 CompCert~\cite{leroy09_formal_verif_realis_compil} was chosen as the front end because it has a well
 established framework for simulation proofs about intermediate languages, and it already provides a
 validated C parser~\cite{jourdan12_valid_lr_parser}.  The Vellvm
@@ -196,11 +187,10 @@ Vericert, this event is either a positive (rising) or a negative (falling) clock
 always-blocks triggering on the same event are executed in parallel. Always-blocks can also express
 control-flow using if-statements and case-statements.
 
-\startplacefigure[reference={fig:tutorial:state_machine},location=top]
-  \setupinterlinespace[line=2.8ex]
+\startplacemarginfigure[reference={fig:tutorial:state_machine},location=top,title={Hello}]
   \startfloatcombination [nx=2, ny=1]
     \startplacesubfigure
-      \startframedtext[width={0.55\textwidth},frame=off,offset=none,bodyfont=11pt]
+      \startframedtext[width={0.9\textwidth},frame=off,offset=none,loffset=3cm,bodyfont=11pt]
       \starthlverilog
          module main(input rst, input y,
                      input clk, output reg z);
@@ -243,7 +233,7 @@ control-flow using if-statements and case-statements.
       \stoptikzpicture}
     \stopplacesubfigure
   \stopfloatcombination
-\stopplacefigure
+\stopplacemarginfigure
 
 A simple state machine can be implemented as shown in Fig.~\ref{fig:tutorial:state_machine}.  At
 every positive edge of the clock (\mono{clk}), both of the always-blocks will trigger
@@ -258,106 +248,98 @@ will reset the value of \mono{tmp} and then set \mono{z} to the original value o
 nonblocking assignment does not change its value until the end of the clock cycle.  Finally, the
 last always-block will set the state to 0 again.
 
-\startplacefigure[location=here,reference={hello}]
-  \startfloatcombination[nx=2,ny=2]
-    \startplacesubfigure[title={1}]
-      \externalfigure[hacker.jpg]
+\startplacemarginfigure[location=top,reference={fig:accumulator_c_rtl},title={Translating a simple
+    program from C to Verilog.}]
+  \startfloatcombination[nx=2,ny=1]
+    \startplacefigure[number=no]
+    \startfloatcombination[nx=1,ny=2]
+    \startplacesubfigure[title={Example C code passed to Vericert.}]
+      \startframedtext[width={0.4\textwidth},frame=off,offset=none,bodyfont=11pt]
+        \starthlC
+          int main() {
+              int x[2] = {3, 6};
+              int i = 1;
+              return x[i];
+          }
+        \stophlC
+      \stopframedtext
     \stopplacesubfigure
-    \startplacesubfigure[title={2}]
-      \externalfigure[hacker.jpg]
+    \startplacesubfigure[title={3AC produced by the CompCert front-end without any optimisations.}]
+      \startframedtext[width={0.4\textwidth},frame=off,offset=none,bodyfont=11pt]
+      \starthlC
+      main() {
+        x5 = 3
+        int32[stack(0)] = x5
+        x4 = 6
+        int32[stack(4)] = x4
+        x1 = 1
+        x3 = stack(0) (int)
+        x2 = int32[x3 + x1
+                   * 4 + 0]
+        return x2
+      }
+      \stophlC
+      \stopframedtext
     \stopplacesubfigure
-    \startplacesubfigure[title={3}]
-      \externalfigure[hacker.jpg]
-    \stopplacesubfigure
-    \startplacesubfigure[]
-      \externalfigure[hacker.jpg]
+    \stopfloatcombination
+    \stopplacefigure
+    \startplacesubfigure[title={Verilog produced by Vericert. It demonstrates the instantiation of
+        the RAM (lines 9--15), the data-path (lines 16--32) and the control logic (lines 33--42).}]
+      \startframedtext[width={\textwidth},frame=off,offset=none,bodyfont=10pt]
+      \starthlverilog
+module main(reset, clk, finish, return_val);
+  input [0:0] reset, clk;
+  output reg [0:0] finish = 0;
+  output reg [31:0] return_val = 0;
+  reg [31:0] reg_3 = 0, addr = 0, d_in = 0,
+             reg_5 = 0, wr_en = 0;
+  reg [0:0] en = 0, u_en = 0;
+  reg [31:0] state = 0, reg_2 = 0, reg_4 = 0,
+             d_out = 0, reg_1 = 0;
+  reg [31:0] stack [1:0];
+  // RAM interface
+  always @(negedge clk)
+    if ({u_en != en}) begin
+      if (wr_en) stack[addr] <= d_in;
+      else d_out <= stack[addr];
+      en <= u_en;
+    end
+  // Data-path
+  always @(posedge clk)
+    case (state)
+      32'd11: reg_2 <= d_out;
+      32'd8: reg_5 <= 32'd3;
+      32'd7: begin u_en <= ( ~ u_en); wr_en <= 32'd1;
+                   d_in <= reg_5; addr <= 32'd0; end
+      32'd6: reg_4 <= 32'd6;
+      32'd5: begin u_en <= ( ~ u_en); wr_en <= 32'd1;
+                   d_in <= reg_4; addr <= 32'd1; end
+      32'd4: reg_1 <= 32'd1;
+      32'd3: reg_3 <= 32'd0;
+      32'd2: begin u_en <= ( ~ u_en); wr_en <= 32'd0;
+                   addr <= {{{reg_3 + 32'd0}
+                           + {reg_1 * 32'd4}}
+                           / 32'd4};
+             end
+      32'd1: begin finish = 32'd1; return_val = reg_2; end
+      default: ;
+    endcase
+  // Control logic
+  always @(posedge clk)
+    if ({reset == 32'd1}) state <= 32'd8;
+    else case (state)
+           32'd11: state <= 32'd1;      32'd4: state <= 32'd3;
+           32'd8: state <= 32'd7;       32'd3: state <= 32'd2;
+           32'd7: state <= 32'd6;       32'd2: state <= 32'd11;
+           32'd6: state <= 32'd5;       32'd1: ;
+           32'd5: state <= 32'd4;       default: ;
+         endcase
+endmodule
+      \stophlverilog
+      \stopframedtext
     \stopplacesubfigure
   \stopfloatcombination
-\stopplacefigure
-
-\in{Figure}{b}[hello]
-
-%\begin{figure}
-%  \centering
-%  \begin{subfigure}[b]{0.3\linewidth}
-%    \begin{subfigure}[t]{1\linewidth}
-%\begin{minted}[fontsize=\footnotesize,linenos,xleftmargin=20pt]{c}
-%int main() {
-%    int x[2] = {3, 6};
-%    int i = 1;
-%    return x[i];
-%}
-%\end{minted}
-%      \caption{Example C code passed to Vericert.}\label{fig:accumulator_c}
-%    \end{subfigure}\\\vspace{3em}
-%    \begin{subfigure}[b]{1\linewidth}
-%\begin{minted}[fontsize=\footnotesize,linenos,xleftmargin=20pt]{c}
-%main() {
-%    x5 = 3
-%    int32[stack(0)] = x5
-%    x4 = 6
-%    int32[stack(4)] = x4
-%    x1 = 1
-%    x3 = stack(0) (int)
-%    x2 = int32[x3 + x1
-%               * 4 + 0]
-%    return x2
-%}
-%\end{minted}
-%      \caption{3AC produced by the CompCert front-end without any optimisations.}\label{fig:accumulator_rtl}
-%    \end{subfigure}
-%  \end{subfigure}\hfill%
-%  \begin{subfigure}[b]{0.65\linewidth}
-%\begin{minted}[fontsize=\tiny,linenos,xleftmargin=20pt]{verilog}
-%module main(reset, clk, finish, return_val);
-%  input [0:0] reset, clk;
-%  output reg [0:0] finish = 0;
-%  output reg [31:0] return_val = 0;
-%  reg [31:0] reg_3 = 0, addr = 0, d_in = 0, reg_5 = 0, wr_en = 0;
-%  reg [0:0] en = 0, u_en = 0;
-%  reg [31:0] state = 0, reg_2 = 0, reg_4 = 0, d_out = 0, reg_1 = 0;
-%  reg [31:0] stack [1:0];
-%  // RAM interface
-%  always @(negedge clk)
-%    if ({u_en != en}) begin
-%      if (wr_en) stack[addr] <= d_in;
-%      else d_out <= stack[addr];
-%      en <= u_en;
-%    end
-%  // Data-path
-%  always @(posedge clk)
-%    case (state)
-%      32'd11: reg_2 <= d_out;
-%      32'd8: reg_5 <= 32'd3;
-%      32'd7: begin u_en <= ( ~ u_en); wr_en <= 32'd1;
-%                   d_in <= reg_5; addr <= 32'd0; end
-%      32'd6: reg_4 <= 32'd6;
-%      32'd5: begin u_en <= ( ~ u_en); wr_en <= 32'd1;
-%                   d_in <= reg_4; addr <= 32'd1; end
-%      32'd4: reg_1 <= 32'd1;
-%      32'd3: reg_3 <= 32'd0;
-%      32'd2: begin u_en <= ( ~ u_en); wr_en <= 32'd0;
-%                   addr <= {{{reg_3 + 32'd0} + {reg_1 * 32'd4}} / 32'd4}; end
-%      32'd1: begin finish = 32'd1; return_val = reg_2; end
-%      default: ;
-%    endcase
-%  // Control logic
-%  always @(posedge clk)
-%    if ({reset == 32'd1}) state <= 32'd8;
-%    else case (state)
-%           32'd11: state <= 32'd1;        32'd4: state <= 32'd3;
-%           32'd8: state <= 32'd7;         32'd3: state <= 32'd2;
-%           32'd7: state <= 32'd6;         32'd2: state <= 32'd11;
-%           32'd6: state <= 32'd5;         32'd1: ;
-%           32'd5: state <= 32'd4;         default: ;
-%         endcase
-%endmodule
-%\end{minted}
-%\caption{Verilog produced by Vericert. It demonstrates the instantiation of the RAM (lines 9--15), the data-path (lines 16--32) and the control logic (lines 33--42).}\label{fig:accumulator_v}
-%\end{subfigure}
-%\caption{Translating a simple program from C to Verilog.}\label{fig:accumulator_c_rtl}
-%% \NR{Is the default in line 41 of (c) supposed to be empty?}\YH{Yep, that's how it's generated.}
-%\end{figure}
+\stopplacemarginfigure
 
 \subsection{Translating C to Verilog by Example}
 Fig.~\ref{fig:accumulator_c_rtl} illustrates the translation of a simple program that stores and retrieves values from an array.
@@ -393,112 +375,113 @@ similar semantics to Verilog at the lower level by already using Verilog stateme
 abstract instructions.  Compared to plain Verilog, HTL is simpler to manipulate and analyse, thereby
 making it easier to prove optimisations like proper RAM insertion.
 
-%\begin{figure*}
-%  \centering
-%\definecolor{control}{HTML}{b3e2cd}
-%\definecolor{data}{HTML}{fdcdac}
-%\begin{tikzpicture}
-%  \begin{scope}[scale=1.15]
-%  \fill[control,fill opacity=1] (6.5,0) rectangle (12,5);
-%  \fill[data,fill opacity=1] (0,0) rectangle (5.5,5);
-%  \node at (1,4.7) {Data-path};
-%  \node at (7.5,4.7) {Control Logic};
-%
-%  \fill[white,rounded corners=10pt] (7,0.5) rectangle (11.5,2.2);
-%  \node at (8,2) {\footnotesize Next State FSM};
-%  \foreach \x in {8,...,2}
-%    {\pgfmathtruncatemacro{\y}{8-\x}%
-%      \node[draw,circle,inner sep=0,minimum size=10,scale=0.8] (s\x) at (7.5+\y/2,1.35) {\tiny \x};}
-%  \node[draw,circle,inner sep=0,minimum size=10,scale=0.8] (s1c) at (11,1.35) {\tiny 1};
-%  \node[draw,circle,inner sep=0,minimum size=13,scale=0.8] (s1) at (s1c) {};
-%  \foreach \x in {8,...,3}
-%    {\pgfmathtruncatemacro{\y}{\x-1}\draw[-{Latex[length=1mm,width=0.7mm]}] (s\x) -- (s\y);}
-%  \node[draw,circle,inner sep=0,minimum size=10,scale=0.8] (s11) at (10.5,0.9) {\tiny 11};
-%  \draw[-{Latex[length=1mm,width=0.7mm]}] (s2) -- (s11);
-%  \draw[-{Latex[length=1mm,width=0.7mm]}] (s11) -- (s1);
-%  \draw[-{Latex[length=1mm,width=0.7mm]}] (7.2,1.7) to [out=0,in=100] (s8);
-%
-%  \node[draw,fill=white] (nextstate) at (9.25,3) {\tiny Current State};
-%  \draw[-{Latex[length=1mm,width=0.7mm]}] let \p1 = (nextstate) in
-%    (11.5,1.25) -| (11.75,\y1) -- (nextstate);
-%  \draw let \p1 = (nextstate) in (nextstate) -- (6,\y1) |- (6,1.5);
-%  \node[scale=0.5,rotate=60] at (7.5,0.75) {\mono{clk}};
-%  \node[scale=0.5,rotate=60] at (7.7,0.75) {\mono{rst}};
-%  \draw[-{Latex[length=1mm,width=0.7mm]}] (7.65,-0.5) -- (7.65,0.5);
-%  \draw[-{Latex[length=1mm,width=0.7mm]}] (7.45,-0.5) -- (7.45,0.5);
-%
-%  \fill[white,rounded corners=10pt] (2,0.5) rectangle (5,3);
-%  \filldraw[fill=white] (0.25,0.5) rectangle (1.5,2.75);
-%  \node at (2.6,2.8) {\footnotesize Update};
-%  \node[align=center] at (0.875,2.55) {\footnotesize \mono{RAM}};
-%  \node[scale=0.5] at (4.7,1.5) {\mono{state}};
-%  \draw[-{Latex[length=1mm,width=0.7mm]}] (6,1.5) -- (5,1.5);
-%  \draw[-{Latex[length=1mm,width=0.7mm]}] (6,1.5) -- (7,1.5);
-%  \node[scale=0.5,rotate=60] at (4.1,0.9) {\mono{finished}};
-%  \node[scale=0.5,rotate=60] at (3.9,0.95) {\mono{return\_val}};
-%  \node[scale=0.5,rotate=60] at (2.5,0.75) {\mono{clk}};
-%  \node[scale=0.5,rotate=60] at (2.7,0.75) {\mono{rst}};
-%
-%  \node[scale=0.5,right,inner sep=5pt] (ram1) at (2,2.1) {\mono{u\_en}};
-%  \node[scale=0.5,right,inner sep=5pt] (ram2) at (2,1.9) {\mono{wr\_en}};
-%  \node[scale=0.5,right,inner sep=5pt] (ram3) at (2,1.7) {\mono{addr}};
-%  \node[scale=0.5,right,inner sep=5pt] (ram4) at (2,1.5) {\mono{d\_in}};
-%  \node[scale=0.5,right,inner sep=5pt] (ram5) at (2,1.3) {\mono{d\_out}};
-%
-%  \node[scale=0.5,left,inner sep=5pt] (r1) at (1.5,2.1) {\mono{u\_en}};
-%  \node[scale=0.5,left,inner sep=5pt] (r2) at (1.5,1.9) {\mono{wr\_en}};
-%  \node[scale=0.5,left,inner sep=5pt] (r3) at (1.5,1.7) {\mono{addr}};
-%  \node[scale=0.5,left,inner sep=5pt] (r4) at (1.5,1.5) {\mono{d\_in}};
-%  \node[scale=0.5,left,inner sep=5pt] (r5) at (1.5,1.3) {\mono{d\_out}};
-%
-%  \draw[-{Latex[length=1mm,width=0.7mm]}] (ram1) -- (r1);
-%  \draw[-{Latex[length=1mm,width=0.7mm]}] (ram2) -- (r2);
-%  \draw[-{Latex[length=1mm,width=0.7mm]}] (ram3) -- (r3);
-%  \draw[-{Latex[length=1mm,width=0.7mm]}] (ram4) -- (r4);
-%  \draw[-{Latex[length=1mm,width=0.7mm]}] (r5) -- (ram5);
-%
-%  \draw[-{Latex[length=1mm,width=0.7mm]}] (4,0.5) -- (4,-0.5);
-%  \draw[-{Latex[length=1mm,width=0.7mm]}] (3.75,0.5) -- (3.75,-0.5);
-%  \draw[-{Latex[length=1mm,width=0.7mm]}] (2.45,-0.5) -- (2.45,0.5);
-%  \draw[-{Latex[length=1mm,width=0.7mm]}] (2.65,-0.5) -- (2.65,0.5);
-%
-%  \foreach \x in {0,...,1}
-%  {\draw (0.25,1-0.25*\x) -- (1.5,1-0.25*\x); \node at (0.875,0.88-0.25*\x) {\tiny \x};}
-%
-%  %\node[scale=0.5] at (1.2,2.2) {\mono{wr\_en}};
-%  %\node[scale=0.5] at (1.2,2) {\mono{wr\_addr}};
-%  %\node[scale=0.5] at (1.2,1.8) {\mono{wr\_data}};
-%  %\node[scale=0.5] at (1.2,1.4) {\mono{r\_addr}};
-%  %\node[scale=0.5] at (1.2,1.2) {\mono{r\_data}};
-%  %
-%  %\node[scale=0.5] at (2.3,2.2) {\mono{wr\_en}};
-%  %\node[scale=0.5] at (2.3,2) {\mono{wr\_addr}};
-%  %\node[scale=0.5] at (2.3,1.8) {\mono{wr\_data}};
-%  %\node[scale=0.5] at (2.3,1.4) {\mono{r\_addr}};
-%  %\node[scale=0.5] at (2.3,1.2) {\mono{r\_data}};
-%  %
-%  %\draw[-{Latex[length=1mm,width=0.7mm]}] (2,2.2) -- (1.5,2.2);
-%  %\draw[-{Latex[length=1mm,width=0.7mm]}] (2,2) -- (1.5,2);
-%  %\draw[-{Latex[length=1mm,width=0.7mm]}] (2,1.8) -- (1.5,1.8);
-%  %\draw[-{Latex[length=1mm,width=0.7mm]}] (2,1.4) -- (1.5,1.4);
-%  %\draw[-{Latex[length=1mm,width=0.7mm]}] (1.5,1.2) -- (2,1.2);
-%
-%  \filldraw[fill=white] (2.8,3.25) rectangle (4.2,4.75);
-%  \node at (3.5,4.55) {\footnotesize \mono{Registers}};
-%  \draw[-{Latex[length=1mm,width=0.7mm]}] (2,2.4) -| (1.75,4) -- (2.8,4);
-%  \draw[-{Latex[length=1mm,width=0.7mm]}] (4.2,4) -- (5.25,4) |- (5,2.4);
-%  \draw[-{Latex[length=1mm,width=0.7mm]}] (5.25,2.4) -- (6.2,2.4) |- (7,1.8);
-%
-%  \node[scale=0.5] at (3.5,4.2) {\mono{reg\_1}};
-%  \node[scale=0.5] at (3.5,4) {\mono{reg\_2}};
-%  \node[scale=0.5] at (3.5,3.8) {\mono{reg\_3}};
-%  \node[scale=0.5] at (3.5,3.6) {\mono{reg\_4}};
-%  \node[scale=0.5] at (3.5,3.4) {\mono{reg\_5}};
-%\end{scope}
-%\end{tikzpicture}
-%  \Description{Diagram displaying the data-path and its internal modules, as well as the control logic and its state machine.}
-%  \caption{The FSMD for the example shown in Fig.~\ref{fig:accumulator_c_rtl}, split into a data-path and control logic for the next state calculation.  The Update block takes the current state, current values of all registers and at most one value stored in the RAM, and calculates a new value that can either be stored back in the RAM or in a register.}\label{fig:accumulator_diagram}
-%\end{figure*}
+\startplacemarginfigure[title={The FSMD for the example shown in Fig.~\ref{fig:accumulator_c_rtl},
+    split into a data-path and control logic for the next state calculation.  The Update block takes
+    the current state, current values of all registers and at most one value stored in the RAM, and
+    calculates a new value that can either be stored back in the RAM or in a
+    register.},reference={fig:accumulator_diagram},location=top]
+\definecolor[control][x=B3E2CD]
+\definecolor[data][x=FDCDAC]
+\starttikzpicture
+  \startscope[scale=1.15]
+  \fill[control,fill opacity=1] (6.5,0) rectangle (12,5);
+  \fill[data,fill opacity=1] (0,0) rectangle (5.5,5);
+  \node at (1,4.7) {Data-path};
+  \node at (7.5,4.7) {Control Logic};
+
+  \fill[white,rounded corners=10pt] (7,0.5) rectangle (11.5,2.2);
+  \node at (8,2) {\rmxx Next State FSM};
+  \foreach \x in {8,...,2}
+    {\pgfmathtruncatemacro{\y}{8-\x}%
+      \node[draw,circle,inner sep=0,minimum size=10,scale=0.8] (s\x) at (7.5+\y/2,1.35) {\rmxx \x};}
+  \node[draw,circle,inner sep=0,minimum size=10,scale=0.8] (s1c) at (11,1.35) {\rmxx 1};
+  \node[draw,circle,inner sep=0,minimum size=13,scale=0.8] (s1) at (s1c) {};
+  \foreach \x in {8,...,3}
+    {\pgfmathtruncatemacro{\y}{\x-1}\draw[-{Latex[length=1mm,width=0.7mm]}] (s\x) -- (s\y);}
+  \node[draw,circle,inner sep=0,minimum size=10,scale=0.8] (s11) at (10.5,0.9) {\rmxx 11};
+  \draw[-{Latex[length=1mm,width=0.7mm]}] (s2) -- (s11);
+  \draw[-{Latex[length=1mm,width=0.7mm]}] (s11) -- (s1);
+  \draw[-{Latex[length=1mm,width=0.7mm]}] (7.2,1.7) to [out=0,in=100] (s8);
+
+  \node[draw,fill=white] (nextstate) at (9.25,3) {\rmxx Current State};
+  \draw[-{Latex[length=1mm,width=0.7mm]}] let \p1 = (nextstate) in
+    (11.5,1.25) -| (11.75,\y1) -- (nextstate);
+  \draw let \p1 = (nextstate) in (nextstate) -- (6,\y1) |- (6,1.5);
+  \node[scale=0.5,rotate=60] at (7.5,0.75) {\mono{clk}};
+  \node[scale=0.5,rotate=60] at (7.7,0.75) {\mono{rst}};
+  \draw[-{Latex[length=1mm,width=0.7mm]}] (7.65,-0.5) -- (7.65,0.5);
+  \draw[-{Latex[length=1mm,width=0.7mm]}] (7.45,-0.5) -- (7.45,0.5);
+
+  \fill[white,rounded corners=10pt] (2,0.5) rectangle (5,3);
+  \filldraw[fill=white] (0.25,0.5) rectangle (1.5,2.75);
+  \node at (2.6,2.8) {\rmxx Update};
+  \node[align=center] at (0.875,2.55) {\rmxx \mono{RAM}};
+  \node[scale=0.5] at (4.7,1.5) {\mono{state}};
+  \draw[-{Latex[length=1mm,width=0.7mm]}] (6,1.5) -- (5,1.5);
+  \draw[-{Latex[length=1mm,width=0.7mm]}] (6,1.5) -- (7,1.5);
+  \node[scale=0.5,rotate=60] at (4.1,0.9) {\mono{finished}};
+  \node[scale=0.5,rotate=60] at (3.9,0.95) {\mono{return\_val}};
+  \node[scale=0.5,rotate=60] at (2.5,0.75) {\mono{clk}};
+  \node[scale=0.5,rotate=60] at (2.7,0.75) {\mono{rst}};
+
+  \node[scale=0.5,right,inner sep=5pt] (ram1) at (2,2.1) {\mono{u\_en}};
+  \node[scale=0.5,right,inner sep=5pt] (ram2) at (2,1.9) {\mono{wr\_en}};
+  \node[scale=0.5,right,inner sep=5pt] (ram3) at (2,1.7) {\mono{addr}};
+  \node[scale=0.5,right,inner sep=5pt] (ram4) at (2,1.5) {\mono{d\_in}};
+  \node[scale=0.5,right,inner sep=5pt] (ram5) at (2,1.3) {\mono{d\_out}};
+
+  \node[scale=0.5,left,inner sep=5pt] (r1) at (1.5,2.1) {\mono{u\_en}};
+  \node[scale=0.5,left,inner sep=5pt] (r2) at (1.5,1.9) {\mono{wr\_en}};
+  \node[scale=0.5,left,inner sep=5pt] (r3) at (1.5,1.7) {\mono{addr}};
+  \node[scale=0.5,left,inner sep=5pt] (r4) at (1.5,1.5) {\mono{d\_in}};
+  \node[scale=0.5,left,inner sep=5pt] (r5) at (1.5,1.3) {\mono{d\_out}};
+
+  \draw[-{Latex[length=1mm,width=0.7mm]}] (ram1) -- (r1);
+  \draw[-{Latex[length=1mm,width=0.7mm]}] (ram2) -- (r2);
+  \draw[-{Latex[length=1mm,width=0.7mm]}] (ram3) -- (r3);
+  \draw[-{Latex[length=1mm,width=0.7mm]}] (ram4) -- (r4);
+  \draw[-{Latex[length=1mm,width=0.7mm]}] (r5) -- (ram5);
+
+  \draw[-{Latex[length=1mm,width=0.7mm]}] (4,0.5) -- (4,-0.5);
+  \draw[-{Latex[length=1mm,width=0.7mm]}] (3.75,0.5) -- (3.75,-0.5);
+  \draw[-{Latex[length=1mm,width=0.7mm]}] (2.45,-0.5) -- (2.45,0.5);
+  \draw[-{Latex[length=1mm,width=0.7mm]}] (2.65,-0.5) -- (2.65,0.5);
+
+  \foreach \x in {0,...,1}
+  {\draw (0.25,1-0.25*\x) -- (1.5,1-0.25*\x); \node at (0.875,0.88-0.25*\x) {\rmxx \x};}
+
+  %\node[scale=0.5] at (1.2,2.2) {\mono{wr\_en}};
+  %\node[scale=0.5] at (1.2,2) {\mono{wr\_addr}};
+  %\node[scale=0.5] at (1.2,1.8) {\mono{wr\_data}};
+  %\node[scale=0.5] at (1.2,1.4) {\mono{r\_addr}};
+  %\node[scale=0.5] at (1.2,1.2) {\mono{r\_data}};
+  %
+  %\node[scale=0.5] at (2.3,2.2) {\mono{wr\_en}};
+  %\node[scale=0.5] at (2.3,2) {\mono{wr\_addr}};
+  %\node[scale=0.5] at (2.3,1.8) {\mono{wr\_data}};
+  %\node[scale=0.5] at (2.3,1.4) {\mono{r\_addr}};
+  %\node[scale=0.5] at (2.3,1.2) {\mono{r\_data}};
+  %
+  %\draw[-{Latex[length=1mm,width=0.7mm]}] (2,2.2) -- (1.5,2.2);
+  %\draw[-{Latex[length=1mm,width=0.7mm]}] (2,2) -- (1.5,2);
+  %\draw[-{Latex[length=1mm,width=0.7mm]}] (2,1.8) -- (1.5,1.8);
+  %\draw[-{Latex[length=1mm,width=0.7mm]}] (2,1.4) -- (1.5,1.4);
+  %\draw[-{Latex[length=1mm,width=0.7mm]}] (1.5,1.2) -- (2,1.2);
+
+  \filldraw[fill=white] (2.8,3.25) rectangle (4.2,4.75);
+  \node at (3.5,4.55) {\rmxx \mono{Registers}};
+  \draw[-{Latex[length=1mm,width=0.7mm]}] (2,2.4) -| (1.75,4) -- (2.8,4);
+  \draw[-{Latex[length=1mm,width=0.7mm]}] (4.2,4) -- (5.25,4) |- (5,2.4);
+  \draw[-{Latex[length=1mm,width=0.7mm]}] (5.25,2.4) -- (6.2,2.4) |- (7,1.8);
+
+  \node[scale=0.5] at (3.5,4.2) {\mono{reg\_1}};
+  \node[scale=0.5] at (3.5,4) {\mono{reg\_2}};
+  \node[scale=0.5] at (3.5,3.8) {\mono{reg\_3}};
+  \node[scale=0.5] at (3.5,3.6) {\mono{reg\_4}};
+  \node[scale=0.5] at (3.5,3.4) {\mono{reg\_5}};
+\stopscope
+\stoptikzpicture
+\stopplacemarginfigure
 
 \paragraph{Translating memory} Typically, HLS-generated hardware consists of a sea of registers and
 RAMs.  This memory view is very different from the C memory model, so we perform the following
@@ -588,8 +571,8 @@ which performs the requested load or store. By factoring all the memory accesses
 interface, we ensure that the underlying array is only accessed from a single program point in the
 Verilog code, and thus ensure that the synthesis tool will correctly infer a RAM
 block.\footnote{Interestingly, the Verilog code shown for the RAM interface must not be modified,
-  because the synthesis tool will only generate a RAM when the code matches a small set of specific
-  patterns.}
+    because the synthesis tool will only generate a RAM when the code matches a small set of
+    specific patterns.}
 
 Therefore, an extra compiler pass is added from HTL to HTL to extract all the direct accesses to the
 Verilog array and replace them by signals that access the RAM interface in a separate
@@ -624,48 +607,21 @@ current value \mono{u\_en} set by the data-path.  When the values are different,
 enabled, and then \mono{en} is set to the value of \mono{u\_en}. This ensures that the RAM will
 always be disabled straight after it was used, without having to insert or modify any other states.
 
-%\begin{figure}
-%  \centering
-%  \begin{subfigure}[b]{0.48\linewidth}
-%    \begin{tikztimingtable}[timing/d/background/.style={fill=white}]
-%      \small clk & 2L 3{6C} \\
-%      \small u\_en & 2D{u\_en} 18D{$\overline{\text{u\_en}}$}\\
-%      \small addr & 2U 18D{3} \\
-%      \small wr\_en & 2U 18L \\
-%      \small en & 8D{u\_en} 12D{$\overline{\text{u\_en}}$}\\
-%      \small d\_out & 8U 12D{0xDEADBEEF} \\
-%      \small r & 14U 6D{0xDEADBEEF} \\
-%      \extracode
-%      \node[help lines] at (2,2.25) {\tiny 1};
-%      \node[help lines] at (8,2.25) {\tiny 2};
-%      \node[help lines] at (14,2.25) {\tiny 3};
-%      \begin{pgfonlayer}{background}
-%        \vertlines[help lines]{2,8,14}
-%      \end{pgfonlayer}
-%    \end{tikztimingtable}
-%    \caption{Timing diagram for loads. At time 1, the \mono{u\_en} signal is toggled to enable the RAM. At time 2, \mono{d\_out} is set to the value stored at the address in the RAM, which is finally assigned to the register \mono{r} at time 3.}\label{fig:ram_load}
-%  \end{subfigure}\hfill%
-%  \begin{subfigure}[b]{0.48\linewidth}
-%    \begin{tikztimingtable}[timing/d/background/.style={fill=white}]
-%      \small clk & 2L 2{7C} \\
-%      \small u\_en & 2D{u\_en} 14D{$\overline{\text{u\_en}}$}\\
-%      \small addr & 2U 14D{3} \\
-%      \small wr\_en & 2U 14H \\
-%      \small d\_in & 2U 14D{0xDEADBEEF} \\
-%      \small en & 9D{u\_en} 7D{$\overline{\text{u\_en}}$}\\
-%      \small stack[addr] & 9U 7D{0xDEADBEEF} \\
-%      \extracode
-%      \node[help lines] at (2,2.25) {\tiny 1};
-%      \node[help lines] at (9,2.25) {\tiny 2};
-%      \begin{pgfonlayer}{background}
-%        \vertlines[help lines]{2,9}
-%      \end{pgfonlayer}
-%    \end{tikztimingtable}
-%    \caption{Timing diagram for stores. At time 1, the \mono{u\_en} signal is toggled to enable the RAM, and the address \mono{addr} and the data to store \mono{d\_in} are set. On the negative edge at time 2, the data is stored into the RAM.}\label{fig:ram_store}
-%  \end{subfigure}
-%  \Description{Timing diagrams of loads and stores, showing which signals are modified at which time step.}
-%  \caption{Timing diagrams showing the execution of loads and stores over multiple clock cycles.}\label{fig:ram_load_store}
-%\end{figure}
+\startplacemarginfigure[reference={fig:ram_load_store},title={Timing diagrams showing the execution
+    of loads and stores over multiple clock cycles.}]
+  \startfloatcombination[nx=2]
+    \startplacesubfigure[title={Timing diagram for loads. At time 1, the \mono{u\_en} signal is
+        toggled to enable the RAM. At time 2, \mono{d\_out} is set to the value stored at the
+        address in the RAM, which is finally assigned to the register \mono{r} at time 3.}]
+      \externalfigure[figures/timing-1.pdf]
+    \stopplacesubfigure
+  \startplacesubfigure[title={Timing diagram for stores. At time 1, the \mono{u\_en} signal is
+      toggled to enable the RAM, and the address \mono{addr} and the data to store \mono{d\_in} are
+      set. On the negative edge at time 2, the data is stored into the RAM.}]
+    \externalfigure[figures/timing-2.pdf]
+  \stopplacesubfigure
+  \stopfloatcombination
+\stopplacemarginfigure
 
 Fig.~\ref{fig:ram_load_store} gives an example of how the RAM interface behaves when values are
 loaded and stored.
@@ -718,7 +674,10 @@ big-step semantics is used to execute all the statements. An example of a rule f
 always-block that is triggered at the positive edge of the clock is shown below, where $\Sigma$ is
 the state of the registers in the module and $s$ is the statement inside the always-block:
 
-%\startformula \inferrule[Always]{(\Sigma, s)\downarrow_{\text{stmnt}} \Sigma'}{(\Sigma, \yhkeyword{always @(posedge clk) } s) \downarrow_{\text{always}^{+}} \Sigma'} \stopformula
+\placeformula\startformula
+  \text{\sc Always }\ \dfrac{(\Sigma, s)\downarrow_{\text{stmnt}} \Sigma'}{(\Sigma, \text{\mono{always
+        @(posedge clk) }} s) \downarrow_{\text{always}^{+}} \Sigma'}
+\stopformula
 
 This rule says that assuming the statement $s$ in the always-block runs with state $\Sigma$ and
 produces the new state $\Sigma'$, the always-block will result in the same final state.
@@ -732,7 +691,7 @@ contains the values that will be assigned at the end of the clock cycle. $\Sigma
 defined as follows: $\Sigma = (\Gamma, \Delta)$. Nonblocking assignment can therefore be expressed
 as follows:
 
-%\startformula \inferrule[Nonblocking Reg]{\yhkeyword{name}\ d = \yhkeyword{OK}\ n \\ (\Gamma, e) \downarrow_{\text{expr}} v}{((\Gamma, \Delta), d\ \yhkeyword{ <= } e) \downarrow_{\text{stmnt}} (\Gamma, \Delta [n \mapsto v])}\\ \stopformula
+\placeformula\startformula \text{\sc Nonblocking Reg }\ \frac{\text{\tt name}\ d = \text{\tt OK}\ n \\ (\Gamma, e) \downarrow_{\text{expr}} v}{((\Gamma, \Delta), d\ \text{\tt <= } e) \downarrow_{\text{stmnt}} (\Gamma, \Delta [n \mapsto v])}\\ \stopformula
 
 where assuming that $\downarrow_{\text{expr}}$ evaluates an expression $e$ to a value $v$, the
 nonblocking assignment $d\ \mono{ <= } e$ updates the future state of the variable $d$ with value
@@ -740,8 +699,11 @@ $v$.
 
 Finally, the following rule dictates how the whole module runs in one clock cycle:
 
-%\startformula \inferrule[Module]{(\Gamma, \epsilon, \vec{m})\ \downarrow_{\text{module}} (\Gamma', \Delta')}{(\Gamma, \yhkeyword{module } \yhconstant{main} \yhkeyword{(...);}\ \vec{m}\ \yhkeyword{endmodule}) \downarrow_{\text{program}} (\Gamma'\ //\ \Delta')} \stopformula
-%
+\placeformula\startformula \text{\sc Module }\ \frac{(\Gamma, \epsilon, \vec{m})\
+  \downarrow_{\text{module}} (\Gamma', \Delta')}{(\Gamma, \text{\tt module } \text{\tt main}
+  \text{\tt (...);}\ \vec{m}\ \text{\tt endmodule}) \downarrow_{\text{program}} (\Gamma'\ //\
+  \Delta')} \stopformula
+
 where $\Gamma$ is the initial state of all the variables, $\epsilon$ is the empty map because the
 $\Delta$ map is assumed to be empty at the start of the clock cycle, and $\vec{m}$ is a list of
 variable declarations and always-blocks that $\downarrow_{\text{module}}$ evaluates sequentially to
@@ -750,11 +712,11 @@ operator, which gives priority to the right-hand operand in a conflict. This rul
 nonblocking assignments overwrite at the end of the clock cycle any blocking assignments made during
 the cycle.
 
-\subsection[title={Changes to the Semantics},reference={changes-to-the-semantics}]
+\subsection[changes-to-the-semantics]{Changes to the Semantics}
 
 Five changes were made to the semantics proposed by to make it suitable as an HLS target.
 
-\subsubsection[title={Adding array support},reference={adding-array-support}]
+\subsubsection[adding-array-support]{Adding array support}
 
 The main change is the addition of support for arrays, which are needed to model RAM in Verilog. RAM
 is needed to model the stack in C efficiently, without having to declare a variable for each
@@ -785,7 +747,11 @@ described in Section~\goto{{[}sec:algorithm:optimisation:ram{]}}[sec:algorithm:o
 support for negative-edge-triggered always-blocks was added to the semantics. This is shown in the
 modifications of the {\sc Module} rule shown below:
 
-%\startformula \inferrule[Module]{(\Gamma, \epsilon, \vec{m})\ \downarrow_{\text{module}^{+}} (\Gamma', \Delta') \\ (\Gamma'\ //\ \Delta', \epsilon, \vec{m}) \downarrow_{\text{module}^{-}} (\Gamma'', \Delta'')}{(\Gamma, \yhkeyword{module}\ \yhconstant{main} \yhkeyword{(...);}\ \vec{m}\ \yhkeyword{endmodule}) \downarrow_{\text{program}} (\Gamma''\ //\ \Delta'')} \stopformula
+\placeformula\startformula \text{\sc Module }\ \frac{(\Gamma, \epsilon, \vec{m})\
+  \downarrow_{\text{module}^{+}} (\Gamma', \Delta') \\ (\Gamma'\ //\ \Delta', \epsilon, \vec{m})
+  \downarrow_{\text{module}^{-}} (\Gamma'', \Delta'')}{(\Gamma, \text{\tt module}\ \text{\tt main}
+  \text{\tt (...);}\ \vec{m}\ \text{\tt endmodule}) \downarrow_{\text{program}} (\Gamma''\ //\
+  \Delta'')} \stopformula
 
 The main execution of the module $\downarrow_{\text{module}}$ is split into
 $\downarrow_{\text{module}^{+}}$ and $\downarrow_{\text{module}^{-}}$, which are rules that only
@@ -817,15 +783,24 @@ because (currently) only supports types represented by 32 bits.
 
 \subsection[sec:verilog:integrating]{Integrating the Verilog Semantics into 's Model}
 
-%\startformula \begin{gathered}
-%      \inferrule[Step]{\Gamma_r[\mathit{rst}] = 0 \\ \Gamma_r[\mathit{fin}] = 0 \\ (m, (\Gamma_r, \Gamma_a))\ \downarrow_{\text{program}} (\Gamma_r', \Gamma_a')}{\yhconstant{State}\ \mathit{sf}\ m\ \ \Gamma_r[\sigma]\ \ \Gamma_r\ \Gamma_a \longrightarrow \yhconstant{State}\ \mathit{sf}\ m\ \ \Gamma_r'[\sigma]\ \ \Gamma_r'\ \Gamma_a'}\\
-%      %
-%      \inferrule[Finish]{\Gamma_r[\mathit{fin}] = 1}{\yhconstant{State}\ \mathit{sf}\ m\ \sigma\ \Gamma_r\ \Gamma_a \longrightarrow \yhconstant{Returnstate}\ \mathit{sf}\ \Gamma_r[ \mathit{ret}]}\\
-%      %
-%      \inferrule[Call]{ }{\yhconstant{Callstate}\ \mathit{sf}\ m\ \vec{r} \longrightarrow \yhconstant{State}\ \mathit{sf}\ m\ n\ ((\yhfunction{init\_params}\ \vec{r}\ a)[\sigma \mapsto n, \mathit{fin} \mapsto 0, \mathit{rst} \mapsto 0])\ \epsilon}\\
-%      %
-%      \inferrule[Return]{ }{\yhconstant{Returnstate}\ (\yhconstant{Stackframe}\ r\ m\ \mathit{pc}\ \Gamma_r\ \Gamma_a :: \mathit{sf})\ v \longrightarrow \yhconstant{State}\ \mathit{sf}\ m\ \mathit{pc}\ (\Gamma_{r} [ \sigma \mapsto \mathit{pc}, r \mapsto v ])\ \Gamma_{a}}
-%    \end{gathered} \stopformula
+\startplacemarginfigure[location=top]
+\startformula \startalign[n=1]
+  \NC\text{\sc Step }\ \frac{\startmatrix[n=1]
+    \NC \Gamma_r[\mathit{rst}] = 0 \qquad \Gamma_r[\mathit{fin}] = 0 \NR
+    \NC(m, (\Gamma_r, \Gamma_a))\ \downarrow_{\text{program}} (\Gamma_r', \Gamma_a') \NR
+  \stopmatrix}{\text{\tt State}\ \mathit{sf}\ m\ \ \Gamma_r[\sigma]\ \ \Gamma_r\ \Gamma_a
+    \longrightarrow \text{\tt State}\ \mathit{sf}\ m\ \ \Gamma_r'[\sigma]\ \ \Gamma_r'\ \Gamma_a'}\NR
+%
+  \NC\text{\sc Finish }\ \frac{\Gamma_r[\mathit{fin}] = 1}{\text{\tt State}\ \mathit{sf}\ m\ \sigma\
+    \Gamma_r\ \Gamma_a \longrightarrow \text{\tt Returnstate}\ \mathit{sf}\ \Gamma_r[ \mathit{ret}]}\NR
+%
+  \NC\text{\sc Call }\ \frac{}{\text{\tt Callstate}\ \mathit{sf}\ m\ \vec{r} \longrightarrow
+    \text{\tt State}\ \mathit{sf}\ m\ n\ ((\text{\tt init\_params}\ \vec{r}\ a)[\sigma \mapsto n, \mathit{fin} \mapsto 0, \mathit{rst} \mapsto 0])\ \epsilon}\NR
+%
+  \NC\text{\sc Return }\ \frac{}{\text{\tt Returnstate}\ (\text{\tt Stackframe}\ r\ m\ \mathit{pc}\
+    \Gamma_r\ \Gamma_a :: \mathit{sf})\ v \longrightarrow \text{\tt State}\ \mathit{sf}\ m\ \mathit{pc}\ (\Gamma_{r} [ \sigma \mapsto \mathit{pc}, r \mapsto v ])\ \Gamma_{a}}\NR
+\stopalign \stopformula
+\stopplacemarginfigure
 
 The computation model defines a set of states through which execution passes. In this subsection, we
 explain how we extend our Verilog semantics with four special-purpose registers in order to
@@ -833,49 +808,32 @@ integrate it into .
 
 executions pass through three main states:
 
-%\startdescription{\type{State} $\mathit{sf}$ $m$ $v$ $\Gamma_{r}$
-%$\Gamma_{a}$}
-%  The main state when executing a function, with stack frame
-%  $\mathit{sf}$, current module $m$, current state $v$ and variable
-%  states $\Gamma_{r}$ and $\Gamma_{a}$.
-%\stopdescription
-%
-%\startdescription{\type{Callstate} $\mathit{sf}$ $m$ $\vec{r}$}
-%  The state that is reached when a function is called, with the current
-%  stack frame $\mathit{sf}$, current module $m$ and arguments $\vec{r}$.
-%\stopdescription
-%
-%\startdescription{\type{Returnstate} $\mathit{sf}$ $v$}
-%  The state that is reached when a function returns back to the caller,
-%  with stack frame $\mathit{sf}$ and return value $v$.
-%\stopdescription
-%
-%To support this computational model, we extend the Verilog module we
-%generate with the following four registers and a RAM block:
-%
-%\startdescription{program counter}
-%  The program counter can be modelled using a register that keeps track
-%  of the state, denoted as $\sigma$.
-%\stopdescription
-%
-%\startdescription{function entry point}
-%  When a function is called, the entry point denotes the first
-%  instruction that will be executed. This can be modelled using a reset
-%  signal that sets the state accordingly, denoted as $\mathit{rst}$.
-%\stopdescription
-%
-%\startdescription{return value}
-%  The return value can be modelled by setting a finished flag to 1 when
-%  the result is ready, and putting the result into a 32-bit output
-%  register. These are denoted as $\mathit{fin}$ and $\mathit{ret}$
-%  respectively.
-%\stopdescription
-%
-%\startdescription{stack}
-%  The function stack can be modelled as a RAM block, which is
-%  implemented using an array in the module, and denoted as
-%  $\mathit{stk}$.
-%\stopdescription
+\desc{\type{State} $\mathit{sf}$ $m$ $v$ $\Gamma_{r}$ $\Gamma_{a}$} The main state when executing a
+function, with stack frame $\mathit{sf}$, current module $m$, current state $v$ and variable states
+$\Gamma_{r}$ and $\Gamma_{a}$.
+
+\desc{\type{Callstate} $\mathit{sf}$ $m$ $\vec{r}$} The state that is reached when a function is
+called, with the current stack frame $\mathit{sf}$, current module $m$ and arguments $\vec{r}$.
+
+\desc{\type{Returnstate} $\mathit{sf}$ $v$} The state that is reached when a function returns back
+to the caller, with stack frame $\mathit{sf}$ and return value $v$.
+
+To support this computational model, we extend the Verilog module we generate with the following
+four registers and a RAM block:
+
+\desc{program counter} The program counter can be modelled using a register that keeps track of the
+state, denoted as $\sigma$.
+
+\desc{function entry point} When a function is called, the entry point denotes the first instruction
+that will be executed. This can be modelled using a reset signal that sets the state accordingly,
+denoted as $\mathit{rst}$.
+
+\desc{return value} The return value can be modelled by setting a finished flag to 1 when the result
+is ready, and putting the result into a 32-bit output register. These are denoted as $\mathit{fin}$
+and $\mathit{ret}$ respectively.
+
+\desc{stack} The function stack can be modelled as a RAM block, which is implemented using an array
+in the module, and denoted as $\mathit{stk}$.
 
 Fig.~\goto{{[}fig:inference_module{]}}[fig:inference_module] shows the inference rules for moving
 between the computational states. The first, {\sc Step}, is the normal rule of execution. It defines
@@ -893,16 +851,18 @@ function. Therefore, in addition to the rules shown in
 Fig.~\goto{{[}fig:inference_module{]}}[fig:inference_module], an initial state and final state need
 to be defined:
 
-%\startformula \begin{gathered}
-%  \inferrule[Initial]{\yhfunction{is\_internal}\ P.\mono{main}}{\yhfunction{initial\_state}\ (\yhconstant{Callstate}\ []\ P.\mono{main}\ [])}\qquad
-%  \inferrule[Final]{ }{\yhfunction{final\_state}\ (\yhconstant{Returnstate}\ []\ n)\ n}\end{gathered} \stopformula
+\placeformula\startformula \startalign[n=1]
+  \NC\text{\sc Initial }\ \frac{\text{\tt is\_internal}\ P.\text{\tt main}}{\text{\tt initial\_state}\
+    (\text{\tt Callstate}\ []\ P.\mono{\tt main}\ [])}\NR
+  \NC\text{\sc Final }\ \frac{}{\text{\sc final\_state}\ (\text{\sc Returnstate}\ []\ n)\ n}\NR
+\stopalign \stopformula
 
 where the initial state is the \type{Callstate} with an empty stack frame and no arguments for the
 \type{main} function of program $P$, where this \type{main} function needs to be in the current
 translation unit. The final state results in the program output of value $n$ when reaching a
 \type{Returnstate} with an empty stack frame.
 
-\subsection[title={Memory Model},reference={sec:verilog:memory}]
+\subsection[sec:verilog:memory]{Memory Model}
 
 The Verilog semantics do not define a memory model for Verilog, as this is not needed for a hardware
 description language. There is no preexisting architecture that Verilog will produce; it can
@@ -981,7 +941,8 @@ supported in , this trace will always be empty.
 Whenever the translation from $C$ succeeds and produces Verilog $V$, and all observable behaviours
 of $C$ are safe, then $V$ has behaviour $B$ only if $C$ has behaviour $B$.
 
-%\startformula \forall C, V, B,\quad \yhfunction{HLS} (C) = \yhconstant{OK} (V) \land \mathit{Safe}(C) \implies (V \Downarrow B \implies C \Downarrow B). \stopformula
+\placeformula\startformula \forall C, V, B,\quad \text{\tt HLS} (C) = \text{\tt OK} (V) \land
+\mathit{Safe}(C) \implies (V \Downarrow B \implies C \Downarrow B). \stopformula
 
 Why is this correctness theorem also the right one for HLS? It could be argued that hardware
 inherently runs forever and therefore does not produce a definitive final result. This would mean
@@ -1010,11 +971,10 @@ forward simulation, it suffices to prove forward simulations between each pair o
 intermediate languages, as these results can be composed to prove the correctness of the whole HLS
 tool. The forward simulation from 3AC to HTL is stated in \in{Lemma}[lemma:htl]
 (\in{Section}[sec:proof:3ac_htl]), the forward simulation for the RAM insertion is shown in
-Lemma~\goto{{[}lemma:htl_ram{]}}[lemma:htl_ram] (\in{Section}[sec:proof:ram_insertion]), then the
-forward simulation between HTL and Verilog is shown in
-Lemma~\goto{{[}lemma:verilog{]}}[lemma:verilog] (\in{Section}[sec:proof:htl_verilog]), and finally,
-the proof that Verilog is deterministic is given in
-Lemma~\goto{{[}lemma:deterministic{]}}[lemma:deterministic] (\in{Section}[sec:proof:deterministic]).
+\in{Lemma}[lemma:htl_ram] (\in{Section}[sec:proof:ram_insertion]), then the forward simulation
+between HTL and Verilog is shown in \in{Lemma}[lemma:verilog] (\in{Section}[sec:proof:htl_verilog]),
+and finally, the proof that Verilog is deterministic is given in \in{Lemma}[lemma:deterministic]
+(\in{Section}[sec:proof:deterministic]).
 
 \subsection[sec:proof:3ac_htl]{Forward Simulation from 3AC to HTL}
 
@@ -1023,20 +983,19 @@ requires a larger proof, because the translation from 3AC instructions to Verilo
 to be proven correct in this step. In addition to that, the semantics of HTL are also quite
 different to the 3AC semantics. Instead of defining small-step semantics for each construct in
 Verilog, the semantics are defined over one clock cycle and mirror the semantics defined for
-Verilog.  Lemma~\goto{{[}lemma:htl{]}}[lemma:htl] shows the result that needs to be proven in this
-subsection.
+Verilog.  \in{Lemma}[lemma:htl] shows the result that needs to be proven in this subsection.
 
 \startlemma[lemma:htl]
 Writing \type{tr_htl} for the translation from 3AC to HTL, we have:
 \startformula
-\forall c, h, B \in \mono{Safe},\quad \mono{tr\_htl} (c) = \mono{OK} (h) \land c
+\forall c, h, B \in \text{\tt Safe},\quad \text{\tt tr\_htl} (c) = \text{\tt OK} (h) \land c
 \Downarrow B \implies h \Downarrow B.
 \stopformula
 \stoplemma
 
 \startproof
 We prove this lemma by first establishing a specification of the translation
-function $\mono{tr\_htl}$ that captures its important properties, and then splitting the proof into
+function $\text{\tt tr\_htl}$ that captures its important properties, and then splitting the proof into
 two parts: one to show that the translation function does indeed meet its specification, and one to
 show that the specification implies the desired simulation result. This strategy is in keeping with
 standard practice.
@@ -1047,27 +1006,33 @@ standard practice.
 The specification for the translation of 3AC instructions into HTL data-path and control logic can
 be defined by the following predicate:
 
-%\startformula \yhfunction{spec\_instr}\ \mathit{fin}\ \mathit{ret}\ \sigma\ \mathit{stk}\ i\ \mathit{data}\ \mathit{control} \stopformula
+\startformula \text{\tt spec\_instr}\ \mathit{fin}\ \mathit{ret}\ \sigma\ \mathit{stk}\ i\
+  \mathit{data}\ \mathit{control} \stopformula
 
 Here, the $\mathit{control}$ and $\mathit{data}$ parameters are the statements that the current 3AC
 instruction $i$ should translate to. The other parameters are the special registers defined in
 Section~\goto{{[}sec:verilog:integrating{]}}[sec:verilog:integrating].  An example of a rule
 describing the translation of an arithmetic/logical operation from 3AC is the following:
 
-%\startformula \inferrule[Iop]{\yhfunction{tr\_op}\ \mathit{op}\ \vec{a} = \yhconstant{OK}\ e}{\yhfunction{spec\_instr}\ \mathit{fin}\ \mathit{ret}\ \sigma\ \mathit{stk}\ (\yhconstant{Iop}\ \mathit{op}\ \vec{a}\ d\ n)\ (d\ \yhkeyword{<=}\ e)\ (\sigma\ \yhkeyword{<=}\ n)} \stopformula
+\startformula \text{\sc Iop}{\text{\tt tr\_op}\ \mathit{op}\ \vec{a} = \text{\tt OK}\ e}{\text{\tt
+    spec\_instr}\ \mathit{fin}\ \mathit{ret}\ \sigma\ \mathit{stk}\ (\text{\tt Iop}\ \mathit{op}\
+  \vec{a}\ d\ n)\ (d\ \text{\tt <=}\ e)\ (\sigma\ \text{\tt <=}\ n)} \stopformula
 
 Assuming that the translation of the operator $\mathit{op}$ with operands $\vec{a}$ is successful
 and results in expression $e$, the rule describes how the destination register $d$ is updated to $e$
 via a non-blocking assignment in the data path, and how the program counter $\sigma$ is updated to
 point to the next CFG node $n$ via another non-blocking assignment in the control logic.
 
-In the following lemma, $\mono{spec\_htl}$ is the top-level specification predicate, which is built
-using $\mono{spec\_instr}$ at the level of instructions.
+In the following lemma, $\text{\tt spec\_htl}$ is the top-level specification predicate, which is built
+using $\text{\tt spec\_instr}$ at the level of instructions.
 
-\reference[lemma:specification]{} If a 3AC program $c$ is translated correctly to an HTL program
-$h$, then the specification of the translation holds.
+\startlemma[lemma:specification]
+  If a 3AC program $c$ is translated correctly to an HTL program $h$, then the specification of the
+  translation holds.
 
-%\startformula \forall c, h,\quad \yhfunction{tr\_htl} (c) = \yhconstant{OK}(h) \implies \yhfunction{spec\_htl}\ c\ h. \stopformula
+\startformula \forall c, h,\quad \text{\tt tr\_htl} (c) = \text{\tt OK}(h) \implies \text{\tt
+  spec\_htl}\ c\ h. \stopformula
+\stoplemma
 
 \subsubsection[from-specification-to-simulation]{From Specification to Simulation}
 
@@ -1112,15 +1077,19 @@ HTL state can be represented by the pair $(\Gamma_{r}, \Gamma_{a})$, which captu
 all the registers and arrays in the module.  Finally, $\mathcal{I}$ stands for the other invariants
 that need to hold for the states to match.
 
-\reference[lemma:simulation_diagram]{} Given the 3AC state $(R,M,\mathit{pc})$ and the matching HTL
-state $(\Gamma_{r}, \Gamma_{a})$, assuming one step in the 3AC semantics produces state
-$(R',M',\mathit{pc}')$, there exist one or more steps in the HTL semantics that result in matching
-states $(\Gamma_{r}', \Gamma_{a}')$. This is all under the assumption that the specification
-$\mono{spec\_{htl}}$ holds for the translation.
+\startlemma[lemma:simulation_diagram]
+  Given the 3AC state $(R,M,\mathit{pc})$ and the matching HTL state $(\Gamma_{r}, \Gamma_{a})$,
+  assuming one step in the 3AC semantics produces state $(R',M',\mathit{pc}')$, there exist one or
+  more steps in the HTL semantics that result in matching states $(\Gamma_{r}', \Gamma_{a}')$. This
+  is all under the assumption that the specification $\text{\tt spec\_{htl}}$ holds for the
+  translation.
+\stoplemma
 
-{\em Proof sketch.} This simulation diagram is proven by induction over the operational semantics of
+\startproof
+This simulation diagram is proven by induction over the operational semantics of
 3AC, which allows us to find one or more steps in the HTL semantics that will produce the same final
-matching state.~◻
+matching state.
+\stopproof
 
 \subsection[sec:proof:ram_insertion]{Forward Simulation of RAM Insertion}
 
@@ -1158,10 +1127,10 @@ translation proves that the specification holds.
 \subsubsection[from-specification-to-simulation-1]{From Specification to Simulation}
 
 Another simulation proof is performed to prove that the insertion of the RAM is semantics
-preserving. As in Lemma~\goto{{[}lemma:simulation_diagram{]}}[lemma:simulation_diagram], we require
-some invariants that always hold at the start and end of the simulation. The invariants needed for
-the simulation of the RAM insertion are quite different to the previous ones, so we can define these
-invariants $\mathcal{I}_{r}$ to be the following:
+preserving. As in \in{Lemma}[lemma:simulation_diagram], we require some invariants that always hold
+at the start and end of the simulation. The invariants needed for the simulation of the RAM
+insertion are quite different to the previous ones, so we can define these invariants
+$\mathcal{I}_{r}$ to be the following:
 
 \startitemize
 \item
@@ -1173,17 +1142,19 @@ invariants $\mathcal{I}_{r}$ to be the following:
 \stopitemize
 
 The other invariants and assumptions for defining two matching states in HTL are quite similar to
-the simulation performed in Lemma~\goto{{[}lemma:simulation_diagram{]}}[lemma:simulation_diagram],
-such as ensuring that the states have the same value, and that the values in the registers are less
-defined. In particular, the less defined relation matches up all the registers, except for the new
-registers introduced by the RAM.
+the simulation performed in \in{Lemma}[lemma:simulation_diagram], such as ensuring that the states
+have the same value, and that the values in the registers are less defined. In particular, the less
+defined relation matches up all the registers, except for the new registers introduced by the RAM.
 
-\reference[lemma:htl_ram]{} Given an HTL program, the forward-simulation relation should hold after
-inserting the RAM and wiring the load, store, and control signals.
+\startlemma[lemma:htl_ram]
+  Given an HTL program, the forward-simulation relation should hold after inserting the RAM and
+  wiring the load, store, and control signals.
 
-%\startformula \begin{aligned}
-%    \forall h, h', B \in \mono{Safe},\quad \yhfunction{tr\_ram\_ins}(h) = h' \land h \Downarrow B \implies h' \Downarrow B.
-%  \end{aligned} \stopformula
+\startformula
+  \forall h, h', B \in \text{\tt Safe},\quad \text{\tt tr\_ram\_ins}(h) = h' \land h \Downarrow B
+  \implies h' \Downarrow B.
+\stopformula
+\stoplemma
 
 \subsection[sec:proof:htl_verilog]{Forward Simulation from HTL to Verilog}
 
@@ -1192,41 +1163,53 @@ representation of the code to the case-statement representation. The proof is mo
 semantics of a map structure is quite different to that of the case-statement to which it is
 converted.
 
-\reference[lemma:verilog]{} In the following, we write $\mono{tr\_verilog}$ for the translation from
-HTL to Verilog.  (Note that this translation cannot fail, so we do not need the constructor here.)
+\startlemma[lemma:verilog]
+  In the following, we write $\text{\tt tr\_verilog}$ for the translation from HTL to Verilog.
+  (Note that this translation cannot fail, so we do not need the constructor here.)
 
-%\startformula \begin{aligned}
-    %\forall h, V, B \in \mono{Safe},\quad \yhfunction{tr\_verilog} (h) = V \land h \Downarrow B \implies V \Downarrow B.
-  %\end{aligned} \stopformula
+  \startformula
+    \forall h, V, B \in \text{\tt Safe},\quad \text{\tt tr\_verilog} (h) = V \land h \Downarrow B
+    \implies V \Downarrow B.
 
-{\em Proof sketch.} The translation from maps to case-statements is done by turning each node of the
-tree into a case-expression containing the same statements. The main difficulty is that a
-random-access structure is being transformed into an inductive structure where a certain number of
-constructors need to be called to get to the correct case.~◻
+  \stopformula
+\stoplemma
+
+\startproof
+  The translation from maps to case-statements is done by turning each node of the
+  tree into a case-expression containing the same statements. The main difficulty is that a
+  random-access structure is being transformed into an inductive structure where a certain number of
+  constructors need to be called to get to the correct case.
+\stopproof
 
 \subsection[sec:proof:deterministic]{Deterministic Verilog Semantics}
 
 The final lemma we need is that the Verilog semantics is deterministic.  This result allows us to
 replace the forwards simulation we have proved with the backwards simulation we desire.
 
-\reference[lemma:deterministic]{} If a Verilog program $V$ admits behaviours $B_1$ and $B_2$, then
+\startlemma[lemma:deterministic]
+If a Verilog program $V$ admits behaviours $B_1$ and $B_2$, then
 $B_1$ and $B_2$ must be the same.
 
-%\startformula \forall V, B_{1}, B_{2},\quad V \Downarrow B_{1} \land V \Downarrow B_{2} \implies B_{1} = B_{2}. \stopformula
+  \startformula
+    \forall V, B_{1}, B_{2},\quad V \Downarrow B_{1} \land V \Downarrow B_{2} \implies B_{1} =
+    B_{2}.
+  \stopformula
+\stoplemma
 
-{\em Proof sketch.} The Verilog semantics is deterministic because the
-order of operation of all the constructs is defined, so there is only
-one way to evaluate the module, and hence only one possible behaviour.
-This was proven for the small-step semantics shown in
-Fig.~\goto{{[}fig:inference_module{]}}[fig:inference_module].~◻
+\startproof
+ The Verilog semantics is deterministic because the order of operation of all the constructs is
+ defined, so there is only one way to evaluate the module, and hence only one possible behaviour.
+ This was proven for the small-step semantics shown in
+ Fig.~\goto{{[}fig:inference_module{]}}[fig:inference_module].
+\stopproof
 
 \subsection[coq-mechanisation]{Coq Mechanisation}
 
 \startplacetable[reference=tab:proof-statistics]
   \starttabulate[|l|r|r|r|r|r|]
-    \HL
-    \NC \NC {\bf Coq code} \NC \NC {\bf Spec} \NC \NC {\bf Total} \NC \NR
-    \HL
+    \FL
+    \NC \NS[1][c] {\bf Coq code} \NS[1][c] {\bf Spec} \NC {\bf Total} \NC \NR
+    \ML
     \NC Data structures and libraries \NC 280 \NC --- \NC --- \NC 500 \NC 780 \NC \NR
     \NC Integers and values \NC 98 \NC --- \NC 15 \NC 968 \NC 1081 \NC \NR
     \NC HTL semantics \NC 50 \NC --- \NC 181 \NC 65 \NC 296 \NC \NR
@@ -1235,9 +1218,9 @@ Fig.~\goto{{[}fig:inference_module{]}}[fig:inference_module].~◻
     \NC Verilog semantics \NC 78 \NC --- \NC 431 \NC 235 \NC 744 \NC \NR
     \NC Verilog generation \NC 104 \NC --- \NC --- \NC 486 \NC 590 \NC \NR
     \NC Top-level driver, pretty printers \NC 318 \NC 775 \NC --- \NC 189 \NC 1282 \NC \NR
-    \HL
+    \LL
     \NC {\bf Total} \NC 1721 \NC 775 \NC 693 \NC 8355 \NC 11544 \NC \NR
-    \HL
+    \BL
   \stoptabulate
 \stopplacetable