\documentclass[conference]{IEEEtran}
\usepackage{amsmath}
\usepackage{xcolor}
\usepackage{caption}
\usepackage{subcaption}
\usepackage{graphicx}
\usepackage{calc}  
\usepackage{enumitem} 
\usepackage{tabu}
\usepackage{tikz}
\usepackage{pgfplots}
\usepackage{pgfplotstable}
\usepackage{multirow}
\usepackage{booktabs}
\usepackage{hyperref}
\usepackage[sort, numbers]{natbib}

\usepackage[textsize=small,shadow]{todonotes}%
\usepackage{soul}
\usepackage{subcaption}
\usepackage{tikz-timing}
\usetikzlibrary{fit}
\usetikzlibrary{decorations.pathreplacing}
\usetikzlibrary{shapes,calc,arrows.meta}
\usetikzlibrary{pgfplots.groupplots}
\pgfplotsset{compat=1.16}

\usepackage{listings}
\lstset{
basicstyle=\tt\small,
columns=fixed, %basewidth=0.5em, %tip from https://tex.stackexchange.com/a/179072
escapeinside=||,
}
\setlength{\fboxsep}{1.5pt} % to reduce \colorbox padding when highlighting

%\usepackage{minted}
%\setminted{
%fontsize=\small,
%escapeinside=||,
%}
%\usemintedstyle{manni}

\definecolor{highlight0}{HTML}{ffffff}
\definecolor{highlight1}{HTML}{b4d6d0} % 8dd3c7
\definecolor{highlight2}{HTML}{fcfcdc} % ffffb3
\definecolor{highlight3}{HTML}{c9c7d6} % bebada
\definecolor{highlight4}{HTML}{fcc1bb} % fb8072
\definecolor{highlight5}{HTML}{a9c2d4} % 80b1d3

\newcommand\HL[2]{\colorbox{highlight#1}{\vphantom{A}\smash{#2}}}

% Leave review comments using
% \jwcomment{...} (for John) or \yhcomment{...} (for Yann)
% Using either directly leaves a margin note, using it as,
% e.g. \jwcomment[inline]{...} leaves an inline comment
\newcommand{\jwcomment}[2][]{\todo[author={John}, color=ACMLightBlue, #1]{#2}}
\newcommand{\yhcomment}[2][]{\todo[author={Yann}, color=ACMGreen, #1]{#2}}
\newcommand{\mpcomment}[2][]{\todo[author={Michalis}, color=ACMMagenta, #1]{#2}}


\newif\ifCOMMENTS
\COMMENTStrue
\newcommand{\Comment}[3]{\ifCOMMENTS\textcolor{#1}{{\bf [\![#2:} #3{\bf ]\!]}}\fi}
\newcommand\JW[2][]{\st{#1}\Comment{red!75!black}{JW}{#2}}
\newcommand\YH[2][]{\st{#1}\Comment{blue!50!green}{YH}{#2}}
\newcommand\MP[2][]{\st{#1}\Comment{blue!50!magenta}{MP}{#2}}

\newcommand\vericert{Vericert}
\newcommand\vericertfun{Vericert-Fun}
\newcommand\bambu{Bambu}

\title{Resource Sharing for Verified High-Level Synthesis \\ (Work in Progress)}

\begin{document}

%\author{Michail Pardalos}
%\email{michail.pardalos17@imperial.ac.uk}
%\affiliation{\institution{Imperial College London}}

%\author{Yann Herklotz}
%\email{yann.herklotz15@imperial.ac.uk}
%\affiliation{\institution{Imperial College London}}

%\author{John Wickerson}
%\email{j.wickerson@imperial.ac.uk}
%\affiliation{\institution{Imperial College London}}

\maketitle

\begin{abstract}
  High-level synthesis (HLS) is playing an ever-increasing role in hardware design, but concerns have been raised about its reliability. Seeking to address these concerns, Herklotz et al. have recently developed an HLS compiler, called \vericert{}, that has been mechanically proven (using the Coq proof assistant) to output Verilog designs that are behaviourally equivalent to the input C program. Unfortunately, \vericert{}'s output cannot compete performance-wise with that of established HLS tools such as LegUp. A major reason for this is \vericert{}'s complete lack of support for resource sharing. 
  
  In this paper, we present \vericertfun: \vericert{} extended with function-level resource sharing. Where original \vericert{} creates one block of hardware per function \emph{call}, %\MP{Is this an over-simplification? We don't really generate a `block of hardware' for function calls}
  \vericertfun{} creates one block of hardware per function \emph{definition}. The enabling innovation is to extend \vericert{}'s intermediate language HTL with the ability to represent \emph{multiple} state machines, and then to add support for function calls that transfer control between these state machines. We are working to extend \vericert{}'s formal correctness proof to cover the translation from C source into this extended HTL language, and thence on to Verilog.
  
We have benchmarked \vericertfun's performance on the PolyBench/C suite, and our results show the generated hardware having a resource usage of 61\% of \vericert{}'s on average and 46\% in the best case, for only a 3\% average decrease in max frequency and 1\% average increase in cycle count.
\end{abstract}

\section{Introduction}
\label{sec:introduction}

The drive for faster, more energy-efficient computation has led to a surge in demand for custom hardware accelerators. These devices are traditionally designed using hardware description languages such as Verilog or VHDL, but the complexities of designing in such a language, as well as the abundance of engineers trained in software rather than hardware development, has meant that \emph{high-level synthesis} (HLS) tools have become an enticing option. 

These tools are useful, but doubts have been raised about their reliability. For instance, \citet{Herklotz2021_empiricalstudy} found numerous miscompilation bugs in commercial HLS tools including Xilinx Vivado HLS~\cite{xilinx_vitis}, Intel i++~\cite{intel_hls}, and LegUp~\cite{legup_CanisAndrew2011}. This unreliability can be a significant hindrance in the development process, especially given the long iteration times of hardware design compared to software. It also undermines the usefulness of HLS in safety- or security-critical settings. It is therefore essential to ensure that HLS tools are as reliable as possible.

\vericert{}~\cite{Herklotz2020} is a new HLS tool that aims to address this issue. Its correctness has been checked to the highest possible standard: machine-checked proof. It provides a proof, checked using the Coq proof assistant~\cite{coq}, that every step of its translation from C to Verilog preserves the semantics of (i.e.\ behaves the same way as) its input program. This proof means that we can always trust any Verilog design produced by \vericert{} to behave the same way as the C program given as input. %It is based on the CompCert~\cite{compcert_Leroy2009} verified C compiler.

Clearly, however, it is not enough for an HLS tool simply to be \emph{correct}. The generated hardware must also meet several other desiderata, including high throughput, low latency, and good \emph{area efficiency}, which is the topic of this paper. A common optimisation employed by HLS tools to improve area efficiency is \emph{resource sharing}; that is, re-using hardware for more than one purpose. Accordingly, our work adds resource sharing to \vericert{}. Keeping with the aims of the \vericert{} project, work is ongoing to extend the correctness proof.

\section{Background}

\paragraph{The Coq proof assistant} \vericert{} is implemented using the Coq proof assistant~\cite{coq}. This means that it consists of a collection of functions that define the compilation process, together with the proof of a theorem stating that those definitions constitute a correct HLS tool. Coq mechanically checks this proof using a formal mathematical calculus, and then automatically translates the function definitions into OCaml code that can be compiled and executed. 

Engineering a software system within a proof assistant like Coq is widely held to be the gold standard for correctness. Recent years have shown that it is feasible to design substantial pieces of software in this way, such as database management systems~\cite{malecha+10}, web servers~\cite{chlipala15}, and operating system kernels~\cite{gu+16}. Coq has also been successfully deployed in the hardware design process, both in academia~\cite{braibant+13, bourgeat+20} and in industry~\cite{silveroak}. It has even been applied specifically to the HLS process: Faissole et al.~\cite{faissole+19} used it to verify that HLS optimisations respect dependencies present in the source code.

\paragraph{The CompCert verified C compiler} Among the most celebrated applications of Coq is the CompCert project~\cite{compcert}. CompCert is a lightly optimising C compiler, with backend support for the Arm, x86, PowerPC, and Kalray VLIW architectures~\cite{six+20}, that has been implemented and proven correct using Coq. CompCert accepts most of the C99 language, and generally produces code of comparable performance to that produced by GCC at optimisation level \texttt{-O1}. It transforms its input through a series of ten intermediate languages before generating the final output. This design ensures that each individual pass remains relatively simple and hence feasible to prove correct. The correctness proof of the entire compiler is formed by composing the correctness proofs of each of its passes. 

That the Csmith compiler testing tool has found hundreds of bugs in GCC and LLVM but has never found a single bug in (the verified parts of) CompCert~\cite{csmith} is a testament to the reliability of this development approach. That said, we cannot guarantee that hardware produced via \vericertfun{} will never go wrong, because of fallibilities in components not covered by the correctness theorem, including the computer checking the proofs, the pretty-printer of the final Verilog design, the synthesis toolchain~\cite{verismith}, and the FPGA device itself.
 
\paragraph{The \vericert{} verified HLS tool}

Introduced by \citet{Herklotz2020}, \vericert{} is a verified C-to-Verilog HLS tool. It is an extension of CompCert, essentially augmenting the existing verified C compiler with a new hardware-oriented intermediate language (called HTL) and a Verilog backend. In its current form, \vericert{} performs no significant optimisations beyond those it inherits from CompCert's frontend. This results in performance generally about one order of magnitude slower than that achieved by comparable, unverified HLS tools such as LegUp~\cite{legup_CanisAndrew2011}.

\vericert{} branches off from CompCert at the intermediate language called \emph{register-transfer language} (RTL). Since that abbreviation is usually used in the hardware community for `register-transfer level', we shall henceforth avoid possible confusion by referring to the CompCert intermediate language as `3AC' (for `three-address code').

In the 3AC language, each function in the program is represented as a numbered list of instructions with gotos -- essentially, a control-flow graph (CFG). The essence of \vericert{}'s compilation strategy is to treat this CFG as a state machine, with each instruction in the CFG becoming a state, and each edge in the CFG becoming a transition. Moreover, program variables that do not have their address taken are mapped to hardware registers; other variables (including arrays and structs) are allocated in a block of RAM that represents the stack. More precisely, \vericert{} builds a \emph{finite state machine with datapath} (FSMD). This comprises two maps, both of which take the current state as their input: the \emph{control logic} map for determining the next state, and a \emph{datapath} map for updating the RAM and registers. These state machines are captured in \vericert{}'s new intermediate language, HTL. When \vericert{} compiles from HTL to the final Verilog output, these maps are converted from proper `mathematical' functions into syntactic Verilog case-statements, and each is placed inside an always-block.

The overall \vericert{} flow is shown at the top of Figure~\ref{fig:flow}. The key point to note here is the `inlining' step, which folds all function definitions into their call sites. This allows \vericert{} to make the simplifying assumption that there is only a single CFG, but has the unwanted effect of duplicating hardware. In this work, we remove some of this inlining and hence some of the duplication.

\paragraph{Resource sharing in HLS}

Resource sharing is a feature expected of most HLS compilers. In a typical architecture generated by HLS~\cite{coussy+09}, a number of `functional components' are selected from a library according to the needs of the specific design, and in the scheduling process, a clock cycle is chosen for each operation such that the components it requires are available. Given the need to mechanically verify the correctness of our implementation, \vericertfun{} follows a simpler approach: we share resources at the granularity of entire functions, rather than individual operations. Function-level resource sharing is implemented in commercial HLS compilers such as the Intel HLS compiler~\cite{intel_hls} or
Xilinx Vitis~\cite{xilinx_vitis}, and is guided by the programmer through appropriate pragmas.

Perna et al.~\cite{perna+12} developed a verified HLS tool for the Handel-C language, but, like \vericert{}, they did not implement function-level resource sharing, instead arranging that ``all procedure and function calls are expanded in the
front-end''.

\begin{figure}
\definecolor{colorflow}{HTML}{8EB85B}

\centering
\begin{tikzpicture}[yscale=-1]
\tikzset{flowlines/.style={draw, colorflow, ultra thick}}

\node(fun1) at (1.5,2) {function};
\node(fun2) at (0,2) {function};
\node[anchor=west](C) at (-2,2) {\bf C:};
\node(CFG1) at (1.5,3) {CFG};
\node(CFG2) at (0,3) {CFG};
\node(CFG) at (0.75,4) {CFG};
\node[anchor=west](TAC) at (-2,3) {\bf 3AC:};
\node(FSMD) at (0.75,5) {state machine};
\node[anchor=west](HTL) at (-2,5) {\bf HTL:};
\node(module) at (0.75,6) {module};
\node[anchor=west](Verilog) at (-2,6) {\bf Verilog:};
\draw[->] (fun1) to node [auto] {CompCert frontend} (CFG1);
\draw[->] (fun2) to (CFG2);
\draw[->] (CFG1) to node [auto, pos=0.2] {inlining} (CFG);
\draw[->] (CFG2) to (CFG);
\draw[->] (CFG) to node [auto] (smg) {state machine generation} (FSMD);
\draw[->] (FSMD) to node [auto] {Verilog generation} (module);
\coordinate (r) at (6,3);

\begin{pgfonlayer}{background}
\node[flowlines, fit=(C)(module)(r)] (background1) {};
\end{pgfonlayer}

\begin{scope}[yshift=55mm]
\node(fun1) at (2,2) {function};
\node(fun2) at (0,2) {function};
\node[anchor=west](C) at (-2,2) {\bf C:};
\node(CFG1) at (2,3) {CFG};
\node(CFG2) at (0,3) {CFG};
\node[anchor=west](TAC) at (-2,3) {\bf 3AC:};
\node(FSMD1) at (2,4) {state machine};
\node(FSMD2) at (0,4) {state machine};
\node(FSMD3) at (2,5) {state machine};
\node(FSMD4) at (0,5) {state machine};
\node[anchor=west](HTL) at (-2,4) {\bf HTL:};
\node(module) at (1,6) {module};
\node[anchor=west](Verilog) at (-2,6) {\bf Verilog:};
\draw[->] (fun1) to node [auto] {CompCert frontend} (CFG1);
\draw[->] (fun2) to (CFG2);
\draw[->] (CFG1) to node [auto] (smg) {state machine generation} (FSMD1);
\draw[->] (CFG2) to (FSMD2);
\draw[->] (FSMD1) to node [auto] {renaming} (FSMD3);
\draw[->] (FSMD2) to (FSMD4);
\draw[->] (FSMD3) to node [auto, pos=0.2] {Verilog generation} (module);
\draw[->] (FSMD4) to (module);
\coordinate (r) at (6,3);

\end{scope}

\begin{pgfonlayer}{background}
\node[flowlines, fit=(C)(module)(r)] (background2) {};
\end{pgfonlayer}

\draw[flowlines, -latex] (background1.south) to node [auto] {\bf this work} (background2.north -| background1.south);

\end{tikzpicture}
\caption{Key compilation passes and intermediate languages in \vericert{}~\cite{Herklotz2020} (top) and \vericertfun{} (bottom)}
\label{fig:flow}
\end{figure}

\section{Implementation of \vericertfun}

\tikzset{
st/.style={draw=black, fill=white, rounded corners, align=left, font=\tt\footnotesize, minimum width=40mm},
ed/.style={draw, ->},
exted/.style={draw, thick, dotted, ->, rounded corners},
lab/.style={anchor=south west, inner sep=1pt, font=\tiny},
tit/.style={anchor=north west, font=\tt\footnotesize},
edlab/.style={auto, inner sep=2pt, align=left, font=\tt\footnotesize}
}

In this section, we shall explain the implementation of \vericertfun, using Figure~\ref{fig:example_C} as a worked example. The basic idea is shown at the bottom of Figure~\ref{fig:flow}: we avoid inlining the function calls at the 3AC level (except in certain circumstances described below), instead maintaining one state machine per function. All the state machines run simultaneously, and function calls are implemented by sending messages between the state machines. We combine all of these state machines into a single Verilog module, after renaming variables as necessary to avoid clashes.

\begin{figure}
\centering
\begin{minipage}[b]{0.45\linewidth}
\begin{lstlisting}
int add(int a, int b) 
{ 
  |\HL5{return a + b;}| 
}

int main() 
{
  |\HL1{int v = 0;}|
  |\HL2{v = add(v, 1);}|
  |\HL3{v = add(v, 2);}|
  |\HL4{return v;}|
}
\end{lstlisting}
\caption{A simple C program with each instruction colour-coded so it can be tracked through the compilation stages}
\label{fig:example_C}
\end{minipage}
\hfill
\begin{minipage}[b]{0.5\linewidth}
% \begin{lstlisting}
% add (x2, x1) {
%   |\HL5{2: x3 = x2 + x1 + 0 (int)}|
%   |\HL5{1: return x3}|
% }
% main () {
%   |\HL1{9: x3 = 0}|
%   |\HL2{8: x6 = 1}|
%   |\HL2{7: x1 = "add"(x3, x6)}|
%   |\HL2{6: x3 = x1}|
%   |\HL3{5: x5 = 2}|
%   |\HL3{4: x2 = "add"(x3, x5)}|
%   |\HL3{3: x3 = x2}|
%   |\HL4{2: x4 = x3}|
%   |\HL4{1: return x4}|
% }
% \end{lstlisting}
\begin{tikzpicture}[node distance=2mm]
\node[st, fill=highlight5]    (a2) at (0,-1) {x3 = x2 + x1 + 0(int)};
\node[st, fill=highlight5, below=of a2] (a1) {return x3};
\draw[ed] (a2) to (a1);
\node[tit] at ([xshift=-4mm, yshift=5mm]a2.north west) (labadd) {add(x2, x1) \{};
\node[tit] at ([xshift=-4mm, yshift=-1mm]a1.south west) (labadd2) {\}};
\begin{scope}[yshift=-24mm]
\node[st, fill=highlight1]    (m9) at (0,-1) {x3 = 0};
\node[st, fill=highlight2, below=of m9] (m8) {x6 = 1};
\node[st, fill=highlight2, below=of m8] (m7) {x1 = "add"(x3, x6)};
\node[st, fill=highlight2, below=of m7] (m6) {x3 = x1};
\node[st, fill=highlight3, below=of m6] (m5) {x5 = 2};
\node[st, fill=highlight3, below=of m5] (m4) {x2 = "add"(x3, x5)};
\node[st, fill=highlight3, below=of m4] (m3) {x3 = x2};
\node[st, fill=highlight4, below=of m3] (m2) {x4 = x3};
\node[st, fill=highlight4, below=of m2] (m1) {return x4};
\draw[ed] (m9) to (m8);
\draw[ed] (m8) to (m7);
\draw[ed] (m7) to (m6);
\draw[ed] (m6) to (m5);
\draw[ed] (m5) to (m4);
\draw[ed] (m4) to (m3);
\draw[ed] (m3) to (m2);
\draw[ed] (m2) to (m1);
\node[tit] at ([xshift=-4mm, yshift=5mm]m9.north west) (labmain) {main() \{};
\node[tit] at ([xshift=-4mm, yshift=-1mm]m1.south west) (labmain2) {\}};
\end{scope}
\foreach\i in {1,2,...,9}
  \node[lab, anchor=south east] at (m\i.west) {\i};
\foreach\i in {1,2}
  \node[lab, anchor=south east] at (a\i.west) {\i};
\end{tikzpicture}
\caption{3AC representation comprising two CFGs}
\label{fig:example_3AC}
\end{minipage}

\end{figure}

Figure~\ref{fig:example_3AC} shows the 3AC representation of that C program, as obtained by the CompCert frontend. It takes the form of two CFGs, one per function. The control flow in this example is straightforward, but in general, 3AC programs can contain unrestricted gotos. The nodes of the CFGs are numbered in reverse order, as are the parameters of the \lstinline{add} function; both are conventions inherited from CompCert.

Figure~\ref{fig:example_HTL} depicts the result of converting those CFGs into two state machines. The conversion of 3AC instructions into Verilog instructions has been described already by \citet{Herklotz2020}; what is new here is the handling of function calls, so the following description concentrates on that aspect. Before we proceed, there are two conventions worth noting: first, ``$*{\rightarrow}\langle\mathit{node}\rangle$'' should be interpreted as an abbreviation for edges from all nodes to $\langle\mathit{node}\rangle$, and second, we allow the \lstinline{main} machine to refer to the \lstinline{add} machine's register $\langle\mathit{reg}\rangle$ by writing ``\lstinline{add.}$\langle\mathit{reg}\rangle$''.

The solid edges within the two state machines indicate the transfer of control between states, while the dashed edges between the state machines are more `fictional'. The ground truth is that both state machines run continuously, but it is convenient to think of only one of the machines as doing useful work at any point in time. So, the dashed edges indicate the points at which the `active' machine changes.

\definecolor{background}{HTML}{eeeeee}
\begin{figure}
\centering
\begin{tikzpicture}[node distance=5mm]
\node[st, fill=highlight1] (m9) at (0,0)     {reg\_3 <= 0;};
\node[st, fill=highlight2, below=of m9] (m8) {reg\_6 <= 1;};
\node[st, fill=highlight2, below=of m8] (m7) {add.rst <= 1; \\ add.param\_0 <= reg\_3; \\ add.param\_1 <= reg\_6;};
\node[st, fill=highlight2, below=of m7] (m12) {add.rst <= 0; \\ reg\_1 <= add.ret;};
\node[st, fill=highlight2, below=of m12] (m6) {reg\_3 <= reg\_1;};
\node[st, fill=highlight3, below=of m6] (m5)  {reg\_5 <= 2;};
\node[st, fill=highlight3, below=of m5] (m4) {add.rst <= 1; \\ add.param\_0 <= reg\_3; \\ add.param\_1 <= reg\_5;};
\node[st, fill=highlight3, below=of m4] (m10) {add.rst <= 0; \\ reg\_2 <= add.ret;};
\node[st, fill=highlight3, below=of m10] (m3) {reg\_3 <= reg\_2;};
\node[st, fill=highlight4, below=of m3] (m2)  {reg\_4 <= reg\_3;};
\node[st, fill=highlight4, below=of m2] (m1)  {fin = 1; \\ ret = reg\_4;};
\node[st, fill=highlight4, below=of m1] (m11) {fin <= 0;};

\node[st, fill=highlight5, above=38mm of m9] (a2) {reg\_7 <= \{\{x2 + x1\} + 0\};};
\node[st, fill=highlight5, below=of a2] (a1) {fin = 1; \\ ret = reg\_7;};
\node[st, fill=highlight5, below=of a1] (a3) {fin <= 0;};

\draw[ed] (m9) to node[edlab] {!rst} (m8);
\draw[ed] (m8) to node[edlab] {!rst} (m7);
\draw[ed] (m7) to node[edlab] {!rst} (m12);
\draw[ed] (m12) to node[edlab] {!rst \&\& add.fin} (m6);
\draw[ed] (m6) to node[edlab] {!rst} (m5);
\draw[ed] (m5) to node[edlab] {!rst} (m4);
\draw[ed] (m4) to node[edlab] {!rst} (m10);
\draw[ed] (m10) to node[edlab] {!rst \&\& add.fin} (m3);
\draw[ed] (m3) to node[edlab] {!rst} (m2);
\draw[ed] (m2) to node[edlab] {!rst} (m1);
\draw[ed] (m1) to node[edlab] {!rst} (m11);
\draw[ed] (m12) to[loop right, looseness=2, out=-4, in=4] node[edlab, swap] {!rst \&\& \\ !add.fin} (m12);
\draw[ed] (m10) to[loop right, looseness=2, out=-4, in=4] node[edlab, swap] {!rst \&\& \\ !add.fin} (m10);
\draw[ed] (m11) to[loop below, looseness=10, pos=0.15] node[edlab] {!rst} (m11);

\node[inner sep=1pt] (ast1) at ([xshift=5mm, yshift=-5mm]m9.south east) {*};
\draw[ed] (ast1.north) to[bend right] node[edlab, swap] {rst} (m9.east);

\draw[ed] (a2) to node[edlab] {!rst} (a1);
\draw[ed] (a1) to node[edlab] {!rst} (a3);
\draw[ed] (a3) to[loop below, looseness=10, pos=0.15] node[edlab] {!rst} (a3);

\node[inner sep=1pt] (ast2) at ([xshift=5mm, yshift=-5mm]a2.south east) {*};
\draw[ed] (ast2.north) to[bend right] node[edlab, swap] {rst} (a2.east);

\coordinate (nw1) at ([xshift=-3mm, yshift=6mm]m9.north west); 
\coordinate (se1) at ([xshift=16mm, yshift=-4mm]m11.south east);
\begin{pgfonlayer}{background}
\node[draw, fill=background, rounded corners, fit=(nw1)(se1)] (main) {};
\end{pgfonlayer}
\node[tit, above=2mm of m9] (labmain) {\bfseries main()};
\coordinate (nw2) at ([xshift=-3mm, yshift=6mm]a2.north west); 
\coordinate (se2) at ([xshift=16mm, yshift=-4mm]a3.south east);
\begin{pgfonlayer}{background}
\node[draw, fill=background, rounded corners, fit=(nw2)(se2)] (add) {};
\end{pgfonlayer}
\node[tit, above=2mm of a2] (labadd) {\bfseries add(x2,x1)};

\foreach\i in {1,2,...,12}
  \node[lab] at (m\i.north west) {\i};
\foreach\i in {1,2,3}
  \node[lab] at (a\i.north west) {\i};

\draw[exted] (m7.west) -- ++(-7mm,0) |- ([yshift=-1mm]a2.west);
\draw[exted] (m4.west) -- ++(-9mm,0) |- ([yshift=1mm]a2.west);
\draw[exted] ([yshift=-1mm]a3.east) -- ++(19mm,0) |- (m6.east);
\draw[exted] ([yshift=1mm]a3.east) -- ++(21mm,0) |- (m3.east);

\end{tikzpicture}
\caption{HTL representation comprising two state machines}
\label{fig:example_HTL}
\medskip
\begin{tikztimingtable}[timing/d/background/.style={fill=white}, timing/xunit=1em, timing/yunit=0.6em]
{\tt\footnotesize clk}\!\!\! & 0.5L 38{0.5C} \\
{\tt\footnotesize rst}\!\!\! & 0.5H 19L \\
{\tt\footnotesize fin}\!\!\! & 0.5X 17L H L \\
{\tt\footnotesize state}\!\!\! & 0.5X D{9} D{8} D{7} 4D{12} D{6} D{5} D{4} 4D{10} D{3} D{2} D{1} 2D{11} \\
{\tt\footnotesize rst}\!\!\! & 3.5X H 6L H 8L \\
{\tt\footnotesize fin}\!\!\! & 4.5X 2L H 6L H 5L \\
{\tt\footnotesize state}\!\!\! & 4.5X D{2} D{1} 5D{3} D{2} D{1} 6D{3} \\
\extracode
\foreach\i in {1,2,...,19}
\node[help lines, anchor=west, inner sep=0] at (\i-0.5,2.25) {\tiny \i};
\begin{pgfonlayer}{background}
  \vertlines[help lines]{0.5,1.5,...,18.5}
\end{pgfonlayer}
\draw[decoration={brace,mirror,raise=5pt},decorate] (-2.6,-1) to node[left=6pt] {\tt\footnotesize main} (-2.6,-6.5);
\draw[decoration={brace,mirror,raise=5pt},decorate] (-2.6,-7) to node[left=6pt] {\tt\footnotesize
add} (-2.6,-12.5);
\end{tikztimingtable}
\caption{Timing diagram for the state machines in Figure~\ref{fig:example_HTL}}
\label{fig:timingdiagram}

\end{figure}

In more detail: Execution begins in state 9 of the \lstinline{main} machine, and proceeds through successive states until it reaches state 7, in which the \lstinline{add} machine's \lstinline{rst} signal is raised. This puts the \lstinline{add} machine into state 2. When the \lstinline{main} machine advances to state 12, that \lstinline{rst} signal is lowered; the \lstinline{add} machine then begins its computation while the \lstinline{main} machine spins in state 12. When the \lstinline{add} machine has finished its computation (state 1), it asserts its \lstinline{fin} signal. This allows the \lstinline{main} machine to progress from state 12. Finally, the \lstinline{add} machine lowers its \lstinline{fin} and waits in state 3 until the next call.

The same sequence of events can also be understood using a timing diagram, as given in Figure~\ref{fig:timingdiagram}. In that diagram, the red lines indicate unspecified values. We see that each call of \lstinline{add} begins with a pulse on \lstinline{add.rst} (fifth waveform) and ends with a pulse on \lstinline{add.fin} (sixth waveform).

One technical challenge we encountered in the implementation of \vericertfun{} has to do with the fact that the caller and callee state machines modify each other's variables. This is problematic because each function is translated into a state machine independently, and hence the variable names used in the other state machines are not necessarily fixed yet. We work around this problem by introducing an additional ... \JW{TODO: finish explaining externctrl.}

\JW{Explain the assumption made about no pointers in called functions.}

\section{Proving \vericertfun{} correct}

The CompCert correctness theorem~\cite{compcert} expresses that every behaviour that can be exhibited by the compiled program is also a behaviour of the source program. \vericert{}~\cite{Herklotz2020} adapted this theorem for HLS by replacing `compiled program' with `generated Verilog design'. In both cases, a formal semantics is required for the source and target languages. \vericertfun{} targets the same fragment of the Verilog language as \citeauthor{Herklotz2020} already mechanised in Coq, so no changes are required there. 

Where changes \emph{are} required is in the semantics of the intermediate language HTL, which sits between CompCert's 3AC and the final Verilog.
When \citeauthor{Herklotz2020} designed HTL, they did not include a semantics for function calls because they assumed all function calls would already have been inlined. We have extended HTL so that its semantics is additionally parameterised by an environment that maps function names to state machines. We have added a semantics for function calls that looks up the named function in this environment, activates the corresponding state machine, and creates a new frame on the stack. We have also added a semantics for return statements that pops the current frame off the stack and reactivates the caller's state machine. \MP{TODO: Consider saying something here about how the effect of a function call is to put the state machine into a particular `callstate'. My concern is that this requires introducing a different notion of `state', distinct from the states of the state machine, and that this will probably confuse readers.}

At the point of writing, the correctness of \vericertfun{} from C to HTL has been mostly proven,
meaning basic instructions, as well as the function calls themselves and their interaction with the
stack is proven correct.  Proofs of the load and store instructions are not quite finished, as
additional invariants need to be proven to hold for those instructions.  The pass that renames
variables in HTL is yet to be proven, as is the pass that generates the final Verilog.

To give a rough idea of the scale and complexity of the task: the implementation of \vericertfun{} involved the addition of about \MP{\ref{???}} lines of Coq code to \vericert{} and took the first author \MP{\ref{???}} months. The correctness proof, so far, has taken \MP{\ref{???}} lines of Coq code and \MP{\ref{???}} months.

\section{Performance evaluation}

We now compare the performance of the hardware generated by \vericertfun{} against that generated by Vericert. Following \citet{Herklotz2020}, we use the PolyBench/C benchmark suite~\cite{polybench}. We used the Icarus Verilog simulator to determine the cycle counts of the generated designs. We used Xilinx Vivado 2017.1, targeting a Xilinx 7-series FPGA (XC7K70T) to determine area usage and fmax.

%Figure~\ref{fig:results} summarises our results. The x-axis shows the impact of resource sharing on
%the speed of the hardware (as calculated by the cycle count divided by fmax); we see that all the
%data points lie very close to 1, which suggests no significant impact. On average the cycle count
%increases by 0.7\%; this modest increase is in line with expectations because our translation
%introduces an extra state per function call. The impact on fmax is similarly minimal, ranging
%between a 1.5\% increase and a 3.1\% decrease (0.2\% decrease on average).

Figure~\ref{fig:results} shows the area savings of \vericertfun{} obtained compared to \vericert{},
and also shows their relative execution speed.  These are compared the \bambu{} high-level synthesis
tool~\cite{pilato13_bambu}, which is open source but unverified.  First, one can can note that
designs generated by \vericertfun{} and \vericert{} execute for a same amount of time, as any
variations can mostly be explained by slight differences in the maximum frequency calculation.  This
shows that the addition of the resource sharing does not negatively affect the execution time.  The
area difference between \vericertfun{} and \vericert{} is quite large, which shows the effectiveness
of the resource sharing that \vericertfun{} introduces.  On average, the area of \vericert{} designs
is 1.4$\times$ larger than designs produced by \bambu{}, whereas designs produced by \vericertfun{}
are on average 0.7$\times$ the size of \bambu{} designs.

\pgfplotstableread[col sep=comma]{data/time-ratio.csv}{\divtimingtable}
\pgfplotstableread[col sep=comma]{data/slice-ratio.csv}{\divslicetable}
\definecolor{vericertcol}{HTML}{66C2A5}
\definecolor{legupnooptcol}{HTML}{FC8D62}
\definecolor{legupnooptnochaincol}{HTML}{8DA0CB}
\newcommand\backgroundbar[2][5]{\draw[draw=none, fill=black!#1] (axis cs:#2*2+0.5,0.1) rectangle
  (axis cs:1+#2*2+0.5,300);}
\newcommand\legup{Bambu}

\begin{figure}
  \begin{tikzpicture}[scale=0.8]
    \begin{groupplot}[
      group style={
        group name=my plots,
        group size=1 by 2,
        xlabels at=edge bottom,
        xticklabels at=edge bottom,
        vertical sep=5pt,
      },
      ymode=log,
      ybar=0.4pt,
      width=0.6\textwidth,
      height=0.32\textwidth,
      /pgf/bar width=2pt,
      legend pos=south east,
      log ticks with fixed point,
      xticklabels from table={\divtimingtable}{benchmark},
      legend style={nodes={scale=0.7, transform shape}},
      x tick label style={rotate=90,anchor=east,font=\footnotesize},
      legend columns=-1,
      xtick=data,
      enlarge x limits={abs=0.5},
      ylabel style={font=\footnotesize},
      xtick style={draw=none},
      ]

      \nextgroupplot[ymin=1,ymax=7,ylabel={Execution time relative to \legup{}}]
      \pgfplotsinvokeforeach{0,...,12}{%
        \backgroundbar{#1}}
      \backgroundbar[10]{13}
      \addplot+[vericertcol] table [x expr=\coordindex,y=vericert,col sep=comma] from \divtimingtable;
      \addplot+[legupnooptcol] table [x expr=\coordindex,y=vericert-fun,col sep=comma] from \divtimingtable;
      \draw (axis cs:-1,1) -- (axis cs:28,1);
      % JW: redraw axis border which has been partially covered by the grey bars
      \draw (axis cs:-0.5,0.8) rectangle (axis cs:27.5,300);

      \nextgroupplot[ymin=0.3,ymax=5,ylabel={Area relative to \legup{}}]
      \pgfplotsinvokeforeach{0,...,12}{%
        \backgroundbar{#1}}
      \backgroundbar[10]{13}
      \addplot+[vericertcol] table [x expr=\coordindex,y=vericert,col sep=comma] from \divslicetable;
      \addplot+[legupnooptcol] table [x expr=\coordindex,y=vericert-fun,col sep=comma] from \divslicetable;
      \draw (axis cs:-1,1) -- (axis cs:28,1);
      % JW: redraw axis border which has been partially covered by the grey bars
      \draw (axis cs:-0.5,0.3) rectangle (axis cs:27.5,10);

      \legend{\vericert{},vericert-fun};
    \end{groupplot}
  \end{tikzpicture}
  \caption{Performance of \vericert{} compared to \vericertfun{}, using \legup{} as a baseline.}
\label{fig:results}
\end{figure}

\section{Future work}

Our immediate priority is to complete \vericertfun's correctness proof. In the medium term, we would
like to improve our implementation of resource sharing by dropping the requirement to inline
functions that access pointers; we anticipate that this will lead to further reductions in area
usage. We would also like to make \vericertfun generate designs with one Verilog module per C
function, as this is more idiomatic than packing all the state machines into a single module; we did
not do this yet because it would require extending the formal Verilog semantics to handle multiple
modules. \YH{TODO: Please check that sentence.}\YH{Yes that looks correct to me.} In the longer
term, we are considering how to implement resource sharing even more effectively in a verified HLS
tool, perhaps by implementing it as part of a resource-constrained scheduling algorithm~\cite{sdc}.


\bibliographystyle{ACM-Reference-Format}
\bibliography{references}

\appendix

We provide the Verilog output that is generated from our running example (Figures~\ref{fig:example_C} to~\ref{fig:example_HTL}).

\begin{lstlisting}
module main (clk, rst, ret, fin)
  input clk, rst;
  output reg [31:0] ret;
  output reg fin;
  reg[31:0] state;
  reg[31:0] reg_1, reg_2, reg_3, reg_4, reg_5, reg_6;
  reg add_rst, add_fin; 
  reg[31:0] add_ret, add_x2, add_x1, reg_7, add_state;
  
  always @(posedge clk) // control logic for "add"
    if ({add_rst == 1}) begin 
      add_state <= 2; add_fin <= 0; 
    end else begin 
      case (add_state)
        |\HL5{2: add\_state <= 1;}|
        |\HL5{1: add\_state <= 3;}|
        3: ;
        default: ;
      endcase
    end   
    
  always @(posedge clk) // datapath for "add"    
    case (add_state)
      |\HL5{2: reg\_7 <= {{add\_x2 + add\_x1} + 0};}|
      |\HL5{1: add\_fin = 1;}|
         |\HL5{add\_ret = reg\_7;}|
      3: add_fin <= 0;
    endcase

  always @(posedge clk) // control logic for "main"
    if ({rst == 1}) begin 
      state <= 9; fin <= 0; 
    end else begin
      case (state)
        |\HL1{9: state <= 8;}|
        |\HL2{8: state <= 7;}|
        |\HL2{7: state <= 12;}|
       |\HL2{12: if ({add\_fin == 1}) state <= 6;}|
        |\HL2{6: state <= 5;}|
        |\HL3{5: state <= 4;}|
        |\HL3{4: state <= 10;}|
       |\HL3{10: if ({add\_fin == 1}) state <= 3;}|
        |\HL3{3: state <= 2;}|
        |\HL4{2: state <= 1;}|
        |\HL4{1: state <= 11;}|
       |\HL0{11: ;}|
      endcase
    end

  always @(posedge clk) // datapath for "main"
    case (state)
      |\HL1{9: reg\_3 <= 0;}|
      |\HL2{8: reg\_6 <= 1;}|
      |\HL2{7: add\_rst <= 1;}|
         |\HL2{add\_x2 <= reg\_3;}|
         |\HL2{add\_x1 <= reg\_6;}|
     |\HL2{12: add\_rst <= 0;}|
         |\HL2{reg\_1 <= add\_ret;}|
      |\HL2{6: reg\_3 <= reg\_1;}|
      |\HL3{5: reg\_5 <= 2;}|
      |\HL3{4: add\_rst <= 1;}|
         |\HL3{add\_x2 <= reg\_3;}|
         |\HL3{add\_x1 <= reg\_5;}|
     |\HL3{10: add\_rst <= 0;}|
         |\HL3{reg\_2 <= add\_ret;}|
      |\HL3{3: reg\_3 <= reg\_2;}|
      |\HL4{2: reg\_4 <= reg\_3;}|
      |\HL4{1: fin = 1;}|
         |\HL4{ret = reg\_4;}|
     |\HL0{11: fin <= 0;}|
    endcase
endmodule
\end{lstlisting}

\end{document}