verified_resource_sharing.tex


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522

\documentclass[conference]{IEEEtran}
\usepackage{amsmath}
\usepackage{xcolor}
\usepackage{caption}
\usepackage{subcaption}
\usepackage{graphicx}
\usepackage{calc}  
\usepackage{enumitem} 
\usepackage{tabu}
\usepackage{tikz}
\usepackage{pgfplots}
\usepackage{pgfplotstable}
\usepackage{multirow}
\usepackage{booktabs}
\usepackage{hyperref}
\usepackage[sort, numbers]{natbib}

\usepackage[textsize=small,shadow]{todonotes}%
\usepackage{soul}
\usepackage{subcaption}
\usepackage{tikz-timing}
\usetikzlibrary{fit}
\usetikzlibrary{decorations.pathreplacing}
\usetikzlibrary{shapes,calc,arrows.meta}
\usetikzlibrary{pgfplots.groupplots}
\pgfplotsset{compat=1.16}

\usepackage{listings}
\lstset{
basicstyle=\tt\small,
columns=fixed, %basewidth=0.5em, %tip from https://tex.stackexchange.com/a/179072
escapeinside=||,
}
\setlength{\fboxsep}{1.5pt} % to reduce \colorbox padding when highlighting

%\usepackage{minted}
%\setminted{
%fontsize=\small,
%escapeinside=||,
%}
%\usemintedstyle{manni}

\definecolor{highlight0}{HTML}{ffffff}
\definecolor{highlight1}{HTML}{b4d6d0} % 8dd3c7
\definecolor{highlight2}{HTML}{fcfcdc} % ffffb3
\definecolor{highlight3}{HTML}{c9c7d6} % bebada
\definecolor{highlight4}{HTML}{fcc1bb} % fb8072
\definecolor{highlight5}{HTML}{a9c2d4} % 80b1d3

\newcommand\HL[2]{\colorbox{highlight#1}{\vphantom{A}\smash{#2}}}

% Leave review comments using
% \jwcomment{...} (for John) or \yhcomment{...} (for Yann)
% Using either directly leaves a margin note, using it as,
% e.g. \jwcomment[inline]{...} leaves an inline comment
\newcommand{\jwcomment}[2][]{\todo[author={John}, color=ACMLightBlue, #1]{#2}}
\newcommand{\yhcomment}[2][]{\todo[author={Yann}, color=ACMGreen, #1]{#2}}
\newcommand{\mpcomment}[2][]{\todo[author={Michalis}, color=ACMMagenta, #1]{#2}}


\newif\ifCOMMENTS
\COMMENTStrue
\newcommand{\Comment}[3]{\ifCOMMENTS\textcolor{#1}{{\bf [\![#2:} #3{\bf ]\!]}}\fi}
\newcommand\JW[2][]{\st{#1}\Comment{red!75!black}{JW}{#2}}
\newcommand\YH[2][]{\st{#1}\Comment{blue!50!green}{YH}{#2}}
\newcommand\MP[2][]{\st{#1}\Comment{blue!50!magenta}{MP}{#2}}

\newcommand\JWchanged[1]{\textcolor{red!75!black}{#1}}

\newcommand\vericert{Vericert}
\newcommand\vericertfun{Vericert-Fun}
\newcommand\bambu{Bambu}

\title{Resource Sharing for Verified High-Level Synthesis}
\author{
\IEEEauthorblockN{Michalis Pardalos}
\IEEEauthorblockA{Imperial College London, UK \\ Email: mpardalos@gmail.com}
\and
\IEEEauthorblockN{Yann Herklotz}
\IEEEauthorblockA{Imperial College London, UK \\ Email: yann.herklotz15@imperial.ac.uk}
\and
\IEEEauthorblockN{John Wickerson}
\IEEEauthorblockA{Imperial College London, UK \\ Email: j.wickerson@imperial.ac.uk}
}


\IEEEoverridecommandlockouts
\IEEEpubid{\makebox[\columnwidth]{978-1-6654-8332-2/22/\$31.00~\copyright2022 IEEE \hfill}
\hspace{\columnsep}\makebox[\columnwidth]{ }}

\begin{document}

\maketitle

\IEEEpubidadjcol

\begin{abstract}
High-level synthesis (HLS) is playing an ever-increasing role in hardware design, but concerns have been raised about its reliability. Seeking to address these, Herklotz et al. have developed an HLS compiler called \vericert{} that has been mechanically proven (using the Coq proof assistant) to output Verilog designs that are behaviourally equivalent to the input C program. Unfortunately, \vericert{} cannot compete performance-wise with established HLS tools, and a major reason for this is \vericert{}'s complete lack of support for \emph{resource sharing}. 
  
This paper introduces \vericertfun: \vericert{} extended with function-level resource sharing. Where original \vericert{} creates one block of hardware per function \emph{call}, \vericertfun{} creates one block of hardware per function \emph{definition}. To enable this, we extend \vericert{}'s intermediate language HTL with the ability to represent \emph{multiple} state machines, and we implement function calls by transferring control between these state machines. We are working to extend \vericert{}'s correctness proof to cover the translation from C into this extended HTL and thence to Verilog. Benchmarking on the PolyBench/C suite indicates that \vericertfun{} generates hardware with about 0.8$\times$ the resource usage of \vericert{}'s on average, with minimal impact on execution time.
\end{abstract}

\section{Introduction}
\label{sec:introduction}

The drive for faster, more energy-efficient computation has led to a surge in demand for custom hardware accelerators. This, in turn, has led to interest in \emph{high-level synthesis} (HLS) as a means to program these devices.
Yet doubts have been raised about the reliability of the current crop of HLS tools. For instance, \citet{Herklotz2021_empiricalstudy} found numerous miscompilation bugs in Xilinx Vivado HLS~\cite{xilinx_vitis}, the Intel HLS Compiler~\cite{intel_hls}, and LegUp~\cite{legup_CanisAndrew2011}. This unreliability can be a significant hindrance for developers, and it undermines the usefulness of HLS in safety- or security-critical settings. %It is therefore essential to ensure that HLS tools are as reliable as possible.

Aiming to address this issue is \vericert{}~\cite{vericert}, a new HLS tool whose correctness has been verified to the highest possible standard: a computer-checked proof that any Verilog design it produces will behave the same way as the C program given as input. %It is based on the CompCert~\cite{compcert_Leroy2009} verified C compiler.
Yet it is not enough for an HLS tool simply to be \emph{correct}; the generated hardware must also enjoy high throughput, low latency, and good \emph{area efficiency} -- the last of which is the topic of this paper. 

A common optimisation employed by HLS tools to improve area efficiency is \emph{resource sharing}; that is, \JWchanged{mapping multiple operations to the same hardware unit}. Accordingly, our work adds function-level resource sharing to \vericert{}, yielding a new prototype HLS tool called `\vericertfun'. In line with the aims of the \vericert{} project, work is ongoing to extend the correctness proof.
The entire \vericertfun{} development is fully open-source~\cite{vericertfun-github}, and more details about the implementation and proofs are available in a technical report~\cite{pardalos_thesis}.

\section{Background}

\paragraph{The Coq proof assistant} \vericert{} is implemented in Coq~\cite{coq}, which means it consists of a collection of functions that define the compiler, together with the proof of a theorem that those definitions constitute a correct HLS tool. Coq mechanically checks this proof using a formal mathematical calculus, then translates the function definitions into OCaml code that can be compiled and executed. 
Developing software within a proof assistant like Coq is widely held to be the gold standard for correctness, and recent years have shown that substantial systems can be produced in this way, such as database systems~\cite{malecha+10}, web servers~\cite{chlipala15}, and OS kernels~\cite{gu+16}. Coq has also been deployed in hardware design, both in academia~\cite{braibant+13, bourgeat+20} and in industry~\cite{silveroak}. It has even been applied specifically to HLS: Faissole et al.~\cite{faissole+19} used it to verify that HLS optimisations respect dependencies in the source code.

\paragraph{The CompCert verified C compiler} Among the most celebrated applications of Coq is CompCert~\cite{compcert}, a lightly optimising C compiler with backend support for the Arm, x86, PowerPC, and Kalray VLIW architectures~\cite{six+20}. 
% CompCert accepts most of the C99 language, and generally performs comparably to GCC at \texttt{-O1}. 
It transforms its input through a series of ten intermediate languages before generating the final output. 
%This design ensures that each individual pass remains relatively simple and hence feasible to prove correct.
The correctness proof of the entire compiler is formed by composing the correctness proofs of each of those passes. 
That the Csmith compiler testing tool~\cite{csmith} has found hundreds of bugs in GCC and LLVM but none in (the verified parts of) CompCert is a testament to the reliability of this development approach. 
That said, we cannot guarantee \vericertfun{} designs will never go wrong, because of fallible components not covered by the theorem, such as the synthesis toolchain~\cite{verismith} and the FPGA itself.
 
\paragraph{The \vericert{} verified HLS tool}

Introduced by \citet{vericert}, \vericert{} is a verified C-to-Verilog HLS tool, built by extending CompCert with a new hardware-oriented intermediate language (called HTL) and a Verilog backend. 
\vericert{} branches off from CompCert at the intermediate language called \emph{register-transfer language} (which we shall call `3AC', for `three-address code', to avoid confusion with `register-transfer level').  
In 3AC, each function is represented as a numbered list of instructions with gotos -- i.e., a control-flow graph (CFG). \vericert{}'s compilation strategy is to treat this CFG as a state machine, with each instruction in the CFG being a state, and each edge in the CFG being a transition. The stack is implemented in a block of RAM, and program variables that do not have their address taken are mapped to hardware registers. More precisely, \vericert{} builds a \emph{finite state machine with datapath} (FSMD). This comprises two maps, both taking the current state as their input: the \emph{control logic} map for determining the next state, and a \emph{datapath} map for updating the RAM and registers. These state machines are captured in \vericert{}'s new intermediate language, HTL. When \vericert{} compiles from HTL to the final Verilog output, these maps are converted from mathematical functions into syntactic Verilog case-statements, and placed inside always-blocks.

\vericert{} currently performs no significant optimisations beyond those inherited from CompCert's frontend. This results in performance generally about an order of magnitude slower than that achieved by comparable, unverified HLS tools.
The overall \vericert{} flow is shown in Figure~\ref{fig:flow} (top). Note the `inlining' step, which folds all function definitions into their call sites. This allows \vericert{} to make the simplifying assumption that there is only a single CFG, but has the unwanted effect of duplicating hardware. \vericertfun{} removes some of this inlining and hence some of the duplication.

\paragraph{Resource sharing in HLS}

Resource sharing is a feature expected of most HLS compilers. In a typical HLS-generated architecture~\cite{coussy+09}, several `functional components' are selected from a library according to the needs of the specific design, and in the scheduling process, each operation is allotted a clock cycle in which the required components are all available. Given the need to mechanically verify the correctness of our implementation, \vericertfun{} follows a simpler approach: we share resources at the granularity of entire functions, rather than individual operations. Function-level resource sharing is implemented in commercial HLS compilers such as the Intel HLS compiler~\cite{intel_hls} or
Xilinx Vitis~\cite{xilinx_vitis}, and is guided by the programmer through pragmas.

Perna et al.~\cite{perna+12} developed a verified HLS tool for the Handel-C language, but, like \vericert{}, they did not implement function-level resource sharing, instead arranging that ``all procedure and function calls are expanded in the
front-end''.

\begin{figure}
\definecolor{colorflow}{HTML}{8EB85B}

\centering
\begin{tikzpicture}[yscale=-1]
\tikzset{flowlines/.style={draw, colorflow, ultra thick}}

\node(fun1) at (1.5,2) {function};
\node(fun2) at (0,2) {function};
\node[anchor=west](C) at (-2,2) {\bf C:};
\node(CFG1) at (1.5,3) {CFG};
\node(CFG2) at (0,3) {CFG};
\node(CFG) at (0.75,4) {CFG};
\node[anchor=west](TAC) at (-2,3) {\bf 3AC:};
\node(FSMD) at (0.75,5) {state machine};
\node[anchor=west](HTL) at (-2,5) {\bf HTL:};
\node(module) at (0.75,6) {module};
\node[anchor=west](Verilog) at (-2,6) {\bf Verilog:};
\draw[->] (fun1) to node [auto] {CompCert frontend} (CFG1);
\draw[->] (fun2) to (CFG2);
\draw[->] (CFG1) to node [auto, pos=0.2] {inlining} (CFG);
\draw[->] (CFG2) to (CFG);
\draw[->] (CFG) to node [auto] (smg) {state machine generation} (FSMD);
\draw[->] (FSMD) to node [auto] {Verilog generation} (module);
\coordinate (r) at (6,3);

\begin{pgfonlayer}{background}
\node[flowlines, fit=(C)(module)(r)] (background1) {};
\end{pgfonlayer}

\begin{scope}[yshift=55mm]
\node(fun1) at (2,2) {function};
\node(fun2) at (0,2) {function};
\node[anchor=west](C) at (-2,2) {\bf C:};
\node(CFG1) at (2,3) {CFG};
\node(CFG2) at (0,3) {CFG};
\node[anchor=west](TAC) at (-2,3) {\bf 3AC:};
\node(FSMD1) at (2,4) {state machine};
\node(FSMD2) at (0,4) {state machine};
\node(FSMD3) at (2,5) {state machine};
\node(FSMD4) at (0,5) {state machine};
\node[anchor=west](HTL) at (-2,4) {\bf HTL:};
\node(module) at (1,6) {module};
\node[anchor=west](Verilog) at (-2,6) {\bf Verilog:};
\draw[->] (fun1) to node [auto] {CompCert frontend} (CFG1);
\draw[->] (fun2) to (CFG2);
\draw[->] (CFG1) to node [auto] (smg) {state machine generation} (FSMD1);
\draw[->] (CFG2) to (FSMD2);
\draw[->] (FSMD1) to node [auto] {renaming} (FSMD3);
\draw[->] (FSMD2) to (FSMD4);
\draw[->] (FSMD3) to node [auto, pos=0.2] {Verilog generation} (module);
\draw[->] (FSMD4) to (module);
\coordinate (r) at (6,3);

\end{scope}

\begin{pgfonlayer}{background}
\node[flowlines, fit=(C)(module)(r)] (background2) {};
\end{pgfonlayer}

\draw[flowlines, -latex] (background1.south) to node [auto] {\bf this work} (background2.north -| background1.south);

\end{tikzpicture}
\caption{Key compilation passes and intermediate languages in \vericert{}~\cite{vericert} (top) and in \vericertfun{} (bottom)}
\label{fig:flow}
\end{figure}

\section{Implementation of \vericertfun}

\tikzset{
st/.style={draw=black, fill=white, rounded corners, align=left, font=\tt\footnotesize, minimum width=40mm},
ed/.style={draw, ->},
exted/.style={draw, thick, dotted, ->, rounded corners},
lab/.style={anchor=south west, inner sep=1pt, font=\tiny},
tit/.style={anchor=north west, font=\tt\footnotesize},
edlab/.style={auto, inner sep=2pt, align=left, font=\tt\footnotesize}
}

We now explain the implementation of \vericertfun{} using Figure~\ref{fig:example_C} as a worked example. The overall flow is shown in Figure~\ref{fig:flow} (bottom): we avoid inlining the function calls at the 3AC level (except in certain circumstances described below), instead maintaining one state machine per function. All the state machines run simultaneously, and function calls are implemented by sending messages between them. We then combine all of these state machines into a single Verilog module after renaming variables to avoid clashes.

\begin{figure}
\centering
\begin{minipage}[b]{0.45\linewidth}
\begin{lstlisting}
int add(int a, int b) 
{ 
  |\HL5{return a + b;}| 
}

int main() 
{
  |\HL1{int v = 0;}|
  |\HL2{v = add(v, 1);}|
  |\HL3{v = add(v, 2);}|
  |\HL4{return v;}|
}
\end{lstlisting}
\caption{A simple C program with each instruction colour-coded so it can be tracked through the compilation stages}
\label{fig:example_C}
\end{minipage}
\hfill
\begin{minipage}[b]{0.5\linewidth}
% \begin{lstlisting}
% add (x2, x1) {
%   |\HL5{2: x3 = x2 + x1 + 0 (int)}|
%   |\HL5{1: return x3}|
% }
% main () {
%   |\HL1{9: x3 = 0}|
%   |\HL2{8: x6 = 1}|
%   |\HL2{7: x1 = "add"(x3, x6)}|
%   |\HL2{6: x3 = x1}|
%   |\HL3{5: x5 = 2}|
%   |\HL3{4: x2 = "add"(x3, x5)}|
%   |\HL3{3: x3 = x2}|
%   |\HL4{2: x4 = x3}|
%   |\HL4{1: return x4}|
% }
% \end{lstlisting}
\begin{tikzpicture}[node distance=2mm]
\node[st, fill=highlight5]    (a2) at (0,-1) {x3 = x2 + x1 + 0(int)};
\node[st, fill=highlight5, below=of a2] (a1) {return x3};
\draw[ed] (a2) to (a1);
\node[tit] at ([xshift=-4mm, yshift=5mm]a2.north west) (labadd) {add(x2, x1) \{};
\node[tit] at ([xshift=-4mm, yshift=-1mm]a1.south west) (labadd2) {\}};
\begin{scope}[yshift=-24mm]
\node[st, fill=highlight1]    (m9) at (0,-1) {x3 = 0};
\node[st, fill=highlight2, below=of m9] (m8) {x6 = 1};
\node[st, fill=highlight2, below=of m8] (m7) {x1 = "add"(x3, x6)};
\node[st, fill=highlight2, below=of m7] (m6) {x3 = x1};
\node[st, fill=highlight3, below=of m6] (m5) {x5 = 2};
\node[st, fill=highlight3, below=of m5] (m4) {x2 = "add"(x3, x5)};
\node[st, fill=highlight3, below=of m4] (m3) {x3 = x2};
\node[st, fill=highlight4, below=of m3] (m2) {x4 = x3};
\node[st, fill=highlight4, below=of m2] (m1) {return x4};
\draw[ed] (m9) to (m8);
\draw[ed] (m8) to (m7);
\draw[ed] (m7) to (m6);
\draw[ed] (m6) to (m5);
\draw[ed] (m5) to (m4);
\draw[ed] (m4) to (m3);
\draw[ed] (m3) to (m2);
\draw[ed] (m2) to (m1);
\node[tit] at ([xshift=-4mm, yshift=5mm]m9.north west) (labmain) {main() \{};
\node[tit] at ([xshift=-4mm, yshift=-1mm]m1.south west) (labmain2) {\}};
\end{scope}
\foreach\i in {1,2,...,9}
  \node[lab, anchor=south east] at (m\i.west) {\i};
\foreach\i in {1,2}
  \node[lab, anchor=south east] at (a\i.west) {\i};
\end{tikzpicture}
\caption{3AC representation comprising two CFGs}
\label{fig:example_3AC}
\end{minipage}

\end{figure}

Figure~\ref{fig:example_3AC} shows the 3AC representation of that C program, as obtained by the CompCert frontend. We see two CFGs, one per function. The control flow in this example is straightforward, but in general, 3AC programs can contain unrestricted gotos. The nodes of the CFGs are numbered in reverse, as are the parameters of the \lstinline{add} function, following CompCert convention.
Figure~\ref{fig:example_HTL} depicts the result of converting those CFGs into two state machines. The conversion of 3AC instructions into Verilog instructions has been described already by \citet{vericert}; what is new here is the handling of function calls, so the following concentrates on that aspect. Note that ``$*{\rightarrow}\langle\mathit{node}\rangle$'' stands for edges from all nodes to $\langle\mathit{node}\rangle$.
The solid edges within the two state machines indicate the transfer of control between states, while the dashed edges between the state machines are more `fictional'. The ground truth is that both state machines run continuously, but it is convenient to think that only one machine does useful work at a time. So, the dashed edges indicate when the `active' machine changes.

\definecolor{background}{HTML}{eeeeee}
\begin{figure}
\centering
\begin{tikzpicture}[node distance=5mm]
\node[st, fill=highlight1] (m9) at (0,0)     {reg\_3 <= 0;};
\node[st, fill=highlight2, below=of m9] (m8) {reg\_6 <= 1;};
\node[st, fill=highlight2, below=of m8] (m7) {add\_0\_rst <= 1; \\ add\_0\_a <= reg\_3; \\ add\_0\_b <= reg\_6;};
\node[st, fill=highlight2, below=of m7] (m12) {add\_0\_rst <= 0; \\ reg\_1 <= add\_0\_ret;};
\node[st, fill=highlight2, below=of m12] (m6) {reg\_3 <= reg\_1;};
\node[st, fill=highlight3, below=of m6] (m5)  {reg\_5 <= 2;};
\node[st, fill=highlight3, below=of m5] (m4) {add\_1\_rst <= 1; \\ add\_1\_a <= reg\_3; \\ add\_1\_b <= reg\_5;};
\node[st, fill=highlight3, below=of m4] (m10) {add\_1\_rst <= 0; \\ reg\_2 <= add\_1\_ret;};
\node[st, fill=highlight3, below=of m10] (m3) {reg\_3 <= reg\_2;};
\node[st, fill=highlight4, below=of m3] (m2)  {reg\_4 <= reg\_3;};
\node[st, fill=highlight4, below=of m2] (m1)  {fin = 1; \\ ret = reg\_4;};
\node[st, fill=highlight4, below=of m1] (m11) {fin <= 0;};

\node[st, fill=highlight5, above=38mm of m9] (a2) {reg\_7 <= x2 + x1 + 0;};
\node[st, fill=highlight5, below=of a2] (a1) {fin = 1; \\ ret = reg\_7;};
\node[st, fill=highlight5, below=of a1] (a3) {fin <= 0;};

\draw[ed] (m9) to node[edlab] {!rst} (m8);
\draw[ed] (m8) to node[edlab] {!rst} (m7);
\draw[ed] (m7) to node[edlab] {!rst} (m12);
\draw[ed] (m12) to node[edlab] {!rst \&\& add\_0\_fin} (m6);
\draw[ed] (m6) to node[edlab] {!rst} (m5);
\draw[ed] (m5) to node[edlab] {!rst} (m4);
\draw[ed] (m4) to node[edlab] {!rst} (m10);
\draw[ed] (m10) to node[edlab] {!rst \&\& add\_1\_fin} (m3);
\draw[ed] (m3) to node[edlab] {!rst} (m2);
\draw[ed] (m2) to node[edlab] {!rst} (m1);
\draw[ed] (m1) to node[edlab] {!rst} (m11);
\draw[ed] (m12) to[loop right, looseness=2, out=-4, in=4] node[edlab, swap] {!rst \&\& \\ !add\_0\_fin} (m12);
\draw[ed] (m10) to[loop right, looseness=2, out=-4, in=4] node[edlab, swap] {!rst \&\& \\ !add\_1\_fin} (m10);
\draw[ed] (m11) to[loop below, looseness=10, pos=0.15] node[edlab] {!rst} (m11);

\node[inner sep=1pt] (ast1) at ([xshift=5mm, yshift=-5mm]m9.south east) {*};
\draw[ed] (ast1.north) to[bend right] node[edlab, swap] {rst} (m9.east);

\draw[ed] (a2) to node[edlab] {!rst} (a1);
\draw[ed] (a1) to node[edlab] {!rst} (a3);
\draw[ed] (a3) to[loop below, looseness=10, pos=0.15] node[edlab] {!rst} (a3);

\node[inner sep=1pt] (ast2) at ([xshift=5mm, yshift=-5mm]a2.south east) {*};
\draw[ed] (ast2.north) to[bend right] node[edlab, swap] {rst} (a2.east);

\coordinate (nw1) at ([xshift=-1mm, yshift=6mm]m9.north west); 
\coordinate (se1) at ([xshift=44mm, yshift=-4mm]m11.south east);
\begin{pgfonlayer}{background}
\node[draw, fill=background, rounded corners, fit=(nw1)(se1)] (main) {};
\end{pgfonlayer}
\node[tit, above=2mm of m9] (labmain) {\bfseries main()};
\coordinate (nw2) at ([xshift=-1mm, yshift=6mm]a2.north west); 
\coordinate (se2) at ([xshift=16mm, yshift=-4mm]a3.south east);
\begin{pgfonlayer}{background}
\node[draw, fill=background, rounded corners, fit=(nw2)(se2)] (add) {};
\end{pgfonlayer}
\node[tit, above=2mm of a2] (labadd) {\bfseries add(x2,x1)};

\foreach\i in {1,2,...,12}
  \node[lab] at (m\i.north west) {\i};
\foreach\i in {1,2,3}
  \node[lab] at (a\i.north west) {\i};

\draw[exted] (m7.west) -- ++(-5mm,0) |- ([yshift=-1mm]a2.west);
\draw[exted] (m4.west) -- ++(-7mm,0) |- ([yshift=1mm]a2.west);
\draw[exted] ([yshift=-1mm]a3.east) -- ++(21mm,0) |- (m6.east);
\draw[exted] ([yshift=1mm]a3.east) -- ++(23mm,0) |- (m3.east);

\node [align=left, right=3mm of m1, font=\tt\footnotesize] {
{\bfseries extern\_ctrl:} \\ 
add\_0\_rst $\mapsto$ (add, \textrm{reset}) \\
add\_0\_ret $\mapsto$ (add, \textrm{return}) \\
add\_0\_fin $\mapsto$ (add, \textrm{finish}) \\
add\_0\_a $\mapsto$ (add, \textrm{param 0}) \\
add\_0\_b $\mapsto$ (add, \textrm{param 1}) \\
add\_1\_rst $\mapsto$ (add, \textrm{reset}) \\
add\_1\_ret $\mapsto$ (add, \textrm{return}) \\
add\_1\_fin $\mapsto$ (add, \textrm{finish}) \\
add\_1\_a $\mapsto$ (add, \textrm{param 0}) \\
add\_1\_b $\mapsto$ (add, \textrm{param 1})
}; 

\end{tikzpicture}
\caption{HTL representation comprising two state machines}
\label{fig:example_HTL}
\medskip
\begin{tikztimingtable}[timing/d/background/.style={fill=white}, timing/xunit=1em, timing/yunit=0.6em]
{\tt\footnotesize clk}\!\!\! & 0.5L 38{0.5C} \\
{\tt\footnotesize rst}\!\!\! & 0.5H 19L \\
{\tt\footnotesize fin}\!\!\! & 0.5X 17L H L \\
{\tt\footnotesize state}\!\!\! & 0.5X D{9} D{8} D{7} 4D{12} D{6} D{5} D{4} 4D{10} D{3} D{2} D{1} 2D{11} \\
{\tt\footnotesize rst}\!\!\! & 3.5X H 6L H 8L \\
{\tt\footnotesize fin}\!\!\! & 4.5X 2L H 6L H 5L \\
{\tt\footnotesize state}\!\!\! & 4.5X D{2} D{1} 5D{3} D{2} D{1} 6D{3} \\
\extracode
\foreach\i in {1,2,...,19}
\node[help lines, anchor=west, inner sep=0] at (\i-0.5,2.25) {\tiny \i};
\begin{pgfonlayer}{background}
  \vertlines[help lines]{0.5,1.5,...,18.5}
\end{pgfonlayer}
\draw[decoration={brace,mirror,raise=5pt},decorate] (-2.6,-1) to node[left=6pt] {\tt\footnotesize main} (-2.6,-6.5);
\draw[decoration={brace,mirror,raise=5pt},decorate] (-2.6,-7) to node[left=6pt] {\tt\footnotesize
add} (-2.6,-12.5);
\end{tikztimingtable}
\caption{Timing diagram for the state machines in Figure~\ref{fig:example_HTL}}
\label{fig:timingdiagram}

\end{figure}

In more detail: Execution begins in state 9 of the \lstinline{main} machine, and proceeds through successive states until it reaches state 7, in which the \lstinline{add} machine's \lstinline{rst} signal is set. This causes the \lstinline{add} machine to advance to state 2. When \lstinline{main} advances to state 12, that \lstinline{rst} signal is unset; \lstinline{add} then begins its computation while \lstinline{main} spins in state 12. When \lstinline{add} has finished (state 1), it sets its \lstinline{fin} signal, which allows \lstinline{main} to leave state 12. Finally, \lstinline{add} unsets its \lstinline{fin} and waits in state 3 for the next call.
The same event sequence can also be understood using the timing diagram in Figure~\ref{fig:timingdiagram}, in which red lines indicate unspecified values. %We see that each call of \lstinline{add} begins with a pulse on \lstinline{add.rst} (fifth waveform) and ends with a pulse on \lstinline{add.fin} (sixth waveform).
We see that calls are initiated by triggering the \lstinline{rst} signal of the called module and that a function returns by setting its own \lstinline{fin} register. %There are no `call' or `return' instructions in HTL.

One technical challenge we encountered in the implementation of \vericertfun{} has to do with the fact that the called and callee state machines need to modify each other's variables. This is problematic because each function is translated independently, and hence the register names used in the other state machines may not be available. We work around this by introducing an additional component to our state machines: an `\lstinline{extern_ctrl}' mapping from local register names to pairs of module identifiers and roles in those modules. For instance, the first entry in \lstinline{extern_ctrl} in Figure~\ref{fig:example_HTL} tells us that the \lstinline{add_0_rst} register used by \lstinline{main} should resolve to whichever register plays the role of `reset' in \lstinline{add}. Once all the state machines have been generated, we erase \lstinline{extern_ctrl}. We do this in two steps. First, we rename all registers to be globally unique, which avoids unintended conflicts between registers in different modules (register names can only be assumed unique within their own module). We then rename all
registers mentioned in \lstinline{extern_ctrl} to the name of the actual register they target. %\JW{I'm still struggling to understand externctrl properly. I don't see why we can't resolve externctrl straight away, when each HTL state machine is first generated. Why do we have to wait until they are linked together?}\YH{I think that there are multiple reasons: The first is that it's very difficult to access the function information from other functions, because this requires resolving their ID in the global AST state and then inspecting it.  If one ends up using this information, it can be very tricky to prove the induction, because one isn't local to the current function anymore.  All the proof structure in CompCert assumes one only deals within the current function.  This also complicates the definition of the semantics.  Secondly, I think it just helps as a bridge, because functions have their completely separate register files in 3AC.  It's therefore quite difficult to map registers from one register file to the other unless one has something that describes which ones to map over.}

A second technical challenge we encountered in the implementation of \vericertfun{} has to do with an assumption made in \vericert{}'s correctness proof: that all pointers are stored as offsets from the main function's stack frame. This assumption was reasonable for \vericert{} because after full inlining, the main function is the only function. This assumption is baked into several of \vericert{}'s most complicated lemmas, including the correctness proof for load and store instructions, and so we have not sought to lift it in our current prototype of \vericertfun{}. Instead, we have made the compromise of only \emph{partially} eliminating the inlining pass. That is: \vericertfun{} inlines all functions that contain \lstinline{load}, \lstinline{store} or \lstinline{call} instructions. Thus, the benefits of resource sharing are currently only enjoyed by functions that do not contain load or store instructions and do not call other functions. 

\section{Proving \vericertfun{} correct}

The CompCert correctness theorem~\cite{compcert} states that every behaviour of the compiled program is also a behaviour of the source program. \citet{vericert} adapted this theorem for HLS by replacing `compiled program' with `generated Verilog design'. In both cases, a formal semantics is required for the source and target languages. \vericertfun{} targets the same fragment of the Verilog language as \citeauthor{vericert} already mechanised in Coq, so no changes are required there. 

Where changes \emph{are} required is in the semantics of the intermediate language HTL, which sits between CompCert's 3AC and the final Verilog.
When \citeauthor{vericert} designed HTL, they did not include a semantics for function calls because they assumed all function calls would already have been inlined. We have extended HTL so that its semantics is additionally parameterised by an environment that maps function names to state machines. Our semantics for function calls looks up the named function in this environment, activates the corresponding state machine, and pushes a new stack frame, and our semantics for return statements pops the current stack frame and reactivates the caller's state machine. % \MP{One aspect of this I didn't mention: This introduces non-determinism. When you set the reset of a module, the next state can be either a call or a normal executing state, ignoring the reset. This non-determinism does not affect the RTL-to-HTL proof, but could affect the HTL-to-Verilog one.}

At the point of writing, the correctness of \vericertfun{} from C to HTL has been mostly proven: basic instructions and function calls are proven correct, but proofs of load and store instructions still lack some key invariants. The pass that renames
variables in HTL is yet to be proven, as is the Verilog-generation pass. 
To give a rough idea of the scale and complexity of our work: the implementation of \vericertfun{} involved adding or changing about 700 lines of Coq code in \vericert{} and took the first author 4 months. The correctness proof has so far required about 2300 lines of additions or changes to the Coq code and 8 person-months of work.

\section{Performance evaluation}

\JWchanged{We now compare the quality of the hardware generated by \vericertfun{} against that of \vericert{}. We use the open-source (but unverified) \bambu{} tool~\cite{pilato13_bambu, ferrandi2021_bambu} as a baseline. We run \bambu{} (version 0.9.6) in the {\tt BAMBU\_AREA} configuration, which optimises for area ahead of latency, but do not provide any additional pragmas to control the HLS process.}
Following \citet{vericert}, we use the PolyBench/C benchmark suite~\cite{polybench} with division and modulo replaced with iterative software implementations because \vericert{} does not handle them efficiently. That is, \lstinline{a/b} and \lstinline{c%d}
are textually replaced with \lstinline{div(a,b)} and \lstinline{mod(c,d)}. These \lstinline{div} and \lstinline{mod} functions are the only function calls that are not inlined.
We used the Icarus Verilog simulator to determine the cycle counts of the generated designs. We used Xilinx Vivado 2017.1, targeting a Xilinx 7-series FPGA (XC7K70T) to determine area usage \JWchanged{(measured in slices)} and fmax.

%Figure~\ref{fig:results} summarises our results. The x-axis shows the impact of resource sharing on
%the speed of the hardware (as calculated by the cycle count divided by fmax); we see that all the
%data points lie very close to 1, which suggests no significant impact. On average the cycle count
%increases by 0.7\%; this modest increase is in line with expectations because our translation
%introduces an extra state per function call. The impact on fmax is similarly minimal, ranging
%between a 1.5\% increase and a 3.1\% decrease (0.2\% decrease on average).

Figure~\ref{fig:results} compares the hardware generated by \vericertfun{} with that of \vericert{}, using Bambu as a baseline. The top graph compares the area usage. We observe a substantial reduction in area usage across the benchmark programs, with \vericert{} consistently using more area than Bambu (1.5$\times$ on average) and \vericertfun{} requiring less area than \vericert{} (0.8$\times$ on average), but still more than \bambu{} (1.2$\times$ on average).  As expected, the benchmarks with several function calls (mvt, 2mm, 3mm, ludcmp) enjoy the biggest area savings, while those with only one function call (heat-3d, nussinov) require slightly more area because of the extra circuitry required.
The bottom graph compares the execution time. We observe that \vericertfun{} generates hardware that is slightly (about 4\%) slower than \vericert{}'s, which can be attributed to the latency overhead of performing a function call. Hardware from \vericert{} and \vericertfun{} is significantly slower than \bambu's, which can be attributed to \vericert{} employing far fewer optimisations than the unverified \bambu{} tool.

\pgfplotstableread[col sep=comma]{data/time-ratio.csv}{\divtimingtable}
\pgfplotstableread[col sep=comma]{data/slice-ratio.csv}{\divslicetable}
\definecolor{vericertcol}{HTML}{1b9e77}
\definecolor{legupnooptcol}{HTML}{d95f02}
\definecolor{legupnooptnochaincol}{HTML}{8DA0CB}
\newcommand\backgroundbar[2][5]{\draw[draw=none, fill=black!#1] (axis cs:#2*2+0.5,0.1) rectangle
  (axis cs:1+#2*2+0.5,300);}

\begin{figure}
  \begin{tikzpicture}[scale=0.8]
    \begin{groupplot}[
      group style={
        group name=my plots,
        group size=1 by 2,
        xlabels at=edge bottom,
        xticklabels at=edge bottom,
        vertical sep=5pt,
      },
      ymode=log,
      ybar=0.4pt,
      width=0.64\textwidth,
      height=0.25\textwidth,
      /pgf/bar width=2pt,
      legend pos=north east,
      log ticks with fixed point,
      xticklabels from table={\divtimingtable}{benchmark},
      legend style={nodes={scale=0.7, transform shape}},
      x tick label style={rotate=90,anchor=east,font=\footnotesize},
      legend columns=-1,
      xtick=data,
      enlarge x limits={abs=0.5},
      ylabel style={font=\footnotesize},
      xtick style={draw=none},
      ]
      
      \nextgroupplot[ymin=0.8,ymax=5,ylabel={Area vs. \bambu{}}, ytick={1,2,4}]
      \pgfplotsinvokeforeach{0,...,12}{%
        \backgroundbar{#1}}
      \backgroundbar[10]{13}
      \addplot+[vericertcol] table [x expr=\coordindex,y=vericert,col sep=comma] from \divslicetable;
      \addplot+[legupnooptcol] table [x expr=\coordindex,y=vericert-fun,col sep=comma] from \divslicetable;
      \draw (axis cs:-1,1) -- (axis cs:28,1);
      % JW: redraw axis border which has been partially covered by the grey bars
      \draw (axis cs:-0.5,0.8) rectangle (axis cs:27.5,5);
      \legend{\vericert{},\vericertfun{}};

      \nextgroupplot[ymin=1,ymax=10,ylabel={Execution time vs. \bambu{}}, ytick={1,2,4,8}]
      \pgfplotsinvokeforeach{0,...,12}{%
        \backgroundbar{#1}}
      \backgroundbar[10]{13}
      \addplot+[vericertcol] table [x expr=\coordindex,y=vericert,col sep=comma] from \divtimingtable;
      \addplot+[legupnooptcol] table [x expr=\coordindex,y=vericert-fun,col sep=comma] from \divtimingtable;
      \draw (axis cs:-1,1) -- (axis cs:28,1);
      % JW: redraw axis border which has been partially covered by the grey bars
      \draw (axis cs:-0.5,1) rectangle (axis cs:27.5,10);
    \end{groupplot}
  \end{tikzpicture}
  \caption{\vericertfun{} vs \vericertfun{}, with \bambu{} as a baseline.}
\label{fig:results}
\end{figure}

\section{Future work}

Our immediate priority is to complete \vericertfun's correctness proof. In the medium term, we intend to improve our implementation of resource sharing by dropping the requirement to inline functions that access pointers; we anticipate that this will lead to further area savings \JWchanged{and also allow \vericertfun{} to be evaluated on benchmarks with more interesting call graphs}. We would also like \vericertfun{} to generate designs with one Verilog module per C function, as this is more idiomatic than cramming all the state machines into a single module; we did not do this yet because it requires extending the Verilog semantics to handle multiple modules. \JWchanged{It would also be interesting to implement \emph{selective} inlining of functions~\cite{huang+15}, either guided by heuristics or by programmer-supplied pragmas. It is worth noting that having proven inlining correct in general, the amount of inlining performed can be adjusted without affecting the correctness proof.} In the longer term, we would like to combine resource sharing with operation scheduling, i.e. resource-constrained scheduling~\cite{sdc}.


\bibliographystyle{ACM-Reference-Format}
\bibliography{references}

\end{document}