verified_resource_sharing.tex


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477

\documentclass[hyphens,prologue,x11names,rgb,sigconf]{acmart}

\usepackage[textsize=small,shadow]{todonotes}%
\usepackage{soul}
\usepackage{subcaption}
\usepackage{tikz-timing}
\usetikzlibrary{fit}
\usetikzlibrary{decorations.pathreplacing}

\usepackage{listings}
\lstset{
basicstyle=\tt,
columns=fixed, basewidth=0.5em, %tip from https://tex.stackexchange.com/a/179072
escapeinside=||,
}
\setlength{\fboxsep}{1.5pt} % to reduce \colorbox padding when highlighting

%\usepackage{minted}
%\setminted{
%fontsize=\small,
%escapeinside=||,
%}
%\usemintedstyle{manni}

\definecolor{highlight0}{HTML}{ffffff}
\definecolor{highlight1}{HTML}{b4d6d0} % 8dd3c7
\definecolor{highlight2}{HTML}{fcfcdc} % ffffb3
\definecolor{highlight3}{HTML}{c9c7d6} % bebada
\definecolor{highlight4}{HTML}{fcc1bb} % fb8072
\definecolor{highlight5}{HTML}{a9c2d4} % 80b1d3

\newcommand\HL[2]{\colorbox{highlight#1}{\vphantom{A}\smash{#2}}}

% Leave review comments using
% \jwcomment{...} (for John) or \yhcomment{...} (for Yann)
% Using either directly leaves a margin note, using it as,
% e.g. \jwcomment[inline]{...} leaves an inline comment
\newcommand{\jwcomment}[2][]{\todo[author={John}, color=ACMLightBlue, #1]{#2}}
\newcommand{\yhcomment}[2][]{\todo[author={Yann}, color=ACMGreen, #1]{#2}}
\newcommand{\mpcomment}[2][]{\todo[author={Michalis}, color=ACMMagenta, #1]{#2}}


\newif\ifCOMMENTS
\COMMENTStrue
\newcommand{\Comment}[3]{\ifCOMMENTS\textcolor{#1}{{\bf [\![#2:} #3{\bf ]\!]}}\fi}
\newcommand\JW[2][]{\st{#1}\Comment{red!75!black}{JW}{#2}}
\newcommand\YH[2][]{\st{#1}\Comment{blue!50!green}{YH}{#2}}
\newcommand\MP[2][]{\st{#1}\Comment{blue!50!magenta}{MP}{#2}}


\newcommand{\citeintext}[1] {\citeauthor{#1} in \citeyear{#1}~\cite{#1}}
\newcommand{\citeshort}[1] {\citeauthor{#1} \citeyear{#1}~\cite{#1}}

\begin{document}

\title{Resource Sharing for Verified High-Level Synthesis}

\author{Michail Pardalos}
\email{michail.pardalos17@imperial.ac.uk}
\affiliation{\institution{Imperial College London}}

\author{Yann Herklotz}
\email{yann.herklotz15@imperial.ac.uk}
\affiliation{\institution{Imperial College London}}

\author{John Wickerson}
\email{j.wickerson@imperial.ac.uk}
\affiliation{\institution{Imperial College London}}

\begin{abstract}
  High-level synthesis (HLS) is playing an ever-increasing role in hardware design, but concerns have been raised about its reliability. Seeking to address these concerns, Herklotz et al. have recently developed an HLS compiler, called Vericert, that has been mechanically proven (using the Coq proof assistant) to output Verilog designs that are behaviourally equivalent to the input C program. Unfortunately, Vericert's output cannot compete performance-wise with that of established HLS tools such as LegUp. A major reason for this is Vericert's complete lack of support for resource sharing. 
  
  In this paper, we present Vericert-Fun: Vericert extended with function-level resource sharing. Where original Vericert creates one block of hardware per function \emph{call} \MP{Is this an over-simplification? We don't really generate a "block of hardware" for function calls}, Vericert-Fun creates one block of hardware per function \emph{definition}. The enabling innovation is to extend Vericert's intermediate language HTL with the ability to represent \emph{multiple} state machines, and then to add support for function calls that transfer control between these state machines. We are working to extend Vericert's formal correctness proof to cover the translation from C source into this extended HTL language, and thence on to Verilog.
  
We have benchmarked Vericert-Fun's performance on the PolyBench/C suite, and our results show the generated hardware having a resource usage of 61\% of Vericert's on average and 46\% in the best case, for only a 3\% average decrease in max frequency and 1\% average increase in cycle count.
\end{abstract}

\maketitle

\section{Introduction}
\label{sec:introduction}

The drive for faster, more energy-efficient computation has led to a surge in demand for custom hardware accelerators. These devices are traditionally designed using hardware description languages such as Verilog or VHDL, but the complexities of designing in such a language, as well as the abundance of engineers trained in software rather than hardware development, has meant that \emph{high-level synthesis} (HLS) tools have become an enticing option. 

These tools are useful, but doubts have been raised about their reliability. For instance, \citet{Herklotz2021_empiricalstudy} found numerous miscompilation bugs in commercial HLS tools including Xilinx Vivado HLS~\cite{xilinx_vitis}, Intel i++~\cite{intel_hls}, and LegUp~\cite{legup_CanisAndrew2011}. This unreliability can be a significant hindrance in the development process, especially given the long iteration times of hardware design compared to software. It also undermines the usefulness of HLS in safety- or security-critical settings. It is therefore essential to ensure that HLS tools are as reliable as possible.

Vericert~\cite{Herklotz2020} is a new HLS tool that aims to address this issue. Its correctness has been checked to the highest possible standard: machine-checked proof. It provides a proof, checked using the Coq proof assistant~\cite{coq}, that every step of its translation from C to Verilog preserves the semantics of (i.e.\ behaves the same way as) its input program. This proof means that we can always trust any Verilog design produced by Vericert to behave the same way as the C program given as input. %It is based on the CompCert~\cite{compcert_Leroy2009} verified C compiler.

Clearly, however, it is not enough for an HLS tool simply to be \emph{correct}. The generated hardware must also meet several other desiderata, including high throughput, low latency, and good \emph{area efficiency}, which is the topic of this paper. A common optimisation employed by HLS tools to improve area efficiency is \emph{resource sharing}; that is, re-using hardware for more than one purpose. Accordingly, our work adds resource sharing to Vericert. Keeping with the aims of the Vericert project, work is ongoing to extend the correctness proof.

\section{Background}

\paragraph{The Coq proof assistant} Vericert is implemented using the Coq proof assistant~\cite{coq}. This means that it consists of a collection of functions that define the compilation process, together with the proof of a theorem stating that those definitions constitute a correct HLS tool. Coq mechanically checks this proof using a formal mathematical calculus, and then automatically translates the function definitions into OCaml code that can be compiled and executed. 

Engineering a software system within a proof assistant like Coq is widely held to be the gold standard for correctness. Recent years have shown that it is feasible to design substantial pieces of software in this way, such as database management systems~\cite{malecha+10}, web servers~\cite{chlipala15}, and operating system kernels~\cite{gu+16}. Coq has also been successfully deployed in the hardware design process, both in academia~\cite{braibant+13, bourgeat+20} and in industry~\cite{silveroak}. It has even been applied specifically to the HLS process: Faissole et al.~\cite{faissole+19} used it to verify that HLS optimisations respect dependencies present in the source code.

\paragraph{The CompCert verified C compiler} Among the most celebrated applications of Coq is the CompCert project~\cite{compcert}. CompCert is a lightly optimising C compiler, with backend support for the Arm, x86, PowerPC, and Kalray VLIW architectures~\cite{six+20}, that has been implemented and proven correct using Coq. CompCert accepts most of the C99 language, and generally produces code of comparable performance to that produced by GCC at optimisation level \texttt{-O1}. It transforms its input through a series of ten intermediate languages before generating the final output. This design ensures that each individual pass remains relatively simple and hence feasible to prove correct. The correctness proof of the entire compiler is formed by composing the correctness proofs of each of its passes.

\paragraph{The Vericert verified HLS tool}

Introduced by \citet{Herklotz2020}, Vericert is a verified C-to-Verilog HLS tool. It is an extension of CompCert, essentially augmenting the existing verified C compiler with a new hardware-oriented intermediate language (called HTL) and a Verilog backend. In its current form, Vericert performs no significant optimisations beyond those it inherits from CompCert's frontend. This results in performance generally about one order of magnitude slower than that achieved by comparable, unverified HLS tools such as LegUp~\cite{legup_CanisAndrew2011}.

Vericert branches off from CompCert at the intermediate language called \emph{register-transfer language} (RTL). Since that abbreviation is usually used in the hardware community for `register-transfer level', we shall henceforth avoid possible confusion by referring to the CompCert intermediate language as `3AC' (for `three-address code').

In the 3AC language, each function in the program is represented as a numbered list of instructions with gotos -- essentially, a control-flow graph (CFG). The essence of Vericert's compilation strategy is to treat this CFG as a state machine, with each instruction in the CFG becoming a state, and each edge in the CFG becoming a transition. Moreover, program variables that do not have their address taken are mapped to hardware registers; other variables (including arrays and structs) are allocated in a block of RAM that represents the stack. More precisely, Vericert builds a \emph{finite state machine with datapath} (FSMD). This comprises two maps, both of which take the current state as their input: the \emph{control logic} map for determining the next state, and a \emph{datapath} map for updating the RAM and registers. These state machines are captured in Vericert's new intermediate language, HTL. When Vericert compiles from HTL to the final Verilog output, these maps are converted from proper `mathematical' functions into syntactic Verilog case-statements, and each is placed inside an always-block.

\begin{figure}

\definecolor{colorflow}{HTML}{8EB85B}

\begin{tikzpicture}[yscale=-1]

\tikzset{flowlines/.style={draw, colorflow, ultra thick}}

\node(fun1) at (1.5,2) {function};
\node(fun2) at (0,2) {function};
\node[anchor=west](C) at (-2,2) {\bf C:};
\node(CFG1) at (1.5,3) {CFG};
\node(CFG2) at (0,3) {CFG};
\node(CFG) at (0.75,4) {CFG};
\node[anchor=west](TAC) at (-2,3) {\bf 3AC:};
\node(FSMD) at (0.75,5) {state machine};
\node[anchor=west](HTL) at (-2,5) {\bf HTL:};
\node(module) at (0.75,6) {module};
\node[anchor=west](Verilog) at (-2,6) {\bf Verilog:};
\draw[->] (fun1) to node [auto] {CompCert frontend} (CFG1);
\draw[->] (fun2) to (CFG2);
\draw[->] (CFG1) to node [auto, pos=0.2] {inlining} (CFG);
\draw[->] (CFG2) to (CFG);
\draw[->] (CFG) to node [auto] (smg) {state machine generation} (FSMD);
\draw[->] (FSMD) to node [auto] {Verilog generation} (module);

\begin{pgfonlayer}{background}
\node[flowlines, fit=(C)(module)(smg)] (background1) {};
\end{pgfonlayer}

\begin{scope}[yshift=55mm]
\node(fun1) at (2,2) {function};
\node(fun2) at (0,2) {function};
\node[anchor=west](C) at (-2,2) {\bf C:};
\node(CFG1) at (2,3) {CFG};
\node(CFG2) at (0,3) {CFG};
\node[anchor=west](TAC) at (-2,3) {\bf 3AC:};
\node(FSMD1) at (2,4) {state machine};
\node(FSMD2) at (0,4) {state machine};
\node(FSMD3) at (2,5) {state machine};
\node(FSMD4) at (0,5) {state machine};
\node[anchor=west](HTL) at (-2,4) {\bf HTL:};
\node(module) at (1,6) {module};
\node[anchor=west](Verilog) at (-2,6) {\bf Verilog:};
\draw[->] (fun1) to node [auto] {CompCert frontend} (CFG1);
\draw[->] (fun2) to (CFG2);
\draw[->] (CFG1) to node [auto] (smg) {state machine generation} (FSMD1);
\draw[->] (CFG2) to (FSMD2);
\draw[->] (FSMD1) to node [auto] {renaming} (FSMD3);
\draw[->] (FSMD2) to (FSMD4);
\draw[->] (FSMD3) to node [auto, pos=0.2] {Verilog generation} (module);
\draw[->] (FSMD4) to (module);

\end{scope}

\begin{pgfonlayer}{background}
\node[flowlines, fit=(C)(module)(smg)] (background2) {};
\end{pgfonlayer}

\draw[flowlines, -latex] (background1.south) to node [auto] {\bf this work} (background2.north -| background1.south);

\end{tikzpicture}
\caption{Key compilation passes and intermediate languages in Vericert~\cite{Herklotz2020} (top) and Vericert-Fun (bottom)}
\label{fig:flow}
\end{figure}

\section{Implementation of Vericert-Fun}

\tikzset{
st/.style={draw=black, fill=white, rounded corners, align=left, font=\tt, minimum width=40mm},
ed/.style={draw, ->},
exted/.style={draw, thick, dotted, ->, rounded corners},
lab/.style={anchor=south west, inner sep=1pt, font=\tiny},
tit/.style={anchor=north west, font=\tt},
edlab/.style={auto, inner sep=2pt, align=left, font=\tt}
}

Figure~\ref{fig:example_C} shows a simple C file that we shall employ as a worked example. Each instruction is colour-coded so it can be tracked through the compilation stages.

\begin{figure}
\centering
\begin{lstlisting}
int add(int a, int b) { 
  |\HL5{return a + b;}| 
}

int main() {
  |\HL1{int v = 0;}|
  |\HL2{v = add(v, 1);}|
  |\HL3{v = add(v, 2);}|
  |\HL4{return v;}|
}
\end{lstlisting}
\caption{Running example: C program}
\label{fig:example_C}
\end{figure}

Figure~\ref{fig:example_3AC} shows the 3AC representation of that C program, as obtained by the CompCert frontend. It takes the form of two CFGs, one per function. The control flow in this example is straightforward, but in general, 3AC programs can contain unrestricted gotos. The nodes of the CFGs are numbered in reverse order, as are the parameters of the \lstinline{add} function; both are conventions inherited from CompCert.

\begin{figure}
% \begin{lstlisting}
% add (x2, x1) {
%   |\HL5{2: x3 = x2 + x1 + 0 (int)}|
%   |\HL5{1: return x3}|
% }
% main () {
%   |\HL1{9: x3 = 0}|
%   |\HL2{8: x6 = 1}|
%   |\HL2{7: x1 = "add"(x3, x6)}|
%   |\HL2{6: x3 = x1}|
%   |\HL3{5: x5 = 2}|
%   |\HL3{4: x2 = "add"(x3, x5)}|
%   |\HL3{3: x3 = x2}|
%   |\HL4{2: x4 = x3}|
%   |\HL4{1: return x4}|
% }
% \end{lstlisting}
\begin{tikzpicture}[node distance=2mm]
\node[st, fill=highlight5]    (a2) at (0,-1) {x3 = x2 + x1 + 0 (int)};
\node[st, fill=highlight5, below=of a2] (a1) {return x3};
\draw[ed] (a2) to (a1);
\node[tit] at ([xshift=-4mm, yshift=5mm]a2.north west) (labadd) {add(x2, x1) \{};
\node[tit] at ([xshift=-4mm, yshift=-1mm]a1.south west) (labadd2) {\}};
\begin{scope}[yshift=-24mm]
\node[st, fill=highlight1]    (m9) at (0,-1) {x3 = 0};
\node[st, fill=highlight2, below=of m9] (m8) {x6 = 1};
\node[st, fill=highlight2, below=of m8] (m7) {x1 = "add"(x3, x6)};
\node[st, fill=highlight2, below=of m7] (m6) {x3 = x1};
\node[st, fill=highlight3, below=of m6] (m5) {x5 = 2};
\node[st, fill=highlight3, below=of m5] (m4) {x2 = "add"(x3, x5)};
\node[st, fill=highlight3, below=of m4] (m3) {x3 = x2};
\node[st, fill=highlight4, below=of m3] (m2) {x4 = x3};
\node[st, fill=highlight4, below=of m2] (m1) {return x4};
\draw[ed] (m9) to (m8);
\draw[ed] (m8) to (m7);
\draw[ed] (m7) to (m6);
\draw[ed] (m6) to (m5);
\draw[ed] (m5) to (m4);
\draw[ed] (m4) to (m3);
\draw[ed] (m3) to (m2);
\draw[ed] (m2) to (m1);
\node[tit] at ([xshift=-4mm, yshift=5mm]m9.north west) (labmain) {main() \{};
\node[tit] at ([xshift=-4mm, yshift=-1mm]m1.south west) (labmain2) {\}};
\end{scope}
\foreach\i in {1,2,...,9}
  \node[lab, anchor=south east] at (m\i.west) {\i};
\foreach\i in {1,2}
  \node[lab, anchor=south east] at (a\i.west) {\i};
\end{tikzpicture}
\caption{Running example: 3AC representation comprising two CFGs}
\label{fig:example_3AC}
\end{figure}

Figure~\ref{fig:example_HTL} depicts the result of converting those CFGs into two state machines. The conversion of 3AC instructions into Verilog instructions has been described already by \citet{Herklotz2020}; what is new here is the handling of function calls, so the following description concentrates on that aspect. Before we proceed, there are two conventions worth noting: first, ``$*{\rightarrow}\langle\mathit{node}\rangle$'' should be interpreted as an abbreviation for edges from all nodes to $\langle\mathit{node}\rangle$, and second, we allow the \lstinline{main} machine to refer to the \lstinline{add} machine's register $\langle\mathit{reg}\rangle$ by writing ``\lstinline{add.}$\langle\mathit{reg}\rangle$''.

The solid edges within the two state machines indicate the transfer of control between states, while the dashed edges between the state machines are more `fictional'. The ground truth is that both state machines run continuously, but it is convenient to think of only one of the machines as doing useful work at any point in time. So, the dashed edges indicate the points at which the `active' machine changes.

\definecolor{background}{HTML}{eeeeee}
\begin{figure}
\centering
\begin{tikzpicture}[node distance=5mm]
\node[st, fill=highlight1] (m9) at (0,0)     {reg\_3 <= 0;};
\node[st, fill=highlight2, below=of m9] (m8) {reg\_6 <= 1;};
\node[st, fill=highlight2, below=of m8] (m7) {add.rst <= 1; \\ add.param\_0 <= reg\_3; \\ add.param\_1 <= reg\_6;};
\node[st, fill=highlight2, below=of m7] (m12) {add.rst <= 0; \\ reg\_1 <= add.ret;};
\node[st, fill=highlight2, below=of m12] (m6) {reg\_3 <= reg\_1;};
\node[st, fill=highlight3, below=of m6] (m5)  {reg\_5 <= 2;};
\node[st, fill=highlight3, below=of m5] (m4) {add.rst <= 1; \\ add.param\_0 <= reg\_3; \\ add.param\_1 <= reg\_5;};
\node[st, fill=highlight3, below=of m4] (m10) {add.rst <= 0; \\ reg\_2 <= add.ret;};
\node[st, fill=highlight3, below=of m10] (m3) {reg\_3 <= reg\_2;};
\node[st, fill=highlight4, below=of m3] (m2)  {reg\_4 <= reg\_3;};
\node[st, fill=highlight4, below=of m2] (m1)  {fin = 1; \\ ret = reg\_4;};
\node[st, fill=highlight4, below=of m1] (m11) {fin <= 0;};

\node[st, fill=highlight5, above=40mm of m9] (a2) {reg\_7 <= \{\{x2 + x1\} + 0\};};
\node[st, fill=highlight5, below=of a2] (a1) {fin = 1; \\ ret = reg\_7;};
\node[st, fill=highlight5, below=of a1] (a3) {fin <= 0;};

\draw[ed] (m9) to node[edlab] {!rst} (m8);
\draw[ed] (m8) to node[edlab] {!rst} (m7);
\draw[ed] (m7) to node[edlab] {!rst} (m12);
\draw[ed] (m12) to node[edlab] {!rst \&\& add.fin} (m6);
\draw[ed] (m6) to node[edlab] {!rst} (m5);
\draw[ed] (m5) to node[edlab] {!rst} (m4);
\draw[ed] (m4) to node[edlab] {!rst} (m10);
\draw[ed] (m10) to node[edlab] {!rst \&\& add.fin} (m3);
\draw[ed] (m3) to node[edlab] {!rst} (m2);
\draw[ed] (m2) to node[edlab] {!rst} (m1);
\draw[ed] (m1) to node[edlab] {!rst} (m11);
\draw[ed] (m12) to[loop right, looseness=2, out=-4, in=4] node[edlab, swap] {!rst \&\& \\ !add.fin} (m12);
\draw[ed] (m10) to[loop right, looseness=2, out=-4, in=4] node[edlab, swap] {!rst \&\& \\ !add.fin} (m10);
\draw[ed] (m11) to[loop below, looseness=10, pos=0.15] node[edlab] {!rst} (m11);

\node[inner sep=1pt] (ast1) at ([xshift=5mm, yshift=-5mm]m9.south east) {*};
\draw[ed] (ast1.north) to[bend right] node[edlab, swap] {rst} (m9.east);

\draw[ed] (a2) to node[edlab] {!rst} (a1);
\draw[ed] (a1) to node[edlab] {!rst} (a3);
\draw[ed] (a3) to[loop below, looseness=10, pos=0.15] node[edlab] {!rst} (a3);

\node[inner sep=1pt] (ast2) at ([xshift=5mm, yshift=-5mm]a2.south east) {*};
\draw[ed] (ast2.north) to[bend right] node[edlab, swap] {rst} (a2.east);

\coordinate (nw1) at ([xshift=-3mm, yshift=6mm]m9.north west); 
\coordinate (se1) at ([xshift=16mm, yshift=-4mm]m11.south east);
\begin{pgfonlayer}{background}
\node[draw, fill=background, rounded corners, fit=(nw1)(se1)] (main) {};
\end{pgfonlayer}
\node[tit, above=2mm of m9] (labmain) {\bfseries main()};
\coordinate (nw2) at ([xshift=-3mm, yshift=6mm]a2.north west); 
\coordinate (se2) at ([xshift=16mm, yshift=-4mm]a3.south east);
\begin{pgfonlayer}{background}
\node[draw, fill=background, rounded corners, fit=(nw2)(se2)] (add) {};
\end{pgfonlayer}
\node[tit, above=2mm of a2] (labadd) {\bfseries add(x2,x1)};

\foreach\i in {1,2,...,12}
  \node[lab] at (m\i.north west) {\i};
\foreach\i in {1,2,3}
  \node[lab] at (a\i.north west) {\i};

\draw[exted] (m7.west) -- ++(-7mm,0) |- ([yshift=-1mm]a2.west);
\draw[exted] (m4.west) -- ++(-9mm,0) |- ([yshift=1mm]a2.west);
\draw[exted] ([yshift=-1mm]a3.east) -- ++(19mm,0) |- (m6.east);
\draw[exted] ([yshift=1mm]a3.east) -- ++(21mm,0) |- (m3.east);

\end{tikzpicture}
\caption{Running example: HTL representation comprising two state machines}
\label{fig:example_HTL}
\end{figure}

In more detail: Execution begins in state 9 of the \lstinline{main} machine, and proceeds through successive states until it reaches state 7, in which the \lstinline{add} machine's \lstinline{rst} signal is raised. This puts the \lstinline{add} machine into state 2. When the \lstinline{main} machine advances to state 12, that \lstinline{rst} signal is lowered; the \lstinline{add} machine then begins its computation while the \lstinline{main} machine spins in state 12. When the \lstinline{add} machine has finished its computation (state 1), it asserts its \lstinline{fin} signal. This allows the \lstinline{main} machine to progress from state 12. Finally, the \lstinline{add} machine lowers its \lstinline{fin} and waits in state 3 until the next call.

The same sequence of events can also be understood using a timing diagram, as given in Figure~\ref{fig:timingdiagram}. In that diagram, the red lines indicate unspecified values. We see that each call of \lstinline{add} begins with a pulse on \lstinline{add.rst} (fifth waveform) and ends with a pulse on \lstinline{add.fin} (sixth waveform).

\begin{figure}
\begin{tikztimingtable}[timing/d/background/.style={fill=white}, timing/xunit=1em]
{\tt\small clk} & 0.5L 38{0.5C} \\
{\tt\small rst} & 0.5H 19L \\
{\tt\small fin} & 0.5X 17L H L \\
{\tt\small state} & 0.5X D{9} D{8} D{7} 4D{12} D{6} D{5} D{4} 4D{10} D{3} D{2} D{1} 2D{11} \\
{\tt\small rst} & 3.5X H 6L H 8L \\
{\tt\small fin} & 4.5X 2L H 6L H 5L \\
{\tt\small state} & 4.5X D{2} D{1} 5D{3} D{2} D{1} 6D{3} \\
\extracode
\foreach\i in {1,2,...,19}
\node[help lines, anchor=west, inner sep=0] at (\i-0.5,2.25) {\tiny \i};
\begin{pgfonlayer}{background}
  \vertlines[help lines]{0.5,1.5,...,18.5}
\end{pgfonlayer}
\draw[decoration={brace,mirror,raise=5pt},decorate] (-3,-1) to node[left=6pt] {\tt\small main} (-3,-6.5);
\draw[decoration={brace,mirror,raise=5pt},decorate] (-3,-7) to node[left=6pt] {\tt\small add} (-3,-12.5);
\end{tikztimingtable}
\caption{Timing diagram for the state machines in Figure~\ref{fig:example_HTL}}
\label{fig:timingdiagram}
\end{figure}

\section{Proving Vericert-Fun correct}


\section{Rough notes}
A few points that we might want to address at some point:

\begin{itemize}

\item What is so good about having a machine-checked correctness proof for an HLS tool? Explain that it is the gold-standard for high-integrity computer systems. Give example of Csmith finding 0 bugs in the verified parts of CompCert, compared to 100s in GCC and LLVM. Explain that machine-checked correctness proofs are becoming more common these days thanks to advances in automated theorem proving technology. Explain that we can't guarantee that hardware produced via Vericert will never go wrong, because:
\begin{itemize}
\item the translation from the Coq source code of Vericert into executable OCaml is unverified,
\item the compilation of that OCaml program is unverified,
\item the machine that runs the compiled OCaml program could go wrong, 
\item the Coq proof assistant could contain a bug, as could the machine running it,
\item the pretty-printing of the final Verilog design is unverified,
\item the synthesis of that Verilog design into a netlist is unverified, as are the tools that perform place-and-route and FPGA bitstream generation,
\item the execution on the FPGA could go wrong, e.g. as a result of cosmic rays.
\end{itemize}

\item What does Vericert currently do?

\item What is needed for the correctness proof? (Semantics for C, semantics for Verilog, ...)

\item We considered having multiple Verilog modules; this would be more idiomatic, but this would require more invasive changes to the semantics.

\end{itemize}


\appendix

\section{Verilog output for running example}

\begin{lstlisting}
module main (clk, rst, ret, fin)
  input clk, rst;
  output reg [31:0] ret;
  output reg fin;
  reg[31:0] state;
  reg[31:0] reg_1, reg_2, reg_3, reg_4, reg_5, reg_6;
  reg add_rst, add_fin; 
  reg[31:0] add_ret, add_x2, add_x1, reg_7, add_state;
  
  always @(posedge clk) // control logic for "add"
    if ({add_rst == 1}) begin 
      add_state <= 2; add_fin <= 0; 
    end else begin 
      case (add_state)
        |\HL5{2: add\_state <= 1;}|
        |\HL5{1: add\_state <= 3;}|
        3: ;
        default: ;
      endcase
    end   
    
  always @(posedge clk) // datapath for "add"    
    case (add_state)
      |\HL5{2: reg\_7 <= {{add\_x2 + add\_x1} + 0};}|
      |\HL5{1: add\_fin = 1;}|
         |\HL5{add\_ret = reg\_7;}|
      3: add_fin <= 0;
    endcase

  always @(posedge clk) // control logic for "main"
    if ({rst == 1}) begin 
      state <= 9; fin <= 0; 
    end else begin
      case (state)
        |\HL1{9: state <= 8;}|
        |\HL2{8: state <= 7;}|
        |\HL2{7: state <= 12;}|
       |\HL2{12: if ({add\_fin == 1}) state <= 6;}|
        |\HL2{6: state <= 5;}|
        |\HL3{5: state <= 4;}|
        |\HL3{4: state <= 10;}|
       |\HL3{10: if ({add\_fin == 1}) state <= 3;}|
        |\HL3{3: state <= 2;}|
        |\HL4{2: state <= 1;}|
        |\HL4{1: state <= 11;}|
       |\HL0{11: ;}|
      endcase
    end

  always @(posedge clk) // datapath for "main"
    case (state)
      |\HL1{9: reg\_3 <= 0;}|
      |\HL2{8: reg\_6 <= 1;}|
      |\HL2{7: add\_rst <= 1;}|
         |\HL2{add\_x2 <= reg\_3;}|
         |\HL2{add\_x1 <= reg\_6;}|
     |\HL2{12: add\_rst <= 0;}|
         |\HL2{reg\_1 <= add\_ret;}|
      |\HL2{6: reg\_3 <= reg\_1;}|
      |\HL3{5: reg\_5 <= 2;}|
      |\HL3{4: add\_rst <= 1;}|
         |\HL3{add\_x2 <= reg\_3;}|
         |\HL3{add\_x1 <= reg\_5;}|
     |\HL3{10: add\_rst <= 0;}|
         |\HL3{reg\_2 <= add\_ret;}|
      |\HL3{3: reg\_3 <= reg\_2;}|
      |\HL4{2: reg\_4 <= reg\_3;}|
      |\HL4{1: fin = 1;}|
         |\HL4{ret = reg\_4;}|
     |\HL0{11: fin <= 0;}|
    endcase
endmodule
\end{lstlisting}


\bibliographystyle{ACM-Reference-Format}
\bibliography{references}

\end{document}