summaryrefslogtreecommitdiffstats
path: root/verified_resource_sharing.tex
blob: 14b2c299939e3bb9f62797c7744c584705047a78 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
\documentclass[conference]{IEEEtran}
\usepackage{amsmath}
\usepackage{xcolor}
\usepackage{caption}
\usepackage{subcaption}
\usepackage{graphicx}
\usepackage{calc}  
\usepackage{enumitem} 
\usepackage{tabu}
\usepackage{tikz}
\usepackage{pgfplots}
\usepackage{pgfplotstable}
\usepackage{multirow}
\usepackage{booktabs}
\usepackage{hyperref}
\usepackage[sort, numbers]{natbib}

\usepackage[textsize=small,shadow]{todonotes}%
\usepackage{soul}
\usepackage{subcaption}
\usepackage{tikz-timing}
\usetikzlibrary{fit}
\usetikzlibrary{decorations.pathreplacing}
\usetikzlibrary{shapes,calc,arrows.meta}
\usetikzlibrary{pgfplots.groupplots}
\pgfplotsset{compat=1.16}

\usepackage{listings}
\lstset{
basicstyle=\tt\small,
columns=fixed, %basewidth=0.5em, %tip from https://tex.stackexchange.com/a/179072
escapeinside=||,
}
\setlength{\fboxsep}{1.5pt} % to reduce \colorbox padding when highlighting

%\usepackage{minted}
%\setminted{
%fontsize=\small,
%escapeinside=||,
%}
%\usemintedstyle{manni}

\definecolor{highlight0}{HTML}{ffffff}
\definecolor{highlight1}{HTML}{b4d6d0} % 8dd3c7
\definecolor{highlight2}{HTML}{fcfcdc} % ffffb3
\definecolor{highlight3}{HTML}{c9c7d6} % bebada
\definecolor{highlight4}{HTML}{fcc1bb} % fb8072
\definecolor{highlight5}{HTML}{a9c2d4} % 80b1d3

\newcommand\HL[2]{\colorbox{highlight#1}{\vphantom{A}\smash{#2}}}

% Leave review comments using
% \jwcomment{...} (for John) or \yhcomment{...} (for Yann)
% Using either directly leaves a margin note, using it as,
% e.g. \jwcomment[inline]{...} leaves an inline comment
\newcommand{\jwcomment}[2][]{\todo[author={John}, color=ACMLightBlue, #1]{#2}}
\newcommand{\yhcomment}[2][]{\todo[author={Yann}, color=ACMGreen, #1]{#2}}
\newcommand{\mpcomment}[2][]{\todo[author={Michalis}, color=ACMMagenta, #1]{#2}}


\newif\ifCOMMENTS
\COMMENTStrue
\newcommand{\Comment}[3]{\ifCOMMENTS\textcolor{#1}{{\bf [\![#2:} #3{\bf ]\!]}}\fi}
\newcommand\JW[2][]{\st{#1}\Comment{red!75!black}{JW}{#2}}
\newcommand\YH[2][]{\st{#1}\Comment{blue!50!green}{YH}{#2}}
\newcommand\MP[2][]{\st{#1}\Comment{blue!50!magenta}{MP}{#2}}

\newcommand\vericert{Vericert}
\newcommand\vericertfun{Vericert-Fun}
\newcommand\bambu{Bambu}

\title{Resource Sharing for Verified High-Level Synthesis \\ (Work in Progress)}

\begin{document}

%\author{Michail Pardalos}
%\email{michail.pardalos17@imperial.ac.uk}
%\affiliation{\institution{Imperial College London}}

%\author{Yann Herklotz}
%\email{yann.herklotz15@imperial.ac.uk}
%\affiliation{\institution{Imperial College London}}

%\author{John Wickerson}
%\email{j.wickerson@imperial.ac.uk}
%\affiliation{\institution{Imperial College London}}

\maketitle

\begin{abstract}
High-level synthesis (HLS) is playing an ever-increasing role in hardware design, but concerns have been raised about its reliability. Seeking to address these, Herklotz et al. have developed an HLS compiler called \vericert{} that has been mechanically proven (using the Coq proof assistant) to output Verilog designs that are behaviourally equivalent to the input C program. Unfortunately, \vericert{} cannot compete performance-wise with established HLS tools, and a major reason for this is \vericert{}'s complete lack of support for \emph{resource sharing}. 
  
This paper introduces \vericertfun: \vericert{} extended with function-level resource sharing. Where original \vericert{} creates one block of hardware per function \emph{call}, \vericertfun{} creates one block of hardware per function \emph{definition}. To enable this, we extend \vericert{}'s intermediate language HTL with the ability to represent \emph{multiple} state machines, and we implement function calls by transferring control between these state machines. We are working to extend \vericert{}'s formal correctness proof to cover the translation from C into this extended HTL and thence to Verilog. Benchmarking on the PolyBench/C suite indicates that \vericertfun{} generates hardware with about half the resource usage of \vericert{}'s on average, with minimal impact on execution time.
\end{abstract}

\section{Introduction}
\label{sec:introduction}

The drive for faster, more energy-efficient computation has led to a surge in demand for custom hardware accelerators. This, in turn, has led to a great deal of interest in \emph{high-level synthesis} (HLS) as a means to program these devices.

However, doubts have been raised about the reliability of the current crop of HLS tools. For instance, \citet{Herklotz2021_empiricalstudy} found numerous miscompilation bugs in commercial HLS tools including Xilinx Vivado HLS~\cite{xilinx_vitis}, the Intel HLS Compiler~\cite{intel_hls}, and LegUp~\cite{legup_CanisAndrew2011}. This unreliability can be a significant hindrance in the development process, especially given the long iteration times of hardware design compared to software, and it undermines the usefulness of HLS in safety- or security-critical settings. %It is therefore essential to ensure that HLS tools are as reliable as possible.

Aiming to address this issue is \vericert{}~\cite{vericert}, a new HLS tool whose correctness has been checked to the highest possible standard: machine-checked proof. It provides a proof, checked using the Coq proof assistant~\cite{coq}, that every step of its translation from C to Verilog preserves the semantics of its input program. This means that we can always trust any Verilog design produced by \vericert{} to behave the same way as the C program given as input. %It is based on the CompCert~\cite{compcert_Leroy2009} verified C compiler.

Clearly, however, it is not enough for an HLS tool simply to be \emph{correct}; the generated hardware must also enjoy high throughput, low latency, and good \emph{area efficiency} -- the last of which is the topic of this paper. A common optimisation employed by HLS tools to improve area efficiency is \emph{resource sharing}; that is, re-using hardware for more than one purpose. Accordingly, our work adds function-level resource sharing to \vericert{}, yielding a new prototype HLS tool called `\vericertfun'. In keeping with the aims of the \vericert{} project, work is ongoing to extend the correctness proof.

The entire \vericertfun{} development is fully open-source~\cite{vericertfun-github}, and more details about the implementation and proofs are available in a technical report~\cite{pardalos_thesis}.

\section{Background}

\paragraph{The Coq proof assistant} \vericert{} is implemented using the Coq proof assistant~\cite{coq}. This means that it consists of a collection of functions that define the compilation process, together with the proof of a theorem stating that those definitions constitute a correct HLS tool. Coq mechanically checks this proof using a formal mathematical calculus, and then automatically translates the function definitions into OCaml code that can be compiled and executed. 

Engineering a software system within a proof assistant like Coq is widely held to be the gold standard for correctness. Recent years have shown that it is feasible to design substantial pieces of software in this way, such as database management systems~\cite{malecha+10}, web servers~\cite{chlipala15}, and operating system kernels~\cite{gu+16}. Coq has also been successfully deployed in the hardware design process, both in academia~\cite{braibant+13, bourgeat+20} and in industry~\cite{silveroak}. It has even been applied specifically to HLS: Faissole et al.~\cite{faissole+19} used it to verify that HLS optimisations respect dependencies in the source code.

\paragraph{The CompCert verified C compiler} Among the most celebrated applications of Coq is the CompCert project~\cite{compcert}. CompCert is a lightly optimising C compiler, with backend support for the Arm, x86, PowerPC, and Kalray VLIW architectures~\cite{six+20}, that has been implemented and proven correct in Coq. CompCert accepts most of the C99 language, and generally produces code of comparable performance to that produced by GCC at \texttt{-O1}. It transforms its input through a series of ten intermediate languages before generating the final output. This design ensures that each individual pass remains relatively simple and hence feasible to prove correct. The correctness proof of the entire compiler is formed by composing the correctness proofs of each of its passes. 

That the Csmith compiler testing tool has found hundreds of bugs in GCC and LLVM but none in (the verified parts of) CompCert~\cite{csmith} is a testament to the reliability of this development approach. That said, we cannot guarantee that hardware produced via \vericertfun{} will never go wrong, because of fallibilities in components not covered by the theorem, including the computer checking the proofs, the synthesis toolchain~\cite{verismith}, and the FPGA device itself.
 
\paragraph{The \vericert{} verified HLS tool}

Introduced by \citet{vericert}, \vericert{} is a verified C-to-Verilog HLS tool. It was built by extending CompCert with a new hardware-oriented intermediate language (called HTL) and a Verilog backend. \vericert{} currently performs no significant optimisations beyond those inherited from CompCert's frontend. This results in performance generally about one order of magnitude slower than that achieved by comparable, unverified HLS tools.

\vericert{} branches off from CompCert at the intermediate language called \emph{register-transfer language} (RTL). \JW{Got up to here; will finish word-chopping pass tomorrow morning.} 
Since that abbreviation is usually used in the hardware community for `register-transfer level', we shall henceforth avoid possible confusion by referring to the CompCert intermediate language as `3AC' (for `three-address code'). 
In the 3AC language, each function in the program is represented as a numbered list of instructions with gotos -- essentially, a control-flow graph (CFG). The essence of \vericert{}'s compilation strategy is to treat this CFG as a state machine, with each instruction in the CFG becoming a state, and each edge in the CFG becoming a transition. Moreover, program variables that do not have their address taken are mapped to hardware registers; other variables (including arrays and structs) are allocated in a block of RAM that represents the stack. More precisely, \vericert{} builds a \emph{finite state machine with datapath} (FSMD). This comprises two maps, both of which take the current state as their input: the \emph{control logic} map for determining the next state, and a \emph{datapath} map for updating the RAM and registers. These state machines are captured in \vericert{}'s new intermediate language, HTL. When \vericert{} compiles from HTL to the final Verilog output, these maps are converted from proper `mathematical' functions into syntactic Verilog case-statements, and each is placed inside an always-block.

The overall \vericert{} flow is shown at the top of Figure~\ref{fig:flow}. The key point to note here is the `inlining' step, which folds all function definitions into their call sites. This allows \vericert{} to make the simplifying assumption that there is only a single CFG, but has the unwanted effect of duplicating hardware. In this work, we remove some of this inlining and hence some of the duplication.

\paragraph{Resource sharing in HLS}

Resource sharing is a feature expected of most HLS compilers. In a typical architecture generated by HLS~\cite{coussy+09}, a number of `functional components' are selected from a library according to the needs of the specific design, and in the scheduling process, a clock cycle is chosen for each operation such that the components it requires are available. Given the need to mechanically verify the correctness of our implementation, \vericertfun{} follows a simpler approach: we share resources at the granularity of entire functions, rather than individual operations. Function-level resource sharing is implemented in commercial HLS compilers such as the Intel HLS compiler~\cite{intel_hls} or
Xilinx Vitis~\cite{xilinx_vitis}, and is guided by the programmer through appropriate pragmas.

Perna et al.~\cite{perna+12} developed a verified HLS tool for the Handel-C language, but, like \vericert{}, they did not implement function-level resource sharing, instead arranging that ``all procedure and function calls are expanded in the
front-end''.

\begin{figure}
\definecolor{colorflow}{HTML}{8EB85B}

\centering
\begin{tikzpicture}[yscale=-1]
\tikzset{flowlines/.style={draw, colorflow, ultra thick}}

\node(fun1) at (1.5,2) {function};
\node(fun2) at (0,2) {function};
\node[anchor=west](C) at (-2,2) {\bf C:};
\node(CFG1) at (1.5,3) {CFG};
\node(CFG2) at (0,3) {CFG};
\node(CFG) at (0.75,4) {CFG};
\node[anchor=west](TAC) at (-2,3) {\bf 3AC:};
\node(FSMD) at (0.75,5) {state machine};
\node[anchor=west](HTL) at (-2,5) {\bf HTL:};
\node(module) at (0.75,6) {module};
\node[anchor=west](Verilog) at (-2,6) {\bf Verilog:};
\draw[->] (fun1) to node [auto] {CompCert frontend} (CFG1);
\draw[->] (fun2) to (CFG2);
\draw[->] (CFG1) to node [auto, pos=0.2] {inlining} (CFG);
\draw[->] (CFG2) to (CFG);
\draw[->] (CFG) to node [auto] (smg) {state machine generation} (FSMD);
\draw[->] (FSMD) to node [auto] {Verilog generation} (module);
\coordinate (r) at (6,3);

\begin{pgfonlayer}{background}
\node[flowlines, fit=(C)(module)(r)] (background1) {};
\end{pgfonlayer}

\begin{scope}[yshift=55mm]
\node(fun1) at (2,2) {function};
\node(fun2) at (0,2) {function};
\node[anchor=west](C) at (-2,2) {\bf C:};
\node(CFG1) at (2,3) {CFG};
\node(CFG2) at (0,3) {CFG};
\node[anchor=west](TAC) at (-2,3) {\bf 3AC:};
\node(FSMD1) at (2,4) {state machine};
\node(FSMD2) at (0,4) {state machine};
\node(FSMD3) at (2,5) {state machine};
\node(FSMD4) at (0,5) {state machine};
\node[anchor=west](HTL) at (-2,4) {\bf HTL:};
\node(module) at (1,6) {module};
\node[anchor=west](Verilog) at (-2,6) {\bf Verilog:};
\draw[->] (fun1) to node [auto] {CompCert frontend} (CFG1);
\draw[->] (fun2) to (CFG2);
\draw[->] (CFG1) to node [auto] (smg) {state machine generation} (FSMD1);
\draw[->] (CFG2) to (FSMD2);
\draw[->] (FSMD1) to node [auto] {renaming} (FSMD3);
\draw[->] (FSMD2) to (FSMD4);
\draw[->] (FSMD3) to node [auto, pos=0.2] {Verilog generation} (module);
\draw[->] (FSMD4) to (module);
\coordinate (r) at (6,3);

\end{scope}

\begin{pgfonlayer}{background}
\node[flowlines, fit=(C)(module)(r)] (background2) {};
\end{pgfonlayer}

\draw[flowlines, -latex] (background1.south) to node [auto] {\bf this work} (background2.north -| background1.south);

\end{tikzpicture}
\caption{Key compilation passes and intermediate languages in \vericert{}~\cite{vericert} (top) and \vericertfun{} (bottom)}
\label{fig:flow}
\end{figure}

\section{Implementation of \vericertfun}

\tikzset{
st/.style={draw=black, fill=white, rounded corners, align=left, font=\tt\footnotesize, minimum width=40mm},
ed/.style={draw, ->},
exted/.style={draw, thick, dotted, ->, rounded corners},
lab/.style={anchor=south west, inner sep=1pt, font=\tiny},
tit/.style={anchor=north west, font=\tt\footnotesize},
edlab/.style={auto, inner sep=2pt, align=left, font=\tt\footnotesize}
}

In this section, we shall explain the implementation of \vericertfun, using Figure~\ref{fig:example_C} as a worked example. The basic idea is shown at the bottom of Figure~\ref{fig:flow}: we avoid inlining the function calls at the 3AC level (except in certain circumstances described below), instead maintaining one state machine per function. All the state machines run simultaneously, and function calls are implemented by sending messages between the state machines. We combine all of these state machines into a single Verilog module, after renaming variables as necessary to avoid clashes.

\begin{figure}
\centering
\begin{minipage}[b]{0.45\linewidth}
\begin{lstlisting}
int add(int a, int b) 
{ 
  |\HL5{return a + b;}| 
}

int main() 
{
  |\HL1{int v = 0;}|
  |\HL2{v = add(v, 1);}|
  |\HL3{v = add(v, 2);}|
  |\HL4{return v;}|
}
\end{lstlisting}
\caption{A simple C program with each instruction colour-coded so it can be tracked through the compilation stages}
\label{fig:example_C}
\end{minipage}
\hfill
\begin{minipage}[b]{0.5\linewidth}
% \begin{lstlisting}
% add (x2, x1) {
%   |\HL5{2: x3 = x2 + x1 + 0 (int)}|
%   |\HL5{1: return x3}|
% }
% main () {
%   |\HL1{9: x3 = 0}|
%   |\HL2{8: x6 = 1}|
%   |\HL2{7: x1 = "add"(x3, x6)}|
%   |\HL2{6: x3 = x1}|
%   |\HL3{5: x5 = 2}|
%   |\HL3{4: x2 = "add"(x3, x5)}|
%   |\HL3{3: x3 = x2}|
%   |\HL4{2: x4 = x3}|
%   |\HL4{1: return x4}|
% }
% \end{lstlisting}
\begin{tikzpicture}[node distance=2mm]
\node[st, fill=highlight5]    (a2) at (0,-1) {x3 = x2 + x1 + 0(int)};
\node[st, fill=highlight5, below=of a2] (a1) {return x3};
\draw[ed] (a2) to (a1);
\node[tit] at ([xshift=-4mm, yshift=5mm]a2.north west) (labadd) {add(x2, x1) \{};
\node[tit] at ([xshift=-4mm, yshift=-1mm]a1.south west) (labadd2) {\}};
\begin{scope}[yshift=-24mm]
\node[st, fill=highlight1]    (m9) at (0,-1) {x3 = 0};
\node[st, fill=highlight2, below=of m9] (m8) {x6 = 1};
\node[st, fill=highlight2, below=of m8] (m7) {x1 = "add"(x3, x6)};
\node[st, fill=highlight2, below=of m7] (m6) {x3 = x1};
\node[st, fill=highlight3, below=of m6] (m5) {x5 = 2};
\node[st, fill=highlight3, below=of m5] (m4) {x2 = "add"(x3, x5)};
\node[st, fill=highlight3, below=of m4] (m3) {x3 = x2};
\node[st, fill=highlight4, below=of m3] (m2) {x4 = x3};
\node[st, fill=highlight4, below=of m2] (m1) {return x4};
\draw[ed] (m9) to (m8);
\draw[ed] (m8) to (m7);
\draw[ed] (m7) to (m6);
\draw[ed] (m6) to (m5);
\draw[ed] (m5) to (m4);
\draw[ed] (m4) to (m3);
\draw[ed] (m3) to (m2);
\draw[ed] (m2) to (m1);
\node[tit] at ([xshift=-4mm, yshift=5mm]m9.north west) (labmain) {main() \{};
\node[tit] at ([xshift=-4mm, yshift=-1mm]m1.south west) (labmain2) {\}};
\end{scope}
\foreach\i in {1,2,...,9}
  \node[lab, anchor=south east] at (m\i.west) {\i};
\foreach\i in {1,2}
  \node[lab, anchor=south east] at (a\i.west) {\i};
\end{tikzpicture}
\caption{3AC representation comprising two CFGs}
\label{fig:example_3AC}
\end{minipage}

\end{figure}

Figure~\ref{fig:example_3AC} shows the 3AC representation of that C program, as obtained by the CompCert frontend. It takes the form of two CFGs, one per function. The control flow in this example is straightforward, but in general, 3AC programs can contain unrestricted gotos. The nodes of the CFGs are numbered in reverse order, as are the parameters of the \lstinline{add} function; both are conventions inherited from CompCert.

Figure~\ref{fig:example_HTL} depicts the result of converting those CFGs into two state machines. The conversion of 3AC instructions into Verilog instructions has been described already by \citet{vericert}; what is new here is the handling of function calls, so the following description concentrates on that aspect. Note that ``$*{\rightarrow}\langle\mathit{node}\rangle$'' is an abbreviation for edges from all nodes to $\langle\mathit{node}\rangle$.

The solid edges within the two state machines indicate the transfer of control between states, while the dashed edges between the state machines are more `fictional'. The ground truth is that both state machines run continuously, but it is convenient to think of only one of the machines as doing useful work at any point in time. So, the dashed edges indicate the points at which the `active' machine changes.

\definecolor{background}{HTML}{eeeeee}
\begin{figure}
\centering
\begin{tikzpicture}[node distance=5mm]
\node[st, fill=highlight1] (m9) at (0,0)     {reg\_3 <= 0;};
\node[st, fill=highlight2, below=of m9] (m8) {reg\_6 <= 1;};
\node[st, fill=highlight2, below=of m8] (m7) {add\_0\_rst <= 1; \\ add\_0\_a <= reg\_3; \\ add\_0\_b <= reg\_6;};
\node[st, fill=highlight2, below=of m7] (m12) {add\_0\_rst <= 0; \\ reg\_1 <= add\_0\_ret;};
\node[st, fill=highlight2, below=of m12] (m6) {reg\_3 <= reg\_1;};
\node[st, fill=highlight3, below=of m6] (m5)  {reg\_5 <= 2;};
\node[st, fill=highlight3, below=of m5] (m4) {add\_1\_rst <= 1; \\ add\_1\_a <= reg\_3; \\ add\_1\_b <= reg\_5;};
\node[st, fill=highlight3, below=of m4] (m10) {add\_1\_rst <= 0; \\ reg\_2 <= add\_1\_ret;};
\node[st, fill=highlight3, below=of m10] (m3) {reg\_3 <= reg\_2;};
\node[st, fill=highlight4, below=of m3] (m2)  {reg\_4 <= reg\_3;};
\node[st, fill=highlight4, below=of m2] (m1)  {fin = 1; \\ ret = reg\_4;};
\node[st, fill=highlight4, below=of m1] (m11) {fin <= 0;};

\node[st, fill=highlight5, above=38mm of m9] (a2) {reg\_7 <= x2 + x1 + 0;};
\node[st, fill=highlight5, below=of a2] (a1) {fin = 1; \\ ret = reg\_7;};
\node[st, fill=highlight5, below=of a1] (a3) {fin <= 0;};

\draw[ed] (m9) to node[edlab] {!rst} (m8);
\draw[ed] (m8) to node[edlab] {!rst} (m7);
\draw[ed] (m7) to node[edlab] {!rst} (m12);
\draw[ed] (m12) to node[edlab] {!rst \&\& add\_0\_fin} (m6);
\draw[ed] (m6) to node[edlab] {!rst} (m5);
\draw[ed] (m5) to node[edlab] {!rst} (m4);
\draw[ed] (m4) to node[edlab] {!rst} (m10);
\draw[ed] (m10) to node[edlab] {!rst \&\& add\_1\_fin} (m3);
\draw[ed] (m3) to node[edlab] {!rst} (m2);
\draw[ed] (m2) to node[edlab] {!rst} (m1);
\draw[ed] (m1) to node[edlab] {!rst} (m11);
\draw[ed] (m12) to[loop right, looseness=2, out=-4, in=4] node[edlab, swap] {!rst \&\& \\ !add\_0\_fin} (m12);
\draw[ed] (m10) to[loop right, looseness=2, out=-4, in=4] node[edlab, swap] {!rst \&\& \\ !add\_1\_fin} (m10);
\draw[ed] (m11) to[loop below, looseness=10, pos=0.15] node[edlab] {!rst} (m11);

\node[inner sep=1pt] (ast1) at ([xshift=5mm, yshift=-5mm]m9.south east) {*};
\draw[ed] (ast1.north) to[bend right] node[edlab, swap] {rst} (m9.east);

\draw[ed] (a2) to node[edlab] {!rst} (a1);
\draw[ed] (a1) to node[edlab] {!rst} (a3);
\draw[ed] (a3) to[loop below, looseness=10, pos=0.15] node[edlab] {!rst} (a3);

\node[inner sep=1pt] (ast2) at ([xshift=5mm, yshift=-5mm]a2.south east) {*};
\draw[ed] (ast2.north) to[bend right] node[edlab, swap] {rst} (a2.east);

\coordinate (nw1) at ([xshift=-1mm, yshift=6mm]m9.north west); 
\coordinate (se1) at ([xshift=44mm, yshift=-4mm]m11.south east);
\begin{pgfonlayer}{background}
\node[draw, fill=background, rounded corners, fit=(nw1)(se1)] (main) {};
\end{pgfonlayer}
\node[tit, above=2mm of m9] (labmain) {\bfseries main()};
\coordinate (nw2) at ([xshift=-1mm, yshift=6mm]a2.north west); 
\coordinate (se2) at ([xshift=16mm, yshift=-4mm]a3.south east);
\begin{pgfonlayer}{background}
\node[draw, fill=background, rounded corners, fit=(nw2)(se2)] (add) {};
\end{pgfonlayer}
\node[tit, above=2mm of a2] (labadd) {\bfseries add(x2,x1)};

\foreach\i in {1,2,...,12}
  \node[lab] at (m\i.north west) {\i};
\foreach\i in {1,2,3}
  \node[lab] at (a\i.north west) {\i};

\draw[exted] (m7.west) -- ++(-5mm,0) |- ([yshift=-1mm]a2.west);
\draw[exted] (m4.west) -- ++(-7mm,0) |- ([yshift=1mm]a2.west);
\draw[exted] ([yshift=-1mm]a3.east) -- ++(21mm,0) |- (m6.east);
\draw[exted] ([yshift=1mm]a3.east) -- ++(23mm,0) |- (m3.east);

\node [align=left, right=3mm of m1, font=\tt\footnotesize] {
{\bfseries extern\_ctrl:} \\ 
add\_0\_rst $\mapsto$ (add, \textrm{reset}) \\
add\_0\_ret $\mapsto$ (add, \textrm{return}) \\
add\_0\_fin $\mapsto$ (add, \textrm{finish}) \\
add\_0\_a $\mapsto$ (add, \textrm{param 0}) \\
add\_0\_b $\mapsto$ (add, \textrm{param 1}) \\
add\_1\_rst $\mapsto$ (add, \textrm{reset}) \\
add\_1\_ret $\mapsto$ (add, \textrm{return}) \\
add\_1\_fin $\mapsto$ (add, \textrm{finish}) \\
add\_1\_a $\mapsto$ (add, \textrm{param 0}) \\
add\_1\_b $\mapsto$ (add, \textrm{param 1})
}; 

\end{tikzpicture}
\caption{HTL representation comprising two state machines}
\label{fig:example_HTL}
\medskip
\begin{tikztimingtable}[timing/d/background/.style={fill=white}, timing/xunit=1em, timing/yunit=0.6em]
{\tt\footnotesize clk}\!\!\! & 0.5L 38{0.5C} \\
{\tt\footnotesize rst}\!\!\! & 0.5H 19L \\
{\tt\footnotesize fin}\!\!\! & 0.5X 17L H L \\
{\tt\footnotesize state}\!\!\! & 0.5X D{9} D{8} D{7} 4D{12} D{6} D{5} D{4} 4D{10} D{3} D{2} D{1} 2D{11} \\
{\tt\footnotesize rst}\!\!\! & 3.5X H 6L H 8L \\
{\tt\footnotesize fin}\!\!\! & 4.5X 2L H 6L H 5L \\
{\tt\footnotesize state}\!\!\! & 4.5X D{2} D{1} 5D{3} D{2} D{1} 6D{3} \\
\extracode
\foreach\i in {1,2,...,19}
\node[help lines, anchor=west, inner sep=0] at (\i-0.5,2.25) {\tiny \i};
\begin{pgfonlayer}{background}
  \vertlines[help lines]{0.5,1.5,...,18.5}
\end{pgfonlayer}
\draw[decoration={brace,mirror,raise=5pt},decorate] (-2.6,-1) to node[left=6pt] {\tt\footnotesize main} (-2.6,-6.5);
\draw[decoration={brace,mirror,raise=5pt},decorate] (-2.6,-7) to node[left=6pt] {\tt\footnotesize
add} (-2.6,-12.5);
\end{tikztimingtable}
\caption{Timing diagram for the state machines in Figure~\ref{fig:example_HTL}}
\label{fig:timingdiagram}

\end{figure}

In more detail: Execution begins in state 9 of the \lstinline{main} machine, and proceeds through successive states until it reaches state 7, in which the \lstinline{add} machine's \lstinline{rst} signal is raised. This causes the \lstinline{add} machine to advance to state 2. When the \lstinline{main} machine advances to state 12, that \lstinline{rst} signal is lowered; the \lstinline{add} machine then begins its computation while the \lstinline{main} machine spins in state 12. When the \lstinline{add} machine has finished its computation (state 1), it asserts its \lstinline{fin} signal. This allows the \lstinline{main} machine to leave state 12. Finally, the \lstinline{add} machine lowers its \lstinline{fin} and waits in state 3 until the next call.

The same sequence of events can also be understood using a timing diagram, as given in Figure~\ref{fig:timingdiagram}. In that diagram, the red lines indicate unspecified values. %We see that each call of \lstinline{add} begins with a pulse on \lstinline{add.rst} (fifth waveform) and ends with a pulse on \lstinline{add.fin} (sixth waveform).
We see that calls are initiated by triggering the \lstinline{rst} signal of the called module. Similarly, a function returns by setting its own \lstinline{fin} register. %There are no `call' or `return' instructions in HTL.

One technical challenge we encountered in the implementation of \vericertfun{} has to do with the fact that the called and callee state machines need to modify each other's variables. This is problematic because each function is translated into a state machine independently, and hence the register names used in the other state machines are not necessarily available. We work around this problem by introducing an additional component to our state machines: a mapping, called \lstinline{extern_ctrl}, that maps local register names to pairs of module identifiers and roles in those modules. For instance, the first entry in \lstinline{extern_ctrl} in Figure~\ref{fig:example_HTL} tells us that the \lstinline{add_0_rst} register used by the \lstinline{main} state machine should resolve to whichever register plays the role of `reset' in the \lstinline{add} state machine. Once all the state machines have been generated, we erase \lstinline{extern_ctrl}. We do this in two steps. First, we perform a renaming pass through
the whole program, making all register names globally unique. This is necessary to avoid
unintended conflicts between register names in different modules, as register names can only be assumed to be unique within their own module. We then do a second pass, renaming
registers in \lstinline{extern_ctrl} to the name of the actual register they target. %\JW{I'm still struggling to understand externctrl properly. I don't see why we can't resolve externctrl straight away, when each HTL state machine is first generated. Why do we have to wait until they are linked together?}\YH{I think that there are multiple reasons: The first is that it's very difficult to access the function information from other functions, because this requires resolving their ID in the global AST state and then inspecting it.  If one ends up using this information, it can be very tricky to prove the induction, because one isn't local to the current function anymore.  All the proof structure in CompCert assumes one only deals within the current function.  This also complicates the definition of the semantics.  Secondly, I think it just helps as a bridge, because functions have their completely separate register files in 3AC.  It's therefore quite difficult to map registers from one register file to the other unless one has something that describes which ones to map over.}

A second technical challenge we encountered in the implementation of \vericertfun{} has to do with an assumption made in \vericert{}'s correctness proof: that all pointers used by the input program are stored as offsets from the main function's stack frame. This assumption was reasonable for \vericert{} because after full inlining, the only function is the main function. This assumption is baked into several of \vericert{}'s most complicated lemmas, including the correctness proof for \lstinline{load} and \lstinline{store} instructions, and so we have not sought to lift it in the current incarnation of \vericertfun{}. Instead, we have made the compromise of only \emph{partially} eliminating the inlining pass. That is: \vericertfun{} inlines all functions that contain \lstinline{load}, \lstinline{store} or \lstinline{call} instructions. Thus, the benefits of resource sharing are only enjoyed by functions that do not include these instructions.

\section{Proving \vericertfun{} correct}

The CompCert correctness theorem~\cite{compcert} expresses that every behaviour that can be exhibited by the compiled program is also a behaviour of the source program. \vericert{}~\cite{vericert} adapted this theorem for HLS by replacing `compiled program' with `generated Verilog design'. In both cases, a formal semantics is required for the source and target languages. \vericertfun{} targets the same fragment of the Verilog language as \citeauthor{vericert} already mechanised in Coq, so no changes are required there. 

Where changes \emph{are} required is in the semantics of the intermediate language HTL, which sits between CompCert's 3AC and the final Verilog.
When \citeauthor{vericert} designed HTL, they did not include a semantics for function calls because they assumed all function calls would already have been inlined. We have extended HTL so that its semantics is additionally parameterised by an environment that maps function names to state machines. We have added a semantics for function calls that looks up the named function in this environment, activates the corresponding state machine, and creates a new frame on the stack. We have also added a semantics for return statements that pops the current frame off the stack and reactivates the caller's state machine. % \MP{One aspect of this I didn't mention: This introduces non-determinism. When you set the reset of a module, the next state can be either a call or a normal executing state, ignoring the reset. This non-determinism does not affect the RTL-to-HTL proof, but could affect the HTL-to-Verilog one.}

At the point of writing, the correctness of \vericertfun{} from C to HTL has been mostly proven: basic instructions and function calls are proven correct, but proofs of load and store instructions are not finished as they lack some key invariants. The pass that renames
variables in HTL is yet to be proven, as is the pass that generates the final Verilog. 

To give a rough idea of the scale and complexity of the task: the implementation of \vericertfun{} involved adding or changing about 700 lines of Coq code in \vericert{} and took the first author 4 months. The correctness proof, so far, has required about 2300 lines of additions or changes to the Coq code and 8 person-months of work.

\section{Performance evaluation}

We now compare the quality of the hardware generated by \vericertfun{} against that generated by \vericert{}. Following \citet{vericert}, we use the PolyBench/C benchmark suite~\cite{polybench} with division and modulo replaced with iterative software implementations because \vericert{} does not handle them inefficiently. That is, \lstinline{a/b} and \lstinline{c%d}
are textually replaced with \lstinline{div(a,b)} and \lstinline{mod(c,d)}. These \lstinline{div} and \lstinline{mod} functions are the only \YH{Non-inlinable? There are always 4 functions defined per benchmark, but these are inlined by Vericert.} function calls in the benchmarks.

We used the Icarus Verilog simulator to determine the cycle counts of the generated designs. We used Xilinx Vivado 2017.1, targeting a Xilinx 7-series FPGA (XC7K70T) to determine area usage and fmax.

%Figure~\ref{fig:results} summarises our results. The x-axis shows the impact of resource sharing on
%the speed of the hardware (as calculated by the cycle count divided by fmax); we see that all the
%data points lie very close to 1, which suggests no significant impact. On average the cycle count
%increases by 0.7\%; this modest increase is in line with expectations because our translation
%introduces an extra state per function call. The impact on fmax is similarly minimal, ranging
%between a 1.5\% increase and a 3.1\% decrease (0.2\% decrease on average).

Figure~\ref{fig:results} compares the hardware generated by \vericertfun{} with that generated by \vericert{}, using the open-source (but unverified) \bambu{} HLS tool~\cite{pilato13_bambu} as a baseline. The top graph compares the area usage. We observe a consistent and substantial reduction in area usage across the benchmark programs, with \vericert{} consistently using more area than Bambu (1.5$\times$ on average) and \vericertfun{} being much closer to Bambu (1.2$\times$ on average). As expected, the benchmarks with the most function calls, such as mvt, 2mm, 3mm and ludcmp enjoy the biggest area savings.  Benchmarks where only one function call is made, such as heat-3d and nussinov naturally see a slight increase in area usage because of the extra circuitry required.

The bottom graph compares the execution time. We observe that \vericertfun{} generates hardware that is almost exactly as fast as \vericert{}'s, with \vericertfun{} hardware being only 4\% slower on average. This can be attributed to the latency overhead of performing a function call. Hardware from \vericert{} and \vericertfun{} is significantly slower than \bambu's, which can be attributed to \vericert{} employing far fewer optimisations than the unverified \bambu{} tool.

\pgfplotstableread[col sep=comma]{data/time-ratio.csv}{\divtimingtable}
\pgfplotstableread[col sep=comma]{data/slice-ratio.csv}{\divslicetable}
\definecolor{vericertcol}{HTML}{1b9e77}
\definecolor{legupnooptcol}{HTML}{d95f02}
\definecolor{legupnooptnochaincol}{HTML}{8DA0CB}
\newcommand\backgroundbar[2][5]{\draw[draw=none, fill=black!#1] (axis cs:#2*2+0.5,0.1) rectangle
  (axis cs:1+#2*2+0.5,300);}
\newcommand\legup{Bambu}

\begin{figure}
  \begin{tikzpicture}[scale=0.8]
    \begin{groupplot}[
      group style={
        group name=my plots,
        group size=1 by 2,
        xlabels at=edge bottom,
        xticklabels at=edge bottom,
        vertical sep=5pt,
      },
      ymode=log,
      ybar=0.4pt,
      width=0.64\textwidth,
      height=0.25\textwidth,
      /pgf/bar width=2pt,
      legend pos=north east,
      log ticks with fixed point,
      xticklabels from table={\divtimingtable}{benchmark},
      legend style={nodes={scale=0.7, transform shape}},
      x tick label style={rotate=90,anchor=east,font=\footnotesize},
      legend columns=-1,
      xtick=data,
      enlarge x limits={abs=0.5},
      ylabel style={font=\footnotesize},
      xtick style={draw=none},
      ]
      
      \nextgroupplot[ymin=0.8,ymax=5,ylabel={Area relative to \legup{}}, ytick={0.25,0.5,1,2,4}]
      \pgfplotsinvokeforeach{0,...,12}{%
        \backgroundbar{#1}}
      \backgroundbar[10]{13}
      \addplot+[vericertcol] table [x expr=\coordindex,y=vericert,col sep=comma] from \divslicetable;
      \addplot+[legupnooptcol] table [x expr=\coordindex,y=vericert-fun,col sep=comma] from \divslicetable;
      \draw (axis cs:-1,1) -- (axis cs:28,1);
      % JW: redraw axis border which has been partially covered by the grey bars
      \draw (axis cs:-0.5,0.8) rectangle (axis cs:27.5,5);
      \legend{\vericert{},\vericertfun{}};

      \nextgroupplot[ymin=1,ymax=10,ylabel={Execution time relative to \legup{}}, ytick={1,2,4}]
      \pgfplotsinvokeforeach{0,...,12}{%
        \backgroundbar{#1}}
      \backgroundbar[10]{13}
      \addplot+[vericertcol] table [x expr=\coordindex,y=vericert,col sep=comma] from \divtimingtable;
      \addplot+[legupnooptcol] table [x expr=\coordindex,y=vericert-fun,col sep=comma] from \divtimingtable;
      \draw (axis cs:-1,1) -- (axis cs:28,1);
      % JW: redraw axis border which has been partially covered by the grey bars
      \draw (axis cs:-0.5,1) rectangle (axis cs:27.5,10);
    \end{groupplot}
  \end{tikzpicture}
  \caption{Performance of \vericert{} compared to \vericertfun{}, using \legup{} as a baseline.}
\label{fig:results}
\end{figure}

\section{Future work}

Our immediate priority is to complete \vericertfun's correctness proof. In the medium term, we would
like to improve our implementation of resource sharing by dropping the requirement to inline
functions that access pointers; we anticipate that this will lead to further reductions in area
usage. We would also like to make \vericertfun generate designs with one Verilog module per C
function, as this is more idiomatic than packing all the state machines into a single module; we did
not do this yet because it would require extending the formal Verilog semantics to handle multiple
modules. It would also be interesting to design heuristics for selectively inlining functions~\cite{huang+15}. In the longer
term, we are considering how to implement resource sharing even more effectively in a verified HLS
tool, perhaps by implementing it as part of a resource-constrained scheduling algorithm~\cite{sdc}.


\bibliographystyle{ACM-Reference-Format}
\bibliography{references}

\appendix

We provide the Verilog output that is generated from our running example (Figures~\ref{fig:example_C} to~\ref{fig:example_HTL}).

\begin{lstlisting}
module main (clk, rst, ret, fin)
  input clk, rst;
  output reg [31:0] ret;
  output reg fin;
  reg[31:0] state;
  reg[31:0] reg_1, reg_2, reg_3, reg_4, reg_5, reg_6;
  reg add_rst, add_fin; 
  reg[31:0] add_ret, add_x2, add_x1, reg_7, add_state;
  
  always @(posedge clk) // control logic for "add"
    if ({add_rst == 1}) begin 
      add_state <= 2; add_fin <= 0; 
    end else begin 
      case (add_state)
        |\HL5{2: add\_state <= 1;}|
        |\HL5{1: add\_state <= 3;}|
        3: ;
        default: ;
      endcase
    end   
    
  always @(posedge clk) // datapath for "add"    
    case (add_state)
      |\HL5{2: reg\_7 <= {{add\_x2 + add\_x1} + 0};}|
      |\HL5{1: add\_fin = 1;}|
         |\HL5{add\_ret = reg\_7;}|
      3: add_fin <= 0;
    endcase

  always @(posedge clk) // control logic for "main"
    if ({rst == 1}) begin 
      state <= 9; fin <= 0; 
    end else begin
      case (state)
        |\HL1{9: state <= 8;}|
        |\HL2{8: state <= 7;}|
        |\HL2{7: state <= 12;}|
       |\HL2{12: if ({add\_fin == 1}) state <= 6;}|
        |\HL2{6: state <= 5;}|
        |\HL3{5: state <= 4;}|
        |\HL3{4: state <= 10;}|
       |\HL3{10: if ({add\_fin == 1}) state <= 3;}|
        |\HL3{3: state <= 2;}|
        |\HL4{2: state <= 1;}|
        |\HL4{1: state <= 11;}|
       |\HL0{11: ;}|
      endcase
    end

  always @(posedge clk) // datapath for "main"
    case (state)
      |\HL1{9: reg\_3 <= 0;}|
      |\HL2{8: reg\_6 <= 1;}|
      |\HL2{7: add\_rst <= 1;}|
         |\HL2{add\_x2 <= reg\_3;}|
         |\HL2{add\_x1 <= reg\_6;}|
     |\HL2{12: add\_rst <= 0;}|
         |\HL2{reg\_1 <= add\_ret;}|
      |\HL2{6: reg\_3 <= reg\_1;}|
      |\HL3{5: reg\_5 <= 2;}|
      |\HL3{4: add\_rst <= 1;}|
         |\HL3{add\_x2 <= reg\_3;}|
         |\HL3{add\_x1 <= reg\_5;}|
     |\HL3{10: add\_rst <= 0;}|
         |\HL3{reg\_2 <= add\_ret;}|
      |\HL3{3: reg\_3 <= reg\_2;}|
      |\HL4{2: reg\_4 <= reg\_3;}|
      |\HL4{1: fin = 1;}|
         |\HL4{ret = reg\_4;}|
     |\HL0{11: fin <= 0;}|
    endcase
endmodule
\end{lstlisting}

\end{document}