summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJohn Wickerson <j.wickerson@imperial.ac.uk>2022-01-17 16:18:34 +0000
committerJohn Wickerson <j.wickerson@imperial.ac.uk>2022-01-17 16:18:34 +0000
commite367ab3b3a81916eb8bcafb1b588ed0b6bf087fe (patch)
treefb32f655dfeebe8b306d1f77b4af3839b4ac94d1
parent15daff9a0bce19984a943974fdb148a8cabca8cc (diff)
downloadfccm22_rsvhls-e367ab3b3a81916eb8bcafb1b588ed0b6bf087fe.tar.gz
fccm22_rsvhls-e367ab3b3a81916eb8bcafb1b588ed0b6bf087fe.zip
various
-rw-r--r--references.bib36
-rw-r--r--verified_resource_sharing.tex29
2 files changed, 34 insertions, 31 deletions
diff --git a/references.bib b/references.bib
index 4a2b791..b2a80ee 100644
--- a/references.bib
+++ b/references.bib
@@ -1,20 +1,23 @@
-@article{Herklotz2020,
- abstract = {High-level synthesis (HLS), which refers to the automatic compilation of software into hardware, is rapidly gaining popularity. In a world increasingly reliant on application-specific hardware accelerators, HLS promises hardware designs of comparable performance and energy efficiency to those coded by hand in a hardware description language such as Verilog, while maintaining the convenience and the rich ecosystem of software development. However, current HLS tools cannot always guarantee that the hardware designs they produce are equivalent to the software they were given, thus undermining any reasoning conducted at the software level. Furthermore, there is mounting evidence that existing HLS tools are quite unreliable, sometimes generating wrong hardware or crashing when given valid inputs. To address this problem, we present the first HLS tool that is mechanically verified to preserve the behaviour of its input software. Our tool, called Vericert, extends the CompCert verified C compiler with a new hardware-oriented intermediate language and a Verilog back end, and has been proven correct in Coq. Vericert supports most C constructs, including all integer operations, function calls, local arrays, structs, unions, and general control-flow statements. An evaluation on the PolyBench/C benchmark suite indicates that Vericert generates hardware that is around an order of magnitude slower (only around 2\texttimes{} slower in the absence of division) and about the same size as hardware generated by an existing, optimising (but unverified) HLS tool.},
- author = {Herklotz, Yann and Pollard, James D. and Ramanathan, Nadesh and Wickerson, John},
- location = {New York, NY, USA},
- publisher = {Association for Computing Machinery},
- url = {https://doi.org/10.1145/3485494},
- date = {2021-10},
- doi = {10.1145/3485494},
- journaltitle = {Proc. ACM Program. Lang.},
- keywords = {high-level synthesis,Coq,Verilog,CompCert,C},
- number = {OOPSLA},
- title = {Formal Verification of High-Level Synthesis},
- volume = {5}
+@article{vericert,
+ author = {Yann Herklotz and
+ James D. Pollard and
+ Nadesh Ramanathan and
+ John Wickerson},
+ title = {Formal verification of high-level synthesis},
+ journal = {Proc. {ACM} Program. Lang.},
+ volume = {5},
+ number = {{OOPSLA}},
+ pages = {1--30},
+ year = {2021},
+ url = {https://doi.org/10.1145/3485494},
+ doi = {10.1145/3485494},
+ timestamp = {Sat, 08 Jan 2022 02:21:39 +0100},
+ biburl = {https://dblp.org/rec/journals/pacmpl/HerklotzPRW21.bib},
+ bibsource = {dblp computer science bibliography, https://dblp.org}
}
@misc{intel_hls,
- publisher = {Intel},
+ author = {{Intel}},
title = {Intel High Level Synthesis Compiler},
year = 2022,
url = {https://www.intel.com/content/www/us/en/software/programmable/quartus-prime/hls-compiler.html},
@@ -24,10 +27,11 @@
author = {{Withheld for blind review}},
title = {Public Github repository for Vericert-Fun},
year = 2022,
+ url = {https://github.com/[withheld]/Vericert-Fun},
}
@misc{xilinx_vitis,
- publisher = {Xilinx Inc.},
+ author = {{Xilinx Inc.}},
title = {Vitis HLS},
year = 2022,
url = {https://www.xilinx.com/products/design-tools/vivado/integration/esl-design.html},
@@ -206,7 +210,7 @@
pages = {653--669},
publisher = {{USENIX} Association},
year = {2016},
- url = {https://www.usenix.org/conference/osdi16/technical-sessions/presentation/gu},
+ doi = {10.5555/3026877.3026928},
timestamp = {Tue, 02 Feb 2021 08:06:02 +0100},
biburl = {https://dblp.org/rec/conf/osdi/GuSCWKSC16.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
diff --git a/verified_resource_sharing.tex b/verified_resource_sharing.tex
index 77626ff..1fb12b9 100644
--- a/verified_resource_sharing.tex
+++ b/verified_resource_sharing.tex
@@ -92,7 +92,7 @@ escapeinside=||,
In this paper, we present \vericertfun: \vericert{} extended with function-level resource sharing. Where original \vericert{} creates one block of hardware per function \emph{call}, \vericertfun{} creates one block of hardware per function \emph{definition}. The enabling innovation is to extend \vericert{}'s intermediate language HTL with the ability to represent \emph{multiple} state machines, and then to add support for function calls that transfer control between these state machines. We are working to extend \vericert{}'s formal correctness proof to cover the translation from C source into this extended HTL language, and thence on to Verilog.
-We have benchmarked \vericertfun's performance on the PolyBench/C suite, and our results show the generated hardware having a resource usage of 49\% of \vericert{}'s on average for only a 1\% average increase in the execution time.
+We have benchmarked \vericertfun's performance on the PolyBench/C suite, and our results show the generated hardware having a resource usage of 0.49$\times$ \vericert{}'s on average, with a minimal increase (about 1\%) in execution time.
\end{abstract}
\section{Introduction}
@@ -106,6 +106,8 @@ These tools are useful, but doubts have been raised about their reliability. For
Clearly, however, it is not enough for an HLS tool simply to be \emph{correct}. The generated hardware must also meet several other desiderata, including high throughput, low latency, and good \emph{area efficiency}, which is the topic of this paper. A common optimisation employed by HLS tools to improve area efficiency is \emph{resource sharing}; that is, re-using hardware for more than one purpose. Accordingly, our work adds resource sharing to \vericert{}. Keeping with the aims of the \vericert{} project, work is ongoing to extend the correctness proof.
+The entire \vericertfun{} development is fully open-source~\cite{vericertfun-github}. Further details about the implementation and proofs are available in a technical report~\cite{pardalos_thesis}.
+
\section{Background}
\paragraph{The Coq proof assistant} \vericert{} is implemented using the Coq proof assistant~\cite{coq}. This means that it consists of a collection of functions that define the compilation process, together with the proof of a theorem stating that those definitions constitute a correct HLS tool. Coq mechanically checks this proof using a formal mathematical calculus, and then automatically translates the function definitions into OCaml code that can be compiled and executed.
@@ -429,13 +431,14 @@ When \citeauthor{Herklotz2020} designed HTL, they did not include a semantics fo
At the point of writing, the correctness of \vericertfun{} from C to HTL has been mostly proven: basic instructions and function calls are proven correct, but proofs of load and store instructions are not finished as they lack some key invariants. The pass that renames
variables in HTL is yet to be proven, as is the pass that generates the final Verilog.
-To give a rough idea of the scale and complexity of the task: the implementation of \vericertfun{} involved adding or changing about 700 lines of Coq code in \vericert{} and took the first author 4 months. The correctness proof, so far, has required about 2300 lines of additions or changes to the Coq code and 8 months.
-
-The entire \vericertfun{} development is fully open-source~\cite{vericertfun-github}. Further details about the implementation and proofs are available in a technical report~\cite{pardalos_thesis}.
+To give a rough idea of the scale and complexity of the task: the implementation of \vericertfun{} involved adding or changing about 700 lines of Coq code in \vericert{} and took the first author 4 months. The correctness proof, so far, has required about 2300 lines of additions or changes to the Coq code and 8 person-months of work.
\section{Performance evaluation}
-We now compare the performance of the hardware generated by \vericertfun{} against that generated by \vericert{}. Following \citet{Herklotz2020}, we use the PolyBench/C benchmark suite~\cite{polybench}, with divisions replaced by an iterative algorithm expressed in C. We used the Icarus Verilog simulator to determine the cycle counts of the generated designs. We used Xilinx Vivado 2017.1, targeting a Xilinx 7-series FPGA (XC7K70T) to determine area usage and fmax.
+We now compare the quality of the hardware generated by \vericertfun{} against that generated by \vericert{}. Following \citet{Herklotz2020}, we use the PolyBench/C benchmark suite~\cite{polybench} with division and modulo replaced with iterative software implementations because \vericert{} does not handle them inefficiently. That is, \lstinline{a/b} and \lstinline{c%d}
+are textually replaced with \lstinline{div(a,b)} and \lstinline{mod(c,d)}. These \lstinline{div} and \lstinline{mod} functions are the only function calls in the benchmarks.
+
+We used the Icarus Verilog simulator to determine the cycle counts of the generated designs. We used Xilinx Vivado 2017.1, targeting a Xilinx 7-series FPGA (XC7K70T) to determine area usage and fmax.
%Figure~\ref{fig:results} summarises our results. The x-axis shows the impact of resource sharing on
%the speed of the hardware (as calculated by the cycle count divided by fmax); we see that all the
@@ -444,13 +447,9 @@ We now compare the performance of the hardware generated by \vericertfun{} again
%introduces an extra state per function call. The impact on fmax is similarly minimal, ranging
%between a 1.5\% increase and a 3.1\% decrease (0.2\% decrease on average).
-Figure~\ref{fig:results} compares the quality of the hardware generated by \vericertfun{} with that generated by \vericert{}, using the open-source (but unverified) \bambu{} HLS tool~\cite{pilato13_bambu} as a baseline. The top graph compares the area usage. We observe a consistent and substantial reduction in area usage across the benchmark programs, with \vericert{} consistently using more area than Bambu (1.4$\times$ on average) and \vericertfun{} almost always using less (0.7$\times$ on average) \JW{got up to here.}
-
+Figure~\ref{fig:results} compares the hardware generated by \vericertfun{} with that generated by \vericert{}, using the open-source (but unverified) \bambu{} HLS tool~\cite{pilato13_bambu} as a baseline. The top graph compares the area usage. We observe a consistent and substantial reduction in area usage across the benchmark programs, with \vericert{} consistently using more area than Bambu (1.4$\times$ on average) and \vericertfun{} almost always using less (0.7$\times$ on average). As expected, the benchmarks with the most function calls, such as \YH{name of benchmark?} and \YH{name of benchmark?} enjoy the biggest area savings. \JW{Revisit this text with the final experimental numbers.}
-First, one can can note that
-designs generated by \vericertfun{} and \vericert{} execute for a similar amount of time, \vericertfun{} being only 1\% slower on average. This is explained by the introduction of latency when performing a function call, but it shows that in general this does not affect the performance too much. The
-area difference between \vericertfun{} and \vericert{} is quite large, which shows the effectiveness
-of the resource sharing that \vericertfun{} introduces.
+The bottom graph compares the execution time. We observe that \vericertfun{} generates hardware that is almost exactly as fast as \vericert{}'s, with \vericertfun{} hardware being only 1\% slower on average. This can be attributed to the latency overhead of performing a function call. Hardware from \vericert{} and \vericertfun{} is significantly slower than \bambu's, which can be attributed to \vericert{} employing far fewer optimisations than the unverified \bambu{} tool.
\pgfplotstableread[col sep=comma]{data/time-ratio.csv}{\divtimingtable}
\pgfplotstableread[col sep=comma]{data/slice-ratio.csv}{\divslicetable}
@@ -488,7 +487,7 @@ of the resource sharing that \vericertfun{} introduces.
xtick style={draw=none},
]
- \nextgroupplot[ymin=0.3,ymax=5,ylabel={Area relative to \legup{}}]
+ \nextgroupplot[ymin=0.2,ymax=5,ylabel={Area relative to \legup{}}, ytick={0.25,0.5,1,2,4}]
\pgfplotsinvokeforeach{0,...,12}{%
\backgroundbar{#1}}
\backgroundbar[10]{13}
@@ -496,9 +495,9 @@ of the resource sharing that \vericertfun{} introduces.
\addplot+[legupnooptcol] table [x expr=\coordindex,y=vericert-fun,col sep=comma] from \divslicetable;
\draw (axis cs:-1,1) -- (axis cs:28,1);
% JW: redraw axis border which has been partially covered by the grey bars
- \draw (axis cs:-0.5,0.3) rectangle (axis cs:27.5,10);
+ \draw (axis cs:-0.5,0.2) rectangle (axis cs:27.5,5);
- \nextgroupplot[ymin=1,ymax=7,ylabel={Execution time relative to \legup{}}]
+ \nextgroupplot[ymin=1,ymax=7,ylabel={Execution time relative to \legup{}}, ytick={1,2,4}]
\pgfplotsinvokeforeach{0,...,12}{%
\backgroundbar{#1}}
\backgroundbar[10]{13}
@@ -506,7 +505,7 @@ of the resource sharing that \vericertfun{} introduces.
\addplot+[legupnooptcol] table [x expr=\coordindex,y=vericert-fun,col sep=comma] from \divtimingtable;
\draw (axis cs:-1,1) -- (axis cs:28,1);
% JW: redraw axis border which has been partially covered by the grey bars
- \draw (axis cs:-0.5,0.8) rectangle (axis cs:27.5,300);
+ \draw (axis cs:-0.5,1) rectangle (axis cs:27.5,7);