summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJohn Wickerson <j.wickerson@imperial.ac.uk>2020-09-22 10:36:40 +0000
committeroverleaf <overleaf@localhost>2020-10-24 15:09:08 +0000
commit25fc95d19a586f774a99630ea34e58fb76e4e629 (patch)
tree5809da7bc117d8bfd2049b3bca90aa610589c5d5
parentdfac4f477dfa32611be640c2fef65646e717a6f0 (diff)
downloadfccm21_esrhls-25fc95d19a586f774a99630ea34e58fb76e4e629.tar.gz
fccm21_esrhls-25fc95d19a586f774a99630ea34e58fb76e4e629.zip
Update on Overleaf.
-rw-r--r--conference.bib16
-rw-r--r--eval.tex18
-rw-r--r--intro.tex6
-rw-r--r--main.tex7
-rw-r--r--method.tex6
5 files changed, 30 insertions, 23 deletions
diff --git a/conference.bib b/conference.bib
index f4c31dc..05a4c24 100644
--- a/conference.bib
+++ b/conference.bib
@@ -57,6 +57,7 @@
numpages = 12,
publisher = {ACM},
series = {PLDI '11},
+ doi={10.1145/1993498.1993532},
}
@inproceedings{lidbury15_many_core_compil_fuzzin,
@@ -121,7 +122,8 @@ with {LegUp} High-Level Synthesis},
author={Regehr, John and Chen, Yang and Cuoq, Pascal and Eide, Eric and Ellison, Chucky and Yang, Xuejun},
booktitle={Proceedings of the 33rd ACM SIGPLAN conference on Programming Language Design and Implementation},
pages={335--346},
- year={2012}
+ year={2012},
+ doi={10.1145/2254064.2254104},
}
@inproceedings{fuzzing+chen+13+taming,
@@ -129,7 +131,8 @@ with {LegUp} High-Level Synthesis},
author={Chen, Yang and Groce, Alex and Zhang, Chaoqiang and Wong, Weng-Keen and Fern, Xiaoli and Eide, Eric and Regehr, John},
booktitle={Proceedings of the 34th ACM SIGPLAN conference on Programming language design and implementation},
pages={197--208},
- year={2013}
+ year={2013},
+ doi={10.1145/2491956.2462173},
}
@article{fuzzing+liang+18+survey,
@@ -140,7 +143,8 @@ with {LegUp} High-Level Synthesis},
number={3},
pages={1199--1218},
year={2018},
- publisher={IEEE}
+ publisher={IEEE},
+ doi={10.1109/TR.2018.2834476}
}
@inproceedings{fuzz+sun+16+toward,
@@ -148,7 +152,8 @@ with {LegUp} High-Level Synthesis},
author={Sun, Chengnian and Le, Vu and Zhang, Qirun and Su, Zhendong},
booktitle={Proceedings of the 25th International Symposium on Software Testing and Analysis},
pages={294--305},
- year={2016}
+ year={2016},
+ doi={10.1145/2931037.2931074},
}
@inproceedings{fuzzing+zhang+19,
@@ -156,7 +161,8 @@ with {LegUp} High-Level Synthesis},
author={Zhang, Chengyu and Su, Ting and Yan, Yichen and Zhang, Fuyuan and Pu, Geguang and Su, Zhendong},
booktitle={Proceedings of the 2019 27th ACM Joint Meeting on European Software Engineering Conference and Symposium on the Foundations of Software Engineering},
pages={763--773},
- year={2019}
+ year={2019},
+ doi={10.1145/3338906.3338932}
}
@article{perna12_mechan_wire_wise_verif_handel_c_synth,
diff --git a/eval.tex b/eval.tex
index 8d42ce3..03d1357 100644
--- a/eval.tex
+++ b/eval.tex
@@ -13,7 +13,7 @@
\draw[white] (-4.4,4.4) ellipse (3.75 and 2.75); % making the
\draw[white] (-10.2,4.4) ellipse (3.75 and 2.75); % outlines
\draw[white] (-7.3,2) ellipse (3.75 and 2.75); % fully opaque
- \node[align=center] at (-10.2,6.3) {\Large\textsf{\textbf{Xilinx Vivado HLS}} \\ \Large\textsf{\textbf{2019.1}}};
+ \node[align=center] at (-10.2,6.3) {\Large\textsf{\textbf{Xilinx Vivado HLS}} \\ \Large\textsf{\textbf{v2019.1}}};
\node at (-4.4,6.3) {\Large\textsf{\textbf{Intel i++ 18.1}}};
\node at (-7.3,0) {\Large\textsf{\textbf{LegUp 4.0}}};
@@ -40,7 +40,7 @@
Intel i++ & $\ge 1$\\
\bottomrule
\end{tabular}
- \caption{Unique bugs found in each tool. %\JW{is `all versions' correct here? and should we add version numbers like in the Venn?}\YH{Yes it is actually correct here, I don't mind adding the specific version either though}\JW{Ok let's leave it as-is.}
+ \caption{Unique bugs found in each tool. The ``$\ge$'' sign signifies a lower bound on the number of unique bugs found and correspond to unique test cases after reduction. %\JW{is `all versions' correct here? and should we add version numbers like in the Venn?}\YH{Yes it is actually correct here, I don't mind adding the specific version either though}\JW{Ok let's leave it as-is.}
}
\label{tab:unique_bugs}
\end{table}
@@ -67,10 +67,10 @@ We write `$\ge$' in the table to indicate that all the bug counts are lower boun
\subsection{Results across versions of an HLS tool}
-Besides comparing the reliability of different HLS tools, we also investigated the reliability of Vivado HLS over time. Figure~\ref{fig:sankey_diagram} shows the results of giving 3645 test-cases to Vivado HLS 2018.3, 2019.1 and 2019.2.
+Besides comparing the reliability of different HLS tools, we also investigated the reliability of Vivado HLS over time. Figure~\ref{fig:sankey_diagram} shows the results of giving 3645 test-cases to Vivado HLS v2018.3, v2019.1 and v2019.2.
Test-cases that pass and fail in the same tools are grouped together into a ribbon.
-For instance, the topmost ribbon represents the 31 test-cases that fail in all three versions of Vivado HLS. Other ribbons can be seen weaving in and out; these indicate that bugs were fixed or reintroduced in the various versions. The diagram demonstrates that Vivado HLS 2018.3 contains the most failing test-cases compared to the other versions, having 62 test-cases fail in total. %Interestingly, Vivado HLS 2019.1 and 2019.2 have a different number of failing test cases, meaning feature improvements that introduced bugs as well as bug fixes between those minor versions.
-Interestingly, as an indicator of reliability of HLS tools, the blue ribbon shows that there are test-cases that fail in v2018.3, pass in v2019.1 but then fail again in 2019.2.
+For instance, the topmost ribbon represents the 31 test-cases that fail in all three versions of Vivado HLS. Other ribbons can be seen weaving in and out; these indicate that bugs were fixed or reintroduced in the various versions. The diagram demonstrates that Vivado HLS v2018.3 contains the most failing test-cases compared to the other versions, having 62 test-cases fail in total. %Interestingly, Vivado HLS 2019.1 and 2019.2 have a different number of failing test cases, meaning feature improvements that introduced bugs as well as bug fixes between those minor versions.
+Interestingly, as an indicator of reliability of HLS tools, the blue ribbon shows that there are test-cases that fail in v2018.3, pass in v2019.1 but then fail again in v2019.2.
\definecolor{ribbon1}{HTML}{8dd3c7}
\definecolor{ribbon2}{HTML}{b3de69}
@@ -98,9 +98,9 @@ Interestingly, as an indicator of reliability of HLS tools, the blue ribbon show
\draw[white, fill=black] (1.8,4.1) rectangle (2.2,2.3);
\draw[white, fill=black] (3.8,4.1) rectangle (4.2,2.05);
- \node at (-0.2,4.5) {2018.3};
- \node at (2,4.5) {2019.1};
- \node at (4,4.5) {2019.2};
+ \node at (-0.2,4.5) {v2018.3};
+ \node at (2,4.5) {v2019.1};
+ \node at (4,4.5) {v2019.2};
%\node at (2,5) {Vivado HLS};
\node at (5.5,3.325) {31};
@@ -122,7 +122,7 @@ In addition to that, it can then be seen that Vivado HLS v2018.3 must have at le
\subsection{Some specific bugs found}
-This section describes some of the bugs that were found in the various tools that were tested. We describe two bugs in LegUp and one in Vivado HLS; in each case, the bug was first reduced automatically using \creduce{}, and then reduced further manually to achieve the minimal test-case. Although we did find test-case failures in Intel i++, the very long compilation times for that tool meant that we did not have time to reduce any of the failures down to an example that is minimal enough to present here.
+This section describes some of the bugs that were found in the various tools that were tested. We describe two bugs in LegUp and one in Vivado HLS; in each case, the bug was first reduced automatically using \creduce{}, and then reduced further manually to achieve the minimal test-case. Although we did find test-case failures in Intel i++, the long compilation times for that tool meant that we did not have time to reduce any of the failures down to an example that is minimal enough to present here.
\subsubsection{LegUp assertion error}
diff --git a/intro.tex b/intro.tex
index 990a241..896b185 100644
--- a/intro.tex
+++ b/intro.tex
@@ -15,7 +15,7 @@ int main() {
return b;
}
\end{minted}
- \caption{Miscompilation bug found in Xilinx Vivado HLS 2018.3 and 2019.2. The program returns \code{0x006535FF} but the correct result is \code{0x046535FF}.}
+ \caption{Miscompilation bug found in Xilinx Vivado HLS v2018.3, v2019.1 and v2019.2. The program returns \code{0x006535FF} but the correct result is \code{0x046535FF}.}
\label{fig:vivado_bug1}
\end{figure}
@@ -60,13 +60,13 @@ For this reason, we find it natural to adopt fuzzing for our HLS testing campaig
\paragraph{Our contribution}
This paper reports on our campaign to test HLS tools by fuzzing.
\begin{itemize}
- \item We use Csmith~\cite{yang11_findin_under_bugs_c_compil} to generate ten thousand valid C programs from within the subset of the C language that is supported by all the HLS tools we test. We augment each program with a random selection of HLS-specific directives.
+ \item We use Csmith~\cite{yang11_findin_under_bugs_c_compil} to generate thousands of valid C programs from within the subset of the C language that is supported by all the HLS tools we test. We also augment each program with a random selection of HLS-specific directives.
\item We give these programs to three widely used HLS tools: Xilinx Vivado HLS~\cite{xilinx20_vivad_high_synth}, LegUp HLS~\cite{canis13_legup} and the Intel HLS Compiler, which is also known as i++~\cite{intel20_sdk_openc_applic}. When we find a program that causes an HLS tool to crash, or to generate hardware that produces a different result from GCC, we reduce it to a minimal example with the help of the \creduce{} tool~\cite{creduce}.
\item Our testing campaign revealed that all three tools could be made to crash while compiling or to generate wrong RTL. In total, 6700 test cases were run through each tool out of which 272 test cases failed in at least one of the tools. Test case reduction was then performed on some of these failing test cases to obtain at least 6 unique failing test cases.
- \item To investigate whether HLS tools are getting more or less reliable over time, we also tested three different versions of Vivado HLS (2018.3, 2019.1, and 2019.2). We found that in general there about half as many failures in versions 2019.1 and 2019.2 compared to 2018.3. However, there were also test-cases that only failed in versions 2019.1 and 2019.2, meaning bugs were probably introduced due to the addition of new features.
+ \item To investigate whether HLS tools are getting more or less reliable over time, we also tested three different versions of Vivado HLS (v2018.3, v2019.1, and v2019.2). We found that in general there about half as many failures in versions v2019.1 and v2019.2 compared to v2018.3. However, there were also test-cases that only failed in versions v2019.1 and v2019.2, meaning bugs were probably introduced due to the addition of new features.
\end{itemize}
% we test, and then augment each program with randomly chosen HLS-specific directives. We synthesise each C program to RTL, and use a Verilog simulator to calculate its return value. If synthesis crashes, or if this return value differs from the return value obtained by executing a binary compiled from the C program by GCC, then we have found a candidate bug. We then use trial-and-error to reduce the C program to a minimal version that still triggers a bug.
diff --git a/main.tex b/main.tex
index d916f6a..afb82b1 100644
--- a/main.tex
+++ b/main.tex
@@ -1,4 +1,4 @@
-\documentclass[hyphens,prologue,x11names,rgb,sigconf,anonymous]{acmart}
+\documentclass[hyphens,prologue,x11names,rgb,sigconf,anonymous,review]{acmart}
\usepackage[english]{babel}
\usepackage{graphicx}
@@ -49,7 +49,6 @@
%%
%% end of the preamble, start of the body of the document source.
-\hypersetup{draft}
\begin{document}
%%
@@ -119,6 +118,8 @@ High-level synthesis (HLS) is becoming an increasingly important part of the com
As such, HLS tools are increasingly relied upon. In this paper, we investigate whether they are trustworthy.
We have subjected three widely used HLS tools -- LegUp, Xilinx Vivado HLS, and the Intel HLS Compiler -- to a rigorous fuzzing campaign using thousands of random, valid C programs that we generated using a modified version of the Csmith tool. For each C program, we compiled it to a hardware design using the HLS tool under test and checked whether that hardware design generates the same output as an executable generated by the GCC compiler. When discrepancies arose between GCC and the HLS tool under test, we reduced the C program to a minimal example in order to zero in on the potential bug. Our testing campaign has revealed that all three HLS tools can be made either to crash or to generate wrong code when given valid C programs, and thereby underlines the need for these increasingly trusted tools to be more rigorously engineered.
+Out of 6700 test cases, we found 272 programs that failed in at least one tool, out of which we were able to identify at least 6 unique bugs.
+
\end{abstract}
%%
@@ -176,6 +177,8 @@ We have subjected three widely used HLS tools -- LegUp, Xilinx Vivado HLS, and t
%For final version of paper.
%\end{acks}
+\bigskip
+
\bibliographystyle{ACM-Reference-Format}
\bibliography{conference.bib}
diff --git a/method.tex b/method.tex
index 052f8b3..6651d33 100644
--- a/method.tex
+++ b/method.tex
@@ -58,7 +58,6 @@ This is vital for our work since we want to generate programs that are HLS-frien
\code{statement\_break/goto/continue\_prob} & Reduced \\
\code{float\_as\_ltype\_prob} & Disabled \\
\code{pointer\_as\_ltype\_prob} & Disabled \\
- \code{void\_prob} & Disabled \\
\code{union\_as\_ltype\_prob} & Disabled \\
\code{more\_struct\_union\_type\_prob} & Disabled \\
\code{safe\_ops\_signed\_prob} & Disabled \\
@@ -95,7 +94,6 @@ We also disallow assignments being embedded within expressions, since HLS genera
We eliminate any floating-point numbers since they typically involve external libraries or use of hard IPs on FPGAs, which in turn make it hard to reduce bugs to their minimal form.
We also disable the generation of pointers for HLS testing, since pointer support in HLS tools is either absent or immature~\cite{xilinx20_vivad_high_synth}.
%\YH{I've looked at the documentation and even pointer to pointer is supported, but maybe not pointer to pointer to pointer. I think there was some other pointer assignment that didn't quite work, but I don't remember now. Immature might be a good description though.}
-We also disable void functions, since we are not supporting pointers.
%\YH{Figure \ref{fig:eval:vivado:mismatch} actually has void functions...} \JW{Hm, true. Perhaps these were introduced during reduction.}
We disable the generation of unions as these were not supported by some of the tools such as LegUp 4.0.
@@ -173,7 +171,7 @@ Having generated HLS-friendly programs and automatically augmented them with dir
%Figure~\ref{fig:method:toolflow} shows the three stages of testing, depicted as the testing environment in the dashed area.
For each HLS tool in turn, we compile the C program to RTL and then simulate the RTL.
Independently, we also compile the C program using GCC and execute it.
-To ensure that our testing is scalable for a large number of large, random programs, we also enforce several time-outs: we set a 5-minute time-out for C execution and a 2-hour time-out for RTL simulation.
+To ensure that our testing is scalable for a large number of large, random programs, we also enforce several time-outs: we set a 5-minute time-out for C execution and a 2-hour time-out for C-to-RTL synthesis and RTL simulation.
We do not count time-outs as bugs, but we record them.
%% JW: This paragraph is not really needed because we repeat the sentiment in the next subsection anyway.
@@ -199,7 +197,7 @@ As the programs generated by Csmith can be fairly large, we must systematically
Reduction is performed by iteratively removing some part of the original program and then providing the reduced program to the HLS tool for re-synthesis and co-simulation.
The goal is to find the smallest program that still triggers the bug.
We apply two consecutive methods of reduction in this work.
-The first step is to reduce the labels and pragmas that were added afterwards to make sure that these do not affect the behaviour of the program. These are reduced iteratively until there are no more declarations left or the bug does not get triggered anymore.
+The first step is to reduce the labels and pragmas that were added afterwards to make sure that these do not affect the behaviour of the program. These are reduced until there are no more declarations left or the bug does not get triggered anymore.
% \NR{We can add one or two more sentences summarising how we reduce the programs. Zewei is probably the best person to add these sentences.}\YH{Added some more lines, we can ask Zewei if she is OK with that.}
%Although, our custom reduction gives us the freedom and control of how to reduce buggy programs, it is arduous and requires a lot of manual effort.
We then use the \creduce{} tool~\cite{creduce} to automatically reduce the remaining C program.