summaryrefslogtreecommitdiffstats
path: root/eval.tex
diff options
context:
space:
mode:
authorYann Herklotz <ymh15@ic.ac.uk>2020-12-17 14:04:42 +0000
committeroverleaf <overleaf@localhost>2020-12-31 14:48:38 +0000
commita5249eb597549437802d2ed852919e5b9a923840 (patch)
tree29a32aa1fba1dc0211be88497884d0c7a2db1690 /eval.tex
parentea9289245fbc493530e9435faf498cc4a824c70f (diff)
downloadfccm21_esrhls-a5249eb597549437802d2ed852919e5b9a923840.tar.gz
fccm21_esrhls-a5249eb597549437802d2ed852919e5b9a923840.zip
Update on Overleaf.
Diffstat (limited to 'eval.tex')
-rw-r--r--eval.tex207
1 files changed, 85 insertions, 122 deletions
diff --git a/eval.tex b/eval.tex
index 49e39c5..9046a15 100644
--- a/eval.tex
+++ b/eval.tex
@@ -1,76 +1,69 @@
\section{Evaluation}\label{sec:evaluation}
+We generate \totaltestcases{} test-cases and provide them to three HLS tools: Vivado HLS, LegUp HLS and Intel i++.
+We use the same test-cases across all tools for fair comparison (except the HLS directives, which have tool-specific syntax).
+We were able to test three different versions of Vivado HLS (v2018.3, v2019.1 and v2019.2).
+We tested one version of Intel i++ (version 18.1), and one version of LegUp (4.0).
+LegUp 7.5 is GUI-based and therefore we could not script our tests.
+However, we were able to manually reproduce all the bugs found in LegUp 4.0 in LegUp 7.5.
+
+% Three different tools were tested, including three different versions of Vivado HLS. We were only able to test one version of LegUp HLS (version 4.0), because although LegUp 7.5 is available, it is GUI-based and not amenable to scripting. However, bugs we found in LegUp 4.0 were reproduced manually in LegUp 7.5.
+% LegUp and Vivado HLS were run under Linux, while the Intel HLS Compiler was run under Windows.
+
+\subsection{Results across different HLS tools}
+
\definecolor{vivado}{HTML}{7fc97f}
\definecolor{intel}{HTML}{beaed4}
\definecolor{legup}{HTML}{fdc086}
\begin{figure}
\centering
\begin{tikzpicture}[scale=0.61]
- \draw (-14.5,7.65) rectangle (0,-1);
- \fill[vivado,fill opacity=0.5] (-4.4,4.4) ellipse (3.75 and 2.75);
- \fill[intel,fill opacity=0.5] (-10.2,4.4) ellipse (3.75 and 2.75);
- \fill[legup,fill opacity=0.5] (-7.3,2) ellipse (3.75 and 2.75);
- \draw[white, thick] (-4.4,4.4) ellipse (3.75 and 2.75);
- \draw[white, thick] (-10.2,4.4) ellipse (3.75 and 2.75);
- \draw[white, thick] (-7.3,2) ellipse (3.75 and 2.75);
- \node[align=center, anchor=north] at (-10.2,6.5) {\textsf{\textbf{Xilinx Vivado HLS}} \\ \textsf{\textbf{v2019.1}}};
- \node[anchor=north] at (-4.4,6.5) {\textsf{\textbf{Intel i++ 18.1}}};
- \node at (-7.3,0) {\textsf{\textbf{LegUp 4.0}}};
-
- \node at (-5.5,3) {1 (\textcolor{red}{1})};
- \node at (-9.1,3) {4 (\textcolor{red}{0})};
- \node at (-3,4.1) {26 (\textcolor{red}{540})};
- \node at (-11.6,4.1) {79 (\textcolor{red}{20})};
- \node at (-7.3,1) {162 (\textcolor{red}{6})};
- \node at (-7.3,5.1) {0 (\textcolor{red}{5})};
- \node at (-7.3,3.9) {0 (\textcolor{red}{0})};
- \node at (-13.6,-0.5) {5856};
+ \draw (-7.2,7.0) rectangle (7.2,0.7);
+ \fill[intel,fill opacity=0.5] (2.5,4.4) ellipse (3.75 and 1.5);
+ \fill[vivado,fill opacity=0.5] (-2.5,4.4) ellipse (3.75 and 1.5);
+ \fill[legup,fill opacity=0.5] (0,3) ellipse (3.75 and 1.5);
+ \draw[white, thick] (2.5,4.4) ellipse (3.75 and 1.5);
+ \draw[white, thick] (-2.5,4.4) ellipse (3.75 and 1.5);
+ \draw[white, thick] (0,3) ellipse (3.75 and 1.5);
+ \node[align=center, anchor=south west] at (-6.4,6) {\textcolor{vivado}{\bf Xilinx Vivado HLS v2019.1}};
+ \node[anchor=south east] at (6.4,6) {\textcolor{intel}{\bf Intel i++ 18.1}};
+ \node at (4,1.6) {\textcolor{legup}{\bf LegUp 4.0}};
+
+ \node at (1.8,3.5) {1 (\textcolor{red}{1})};
+ \node at (-1.8,3.5) {4 (\textcolor{red}{0})};
+ \node at (4.0,4.5) {26 (\textcolor{red}{540})};
+ \node at (-4.0,4.5) {79 (\textcolor{red}{20})};
+ \node at (0,2.1) {162 (\textcolor{red}{6})};
+ \node at (0,4.9) {0 (\textcolor{red}{5})};
+ \node at (0,3.9) {0 (\textcolor{red}{0})};
+ \node at (-6,1.4) {5856};
\end{tikzpicture}
-\caption{A Venn diagram showing the number of failures in each tool out of 6700 test-cases that were run. Overlapping regions mean that the test-cases failed in multiple tools. The numbers in parentheses represent the number of test-cases that timed out.}\label{fig:existing_tools}
+\caption{The number of failures per tool out of \totaltestcases{} test-cases. Overlapping regions mean that the same test-cases failed in multiple tools. The numbers in parentheses report how many test-cases timed out.}\label{fig:existing_tools}
\end{figure}
-\begin{table}
- \centering
- \begin{tabular}{lr}\toprule
+Figure~\ref{fig:existing_tools} shows a Venn diagram of our results.
+We see that 167 (2.5\%), 83 (1.2\%) and 26 (0.4\%) test-cases fail in LegUp, Vivado HLS and Intel i++ respectively.
+Despite i++ having the lowest failure rate, it has the highest time-out rate (540 test-cases), because of its remarkably long compilation time.
+% We remark that although the Intel HLS Compiler had the smallest number of confirmed test-case failures, it had the most time-outs (which could be masking additional failures)
+Note that the absolute numbers here do not necessarily correspond to the number of bugs in the tools, because a single bug in a language feature that appears frequently in our test suite could cause many programs to crash or fail.
+Hence, we reduce many of the failing test-cases in an effort to identify unique bugs; these are summarised in the table below.
+
+\begin{table}[h]
+\centering
+\begin{tabular}{lr}\toprule
\textbf{Tool} & \textbf{Unique Bugs}\\
\midrule
- Xilinx Vivado HLS (all versions) & $\ge 2$\\
+ Xilinx Vivado HLS v2019.1 & $\ge 2$\\
LegUp HLS & $\ge 3$\\
Intel i++ & $\ge 1$\\
\bottomrule
\end{tabular}
- \caption{Unique bugs found in each tool. The ``$\ge$'' sign signifies a lower bound on the number of unique bugs found after test-case reduction. %\JW{is `all versions' correct here? and should we add version numbers like in the Venn?}\YH{Yes it is actually correct here, I don't mind adding the specific version either though}\JW{Ok let's leave it as-is.}
- }
- \label{tab:unique_bugs}
\end{table}
-We generate 6700 test-cases and provide them to three HLS tools: Vivado HLS, LegUp HLS and Intel i++.
-We use the same test-cases across all tools for fair comparison.
-We were able to test three different versions of Vivado HLS (v2018.3, v2019.1 and v2019.2).
-We tested one version of Intel i++ (version 18.1), and one version of LegUp (4.0).
-LegUp 7.5 is GUI-based and therefore we could not script our tests.
-However, we were able to manually reproduce bugs found in LegUp 4.0 in LegUp 7.5.
-
-% Three different tools were tested, including three different versions of Vivado HLS. We were only able to test one version of LegUp HLS (version 4.0), because although LegUp 7.5 is available, it is GUI-based and not amenable to scripting. However, bugs we found in LegUp 4.0 were reproduced manually in LegUp 7.5.
-% LegUp and Vivado HLS were run under Linux, while the Intel HLS Compiler was run under Windows.
-
-\subsection{Results across different HLS tools}
-
-Figure~\ref{fig:existing_tools} shows a Venn diagram of our results.
-We see that 167 (2.5\%), 83 (1.2\%) and 26 (0.4\%) test-cases fail in LegUp, Vivado HLS and Intel i++ respectively.
-Despite i++ having the lowest failure rate, it has the highest time-out rate (540 test-cases), because of its remarkably long compilation time.
-% We remark that although the Intel HLS Compiler had the smallest number of confirmed test-case failures, it had the most time-outs (which could be masking additional failures)
-Note that the absolute numbers here do not necessary correspond to the number of bugs in the tools, because a single bug in a language feature that appears frequently in our test suite could cause many programs to crash or fail.
-Hence, we reduce many of the failing test-cases to identify unique bugs, as summarised in Table~\ref{tab:unique_bugs}.
-We write `$\ge$' in the table to indicate that all the bug counts are lower bounds -- we did not have time to go through the test-case reduction process for every failure.
+We write `$\ge$' above to emphasise that all the bug counts are lower bounds -- we did not have time to go through the rather arduous test-case reduction process for every failure.
\subsection{Results across versions of an HLS tool}
-Besides comparing the reliability of different HLS tools, we also investigated the reliability of Vivado HLS over time. Figure~\ref{fig:sankey_diagram} shows the results of giving 3645 test-cases to Vivado HLS v2018.3, v2019.1 and v2019.2.
-Test-cases that pass and fail in the same tools are grouped together into a ribbon.
-For instance, the topmost ribbon represents the 31 test-cases that fail in all three versions of Vivado HLS. Other ribbons can be seen weaving in and out; these indicate that bugs were fixed or reintroduced in the various versions. The diagram demonstrates that Vivado HLS v2018.3 contains the most failing test-cases compared to the other versions, having 62 test-cases fail in total. %Interestingly, Vivado HLS 2019.1 and 2019.2 have a different number of failing test cases, meaning feature improvements that introduced bugs as well as bug fixes between those minor versions.
-Interestingly, as an indicator of reliability of HLS tools, the blue ribbon shows that there are test-cases that fail in v2018.3, pass in v2019.1 but then fail again in v2019.2.
-
\definecolor{ribbon1}{HTML}{8dd3c7}
\definecolor{ribbon2}{HTML}{b3de69}
\definecolor{ribbon3}{HTML}{bebada}
@@ -110,37 +103,35 @@ Interestingly, as an indicator of reliability of HLS tools, the blue ribbon show
\node[white] at (2,2.5) {36};
\node[white] at (4,2.25) {41};
\end{tikzpicture}
- \caption{A Sankey diagram that tracks 3645 test-cases through three different versions of Vivado HLS. The ribbons collect the test-cases that pass and fail together. The black bars are labelled with the total number of test-case failures per version. The 3573 test-cases that pass in all three versions are not depicted.
+ \caption{A Sankey diagram that tracks \vivadotestcases{} test-cases through three different versions of Vivado HLS. The ribbons collect the test-cases that pass and fail together. The black bars are labelled with the total number of test-case failures per version. The 3573 test-cases that pass in all three versions are not depicted.
}\label{fig:sankey_diagram}
\end{figure}
-% \NR{Why are there missing numbers in the ribbons?}
-As in our Venn diagram, the absolute numbers in Figure~\ref{fig:sankey_diagram} do not necessary correspond to the number of bugs. However, we can deduce from this diagram that there must be at least six unique bugs in Vivado HLS, given that a ribbon must contain at least one unique bug.
-%\YH{Contradicts value of 3 in Table~\ref{tab:unique_bugs}, maybe I can change that to 6?} \JW{I'd leave it as-is personally; we have already put a `$\ge$' symbol in the table, so I think it's fine.}
-In addition to that, it can then be seen that Vivado HLS v2018.3 must have at least 4 individual bugs, of which two were fixed and two others stayed in Vivado HLS v2019.1. However, with the release of v2019.1, new bugs were introduced as well. % Finally, for version 2019.2 of Vivado HLS, there seems to be a bug that was reintroduced which was also present in Vivado 2018.3, in addition to a new bug. In general it seems like each release of Vivado HLS will have new bugs present, however, will also contain many previous bug fixes. However, it cannot be guaranteed that a bug that was previously fixed will remain fixed in future versions as well.
+Besides comparing the reliability of different HLS tools, we also investigated the reliability of Vivado HLS over time. Figure~\ref{fig:sankey_diagram} shows the results of giving \vivadotestcases{} test-cases to Vivado HLS v2018.3, v2019.1 and v2019.2.
+Test-cases that pass and fail in the same tools are grouped together into a ribbon.
+For instance, the topmost ribbon represents the 31 test-cases that fail in all three versions of Vivado HLS. Other ribbons can be seen weaving in and out; these indicate that bugs were fixed or reintroduced in the various versions. We see that Vivado HLS v2018.3 had the most test-case failures (62).
+Interestingly, as an indicator of reliability of HLS tools, the blue ribbon shows that there are test-cases that fail in v2018.3, pass in v2019.1 but then fail again in v2019.2.
+As in our Venn diagram, the absolute numbers here do not necessary correspond to the number of actual bugs, but we can deduce that there must be at least six unique bugs in Vivado HLS, given that each ribbon corresponds to at least one unique bug.
-\subsection{Some specific bugs found}
-This section describes some of the bugs that were found in the various tools that were tested. We describe two bugs in LegUp and one in Vivado HLS; in each case, the bug was first reduced automatically using \creduce{}, and then reduced further manually to achieve the minimal test-case. Although we did find test-case failures in Intel i++, the long compilation times for that tool meant that we did not have time to reduce any of the failures down to an example that is minimal enough to present here.
-\subsubsection{LegUp assertion error}
-The code shown in Figure~\ref{fig:eval:legup:assert} leads to an assertion error in LegUp 4.0 and 7.5 even though it should compile without any errors.
-An assertion error counts as a crash of the tool, as it means that an unexpected state was reached by this input.
-This shows that there is a bug in one of the compilation passes in LegUp, however, due to the assertion the bug is caught in the tool before it produces an incorrect design.
+%\YH{Contradicts value of 3 in Table~\ref{tab:unique_bugs}, maybe I can change that to 6?} \JW{I'd leave it as-is personally; we have already put a `$\ge$' symbol in the table, so I think it's fine.}
+%In addition to that, it can then be seen that Vivado HLS v2018.3 must have at least 4 individual bugs, of which two were fixed and two others stayed in Vivado HLS v2019.1. However, with the release of v2019.1, new bugs were introduced as well. % Finally, for version 2019.2 of Vivado HLS, there seems to be a bug that was reintroduced which was also present in Vivado 2018.3, in addition to a new bug. In general it seems like each release of Vivado HLS will have new bugs present, however, will also contain many previous bug fixes. However, it cannot be guaranteed that a bug that was previously fixed will remain fixed in future versions as well.
-\begin{figure}
+\subsection{Some specific bugs found}
+
+We now describe two more of the bugs we found: one crash bug in LegUp and one miscompilation bug in Vivado HLS. As in Example~\ref{ex:vivado_miscomp}, each bug was first reduced automatically using \creduce{}, and then reduced further manually to achieve the minimal test-case. Although we did find test-case failures in Intel i++, the long compilation times for that tool meant that we did not have time to reduce any of the failures down to an example that is minimal enough to present here.
+
+\begin{example}[A crash bug in LegUp]
+The program shown below leads to an internal compiler error (an unhandled assertion in this case) in LegUp 4.0 and 7.5.
\begin{minted}{c}
int a[2][2][1] = {{{0},{1}},{{0},{0}}};
-
-int main() {
- a[0][1][0] = 1;
-}
+int main() { a[0][1][0] = 1; }
\end{minted}
-\caption{This program causes an assertion failure in LegUp HLS when \texttt{NO\_INLINE} is set.}\label{fig:eval:legup:assert}
-\end{figure}
-
-The buggy test-case has to do with initialisation and assignment to a three-dimensional array, for which the above piece of code is the minimal example. However, in addition to that it requires the \texttt{NO\_INLINE} flag to be set, which disables function inlining. The code initialises the array with zeroes except for \texttt{a[0][1][0]}, which is set to one. Then the main function assigns one to that same location. This code on its own should not actually produce a result and should just terminate by returning 0, which is also what the design that LegUp generates does when the \texttt{NO\_INLINE} flag is turned off.
+%An assertion error counts as a crash of the tool, as it means that an unexpected state was reached by this input.
+%This shows that there is a bug in one of the compilation passes in LegUp, however, due to the assertion the bug is caught in the tool before it produces an incorrect design.
+It initialises a 3D array with zeroes, and then assigns to one element. The bug only appears when function inlining is disabled (\texttt{NO\_INLINE}). % The code initialises the array with zeroes except for \texttt{a[0][1][0]}, which is set to one. Then the main function assigns one to that same location. This code on its own should not actually produce a result and should just terminate by returning 0, which is also what the design that LegUp generates does when the \texttt{NO\_INLINE} flag is turned off.
%The following code also produces an assertion error in LegUp, which is a different one this time. This bug was not discovered during the main test runs of 10 thousand test cases, but beforehand, which meant that we disabled unions from being generated. However, this bug also requires the \texttt{volatile} keyword which seems to be the reason for quite a few mismatches in LegUp and Vivado.
%
@@ -152,68 +143,40 @@ The buggy test-case has to do with initialisation and assignment to a three-dime
%int main() { return un.a; }
%\end{minted}
-\subsubsection{LegUp miscompilation}
-
-The test-case in Figure~\ref{fig:eval:legup:wrong} produces an incorrect Verilog in LegUp 4.0 and 7.5, which means that the results of RTL simulation is different to the C execution.
+\end{example}
\begin{figure}
\begin{minted}{c}
-volatile int a = 0;
-int b = 1;
-
-int main() {
- int d = 1;
- if (d + a)
- b || 1;
- else
- b = 0;
- return b;
-}
-\end{minted}
-\caption{An output mismatch: LegUp HLS returns 0 but the correct result is 1.}\label{fig:eval:legup:wrong}
-\end{figure}
-
-In the code above, \texttt{b} has value 1 when run in GCC, but has value 0 when run with LegUp. If the \texttt{volatile} keyword is removed from \texttt{a}, then the Verilog produces the correct result. As \texttt{a} and \texttt{d} are constants, the \code{if} statement should always produce go into the \texttt{true} branch, meaning \texttt{b} should never be set to 0. The \texttt{true} branch of the \code{if} statement only executes an expression which is not assigned to any variable, meaning the initial state of all variables should not change. However, LegUp HLS generates a design which enters the \texttt{else} branch instead and assigns \texttt{b} to be 0. The cause of this bug seems to be the use of \texttt{volatile} keyword, which interferes with the analysis that attempts to simplify the \code{if} statement.
-
-\subsubsection{Vivado HLS miscompilation}
-
-Figure~\ref{fig:eval:vivado:mismatch} shows code that does not output the right result when compiled with all Vivado HLS versions.
-It returns \texttt{0x0} with Vivado HLS, instead of \texttt{0xF}. This test-case is much larger compared to the other test-cases that were reduced.
-We could not reduce this program any further, as everything in the code was necessary to trigger the bug.
-
-The array \texttt{a} is initialised to all zeroes, as well as the other global variables \texttt{g} and \texttt{c}, so as to not introduce any undefined behaviour. However, \texttt{g} is also given the \texttt{volatile} keyword, which ensures that the variable is not optimised away. The function \texttt{d} then accumulates the values \texttt{b} that it is passed into a hash stored in \texttt{c}. Each \texttt{b} is eight bits wide, so function \texttt{e} calls the function seven times for some of the bits in the 64-bit value of \texttt{f} that it is passed. Finally, in the main function, the array is initialised partially with a \code{for} loop, after which the \texttt{e} function is called twice, once on the volatile function and once on a constant. Interestingly, the second function call with the constant is also necessary to trigger the bug.
-
-\begin{figure}
-\begin{minted}{c}
-volatile unsigned int g = 0;
-int a[256] = {0};
-int c = 0;
-
-void d(char b) {
- c = (c & 4095) ^ a[(c ^ b) & 15];
-}
-
-void e(long f) {
- d(f); d(f >> 8); d(f >> 16); d(f >> 24);
- d(f >> 32); d(f >> 40); d(f >> 48);
-}
-
+static volatile int a[9][1][7];
int main() {
- for (int i = 0; i < 56; i++)
- a[i] = i;
- e(g);
- e(-2L);
- return c;
+ int tmp = 1;
+ for (int b = 0; b < 2; b++) {
+ a[0][0][0] = 3;
+ a[0][0][0] = a[0][0][0];
+ }
+ for (int i = 0; i < 9; i++)
+ for (int k = 0; k < 7; k++)
+ tmp ^= a[i][0][k];
+ return tmp;
}
\end{minted}
-\caption{An output mismatch: Vivado HLS returns \texttt{0x0} but the correct result is \texttt{0xF}.}\label{fig:eval:vivado:mismatch}
+\caption{Miscompilation bug in Intel i++. It should return 2 because \code{3 \^{} 1 = 2}, however, Intel i++ returns 0 instead.}\label{fig:eval:intel:mismatch}
\end{figure}
-\subsubsection{Intel i++ miscompilation}
+%\begin{example}[A miscompilation bug in Vivado HLS]
+
+%Figure~\ref{fig:eval:vivado:mismatch} shows code that should output \texttt{0xF}, but outputs \texttt{0x0} when compiled with Vivado HLS (all three versions).
+%This test-case is much larger compared to the other test-cases that were reduced.
+%We could not reduce this program any further, as everything in the code was necessary to trigger the bug.
+%This test-case is interesting because the root cause appears to be the hashing boilerplate that we added to Csmith, rather than the randomly-generated code that Csmith produced. In the main function, an array is partially initialised, and then a function \texttt{hashl} for hashing a long integer is called twice, once on a volatile global \texttt{x}, and once on a constant. That hashing function invokes a second hashing function on some of the bytes in the long integer.
-\JW{Write about one of the Intel bugs here.}
+%The array \texttt{arr} is initialised to all zeroes, as well as the other global variables \texttt{x} and \texttt{y}, so as to not introduce any undefined behaviour. However, \texttt{x} is also given the \texttt{volatile} keyword, which ensures that the variable is not optimised away. The function \texttt{hashc} then accumulates the values that it is passed into a hash stored in \texttt{h}. Each \texttt{b} is eight bits wide, so function \texttt{hashl} calls the function seven times for some of the bits in the 64-bit value of \texttt{f} that it is passed. Finally, in the main function, the array is initialised partially with a \code{for} loop, after which the \texttt{hashl} function is called twice, once on the volatile function and once on a constant. Interestingly, the second function call with the constant is also necessary to trigger the bug.
+%\end{example}
+\begin{example}[A miscompilation bug in Intel i++]
+Figure~\ref{fig:eval:intel:mismatch} shows a miscompilation bug that was found in Intel i++. Intel i++ does not seem to notice the assignment to 3 in the previous for loop, or tries to perform some optimisations that seem to analyse the array incorrectly and therefore results in a wrong value being returned.
+\end{example}
%%% Local Variables:
%%% mode: latex