summaryrefslogtreecommitdiffstats
path: root/eval.tex
diff options
context:
space:
mode:
authorJohn Wickerson <j.wickerson@imperial.ac.uk>2021-03-26 21:16:22 +0000
committeroverleaf <overleaf@localhost>2021-03-28 14:43:30 +0000
commit546590b262efafc33133059f1184268dd25d89fe (patch)
treef379f428c06d1fafe2e0932d03cc500d8a40c5de /eval.tex
parentb16f3242909e90bdb547ab1500d46f5e38527981 (diff)
downloadfccm21_esrhls-546590b262efafc33133059f1184268dd25d89fe.tar.gz
fccm21_esrhls-546590b262efafc33133059f1184268dd25d89fe.zip
Update on Overleaf.
Diffstat (limited to 'eval.tex')
-rw-r--r--eval.tex270
1 files changed, 161 insertions, 109 deletions
diff --git a/eval.tex b/eval.tex
index e17d955..2bf08a1 100644
--- a/eval.tex
+++ b/eval.tex
@@ -1,11 +1,10 @@
\section{Evaluation}\label{sec:evaluation}
-We generate \totaltestcases{} test-cases and provide them to four HLS tools: Vivado HLS, LegUp HLS, Intel i++ and Bambu.
+We generate \totaltestcases{} test-cases and provide them to four HLS tools: Vivado HLS, LegUp HLS, Intel i++, and Bambu.
We use the same test-cases across all tools for fair comparison (except the HLS directives, which have tool-specific syntax).
We were able to test three different versions of Vivado HLS (v2018.3, v2019.1 and v2019.2).
We tested one version of Intel i++ (version 18.1), LegUp (4.0) and Bambu (v0.9.7).
-LegUp 7.5 is GUI-based and therefore we could not script our tests.
-However, we were able to manually reproduce all the bugs found in LegUp 4.0 in LegUp 7.5.
+LegUp 7.5 is GUI-based so we could not script our tests; however, we were able to manually reproduce all the bugs found in LegUp 4.0 in LegUp 7.5.
% Three different tools were tested, including three different versions of Vivado HLS. We were only able to test one version of LegUp HLS (version 4.0), because although LegUp 7.5 is available, it is GUI-based and not amenable to scripting. However, bugs we found in LegUp 4.0 were reproduced manually in LegUp 7.5.
% LegUp and Vivado HLS were run under Linux, while the Intel HLS Compiler was run under Windows.
@@ -19,14 +18,40 @@ However, we were able to manually reproduce all the bugs found in LegUp 4.0 in L
\definecolor{timeout}{HTML}{ef4c4c}
\begin{figure}
\centering
+% \begin{tikzpicture}[scale=0.61]
+% \draw (-7.2,7.0) rectangle (7.2,0.7);
+% \fill[vivado,fill opacity=0.5] (0.5,4.4) ellipse (3.75 and 1.5);
+% \fill[intel,fill opacity=0.5] (-4,4.8) ellipse (2.5 and 1.3);
+% \fill[bambu,fill opacity=0.5] (2.5,3) ellipse (3.75 and 1.5);
+% \fill[legup,fill opacity=0.5] (-2.5,3) ellipse (3.75 and 1.5);
+% \draw[white, thick] (0.5,4.4) ellipse (3.75 and 1.5);
+% \draw[white, thick] (-4,4.8) ellipse (2.5 and 1.3);
+% \draw[white, thick] (2.5,3) ellipse (3.75 and 1.5);
+% \draw[white, thick] (-2.5,3) ellipse (3.75 and 1.5);
+% \node[align=center, anchor=south west] at (-0.5,6) {\textcolor{vivado}{\bf Xilinx Vivado HLS v2019.1}};
+% \node[anchor=south west] at (-6.4,6) {\textcolor{intel}{\bf Intel i++ 18.1}};
+% \node at (-3,1.1) {\textcolor{legup}{\bf LegUp 4.0}};
+% \node at (3,1.1) {\textcolor{bambu}{\bf Bambu PandA 0.9.7}};
+
+% \node at (-3.5,2.5) {\small 159 (\textcolor{timeout}{4})};
+% \node at (-5,5) {\small 26 (\textcolor{timeout}{540})};
+% \node at (-4,3.9) {\small 1 (\textcolor{timeout}{1})};
+% \node at (-2.3,4.8) {\small 0 (\textcolor{timeout}{5})};
+% \node at (-1.5,3.8) {\small 4 (\textcolor{timeout}{0})};
+% \node at (0,2.5) {\small 3 (\textcolor{timeout}{2})};
+% \node at (3.5,2.5) {\small 906 (\textcolor{timeout}{14})};
+% \node at (2.5,3.8) {\small 9 (\textcolor{timeout}{0})};
+% \node at (0,5) {\small 70 (\textcolor{timeout}{20})};
+% \node at (-6,1.4) {4936};
+% \end{tikzpicture}
\begin{tikzpicture}[scale=0.61]
\draw (-7.2,7.0) rectangle (7.2,0.7);
- \fill[vivado,fill opacity=0.5] (0.5,4.4) ellipse (3.75 and 1.5);
- \fill[intel,fill opacity=0.5] (-4,4.8) ellipse (2.5 and 1.3);
+ \fill[vivado,fill opacity=0.5] (0.9,4.4) ellipse (3.3 and 1.5);
+ \fill[intel,fill opacity=0.5] (-4.5,4.8) ellipse (2.0 and 1.3);
\fill[bambu,fill opacity=0.5] (2.5,3) ellipse (3.75 and 1.5);
\fill[legup,fill opacity=0.5] (-2.5,3) ellipse (3.75 and 1.5);
- \draw[white, thick] (0.5,4.4) ellipse (3.75 and 1.5);
- \draw[white, thick] (-4,4.8) ellipse (2.5 and 1.3);
+ \draw[white, thick] (0.9,4.4) ellipse (3.3 and 1.5);
+ \draw[white, thick] (-4.5,4.8) ellipse (2.0 and 1.3);
\draw[white, thick] (2.5,3) ellipse (3.75 and 1.5);
\draw[white, thick] (-2.5,3) ellipse (3.75 and 1.5);
\node[align=center, anchor=south west] at (-0.5,6) {\textcolor{vivado}{\bf Xilinx Vivado HLS v2019.1}};
@@ -34,112 +59,42 @@ However, we were able to manually reproduce all the bugs found in LegUp 4.0 in L
\node at (-3,1.1) {\textcolor{legup}{\bf LegUp 4.0}};
\node at (3,1.1) {\textcolor{bambu}{\bf Bambu PandA 0.9.7}};
- \node at (-3.5,2.5) {\small 159 (\textcolor{timeout}{4})};
- \node at (-5,5) {\small 26 (\textcolor{timeout}{540})};
- \node at (-4,3.9) {\small 1 (\textcolor{timeout}{1})};
- \node at (-2.3,4.8) {\small 0 (\textcolor{timeout}{5})};
- \node at (-1.5,3.8) {\small 4 (\textcolor{timeout}{0})};
- \node at (0,2.5) {\small 3 (\textcolor{timeout}{2})};
- \node at (3.5,2.5) {\small 906 (\textcolor{timeout}{14})};
- \node at (2.5,3.8) {\small 9 (\textcolor{timeout}{0})};
- \node at (0,5) {\small 70 (\textcolor{timeout}{20})};
- \node at (-6,1.4) {4936};
+ \node at (-3.5,2.5) {\small 159};
+ \node at (-5,5) {\small 26};
+ \node at (-4,3.9) {\small 1};
+ \node at (-1.5,3.8) {\small 4};
+ \node at (0,2.5) {\small 3};
+ \node at (3.5,2.5) {\small 906};
+ \node at (2.5,3.8) {\small 9};
+ \node at (0,5) {\small 70};
+ \node at (-6,1.4) {5522};
\end{tikzpicture}
-\caption{The number of failures per tool out of \totaltestcases{} test-cases. Overlapping regions mean that the same test-cases failed in multiple tools. The numbers in parentheses report how many test-cases timed out.}\label{fig:existing_tools}
+\caption{The number of failures per tool out of \totaltestcases{} test-cases. Overlapping regions mean that the same test-cases failed in multiple tools. \JW{Todo: update the Bambu numbers.} %The numbers in parentheses report how many test-cases timed out.
+}\label{fig:existing_tools}
\end{figure}
-Figure~\ref{fig:existing_tools} shows a Venn diagram of our results.
+Figure~\ref{fig:existing_tools} shows an Euler diagram of our results.
We see that 918 (13.7\%), 167 (2.5\%), 83 (1.2\%) and 26 (0.4\%) test-cases fail in Bambu, LegUp, Vivado HLS and Intel i++ respectively.
+\JW{Somewhere around here mention that Bambu originally had M failures, but after a single bugfix, it went down to N failures. Maybe mention that we would have extended the same courtesy to the other tools had they released fixed versions of their tools promptly?}
Despite i++ having the lowest failure rate, it has the highest time-out rate (540 test-cases), because of its remarkably long compilation time.
% We remark that although the Intel HLS Compiler had the smallest number of confirmed test-case failures, it had the most time-outs (which could be masking additional failures)
Note that the absolute numbers here do not necessarily correspond to the number of bugs in the tools, because a single bug in a language feature that appears frequently in our test suite could cause many programs to crash or fail.
-Hence, we reduce many of the failing test-cases in an effort to identify unique bugs; these are summarised in the table below.\footnote{Link to detailed bug reports available from PC Chair.}
+Moreover, we are reluctant to draw conclusions about the relative reliability of each tool by comparing the number of test-case failures, because these numbers are so sensitive to the parameters of the randomly generated test suite we used. In other words, we can confirm the \emph{presence} of bugs, but cannot deduce the \emph{number} of them (nor their importance).
-\begin{table}[h]
-\centering
-\begin{tabular}{lr}\toprule
- \textbf{Tool} & \textbf{Unique Bugs}\\
- \midrule
- Xilinx Vivado HLS v2019.1 & $\ge 2$\\
- LegUp HLS & $\ge 3$\\
- Intel i++ & $\ge 1$\\
- Bambu HLS & $\ge 2$\\
- \bottomrule
- \end{tabular}
-\end{table}
-
-We write `$\ge$' above to emphasise that all the bug counts are lower bounds -- we did not have time to go through the rather arduous test-case reduction process for every failure.
-
-\subsection{Results across versions of an HLS tool}
-
-\definecolor{ribbon1}{HTML}{8dd3c7}
-\definecolor{ribbon2}{HTML}{b3de69}
-\definecolor{ribbon3}{HTML}{bebada}
-\definecolor{ribbon4}{HTML}{fb8072}
-\definecolor{ribbon5}{HTML}{80b1d3}
-\definecolor{ribbon6}{HTML}{fdb462}
-\begin{figure}
- \centering
- \begin{tikzpicture}[xscale=1.25]
- \draw[white, fill=ribbon1] (-1.0,4.1) -- (0.0,4.1000000000000005) to [out=0,in=180] (2.0,4.1000000000000005) to [out=0,in=180] (4.0,4.1000000000000005) -- (6.0,4.1000000000000005) -- %(7.55,3.325) --
- (6.0,2.5500000000000003) -- (4.0,2.5500000000000003) to [out=180,in=0] (2.0,2.5500000000000003) to [out=180,in=0] (0.0,2.5500000000000003) -- (-1.0,2.55) -- cycle;
- \draw[white, fill=ribbon2] (-1.0,2.55) -- (0.0,2.5500000000000003) to [out=0,in=180] (2.0,1.8) to [out=0,in=180] (4.0,1.55) -- (6.0,1.55) -- %(7.3,0.9) --
- (6.0,0.25) -- (4.0,0.25) to [out=180,in=0] (2.0,0.5) to [out=180,in=0] (0.0,1.25) -- (-1.0,1.25) -- cycle;
- \draw[white, fill=ribbon3] (-1.0,1.25) -- (0.0,1.25) to [out=0,in=180] (2.0,2.5500000000000003) to [out=0,in=180] (4.0,0.25) -- (6.0,0.25) -- %(6.05,0.225) --
- (6.0,0.2) -- (4.0,0.2) to [out=180,in=0] (2.0,2.5) to [out=180,in=0] (0.0,1.2000000000000002) -- (-1.0,1.2) -- cycle;
- \draw[white, fill=ribbon4] (-1.0,0.5) -- (0.0,0.5) to [out=0,in=180] (2.0,2.5) to [out=0,in=180] (4.0,0.2) -- (6.0,0.2) -- %(6.2,0.1) --
- (6.0,0.0) -- (4.0,0.0) to [out=180,in=0] (2.0,2.3000000000000003) to [out=180,in=0] (0.0,0.30000000000000004) -- (-1.0,0.3) -- cycle;
- \draw[white, fill=ribbon5] (-1.0,1.2) -- (0.0,1.2000000000000002) to [out=0,in=180] (2.0,0.5) to [out=0,in=180] (4.0,2.5500000000000003) -- (6.0,2.5500000000000003) -- %(6.2,2.45) --
- (6.0,2.35) -- (4.0,2.35) to [out=180,in=0] (2.0,0.30000000000000004) to [out=180,in=0] (0.0,1.0) -- (-1.0,1.0) -- cycle;
- \draw[white, fill=ribbon6] (-1.0,0.3) -- (0.0,0.30000000000000004) to [out=0,in=180] (2.0,0.30000000000000004) to [out=0,in=180] (4.0,2.35) -- (6.0,2.35) -- %(6.3,2.2) --
- (6.0,2.0500000000000003) -- (4.0,2.0500000000000003) to [out=180,in=0] (2.0,0.0) to [out=180,in=0] (0.0,0.0) -- (-1.0,0.0) -- cycle;
+We have reduced several of the failing test-cases in an effort to identify particular bugs, and our findings are summarised in Table~\ref{tab:bugsummary}. We emphasise that the bug counts here are lower bounds -- we did not have time to go through the arduous test-case reduction process for every failure.
+Figures~\ref{fig:eval:legup:crash}, \ref{fig:eval:intel:mismatch}, and~\ref{fig:eval:bambu:mismatch} present three of the bugs we found. As in Example~\ref{ex:vivado_miscomp}, each bug was first reduced automatically using \creduce{}, and then further reduced manually to achieve the minimal test-case.
- \draw[white, fill=black] (-0.4,4.1) rectangle (0.0,1.0);
- \draw[white, fill=black] (1.8,4.1) rectangle (2.2,2.3);
- \draw[white, fill=black] (3.8,4.1) rectangle (4.2,2.05);
-
- \node at (-0.2,4.5) {v2018.3};
- \node at (2,4.5) {v2019.1};
- \node at (4,4.5) {v2019.2};
- %\node at (2,5) {Vivado HLS};
-
- \node at (5.5,3.325) {31};
- \node at (5.5,0.9) {26};
- \node at (5.5,2.2) {6};
-
- \node[white] at (-0.2,1.2) {62};
- \node[white] at (2,2.5) {36};
- \node[white] at (4,2.25) {41};
- \end{tikzpicture}
- \caption{A Sankey diagram that tracks \vivadotestcases{} test-cases through three different versions of Vivado HLS. The ribbons collect the test-cases that pass and fail together. The black bars are labelled with the total number of test-case failures per version. The 3573 test-cases that pass in all three versions are not depicted.
- }\label{fig:sankey_diagram}
-\end{figure}
-
-Besides comparing the reliability of different HLS tools, we also investigated the reliability of Vivado HLS over time. Figure~\ref{fig:sankey_diagram} shows the results of giving \vivadotestcases{} test-cases to Vivado HLS v2018.3, v2019.1 and v2019.2.
-Test-cases that pass and fail in the same tools are grouped together into a ribbon.
-For instance, the topmost ribbon represents the 31 test-cases that fail in all three versions of Vivado HLS. Other ribbons can be seen weaving in and out; these indicate that bugs were fixed or reintroduced in the various versions. We see that Vivado HLS v2018.3 had the most test-case failures (62).
-Interestingly, as an indicator of reliability of HLS tools, the blue ribbon shows that there are test-cases that fail in v2018.3, pass in v2019.1 but then fail again in v2019.2.
-As in our Venn diagram, the absolute numbers here do not necessary correspond to the number of actual bugs, but we can deduce that there must be at least six unique bugs in Vivado HLS, given that each ribbon corresponds to at least one unique bug.
-
-
-
-
-%\YH{Contradicts value of 3 in Table~\ref{tab:unique_bugs}, maybe I can change that to 6?} \JW{I'd leave it as-is personally; we have already put a `$\ge$' symbol in the table, so I think it's fine.}
-%In addition to that, it can then be seen that Vivado HLS v2018.3 must have at least 4 individual bugs, of which two were fixed and two others stayed in Vivado HLS v2019.1. However, with the release of v2019.1, new bugs were introduced as well. % Finally, for version 2019.2 of Vivado HLS, there seems to be a bug that was reintroduced which was also present in Vivado 2018.3, in addition to a new bug. In general it seems like each release of Vivado HLS will have new bugs present, however, will also contain many previous bug fixes. However, it cannot be guaranteed that a bug that was previously fixed will remain fixed in future versions as well.
-
-\subsection{Some specific bugs found}
-
-We now describe three more of the bugs we found: one crash bug in LegUp, and a miscompilation in Intel and Bambu respectively. As in Example~\ref{ex:vivado_miscomp}, each bug was first reduced automatically using \creduce{}, and then reduced further manually to achieve the minimal test-case.
-
-\begin{example}[A crash bug in LegUp]
-The program shown below leads to an internal compiler error (an unhandled assertion in this case) in LegUp 4.0 and 7.5.
+\begin{figure}[t]
\begin{minted}{c}
int a[2][2][1] = {{{0},{1}},{{0},{0}}};
int main() { a[0][1][0] = 1; }
\end{minted}
+\caption{This program leads to an internal compiler error (an unhandled assertion in this case) in LegUp 4.0 and 7.5. It initialises a 3D array with zeroes and then assigns to one element. The bug only appears when function inlining is disabled (\texttt{NO\_INLINE}).}
+\label{fig:eval:legup:crash}
+\end{figure}
%An assertion error counts as a crash of the tool, as it means that an unexpected state was reached by this input.
%This shows that there is a bug in one of the compilation passes in LegUp, however, due to the assertion the bug is caught in the tool before it produces an incorrect design.
-It initialises a 3D array with zeroes, and then assigns to one element. The bug only appears when function inlining is disabled (\texttt{NO\_INLINE}). % The code initialises the array with zeroes except for \texttt{a[0][1][0]}, which is set to one. Then the main function assigns one to that same location. This code on its own should not actually produce a result and should just terminate by returning 0, which is also what the design that LegUp generates does when the \texttt{NO\_INLINE} flag is turned off.
+% The code initialises the array with zeroes except for \texttt{a[0][1][0]}, which is set to one. Then the main function assigns one to that same location. This code on its own should not actually produce a result and should just terminate by returning 0, which is also what the design that LegUp generates does when the \texttt{NO\_INLINE} flag is turned off.
%The following code also produces an assertion error in LegUp, which is a different one this time. This bug was not discovered during the main test runs of 10 thousand test cases, but beforehand, which meant that we disabled unions from being generated. However, this bug also requires the \texttt{volatile} keyword which seems to be the reason for quite a few mismatches in LegUp and Vivado.
%
@@ -151,9 +106,9 @@ It initialises a 3D array with zeroes, and then assigns to one element. The bug
%int main() { return un.a; }
%\end{minted}
-\end{example}
+%\end{example}
-\begin{figure}
+\begin{figure}[t]
\begin{minted}{c}
static volatile int a[9][1][7];
int main() {
@@ -168,10 +123,10 @@ int main() {
return tmp;
}
\end{minted}
-\caption{Miscompilation bug in Intel i++. It should return 2 because \code{3 \^{} 1 = 2}, however, Intel i++ returns 0 instead.}\label{fig:eval:intel:mismatch}
+\caption{This program miscompiles in Intel i++. It should return 2 because \code{3 \^{} 1 = 2}, but Intel i++ generates a design that returns 0 instead. Perhaps the assignment to 3 in the first for-loop is being overlooked.}\label{fig:eval:intel:mismatch}
\end{figure}
-\begin{figure}
+\begin{figure}[t]
\begin{minted}{c}
static int b = 0x10000;
static volatile short a = 0;
@@ -182,7 +137,7 @@ int main() {
return b;
}
\end{minted}
-\caption{Miscompilation bug in Bambu. As the value of \texttt{b} is shifted to the right by 8, the output should be \texttt{0x100}. However, Bambu outputs 0.}\label{fig:eval:bambu:mismatch}
+\caption{This program miscompiles in Bambu. As the value of \texttt{b} is shifted to the right by 8, the output should be \texttt{0x100}, but Bambu generates a design that returns 0. The increment operation on \texttt{a} appears unrelated, but is necessary to trigger the bug.}\label{fig:eval:bambu:mismatch}
\end{figure}
%\begin{example}[A miscompilation bug in Vivado HLS]
@@ -196,13 +151,110 @@ int main() {
%\end{example}
-\begin{example}[A miscompilation bug in Intel i++]
-Figure~\ref{fig:eval:intel:mismatch} shows a miscompilation bug that was found in Intel i++. Intel i++ does not seem to notice the assignment to 3 in the previous for loop, or tries to perform some optimisations that seem to analyse the array incorrectly and therefore results in a wrong value being returned.
-\end{example}
+%\begin{example}[A miscompilation bug in Intel i++]
+%Figure~\ref{fig:eval:intel:mismatch} shows a miscompilation bug found in Intel i++. Intel i++ does not seem to notice the assignment to 3 in the previous for loop, or tries to perform some optimisations that seem to analyse the array incorrectly and therefore results in a wrong value being returned.
+%\end{example}
+
+%\begin{example}[A miscompilation bug in Bambu]
+%Figure~\ref{fig:eval:bambu:mismatch} shows a miscompilation bug in Bambu, where the result of the value in \texttt{b} is affected by the increment operation on \texttt{a}.
+%\end{example}
+
+%We have reduced several of the failing test-cases in an effort to identify particular bugs; these are summarised in the table below.\footnote{Link to detailed bug reports available from PC Chair.} \JW{One reviewer complained about this table not having a caption.} \JW{How about we extend this table so it has one bug per row? See my attempt in Table~\ref{tab:bugsummary}.}
+
+%\begin{table}[h]
+%\centering
+%\begin{tabular}{lr}\toprule
+% \textbf{Tool} & \textbf{Unique Bugs}\\
+% \midrule
+% Xilinx Vivado HLS v2019.1 & $\ge 2$\\
+% LegUp HLS & $\ge 3$\\
+% Intel i++ & $\ge 1$\\
+% Bambu HLS & $\ge 2$\\
+% \bottomrule
+% \end{tabular}
+%\end{table}
+
+\begin{table}[t]
+\centering
+\caption{A summary of the bugs we found.}
+\label{tab:bugsummary}
+\begin{tabular}{llll}\toprule
+ \textbf{Tool} & \textbf{Bug type} & \textbf{Details} & \textbf{Status} \\
+ \midrule
+ Vivado HLS & miscompile & Fig.~\ref{fig:vivado_bug1} & reported, confirmed \\
+ Vivado HLS & miscompile & webpage & reported \\
+ LegUp HLS & crash & Fig.~\ref{fig:eval:legup:crash} & reported, confirmed \\
+ LegUp HLS & crash & webpage & \JW{status?} \\
+ LegUp HLS & miscompile & webpage & reported, confirmed \\
+ Intel i++ & miscompile & Fig.~\ref{fig:eval:intel:mismatch} & reported \\
+ Bambu HLS & miscompile & Fig.~\ref{fig:eval:bambu:mismatch} & reported, confirmed, fixed \\
+ Bambu HLS & miscompile & webpage & reported, confirmed \\
+ \bottomrule
+ \end{tabular}
+\end{table}
+
+%We write `$\ge$' above to emphasise that all the bug counts are lower bounds -- we did not have time to go through the rather arduous test-case reduction process for every failure.
+
+\subsection{Results across versions of an HLS tool}
+
+\definecolor{ribbon1}{HTML}{8dd3c7}
+\definecolor{ribbon2}{HTML}{b3de69}
+\definecolor{ribbon3}{HTML}{bebada}
+\definecolor{ribbon4}{HTML}{fb8072}
+\definecolor{ribbon5}{HTML}{80b1d3}
+\definecolor{ribbon6}{HTML}{fdb462}
+\begin{figure}[t]
+ \centering
+ \begin{tikzpicture}[xscale=1.25]
+ \draw[white, fill=ribbon1] (-1.0,4.1) -- (0.0,4.1) to [out=0,in=180] (2.0,4.1) to [out=0,in=180] (4.0,4.1) -- (6.0,4.1) -- %(7.55,3.325) --
+ (6.0,2.55) -- (4.0,2.55) to [out=180,in=0] (2.0,2.55) to [out=180,in=0] (0.0,2.55) -- (-1.0,2.55) -- cycle;
+ \draw[white, fill=ribbon2] (-1.0,2.55) -- (0.0,2.55) to [out=0,in=180] (1.8,1.8) -- (2.2,1.8) to [out=0,in=180] (4.0,1.55) -- (6.0,1.55) -- %(7.3,0.9) --
+ (6.0,0.25) -- (4.0,0.25) to [out=180,in=0] (2.2,0.5) -- (1.8,0.5) to [out=180,in=0] (0.0,1.25) -- (-1.0,1.25) -- cycle;
+ \draw[white, fill=ribbon3] (-1.0,1.25) -- (0.0,1.25) to [out=0,in=180] (1.8,2.55) -- (2.2,2.55) to [out=0,in=180] (4.0,0.25) -- (6.0,0.25) -- %(6.05,0.225) --
+ (6.0,0.2) -- (4.0,0.2) to [out=180,in=0] (2.2,2.5) -- (1.8,2.5) to [out=180,in=0] (0.0,1.2) -- (-1.0,1.2) -- cycle;
+ \draw[white, fill=ribbon4] (-1.0,0.5) -- (0.0,0.5) to [out=0,in=180] (1.8,2.5) -- (2.2,2.5)to [out=0,in=180] (4.0,0.2) -- (6.0,0.2) -- %(6.2,0.1) --
+ (6.0,0.0) -- (4.0,0.0) to [out=180,in=0] (2.2,2.3) -- (1.8,2.3) to [out=180,in=0] (0.0,0.3) -- (-1.0,0.3) -- cycle;
+ \draw[white, fill=ribbon5] (-1.0,1.2) -- (0.0,1.2) to [out=0,in=180] (1.8,0.5) -- (2.2,0.5) to [out=0,in=180] (4.0,2.55) -- (6.0,2.55) -- %(6.2,2.45) --
+ (6.0,2.35) -- (4.0,2.35) to [out=180,in=0] (2.2,0.3) -- (1.8,0.3) to [out=180,in=0] (0.0,1.0) -- (-1.0,1.0) -- cycle;
+ \draw[white, fill=ribbon6] (-1.0,0.3) -- (0.0,0.3) to [out=0,in=180] (1.8,0.3) -- (2.2,0.3) to [out=0,in=180] (4.0,2.35) -- (6.0,2.35) -- %(6.3,2.2) --
+ (6.0,2.05) -- (4.0,2.05) to [out=180,in=0] (2.2,0.0) -- (1.8,0.0) to [out=180,in=0] (0.0,0.0) -- (-1.0,0.0) -- cycle;
+
+ \draw[white, fill=black] (-0.4,4.1) rectangle (0.0,1.0);
+ \draw[white, fill=black] (1.8,4.1) rectangle (2.2,2.3);
+ \draw[white, fill=black] (4.0,4.1) rectangle (4.4,2.05);
+
+ \node at (-0.2,4.5) {v2018.3};
+ \node at (2,4.5) {v2019.1};
+ \node at (4.2,4.5) {v2019.2};
+ %\node at (2,5) {Vivado HLS};
+
+ \node at (5.5,3.325) {31};
+ \node at (5.5,0.9) {26};
+ \node at (5.5,2.2) {6};
+
+ \node[white] at (-0.2,1.2) {62};
+ \node[white] at (2,2.5) {36};
+ \node[white] at (4.2,2.25) {41};
+ \end{tikzpicture}
+ \caption{A Sankey diagram that tracks \vivadotestcases{} test-cases through three different versions of Vivado HLS. The ribbons collect the test-cases that pass and fail together. The black bars are labelled with the total number of test-case failures per version. The 3573 test-cases that pass in all three versions are not depicted.
+ }\label{fig:sankey_diagram}
+\end{figure}
+
+Besides studying the reliability of different HLS tools, we also studied the reliability of Vivado HLS over time. Figure~\ref{fig:sankey_diagram} shows the results of giving \vivadotestcases{} test-cases to Vivado HLS v2018.3, v2019.1 and v2019.2.
+Test-cases that pass and fail in the same tools are grouped together into a ribbon.
+For instance, the topmost ribbon represents the 31 test-cases that fail in all three versions of Vivado HLS. Other ribbons can be seen weaving in and out; these indicate that bugs were fixed or reintroduced in the various versions. We see that Vivado HLS v2018.3 had the most test-case failures (62).
+Interestingly, the blue ribbon shows that there are test-cases that fail in v2018.3, pass in v2019.1, and then fail again in v2019.2.
+As in our Euler diagram, the absolute numbers here do not necessary correspond to the number of actual bugs, though we can observe that there must be at least six unique bugs in Vivado HLS, given that each ribbon corresponds to at least one unique bug.
+
+
+
+
+%\YH{Contradicts value of 3 in Table~\ref{tab:unique_bugs}, maybe I can change that to 6?} \JW{I'd leave it as-is personally; we have already put a `$\ge$' symbol in the table, so I think it's fine.}
+%In addition to that, it can then be seen that Vivado HLS v2018.3 must have at least 4 individual bugs, of which two were fixed and two others stayed in Vivado HLS v2019.1. However, with the release of v2019.1, new bugs were introduced as well. % Finally, for version 2019.2 of Vivado HLS, there seems to be a bug that was reintroduced which was also present in Vivado 2018.3, in addition to a new bug. In general it seems like each release of Vivado HLS will have new bugs present, however, will also contain many previous bug fixes. However, it cannot be guaranteed that a bug that was previously fixed will remain fixed in future versions as well.
+
+%\subsection{Some specific bugs found}
+
-\begin{example}[A miscompilation bug in Bambu]
-Figure~\ref{fig:eval:bambu:mismatch} shows a miscompilation bug in Bambu, where the result of the value in \texttt{b} is affected by the increment operation on \texttt{a}.
-\end{example}
%%% Local Variables:
%%% mode: latex