\documentclass[hyphens,prologue,x11names,rgb,sigconf]{acmart}

\def\ymhgistrue{1}\def\ymhgident%
  {1}
% ^^^ Set ident to 0 for anonymous, and 1 for personal info

\usepackage[english]{babel}
\usepackage[nounderscore]{syntax}
\usepackage{graphicx}
\usepackage{siunitx}
\usepackage{minted}
\usepackage{amsthm}
\usepackage{pgfplots}
\usepackage{tikz}
\usepackage{subcaption}
\usepackage{booktabs}
\usepackage{multirow}
\usepackage{multicol}
\usepackage{hyperref}
%\usepackage{balance}

\copyrightyear{2020}
\acmYear{2020}
\setcopyright{acmcopyright}
\acmConference[FPGA '20]{Proceedings of the 2020 ACM/SIGDA International Symposium on Field-Programmable Gate Arrays}{February 23--25, 2020}{Seaside, CA, USA}
\acmBooktitle{Proceedings of the 2020 ACM/SIGDA International Symposium on Field-Programmable Gate Arrays (FPGA '20), February 23--25, 2020, Seaside, CA, USA}
\acmPrice{15.00}
\acmDOI{10.1145/3373087.3375310}
\acmISBN{978-1-4503-7099-8/20/02}

%%
%% The majority of ACM publications use numbered citations and
%% references.  The command \citestyle{authoryear} switches to the
%% "author year" style.
%%
%% If you are preparing content for an event
%% sponsored by ACM SIGGRAPH, you must use the "author year" style of
%% citations and references.
%% Uncommenting
%% the next command will enable that style.
%% \citestyle{acmauthoryear}

\settopmatter{printacmref=true}

\renewcommand{\syntleft}{\it\/}
\renewcommand{\syntright}{}
\renewcommand{\litleft}{\ttfamily}
\renewcommand{\litright}{}

\theoremstyle{remark}
\newtheorem*{remark}{Remark}

\setminted{fontsize=\small,baselinestretch=1,frame=lines,linenos,numbersep=5pt}

\pgfplotsset{compat=1.15}
\usetikzlibrary{calc,arrows.meta}
\usetikzlibrary{positioning}
\usepgfplotslibrary{groupplots}
%\usetikzlibrary{external}
%\tikzexternalize[prefix=figures_conference/]

\tikzset{
  small dot/.style={fill=red,circle,scale=1},
}

\newcommand{\tool}[2][]{%
  \ifx\hfuzz#1\hfuzz #2%
  \else #2 #1%
  \fi}%
\newcommand{\xilinx}[1][]{\tool[#1]{Xilinx}}%
\newcommand{\intel}[1][]{\tool[#1]{Intel}}%
\newcommand{\quartus}[1][]{\tool[#1]{Quartus Prime}}%
\newcommand{\quartuslite}[1][]{\tool[#1]{Quartus Prime Lite}}%
\newcommand{\yosys}[1][]{\tool[#1]{Yosys}}%
\newcommand{\xst}[1][]{\tool[#1]{XST}}%
\newcommand{\vivado}[1][]{\tool[#1]{Vivado}}%
\newcommand{\verismith}[1][]{\tool[#1]{Verismith}}%
\newcommand{\iverilog}[1][]{\tool[#1]{Icarus Verilog}}%
\newcommand{\bugtracker}[1]{%
  \ifx\ymhgistrue\ymhgident%
  \footnote{#1}%
  \else%
  \footnote{Bug tracker URL redacted for blind review.}%
  \fi%
}
\newcommand{\personallink}[1]{%
  \ifx\ymhgistrue\ymhgident%
  \footnote{#1}%
  \else%
  \footnote{URL redacted for blind review.}%
  \fi%
}

\pgfplotsset{tiny/.style={width=4.5cm,
    height=,
    legend style={font=\tiny},
    tick label style={font=\tiny},
    label style={font=\tiny},
    title style={font=\footnotesize},
    every axis title shift=0pt,
    max space between ticks=12,
    every mark/.append style={mark size=6},
    major tick length=0.1cm,
    minor tick length=0.066cm,
    every legend image post/.append style={scale=0.8},
  },
}

%
\definecolor{bblue}{HTML}{4F81BD}%
\definecolor{lightbblue}{HTML}{89B1E1}%
\definecolor{rred}{HTML}{C0504D}%
\definecolor{lightrred}{HTML}{FA8D8A}%
\definecolor{ggreen}{HTML}{9BBB59}%
\definecolor{ppurple}{HTML}{9F4C7C}%
\definecolor{yyellow}{HTML}{FFA000}

\definecolor{ribbon1}{HTML}{66C2A5}
\definecolor{ribbon2}{HTML}{FC8D62}
\definecolor{ribbon3}{HTML}{8DA0CB}
\definecolor{ribbon4}{HTML}{E78AC3}
\definecolor{ribbon5}{HTML}{A6D854}
\definecolor{ribbon6}{HTML}{FFD92F}

\definecolor{mscol1}{HTML}{E66101}
\definecolor{mscol2}{HTML}{5E3C99}
\definecolor{ccol1}{HTML}{FDB863}
\definecolor{ccol2}{HTML}{B2ABD2}

\definecolor{verireduce}{HTML}{E66101}
\definecolor{creduce}{HTML}{5E3C99}

\pgfplotsset{
  verismith missynth/.style={verireduce, draw=verireduce, mark=*},
  creduce missynth/.style={creduce, draw=creduce, mark=triangle*},
  verismith crash/.style={verireduce, draw=verireduce, mark=o},
  creduce crash/.style={creduce, draw=creduce, mark=triangle}
}

\definecolor{diag1}{HTML}{CCEBC5}
\definecolor{diag2}{HTML}{B3CDE3}
\definecolor{diagerr}{HTML}{FBB4AE}

\definecolor{distr1}{HTML}{1B9E77}
\definecolor{distr2}{HTML}{D95F02}
\definecolor{distr3}{HTML}{7570B3}
\definecolor{distr4}{HTML}{E7298A}
\definecolor{distr5}{HTML}{66A61E}
\definecolor{distr6}{HTML}{E6AB02}
\definecolor{distr7}{HTML}{A6761D}
\definecolor{distr8}{HTML}{666666}

\definecolor{combined}{HTML}{008080}
\definecolor{uncombined}{HTML}{008080}

%%
%% end of the preamble, start of the body of the document source.
\begin{document}

\fancyhead{}

%%
%% The "title" command has an optional parameter,
%% allowing the author to define a "short title" to be used in page headers.
\title{Finding and Understanding Bugs in FPGA Synthesis Tools}

%%
%% The "author" command and its associated commands are used to define
%% the authors and their affiliations.
%% Of note is the shared affiliation of the first two authors, and the
%% "authornote" and "authornotemark" commands
%% used to denote shared contribution to the research.
\author{Yann Herklotz}
\email{yann.herklotz15@imperial.ac.uk}
\affiliation{%
  \institution{Imperial College London}
  \streetaddress{South Kensington Campus}
  \city{London}
  \country{UK}
  \postcode{SW7 2AZ}
}

\author{John Wickerson}
\email{j.wickerson@imperial.ac.uk}
\affiliation{%
  \institution{Imperial College London}
  \streetaddress{South Kensington Campus}
  \city{London}
  \country{UK}
  \postcode{SW7 2AZ}
}

%%
%% By default, the full list of authors will be used in the page
%% headers. Often, this list is too long, and will overlap
%% other information printed in the page headers. This command allows
%% the author to define a more concise list
%% of authors' names for this purpose.
\renewcommand{\shortauthors}{Yann Herklotz and John Wickerson}

%%
%% The abstract is a short summary of the work to be presented in the
%% article.
\begin{abstract}
  All software ultimately relies on hardware functioning correctly. Hardware correctness is becoming increasingly important due to the growing use of custom accelerators using FPGAs to speed up applications on servers. Furthermore, the increasing complexity of hardware also leads to ever more reliance on automation, meaning that the correctness of synthesis tools is vital for the reliability of the hardware.

  This paper aims to improve the quality of FPGA synthesis tools by introducing a method to test them automatically using randomly generated, correct Verilog, and checking that the synthesised netlist is always equivalent to the original design. The main contributions of this work are twofold: firstly a method for generating random behavioural Verilog free of undefined values, and secondly a Verilog test case reducer used to locate the cause of the bug that was found. These are implemented in a tool called \verismith{}. This paper also provides a qualitative and quantitative analysis of the bugs found in \yosys{}, \vivado{}, \xst{} and \quartus{}. Every synthesis tool except \quartus{} was found to introduce discrepancies between the netlist and the design. In addition to that, \vivado{} and a development version of \yosys{} were found to crash when given valid input. Using \verismith{}, eleven bugs were reported to tool vendors, of which six have already been fixed.
\end{abstract}

%%
%% The code below is generated by the tool at http://dl.acm.org/ccs.cfm.
%% Please copy and paste the code instead of the example below.
%%

\begin{CCSXML}
<ccs2012>
<concept>
<concept_id>10010583.10010682</concept_id>
<concept_desc>Hardware~Electronic design automation</concept_desc>
<concept_significance>500</concept_significance>
</concept>
<concept>
<concept_id>10010583.10010682.10010689</concept_id>
<concept_desc>Hardware~Hardware description languages and compilation</concept_desc>
<concept_significance>500</concept_significance>
</concept>
<concept>
<concept_id>10011007.10011074.10011099</concept_id>
<concept_desc>Software and its engineering~Software verification and validation</concept_desc>
<concept_significance>500</concept_significance>
</concept>
</ccs2012>
\end{CCSXML}

\ccsdesc[500]{Hardware~Electronic design automation}
\ccsdesc[500]{Hardware~Hardware description languages and compilation}
\ccsdesc[500]{Software and its engineering~Software verification and validation}

%%
%% Keywords. The author(s) should pick words that accurately describe
%% the work being presented. Separate the keywords with commas.
\keywords{fuzzing, logic synthesis, Verilog, test case reduction}

%%
%% This command processes the author and affiliation and title
%% information and builds the first part of the formatted document.
\maketitle

\section{Introduction}

Almost all digital computation performed in the world today relies, in one way or another, on a logic synthesis tool. Computation specified in RTL passes through a logic synthesis tool before being implemented on an FPGA or an ASIC.\@ Even designs that are expressed in higher-level languages eventually get synthesised down to RTL.\@ Computation that is executed in software is carried out on a processor whose design has also, at some point, passed through a logic synthesis tool.

These tools are not only \emph{pervasive}: they are \emph{trusted}. That is, any bugs they contain undermine efforts to ensure the correctness of hardware designs. For instance, the Silver processor has been formally proven to implement its ISA correctly~\cite{loow19_verif_compil_verif_proces}, and the Kami platform enables hardware designs to be formally verified using Coq~\cite{choi17_kami}. Yet in both cases, the final hardware is only as reliable as the logic synthesis tool that produces it. That these tools are trusted is explicitly acknowledged -- Silver's correctness proof assumes that the ``toolchain taking Verilog to FPGA bitstreams is bug-free'', while Kami's guarantees hold only ``if we trust [the] compiler to preserve the semantics.'' We ask in this paper whether this trust is well placed.

\begin{figure}
\begin{minted}{verilog}
module top (y, clk, w1);
   output y;
   input clk;
   input signed [1:0] w1;
   reg r1 = 1'b0;
   assign y = r1;
   always @(posedge clk) 
     if ({-1'b1 == w1}) r1 <= 1'b1;
endmodule
\end{minted}
  \caption[Synthesis bug in \vivado{}.]{\vivado{} bug found automatically by \verismith{}. \vivado{} incorrectly expands \texttt{-1'b1} to \texttt{-2'b11} instead of \texttt{-2'b01}. The bug was reported and confirmed by \xilinx{}.\footnotemark{}}\label{fig:synth_bug}
\end{figure}
\footnotetext{\url{https://forums.xilinx.com/t5/Synthesis/Vivado-2019-1-Unsigned-bit-extension-in-if-statement/td-p/981789}}

Logic synthesis tools are prone to bugs because of the complexity involved in performing the aggressive optimisations that are required to meet power consumption and timing demands. Moreover, these bugs are likely to be particularly egregious because they can be hard to detect. This is especially the case when synthesising large designs, because post-synthesis simulation or verification is often skipped, or is only performed towards the end of the development cycle, due to time constraints. Even when these bugs \emph{are} detected during post-synthesis testing, the root cause can be extremely challenging to isolate and work around~\cite{mcdonald06_logic_equiv_check_arriv_fpga_devel}. With hardware designs growing ever larger and increasingly being created by software engineers operating HLS tools, this inability to debug is becoming ever more troublesome.

In this paper, we describe the design and implementation of a tool called \verismith{}~\cite{herklotz_verismith} for finding and understanding bugs in logic synthesis tools that target FPGAs. \verismith{} generates pseudo-random, valid, deterministic Verilog designs, feeds each to a synthesis tool, and uses an SMT solver or the ABC~\cite{brayton10_abc} circuit verification tool to check that the output is logically equivalent to the input. If they are not equivalent, it has found a bug. \verismith{} then iteratively reduces the Verilog design with the aim of finding the smallest (and hence most understandable) program that still triggers a bug. An example of such a bug that was found by \verismith{} is shown in Figure~\ref{fig:synth_bug}. The test case has been tweaked manually for readability, but it was found and reduced automatically by \verismith{}.

\begin{table}
  \renewcommand\tabcolsep{1mm}
  \centering
  \begin{tabular}{lllp{2cm}} \toprule
    \textbf{Tool} & \textbf{Vendor} & \textbf{License} & \textbf{Versions} \\
    \midrule
    \xst{}~\cite{xilinx_xst_synth_overv} & \xilinx{} & Commercial & 14.7 \\
    \vivado{}~\cite{xilinx_vivad_desig_suite} & \xilinx{} & Commercial & 2016.1, 2016.2, 2017.4, 2018.2, 2019.1
    \\
    \quartus{}~\cite{intel_intel_quart} & \intel{} & Commercial & 19.2 \\
    \quartuslite{}~\cite{intel_intel_quart} & \intel{} & Commercial & 19.1 \\
    \yosys{}~\cite{wolf_yosys_open_synth_suite} & \yosys{} & ISC License & 0.8, 0.9, 3333e00, 70d0f38  \\
    \bottomrule
  \end{tabular}
  \caption{Versions of the synthesis tools that were tested.}\label{table:tool_version}
\end{table}

We ran \verismith{} on five major synthesis tools for FPGAs for a total of 18000 CPU hours, as can be seen in Table~\ref{table:tool_version}. We found two classes of bugs: Verilog designs that cause the synthesis tool to produce incorrect output, and Verilog designs that cause the synthesis tool to crash. We reported a total of 7 unique test cases that are mis-synthesised: 4 in \yosys{}, 3 in \vivado{}. We also reported 3 unique crash bugs: 1 in \yosys{} and 2 in \vivado{}. In addition, a bug was also found in \iverilog{}~\cite{williams_icarus_veril}, which is a simulator used in \verismith{} to check counterexamples returned by the equivalence check. All 11 test cases have been reported to the manufacturers; the \yosys{} and \iverilog{} bugs have all been confirmed and fixed, whereas the \vivado{} bugs have been confirmed and are awaiting a fix. Testing \quartus{} proved difficult, however, once it worked, we did not find any failing test cases, meaning it was quite stable. Failing test cases found in \xst{} were not reported, as it is no longer being actively maintained.

The three main contributions of this paper are the following:

\begin{itemize}
\item We present an algorithm for generating random, valid, deterministic Verilog designs that employ a variety of combinational and behavioural constructs, shown in Section~\ref{sec:generation}.
\item We explain how to check whether a given Verilog design triggers a bug in a synthesis tool under test in Section~\ref{sec:equivalence} and then present an algorithm for reducing Verilog designs to find the smallest program that triggers the bug in Section~\ref{sec:reducer}.
\item Finally, in Section~\ref{sec:results}, we report the results of synthesising our generated programs using three logic synthesis tools and evaluate how our design decisions affect the ability of \verismith{} to find bugs in these tools.
\end{itemize}

%% TODO: Think about what to do with the artifacts
\verismith{} is fully open source and can be found on GitHub.\personallink{\url{https://github.com/ymherklotz/verismith}} %All the runs that were performed are also hosted on AWS\personallink{\url{http://verifuzz-data.s3.amazonaws.com}}.

\section{Overview of Verismith}\label{sec:verismith}

\verismith{} is the implementation of the Verilog generation and test case reduction algorithm with the goal of finding bugs in synthesis tools. Figure~\ref{fig:diff_testing} shows the main workflow in \verismith{}. First, a random design is generated and passed to the synthesis tool, which should produce an equivalent netlist. If the synthesis tool crashes, a bug has been found and the initial design would therefore be reduced to a minimal test case that still triggers the crash. This is shown by the red dashed arrow which shortcuts an error occurring in synthesis to the reduction step. However, if synthesis completes successfully, the netlist is compared to the initial design. If they differ, the resultant design needs to be reduced as well, which is depicted by the other red dashed arrow.

\verismith{} generates semantically correct and deterministic Verilog, meaning that it should always pass synthesis and the values of output wires should be uniquely determined by values of the input wires. An equivalence check can therefore be performed between the generated design and the synthesised netlist to determine if the synthesis was correct. If the netlist is shown not to be equivalent to the design, it must mean that there is a synthesis bug, as false negatives and false positives are not possible. However, there is the possibility that the SMT solver does not give an answer in time, in which case it cannot be determined if the design is equivalent to the netlist.

\begin{figure}
  \centering
  \begin{tikzpicture}
    \node[draw,fill=diag1,text width=2cm,align=center] (verilog) at (0,0)
    {Verilog generation};
    \node[draw,fill=diag2,circle,text width=1.5cm,align=center,inner sep=-3pt]
    (verdes) at (2.5,0) {Verilog design};
    \node[draw,fill=diag2,circle,text width=1.5cm,align=center,inner sep=-3pt]
    (vernet) at (6.5,0) {Verilog netlist};
    \node[draw,fill=diag2,circle,text width=1.5cm,align=center,inner sep=-3pt]
    (vertest) at (0,-2) {Reduced test case};
    \node[draw,fill=diag1,minimum height=1cm] (synth) at (4.5,0) {Synthesis};
    \node[draw,fill=diag1,minimum height=1cm,text width=2cm,align=center]
    (equiv) at (6,-2) {Equivalence check};
    \node[draw,fill=diag1,minimum height=1cm,text width=2cm,align=center]
    (reduce) at (2.5,-2) {Reduction};
    \node at (4.3,-2.4) {{\color{rred} \small fail}};
    \node at (3.8,-1.2) {{\color{rred} \small crash}};
    \draw[thick,->] (verilog) -- (verdes);
    \draw[thick,->] (verdes) -- (synth);
    \draw[thick,->] (synth) -- (vernet);
    \draw[thick,->] (verdes) -- ($ (equiv.north) + (-0.5,0) $);
    \draw[thick,->] (vernet) -- ($ (equiv.north) + (0.5,0) $);
    \draw[rred,thick,dashed,->] (synth.south) to [out=270,in=0] ($ (reduce.east) + (0,0.25) $);
    \draw[thick,->] (verdes) -- ($ (reduce.north) $);
    \draw[rred,thick,dashed,->] (equiv) to [out=180,in=0] ($ (reduce.east) + (0,-0.25) $);
    \draw[thick,->] (reduce) -- (vertest);
  \end{tikzpicture}
  \caption{Overview of the testing approach used in \verismith{} by generating random Verilog. If the synthesis tool crashes or the equivalence check fails, the test case is reduced into a minimal representation, shown by the red dashed arrows.}\label{fig:diff_testing}
\end{figure}

\verismith{} was implemented in Haskell because its algebraic data types are well-suited for capturing the syntax of a language like Verilog, and its pure functions make it easier to reason about and test functions.

\section{Generating Verilog}\label{sec:generation}

To test the synthesis tools, valid random Verilog needs to be generated so that the synthesis tool successfully produces a netlist that can be compared to the original design.

\subsection{Target language}

The synthesisable subset of Verilog 2005~\cite{05_veril_regis_trans_level_synth} was chosen as the target HDL as it is widely supported. Every generated Verilog file contains a list of module definitions with an arbitrary number of inputs and parameters, and one output. The module body consists of a list of module items, which can be any of the following constructs:

\begin{itemize}
\item continuous assignment,
\item local parameter declaration,
\item module instantiation,
\item always and initial block, and
\item wire and variable declaration.
\end{itemize}

Inside always blocks and initial blocks, behavioural Verilog can then be generated, which supports the following constructs:

\begin{itemize}
\item blocking and nonblocking assignments,
\item conditional statements, and
\item for loops.
\end{itemize}

Finally, many different expressions are supported, such as:

\begin{itemize}
\item all synthesisable unary and binary operators,
\item bit selection,
\item function calls,
\item concatenation, and
\item ternary conditional operators.
\end{itemize}

The most notable features that are missing from this grammar subset are function and task definitions, but we expect adding them to be straightforward. In addition to that, the synthesisable subset of Verilog specifies many constructs that should be ignored by synthesis tools, so these are also not generated by \verismith{}. These are constructs that do not have a direct hardware equivalent and include delays, specify blocks or system task calls. Although initial blocks and variable initialisation are supposed to be ignored by the synthesis tool, except when modelling ROMs~\cite[section 7.7.9.1]{05_veril_regis_trans_level_synth}, both of these features are supported in all the synthesis tools that were tested for FPGAs, as these can be powered up into a known state. These are therefore used by \verismith{} to set the design in a known state at the start.

\subsection{Properties of generated programs}

The data types specifying the syntax strictly follow the grammar, with the result that only syntactically valid programs can be represented. However, this cannot guarantee that the programs are semantically valid as well.

We define semantically valid programs as ones that should be accepted by simulators and synthesis tools without producing any errors or critical warnings. Even though random programs following the grammar will pass the parsing stage of these tools, they will most likely error out when they are processed. To ensure semantic validity, several rules have to be followed, including:

\begin{itemize}
\item module inputs have to be nets, 
\item no continuous assignments to variables or blocking/non\-blocking assignments to nets occur, and
\item every variable that is used has to be declared.
\end{itemize}
% Change to what it says about determinism

Moreover, the generated Verilog should be deterministic. This simplifies the equivalence checking stage, as there cannot be any false positives or false negatives. Therefore the equivalence check indicates if there is a bug or not. In addition, we argue that bugs found using purely deterministic Verilog are more severe than bugs that include undefined values, as most Verilog in production is deterministic and it is generally bad practice to write Verilog that depends on undefined values. Furthermore, undefined values might mask bugs in the synthesis tool by allowing it to optimise away large sections of the design. Therefore, other Verilog patterns have to be avoided even though they are semantically and syntactically correct. For example, some constructs that lead to nondeterministic behaviour are the following:

\begin{itemize}
\item driving a wire from two different sources,
\item dividing by zero,
\item passing fewer bits to a module than it expects,
\item selecting bits that are out of range, and
\item using a net that has not been declared previously.
\end{itemize}

\subsection{Generation algorithm}\label{sec:generation_algorithm}

\begin{figure}
  \centering
\begin{minted}{js}
[probability]
  expr.binary = 5
  expr.concatenation = 3
  expr.number = 1
  expr.rangeselect = 5
  expr.signed = 5
  expr.string = 0
  expr.ternary = 5
  expr.unary = 5
  expr.unsigned = 5
  expr.variable = 5
  moditem.assign = 5
  moditem.combinational = 1
  moditem.instantiation = 1
  moditem.sequential = 1
  statement.blocking = 0
  statement.conditional = 1
  statement.forloop = 1
  statement.nonblocking = 3

[property]
  module.depth = 2
  module.max = 5
  output.combine = false
  sample.method = "random"
  sample.size = 10
  size = 20
  statement.depth = 3
\end{minted}
  \caption{Configuration file used to tweak properties of the generated program including frequencies of constructs.}\label{fig:config}
\end{figure}

To generate Verilog that avoids these constructs, the syntax tree is built sequentially, line by line, using a context to keep track of important facts about previously generated code. This inherently prohibits combinational loops, as all the values being assigned to the current wire will already have been assigned beforehand. The context contains: (1) a list of nets and variables that are declared and assigned in the current context, (2) a list of modules that can safely be instantiated and (3) a list of parameters that are available in the module. These are used to safely create module items and make sure that they do not introduce undefined values or race conditions. When creating a new variable assignment, the nets can be safely taken from the context as these are guaranteed to have been assigned previously.

The context also contains relative frequencies which are attached to each construct that can be generated. These determine how often a construct will appear in the output. We tweaked these manually using a configuration file, shown in Figure~\ref{fig:config}, to obtain a good coverage of all the features, while keeping the synthesis and equivalence checking time to a minimum. In particular, maximum statement list length and depth were heavily tweaked to reduce the nesting depth of statements, as that would increase the synthesis time and equivalence checking time exponentially. Operations like divide and modulo were removed for most of the testing, because with nets or variables containing a large number of bits, the circuits generated by the synthesis tools were too large to be efficiently optimised and checked for equivalence.

The output of the Verilog generation is a Verilog file containing multiple modules. The entry point is a top-level module. For every module, a random number of inputs with random sizes are chosen and added to the context. A clock for sequential blocks and an output port are also added. Random parameters are also declared and added to the context so that they are available in any expressions inside the module. Finally a list of module items are generated.

To ensure that expressions remain deterministic, extra checks are performed to ascertain that no undefined values are added to an expression. Checks can either be performed statically at generation or dynamically at runtime. For example, if bits are selected from a net, the size of the net is checked against the range of the selection statically. However, runtime checks need to be added to operations like division to check against division by zero. This is similar to the safe math wrappers that are used in an existing C fuzzer called Csmith~\cite{yang11_findin_under_bugs_c_compil} to avoid undefined behaviour like signed overflow.

% \begin{enumerate}
% \item A construct is selected using a frequency distribution from all valid constructs that could be inserted at that point. The frequencies are defined in a configuration file passed to \verismith{} when it is invoked. If needed, checks are performed to ensure that the chosen construct is valid at the current position. For example, a conditional statement cannot be generated if the maximum statement depth has been reached. Another example is that nonblocking assignment cannot be generated in a combinational block.
% \item If a new net or variable is being defined in the construct, for example in an assignment or in a for loop, it is added to the context and declared properly at the start of the module. If the construct requires existing variables or nets to be used in an expression or as arguments to a module instantiation, they are taken from the current context.
% \item To ensure that expressions remain deterministic, extra checks are performed to ascertain that no undefined values are added to an expression. Checks can either be performed statically at generation or dynamically at runtime. For example, if bits are selected from a variable or net, the size of the net is checked against the range of the selection statically. However, runtime checks need to be added to operations like division to check against division by zero. This is similar to the safe math wrappers that are used in an existing C fuzzer called Csmith~\cite{yang11_findin_under_bugs_c_compil} to avoid undefined behaviour like signed overflow.
% \end{enumerate}

\begin{figure}
  \inputminted[breaklines]{verilog}{./data/example_gen.v}
  \caption{Example module generated by \verismith{} showing the distinct sections that are produced.}\label{fig:gen_module}
\end{figure}

Once the module items have finished generating, all the internally declared variables and nets are concatenated and assigned to the output, so that any discrepancies in the internals of the module are detected by the formal verification step. An example of a generated module is shown in Figure~\ref{fig:gen_module}, which declares many variables and concatenates them to the output \texttt{y}. Figure~\ref{fig:gen_module} also shows the different sections that are created by \verismith{}. First come all the declarations of the nets and variables that are used and assigned to somewhere in the body of the module. Then follows the assignment of the internal state of the module to the output wire \texttt{y}, so that any errors in any of the assignments will be detected in the output. Finally, the main body of the module contains a list of random constructs, which were generated according to the configuration file that was passed to \verismith{}.

\begin{remark}
If the number of IO ports is limited, because the design first needs to fit onto an actual FPGA such as in \quartuslite{}, the output can be reduced to one bit by applying the XOR reduction operator. For example, the concatenation

\begin{minted}[linenos=false]{verilog}
assign y = {reg3,reg4,reg5,wire6,wire7,reg8,wire9};
\end{minted}

\noindent can be changed to

\begin{minted}[linenos=false]{verilog}
assign y = ^{reg3,reg4,reg5,wire6,wire7,reg8,wire9};
\end{minted}

\noindent This continues to hold all the necessary information to detect a single bitflip in the internal state of the module, however, synthesis and verification are much slower because of the larger circuit. We demonstrate this empirically in Section~\ref{sec:results}.
\end{remark}

% Finally, state-based Verilog generation was chosen over a simpler directed acyclic graph (DAG) generation method. The DAG method generates a random DAG which is interpreted as a circuit with random gates at every node and wires at every edge. More complicated constructs such as flip-flops could be represented by adding cycles to the graph where one of the edges would be clocked, which preserves the determinism. However, the main problem with this method is that representing behavioural Verilog constructs is not possible and it would therefore only be able to generate random netlists. The state-based Verilog generation, on the other hand, directly generates the abstract syntax tree (AST) and can therefore generate any constructs that are present in the grammar.

\section{Equivalence checking}\label{sec:equivalence}

The equivalence check is a crucial step in verifying that the synthesis tool behaved properly by proving that the synthesised netlist is equivalent to the original design. The equivalence check itself is performed using \yosys{}~\cite{wolf_yosys_open_synth_suite} and the ABC~\cite{brayton10_abc} back end. However, an SMT solver such as Z3~\cite{moura08_z3} can also be used as a back end to perform the equivalence check. As only deterministic Verilog is being tested, which does not contain any undefined values, the equivalence check proves that the design is equivalent to the netlist over all possible inputs.

The equivalence is checked using the following property: the output wires of the randomly generated Verilog design and the synthesised netlist should always be equal at the clock edge given the same inputs. This is expressed in Verilog by instantiating both modules and asserting that the outputs are equal:

\begin{minted}{systemverilog}
module equiv( input clk, input [6:0] w0, input [10:0] w1
            , input signed [10:0] w2, input [11:0] w3 );
   wire [49:0] y1, y2;
   top t1(y1, clk, w0, w1, w2, w3);
   top_synth_netlist t2(y2, clk, w0, w1, w2, w3);
   always @(posedge clk) assert(y1 == y2);
endmodule
\end{minted}

\noindent where \texttt{y1} and \texttt{y2} are the outputs of the design and the netlist respectively. The Verilog is passed through \yosys{} synthesis to obtain either SMT-LIBv2~\cite{barrett17_smt_lib_stand} for an SMT solver, or a netlist for ABC.\@

As this process is performed by \yosys{} itself, when testing \yosys{} synthesis there might be bugs that are not found by the equivalence check, as the same bug will be present when design is passed to the SMT solver or to ABC.\@ However, bugs can still be found in optimisations that \yosys{} only applies when it is properly synthesising the design instead of passing it to an external solver. In addition to that, if \yosys{} is tested with multiple other synthesis tools, the synthesised netlist produced by \yosys{} can be compared to the other synthesised netlist instead of the original design. Therefore, the bug in \yosys{} should not trigger anymore as it is only synthesising two netlists. Finally, a test bench can also be created for the generated design and the netlist by passing random test vectors to both top-level modules and checking that the output remains the same.

After the equivalence check is performed, the checker returns a counterexample which can be added to a testbench and hence simulated, to make sure that it does indeed expose a difference between the design and the netlist.

\section{Test case reduction}\label{sec:reducer}

% - describe the reason it is needed
% - describe specialised reduction algorithm

Reducing an HDL is different from reducing programming languages. The time it takes to discover the presence of an error is much higher with synthesis and equivalence checking than with compilation and execution. Existing reduction methods such as delta debugging~\cite{zeller02_simpl_isolat_failur_induc_input} or hierarchical delta debugging~\cite{misherghi06_hdd} are effective at reducing failing test cases for programming languages, but rely on a quick feedback loop which tells the reducer if the current version of the test case still triggers a bug, i.e.\ that it is still interesting.

\setlength{\tabcolsep}{3pt}

\begin{table*}
  \centering
  \begin{tabular}{lrp{5mm}r@{~}rrr}
    \toprule
    \textbf{Tool} & \textbf{Total test cases} & \multicolumn{3}{r}{\textbf{Failing test cases}} & \textbf{Distinct failing test cases} & \textbf{Bug reports} \\
    \midrule
    \yosys{ 0.8}                                            & 26400 & & 7164 & (27.1\%) & $\ge 1$ & 0 \\
    \yosys{ 3333e00}                                        & 51000 & & 7224 & (14.2\%) & $\ge 4$ & 3 \\
    \yosys{ 70d0f38} (crash)                                        & 11 & & 1 & (9.09\%) & $\ge 1$ & 1 \\
    \yosys{ 0.9}                                            & 26400 & & 611  & (2.31\%) & $\ge 1$ & 1 \\
    \vivado{ 18.2}                                          & 47992 & & 1134 & (2.36\%) & $\ge 5$ & 3 \\
    \vivado{ 18.2} (crash)                                & 47992 & & 566  & (1.18\%) & 5       & 2 \\
    \xst{ 14.7}                                             & 47992 & & 539  & (1.12\%) & $\ge 2$ & 0 \\
    \quartus{ 19.2}                                         & 80300 & & 0    & (0\%)    & 0       & 0 \\
    \quartuslite{ 19.1}                                    & 43    & & 17   & (39.5\%) & 1       & 0 \\
    \quartuslite{ 19.1} (No \texttt{\$signed})             & 137   & & 0    & (0\%)    & 0       & 0 \\
    \midrule
    \iverilog{ 10.3}                                        & 26400 & & 616  & (2.33\%) & $\ge 1$  & 1 \\
    \bottomrule
  \end{tabular}
  \caption{Summary of failing test cases found in each tool that was tested.}\label{table:unique_bugs}
\end{table*}

Therefore, we developed a general reduction approach similar to hierarchical delta debugging to speed up the reduction of an arbitrary Verilog design. Then, due to the structure of the random test cases, an optimisation can be added to improve the efficiency when dealing with those test cases. As the reducer is tightly coupled with the generator, the original AST can be used to analyse the source and reduce it further. However, the reducer still works in a standalone manner for the supported subset of Verilog.

The main goal of the reduction algorithm is to quickly reduce the size of the program, so that it can be analysed as soon as possible and testing can be resumed. As synthesis and verification are time consuming, even for small designs, it is crucial to minimise the number of steps needed by the reduction algorithm. To achieve this, a depth-first binary search is performed on the syntax tree, with different levels of granularity. At every node, the current program is checked against the synthesis tool to check that the bug is still present.

The steps of the general reduction are detailed below. Each of the steps will result in a binary choice which is explored in a greedy fashion, meaning that the first option is taken until the bug is not present anymore. The second option is only explored if the first option would eliminate the bug from the design. If the bug is not present there either, the reduction algorithm backtracks to the last version that still contains the failing test case and terminates. Each of the following steps is repeated until it cannot be applied anymore without eliminating the bug.

\begin{enumerate}
\item Half the modules, excluding the top-level module, are removed from the current Verilog file. All instantiations of modules that were removed and that are present in the leftover modules are eliminated as well. Any wires that were connected to the output ports in the module instantiations are also pruned and any references to those modules in any expressions are also removed so that no extra undefined values are added.
\item Half of the module items are removed. For all the assignments that are removed, the declaration of the variable or net that was being assigned is also removed together with any uses of the variable or net in any expressions in that module.
\item The same is done for all the statements inside the always blocks. Half of the assignments in each always block are removed. If a conditional statement is encountered, it is reduced by picking one of the branches, and choosing the other if the bug is not present anymore.
\item Finally, expressions are simplified and reduced to narrow down on the bug. Concatenations are split into two, and binary operators are removed and replaced by their LHS and RHS.\@
\end{enumerate}

By reducing the Verilog at different levels of granularity, as much code as possible is removed at every step. As a binary choice is made at every step, the reduction will converge to a result, which may, however, not be optimal.

An optimisation can be carried out if the test case was generated using \verismith{}, because all the internal variables and nets are concatenated and added to the only output. The binary search can then be performed only on the concatenation, while deleting all references to variables or nets that are removed from the concatenation. Once the wire in question is found, the standard reduction algorithm can attempt to reduce it further. However, normally only one assignment is left which can consequently be identified manually.

This optimisation only helps if the test case is processed sensibly by the synthesis tool, where the netlist is produced but is not equivalent to the input. It may fail to reduce the test cases properly when the synthesis tool crashes, as it only operates on the output of the module, which may not be the cause of the crash.

\begin{remark}
Another optimisation that could be performed to increase the speed of the reduction is to use the counterexample provided by the equivalence checker in iterations of the reducer using simulation, instead of rerunning the equivalence check at every iteration.
\end{remark}

\section{Evaluation}\label{sec:results}

\verismith{} was run for 18000 CPU hours, testing the following synthesis tools: \vivado{}~\cite{xilinx_vivad_desig_suite}, \xst{}~\cite{xilinx_xst_synth_overv}, \quartus{}~\cite{intel_intel_quart}, \quartuslite{}~\cite{intel_intel_quart}, and \yosys{}~\cite{wolf_yosys_open_synth_suite}. Our experiments were designed to answer five questions. First, how many bugs can we detect in various synthesis tools? Second, how does increasing the size of the generated Verilog designs affect the efficiency of our testing approach? Third, what is the effect of XOR-ing all the outputs of a design into a single wire? Fourth, how does the stability of synthesis tools change with each new release? Finally, how does our custom Verilog test case reducer fare against an existing delta-debugging tool called C-Reduce~\cite{regehr12_test_reduc_c_compil_bugs}?

\begin{remark}
  Many problems were encountered when testing \quartus{}. The lack of documentation about the d-flip-flop used by \quartus{} required a lot of trial and error to get it working properly. In addition to that, optimisations in \quartus{}, such as multiply-accumulate, had to be disabled as these would use hardware of the specific FPGA that was targeted, therefore using encrypted modules. Finally, to be able to generate a Verilog netlist using \quartuslite{}, the design had to be first fitted on a real FPGA, which meant that virtual pins had to be used to fit all the IO ports, or the output had to be combined into one bit.
\end{remark}

\subsection{How many bugs were found?}

Table~\ref{table:unique_bugs} presents the breakdown of all the bugs found in the synthesis tools. These tools were tested separately and were therefore given different test cases. The only tools where we observed any crashes were \vivado{} and \yosys{}. Crashes in \vivado{} are therefore shown in a separate row. The crash found in \yosys{} was found when briefly testing a different development version of \yosys{ (commit hash 70d0f38)}. All the other rows of the table refer only to test cases that mis-synthesise. The only tool where all test cases succeeded was \quartus{}, even after ramping up the number of test cases to over 80,000. In a surprising contrast, we found the highest percentage of failures in its sister tool, \quartuslite{}. We only ran a small number of test cases many were failing. Upon inspection, we found that these failures could all be traced back to the \texttt{\$signed} function, which converts an expression to signed and sign extends it if necessary. When we disabled this construction in our random generator, we found no further failures. Alongside the various Verilog synthesis tools, we also tested a Verilog simulator called \iverilog{}, which was used to check the counter examples returned by the equivalence checker. This revealed one bug in the simulator, which was reported and fixed.

A test case failure was identified as being unique if one minimal test case could not be reduced into a different minimal test case. This often required manual intervention to go from the automatically reduced test case to the minimal test case that could be compared to the bugs that had already been identified. One caveat regarding identifying unique mis-synthesis bugs has to be noted, because there may be one bug in the synthesis tool that is identified multiple times with different unique minimal test cases. For the open source synthesis tools, this can be verified by following the bug report, however, for commercial synthesis tools, this cannot be checked. Unique crashes were easier to identify, as the tools dump a stack trace that shows the exact position where the crash occurred. Therefore, crashes at different positions could be identified as being unique.

Only the failing test cases in \yosys{} can be analysed properly, as the other failing test cases have either not been fixed, or the fix itself has not been disclosed. \yosys{} was tested using three different versions, the stable version of \yosys{ 0.8} before any of our bug fixes were integrated, the master branch of \yosys{ (commit hash 3333e00)} as it was being developed, just before our bug fixes were introduced and finally the most recent stable release which is \yosys{ 0.9}. The master branch of \yosys{} was tested to assist the current development of Yosys and be able to report bugs before they affect users.

All the bugs mentioned were reported and confirmed by the tool maintainers and vendors. Most of the \yosys{} bugs were fixed within the day, whereas \xilinx{} confirmed the bug and intend to ``fix this issue in future releases''.\bugtracker{https://forums.xilinx.com/t5/Synthesis/Vivado-2019-1-Bit-selection-synthesis-mismatch/m-p/982632\#M31484} The following subsections show examples of bugs that were found.

\subsubsection[\yosys{} peephole
  optimisation]{\yosys{} peephole
  optimisation.\bugtracker{\url{https://github.com/YosysHQ/yosys/issues/1047}}}

An example of a bug in \yosys{} is in the peephole optimisation pass as shown below. A peephole optimisation is the replacement of a specific sequence of instructions with a more efficient but equivalent sequence of instructions.

\begin{minted}[]{verilog}
module top (y, w);
   output y;
   input [2:0] w;
   assign y = 1'b1 >> (w * (3'b110));
endmodule
\end{minted}

The piece of code above was identified for a peephole optimisation which optimised the multiply and shift operations where one of the operands in the multiply is constant. However, the code above was optimised in a way that did not truncate the result of the multiplication, meaning that if the input is \texttt{w = 3'b100}, \texttt{y} would be set to 0 because the shift amount would be set to \texttt{6'b011000} instead of the correct value \texttt{3'b000}. Therefore, the correct output of the module should be 1 when \texttt{w} is set to \texttt{3'b100}.

\subsubsection[\yosys{} peephole crash]{\yosys{} peephole crash.\bugtracker{\url{https://github.com/YosysHQ/yosys/issues/993}}}

A crash was also found in the peephole optimisation, which is shown in the code below.

\begin{minted}[]{verilog}
module top(y, clk, wire1);
   output [1:0] y;
   input clk;
   input [1:0] wire1;
   reg [1:0] reg1 = 0, reg2 = 0;
   assign y = reg2;
   always @(posedge clk) reg1 <= wire1 == 1;
   always @(posedge clk) reg2 <= 1 >> reg1[1:1];
endmodule
\end{minted}

It will shift 1 to the right by one bit if the second bit in \texttt{reg1} is set. However, the code performing the optimisation did not check the vector’s length before attempting to access the last element in the vector. This therefore led to the code crashing because it tried to index an element outside of the vector.

\subsubsection[\vivado{} bug]{\vivado{}
  bug.\bugtracker{\url{https://forums.xilinx.com/t5/Synthesis/Vivado-2019-1-Bit-selection-synthesis-mismatch/m-p/982632\#M31484}}}

Another bug was found which ignored the bit selection. The code sample for that bug is slightly more complex.
\vspace{1.5em}

\begin{minted}{verilog}
module top (y, clk, w0);
   output [1:0] y;
   input clk;
   input [1:0] w0;
   reg [2:0] r1 = 3'b0;
   reg [1:0] r0 = 2'b0;
   assign y = r1;
   always @(posedge clk) begin
      r0 <= 1'b1;
      if (r0) r1 <= r0 ? w0[0:0] : 1'b0;
      else r1 <= 3'b1;
   end
endmodule
\end{minted}

For an input of \texttt{w0 = 2b'10} for two clock cycles, the final output should be \texttt{2'd0}, because the if statement is entered on the second clock cycle and the LSB of \texttt{w0} is assigned to \texttt{r1}, which is \texttt{1'b0}. However, with \vivado{} the output is \texttt{2'10} instead, meaning \vivado{} does not truncate the value of the input to the LSB in \texttt{w0[0:0]}.

\subsection{How does program size affect efficiency?}

\begin{figure*}
  \centering
  \begin{subfigure}{0.45\textwidth}
    \begin{tikzpicture}
      \begin{semilogxaxis}[width=\linewidth,
        major x tick style=transparent,
        xlabel=Average lines of code in generated programs,
        legend style={at={(0.5,-0.15)},anchor=north,legend columns=-1},
        ylabel=Number of test cases,
        legend pos=north east,
        legend style={font=\scriptsize},
        xmin=50,
        xmax=8800,
        ybar=-3.6pt,
        bar width=3.6pt,
        ymin=0
        % nodes near coords,
        % nodes near coords align={vertical}
        ]
        \addplot[style={distr1,fill=distr1,mark=none,fill opacity=0.5,draw opacity=0}] coordinates {
          (50.0,36) (55.2273245010972,42) (61.0011474309897,113) (67.3786032822108,210) (74.422799757947,341) (82.2034422502463,458) (90.7975236052311,501) (100.290086000843,507) (110.77506247623,353) (122.356206440081,241) (135.148118355792,115) (149.27737976296,39) (164.88380585685,11) (182.121829020643,3) (201.162027001128,0) (222.192810849795,0) (245.422289332251,0) (271.080328255088,0) (299.420825088154,0) (330.724221390595,0) (365.300277902221,0) (403.491139760939,0) (445.674722177899,0) (492.268450073103,0) (543.733388676789,0) (600.578805970684,0) (663.36721211649,0) (732.71992573891,0) (809.323222144053,0) (893.935124312463,0) (987.392903866661,0) (1090.6213662385,0) (1204.64200202167,0) (1330.58309506604,0) (1469.69088733773,0) (1623.34191102612,0) (1793.05660992942,0) (1980.51438490818,0) (2187.5702122883,0) (2416.2729996596,0) (2668.88586070881,0) (2947.90850971511,0) (3256.10199731164,0) (3596.51603228401,0) (3972.51915976695,0) (4387.83209446551,0) (4846.56453874751,0) (5353.25584993839,0) (5912.91995923888,0) (6531.09498675799,0) (7213.89804362345,0) (7968.08576346046,0) (8801.12116222408,0) 
        };
        \addplot[style={distr2,fill=distr2,mark=none,fill opacity=0.5,draw opacity=0}] coordinates {
          (50.0,0) (55.2273245010972,0) (61.0011474309897,1) (67.3786032822108,2) (74.422799757947,1) (82.2034422502463,6) (90.7975236052311,12) (100.290086000843,29) (110.77506247623,52) (122.356206440081,99) (135.148118355792,128) (149.27737976296,191) (164.88380585685,219) (182.121829020643,323) (201.162027001128,355) (222.192810849795,294) (245.422289332251,209) (271.080328255088,137) (299.420825088154,71) (330.724221390595,28) (365.300277902221,14) (403.491139760939,2) (445.674722177899,1) (492.268450073103,0) (543.733388676789,0) (600.578805970684,0) (663.36721211649,0) (732.71992573891,0) (809.323222144053,0) (893.935124312463,0) (987.392903866661,0) (1090.6213662385,0) (1204.64200202167,0) (1330.58309506604,0) (1469.69088733773,0) (1623.34191102612,0) (1793.05660992942,0) (1980.51438490818,0) (2187.5702122883,0) (2416.2729996596,0) (2668.88586070881,0) (2947.90850971511,0) (3256.10199731164,0) (3596.51603228401,0) (3972.51915976695,0) (4387.83209446551,0) (4846.56453874751,0) (5353.25584993839,0) (5912.91995923888,0) (6531.09498675799,0) (7213.89804362345,0) (7968.08576346046,0) (8801.12116222408,0)
        };
        \addplot[style={distr3,fill=distr3,mark=none,fill opacity=0.5,draw opacity=0}] coordinates {
          (50.0,0) (55.2273245010972,0) (61.0011474309897,0) (67.3786032822108,0) (74.422799757947,0) (82.2034422502463,0) (90.7975236052311,0) (100.290086000843,0) (110.77506247623,0) (122.356206440081,0) (135.148118355792,0) (149.27737976296,0) (164.88380585685,0) (182.121829020643,0) (201.162027001128,0) (222.192810849795,1) (245.422289332251,0) (271.080328255088,3) (299.420825088154,7) (330.724221390595,9) (365.300277902221,12) (403.491139760939,32) (445.674722177899,46) (492.268450073103,74) (543.733388676789,93) (600.578805970684,129) (663.36721211649,166) (732.71992573891,186) (809.323222144053,171) (893.935124312463,116) (987.392903866661,58) (1090.6213662385,43) (1204.64200202167,16) (1330.58309506604,2) (1469.69088733773,1) (1623.34191102612,0) (1793.05660992942,0) (1980.51438490818,0) (2187.5702122883,0) (2416.2729996596,0) (2668.88586070881,0) (2947.90850971511,0) (3256.10199731164,0) (3596.51603228401,0) (3972.51915976695,0) (4387.83209446551,0) (4846.56453874751,0) (5353.25584993839,0) (5912.91995923888,0) (6531.09498675799,0) (7213.89804362345,0) (7968.08576346046,0) (8801.12116222408,0)
        };
        \addplot[style={distr4,fill=distr4,mark=none,fill opacity=0.5,draw opacity=0}] coordinates {
          (50.0,0) (55.2273245010972,0) (61.0011474309897,0) (67.3786032822108,0) (74.422799757947,0) (82.2034422502463,0) (90.7975236052311,0) (100.290086000843,0) (110.77506247623,0) (122.356206440081,0) (135.148118355792,0) (149.27737976296,0) (164.88380585685,0) (182.121829020643,0) (201.162027001128,0) (222.192810849795,0) (245.422289332251,0) (271.080328255088,0) (299.420825088154,0) (330.724221390595,0) (365.300277902221,0) (403.491139760939,0) (445.674722177899,0) (492.268450073103,2) (543.733388676789,2) (600.578805970684,6) (663.36721211649,12) (732.71992573891,14) (809.323222144053,30) (893.935124312463,56) (987.392903866661,65) (1090.6213662385,84) (1204.64200202167,103) (1330.58309506604,123) (1469.69088733773,109) (1623.34191102612,66) (1793.05660992942,45) (1980.51438490818,18) (2187.5702122883,10) (2416.2729996596,1) (2668.88586070881,0) (2947.90850971511,0) (3256.10199731164,0) (3596.51603228401,0) (3972.51915976695,0) (4387.83209446551,0) (4846.56453874751,0) (5353.25584993839,0) (5912.91995923888,0) (6531.09498675799,0) (7213.89804362345,0) (7968.08576346046,0) (8801.12116222408,0)
        };
        \addplot[style={distr5,fill=distr5,mark=none,fill opacity=0.5,draw opacity=0}] coordinates {
          (50.0,0) (55.2273245010972,0) (61.0011474309897,0) (67.3786032822108,0) (74.422799757947,0) (82.2034422502463,0) (90.7975236052311,0) (100.290086000843,0) (110.77506247623,0) (122.356206440081,0) (135.148118355792,0) (149.27737976296,0) (164.88380585685,0) (182.121829020643,0) (201.162027001128,0) (222.192810849795,0) (245.422289332251,0) (271.080328255088,0) (299.420825088154,0) (330.724221390595,0) (365.300277902221,0) (403.491139760939,0) (445.674722177899,0) (492.268450073103,0) (543.733388676789,0) (600.578805970684,0) (663.36721211649,2) (732.71992573891,0) (809.323222144053,2) (893.935124312463,14) (987.392903866661,16) (1090.6213662385,29) (1204.64200202167,50) (1330.58309506604,76) (1469.69088733773,83) (1623.34191102612,99) (1793.05660992942,99) (1980.51438490818,81) (2187.5702122883,49) (2416.2729996596,35) (2668.88586070881,16) (2947.90850971511,3) (3256.10199731164,2) (3596.51603228401,1) (3972.51915976695,1) (4387.83209446551,0) (4846.56453874751,0) (5353.25584993839,0) (5912.91995923888,0) (6531.09498675799,0) (7213.89804362345,0) (7968.08576346046,0) (8801.12116222408,0)
        };
        \addplot[style={distr6,fill=distr6,mark=none,fill opacity=0.5,draw opacity=0}] coordinates {
          (50.0,0) (55.2273245010972,0) (61.0011474309897,0) (67.3786032822108,0) (74.422799757947,0) (82.2034422502463,0) (90.7975236052311,0) (100.290086000843,0) (110.77506247623,0) (122.356206440081,0) (135.148118355792,0) (149.27737976296,0) (164.88380585685,0) (182.121829020643,0) (201.162027001128,0) (222.192810849795,0) (245.422289332251,0) (271.080328255088,0) (299.420825088154,0) (330.724221390595,0) (365.300277902221,0) (403.491139760939,0) (445.674722177899,0) (492.268450073103,0) (543.733388676789,0) (600.578805970684,0) (663.36721211649,0) (732.71992573891,0) (809.323222144053,0) (893.935124312463,0) (987.392903866661,0) (1090.6213662385,2) (1204.64200202167,2) (1330.58309506604,1) (1469.69088733773,2) (1623.34191102612,4) (1793.05660992942,15) (1980.51438490818,16) (2187.5702122883,28) (2416.2729996596,49) (2668.88586070881,58) (2947.90850971511,66) (3256.10199731164,50) (3596.51603228401,59) (3972.51915976695,37) (4387.83209446551,15) (4846.56453874751,7) (5353.25584993839,1) (5912.91995923888,0) (6531.09498675799,0) (7213.89804362345,0) (7968.08576346046,0) (8801.12116222408,0)
        };
        \addplot[style={distr7,fill=distr7,mark=none,fill opacity=0.5,draw opacity=0}] coordinates {
          (50.0,0) (55.2273245010972,0) (61.0011474309897,0) (67.3786032822108,0) (74.422799757947,0) (82.2034422502463,0) (90.7975236052311,0) (100.290086000843,0) (110.77506247623,0) (122.356206440081,0) (135.148118355792,0) (149.27737976296,0) (164.88380585685,0) (182.121829020643,0) (201.162027001128,0) (222.192810849795,0) (245.422289332251,0) (271.080328255088,0) (299.420825088154,0) (330.724221390595,0) (365.300277902221,0) (403.491139760939,0) (445.674722177899,0) (492.268450073103,0) (543.733388676789,0) (600.578805970684,0) (663.36721211649,0) (732.71992573891,0) (809.323222144053,0) (893.935124312463,0) (987.392903866661,1) (1090.6213662385,0) (1204.64200202167,0) (1330.58309506604,0) (1469.69088733773,0) (1623.34191102612,4) (1793.05660992942,4) (1980.51438490818,7) (2187.5702122883,12) (2416.2729996596,23) (2668.88586070881,41) (2947.90850971511,36) (3256.10199731164,51) (3596.51603228401,65) (3972.51915976695,43) (4387.83209446551,37) (4846.56453874751,24) (5353.25584993839,10) (5912.91995923888,3) (6531.09498675799,0) (7213.89804362345,0) (7968.08576346046,0) (8801.12116222408,0)
        };
        \addplot[style={distr8,fill=distr8,mark=none,fill opacity=0.5,draw opacity=0}] coordinates {
          (50.0,0) (55.2273245010972,0) (61.0011474309897,0) (67.3786032822108,0) (74.422799757947,0) (82.2034422502463,0) (90.7975236052311,0) (100.290086000843,0) (110.77506247623,0) (122.356206440081,0) (135.148118355792,0) (149.27737976296,0) (164.88380585685,0) (182.121829020643,0) (201.162027001128,0) (222.192810849795,0) (245.422289332251,0) (271.080328255088,0) (299.420825088154,0) (330.724221390595,0) (365.300277902221,0) (403.491139760939,0) (445.674722177899,0) (492.268450073103,0) (543.733388676789,0) (600.578805970684,0) (663.36721211649,0) (732.71992573891,0) (809.323222144053,0) (893.935124312463,0) (987.392903866661,0) (1090.6213662385,0) (1204.64200202167,0) (1330.58309506604,0) (1469.69088733773,0) (1623.34191102612,0) (1793.05660992942,1) (1980.51438490818,4) (2187.5702122883,0) (2416.2729996596,2) (2668.88586070881,5) (2947.90850971511,8) (3256.10199731164,16) (3596.51603228401,24) (3972.51915976695,24) (4387.83209446551,28) (4846.56453874751,32) (5353.25584993839,25) (5912.91995923888,24) (6531.09498675799,8) (7213.89804362345,2) (7968.08576346046,1) (8801.12116222408,0)
        };
        \node at (axis cs: 100,530) {10};
        \node at (axis cs: 200,380) {15};
        \node at (axis cs: 750,210) {20};
        \node at (axis cs:1300,150) {21};
        \node at (axis cs:2000,125) {26};
        \node at (axis cs:3000, 90) {27};
        \node at (axis cs:4000, 90) {30};
        \node at (axis cs:6000, 50) {35};
      \end{semilogxaxis}
    \end{tikzpicture}
    \caption{Distribution of test case sizes in each experiment.}\label{fig:size_bugs_distr}
  \end{subfigure}%
  \hfill
  \begin{subfigure}{0.45\textwidth}
    \centering
    \begin{tikzpicture}
      \begin{axis}[width=\linewidth,
        major x tick style=transparent,
        symbolic x coords={91,181,438,792,929,1700,2110,4230},
        xlabel=Average lines of code in generated programs,
        ylabel=Number of test cases,
        clip=false,
        legend pos=north east,
        legend style={font=\scriptsize},
        xtick=data,
        every axis x label/.style={at={($(ticklabel cs:0.6) + (0,-0.02)$)},anchor=near ticklabel,},
        ybar=0,
        bar width=9pt,
        ymin=0,
        % nodes near coords,
        % nodes near coords align={vertical}
        ]
        \fill[distr1!50] (axis cs:91,-15) circle (3pt);
        \fill[distr2!50] (axis cs:181,-15) circle (3pt);
        \fill[distr3!50] (axis cs:438,-15) circle (3pt);
        \fill[distr4!50] (axis cs:792,-15) circle (3pt);
        \fill[distr5!50] (axis cs:929,-15) circle (3pt);
        \fill[distr6!50] (axis cs:1700,-15) circle (3pt);
        \fill[distr7!50] (axis cs:2110,-15) circle (3pt);
        \fill[distr8!50] (axis cs:4230,-15) circle (3pt);
        \addplot[style={black,fill=black!70,mark=none}] coordinates {
          (91,122) (181,87) (438,63) (792,56) (929,43)
          (1700,14) (2110,10) (4230,7)
        };
        \addplot[style={black,fill=white,mark=none}] coordinates {
          (91,0) (181,2) (438,4) (792,12) (929,10)
          (1700,8) (2110,12) (4230,17)
        };
        \legend{Bugs found,Crashes found};
      \end{axis}
    \end{tikzpicture}
    \caption{Efficiency of various sizes at finding failing test cases.}\label{fig:size_bugs_found}
  \end{subfigure}
  \caption{How the bug-finding efficiency varies with generated program size.}
\end{figure*}

The efficiency of generating different sizes of Verilog code was also analysed to identify the optimal size that finds the most failing test cases. This was measured by conducting eight separate experiments, each with Verismith configured to generate programs of a different size, and each experiment given 48 hours to find as many bugs as possible.

Figure~\ref{fig:size_bugs_distr} shows how the distribution of test case sizes is affected by the {\tt size} parameter given to Verismith. Each of the eight distributions in the figure is labelled with the value of the {\tt size} parameter that generated it. As the {\tt size} parameter becomes larger, we observe that it becomes harder to control the test case size (hence the more spread out distributions) -- this is because of the inherent randomness in the generation algorithm.

Figure 5b compares the eight experiments by how many test case failures they found. The average test case size in each experiment is displayed along the x-axis. We see that shorter programs are more effective at triggering mis-synthesis bugs, and longer programs are more effective at triggering crash bugs. Therefore, generating programs that have a size of around 700 lines of code might be optimal to find the largest variety of bugs.

Larger inputs should find more failing test cases, as more combinations are tried that might trigger a bug in the synthesis tool. The designs also become much more complex as the size of the Verilog increases. However, it is more efficient to run multiple small random Verilog modules instead of large ones, because in the same amount of time, many more failings test cases are found. The run time of the synthesis and the equivalence checking is the limiting factor, as both of these increase exponentially with the size of the input. On the other hand, the number of crashes still increases with the size of the input. This is because a crash normally occurs at the start of the synthesis process, which means that the complexity of the input does not affect the time taken to discover the crash. Larger inputs will therefore have a greater chance of crashing the synthesis tool.

Thus, it seems that it is more useful to generate small Verilog modules which will synthesise and pass equivalence checking as fast as possible. However, solely generating programs of the smallest size might only result in finding the same bug repeatedly. Instead, it is better to generate Verilog designs at around 700 lines of code so that a larger variety of inputs can be created.

\subsection{What is the effect of XOR-ing the outputs?}

\begin{figure}
  \centering
  \begin{tikzpicture}
    \begin{groupplot}[%
      group style = {group size = 2 by 1, horizontal sep=3mm, ylabels at=edge left, yticklabels at=edge left},
      width=50mm,
      height=40mm,
      ylabel=Time taken (\si{\second}),
      xmin=0,
      ymin=0,
      xmax=3500,
      xtick distance=1000,
      ymax=1000,
      ]
      \nextgroupplot
      \addplot[only marks,uncombined,draw=uncombined,fill opacity=0.3,draw opacity=0,mark=*]
      table[x=Size,y=Equivalence check time,col sep=comma]
      {data/length_no_combine_out_downscale.csv};
      \nextgroupplot[xlabel={Lines of code in the test case}, xlabel style={xshift=-20mm}]
      \addplot[only marks,combined,draw=combined,fill opacity=0.3,draw opacity=0,mark=*]
      table[x=Size,y=Equivalence check time,col sep=comma] {data/length.csv};
    \end{groupplot}
  \end{tikzpicture}
\caption{Synthesis and equivalence checking time as program size increases for test cases. Left: output combined using concatenation. Right: output combined to one bit using unary XOR operator.}\label{fig:synth_ec_time_all}
\end{figure}

Figure~\ref{fig:synth_ec_time_all} compares concatenating the output to combining the output into one bit, as was mentioned in Section~\ref{sec:generation_algorithm}, by comparing the time taken to perform the equivalence check. Synthesis time is not displayed, because it did not change when the output was combined into one bit or not, and scaled exponentially with size in both cases. Equivalence checking is the limiting factor when performing the random testing, because it scales exponentially but can also take much longer depending on the test case. The horizontal line of points at the very top are the test cases that timed out at 900 seconds. The graph on the left shows that equivalence checking time mostly scales exponentially with the size of the test case, with around 5\% of test cases failing. However, when the output is combined into one bit, which is shown by the graph on the right, the equivalence checking time scales much worse as the lines of code of the test cases increase, with around 29\% of test cases failing overall. The time taken increases in a nearly vertical line, meaning even small test cases timeout most of the time. The reason for this is that the circuit generated by the unary XOR operator takes a long time to verify, as all possible paths are explored.

Reducing the output to one bit increases the time to perform the equivalence check dramatically, making it impossible to generate programs of more than 500 lines and check for bugs that were introduced. Therefore, the output should not be combined unless it is absolutely necessary.

\subsection{How stable are synthesis tools?}

By fuzzing different versions of synthesis tools, the general stability of the tools can be observed. One might expect newer versions of the tool either to have fewer bugs, as they are reported and fixed, or to have more bugs, as new features are added. In total, 837 test cases were run through the four different versions of \vivado{}, however, 134 test cases timed out and were therefore disregarded. Figure~\ref{fig:vivado_over_time} shows how many test cases produced failures in each version of \vivado{}. Each horizontal ribbon represents a group of test cases that produced failures in the same tools and the larger the ribbon, the more test cases followed the same trajectory.

Figure~\ref{fig:vivado_over_time} shows that \vivado{} 2016.1 and 2016.2 have exactly the same test case failures. From 2016.2 to 2017.4, the bugs originating from one group of test cases were fixed, but two different groups of test cases starts to fail. The total number of failing test cases is also higher in 2017.4 than in previous versions. Finally, two groups of test cases are fixed for version 2018.2, but another group of test cases start to fail, which is the largest proportion of test cases in the diagram. There is one group that stays constant in all the versions of \vivado{} comprising 15 test cases, which are likely due to failures that have not been found or reported yet.

\begin{figure}
  \centering
  \resizebox{0.45\textwidth}{!}{
  \begin{tikzpicture}
    \fill[fill=ribbon1] (-0.5,4.2) -- (7,4.2) -- (7.5,3.6) -- (7,3) -- (-0.5,3) --
    (-0.2,3.6) -- cycle;
    \fill[fill=ribbon2] (-0.5,2.2) -- (2.4,2.2) to [out=0,in=180] (4.4,2.9) --
    (4.6,2.9) to [out=0,in=180] (6.6,3) -- (7,3) -- (7.4,2.55) -- (7,2.1) --
    (6.6,2.1) to [out=180,in=0] (4.6,2) -- (4.4,2) to [out=180,in=0] (2.4,1.3) --
    (-0.5,1.3) -- (-0.2,1.75) -- cycle;
    \fill[fill=ribbon3] (-0.5,2.9) -- (2.4,2.9) to [out=0,in=180] (6.6,0.4) --
    (7,0.4) -- (7.3,0.2) -- (7,0) -- (6.6,0) to [out=180,in=0] (2.4,2.5) --
    (-0.5,2.5) -- (-0.2,2.7) -- cycle;
    \fill[fill=ribbon4] (-0.5,1.2) -- (3,1.2) to [out=0,in=180] (6.6,2.1)
    -- (7,2.1) -- (7.5,1.5) -- (7,0.9)
    -- (6.6,0.9) to [out=180,in=0] (3,0) -- (-0.5,0) -- (-0.1,0.6) -- cycle;
    \fill[fill=ribbon5] (-0.5,3) -- (4.6,3) to [out=0,in=180] (6.6,0.5) --
    (7,0.5) -- (7.1,0.45) -- (7,0.4) -- (6.6,0.4) to [out=180,in=0] (4.6,2.9) --
    (-0.5,2.9) -- (-0.4,2.95) -- cycle;
    \fill[fill=ribbon6] (-0.5,1.3) -- (2.4,1.3) to [out=0,in=180] (4.4,2) -- (4.6,2) to
    [out=0,in=180] (6.6,0) -- (7,0) -- (7.1,-0.05) -- (7,-0.1) -- (6.6,-0.1) to
    [out=180,in=0] (4.6,1.9) -- (4.4,1.9) to [out=180,in=0] (2.4,1.2) --
    (-0.5,1.2) -- (-0.5,1.25) -- cycle;
    \filldraw[fill=white] (-0.1,2.5) rectangle (0.3,4.2);
    \filldraw[fill=white] (2.1,2.5) rectangle (2.5,4.2);
    \filldraw[fill=white] (4.3,1.9) rectangle (4.7,4.2);
    \filldraw[fill=white] (6.5,0.9) rectangle (6.9,4.2);
    \node at (0.1,4.6) {2016.1};
    \node at (2.3,4.6) {2016.2};
    \node at (4.5,4.6) {2017.4};
    \node at (6.7,4.6) {2018.2};
    \node at (1.1,0.6) {17};
    \node at (1.1,3.6) {15};
    \node at (1.1,1.75) {11};
    \node at (1.1,2.7) {6};
    \node at (0.1,2.7) {22};
    \node at (2.3,2.7) {22};
    \node at (4.5,2.1) {28};
    \node at (6.7,1.1) {43};
    \node at (3.4,5) {Vivado version};
  \end{tikzpicture}}
  \caption{Tracking the same set of test cases across four versions of Vivado. The white rectangles indicate the total number of failing test cases per version. Test cases that fail in the same versions are grouped into a ribbon, and each ribbon is labelled with the number of test cases it contains. The interleaving of ribbons shows how bugs may have been introduced or fixed in each version.}\label{fig:vivado_over_time}
\end{figure}

\subsection{How effective is our reducer?}

To test the efficiency of the test case reducer that was implemented in \verismith{}, it was compared against an existing test case reducer for C/C++, called C-Reduce~\cite{regehr12_test_reduc_c_compil_bugs}. C-Reduce is a general reducer containing various passes that perform different reduction algorithms. As it is tailored to reduce C-like languages, C-Reduce is much less effective at reducing unknown languages, because it cannot analyse and perform reductions on the AST.\@ However, it can still apply other passes which are independent of the input language.

C-Reduce has a notion of test case validity and checks for undefined and unspecified behaviour when reducing C so that it can be avoided. This is necessary, as the reducer might otherwise discard the original bug and reduce the undefined behaviour instead. To be able to compare C-Reduce to \verismith{}, it requires a similar notion of how to avoid undefined behaviour when reducing Verilog. We added a few context-free heuristics to the script that check if the test case is still \emph{interesting}, i.e.\ that the bug is still present. The heuristics are the following:

\begin{enumerate}
\item Check that it can be parsed by \verismith{}, to guarantee that the test case is still in the supported subset of Verilog and is syntactically valid.
\item Apply context-free heuristics, such as requiring all registers to be initialised.
\item Check that the synthesis tool does not output any warning to catch any other errors that may have been introduced.
\end{enumerate}

C-Reduce was tested against our implementation of the reduction algorithm using 30 randomly selected test cases that fail in Yosys. C-Reduce was run with a default configuration and all the C/C++ passes turned off. As per the default, C-Reduce was run in parallel on four cores. Verismith, being a single-threaded design, ran on only one core. Figure~\ref{fig:reducer_comparison} shows the results of the comparison, differentiating between crashes and mis-synthesis bugs. Six out of the 30 test cases were reduced to contain only undefined behaviour using C-Reduce and discarded the original cause of the discrepancy between the netlist and the design. This was often because inputs to module instantiations were removed, which led to undefined values in those inputs. To mitigate this, context-sensitive heuristics would be needed, to make sure the right amount of inputs is present in a module instantiation. In addition to that, because C-Reduce does not know the Verilog syntax, most of the minimal test cases were unreadable and were therefore passed through Verismith to compare the final sizes properly. Figure~\ref{fig:reducer_comparison} allows us to draw the following conclusions:

\begin{figure}
  \centering
  \begin{tikzpicture}
    \begin{loglogaxis}[
      width=0.95\linewidth,
      legend style={nodes={scale=0.7, transform shape}},
      legend pos=south east,
      xlabel=Final size of reduced test case (lines of code),
      ylabel=Time taken for reduction (s),
      xticklabel style={
        /pgf/number format/fixed,
        /pgf/number format/precision=2
      },
      scaled x ticks=false,
      grid=major,
      major grid style={line width=.2pt,draw=black!20},
      legend columns=2,
      legend cell align=left,
      ymin=0.4
      ]
      \addplot[only marks, verismith missynth]
      coordinates {
        (8,24.228676396) (12,48.020800025) (9,239.19471845) (19,78.85903502) (11,73.373059156) (265,396.931663908) (9,28.596337771) (9,121.520180855) (56,392.16407163) (12,9.194543707) (443,1106.024588047) (9,88.786871352) (9,166.899571358) (120,142.222511227) (9,73.07548465) (9,71.661578111) (26,47.691798645) (87,153.32273806) (9,89.500623124)
      };
      \addplot[only marks, creduce missynth]
      coordinates {
        (23,546.138) (40,549.916) (63,2954.062) (66,2565.887) (65,4355.297) (59,1695.4) (32,2355.947) (60,5091.577) (64,1611.466) (15,89.294) (137,6825.607) (74,13896.513) (64,9126.388) (90,4550.126) (43,5553.653) (27,1096.045) (16,340.533) (68,2814.821) (85,10592.831)
      };
      \addplot[only marks, verismith crash]
      coordinates {
        (462,31.025951033) (405,32.882851417) (75,49.630976912) (49,4.096207305) (234,10.081659552) (191,23.921349163) (99,10.894472949) (115,16.908927877) (32,18.313146602) (12,1.4856776) (230,30.366253517)
      };
      \addplot[only marks, creduce crash]
      coordinates {
        (30,225.177) (69,289.044) (33,136.239) (33,194.167) (52,352.996) (60,313.808) (54,285.255) (63,221.02) (60,137.885) (34,223.37) (54,213.942)
      };
      \legend{\verismith{}: mis-synthesis,C-Reduce: mis-synthesis,\verismith{}: crash,C-Reduce: crash}
    \end{loglogaxis}
  \end{tikzpicture}
  \caption{Comparing the effectiveness of test case reduction by Verismith and by C-Reduce using 30 randomly selected test cases that fail in Yosys. Only 24 test cases were successfully reduced by C-Reduce, whereas Verismith reduced all 30. Both axes use a logarithmic scale. Points towards the bottom left are favoured.}\label{fig:reducer_comparison}
\end{figure}

\paragraph{Verismith is much faster than C-Reduce}

The average time taken by Verismith is 119\si{\second}, while the average time taken by C-Reduce is 2640\si{\second} (note that the logarithmic scale on the y-axis de-emphasises this discrepancy). This is expected, as Verismith performs a strict binary choice at every pass, and does not consider additional alternatives, so that the number of synthesis runs is minimised. Additionally, Verismith has access to the original AST and performs semantically valid reductions that will not introduce undefined behaviour. 

\paragraph{Verismith reduces mis-synthesis bugs further than C-Reduce}

The median size of the reduced test case for Verismith is 11 lines, whereas for C-Reduce this is 61 lines. This can be explained by the fact that Verismith has access to the AST and can therefore perform more semantically valid transformations. However, the average number of lines reduced is about the same for both tools, as there are cases where Verismith does not find the optimal reduction.

\paragraph{C-Reduce reduces crash bugs further than Verismith}

Even though Verismith is faster than C-Reduce in most cases, C-Reduce seems to be better at reducing crashes, taking only slightly longer than Verismith but achieving a smaller reduced test case in general. This is because Verismith always tries to perform semantically valid transformations, which is not relevant when reducing crashes. Checking that the Verilog is valid is not important as long as the tool still crashes with the original error message.

\section{Related work}

\paragraph{Random Verilog generation}

VlogHammer~\cite{wolf_vlogh} is also a Verilog fuzzer that targets the major commercial synthesis tools, as well as several simulators. It has found around 75 bugs to date which have been reported to the tool vendors. In contrast to our tool, VLogHammer does not generate programs with multiple modules and it does not support behavioural Verilog (e.g. always blocks). Whereas our tool only generates deterministic Verilog (which we have argued is the most important part of the language), VLogHammer generates nondeterministic Verilog, and requires an additional simulation step to avoid false positives. Finally, VLogHammer does not perform test case reduction, instead only generating small modules that can be analysed manually if they fail.

Another random Verilog generator is VERGEN~\cite{ratchev03_verif_correc_fpga_logic_synth_algor}, which generates behavioural Verilog by randomly combining high-level logic blocks such as state machines, MUXes and shift registers. However, because it generates these predefined constructs, it produces well-behaved code which is unlikely to test many different combinations of Verilog constructs.

American Fuzzy Lop (AFL)~\cite{zalewski15_americ} is a general-purpose fuzzer for binaries and uses instrumentation to guide the mutation of existing test cases. However, given that synthesis tools are highly complex programs with a large number of different states, it can be difficult to identify the correct set of inputs to enable the fuzzer to find a bug. In addition to that, the fuzzer has no notion of correct behaviour, and can therefore only detect crashes. We ran AFL on Yosys for 144 CPU hours and did not find a crash.

Random generation and differential testing has also successfully been applied to fuzz various OpenCL implementations~\cite{lidbury15_many_compil_fuzzin}, including the Intel FPGA SDK for OpenCL~\cite{intel_intel_fpga_sdk_openc}.

\paragraph{Equivalence checking}

Differential testing~\cite{mckeeman98_differ_testin_softw} is the standard method for checking compiler correctness, by passing the input to two or more different compilers for the same language and checking if the output behaves in the same way. If one achieves a different result, then it is assumed that there must be a bug. Csmith~\cite{yang11_findin_under_bugs_c_compil} uses this technique to check if the output of different C compilers is correct. However, as the output of synthesis tools is Verilog, it can be checked formally for equivalence with the initial design to ensure that no bugs were present in the synthesis tool.

Modern commercial logical equivalence checkers, such as Conformal~\cite{cadence_confor_equiv_check} could also solve the problem of comparing netlists to the original design, as they are often built with that use case in mind.

\paragraph{Test case reduction}

Test case reduction is usually performed by a process called delta-debugging~\cite{zeller02_simpl_isolat_failur_induc_input} which splits the source code into parts using simple lexer rules and tries to remove as many parts as possible. C-Reduce~\cite{regehr12_test_reduc_c_compil_bugs} is an example of an advanced reduction algorithm for C-like languages that makes use of parallel executions of different subsets of the test case to identify one that is still interesting, which is reduced further.

\paragraph{Verified synthesis}

There has also been work on verified synthesis, such as PBS~\cite{aagaard91}, written in ML and verified mechanically using Nuprl~\cite{constable86_implem_mathem_nuprl_proof_devel_system}, or $\Pi$-ware~\cite{flor18_pi_ware}, which is a high level HDL in Agda where the synthesis process to gates is formally proven. Such systems should, in theory, withstand our random testing. However, it is an enormous effort to build a fully verified synthesis tool that offers comparable performance to state-of-the-art tools. Moreover, there could still be bugs in the non-verified parts of these tools -- as was found to be the case with the CompCert~\cite{leroy06_formal_certif_compil_back} verified C compiler when tested using Csmith~\cite{yang11_findin_under_bugs_c_compil}.

\section{Conclusion and further work}

This paper introduced a method for behavioural Verilog generation and Verilog reduction, which was implemented in an open-source tool called \verismith{}. This tool successfully found and reported a total of eleven bugs in \yosys{}, \vivado{} and \iverilog{}, which are now either fixed or confirmed and scheduled to be fixed in upcoming releases.

The main limitation of \verismith{} is that it cannot produce a design that contains undefined values, meaning no bugs can be found that are caused by them. Another limitation is that implementations for all the FPGA primitives are needed for the different tools that are used. Finally, only synthesis tools that can output Verilog are supported, which may limit which synthesis tools can be tested.

Further work could be done on supporting a larger subset of Verilog, which improves the testing of the synthesis tools. In addition to that, undefined values could also be supported, which would further increase the variety of test cases that could be generated. Undefined values could be introduced in a controlled manner to support this, as \verismith{} already has a notion of determinism. This would allow for control over how much of the output should be undefined, which would reduce the risk of undefined values masking possible bugs and affecting a large proportion of the output.

It is worth asking whether the bugs we have found using Verismith really matter, or whether they would only be triggered by code patterns that are unlikely to appear in production code. This is hard to answer definitively, but it is worth noting that one member of the \xilinx{} user community remarked that the bug we found in \vivado{} ``looks to me to be a rather critical bug''.\bugtracker{https://forums.xilinx.com/t5/Synthesis/Vivado-2019-1-Bit-selection-synthesis-mismatch/m-p/982632\#M31484} In general it does seem like these tools cannot be completely trusted, because they can generate a netlist that is not equivalent to the original or even crash given correct and deterministic Verilog.

It is our hope that tools like \verismith{} can not only be valuable to designers of logic synthesis tools as a way to catch more bugs, but can also provide designers with a safety net that gives them the confidence to implement ever more ambitious optimisations.

\begin{acks}
We acknowledge financial support of a PhD studentship from the Research Institute on Verified Trustworthy Software Systems (VeTSS), which is funded by the National Cyber Security Centre (NCSC), and the EPSRC IRIS programme grant (EP/R006865/1). We thank Alastair Donaldson, Eric Eide, Martin Ferianc, Brent Nelson, and the FPGA '20 reviewers for valuable suggestions.
\end{acks}

\bibliographystyle{ACM-Reference-Format}
\addtolength{\textheight}{-310pt}
% \balance
\bibliography{conference.bib}

\end{document}
\endinput

%%% Local Variables:
%%% mode: latex
%%% TeX-master: t
%%% End: