summaryrefslogtreecommitdiffstats
path: root/algorithm.tex
diff options
context:
space:
mode:
authorYann Herklotz <git@yannherklotz.com>2021-04-14 11:53:23 +0100
committerYann Herklotz <git@yannherklotz.com>2021-04-14 11:53:23 +0100
commitfb42c6f69b1f3e5d5362fcb483dc195201a63fbd (patch)
treee4128def390795926ae34c237ac200d8c2a17bf5 /algorithm.tex
parentb22cf3b97ea6b745dac1d54bab52e8ddc3341a79 (diff)
downloadoopsla21_fvhls-fb42c6f69b1f3e5d5362fcb483dc195201a63fbd.tar.gz
oopsla21_fvhls-fb42c6f69b1f3e5d5362fcb483dc195201a63fbd.zip
Fix main diagram
Diffstat (limited to 'algorithm.tex')
-rw-r--r--algorithm.tex113
1 files changed, 69 insertions, 44 deletions
diff --git a/algorithm.tex b/algorithm.tex
index b620447..930e3ca 100644
--- a/algorithm.tex
+++ b/algorithm.tex
@@ -79,8 +79,9 @@ It has an unlimited number of pseudo-registers, and is represented as a control
\begin{subfigure}[t]{1\linewidth}
\begin{minted}[fontsize=\footnotesize,linenos,xleftmargin=20pt]{c}
int main() {
- int x[1] = {3};
- return x[0];
+ int x[2] = {3, 6};
+ int i = 1;
+ return x[i];
}
\end{minted}
\caption{Example C code passed to \vericert{}.}\label{fig:accumulator_c}
@@ -88,51 +89,60 @@ int main() {
\begin{subfigure}[b]{1\linewidth}
\begin{minted}[fontsize=\footnotesize,linenos,xleftmargin=20pt]{c}
main() {
- x2 = 3
- int32[stack(0)] = x2
- x1 = int32[stack(0)]
- return x1
+ x5 = 3
+ int32[stack(0)] = x5
+ x4 = 6
+ int32[stack(4)] = x4
+ x1 = 1
+ x3 = stack(0) (int)
+ x2 = int32[x3 + x1
+ * 4 + 0]
+ return x2
}
\end{minted}
- \caption{3AC produced by \compcert{}.}\label{fig:accumulator_rtl}
+ \caption{3AC produced by the \compcert{} front-end without any optimisations.}\label{fig:accumulator_rtl}
\end{subfigure}
\end{subfigure}\hfill%
\begin{subfigure}[b]{0.65\linewidth}
- \vspace{1em}
\begin{minted}[fontsize=\tiny,linenos,xleftmargin=20pt]{verilog}
module main(reset, clk, finish, return_val);
- input [0:0] clk, reset; output reg [31:0] return_val = 0;
+ input [0:0] reset, clk;
output reg [0:0] finish = 0;
- reg [31:0] state = 0, d_out = 0, d_in = 0, reg_1 = 0, addr = 0, reg_2 = 0;
- reg [0:0] en = 0, wr_en = 0, u_en = 0; reg [31:0] stack [0:0];
- // RAM template
+ output reg [31:0] return_val = 0;
+ reg [31:0] reg_3 = 0, addr = 0, d_in = 0, reg_5 = 0, wr_en = 0;
+ reg [0:0] en = 0, u_en = 0;
+ reg [31:0] state = 0, reg_2 = 0, reg_4 = 0, d_out = 0, reg_1 = 0;
+ reg [31:0] stack [1:0];
always @(negedge clk)
if ({u_en != en}) begin
if (wr_en) stack[addr] <= d_in; else d_out <= stack[addr];
en <= u_en;
end
- // Data-path
always @(posedge clk)
case (state)
- 32'd6: reg_1 <= d_out;
- 32'd4: reg_2 <= 32'd3;
- 32'd3: begin u_en <= ( ! u_en); wr_en <= 32'd1;
- d_in <= reg_2; addr <= 32'd0; end
- 32'd2: begin u_en <= ( ! u_en); wr_en <= 32'd0; addr <= 32'd0; end
- 32'd1: begin finish <= 32'd1; return_val <= reg_1; end
- default:;
+ 32'd11: reg_2 <= d_out;
+ 32'd8: reg_5 <= 32'd3;
+ 32'd7: begin u_en <= ( ! u_en); wr_en <= 32'd1;
+ d_in <= reg_5; addr <= 32'd0; end
+ 32'd6: reg_4 <= 32'd6;
+ 32'd5: begin u_en <= ( ! u_en); wr_en <= 32'd1;
+ d_in <= reg_4; addr <= 32'd1; end
+ 32'd4: reg_1 <= 32'd1;
+ 32'd3: reg_3 <= 32'd0;
+ 32'd2: begin u_en <= ( ! u_en); wr_en <= 32'd0;
+ addr <= {{{reg_3 + 32'd0} + {reg_1 * 32'd4}} / 32'd4}; end
+ 32'd1: begin finish = 32'd1; return_val = reg_2; end
+ default: ;
endcase
- // Control-logic
always @(posedge clk)
- if ({reset == 32'd1}) state <= 32'd4;
+ if ({reset == 32'd1}) state <= 32'd8;
else case (state)
- 32'd6: state <= 32'd1;
- 32'd4: state <= 32'd3;
- 32'd3: state <= 32'd2;
- 32'd2: state <= 32'd6;
- 32'd1: ;
- default:;
- endcase
+ 32'd11: state <= 32'd1; 32'd4: state <= 32'd3;
+ 32'd8: state <= 32'd7; 32'd3: state <= 32'd2;
+ 32'd7: state <= 32'd6; 32'd2: state <= 32'd11;
+ 32'd6: state <= 32'd5; 32'd1: ;
+ 32'd5: state <= 32'd4; default: ;
+ endcase
endmodule
\end{minted}
\caption{Verilog produced by \vericert{}. It demonstrates the instantiation of the RAM, the data-path and finally the control-logic.}\label{fig:accumulator_v}
@@ -147,7 +157,7 @@ In this section, we describe the stages of the \vericert{} translation, referrin
\subsubsection{Translating C to 3AC}
The first stage of the translation uses unmodified \compcert{} to transform the C input into a 3AC intermediate representation, shown in Figure~\ref{fig:accumulator_rtl}.
-As part of this translation, function inlining is also performed on all functions, which allows us to support function calls without having to support the \texttt{Icall} 3AC instruction. Although the duplication of the function bodies caused by inlining can increase the area of the hardware, it can have a positive effect on latency. Inlining precludes support for recursive function calls, but this feature isn't supported in most other HLS tools either~\cite{davidthomas_asap16}.
+As part of this translation, function inlining is also performed on all functions, which allows us to support function calls without having to support the \texttt{Icall} 3AC instruction. Although the duplication of the function bodies caused by inlining can increase the area of the hardware, it can have a positive effect on latency\YH{TODO: Add sentence on why inlining is often the better choice}. Inlining precludes support for recursive function calls, but this feature isn't supported in most other HLS tools either~\cite{davidthomas_asap16}.
%\JW{Is that definitely true? Was discussing this with Nadesh and George recently, and I ended up not being so sure. Inlining could actually lead to \emph{reduced} resource usage because once everything has been inlined, the (big) scheduling problem could then be solved quite optimally. Certainly inlining is known to increase register pressure, but that's not really an issue here. If we're not sure, we could just say that inlining everything leads to bloated Verilog files and the inability to support recursion, and leave it at that.}\YH{I think that is true, just because we don't do scheduling. With scheduling I think that's true, inlining actually becomes quite good.}
@@ -180,18 +190,17 @@ Figure~\ref{fig:accumulator_diagram} shows the resulting FSMD architecture. The
\fill[white,rounded corners=10pt] (7,0.5) rectangle (11.5,2.2);
\node at (8,2) {\tiny Next State FSM};
- \foreach \x in {15,...,9}
- {\pgfmathtruncatemacro{\y}{\x-9}%
- \node[draw,circle,inner sep=0,minimum size=10,scale=0.8] (s\x) at (7.5+\y/2,1.6) {\tiny \x};}
\foreach \x in {8,...,2}
{\pgfmathtruncatemacro{\y}{8-\x}%
- \node[draw,circle,inner sep=0,minimum size=10,scale=0.8] (s\x) at (7.5+\y/2,1.1) {\tiny \x};}
+ \node[draw,circle,inner sep=0,minimum size=10,scale=0.8] (s\x) at (7.5+\y/2,1.35) {\tiny \x};}
\node[draw,circle,inner sep=0,minimum size=10,scale=0.8] (s1c) at (11,1.35) {\tiny 1};
\node[draw,circle,inner sep=0,minimum size=13,scale=0.8] (s1) at (s1c) {};
- \foreach \x in {15,...,2}
+ \foreach \x in {8,...,3}
{\pgfmathtruncatemacro{\y}{\x-1}\draw[-{Latex[length=1mm,width=0.7mm]}] (s\x) -- (s\y);}
- \draw[-{Latex[length=1mm,width=0.7mm]}] (10.8,2) to [out=200,in=70] (s15);
- \draw[-{Latex[length=1mm,width=0.7mm]}] (s3) to [out=210,in=330] (s7);
+ \node[draw,circle,inner sep=0,minimum size=10,scale=0.8] (s11) at (10.5,0.9) {\tiny 11};
+ \draw[-{Latex[length=1mm,width=0.7mm]}] (s2) -- (s11);
+ \draw[-{Latex[length=1mm,width=0.7mm]}] (s11) -- (s1);
+ \draw[-{Latex[length=1mm,width=0.7mm]}] (7.2,1.7) to [out=0,in=100] (s8);
\node[draw,fill=white] (nextstate) at (9.25,3) {\tiny Current State};
\draw[-{Latex[length=1mm,width=0.7mm]}] let \p1 = (nextstate) in
@@ -205,7 +214,7 @@ Figure~\ref{fig:accumulator_diagram} shows the resulting FSMD architecture. The
\fill[white,rounded corners=10pt] (2,0.5) rectangle (5,3);
\filldraw[fill=white] (0.25,0.5) rectangle (1.5,2.75);
\node at (2.6,2.8) {\tiny Update};
- \node[align=center] at (0.875,2.4) {\tiny \texttt{RAM}\\[-1.5ex]\tiny\texttt{(Array)}};
+ \node[align=center] at (0.875,2.4) {\tiny \texttt{RAM}};
\node[scale=0.4] at (4.7,1.5) {\texttt{state}};
\draw[-{Latex[length=1mm,width=0.7mm]}] (6,1.5) -- (5,1.5);
\draw[-{Latex[length=1mm,width=0.7mm]}] (6,1.5) -- (7,1.5);
@@ -214,15 +223,31 @@ Figure~\ref{fig:accumulator_diagram} shows the resulting FSMD architecture. The
\node[scale=0.4,rotate=60] at (2.5,0.75) {\texttt{clk}};
\node[scale=0.4,rotate=60] at (2.7,0.75) {\texttt{rst}};
+ \node[scale=0.4,right,inner sep=5pt] (ram1) at (2,2.1) {\texttt{u\_en}};
+ \node[scale=0.4,right,inner sep=5pt] (ram2) at (2,1.9) {\texttt{wr\_en}};
+ \node[scale=0.4,right,inner sep=5pt] (ram3) at (2,1.7) {\texttt{addr}};
+ \node[scale=0.4,right,inner sep=5pt] (ram4) at (2,1.5) {\texttt{d\_in}};
+ \node[scale=0.4,right,inner sep=5pt] (ram5) at (2,1.3) {\texttt{d\_out}};
+
+ \node[scale=0.4,left,inner sep=5pt] (r1) at (1.5,2.1) {\texttt{u\_en}};
+ \node[scale=0.4,left,inner sep=5pt] (r2) at (1.5,1.9) {\texttt{wr\_en}};
+ \node[scale=0.4,left,inner sep=5pt] (r3) at (1.5,1.7) {\texttt{addr}};
+ \node[scale=0.4,left,inner sep=5pt] (r4) at (1.5,1.5) {\texttt{d\_in}};
+ \node[scale=0.4,left,inner sep=5pt] (r5) at (1.5,1.3) {\texttt{d\_out}};
+
+ \draw[-{Latex[length=1mm,width=0.7mm]}] (ram1) -- (r1);
+ \draw[-{Latex[length=1mm,width=0.7mm]}] (ram2) -- (r2);
+ \draw[-{Latex[length=1mm,width=0.7mm]}] (ram3) -- (r3);
+ \draw[-{Latex[length=1mm,width=0.7mm]}] (ram4) -- (r4);
+ \draw[-{Latex[length=1mm,width=0.7mm]}] (r5) -- (ram5);
+
\draw[-{Latex[length=1mm,width=0.7mm]}] (4,0.5) -- (4,-0.5);
\draw[-{Latex[length=1mm,width=0.7mm]}] (3.75,0.5) -- (3.75,-0.5);
\draw[-{Latex[length=1mm,width=0.7mm]}] (2.45,-0.5) -- (2.45,0.5);
\draw[-{Latex[length=1mm,width=0.7mm]}] (2.65,-0.5) -- (2.65,0.5);
- \foreach \x in {0,...,2}
- {\draw (0.25,1.25-0.25*\x) -- (1.5,1.25-0.25*\x); \node at (0.875,1.13-0.25*\x) {\tiny \x};}
- \draw[-{Latex[length=1mm,width=0.7mm]}] (1.5,1.5) -- (2,1.5);
- \draw[-{Latex[length=1mm,width=0.7mm]}] (2,1.75) -- (1.5,1.75);
+ \foreach \x in {0,...,1}
+ {\draw (0.25,1-0.25*\x) -- (1.5,1-0.25*\x); \node at (0.875,0.88-0.25*\x) {\tiny \x};}
%\node[scale=0.4] at (1.2,2.2) {\texttt{wr\_en}};
%\node[scale=0.4] at (1.2,2) {\texttt{wr\_addr}};
@@ -251,8 +276,8 @@ Figure~\ref{fig:accumulator_diagram} shows the resulting FSMD architecture. The
\node[scale=0.4] at (3.5,4.2) {\texttt{reg\_1}};
\node[scale=0.4] at (3.5,4) {\texttt{reg\_2}};
\node[scale=0.4] at (3.5,3.8) {\texttt{reg\_3}};
- \node[scale=0.4] at (3.5,3.6) {$\cdots$};
- \node[scale=0.4] at (3.5,3.4) {\texttt{reg\_8}};
+ \node[scale=0.4] at (3.5,3.6) {\texttt{reg\_4}};
+ \node[scale=0.4] at (3.5,3.4) {\texttt{reg\_5}};
\end{tikzpicture}}
\caption{The FSMD for the example shown in Figure~\ref{fig:accumulator_c_rtl}, split into a data-path and control logic for the next state calculation. The Update block takes the current state, current values of all registers and at most one value stored in the array, and calculates a new value that can either be stored back in the array or in a register.}\label{fig:accumulator_diagram}
\end{figure*}