summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorYann Herklotz <git@yannherklotz.com>2020-11-18 11:23:18 +0000
committerYann Herklotz <git@yannherklotz.com>2020-11-18 11:23:29 +0000
commita39760b961cce7d8ad4d0e7774f3ef06c4eb9af2 (patch)
treed2ced8485cc8ffb28a36e1fe5540fc5d555c0628
parentd28f866bc80a61332b2942651b64f5def6753635 (diff)
downloadoopsla21_fvhls-a39760b961cce7d8ad4d0e7774f3ef06c4eb9af2.tar.gz
oopsla21_fvhls-a39760b961cce7d8ad4d0e7774f3ef06c4eb9af2.zip
Update pdf
-rw-r--r--algorithm.tex63
-rw-r--r--data/accumulator_fsmd2.pdfbin28925 -> 32479 bytes
2 files changed, 28 insertions, 35 deletions
diff --git a/algorithm.tex b/algorithm.tex
index a2f8c8c..e07406a 100644
--- a/algorithm.tex
+++ b/algorithm.tex
@@ -74,23 +74,21 @@ int main() {
\begin{subfigure}[b]{0.49\linewidth}
\begin{minted}[fontsize=\footnotesize]{c}
main() {
- 16: x9 = 1
- 15: int32[stack(0)] = x9
- 14: x8 = 2
- 13: int32[stack(4)] = x8
- 12: x7 = 3
- 11: int32[stack(8)] = x7
- 10: x3 = 0
- 9: nop
- 8: x1 = 0
- 7: x6 = stack(0) (int)
- 6: x5 = int32[x6 + x1 * 4 + 0]
- 5: x3 = x3 + x5 + 0 (int)
- 4: x1 = x1 + 1 (int)
- 3: if (x1 <s 3) goto 7
- else goto 2
- 2: x4 = x3
- 1: return x4
+ 15: x8 = 1
+ 14: int32[stack(0)] = x8
+ 13: x7 = 2
+ 12: int32[stack(4)] = x7
+ 11: x6 = 3
+ 10: int32[stack(8)] = x6
+ 9: x2 = 0
+ 8: x1 = 0
+ 7: x5 = stack(0) (int)
+ 6: x4 = int32[x5 + x1 * 4 + 0]
+ 5: x2 = x2 + x4 + 0 (int)
+ 4: x1 = x1 + 1 (int)
+ 3: if (x1 <s 3) goto 7 else goto 2
+ 2: x3 = x2
+ 1: return x3
}
\end{minted}
\caption{3AC produced by \compcert{}.}\label{fig:accumulator_rtl}
@@ -114,21 +112,18 @@ The first step of the translation is to use \compcert{} to transform the input C
% + TODO Explain how memory is mapped
-The first translation performed in Vericert is from 3AC to a new hardware translation language (HTL), which is one step towards being completely translated to hardware described in Verilog. The main translation that is performed is going from a CFG representation of the computation to a finite state machine with datapath (FSMD)~\cite{hwang99_fsmd}\JW{I feel like this could use some sort of citation, but I'm not sure what. I guess this is all from "Hardware Design 101", right?}\YH{I think I found a good one actually, which goes over the basics.} representation in HTL.\@ The core idea of the FSMD representation is that it separates the control flow from the operations on the memory and registers, so that the state transitions can be translated into a simple finite state machine (FSM) and each state then contains data operations that update the memory and registers. Figure~\ref{fig:accumulator_diagram} shows the resulting architecture of the FSMD. \JW{I think it would be worth having a sentence to explain how the C model of memory is translated to a hardware-centric model of memory. For instance, in C we have global variables/arrays, stack-allocated variables/arrays, and heap-allocated variables/arrays (anything else?). In Verilog we have registers and RAM blocks. So what's the correspondence between the two worlds? Globals and heap-allocated are not handled, stack-allocated variables become registers, and stack-allocated arrays become RAM blocks? Am I close?}\YH{Stack allocated variables become RAM as well, so that we can deal with addresses easily and take addresses of any variable.} Hardware does not have the same memory model as C
+The first translation performed in Vericert is from 3AC to a new hardware translation language (HTL), which is one step towards being completely translated to hardware described in Verilog. The main translation that is performed is going from a CFG representation of the computation to a finite state machine with datapath (FSMD)~\cite{hwang99_fsmd}\JW{I feel like this could use some sort of citation, but I'm not sure what. I guess this is all from "Hardware Design 101", right?}\YH{I think I found a good one actually, which goes over the basics.} representation in HTL.\@ The core idea of the FSMD representation is that it separates the control flow from the operations on the memory and registers, so that the state transitions can be translated into a simple finite state machine (FSM) and each state then contains data operations that update the memory and registers. Figure~\ref{fig:accumulator_diagram} shows the resulting architecture of the FSMD. \JW{I think it would be worth having a sentence to explain how the C model of memory is translated to a hardware-centric model of memory. For instance, in C we have global variables/arrays, stack-allocated variables/arrays, and heap-allocated variables/arrays (anything else?). In Verilog we have registers and RAM blocks. So what's the correspondence between the two worlds? Globals and heap-allocated are not handled, stack-allocated variables become registers, and stack-allocated arrays become RAM blocks? Am I close?}\YH{Stack allocated variables become RAM as well, so that we can deal with addresses easily and take addresses of any variable.} Hardware does not have the same memory model as C, the memory model therefore needs to be translated in the following way. Global variables are not translated in Vericert at the moment, however, the stack of the main function will become the RAM seen in Figure~\ref{fig:accumulator_diagram}. Variables that have their address is taken will therefore be stored in the RAM, as well as any arrays or structs defined in the function. Variables that did not have their address taken will be kept in registers.
\begin{figure*}
\centering
- \includegraphics[scale=0.3,trim={10cm 10cm 7cm 7cm},clip=true]{data/accumulator_fsmd2.pdf}
+ \includegraphics[scale=0.3,trim={10cm 8cm 5cm 5cm},clip=true]{data/accumulator_fsmd2.pdf}
\caption{The FSMD for our running example. \JW{Maybe replace `State' with `Current State'? And maybe `Calculate State' could be clearer as `Calculate Next State'?} \JW{Is it worth distinguishing between the different types of box? We have a RAM box which is a single hardware block, then the Registers and State boxes both indicate a collection of registers, and finally the Update and Calculate boxes indicate combinational logic. Perhaps the combinational logic bits could be visualised as clouds? (Or if clouds are a bit of a faff to draw, then rounded rectangles, or something?)} \JW{Can we label `data path' and `control path' on the diagram? Looks like the diagram is nicely split into these two parts already.} \JW{Can state 15 (or should it be state 16??) have a dangling incoming arrow to indicate that it is the start state? And perhaps state 1 could have a double outline to indicate that it is an `accepting' state? Since there's space above the `Calculate State' box, I'd be mildly in favour of expanding that box a bit so that it included all 15 states explicitly (snaking back and forth).}}\label{fig:accumulator_diagram}
\end{figure*}
-The translation from 3AC to HTL is quite straightforward, as each 3AC instruction either matches up quite well to a hardware construct, or does not have to be handled by the translation, such as function calls.
-\JW{Repeated `quite'.}
-At each instruction, the control flow is separated from the data computation and is then added to the control logic and data-flow map respectively.
-\JW{I suspect that you could safely chop that sentence.}
-For example, in state 16 \JWcouldcut{in the 3AC code} in figure~\ref{fig:accumulator_rtl}, the register \texttt{x9} is initialised to 1, after which the control flow moves to state 15. This is encoded in HTL by initialising a 32-bit register \texttt{reg\_9} to 1 \JWcouldcut{as well} in the data-flow \JWcouldcut{graph} section, and also adding a \JWcouldcut{state} transition to the state 15 in the control logic \JWcouldcut{transition}\JW{section}. Simple operator instructions are translated in a similar way. For example, in state 5, the value in the array is added to the current value of the accumulated sum, which is simply translated to an addition of the equivalent registers in the HTL code.
-
-In the \JWcouldcut{accumulator} C code, the for loop \JW{for-loop is easier to parse} is translated to a branch statement in state 3, which can also be translated to equivalent HTL code by placing the comparison into the control logic part of the main function, and not performing any data operations in the data-flow section. On the next clock cycle, the state will therefore transition to state 7 or state 2 depending on if \texttt{reg\_3} is less than 3 or not. \JW{The idea of a for-loop translating to a branch statement in the CFG is standard; no need to explain that at all I'd say. So I'd start this paragraph by focusing on the signedness issue:}
+The translation from 3AC to HTL is straightforward, as each 3AC instruction either matches up quite well to a hardware construct, or does not have to be handled by the translation, such as function calls.
+%At each instruction, the control flow is separated from the data computation and is then added to the control logic and data-flow map respectively.
+%\JW{I suspect that you could safely chop that sentence.}
+For example, in state 16 in figure~\ref{fig:accumulator_rtl}, the register \texttt{x9} is initialised to 1, after which the control flow moves to state 15. This is encoded in HTL by initialising a 32-bit register \texttt{reg\_9} to 1 in the data-flow section, and also adding a transition to the state 15 in the control logic section. Simple operator instructions are translated in a similar way. For example, in state 5, the value in the array is added to the current value of the accumulated sum, which is simply translated to an addition of the equivalent registers in the HTL code.
\paragraph{Key challenge: signedness} Note that the comparison in state 3 is signed. By default, all operators and registers in Verilog and HTL are unsigned, so to force an operation to handle the bits as signed, both operators have to be forced to signed. In addition to that, Verilog resizes expressions to the largest needed size by default, which can affect the result of the computation. This feature is also not supported by the Verilog semantics we used, and there would therefore be a mismatch between the Verilog semantics and the actual behaviour of Verilog according to the standard. To bypass this issue braces are used to stop the Verilog simulator or synthesis tool from resizing anything inside the braces. Instead, explicit resizing is used in the semantics and operations can only be performed on two registers that have the same size.
@@ -154,14 +149,13 @@ module main(reset, clk, finish, return_val);
reg_5, reg_3, reg_7;
always @(posedge clk)
case (state)
- 32'd16: reg_9 <= 32'd1;
- 32'd15: stack[32'd0] <= reg_9;
- 32'd14: reg_8 <= 32'd2;
- 32'd13: stack[32'd1] <= reg_8;
- 32'd12: reg_7 <= 32'd3;
- 32'd11: stack[32'd2] <= reg_7;
- 32'd10: reg_3 <= 32'd0;
- 32'd9: ;
+ 32'd15: reg_9 <= 32'd1;
+ 32'd14: stack[32'd0] <= reg_9;
+ 32'd13: reg_8 <= 32'd2;
+ 32'd12: stack[32'd1] <= reg_8;
+ 32'd11: reg_7 <= 32'd3;
+ 32'd10: stack[32'd2] <= reg_7;
+ 32'd9: reg_3 <= 32'd0;
32'd8: reg_1 <= 32'd0;
32'd7: reg_6 <= 32'd0;
32'd6: reg_5 <= stack[{{{reg_6 + 32'd0}
@@ -186,7 +180,6 @@ module main(reset, clk, finish, return_val);
state <= 32'd16;
else
case (state)
- 32'd16: state <= 32'd15;
32'd15: state <= 32'd14;
32'd14: state <= 32'd13;
32'd13: state <= 32'd12;
diff --git a/data/accumulator_fsmd2.pdf b/data/accumulator_fsmd2.pdf
index 3731936..6c848ba 100644
--- a/data/accumulator_fsmd2.pdf
+++ b/data/accumulator_fsmd2.pdf
Binary files differ