From db0b534ae51bbb31ab33f1354ab807c04292838f Mon Sep 17 00:00:00 2001
From: Yann Herklotz <git@yannherklotz.com>
Date: Tue, 3 May 2022 09:51:01 +0100
Subject: Add pipelining text

---
 chapters/pipelining.tex       | 59 +++++++++++++++++++++++++++++++++++++++----
 chapters/pipelining_notes.org | 24 +++++++++++++++++-
 2 files changed, 77 insertions(+), 6 deletions(-)

(limited to 'chapters')

diff --git a/chapters/pipelining.tex b/chapters/pipelining.tex
index 6c351c8..5aa03dc 100644
--- a/chapters/pipelining.tex
+++ b/chapters/pipelining.tex
@@ -13,7 +13,12 @@
 
 Standard instruction scheduling only addresses parallelisation inside hyperblocks, which are linear
 sections of code.  However, loops are often the most critical sections in code, and scheduling only
-addresses parallelisation within one iteration.
+addresses parallelisation within one iteration.  Traditionally, loop pipelining is performed as part
+of the normal scheduling step in \HLS, as the scheduling algorithm can be generalised to support
+these new constraints.  However, it becomes expensive to check the correctness of a scheduling
+algorithm that transforms the code in this many ways.  It is better to separate loop pipelining into
+it's own translation pass which does a source-to-source translation of the code.  The final result
+should be similar to performing the loop pipelining together with the scheduling.
 
 \section{Loop pipelining example}
 
@@ -53,10 +58,54 @@ addresses parallelisation within one iteration.
   \stopfloatcombination
 \stopplacemarginfigure
 
-\in{Figure}[fig:pipelined-loop] shows an example of pipelining a loop which accumulates values and
-modifies an array.  In \in{Figure}{a}[fig:pipelined-loop], the body of the loop cannot be scheduled
-in less than three cycles, assuming that a load takes two clock cycles.  However, after transforming
-the code into the pipelined version on the right
+\in{Figure}[fig:pipelined-loop] shows an example of \emph{software pipelining} of a loop which
+accumulates values and modifies an array.  In \in{Figure}{a}[fig:pipelined-loop], the body of the
+loop cannot be scheduled in less than three cycles, assuming that a load takes two clock cycles.
+However, after transforming the code into the pipelined version in
+\in{Figure}{b}[fig:pipelined-loop], the number of inter-iteration dependencies have been reduced, by
+moving the store into the next iteration of the loop.  This means that the body of the loop could
+now be scheduled in two clock cycles.
+
+The process of pipelining the loop by being resource constrained can be performed using iterative
+modulo scheduling~\cite[rau94_iterat_sched], followed by modulo variable
+expansion~\cite[lam88_softw_pipel].  The steps performed in the optimisation are the following.
+
+\startitemize[n]
+\item Calculate a minimum \II, which is the lowest possible \II\ based on the resources of the code
+  inside of the loop and the distance and delay of inter-iteration dependencies.
+\item Perform modulo scheduling with an increasing \II\ until one is found that works.  The
+  scheduling is performed by adding instructions to a \MRT~\cite[lam88_softw_pipel], assigning
+  operations to each resource for one iteration.
+\item Once a valid modulo schedule is found, the loop code can be generated from it.  To keep the
+  \II\ that was found in the previous step, modulo variable expansion might be necessary and require
+  unrolling the loop.
+\stopitemize
+
+\section{Verification of pipelining}
+
+Verification of pipelining has already been performed in CompCert by
+\cite[authoryears][tristan10_simpl_verif_valid_softw_pipel].  Assuming that one has a loop body
+$\mathcal{B}$, the pipelined code can be described using the prologue $\mathcal{P}$, the steady
+state $\mathcal{S}$, the epilogue $\mathcal{E}$, and finally some additional variables representing
+the minimum number of iterations that need to be performed to be able to use the pipelined loop
+$\mu$ and representing the unroll factor of the steady state $\delta$.
+
+The goal is to verify the equivalence of the original loop and the pipelined loop using a validation
+algorithm, and then prove that the validation algorithm is \emph{sound}, i.e. if it finds the two
+loops to be equivalent, this implies that they will behave the same according to the language
+semantics.  Essentially, as the loop pipelining algorithm only modifies loops, it is enough to show
+that the input loop $X$ is equivalent to the pipelined version of the loop $Y$.  Assuming that the
+validation algorithm is called $\alpha$, we therefore want to show that the following statement
+holds, where $X_N$ means that we are considering $N$ iterations of the loop.
+\placeformula\startformula
+  \forall N,\quad \alpha(X_N) = \startmathcases
+    \NC \alpha(Y_{N/\delta}; X_{N\%\delta}) \NC N \ge \mu \NR
+    \NC \alpha(X_N) \NC \text{otherwise} \NR
+\stopmathcases\stopformula
+
+This states that for any number of iteration of the loop X, the formula should hold.  As the number
+of loop iterations $N$ are not always known at compile time, this may become an infinite property
+that needs to be checked.
 
 \startmode[section]
   \section{Bibliography}
diff --git a/chapters/pipelining_notes.org b/chapters/pipelining_notes.org
index 0680800..01ff668 100644
--- a/chapters/pipelining_notes.org
+++ b/chapters/pipelining_notes.org
@@ -49,9 +49,31 @@ A3  x6 = x6 + 1 (int)
 
 | Instr | Stage | Code                                                              |
 |-------+-------+-------------------------------------------------------------------|
-|     0 |     0 | ~x18 = x6 - 1~, ~x16 = int32[x4+x18*4+0]~, ~x12 = int32[x3+x6*4]~ |
+|     0 |     0 | ~x18 = x6 - 1~, ~x13 = int32[x4+x18*4+0]~, ~x12 = int32[x3+x6*4]~ |
 |       |     1 | ~x7 = x12 * x13~                                                  |
 |       |     2 | ~x11 = x8 + x7~                                                   |
 |       |     3 | ~x6 = x6 + 1~, ~int32[x4+x6*4] = x11~                             |
 |     1 |     0 | ~x16 = int32[x4+x18*4]~                                           |
 |       |     1 | ~x8 = x16 * x1~                                                   |
+
+
+#+begin_src
+x18[0] = x6[0] - 1
+|| x13[0] = int32[x4[0]+x18[0]*4+0]
+|| x12[0] = int32[x3[0]+x6[0]*4]
+|| x7[1] = x12[1] * x13[1]
+|| x11[2] = x8[2] + x7[2]
+|| int32[x4[3]+x6[3]*4] = x11[3]
+|| x6[3] = x6[3] + 1
+
+x16[0] = int32[x4+x18*4]
+|| x8[1] = x16[1] * x1[1]
+
+x18>>
+x16>>
+x12>>
+x13>>
+x7>>
+x8>>
+x11>>
+#+end_src
-- 
cgit