summaryrefslogtreecommitdiffstats
path: root/hls.tex
blob: d06b066277f5b7e857623af433432f292be8f9d4 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
\startcomponent hls
\product main

\def\slowdownOrig{27}
\def\slowdownDiv{2}
\def\areaIncr{1.1}

\chapter[sec:hls]{High-Level Synthesis}

\paragraph{Can you trust your high-level synthesis tool?} As latency, throughput, and energy
efficiency become increasingly important, custom hardware accelerators are being designed for
numerous applications.  Alas, designing these accelerators can be a tedious and error-prone process
using a hardware description language (HDL) such as Verilog.  An attractive alternative is
\emph{high-level synthesis} (HLS), in which hardware designs are automatically compiled from
software written in a high-level language like C.  Modern HLS tools such as
LegUp~\cite{canis11_legup}, Vivado HLS~\cite{xilinx20_vivad_high_synth}, Intel i++~\cite{intel_hls},
and Bambu HLS~\cite{bambu_hls} promise designs with comparable performance and energy-efficiency to
those hand-written in an HDL~\cite{homsirikamol+14, silexicahlshdl, 7818341}, while offering the
convenient abstractions and rich ecosystems of software development.  But existing HLS tools cannot
always guarantee that the hardware designs they produce are equivalent to the software they were
given, and this undermines any reasoning conducted at the software level.

Indeed, there are reasons to doubt that HLS tools actually \emph{do} always preserve equivalence.
For instance, Vivado HLS has been shown to apply pipelining optimisations
incorrectly\footnote{\goto{https://bit.ly/vivado-hls-pipeline-bug}[url(https://bit.ly/vivado-hls-pipeline-bug)]}
or to silently generate wrong code should the programmer stray outside the fragment of C that it
supports.\footnote{\goto{https://bit.ly/vivado-hls-pointer-bug}[url(https://bit.ly/vivado-hls-pointer-bug)]}
Meanwhile, \cite{lidbury15_many_core_compil_fuzzin} had to abandon their attempt to fuzz-test
Altera's (now Intel's) OpenCL compiler since it \quotation{either crashed or emitted an internal
  compiler error} on so many of their test inputs.  More recently,
\cite{herklotz21_empir_study_reliab_high_level_synth_tools} fuzz-tested three commercial HLS tools
using Csmith~\cite{yang11_findin_under_bugs_c_compil}, and despite restricting the generated
programs to the C fragment explicitly supported by all the tools, they still found that on average
2.5\% of test-cases were compiled to designs that behaved incorrectly.

\paragraph{Existing workarounds.} Aware of the reliability shortcomings of HLS tools, hardware
designers routinely check the generated hardware for functional correctness. This is commonly done
by simulating the generated design against a large test-bench. But unless the test-bench covers all
inputs exhaustively -- which is often infeasible -- there is a risk that bugs remain.

One alternative is to use \emph{translation validation}~\cite{pnueli98_trans} to prove equivalence
between the input program and the output design. Translation validation has been successfully
applied to several HLS
optimisations~\cite{kim04_autom_fsmd,karfa06_formal_verif_method_sched_high_synth,chouksey20_verif_sched_condit_behav_high_level_synth,banerjee14_verif_code_motion_techn_using_value_propag,chouksey19_trans_valid_code_motion_trans_invol_loops}.
Nevertheless, it is an expensive task, especially for large designs, and it must be repeated every
time the compiler is invoked.  For example, the translation validation for Catapult
C~\cite{mentor20_catap_high_level_synth} may require several rounds of expert
`adjustments'~\cite[alternative=authoryear,right={, p.~3)}][slec_whitepaper] to the input C program
before validation succeeds. And even when it succeeds, translation validation does not provide
watertight guarantees unless the validator itself has been mechanically proven
correct~\cite[tristan08_formal_verif_trans_valid], which has not been the case in HLS tools to date.

Our position is that none of the above workarounds are necessary if the HLS tool can simply be
trusted to work correctly.

\paragraph{Our solution.} We have designed a new HLS tool in the Coq theorem prover and proved that
any output design it produces always has the same behaviour as its input program. Our tool, called
Vericert, is automatically extracted to an OCaml program from Coq, which ensures that the object of
the proof is the same as the implementation of the tool. Vericert is built by extending the CompCert
verified C compiler~\cite[leroy09_formal_verif_realis_compil] with a new hardware-specific
intermediate language and a Verilog back end. It supports most C constructs, including integer
operations, function calls (which are all inlined), local arrays, structs, unions, and general
control-flow statements, but currently excludes support for case statements, function pointers,
recursive function calls, non-32-bit integers, floats, and global variables.

\paragraph{Contributions and Outline.} The contributions of this paper are as follows:

\startitemize[]
\item We present Vericert, the first mechanically verified HLS tool that compiles C to Verilog. In
  \in{Section}[sec:hls:design], we describe the design of Vericert, including certain optimisations
  related to memory accesses and division.
\item We state the correctness theorem of Vericert with respect to an existing semantics for Verilog
  due to \cite[authoryear][loow19_proof_trans_veril_devel_hol]. In \in{Section}[sec:hls:verilog], we
  describe how we extended this semantics to make it suitable as an HLS target.  We also describe
  how the Verilog semantics is integrated into CompCert's language execution model and its framework
  for performing simulation proofs. A mapping of CompCert's infinite memory model onto a finite
  Verilog array is also described.
\item In \in{Section}[sec:hls:proof], we describe how we proved the correctness theorem. The proof
  follows standard CompCert techniques -- forward simulations, intermediate specifications, and
  determinism results -- but we encountered several challenges peculiar to our hardware-oriented
  setting.  These include handling discrepancies between the byte-addressed memory assumed by the
  input software and the word-addressed memory that we implement in the output hardware, different
  handling of unsigned comparisons between C and Verilog, and carefully implementing memory reads
  and writes so that these behave properly as a RAM in hardware.
\item In \in{Section}[sec:hls:evaluation], we evaluate Vericert on the PolyBench/C benchmark
  suite~\cite[polybench], and compare the performance of our generated hardware against an existing,
  unverified HLS tool called LegUp~\cite[canis11_legup]. We show that Vericert generates hardware
  that is \slowdownOrig$\times$ slower (\slowdownDiv$\times$ slower in the absence of division) and
  \areaIncr$\times$ larger than that generated by LegUp. This performance gap can be largely
  attributed to Vericert's current lack of support for instruction-level parallelism and the absence
  of an efficient, pipelined division operator. We intend to close this gap in the future by
  introducing (and verifying) HLS optimisations of our own, such as scheduling and memory analysis.
  This section also reports on our campaign to fuzz-test Vericert using over a hundred thousand
  random C programs generated by Csmith~\cite{yang11_findin_under_bugs_c_compil} in order to confirm
  that its correctness theorem is watertight.
\stopitemize

\paragraph{Companion material.} Vericert is fully open source and available on GitHub at

\startalignment[center]
  \goto{\hyphenatedurl{https://github.com/ymherklotz/vericert}}[url(https://github.com/ymherklotz/vericert)].
\stopalignment

A snapshot of the Vericert development is also available in a Zenodo
repository~\cite{yann_herklotz_2021_5093839}.

\section[sec:hls:design]{Designing a Verified HLS Tool}

This section describes the main architecture of the HLS tool, and the way in which the Verilog back
end was added to CompCert.  This section also covers an example of converting a simple C program
into hardware, expressed in the Verilog language.

\subsection{Main Design Decisions}

\paragraph{Choice of source language.} C was chosen as the source language as it remains the most
common source language amongst production-quality HLS tools~\cite[canis11_legup,
  xilinx20_vivad_high_synth, intel_hls, bambu_hls]. This, in turn, may be because it is
\quotation{[t]he starting point for the vast majority of algorithms to be implemented in
  hardware}~\cite{5522874}, lending a degree of practicality.  The availability of
CompCert~\cite{leroy09_formal_verif_realis_compil} also provides a solid basis for formally verified
C compilation.  We considered Bluespec~\cite{nikhil04_blues_system_veril}, but decided that although
it \quotation{can be classed as a high-level language}~\cite{greaves_note}, it is too hardware-oriented to
be suitable for traditional HLS.  We also considered using a language with built-in parallel
constructs that map well to parallel hardware, such as occam~\cite{page91_compil_occam},
Spatial~\cite{spatial} or Scala~\cite{chisel}.

\paragraph{Choice of target language.} Verilog~\cite{06_ieee_stand_veril_hardw_descr_languag} is an
HDL that can be synthesised into logic cells which can either be placed onto a field-programmable
gate array (FPGA) or turned into an application-specific integrated circuit (ASIC).  Verilog was
chosen as the output language for Vericert because it is one of the most popular HDLs and there
already exist a few formal semantics for it that could be used as a
target~\cite{loow19_verif_compil_verif_proces, meredith10_veril}.  Bluespec, previously ruled out as
a source language, is another possible target and there exists a formally verified translation to
circuits using
K\^{o}ika~\cite{bourgeat20_essen_blues}.

\paragraph{Choice of implementation language.} We chose Coq as the implementation language because
of its mature support for code extraction; that is, its ability to generate OCaml programs directly
from the definitions used in the theorems.  We note that other authors have had some success
reasoning about the HLS process using other theorem provers such as Isabelle~\cite{ellis08}.
CompCert~\cite{leroy09_formal_verif_realis_compil} was chosen as the front end because it has a well
established framework for simulation proofs about intermediate languages, and it already provides a
validated C parser~\cite{jourdan12_valid_lr_parser}.  The Vellvm
framework~\cite{zhao12_formal_llvm_inter_repres_verif_progr_trans} was also considered because
several existing HLS tools are already LLVM-based, but additional work would be required to support
a high-level language like C as input.  The .NET framework has been used as a basis for other HLS
tools, such as Kiwi~\cite{kiwi}, and LLHD~\cite{schuiki20_llhd} has been recently proposed as an
intermediate language for hardware design, but neither are suitable for us because they lack formal
semantics.

\startplacefigure[title={Vericert as a Verilog back end to CompCert.}]
  \hbox{\starttikzpicture [language/.style={fill=white,rounded corners=3pt,minimum height=7mm},
    continuation/.style={},
    linecount/.style={rounded corners=3pt,dashed}]
    \fill[compcert,rounded corners=3pt] (-1,-0.5) rectangle (9.9,2);
    \fill[formalhls,rounded corners=3pt] (-1,-1) rectangle (9.9,-2.4);
    %\draw[linecount] (-0.95,-0.45) rectangle (3.6,1);
    %\draw[linecount] (4,-0.45) rectangle (7.5,1);
    \node[language] at (-0.3,0) (clight) {Clight};
    \node[continuation] at (1,0) (conta) {$\cdots$};
    \node[language] at (2.7,0) (cminor) {CminorSel};
    \node[language] at (4.7,0) (rtl) {3AC};
    \node[language] at (6.2,0) (ltl) {LTL};
    \node[language,anchor=west] at (8.4,0) (aarch) {aarch64};
    \node[language,anchor=west] at (8.4,0.8) (x86) {x86};
    \node[continuation,anchor=west] at (8.4,1.4) (backs) {$\cdots$};
    \node[continuation] at (7.3,0) (contb) {$\cdots$};
    \node[language] at (4.7,-1.5) (htl) {HTL};
    \node[language] at (6.7,-1.5) (verilog) {Verilog};
    \node[anchor=west] at (-0.9,1.6) {\bold{CompCert}};
    \node[anchor=west] at (-0.9,-1.4) {\bold{Vericert}};
    %%\node[anchor=west] at (-0.9,0.7) {\small $\sim$ 27 kloc};
    %%\node[anchor=west] at (4.1,0.7) {\small $\sim$ 46 kloc};
    %%\node[anchor=west] at (2,-1.5) {\small $\sim$ 17 kloc};
    \node[align=center] at (3.2,-2) {RAM\blank insertion};
    \draw[->,thick] (clight) -- (conta);
    \draw[->,thick] (conta) -- (cminor);
    \draw[->,thick] (cminor) -- (rtl);
    \draw[->,thick] (rtl) -- (ltl);
    \draw[->,thick] (ltl) -- (contb);
    \draw[->,thick] (contb) -- (aarch);
    \draw[->,thick] (contb) to [out=0,in=200] (x86.west);
    \draw[->,thick] (contb) to [out=0,in=190] (backs.west);
    \draw[->,thick] (rtl) -- (htl);
    \draw[->,thick] (htl) -- (verilog);
    \draw[->,thick] (htl.west) to [out=180,in=150] (4,-2.2) to [out=330,in=270] (htl.south);
  \stoptikzpicture}
\stopplacefigure

\paragraph{Architecture of Vericert.} The main work flow of Vericert is given in
Fig.~\ref{fig:rtlbranch}, which shows those parts of the translation that are performed in CompCert,
and those that have been added.  This includes translations to two new intermediate languages added
in Vericert, HTL and Verilog, as well as an additional optimisation pass labelled as \quotation{RAM
insertion}.

\def\numcompcertlanguages{ten}

CompCert translates Clight\footnote{A deterministic subset of C with pure expressions.} input into
assembly output via a sequence of intermediate languages; we must decide which of these
\numcompcertlanguages{} languages is the most suitable starting point for the HLS-specific
translation stages.

We select CompCert's three-address code (3AC)\footnote{This is known as register transfer language
  (RTL) in the CompCert literature. `3AC' is used in this paper instead to avoid confusion with
  register-transfer level (RTL), which is another name for the final hardware target of the HLS
  tool.} as the starting point. Branching off \emph{before} this point (at CminorSel or earlier)
denies CompCert the opportunity to perform optimisations such as constant propagation and dead-code
elimination, which, despite being designed for software compilers, have been found useful in HLS
tools as well~\cite{cong+11}. And if we branch off \emph{after} this point (at LTL or later) then
CompCert has already performed register allocation to reduce the number of registers and spill some
variables to the stack; this transformation is not required in HLS because there are many more
registers available, and these should be used instead of RAM whenever
possible.

3AC is also attractive because it is the closest intermediate language to LLVM IR, which is used by
several existing HLS compilers.  It has an unlimited number of pseudo-registers, and is represented
as a control flow graph (CFG) where each instruction is a node with links to the instructions that
can follow it. One difference between LLVM IR and 3AC is that 3AC includes operations that are
specific to the chosen target architecture; we chose to target the x86\_32 back end because it
generally produces relatively dense 3AC thanks to the availability of complex addressing modes.

\subsection{An Introduction to Verilog}

This section will introduce Verilog for readers who may not be familiar with the language,
concentrating on the features that are used in the output of Vericert.  Verilog is a hardware
description language (HDL) and is used to design hardware ranging from complete CPUs that are
eventually produced as integrated circuits, to small application-specific accelerators that are
placed on FPGAs.  Verilog is a popular language because it allows for fine-grained control over the
hardware, and also provides high-level constructs to simplify development.

Verilog behaves quite differently to standard software programming languages due to it having to
express the parallel nature of hardware.  The basic construct to achieve this is the always-block,
which is a collection of assignments that are executed every time some event occurs.  In the case of
Vericert, this event is either a positive (rising) or a negative (falling) clock edge.  All
always-blocks triggering on the same event are executed in parallel. Always-blocks can also express
control-flow using if-statements and case-statements.

\startplacefigure
  \setupinterlinespace[line=2.8ex]
  \startfloatcombination [nx=2, ny=1]
    \startplacefigure
      \startframedtext[width={0.55\textwidth},frame=on,offset=none,bodyfont=11pt]
      \starthlverilog
        module main(input rst, input y, input clk,
                    output reg z);
          reg tmp, state;
          always @(posedge clk)
            case (state)
              1'b0: tmp <= y;
              1'b1: begin tmp <= 1'b0; z <= tmp; end
            endcase
          always @(posedge clk)
            if (rst) state <= 1'b0;
            else case (state)
              1'b0: if (y) state <= 1'b1;
                    else state <= 1'b0;
              1'b1: state <= 1'b0;
            endcase
        endmodule
      \stophlverilog
    \stopframedtext
    \stopplacefigure
    \startplacefigure
      \startframedtext[width={0.35\textwidth}]
      \starthlverilog
        module main(input rst, input y, input clk,
                    output reg z);
          reg tmp, state;
          always @(posedge clk)
            case (state)
              1'b0: tmp <= y;
              1'b1: begin tmp <= 1'b0; z <= tmp; end
            endcase
          always @(posedge clk)
            if (rst) state <= 1'b0;
            else case (state)
              1'b0: if (y) state <= 1'b1;
                    else state <= 1'b0;
              1'b1: state <= 1'b0;
            endcase
        endmodule
      \stophlverilog
    \stopframedtext
    \stopplacefigure
  \stopfloatcombination
\stopplacefigure

%\begin{figure}
%  \centering
%  \begin{subfigure}{0.55\linewidth}
%  \end{subfigure}\hfill%
%  \begin{subfigure}{0.45\linewidth}
%    \centering
%    \begin{tikzpicture}
%      \node[draw,circle,inner sep=6pt] (s0) at (0,0) {$S_{\mathit{start}} / \mono{x}$};
%      \node[draw,circle,inner sep=8pt] (s1) at (1.5,-3) {$S_{1} / \mono{1}$};
%      \node[draw,circle,inner sep=8pt] (s2) at (3,0) {$S_{0} / \mono{1}$};
%      \node (s2s) at ($(s2.west) + (-0.3,1)$) {\mono{00}};
%      \node (s2ss) at ($(s2.east) + (0.3,1)$) {\mono{1x}};
%      \draw[-{Latex[length=2mm,width=1.4mm]}] ($(s0.west) + (-0.3,1)$) to [out=0,in=120] (s0);
%      \draw[-{Latex[length=2mm,width=1.4mm]}] (s0)
%           to [out=-90,in=150] node[midway,left] {\mono{01}} (s1);
%      \draw[-{Latex[length=2mm,width=1.4mm]}] (s1)
%      to [out=80,in=220] node[midway,left] {\mono{xx}} (s2);
%      \draw[-{Latex[length=2mm,width=1.4mm]}] (s2)
%      to [out=260,in=50] node[midway,right] {\mono{01}} (s1);
%      \draw[-{Latex[length=2mm,width=1.4mm]}] (s2)
%      to [out=120,in=40] ($(s2.west) + (-0.3,0.7)$) to [out=220,in=170] (s2);
%      \draw[-{Latex[length=2mm,width=1.4mm]}] (s2)
%      to [out=60,in=130] ($(s2.east) + (0.3,0.7)$) to [out=310,in=10] (s2);
%    \end{tikzpicture}
%  \end{subfigure}
%  \Description{Verilog code of a state machine, and its equivalent state machine diagram.}
%  \caption{A simple state machine implemented in Verilog, with its diagrammatic representation on the right. The \mono{x} stands for ``don't care'' and each transition is labelled with the values of the inputs \mono{rst} and \mono{y} that trigger the transition.  The output that will be produced is shown in each state.}%
%  \label{fig:tutorial:state_machine}
%\end{figure}

A simple state machine can be implemented as shown in Fig.~\ref{fig:tutorial:state_machine}.  At
every positive edge of the clock (\mono{clk}), both of the always-blocks will trigger
simultaneously.  The first always-block controls the values in the register \mono{x} and the output
\mono{z}, while the second always-block controls the next state the state machine should go to.
When the \mono{state} is 0, \mono{tmp} will be assigned to the input \mono{y} using nonblocking
assignment, denoted by \mono{<=}.  Nonblocking assignment assigns registers in parallel at the end
of the clock cycle, rather than sequentially throughout the always-block. In the second
always-block, the input \mono{y} will be checked, and if it's high it will move on to the next
state, otherwise it will stay in the current state.  When \mono{state} is 1, the first always-block
will reset the value of \mono{tmp} and then set \mono{z} to the original value of \mono{tmp}, since
nonblocking assignment does not change its value until the end of the clock cycle.  Finally, the
last always-block will set the state to 0 again.

%\begin{figure}
%  \centering
%  \begin{subfigure}[b]{0.3\linewidth}
%    \begin{subfigure}[t]{1\linewidth}
%\begin{minted}[fontsize=\footnotesize,linenos,xleftmargin=20pt]{c}
%int main() {
%    int x[2] = {3, 6};
%    int i = 1;
%    return x[i];
%}
%\end{minted}
%      \caption{Example C code passed to Vericert.}\label{fig:accumulator_c}
%    \end{subfigure}\\\vspace{3em}
%    \begin{subfigure}[b]{1\linewidth}
%\begin{minted}[fontsize=\footnotesize,linenos,xleftmargin=20pt]{c}
%main() {
%    x5 = 3
%    int32[stack(0)] = x5
%    x4 = 6
%    int32[stack(4)] = x4
%    x1 = 1
%    x3 = stack(0) (int)
%    x2 = int32[x3 + x1
%               * 4 + 0]
%    return x2
%}
%\end{minted}
%      \caption{3AC produced by the CompCert front-end without any optimisations.}\label{fig:accumulator_rtl}
%    \end{subfigure}
%  \end{subfigure}\hfill%
%  \begin{subfigure}[b]{0.65\linewidth}
%\begin{minted}[fontsize=\tiny,linenos,xleftmargin=20pt]{verilog}
%module main(reset, clk, finish, return_val);
%  input [0:0] reset, clk;
%  output reg [0:0] finish = 0;
%  output reg [31:0] return_val = 0;
%  reg [31:0] reg_3 = 0, addr = 0, d_in = 0, reg_5 = 0, wr_en = 0;
%  reg [0:0] en = 0, u_en = 0;
%  reg [31:0] state = 0, reg_2 = 0, reg_4 = 0, d_out = 0, reg_1 = 0;
%  reg [31:0] stack [1:0];
%  // RAM interface
%  always @(negedge clk)
%    if ({u_en != en}) begin
%      if (wr_en) stack[addr] <= d_in;
%      else d_out <= stack[addr];
%      en <= u_en;
%    end
%  // Data-path
%  always @(posedge clk)
%    case (state)
%      32'd11: reg_2 <= d_out;
%      32'd8: reg_5 <= 32'd3;
%      32'd7: begin u_en <= ( ~ u_en); wr_en <= 32'd1;
%                   d_in <= reg_5; addr <= 32'd0; end
%      32'd6: reg_4 <= 32'd6;
%      32'd5: begin u_en <= ( ~ u_en); wr_en <= 32'd1;
%                   d_in <= reg_4; addr <= 32'd1; end
%      32'd4: reg_1 <= 32'd1;
%      32'd3: reg_3 <= 32'd0;
%      32'd2: begin u_en <= ( ~ u_en); wr_en <= 32'd0;
%                   addr <= {{{reg_3 + 32'd0} + {reg_1 * 32'd4}} / 32'd4}; end
%      32'd1: begin finish = 32'd1; return_val = reg_2; end
%      default: ;
%    endcase
%  // Control logic
%  always @(posedge clk)
%    if ({reset == 32'd1}) state <= 32'd8;
%    else case (state)
%           32'd11: state <= 32'd1;        32'd4: state <= 32'd3;
%           32'd8: state <= 32'd7;         32'd3: state <= 32'd2;
%           32'd7: state <= 32'd6;         32'd2: state <= 32'd11;
%           32'd6: state <= 32'd5;         32'd1: ;
%           32'd5: state <= 32'd4;         default: ;
%         endcase
%endmodule
%\end{minted}
%\caption{Verilog produced by Vericert. It demonstrates the instantiation of the RAM (lines 9--15), the data-path (lines 16--32) and the control logic (lines 33--42).}\label{fig:accumulator_v}
%\end{subfigure}
%\caption{Translating a simple program from C to Verilog.}\label{fig:accumulator_c_rtl}
%% \NR{Is the default in line 41 of (c) supposed to be empty?}\YH{Yep, that's how it's generated.}
%\end{figure}

\subsection{Translating C to Verilog by Example}
Fig.~\ref{fig:accumulator_c_rtl} illustrates the translation of a simple program that stores and retrieves values from an array.
In this section, we describe the stages of the Vericert translation, referring to this program as an example.

\subsubsection{Translating C to 3AC}

The first stage of the translation uses unmodified CompCert to transform the C input, shown in
Fig.~\ref{fig:accumulator_c}, into a 3AC intermediate representation, shown in
Fig.~\ref{fig:accumulator_rtl}.  As part of this translation, function inlining is performed on all
functions, which allows us to support function calls without having to support the \mono{Icall} 3AC
instruction.  Although the duplication of the function bodies caused by inlining can increase the
area of the hardware, it can have a positive effect on latency and is therefore a common HLS
optimisation~\cite{noronha17_rapid_fpga}. Inlining precludes support for recursive function calls,
but this feature is not supported in most HLS tools anyway~\cite{davidthomas_asap16}.

%\JW{Is that definitely true? Was discussing this with Nadesh and George recently, and I ended up not being so sure. Inlining could actually lead to \emph{reduced} resource usage because once everything has been inlined, the (big) scheduling problem could then be solved quite optimally. Certainly inlining is known to increase register pressure, but that's not really an issue here. If we're  not sure, we could just say that inlining everything leads to bloated Verilog files and the inability to support recursion, and leave it at that.}\YH{I think that is true, just because we don't do scheduling.  With scheduling I think that's true, inlining actually becomes quite good.}

\subsubsection{Translating 3AC to HTL}

%   + TODO Explain the main mapping in a short simple way

%   + TODO Clarify connection between CFG and FSMD

%   + TODO Explain how memory is mapped
%\JW{I feel like this could use some sort of citation, but I'm not sure what. I guess this is all from "Hardware Design 101", right?}\YH{I think I found a good one actually, which goes over the basics.}
%\JW{I think it would be worth having a sentence to explain how the C model of memory is translated to a hardware-centric model of memory. For instance, in C we have global variables/arrays, stack-allocated variables/arrays, and heap-allocated variables/arrays (anything else?). In Verilog we have registers and RAM blocks. So what's the correspondence between the two worlds? Globals and heap-allocated are not handled, stack-allocated variables become registers, and stack-allocated arrays become RAM blocks? Am I close?}\YH{Stack allocated variables become RAM as well, so that we can deal with addresses easily and take addresses of any variable.} \JW{I see, thanks. So, in short, the only registers in your hardware designs are those that store things like the current state, etc. You generate a fixed number of registers every time you synthesis -- you don't generate extra registers to store any of the program variables. Right?}

The next translation is from 3AC to a new hardware translation language (HTL). %, which is one step towards being completely translated to hardware described in Verilog.
This involves going from a CFG representation of the computation to a finite state machine with data-path (FSMD) representation~\cite{hwang99_fsmd}. The core idea of the FSMD representation is that it separates the control flow from the operations on the memory and registers. %\JP{I've become less comfortable with this term, but it's personal preference so feel free to ignore. I think `generalised finite state machine' (i.e.\ thinking of the entire `data-path' as contributing to the overall state) is more accurate.}\YH{Hmm, yes, I mainly chose FSMD because there is quite a lot of literature around it.  I think for now I'll keep it but for the final draft we could maybe change it.}
%This means that the state transitions can be translated into a simple finite state machine (FSM) where each state contains data operations that update the memory and registers.
Hence, an HTL program consists of two maps from states to Verilog statements: the \emph{control logic} map, which expresses state transitions, and the \emph{data-path} map, which expresses computations.
Fig.~\ref{fig:accumulator_diagram} shows the resulting FSMD architecture. The right-hand block is the control logic that computes the next state, while the left-hand block updates all the registers and RAM based on the current program state.

The HTL language was mainly introduced to simplify the proof of translation from 3AC to Verilog, as these languages have very different semantics.
It serves as an intermediate language with similar semantics to 3AC at the top level, using maps to represents what to execute at every state, and similar semantics to Verilog at the lower level by already using Verilog statements instead of more abstract instructions.
Compared to plain Verilog, HTL is simpler to manipulate and analyse, thereby making it easier to prove optimisations like proper RAM insertion.

%\begin{figure*}
%  \centering
%\definecolor{control}{HTML}{b3e2cd}
%\definecolor{data}{HTML}{fdcdac}
%\begin{tikzpicture}
%  \begin{scope}[scale=1.15]
%  \fill[control,fill opacity=1] (6.5,0) rectangle (12,5);
%  \fill[data,fill opacity=1] (0,0) rectangle (5.5,5);
%  \node at (1,4.7) {Data-path};
%  \node at (7.5,4.7) {Control Logic};
%
%  \fill[white,rounded corners=10pt] (7,0.5) rectangle (11.5,2.2);
%  \node at (8,2) {\footnotesize Next State FSM};
%  \foreach \x in {8,...,2}
%    {\pgfmathtruncatemacro{\y}{8-\x}%
%      \node[draw,circle,inner sep=0,minimum size=10,scale=0.8] (s\x) at (7.5+\y/2,1.35) {\tiny \x};}
%  \node[draw,circle,inner sep=0,minimum size=10,scale=0.8] (s1c) at (11,1.35) {\tiny 1};
%  \node[draw,circle,inner sep=0,minimum size=13,scale=0.8] (s1) at (s1c) {};
%  \foreach \x in {8,...,3}
%    {\pgfmathtruncatemacro{\y}{\x-1}\draw[-{Latex[length=1mm,width=0.7mm]}] (s\x) -- (s\y);}
%  \node[draw,circle,inner sep=0,minimum size=10,scale=0.8] (s11) at (10.5,0.9) {\tiny 11};
%  \draw[-{Latex[length=1mm,width=0.7mm]}] (s2) -- (s11);
%  \draw[-{Latex[length=1mm,width=0.7mm]}] (s11) -- (s1);
%  \draw[-{Latex[length=1mm,width=0.7mm]}] (7.2,1.7) to [out=0,in=100] (s8);
%
%  \node[draw,fill=white] (nextstate) at (9.25,3) {\tiny Current State};
%  \draw[-{Latex[length=1mm,width=0.7mm]}] let \p1 = (nextstate) in
%    (11.5,1.25) -| (11.75,\y1) -- (nextstate);
%  \draw let \p1 = (nextstate) in (nextstate) -- (6,\y1) |- (6,1.5);
%  \node[scale=0.5,rotate=60] at (7.5,0.75) {\mono{clk}};
%  \node[scale=0.5,rotate=60] at (7.7,0.75) {\mono{rst}};
%  \draw[-{Latex[length=1mm,width=0.7mm]}] (7.65,-0.5) -- (7.65,0.5);
%  \draw[-{Latex[length=1mm,width=0.7mm]}] (7.45,-0.5) -- (7.45,0.5);
%
%  \fill[white,rounded corners=10pt] (2,0.5) rectangle (5,3);
%  \filldraw[fill=white] (0.25,0.5) rectangle (1.5,2.75);
%  \node at (2.6,2.8) {\footnotesize Update};
%  \node[align=center] at (0.875,2.55) {\footnotesize \mono{RAM}};
%  \node[scale=0.5] at (4.7,1.5) {\mono{state}};
%  \draw[-{Latex[length=1mm,width=0.7mm]}] (6,1.5) -- (5,1.5);
%  \draw[-{Latex[length=1mm,width=0.7mm]}] (6,1.5) -- (7,1.5);
%  \node[scale=0.5,rotate=60] at (4.1,0.9) {\mono{finished}};
%  \node[scale=0.5,rotate=60] at (3.9,0.95) {\mono{return\_val}};
%  \node[scale=0.5,rotate=60] at (2.5,0.75) {\mono{clk}};
%  \node[scale=0.5,rotate=60] at (2.7,0.75) {\mono{rst}};
%
%  \node[scale=0.5,right,inner sep=5pt] (ram1) at (2,2.1) {\mono{u\_en}};
%  \node[scale=0.5,right,inner sep=5pt] (ram2) at (2,1.9) {\mono{wr\_en}};
%  \node[scale=0.5,right,inner sep=5pt] (ram3) at (2,1.7) {\mono{addr}};
%  \node[scale=0.5,right,inner sep=5pt] (ram4) at (2,1.5) {\mono{d\_in}};
%  \node[scale=0.5,right,inner sep=5pt] (ram5) at (2,1.3) {\mono{d\_out}};
%
%  \node[scale=0.5,left,inner sep=5pt] (r1) at (1.5,2.1) {\mono{u\_en}};
%  \node[scale=0.5,left,inner sep=5pt] (r2) at (1.5,1.9) {\mono{wr\_en}};
%  \node[scale=0.5,left,inner sep=5pt] (r3) at (1.5,1.7) {\mono{addr}};
%  \node[scale=0.5,left,inner sep=5pt] (r4) at (1.5,1.5) {\mono{d\_in}};
%  \node[scale=0.5,left,inner sep=5pt] (r5) at (1.5,1.3) {\mono{d\_out}};
%
%  \draw[-{Latex[length=1mm,width=0.7mm]}] (ram1) -- (r1);
%  \draw[-{Latex[length=1mm,width=0.7mm]}] (ram2) -- (r2);
%  \draw[-{Latex[length=1mm,width=0.7mm]}] (ram3) -- (r3);
%  \draw[-{Latex[length=1mm,width=0.7mm]}] (ram4) -- (r4);
%  \draw[-{Latex[length=1mm,width=0.7mm]}] (r5) -- (ram5);
%
%  \draw[-{Latex[length=1mm,width=0.7mm]}] (4,0.5) -- (4,-0.5);
%  \draw[-{Latex[length=1mm,width=0.7mm]}] (3.75,0.5) -- (3.75,-0.5);
%  \draw[-{Latex[length=1mm,width=0.7mm]}] (2.45,-0.5) -- (2.45,0.5);
%  \draw[-{Latex[length=1mm,width=0.7mm]}] (2.65,-0.5) -- (2.65,0.5);
%
%  \foreach \x in {0,...,1}
%  {\draw (0.25,1-0.25*\x) -- (1.5,1-0.25*\x); \node at (0.875,0.88-0.25*\x) {\tiny \x};}
%
%  %\node[scale=0.5] at (1.2,2.2) {\mono{wr\_en}};
%  %\node[scale=0.5] at (1.2,2) {\mono{wr\_addr}};
%  %\node[scale=0.5] at (1.2,1.8) {\mono{wr\_data}};
%  %\node[scale=0.5] at (1.2,1.4) {\mono{r\_addr}};
%  %\node[scale=0.5] at (1.2,1.2) {\mono{r\_data}};
%  %
%  %\node[scale=0.5] at (2.3,2.2) {\mono{wr\_en}};
%  %\node[scale=0.5] at (2.3,2) {\mono{wr\_addr}};
%  %\node[scale=0.5] at (2.3,1.8) {\mono{wr\_data}};
%  %\node[scale=0.5] at (2.3,1.4) {\mono{r\_addr}};
%  %\node[scale=0.5] at (2.3,1.2) {\mono{r\_data}};
%  %
%  %\draw[-{Latex[length=1mm,width=0.7mm]}] (2,2.2) -- (1.5,2.2);
%  %\draw[-{Latex[length=1mm,width=0.7mm]}] (2,2) -- (1.5,2);
%  %\draw[-{Latex[length=1mm,width=0.7mm]}] (2,1.8) -- (1.5,1.8);
%  %\draw[-{Latex[length=1mm,width=0.7mm]}] (2,1.4) -- (1.5,1.4);
%  %\draw[-{Latex[length=1mm,width=0.7mm]}] (1.5,1.2) -- (2,1.2);
%
%  \filldraw[fill=white] (2.8,3.25) rectangle (4.2,4.75);
%  \node at (3.5,4.55) {\footnotesize \mono{Registers}};
%  \draw[-{Latex[length=1mm,width=0.7mm]}] (2,2.4) -| (1.75,4) -- (2.8,4);
%  \draw[-{Latex[length=1mm,width=0.7mm]}] (4.2,4) -- (5.25,4) |- (5,2.4);
%  \draw[-{Latex[length=1mm,width=0.7mm]}] (5.25,2.4) -- (6.2,2.4) |- (7,1.8);
%
%  \node[scale=0.5] at (3.5,4.2) {\mono{reg\_1}};
%  \node[scale=0.5] at (3.5,4) {\mono{reg\_2}};
%  \node[scale=0.5] at (3.5,3.8) {\mono{reg\_3}};
%  \node[scale=0.5] at (3.5,3.6) {\mono{reg\_4}};
%  \node[scale=0.5] at (3.5,3.4) {\mono{reg\_5}};
%\end{scope}
%\end{tikzpicture}
%  \Description{Diagram displaying the data-path and its internal modules, as well as the control logic and its state machine.}
%  \caption{The FSMD for the example shown in Fig.~\ref{fig:accumulator_c_rtl}, split into a data-path and control logic for the next state calculation.  The Update block takes the current state, current values of all registers and at most one value stored in the RAM, and calculates a new value that can either be stored back in the RAM or in a register.}\label{fig:accumulator_diagram}
%\end{figure*}

%\JP{Does it? Verilog has neither physical registers nor RAMs, just language constructs which the synthesiser might implement with registers and RAMs. We should be clear whether we're talking about the HDL representation, or the synthesised result: in our case these can be very different since we don't target any specific architectural features of an FPGA fabric of ASIC process.}
\paragraph{Translating memory}
Typically, HLS-generated hardware consists of a sea of registers and RAMs.
This memory view is very different from the C memory model, so we perform the following translation from CompCert's abstract memory model to a concrete RAM.
Variables that do not have their address taken are kept in registers, which correspond to the registers in 3AC.
All address-taken variables, arrays, and structs are kept in RAM.
The stack of the main function becomes an unpacked array of 32-bit integers representing the RAM block.  Any loads and stores are temporarily translated to direct accesses to this array, where each address has its offset removed and is divided by four.  In a separate HTL-to-HTL conversion, these direct accesses are then translated to proper loads and stores that use a RAM interface to communicate with the RAM, shown on lines 21, 24 and 28 of Fig.~\ref{fig:accumulator_v}.  This pass inserts a RAM block with the interface around the unpacked array.  Without this interface and without the RAM block, the synthesis tool processing the Verilog hardware description would not identify the array as a RAM, and would instead implement it using many registers.  This interface is shown on lines 9--15 in the Verilog code in Fig.~\ref{fig:accumulator_v}.
A high-level overview of the architecture and of the RAM interface can be seen in Fig.~\ref{fig:accumulator_diagram}.

\paragraph{Translating instructions}

Most 3AC instructions correspond to hardware constructs.
%Each 3AC instruction either corresponds to a hardware construct or does not have to be handled by the translation, such as function calls (because of inlining). \JW{Are function calls the only 3AC instruction that we ignore? (And I guess return statements too for the same reason.)}\YH{Actually, return instructions are translated (because you can return from main whenever), so call instructions (Icall, Ibuiltin and Itailcall) are the only functions that are not handled.}
% JW: Thanks; please check proposed new text.
For example, line 2 in Fig.~\ref{fig:accumulator_rtl} shows a 32-bit register \mono{x5} being initialised to 3, after which the control flow moves execution to line 3. This initialisation is also encoded in the Verilog generated from HTL at state 8 in both the control logic and data-path always-blocks, shown at lines 33 and 16 respectively in Fig.~\ref{fig:accumulator_v}.  Simple operator instructions are translated in a similar way.  For example, the add instruction is just translated to the built-in add operator, similarly for the multiply operator.  All 32-bit instructions can be translated in this way, but some special instructions require extra care. One such instruction is the \mono{Oshrximm} instruction, which is discussed further in Section~\ref{sec:algorithm:optimisation:oshrximm}. Another is the \mono{Oshldimm} instruction, which is a left rotate instruction that has no Verilog equivalent and therefore has to be implemented in terms of other operations and proven to be equivalent.
% In addition to any non-32-bit operations, the remaining
The only 32-bit instructions that we do not translate are case-statements (\mono{Ijumptable}) and those instructions related to function calls (\mono{Icall}, \mono{Ibuiltin}, and \mono{Itailcall}), because we enable inlining by default.

\subsubsection{Translating HTL to Verilog}

Finally, we have to translate the HTL code into proper Verilog. % and prove that it behaves the same as the 3AC according to the Verilog semantics.
The challenge here is to translate our FSMD representation into a Verilog AST.  However, as all the instructions in HTL are already expressed as Verilog statements, only the top-level data-path and control logic maps need to be translated to valid Verilog case-statements.  We also require declarations for all the variables in the program, as well as declarations of the inputs and outputs to the module, so that the module can be used inside a larger hardware design.  In addition to translating the maps of Verilog statements, an always-block that will behave like the RAM also has to be created, which is only modelled abstractly at the HTL level.
Fig.~\ref{fig:accumulator_v} shows the final Verilog output that is generated for our example.

Although this translation seems quite straight\-forward, proving that this translation is correct is complex.
All the implicit assumptions that were made in HTL need to be translated explicitly to Verilog statements and it needs to be shown that these explicit behaviours are equivalent to the assumptions made in the HTL semantics.  One main example of this is proving that the specification of the RAM in HTL does indeed behave in the same as its Verilog implementation.
We discuss these proofs in upcoming sections.

 %In general, the generated Verilog structure has similar to that of the HTL code.
%The key difference is that the control and datapath maps become Verilog case-statements.
%Other additions are the initialisation of all the variables in the code to the correct bitwidths and the declaration of the inputs and outputs to the module, so that the module can be used inside a larger hardware design.

\subsection{Optimisations}

Although we would not claim that Vericert is a proper `optimising' HLS compiler yet, we have nonetheless made several design choices that aim to improve the quality of the hardware designs it produces.

\subsubsection{Byte- and Word-Addressable Memories}

One big difference between C and Verilog is how memory is represented.  Although Verilog arrays use similar syntax to C arrays, they must be treated quite differently. To make loads and stores as efficient as possible, the RAM needs to be word-addressable, which means that an entire integer can be loaded or stored in one clock cycle.
However, the memory model that CompCert uses for its intermediate languages is byte-addre\-ssa\-ble~\cite{blazy05_formal_verif_memor_model_c}.  If a byte-addressable memory was used in the target hardware, which is closer to CompCert's memory model, then a load and store would instead take four clock cycles, because a RAM can only perform one read and write per clock cycle.  It therefore has to be proven that the byte-addressable memory behaves in the same way as the word-addressable memory in hardware.  Any modifications of the bytes in the CompCert memory model also have to be shown to modify the word-addressable memory in the same way.  Since only integer loads and stores are currently supported in Vericert, it follows that the addresses given to the loads and stores will be multiples of four.  Translating from byte-addressed memory to word-addressed memory can then be done by dividing the address by four.

\subsubsection[sec:hls:algorithm:optimisation:ram]{Implementation of RAM Interface}
The simplest way to implement loads and stores in Vericert would be to access the Verilog array directly from within the data-path (i.e., inside the always-block on lines 16--32 of Fig.~\ref{fig:accumulator_v}). This would be correct, but when a Verilog array is accessed at several program points, the synthesis tool is unlikely to detect that it can be implemented as a RAM block, and will resort to using lots of registers instead, ruining the circuit's area and performance.  To avert this, we arrange that the data-path does not access memory directly, but simply sets the address it wishes to access and then toggles the \mono{u\_en} flag. This activates the RAM interface (lines 9--15 of Fig.~\ref{fig:accumulator_v}) on the next falling clock edge, which performs the requested load or store. By factoring all the memory accesses out into a separate interface, we ensure that the underlying array is only accessed from a single program point in the Verilog code, and thus ensure that the synthesis tool will correctly infer a RAM block.\footnote{Interestingly, the Verilog code shown for the RAM interface must not be modified, because the synthesis tool will only generate a RAM when the code matches a small set of specific patterns.}
%\JW{I tweaked this slightly in an attempt to clarify; please check.} %\NR{Bring forward this sentence to help with flow.}

%\JW{I think the following sentence could be cut as we've said this kind of thing a couple of times already.} Without the interface, the array would be implemented using registers, which would increase the size of the hardware considerably.

Therefore, an extra compiler pass is added from HTL to HTL to extract all the direct accesses to the Verilog array and replace them by signals that access the RAM interface in a separate always-block. The translation is performed by going through all the instructions and replacing each load and store expression in turn.  Stores can simply be replaced by the necessary wires directly. Loads are a little more subtle: loads that use the RAM interface take two clock cycles where a direct load from an array takes only one, so this pass inserts an extra state after each load.

%\JW{I've called that negedge always-block the `RAM driver' in my proposed text above -- that feels like quite a nice a word for it to my mind -- what do you think?}\YH{Yes I quite like it!}
%Verilog arrays can be used in a variety of ways, however, these do not all produce optimal hardware designs.  If, for example, arrays in Verilog are accessed immediately in the data-path, then the synthesis tool is not be able to identify it as having the right properties for a RAM, and would instead implement the array using registers.  This is extremely expensive, and for large memories this can easily blow up the area usage of the FPGA, and because of the longer wires that are needed, it would also affect the performance of the circuit.  The synthesis tools therefore provide code snippets that they know how to transform into various constructs, including snippets that will generate proper RAMs in the final hardware.  This process is called memory inference.  The initial translation from 3AC to HTL converts loads and stores to direct accesses to the memory, as this preserves the same behaviour without having to insert more registers and logic.  We therefore have another pass from HTL to itself which performs the translation from this na\"ive use of arrays to a representation which always allows for memory inference.  This pass creates a separate always-block to perform the loads and stores to the memory, and adds the necessary data, address and enable signals to communicate with that always-block from other always-blocks.  This always-block is shown between lines 10-15 in Fig.~\ref{fig:accumulator_v}.

There are two interesting parts to the inserted RAM interface.  Firstly, the memory updates are triggered on the negative (falling) edge of the clock, out of phase with the rest of the design which is triggered on the positive (rising) edge of the clock.  The advantage of this is that instead of loads and stores taking three clock cycles and two clock cycles respectively, they only take two clock cycles and one clock cycle instead, greatly improving their performance. %\JW{Is this a standard `trick' in hardware design? If so it might be nice to cite it.}\YH{Hmm, not really, because it has the downside of kind of halving your available clock period. However, RAMs normally come in both forms on the FPGA (Page 12, Fig. 2, \url{https://www.intel.com/content/dam/www/programmable/us/en/pdfs/literature/ug/ug_ram_rom.pdf})}
% JW: thanks!
Using the negative edge of the clock is widely supported by synthesis tools, and does not affect the maximum frequency of the final design.

Secondly, the logic in the enable signal of the RAM (\mono{en != u\_en}) is also atypical in hardware designs.  Enable signals are normally manually controlled and inserted into the appropriate states, by using a check like the following in the RAM: \mono{en == 1}.  This means that the RAM only turns on when the enable signal is set.  However, to make the proof simpler and avoid reasoning about possible side effects introduced by the RAM being enabled but not used, a RAM which disables itself after every use would be ideal.  One method for implementing this would be to insert an extra state after each load or store that disables the RAM, but this extra state would eliminate the speed advantage of the negative-edge-triggered RAM. Another method would be to determine the next state after each load or store and disable the RAM in that state, but this could quickly become complicated, especially in the case where the next state also contains a memory operation, and hence the disable signal should not be added. The method we ultimately chose was to have the RAM become enabled not when the enable signal is high, but when it \emph{toggles} its value.  This can be arranged by keeping track of the old value of the enable signal in \mono{en} and comparing it to the current value \mono{u\_en} set by the data-path.  When the values are different, the RAM gets enabled, and then \mono{en} is set to the value of \mono{u\_en}. This ensures that the RAM will always be disabled straight after it was used, without having to insert or modify any other states.

%We can instead generate a second enable signal that is set by the user, and the original enable signal is then updated by the RAM to be equal to the value that the user set.  This means that the RAM should be enabled whenever the two signals are different, and disabled otherwise.

%\begin{figure}
%  \centering
%  \begin{subfigure}[b]{0.48\linewidth}
%    \begin{tikztimingtable}[timing/d/background/.style={fill=white}]
%      \small clk & 2L 3{6C} \\
%      \small u\_en & 2D{u\_en} 18D{$\overline{\text{u\_en}}$}\\
%      \small addr & 2U 18D{3} \\
%      \small wr\_en & 2U 18L \\
%      \small en & 8D{u\_en} 12D{$\overline{\text{u\_en}}$}\\
%      \small d\_out & 8U 12D{0xDEADBEEF} \\
%      \small r & 14U 6D{0xDEADBEEF} \\
%      \extracode
%      \node[help lines] at (2,2.25) {\tiny 1};
%      \node[help lines] at (8,2.25) {\tiny 2};
%      \node[help lines] at (14,2.25) {\tiny 3};
%      \begin{pgfonlayer}{background}
%        \vertlines[help lines]{2,8,14}
%      \end{pgfonlayer}
%    \end{tikztimingtable}
%    \caption{Timing diagram for loads. At time 1, the \mono{u\_en} signal is toggled to enable the RAM. At time 2, \mono{d\_out} is set to the value stored at the address in the RAM, which is finally assigned to the register \mono{r} at time 3.}\label{fig:ram_load}
%  \end{subfigure}\hfill%
%  \begin{subfigure}[b]{0.48\linewidth}
%    \begin{tikztimingtable}[timing/d/background/.style={fill=white}]
%      \small clk & 2L 2{7C} \\
%      \small u\_en & 2D{u\_en} 14D{$\overline{\text{u\_en}}$}\\
%      \small addr & 2U 14D{3} \\
%      \small wr\_en & 2U 14H \\
%      \small d\_in & 2U 14D{0xDEADBEEF} \\
%      \small en & 9D{u\_en} 7D{$\overline{\text{u\_en}}$}\\
%      \small stack[addr] & 9U 7D{0xDEADBEEF} \\
%      \extracode
%      \node[help lines] at (2,2.25) {\tiny 1};
%      \node[help lines] at (9,2.25) {\tiny 2};
%      \begin{pgfonlayer}{background}
%        \vertlines[help lines]{2,9}
%      \end{pgfonlayer}
%    \end{tikztimingtable}
%    \caption{Timing diagram for stores. At time 1, the \mono{u\_en} signal is toggled to enable the RAM, and the address \mono{addr} and the data to store \mono{d\_in} are set. On the negative edge at time 2, the data is stored into the RAM.}\label{fig:ram_store}
%  \end{subfigure}
%  \Description{Timing diagrams of loads and stores, showing which signals are modified at which time step.}
%  \caption{Timing diagrams showing the execution of loads and stores over multiple clock cycles.}\label{fig:ram_load_store}
%\end{figure}

Fig.~\ref{fig:ram_load_store} gives an example of how the RAM interface behaves when values are loaded and stored.

\subsubsection[sec:hls:algorithm:optimisation:oshrximm]{Implementing the \mono{Oshrximm} Instruction}

% Mention that this optimisation is not performed sometimes (clang -03).

Many of the CompCert instructions map well to hardware, but \mono{Oshrximm} (efficient signed division by a power of two using a logical shift) is expensive if implemented na\"ively. The problem is that in CompCert it is specified as a signed division:
%\begin{equation*}
%  \mono{Oshrximm } x\ y = \text{round\_towards\_zero}\left(\frac{x}{2^{y}}\right)
%\end{equation*}
(where $x, y \in \mathbb{Z}$, $0 \leq y < 31$, and $-2^{31} \leq x < 2^{31}$) and instantiating divider circuits in hardware is well known to cripple performance. Moreover, since Vericert requires the result of a divide operation to be ready within a single clock cycle, the divide circuit needs to be entirely combinational. This is inefficient in terms of area, but also in terms of latency, because it means that the maximum frequency of the hardware must be reduced dramatically so that the divide circuit has enough time to finish.  It should therefore be implemented using a sequence of shifts.

CompCert eventually performs a translation from this representation into assembly code which uses
shifts to implement the division, however, the specification of the instruction in 3AC itself still
uses division instead of shifts, meaning this proof of the translation cannot be reused.  In
Vericert, the equivalence of the representation in terms of divisions and shifts is proven over the
integers and the specification, thereby making it simpler to prove the correctness of the Verilog
implementation in terms of shifts.

\section[title={A Formal Semantics for Verilog},reference={sec:verilog}]

This section describes the Verilog semantics that was chosen for the
target language, including the changes that were made to the semantics
to make it a suitable HLS target. The Verilog standard is quite large~,
but the syntax and semantics can be reduced to a small subset that needs
to target. This section also describes how 's representation of memory
differs from 's memory model.

The Verilog semantics we use is ported to Coq from a semantics written
in HOL4 by and used to prove the translation from HOL4 to Verilog~. This
semantics is quite practical as it is restricted to a small subset of
Verilog, which can nonetheless be used to model the hardware constructs
required for HLS. The main features that are excluded are continuous
assignment and combinational always-blocks; these are modelled in other
semantics such as that by~.

The semantics of Verilog differs from regular programming languages, as
it is used to describe hardware directly, which is inherently parallel,
rather than an algorithm, which is usually sequential. The main
construct in Verilog is the always-block. A module can contain multiple
always-blocks, all of which run in parallel. These always-blocks further
contain statements such as if-statements or assignments to variables. We
support only {\em synchronous} logic, which means that the always-block
is triggered on (and only on) the positive or negative edge of a clock
signal.

The semantics combines the big-step and small-step styles. The overall
execution of the hardware is described using a small-step semantics,
with one small step per clock cycle; this is appropriate because
hardware is routinely designed to run for an unlimited number of clock
cycles and the big-step style is ill-suited to describing infinite
executions. Then, within each clock cycle, a big-step semantics is used
to execute all the statements. An example of a rule for executing an
always-block that is triggered at the positive edge of the clock is
shown below, where $\Sigma$ is the state of the registers in the module
and $s$ is the statement inside the always-block:

%\startformula \inferrule[Always]{(\Sigma, s)\downarrow_{\text{stmnt}} \Sigma'}{(\Sigma, \yhkeyword{always @(posedge clk) } s) \downarrow_{\text{always}^{+}} \Sigma'} \stopformula

This rule says that assuming the statement $s$ in the always-block runs
with state $\Sigma$ and produces the new state $\Sigma'$, the
always-block will result in the same final state.

Two types of assignments are supported in always-blocks: nonblocking and
blocking assignment. Nonblocking assignments all take effect
simultaneously at the end of the clock cycle, while blocking assignments
happen instantly so that later assignments in the clock cycle can pick
them up. To model both of these assignments, the state $\Sigma$ has to
be split into two maps: $\Gamma$, which contains the current values of
all variables and arrays, and $\Delta$, which contains the values that
will be assigned at the end of the clock cycle. $\Sigma$ can therefore
be defined as follows: $\Sigma = (\Gamma, \Delta)$. Nonblocking
assignment can therefore be expressed as follows:
%\startformula \inferrule[Nonblocking Reg]{\yhkeyword{name}\ d = \yhkeyword{OK}\ n \\ (\Gamma, e) \downarrow_{\text{expr}} v}{((\Gamma, \Delta), d\ \yhkeyword{ <= } e) \downarrow_{\text{stmnt}} (\Gamma, \Delta [n \mapsto v])}\\ \stopformula

where assuming that $\downarrow_{\text{expr}}$ evaluates an expression
$e$ to a value $v$, the nonblocking assignment $d\ \mono{ <= } e$
updates the future state of the variable $d$ with value $v$.

Finally, the following rule dictates how the whole module runs in one
clock cycle:
%\startformula \inferrule[Module]{(\Gamma, \epsilon, \vec{m})\ \downarrow_{\text{module}} (\Gamma', \Delta')}{(\Gamma, \yhkeyword{module } \yhconstant{main} \yhkeyword{(...);}\ \vec{m}\ \yhkeyword{endmodule}) \downarrow_{\text{program}} (\Gamma'\ //\ \Delta')} \stopformula
where $\Gamma$ is the initial state of all the variables, $\epsilon$ is
the empty map because the $\Delta$ map is assumed to be empty at the
start of the clock cycle, and $\vec{m}$ is a list of variable
declarations and always-blocks that $\downarrow_{\text{module}}$
evaluates sequentially to obtain $(\Gamma', \Delta')$. The final state
is obtained by merging these maps using the $//$ operator, which gives
priority to the right-hand operand in a conflict. This rule ensures that
the nonblocking assignments overwrite at the end of the clock cycle any
blocking assignments made during the cycle.

\subsection[title={Changes to the
Semantics},reference={changes-to-the-semantics}]

Five changes were made to the semantics proposed by to make it suitable
as an HLS target.

\subsubsection[title={Adding array
support},reference={adding-array-support}]

The main change is the addition of support for arrays, which are needed
to model RAM in Verilog. RAM is needed to model the stack in C
efficiently, without having to declare a variable for each possible
stack location. Consider the following Verilog code:

\starthlverilog
reg [31:0] x[1:0];
always @(posedge clk) begin x[0] = 1; x[1] <= 1; end
\stophlverilog

which modifies one array element using blocking assignment and then a
second using nonblocking assignment. If the existing semantics were used
to update the array, then during the merge, the entire array \type{x}
from the nonblocking association map would replace the entire array from
the blocking association map. This would replace \type{x[0]} with its
original value and therefore behave incorrectly. Accordingly, we
modified the maps so they record updates on a per-element basis. Our
state $\Gamma$ is therefore further split up into $\Gamma_{r}$ for
instantaneous updates to variables, and $\Gamma_{a}$ for instantaneous
updates to arrays ($\Gamma = (\Gamma_{r}, \Gamma_{a})$); $\Delta$ is
split similarly ($\Delta = (\Delta_{r}, \Delta_{a})$). The merge
function then ensures that only the modified indices get updated when
$\Gamma_{a}$ is merged with the nonblocking map equivalent $\Delta_{a}$.

\subsubsection[title={Adding negative edge
support},reference={adding-negative-edge-support}]

To reason about circuits that execute on the negative edge of the clock
(such as our RAM interface described in
Section~\goto{{[}sec:algorithm:optimisation:ram{]}}[sec:algorithm:optimisation:ram]),
support for negative-edge-triggered always-blocks was added to the
semantics. This is shown in the modifications of the {\sc Module} rule
shown below:

%\startformula \inferrule[Module]{(\Gamma, \epsilon, \vec{m})\ \downarrow_{\text{module}^{+}} (\Gamma', \Delta') \\ (\Gamma'\ //\ \Delta', \epsilon, \vec{m}) \downarrow_{\text{module}^{-}} (\Gamma'', \Delta'')}{(\Gamma, \yhkeyword{module}\ \yhconstant{main} \yhkeyword{(...);}\ \vec{m}\ \yhkeyword{endmodule}) \downarrow_{\text{program}} (\Gamma''\ //\ \Delta'')} \stopformula

The main execution of the module $\downarrow_{\text{module}}$ is split
into $\downarrow_{\text{module}^{+}}$ and
$\downarrow_{\text{module}^{-}}$, which are rules that only execute
always-blocks triggered at the positive and at the negative edge
respectively. The positive-edge-triggered always-blocks are processed in
the same way as in the original {\sc Module} rule. The output maps
$\Gamma'$ and $\Delta'$ are then merged and passed as the blocking
assignments map into the negative edge execution, so that all the
blocking and nonblocking assignments are present. Finally, all the
negative-edge-triggered always-blocks are processed and merged to give
the final state.

\subsubsection[adding-declarations]{Adding declarations}

Explicit support for declaring inputs, outputs and internal variables
was added to the semantics to make sure that the generated Verilog also
contains the correct declarations. This adds some guarantees to the
generated Verilog and ensures that it synthesises and simulates
correctly.

\subsubsection[removing-support-for-external-inputs-to-modules]{Removing support for external inputs
  to modules}

Support for receiving external inputs was removed from the semantics for
simplicity, as these are not needed for an HLS target. The main module
in Verilog models the main function in C, and since the inputs to a C
function should not change during its execution, there is no need for
external inputs for Verilog modules.

\subsubsection[simplifying-representation-of-bitvectors]{Simplifying representation of bitvectors}

Finally, we use 32-bit integers to represent bitvectors rather than
arrays of booleans. This is because (currently) only supports types
represented by 32 bits.

\subsection[sec:verilog:integrating]{Integrating the Verilog Semantics into 's Model}

%\startformula \begin{gathered}
%      \inferrule[Step]{\Gamma_r[\mathit{rst}] = 0 \\ \Gamma_r[\mathit{fin}] = 0 \\ (m, (\Gamma_r, \Gamma_a))\ \downarrow_{\text{program}} (\Gamma_r', \Gamma_a')}{\yhconstant{State}\ \mathit{sf}\ m\ \ \Gamma_r[\sigma]\ \ \Gamma_r\ \Gamma_a \longrightarrow \yhconstant{State}\ \mathit{sf}\ m\ \ \Gamma_r'[\sigma]\ \ \Gamma_r'\ \Gamma_a'}\\
%      %
%      \inferrule[Finish]{\Gamma_r[\mathit{fin}] = 1}{\yhconstant{State}\ \mathit{sf}\ m\ \sigma\ \Gamma_r\ \Gamma_a \longrightarrow \yhconstant{Returnstate}\ \mathit{sf}\ \Gamma_r[ \mathit{ret}]}\\
%      %
%      \inferrule[Call]{ }{\yhconstant{Callstate}\ \mathit{sf}\ m\ \vec{r} \longrightarrow \yhconstant{State}\ \mathit{sf}\ m\ n\ ((\yhfunction{init\_params}\ \vec{r}\ a)[\sigma \mapsto n, \mathit{fin} \mapsto 0, \mathit{rst} \mapsto 0])\ \epsilon}\\
%      %
%      \inferrule[Return]{ }{\yhconstant{Returnstate}\ (\yhconstant{Stackframe}\ r\ m\ \mathit{pc}\ \Gamma_r\ \Gamma_a :: \mathit{sf})\ v \longrightarrow \yhconstant{State}\ \mathit{sf}\ m\ \mathit{pc}\ (\Gamma_{r} [ \sigma \mapsto \mathit{pc}, r \mapsto v ])\ \Gamma_{a}}
%    \end{gathered} \stopformula

The computation model defines a set of states through which execution
passes. In this subsection, we explain how we extend our Verilog
semantics with four special-purpose registers in order to integrate it
into .

executions pass through three main states:

%\startdescription{\type{State} $\mathit{sf}$ $m$ $v$ $\Gamma_{r}$
%$\Gamma_{a}$}
%  The main state when executing a function, with stack frame
%  $\mathit{sf}$, current module $m$, current state $v$ and variable
%  states $\Gamma_{r}$ and $\Gamma_{a}$.
%\stopdescription
%
%\startdescription{\type{Callstate} $\mathit{sf}$ $m$ $\vec{r}$}
%  The state that is reached when a function is called, with the current
%  stack frame $\mathit{sf}$, current module $m$ and arguments $\vec{r}$.
%\stopdescription
%
%\startdescription{\type{Returnstate} $\mathit{sf}$ $v$}
%  The state that is reached when a function returns back to the caller,
%  with stack frame $\mathit{sf}$ and return value $v$.
%\stopdescription
%
%To support this computational model, we extend the Verilog module we
%generate with the following four registers and a RAM block:
%
%\startdescription{program counter}
%  The program counter can be modelled using a register that keeps track
%  of the state, denoted as $\sigma$.
%\stopdescription
%
%\startdescription{function entry point}
%  When a function is called, the entry point denotes the first
%  instruction that will be executed. This can be modelled using a reset
%  signal that sets the state accordingly, denoted as $\mathit{rst}$.
%\stopdescription
%
%\startdescription{return value}
%  The return value can be modelled by setting a finished flag to 1 when
%  the result is ready, and putting the result into a 32-bit output
%  register. These are denoted as $\mathit{fin}$ and $\mathit{ret}$
%  respectively.
%\stopdescription
%
%\startdescription{stack}
%  The function stack can be modelled as a RAM block, which is
%  implemented using an array in the module, and denoted as
%  $\mathit{stk}$.
%\stopdescription

Fig.~\goto{{[}fig:inference_module{]}}[fig:inference_module] shows the
inference rules for moving between the computational states. The first,
{\sc Step}, is the normal rule of execution. It defines one step in the
\type{State} state, assuming that the module is not being reset, that
the finish state has not been reached yet, that the current and next
state are $v$ and $v'$, and that the module runs from state $\Gamma$ to
$\Gamma'$ using the {\sc Step} rule. The {\sc Finish} rule returns the
final value of running the module and is applied when the $\mathit{fin}$
register is set; the return value is then taken from the $\mathit{ret}$
register.

Note that there is no step from \type{State} to \type{Callstate}; this
is because function calls are not supported, and it is therefore
impossible in our semantics ever to reach a \type{Callstate} except for
the initial call to main. So the {\sc Call} rule is only used at the
very beginning of execution; likewise, the {\sc Return} rule is only
matched for the final return value from the main function. Therefore, in
addition to the rules shown in
Fig.~\goto{{[}fig:inference_module{]}}[fig:inference_module], an initial
state and final state need to be defined:

%\startformula \begin{gathered}
%  \inferrule[Initial]{\yhfunction{is\_internal}\ P.\mono{main}}{\yhfunction{initial\_state}\ (\yhconstant{Callstate}\ []\ P.\mono{main}\ [])}\qquad
%  \inferrule[Final]{ }{\yhfunction{final\_state}\ (\yhconstant{Returnstate}\ []\ n)\ n}\end{gathered} \stopformula

where the initial state is the \type{Callstate} with an empty stack
frame and no arguments for the \type{main} function of program $P$,
where this \type{main} function needs to be in the current translation
unit. The final state results in the program output of value $n$ when
reaching a \type{Returnstate} with an empty stack frame.

\subsection[title={Memory Model},reference={sec:verilog:memory}]

The Verilog semantics do not define a memory model for Verilog, as this
is not needed for a hardware description language. There is no
preexisting architecture that Verilog will produce; it can describe any
memory layout that is needed. Instead of having specific semantics for
memory, the semantics only needs to support the language features that
can produce these different memory layouts, these being Verilog arrays.
We therefore define semantics for updating Verilog arrays using blocking
and nonblocking assignment. We then have to prove that the C memory
model that uses matches with the interpretation of arrays used in
Verilog. The memory model is infinite, whereas our representation of
arrays in Verilog is inherently finite. There have already been efforts
to define a general finite memory model for all intermediate languages
in , such as S~ or -TSO~, or keeping the intermediate languages intact
and translate to a more concrete finite memory model in the back end,
such as in ELF~. We also define such a translation from 's standard
infinite memory model to finite arrays that can be represented in
Verilog. There is therefore no more notion of an abstract memory model
and all the interactions to memory are encoded in the hardware itself.

This translation is represented in
Fig.~\goto{{[}fig:memory_model_transl{]}}[fig:memory_model_transl].
defines a map from blocks to maps from memory addresses to memory
contents. Each block represents an area in memory; for example, a block
can represent a global variable or a stack for a function. As there are
no global variables, the main stack can be assumed to be block 0, and
this is the only block we translate. Meanwhile, our Verilog semantics
defines two finite arrays of optional values, one for the blocking
assignments map $\Gamma_{\rm a}$ and one for the nonblocking assignments
map $\Delta_{\rm a}$. The optional values are present to ensure correct
merging of the two association maps at the end of the clock cycle. The
invariant used in the proofs is that block 0 should be equivalent to the
merged representation of the $\Gamma_{\rm a}$ and $\Delta_{\rm a}$ maps.

\section[sec:proof]{Correctness Proof}

Now that the Verilog semantics have been adapted to the CompCert model,
we are in a position to formally prove the correctness of our
C-to-Verilog compilation. This section describes the main correctness
theorem that was proven and the key ideas in the proof. The full Coq
proof is available online~.

\subsection[main-challenges-in-the-proof]{Main Challenges in the Proof}

The proof of correctness of the Verilog back end is quite different from
the usual proofs performed in CompCert, mainly because of the difference
in the memory model and semantic differences between Verilog and
CompCert's existing intermediate languages.

\startitemize
\item
  As already mentioned in
  Section~\goto{{[}sec:verilog:memory{]}}[sec:verilog:memory], because
  the memory model in our Verilog semantics is finite and concrete, but
  the CompCert memory model is more abstract and infinite with
  additional bounds, the equivalence of these models needs to be proven.
  Moreover, our memory is word-addressed for efficiency reasons, whereas
  CompCert's memory is byte-addressed.
\item
  Second, the Verilog semantics operates quite differently to the usual
  intermediate languages in CompCert. All the CompCert intermediate
  languages use a map from control-flow nodes to instructions. An
  instruction can therefore be selected using an abstract program
  pointer. Meanwhile, in the Verilog semantics the whole design is
  executed at every clock cycle, because hardware is inherently
  parallel. The program pointer is part of the design as well, not just
  part of an abstract state. This makes the semantics of Verilog
  simpler, but comparing it to the semantics of 3AC becomes more
  challenging, as one has to map the abstract notion of the state to
  concrete values in registers.
\stopitemize

Together, these differences mean that translating 3AC directly to
Verilog is infeasible, as the differences in the semantics are too
large. Instead, HTL, which was introduced in
Section~\goto{{[}sec:design{]}}[sec:design], bridges the gap in the
semantics between the two languages. HTL still consists of maps, like
many of the other CompCert languages, but each state corresponds to a
Verilog statement.

\subsection[formulating-the-correctness-theorem]{Formulating the Correctness Theorem}

The main correctness theorem is analogous to that stated in ~: for all
Clight source programs $C$, if the translation to the target Verilog
code succeeds, $\mathit{Safe}(C)$ holds and the target Verilog has
behaviour $B$ when simulated, then $C$ will have the same behaviour $B$.
$\mathit{Safe}(C)$ means all observable behaviours of $C$ are safe,
which can be defined as
$\forall B,\ C \Downarrow B \implies B \in \mono{Safe}$. A behaviour
is in \type{Safe} if it is either a final state (in the case of
convergence) or divergent, but it cannot \quote{go wrong}. (This means
that the source program must not contain undefined behaviour.) In , a
behaviour is also associated with a trace of I/O events, but since
external function calls are not supported in , this trace will always be
empty.

Whenever the translation from $C$ succeeds and produces Verilog $V$, and
all observable behaviours of $C$ are safe, then $V$ has behaviour $B$
only if $C$ has behaviour $B$.
%\startformula \forall C, V, B,\quad \yhfunction{HLS} (C) = \yhconstant{OK} (V) \land \mathit{Safe}(C) \implies (V \Downarrow B \implies C \Downarrow B). \stopformula

Why is this correctness theorem also the right one for HLS? It could be
argued that hardware inherently runs forever and therefore does not
produce a definitive final result. This would mean that the correctness
theorem would probably be unhelpful with proving hardware correctness,
as the behaviour would always be divergent. However, in practice, HLS
does not normally produce the top-level of the design that needs to
connect to other components, therefore needing to run forever. Rather,
HLS often produces smaller components that take an input, execute, and
then terminate with an answer. To start the execution of the hardware
and to signal to the HLS component that the inputs are ready, the
$\mathit{rst}$ signal is set and unset. Then, once the result is ready,
the $\mathit{fin}$ signal is set and the result value is placed in
$\mathit{ret}$. These signals are also present in the semantics of
execution shown in
Fig.~\goto{{[}fig:inference_module{]}}[fig:inference_module]. The
correctness theorem therefore also uses these signals, and the proof
shows that once the $\mathit{fin}$ flag is set, the value in
$\mathit{ret}$ is correct according to the semantics of Verilog and
Clight. Note that the compiler is allowed to fail and not produce any
output; the correctness theorem only applies when the translation
succeeds.

How can we prove this theorem? First, note that the theorem is a
\quote{backwards simulation} result (every target behaviour must also be
a source behaviour), following the terminology used in the literature~.
The reverse direction (every source behaviour must also be a target
behaviour) is not demanded because compilers are permitted to resolve
any non-determinism present in their source programs. However, since
Clight programs are all deterministic, as are the Verilog programs in
the fragment we consider, we can actually reformulate the correctness
theorem above as a forwards simulation result (following standard
practice), which makes it easier to prove. To prove this forward
simulation, it suffices to prove forward simulations between each pair
of consecutive intermediate languages, as these results can be composed
to prove the correctness of the whole HLS tool. The forward simulation
from 3AC to HTL is stated in Lemma~\goto{{[}lemma:htl{]}}[lemma:htl]
(Section~\goto{1.3}[sec:proof:3ac_htl]), the forward simulation for the
RAM insertion is shown in
Lemma~\goto{{[}lemma:htl_ram{]}}[lemma:htl_ram]
(Section~\goto{1.4}[sec:proof:ram_insertion]), then the forward
simulation between HTL and Verilog is shown in
Lemma~\goto{{[}lemma:verilog{]}}[lemma:verilog]
(Section~\goto{1.5}[sec:proof:htl_verilog]), and finally, the proof that
Verilog is deterministic is given in
Lemma~\goto{{[}lemma:deterministic{]}}[lemma:deterministic]
(Section~\goto{1.6}[sec:proof:deterministic]).

\subsection[sec:proof:3ac_htl]{Forward Simulation from 3AC to HTL}

As HTL is quite far removed from 3AC, this first translation is the most
involved and therefore requires a larger proof, because the translation
from 3AC instructions to Verilog statements needs to be proven correct
in this step. In addition to that, the semantics of HTL are also quite
different to the 3AC semantics. Instead of defining small-step semantics
for each construct in Verilog, the semantics are defined over one clock
cycle and mirror the semantics defined for Verilog.
Lemma~\goto{{[}lemma:htl{]}}[lemma:htl] shows the result that needs to
be proven in this subsection.

\reference[lemma:htl]{} Writing \type{tr_htl} for the translation from
3AC to HTL, we have:
%\startformula \forall c, h, B \in \mono{Safe},\quad \yhfunction{tr\_htl} (c) = \yhconstant{OK} (h) \land c \Downarrow B \implies h \Downarrow B. \stopformula

{\em Proof sketch.} We prove this lemma by first establishing a
specification of the translation function $\mono{tr\_htl}$ that
captures its important properties, and then splitting the proof into two
parts: one to show that the translation function does indeed meet its
specification, and one to show that the specification implies the
desired simulation result. This strategy is in keeping with standard
practice.~◻

\subsubsection[sec:proof:3ac_htl:specification]{From Implementation to Specification}

The specification for the translation of 3AC instructions into HTL
data-path and control logic can be defined by the following predicate:
%\startformula \yhfunction{spec\_instr}\ \mathit{fin}\ \mathit{ret}\ \sigma\ \mathit{stk}\ i\ \mathit{data}\ \mathit{control} \stopformula

Here, the $\mathit{control}$ and $\mathit{data}$ parameters are the
statements that the current 3AC instruction $i$ should translate to. The
other parameters are the special registers defined in
Section~\goto{{[}sec:verilog:integrating{]}}[sec:verilog:integrating].
An example of a rule describing the translation of an arithmetic/logical
operation from 3AC is the following:
%\startformula \inferrule[Iop]{\yhfunction{tr\_op}\ \mathit{op}\ \vec{a} = \yhconstant{OK}\ e}{\yhfunction{spec\_instr}\ \mathit{fin}\ \mathit{ret}\ \sigma\ \mathit{stk}\ (\yhconstant{Iop}\ \mathit{op}\ \vec{a}\ d\ n)\ (d\ \yhkeyword{<=}\ e)\ (\sigma\ \yhkeyword{<=}\ n)} \stopformula

Assuming that the translation of the operator $\mathit{op}$ with
operands $\vec{a}$ is successful and results in expression $e$, the rule
describes how the destination register $d$ is updated to $e$ via a
non-blocking assignment in the data path, and how the program counter
$\sigma$ is updated to point to the next CFG node $n$ via another
non-blocking assignment in the control logic.

In the following lemma, $\mono{spec\_htl}$ is the top-level
specification predicate, which is built using $\mono{spec\_instr}$
at the level of instructions.

\reference[lemma:specification]{} If a 3AC program $c$ is translated
correctly to an HTL program $h$, then the specification of the
translation holds.
%\startformula \forall c, h,\quad \yhfunction{tr\_htl} (c) = \yhconstant{OK}(h) \implies \yhfunction{spec\_htl}\ c\ h. \stopformula

\subsubsection[from-specification-to-simulation]{From Specification to Simulation}

To prove that the specification predicate implies the desired forward
simulation, we must first define a relation that matches each 3AC state
to an equivalent HTL state. This relation also captures the assumptions
made about the 3AC code that we receive from . These assumptions then
have to be proven to always hold assuming the HTL code was created by
the translation algorithm. Some of the assumptions that need to be made
about the 3AC and HTL code for a pair of states to match are:

\startitemize
\item
  The 3AC register file $R$ needs to be \quote{less defined} than the
  HTL register map $\Gamma_{r}$ (written $R \le \Gamma_{r}$). This means
  that all entries should be equal to each other, unless a value in $R$
  is undefined, in which case any value can match it.
\item
  The RAM values represented by each Verilog array in $\Gamma_{a}$ need
  to match the 3AC function's stack contents, which are part of the
  memory $M$; that is: $M \le \Gamma_{a}$.
\item
  The state is well formed, which means that the value of the state
  register matches the current value of the program counter; that is:
  $\mathit{pc} = \Gamma_{r}[\sigma]$.
\stopitemize

We also define the following set $\mathcal{I}$ of invariants that must
hold for the current state to be valid:

\startitemize
\item
  that all pointers in the program use the stack as a base pointer,
\item
  that any loads or stores to locations outside of the bounds of the
  stack result in undefined behaviour (and hence we do not need to
  handle them),
\item
  that $\mathit{rst}$ and $\mathit{fin}$ are not modified and therefore
  stay at a constant 0 throughout execution, and
\item
  that the stack frames match.
\stopitemize

We can now define the simulation diagram for the translation. The 3AC
state can be represented by the tuple $(R,M,\mathit{pc})$, which
captures the register file, memory, and program counter. The HTL state
can be represented by the pair $(\Gamma_{r}, \Gamma_{a})$, which
captures the states of all the registers and arrays in the module.
Finally, $\mathcal{I}$ stands for the other invariants that need to hold
for the states to match.

\reference[lemma:simulation_diagram]{} Given the 3AC state
$(R,M,\mathit{pc})$ and the matching HTL state
$(\Gamma_{r}, \Gamma_{a})$, assuming one step in the 3AC semantics
produces state $(R',M',\mathit{pc}')$, there exist one or more steps in
the HTL semantics that result in matching states
$(\Gamma_{r}', \Gamma_{a}')$. This is all under the assumption that the
specification $\mono{spec\_{htl}}$ holds for the translation.

{\em Proof sketch.} This simulation diagram is proven by induction over
the operational semantics of 3AC, which allows us to find one or more
steps in the HTL semantics that will produce the same final matching
state.~◻

\subsection[sec:proof:ram_insertion]{Forward Simulation of RAM Insertion}

%\startformula \begin{gathered}
%      \inferrule[Idle]{\Gamma_{\rm r}[\mathit{r.en}] = \Gamma_{\rm r}[\mathit{r.u_{en}}]}{((\Gamma_{\rm r}, \Gamma_{\rm a}), \Delta, r) \downarrow_{\text{ram}} \Delta}\\
%%
%      \inferrule[Load]{\Gamma_{\rm r}[\mathit{r.en}] \ne \Gamma_{\rm r}[\mathit{r.u_{en}}] \\ \Gamma_{\rm r}[\mathit{r.wr_{en}}] = 0}{((\Gamma_{\rm r}, \Gamma_{\rm a}), (\Delta_{\rm r}, \Delta_{\rm a}), r) \downarrow_{\text{ram}} (\Delta_{\rm r}[\mathit{r.en} \mapsto \mathit{r.u_{en}}, \mathit{r.d_{out}} \mapsto (\Gamma_{\rm a}[\mathit{r.mem}])[ \mathit{r.addr}]], \Delta_{\rm a}) }\\
%%
%      \inferrule[Store]{\Gamma_{\rm r}[\mathit{r.en}] \ne \Gamma_{\rm r}[\mathit{r.u_{en}}] \\ \Gamma_{\rm r}[\mathit{r.wr_{en}}] = 1}{((\Gamma_{\rm r}, \Gamma_{\rm a}), (\Delta_{\rm r}, \Delta_{\rm a}), r) \downarrow_{\text{ram}} (\Delta_{\rm r}[\mathit{r.en} \mapsto \mathit{r.u\_en}], \Delta_{\rm a}[\mathit{r.mem} \mapsto (\Gamma_{\rm a}[ \mathit{r.mem}])[\mathit{r.addr} \mapsto \mathit{r.d_{in}}]]) }
%    \end{gathered} \stopformula

HTL can only represent a single state machine, so we must model the RAM
abstractly to reason about the correctness of replacing the direct read
and writes to the array by loads and stores to a RAM. The specification
for the RAM is shown in
Fig.~\goto{{[}fig:htl_ram_spec{]}}[fig:htl_ram_spec], which defines how
the RAM $r$ will behave for all the possible combinations of the input
signals.

\subsubsection[from-implementation-to-specification]{From Implementation to Specification}

The first step in proving the simulation correct is to build a
specification of the translation algorithm. There are three
possibilities for the transformation of an instruction. For each Verilog
statement in the map at location $i$, the statement is either a load, a
store, or neither. The load or store is translated to the equivalent
representation using the RAM specification and all other instructions
are left intact. An example of the specification for the translation of
the store instruction is shown below, where $\sigma$ is state register,
$r$ is the RAM, $d$ and $c$ are the input data-path and control logic
maps, and $i$ is the current state. ($n$ is the newly inserted state,
which only applies to the translation of loads.)

%\startformula \begin{gathered}
%  \inferrule[Store Spec]{ d[i] = (r.mem\mono{[}e_{1}\mono{]} \mono{ <= } e_{2}) \\ t = (r.u\_en \mono{ <= } \neg r.u\_en; r.wr\_en \mono{ <= } 1; r.d\_in \mono{ <= } e_{2}; r.addr \mono{ <= } e_{1})}%
%  {\yhfunction{spec\_ram\_tr}\ \sigma\ r\ d\ c\ d[i \mapsto t]\ c\ i\ n}\end{gathered} \stopformula

A similar specification is created for the load. We then also prove that
the implementation of the translation proves that the specification
holds.

\subsubsection[from-specification-to-simulation-1]{From Specification to Simulation}

Another simulation proof is performed to prove that the insertion of the
RAM is semantics preserving. As in
Lemma~\goto{{[}lemma:simulation_diagram{]}}[lemma:simulation_diagram],
we require some invariants that always hold at the start and end of the
simulation. The invariants needed for the simulation of the RAM
insertion are quite different to the previous ones, so we can define
these invariants $\mathcal{I}_{r}$ to be the following:

\startitemize
\item
  The association map for arrays $\Gamma_{a}$ always needs to have the
  same arrays present, and these arrays should never change in size.
\item
  The RAM should always be disabled at the start and the end of each
  simulation step. (This is why self-disabling RAM is needed.)
\stopitemize

The other invariants and assumptions for defining two matching states in
HTL are quite similar to the simulation performed in
Lemma~\goto{{[}lemma:simulation_diagram{]}}[lemma:simulation_diagram],
such as ensuring that the states have the same value, and that the
values in the registers are less defined. In particular, the less
defined relation matches up all the registers, except for the new
registers introduced by the RAM.

\reference[lemma:htl_ram]{} Given an HTL program, the forward-simulation
relation should hold after inserting the RAM and wiring the load, store,
and control signals.

%\startformula \begin{aligned}
%    \forall h, h', B \in \mono{Safe},\quad \yhfunction{tr\_ram\_ins}(h) = h' \land h \Downarrow B \implies h' \Downarrow B.
%  \end{aligned} \stopformula

\subsection[sec:proof:htl_verilog]{Forward Simulation from HTL to Verilog}

The HTL-to-Verilog simulation is conceptually simple, as the only
transformation is from the map representation of the code to the
case-statement representation. The proof is more involved, as the
semantics of a map structure is quite different to that of the
case-statement to which it is converted.

\reference[lemma:verilog]{} In the following, we write
$\mono{tr\_verilog}$ for the translation from HTL to Verilog.
(Note that this translation cannot fail, so we do not need the
constructor here.)

%\startformula \begin{aligned}
    %\forall h, V, B \in \mono{Safe},\quad \yhfunction{tr\_verilog} (h) = V \land h \Downarrow B \implies V \Downarrow B.
  %\end{aligned} \stopformula

{\em Proof sketch.} The translation from maps to case-statements is done
by turning each node of the tree into a case-expression containing the
same statements. The main difficulty is that a random-access structure
is being transformed into an inductive structure where a certain number
of constructors need to be called to get to the correct case.~◻

\subsection[sec:proof:deterministic]{Deterministic Verilog Semantics}

The final lemma we need is that the Verilog semantics is deterministic.
This result allows us to replace the forwards simulation we have proved
with the backwards simulation we desire.

\reference[lemma:deterministic]{} If a Verilog program $V$ admits
behaviours $B_1$ and $B_2$, then $B_1$ and $B_2$ must be the same.

%\startformula \forall V, B_{1}, B_{2},\quad V \Downarrow B_{1} \land V \Downarrow B_{2} \implies B_{1} = B_{2}. \stopformula

{\em Proof sketch.} The Verilog semantics is deterministic because the
order of operation of all the constructs is defined, so there is only
one way to evaluate the module, and hence only one possible behaviour.
This was proven for the small-step semantics shown in
Fig.~\goto{{[}fig:inference_module{]}}[fig:inference_module].~◻

\subsection[coq-mechanisation]{Coq Mechanisation}

lrrrrr & {\bf Coq code} & & {\bf Spec} & & {\bf Total}\crlf
Data structures and libraries & 280 & --- & --- & 500 & 780\crlf
Integers and values & 98 & --- & 15 & 968 & 1081\crlf
HTL semantics & 50 & --- & 181 & 65 & 296\crlf
HTL generation & 590 & --- & 66 & 3069 & 3725\crlf
RAM generation & 253 & --- & --- & 2793 & 3046\crlf
Verilog semantics & 78 & --- & 431 & 235 & 744\crlf
Verilog generation & 104 & --- & --- & 486 & 590\crlf
Top-level driver, pretty printers & 318 & 775 & --- & 189 & 1282\crlf
{\bf Total} & 1721 & 775 & 693 & 8355 & 11544\crlf

The lines of code for the implementation and proof of can be found in
Table~\goto{{[}tab:proof_statistics{]}}[tab:proof_statistics]. Overall,
it took about 1.5 person-years to build -- about three person-months on
implementation and 15 person-months on proofs. The largest proof is the
correctness proof for the HTL generation, which required equivalence
proofs between all integer operations supported by and those supported
in hardware. From the 3069 lines of proof code in the HTL generation,
1189 are for the correctness proof of just the load and store
instructions. These were tedious to prove correct because of the
substantial difference between the memory models used, and the need to
prove properties such as stores outside of the allocated memory being
undefined, so that a finite array could be used. In addition to that,
since pointers in HTL and Verilog are represented as integers, instead
of as a separate \quote{pointer} type like in the semantics, it was
painful to reason about them. Many new theorems had to be proven about
them in to prove the conversion from pointer to integer. Moreover, the
second-largest proof of the correct RAM generation includes many proofs
about the extensional equality of array operations, such as merging
arrays with different assignments. As the negative edge implies that two
merges take place every clock cycle, the proofs about the equality of
the contents in memory and in the merged arrays become more tedious too.

Looking at the trusted computing base of , the Verilog semantics is 431
lines of code. This and the Clight semantics from are the only parts of
the compiler that need to be trusted. The Verilog semantics
specification is therefore much smaller compared to the 1721 lines of
the implementation that are written in Coq, which are the verified parts
of the HLS tool, even when the Clight semantics are added. In addition
to that, understanding the semantics specification is simpler than
trying to understand the translation algorithm. We conclude that the
trusted base has been successfully reduced.

\stopcomponent