From 7cdd676d002e33015b496f609538a9e86d77c543 Mon Sep 17 00:00:00 2001
From: Xavier Leroy <xavier.leroy@inria.fr>
Date: Thu, 8 Aug 2019 11:18:38 +0200
Subject: AArch64 port

This commit adds a back-end for the AArch64 architecture, namely ARMv8
in 64-bit mode.
---
 aarch64/Asmexpand.ml | 436 +++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 436 insertions(+)
 create mode 100644 aarch64/Asmexpand.ml

(limited to 'aarch64/Asmexpand.ml')

diff --git a/aarch64/Asmexpand.ml b/aarch64/Asmexpand.ml
new file mode 100644
index 00000000..71bd0042
--- /dev/null
+++ b/aarch64/Asmexpand.ml
@@ -0,0 +1,436 @@
+(* *********************************************************************)
+(*                                                                     *)
+(*              The Compcert verified compiler                         *)
+(*                                                                     *)
+(*         Xavier Leroy, Collège de France and INRIA Paris             *)
+(*                                                                     *)
+(*  Copyright Institut National de Recherche en Informatique et en     *)
+(*  Automatique.  All rights reserved.  This file is distributed       *)
+(*  under the terms of the INRIA Non-Commercial License Agreement.     *)
+(*                                                                     *)
+(* *********************************************************************)
+
+(* Expanding built-ins and some pseudo-instructions by rewriting
+   of the AArch64 assembly code. *)
+
+open Asm
+open Asmexpandaux
+open AST
+open Camlcoq
+module Ptrofs = Integers.Ptrofs
+
+exception Error of string
+
+(* Useful constants *)
+
+let _0  = Z.zero
+let _1  = Z.one
+let _2  = Z.of_sint 2
+let _4  = Z.of_sint 4
+let _8  = Z.of_sint 8
+let _16  = Z.of_sint 16
+let _m1 = Z.of_sint (-1)
+
+(* Emit instruction sequences that set or offset a register by a constant. *)
+
+let expand_loadimm32 (dst: ireg) n =
+  List.iter emit (Asmgen.loadimm32 dst n [])
+
+let expand_addimm64 (dst: iregsp) (src: iregsp) n =
+  List.iter emit (Asmgen.addimm64 dst src n [])
+
+let expand_storeptr (src: ireg) (base: iregsp) ofs =
+  List.iter emit (Asmgen.storeptr src base ofs [])
+
+(* Handling of varargs *)
+
+(* Determine the number of int registers, FP registers, and stack locations
+   used to pass the fixed parameters. *)
+
+let rec next_arg_locations ir fr stk = function
+  | [] ->
+      (ir, fr, stk)
+  | (Tint | Tlong | Tany32 | Tany64) :: l ->
+      if ir < 8
+      then next_arg_locations (ir + 1) fr stk l
+      else next_arg_locations ir fr (stk + 8) l
+  | (Tfloat | Tsingle) :: l ->
+      if fr < 8
+      then next_arg_locations ir (fr + 1) stk l
+      else next_arg_locations ir fr (stk + 8) l
+
+(* Allocate memory on the stack and use it to save the registers
+   used for parameter passing.  As an optimization, do not save
+   the registers used to pass the fixed parameters. *)
+
+let int_param_regs = [| X0; X1; X2; X3; X4; X5; X6; X7 |]
+let float_param_regs = [| D0; D1; D2; D3; D4; D5; D6; D7 |]
+let size_save_register_area = 8*8 + 8*16
+
+let save_parameter_registers ir fr =
+  emit (Psubimm(X, XSP, XSP, Z.of_uint size_save_register_area));
+  let i = ref ir in
+  while !i < 8 do
+    let pos = 8*16 + !i*8 in
+    if !i land 1 = 0 then begin
+      emit (Pstp(int_param_regs.(!i), int_param_regs.(!i + 1),
+                 ADimm(XSP, Z.of_uint pos)));
+      i := !i + 2
+    end else begin
+      emit (Pstrx(int_param_regs.(!i), ADimm(XSP, Z.of_uint pos)));
+      i := !i + 1
+    end
+  done;
+  for i = fr to 7 do
+    let pos = i*16 in
+    emit (Pstrd(float_param_regs.(i), ADimm(XSP, Z.of_uint pos)))
+  done
+
+(* Initialize a va_list as per va_start.
+   Register r points to the following struct:
+
+   typedef struct __va_list {
+     void *__stack;             // next stack parameter
+     void *__gr_top;            // top of the save area for int regs
+     void *__vr_top;            // top of the save area for float regs
+     int__gr_offs;              // offset from gr_top to next int reg
+     int__vr_offs;              // offset from gr_top to next FP reg
+   }
+*)
+
+let current_function_stacksize = ref 0L
+
+let expand_builtin_va_start r =
+  if not (is_current_function_variadic ()) then
+    invalid_arg "Fatal error: va_start used in non-vararg function";
+  let (ir, fr, stk) =
+    next_arg_locations 0 0 0 (get_current_function_args ()) in
+  let stack_ofs = Int64.(add !current_function_stacksize (of_int stk))
+  and gr_top_ofs = !current_function_stacksize
+  and vr_top_ofs = Int64.(sub !current_function_stacksize 64L)
+  and gr_offs = - ((8 - ir) * 8)
+  and vr_offs = - ((8 - fr) * 16) in
+  (* va->__stack = sp + stack_ofs *)
+  expand_addimm64 (RR1 X16) XSP (coqint_of_camlint64 stack_ofs);
+  emit (Pstrx(X16, ADimm(RR1 r, coqint_of_camlint64 0L)));
+  (* va->__gr_top = sp + gr_top_ofs *)
+  if gr_top_ofs <> stack_ofs then
+    expand_addimm64 (RR1 X16) XSP (coqint_of_camlint64 gr_top_ofs);
+  emit (Pstrx(X16, ADimm(RR1 r, coqint_of_camlint64 8L)));
+  (* va->__vr_top = sp + vr_top_ofs *)
+  expand_addimm64 (RR1 X16) XSP (coqint_of_camlint64 vr_top_ofs);
+  emit (Pstrx(X16, ADimm(RR1 r, coqint_of_camlint64 16L)));
+  (* va->__gr_offs = gr_offs *)
+  expand_loadimm32 X16 (coqint_of_camlint (Int32.of_int gr_offs));
+  emit (Pstrw(X16, ADimm(RR1 r, coqint_of_camlint64 24L)));
+  (* va->__vr_offs = vr_offs *)
+  expand_loadimm32 X16 (coqint_of_camlint (Int32.of_int vr_offs));
+  emit (Pstrw(X16, ADimm(RR1 r, coqint_of_camlint64 28L)))
+
+(* Handling of annotations *)
+
+let expand_annot_val kind txt targ args res =
+  emit (Pbuiltin (EF_annot(kind,txt,[targ]), args, BR_none));
+  match args, res with
+  | [BA(IR src)], BR(IR dst) ->
+     if dst <> src then emit (Pmov (RR1 dst, RR1 src))
+  | [BA(FR src)], BR(FR dst) ->
+     if dst <> src then emit (Pfmov (dst, src))
+  | _, _ ->
+     raise (Error "ill-formed __builtin_annot_val")
+
+(* Handling of memcpy *)
+
+(* We assume unaligned memory accesses are efficient.  Hence we use
+   memory accesses as wide as we can, up to 16 bytes.
+   Temporary registers used: x15 x16 x17 x29 x30. *)
+
+let offset_in_range ofs =
+  let ofs = Z.to_int64 ofs in 0L <= ofs && ofs < 0x1000L
+  
+let memcpy_small_arg sz arg tmp =
+  match arg with
+  | BA (IR r) ->
+      (RR1 r, _0)
+  | BA_addrstack ofs ->
+      if offset_in_range ofs
+      && offset_in_range (Ptrofs.add ofs (Ptrofs.repr (Z.of_uint sz)))
+      then (XSP, ofs)
+      else begin expand_addimm64 (RR1 tmp) XSP ofs; (RR1 tmp, _0) end
+  | _ ->
+      assert false
+
+let expand_builtin_memcpy_small sz al src dst =
+  let (tsrc, tdst) =
+    if dst <> BA (IR X17) then (X17, X29) else (X29, X17) in
+  let (rsrc, osrc) = memcpy_small_arg sz src tsrc in
+  let (rdst, odst) = memcpy_small_arg sz dst tdst in
+  let rec copy osrc odst sz =
+    if sz >= 16 then begin
+      emit (Pldp(X16, X30, ADimm(rsrc, osrc)));
+      emit (Pstp(X16, X30, ADimm(rdst, odst)));
+      copy (Ptrofs.add osrc _16) (Ptrofs.add odst _16) (sz - 16)
+    end
+    else if sz >= 8 then begin
+      emit (Pldrx(X16, ADimm(rsrc, osrc)));
+      emit (Pstrx(X16, ADimm(rdst, odst)));
+      copy (Ptrofs.add osrc _8) (Ptrofs.add odst _8) (sz - 8)
+    end
+    else if sz >= 4 then begin
+      emit (Pldrw(X16, ADimm(rsrc, osrc)));
+      emit (Pstrw(X16, ADimm(rdst, odst)));
+      copy (Ptrofs.add osrc _4) (Ptrofs.add odst _4) (sz - 4)
+    end
+    else if sz >= 2 then begin
+      emit (Pldrh(W, X16, ADimm(rsrc, osrc)));
+      emit (Pstrh(X16, ADimm(rdst, odst)));
+      copy (Ptrofs.add osrc _2) (Ptrofs.add odst _2) (sz - 2)
+    end
+    else if sz >= 1 then begin
+      emit (Pldrb(W, X16, ADimm(rsrc, osrc)));
+      emit (Pstrb(X16, ADimm(rdst, odst)));
+      copy (Ptrofs.add osrc _1) (Ptrofs.add odst _1) (sz - 1)
+    end
+  in copy osrc odst sz
+
+let memcpy_big_arg arg tmp =
+  match arg with
+  | BA (IR r) -> emit (Pmov(RR1 tmp, RR1 r))
+  | BA_addrstack ofs -> expand_addimm64 (RR1 tmp) XSP ofs
+  | _ -> assert false
+
+let expand_builtin_memcpy_big sz al src dst =
+  assert (sz >= 16);
+  memcpy_big_arg src X30;
+  memcpy_big_arg dst X29;
+  let lbl = new_label () in
+  expand_loadimm32 X15 (Z.of_uint (sz / 16));
+  emit (Plabel lbl);
+  emit (Pldp(X16, X17, ADpostincr(RR1 X30, _16)));
+  emit (Pstp(X16, X17, ADpostincr(RR1 X29, _16)));
+  emit (Psubimm(W, RR1 X15, RR1 X15, _1));
+  emit (Pcbnz(W, X15, lbl));
+  if sz mod 16 >= 8 then begin
+    emit (Pldrx(X16, ADpostincr(RR1 X30, _8)));
+    emit (Pstrx(X16, ADpostincr(RR1 X29, _8)))
+  end;
+  if sz mod 8 >= 4 then begin
+    emit (Pldrw(X16, ADpostincr(RR1 X30, _4)));
+    emit (Pstrw(X16, ADpostincr(RR1 X29, _4)))
+  end;
+  if sz mod 4 >= 2 then begin
+    emit (Pldrh(W, X16, ADpostincr(RR1 X30, _2)));
+    emit (Pstrh(X16, ADpostincr(RR1 X29, _2)))
+  end;
+  if sz mod 2 >= 1 then begin
+    emit (Pldrb(W, X16, ADpostincr(RR1 X30, _1)));
+    emit (Pstrb(X16, ADpostincr(RR1 X29, _1)))
+  end
+
+let expand_builtin_memcpy  sz al args =
+  let (dst, src) =
+    match args with [d; s] -> (d, s) | _ -> assert false in
+  if sz < 64
+  then expand_builtin_memcpy_small sz al src dst
+  else expand_builtin_memcpy_big sz al src dst
+
+(* Handling of volatile reads and writes *)
+
+let expand_builtin_vload_common chunk base ofs res =
+  let addr = ADimm(base, ofs) in
+  match chunk, res with
+  | Mint8unsigned, BR(IR res) ->
+     emit (Pldrb(W, res, addr))
+  | Mint8signed, BR(IR res) ->
+     emit (Pldrsb(W, res, addr))
+  | Mint16unsigned, BR(IR res) ->
+     emit (Pldrh(W, res, addr))
+  | Mint16signed, BR(IR res) ->
+     emit (Pldrsh(W, res, addr))
+  | Mint32, BR(IR res) ->
+     emit (Pldrw(res, addr))
+  | Mint64, BR(IR res) ->
+     emit (Pldrx(res, addr))
+  | Mfloat32, BR(FR res) ->
+     emit (Pldrs(res, addr))
+  | Mfloat64, BR(FR res) ->
+     emit (Pldrd(res, addr))
+  | _ ->
+     assert false
+
+let expand_builtin_vload chunk args res =
+  match args with
+  | [BA(IR addr)] ->
+      expand_builtin_vload_common chunk (RR1 addr) _0 res
+  | [BA_addrstack ofs] ->
+      if offset_in_range (Z.add ofs (Memdata.size_chunk chunk)) then
+        expand_builtin_vload_common chunk XSP ofs res
+      else begin
+        expand_addimm64 (RR1 X16) XSP ofs; (* X16 <- SP + ofs *)
+        expand_builtin_vload_common chunk (RR1 X16) _0 res
+      end
+  | [BA_addptr(BA(IR addr), BA_long ofs)] ->
+      if offset_in_range (Z.add ofs (Memdata.size_chunk chunk)) then
+        expand_builtin_vload_common chunk (RR1 addr) ofs res
+      else begin
+        expand_addimm64 (RR1 X16) (RR1 addr) ofs; (* X16 <- addr + ofs *)
+        expand_builtin_vload_common chunk (RR1 X16) _0 res
+      end
+  | _ ->
+      assert false
+
+let expand_builtin_vstore_common chunk base ofs src =
+  let addr = ADimm(base, ofs) in
+  match chunk, src with
+  | (Mint8signed | Mint8unsigned), BA(IR src) ->
+     emit (Pstrb(src, addr))
+  | (Mint16signed | Mint16unsigned), BA(IR src) ->
+     emit (Pstrh(src, addr))
+  | Mint32, BA(IR src) ->
+     emit (Pstrw(src, addr))
+  | Mint64, BA(IR src) ->
+     emit (Pstrx(src, addr))
+  | Mfloat32, BA(FR src) ->
+     emit (Pstrs(src, addr))
+  | Mfloat64, BA(FR src) ->
+     emit (Pstrd(src, addr))
+  | _ ->
+     assert false
+
+let expand_builtin_vstore chunk args =
+  match args with
+  | [BA(IR addr); src] ->
+      expand_builtin_vstore_common chunk (RR1 addr) _0 src
+  | [BA_addrstack ofs; src] ->
+      if offset_in_range (Z.add ofs (Memdata.size_chunk chunk)) then
+        expand_builtin_vstore_common chunk XSP ofs src
+      else begin
+        expand_addimm64 (RR1 X16) XSP ofs; (* X16 <- SP + ofs *)
+        expand_builtin_vstore_common chunk (RR1 X16) _0 src
+      end
+  | [BA_addptr(BA(IR addr), BA_long ofs); src] ->
+      if offset_in_range (Z.add ofs (Memdata.size_chunk chunk)) then
+        expand_builtin_vstore_common chunk (RR1 addr) ofs src
+      else begin
+        expand_addimm64 (RR1 X16) (RR1 addr) ofs; (* X16 <- addr + ofs *)
+        expand_builtin_vstore_common chunk (RR1 X16) _0 src
+      end
+  | _ ->
+      assert false
+
+(* Handling of compiler-inlined builtins *)
+
+let expand_builtin_inline name args res =
+  match name, args, res with
+  (* Synchronization *)
+  | "__builtin_membar", [], _ ->
+     ()
+  | "__builtin_nop", [], _ ->
+     emit Pnop
+  (* Byte swap *)
+  | ("__builtin_bswap" | "__builtin_bswap32"), [BA(IR a1)], BR(IR res) ->
+     emit (Prev(W, res, a1))
+  | "__builtin_bswap64", [BA(IR a1)], BR(IR res) ->
+     emit (Prev(X, res, a1))
+  | "__builtin_bswap16", [BA(IR a1)], BR(IR res) ->
+     emit (Prev16(W, res, a1));
+     emit (Pandimm(W, res, RR0 res, Z.of_uint 0xFFFF))
+  (* Count leading zeros and leading sign bits *)
+  | "__builtin_clz",  [BA(IR a1)], BR(IR res) ->
+     emit (Pclz(W, res, a1))
+  | ("__builtin_clzl" | "__builtin_clzll"),  [BA(IR a1)], BR(IR res) ->
+     emit (Pclz(X, res, a1)) 
+  | "__builtin_cls",  [BA(IR a1)], BR(IR res) ->
+     emit (Pcls(W, res, a1))
+  | ("__builtin_clsl" | "__builtin_clsll"),  [BA(IR a1)], BR(IR res) ->
+     emit (Pcls(X, res, a1))
+ (* Float arithmetic *)
+  | "__builtin_fabs",  [BA(FR a1)], BR(FR res) ->
+     emit (Pfabs(D, res, a1))
+  | "__builtin_fsqrt",  [BA(FR a1)], BR(FR res) ->
+     emit (Pfsqrt(D, res, a1))
+  | "__builtin_fmadd", [BA(FR a1); BA(FR a2); BA(FR a3)], BR(FR res) ->
+      emit (Pfmadd(D, res, a1, a2, a3))
+  | "__builtin_fmsub", [BA(FR a1); BA(FR a2); BA(FR a3)], BR(FR res) ->
+      emit (Pfmsub(D, res, a1, a2, a3))
+  | "__builtin_fnmadd", [BA(FR a1); BA(FR a2); BA(FR a3)], BR(FR res) ->
+      emit (Pfnmadd(D, res, a1, a2, a3))
+  | "__builtin_fnmsub", [BA(FR a1); BA(FR a2); BA(FR a3)], BR(FR res) ->
+      emit (Pfnmsub(D, res, a1, a2, a3))
+  (* Vararg *)
+  | "__builtin_va_start", [BA(IR a)], _ ->
+      expand_builtin_va_start a
+  (* Catch-all *)
+  | _ ->
+     raise (Error ("unrecognized builtin " ^ name))
+
+(* Expansion of instructions *)
+
+let expand_instruction instr =
+  match instr with
+  | Pallocframe (sz, ofs) ->
+      emit (Pmov (RR1 X29, XSP));
+      if is_current_function_variadic() then begin
+        let (ir, fr, _) =
+          next_arg_locations 0 0 0 (get_current_function_args ()) in
+        save_parameter_registers ir fr;
+        current_function_stacksize :=
+          Int64.(add (Z.to_int64 sz) (of_int size_save_register_area))
+      end else begin
+        current_function_stacksize := Z.to_int64 sz
+      end;
+      expand_addimm64 XSP XSP (Ptrofs.repr (Z.neg sz));
+      expand_storeptr X29 XSP ofs
+  | Pfreeframe (sz, ofs) ->
+      expand_addimm64 XSP XSP (coqint_of_camlint64 !current_function_stacksize)
+  | Pcvtx2w rd ->
+      (* no code generated, the upper 32 bits of rd will be ignored *)
+      ()
+  | Pbuiltin (ef,args,res) ->
+     begin match ef with
+     | EF_builtin (name,sg) ->
+        expand_builtin_inline (camlstring_of_coqstring name) args res
+     | EF_vload chunk ->
+        expand_builtin_vload chunk args res
+     | EF_vstore chunk ->
+        expand_builtin_vstore chunk args
+     | EF_annot_val (kind,txt,targ) ->
+        expand_annot_val kind txt targ args res
+     | EF_memcpy(sz, al) ->
+        expand_builtin_memcpy (Z.to_int sz) (Z.to_int al) args
+     | EF_annot _ | EF_debug _ | EF_inline_asm _ ->
+        emit instr
+     | _ ->
+        assert false
+     end
+  | _ ->
+     emit instr
+
+let int_reg_to_dwarf r = 0              (* TODO *)
+
+let float_reg_to_dwarf r = 0            (* TODO *)
+
+let preg_to_dwarf = function
+   | IR r -> int_reg_to_dwarf r
+   | FR r -> float_reg_to_dwarf r
+   | _ -> assert false
+
+let expand_function id fn =
+  try
+    set_current_function fn;
+    expand id (* sp= *) 2 preg_to_dwarf expand_instruction fn.fn_code;
+    Errors.OK (get_current_function ())
+  with Error s ->
+    Errors.Error (Errors.msg (coqstring_of_camlstring s))
+
+let expand_fundef id = function
+  | Internal f ->
+      begin match expand_function id f with
+      | Errors.OK tf -> Errors.OK (Internal tf)
+      | Errors.Error msg -> Errors.Error msg
+      end
+  | External ef ->
+      Errors.OK (External ef)
+
+let expand_program (p: Asm.program) : Asm.program Errors.res =
+  AST.transform_partial_program2 expand_fundef (fun id v -> Errors.OK v) p
-- 
cgit