(* *********************************************************************)
(*                                                                     *)
(*              The Compcert verified compiler                         *)
(*                                                                     *)
(*         Xavier Leroy, Collège de France and INRIA Paris             *)
(*                                                                     *)
(*  Copyright Institut National de Recherche en Informatique et en     *)
(*  Automatique.  All rights reserved.  This file is distributed       *)
(*  under the terms of the INRIA Non-Commercial License Agreement.     *)
(*                                                                     *)
(* *********************************************************************)

(* Expanding built-ins and some pseudo-instructions by rewriting
   of the AArch64 assembly code. *)

open Asm
open Asmexpandaux
open AST
open Camlcoq
module Ptrofs = Integers.Ptrofs

exception Error of string

(* Useful constants *)

let _0  = Z.zero
let _1  = Z.one
let _2  = Z.of_sint 2
let _4  = Z.of_sint 4
let _8  = Z.of_sint 8
let _16  = Z.of_sint 16
let _m1 = Z.of_sint (-1)

(* Emit instruction sequences that set or offset a register by a constant. *)

let expand_loadimm32 (dst: ireg) n =
  List.iter emit (Asmgen.Asmgen_expand.loadimm32 dst n [])

let expand_addimm64 (dst: iregsp) (src: iregsp) n =
  List.iter emit (Asmgen.Asmgen_expand.addimm64 dst src n [])

let expand_storeptr (src: ireg) (base: iregsp) ofs =
  List.iter emit (Asmgen.Asmgen_expand.storeptr src base ofs [])

(* Handling of varargs *)

(* Determine the number of int registers, FP registers, and stack locations
   used to pass the fixed parameters. *)

let align n a = (n + a - 1) land (-a)

let typesize = function
  | Tint | Tany32 | Tsingle -> 4
  | Tlong | Tany64 | Tfloat -> 8

let reserve_stack stk ty =
  match Archi.abi with
  | Archi.AAPCS64 -> stk + 8
  | Archi.Apple -> align stk (typesize ty) + typesize ty

let rec next_arg_locations ir fr stk = function
  | [] ->
      (ir, fr, stk)
  | (Tint | Tlong | Tany32 | Tany64 as ty) :: l ->
      if ir < 8
      then next_arg_locations (ir + 1) fr stk l
      else next_arg_locations ir fr (reserve_stack stk ty) l
  | (Tfloat | Tsingle as ty) :: l ->
      if fr < 8
      then next_arg_locations ir (fr + 1) stk l
      else next_arg_locations ir fr (reserve_stack stk ty) l

(* Allocate memory on the stack and use it to save the registers
   used for parameter passing.  As an optimization, do not save
   the registers used to pass the fixed parameters. *)

let int_param_regs = [| X0; X1; X2; X3; X4; X5; X6; X7 |]
let float_param_regs = [| D0; D1; D2; D3; D4; D5; D6; D7 |]
let size_save_register_area = 8*8 + 8*16

let save_parameter_registers ir fr =
  emit (Psubimm(X, XSP, XSP, Z.of_uint size_save_register_area));
  let i = ref ir in
  while !i < 8 do
    let pos = 8*16 + !i*8 in
    if !i land 1 = 0 then begin
      emit (Pstpx(int_param_regs.(!i), int_param_regs.(!i + 1),
                 Mint64, Mint64, ADimm(XSP, Z.of_uint pos)));
      i := !i + 2
    end else begin
      emit (Pstrx(int_param_regs.(!i), ADimm(XSP, Z.of_uint pos)));
      i := !i + 1
    end
  done;
  for i = fr to 7 do
    let pos = i*16 in
    emit (Pstrd(float_param_regs.(i), ADimm(XSP, Z.of_uint pos)))
  done

let current_function_stacksize = ref 0L

(* Initialize a va_list as per va_start.
   Register r points to the following struct:

   typedef struct __va_list {
     void *__stack;             // next stack parameter
     void *__gr_top;            // top of the save area for int regs
     void *__vr_top;            // top of the save area for float regs
     int__gr_offs;              // offset from gr_top to next int reg
     int__vr_offs;              // offset from gr_top to next FP reg
   }
*)

let expand_builtin_va_start_aapcs64 r =
  let (ir, fr, stk) =
    next_arg_locations 0 0 0 (get_current_function_args ()) in
  let stack_ofs = Int64.(add !current_function_stacksize (of_int stk))
  and gr_top_ofs = !current_function_stacksize
  and vr_top_ofs = Int64.(sub !current_function_stacksize 64L)
  and gr_offs = - ((8 - ir) * 8)
  and vr_offs = - ((8 - fr) * 16) in
  (* va->__stack = sp + stack_ofs *)
  expand_addimm64 (RR1 X16) XSP (coqint_of_camlint64 stack_ofs);
  emit (Pstrx(X16, ADimm(RR1 r, coqint_of_camlint64 0L)));
  (* va->__gr_top = sp + gr_top_ofs *)
  if gr_top_ofs <> stack_ofs then
    expand_addimm64 (RR1 X16) XSP (coqint_of_camlint64 gr_top_ofs);
  emit (Pstrx(X16, ADimm(RR1 r, coqint_of_camlint64 8L)));
  (* va->__vr_top = sp + vr_top_ofs *)
  expand_addimm64 (RR1 X16) XSP (coqint_of_camlint64 vr_top_ofs);
  emit (Pstrx(X16, ADimm(RR1 r, coqint_of_camlint64 16L)));
  (* va->__gr_offs = gr_offs *)
  expand_loadimm32 X16 (coqint_of_camlint (Int32.of_int gr_offs));
  emit (Pstrw(X16, ADimm(RR1 r, coqint_of_camlint64 24L)));
  (* va->__vr_offs = vr_offs *)
  expand_loadimm32 X16 (coqint_of_camlint (Int32.of_int vr_offs));
  emit (Pstrw(X16, ADimm(RR1 r, coqint_of_camlint64 28L)))

(* In macOS, va_list is just a pointer (char * ) and all variadic arguments
   are passed on the stack. *)

let expand_builtin_va_start_apple r =
  let (ir, fr, stk) =
    next_arg_locations 0 0 0 (get_current_function_args ()) in
  let stk = align stk 8 in
  let stack_ofs = Int64.(add !current_function_stacksize (of_int stk)) in
  (* *va = sp + stack_ofs *)
  expand_addimm64 (RR1 X16) XSP (coqint_of_camlint64 stack_ofs);
  emit (Pstrx(X16, ADimm(RR1 r, coqint_of_camlint64 0L)))

let expand_builtin_va_start r =
  if not (is_current_function_variadic ()) then
    invalid_arg "Fatal error: va_start used in non-vararg function";
  match Archi.abi with
  | Archi.AAPCS64 -> expand_builtin_va_start_aapcs64 r
  | Archi.Apple   -> expand_builtin_va_start_apple r

(* Handling of annotations *)

let expand_annot_val kind txt targ args res =
  emit (Pbuiltin (EF_annot(kind,txt,[targ]), args, BR_none));
  match args, res with
  | [BA(DR(IR src))], BR(DR(IR dst)) ->
     if dst <> src then emit (Pmov (dst, src))
  | [BA(DR(FR src))], BR(DR(FR dst)) ->
     if dst <> src then emit (Pfmov (dst, src))
  | _, _ ->
     raise (Error "ill-formed __builtin_annot_val")

(* Handling of memcpy *)

(* We assume unaligned memory accesses are efficient.  Hence we use
   memory accesses as wide as we can, up to 16 bytes.
   Temporary registers used: x15 x16 x17 x29 x30. *)

let offset_in_range ofs =
  (* The 512 upper bound comes from ldp/stp.  Single-register load/store
     instructions support bigger offsets. *)
  let ofs = Z.to_int64 ofs in 0L <= ofs && ofs < 512L
  
let memcpy_small_arg sz arg tmp =
  match arg with
  | BA (DR(IR r)) ->
      (r, _0)
  | BA_addrstack ofs ->
      if offset_in_range ofs
      && offset_in_range (Ptrofs.add ofs (Ptrofs.repr (Z.of_uint sz)))
      && Int64.rem (Z.to_int64 ofs) 8L = 0L
      then (XSP, ofs)
      else begin expand_addimm64 (RR1 tmp) XSP ofs; (RR1 tmp, _0) end
  | _ ->
      assert false

let expand_builtin_memcpy_small sz al src dst =
  let tsrc = if dst <> BA (DR(IR(RR1 X17))) then X17 else X29 in
  let tdst = if src <> BA (DR(IR(RR1 X29))) then X29 else X17 in
  let (rsrc, osrc) = memcpy_small_arg sz src tsrc in
  let (rdst, odst) = memcpy_small_arg sz dst tdst in
  let rec copy osrc odst sz =
    if sz >= 16 then begin
      emit (Pldpx(X16, X30, Mint64, Mint64, ADimm(rsrc, osrc)));
      emit (Pstpx(X16, X30, Mint64, Mint64, ADimm(rdst, odst)));
      copy (Ptrofs.add osrc _16) (Ptrofs.add odst _16) (sz - 16)
    end
    else if sz >= 8 then begin
      emit (Pldrx(X16, ADimm(rsrc, osrc)));
      emit (Pstrx(X16, ADimm(rdst, odst)));
      copy (Ptrofs.add osrc _8) (Ptrofs.add odst _8) (sz - 8)
    end
    else if sz >= 4 then begin
      emit (Pldrw(X16, ADimm(rsrc, osrc)));
      emit (Pstrw(X16, ADimm(rdst, odst)));
      copy (Ptrofs.add osrc _4) (Ptrofs.add odst _4) (sz - 4)
    end
    else if sz >= 2 then begin
      emit (Pldrh(W, X16, ADimm(rsrc, osrc)));
      emit (Pstrh(X16, ADimm(rdst, odst)));
      copy (Ptrofs.add osrc _2) (Ptrofs.add odst _2) (sz - 2)
    end
    else if sz >= 1 then begin
      emit (Pldrb(W, X16, ADimm(rsrc, osrc)));
      emit (Pstrb(X16, ADimm(rdst, odst)));
      copy (Ptrofs.add osrc _1) (Ptrofs.add odst _1) (sz - 1)
    end
  in copy osrc odst sz

let memcpy_big_arg arg tmp =
  match arg with
  | BA (DR(IR r)) -> emit (Pmov(RR1 tmp, r))
  | BA_addrstack ofs -> expand_addimm64 (RR1 tmp) XSP ofs
  | _ -> assert false

let expand_builtin_memcpy_big sz al src dst =
  assert (sz >= 16);
  memcpy_big_arg src X30;
  memcpy_big_arg dst X29;
  let lbl = new_label () in
  expand_loadimm32 X15 (Z.of_uint (sz / 16));
  emit (Plabel lbl);
  emit (Pldpx(X16, X17, Mint64, Mint64, ADpostincr(RR1 X30, _16)));
  emit (Pstpx(X16, X17, Mint64, Mint64, ADpostincr(RR1 X29, _16)));
  emit (Psubimm(W, RR1 X15, RR1 X15, _1));
  emit (Pcbnz(W, X15, lbl));
  if sz mod 16 >= 8 then begin
    emit (Pldrx(X16, ADpostincr(RR1 X30, _8)));
    emit (Pstrx(X16, ADpostincr(RR1 X29, _8)))
  end;
  if sz mod 8 >= 4 then begin
    emit (Pldrw(X16, ADpostincr(RR1 X30, _4)));
    emit (Pstrw(X16, ADpostincr(RR1 X29, _4)))
  end;
  if sz mod 4 >= 2 then begin
    emit (Pldrh(W, X16, ADpostincr(RR1 X30, _2)));
    emit (Pstrh(X16, ADpostincr(RR1 X29, _2)))
  end;
  if sz mod 2 >= 1 then begin
    emit (Pldrb(W, X16, ADpostincr(RR1 X30, _1)));
    emit (Pstrb(X16, ADpostincr(RR1 X29, _1)))
  end

let expand_builtin_memcpy  sz al args =
  let (dst, src) =
    match args with [d; s] -> (d, s) | _ -> assert false in
  if sz < 64
  then expand_builtin_memcpy_small sz al src dst
  else expand_builtin_memcpy_big sz al src dst

(* Handling of volatile reads and writes *)

let expand_builtin_vload_common chunk base ofs res =
  let addr = ADimm(base, ofs) in
  match chunk, res with
  | Mint8unsigned, BR(DR(IR(RR1 res))) ->
     emit (Pldrb(W, res, addr))
  | Mint8signed, BR(DR(IR(RR1 res))) ->
     emit (Pldrsb(W, res, addr))
  | Mint16unsigned, BR(DR(IR(RR1 res))) ->
     emit (Pldrh(W, res, addr))
  | Mint16signed, BR(DR(IR(RR1 res))) ->
     emit (Pldrsh(W, res, addr))
  | Mint32, BR(DR(IR(RR1 res))) ->
     emit (Pldrw(res, addr))
  | Mint64, BR(DR(IR(RR1 res))) ->
     emit (Pldrx(res, addr))
  | Mfloat32, BR(DR(FR res)) ->
     emit (Pldrs(res, addr))
  | Mfloat64, BR(DR(FR res)) ->
     emit (Pldrd(res, addr))
  | _ ->
     assert false

let expand_builtin_vload chunk args res =
  match args with
  | [BA(DR(IR addr))] ->
      expand_builtin_vload_common chunk addr _0 res
  | [BA_addrstack ofs] ->
      if offset_in_range (Z.add ofs (Memdata.size_chunk chunk)) then
        expand_builtin_vload_common chunk XSP ofs res
      else begin
        expand_addimm64 (RR1 X16) XSP ofs; (* X16 <- SP + ofs *)
        expand_builtin_vload_common chunk (RR1 X16) _0 res
      end
  | [BA_addptr(BA(DR(IR addr)), BA_long ofs)] ->
      if offset_in_range (Z.add ofs (Memdata.size_chunk chunk)) then
        expand_builtin_vload_common chunk addr ofs res
      else begin
        expand_addimm64 (RR1 X16) addr ofs; (* X16 <- addr + ofs *)
        expand_builtin_vload_common chunk (RR1 X16) _0 res
      end
  | _ ->
      assert false

let expand_builtin_vstore_common chunk base ofs src =
  let addr = ADimm(base, ofs) in
  match chunk, src with
  | (Mint8signed | Mint8unsigned), BA(DR(IR(RR1 src))) ->
     emit (Pstrb(src, addr))
  | (Mint16signed | Mint16unsigned), BA(DR(IR(RR1 src))) ->
     emit (Pstrh(src, addr))
  | Mint32, BA(DR(IR(RR1 src))) ->
     emit (Pstrw(src, addr))
  | Mint64, BA(DR(IR(RR1 src))) ->
     emit (Pstrx(src, addr))
  | Mfloat32, BA(DR(FR src)) ->
     emit (Pstrs(src, addr))
  | Mfloat64, BA(DR(FR src)) ->
     emit (Pstrd(src, addr))
  | _ ->
     assert false

let expand_builtin_vstore chunk args =
  match args with
  | [BA(DR(IR addr)); src] ->
      expand_builtin_vstore_common chunk addr _0 src
  | [BA_addrstack ofs; src] ->
      if offset_in_range (Z.add ofs (Memdata.size_chunk chunk)) then
        expand_builtin_vstore_common chunk XSP ofs src
      else begin
        expand_addimm64 (RR1 X16) XSP ofs; (* X16 <- SP + ofs *)
        expand_builtin_vstore_common chunk (RR1 X16) _0 src
      end
  | [BA_addptr(BA(DR(IR addr)), BA_long ofs); src] ->
      if offset_in_range (Z.add ofs (Memdata.size_chunk chunk)) then
        expand_builtin_vstore_common chunk addr ofs src
      else begin
        expand_addimm64 (RR1 X16) addr ofs; (* X16 <- addr + ofs *)
        expand_builtin_vstore_common chunk (RR1 X16) _0 src
      end
  | _ ->
      assert false

(* Handling of compiler-inlined builtins *)

let expand_builtin_inline name args res =
  match name, args, res with
  (* Synchronization *)
  | "__builtin_membar", [], _ ->
     ()
  (* No operation *)
  | "__builtin_nop", [], _ ->
     emit Pnop
  (* Optimization hint *)
  | "__builtin_unreachable", [], _ ->
     ()
  (* Byte swap *)
  | ("__builtin_bswap" | "__builtin_bswap32"), [BA(DR(IR(RR1 a1)))], BR(DR(IR(RR1 res))) ->
     emit (Prev(W, res, a1))
  | "__builtin_bswap64", [BA(DR(IR(RR1 a1)))], BR(DR(IR(RR1 res))) ->
     emit (Prev(X, res, a1))
  | "__builtin_bswap16", [BA(DR(IR(RR1 a1)))], BR(DR(IR(RR1 res))) ->
     emit (Prev16(W, res, a1));
     emit (Pandimm(W, res, RR0 res, Z.of_uint 0xFFFF))
  (* Count leading zeros, leading sign bits, trailing zeros *)
  | "__builtin_clz",  [BA(DR(IR(RR1 a1)))], BR(DR(IR(RR1 res))) ->
     emit (Pclz(W, res, a1))
  | ("__builtin_clzl" | "__builtin_clzll"),  [BA(DR(IR(RR1 a1)))], BR(DR(IR(RR1 res))) ->
     emit (Pclz(X, res, a1)) 
  | "__builtin_cls",  [BA(DR(IR(RR1 a1)))], BR(DR(IR(RR1 res))) ->
     emit (Pcls(W, res, a1))
  | ("__builtin_clsl" | "__builtin_clsll"),  [BA(DR(IR(RR1 a1)))], BR(DR(IR(RR1 res))) ->
     emit (Pcls(X, res, a1))
  | "__builtin_ctz",  [BA(DR(IR(RR1 a1)))], BR(DR(IR(RR1 res))) ->
     emit (Prbit(W, res, a1));
     emit (Pclz(W, res, res))
  | ("__builtin_ctzl" | "__builtin_ctzll"),  [BA(DR(IR(RR1 a1)))], BR(DR(IR(RR1 res))) ->
     emit (Prbit(X, res, a1));
     emit (Pclz(X, res, res))
 (* Float arithmetic *)
  | "__builtin_fabs",  [BA(DR(FR a1))], BR(DR(FR res)) ->
     emit (Pfabs(D, res, a1))
  | ("__builtin_fsqrt" | "__builtin_sqrt"),  [BA(DR(FR a1))], BR(DR(FR res)) ->
     emit (Pfsqrt(D, res, a1))
  | "__builtin_fmadd", [BA(DR(FR a1)); BA(DR(FR a2)); BA(DR(FR a3))], BR(DR(FR res)) ->
      emit (Pfmadd(D, res, a1, a2, a3))
  | "__builtin_fmsub", [BA(DR(FR a1)); BA(DR(FR a2)); BA(DR(FR a3))], BR(DR(FR res)) ->
      emit (Pfmsub(D, res, a1, a2, a3))
  | "__builtin_fnmadd", [BA(DR(FR a1)); BA(DR(FR a2)); BA(DR(FR a3))], BR(DR(FR res)) ->
      emit (Pfnmadd(D, res, a1, a2, a3))
  | "__builtin_fnmsub", [BA(DR(FR a1)); BA(DR(FR a2)); BA(DR(FR a3))], BR(DR(FR res)) ->
      emit (Pfnmsub(D, res, a1, a2, a3))
  | "__builtin_fmax", [BA(DR(FR a1)); BA(DR(FR a2))], BR(DR(FR res)) ->
      emit (Pfmax (D, res, a1, a2))
  | "__builtin_fmin", [BA(DR(FR a1)); BA(DR(FR a2))], BR(DR(FR res)) ->
      emit (Pfmin (D, res, a1, a2))
  (* Vararg *)
  | "__builtin_va_start", [BA(DR(IR(RR1 a)))], _ ->
      expand_builtin_va_start a
  (* Catch-all *)
  | _ ->
     raise (Error ("unrecognized builtin " ^ name))

(* Expansion of instructions *)

let expand_instruction instr =
  match instr with
  | Pallocframe (sz, ofs) ->
      emit (Pmov (RR1 X29, XSP));
      if is_current_function_variadic() && Archi.abi = Archi.AAPCS64 then begin
        let (ir, fr, _) =
          next_arg_locations 0 0 0 (get_current_function_args ()) in
        save_parameter_registers ir fr;
        current_function_stacksize :=
          Int64.(add (Z.to_int64 sz) (of_int size_save_register_area))
      end else begin
        current_function_stacksize := Z.to_int64 sz
      end;
      expand_addimm64 XSP XSP (Ptrofs.repr (Z.neg sz));
      expand_storeptr X29 XSP ofs
  | Pfreeframe (sz, ofs) ->
      expand_addimm64 XSP XSP (coqint_of_camlint64 !current_function_stacksize)
  | Pcvtx2w rd ->
      (* no code generated, the upper 32 bits of rd will be ignored *)
      ()
  | Pbuiltin (ef,args,res) ->
     begin match ef with
     | EF_builtin (name,sg) ->
        expand_builtin_inline (camlstring_of_coqstring name) args res
     | EF_vload chunk ->
        expand_builtin_vload chunk args res
     | EF_vstore chunk ->
        expand_builtin_vstore chunk args
     | EF_annot_val (kind,txt,targ) ->
        expand_annot_val kind txt targ args res
     | EF_memcpy(sz, al) ->
        expand_builtin_memcpy (Z.to_int sz) (Z.to_int al) args
     | EF_annot _ | EF_debug _ | EF_inline_asm _ | EF_profiling _ ->
        emit instr
     | _ ->
        assert false
     end
  | _ ->
     emit instr

let int_reg_to_dwarf = function
  | X0 -> 0 | X1 -> 1 | X2 -> 2 | X3 -> 3 | X4 -> 4
  | X5 -> 5 | X6 -> 6 | X7 -> 7 | X8 -> 8 | X9 -> 9
  | X10 -> 10 | X11 -> 11 | X12 -> 12 | X13 -> 13 | X14 -> 14
  | X15 -> 15 | X16 -> 16 | X17 -> 17 | X18 -> 18 | X19 -> 19
  | X20 -> 20 | X21 -> 21 | X22 -> 22 | X23 -> 23 | X24 -> 24
  | X25 -> 25 | X26 -> 26 | X27 -> 27 | X28 -> 28 | X29 -> 29
  | X30 -> 30

let float_reg_to_dwarf = function
  | D0 -> 64 | D1 -> 65 | D2 -> 66 | D3 -> 67 | D4 -> 68
  | D5 -> 69 | D6 -> 70 | D7 -> 71 | D8 -> 72 | D9 -> 73
  | D10 -> 74 | D11 -> 75 | D12 -> 76 | D13 -> 77 | D14 -> 78
  | D15 -> 79 | D16 -> 80 | D17 -> 81 | D18 -> 82 | D19 -> 83
  | D20 -> 84 | D21 -> 85 | D22 -> 86 | D23 -> 87 | D24 -> 88
  | D25 -> 89 | D26 -> 90 | D27 -> 91 | D28 -> 92 | D29 -> 93
  | D30 -> 94 | D31 -> 95

let preg_to_dwarf = function
   | DR(IR(RR1 r)) -> int_reg_to_dwarf r
   | DR(FR r) -> float_reg_to_dwarf r
   | DR(IR(XSP)) -> 31
   | _ -> assert false

let expand_function id fn =
  try
    set_current_function fn;
    expand id (* sp= *) 31 preg_to_dwarf expand_instruction fn.fn_code;
    Errors.OK (get_current_function ())
  with Error s ->
    Errors.Error (Errors.msg (coqstring_of_camlstring s))

let expand_fundef id = function
  | Internal f ->
      begin match expand_function id f with
      | Errors.OK tf -> Errors.OK (Internal tf)
      | Errors.Error msg -> Errors.Error msg
      end
  | External ef ->
      Errors.OK (External ef)

let expand_program (p: Asm.program) : Asm.program Errors.res =
  AST.transform_partial_program2 expand_fundef (fun id v -> Errors.OK v) p