(* *********************************************************************)
(*                                                                     *)
(*              The Compcert verified compiler                         *)
(*                                                                     *)
(*          Xavier Leroy, INRIA Paris-Rocquencourt                     *)
(*                                                                     *)
(*  Copyright Institut National de Recherche en Informatique et en     *)
(*  Automatique.  All rights reserved.  This file is distributed       *)
(*  under the terms of the INRIA Non-Commercial License Agreement.     *)
(*                                                                     *)
(* *********************************************************************)

(* Register allocation by coloring of an interference graph *)

(* The algorithm in a nutshell:

   - Split live ranges
   - Convert from RTL to XTL
   - Eliminate dead code
   - Repeat:
       . Construct interference graph
       . Color interference graph using IRC algorithm
       . Check for variables that were spilled and must be in registers
       . If none, convert to LTL and exit.
       . If some, insert spill and reload instructions and try again
     End Repeat
*)

open Printf
open Clflags
open Camlcoq
open Datatypes
open Coqlib
open Maps
open AST
open Kildall
open Op
open Machregs
open Locations
open Conventions1
open Conventions
open IRC
open XTL

(* Detection of 2-address operations *)

let is_two_address op args =
  if two_address_op op then
    match args with
    | [] -> assert false
    | arg1 :: argl -> Some(arg1, argl)
  else None

(* For tracing *)

let destination_alloctrace : string option ref = ref None
let pp = ref stdout

let init_trace () =
  if !option_dalloctrace && !pp == stdout then begin
    match !destination_alloctrace with
    | None -> ()  (* should not happen *)
    | Some f -> pp := open_out f
  end


(**************** Initial conversion from RTL to XTL **************)

let vreg tyenv r = V(r, tyenv r)

let vregs tyenv rl = List.map (vreg tyenv) rl

let constrain_reg v c =
  match c with
  | None -> v
  | Some mr -> L(R mr)

let rec constrain_regs vl cl =
  match vl, cl with
  | [], _ -> []
  | v1 :: vl', [] -> vl
  | v1 :: vl', Some mr1 :: cl' -> L(R mr1) :: constrain_regs vl' cl'
  | v1 :: vl', None :: cl' -> v1 :: constrain_regs vl' cl'

let move v1 v2 k =
  if v1 = v2 then
    k
  else if XTL.is_stack_reg v1 then begin
    let t = new_temp (typeof v2) in Xmove(v1, t) :: Xmove(t, v2) :: k
  end else if XTL.is_stack_reg v2 then begin
    let t = new_temp (typeof v1) in Xmove(v1, t) :: Xmove(t, v2) :: k
  end else
    Xmove(v1, v2) :: k

let rec movelist vl1 vl2 k =
  match vl1, vl2 with
  | [], [] -> k
  | v1 :: vl1, v2 :: vl2 -> move v1 v2 (movelist vl1 vl2 k)
  | _, _ -> assert false

let parmove_regs2locs tyenv srcs dsts k =
  assert (List.length srcs = List.length dsts);
  let rec expand srcs' dsts' rl ll =
    match rl, ll with
    | [], [] ->
        begin match srcs', dsts' with
        | [], [] -> k
        | [src], [dst] -> move src dst k
        | _, _ -> Xparmove(srcs', dsts', new_temp Tint, new_temp Tfloat) :: k
        end
    | r :: rl, One l :: ll ->
        let ty = tyenv r in
        expand (V(r, ty) :: srcs') (L l :: dsts') rl ll
    | r :: rl, Twolong(l1, l2) :: ll ->
        assert (tyenv r = Tlong);
        if Archi.splitlong then
          expand (V(r, Tint) :: V(twin_reg r, Tint) :: srcs')
                 (L l1 :: L l2 :: dsts')
                 rl ll
        else
          Xop(Ohighlong, [V(r, Tlong)], L l1) ::
          Xop(Olowlong,  [V(r, Tlong)], L l2) ::
          expand srcs' dsts' rl ll
    | _, _ ->
        assert false in
  expand [] [] srcs dsts

let parmove_locs2regs tyenv srcs dsts k =
  assert (List.length srcs = List.length dsts);
  let rec expand srcs' dsts' ll rl =
    match ll, rl with
    | [], [] ->
        begin match srcs', dsts' with
        | [], [] -> k
        | [src], [dst] -> move src dst k
        | _, _ -> Xparmove(srcs', dsts', new_temp Tint, new_temp Tfloat) :: k
        end
    | One l :: ll, r :: rl ->
        let ty = tyenv r in
        expand (L l :: srcs') (V(r, ty) :: dsts') ll rl
    | Twolong(l1, l2) :: ll, r :: rl ->
        assert (tyenv r = Tlong);
        if Archi.splitlong then
          expand (L l1 :: L l2 :: srcs')
                 (V(r, Tint) :: V(twin_reg r, Tint) :: dsts')
                 ll rl
        else
          Xop(Omakelong, [L l1; L l2], V(r, Tlong)) ::
          expand srcs' dsts' ll rl
    | _, _ ->
        assert false in
  expand [] [] srcs dsts

let rec convert_builtin_arg tyenv = function
  | BA r ->
      let ty = tyenv r in
      if Archi.splitlong && ty = Tlong
      then BA_splitlong(BA(V(r, Tint)), BA(V(twin_reg r, Tint)))
      else BA(V(r, ty))
  | BA_int n -> BA_int n
  | BA_long n -> BA_long n
  | BA_float n -> BA_float n
  | BA_single n -> BA_single n
  | BA_loadstack(chunk, ofs) -> BA_loadstack(chunk, ofs)
  | BA_addrstack(ofs) -> BA_addrstack(ofs)
  | BA_loadglobal(chunk, id, ofs) -> BA_loadglobal(chunk, id, ofs)
  | BA_addrglobal(id, ofs) -> BA_addrglobal(id, ofs)
  | BA_splitlong(hi, lo) ->
      BA_splitlong(convert_builtin_arg tyenv hi, convert_builtin_arg tyenv lo)
  | BA_addptr(a1, a2) ->
      BA_addptr(convert_builtin_arg tyenv a1, convert_builtin_arg tyenv a2)

let convert_builtin_res tyenv = function
  | BR r ->
      let ty = tyenv r in
      if Archi.splitlong && ty = Tlong
      then BR_splitlong(BR(V(r, Tint)), BR(V(twin_reg r, Tint)))
      else BR(V(r, ty))
  | BR_none -> BR_none
  | BR_splitlong _ -> assert false

let rec constrain_builtin_arg a cl =
  match a, cl with
  | BA x, None :: cl' -> (a, cl')
  | BA x, Some mr :: cl' -> (BA (L(R mr)), cl')
  | BA_splitlong(hi, lo), _ ->
      let (hi', cl1) = constrain_builtin_arg hi cl in
      let (lo', cl2) = constrain_builtin_arg lo cl1 in
      (BA_splitlong(hi', lo'), cl2)
  | BA_addptr(a1, a2), _ ->
      let (a1', cl1) = constrain_builtin_arg a1 cl in
      let (a2', cl2) = constrain_builtin_arg a2 cl1 in
      (BA_addptr(a1', a2'), cl2)
  | _, _ -> (a, cl)

let rec constrain_builtin_args al cl =
  match al with
  | [] -> ([], cl)
  | a :: al ->
      let (a', cl1) = constrain_builtin_arg a cl in
      let (al', cl2) = constrain_builtin_args al cl1 in
      (a' :: al', cl2)

let rec constrain_builtin_res a cl =
  match a, cl with
  | BR x, None :: cl' -> (a, cl')
  | BR x, Some mr :: cl' -> (BR (L(R mr)), cl')
  | BR_splitlong(hi, lo), _ ->
      let (hi', cl1) = constrain_builtin_res hi cl in
      let (lo', cl2) = constrain_builtin_res lo cl1 in
      (BR_splitlong(hi', lo'), cl2)
  | _, _ -> (a, cl)

(* Return the XTL basic block corresponding to the given RTL instruction.
   Move and parallel move instructions are introduced to honor calling
   conventions and register constraints on some operations.
   64-bit integer variables are split in two 32-bit halves
   if [Archi.splitlong] is true. *)

let block_of_RTL_instr funsig tyenv = function
  | RTL.Inop s ->
      [Xbranch s]
  | RTL.Iop(Omove, [arg], res, s) ->
      if Archi.splitlong && tyenv arg = Tlong then
        [Xmove(V(arg, Tint), V(res, Tint));
         Xmove(V(twin_reg arg, Tint), V(twin_reg res, Tint));
         Xbranch s]
      else
        [Xmove(vreg tyenv arg, vreg tyenv res); Xbranch s]
  | RTL.Iop(Omakelong, [arg1; arg2], res, s) when Archi.splitlong ->
      [Xmove(V(arg1, Tint), V(res, Tint));
       Xmove(V(arg2, Tint), V(twin_reg res, Tint));
       Xbranch s]
  | RTL.Iop(Olowlong, [arg], res, s) when Archi.splitlong ->
      [Xmove(V(twin_reg arg, Tint), V(res, Tint)); Xbranch s]
  | RTL.Iop(Ohighlong, [arg], res, s) when Archi.splitlong ->
      [Xmove(V(arg, Tint), V(res, Tint)); Xbranch s]
  | RTL.Iop(op, args, res, s) ->
      let (cargs, cres) = mregs_for_operation op in
      let args1 = vregs tyenv args and res1 = vreg tyenv res in
      let args2 = constrain_regs args1 cargs and res2 = constrain_reg res1 cres in
      let (args3, res3) =
        match is_two_address op args2 with
        | None ->
            (args2, res2)
        | Some(arg, args2') ->
            if arg = res2 || not (List.mem res2 args2') then
              (args2, res2)
            else
              let t = new_temp (tyenv res) in (t :: args2', t) in
      movelist args1 args3 (Xop(op, args3, res3) :: move res3 res1 [Xbranch s])
  | RTL.Iload(trap, chunk, addr, args, dst, s) ->
      if Archi.splitlong && chunk = Mint64 then begin
        match offset_addressing addr (coqint_of_camlint 4l) with
        | None -> assert false
        | Some addr' ->
            [Xload(trap, Mint32, addr, vregs tyenv args,
                   V((if Archi.big_endian then dst else twin_reg dst), Tint));
             Xload(trap, Mint32, addr', vregs tyenv args,
                   V((if Archi.big_endian then twin_reg dst else dst), Tint));
             Xbranch s]
      end else
        [Xload(trap, chunk, addr, vregs tyenv args, vreg tyenv dst); Xbranch s]
  | RTL.Istore(chunk, addr, args, src, s) ->
      if Archi.splitlong && chunk = Mint64 then begin
        match offset_addressing addr (coqint_of_camlint 4l) with
        | None -> assert false
        | Some addr' ->
            [Xstore(Mint32, addr, vregs tyenv args,
                   V((if Archi.big_endian then src else twin_reg src), Tint));
             Xstore(Mint32, addr', vregs tyenv args,
                   V((if Archi.big_endian then twin_reg src else src), Tint));
             Xbranch s]
      end else
        [Xstore(chunk, addr, vregs tyenv args, vreg tyenv src); Xbranch s]
  | RTL.Icall(sg, ros, args, res, s) ->
      let args' = loc_arguments sg
      and res' = [map_rpair (fun r -> R r) (loc_result sg)] in
      parmove_regs2locs tyenv args args'
       (Xcall(sg, sum_left_map (vreg tyenv) ros,
              vlocpairs args', vlocpairs res') ::
         parmove_locs2regs tyenv res' [res]
           [Xbranch s])
  | RTL.Itailcall(sg, ros, args) ->
      let args' = loc_arguments sg in
      parmove_regs2locs tyenv args args'
        [Xtailcall(sg, sum_left_map (vreg tyenv) ros, vlocpairs args')]
  | RTL.Ibuiltin(ef, args, res, s) ->
      let (cargs, cres) = mregs_for_builtin ef in
      let args1 = List.map (convert_builtin_arg tyenv) args
      and res1 = convert_builtin_res tyenv res in
      let (args2, _) = constrain_builtin_args args1 cargs
      and (res2, _)  = constrain_builtin_res res1 cres in
      movelist (params_of_builtin_args args1) (params_of_builtin_args args2)
         (Xbuiltin(ef, args2, res2) ::
            movelist (params_of_builtin_res res2) (params_of_builtin_res res1)
               [Xbranch s])
  | RTL.Icond(cond, args, s1, s2, i) ->
      [Xcond(cond, vregs tyenv args, s1, s2, i)]
  | RTL.Ijumptable(arg, tbl) ->
      [Xjumptable(vreg tyenv arg, tbl)]
  | RTL.Ireturn None ->
      [Xreturn []]
  | RTL.Ireturn (Some arg) ->
      let args' = [map_rpair (fun r -> R r) (loc_result funsig)] in
      parmove_regs2locs tyenv [arg] args' [Xreturn (vlocpairs args')]

(* One above the [pc] nodes of the given RTL function *)

let next_pc f =
  PTree.fold
    (fun npc pc i -> if P.lt pc npc then npc else P.succ pc)
    f.RTL.fn_code P.one

(* Translate an RTL function to an XTL function *)

let function_of_RTL_function f tyenv =
  let xc = PTree.map1 (block_of_RTL_instr f.RTL.fn_sig tyenv) f.RTL.fn_code in
  (* Add moves for function parameters *)
  let pc_entrypoint = next_pc f in
  let b_entrypoint =
    parmove_locs2regs tyenv
              (loc_parameters f.RTL.fn_sig)
              f.RTL.fn_params
              [Xbranch f.RTL.fn_entrypoint] in
  { fn_sig = f.RTL.fn_sig;
    fn_stacksize = f.RTL.fn_stacksize;
    fn_entrypoint = pc_entrypoint;
    fn_code = PTree.set pc_entrypoint b_entrypoint xc }


(***************** Liveness analysis *****************)

let vset_removelist vl after = List.fold_right VSet.remove vl after
let vset_addlist vl after = List.fold_right VSet.add vl after

let vset_addros vos after =
  match vos with Coq_inl v -> VSet.add v after | Coq_inr id -> after

let rec vset_addarg a after =
  match a with
  | BA v -> VSet.add v after
  | BA_splitlong(hi, lo) -> vset_addarg hi (vset_addarg lo after)
  | BA_addptr(a1, a2) -> vset_addarg a1 (vset_addarg a2 after)
  | _ -> after

let vset_addargs al after = List.fold_right vset_addarg al after

let rec vset_removeres r after =
  match r with
  | BR v -> VSet.remove v after
  | BR_none -> after
  | BR_splitlong(hi, lo) -> vset_removeres hi (vset_removeres lo after)

let live_before instr after =
  match instr with
  | Xmove(src, dst) | Xspill(src, dst) | Xreload(src, dst) ->
      if VSet.mem dst after || XTL.is_stack_reg src
      then VSet.add src (VSet.remove dst after)
      else after
  | Xparmove(srcs, dsts, itmp, ftmp) ->
      vset_addlist srcs (vset_removelist dsts after)
  | Xop(op, args, res) ->
      if VSet.mem res after
      then vset_addlist args (VSet.remove res after)
      else after
  | Xload(trap, chunk, addr, args, dst) ->
      if VSet.mem dst after
      then vset_addlist args (VSet.remove dst after)
      else after
  | Xstore(chunk, addr, args, src) ->
      vset_addlist args (VSet.add src after)
  | Xcall(sg, ros, args, res) ->
      vset_addlist args (vset_addros ros (vset_removelist res after))
  | Xtailcall(sg, ros, args) ->
      vset_addlist args (vset_addros ros VSet.empty)
  | Xbuiltin(EF_debug _, args, res) ->
      vset_removeres res after
  | Xbuiltin(ef, args, res) ->
      vset_addargs args (vset_removeres res after)
  | Xbranch s ->
      after
  | Xcond(cond, args, s1, s2, _) ->
      List.fold_right VSet.add args after
  | Xjumptable(arg, tbl) ->
      VSet.add arg after
  | Xreturn args ->
      vset_addlist args VSet.empty

let rec live_before_block blk after =
  match blk with
  | [] -> after
  | instr :: blk -> live_before instr (live_before_block blk after)

let transfer_live f pc after =
  match PTree.get pc f.fn_code with
  | None -> VSet.empty
  | Some blk -> live_before_block blk after

module VSetLat = struct
  type t = VSet.t
  let beq = VSet.equal
  let bot = VSet.empty
  let lub = VSet.union
end

module Liveness_Solver = Backward_Dataflow_Solver(VSetLat)(NodeSetBackward)

let liveness_analysis f =
  match Liveness_Solver.fixpoint f.fn_code successors_block (transfer_live f) with
  | None -> assert false
  | Some lv -> lv

(* Pair the instructions of a block with their live-before sets *)

let pair_block_live blk after =
  let rec pair_rec accu after = function
  | [] -> accu
  | instr :: blk ->
      let before = live_before instr after in
      pair_rec ((instr, before) :: accu) before blk in
  pair_rec [] after (List.rev blk)


(**************** Dead code elimination **********************)

(* Eliminate pure instructions whose results are not used later. *)
(* Also: remove dead registers from debug annotations. *)

let rec dce_parmove srcs dsts after =
  match srcs, dsts with
  | [], [] -> [], []
  | src1 :: srcs, dst1 :: dsts ->
      let (srcs', dsts') = dce_parmove srcs dsts after in
      if VSet.mem dst1 after || XTL.is_stack_reg src1
      then (src1 :: srcs', dst1 :: dsts')
      else (srcs', dsts')
  | _, _ -> assert false

let rec keep_builtin_arg after = function
  | BA v -> VSet.mem v after
  | BA_splitlong(a1, a2) | BA_addptr(a1, a2) ->
      keep_builtin_arg after a1 && keep_builtin_arg after a2
  | _ -> true

let dce_instr instr after k =
  match instr with
  | Xmove(src, dst) ->
      if VSet.mem dst after || XTL.is_stack_reg src
      then instr :: k
      else k
  | Xparmove(srcs, dsts, itmp, ftmp) ->
      begin match dce_parmove srcs dsts after with
      | ([], []) -> k
      | ([src], [dst]) -> Xmove(src, dst) :: k
      | (srcs', dsts') -> Xparmove(srcs', dsts', itmp, ftmp) :: k
      end
  | Xop(op, args, res) ->
      if VSet.mem res after
      then instr :: k
      else k
  | Xload(trap, chunk, addr, args, dst) ->
      if VSet.mem dst after
      then instr :: k
      else k
  | Xbuiltin(EF_debug _ as ef, args, res) ->
      let across = vset_removeres res after in
      Xbuiltin(ef, List.filter (keep_builtin_arg across) args, res) :: k
  | _ ->
      instr :: k

let rec dce_block blk after =
  match blk with
  | [] -> (after, [])
  | instr :: blk' ->
      let (after', tblk') = dce_block blk' after in
      (live_before instr after', dce_instr instr after' tblk')

let dead_code_elimination f liveness =
  { f with fn_code =
      PTree.map (fun pc blk -> snd(dce_block blk (PMap.get pc liveness)))
                 f.fn_code }


(*********************** Spill costs ****************************)

(* Estimate spill costs and count other statistics for every variable.
   Variables that must not be spilled are given infinite costs. *)

let spill_costs f =

  let costs = ref PTree.empty in

  let get_stats r =
    match PTree.get r !costs with
    | Some st -> st
    | None ->
        let st = {cost = 0; usedefs = 0} in
        costs := PTree.set r st !costs;
        st in

  let charge amount uses v =
    match v with
    | L l -> ()
    | V(r, ty) ->
        let st = get_stats r in
        if st.cost < 0 then
          (* the variable must be spilled, don't change its cost *)
          assert (amount < max_int)
        else begin
          (* saturating addition *)
          let c1 = st.cost + amount in
          let c2 = if c1 >= 0 then c1 else max_int (* overflow *) in
          st.cost <- c2
        end;
        st.usedefs <- st.usedefs + uses in

  let charge_list amount uses vl =
    List.iter (charge amount uses) vl in

  let charge_ros amount ros =
    match ros with Coq_inl v -> charge amount 1 v | Coq_inr id -> () in

  let force_stack_allocation v =
    match v with
    | L l -> ()
    | V(r, ty) ->
        let st = get_stats r in
        assert (st.cost < max_int);
        st.cost <- (-1) in

  let charge_instr = function
    | Xmove(src, dst) ->
        if XTL.is_stack_reg src then
          force_stack_allocation dst
        else if XTL.is_stack_reg dst then
          force_stack_allocation src
        else begin
          charge 1 1 src; charge 1 1 dst
        end
    | Xreload(src, dst) ->
        charge 1 1 src; charge max_int 1 dst
        (* dest must not be spilled! *)
    | Xspill(src, dst) ->
        charge max_int 1 src; charge 1 1 dst
        (* source must not be spilled! *)
    | Xparmove(srcs, dsts, itmp, ftmp) ->
        charge_list 1 1 srcs; charge_list 1 1 dsts;
        charge max_int 0 itmp; charge max_int 0 ftmp
        (* temps must not be spilled *)
    | Xop(op, args, res) ->
        charge_list 10 1 args; charge 10 1 res
    | Xload(trap, chunk, addr, args, dst) ->
        charge_list 10 1 args; charge 10 1 dst
    | Xstore(chunk, addr, args, src) ->
        charge_list 10 1 args; charge 10 1 src
    | Xcall(sg, vos, args, res) ->
        charge_ros 10 vos
    | Xtailcall(sg, vos, args) ->
        charge_ros 10 vos
    | Xbuiltin(ef, args, res) ->
        begin match ef with
        | EF_annot _ | EF_debug _ ->
            ()
        | EF_vstore _ | EF_memcpy _ ->
            (* result is not used but should not be spilled *)
            charge_list 10 1 (params_of_builtin_args args);
            charge_list max_int 0 (params_of_builtin_res res)
        | EF_annot_val _ ->
            (* like a move *)
            charge_list 1 1 (params_of_builtin_args args);
            charge_list 1 1 (params_of_builtin_res res)
        | _ ->
            charge_list 10 1 (params_of_builtin_args args);
            charge_list 10 1 (params_of_builtin_res res)
        end
    | Xbranch _ -> ()
    | Xcond(cond, args, _, _, _) ->
        charge_list 10 1 args
    | Xjumptable(arg, _) ->
        charge 10 1 arg
    | Xreturn optarg ->
        () in

  let charge_block blk = List.iter charge_instr blk in

  PTree.fold1
    (fun () blk -> charge_block blk)
    f.fn_code ();
  if !option_dalloctrace then begin
    fprintf !pp "------------------ Unspillable variables --------------\n\n";
    PTree.fold
      (fun () r st ->
        if st.cost = max_int then fprintf !pp "x%d " (P.to_int r))
      !costs ();
    fprintf !pp "\n\n"
  end;
  (* Result is cost function: pseudoreg -> stats *)
  get_stats


(********* Construction and coloring of the interference graph **************)

let add_interfs_def g res live =
  VSet.iter (fun v -> if v <> res then IRC.add_interf g v res) live

let add_interfs_move g src dst live =
  VSet.iter (fun v -> if v <> src && v <> dst then IRC.add_interf g v dst) live

let add_interfs_destroyed g live mregs =
  List.iter
    (fun mr -> VSet.iter (IRC.add_interf g (L (R mr))) live)
    mregs

let add_interfs_caller_save g live =
  VSet.iter
    (fun v ->
       let tv = typeof v in
       List.iter
         (fun mr ->
            if not (is_callee_save mr && subtype tv (callee_save_type mr))
            then IRC.add_interf g (L (R mr)) v)
         all_mregs)
    live

let add_interfs_live g live v =
  VSet.iter (fun v' -> IRC.add_interf g v v') live

let add_interfs_list g v vl =
  List.iter (IRC.add_interf g v) vl

let add_interfs_list_mreg g vl mr =
  List.iter (fun v -> IRC.add_interf g v (L (R mr))) vl

let rec add_interfs_pairwise g = function
  | [] -> ()
  | v1 :: vl -> add_interfs_list g v1 vl; add_interfs_pairwise g vl

let add_interfs_instr g instr live =
  match instr with
  | Xmove(src, dst) | Xspill(src, dst) | Xreload(src, dst) ->
      IRC.add_pref g src dst;
      add_interfs_move g src dst live;
      (* Reloads from incoming slots can occur when some 64-bit
         parameters are split and passed as two 32-bit stack locations. *)
      begin match src with
      | L(Locations.S(Incoming, _, _)) ->
          add_interfs_def g (vmreg temp_for_parent_frame) live
      | _ -> ()
      end
  | Xparmove(srcs, dsts, itmp, ftmp) ->
      List.iter2 (IRC.add_pref g) srcs dsts;
      (* Interferences with live across *)
      let across = vset_removelist dsts live in
      List.iter (add_interfs_live g across) dsts;
      add_interfs_live g across itmp; add_interfs_live g across ftmp;
      (* All destinations must be pairwise different *)
      add_interfs_pairwise g dsts;
      (* The temporaries must be different from sources and dests *)
      add_interfs_list g itmp srcs; add_interfs_list g itmp dsts;
      add_interfs_list g ftmp srcs; add_interfs_list g ftmp dsts;
      (* Take into account destroyed reg when accessing Incoming param *)
      if List.exists (function (L(Locations.S(Incoming, _, _))) -> true | _ -> false) srcs
      then begin
         add_interfs_list g (vmreg temp_for_parent_frame) dsts;
         add_interfs_live g across (vmreg temp_for_parent_frame)
      end
  | Xop(op, args, res) ->
      begin match is_two_address op args with
      | None ->
          add_interfs_def g res live
      | Some(arg1, argl) ->
          (* Treat as "res := arg1; res := op(res, argl)" *)
          add_interfs_def g res live;
          IRC.add_pref g arg1 res;
          add_interfs_move g arg1 res
            (vset_addlist (res :: argl) (VSet.remove res live))
      end;
      add_interfs_destroyed g (VSet.remove res live) (destroyed_by_op op)
  | Xload(trap, chunk, addr, args, dst) ->
      add_interfs_def g dst live;
      add_interfs_destroyed g (VSet.remove dst live)
                              (destroyed_by_load chunk addr)
  | Xstore(chunk, addr, args, src) ->
      add_interfs_destroyed g live (destroyed_by_store chunk addr)
  | Xcall(sg, vos, args, res) ->
      begin match vos with
      | Coq_inl v ->  List.iter (fun r -> IRC.add_interf g (vmreg r) v) destroyed_at_indirect_call
      | _ -> () end;
      add_interfs_caller_save g (vset_removelist res live)
  | Xtailcall(sg, Coq_inl v, args) ->
      List.iter (fun r -> IRC.add_interf g (vmreg r) v) (int_callee_save_regs @ destroyed_at_indirect_call)
  | Xtailcall(sg, Coq_inr id, args) ->
      ()
  | Xbuiltin(ef, args, res) ->
      (* Interferences with live across *)
      let across = vset_removeres res live in
      let vres = params_of_builtin_res res in
      List.iter (add_interfs_live g across) vres;
      (* All results must be pairwise different *)
      add_interfs_pairwise g vres;
      add_interfs_destroyed g across (destroyed_by_builtin ef);
      begin match ef, args, res with
      | EF_annot_val _, [BA arg], BR res ->
          (* like a move *)
          IRC.add_pref g arg res
      | EF_inline_asm(txt, sg, clob), _, _ ->
          let vargs = params_of_builtin_args args in
          (* clobbered regs interfere with res and args for GCC compatibility *)
          List.iter (fun c ->
            match Machregs.register_by_name c with
            | None -> ()
            | Some mr ->
                add_interfs_list_mreg g vargs mr;
                add_interfs_list_mreg g vres mr)
            clob
      | _ -> ()
      end
  | Xbranch s ->
      ()
  | Xcond(cond, args, s1, s2, _) ->
      add_interfs_destroyed g live (destroyed_by_cond cond)
  | Xjumptable(arg, tbl) ->
      add_interfs_destroyed g live destroyed_by_jumptable
  | Xreturn optarg ->
      ()

let rec add_interfs_block g blk live =
  match blk with
  | [] -> live
  | instr :: blk' ->
      let live' = add_interfs_block g blk' live in
      add_interfs_instr g instr live';
      live_before instr live'

let find_coloring f liveness =
  (*type_function f;  (* for debugging *)*)
  let g = IRC.init (spill_costs f) in
  PTree.fold
    (fun () pc blk -> ignore (add_interfs_block g blk (PMap.get pc liveness)))
    f.fn_code ();
  add_interfs_destroyed g
    (transfer_live f f.fn_entrypoint (PMap.get f.fn_entrypoint liveness))
    destroyed_at_function_entry;
  IRC.coloring g


(*********** Determination of variables that need spill code insertion *****)

let is_reg alloc v =
  match alloc v with R _ -> true | Locations.S _ -> false

let add_tospill alloc v ts =
  match alloc v with R _ -> ts | Locations.S _ -> VSet.add v ts

let addlist_tospill alloc vl ts =
  List.fold_right (add_tospill alloc) vl ts

let addros_tospill alloc ros ts =
  match ros with Coq_inl v -> add_tospill alloc v ts | Coq_inr s -> ts

let tospill_instr alloc instr ts =
  match instr with
  | Xmove(src, dst) ->
      if is_reg alloc src || is_reg alloc dst || alloc src = alloc dst
      then ts
      else VSet.add src (VSet.add dst ts)
  | Xreload(src, dst) ->
      if not (is_reg alloc dst) then begin
        printf "Error: %a was spilled\n" PrintXTL.var dst;
        assert false
      end;
      ts
  | Xspill(src, dst) ->
      if not (is_reg alloc src) then begin
        printf "Error: %a was spilled\n" PrintXTL.var src;
        assert false
      end;
      ts
  | Xparmove(srcs, dsts, itmp, ftmp) ->
      assert (is_reg alloc itmp && is_reg alloc ftmp);
      ts
  | Xop(op, args, res) ->
      addlist_tospill alloc args (add_tospill alloc res ts)
  | Xload(trap, chunk, addr, args, dst) ->
      addlist_tospill alloc args (add_tospill alloc dst ts)
  | Xstore(chunk, addr, args, src) ->
      addlist_tospill alloc args (add_tospill alloc src ts)
  | Xcall(sg, vos, args, res) ->
      addros_tospill alloc vos ts
  | Xtailcall(sg, vos, args) ->
      addros_tospill alloc vos ts
  | Xbuiltin((EF_annot _ | EF_debug _), _, _) ->
      ts
  | Xbuiltin(ef, args, res) ->
      addlist_tospill alloc (params_of_builtin_args args)
         (addlist_tospill alloc (params_of_builtin_res res) ts)
  | Xbranch s ->
      ts
  | Xcond(cond, args, s1, s2, _) ->
      addlist_tospill alloc args ts
  | Xjumptable(arg, tbl) ->
      add_tospill alloc arg ts
  | Xreturn optarg ->
      ts

let rec tospill_block alloc blk ts =
  match blk with
  | [] -> ts
  | instr :: blk' -> tospill_block alloc blk' (tospill_instr alloc instr ts)

let tospill_function f alloc =
  PTree.fold1
    (fun ts blk -> tospill_block alloc blk ts)
    f.fn_code VSet.empty


(********************* Spilling ***********************)

(* We follow a semi-naive spilling strategy.  By default, we spill at
  every definition of a variable that could not be allocated a register,
  and we reload before every use.  However, we also maintain a list of
  equations of the form [spilled-var = temp] that keep track of
  variables that were recently spilled or reloaded.  Based on these
  equations, we can avoid reloading a spilled variable if its value
  is still available in a temporary register. *)

let rec find_reg_containing v = function
  | [] -> None
  | (var, temp, date) :: eqs ->
      if var = v then Some temp else find_reg_containing v eqs

let add v t eqs = (v, t, 0) :: eqs

let kill x eqs =
  List.filter (fun (v, t, date) -> v <> x && t <> x) eqs

let reload_var tospill eqs v =
  if not (VSet.mem v tospill) then
    (v, [], eqs)
  else
    match find_reg_containing v eqs with
    | Some t ->
        (*printf "Reusing %a for %a@ @." PrintXTL.var t PrintXTL.var v;*)
        (t, [], eqs)
    | None ->
        let t = new_temp (typeof v) in (t, [Xreload(v, t)], add v t eqs)

let rec reload_vars tospill eqs vl =
  match vl with
  | [] -> ([], [], eqs)
  | v1 :: vs ->
      let (t1, c1, eqs1) = reload_var tospill eqs v1 in
      let (ts, cs, eqs2) = reload_vars tospill eqs1 vs in
      (t1 :: ts, c1 @ cs, eqs2)

let rec reload_arg tospill eqs = function
  | BA v ->
      let (t, c1, eqs1) = reload_var tospill eqs v in
      (BA t, c1, eqs1)
  | BA_splitlong(hi, lo) ->
      let (hi', c1, eqs1) = reload_arg tospill eqs hi in
      let (lo', c2, eqs2) = reload_arg tospill eqs1 lo in
      (BA_splitlong(hi', lo'), c1 @ c2, eqs2)
  | BA_addptr(a1, a2) ->
      let (a1', c1, eqs1) = reload_arg tospill eqs a1 in
      let (a2', c2, eqs2) = reload_arg tospill eqs1 a2 in
      (BA_addptr(a1', a2'), c1 @ c2, eqs2)
  | a -> (a, [], eqs)

let rec reload_args tospill eqs = function
  | [] -> ([], [], eqs)
  | a1 :: al ->
      let (t1, c1, eqs1) = reload_arg tospill eqs a1 in
      let (tl, cl, eqs2) = reload_args tospill eqs1 al in
      (t1 :: tl, c1 @ cl, eqs2)

let save_var tospill eqs v =
  if not (VSet.mem v tospill) then
    (v, [], kill v eqs)
  else begin
    let t = new_temp (typeof v) in
    (t, [Xspill(t, v)], add v t (kill v eqs))
  end

let rec save_res tospill eqs = function
  | BR v ->
      let (t, c1, eqs1) = save_var tospill eqs v in
      (BR t, c1, eqs1)
  | BR_none ->
      (BR_none, [], eqs)
  | BR_splitlong(hi, lo) ->
      let (hi', c1, eqs1) = save_res tospill eqs hi in
      let (lo', c2, eqs2) = save_res tospill eqs1 lo in
      (BR_splitlong(hi', lo'), c1 @ c2, eqs2)

(* Trimming equations when we have too many or when they are too old.
   The goal is to limit the live range of unspillable temporaries.
   By setting [max_age] to zero, we can effectively deactivate
   the reuse strategy and fall back to a naive "reload at every use"
   strategy. *)

let max_age = ref 3
let max_num_eqs = ref 3

let rec trim count eqs =
  if count <= 0 then [] else
    match eqs with
    | [] -> []
    | (v, t, date) :: eqs' ->
        if date <= !max_age
        then (v, t, date + 1) :: trim (count - 1) eqs'
        else []

(* Insertion of spill and reload instructions. *)

let spill_instr tospill eqs instr =
  let eqs = trim !max_num_eqs eqs in
  match instr with
  | Xmove(src, dst) ->
      if VSet.mem src tospill && VSet.mem dst tospill then begin
        let (src', c1, eqs1) = reload_var tospill eqs src in
        (c1 @ [Xspill(src', dst)], add dst src' (kill dst eqs1))
      end else begin
        ([instr], kill dst eqs)
      end
  | Xreload(src, dst) ->
      assert false
  | Xspill(src, dst) ->
      assert false
  | Xparmove(srcs, dsts, itmp, ftmp) ->
      ([instr], List.fold_right kill dsts eqs)
  | Xop(op, args, res) ->
      begin match is_two_address op args with
      | None ->
          let (args', c1, eqs1) = reload_vars tospill eqs args in
          let (res', c2, eqs2) = save_var tospill eqs1 res in
          (c1 @ Xop(op, args', res') :: c2, eqs2)
      | Some(arg1, argl) ->
          begin match VSet.mem res tospill, VSet.mem arg1 tospill with
          | false, false ->
              let (argl', c1, eqs1) = reload_vars tospill eqs argl in
              (c1 @ [Xop(op, arg1 :: argl', res)], kill res eqs1)
          | true, false ->
              let tmp = new_temp (typeof res) in
              let (argl', c1, eqs1) = reload_vars tospill eqs argl in
              (c1 @ [Xmove(arg1, tmp); Xop(op, tmp :: argl', tmp); Xspill(tmp, res)],
               add res tmp (kill res eqs1))
          | false, true ->
              let eqs1 = add arg1 res (kill res eqs) in
              let (argl', c1, eqs2) = reload_vars tospill eqs1 argl in
              (* PR#113, PR#122: [Xreload] here causes [res] to become
                 unspillable in the future.  This is too strong, hence
                 the [Xmove].  Alternatively, this [false, true]
                 case could be removed and the [true, true] case
                 below used instead. *)
              (Xmove(arg1, res) :: c1 @ [Xop(op, res :: argl', res)],
               kill res eqs2)
          | true, true ->
              let tmp = new_temp (typeof res) in
              let eqs1 = add arg1 tmp eqs in
              let (argl', c1, eqs2) = reload_vars tospill eqs1 argl in
              (Xreload(arg1, tmp) :: c1 @ [Xop(op, tmp :: argl', tmp); Xspill(tmp, res)],
               add res tmp (kill tmp (kill res eqs2)))
          end
      end
  | Xload(trap, chunk, addr, args, dst) ->
      let (args', c1, eqs1) = reload_vars tospill eqs args in
      let (dst', c2, eqs2) = save_var tospill eqs1 dst in
      (c1 @ Xload(trap, chunk, addr, args', dst') :: c2, eqs2)
  | Xstore(chunk, addr, args, src) ->
      let (args', c1, eqs1) = reload_vars tospill eqs args in
      let (src', c2, eqs2) = reload_var tospill eqs1 src in
      (c1 @ c2 @ [Xstore(chunk, addr, args', src')], eqs2)
  | Xcall(sg, Coq_inl v, args, res) ->
      let (v', c1, eqs1) = reload_var tospill eqs v in
      (c1 @ [Xcall(sg, Coq_inl v', args, res)], [])
  | Xcall(sg, Coq_inr id, args, res) ->
      ([instr], [])
  | Xtailcall(sg, Coq_inl v, args) ->
      let (v', c1, eqs1) = reload_var tospill eqs v in
      (c1 @ [Xtailcall(sg, Coq_inl v', args)], [])
  | Xtailcall(sg, Coq_inr id, args) ->
      ([instr], [])
  | Xbuiltin((EF_annot _ | EF_debug _), args, res) ->
      ([instr], eqs)
  | Xbuiltin(ef, args, res) ->
      let (args', c1, eqs1) = reload_args tospill eqs args in
      let (res', c2, eqs2) = save_res tospill eqs1 res in
      (c1 @ Xbuiltin(ef, args', res') :: c2, eqs2)
  | Xbranch s ->
      ([instr], eqs)
  | Xcond(cond, args, s1, s2, i) ->
     let (args', c1, eqs1) = reload_vars tospill eqs args in
     (c1 @ [Xcond(cond, args', s1, s2, i)], eqs1)
  | Xjumptable(arg, tbl) ->
      let (arg', c1, eqs1) = reload_var tospill eqs arg in
      (c1 @ [Xjumptable(arg', tbl)], eqs1)
  | Xreturn optarg ->
      ([instr], [])

let rec spill_block tospill pc blk eqs =
  match blk with
  | [] -> ([], eqs)
  | instr :: blk' ->
      let (c1, eqs1) = spill_instr tospill eqs instr in
      let (c2, eqs2) = spill_block tospill pc blk' eqs1 in
      (c1 @ c2, eqs2)

(*
let spill_block tospill pc blk eqs =
  printf "@[<hov 2>spill_block: at %ld: " (camlint_of_positive pc);
  List.iter (fun (x,y,d) -> printf "@ %a=%a" PrintXTL.var x PrintXTL.var y) eqs;
  printf "@]@.";
  spill_block tospill pc blk eqs
*)

let spill_function f tospill round =
  max_num_eqs := 3;
  max_age := (if round <= 10 then 3 else if round <= 20 then 1 else 0);
  transform_basic_blocks (spill_block tospill) [] f


(***************** Generation of LTL from XTL ***********************)

(** Apply a register allocation to an XTL function, producing an LTL function.
  Raise [Bad_LTL] if some pseudoregisters were mapped to stack locations
  while machine registers were expected, or in other words if spilling
  and reloading code must be inserted. *)

exception Bad_LTL

let mreg_of alloc v = match alloc v with R mr -> mr | Locations.S _ -> raise Bad_LTL

let mregs_of alloc vl = List.map (mreg_of alloc) vl

let mros_of alloc vos = sum_left_map (mreg_of alloc) vos

let make_move src dst k =
  match src, dst with
  | R rsrc, R rdst ->
      if rsrc = rdst then k else LTL.Lop(Omove, [rsrc], rdst) :: k
  | R rsrc, Locations.S(sl, ofs, ty) ->
      LTL.Lsetstack(rsrc, sl, ofs, ty) :: k
  | Locations.S(sl, ofs, ty), R rdst ->
      LTL.Lgetstack(sl, ofs, ty, rdst) :: k
  | Locations.S _, Locations.S _ ->
      if src = dst then k else raise Bad_LTL

type parmove_status = To_move | Being_moved | Moved

let make_parmove srcs dsts itmp ftmp k =
  let src = Array.of_list srcs
  and dst = Array.of_list dsts in
  let n = Array.length src in
  assert (Array.length dst = n);
  let status = Array.make n To_move in
  let temp_for cls =
    match cls with 0 -> itmp | 1 -> ftmp | _ -> assert false in
  let code = ref [] in
  let add_move s d =
    match s, d with
    | R rs, R rd ->
        code := LTL.Lop(Omove, [rs], rd) :: !code
    | R rs, Locations.S(sl, ofs, ty) ->
        code := LTL.Lsetstack(rs, sl, ofs, ty) :: !code
    | Locations.S(sl, ofs, ty), R rd ->
        code := LTL.Lgetstack(sl, ofs, ty, rd) :: !code
    | Locations.S(sls, ofss, tys), Locations.S(sld, ofsd, tyd) ->
        let tmp = temp_for (Machregsaux.class_of_type tys) in
        (* code will be reversed at the end *)
        code := LTL.Lsetstack(tmp, sld, ofsd, tyd) ::
                LTL.Lgetstack(sls, ofss, tys, tmp) :: !code
    in
  let rec move_one i =
    if src.(i) <> dst.(i) then begin
      status.(i) <- Being_moved;
      for j = 0 to n - 1 do
        if src.(j) = dst.(i) then
          match status.(j) with
          | To_move ->
              move_one j
          | Being_moved ->
              let tmp = R (temp_for (class_of_loc src.(j))) in
              add_move src.(j) tmp;
              src.(j) <- tmp
          | Moved ->
              ()
      done;
      add_move src.(i) dst.(i);
      status.(i) <- Moved
    end in
  for i = 0 to n - 1 do
    if status.(i) = To_move then move_one i
  done;
  List.rev_append !code k

let transl_instr alloc instr k =
  match instr with
  | Xmove(src, dst) | Xreload(src, dst) | Xspill(src, dst) ->
      make_move (alloc src) (alloc dst) k
  | Xparmove(srcs, dsts, itmp, ftmp) ->
      make_parmove (List.map alloc srcs) (List.map alloc dsts)
                   (mreg_of alloc itmp) (mreg_of alloc ftmp) k
  | Xop(op, args, res) ->
      let rargs = mregs_of alloc args
      and rres  = mreg_of alloc res in
      begin match is_two_address op rargs with
      | None ->
          LTL.Lop(op, rargs, rres) :: k
      | Some(rarg1, rargl) ->
          if rarg1 = rres then
            LTL.Lop(op, rargs, rres) :: k
          else
            LTL.Lop(Omove, [rarg1], rres) ::
            LTL.Lop(op, rres :: rargl, rres) :: k
      end
  | Xload(trap, chunk, addr, args, dst) ->
      LTL.Lload(trap, chunk, addr, mregs_of alloc args, mreg_of alloc dst) :: k
  | Xstore(chunk, addr, args, src) ->
      LTL.Lstore(chunk, addr, mregs_of alloc args, mreg_of alloc src) :: k
  | Xcall(sg, vos, args, res) ->
      LTL.Lcall(sg, mros_of alloc vos) :: k
  | Xtailcall(sg, vos, args) ->
      LTL.Ltailcall(sg, mros_of alloc vos) :: []
  | Xbuiltin(ef, args, res) ->
      LTL.Lbuiltin(ef, List.map (AST.map_builtin_arg alloc) args,
                       AST.map_builtin_res (mreg_of alloc) res) :: k
  | Xbranch s ->
      LTL.Lbranch s :: []
  | Xcond(cond, args, s1, s2, i) ->
      LTL.Lcond(cond, mregs_of alloc args, s1, s2, i) :: []
  | Xjumptable(arg, tbl) ->
      LTL.Ljumptable(mreg_of alloc arg, tbl) :: []
  | Xreturn optarg ->
      LTL.Lreturn :: []

let rec transl_block alloc blk =
  match blk with
  | [] -> []
  | instr :: blk' -> transl_instr alloc instr (transl_block alloc blk')

let transl_function fn alloc =
  { LTL.fn_sig = fn.fn_sig;
    LTL.fn_stacksize = fn.fn_stacksize;
    LTL.fn_entrypoint = fn.fn_entrypoint;
    LTL.fn_code = PTree.map1 (transl_block alloc) fn.fn_code
  }


(******************* All together *********************)

exception Timeout

let rec first_round f liveness =
  let alloc = find_coloring f liveness in
  if !option_dalloctrace then begin
    fprintf !pp "-------------- After initial register allocation\n\n";
    PrintXTL.print_function !pp ~alloc: alloc ~live: liveness f
  end;
  let ts = tospill_function f alloc in
  if VSet.is_empty ts then success f alloc else more_rounds f ts 1

and more_rounds f ts count =
  if count >= 40 then raise Timeout;
  let f' = spill_function f ts count in
  let liveness = liveness_analysis f' in
  let alloc = find_coloring f' liveness in
  if !option_dalloctrace then begin
    fprintf !pp "-------------- After register allocation (round %d)\n\n" count;
    PrintXTL.print_function !pp ~alloc: alloc ~live: liveness f'
  end;
  let ts' = tospill_function f' alloc in
  if VSet.is_empty ts'
  then success f' alloc
  else begin
    if !option_dalloctrace then begin
      fprintf !pp "--- Remain to be spilled:\n";
      VSet.iter (fun v -> fprintf !pp "%a " PrintXTL.var v) ts';
      fprintf !pp "\n\n"
    end;
    more_rounds f (VSet.union ts ts') (count + 1)
  end

and success f alloc =
  let f' = transl_function f alloc in
  if !option_dalloctrace then begin
    fprintf !pp "-------------- Candidate allocation\n\n";
    PrintLTL.print_function !pp (intern_string "f") f'
  end;
  f'


let regalloc f =
  init_trace();
  reset_temps();
  let f1 = Splitting.rename_function f in
  match RTLtyping.type_function f1 with
  | Errors.Error msg ->
      Errors.Error(Errors.MSG (coqstring_of_camlstring "RTL code after splitting is ill-typed:") :: msg)
  | Errors.OK tyenv ->
      let f2 = function_of_RTL_function f1 tyenv in
      let liveness = liveness_analysis f2 in
      let f3 = dead_code_elimination f2 liveness in
      if !option_dalloctrace then begin
        fprintf !pp "-------------- Initial XTL\n\n";
        PrintXTL.print_function !pp f3
      end;
      try
        Errors.OK(first_round f3 liveness)
      with
      | Timeout ->
          Errors.Error(Errors.msg (coqstring_of_camlstring "spilling fails to converge"))
      | Type_error_at pc ->
          Errors.Error [Errors.MSG(coqstring_of_camlstring "ill-typed XTL code at PC ");
                 Errors.POS pc]
      | Bad_LTL ->
          Errors.Error(Errors.msg (coqstring_of_camlstring "bad LTL after spilling"))