diff options
Diffstat (limited to 'powerpc/Asmexpand.ml')
-rw-r--r-- | powerpc/Asmexpand.ml | 525 |
1 files changed, 525 insertions, 0 deletions
diff --git a/powerpc/Asmexpand.ml b/powerpc/Asmexpand.ml new file mode 100644 index 00000000..243a4d93 --- /dev/null +++ b/powerpc/Asmexpand.ml @@ -0,0 +1,525 @@ +(* *********************************************************************) +(* *) +(* The Compcert verified compiler *) +(* *) +(* Xavier Leroy, INRIA Paris-Rocquencourt *) +(* *) +(* Copyright Institut National de Recherche en Informatique et en *) +(* Automatique. All rights reserved. This file is distributed *) +(* under the terms of the INRIA Non-Commercial License Agreement. *) +(* *) +(* *********************************************************************) + +(* Expanding built-ins and some pseudo-instructions by rewriting + of the PPC assembly code. *) + +open Datatypes +open Camlcoq +open Integers +open AST +open Memdata +open Asm + +(* Buffering the expanded code *) + +let current_code = ref ([]: instruction list) + +let emit i = current_code := i :: !current_code + +let emit_loadimm r n = + List.iter emit (Asmgen.loadimm r n []) + +let emit_addimm rd rs n = + List.iter emit (Asmgen.addimm rd rs n []) + +let get_code () = + let c = List.rev !current_code in current_code := []; c + +(* Generation of fresh labels *) + +let dummy_function = { fn_code = []; fn_sig = signature_main } +let current_function = ref dummy_function +let next_label = ref (None : label option) + +let new_label () = + let lbl = + match !next_label with + | Some l -> l + | None -> + (* on-demand computation of the next available label *) + List.fold_left + (fun next instr -> + match instr with + | Plabel l -> if P.lt l next then next else P.succ l + | _ -> next) + P.one (!current_function).fn_code + in + next_label := Some (P.succ lbl); + lbl + +let set_current_function f = + current_function := f; next_label := None + +(* Useful constants *) + +let _0 = Integers.Int.zero +let _1 = Integers.Int.one +let _2 = coqint_of_camlint 2l +let _4 = coqint_of_camlint 4l +let _6 = coqint_of_camlint 6l +let _8 = coqint_of_camlint 8l +let _m4 = coqint_of_camlint (-4l) +let _m8 = coqint_of_camlint (-8l) + +(* Handling of annotations *) + +let expand_annot_val txt targ args res = + emit (Pannot(EF_annot(txt, [AA_arg targ]), List.map (fun r -> APreg r) args)); + begin match args, res with + | [IR src], [IR dst] -> + if dst <> src then emit (Pmr(dst, src)) + | [FR src], [FR dst] -> + if dst <> src then emit (Pfmr(dst, src)) + | _, _ -> + assert false + end + +(* Handling of memcpy *) + +(* On the PowerPC, unaligned accesses to 16- and 32-bit integers are + fast, but unaligned accesses to 64-bit floats can be slow + (not so much on G5, but clearly so on Power7). + So, use 64-bit accesses only if alignment >= 4. + Note that lfd and stfd cannot trap on ill-formed floats. *) + +let expand_builtin_memcpy_small sz al src dst = + let rec copy ofs sz = + if sz >= 8 && al >= 4 && !Clflags.option_ffpu then begin + emit (Plfd(FPR13, Cint ofs, src)); + emit (Pstfd(FPR13, Cint ofs, dst)); + copy (Int.add ofs _8) (sz - 8) + end else if sz >= 4 then begin + emit (Plwz(GPR0, Cint ofs, src)); + emit (Pstw(GPR0, Cint ofs, dst)); + copy (Int.add ofs _4) (sz - 4) + end else if sz >= 2 then begin + emit (Plhz(GPR0, Cint ofs, src)); + emit (Psth(GPR0, Cint ofs, dst)); + copy (Int.add ofs _2) (sz - 2) + end else if sz >= 1 then begin + emit (Plbz(GPR0, Cint ofs, src)); + emit (Pstb(GPR0, Cint ofs, dst)); + copy (Int.add ofs _1) (sz - 1) + end in + copy _0 sz + +let expand_builtin_memcpy_big sz al src dst = + assert (sz >= 4); + emit_loadimm GPR0 (Z.of_uint (sz / 4)); + emit (Pmtctr GPR0); + let (s,d) = if dst <> GPR11 then (GPR11, GPR12) else (GPR12, GPR11) in + emit (Paddi(s, src, Cint _m4)); + emit (Paddi(d, dst, Cint _m4)); + let lbl = new_label() in + emit (Plabel lbl); + emit (Plwzu(GPR0, Cint _4, s)); + emit (Pstwu(GPR0, Cint _4, d)); + emit (Pbdnz lbl); + (* s and d lag behind by 4 bytes *) + match sz land 3 with + | 1 -> emit (Plbz(GPR0, Cint _4, s)); + emit (Pstb(GPR0, Cint _4, d)) + | 2 -> emit (Plhz(GPR0, Cint _4, s)); + emit (Psth(GPR0, Cint _4, d)) + | 3 -> emit (Plhz(GPR0, Cint _4, s)); + emit (Psth(GPR0, Cint _4, d)); + emit (Plbz(GPR0, Cint _6, s)); + emit (Pstb(GPR0, Cint _6, d)) + | _ -> () + +let expand_builtin_memcpy sz al args = + let (dst, src) = + match args with [IR d; IR s] -> (d, s) | _ -> assert false in + if sz <= (if !Clflags.option_ffpu && al >= 4 + then if !Clflags.option_Osize then 35 else 51 + else if !Clflags.option_Osize then 19 else 27) + then expand_builtin_memcpy_small sz al src dst + else expand_builtin_memcpy_big sz al src dst + +(* Handling of volatile reads and writes *) + +let expand_builtin_vload_common chunk base offset res = + match chunk, res with + | Mint8unsigned, IR res -> + emit (Plbz(res, offset, base)) + | Mint8signed, IR res -> + emit (Plbz(res, offset, base)); + emit (Pextsb(res, res)) + | Mint16unsigned, IR res -> + emit (Plhz(res, offset, base)) + | Mint16signed, IR res -> + emit (Plha(res, offset, base)) + | (Mint32 | Many32), IR res -> + emit (Plwz(res, offset, base)) + | Mfloat32, FR res -> + emit (Plfs(res, offset, base)) + | (Mfloat64 | Many64), FR res -> + emit (Plfd(res, offset, base)) + (* Mint64 is special-cased below *) + | _ -> + assert false + +let expand_builtin_vload chunk args res = + begin match args, res with + | [IR addr], [res] when chunk <> Mint64 -> + expand_builtin_vload_common chunk addr (Cint _0) res + | [IR addr], [IR res1; IR res2] when chunk = Mint64 -> + if addr <> res1 then begin + emit (Plwz(res1, Cint _0, addr)); + emit (Plwz(res2, Cint _4, addr)) + end else begin + emit (Plwz(res2, Cint _4, addr)); + emit (Plwz(res1, Cint _0, addr)) + end + | _ -> + assert false + end + +let expand_builtin_vload_global chunk id ofs args res = + begin match res with + | [res] when chunk <> Mint64 -> + emit (Paddis(GPR11, GPR0, Csymbol_high(id, ofs))); + expand_builtin_vload_common chunk GPR11 (Csymbol_low(id, ofs)) res + | [IR res1; IR res2] when chunk = Mint64 -> + emit (Paddis(res1, GPR0, Csymbol_high(id, ofs))); + emit (Plwz(res1, Csymbol_low(id, ofs), res1)); + let ofs = Int.add ofs _4 in + emit (Paddis(res2, GPR0, Csymbol_high(id, ofs))); + emit (Plwz(res2, Csymbol_low(id, ofs), res2)) + | _ -> + assert false + end + +let expand_builtin_vload_sda chunk id ofs args res = + begin match res with + | [res] when chunk <> Mint64 -> + expand_builtin_vload_common chunk GPR0 (Csymbol_sda(id, ofs)) res + | [IR res1; IR res2] when chunk = Mint64 -> + emit (Plwz(res1, Csymbol_sda(id, ofs), GPR0)); + let ofs = Int.add ofs _4 in + emit (Plwz(res2, Csymbol_sda(id, ofs), GPR0)) + | _ -> + assert false + end + +let expand_builtin_vstore_common chunk base offset src = + match chunk, src with + | (Mint8signed | Mint8unsigned), IR src -> + emit (Pstb(src, offset, base)) + | (Mint16signed | Mint16unsigned), IR src -> + emit (Psth(src, offset, base)) + | (Mint32 | Many32), IR src -> + emit (Pstw(src, offset, base)) + | Mfloat32, FR src -> + emit (Pstfs(src, offset, base)) + | (Mfloat64 | Many64), FR src -> + emit (Pstfd(src, offset, base)) + (* Mint64 is special-cased below *) + | _ -> + assert false + +let expand_builtin_vstore chunk args = + begin match args with + | [IR addr; src] when chunk <> Mint64 -> + expand_builtin_vstore_common chunk addr (Cint _0) src + | [IR addr; IR src1; IR src2] when chunk = Mint64 -> + emit (Pstw(src1, Cint _0, addr)); + emit (Pstw(src2, Cint _4, addr)) + | _ -> + assert false + end + +let expand_builtin_vstore_global chunk id ofs args = + begin match args with + | [src] when chunk <> Mint64 -> + let tmp = if src = IR GPR11 then GPR12 else GPR11 in + emit (Paddis(tmp, GPR0, Csymbol_high(id, ofs))); + expand_builtin_vstore_common chunk tmp (Csymbol_low(id, ofs)) src + | [IR src1; IR src2] when chunk = Mint64 -> + let tmp = + if not (List.mem GPR12 [src1; src2]) then GPR12 else + if not (List.mem GPR11 [src1; src2]) then GPR11 else GPR10 in + emit (Paddis(tmp, GPR0, Csymbol_high(id, ofs))); + emit (Pstw(src1, Csymbol_low(id, ofs), tmp)); + let ofs = Int.add ofs _4 in + emit (Paddis(tmp, GPR0, Csymbol_high(id, ofs))); + emit (Pstw(src2, Csymbol_low(id, ofs), tmp)) + | _ -> + assert false + end + +let expand_builtin_vstore_sda chunk id ofs args = + begin match args with + | [src] when chunk <> Mint64 -> + expand_builtin_vstore_common chunk GPR0 (Csymbol_sda(id, ofs)) src + | [IR src1; IR src2] when chunk = Mint64 -> + emit (Pstw(src1, Csymbol_sda(id, ofs), GPR0)); + emit (Pstw(src2, Csymbol_sda(id, ofs), GPR0)) + | _ -> + assert false + end + +(* Handling of varargs *) + +let current_function_stacksize = ref 0l + +let align n a = (n + a - 1) land (-a) + +let rec next_arg_locations ir fr ofs = function + | [] -> + (ir, fr, ofs) + | (Tint | Tany32) :: l -> + if ir < 8 + then next_arg_locations (ir + 1) fr ofs l + else next_arg_locations ir fr (ofs + 4) l + | (Tfloat | Tsingle | Tany64) :: l -> + if fr < 8 + then next_arg_locations ir (fr + 1) ofs l + else next_arg_locations ir fr (align ofs 8 + 8) l + | Tlong :: l -> + if ir < 7 + then next_arg_locations (align ir 2 + 2) fr ofs l + else next_arg_locations ir fr (align ofs 8 + 8) l + +let expand_builtin_va_start r = + if not (!current_function).fn_sig.sig_cc.cc_vararg then + invalid_arg "Fatal error: va_start used in non-vararg function"; + let (ir, fr, ofs) = + next_arg_locations 0 0 0 (!current_function).fn_sig.sig_args in + emit_loadimm GPR0 (Z.of_uint ir); + emit (Pstb(GPR0, Cint _0, r)); + emit_loadimm GPR0 (Z.of_uint fr); + emit (Pstb(GPR0, Cint _1, r)); + emit_addimm GPR0 GPR1 (coqint_of_camlint + Int32.(add (add !current_function_stacksize 8l) + (of_int ofs))); + emit (Pstw(GPR0, Cint _4, r)); + emit_addimm GPR0 GPR1 (coqint_of_camlint + Int32.(sub !current_function_stacksize 96l)); + emit (Pstw(GPR0, Cint _8, r)) + +(* Auxiliary for 64-bit integer arithmetic built-ins. They expand to + two instructions, one computing the low 32 bits of the result, + followed by another computing the high 32 bits. In cases where + the first instruction would overwrite arguments to the second + instruction, we must go through GPR0 to hold the low 32 bits of the result. +*) + +let expand_int64_arith conflict rl fn = + if conflict then (fn GPR0; emit (Pmr(rl, GPR0))) else fn rl + +(* Handling of compiler-inlined builtins *) + +let expand_builtin_inline name args res = + (* Can use as temporaries: GPR0, FPR13 *) + match name, args, res with + (* Integer arithmetic *) + | "__builtin_mulhw", [IR a1; IR a2], [IR res] -> + emit (Pmulhw(res, a1, a2)) + | "__builtin_mulhwu", [IR a1; IR a2], [IR res] -> + emit (Pmulhwu(res, a1, a2)) + | "__builtin_cntlz", [IR a1], [IR res] -> + emit (Pcntlz(res, a1)) + | ("__builtin_bswap" | "__builtin_bswap32"), [IR a1], [IR res] -> + emit (Pstwu(a1, Cint _m8, GPR1)); + emit (Pcfi_adjust _8); + emit (Plwbrx(res, GPR0, GPR1)); + emit (Paddi(GPR1, GPR1, Cint _8)); + emit (Pcfi_adjust _m8) + | "__builtin_bswap16", [IR a1], [IR res] -> + emit (Prlwinm(GPR0, a1, _8, coqint_of_camlint 0x0000FF00l)); + emit (Prlwinm(res, a1, coqint_of_camlint 24l, + coqint_of_camlint 0x000000FFl)); + emit (Por(res, GPR0, res)) + (* Float arithmetic *) + | "__builtin_fmadd", [FR a1; FR a2; FR a3], [FR res] -> + emit (Pfmadd(res, a1, a2, a3)) + | "__builtin_fmsub", [FR a1; FR a2; FR a3], [FR res] -> + emit (Pfmsub(res, a1, a2, a3)) + | "__builtin_fnmadd", [FR a1; FR a2; FR a3], [FR res] -> + emit (Pfnmadd(res, a1, a2, a3)) + | "__builtin_fnmsub", [FR a1; FR a2; FR a3], [FR res] -> + emit (Pfnmsub(res, a1, a2, a3)) + | "__builtin_fabs", [FR a1], [FR res] -> + emit (Pfabs(res, a1)) + | "__builtin_fsqrt", [FR a1], [FR res] -> + emit (Pfsqrt(res, a1)) + | "__builtin_frsqrte", [FR a1], [FR res] -> + emit (Pfrsqrte(res, a1)) + | "__builtin_fres", [FR a1], [FR res] -> + emit (Pfres(res, a1)) + | "__builtin_fsel", [FR a1; FR a2; FR a3], [FR res] -> + emit (Pfsel(res, a1, a2, a3)) + | "__builtin_fcti", [FR a1], [IR res] -> + emit (Pfctiw(FPR13, a1)); + emit (Pstfdu(FPR13, Cint _m8, GPR1)); + emit (Pcfi_adjust _8); + emit (Plwz(res, Cint _4, GPR1)); + emit (Paddi(GPR1, GPR1, Cint _8)); + emit (Pcfi_adjust _m8) + (* 64-bit integer arithmetic *) + | "__builtin_negl", [IR ah; IR al], [IR rh; IR rl] -> + expand_int64_arith (rl = ah) rl (fun rl' -> + emit (Psubfic(rl', al, Cint _0)); + emit (Psubfze(rh, ah))) + | "__builtin_addl", [IR ah; IR al; IR bh; IR bl], [IR rh; IR rl] -> + expand_int64_arith (rl = ah || rl = bh) rl (fun rl' -> + emit (Paddc(rl', al, bl)); + emit (Padde(rh, ah, bh))) + | "__builtin_subl", [IR ah; IR al; IR bh; IR bl], [IR rh; IR rl] -> + expand_int64_arith (rl = ah || rl = bh) rl (fun rl' -> + emit (Psubfc(rl', bl, al)); + emit (Psubfe(rh, bh, ah))) + | "__builtin_mull", [IR a; IR b], [IR rh; IR rl] -> + expand_int64_arith (rl = a || rl = b) rl (fun rl' -> + emit (Pmullw(rl, a, b)); + emit (Pmulhwu(rh, a, b))) + (* Memory accesses *) + | "__builtin_read16_reversed", [IR a1], [IR res] -> + emit (Plhbrx(res, GPR0, a1)) + | "__builtin_read32_reversed", [IR a1], [IR res] -> + emit (Plwbrx(res, GPR0, a1)) + | "__builtin_write16_reversed", [IR a1; IR a2], _ -> + emit (Psthbrx(a2, GPR0, a1)) + | "__builtin_write32_reversed", [IR a1; IR a2], _ -> + emit (Pstwbrx(a2, GPR0, a1)) + (* Synchronization *) + | "__builtin_eieio", [], _ -> + emit (Peieio) + | "__builtin_sync", [], _ -> + emit (Psync) + | "__builtin_isync", [], _ -> + emit (Pisync) + | "__builtin_trap", [], _ -> + emit (Ptrap) + (* Vararg stuff *) + | "__builtin_va_start", [IR a], _ -> + expand_builtin_va_start a + (* Catch-all *) + | _ -> + invalid_arg ("unrecognized builtin " ^ name) + +(* Calls to variadic functions: condition bit 6 must be set + if at least one argument is a float; clear otherwise. + Note that variadic functions cannot have arguments of type Tsingle. *) + +let set_cr6 sg = + if sg.sig_cc.cc_vararg then begin + if List.mem Tfloat sg.sig_args + then emit (Pcreqv(CRbit_6, CRbit_6, CRbit_6)) + else emit (Pcrxor(CRbit_6, CRbit_6, CRbit_6)) + end + +(* Expand instructions *) + +let num_crbit = function + | CRbit_0 -> 0 + | CRbit_1 -> 1 + | CRbit_2 -> 2 + | CRbit_3 -> 3 + | CRbit_6 -> 6 + +let expand_instruction instr = + match instr with + | Pallocframe(sz, ofs) -> + let variadic = (!current_function).fn_sig.sig_cc.cc_vararg in + let sz = camlint_of_coqint sz in + assert (ofs = Int.zero); + let sz = if variadic then Int32.add sz 96l else sz in + let adj = Int32.neg sz in + if adj >= -0x8000l then + emit (Pstwu(GPR1, Cint(coqint_of_camlint adj), GPR1)) + else begin + emit_loadimm GPR0 (coqint_of_camlint adj); + emit (Pstwxu(GPR1, GPR1, GPR0)) + end; + emit (Pcfi_adjust (coqint_of_camlint sz)); + if variadic then begin + emit (Pmflr GPR0); + emit (Pbl(intern_string "__compcert_va_saveregs", + {sig_args = []; sig_res = None; sig_cc = cc_default})); + emit (Pmtlr GPR0) + end; + current_function_stacksize := sz + | Pbctr sg | Pbctrl sg | Pbl(_, sg) | Pbs(_, sg) -> + set_cr6 sg; + emit instr + | Pfreeframe(sz, ofs) -> + let variadic = (!current_function).fn_sig.sig_cc.cc_vararg in + let sz = camlint_of_coqint sz in + let sz = if variadic then Int32.add sz 96l else sz in + if sz < 0x8000l then + emit (Paddi(GPR1, GPR1, Cint(coqint_of_camlint sz))) + else + emit (Plwz(GPR1, Cint ofs, GPR1)) + | Pfcti(r1, r2) -> + emit (Pfctiwz(FPR13, r2)); + emit (Pstfdu(FPR13, Cint _m8, GPR1)); + emit (Pcfi_adjust _8); + emit (Plwz(r1, Cint _4, GPR1)); + emit (Paddi(GPR1, GPR1, Cint _8)); + emit (Pcfi_adjust _m8) + | Pfmake(rd, r1, r2) -> + emit (Pstwu(r1, Cint _m8, GPR1)); + emit (Pcfi_adjust _8); + emit (Pstw(r2, Cint _4, GPR1)); + emit (Plfd(rd, Cint _0, GPR1)); + emit (Paddi(GPR1, GPR1, Cint _8)); + emit (Pcfi_adjust _m8); + | Pfxdp(r1, r2) -> + if r1 <> r2 then emit(Pfmr(r1, r2)) + | Pmfcrbit(r1, bit) -> + emit (Pmfcr r1); + emit (Prlwinm(r1, r1, Z.of_uint (1 + num_crbit bit), _1)) + | Pbuiltin(ef, args, res) -> + begin match ef with + | EF_builtin(name, sg) -> + expand_builtin_inline (extern_atom name) args res + | EF_vload chunk -> + expand_builtin_vload chunk args res + | EF_vstore chunk -> + expand_builtin_vstore chunk args + | EF_vload_global(chunk, id, ofs) -> + if symbol_is_small_data id ofs + then expand_builtin_vload_sda chunk id ofs args res + else expand_builtin_vload_global chunk id ofs args res + | EF_vstore_global(chunk, id, ofs) -> + if symbol_is_small_data id ofs + then expand_builtin_vstore_sda chunk id ofs args + else expand_builtin_vstore_global chunk id ofs args + | EF_memcpy(sz, al) -> + expand_builtin_memcpy (Z.to_int sz) (Z.to_int al) args + | EF_annot_val(txt, targ) -> + expand_annot_val txt targ args res + | EF_inline_asm txt -> + emit instr + | _ -> + assert false + end + | _ -> + emit instr + +let expand_function fn = + set_current_function fn; + current_code := []; + List.iter expand_instruction fn.fn_code; + let c = get_code() in + set_current_function dummy_function; + { fn with fn_code = c } + +let expand_fundef = function + | Internal f -> Internal (expand_function f) + | External ef -> External ef + +let expand_program (p: Asm.program) : Asm.program = + AST.transform_program expand_fundef p |