From 994c6c34182606385140e5695e33c90507ce59ee Mon Sep 17 00:00:00 2001 From: Xavier Leroy Date: Mon, 19 Sep 2022 16:37:17 +0200 Subject: Support C11 Unicode string literals and character constants (#452) * Support C11 Unicode string literals and character constants * Add tests for C11 string literals and character constants * Better error message for ill-formed universal character names E.g. \u followed by fewer than 4 hex digits, or \U followed by fewer than 8 hex digits. * Add new warning `invalid-utf8` for byte sequences that are not valid UTF8. The warning is activated but not fatal by default. * Warn on uses of C11 Unicode character constants and string literals This uses the `c11-extensions` warning, which is off by default. * Support preprocessing option -finput-charset= for GNU toolchains --- cfrontend/C2C.ml | 197 ++++++++++++++++++-------------------- cparser/C.mli | 2 +- cparser/Cabs.v | 17 +++- cparser/Ceval.ml | 6 +- cparser/Cprint.ml | 2 +- cparser/Cutil.ml | 4 +- cparser/Diagnostics.ml | 3 + cparser/Diagnostics.mli | 1 + cparser/Elab.ml | 128 ++++++++++++++++--------- cparser/Elab.mli | 2 +- cparser/Lexer.mll | 180 ++++++++++++++++++++++++++-------- cparser/Parser.vy | 20 ++-- cparser/pre_parser.mly | 2 +- driver/Frontend.ml | 5 +- test/regression/Makefile | 2 +- test/regression/Results/charlit | 27 ++++++ test/regression/Results/stringlit | 15 +++ test/regression/charlit.c | 50 ++++++++++ test/regression/stringlit.c | 41 ++++++++ 19 files changed, 490 insertions(+), 214 deletions(-) create mode 100644 test/regression/Results/charlit create mode 100644 test/regression/Results/stringlit create mode 100644 test/regression/charlit.c create mode 100644 test/regression/stringlit.c diff --git a/cfrontend/C2C.ml b/cfrontend/C2C.ml index b0dc8e8a..89b9139c 100644 --- a/cfrontend/C2C.ml +++ b/cfrontend/C2C.ml @@ -287,105 +287,6 @@ let attributes = [ ] -(** ** Functions used to handle string literals *) - -let stringNum = ref 0 (* number of next global for string literals *) -let stringTable : (string, AST.ident) Hashtbl.t = Hashtbl.create 47 -let wstringTable : (int64 list, AST.ident) Hashtbl.t = Hashtbl.create 47 - -let is_C_string s = not (String.contains s '\000') - -let name_for_string_literal s = - try - Hashtbl.find stringTable s - with Not_found -> - incr stringNum; - let name = Printf.sprintf "__stringlit_%d" !stringNum in - let id = intern_string name in - let mergeable = if is_C_string s then 1 else 0 in - Hashtbl.add decl_atom id - { a_storage = C.Storage_static; - a_alignment = Some 1; - a_size = Some (Int64.of_int (String.length s + 1)); - a_sections = [Sections.for_stringlit mergeable]; - a_access = Sections.Access_default; - a_inline = No_specifier; - a_loc = Cutil.no_loc }; - Hashtbl.add stringTable s id; - id - -let typeStringLiteral s = - let sg = if Machine.((!config).char_signed) then Signed else Unsigned in - Tarray(Tint(I8, sg, noattr), Z.of_uint (String.length s + 1), noattr) - -let global_for_string s id = - let init = ref [] in - let add_char c = - init := AST.Init_int8(Z.of_uint(Char.code c)) :: !init in - add_char '\000'; - for i = String.length s - 1 downto 0 do add_char s.[i] done; - AST.(id, Gvar { gvar_info = typeStringLiteral s; gvar_init = !init; - gvar_readonly = true; gvar_volatile = false}) - -let is_C_wide_string s = not (List.mem 0L s) - -let name_for_wide_string_literal s = - try - Hashtbl.find wstringTable s - with Not_found -> - incr stringNum; - let name = Printf.sprintf "__stringlit_%d" !stringNum in - let id = intern_string name in - let wchar_size = Machine.((!config).sizeof_wchar) in - let mergeable = if is_C_wide_string s then wchar_size else 0 in - Hashtbl.add decl_atom id - { a_storage = C.Storage_static; - a_alignment = Some wchar_size; - a_size = Some (Int64.(mul (of_int (List.length s + 1)) - (of_int wchar_size))); - a_sections = [Sections.for_stringlit mergeable]; - a_access = Sections.Access_default; - a_inline = No_specifier; - a_loc = Cutil.no_loc }; - Hashtbl.add wstringTable s id; - id - -let typeWideStringLiteral s = - let sz = - match Machine.((!config).sizeof_wchar) with - | 2 -> I16 - | 4 -> I32 - | _ -> assert false in - let sg = - if Machine.((!config).wchar_signed) then Signed else Unsigned in - Tarray(Tint(sz, sg, noattr), Z.of_uint (List.length s + 1), noattr) - -let global_for_wide_string s id = - let init = ref [] in - let init_of_char = - match Machine.((!config).sizeof_wchar) with - | 2 -> (fun z -> AST.Init_int16 z) - | 4 -> (fun z -> AST.Init_int32 z) - | _ -> assert false in - let add_char c = - init := init_of_char(Z.of_uint64 c) :: !init in - List.iter add_char s; - add_char 0L; - AST.(id, Gvar { gvar_info = typeWideStringLiteral s; - gvar_init = List.rev !init; - gvar_readonly = true; gvar_volatile = false}) - -let globals_for_strings globs = - let globs1 = - Hashtbl.fold - (fun s id l -> global_for_wide_string s id :: l) - wstringTable globs in - let globs2 = - Hashtbl.fold - (fun s id l -> global_for_string s id :: l) - stringTable globs1 in - globs2 - (** ** Handling of inlined memcpy functions *) let constant_size_t a = @@ -667,6 +568,98 @@ let is_int64 env ty = | C.TEnum(_, _) -> false | _ -> assert false +(** String literals *) + +let stringNum = ref 0 (* number of next global for string literals *) +let stringTable : (string, AST.ident) Hashtbl.t = Hashtbl.create 47 +let wstringTable : (int64 list * ikind, AST.ident) Hashtbl.t = Hashtbl.create 47 + +let is_C_string s = not (String.contains s '\000') + +let name_for_string_literal s = + try + Hashtbl.find stringTable s + with Not_found -> + incr stringNum; + let name = Printf.sprintf "__stringlit_%d" !stringNum in + let id = intern_string name in + let mergeable = if is_C_string s then 1 else 0 in + Hashtbl.add decl_atom id + { a_storage = C.Storage_static; + a_alignment = Some 1; + a_size = Some (Int64.of_int (String.length s + 1)); + a_sections = [Sections.for_stringlit mergeable]; + a_access = Sections.Access_default; + a_inline = No_specifier; + a_loc = Cutil.no_loc }; + Hashtbl.add stringTable s id; + id + +let typeStringLiteral s = + let sg = if Machine.((!config).char_signed) then Signed else Unsigned in + Tarray(Tint(I8, sg, noattr), Z.of_uint (String.length s + 1), noattr) + +let global_for_string s id = + let init = ref [] in + let add_char c = + init := AST.Init_int8(Z.of_uint(Char.code c)) :: !init in + add_char '\000'; + for i = String.length s - 1 downto 0 do add_char s.[i] done; + AST.(id, Gvar { gvar_info = typeStringLiteral s; gvar_init = !init; + gvar_readonly = true; gvar_volatile = false}) + +let is_C_wide_string s = not (List.mem 0L s) + +let name_for_wide_string_literal s ik = + try + Hashtbl.find wstringTable (s, ik) + with Not_found -> + incr stringNum; + let name = Printf.sprintf "__stringlit_%d" !stringNum in + let id = intern_string name in + let wchar_size = Cutil.sizeof_ikind ik in + let mergeable = if is_C_wide_string s then wchar_size else 0 in + Hashtbl.add decl_atom id + { a_storage = C.Storage_static; + a_alignment = Some wchar_size; + a_size = Some (Int64.(mul (of_int (List.length s + 1)) + (of_int wchar_size))); + a_sections = [Sections.for_stringlit mergeable]; + a_access = Sections.Access_default; + a_inline = No_specifier; + a_loc = Cutil.no_loc }; + Hashtbl.add wstringTable (s, ik) id; + id + +let typeWideStringLiteral s ik = + Tarray(convertIkind ik noattr, Z.of_uint (List.length s + 1), noattr) + +let global_for_wide_string (s, ik) id = + let init = ref [] in + let init_of_char = + match Cutil.sizeof_ikind ik with + | 2 -> (fun z -> AST.Init_int16 z) + | 4 -> (fun z -> AST.Init_int32 z) + | _ -> assert false in + let add_char c = + init := init_of_char(Z.of_uint64 c) :: !init in + List.iter add_char s; + add_char 0L; + AST.(id, Gvar { gvar_info = typeWideStringLiteral s ik; + gvar_init = List.rev !init; + gvar_readonly = true; gvar_volatile = false}) + +let globals_for_strings globs = + let globs1 = + Hashtbl.fold + (fun s id l -> global_for_wide_string s id :: l) + wstringTable globs in + let globs2 = + Hashtbl.fold + (fun s id l -> global_for_string s id :: l) + stringTable globs1 in + globs2 + (** Floating point constants *) let z_of_str hex str fst = @@ -1000,9 +993,9 @@ and convertLvalue env e = | C.EConst(C.CStr s) -> let ty = typeStringLiteral s in Evar(name_for_string_literal s, ty) - | C.EConst(C.CWStr s) -> - let ty = typeWideStringLiteral s in - Evar(name_for_wide_string_literal s, ty) + | C.EConst(C.CWStr(s, ik)) -> + let ty = typeWideStringLiteral s ik in + Evar(name_for_wide_string_literal s ik, ty) | _ -> error "illegal lvalue"; ezero diff --git a/cparser/C.mli b/cparser/C.mli index 763a9277..1388fab9 100644 --- a/cparser/C.mli +++ b/cparser/C.mli @@ -64,7 +64,7 @@ type constant = | CInt of int64 * ikind * string (* as it appeared in the source *) | CFloat of float_cst * fkind | CStr of string - | CWStr of int64 list + | CWStr of int64 list * ikind | CEnum of ident * int64 (* enum tag, integer value *) (** Attributes *) diff --git a/cparser/Cabs.v b/cparser/Cabs.v index bf8c8c74..59cd30ab 100644 --- a/cparser/Cabs.v +++ b/cparser/Cabs.v @@ -31,6 +31,13 @@ Record floatInfo := { suffix_FI:option string }. +Inductive encoding := + | EncNone (* no prefix *) + | EncWide (* 'L' prefix *) + | EncU16 (* 'u' prefix *) + | EncU32 (* 'U' prefix *) + | EncUTF8. (* 'u8' prefix (strings only) *) + Inductive structOrUnion := | STRUCT | UNION. @@ -152,8 +159,8 @@ with constant := the source code. *) | CONST_INT : string -> constant | CONST_FLOAT : floatInfo -> constant - | CONST_CHAR : bool -> list char_code -> constant - | CONST_STRING : bool -> list char_code -> constant + | CONST_CHAR : encoding -> list char_code -> constant + | CONST_STRING : encoding -> list char_code -> constant with init_expression := | NO_INIT @@ -194,9 +201,9 @@ Definition generic_assoc := (option type_name * expression)%type. (* GCC extended asm *) Inductive asm_operand := -| ASMOPERAND: option string -> bool -> list char_code -> expression -> asm_operand. +| ASMOPERAND: option string -> encoding -> list char_code -> expression -> asm_operand. -Definition asm_flag := (bool * list char_code)%type. +Definition asm_flag := (encoding * list char_code)%type. (* ** Declaration definition (at toplevel) @@ -227,7 +234,7 @@ with statement := | DEFAULT : statement -> loc -> statement | LABEL : string -> statement -> loc -> statement | GOTO : string -> loc -> statement - | ASM : list cvspec -> bool -> list char_code -> list asm_operand -> list asm_operand -> list asm_flag -> loc -> statement + | ASM : list cvspec -> encoding -> list char_code -> list asm_operand -> list asm_operand -> list asm_flag -> loc -> statement | DEFINITION : definition -> statement (*definition or declaration of a variable or type*) with for_clause := diff --git a/cparser/Ceval.ml b/cparser/Ceval.ml index 14f61e06..0800e25b 100644 --- a/cparser/Ceval.ml +++ b/cparser/Ceval.ml @@ -72,7 +72,7 @@ let normalize_int n ik = type value = | I of int64 | S of string - | WS of int64 list + | WS of int64 list * ikind let boolean_value v = match v with @@ -83,7 +83,7 @@ let constant = function | CInt(v, ik, _) -> I (normalize_int v ik) | CFloat(v, fk) -> raise Notconst | CStr s -> S s - | CWStr s -> WS s + | CWStr(s, ik) -> WS(s, ik) | CEnum(id, v) -> I v let is_signed env ty = @@ -274,7 +274,7 @@ let constant_expr env ty e = | TInt(ik, _), I n -> Some(CInt(n, ik, "")) | TPtr(_, _), I n -> Some(CInt(n, ptr_t_ikind (), "")) | (TArray(_, _, _) | TPtr(_, _)), S s -> Some(CStr s) - | (TArray(_, _, _) | TPtr(_, _)), WS s -> Some(CWStr s) + | (TArray(_, _, _) | TPtr(_, _)), WS(s, ik) -> Some(CWStr(s, ik)) | TEnum(_, _), I n -> Some(CInt(n, enum_ikind, "")) | _ -> None with Notconst -> None diff --git a/cparser/Cprint.ml b/cparser/Cprint.ml index caa4fa66..05448784 100644 --- a/cparser/Cprint.ml +++ b/cparser/Cprint.ml @@ -75,7 +75,7 @@ let const pp = function else fprintf pp "\\%03o" (Char.code c) done; fprintf pp "\"" - | CWStr l -> + | CWStr(l, _) -> fprintf pp "L\""; List.iter (fun c -> diff --git a/cparser/Cutil.ml b/cparser/Cutil.ml index 6fd12323..f226d51b 100644 --- a/cparser/Cutil.ml +++ b/cparser/Cutil.ml @@ -1024,9 +1024,9 @@ let type_of_constant = function | CStr s -> let size = Int64.of_int (String.length s + 1) in TArray(TInt(IChar,[]), Some size, []) - | CWStr s -> + | CWStr(s, ik) -> let size = Int64.of_int (List.length s + 1) in - TArray(TInt(wchar_ikind(), []), Some size, []) + TArray(TInt(ik, []), Some size, []) | CEnum(_, _) -> TInt(IInt, []) (* Check that a C expression is a lvalue *) diff --git a/cparser/Diagnostics.ml b/cparser/Diagnostics.ml index 483b0376..8a8a0c17 100644 --- a/cparser/Diagnostics.ml +++ b/cparser/Diagnostics.ml @@ -104,6 +104,7 @@ type warning_type = | Tentative_incomplete_static | Reduced_alignment | Non_linear_cond_expr + | Invalid_UTF8 (* List of all warnings with default status. "true" means the warning is active by default. @@ -140,6 +141,7 @@ let all_warnings = (Tentative_incomplete_static, false); (Reduced_alignment, false); (Non_linear_cond_expr, false); + (Invalid_UTF8, true); ] (* List of active warnings *) @@ -182,6 +184,7 @@ let string_of_warning = function | Tentative_incomplete_static -> "tentative-incomplete-static" | Reduced_alignment -> "reduced-alignment" | Non_linear_cond_expr -> "non-linear-cond-expr" + | Invalid_UTF8 -> "invalid-utf8" (* Activate the given warning *) let activate_warning w () = diff --git a/cparser/Diagnostics.mli b/cparser/Diagnostics.mli index 1210353f..47727707 100644 --- a/cparser/Diagnostics.mli +++ b/cparser/Diagnostics.mli @@ -57,6 +57,7 @@ type warning_type = | Tentative_incomplete_static (** static tentative definition with incomplete type *) | Reduced_alignment (** alignment reduction *) | Non_linear_cond_expr (** condition that cannot be linearized *) + | Invalid_UTF8 (** invalid UTF-8 encoding *) val warning : (string * int) -> warning_type -> ('a, Format.formatter, unit, unit, unit, unit) format6 -> 'a (** [warning (f,c) w fmt arg1 ... argN] formats the arguments [arg1] to [argN] as warining according to diff --git a/cparser/Elab.ml b/cparser/Elab.ml index 01e745c3..ec79634a 100644 --- a/cparser/Elab.ml +++ b/cparser/Elab.ml @@ -395,48 +395,75 @@ let elab_float_constant f = in (v, ty) -let elab_char_constant loc wide chars = +let check_char_range loc ikind chars = + let max = Int64.shift_left 1L (sizeof_ikind ikind * 8) in + List.iter + (fun c -> + if c >= max then error loc "escape sequence 0x%LX is out of range" c) + chars + +let ikind_of_encoding = function + | EncNone -> IChar + | EncWide -> wchar_ikind() + | EncU16 -> IUShort + | EncU32 -> IUInt + | EncUTF8 -> IChar + +let elab_char_constant loc enc chars = let len = List.length chars in - let nbits = if wide then 8 * !config.sizeof_wchar else 8 in - let max_digit = Int64.shift_left 1L nbits in - (* Treat multi-character constants as a number in base 2^nbits. - It must fit in type int for a normal constant and in type wchar_t - for a wide constant. *) + (* We support multi-character constants for EncNone character literals only. + We treat them as big-endian numbers in base 256. *) let v = - if len > (if wide then 1 else !config.sizeof_int) then begin - error loc "%d-character constant too long for its type" len; - 0L - end else - List.fold_left - (fun acc d -> - if d < 0L || d >= max_digit then - error loc "escape sequence is out of range (code 0x%LX)" d; - Int64.add (Int64.shift_left acc nbits) d) - 0L chars in - (* C99 6.4.4.4 items 10 and 11: - single-character constant -> represent at type char + match chars, enc with + | [], _ -> error loc "empty character constant"; 0L + | [c], _ -> c + | _, EncNone -> + if len > !config.sizeof_int then begin + error loc "%d-character constant too long, overflows its type" len; + 0L + end else begin + check_char_range loc IUChar chars; + List.fold_left + (fun acc d -> Int64.(add (shift_left acc 8) d)) + 0L chars + end + | _, _ -> + error loc "%d-character constant not supported" len; 0L in + (* C11 6.4.4.4 items 10 and 11: + normal single-character constant -> represent at type char multi-character constant -> represent at type int - wide character constant -> represent at type wchar_t *) - Ceval.normalize_int v - (if wide then wchar_ikind() else if len = 1 then IChar else IInt) - -let elab_string_literal loc wide chars = - let nbits = if wide then 8 * !config.sizeof_wchar else 8 in - let char_max = Int64.shift_left 1L nbits in - List.iter - (fun c -> - if c < 0L || c >= char_max - then error loc "escape sequence is out of range (code 0x%LX)" c) - chars; - if wide then - CWStr chars - else begin - let res = Bytes.create (List.length chars) in - List.iteri - (fun i c -> Bytes.set res i (Char.unsafe_chr (Int64.to_int c))) - chars; - CStr (Bytes.to_string res) - end + L character constant -> represent at type wchar_t + u character constant -> represent at type char16_t + U character constant -> represent at type char32_t *) + let ik = + if enc = EncNone && len > 1 then IInt else ikind_of_encoding enc in + let v' = Ceval.normalize_int v ik in + if v' <> v then + warning loc Constant_conversion + "overflow in character constant, changes value from %Ld to %Ld" v v'; + v' + +let elab_string_literal loc enc chars = + let ik = ikind_of_encoding enc in + check_char_range loc ik chars; + match enc with + | EncNone | EncUTF8 -> + let res = Bytes.create (List.length chars) in + List.iteri + (fun i c -> Bytes.set res i (Char.unsafe_chr (Int64.to_int c))) + chars; + CStr (Bytes.to_string res) + | EncWide | EncU16 | EncU32 -> + CWStr(chars, ik) + +let warn_C11_literals loc enc kind = + let warn enc = + warning loc Celeven_extension "'%s' %s are a C11 extension" enc kind in + match enc with + | EncNone | EncWide -> () + | EncUTF8 -> warn "u8" + | EncU16 -> warn "u" + | EncU32 -> warn "U" let elab_constant loc = function | CONST_INT s -> @@ -445,14 +472,22 @@ let elab_constant loc = function | CONST_FLOAT f -> let (v, fk) = elab_float_constant f in CFloat(v, fk) - | CONST_CHAR(wide, s) -> - let ikind = if wide then wchar_ikind () else IInt in - CInt(elab_char_constant loc wide s, ikind, "") + | CONST_CHAR(enc, s) -> + warn_C11_literals loc enc "character constants"; + let ikind = + match enc with + | EncNone -> IInt + | EncWide -> wchar_ikind () + | EncU16 -> IUShort + | EncU32 -> IUInt + | EncUTF8 -> assert false in + CInt(elab_char_constant loc enc s, ikind, "") | CONST_STRING(wide, s) -> + warn_C11_literals loc wide "string literals"; elab_string_literal loc wide s -let elab_simple_string loc wide chars = - match elab_string_literal loc wide chars with +let elab_simple_string loc enc chars = + match elab_string_literal loc enc chars with | CStr s -> s | _ -> error loc "cannot use wide string literal in 'asm'"; "" @@ -1586,6 +1621,7 @@ and elab_item zi item il = | COMPOUND_INIT [_, SINGLE_INIT(CONSTANT (CONST_STRING(w, s)))]), TArray(ty_elt, sz, _) when is_integer_type env ty_elt -> + warn_C11_literals loc w "string literals"; begin match elab_string_literal loc w s, unroll env ty_elt with | CStr s, TInt((IChar | ISChar | IUChar), _) -> if not (I.index_below (Int64.of_int(String.length s - 1)) sz) then @@ -1594,12 +1630,12 @@ and elab_item zi item il = | CStr _, _ -> error loc "initialization of an array of non-char elements with a string literal"; elab_list zi il false - | CWStr s, TInt(_, _) when compatible_types AttrIgnoreTop env ty_elt (TInt(wchar_ikind(), [])) -> + | CWStr(s, ik), TInt(_, _) when compatible_types AttrIgnoreTop env ty_elt (TInt(ik, [])) -> if not (I.index_below (Int64.of_int(List.length s - 1)) sz) then warning loc Unnamed "initializer string for array of wide chars %s is too long" (I.name zi); elab_list (I.set zi (init_int_array_wstring sz s)) il false | CWStr _, _ -> - error loc "initialization of an array of non-wchar_t elements with a wide string literal"; + error loc "type mismatch between array destination and wide string literal"; elab_list zi il false | _ -> assert false end diff --git a/cparser/Elab.mli b/cparser/Elab.mli index bca4f74d..537056fa 100644 --- a/cparser/Elab.mli +++ b/cparser/Elab.mli @@ -21,6 +21,6 @@ val elab_file : Cabs.definition list -> C.program val elab_int_constant : Cabs.loc -> string -> int64 * C.ikind val elab_float_constant : Cabs.floatInfo -> C.float_cst * C.fkind -val elab_char_constant : Cabs.loc -> bool -> int64 list -> int64 +val elab_char_constant : Cabs.loc -> Cabs.encoding -> int64 list -> int64 (* These auxiliary functions are exported so that they can be reused in other projects that deal with C-style source languages. *) diff --git a/cparser/Lexer.mll b/cparser/Lexer.mll index 9f7fba1e..2efa4216 100644 --- a/cparser/Lexer.mll +++ b/cparser/Lexer.mll @@ -3,6 +3,7 @@ (* The Compcert verified compiler *) (* *) (* Jacques-Henri Jourdan, INRIA Paris-Rocquencourt *) +(* Xavier Leroy, Collège de France and Inria *) (* *) (* Copyright Institut National de Recherche en Informatique et en *) (* Automatique. All rights reserved. This file is distributed *) @@ -144,9 +145,9 @@ let error lb fmt = Diagnostics.error (lb.lex_curr_p.pos_fname,lb.lex_curr_p.pos_lnum) fmt -let warning lb fmt = +let warning lb kind fmt = Diagnostics.warning - (lb.lex_curr_p.pos_fname,lb.lex_curr_p.pos_lnum) Diagnostics.Unnamed fmt + (lb.lex_curr_p.pos_fname,lb.lex_curr_p.pos_lnum) kind fmt (* Simple character escapes *) @@ -160,6 +161,89 @@ let convert_escape = function | 't' -> 9L (* horizontal tab *) | 'v' -> 11L (* vertical tab *) | c -> Int64.of_int (Char.code c) + +(* Encodings for character and string literals *) + +let encoding_of = function + | "" -> Cabs.EncNone + | "L" -> Cabs.EncWide + | "u" -> Cabs.EncU16 + | "U" -> Cabs.EncU32 + | "u8" -> Cabs.EncUTF8 + | _ -> assert false + +let combine_encodings loc e1 e2 = + if e1 = Cabs.EncNone then e2 + else if e2 = Cabs.EncNone then e1 + else if e1 = e2 then e1 + else Diagnostics.fatal_error + Cabs.(loc.filename, loc.lineno) + "unsupported non-standard concatenation of string literals" + +(* Handling of characters and escapes in string and char constants *) + +type chr = Chr of int | Esc of int64 + +let check_utf8 lexbuf min x = + if x > 0x10FFFF || (x >= 0xD800 && x <= 0xDFFF) then + warning lexbuf Diagnostics.Invalid_UTF8 "Wrong Unicode value U+%X" x; + if x < min then + warning lexbuf Diagnostics.Invalid_UTF8 + "Overlong UTF-8 encoding for Unicode value U+%X" x; + Chr x + +let check_universal_character lexbuf x = + if x > 0x10FFFF + || x >= 0xD800 && x <= 0xDFFF + || x < 0xA0 && x <> 0x24 && x <> 0x40 && x <> 0x60 + then begin + error lexbuf "Wrong universal character name U+%X" x; Chr 0 + end else + Chr x + +let add_char_utf8 x accu = + if x <= 0x007F then + Int64.of_int x :: accu + else if x <= 0x07FF then + Int64.of_int (0x80 lor (x land 0x3F)) :: + Int64.of_int (0xC0 lor (x lsr 6)) :: + accu + else if x <= 0xFFFF then + Int64.of_int (0x80 lor (x land 0x3F)) :: + Int64.of_int (0x80 lor ((x lsr 6) land 0x3F)) :: + Int64.of_int (0xE0 lor (x lsr 12)) :: + accu + else + Int64.of_int (0x80 lor (x land 0x3F)) :: + Int64.of_int (0x80 lor ((x lsr 6) land 0x3F)) :: + Int64.of_int (0x80 lor ((x lsr 12) land 0x3F)) :: + Int64.of_int (0xF0 lor (x lsr 18)) :: + accu + +let add_char_utf16 x accu = + if x <= 0xFFFF then + Int64.of_int x :: accu + else begin + let x = x - 0x10000 in + Int64.of_int (0xDC00 lor (x land 0x3FF)) :: + Int64.of_int (0xD800 lor (x lsr 10)) :: + accu + end + +let add_char enc c accu = + match c, enc with + | Esc x, _ -> (* Escapes are never encoded *) + x :: accu + | Chr x, (Cabs.EncNone | Cabs.EncUTF8) -> (* Characters are encoded in UTF8 *) + add_char_utf8 x accu + | Chr x, Cabs.EncU16 -> (* Characters are encoded in UTF16 *) + add_char_utf16 x accu + | Chr x, Cabs.EncU32 -> (* Characters are not encoded *) + Int64.of_int x :: accu + | Chr x, Cabs.EncWide -> (* Depends on size of wchar_t *) + if Machine.(!config.sizeof_wchar) = 2 + then add_char_utf16 x accu + else Int64.of_int x :: accu } (* Identifiers *) @@ -249,11 +333,6 @@ let octal_escape_sequence = | octal_digit octal_digit | octal_digit octal_digit octal_digit) as n) let hexadecimal_escape_sequence = "\\x" (hexadecimal_digit+ as n) -let escape_sequence = - simple_escape_sequence - | octal_escape_sequence - | hexadecimal_escape_sequence - | universal_character_name rule initial = parse | '\n' { new_line lexbuf; initial_linebegin lexbuf } @@ -283,16 +362,13 @@ rule initial = parse currentLoc lexbuf)} | preprocessing_number as s { error lexbuf "invalid numerical constant '%s'@ These characters form a preprocessor number, but not a constant" s; CONSTANT (Cabs.CONST_INT "0", currentLoc lexbuf) } - | "'" { let l = char_literal lexbuf.lex_start_p [] lexbuf in - CONSTANT (Cabs.CONST_CHAR(false, l), - currentLoc lexbuf) } - | "L'" { let l = char_literal lexbuf.lex_start_p [] lexbuf in - CONSTANT (Cabs.CONST_CHAR(true, l), - currentLoc lexbuf) } - | "\"" { let l = string_literal lexbuf.lex_start_p [] lexbuf in - STRING_LITERAL(false, l, currentLoc lexbuf) } - | "L\"" { let l = string_literal lexbuf.lex_start_p [] lexbuf in - STRING_LITERAL(true, l, currentLoc lexbuf) } + | (""|"L"|"u"|"U") as e "'" { let enc = encoding_of e in + let l = char_literal lexbuf.lex_start_p [] lexbuf in + CONSTANT (Cabs.CONST_CHAR(enc, l), currentLoc lexbuf) } + | (""|"L"|"u"|"U"|"u8") as e "\"" + { let enc = encoding_of e in + let l = string_literal lexbuf.lex_start_p enc [] lexbuf in + STRING_LITERAL(enc, l, currentLoc lexbuf) } | "..." { ELLIPSIS(currentLoc lexbuf) } | "+=" { ADD_ASSIGN(currentLoc lexbuf) } | "-=" { SUB_ASSIGN(currentLoc lexbuf) } @@ -357,39 +433,62 @@ and initial_linebegin = parse and char = parse | universal_character_name { try - Int64.of_string ("0x" ^ n) + check_universal_character lexbuf (int_of_string ("0x" ^ n)) with Failure _ -> error lexbuf "overflow in universal character name"; - 0L + Chr 0 } | hexadecimal_escape_sequence { try - Int64.of_string ("0x" ^ n) + Esc (Int64.of_string ("0x" ^ n)) with Failure _ -> error lexbuf "overflow in hexadecimal escape sequence"; - 0L + Esc 0L } | octal_escape_sequence - { Int64.of_string ("0o" ^ n) } + { Esc (Int64.of_string ("0o" ^ n)) } | simple_escape_sequence - { convert_escape c } + { Esc (convert_escape c) } + | "\\u" | "\\U" + { error lexbuf "incomplete universal character name"; + Chr 0 } | '\\' (_ as c) { error lexbuf "incorrect escape sequence '\\%c'" c; - Int64.of_int (Char.code c) } + Esc (Int64.of_int (Char.code c)) } + | ['\x00'-'\x7F'] as c1 + { Chr (Char.code c1) } + | (['\xC0'-'\xDF'] as c1) (['\x80'-'\xBF'] as c2) + { check_utf8 lexbuf 0x80 + ( (Char.code c1 land 0b00011111) lsl 6 + + (Char.code c2 land 0b00111111)) } + | (['\xE0'-'\xEF'] as c1) (['\x80'-'\xBF'] as c2) (['\x80'-'\xBF'] as c3) + { check_utf8 lexbuf 0x800 + ( (Char.code c1 land 0b00001111) lsl 12 + + (Char.code c2 land 0b00111111) lsl 6 + + (Char.code c3 land 0b00111111) ) } + | (['\xF0'-'\xF7'] as c1) (['\x80'-'\xBF'] as c2) (['\x80'-'\xBF'] as c3) (['\x80'-'\xBF'] as c4) + { check_utf8 lexbuf 0x10000 + ( (Char.code c1 land 0b00000111) lsl 18 + + (Char.code c2 land 0b00111111) lsl 12 + + (Char.code c3 land 0b00111111) lsl 6 + + (Char.code c4 land 0b00111111) ) } | _ as c - { Int64.of_int (Char.code c) } + { warning lexbuf Diagnostics.Invalid_UTF8 + "Invalid UTF8 encoding: byte 0x%02x" (Char.code c); + Esc (Int64.of_int (Char.code c)) (* re-encode as-is *) + } and char_literal startp accu = parse | '\'' { lexbuf.lex_start_p <- startp; List.rev accu } | '\n' | eof { fatal_error lexbuf "missing terminating \"'\" character" } - | "" { let c = char lexbuf in char_literal startp (c :: accu) lexbuf } + | "" { let c = char lexbuf in char_literal startp (add_char Cabs.EncU32 c accu) lexbuf } -and string_literal startp accu = parse +and string_literal startp enc accu = parse | '\"' { lexbuf.lex_start_p <- startp; List.rev accu } | '\n' | eof { fatal_error lexbuf "missing terminating '\"' character" } - | "" { let c = char lexbuf in string_literal startp (c :: accu) lexbuf } + | "" { let c = char lexbuf in string_literal startp enc (add_char enc c accu) lexbuf } (* We assume gcc -E syntax but try to tolerate variations. *) and hash = parse @@ -402,7 +501,8 @@ and hash = parse try int_of_string n with Failure _ -> - warning lexbuf "invalid line number"; lexbuf.lex_curr_p.pos_lnum + warning lexbuf Diagnostics.Unnamed "invalid line number"; + lexbuf.lex_curr_p.pos_lnum in lexbuf.lex_curr_p <- { lexbuf.lex_curr_p with @@ -417,7 +517,7 @@ and hash = parse ([^ '\n']* as s) '\n' { new_line lexbuf; PRAGMA (s, currentLoc lexbuf) } | [^ '\n']* '\n' - { warning lexbuf "unrecognized '#' line"; + { warning lexbuf Diagnostics.Unnamed "unrecognized '#' line"; new_line lexbuf; initial_linebegin lexbuf } | [^ '\n']* eof { fatal_error lexbuf "unexpected end of file" } @@ -583,21 +683,21 @@ and singleline_comment = parse | Pre_parser.STAR loc -> loop (Parser.STAR loc) | Pre_parser.STATIC loc -> loop (Parser.STATIC loc) | Pre_parser.STATIC_ASSERT loc -> loop (Parser.STATIC_ASSERT loc) - | Pre_parser.STRING_LITERAL (wide, str, loc) -> + | Pre_parser.STRING_LITERAL (enc, str, loc) -> (* Merge consecutive string literals *) - let rec doConcat wide str = + let rec doConcat enc str = match Queue.peek tokens with - | Pre_parser.STRING_LITERAL (wide', str', loc) -> + | Pre_parser.STRING_LITERAL (enc', str', loc') -> ignore (Queue.pop tokens); - let (wide'', str'') = doConcat wide' str' in + let (enc'', str'') = doConcat enc' str' in if str'' <> [] - then (wide || wide'', str @ str'') - else (wide, str) - | _ -> (wide, str) - | exception Queue.Empty -> (wide, str) + then (combine_encodings loc enc enc'', str @ str'') + else (enc, str) + | _ -> (enc, str) + | exception Queue.Empty -> (enc, str) in - let (wide', str') = doConcat wide str in - loop (Parser.STRING_LITERAL ((wide', str'), loc)) + let (enc', str') = doConcat enc str in + loop (Parser.STRING_LITERAL ((enc', str'), loc)) | Pre_parser.STRUCT loc -> loop (Parser.STRUCT loc) | Pre_parser.SUB_ASSIGN loc -> loop (Parser.SUB_ASSIGN loc) | Pre_parser.SWITCH loc -> loop (Parser.SWITCH loc) diff --git a/cparser/Parser.vy b/cparser/Parser.vy index 8d7cd055..12337367 100644 --- a/cparser/Parser.vy +++ b/cparser/Parser.vy @@ -23,7 +23,7 @@ Require Cabs. %token VAR_NAME TYPEDEF_NAME OTHER_NAME %token PRAGMA -%token STRING_LITERAL +%token STRING_LITERAL %token CONSTANT %token SIZEOF PTR INC DEC LEFT RIGHT LEQ GEQ EQEQ EQ NEQ LT GT ANDAND BARBAR PLUS MINUS STAR TILDE BANG SLASH PERCENT HAT BAR QUESTION @@ -124,8 +124,8 @@ primary_expression: | cst = CONSTANT { (Cabs.CONSTANT (fst cst), snd cst) } | str = STRING_LITERAL - { let '((wide, chars), loc) := str in - (Cabs.CONSTANT (Cabs.CONST_STRING wide chars), loc) } + { let '((enc, chars), loc) := str in + (Cabs.CONSTANT (Cabs.CONST_STRING enc chars), loc) } | loc = LPAREN expr = expression RPAREN { (fst expr, loc)} | sel = generic_selection @@ -786,8 +786,8 @@ designator: static_assert_declaration: | loc = STATIC_ASSERT LPAREN expr = constant_expression COMMA str = STRING_LITERAL RPAREN SEMICOLON - { let '((wide, chars), locs) := str in - (expr, (Cabs.CONST_STRING wide chars, locs), loc) } + { let '((enc, chars), locs) := str in + (expr, (Cabs.CONST_STRING enc chars, locs), loc) } (* 6.8 *) statement_dangerous: @@ -922,9 +922,9 @@ jump_statement: asm_statement: | loc = ASM attr = asm_attributes LPAREN template = STRING_LITERAL args = asm_arguments RPAREN SEMICOLON - { let '(wide, chars, _) := template in + { let '(enc, chars, _) := template in let '(outputs, inputs, flags) := args in - Cabs.ASM attr wide chars outputs inputs flags loc } + Cabs.ASM attr enc chars outputs inputs flags loc } asm_attributes: | /* empty */ @@ -954,7 +954,7 @@ asm_operands_ne: asm_operand: | n = asm_op_name cstr = STRING_LITERAL LPAREN e = expression RPAREN - { let '(wide, s, loc) := cstr in Cabs.ASMOPERAND n wide s (fst e) } + { let '(enc, s, loc) := cstr in Cabs.ASMOPERAND n enc s (fst e) } asm_op_name: | /* empty */ { None } @@ -962,9 +962,9 @@ asm_op_name: asm_flags: | f = STRING_LITERAL - { let '(wide, s, loc) := f in (wide, s) :: nil } + { let '(enc, s, loc) := f in (enc, s) :: nil } | f = STRING_LITERAL COMMA fl = asm_flags - { let '(wide, s, loc) := f in (wide, s) :: fl } + { let '(enc, s, loc) := f in (enc, s) :: fl } (* 6.9 *) translation_unit_file: diff --git a/cparser/pre_parser.mly b/cparser/pre_parser.mly index ad294398..00ca0ade 100644 --- a/cparser/pre_parser.mly +++ b/cparser/pre_parser.mly @@ -47,7 +47,7 @@ %token VAR_NAME TYPEDEF_NAME %token CONSTANT -%token STRING_LITERAL +%token STRING_LITERAL %token PRAGMA %token SIZEOF PTR INC DEC LEFT RIGHT LEQ GEQ EQEQ EQ NEQ LT GT diff --git a/driver/Frontend.ml b/driver/Frontend.ml index 6133291e..4ee5c712 100644 --- a/driver/Frontend.ml +++ b/driver/Frontend.ml @@ -154,7 +154,8 @@ let gnu_prepro_actions = [ Exact "-iquote", String (gnu_prepro_opt_key "-iquote"); Exact "-P", Self gnu_prepro_opt; Exact "-C", Self gnu_prepro_opt; - Exact "-CC", Self gnu_prepro_opt;] + Exact "-CC", Self gnu_prepro_opt; + Prefix "-finput-charset=", Self gnu_prepro_opt] let prepro_actions = [ (* Preprocessing options *) @@ -198,6 +199,8 @@ let gnu_prepro_help = -C Do not discard comments -CC Do not discard comments, including during macro expansion + -finput-charset= + Set the input character set, used for reading source files. |} let prepro_help = {|Preprocessing options: diff --git a/test/regression/Makefile b/test/regression/Makefile index daee05bc..53719900 100644 --- a/test/regression/Makefile +++ b/test/regression/Makefile @@ -16,7 +16,7 @@ TESTS=int32 int64 floats floats-basics floats-lit \ funct3 expr5 struct7 struct8 struct11 struct12 casts1 casts2 char1 \ sizeof1 sizeof2 binops bool for1 for2 switch switch2 compound \ decl1 bitfields9 ptrs3 \ - parsing krfun ifconv generic + parsing krfun ifconv generic stringlit charlit # Can run, but only in compiled mode, and have reference output in Results diff --git a/test/regression/Results/charlit b/test/regression/Results/charlit new file mode 100644 index 00000000..3954a0c8 --- /dev/null +++ b/test/regression/Results/charlit @@ -0,0 +1,27 @@ +c1: 61 +c2: 61 +c3: 61 +c4: 61 +d1: fe +d2: fe +d3: fe +d4: fe +e1: 34 +e2: 1234 +e3: 1234 +e4: 1234 +f1: e9 +f2: e9 +f3: e9 +f4: e9 +g1: 2b +g2: 732b +g3: 732b +g4: 732b +h1: 4c +h2: f34c +h3: 1f34c +h4: 1f34c +m1: 6162 +m2: 1020304 +m3: e9e8 diff --git a/test/regression/Results/stringlit b/test/regression/Results/stringlit new file mode 100644 index 00000000..d6967ccc --- /dev/null +++ b/test/regression/Results/stringlit @@ -0,0 +1,15 @@ +s1: size 11, contents 61 c3 a9 e7 8c ab f0 9f 8d 8c +s2: size 11, contents 61 c3 a9 e7 8c ab f0 9f 8d 8c +s3: size 12, contents 61 e9 732b d83c df4c +s4: size 20, contents 61 e9 732b 1f34c +s5: size 20, contents 61 e9 732b 1f34c +t1: size 11, contents 61 c3 a9 e7 8c ab f0 9f 8d 8c +t2: size 11, contents 61 c3 a9 e7 8c ab f0 9f 8d 8c +t3: size 12, contents 61 e9 732b d83c df4c +t4: size 20, contents 61 e9 732b 1f34c +t5: size 20, contents 61 e9 732b 1f34c +e1: size 4, contents 61 e9 e8 +e2: size 4, contents 61 e9 e8 +e3: size 10, contents 61 e9 e8 732b +e4: size 24, contents 61 e9 e8 732b 1f34c +e5: size 24, contents 61 e9 e8 732b 1f34c diff --git a/test/regression/charlit.c b/test/regression/charlit.c new file mode 100644 index 00000000..5a7e0916 --- /dev/null +++ b/test/regression/charlit.c @@ -0,0 +1,50 @@ +#include +#include +#include + +unsigned char c1 = 'a'; +char16_t c2 = u'a';; +char32_t c3 = U'a';; +wchar_t c4 = L'a';; + +unsigned char d1 = '\xFE'; +char16_t d2 = u'\xFE';; +char32_t d3 = U'\xFE';; +wchar_t d4 = L'\xFE';; + +unsigned char e1 = '\x1234'; // warning but no error +char16_t e2 = u'\x1234'; +char32_t e3 = U'\x1234'; +wchar_t e4 = L'\x1234'; + +unsigned char f1 = 'é'; // CompCert tolerance +char16_t f2 = u'é'; +char32_t f3 = U'é'; +wchar_t f4 = L'é'; + +unsigned char g1 = '猫'; // CompCert tolerance + warning +char16_t g2 = u'猫'; +char32_t g3 = U'猫'; +wchar_t g4 = L'猫'; + +unsigned char h1 = '🍌'; // CompCert tolerance + warning +char16_t h2 = u'🍌'; // CompCert tolerance + warning +char32_t h3 = U'🍌'; +wchar_t h4 = L'🍌'; + +int m1 = 'ab'; +int m2 = '\x01\x02\x03\x04'; +int m3 = 'éè'; // CompCert tolerance + +#define PRINT(x) printf("%s: %x\n", #x, x) + +int main() +{ + PRINT(c1); PRINT(c2); PRINT(c3); PRINT(c4); + PRINT(d1); PRINT(d2); PRINT(d3); PRINT(d4); + PRINT(e1); PRINT(e2); PRINT(e3); PRINT(e4); + PRINT(f1); PRINT(f2); PRINT(f3); PRINT(f4); + PRINT(g1); PRINT(g2); PRINT(g3); PRINT(g4); + PRINT(h1); PRINT(h2); PRINT(h3); PRINT(h4); + PRINT(m1); PRINT(m2); PRINT(m3); +} diff --git a/test/regression/stringlit.c b/test/regression/stringlit.c new file mode 100644 index 00000000..155f8ebb --- /dev/null +++ b/test/regression/stringlit.c @@ -0,0 +1,41 @@ +#include +#include +#include + +/* Without escapes nor universal character names */ + +unsigned char s1[] = "aé猫🍌"; +unsigned char s2[] = u8"aé猫🍌"; +char16_t s3[] = u"aé猫🍌"; +char32_t s4[] = U"aé猫🍌"; +wchar_t s5[] = L"aé猫🍌"; + +/* With universal character names */ + +unsigned char t1[] = "a\u00e9\u732B\U0001F34C"; +unsigned char t2[] = u8"a\u00e9\u732B\U0001F34C"; +char16_t t3[] = u"a\u00e9\u732B\U0001F34C"; +char32_t t4[] = U"a\u00e9\u732B\U0001F34C"; +wchar_t t5[] = L"a\u00e9\u732B\U0001F34C"; + +/* With numerical escapes */ + +unsigned char e1[] = "a\xe9\350"; +unsigned char e2[] = u8"a\xe9\350"; +char16_t e3[] = u"a\xe9\350\x732B"; +char32_t e4[] = U"a\xe9\350\x732B\x0001F34C"; +wchar_t e5[] = L"a\xe9\350\x732B\x0001F34C"; + +#define PRINT(x) \ + printf("%s: size %u, contents", #x, (int) sizeof(x)); \ + for (int i = 0; x[i] != 0; i++) printf(" %x", x[i]); \ + printf("\n") + +int main() +{ + PRINT(s1); PRINT(s2); PRINT(s3); PRINT(s4); PRINT(s5); + PRINT(t1); PRINT(t2); PRINT(t3); PRINT(t4); PRINT(t5); + PRINT(e1); PRINT(e2); PRINT(e3); PRINT(e4); PRINT(e5); + return 0; +} + -- cgit