diff options
author | Xavier Leroy <xavierleroy@users.noreply.github.com> | 2022-09-19 16:37:17 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2022-09-19 16:37:17 +0200 |
commit | 994c6c34182606385140e5695e33c90507ce59ee (patch) | |
tree | e9291d64997dd3e2ac8660c0e6fbe1b9a597799e /cparser/Elab.ml | |
parent | 103aa7074a9dd3b1bcb2864d52c89292a2ab7bff (diff) | |
download | compcert-994c6c34182606385140e5695e33c90507ce59ee.tar.gz compcert-994c6c34182606385140e5695e33c90507ce59ee.zip |
Support C11 Unicode string literals and character constants (#452)
* Support C11 Unicode string literals and character constants
* Add tests for C11 string literals and character constants
* Better error message for ill-formed universal character names
E.g. \u followed by fewer than 4 hex digits, or \U followed by fewer than 8 hex digits.
* Add new warning `invalid-utf8` for byte sequences that are not valid UTF8.
The warning is activated but not fatal by default.
* Warn on uses of C11 Unicode character constants and string literals
This uses the `c11-extensions` warning, which is off by default.
* Support preprocessing option -finput-charset= for GNU toolchains
Diffstat (limited to 'cparser/Elab.ml')
-rw-r--r-- | cparser/Elab.ml | 128 |
1 files changed, 82 insertions, 46 deletions
diff --git a/cparser/Elab.ml b/cparser/Elab.ml index 01e745c3..ec79634a 100644 --- a/cparser/Elab.ml +++ b/cparser/Elab.ml @@ -395,48 +395,75 @@ let elab_float_constant f = in (v, ty) -let elab_char_constant loc wide chars = +let check_char_range loc ikind chars = + let max = Int64.shift_left 1L (sizeof_ikind ikind * 8) in + List.iter + (fun c -> + if c >= max then error loc "escape sequence 0x%LX is out of range" c) + chars + +let ikind_of_encoding = function + | EncNone -> IChar + | EncWide -> wchar_ikind() + | EncU16 -> IUShort + | EncU32 -> IUInt + | EncUTF8 -> IChar + +let elab_char_constant loc enc chars = let len = List.length chars in - let nbits = if wide then 8 * !config.sizeof_wchar else 8 in - let max_digit = Int64.shift_left 1L nbits in - (* Treat multi-character constants as a number in base 2^nbits. - It must fit in type int for a normal constant and in type wchar_t - for a wide constant. *) + (* We support multi-character constants for EncNone character literals only. + We treat them as big-endian numbers in base 256. *) let v = - if len > (if wide then 1 else !config.sizeof_int) then begin - error loc "%d-character constant too long for its type" len; - 0L - end else - List.fold_left - (fun acc d -> - if d < 0L || d >= max_digit then - error loc "escape sequence is out of range (code 0x%LX)" d; - Int64.add (Int64.shift_left acc nbits) d) - 0L chars in - (* C99 6.4.4.4 items 10 and 11: - single-character constant -> represent at type char + match chars, enc with + | [], _ -> error loc "empty character constant"; 0L + | [c], _ -> c + | _, EncNone -> + if len > !config.sizeof_int then begin + error loc "%d-character constant too long, overflows its type" len; + 0L + end else begin + check_char_range loc IUChar chars; + List.fold_left + (fun acc d -> Int64.(add (shift_left acc 8) d)) + 0L chars + end + | _, _ -> + error loc "%d-character constant not supported" len; 0L in + (* C11 6.4.4.4 items 10 and 11: + normal single-character constant -> represent at type char multi-character constant -> represent at type int - wide character constant -> represent at type wchar_t *) - Ceval.normalize_int v - (if wide then wchar_ikind() else if len = 1 then IChar else IInt) - -let elab_string_literal loc wide chars = - let nbits = if wide then 8 * !config.sizeof_wchar else 8 in - let char_max = Int64.shift_left 1L nbits in - List.iter - (fun c -> - if c < 0L || c >= char_max - then error loc "escape sequence is out of range (code 0x%LX)" c) - chars; - if wide then - CWStr chars - else begin - let res = Bytes.create (List.length chars) in - List.iteri - (fun i c -> Bytes.set res i (Char.unsafe_chr (Int64.to_int c))) - chars; - CStr (Bytes.to_string res) - end + L character constant -> represent at type wchar_t + u character constant -> represent at type char16_t + U character constant -> represent at type char32_t *) + let ik = + if enc = EncNone && len > 1 then IInt else ikind_of_encoding enc in + let v' = Ceval.normalize_int v ik in + if v' <> v then + warning loc Constant_conversion + "overflow in character constant, changes value from %Ld to %Ld" v v'; + v' + +let elab_string_literal loc enc chars = + let ik = ikind_of_encoding enc in + check_char_range loc ik chars; + match enc with + | EncNone | EncUTF8 -> + let res = Bytes.create (List.length chars) in + List.iteri + (fun i c -> Bytes.set res i (Char.unsafe_chr (Int64.to_int c))) + chars; + CStr (Bytes.to_string res) + | EncWide | EncU16 | EncU32 -> + CWStr(chars, ik) + +let warn_C11_literals loc enc kind = + let warn enc = + warning loc Celeven_extension "'%s' %s are a C11 extension" enc kind in + match enc with + | EncNone | EncWide -> () + | EncUTF8 -> warn "u8" + | EncU16 -> warn "u" + | EncU32 -> warn "U" let elab_constant loc = function | CONST_INT s -> @@ -445,14 +472,22 @@ let elab_constant loc = function | CONST_FLOAT f -> let (v, fk) = elab_float_constant f in CFloat(v, fk) - | CONST_CHAR(wide, s) -> - let ikind = if wide then wchar_ikind () else IInt in - CInt(elab_char_constant loc wide s, ikind, "") + | CONST_CHAR(enc, s) -> + warn_C11_literals loc enc "character constants"; + let ikind = + match enc with + | EncNone -> IInt + | EncWide -> wchar_ikind () + | EncU16 -> IUShort + | EncU32 -> IUInt + | EncUTF8 -> assert false in + CInt(elab_char_constant loc enc s, ikind, "") | CONST_STRING(wide, s) -> + warn_C11_literals loc wide "string literals"; elab_string_literal loc wide s -let elab_simple_string loc wide chars = - match elab_string_literal loc wide chars with +let elab_simple_string loc enc chars = + match elab_string_literal loc enc chars with | CStr s -> s | _ -> error loc "cannot use wide string literal in 'asm'"; "" @@ -1586,6 +1621,7 @@ and elab_item zi item il = | COMPOUND_INIT [_, SINGLE_INIT(CONSTANT (CONST_STRING(w, s)))]), TArray(ty_elt, sz, _) when is_integer_type env ty_elt -> + warn_C11_literals loc w "string literals"; begin match elab_string_literal loc w s, unroll env ty_elt with | CStr s, TInt((IChar | ISChar | IUChar), _) -> if not (I.index_below (Int64.of_int(String.length s - 1)) sz) then @@ -1594,12 +1630,12 @@ and elab_item zi item il = | CStr _, _ -> error loc "initialization of an array of non-char elements with a string literal"; elab_list zi il false - | CWStr s, TInt(_, _) when compatible_types AttrIgnoreTop env ty_elt (TInt(wchar_ikind(), [])) -> + | CWStr(s, ik), TInt(_, _) when compatible_types AttrIgnoreTop env ty_elt (TInt(ik, [])) -> if not (I.index_below (Int64.of_int(List.length s - 1)) sz) then warning loc Unnamed "initializer string for array of wide chars %s is too long" (I.name zi); elab_list (I.set zi (init_int_array_wstring sz s)) il false | CWStr _, _ -> - error loc "initialization of an array of non-wchar_t elements with a wide string literal"; + error loc "type mismatch between array destination and wide string literal"; elab_list zi il false | _ -> assert false end |