aboutsummaryrefslogtreecommitdiffstats
path: root/cparser/Elab.ml
diff options
context:
space:
mode:
authorXavier Leroy <xavierleroy@users.noreply.github.com>2022-09-19 16:37:17 +0200
committerGitHub <noreply@github.com>2022-09-19 16:37:17 +0200
commit994c6c34182606385140e5695e33c90507ce59ee (patch)
treee9291d64997dd3e2ac8660c0e6fbe1b9a597799e /cparser/Elab.ml
parent103aa7074a9dd3b1bcb2864d52c89292a2ab7bff (diff)
downloadcompcert-994c6c34182606385140e5695e33c90507ce59ee.tar.gz
compcert-994c6c34182606385140e5695e33c90507ce59ee.zip
Support C11 Unicode string literals and character constants (#452)
* Support C11 Unicode string literals and character constants * Add tests for C11 string literals and character constants * Better error message for ill-formed universal character names E.g. \u followed by fewer than 4 hex digits, or \U followed by fewer than 8 hex digits. * Add new warning `invalid-utf8` for byte sequences that are not valid UTF8. The warning is activated but not fatal by default. * Warn on uses of C11 Unicode character constants and string literals This uses the `c11-extensions` warning, which is off by default. * Support preprocessing option -finput-charset= for GNU toolchains
Diffstat (limited to 'cparser/Elab.ml')
-rw-r--r--cparser/Elab.ml128
1 files changed, 82 insertions, 46 deletions
diff --git a/cparser/Elab.ml b/cparser/Elab.ml
index 01e745c3..ec79634a 100644
--- a/cparser/Elab.ml
+++ b/cparser/Elab.ml
@@ -395,48 +395,75 @@ let elab_float_constant f =
in
(v, ty)
-let elab_char_constant loc wide chars =
+let check_char_range loc ikind chars =
+ let max = Int64.shift_left 1L (sizeof_ikind ikind * 8) in
+ List.iter
+ (fun c ->
+ if c >= max then error loc "escape sequence 0x%LX is out of range" c)
+ chars
+
+let ikind_of_encoding = function
+ | EncNone -> IChar
+ | EncWide -> wchar_ikind()
+ | EncU16 -> IUShort
+ | EncU32 -> IUInt
+ | EncUTF8 -> IChar
+
+let elab_char_constant loc enc chars =
let len = List.length chars in
- let nbits = if wide then 8 * !config.sizeof_wchar else 8 in
- let max_digit = Int64.shift_left 1L nbits in
- (* Treat multi-character constants as a number in base 2^nbits.
- It must fit in type int for a normal constant and in type wchar_t
- for a wide constant. *)
+ (* We support multi-character constants for EncNone character literals only.
+ We treat them as big-endian numbers in base 256. *)
let v =
- if len > (if wide then 1 else !config.sizeof_int) then begin
- error loc "%d-character constant too long for its type" len;
- 0L
- end else
- List.fold_left
- (fun acc d ->
- if d < 0L || d >= max_digit then
- error loc "escape sequence is out of range (code 0x%LX)" d;
- Int64.add (Int64.shift_left acc nbits) d)
- 0L chars in
- (* C99 6.4.4.4 items 10 and 11:
- single-character constant -> represent at type char
+ match chars, enc with
+ | [], _ -> error loc "empty character constant"; 0L
+ | [c], _ -> c
+ | _, EncNone ->
+ if len > !config.sizeof_int then begin
+ error loc "%d-character constant too long, overflows its type" len;
+ 0L
+ end else begin
+ check_char_range loc IUChar chars;
+ List.fold_left
+ (fun acc d -> Int64.(add (shift_left acc 8) d))
+ 0L chars
+ end
+ | _, _ ->
+ error loc "%d-character constant not supported" len; 0L in
+ (* C11 6.4.4.4 items 10 and 11:
+ normal single-character constant -> represent at type char
multi-character constant -> represent at type int
- wide character constant -> represent at type wchar_t *)
- Ceval.normalize_int v
- (if wide then wchar_ikind() else if len = 1 then IChar else IInt)
-
-let elab_string_literal loc wide chars =
- let nbits = if wide then 8 * !config.sizeof_wchar else 8 in
- let char_max = Int64.shift_left 1L nbits in
- List.iter
- (fun c ->
- if c < 0L || c >= char_max
- then error loc "escape sequence is out of range (code 0x%LX)" c)
- chars;
- if wide then
- CWStr chars
- else begin
- let res = Bytes.create (List.length chars) in
- List.iteri
- (fun i c -> Bytes.set res i (Char.unsafe_chr (Int64.to_int c)))
- chars;
- CStr (Bytes.to_string res)
- end
+ L character constant -> represent at type wchar_t
+ u character constant -> represent at type char16_t
+ U character constant -> represent at type char32_t *)
+ let ik =
+ if enc = EncNone && len > 1 then IInt else ikind_of_encoding enc in
+ let v' = Ceval.normalize_int v ik in
+ if v' <> v then
+ warning loc Constant_conversion
+ "overflow in character constant, changes value from %Ld to %Ld" v v';
+ v'
+
+let elab_string_literal loc enc chars =
+ let ik = ikind_of_encoding enc in
+ check_char_range loc ik chars;
+ match enc with
+ | EncNone | EncUTF8 ->
+ let res = Bytes.create (List.length chars) in
+ List.iteri
+ (fun i c -> Bytes.set res i (Char.unsafe_chr (Int64.to_int c)))
+ chars;
+ CStr (Bytes.to_string res)
+ | EncWide | EncU16 | EncU32 ->
+ CWStr(chars, ik)
+
+let warn_C11_literals loc enc kind =
+ let warn enc =
+ warning loc Celeven_extension "'%s' %s are a C11 extension" enc kind in
+ match enc with
+ | EncNone | EncWide -> ()
+ | EncUTF8 -> warn "u8"
+ | EncU16 -> warn "u"
+ | EncU32 -> warn "U"
let elab_constant loc = function
| CONST_INT s ->
@@ -445,14 +472,22 @@ let elab_constant loc = function
| CONST_FLOAT f ->
let (v, fk) = elab_float_constant f in
CFloat(v, fk)
- | CONST_CHAR(wide, s) ->
- let ikind = if wide then wchar_ikind () else IInt in
- CInt(elab_char_constant loc wide s, ikind, "")
+ | CONST_CHAR(enc, s) ->
+ warn_C11_literals loc enc "character constants";
+ let ikind =
+ match enc with
+ | EncNone -> IInt
+ | EncWide -> wchar_ikind ()
+ | EncU16 -> IUShort
+ | EncU32 -> IUInt
+ | EncUTF8 -> assert false in
+ CInt(elab_char_constant loc enc s, ikind, "")
| CONST_STRING(wide, s) ->
+ warn_C11_literals loc wide "string literals";
elab_string_literal loc wide s
-let elab_simple_string loc wide chars =
- match elab_string_literal loc wide chars with
+let elab_simple_string loc enc chars =
+ match elab_string_literal loc enc chars with
| CStr s -> s
| _ -> error loc "cannot use wide string literal in 'asm'"; ""
@@ -1586,6 +1621,7 @@ and elab_item zi item il =
| COMPOUND_INIT [_, SINGLE_INIT(CONSTANT (CONST_STRING(w, s)))]),
TArray(ty_elt, sz, _)
when is_integer_type env ty_elt ->
+ warn_C11_literals loc w "string literals";
begin match elab_string_literal loc w s, unroll env ty_elt with
| CStr s, TInt((IChar | ISChar | IUChar), _) ->
if not (I.index_below (Int64.of_int(String.length s - 1)) sz) then
@@ -1594,12 +1630,12 @@ and elab_item zi item il =
| CStr _, _ ->
error loc "initialization of an array of non-char elements with a string literal";
elab_list zi il false
- | CWStr s, TInt(_, _) when compatible_types AttrIgnoreTop env ty_elt (TInt(wchar_ikind(), [])) ->
+ | CWStr(s, ik), TInt(_, _) when compatible_types AttrIgnoreTop env ty_elt (TInt(ik, [])) ->
if not (I.index_below (Int64.of_int(List.length s - 1)) sz) then
warning loc Unnamed "initializer string for array of wide chars %s is too long" (I.name zi);
elab_list (I.set zi (init_int_array_wstring sz s)) il false
| CWStr _, _ ->
- error loc "initialization of an array of non-wchar_t elements with a wide string literal";
+ error loc "type mismatch between array destination and wide string literal";
elab_list zi il false
| _ -> assert false
end