From 994c6c34182606385140e5695e33c90507ce59ee Mon Sep 17 00:00:00 2001
From: Xavier Leroy <xavierleroy@users.noreply.github.com>
Date: Mon, 19 Sep 2022 16:37:17 +0200
Subject: Support C11 Unicode string literals and character constants (#452)

* Support C11 Unicode string literals and character constants

* Add tests for C11 string literals and character constants

* Better error message for ill-formed universal character names
E.g. \u followed by fewer than 4 hex digits, or \U followed by fewer than 8 hex digits.

* Add new warning `invalid-utf8` for byte sequences that are not valid UTF8.
The warning is activated but not fatal by default.

* Warn on uses of C11 Unicode character constants and string literals
This uses the `c11-extensions` warning, which is off by default.

* Support preprocessing option -finput-charset= for GNU toolchains
---
 cfrontend/C2C.ml                  | 197 ++++++++++++++++++--------------------
 cparser/C.mli                     |   2 +-
 cparser/Cabs.v                    |  17 +++-
 cparser/Ceval.ml                  |   6 +-
 cparser/Cprint.ml                 |   2 +-
 cparser/Cutil.ml                  |   4 +-
 cparser/Diagnostics.ml            |   3 +
 cparser/Diagnostics.mli           |   1 +
 cparser/Elab.ml                   | 128 ++++++++++++++++---------
 cparser/Elab.mli                  |   2 +-
 cparser/Lexer.mll                 | 180 ++++++++++++++++++++++++++--------
 cparser/Parser.vy                 |  20 ++--
 cparser/pre_parser.mly            |   2 +-
 driver/Frontend.ml                |   5 +-
 test/regression/Makefile          |   2 +-
 test/regression/Results/charlit   |  27 ++++++
 test/regression/Results/stringlit |  15 +++
 test/regression/charlit.c         |  50 ++++++++++
 test/regression/stringlit.c       |  41 ++++++++
 19 files changed, 490 insertions(+), 214 deletions(-)
 create mode 100644 test/regression/Results/charlit
 create mode 100644 test/regression/Results/stringlit
 create mode 100644 test/regression/charlit.c
 create mode 100644 test/regression/stringlit.c

diff --git a/cfrontend/C2C.ml b/cfrontend/C2C.ml
index b0dc8e8a..89b9139c 100644
--- a/cfrontend/C2C.ml
+++ b/cfrontend/C2C.ml
@@ -287,105 +287,6 @@ let attributes = [
 ]
 
 
-(** ** Functions used to handle string literals *)
-
-let stringNum = ref 0   (* number of next global for string literals *)
-let stringTable : (string, AST.ident) Hashtbl.t = Hashtbl.create 47
-let wstringTable : (int64 list, AST.ident) Hashtbl.t = Hashtbl.create 47
-
-let is_C_string s = not (String.contains s '\000')
-
-let name_for_string_literal s =
-  try
-    Hashtbl.find stringTable s
-  with Not_found ->
-    incr stringNum;
-    let name = Printf.sprintf "__stringlit_%d" !stringNum in
-    let id = intern_string name in
-    let mergeable = if is_C_string s then 1 else 0 in
-    Hashtbl.add decl_atom id
-      { a_storage = C.Storage_static;
-        a_alignment = Some 1;
-        a_size = Some (Int64.of_int (String.length s + 1));
-        a_sections = [Sections.for_stringlit mergeable];
-        a_access = Sections.Access_default;
-        a_inline = No_specifier;
-        a_loc = Cutil.no_loc };
-    Hashtbl.add stringTable s id;
-    id
-
-let typeStringLiteral s =
-  let sg = if Machine.((!config).char_signed) then Signed else Unsigned in
-  Tarray(Tint(I8, sg, noattr), Z.of_uint (String.length s + 1), noattr)
-
-let global_for_string s id =
-  let init = ref [] in
-  let add_char c =
-    init := AST.Init_int8(Z.of_uint(Char.code c)) :: !init in
-  add_char '\000';
-  for i = String.length s - 1 downto 0 do add_char s.[i] done;
-  AST.(id, Gvar { gvar_info = typeStringLiteral s;  gvar_init = !init;
-                  gvar_readonly = true;  gvar_volatile = false})
-
-let is_C_wide_string s = not (List.mem 0L s)
-
-let name_for_wide_string_literal s =
-  try
-    Hashtbl.find wstringTable s
-  with Not_found ->
-    incr stringNum;
-    let name = Printf.sprintf "__stringlit_%d" !stringNum in
-    let id = intern_string name in
-    let wchar_size = Machine.((!config).sizeof_wchar) in
-    let mergeable = if is_C_wide_string s then wchar_size else 0 in
-    Hashtbl.add decl_atom id
-      { a_storage = C.Storage_static;
-        a_alignment = Some wchar_size;
-        a_size = Some (Int64.(mul (of_int (List.length s + 1))
-                                  (of_int wchar_size)));
-        a_sections = [Sections.for_stringlit mergeable];
-        a_access = Sections.Access_default;
-        a_inline = No_specifier;
-        a_loc = Cutil.no_loc };
-    Hashtbl.add wstringTable s id;
-    id
-
-let typeWideStringLiteral s =
-  let sz =
-    match Machine.((!config).sizeof_wchar) with
-    | 2 -> I16
-    | 4 -> I32
-    | _ -> assert false in
-  let sg =
-    if Machine.((!config).wchar_signed) then Signed else Unsigned in
-  Tarray(Tint(sz, sg, noattr), Z.of_uint (List.length s + 1), noattr)
-
-let global_for_wide_string s id =
-  let init = ref [] in
-  let init_of_char =
-    match Machine.((!config).sizeof_wchar) with
-    | 2 -> (fun z -> AST.Init_int16 z)
-    | 4 -> (fun z -> AST.Init_int32 z)
-    | _ -> assert false in
-  let add_char c =
-    init := init_of_char(Z.of_uint64 c) :: !init in
-  List.iter add_char s;
-  add_char 0L;
-  AST.(id, Gvar { gvar_info = typeWideStringLiteral s;
-                  gvar_init = List.rev !init;
-                  gvar_readonly = true; gvar_volatile = false})
-
-let globals_for_strings globs =
-  let globs1 =
-    Hashtbl.fold
-      (fun s id l -> global_for_wide_string s id :: l)
-      wstringTable globs in
-  let globs2 =
-    Hashtbl.fold
-      (fun s id l -> global_for_string s id :: l)
-      stringTable globs1 in
-  globs2
-
 (** ** Handling of inlined memcpy functions *)
 
 let constant_size_t a =
@@ -667,6 +568,98 @@ let is_int64 env ty =
   | C.TEnum(_, _) -> false
   | _ -> assert false
 
+(** String literals *)
+
+let stringNum = ref 0   (* number of next global for string literals *)
+let stringTable : (string, AST.ident) Hashtbl.t = Hashtbl.create 47
+let wstringTable : (int64 list * ikind, AST.ident) Hashtbl.t = Hashtbl.create 47
+
+let is_C_string s = not (String.contains s '\000')
+
+let name_for_string_literal s =
+  try
+    Hashtbl.find stringTable s
+  with Not_found ->
+    incr stringNum;
+    let name = Printf.sprintf "__stringlit_%d" !stringNum in
+    let id = intern_string name in
+    let mergeable = if is_C_string s then 1 else 0 in
+    Hashtbl.add decl_atom id
+      { a_storage = C.Storage_static;
+        a_alignment = Some 1;
+        a_size = Some (Int64.of_int (String.length s + 1));
+        a_sections = [Sections.for_stringlit mergeable];
+        a_access = Sections.Access_default;
+        a_inline = No_specifier;
+        a_loc = Cutil.no_loc };
+    Hashtbl.add stringTable s id;
+    id
+
+let typeStringLiteral s =
+  let sg = if Machine.((!config).char_signed) then Signed else Unsigned in
+  Tarray(Tint(I8, sg, noattr), Z.of_uint (String.length s + 1), noattr)
+
+let global_for_string s id =
+  let init = ref [] in
+  let add_char c =
+    init := AST.Init_int8(Z.of_uint(Char.code c)) :: !init in
+  add_char '\000';
+  for i = String.length s - 1 downto 0 do add_char s.[i] done;
+  AST.(id, Gvar { gvar_info = typeStringLiteral s;  gvar_init = !init;
+                  gvar_readonly = true;  gvar_volatile = false})
+
+let is_C_wide_string s = not (List.mem 0L s)
+
+let name_for_wide_string_literal s ik =
+  try
+    Hashtbl.find wstringTable (s, ik)
+  with Not_found ->
+    incr stringNum;
+    let name = Printf.sprintf "__stringlit_%d" !stringNum in
+    let id = intern_string name in
+    let wchar_size = Cutil.sizeof_ikind ik in
+    let mergeable = if is_C_wide_string s then wchar_size else 0 in
+    Hashtbl.add decl_atom id
+      { a_storage = C.Storage_static;
+        a_alignment = Some wchar_size;
+        a_size = Some (Int64.(mul (of_int (List.length s + 1))
+                                  (of_int wchar_size)));
+        a_sections = [Sections.for_stringlit mergeable];
+        a_access = Sections.Access_default;
+        a_inline = No_specifier;
+        a_loc = Cutil.no_loc };
+    Hashtbl.add wstringTable (s, ik) id;
+    id
+
+let typeWideStringLiteral s ik =
+  Tarray(convertIkind ik noattr, Z.of_uint (List.length s + 1), noattr)
+
+let global_for_wide_string (s, ik) id =
+  let init = ref [] in
+  let init_of_char =
+    match Cutil.sizeof_ikind ik with
+    | 2 -> (fun z -> AST.Init_int16 z)
+    | 4 -> (fun z -> AST.Init_int32 z)
+    | _ -> assert false in
+  let add_char c =
+    init := init_of_char(Z.of_uint64 c) :: !init in
+  List.iter add_char s;
+  add_char 0L;
+  AST.(id, Gvar { gvar_info = typeWideStringLiteral s ik;
+                  gvar_init = List.rev !init;
+                  gvar_readonly = true; gvar_volatile = false})
+
+let globals_for_strings globs =
+  let globs1 =
+    Hashtbl.fold
+      (fun s id l -> global_for_wide_string s id :: l)
+      wstringTable globs in
+  let globs2 =
+    Hashtbl.fold
+      (fun s id l -> global_for_string s id :: l)
+      stringTable globs1 in
+  globs2
+
 (** Floating point constants *)
 
 let z_of_str hex str fst =
@@ -1000,9 +993,9 @@ and convertLvalue env e =
   | C.EConst(C.CStr s) ->
       let ty = typeStringLiteral s in
       Evar(name_for_string_literal s, ty)
-  | C.EConst(C.CWStr s) ->
-      let ty = typeWideStringLiteral s in
-      Evar(name_for_wide_string_literal s, ty)
+  | C.EConst(C.CWStr(s, ik)) ->
+      let ty = typeWideStringLiteral s ik in
+      Evar(name_for_wide_string_literal s ik, ty)
   | _ ->
       error "illegal lvalue"; ezero
 
diff --git a/cparser/C.mli b/cparser/C.mli
index 763a9277..1388fab9 100644
--- a/cparser/C.mli
+++ b/cparser/C.mli
@@ -64,7 +64,7 @@ type constant =
   | CInt of int64 * ikind * string      (* as it appeared in the source *)
   | CFloat of float_cst * fkind
   | CStr of string
-  | CWStr of int64 list
+  | CWStr of int64 list * ikind
   | CEnum of ident * int64              (* enum tag, integer value *)
 
 (** Attributes *)
diff --git a/cparser/Cabs.v b/cparser/Cabs.v
index bf8c8c74..59cd30ab 100644
--- a/cparser/Cabs.v
+++ b/cparser/Cabs.v
@@ -31,6 +31,13 @@ Record floatInfo := {
   suffix_FI:option string
 }.
 
+Inductive encoding :=
+  | EncNone                             (* no prefix *)
+  | EncWide                             (* 'L' prefix *)
+  | EncU16                              (* 'u' prefix *)
+  | EncU32                              (* 'U' prefix *)
+  | EncUTF8.                            (* 'u8' prefix (strings only) *)
+
 Inductive structOrUnion :=
   | STRUCT | UNION.
 
@@ -152,8 +159,8 @@ with constant :=
      the source code. *)
   | CONST_INT : string -> constant
   | CONST_FLOAT : floatInfo -> constant
-  | CONST_CHAR : bool -> list char_code -> constant
-  | CONST_STRING : bool -> list char_code -> constant
+  | CONST_CHAR : encoding -> list char_code -> constant
+  | CONST_STRING : encoding -> list char_code -> constant
 
 with init_expression :=
   | NO_INIT
@@ -194,9 +201,9 @@ Definition generic_assoc := (option type_name * expression)%type.
 
 (* GCC extended asm *)
 Inductive asm_operand :=
-| ASMOPERAND: option string -> bool -> list char_code -> expression -> asm_operand.
+| ASMOPERAND: option string -> encoding -> list char_code -> expression -> asm_operand.
 
-Definition asm_flag := (bool * list char_code)%type.
+Definition asm_flag := (encoding * list char_code)%type.
 
 (*
 ** Declaration definition (at toplevel)
@@ -227,7 +234,7 @@ with statement :=
  | DEFAULT : statement -> loc -> statement
  | LABEL : string -> statement -> loc -> statement
  | GOTO : string -> loc -> statement
- | ASM : list cvspec -> bool -> list char_code -> list asm_operand -> list asm_operand -> list asm_flag -> loc -> statement
+ | ASM : list cvspec -> encoding -> list char_code -> list asm_operand -> list asm_operand -> list asm_flag -> loc -> statement
  | DEFINITION : definition -> statement (*definition or declaration of a variable or type*)
 
 with for_clause :=
diff --git a/cparser/Ceval.ml b/cparser/Ceval.ml
index 14f61e06..0800e25b 100644
--- a/cparser/Ceval.ml
+++ b/cparser/Ceval.ml
@@ -72,7 +72,7 @@ let normalize_int n ik =
 type value =
   | I of int64
   | S of string
-  | WS of int64 list
+  | WS of int64 list * ikind
 
 let boolean_value v =
   match v with
@@ -83,7 +83,7 @@ let constant = function
   | CInt(v, ik, _) -> I (normalize_int v ik)
   | CFloat(v, fk) -> raise Notconst
   | CStr s -> S s
-  | CWStr s -> WS s
+  | CWStr(s, ik) -> WS(s, ik)
   | CEnum(id, v) -> I v
 
 let is_signed env ty =
@@ -274,7 +274,7 @@ let constant_expr env ty e =
     | TInt(ik, _), I n -> Some(CInt(n, ik, ""))
     | TPtr(_, _), I n -> Some(CInt(n, ptr_t_ikind (), ""))
     | (TArray(_, _, _) | TPtr(_, _)), S s -> Some(CStr s)
-    | (TArray(_, _, _) | TPtr(_, _)), WS s -> Some(CWStr s)
+    | (TArray(_, _, _) | TPtr(_, _)), WS(s, ik) -> Some(CWStr(s, ik))
     | TEnum(_, _), I n -> Some(CInt(n, enum_ikind, ""))
     | _   -> None
   with Notconst -> None
diff --git a/cparser/Cprint.ml b/cparser/Cprint.ml
index caa4fa66..05448784 100644
--- a/cparser/Cprint.ml
+++ b/cparser/Cprint.ml
@@ -75,7 +75,7 @@ let const pp = function
             else fprintf pp "\\%03o" (Char.code c)
       done;
       fprintf pp "\""
-  | CWStr l ->
+  | CWStr(l, _) ->
       fprintf pp "L\"";
       List.iter
         (fun c ->
diff --git a/cparser/Cutil.ml b/cparser/Cutil.ml
index 6fd12323..f226d51b 100644
--- a/cparser/Cutil.ml
+++ b/cparser/Cutil.ml
@@ -1024,9 +1024,9 @@ let type_of_constant = function
   | CStr s ->
     let size = Int64.of_int (String.length s + 1) in
     TArray(TInt(IChar,[]), Some size, [])
-  | CWStr s ->
+  | CWStr(s, ik) ->
     let size = Int64.of_int (List.length s + 1) in
-    TArray(TInt(wchar_ikind(), []), Some size, [])
+    TArray(TInt(ik, []), Some size, [])
   | CEnum(_, _) -> TInt(IInt, [])
 
 (* Check that a C expression is a lvalue *)
diff --git a/cparser/Diagnostics.ml b/cparser/Diagnostics.ml
index 483b0376..8a8a0c17 100644
--- a/cparser/Diagnostics.ml
+++ b/cparser/Diagnostics.ml
@@ -104,6 +104,7 @@ type warning_type =
   | Tentative_incomplete_static
   | Reduced_alignment
   | Non_linear_cond_expr
+  | Invalid_UTF8
 
 (* List of all warnings with default status.
    "true" means the warning is active by default.
@@ -140,6 +141,7 @@ let all_warnings =
     (Tentative_incomplete_static, false);
     (Reduced_alignment, false);
     (Non_linear_cond_expr, false);
+    (Invalid_UTF8, true);
   ]
 
 (* List of active warnings *)
@@ -182,6 +184,7 @@ let string_of_warning = function
   | Tentative_incomplete_static -> "tentative-incomplete-static"
   | Reduced_alignment -> "reduced-alignment"
   | Non_linear_cond_expr -> "non-linear-cond-expr"
+  | Invalid_UTF8 -> "invalid-utf8"
 
 (* Activate the given warning *)
 let activate_warning w () =
diff --git a/cparser/Diagnostics.mli b/cparser/Diagnostics.mli
index 1210353f..47727707 100644
--- a/cparser/Diagnostics.mli
+++ b/cparser/Diagnostics.mli
@@ -57,6 +57,7 @@ type warning_type =
   | Tentative_incomplete_static    (** static tentative definition with incomplete type *)
   | Reduced_alignment              (** alignment reduction *)
   | Non_linear_cond_expr           (** condition that cannot be linearized *)
+  | Invalid_UTF8                   (** invalid UTF-8 encoding *)
 
 val warning  : (string * int) -> warning_type -> ('a, Format.formatter, unit, unit, unit, unit) format6 -> 'a
 (** [warning (f,c) w fmt arg1 ... argN] formats the arguments [arg1] to [argN] as warining according to
diff --git a/cparser/Elab.ml b/cparser/Elab.ml
index 01e745c3..ec79634a 100644
--- a/cparser/Elab.ml
+++ b/cparser/Elab.ml
@@ -395,48 +395,75 @@ let elab_float_constant f =
   in
   (v, ty)
 
-let elab_char_constant loc wide chars =
+let check_char_range loc ikind chars =
+  let max = Int64.shift_left 1L (sizeof_ikind ikind * 8) in
+  List.iter
+    (fun c ->
+      if c >= max then error loc "escape sequence 0x%LX is out of range" c)
+    chars
+
+let ikind_of_encoding = function
+  | EncNone -> IChar
+  | EncWide -> wchar_ikind()
+  | EncU16 -> IUShort
+  | EncU32 -> IUInt
+  | EncUTF8 -> IChar
+
+let elab_char_constant loc enc chars =
   let len = List.length chars in
-  let nbits = if wide then 8 * !config.sizeof_wchar else 8 in
-  let max_digit = Int64.shift_left 1L nbits in
-  (* Treat multi-character constants as a number in base 2^nbits.
-     It must fit in type int for a normal constant and in type wchar_t
-     for a wide constant. *)
+  (* We support multi-character constants for EncNone character literals only.
+     We treat them as big-endian numbers in base 256. *)
   let v =
-    if len > (if wide then 1 else !config.sizeof_int) then begin
-      error loc "%d-character constant too long for its type" len;
-      0L
-    end else
-      List.fold_left
-        (fun acc d ->
-          if d < 0L || d >= max_digit then
-            error loc "escape sequence is out of range (code 0x%LX)" d;
-          Int64.add (Int64.shift_left acc nbits) d)
-        0L chars in
-  (* C99 6.4.4.4 items 10 and 11:
-       single-character constant -> represent at type char
+    match chars, enc with
+    | [], _ -> error loc "empty character constant"; 0L
+    | [c], _ -> c
+    | _, EncNone ->
+      if len > !config.sizeof_int then begin
+        error loc "%d-character constant too long, overflows its type" len;
+        0L
+      end else begin
+        check_char_range loc IUChar chars;
+        List.fold_left
+          (fun acc d -> Int64.(add (shift_left acc 8) d))
+          0L chars
+      end
+    | _, _ ->
+      error loc "%d-character constant not supported" len; 0L in
+  (* C11 6.4.4.4 items 10 and 11:
+       normal single-character constant -> represent at type char
        multi-character constant -> represent at type int
-       wide character constant -> represent at type wchar_t *)
-  Ceval.normalize_int v
-    (if wide then wchar_ikind() else if len = 1 then IChar else IInt)
-
-let elab_string_literal loc wide chars =
-  let nbits = if wide then 8 * !config.sizeof_wchar else 8 in
-  let char_max = Int64.shift_left 1L nbits in
-  List.iter
-    (fun c ->
-      if c < 0L || c >= char_max
-      then error loc "escape sequence is out of range (code 0x%LX)" c)
-    chars;
-  if wide then
-    CWStr chars
-  else begin
-    let res = Bytes.create (List.length chars) in
-    List.iteri
-      (fun i c -> Bytes.set res i (Char.unsafe_chr (Int64.to_int c)))
-      chars;
-    CStr (Bytes.to_string res)
-  end
+       L character constant -> represent at type wchar_t
+       u character constant -> represent at type char16_t
+       U character constant -> represent at type char32_t *)
+  let ik =
+    if enc = EncNone && len > 1 then IInt else ikind_of_encoding enc in
+  let v' = Ceval.normalize_int v ik in
+  if v' <> v then
+    warning loc Constant_conversion
+      "overflow in character constant, changes value from %Ld to %Ld" v v';
+  v'
+
+let elab_string_literal loc enc chars =
+  let ik = ikind_of_encoding enc in
+  check_char_range loc ik chars;
+  match enc with
+  | EncNone | EncUTF8 ->
+      let res = Bytes.create (List.length chars) in
+      List.iteri
+        (fun i c -> Bytes.set res i (Char.unsafe_chr (Int64.to_int c)))
+        chars;
+      CStr (Bytes.to_string res)
+  | EncWide | EncU16 | EncU32 -> 
+      CWStr(chars, ik)
+
+let warn_C11_literals loc enc kind =
+  let warn enc =
+    warning loc Celeven_extension "'%s' %s are a C11 extension" enc kind in
+  match enc with
+  | EncNone | EncWide -> ()
+  | EncUTF8 -> warn "u8"
+  | EncU16 -> warn "u"
+  | EncU32 -> warn "U"
 
 let elab_constant loc = function
   | CONST_INT s ->
@@ -445,14 +472,22 @@ let elab_constant loc = function
   | CONST_FLOAT f ->
       let (v, fk) = elab_float_constant f in
       CFloat(v, fk)
-  | CONST_CHAR(wide, s) ->
-      let ikind = if wide then wchar_ikind () else IInt in
-      CInt(elab_char_constant loc wide s, ikind, "")
+  | CONST_CHAR(enc, s) ->
+      warn_C11_literals loc enc "character constants";
+      let ikind =
+        match enc with
+        | EncNone -> IInt
+        | EncWide -> wchar_ikind ()
+        | EncU16 -> IUShort
+        | EncU32 -> IUInt
+        | EncUTF8 -> assert false in
+      CInt(elab_char_constant loc enc s, ikind, "")
   | CONST_STRING(wide, s) ->
+      warn_C11_literals loc wide "string literals";
       elab_string_literal loc wide s
 
-let elab_simple_string loc wide chars =
-  match elab_string_literal loc wide chars with
+let elab_simple_string loc enc chars =
+  match elab_string_literal loc enc chars with
   | CStr s -> s
   | _ -> error loc "cannot use wide string literal in 'asm'"; ""
 
@@ -1586,6 +1621,7 @@ and elab_item zi item il =
      | COMPOUND_INIT [_, SINGLE_INIT(CONSTANT (CONST_STRING(w, s)))]),
     TArray(ty_elt, sz, _)
     when is_integer_type env ty_elt ->
+      warn_C11_literals loc w "string literals";
       begin match elab_string_literal loc w s, unroll env ty_elt with
       | CStr s, TInt((IChar | ISChar | IUChar), _) ->
           if not (I.index_below (Int64.of_int(String.length s - 1)) sz) then
@@ -1594,12 +1630,12 @@ and elab_item zi item il =
       | CStr _, _ ->
           error loc "initialization of an array of non-char elements with a string literal";
           elab_list zi il false
-      | CWStr s, TInt(_, _) when compatible_types AttrIgnoreTop env ty_elt (TInt(wchar_ikind(), [])) ->
+      | CWStr(s, ik), TInt(_, _) when compatible_types AttrIgnoreTop env ty_elt (TInt(ik, [])) ->
           if not (I.index_below (Int64.of_int(List.length s - 1)) sz) then
             warning loc Unnamed "initializer string for array of wide chars %s is too long" (I.name zi);
           elab_list (I.set zi (init_int_array_wstring sz s)) il false
       | CWStr _, _ ->
-          error loc "initialization of an array of non-wchar_t elements with a wide string literal";
+          error loc "type mismatch between array destination and wide string literal";
           elab_list zi il false
       | _ -> assert false
       end
diff --git a/cparser/Elab.mli b/cparser/Elab.mli
index bca4f74d..537056fa 100644
--- a/cparser/Elab.mli
+++ b/cparser/Elab.mli
@@ -21,6 +21,6 @@ val elab_file : Cabs.definition list -> C.program
 
 val elab_int_constant : Cabs.loc -> string -> int64 * C.ikind
 val elab_float_constant : Cabs.floatInfo -> C.float_cst * C.fkind
-val elab_char_constant : Cabs.loc -> bool -> int64 list -> int64
+val elab_char_constant : Cabs.loc -> Cabs.encoding -> int64 list -> int64
   (* These auxiliary functions are exported so that they can be reused
      in other projects that deal with C-style source languages. *)
diff --git a/cparser/Lexer.mll b/cparser/Lexer.mll
index 9f7fba1e..2efa4216 100644
--- a/cparser/Lexer.mll
+++ b/cparser/Lexer.mll
@@ -3,6 +3,7 @@
 (*              The Compcert verified compiler                         *)
 (*                                                                     *)
 (*          Jacques-Henri Jourdan, INRIA Paris-Rocquencourt            *)
+(*             Xavier Leroy, Collège de France and Inria               *)
 (*                                                                     *)
 (*  Copyright Institut National de Recherche en Informatique et en     *)
 (*  Automatique.  All rights reserved.  This file is distributed       *)
@@ -144,9 +145,9 @@ let error lb fmt =
   Diagnostics.error
     (lb.lex_curr_p.pos_fname,lb.lex_curr_p.pos_lnum) fmt
 
-let warning lb fmt =
+let warning lb kind fmt =
   Diagnostics.warning
-      (lb.lex_curr_p.pos_fname,lb.lex_curr_p.pos_lnum) Diagnostics.Unnamed fmt
+      (lb.lex_curr_p.pos_fname,lb.lex_curr_p.pos_lnum) kind fmt
 
 (* Simple character escapes *)
 
@@ -160,6 +161,89 @@ let convert_escape = function
   | 't' -> 9L  (* horizontal tab *)
   | 'v' -> 11L (* vertical tab *)
   | c   -> Int64.of_int (Char.code c)
+
+(* Encodings for character and string literals *)
+
+let encoding_of = function
+  | "" -> Cabs.EncNone
+  | "L" -> Cabs.EncWide
+  | "u" -> Cabs.EncU16
+  | "U" -> Cabs.EncU32
+  | "u8" -> Cabs.EncUTF8
+  | _ -> assert false
+
+let combine_encodings loc e1 e2 =
+  if e1 = Cabs.EncNone then e2
+  else if e2 = Cabs.EncNone then e1
+  else if e1 = e2 then e1
+  else Diagnostics.fatal_error
+           Cabs.(loc.filename, loc.lineno)
+           "unsupported non-standard concatenation of string literals"
+
+(* Handling of characters and escapes in string and char constants *)
+
+type chr = Chr of int | Esc of int64
+
+let check_utf8 lexbuf min x =
+  if x > 0x10FFFF || (x >= 0xD800 && x <= 0xDFFF) then
+    warning lexbuf Diagnostics.Invalid_UTF8 "Wrong Unicode value U+%X" x;
+  if x < min then
+    warning lexbuf Diagnostics.Invalid_UTF8
+            "Overlong UTF-8 encoding for Unicode value U+%X" x;
+  Chr x
+
+let check_universal_character lexbuf x =
+  if x > 0x10FFFF
+  || x >= 0xD800 && x <= 0xDFFF
+  || x < 0xA0 && x <> 0x24 && x <> 0x40 && x <> 0x60
+  then begin
+    error lexbuf "Wrong universal character name U+%X" x; Chr 0
+  end else
+    Chr x
+
+let add_char_utf8 x accu =
+  if x <= 0x007F then
+    Int64.of_int x :: accu
+  else if x <= 0x07FF then
+    Int64.of_int (0x80 lor (x land 0x3F)) ::
+    Int64.of_int (0xC0 lor (x lsr 6)) ::
+    accu
+  else if x <= 0xFFFF then
+    Int64.of_int (0x80 lor (x land 0x3F)) ::
+    Int64.of_int (0x80 lor ((x lsr 6) land 0x3F)) ::
+    Int64.of_int (0xE0 lor (x lsr 12)) ::
+    accu
+  else
+    Int64.of_int (0x80 lor (x land 0x3F)) ::
+    Int64.of_int (0x80 lor ((x lsr 6) land 0x3F)) ::
+    Int64.of_int (0x80 lor ((x lsr 12) land 0x3F)) ::
+    Int64.of_int (0xF0 lor (x lsr 18)) ::
+    accu
+
+let add_char_utf16 x accu =
+  if x <= 0xFFFF then
+    Int64.of_int x :: accu
+  else begin
+    let x = x - 0x10000 in
+    Int64.of_int (0xDC00 lor (x land 0x3FF)) ::
+    Int64.of_int (0xD800 lor (x lsr 10)) ::
+    accu
+  end
+
+let add_char enc c accu =
+  match c, enc with
+  | Esc x, _ -> (* Escapes are never encoded *)
+      x :: accu
+  | Chr x, (Cabs.EncNone | Cabs.EncUTF8) -> (* Characters are encoded in UTF8 *)
+      add_char_utf8 x accu
+  | Chr x, Cabs.EncU16 -> (* Characters are encoded in UTF16 *)
+      add_char_utf16 x accu
+  | Chr x, Cabs.EncU32 -> (* Characters are not encoded *)
+      Int64.of_int x :: accu
+  | Chr x, Cabs.EncWide -> (* Depends on size of wchar_t *)
+      if Machine.(!config.sizeof_wchar) = 2
+      then add_char_utf16 x accu
+      else Int64.of_int x :: accu
 }
 
 (* Identifiers *)
@@ -249,11 +333,6 @@ let octal_escape_sequence =
          | octal_digit octal_digit
          | octal_digit octal_digit octal_digit) as n)
 let hexadecimal_escape_sequence = "\\x" (hexadecimal_digit+ as n)
-let escape_sequence =
-    simple_escape_sequence
-  | octal_escape_sequence
-  | hexadecimal_escape_sequence
-  | universal_character_name
 
 rule initial = parse
   | '\n'                          { new_line lexbuf; initial_linebegin lexbuf }
@@ -283,16 +362,13 @@ rule initial = parse
                                       currentLoc lexbuf)}
   | preprocessing_number as s     { error lexbuf "invalid numerical constant '%s'@ These characters form a preprocessor number, but not a constant" s;
                                     CONSTANT (Cabs.CONST_INT "0", currentLoc lexbuf) }
-  | "'"                           { let l = char_literal lexbuf.lex_start_p [] lexbuf in
-                                    CONSTANT (Cabs.CONST_CHAR(false, l),
-                                              currentLoc lexbuf) }
-  | "L'"                          { let l = char_literal lexbuf.lex_start_p [] lexbuf in
-                                    CONSTANT (Cabs.CONST_CHAR(true, l),
-                                              currentLoc lexbuf) }
-  | "\""                          { let l = string_literal lexbuf.lex_start_p [] lexbuf in
-                                    STRING_LITERAL(false, l, currentLoc lexbuf) }
-  | "L\""                         { let l = string_literal lexbuf.lex_start_p [] lexbuf in
-                                    STRING_LITERAL(true, l, currentLoc lexbuf) }
+  | (""|"L"|"u"|"U") as e "'"     { let enc = encoding_of e in
+                                    let l = char_literal lexbuf.lex_start_p [] lexbuf in
+                                    CONSTANT (Cabs.CONST_CHAR(enc, l), currentLoc lexbuf) }
+  | (""|"L"|"u"|"U"|"u8") as e "\""
+                                  { let enc = encoding_of e in
+                                    let l = string_literal lexbuf.lex_start_p enc [] lexbuf in
+                                    STRING_LITERAL(enc, l, currentLoc lexbuf) }
   | "..."                         { ELLIPSIS(currentLoc lexbuf) }
   | "+="                          { ADD_ASSIGN(currentLoc lexbuf) }
   | "-="                          { SUB_ASSIGN(currentLoc lexbuf) }
@@ -357,39 +433,62 @@ and initial_linebegin = parse
 and char = parse
   | universal_character_name
       { try
-          Int64.of_string ("0x" ^ n)
+          check_universal_character lexbuf (int_of_string ("0x" ^ n))
         with Failure _ ->
           error lexbuf "overflow in universal character name";
-          0L
+          Chr 0
       }
   | hexadecimal_escape_sequence
       { try
-          Int64.of_string ("0x" ^ n)
+          Esc (Int64.of_string ("0x" ^ n))
         with Failure _ ->
           error lexbuf "overflow in hexadecimal escape sequence";
-          0L
+          Esc 0L
       }
   | octal_escape_sequence
-      { Int64.of_string  ("0o" ^ n) }
+      { Esc (Int64.of_string  ("0o" ^ n)) }
   | simple_escape_sequence
-      { convert_escape c }
+      { Esc (convert_escape c) }
+  | "\\u" | "\\U"
+      { error lexbuf "incomplete universal character name";
+        Chr 0 }
   | '\\' (_ as c)
       { error lexbuf "incorrect escape sequence '\\%c'" c;
-        Int64.of_int (Char.code c) }
+        Esc (Int64.of_int (Char.code c)) }
+  | ['\x00'-'\x7F'] as c1
+      { Chr (Char.code c1) }
+  | (['\xC0'-'\xDF'] as c1) (['\x80'-'\xBF'] as c2)
+      { check_utf8 lexbuf 0x80
+          ( (Char.code c1 land 0b00011111) lsl 6
+          + (Char.code c2 land 0b00111111)) }
+  | (['\xE0'-'\xEF'] as c1) (['\x80'-'\xBF'] as c2) (['\x80'-'\xBF'] as c3)
+      { check_utf8 lexbuf 0x800
+          ( (Char.code c1 land 0b00001111) lsl 12
+          + (Char.code c2 land 0b00111111) lsl 6
+          + (Char.code c3 land 0b00111111) ) }
+  | (['\xF0'-'\xF7'] as c1) (['\x80'-'\xBF'] as c2) (['\x80'-'\xBF'] as c3) (['\x80'-'\xBF'] as c4)
+     { check_utf8 lexbuf 0x10000
+          ( (Char.code c1 land 0b00000111) lsl 18
+          + (Char.code c2 land 0b00111111) lsl 12
+          + (Char.code c3 land 0b00111111) lsl 6
+          + (Char.code c4 land 0b00111111) ) }
   | _ as c
-      { Int64.of_int (Char.code c) }
+     { warning lexbuf Diagnostics.Invalid_UTF8
+               "Invalid UTF8 encoding: byte 0x%02x" (Char.code c);
+       Esc (Int64.of_int (Char.code c)) (* re-encode as-is *)
+     }
 
 and char_literal startp accu = parse
   | '\''       { lexbuf.lex_start_p <- startp;
                  List.rev accu }
   | '\n' | eof { fatal_error lexbuf "missing terminating \"'\" character" }
-  | ""         { let c = char lexbuf in char_literal startp (c :: accu) lexbuf }
+  | ""         { let c = char lexbuf in char_literal startp (add_char Cabs.EncU32 c accu) lexbuf }
 
-and string_literal startp accu = parse
+and string_literal startp enc accu = parse
   | '\"'       { lexbuf.lex_start_p <- startp;
                  List.rev accu }
   | '\n' | eof { fatal_error lexbuf "missing terminating '\"' character" }
-  | ""         { let c = char lexbuf in string_literal startp (c :: accu) lexbuf }
+  | ""         { let c = char lexbuf in string_literal startp enc (add_char enc c accu) lexbuf }
 
 (* We assume gcc -E syntax but try to tolerate variations. *)
 and hash = parse
@@ -402,7 +501,8 @@ and hash = parse
           try
             int_of_string n
           with Failure _ ->
-            warning lexbuf "invalid line number"; lexbuf.lex_curr_p.pos_lnum
+            warning lexbuf Diagnostics.Unnamed "invalid line number";
+            lexbuf.lex_curr_p.pos_lnum
         in
         lexbuf.lex_curr_p <- {
           lexbuf.lex_curr_p with
@@ -417,7 +517,7 @@ and hash = parse
     ([^ '\n']* as s) '\n'
       { new_line lexbuf; PRAGMA (s, currentLoc lexbuf) }
   | [^ '\n']* '\n'
-      { warning lexbuf "unrecognized '#' line";
+      { warning lexbuf Diagnostics.Unnamed "unrecognized '#' line";
         new_line lexbuf; initial_linebegin lexbuf }
   | [^ '\n']* eof
       { fatal_error lexbuf "unexpected end of file" }
@@ -583,21 +683,21 @@ and singleline_comment = parse
       | Pre_parser.STAR loc -> loop (Parser.STAR loc)
       | Pre_parser.STATIC loc -> loop (Parser.STATIC loc)
       | Pre_parser.STATIC_ASSERT loc -> loop (Parser.STATIC_ASSERT loc)
-      | Pre_parser.STRING_LITERAL (wide, str, loc) ->
+      | Pre_parser.STRING_LITERAL (enc, str, loc) ->
           (* Merge consecutive string literals *)
-          let rec doConcat wide str =
+          let rec doConcat enc str =
             match Queue.peek tokens with
-            | Pre_parser.STRING_LITERAL (wide', str', loc) ->
+            | Pre_parser.STRING_LITERAL (enc', str', loc') ->
                ignore (Queue.pop tokens);
-               let (wide'', str'') = doConcat wide' str' in
+               let (enc'', str'') = doConcat enc' str' in
                if str'' <> []
-               then (wide || wide'', str @ str'')
-               else (wide, str)
-            | _ -> (wide, str)
-            | exception Queue.Empty -> (wide, str)
+               then (combine_encodings loc enc enc'', str @ str'')
+               else (enc, str)
+            | _ -> (enc, str)
+            | exception Queue.Empty -> (enc, str)
           in
-          let (wide', str') = doConcat wide str in
-          loop (Parser.STRING_LITERAL ((wide', str'), loc))
+          let (enc', str') = doConcat enc str in
+          loop (Parser.STRING_LITERAL ((enc', str'), loc))
       | Pre_parser.STRUCT loc -> loop (Parser.STRUCT loc)
       | Pre_parser.SUB_ASSIGN loc -> loop (Parser.SUB_ASSIGN loc)
       | Pre_parser.SWITCH loc -> loop (Parser.SWITCH loc)
diff --git a/cparser/Parser.vy b/cparser/Parser.vy
index 8d7cd055..12337367 100644
--- a/cparser/Parser.vy
+++ b/cparser/Parser.vy
@@ -23,7 +23,7 @@ Require Cabs.
 
 %token<Cabs.string * Cabs.loc> VAR_NAME TYPEDEF_NAME OTHER_NAME
 %token<Cabs.string * Cabs.loc> PRAGMA
-%token<bool * list Cabs.char_code * Cabs.loc> STRING_LITERAL
+%token<Cabs.encoding * list Cabs.char_code * Cabs.loc> STRING_LITERAL
 %token<Cabs.constant * Cabs.loc> CONSTANT
 %token<Cabs.loc> SIZEOF PTR INC DEC LEFT RIGHT LEQ GEQ EQEQ EQ NEQ LT GT
   ANDAND BARBAR PLUS MINUS STAR TILDE BANG SLASH PERCENT HAT BAR QUESTION
@@ -124,8 +124,8 @@ primary_expression:
 | cst = CONSTANT
     { (Cabs.CONSTANT (fst cst), snd cst) }
 | str = STRING_LITERAL
-    { let '((wide, chars), loc) := str in
-      (Cabs.CONSTANT (Cabs.CONST_STRING wide chars), loc) }
+    { let '((enc, chars), loc) := str in
+      (Cabs.CONSTANT (Cabs.CONST_STRING enc chars), loc) }
 | loc = LPAREN expr = expression RPAREN
     { (fst expr, loc)}
 | sel = generic_selection
@@ -786,8 +786,8 @@ designator:
 static_assert_declaration:
 | loc = STATIC_ASSERT LPAREN expr = constant_expression
                         COMMA str = STRING_LITERAL RPAREN SEMICOLON
-    { let '((wide, chars), locs) := str in
-      (expr, (Cabs.CONST_STRING wide chars, locs), loc) }
+    { let '((enc, chars), locs) := str in
+      (expr, (Cabs.CONST_STRING enc chars, locs), loc) }
 
 (* 6.8 *)
 statement_dangerous:
@@ -922,9 +922,9 @@ jump_statement:
 asm_statement:
 | loc = ASM attr = asm_attributes LPAREN template = STRING_LITERAL args = asm_arguments
   RPAREN SEMICOLON
-    { let '(wide, chars, _) := template in
+    { let '(enc, chars, _) := template in
       let '(outputs, inputs, flags) := args in
-      Cabs.ASM attr wide chars outputs inputs flags loc }
+      Cabs.ASM attr enc chars outputs inputs flags loc }
 
 asm_attributes:
 | /* empty */
@@ -954,7 +954,7 @@ asm_operands_ne:
 
 asm_operand:
 | n = asm_op_name cstr = STRING_LITERAL LPAREN e = expression RPAREN
-    { let '(wide, s, loc) := cstr in Cabs.ASMOPERAND n wide s (fst e) }
+    { let '(enc, s, loc) := cstr in Cabs.ASMOPERAND n enc s (fst e) }
 
 asm_op_name:
 | /* empty */                         { None }
@@ -962,9 +962,9 @@ asm_op_name:
 
 asm_flags:
 | f = STRING_LITERAL
-    { let '(wide, s, loc) := f in (wide, s) :: nil }
+    { let '(enc, s, loc) := f in (enc, s) :: nil }
 | f = STRING_LITERAL COMMA fl = asm_flags
-    { let '(wide, s, loc) := f in (wide, s) :: fl }
+    { let '(enc, s, loc) := f in (enc, s) :: fl }
 
 (* 6.9 *)
 translation_unit_file:
diff --git a/cparser/pre_parser.mly b/cparser/pre_parser.mly
index ad294398..00ca0ade 100644
--- a/cparser/pre_parser.mly
+++ b/cparser/pre_parser.mly
@@ -47,7 +47,7 @@
 %token<string * Pre_parser_aux.identifier_type ref * Cabs.loc>
   VAR_NAME TYPEDEF_NAME
 %token<Cabs.constant * Cabs.loc> CONSTANT
-%token<bool * int64 list * Cabs.loc> STRING_LITERAL
+%token<Cabs.encoding * int64 list * Cabs.loc> STRING_LITERAL
 %token<string * Cabs.loc> PRAGMA
 
 %token<Cabs.loc> SIZEOF PTR INC DEC LEFT RIGHT LEQ GEQ EQEQ EQ NEQ LT GT
diff --git a/driver/Frontend.ml b/driver/Frontend.ml
index 6133291e..4ee5c712 100644
--- a/driver/Frontend.ml
+++ b/driver/Frontend.ml
@@ -154,7 +154,8 @@ let gnu_prepro_actions = [
   Exact "-iquote", String (gnu_prepro_opt_key "-iquote");
   Exact "-P", Self gnu_prepro_opt;
   Exact "-C", Self gnu_prepro_opt;
-  Exact "-CC", Self gnu_prepro_opt;]
+  Exact "-CC", Self gnu_prepro_opt;
+  Prefix "-finput-charset=", Self gnu_prepro_opt]
 
 let prepro_actions = [
   (* Preprocessing options *)
@@ -198,6 +199,8 @@ let gnu_prepro_help =
   -C              Do not discard comments
   -CC             Do not discard comments, including during macro
                   expansion
+  -finput-charset=<charset>
+                  Set the input character set, used for reading source files.
 |}
 
 let prepro_help = {|Preprocessing options:
diff --git a/test/regression/Makefile b/test/regression/Makefile
index daee05bc..53719900 100644
--- a/test/regression/Makefile
+++ b/test/regression/Makefile
@@ -16,7 +16,7 @@ TESTS=int32 int64 floats floats-basics floats-lit \
   funct3 expr5 struct7 struct8 struct11 struct12 casts1 casts2 char1 \
   sizeof1 sizeof2 binops bool for1 for2 switch switch2 compound \
   decl1 bitfields9 ptrs3 \
-  parsing krfun ifconv generic
+  parsing krfun ifconv generic stringlit charlit
 
 # Can run, but only in compiled mode, and have reference output in Results
 
diff --git a/test/regression/Results/charlit b/test/regression/Results/charlit
new file mode 100644
index 00000000..3954a0c8
--- /dev/null
+++ b/test/regression/Results/charlit
@@ -0,0 +1,27 @@
+c1: 61
+c2: 61
+c3: 61
+c4: 61
+d1: fe
+d2: fe
+d3: fe
+d4: fe
+e1: 34
+e2: 1234
+e3: 1234
+e4: 1234
+f1: e9
+f2: e9
+f3: e9
+f4: e9
+g1: 2b
+g2: 732b
+g3: 732b
+g4: 732b
+h1: 4c
+h2: f34c
+h3: 1f34c
+h4: 1f34c
+m1: 6162
+m2: 1020304
+m3: e9e8
diff --git a/test/regression/Results/stringlit b/test/regression/Results/stringlit
new file mode 100644
index 00000000..d6967ccc
--- /dev/null
+++ b/test/regression/Results/stringlit
@@ -0,0 +1,15 @@
+s1: size 11, contents 61 c3 a9 e7 8c ab f0 9f 8d 8c
+s2: size 11, contents 61 c3 a9 e7 8c ab f0 9f 8d 8c
+s3: size 12, contents 61 e9 732b d83c df4c
+s4: size 20, contents 61 e9 732b 1f34c
+s5: size 20, contents 61 e9 732b 1f34c
+t1: size 11, contents 61 c3 a9 e7 8c ab f0 9f 8d 8c
+t2: size 11, contents 61 c3 a9 e7 8c ab f0 9f 8d 8c
+t3: size 12, contents 61 e9 732b d83c df4c
+t4: size 20, contents 61 e9 732b 1f34c
+t5: size 20, contents 61 e9 732b 1f34c
+e1: size 4, contents 61 e9 e8
+e2: size 4, contents 61 e9 e8
+e3: size 10, contents 61 e9 e8 732b
+e4: size 24, contents 61 e9 e8 732b 1f34c
+e5: size 24, contents 61 e9 e8 732b 1f34c
diff --git a/test/regression/charlit.c b/test/regression/charlit.c
new file mode 100644
index 00000000..5a7e0916
--- /dev/null
+++ b/test/regression/charlit.c
@@ -0,0 +1,50 @@
+#include <stdio.h>
+#include <wchar.h>
+#include <uchar.h>
+
+unsigned char c1 = 'a';
+char16_t c2 = u'a';;
+char32_t c3 = U'a';;
+wchar_t c4 = L'a';;
+
+unsigned char d1 = '\xFE';
+char16_t d2 = u'\xFE';;
+char32_t d3 = U'\xFE';;
+wchar_t d4 = L'\xFE';;
+
+unsigned char e1 = '\x1234'; // warning but no error
+char16_t e2 = u'\x1234';
+char32_t e3 = U'\x1234';
+wchar_t e4 = L'\x1234';
+
+unsigned char f1 = 'é';      // CompCert tolerance
+char16_t f2 = u'é';
+char32_t f3 = U'é';
+wchar_t f4 = L'é';
+
+unsigned char g1 = '猫';     // CompCert tolerance + warning
+char16_t g2 = u'猫';
+char32_t g3 = U'猫';
+wchar_t g4 = L'猫';
+
+unsigned char h1 = '🍌';     // CompCert tolerance + warning
+char16_t h2 = u'🍌';         // CompCert tolerance + warning
+char32_t h3 = U'🍌';
+wchar_t h4 = L'🍌';
+
+int m1 = 'ab';
+int m2 = '\x01\x02\x03\x04';
+int m3 = 'éè';              // CompCert tolerance
+
+#define PRINT(x) printf("%s: %x\n", #x, x)
+
+int main()
+{
+  PRINT(c1); PRINT(c2); PRINT(c3); PRINT(c4);
+  PRINT(d1); PRINT(d2); PRINT(d3); PRINT(d4);
+  PRINT(e1); PRINT(e2); PRINT(e3); PRINT(e4);
+  PRINT(f1); PRINT(f2); PRINT(f3); PRINT(f4);
+  PRINT(g1); PRINT(g2); PRINT(g3); PRINT(g4);
+  PRINT(h1); PRINT(h2); PRINT(h3); PRINT(h4);
+  PRINT(m1); PRINT(m2); PRINT(m3);
+}
diff --git a/test/regression/stringlit.c b/test/regression/stringlit.c
new file mode 100644
index 00000000..155f8ebb
--- /dev/null
+++ b/test/regression/stringlit.c
@@ -0,0 +1,41 @@
+#include <stdio.h>
+#include <wchar.h>
+#include <uchar.h>
+
+/* Without escapes nor universal character names */
+
+unsigned char s1[] = "aé猫🍌";
+unsigned char s2[] = u8"aé猫🍌";
+char16_t s3[] = u"aé猫🍌";
+char32_t s4[] = U"aé猫🍌";
+wchar_t s5[] = L"aé猫🍌";
+
+/* With universal character names */
+
+unsigned char t1[] = "a\u00e9\u732B\U0001F34C";
+unsigned char t2[] = u8"a\u00e9\u732B\U0001F34C";
+char16_t t3[] = u"a\u00e9\u732B\U0001F34C";
+char32_t t4[] = U"a\u00e9\u732B\U0001F34C";
+wchar_t t5[] = L"a\u00e9\u732B\U0001F34C";
+
+/* With numerical escapes */
+
+unsigned char e1[] = "a\xe9\350";
+unsigned char e2[] = u8"a\xe9\350";
+char16_t e3[] = u"a\xe9\350\x732B";
+char32_t e4[] = U"a\xe9\350\x732B\x0001F34C";
+wchar_t e5[] = L"a\xe9\350\x732B\x0001F34C";
+
+#define PRINT(x) \
+  printf("%s: size %u, contents", #x, (int) sizeof(x)); \
+  for (int i = 0; x[i] != 0; i++) printf(" %x", x[i]); \
+  printf("\n")
+
+int main()
+{
+  PRINT(s1); PRINT(s2); PRINT(s3); PRINT(s4); PRINT(s5);
+  PRINT(t1); PRINT(t2); PRINT(t3); PRINT(t4); PRINT(t5);
+  PRINT(e1); PRINT(e2); PRINT(e3); PRINT(e4); PRINT(e5);
+  return 0;
+}
+
-- 
cgit