12 files changed, 257 insertions, 110 deletions
diff --git a/cparser/C.mli b/cparser/C.mli
index 763a9277..1388fab9 100644
--- a/cparser/C.mli
+++ b/cparser/C.mli
@@ -64,7 +64,7 @@ type constant =
   | CInt of int64 * ikind * string      (* as it appeared in the source *)
   | CFloat of float_cst * fkind
   | CStr of string
-  | CWStr of int64 list
+  | CWStr of int64 list * ikind
   | CEnum of ident * int64              (* enum tag, integer value *)
 
 (** Attributes *)
diff --git a/cparser/Cabs.v b/cparser/Cabs.v
index bf8c8c74..59cd30ab 100644
--- a/cparser/Cabs.v
+++ b/cparser/Cabs.v
@@ -31,6 +31,13 @@ Record floatInfo := {
   suffix_FI:option string
 }.
 
+Inductive encoding :=
+  | EncNone                             (* no prefix *)
+  | EncWide                             (* 'L' prefix *)
+  | EncU16                              (* 'u' prefix *)
+  | EncU32                              (* 'U' prefix *)
+  | EncUTF8.                            (* 'u8' prefix (strings only) *)
+
 Inductive structOrUnion :=
   | STRUCT | UNION.
 
@@ -152,8 +159,8 @@ with constant :=
      the source code. *)
   | CONST_INT : string -> constant
   | CONST_FLOAT : floatInfo -> constant
-  | CONST_CHAR : bool -> list char_code -> constant
-  | CONST_STRING : bool -> list char_code -> constant
+  | CONST_CHAR : encoding -> list char_code -> constant
+  | CONST_STRING : encoding -> list char_code -> constant
 
 with init_expression :=
   | NO_INIT
@@ -194,9 +201,9 @@ Definition generic_assoc := (option type_name * expression)%type.
 
 (* GCC extended asm *)
 Inductive asm_operand :=
-| ASMOPERAND: option string -> bool -> list char_code -> expression -> asm_operand.
+| ASMOPERAND: option string -> encoding -> list char_code -> expression -> asm_operand.
 
-Definition asm_flag := (bool * list char_code)%type.
+Definition asm_flag := (encoding * list char_code)%type.
 
 (*
 ** Declaration definition (at toplevel)
@@ -227,7 +234,7 @@ with statement :=
  | DEFAULT : statement -> loc -> statement
  | LABEL : string -> statement -> loc -> statement
  | GOTO : string -> loc -> statement
- | ASM : list cvspec -> bool -> list char_code -> list asm_operand -> list asm_operand -> list asm_flag -> loc -> statement
+ | ASM : list cvspec -> encoding -> list char_code -> list asm_operand -> list asm_operand -> list asm_flag -> loc -> statement
  | DEFINITION : definition -> statement (*definition or declaration of a variable or type*)
 
 with for_clause :=
diff --git a/cparser/Ceval.ml b/cparser/Ceval.ml
index 14f61e06..0800e25b 100644
--- a/cparser/Ceval.ml
+++ b/cparser/Ceval.ml
@@ -72,7 +72,7 @@ let normalize_int n ik =
 type value =
   | I of int64
   | S of string
-  | WS of int64 list
+  | WS of int64 list * ikind
 
 let boolean_value v =
   match v with
@@ -83,7 +83,7 @@ let constant = function
   | CInt(v, ik, _) -> I (normalize_int v ik)
   | CFloat(v, fk) -> raise Notconst
   | CStr s -> S s
-  | CWStr s -> WS s
+  | CWStr(s, ik) -> WS(s, ik)
   | CEnum(id, v) -> I v
 
 let is_signed env ty =
@@ -274,7 +274,7 @@ let constant_expr env ty e =
     | TInt(ik, _), I n -> Some(CInt(n, ik, ""))
     | TPtr(_, _), I n -> Some(CInt(n, ptr_t_ikind (), ""))
     | (TArray(_, _, _) | TPtr(_, _)), S s -> Some(CStr s)
-    | (TArray(_, _, _) | TPtr(_, _)), WS s -> Some(CWStr s)
+    | (TArray(_, _, _) | TPtr(_, _)), WS(s, ik) -> Some(CWStr(s, ik))
     | TEnum(_, _), I n -> Some(CInt(n, enum_ikind, ""))
     | _   -> None
   with Notconst -> None
diff --git a/cparser/Cprint.ml b/cparser/Cprint.ml
index caa4fa66..05448784 100644
--- a/cparser/Cprint.ml
+++ b/cparser/Cprint.ml
@@ -75,7 +75,7 @@ let const pp = function
             else fprintf pp "\\%03o" (Char.code c)
       done;
       fprintf pp "\""
-  | CWStr l ->
+  | CWStr(l, _) ->
       fprintf pp "L\"";
       List.iter
         (fun c ->
diff --git a/cparser/Cutil.ml b/cparser/Cutil.ml
index 6fd12323..f226d51b 100644
--- a/cparser/Cutil.ml
+++ b/cparser/Cutil.ml
@@ -1024,9 +1024,9 @@ let type_of_constant = function
   | CStr s ->
     let size = Int64.of_int (String.length s + 1) in
     TArray(TInt(IChar,[]), Some size, [])
-  | CWStr s ->
+  | CWStr(s, ik) ->
     let size = Int64.of_int (List.length s + 1) in
-    TArray(TInt(wchar_ikind(), []), Some size, [])
+    TArray(TInt(ik, []), Some size, [])
   | CEnum(_, _) -> TInt(IInt, [])
 
 (* Check that a C expression is a lvalue *)
diff --git a/cparser/Diagnostics.ml b/cparser/Diagnostics.ml
index 483b0376..8a8a0c17 100644
--- a/cparser/Diagnostics.ml
+++ b/cparser/Diagnostics.ml
@@ -104,6 +104,7 @@ type warning_type =
   | Tentative_incomplete_static
   | Reduced_alignment
   | Non_linear_cond_expr
+  | Invalid_UTF8
 
 (* List of all warnings with default status.
    "true" means the warning is active by default.
@@ -140,6 +141,7 @@ let all_warnings =
     (Tentative_incomplete_static, false);
     (Reduced_alignment, false);
     (Non_linear_cond_expr, false);
+    (Invalid_UTF8, true);
   ]
 
 (* List of active warnings *)
@@ -182,6 +184,7 @@ let string_of_warning = function
   | Tentative_incomplete_static -> "tentative-incomplete-static"
   | Reduced_alignment -> "reduced-alignment"
   | Non_linear_cond_expr -> "non-linear-cond-expr"
+  | Invalid_UTF8 -> "invalid-utf8"
 
 (* Activate the given warning *)
 let activate_warning w () =
diff --git a/cparser/Diagnostics.mli b/cparser/Diagnostics.mli
index 1210353f..47727707 100644
--- a/cparser/Diagnostics.mli
+++ b/cparser/Diagnostics.mli
@@ -57,6 +57,7 @@ type warning_type =
   | Tentative_incomplete_static    (** static tentative definition with incomplete type *)
   | Reduced_alignment              (** alignment reduction *)
   | Non_linear_cond_expr           (** condition that cannot be linearized *)
+  | Invalid_UTF8                   (** invalid UTF-8 encoding *)
 
 val warning  : (string * int) -> warning_type -> ('a, Format.formatter, unit, unit, unit, unit) format6 -> 'a
 (** [warning (f,c) w fmt arg1 ... argN] formats the arguments [arg1] to [argN] as warining according to
diff --git a/cparser/Elab.ml b/cparser/Elab.ml
index 01e745c3..ec79634a 100644
--- a/cparser/Elab.ml
+++ b/cparser/Elab.ml
@@ -395,48 +395,75 @@ let elab_float_constant f =
   in
   (v, ty)
 
-let elab_char_constant loc wide chars =
+let check_char_range loc ikind chars =
+  let max = Int64.shift_left 1L (sizeof_ikind ikind * 8) in
+  List.iter
+    (fun c ->
+      if c >= max then error loc "escape sequence 0x%LX is out of range" c)
+    chars
+
+let ikind_of_encoding = function
+  | EncNone -> IChar
+  | EncWide -> wchar_ikind()
+  | EncU16 -> IUShort
+  | EncU32 -> IUInt
+  | EncUTF8 -> IChar
+
+let elab_char_constant loc enc chars =
   let len = List.length chars in
-  let nbits = if wide then 8 * !config.sizeof_wchar else 8 in
-  let max_digit = Int64.shift_left 1L nbits in
-  (* Treat multi-character constants as a number in base 2^nbits.
-     It must fit in type int for a normal constant and in type wchar_t
-     for a wide constant. *)
+  (* We support multi-character constants for EncNone character literals only.
+     We treat them as big-endian numbers in base 256. *)
   let v =
-    if len > (if wide then 1 else !config.sizeof_int) then begin
-      error loc "%d-character constant too long for its type" len;
-      0L
-    end else
-      List.fold_left
-        (fun acc d ->
-          if d < 0L || d >= max_digit then
-            error loc "escape sequence is out of range (code 0x%LX)" d;
-          Int64.add (Int64.shift_left acc nbits) d)
-        0L chars in
-  (* C99 6.4.4.4 items 10 and 11:
-       single-character constant -> represent at type char
+    match chars, enc with
+    | [], _ -> error loc "empty character constant"; 0L
+    | [c], _ -> c
+    | _, EncNone ->
+      if len > !config.sizeof_int then begin
+        error loc "%d-character constant too long, overflows its type" len;
+        0L
+      end else begin
+        check_char_range loc IUChar chars;
+        List.fold_left
+          (fun acc d -> Int64.(add (shift_left acc 8) d))
+          0L chars
+      end
+    | _, _ ->
+      error loc "%d-character constant not supported" len; 0L in
+  (* C11 6.4.4.4 items 10 and 11:
+       normal single-character constant -> represent at type char
        multi-character constant -> represent at type int
-       wide character constant -> represent at type wchar_t *)
-  Ceval.normalize_int v
-    (if wide then wchar_ikind() else if len = 1 then IChar else IInt)
-
-let elab_string_literal loc wide chars =
-  let nbits = if wide then 8 * !config.sizeof_wchar else 8 in
-  let char_max = Int64.shift_left 1L nbits in
-  List.iter
-    (fun c ->
-      if c < 0L || c >= char_max
-      then error loc "escape sequence is out of range (code 0x%LX)" c)
-    chars;
-  if wide then
-    CWStr chars
-  else begin
-    let res = Bytes.create (List.length chars) in
-    List.iteri
-      (fun i c -> Bytes.set res i (Char.unsafe_chr (Int64.to_int c)))
-      chars;
-    CStr (Bytes.to_string res)
-  end
+       L character constant -> represent at type wchar_t
+       u character constant -> represent at type char16_t
+       U character constant -> represent at type char32_t *)
+  let ik =
+    if enc = EncNone && len > 1 then IInt else ikind_of_encoding enc in
+  let v' = Ceval.normalize_int v ik in
+  if v' <> v then
+    warning loc Constant_conversion
+      "overflow in character constant, changes value from %Ld to %Ld" v v';
+  v'
+
+let elab_string_literal loc enc chars =
+  let ik = ikind_of_encoding enc in
+  check_char_range loc ik chars;
+  match enc with
+  | EncNone | EncUTF8 ->
+      let res = Bytes.create (List.length chars) in
+      List.iteri
+        (fun i c -> Bytes.set res i (Char.unsafe_chr (Int64.to_int c)))
+        chars;
+      CStr (Bytes.to_string res)
+  | EncWide | EncU16 | EncU32 -> 
+      CWStr(chars, ik)
+
+let warn_C11_literals loc enc kind =
+  let warn enc =
+    warning loc Celeven_extension "'%s' %s are a C11 extension" enc kind in
+  match enc with
+  | EncNone | EncWide -> ()
+  | EncUTF8 -> warn "u8"
+  | EncU16 -> warn "u"
+  | EncU32 -> warn "U"
 
 let elab_constant loc = function
   | CONST_INT s ->
@@ -445,14 +472,22 @@ let elab_constant loc = function
   | CONST_FLOAT f ->
       let (v, fk) = elab_float_constant f in
       CFloat(v, fk)
-  | CONST_CHAR(wide, s) ->
-      let ikind = if wide then wchar_ikind () else IInt in
-      CInt(elab_char_constant loc wide s, ikind, "")
+  | CONST_CHAR(enc, s) ->
+      warn_C11_literals loc enc "character constants";
+      let ikind =
+        match enc with
+        | EncNone -> IInt
+        | EncWide -> wchar_ikind ()
+        | EncU16 -> IUShort
+        | EncU32 -> IUInt
+        | EncUTF8 -> assert false in
+      CInt(elab_char_constant loc enc s, ikind, "")
   | CONST_STRING(wide, s) ->
+      warn_C11_literals loc wide "string literals";
       elab_string_literal loc wide s
 
-let elab_simple_string loc wide chars =
-  match elab_string_literal loc wide chars with
+let elab_simple_string loc enc chars =
+  match elab_string_literal loc enc chars with
   | CStr s -> s
   | _ -> error loc "cannot use wide string literal in 'asm'"; ""
 
@@ -1586,6 +1621,7 @@ and elab_item zi item il =
      | COMPOUND_INIT [_, SINGLE_INIT(CONSTANT (CONST_STRING(w, s)))]),
     TArray(ty_elt, sz, _)
     when is_integer_type env ty_elt ->
+      warn_C11_literals loc w "string literals";
       begin match elab_string_literal loc w s, unroll env ty_elt with
       | CStr s, TInt((IChar | ISChar | IUChar), _) ->
           if not (I.index_below (Int64.of_int(String.length s - 1)) sz) then
@@ -1594,12 +1630,12 @@ and elab_item zi item il =
       | CStr _, _ ->
           error loc "initialization of an array of non-char elements with a string literal";
           elab_list zi il false
-      | CWStr s, TInt(_, _) when compatible_types AttrIgnoreTop env ty_elt (TInt(wchar_ikind(), [])) ->
+      | CWStr(s, ik), TInt(_, _) when compatible_types AttrIgnoreTop env ty_elt (TInt(ik, [])) ->
           if not (I.index_below (Int64.of_int(List.length s - 1)) sz) then
             warning loc Unnamed "initializer string for array of wide chars %s is too long" (I.name zi);
           elab_list (I.set zi (init_int_array_wstring sz s)) il false
       | CWStr _, _ ->
-          error loc "initialization of an array of non-wchar_t elements with a wide string literal";
+          error loc "type mismatch between array destination and wide string literal";
           elab_list zi il false
       | _ -> assert false
       end
diff --git a/cparser/Elab.mli b/cparser/Elab.mli
index bca4f74d..537056fa 100644
--- a/cparser/Elab.mli
+++ b/cparser/Elab.mli
@@ -21,6 +21,6 @@ val elab_file : Cabs.definition list -> C.program
 
 val elab_int_constant : Cabs.loc -> string -> int64 * C.ikind
 val elab_float_constant : Cabs.floatInfo -> C.float_cst * C.fkind
-val elab_char_constant : Cabs.loc -> bool -> int64 list -> int64
+val elab_char_constant : Cabs.loc -> Cabs.encoding -> int64 list -> int64
   (* These auxiliary functions are exported so that they can be reused
      in other projects that deal with C-style source languages. *)
diff --git a/cparser/Lexer.mll b/cparser/Lexer.mll
index 9f7fba1e..2efa4216 100644
--- a/cparser/Lexer.mll
+++ b/cparser/Lexer.mll
@@ -3,6 +3,7 @@
 (*              The Compcert verified compiler                         *)
 (*                                                                     *)
 (*          Jacques-Henri Jourdan, INRIA Paris-Rocquencourt            *)
+(*             Xavier Leroy, Collège de France and Inria               *)
 (*                                                                     *)
 (*  Copyright Institut National de Recherche en Informatique et en     *)
 (*  Automatique.  All rights reserved.  This file is distributed       *)
@@ -144,9 +145,9 @@ let error lb fmt =
   Diagnostics.error
     (lb.lex_curr_p.pos_fname,lb.lex_curr_p.pos_lnum) fmt
 
-let warning lb fmt =
+let warning lb kind fmt =
   Diagnostics.warning
-      (lb.lex_curr_p.pos_fname,lb.lex_curr_p.pos_lnum) Diagnostics.Unnamed fmt
+      (lb.lex_curr_p.pos_fname,lb.lex_curr_p.pos_lnum) kind fmt
 
 (* Simple character escapes *)
 
@@ -160,6 +161,89 @@ let convert_escape = function
   | 't' -> 9L  (* horizontal tab *)
   | 'v' -> 11L (* vertical tab *)
   | c   -> Int64.of_int (Char.code c)
+
+(* Encodings for character and string literals *)
+
+let encoding_of = function
+  | "" -> Cabs.EncNone
+  | "L" -> Cabs.EncWide
+  | "u" -> Cabs.EncU16
+  | "U" -> Cabs.EncU32
+  | "u8" -> Cabs.EncUTF8
+  | _ -> assert false
+
+let combine_encodings loc e1 e2 =
+  if e1 = Cabs.EncNone then e2
+  else if e2 = Cabs.EncNone then e1
+  else if e1 = e2 then e1
+  else Diagnostics.fatal_error
+           Cabs.(loc.filename, loc.lineno)
+           "unsupported non-standard concatenation of string literals"
+
+(* Handling of characters and escapes in string and char constants *)
+
+type chr = Chr of int | Esc of int64
+
+let check_utf8 lexbuf min x =
+  if x > 0x10FFFF || (x >= 0xD800 && x <= 0xDFFF) then
+    warning lexbuf Diagnostics.Invalid_UTF8 "Wrong Unicode value U+%X" x;
+  if x < min then
+    warning lexbuf Diagnostics.Invalid_UTF8
+            "Overlong UTF-8 encoding for Unicode value U+%X" x;
+  Chr x
+
+let check_universal_character lexbuf x =
+  if x > 0x10FFFF
+  || x >= 0xD800 && x <= 0xDFFF
+  || x < 0xA0 && x <> 0x24 && x <> 0x40 && x <> 0x60
+  then begin
+    error lexbuf "Wrong universal character name U+%X" x; Chr 0
+  end else
+    Chr x
+
+let add_char_utf8 x accu =
+  if x <= 0x007F then
+    Int64.of_int x :: accu
+  else if x <= 0x07FF then
+    Int64.of_int (0x80 lor (x land 0x3F)) ::
+    Int64.of_int (0xC0 lor (x lsr 6)) ::
+    accu
+  else if x <= 0xFFFF then
+    Int64.of_int (0x80 lor (x land 0x3F)) ::
+    Int64.of_int (0x80 lor ((x lsr 6) land 0x3F)) ::
+    Int64.of_int (0xE0 lor (x lsr 12)) ::
+    accu
+  else
+    Int64.of_int (0x80 lor (x land 0x3F)) ::
+    Int64.of_int (0x80 lor ((x lsr 6) land 0x3F)) ::
+    Int64.of_int (0x80 lor ((x lsr 12) land 0x3F)) ::
+    Int64.of_int (0xF0 lor (x lsr 18)) ::
+    accu
+
+let add_char_utf16 x accu =
+  if x <= 0xFFFF then
+    Int64.of_int x :: accu
+  else begin
+    let x = x - 0x10000 in
+    Int64.of_int (0xDC00 lor (x land 0x3FF)) ::
+    Int64.of_int (0xD800 lor (x lsr 10)) ::
+    accu
+  end
+
+let add_char enc c accu =
+  match c, enc with
+  | Esc x, _ -> (* Escapes are never encoded *)
+      x :: accu
+  | Chr x, (Cabs.EncNone | Cabs.EncUTF8) -> (* Characters are encoded in UTF8 *)
+      add_char_utf8 x accu
+  | Chr x, Cabs.EncU16 -> (* Characters are encoded in UTF16 *)
+      add_char_utf16 x accu
+  | Chr x, Cabs.EncU32 -> (* Characters are not encoded *)
+      Int64.of_int x :: accu
+  | Chr x, Cabs.EncWide -> (* Depends on size of wchar_t *)
+      if Machine.(!config.sizeof_wchar) = 2
+      then add_char_utf16 x accu
+      else Int64.of_int x :: accu
 }
 
 (* Identifiers *)
@@ -249,11 +333,6 @@ let octal_escape_sequence =
          | octal_digit octal_digit
          | octal_digit octal_digit octal_digit) as n)
 let hexadecimal_escape_sequence = "\\x" (hexadecimal_digit+ as n)
-let escape_sequence =
-    simple_escape_sequence
-  | octal_escape_sequence
-  | hexadecimal_escape_sequence
-  | universal_character_name
 
 rule initial = parse
   | '\n'                          { new_line lexbuf; initial_linebegin lexbuf }
@@ -283,16 +362,13 @@ rule initial = parse
                                       currentLoc lexbuf)}
   | preprocessing_number as s     { error lexbuf "invalid numerical constant '%s'@ These characters form a preprocessor number, but not a constant" s;
                                     CONSTANT (Cabs.CONST_INT "0", currentLoc lexbuf) }
-  | "'"                           { let l = char_literal lexbuf.lex_start_p [] lexbuf in
-                                    CONSTANT (Cabs.CONST_CHAR(false, l),
-                                              currentLoc lexbuf) }
-  | "L'"                          { let l = char_literal lexbuf.lex_start_p [] lexbuf in
-                                    CONSTANT (Cabs.CONST_CHAR(true, l),
-                                              currentLoc lexbuf) }
-  | "\""                          { let l = string_literal lexbuf.lex_start_p [] lexbuf in
-                                    STRING_LITERAL(false, l, currentLoc lexbuf) }
-  | "L\""                         { let l = string_literal lexbuf.lex_start_p [] lexbuf in
-                                    STRING_LITERAL(true, l, currentLoc lexbuf) }
+  | (""|"L"|"u"|"U") as e "'"     { let enc = encoding_of e in
+                                    let l = char_literal lexbuf.lex_start_p [] lexbuf in
+                                    CONSTANT (Cabs.CONST_CHAR(enc, l), currentLoc lexbuf) }
+  | (""|"L"|"u"|"U"|"u8") as e "\""
+                                  { let enc = encoding_of e in
+                                    let l = string_literal lexbuf.lex_start_p enc [] lexbuf in
+                                    STRING_LITERAL(enc, l, currentLoc lexbuf) }
   | "..."                         { ELLIPSIS(currentLoc lexbuf) }
   | "+="                          { ADD_ASSIGN(currentLoc lexbuf) }
   | "-="                          { SUB_ASSIGN(currentLoc lexbuf) }
@@ -357,39 +433,62 @@ and initial_linebegin = parse
 and char = parse
   | universal_character_name
       { try
-          Int64.of_string ("0x" ^ n)
+          check_universal_character lexbuf (int_of_string ("0x" ^ n))
         with Failure _ ->
           error lexbuf "overflow in universal character name";
-          0L
+          Chr 0
       }
   | hexadecimal_escape_sequence
       { try
-          Int64.of_string ("0x" ^ n)
+          Esc (Int64.of_string ("0x" ^ n))
         with Failure _ ->
           error lexbuf "overflow in hexadecimal escape sequence";
-          0L
+          Esc 0L
       }
   | octal_escape_sequence
-      { Int64.of_string  ("0o" ^ n) }
+      { Esc (Int64.of_string  ("0o" ^ n)) }
   | simple_escape_sequence
-      { convert_escape c }
+      { Esc (convert_escape c) }
+  | "\\u" | "\\U"
+      { error lexbuf "incomplete universal character name";
+        Chr 0 }
   | '\\' (_ as c)
       { error lexbuf "incorrect escape sequence '\\%c'" c;
-        Int64.of_int (Char.code c) }
+        Esc (Int64.of_int (Char.code c)) }
+  | ['\x00'-'\x7F'] as c1
+      { Chr (Char.code c1) }
+  | (['\xC0'-'\xDF'] as c1) (['\x80'-'\xBF'] as c2)
+      { check_utf8 lexbuf 0x80
+          ( (Char.code c1 land 0b00011111) lsl 6
+          + (Char.code c2 land 0b00111111)) }
+  | (['\xE0'-'\xEF'] as c1) (['\x80'-'\xBF'] as c2) (['\x80'-'\xBF'] as c3)
+      { check_utf8 lexbuf 0x800
+          ( (Char.code c1 land 0b00001111) lsl 12
+          + (Char.code c2 land 0b00111111) lsl 6
+          + (Char.code c3 land 0b00111111) ) }
+  | (['\xF0'-'\xF7'] as c1) (['\x80'-'\xBF'] as c2) (['\x80'-'\xBF'] as c3) (['\x80'-'\xBF'] as c4)
+     { check_utf8 lexbuf 0x10000
+          ( (Char.code c1 land 0b00000111) lsl 18
+          + (Char.code c2 land 0b00111111) lsl 12
+          + (Char.code c3 land 0b00111111) lsl 6
+          + (Char.code c4 land 0b00111111) ) }
   | _ as c
-      { Int64.of_int (Char.code c) }
+     { warning lexbuf Diagnostics.Invalid_UTF8
+               "Invalid UTF8 encoding: byte 0x%02x" (Char.code c);
+       Esc (Int64.of_int (Char.code c)) (* re-encode as-is *)
+     }
 
 and char_literal startp accu = parse
   | '\''       { lexbuf.lex_start_p <- startp;
                  List.rev accu }
   | '\n' | eof { fatal_error lexbuf "missing terminating \"'\" character" }
-  | ""         { let c = char lexbuf in char_literal startp (c :: accu) lexbuf }
+  | ""         { let c = char lexbuf in char_literal startp (add_char Cabs.EncU32 c accu) lexbuf }
 
-and string_literal startp accu = parse
+and string_literal startp enc accu = parse
   | '\"'       { lexbuf.lex_start_p <- startp;
                  List.rev accu }
   | '\n' | eof { fatal_error lexbuf "missing terminating '\"' character" }
-  | ""         { let c = char lexbuf in string_literal startp (c :: accu) lexbuf }
+  | ""         { let c = char lexbuf in string_literal startp enc (add_char enc c accu) lexbuf }
 
 (* We assume gcc -E syntax but try to tolerate variations. *)
 and hash = parse
@@ -402,7 +501,8 @@ and hash = parse
           try
             int_of_string n
           with Failure _ ->
-            warning lexbuf "invalid line number"; lexbuf.lex_curr_p.pos_lnum
+            warning lexbuf Diagnostics.Unnamed "invalid line number";
+            lexbuf.lex_curr_p.pos_lnum
         in
         lexbuf.lex_curr_p <- {
           lexbuf.lex_curr_p with
@@ -417,7 +517,7 @@ and hash = parse
     ([^ '\n']* as s) '\n'
       { new_line lexbuf; PRAGMA (s, currentLoc lexbuf) }
   | [^ '\n']* '\n'
-      { warning lexbuf "unrecognized '#' line";
+      { warning lexbuf Diagnostics.Unnamed "unrecognized '#' line";
         new_line lexbuf; initial_linebegin lexbuf }
   | [^ '\n']* eof
       { fatal_error lexbuf "unexpected end of file" }
@@ -583,21 +683,21 @@ and singleline_comment = parse
       | Pre_parser.STAR loc -> loop (Parser.STAR loc)
       | Pre_parser.STATIC loc -> loop (Parser.STATIC loc)
       | Pre_parser.STATIC_ASSERT loc -> loop (Parser.STATIC_ASSERT loc)
-      | Pre_parser.STRING_LITERAL (wide, str, loc) ->
+      | Pre_parser.STRING_LITERAL (enc, str, loc) ->
           (* Merge consecutive string literals *)
-          let rec doConcat wide str =
+          let rec doConcat enc str =
             match Queue.peek tokens with
-            | Pre_parser.STRING_LITERAL (wide', str', loc) ->
+            | Pre_parser.STRING_LITERAL (enc', str', loc') ->
                ignore (Queue.pop tokens);
-               let (wide'', str'') = doConcat wide' str' in
+               let (enc'', str'') = doConcat enc' str' in
                if str'' <> []
-               then (wide || wide'', str @ str'')
-               else (wide, str)
-            | _ -> (wide, str)
-            | exception Queue.Empty -> (wide, str)
+               then (combine_encodings loc enc enc'', str @ str'')
+               else (enc, str)
+            | _ -> (enc, str)
+            | exception Queue.Empty -> (enc, str)
           in
-          let (wide', str') = doConcat wide str in
-          loop (Parser.STRING_LITERAL ((wide', str'), loc))
+          let (enc', str') = doConcat enc str in
+          loop (Parser.STRING_LITERAL ((enc', str'), loc))
       | Pre_parser.STRUCT loc -> loop (Parser.STRUCT loc)
       | Pre_parser.SUB_ASSIGN loc -> loop (Parser.SUB_ASSIGN loc)
       | Pre_parser.SWITCH loc -> loop (Parser.SWITCH loc)
diff --git a/cparser/Parser.vy b/cparser/Parser.vy
index 8d7cd055..12337367 100644
--- a/cparser/Parser.vy
+++ b/cparser/Parser.vy
@@ -23,7 +23,7 @@ Require Cabs.
 
 %token<Cabs.string * Cabs.loc> VAR_NAME TYPEDEF_NAME OTHER_NAME
 %token<Cabs.string * Cabs.loc> PRAGMA
-%token<bool * list Cabs.char_code * Cabs.loc> STRING_LITERAL
+%token<Cabs.encoding * list Cabs.char_code * Cabs.loc> STRING_LITERAL
 %token<Cabs.constant * Cabs.loc> CONSTANT
 %token<Cabs.loc> SIZEOF PTR INC DEC LEFT RIGHT LEQ GEQ EQEQ EQ NEQ LT GT
   ANDAND BARBAR PLUS MINUS STAR TILDE BANG SLASH PERCENT HAT BAR QUESTION
@@ -124,8 +124,8 @@ primary_expression:
 | cst = CONSTANT
     { (Cabs.CONSTANT (fst cst), snd cst) }
 | str = STRING_LITERAL
-    { let '((wide, chars), loc) := str in
-      (Cabs.CONSTANT (Cabs.CONST_STRING wide chars), loc) }
+    { let '((enc, chars), loc) := str in
+      (Cabs.CONSTANT (Cabs.CONST_STRING enc chars), loc) }
 | loc = LPAREN expr = expression RPAREN
     { (fst expr, loc)}
 | sel = generic_selection
@@ -786,8 +786,8 @@ designator:
 static_assert_declaration:
 | loc = STATIC_ASSERT LPAREN expr = constant_expression
                         COMMA str = STRING_LITERAL RPAREN SEMICOLON
-    { let '((wide, chars), locs) := str in
-      (expr, (Cabs.CONST_STRING wide chars, locs), loc) }
+    { let '((enc, chars), locs) := str in
+      (expr, (Cabs.CONST_STRING enc chars, locs), loc) }
 
 (* 6.8 *)
 statement_dangerous:
@@ -922,9 +922,9 @@ jump_statement:
 asm_statement:
 | loc = ASM attr = asm_attributes LPAREN template = STRING_LITERAL args = asm_arguments
   RPAREN SEMICOLON
-    { let '(wide, chars, _) := template in
+    { let '(enc, chars, _) := template in
       let '(outputs, inputs, flags) := args in
-      Cabs.ASM attr wide chars outputs inputs flags loc }
+      Cabs.ASM attr enc chars outputs inputs flags loc }
 
 asm_attributes:
 | /* empty */
@@ -954,7 +954,7 @@ asm_operands_ne:
 
 asm_operand:
 | n = asm_op_name cstr = STRING_LITERAL LPAREN e = expression RPAREN
-    { let '(wide, s, loc) := cstr in Cabs.ASMOPERAND n wide s (fst e) }
+    { let '(enc, s, loc) := cstr in Cabs.ASMOPERAND n enc s (fst e) }
 
 asm_op_name:
 | /* empty */                         { None }
@@ -962,9 +962,9 @@ asm_op_name:
 
 asm_flags:
 | f = STRING_LITERAL
-    { let '(wide, s, loc) := f in (wide, s) :: nil }
+    { let '(enc, s, loc) := f in (enc, s) :: nil }
 | f = STRING_LITERAL COMMA fl = asm_flags
-    { let '(wide, s, loc) := f in (wide, s) :: fl }
+    { let '(enc, s, loc) := f in (enc, s) :: fl }
 
 (* 6.9 *)
 translation_unit_file:
diff --git a/cparser/pre_parser.mly b/cparser/pre_parser.mly
index ad294398..00ca0ade 100644
--- a/cparser/pre_parser.mly
+++ b/cparser/pre_parser.mly
@@ -47,7 +47,7 @@
 %token<string * Pre_parser_aux.identifier_type ref * Cabs.loc>
   VAR_NAME TYPEDEF_NAME
 %token<Cabs.constant * Cabs.loc> CONSTANT
-%token<bool * int64 list * Cabs.loc> STRING_LITERAL
+%token<Cabs.encoding * int64 list * Cabs.loc> STRING_LITERAL
 %token<string * Cabs.loc> PRAGMA
 
 %token<Cabs.loc> SIZEOF PTR INC DEC LEFT RIGHT LEQ GEQ EQEQ EQ NEQ LT GT