aboutsummaryrefslogtreecommitdiffstats
path: root/cparser/Lexer.mll
diff options
context:
space:
mode:
Diffstat (limited to 'cparser/Lexer.mll')
-rw-r--r--cparser/Lexer.mll180
1 files changed, 140 insertions, 40 deletions
diff --git a/cparser/Lexer.mll b/cparser/Lexer.mll
index 9f7fba1e..2efa4216 100644
--- a/cparser/Lexer.mll
+++ b/cparser/Lexer.mll
@@ -3,6 +3,7 @@
(* The Compcert verified compiler *)
(* *)
(* Jacques-Henri Jourdan, INRIA Paris-Rocquencourt *)
+(* Xavier Leroy, Collège de France and Inria *)
(* *)
(* Copyright Institut National de Recherche en Informatique et en *)
(* Automatique. All rights reserved. This file is distributed *)
@@ -144,9 +145,9 @@ let error lb fmt =
Diagnostics.error
(lb.lex_curr_p.pos_fname,lb.lex_curr_p.pos_lnum) fmt
-let warning lb fmt =
+let warning lb kind fmt =
Diagnostics.warning
- (lb.lex_curr_p.pos_fname,lb.lex_curr_p.pos_lnum) Diagnostics.Unnamed fmt
+ (lb.lex_curr_p.pos_fname,lb.lex_curr_p.pos_lnum) kind fmt
(* Simple character escapes *)
@@ -160,6 +161,89 @@ let convert_escape = function
| 't' -> 9L (* horizontal tab *)
| 'v' -> 11L (* vertical tab *)
| c -> Int64.of_int (Char.code c)
+
+(* Encodings for character and string literals *)
+
+let encoding_of = function
+ | "" -> Cabs.EncNone
+ | "L" -> Cabs.EncWide
+ | "u" -> Cabs.EncU16
+ | "U" -> Cabs.EncU32
+ | "u8" -> Cabs.EncUTF8
+ | _ -> assert false
+
+let combine_encodings loc e1 e2 =
+ if e1 = Cabs.EncNone then e2
+ else if e2 = Cabs.EncNone then e1
+ else if e1 = e2 then e1
+ else Diagnostics.fatal_error
+ Cabs.(loc.filename, loc.lineno)
+ "unsupported non-standard concatenation of string literals"
+
+(* Handling of characters and escapes in string and char constants *)
+
+type chr = Chr of int | Esc of int64
+
+let check_utf8 lexbuf min x =
+ if x > 0x10FFFF || (x >= 0xD800 && x <= 0xDFFF) then
+ warning lexbuf Diagnostics.Invalid_UTF8 "Wrong Unicode value U+%X" x;
+ if x < min then
+ warning lexbuf Diagnostics.Invalid_UTF8
+ "Overlong UTF-8 encoding for Unicode value U+%X" x;
+ Chr x
+
+let check_universal_character lexbuf x =
+ if x > 0x10FFFF
+ || x >= 0xD800 && x <= 0xDFFF
+ || x < 0xA0 && x <> 0x24 && x <> 0x40 && x <> 0x60
+ then begin
+ error lexbuf "Wrong universal character name U+%X" x; Chr 0
+ end else
+ Chr x
+
+let add_char_utf8 x accu =
+ if x <= 0x007F then
+ Int64.of_int x :: accu
+ else if x <= 0x07FF then
+ Int64.of_int (0x80 lor (x land 0x3F)) ::
+ Int64.of_int (0xC0 lor (x lsr 6)) ::
+ accu
+ else if x <= 0xFFFF then
+ Int64.of_int (0x80 lor (x land 0x3F)) ::
+ Int64.of_int (0x80 lor ((x lsr 6) land 0x3F)) ::
+ Int64.of_int (0xE0 lor (x lsr 12)) ::
+ accu
+ else
+ Int64.of_int (0x80 lor (x land 0x3F)) ::
+ Int64.of_int (0x80 lor ((x lsr 6) land 0x3F)) ::
+ Int64.of_int (0x80 lor ((x lsr 12) land 0x3F)) ::
+ Int64.of_int (0xF0 lor (x lsr 18)) ::
+ accu
+
+let add_char_utf16 x accu =
+ if x <= 0xFFFF then
+ Int64.of_int x :: accu
+ else begin
+ let x = x - 0x10000 in
+ Int64.of_int (0xDC00 lor (x land 0x3FF)) ::
+ Int64.of_int (0xD800 lor (x lsr 10)) ::
+ accu
+ end
+
+let add_char enc c accu =
+ match c, enc with
+ | Esc x, _ -> (* Escapes are never encoded *)
+ x :: accu
+ | Chr x, (Cabs.EncNone | Cabs.EncUTF8) -> (* Characters are encoded in UTF8 *)
+ add_char_utf8 x accu
+ | Chr x, Cabs.EncU16 -> (* Characters are encoded in UTF16 *)
+ add_char_utf16 x accu
+ | Chr x, Cabs.EncU32 -> (* Characters are not encoded *)
+ Int64.of_int x :: accu
+ | Chr x, Cabs.EncWide -> (* Depends on size of wchar_t *)
+ if Machine.(!config.sizeof_wchar) = 2
+ then add_char_utf16 x accu
+ else Int64.of_int x :: accu
}
(* Identifiers *)
@@ -249,11 +333,6 @@ let octal_escape_sequence =
| octal_digit octal_digit
| octal_digit octal_digit octal_digit) as n)
let hexadecimal_escape_sequence = "\\x" (hexadecimal_digit+ as n)
-let escape_sequence =
- simple_escape_sequence
- | octal_escape_sequence
- | hexadecimal_escape_sequence
- | universal_character_name
rule initial = parse
| '\n' { new_line lexbuf; initial_linebegin lexbuf }
@@ -283,16 +362,13 @@ rule initial = parse
currentLoc lexbuf)}
| preprocessing_number as s { error lexbuf "invalid numerical constant '%s'@ These characters form a preprocessor number, but not a constant" s;
CONSTANT (Cabs.CONST_INT "0", currentLoc lexbuf) }
- | "'" { let l = char_literal lexbuf.lex_start_p [] lexbuf in
- CONSTANT (Cabs.CONST_CHAR(false, l),
- currentLoc lexbuf) }
- | "L'" { let l = char_literal lexbuf.lex_start_p [] lexbuf in
- CONSTANT (Cabs.CONST_CHAR(true, l),
- currentLoc lexbuf) }
- | "\"" { let l = string_literal lexbuf.lex_start_p [] lexbuf in
- STRING_LITERAL(false, l, currentLoc lexbuf) }
- | "L\"" { let l = string_literal lexbuf.lex_start_p [] lexbuf in
- STRING_LITERAL(true, l, currentLoc lexbuf) }
+ | (""|"L"|"u"|"U") as e "'" { let enc = encoding_of e in
+ let l = char_literal lexbuf.lex_start_p [] lexbuf in
+ CONSTANT (Cabs.CONST_CHAR(enc, l), currentLoc lexbuf) }
+ | (""|"L"|"u"|"U"|"u8") as e "\""
+ { let enc = encoding_of e in
+ let l = string_literal lexbuf.lex_start_p enc [] lexbuf in
+ STRING_LITERAL(enc, l, currentLoc lexbuf) }
| "..." { ELLIPSIS(currentLoc lexbuf) }
| "+=" { ADD_ASSIGN(currentLoc lexbuf) }
| "-=" { SUB_ASSIGN(currentLoc lexbuf) }
@@ -357,39 +433,62 @@ and initial_linebegin = parse
and char = parse
| universal_character_name
{ try
- Int64.of_string ("0x" ^ n)
+ check_universal_character lexbuf (int_of_string ("0x" ^ n))
with Failure _ ->
error lexbuf "overflow in universal character name";
- 0L
+ Chr 0
}
| hexadecimal_escape_sequence
{ try
- Int64.of_string ("0x" ^ n)
+ Esc (Int64.of_string ("0x" ^ n))
with Failure _ ->
error lexbuf "overflow in hexadecimal escape sequence";
- 0L
+ Esc 0L
}
| octal_escape_sequence
- { Int64.of_string ("0o" ^ n) }
+ { Esc (Int64.of_string ("0o" ^ n)) }
| simple_escape_sequence
- { convert_escape c }
+ { Esc (convert_escape c) }
+ | "\\u" | "\\U"
+ { error lexbuf "incomplete universal character name";
+ Chr 0 }
| '\\' (_ as c)
{ error lexbuf "incorrect escape sequence '\\%c'" c;
- Int64.of_int (Char.code c) }
+ Esc (Int64.of_int (Char.code c)) }
+ | ['\x00'-'\x7F'] as c1
+ { Chr (Char.code c1) }
+ | (['\xC0'-'\xDF'] as c1) (['\x80'-'\xBF'] as c2)
+ { check_utf8 lexbuf 0x80
+ ( (Char.code c1 land 0b00011111) lsl 6
+ + (Char.code c2 land 0b00111111)) }
+ | (['\xE0'-'\xEF'] as c1) (['\x80'-'\xBF'] as c2) (['\x80'-'\xBF'] as c3)
+ { check_utf8 lexbuf 0x800
+ ( (Char.code c1 land 0b00001111) lsl 12
+ + (Char.code c2 land 0b00111111) lsl 6
+ + (Char.code c3 land 0b00111111) ) }
+ | (['\xF0'-'\xF7'] as c1) (['\x80'-'\xBF'] as c2) (['\x80'-'\xBF'] as c3) (['\x80'-'\xBF'] as c4)
+ { check_utf8 lexbuf 0x10000
+ ( (Char.code c1 land 0b00000111) lsl 18
+ + (Char.code c2 land 0b00111111) lsl 12
+ + (Char.code c3 land 0b00111111) lsl 6
+ + (Char.code c4 land 0b00111111) ) }
| _ as c
- { Int64.of_int (Char.code c) }
+ { warning lexbuf Diagnostics.Invalid_UTF8
+ "Invalid UTF8 encoding: byte 0x%02x" (Char.code c);
+ Esc (Int64.of_int (Char.code c)) (* re-encode as-is *)
+ }
and char_literal startp accu = parse
| '\'' { lexbuf.lex_start_p <- startp;
List.rev accu }
| '\n' | eof { fatal_error lexbuf "missing terminating \"'\" character" }
- | "" { let c = char lexbuf in char_literal startp (c :: accu) lexbuf }
+ | "" { let c = char lexbuf in char_literal startp (add_char Cabs.EncU32 c accu) lexbuf }
-and string_literal startp accu = parse
+and string_literal startp enc accu = parse
| '\"' { lexbuf.lex_start_p <- startp;
List.rev accu }
| '\n' | eof { fatal_error lexbuf "missing terminating '\"' character" }
- | "" { let c = char lexbuf in string_literal startp (c :: accu) lexbuf }
+ | "" { let c = char lexbuf in string_literal startp enc (add_char enc c accu) lexbuf }
(* We assume gcc -E syntax but try to tolerate variations. *)
and hash = parse
@@ -402,7 +501,8 @@ and hash = parse
try
int_of_string n
with Failure _ ->
- warning lexbuf "invalid line number"; lexbuf.lex_curr_p.pos_lnum
+ warning lexbuf Diagnostics.Unnamed "invalid line number";
+ lexbuf.lex_curr_p.pos_lnum
in
lexbuf.lex_curr_p <- {
lexbuf.lex_curr_p with
@@ -417,7 +517,7 @@ and hash = parse
([^ '\n']* as s) '\n'
{ new_line lexbuf; PRAGMA (s, currentLoc lexbuf) }
| [^ '\n']* '\n'
- { warning lexbuf "unrecognized '#' line";
+ { warning lexbuf Diagnostics.Unnamed "unrecognized '#' line";
new_line lexbuf; initial_linebegin lexbuf }
| [^ '\n']* eof
{ fatal_error lexbuf "unexpected end of file" }
@@ -583,21 +683,21 @@ and singleline_comment = parse
| Pre_parser.STAR loc -> loop (Parser.STAR loc)
| Pre_parser.STATIC loc -> loop (Parser.STATIC loc)
| Pre_parser.STATIC_ASSERT loc -> loop (Parser.STATIC_ASSERT loc)
- | Pre_parser.STRING_LITERAL (wide, str, loc) ->
+ | Pre_parser.STRING_LITERAL (enc, str, loc) ->
(* Merge consecutive string literals *)
- let rec doConcat wide str =
+ let rec doConcat enc str =
match Queue.peek tokens with
- | Pre_parser.STRING_LITERAL (wide', str', loc) ->
+ | Pre_parser.STRING_LITERAL (enc', str', loc') ->
ignore (Queue.pop tokens);
- let (wide'', str'') = doConcat wide' str' in
+ let (enc'', str'') = doConcat enc' str' in
if str'' <> []
- then (wide || wide'', str @ str'')
- else (wide, str)
- | _ -> (wide, str)
- | exception Queue.Empty -> (wide, str)
+ then (combine_encodings loc enc enc'', str @ str'')
+ else (enc, str)
+ | _ -> (enc, str)
+ | exception Queue.Empty -> (enc, str)
in
- let (wide', str') = doConcat wide str in
- loop (Parser.STRING_LITERAL ((wide', str'), loc))
+ let (enc', str') = doConcat enc str in
+ loop (Parser.STRING_LITERAL ((enc', str'), loc))
| Pre_parser.STRUCT loc -> loop (Parser.STRUCT loc)
| Pre_parser.SUB_ASSIGN loc -> loop (Parser.SUB_ASSIGN loc)
| Pre_parser.SWITCH loc -> loop (Parser.SWITCH loc)