diff options
Diffstat (limited to 'cparser/Lexer.mll')
-rw-r--r-- | cparser/Lexer.mll | 180 |
1 files changed, 140 insertions, 40 deletions
diff --git a/cparser/Lexer.mll b/cparser/Lexer.mll index 9f7fba1e..2efa4216 100644 --- a/cparser/Lexer.mll +++ b/cparser/Lexer.mll @@ -3,6 +3,7 @@ (* The Compcert verified compiler *) (* *) (* Jacques-Henri Jourdan, INRIA Paris-Rocquencourt *) +(* Xavier Leroy, Collège de France and Inria *) (* *) (* Copyright Institut National de Recherche en Informatique et en *) (* Automatique. All rights reserved. This file is distributed *) @@ -144,9 +145,9 @@ let error lb fmt = Diagnostics.error (lb.lex_curr_p.pos_fname,lb.lex_curr_p.pos_lnum) fmt -let warning lb fmt = +let warning lb kind fmt = Diagnostics.warning - (lb.lex_curr_p.pos_fname,lb.lex_curr_p.pos_lnum) Diagnostics.Unnamed fmt + (lb.lex_curr_p.pos_fname,lb.lex_curr_p.pos_lnum) kind fmt (* Simple character escapes *) @@ -160,6 +161,89 @@ let convert_escape = function | 't' -> 9L (* horizontal tab *) | 'v' -> 11L (* vertical tab *) | c -> Int64.of_int (Char.code c) + +(* Encodings for character and string literals *) + +let encoding_of = function + | "" -> Cabs.EncNone + | "L" -> Cabs.EncWide + | "u" -> Cabs.EncU16 + | "U" -> Cabs.EncU32 + | "u8" -> Cabs.EncUTF8 + | _ -> assert false + +let combine_encodings loc e1 e2 = + if e1 = Cabs.EncNone then e2 + else if e2 = Cabs.EncNone then e1 + else if e1 = e2 then e1 + else Diagnostics.fatal_error + Cabs.(loc.filename, loc.lineno) + "unsupported non-standard concatenation of string literals" + +(* Handling of characters and escapes in string and char constants *) + +type chr = Chr of int | Esc of int64 + +let check_utf8 lexbuf min x = + if x > 0x10FFFF || (x >= 0xD800 && x <= 0xDFFF) then + warning lexbuf Diagnostics.Invalid_UTF8 "Wrong Unicode value U+%X" x; + if x < min then + warning lexbuf Diagnostics.Invalid_UTF8 + "Overlong UTF-8 encoding for Unicode value U+%X" x; + Chr x + +let check_universal_character lexbuf x = + if x > 0x10FFFF + || x >= 0xD800 && x <= 0xDFFF + || x < 0xA0 && x <> 0x24 && x <> 0x40 && x <> 0x60 + then begin + error lexbuf "Wrong universal character name U+%X" x; Chr 0 + end else + Chr x + +let add_char_utf8 x accu = + if x <= 0x007F then + Int64.of_int x :: accu + else if x <= 0x07FF then + Int64.of_int (0x80 lor (x land 0x3F)) :: + Int64.of_int (0xC0 lor (x lsr 6)) :: + accu + else if x <= 0xFFFF then + Int64.of_int (0x80 lor (x land 0x3F)) :: + Int64.of_int (0x80 lor ((x lsr 6) land 0x3F)) :: + Int64.of_int (0xE0 lor (x lsr 12)) :: + accu + else + Int64.of_int (0x80 lor (x land 0x3F)) :: + Int64.of_int (0x80 lor ((x lsr 6) land 0x3F)) :: + Int64.of_int (0x80 lor ((x lsr 12) land 0x3F)) :: + Int64.of_int (0xF0 lor (x lsr 18)) :: + accu + +let add_char_utf16 x accu = + if x <= 0xFFFF then + Int64.of_int x :: accu + else begin + let x = x - 0x10000 in + Int64.of_int (0xDC00 lor (x land 0x3FF)) :: + Int64.of_int (0xD800 lor (x lsr 10)) :: + accu + end + +let add_char enc c accu = + match c, enc with + | Esc x, _ -> (* Escapes are never encoded *) + x :: accu + | Chr x, (Cabs.EncNone | Cabs.EncUTF8) -> (* Characters are encoded in UTF8 *) + add_char_utf8 x accu + | Chr x, Cabs.EncU16 -> (* Characters are encoded in UTF16 *) + add_char_utf16 x accu + | Chr x, Cabs.EncU32 -> (* Characters are not encoded *) + Int64.of_int x :: accu + | Chr x, Cabs.EncWide -> (* Depends on size of wchar_t *) + if Machine.(!config.sizeof_wchar) = 2 + then add_char_utf16 x accu + else Int64.of_int x :: accu } (* Identifiers *) @@ -249,11 +333,6 @@ let octal_escape_sequence = | octal_digit octal_digit | octal_digit octal_digit octal_digit) as n) let hexadecimal_escape_sequence = "\\x" (hexadecimal_digit+ as n) -let escape_sequence = - simple_escape_sequence - | octal_escape_sequence - | hexadecimal_escape_sequence - | universal_character_name rule initial = parse | '\n' { new_line lexbuf; initial_linebegin lexbuf } @@ -283,16 +362,13 @@ rule initial = parse currentLoc lexbuf)} | preprocessing_number as s { error lexbuf "invalid numerical constant '%s'@ These characters form a preprocessor number, but not a constant" s; CONSTANT (Cabs.CONST_INT "0", currentLoc lexbuf) } - | "'" { let l = char_literal lexbuf.lex_start_p [] lexbuf in - CONSTANT (Cabs.CONST_CHAR(false, l), - currentLoc lexbuf) } - | "L'" { let l = char_literal lexbuf.lex_start_p [] lexbuf in - CONSTANT (Cabs.CONST_CHAR(true, l), - currentLoc lexbuf) } - | "\"" { let l = string_literal lexbuf.lex_start_p [] lexbuf in - STRING_LITERAL(false, l, currentLoc lexbuf) } - | "L\"" { let l = string_literal lexbuf.lex_start_p [] lexbuf in - STRING_LITERAL(true, l, currentLoc lexbuf) } + | (""|"L"|"u"|"U") as e "'" { let enc = encoding_of e in + let l = char_literal lexbuf.lex_start_p [] lexbuf in + CONSTANT (Cabs.CONST_CHAR(enc, l), currentLoc lexbuf) } + | (""|"L"|"u"|"U"|"u8") as e "\"" + { let enc = encoding_of e in + let l = string_literal lexbuf.lex_start_p enc [] lexbuf in + STRING_LITERAL(enc, l, currentLoc lexbuf) } | "..." { ELLIPSIS(currentLoc lexbuf) } | "+=" { ADD_ASSIGN(currentLoc lexbuf) } | "-=" { SUB_ASSIGN(currentLoc lexbuf) } @@ -357,39 +433,62 @@ and initial_linebegin = parse and char = parse | universal_character_name { try - Int64.of_string ("0x" ^ n) + check_universal_character lexbuf (int_of_string ("0x" ^ n)) with Failure _ -> error lexbuf "overflow in universal character name"; - 0L + Chr 0 } | hexadecimal_escape_sequence { try - Int64.of_string ("0x" ^ n) + Esc (Int64.of_string ("0x" ^ n)) with Failure _ -> error lexbuf "overflow in hexadecimal escape sequence"; - 0L + Esc 0L } | octal_escape_sequence - { Int64.of_string ("0o" ^ n) } + { Esc (Int64.of_string ("0o" ^ n)) } | simple_escape_sequence - { convert_escape c } + { Esc (convert_escape c) } + | "\\u" | "\\U" + { error lexbuf "incomplete universal character name"; + Chr 0 } | '\\' (_ as c) { error lexbuf "incorrect escape sequence '\\%c'" c; - Int64.of_int (Char.code c) } + Esc (Int64.of_int (Char.code c)) } + | ['\x00'-'\x7F'] as c1 + { Chr (Char.code c1) } + | (['\xC0'-'\xDF'] as c1) (['\x80'-'\xBF'] as c2) + { check_utf8 lexbuf 0x80 + ( (Char.code c1 land 0b00011111) lsl 6 + + (Char.code c2 land 0b00111111)) } + | (['\xE0'-'\xEF'] as c1) (['\x80'-'\xBF'] as c2) (['\x80'-'\xBF'] as c3) + { check_utf8 lexbuf 0x800 + ( (Char.code c1 land 0b00001111) lsl 12 + + (Char.code c2 land 0b00111111) lsl 6 + + (Char.code c3 land 0b00111111) ) } + | (['\xF0'-'\xF7'] as c1) (['\x80'-'\xBF'] as c2) (['\x80'-'\xBF'] as c3) (['\x80'-'\xBF'] as c4) + { check_utf8 lexbuf 0x10000 + ( (Char.code c1 land 0b00000111) lsl 18 + + (Char.code c2 land 0b00111111) lsl 12 + + (Char.code c3 land 0b00111111) lsl 6 + + (Char.code c4 land 0b00111111) ) } | _ as c - { Int64.of_int (Char.code c) } + { warning lexbuf Diagnostics.Invalid_UTF8 + "Invalid UTF8 encoding: byte 0x%02x" (Char.code c); + Esc (Int64.of_int (Char.code c)) (* re-encode as-is *) + } and char_literal startp accu = parse | '\'' { lexbuf.lex_start_p <- startp; List.rev accu } | '\n' | eof { fatal_error lexbuf "missing terminating \"'\" character" } - | "" { let c = char lexbuf in char_literal startp (c :: accu) lexbuf } + | "" { let c = char lexbuf in char_literal startp (add_char Cabs.EncU32 c accu) lexbuf } -and string_literal startp accu = parse +and string_literal startp enc accu = parse | '\"' { lexbuf.lex_start_p <- startp; List.rev accu } | '\n' | eof { fatal_error lexbuf "missing terminating '\"' character" } - | "" { let c = char lexbuf in string_literal startp (c :: accu) lexbuf } + | "" { let c = char lexbuf in string_literal startp enc (add_char enc c accu) lexbuf } (* We assume gcc -E syntax but try to tolerate variations. *) and hash = parse @@ -402,7 +501,8 @@ and hash = parse try int_of_string n with Failure _ -> - warning lexbuf "invalid line number"; lexbuf.lex_curr_p.pos_lnum + warning lexbuf Diagnostics.Unnamed "invalid line number"; + lexbuf.lex_curr_p.pos_lnum in lexbuf.lex_curr_p <- { lexbuf.lex_curr_p with @@ -417,7 +517,7 @@ and hash = parse ([^ '\n']* as s) '\n' { new_line lexbuf; PRAGMA (s, currentLoc lexbuf) } | [^ '\n']* '\n' - { warning lexbuf "unrecognized '#' line"; + { warning lexbuf Diagnostics.Unnamed "unrecognized '#' line"; new_line lexbuf; initial_linebegin lexbuf } | [^ '\n']* eof { fatal_error lexbuf "unexpected end of file" } @@ -583,21 +683,21 @@ and singleline_comment = parse | Pre_parser.STAR loc -> loop (Parser.STAR loc) | Pre_parser.STATIC loc -> loop (Parser.STATIC loc) | Pre_parser.STATIC_ASSERT loc -> loop (Parser.STATIC_ASSERT loc) - | Pre_parser.STRING_LITERAL (wide, str, loc) -> + | Pre_parser.STRING_LITERAL (enc, str, loc) -> (* Merge consecutive string literals *) - let rec doConcat wide str = + let rec doConcat enc str = match Queue.peek tokens with - | Pre_parser.STRING_LITERAL (wide', str', loc) -> + | Pre_parser.STRING_LITERAL (enc', str', loc') -> ignore (Queue.pop tokens); - let (wide'', str'') = doConcat wide' str' in + let (enc'', str'') = doConcat enc' str' in if str'' <> [] - then (wide || wide'', str @ str'') - else (wide, str) - | _ -> (wide, str) - | exception Queue.Empty -> (wide, str) + then (combine_encodings loc enc enc'', str @ str'') + else (enc, str) + | _ -> (enc, str) + | exception Queue.Empty -> (enc, str) in - let (wide', str') = doConcat wide str in - loop (Parser.STRING_LITERAL ((wide', str'), loc)) + let (enc', str') = doConcat enc str in + loop (Parser.STRING_LITERAL ((enc', str'), loc)) | Pre_parser.STRUCT loc -> loop (Parser.STRUCT loc) | Pre_parser.SUB_ASSIGN loc -> loop (Parser.SUB_ASSIGN loc) | Pre_parser.SWITCH loc -> loop (Parser.SWITCH loc) |