From 16fc8b07cb8dbbabad0b665b9114925f4349cf38 Mon Sep 17 00:00:00 2001 From: xleroy Date: Thu, 5 Jun 2014 06:50:10 +0000 Subject: Cleaner, more resilient parsing of pragmas. git-svn-id: https://yquem.inria.fr/compcert/svn/compcert/trunk@2507 fca1b0fc-160b-0410-b1d3-a4f43f01ea2e --- cfrontend/CPragmas.ml | 80 ++++++++++++++++----------------------------------- lib/Tokenize.mli | 33 +++++++++++++++++++++ lib/Tokenize.mll | 45 +++++++++++++++++++++++++++++ 3 files changed, 103 insertions(+), 55 deletions(-) create mode 100644 lib/Tokenize.mli create mode 100644 lib/Tokenize.mll diff --git a/cfrontend/CPragmas.ml b/cfrontend/CPragmas.ml index 3c0c9f15..c0746322 100644 --- a/cfrontend/CPragmas.ml +++ b/cfrontend/CPragmas.ml @@ -41,10 +41,12 @@ let process_section_pragma classname istring ustring addrmode accmode = let re_c_ident = Str.regexp "[A-Za-z_][A-Za-z_0-9]*$" let process_use_section_pragma classname id = - if not (Str.string_match re_c_ident id 0) then - C2C.error (sprintf "bad identifier `%s' in #pragma use_section" id); - if not (Sections.use_section_for (intern_string id) classname) then - C2C.error (sprintf "unknown section name `%s'" classname) + if id = "," || id = ";" then () else begin + if not (Str.string_match re_c_ident id 0) then + C2C.error (sprintf "bad identifier `%s' in #pragma use_section" id); + if not (Sections.use_section_for (intern_string id) classname) then + C2C.error (sprintf "unknown section name `%s'" classname) + end (* #pragma reserve_register *) @@ -58,59 +60,27 @@ let process_reserve_register_pragma name = else C2C.error "cannot reserve this register (not a callee-save)" -(* Parsing of pragmas using regexps *) - -let re_start_pragma_section = Str.regexp "section\\b" - -let re_pragma_section = Str.regexp( - "section[ \t]+" -^ "\\([A-Za-z_][A-Za-z_0-9]*\\)[ \t]+" (* class_name *) -^ "\"\\([^\"]*\\)\"?[ \t]*" (* istring *) -^ "\"\\([^\"]*\\)\"?[ \t]*" (* ustring *) -^ "\\([a-zA-Z-]+\\)?[ \t]*" (* addressing mode *) -^ "\\([RWXON]*\\)" (* access mode *) -) - -let re_start_pragma_use_section = Str.regexp "use_section\\b" - -let re_pragma_use_section = Str.regexp - "use_section[ \t]+\ - \\([A-Za-z_][A-Za-z_0-9]*\\)[ \t]+\ - \\(.*\\)$" - -let re_split_idents = Str.regexp "[ \t,]+" - -let re_start_pragma_reserve_register = Str.regexp "reserve_register\\b" - -let re_pragma_reserve_register = Str.regexp - "reserve_register[ \t]+\\([A-Za-z0-9]+\\)" +(* Parsing of pragmas *) let process_pragma name = - if Str.string_match re_pragma_section name 0 then begin - process_section_pragma - (Str.matched_group 1 name) (* classname *) - (Str.matched_group 2 name) (* istring *) - (Str.matched_group 3 name) (* ustring *) - (Str.matched_group 4 name) (* addrmode *) - (Str.matched_group 5 name); (* accmode *) - true - end else if Str.string_match re_start_pragma_section name 0 then - (C2C.error "ill-formed `section' pragma"; true) - else if Str.string_match re_pragma_use_section name 0 then begin - let classname = Str.matched_group 1 name - and idents = Str.matched_group 2 name in - let identlist = Str.split re_split_idents idents in - if identlist = [] then C2C.warning "vacuous `use_section' pragma"; - List.iter (process_use_section_pragma classname) identlist; - true - end else if Str.string_match re_start_pragma_use_section name 0 then begin - C2C.error "ill-formed `use_section' pragma"; true - end else if Str.string_match re_pragma_reserve_register name 0 then begin - process_reserve_register_pragma (Str.matched_group 1 name); true - end else if Str.string_match re_start_pragma_reserve_register name 0 then begin - C2C.error "ill-formed `reserve_register' pragma"; true - end else - false + match Tokenize.string name with + | ["section"; classname; istring; ustring; addrmode; accmode] -> + process_section_pragma classname istring ustring addrmode accmode; + true + | "section" :: _ -> + C2C.error "ill-formed `section' pragma"; true + | "use_section" :: classname :: identlist -> + if identlist = [] then C2C.warning "vacuous `use_section' pragma"; + List.iter (process_use_section_pragma classname) identlist; + true + | "use_section" :: _ -> + C2C.error "ill-formed `use_section' pragma"; true + | ["reserve_register"; reg] -> + process_reserve_register_pragma reg; true + | "reserve_register" :: _ -> + C2C.error "ill-formed `reserve_register' pragma"; true + | _ -> + false let initialize () = C2C.process_pragma_hook := process_pragma diff --git a/lib/Tokenize.mli b/lib/Tokenize.mli new file mode 100644 index 00000000..a9f22c4d --- /dev/null +++ b/lib/Tokenize.mli @@ -0,0 +1,33 @@ +(* *********************************************************************) +(* *) +(* The Compcert verified compiler *) +(* *) +(* Xavier Leroy, INRIA Paris-Rocquencourt *) +(* *) +(* Copyright Institut National de Recherche en Informatique et en *) +(* Automatique. All rights reserved. This file is distributed *) +(* under the terms of the GNU General Public License as published by *) +(* the Free Software Foundation, either version 2 of the License, or *) +(* (at your option) any later version. This file is also distributed *) +(* under the terms of the INRIA Non-Commercial License Agreement. *) +(* *) +(* *********************************************************************) + +(* Parse a string as a list of tokens *) + +val string: string -> string list + (** [Tokenize.string s] decomposes [s] into a list of tokens. + Whitespace separates tokens. The following substrings + constitute tokens: + - A string enclosed in double quotes. Within the string, + the escape sequences '\t' '\n' '\"' and '\\' are recognized. + The token value is the contents of the string without the + enclosing double quotes. + - A string enclosed in single quotes. No escape sequences are + recognized. The token value is the contents of the string without the + enclosing single quotes. + - A sequence of letters, digits, or the [_], [$], [-] and [.] + characters. [-] and [.] cannot appear as the first character. + - Any other non-whitespace character is treated as a separate token + of length 1. + *) diff --git a/lib/Tokenize.mll b/lib/Tokenize.mll new file mode 100644 index 00000000..422068b1 --- /dev/null +++ b/lib/Tokenize.mll @@ -0,0 +1,45 @@ +(* *********************************************************************) +(* *) +(* The Compcert verified compiler *) +(* *) +(* Xavier Leroy, INRIA Paris-Rocquencourt *) +(* *) +(* Copyright Institut National de Recherche en Informatique et en *) +(* Automatique. All rights reserved. This file is distributed *) +(* under the terms of the GNU General Public License as published by *) +(* the Free Software Foundation, either version 2 of the License, or *) +(* (at your option) any later version. This file is also distributed *) +(* under the terms of the INRIA Non-Commercial License Agreement. *) +(* *) +(* *********************************************************************) + +(* Parse a string as a list of tokens *) + +let identstart = [ '0'-'9' 'A'-'Z' 'a'-'z' '$' '_' ] +let identcont = [ '0'-'9' 'A'-'Z' 'a'-'z' '$' '_' '-' '.' ] + +rule tokenize acc = parse + | eof { List.rev acc } + | [' ' '\t' '\n'] + { tokenize acc lexbuf } + | "\"" { tok_dquote acc (Buffer.create 16) lexbuf } + | "'" { tok_squote acc (Buffer.create 16) lexbuf } + | (identstart identcont*) as s + { tokenize (s :: acc) lexbuf } + | _ as c { tokenize (String.make 1 c :: acc) lexbuf } + +and tok_dquote acc buf = parse + | "\"" | eof { tokenize (Buffer.contents buf :: acc) lexbuf } + | "\\t" { Buffer.add_char buf '\t'; tok_dquote acc buf lexbuf } + | "\\n" { Buffer.add_char buf '\n'; tok_dquote acc buf lexbuf } + | "\\" ([ '\\' '\"' ] as c) + { Buffer.add_char buf c; tok_dquote acc buf lexbuf } + | _ as c { Buffer.add_char buf c; tok_dquote acc buf lexbuf } + +and tok_squote acc buf = parse + | "\'" | eof { tokenize (Buffer.contents buf :: acc) lexbuf } + | _ as c { Buffer.add_char buf c; tok_squote acc buf lexbuf } + +{ +let string s = + tokenize [] (Lexing.from_string s) +} -- cgit