From 16fc8b07cb8dbbabad0b665b9114925f4349cf38 Mon Sep 17 00:00:00 2001
From: xleroy <xleroy@fca1b0fc-160b-0410-b1d3-a4f43f01ea2e>
Date: Thu, 5 Jun 2014 06:50:10 +0000
Subject: Cleaner, more resilient parsing of pragmas.

git-svn-id: https://yquem.inria.fr/compcert/svn/compcert/trunk@2507 fca1b0fc-160b-0410-b1d3-a4f43f01ea2e
---
 cfrontend/CPragmas.ml | 80 ++++++++++++++++-----------------------------------
 lib/Tokenize.mli      | 33 +++++++++++++++++++++
 lib/Tokenize.mll      | 45 +++++++++++++++++++++++++++++
 3 files changed, 103 insertions(+), 55 deletions(-)
 create mode 100644 lib/Tokenize.mli
 create mode 100644 lib/Tokenize.mll

diff --git a/cfrontend/CPragmas.ml b/cfrontend/CPragmas.ml
index 3c0c9f15..c0746322 100644
--- a/cfrontend/CPragmas.ml
+++ b/cfrontend/CPragmas.ml
@@ -41,10 +41,12 @@ let process_section_pragma classname istring ustring addrmode accmode =
 let re_c_ident = Str.regexp "[A-Za-z_][A-Za-z_0-9]*$"
 
 let process_use_section_pragma classname id =
-  if not (Str.string_match re_c_ident id 0) then
-    C2C.error (sprintf "bad identifier `%s' in #pragma use_section" id);
-  if not (Sections.use_section_for (intern_string id) classname) then
-    C2C.error (sprintf "unknown section name `%s'" classname)
+  if id = "," || id = ";" then () else begin
+    if not (Str.string_match re_c_ident id 0) then
+      C2C.error (sprintf "bad identifier `%s' in #pragma use_section" id);
+    if not (Sections.use_section_for (intern_string id) classname) then
+      C2C.error (sprintf "unknown section name `%s'" classname)
+  end
 
 (* #pragma reserve_register *)
 
@@ -58,59 +60,27 @@ let process_reserve_register_pragma name =
       else
         C2C.error "cannot reserve this register (not a callee-save)"
 
-(* Parsing of pragmas using regexps *)
-
-let re_start_pragma_section = Str.regexp "section\\b"
-
-let re_pragma_section = Str.regexp(
-  "section[ \t]+"
-^ "\\([A-Za-z_][A-Za-z_0-9]*\\)[ \t]+"  (* class_name *)
-^ "\"\\([^\"]*\\)\"?[ \t]*"             (* istring *)
-^ "\"\\([^\"]*\\)\"?[ \t]*"             (* ustring *)
-^ "\\([a-zA-Z-]+\\)?[ \t]*"             (* addressing mode *)
-^ "\\([RWXON]*\\)"                      (* access mode *)
-)
-
-let re_start_pragma_use_section = Str.regexp "use_section\\b"
-
-let re_pragma_use_section = Str.regexp
-  "use_section[ \t]+\
-   \\([A-Za-z_][A-Za-z_0-9]*\\)[ \t]+\
-   \\(.*\\)$"
-
-let re_split_idents = Str.regexp "[ \t,]+"
-
-let re_start_pragma_reserve_register = Str.regexp "reserve_register\\b"
-
-let re_pragma_reserve_register = Str.regexp
-  "reserve_register[ \t]+\\([A-Za-z0-9]+\\)"
+(* Parsing of pragmas *)
 
 let process_pragma name =
-  if Str.string_match re_pragma_section name 0 then begin
-    process_section_pragma
-      (Str.matched_group 1 name) (* classname *)
-      (Str.matched_group 2 name) (* istring *)
-      (Str.matched_group 3 name) (* ustring *)
-      (Str.matched_group 4 name) (* addrmode *)
-      (Str.matched_group 5 name); (* accmode *)
-    true
-  end else if Str.string_match re_start_pragma_section name 0 then
-    (C2C.error "ill-formed `section' pragma"; true)
- else if Str.string_match re_pragma_use_section name 0 then begin
-    let classname = Str.matched_group 1 name
-    and idents = Str.matched_group 2 name in
-    let identlist = Str.split re_split_idents idents in
-    if identlist = [] then C2C.warning "vacuous `use_section' pragma";
-    List.iter (process_use_section_pragma classname) identlist;
-    true
-  end else if Str.string_match re_start_pragma_use_section name 0 then begin
-    C2C.error "ill-formed `use_section' pragma"; true
-  end else if Str.string_match re_pragma_reserve_register name 0 then begin
-    process_reserve_register_pragma (Str.matched_group 1 name); true
-  end else if Str.string_match re_start_pragma_reserve_register name 0 then begin
-    C2C.error "ill-formed `reserve_register' pragma"; true
-  end else
-    false
+  match Tokenize.string name with
+  | ["section"; classname; istring; ustring; addrmode; accmode] ->
+      process_section_pragma classname istring ustring addrmode accmode;
+      true
+  | "section" :: _ ->
+      C2C.error "ill-formed `section' pragma"; true
+  | "use_section" :: classname :: identlist ->
+      if identlist = [] then C2C.warning "vacuous `use_section' pragma";
+      List.iter (process_use_section_pragma classname) identlist;
+      true
+  | "use_section" :: _ ->
+      C2C.error "ill-formed `use_section' pragma"; true
+  | ["reserve_register"; reg] ->
+      process_reserve_register_pragma reg; true
+  | "reserve_register" :: _ ->
+      C2C.error "ill-formed `reserve_register' pragma"; true
+  | _ ->
+      false
 
 let initialize () =
   C2C.process_pragma_hook := process_pragma
diff --git a/lib/Tokenize.mli b/lib/Tokenize.mli
new file mode 100644
index 00000000..a9f22c4d
--- /dev/null
+++ b/lib/Tokenize.mli
@@ -0,0 +1,33 @@
+(* *********************************************************************)
+(*                                                                     *)
+(*              The Compcert verified compiler                         *)
+(*                                                                     *)
+(*          Xavier Leroy, INRIA Paris-Rocquencourt                     *)
+(*                                                                     *)
+(*  Copyright Institut National de Recherche en Informatique et en     *)
+(*  Automatique.  All rights reserved.  This file is distributed       *)
+(*  under the terms of the GNU General Public License as published by  *)
+(*  the Free Software Foundation, either version 2 of the License, or  *)
+(*  (at your option) any later version.  This file is also distributed *)
+(*  under the terms of the INRIA Non-Commercial License Agreement.     *)
+(*                                                                     *)
+(* *********************************************************************)
+
+(* Parse a string as a list of tokens *)
+
+val string: string -> string list
+  (** [Tokenize.string s] decomposes [s] into a list of tokens.
+      Whitespace separates tokens.  The following substrings
+      constitute tokens:
+    - A string enclosed in double quotes.  Within the string,
+      the escape sequences '\t' '\n' '\"' and '\\' are recognized.
+      The token value is the contents of the string without the
+      enclosing double quotes.
+    - A string enclosed in single quotes.  No escape sequences are
+      recognized.  The token value is the contents of the string without the
+      enclosing single quotes.
+    - A sequence of letters, digits, or the [_], [$], [-] and [.]
+      characters.  [-] and [.] cannot appear as the first character.
+    - Any other non-whitespace character is treated as a separate token
+      of length 1.
+  *)
diff --git a/lib/Tokenize.mll b/lib/Tokenize.mll
new file mode 100644
index 00000000..422068b1
--- /dev/null
+++ b/lib/Tokenize.mll
@@ -0,0 +1,45 @@
+(* *********************************************************************)
+(*                                                                     *)
+(*              The Compcert verified compiler                         *)
+(*                                                                     *)
+(*          Xavier Leroy, INRIA Paris-Rocquencourt                     *)
+(*                                                                     *)
+(*  Copyright Institut National de Recherche en Informatique et en     *)
+(*  Automatique.  All rights reserved.  This file is distributed       *)
+(*  under the terms of the GNU General Public License as published by  *)
+(*  the Free Software Foundation, either version 2 of the License, or  *)
+(*  (at your option) any later version.  This file is also distributed *)
+(*  under the terms of the INRIA Non-Commercial License Agreement.     *)
+(*                                                                     *)
+(* *********************************************************************)
+
+(* Parse a string as a list of tokens *)
+
+let identstart = [ '0'-'9' 'A'-'Z' 'a'-'z' '$' '_' ]
+let identcont  = [ '0'-'9' 'A'-'Z' 'a'-'z' '$' '_' '-' '.' ]
+
+rule tokenize acc = parse
+  | eof                 { List.rev acc }
+  | [' ' '\t' '\n'] +   { tokenize acc lexbuf }
+  | "\""                { tok_dquote acc (Buffer.create 16) lexbuf }
+  | "'"                 { tok_squote acc (Buffer.create 16) lexbuf }
+  | (identstart identcont*) as s
+                        { tokenize (s :: acc) lexbuf }
+  | _ as c              { tokenize (String.make 1 c :: acc) lexbuf }
+
+and tok_dquote acc buf = parse
+  | "\"" | eof          { tokenize (Buffer.contents buf :: acc) lexbuf }
+  | "\\t"               { Buffer.add_char buf '\t'; tok_dquote acc buf lexbuf }
+  | "\\n"               { Buffer.add_char buf '\n'; tok_dquote acc buf lexbuf }
+  | "\\" ([ '\\' '\"' ] as c)
+                        { Buffer.add_char buf c; tok_dquote acc buf lexbuf }
+  | _ as c              { Buffer.add_char buf c; tok_dquote acc buf lexbuf }
+
+and tok_squote acc buf = parse
+  | "\'" | eof          { tokenize (Buffer.contents buf :: acc) lexbuf }
+  | _ as c              { Buffer.add_char buf c; tok_squote acc buf lexbuf }
+
+{
+let string s =
+  tokenize [] (Lexing.from_string s)
+}
-- 
cgit