From f8be3f5f2937b053b9cb75ada7937a6c1b20f019 Mon Sep 17 00:00:00 2001 From: François Pottier Date: Fri, 23 Oct 2015 13:34:43 +0200 Subject: Install the new system for reporting syntax errors. This requires the development version of Menhir, to be released soon. In summary: handcrafted.messages is new. It contains a mapping of erroneous sentences to error messages, together with a lot of comments. Makefile.extr is new. It contains a rule to generate cparser/pre_parser_messages.ml based on this mapping. cparser/ErrorReports.{ml,mli} are new. They construct syntax error messages, based on the compiled mapping. cparser/Lexer.mll is modified. The last two tokens that have been read are stored in a buffer. ErrorReports is called to construct a syntax error message. cparser/GNUmakefile is new. It offers several commands for working on the pre-parser. cparser/deLexer.ml is new. It is a script (it is not linked into CompCert). It translates the symbolic name of a token to an example of this token in concrete C syntax. It is used by [make -C cparser concrete] to produce the .c files in tests/generated/. cparser/tests/generated/Makefile is new. It runs ccomp, clang and gcc on each of the generated C files, so as to allow a comparison of the error messages. --- cparser/ErrorReports.ml | 239 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 239 insertions(+) create mode 100644 cparser/ErrorReports.ml (limited to 'cparser/ErrorReports.ml') diff --git a/cparser/ErrorReports.ml b/cparser/ErrorReports.ml new file mode 100644 index 00000000..445b8412 --- /dev/null +++ b/cparser/ErrorReports.ml @@ -0,0 +1,239 @@ +open Lexing +open Pre_parser.MenhirInterpreter +module S = MenhirLib.General (* Streams *) + +(* -------------------------------------------------------------------------- *) + +(* There are places where we may hit an internal error and we would like to fail + abruptly because "this cannot happen". Yet, it is safer when shipping to + silently cover up for our internal error. Thus, we typically use an idiom of + the form [if debug then assert false else ]. *) + +let debug = true + +(* -------------------------------------------------------------------------- *) + +(* The parser keeps track of the last two tokens in a two-place buffer. *) + +type 'a buffer = +| Zero +| One of 'a +| Two of 'a * (* most recent: *) 'a + +(* [push buffer x] pushes [x] into [buffer], causing the buffer to slide. *) + +let update buffer x : _ buffer = + match buffer, x with + | Zero, _ -> + One x + | One x1, x2 + | Two (_, x1), x2 -> + Two (x1, x2) + +(* [show f buffer] prints the contents of the buffer. The function [f] is + used to print an element. *) + +let show f buffer : string = + match buffer with + | Zero -> + (* The buffer cannot be empty. If we have read no tokens, we + cannot have detected a syntax error. *) + if debug then assert false else "" + | One invalid -> + (* It is unlikely, but possible, that we have read just one token. *) + Printf.sprintf "before '%s'" (f invalid) + | Two (valid, invalid) -> + (* In the most likely case, we have read two tokens. *) + Printf.sprintf "after '%s' and before '%s'" (f valid) (f invalid) + +(* [last buffer] returns the last element of the buffer (that is, the + invalid token). *) + +let last buffer = + match buffer with + | Zero -> + (* The buffer cannot be empty. If we have read no tokens, we + cannot have detected a syntax error. *) + assert false + | One invalid + | Two (_, invalid) -> + invalid + +(* -------------------------------------------------------------------------- *) + +(* [extract text (pos1, pos2)] extracts the sub-string of [text] delimited + by the positions [pos1] and [pos2]. *) + +let extract text (pos1, pos2) : string = + let ofs1 = pos1.pos_cnum + and ofs2 = pos2.pos_cnum in + let len = ofs2 - ofs1 in + try + String.sub text ofs1 len + with Invalid_argument _ -> + (* In principle, this should not happen, but if it does, let's make this + a non-fatal error. *) + if debug then assert false else "???" + +(* -------------------------------------------------------------------------- *) + +(* [sanitize text] eliminates any special characters from the text [text]. + They are (arbitrarily) replaced with a single dot character. *) + +let sanitize text = + String.map (fun c -> + if Char.code c < 32 || Char.code c >= 127 then '.' else c + ) text + +(* -------------------------------------------------------------------------- *) + +(* [shorten k text] limits the length of [text] to [2k+3] characters. If the + text is too long, a fragment in the middle is replaced with an ellipsis. *) + +let shorten k text = + let n = String.length text in + if n <= 2 * k + 3 then + text + else + String.sub text 0 k ^ + "..." ^ + String.sub text (n - k) k + +(* -------------------------------------------------------------------------- *) + +(* [stack checkpoint] extracts the parser's stack out of a checkpoint. *) + +let stack checkpoint = + match checkpoint with + | HandlingError env -> + stack env + | _ -> + assert false (* this cannot happen, I promise *) + +(* -------------------------------------------------------------------------- *) + +(* [state checkpoint] extracts the number of the current state out of a + parser checkpoint. *) + +let state checkpoint : int = + match Lazy.force (stack checkpoint) with + | S.Nil -> + (* Hmm... The parser is in its initial state. Its number is + usually 0. This is a BIG HACK. TEMPORARY *) + 0 + | S.Cons (Element (s, _, _, _), _) -> + number s + +(* -------------------------------------------------------------------------- *) + +(* TEMPORARY move to MenhirLib.General *) + +let rec drop n (xs : 'a S.stream) : 'a S.stream = + match n, xs with + | 0, _ + | _, lazy (S.Nil) -> + xs + | _, lazy (S.Cons (_, xs)) -> + drop (n - 1) xs + +(* -------------------------------------------------------------------------- *) + +(* [element checkpoint i] returns the [i]-th cell of the parser stack. The index + [i] is 0-based. [i] should (ideally) be within bounds; we raise [Not_found] + if it isn't. *) + +let element checkpoint i : element = + match Lazy.force (drop i (stack checkpoint)) with + | S.Nil -> + (* [i] is out of range. This could happen if the handwritten error + messages are out of sync with the grammar, or if a mistake was + made. We fail in a non-fatal way. *) + raise Not_found + | S.Cons (e, _) -> + e + +(* -------------------------------------------------------------------------- *) + +(* [range text e] converts the stack element [e] to the fragment of the source + text that corresponds to this stack element. The fragment is placed within + single quotes and shortened if it is too long. We also ensure that it does + not contain any special characters. *) + +let width = 30 + +let range text (e : element) : string = + (* Extract the start and positions of this stack element. *) + let Element (_, _, pos1, pos2) = e in + (* Get the underlying source text fragment. *) + let fragment = extract text (pos1, pos2) in + (* Sanitize it and limit its length. Enclose it in single quotes. *) + "'" ^ shorten width (sanitize fragment) ^ "'" + +(* -------------------------------------------------------------------------- *) + +(* We allow an error message to contain the special form $i, where is a 0-based + index into the stack. We replace it with the fragment of the source text that + corresponds to this stack entry. *) + +let fragment text checkpoint message = + try + let i = int_of_string (Str.matched_group 1 message) in + range text (element checkpoint i) + with + | Failure _ -> + (* In principle, this should not happen, but if it does, let's cover up + for our internal error. *) + if debug then assert false else "???" + | Not_found -> + (* In principle, this should not happen, but if it does, let's cover up + for our internal error. *) + if debug then assert false else "???" + +let fragments text checkpoint (message : string) : string = + Str.global_substitute + (Str.regexp "\\$\\([0-9]+\\)") + (fragment text checkpoint) + message + +(* -------------------------------------------------------------------------- *) + +(* [report text buffer checkpoint] constructs an error message. The C source + code must be stored in the string [text]. The start and end positions of the + last two tokens that were read must be stored in [buffer]. The parser state + (i.e., the automaton's state and stack) must be recorded in the checkpoint + [checkpoint]. *) + +(* The start and end positions of the invalid token are [lexbuf.lex_start_p] + and [lexbuf.lex_curr_p], since this is the last token that was read. But + we need not care about that here. *) + +let report text buffer checkpoint : string = + (* Extract the position where the error occurred, that is, the start + position of the invalid token. We display it as a filename, a (1-based) + line number, and a (0-based) column number. *) + let (pos, _) = last buffer in + (* Construct a readable description of where the error occurred, that is, + after which token and before which token. *) + let where = show (extract text) buffer in + (* Find out in which state the parser failed. *) + let s : int = state checkpoint in + (* Choose an error message, based on the state number [s]. + Then, customize it, based on dynamic information. *) + let message = try + Pre_parser_messages.message s |> + fragments text checkpoint + with Not_found -> + (* If the state number cannot be found -- which, in principle, + should not happen, since our list of erroneous states is + supposed to be complete! -- produce a generic message. *) + Printf.sprintf "This is an unknown syntax error (%d).\n\ + Please report this problem to the compiler vendor.\n" s + in + (* Construct the full error message. *) + Printf.sprintf "%s:%d:%d: syntax error %s.\n%s" + pos.pos_fname + pos.pos_lnum + (pos.pos_cnum - pos.pos_bol) + where + message + -- cgit