diff options
Diffstat (limited to 'lex')
-rw-r--r-- | lex/cset.ml | 5 | ||||
-rw-r--r-- | lex/cset.mli | 5 | ||||
-rw-r--r-- | lex/lexer.mli | 4 | ||||
-rw-r--r-- | lex/lexer.mll | 108 | ||||
-rw-r--r-- | lex/lexgen.ml | 2 | ||||
-rw-r--r-- | lex/main.ml | 20 | ||||
-rw-r--r-- | lex/parser.mly | 13 |
7 files changed, 109 insertions, 48 deletions
diff --git a/lex/cset.ml b/lex/cset.ml index 84c2a77142..ec68ee1c8b 100644 --- a/lex/cset.ml +++ b/lex/cset.ml @@ -11,6 +11,11 @@ (* *) (***********************************************************************) +(* $Id$ *) + + +exception Bad + type t = (int * int) list diff --git a/lex/cset.mli b/lex/cset.mli index 0ebcac0e5f..fc2c9930c3 100644 --- a/lex/cset.mli +++ b/lex/cset.mli @@ -11,13 +11,18 @@ (* *) (***********************************************************************) +(* $Id$ *) + (* Set of characters encoded as list of intervals *) type t +exception Bad val empty : t val is_empty : t -> bool val all_chars : t +exception Bad + val all_chars_eof : t val eof : t val singleton : int -> t diff --git a/lex/lexer.mli b/lex/lexer.mli index 569a5b266f..be34674eb3 100644 --- a/lex/lexer.mli +++ b/lex/lexer.mli @@ -14,7 +14,9 @@ val main: Lexing.lexbuf -> Parser.token -exception Lexical_error of string * int * int +exception Lexical_error of string * string * int * int +(*n val line_num: int ref val line_start_pos: int ref +*) diff --git a/lex/lexer.mll b/lex/lexer.mll index 5249ca8bc0..fa1d15bda6 100644 --- a/lex/lexer.mll +++ b/lex/lexer.mll @@ -25,7 +25,7 @@ and comment_depth = ref 0 let in_pattern () = !brace_depth = 0 && !comment_depth = 0 -exception Lexical_error of string * int * int +exception Lexical_error of string * string * int * int let string_buff = Buffer.create 256 @@ -42,24 +42,32 @@ let char_for_backslash = function | 'r' -> '\r' | c -> c - -let line_num = ref 1 -let line_start_pos = ref 0 +let raise_lexical_error lexbuf msg = + let p = Lexing.lexeme_start_p lexbuf in + raise (Lexical_error (msg, + p.Lexing.pos_fname, + p.Lexing.pos_lnum, + p.Lexing.pos_cnum - p.Lexing.pos_bol + 1)) +;; let handle_lexical_error fn lexbuf = - let line = !line_num - and column = Lexing.lexeme_start lexbuf - !line_start_pos + 1 in + let p = Lexing.lexeme_start_p lexbuf in + let line = p.Lexing.pos_lnum + and column = p.Lexing.pos_cnum - p.Lexing.pos_bol + 1 + and file = p.Lexing.pos_fname + in try fn lexbuf - with Lexical_error (msg, 0, 0) -> - raise(Lexical_error(msg, line, column)) + with Lexical_error (msg, "", 0, 0) -> + raise(Lexical_error(msg, file, line, column)) let get_input_name () = Sys.argv.(Array.length Sys.argv - 1) let warning lexbuf msg = + let p = Lexing.lexeme_start_p lexbuf in Printf.eprintf "ocamllex warning:\nFile \"%s\", line %d, character %d: %s.\n" - (get_input_name ()) !line_num - (Lexing.lexeme_start lexbuf - !line_start_pos+1) msg; + p.Lexing.pos_fname p.Lexing.pos_lnum + (p.Lexing.pos_cnum - p.Lexing.pos_bol + 1) msg; flush stderr let decimal_code c d u = @@ -78,6 +86,27 @@ let char_for_hexadecimal_code d u = in Char.chr (val1 * 16 + val2) +let incr_loc lexbuf delta = + let pos = lexbuf.Lexing.lex_curr_p in + lexbuf.Lexing.lex_curr_p <- { pos with + Lexing.pos_lnum = pos.Lexing.pos_lnum + 1; + Lexing.pos_bol = pos.Lexing.pos_cnum - delta; + } +;; + +let update_loc lexbuf opt_file line = + let pos = lexbuf.Lexing.lex_curr_p in + let new_file = match opt_file with + | None -> pos.Lexing.pos_fname + | Some f -> f + in + lexbuf.Lexing.lex_curr_p <- { pos with + Lexing.pos_fname = new_file; + Lexing.pos_lnum = line; + Lexing.pos_bol = pos.Lexing.pos_cnum; + } +;; + } let identstart = @@ -91,9 +120,14 @@ rule main = parse [' ' '\013' '\009' '\012' ] + { main lexbuf } | '\010' - { line_start_pos := Lexing.lexeme_end lexbuf; - incr line_num; + { incr_loc lexbuf 0; main lexbuf } + | "#" [' ' '\t']* (['0'-'9']+ as num) [' ' '\t']* + ('\"' ([^ '\010' '\013' '\"']* as name) '\"')? + [^ '\010' '\013']* '\010' + { update_loc lexbuf name (int_of_string num); + main lexbuf + } | "(*" { comment_depth := 1; handle_lexical_error comment lexbuf; @@ -121,25 +155,22 @@ rule main = parse | "'" '\\' (['0'-'9'] as c) (['0'-'9'] as d) (['0'-'9'] as u)"'" { let v = decimal_code c d u in if v > 255 then - raise - (Lexical_error - (Printf.sprintf "illegal escape sequence \\%c%c%c" c d u, - !line_num, Lexing.lexeme_start lexbuf - !line_start_pos+1)) + raise_lexical_error lexbuf + (Printf.sprintf "illegal escape sequence \\%c%c%c" c d u) else Tchar v } | "'" '\\' 'x' (['0'-'9' 'a'-'f' 'A'-'F'] as d) (['0'-'9' 'a'-'f' 'A'-'F'] as u) "'" { Tchar(Char.code(char_for_hexadecimal_code d u)) } | "'" '\\' (_ as c) - { raise - (Lexical_error - (Printf.sprintf "illegal escape sequence \\%c" c, - !line_num, Lexing.lexeme_start lexbuf - !line_start_pos+1)) + { raise_lexical_error lexbuf + (Printf.sprintf "illegal escape sequence \\%c" c) } | '{' - { let n1 = Lexing.lexeme_end lexbuf - and l1 = !line_num - and s1 = !line_start_pos in + { let p = Lexing.lexeme_end_p lexbuf in + let n1 = p.Lexing.pos_cnum + and l1 = p.Lexing.pos_lnum + and s1 = p.Lexing.pos_bol in brace_depth := 1; let n2 = handle_lexical_error action lexbuf in Taction({start_pos = n1; end_pos = n2; @@ -155,20 +186,20 @@ rule main = parse | ')' { Trparen } | '^' { Tcaret } | '-' { Tdash } + | '#' { Tsharp } | eof { Tend } | _ - { raise(Lexical_error - ("illegal character " ^ String.escaped(Lexing.lexeme lexbuf), - !line_num, Lexing.lexeme_start lexbuf - !line_start_pos+1)) } + { raise_lexical_error lexbuf + ("illegal character " ^ String.escaped(Lexing.lexeme lexbuf)) + } (* String parsing comes from the compiler lexer *) and string = parse '"' { () } - | '\\' ("\010" | "\013" | "\013\010") [' ' '\009'] * - { line_start_pos := Lexing.lexeme_end lexbuf; - incr line_num; + | '\\' ("\010" | "\013" | "\013\010") ([' ' '\009'] * as spaces) + { incr_loc lexbuf (String.length spaces); string lexbuf } | '\\' (backslash_escapes as c) { store_string_char(char_for_backslash c); @@ -192,11 +223,10 @@ and string = parse store_string_char c ; string lexbuf } | eof - { raise(Lexical_error("unterminated string", 0, 0)) } + { raise(Lexical_error("unterminated string", "", 0, 0)) } | '\010' { store_string_char '\010'; - line_start_pos := Lexing.lexeme_end lexbuf; - incr line_num; + incr_loc lexbuf 0; string lexbuf } | _ as c { store_string_char c; @@ -223,10 +253,9 @@ and comment = parse { skip_char lexbuf ; comment lexbuf } | eof - { raise(Lexical_error("unterminated comment", 0, 0)) } + { raise(Lexical_error("unterminated comment", "", 0, 0)) } | '\010' - { line_start_pos := Lexing.lexeme_end lexbuf; - incr line_num; + { incr_loc lexbuf 0; comment lexbuf } | _ { comment lexbuf } @@ -251,18 +280,17 @@ and action = parse comment lexbuf; action lexbuf } | eof - { raise (Lexical_error("unterminated action", 0, 0)) } + { raise (Lexical_error("unterminated action", "", 0, 0)) } | '\010' - { line_start_pos := Lexing.lexeme_end lexbuf; - incr line_num; + { incr_loc lexbuf 0; action lexbuf } | _ { action lexbuf } and skip_char = parse | '\\'? '\010' "'" - { line_start_pos := Lexing.lexeme_end lexbuf; - incr line_num } + { incr_loc lexbuf 1; + } | [^ '\\' '\''] "'" (* regular character *) (* one character and numeric escape sequences *) | '\\' _ "'" diff --git a/lex/lexgen.ml b/lex/lexgen.ml index 050c00b4cc..8d665e77ee 100644 --- a/lex/lexgen.ml +++ b/lex/lexgen.ml @@ -155,7 +155,7 @@ let rec do_find_opt = function let opt1,all1 = do_find_opt e1 and opt2,all2 = do_find_opt e2 in StringSet.union - (stringset_delta opt1 opt2) + (StringSet.union opt1 opt2) (stringset_delta all1 all2), StringSet.union all1 all2 | Repetition e -> diff --git a/lex/main.ml b/lex/main.ml index d97820151f..03b9ac91ef 100644 --- a/lex/main.ml +++ b/lex/main.ml @@ -55,6 +55,9 @@ let main () = let oc = open_out dest_name in let tr = Common.open_tracker dest_name oc in let lexbuf = Lexing.from_channel ic in + lexbuf.Lexing.lex_curr_p <- + {Lexing.pos_fname = source_name; Lexing.pos_lnum = 1; + Lexing.pos_bol = 0; Lexing.pos_cnum = 0}; try let def = Parser.lexer_definition Lexer.main lexbuf in let (entries, transitions) = Lexgen.make_dfa def.entrypoints in @@ -76,15 +79,22 @@ let main () = Common.close_tracker tr; Sys.remove dest_name; begin match exn with - Parsing.Parse_error -> + | Cset.Bad -> + let p = Lexing.lexeme_start_p lexbuf in + Printf.fprintf stderr + "File \"%s\", line %d, character %d: character set expected.\n" + p.Lexing.pos_fname p.Lexing.pos_lnum + (p.Lexing.pos_cnum - p.Lexing.pos_bol) + | Parsing.Parse_error -> + let p = Lexing.lexeme_start_p lexbuf in Printf.fprintf stderr "File \"%s\", line %d, character %d: syntax error.\n" - source_name !Lexer.line_num - (Lexing.lexeme_start lexbuf - !Lexer.line_start_pos) - | Lexer.Lexical_error(msg, line, col) -> + p.Lexing.pos_fname p.Lexing.pos_lnum + (p.Lexing.pos_cnum - p.Lexing.pos_bol) + | Lexer.Lexical_error(msg, file, line, col) -> Printf.fprintf stderr "File \"%s\", line %d, character %d: %s.\n" - source_name line col msg + file line col msg | Lexgen.Memory_overflow -> Printf.fprintf stderr "File \"%s\":\n Position memory overflow, too many bindings\n" diff --git a/lex/parser.mly b/lex/parser.mly index a1921309b2..8f6ff70525 100644 --- a/lex/parser.mly +++ b/lex/parser.mly @@ -40,6 +40,10 @@ let rec remove_as = function | Alternative (e1, e2) -> Alternative (remove_as e1, remove_as e2) | Repetition e -> Repetition (remove_as e) +let as_cset = function + | Characters s -> s + | _ -> raise Cset.Bad + %} %token <string> Tident @@ -47,9 +51,10 @@ let rec remove_as = function %token <string> Tstring %token <Syntax.location> Taction %token Trule Tparse Tparse_shortest Tand Tequal Tend Tor Tunderscore Teof Tlbracket Trbracket -%token Tstar Tmaybe Tplus Tlparen Trparen Tcaret Tdash Tlet Tas +%token Tstar Tmaybe Tplus Tlparen Trparen Tcaret Tdash Tlet Tas Tsharp %right Tas +%left Tsharp %left Tor %nonassoc CONCAT %nonassoc Tmaybe Tstar Tplus @@ -131,6 +136,12 @@ regexp: { Alternative(Epsilon, $1) } | regexp Tplus { Sequence(Repetition (remove_as $1), $1) } + | regexp Tsharp regexp + { + let s1 = as_cset $1 + and s2 = as_cset $3 in + Characters (Cset.diff s1 s2) + } | regexp Tor regexp { Alternative($1,$3) } | regexp regexp %prec CONCAT |