summaryrefslogtreecommitdiff
path: root/lex
diff options
context:
space:
mode:
Diffstat (limited to 'lex')
-rw-r--r--lex/.depend7
-rw-r--r--lex/Makefile51
-rw-r--r--lex/lexer.mli3
-rw-r--r--lex/lexer.mll159
-rw-r--r--lex/lexgen.ml203
-rw-r--r--lex/main.ml48
-rw-r--r--lex/output.ml146
-rw-r--r--lex/parser.mly120
-rw-r--r--lex/syntax.ml26
9 files changed, 763 insertions, 0 deletions
diff --git a/lex/.depend b/lex/.depend
new file mode 100644
index 0000000000..0379bb77b3
--- /dev/null
+++ b/lex/.depend
@@ -0,0 +1,7 @@
+lexer.cmi: parser.cmi
+parser.cmi: syntax.cmo
+lexer.cmo: lexer.cmi parser.cmi syntax.cmo
+lexgen.cmo: syntax.cmo
+main.cmo: lexgen.cmo parser.cmi output.cmo syntax.cmo lexer.cmi
+output.cmo: syntax.cmo
+parser.cmo: parser.cmi syntax.cmo
diff --git a/lex/Makefile b/lex/Makefile
new file mode 100644
index 0000000000..4104fe77c2
--- /dev/null
+++ b/lex/Makefile
@@ -0,0 +1,51 @@
+# The lexer generator
+
+CAMLC=../boot/camlrun ../boot/camlc -I ../boot
+COMPFLAGS=
+LINKFLAGS=
+CAMLYACC=../boot/camlyacc
+YACCFLAGS=
+CAMLLEX=../boot/camlrun ../boot/camllex
+CAMLDEP=../tools/camldep
+DEPFLAGS=
+
+OBJS=syntax.cmo parser.cmo lexer.cmo lexgen.cmo output.cmo main.cmo
+
+all: camllex
+
+camllex: $(OBJS)
+ $(CAMLC) $(LINKFLAGS) -o camllex $(OBJS)
+
+clean::
+ rm -f camllex
+ rm -f *.cmo *.cmi camllex
+
+parser.ml parser.mli: parser.mly
+ $(CAMLYACC) $(YACCFLAGS) parser.mly
+
+clean::
+ rm -f parser.ml parser.mli
+
+beforedepend:: parser.ml parser.mli
+
+lexer.ml: lexer.mll
+ $(CAMLLEX) lexer.mll
+
+clean::
+ rm -f lexer.ml
+
+beforedepend:: lexer.ml
+
+.SUFFIXES:
+.SUFFIXES: .ml .cmo .mli .cmi
+
+.ml.cmo:
+ $(CAMLC) -c $(COMPFLAGS) $<
+
+.mli.cmi:
+ $(CAMLC) -c $(COMPFLAGS) $<
+
+depend: beforedepend
+ $(CAMLDEP) *.mli *.ml > .depend
+
+include .depend
diff --git a/lex/lexer.mli b/lex/lexer.mli
new file mode 100644
index 0000000000..6e0b4a5073
--- /dev/null
+++ b/lex/lexer.mli
@@ -0,0 +1,3 @@
+val main: Lexing.lexbuf -> Parser.token
+
+exception Lexical_error of string
diff --git a/lex/lexer.mll b/lex/lexer.mll
new file mode 100644
index 0000000000..41d05c421a
--- /dev/null
+++ b/lex/lexer.mll
@@ -0,0 +1,159 @@
+(* The lexical analyzer for lexer definitions. Bootstrapped! *)
+
+{
+open Syntax
+open Parser
+
+(* Auxiliaries for the lexical analyzer *)
+
+let brace_depth = ref 0
+and comment_depth = ref 0
+
+exception Lexical_error of string
+
+let initial_string_buffer = String.create 256
+let string_buff = ref initial_string_buffer
+let string_index = ref 0
+
+let reset_string_buffer () =
+ string_buff := initial_string_buffer;
+ string_index := 0
+
+let store_string_char c =
+ (if !string_index >= String.length (!string_buff) then
+ let new_buff = String.create (String.length (!string_buff) * 2) in
+ String.blit new_buff 0 (!string_buff) 0 (String.length (!string_buff));
+ string_buff := new_buff;
+ ());
+ !string_buff.[!string_index] <- c;
+ incr string_index
+
+let get_stored_string () =
+ let s = String.sub (!string_buff) 0 (!string_index) in
+ string_buff := initial_string_buffer;
+ s
+
+let char_for_backslash = function
+ 'n' -> '\010' (* '\n' when bootstrapped *)
+ | 't' -> '\009' (* '\t' *)
+ | 'b' -> '\008' (* '\b' *)
+ | 'r' -> '\013' (* '\r' *)
+ | c -> c
+
+let char_for_decimal_code lexbuf i =
+ Char.chr(100 * (Char.code(Lexing.lexeme_char lexbuf i) - 48) +
+ 10 * (Char.code(Lexing.lexeme_char lexbuf (i+1)) - 48) +
+ (Char.code(Lexing.lexeme_char lexbuf (i+2)) - 48))
+
+}
+
+rule main = parse
+ [' ' '\010' '\013' '\009' ] +
+ { main lexbuf }
+ | "(*"
+ { comment_depth := 1;
+ comment lexbuf;
+ main lexbuf }
+ | ['A'-'Z' 'a'-'z'] ['A'-'Z' 'a'-'z' '\'' '_' '0'-'9'] *
+ { match Lexing.lexeme lexbuf with
+ "rule" -> Trule
+ | "parse" -> Tparse
+ | "and" -> Tand
+ | "eof" -> Teof
+ | s -> Tident s }
+ | '"'
+ { reset_string_buffer();
+ string lexbuf;
+ Tstring(get_stored_string()) }
+ | "`" [^ '\\'] "`"
+ { Tchar(Lexing.lexeme_char lexbuf 1) }
+ | "`" '\\' ['\\' '`' 'n' 't' 'b' 'r'] "`"
+ { Tchar(char_for_backslash (Lexing.lexeme_char lexbuf 2)) }
+ | "`" '\\' ['0'-'9'] ['0'-'9'] ['0'-'9'] "`"
+ { Tchar(char_for_decimal_code lexbuf 2) }
+ | "'" [^ '\\'] "'"
+ { Tchar(Lexing.lexeme_char lexbuf 1) }
+ | "'" '\\' ['\\' '\'' 'n' 't' 'b' 'r'] "'"
+ { Tchar(char_for_backslash (Lexing.lexeme_char lexbuf 2)) }
+ | "'" '\\' ['0'-'9'] ['0'-'9'] ['0'-'9'] "'"
+ { Tchar(char_for_decimal_code lexbuf 2) }
+ | '{'
+ { let n1 = Lexing.lexeme_end lexbuf in
+ brace_depth := 1;
+ let n2 = action lexbuf in
+ Taction(Location(n1, n2)) }
+ | '=' { Tequal }
+ | '|' { Tor }
+ | '_' { Tunderscore }
+ | "eof" { Teof }
+ | '[' { Tlbracket }
+ | ']' { Trbracket }
+ | '*' { Tstar }
+ | '?' { Tmaybe }
+ | '+' { Tplus }
+ | '(' { Tlparen }
+ | ')' { Trparen }
+ | '^' { Tcaret }
+ | '-' { Tdash }
+ | eof { Tend }
+ | _
+ { raise(Lexical_error
+ ("illegal character " ^ String.escaped(Lexing.lexeme lexbuf))) }
+
+and action = parse
+ '{'
+ { incr brace_depth;
+ action lexbuf }
+ | '}'
+ { decr brace_depth;
+ if !brace_depth == 0 then Lexing.lexeme_start lexbuf else action lexbuf }
+ | '"'
+ { reset_string_buffer();
+ string lexbuf;
+ reset_string_buffer();
+ action lexbuf }
+ | "'{'"
+ { action lexbuf }
+ | "'{'"
+ { action lexbuf }
+ | "(*"
+ { comment_depth := 1;
+ comment lexbuf;
+ action lexbuf }
+ | eof
+ { raise (Lexical_error "unterminated action") }
+ | _
+ { action lexbuf }
+
+and string = parse
+ '"'
+ { () }
+ | '\\' [' ' '\010' '\013' '\009' '\026' '\012'] +
+ { string lexbuf }
+ | '\\' ['\\' '"' 'n' 't' 'b' 'r']
+ { store_string_char(char_for_backslash(Lexing.lexeme_char lexbuf 1));
+ string lexbuf }
+ | '\\' ['0'-'9'] ['0'-'9'] ['0'-'9']
+ { store_string_char(char_for_decimal_code lexbuf 1);
+ string lexbuf }
+ | eof
+ { raise(Lexical_error "unterminated string") }
+ | _
+ { store_string_char(Lexing.lexeme_char lexbuf 0);
+ string lexbuf }
+
+and comment = parse
+ "(*"
+ { incr comment_depth; comment lexbuf }
+ | "*)"
+ { decr comment_depth;
+ if !comment_depth == 0 then () else comment lexbuf }
+ | '"'
+ { reset_string_buffer();
+ string lexbuf;
+ reset_string_buffer();
+ comment lexbuf }
+ | eof
+ { raise(Lexical_error "unterminated comment") }
+ | _
+ { comment lexbuf }
diff --git a/lex/lexgen.ml b/lex/lexgen.ml
new file mode 100644
index 0000000000..6870419470
--- /dev/null
+++ b/lex/lexgen.ml
@@ -0,0 +1,203 @@
+(* Compiling a lexer definition *)
+
+open Syntax
+
+(* Deep abstract syntax for regular expressions *)
+
+type regexp =
+ Empty
+ | Chars of int
+ | Action of int
+ | Seq of regexp * regexp
+ | Alt of regexp * regexp
+ | Star of regexp
+
+(* From shallow to deep syntax *)
+
+let chars = ref ([] : char list list)
+let chars_count = ref 0
+let actions = ref ([] : (int * location) list)
+let actions_count = ref 0
+
+let rec encode_regexp = function
+ Epsilon -> Empty
+ | Characters cl ->
+ let n = !chars_count in
+ chars := cl :: !chars;
+ incr chars_count;
+ Chars(n)
+ | Sequence(r1,r2) ->
+ Seq(encode_regexp r1, encode_regexp r2)
+ | Alternative(r1,r2) ->
+ Alt(encode_regexp r1, encode_regexp r2)
+ | Repetition r ->
+ Star (encode_regexp r)
+
+let encode_casedef =
+ List.fold_left
+ (fun reg (expr,act) ->
+ let act_num = !actions_count in
+ incr actions_count;
+ actions := (act_num, act) :: !actions;
+ Alt(reg, Seq(encode_regexp expr, Action act_num)))
+ Empty
+
+let encode_lexdef (Lexdef(_, ld)) =
+ chars := [];
+ chars_count := 0;
+ actions := [];
+ actions_count := 0;
+ let name_regexp_list =
+ List.map (fun (name, casedef) -> (name, encode_casedef casedef)) ld in
+ let chr = Array.of_list (List.rev !chars)
+ and act = !actions in
+ chars := [];
+ actions := [];
+ (chr, name_regexp_list, act)
+
+(* To generate directly a NFA from a regular expression.
+ Confer Aho-Sethi-Ullman, dragon book, chap. 3 *)
+
+type transition =
+ OnChars of int
+ | ToAction of int
+
+let rec merge_trans s1 s2 =
+ match (s1, s2) with
+ ([], _) -> s2
+ | (_, []) -> s1
+ | ((OnChars n1 as t1) :: r1, (OnChars n2 as t2) :: r2) ->
+ if n1 == n2 then t1 :: merge_trans r1 r2 else
+ if n1 < n2 then t1 :: merge_trans r1 s2 else
+ t2 :: merge_trans s1 r2
+ | ((ToAction n1 as t1) :: r1, (ToAction n2 as t2) :: r2) ->
+ if n1 == n2 then t1 :: merge_trans r1 r2 else
+ if n1 < n2 then t1 :: merge_trans r1 s2 else
+ t2 :: merge_trans s1 r2
+ | ((OnChars n1 as t1) :: r1, (ToAction n2 as t2) :: r2) ->
+ t1 :: merge_trans r1 s2
+ | ((ToAction n1 as t1) :: r1, (OnChars n2 as t2) :: r2) ->
+ t2 :: merge_trans s1 r2
+
+let rec nullable = function
+ Empty -> true
+ | Chars _ -> false
+ | Action _ -> false
+ | Seq(r1,r2) -> nullable r1 & nullable r2
+ | Alt(r1,r2) -> nullable r1 or nullable r2
+ | Star r -> true
+
+let rec firstpos = function
+ Empty -> []
+ | Chars pos -> [OnChars pos]
+ | Action act -> [ToAction act]
+ | Seq(r1,r2) -> if nullable r1
+ then merge_trans (firstpos r1) (firstpos r2)
+ else firstpos r1
+ | Alt(r1,r2) -> merge_trans (firstpos r1) (firstpos r2)
+ | Star r -> firstpos r
+
+let rec lastpos = function
+ Empty -> []
+ | Chars pos -> [OnChars pos]
+ | Action act -> [ToAction act]
+ | Seq(r1,r2) -> if nullable r2
+ then merge_trans (lastpos r1) (lastpos r2)
+ else lastpos r2
+ | Alt(r1,r2) -> merge_trans (lastpos r1) (lastpos r2)
+ | Star r -> lastpos r
+
+let followpos size name_regexp_list =
+ let v = Array.new size [] in
+ let fill_pos first = function
+ OnChars pos -> v.(pos) <- merge_trans first v.(pos); ()
+ | ToAction _ -> () in
+ let rec fill = function
+ Seq(r1,r2) ->
+ fill r1; fill r2;
+ List.iter (fill_pos (firstpos r2)) (lastpos r1)
+ | Alt(r1,r2) ->
+ fill r1; fill r2
+ | Star r ->
+ fill r;
+ List.iter (fill_pos (firstpos r)) (lastpos r)
+ | _ -> () in
+ List.iter (fun (name, regexp) -> fill regexp) name_regexp_list;
+ v
+
+let no_action = 32767
+
+let split_trans_set = List.fold_left
+ (fun (act, pos_set as act_pos_set) ->
+ function OnChars pos -> (act, pos :: pos_set)
+ | ToAction act1 -> if act1 < act then (act1, pos_set)
+ else act_pos_set)
+ (no_action, [])
+
+let memory = (Hashtbl.new 131 : (transition list, int) Hashtbl.t)
+and todo = ref ([] : (transition list * int) list)
+and next = ref 0
+
+let reset_state_mem () =
+ Hashtbl.clear memory; todo := []; next := 0; ()
+
+let get_state st =
+ try
+ Hashtbl.find memory st
+ with Not_found ->
+ let nbr = !next in
+ incr next;
+ Hashtbl.add memory st nbr;
+ todo := (st, nbr) :: !todo;
+ nbr
+
+let rec map_on_states f =
+ match !todo with
+ [] -> []
+ | (st,i)::r -> todo := r; let res = f st in (res,i) :: map_on_states f
+
+let number_of_states () =
+ !next
+
+let goto_state = function
+ [] -> Backtrack
+ | ps -> Goto (get_state ps)
+
+let transition_from chars follow pos_set =
+ let tr = Array.new 256 []
+ and shift = Array.new 256 Backtrack in
+ List.iter
+ (fun pos ->
+ List.iter
+ (fun c ->
+ tr.(Char.code c) <-
+ merge_trans tr.(Char.code c) follow.(pos))
+ chars.(pos))
+ pos_set;
+ for i = 0 to 255 do
+ shift.(i) <- goto_state tr.(i)
+ done;
+ shift
+
+let translate_state chars follow state =
+ match split_trans_set state with
+ n, [] -> Perform n
+ | n, ps -> Shift( (if n == no_action then No_remember else Remember n),
+ transition_from chars follow ps)
+
+let make_dfa lexdef =
+ let (chars, name_regexp_list, actions) =
+ encode_lexdef lexdef in
+ let follow =
+ followpos (Array.length chars) name_regexp_list in
+ reset_state_mem();
+ let initial_states =
+ List.map (fun (name, regexp) -> (name, get_state(firstpos regexp)))
+ name_regexp_list in
+ let states =
+ map_on_states (translate_state chars follow) in
+ let v =
+ Array.new (number_of_states()) (Perform 0) in
+ List.iter (fun (auto, i) -> v.(i) <- auto) states;
+ reset_state_mem();
+ (initial_states, v, actions)
diff --git a/lex/main.ml b/lex/main.ml
new file mode 100644
index 0000000000..aaefe487dd
--- /dev/null
+++ b/lex/main.ml
@@ -0,0 +1,48 @@
+(* The lexer generator. Command-line parsing. *)
+
+open Syntax
+open Lexgen
+open Output
+
+let main () =
+ if Array.length Sys.argv != 2 then begin
+ prerr_endline "Usage: camllex <input file>";
+ exit 2
+ end;
+ let source_name = Sys.argv.(1) in
+ let dest_name =
+ if Filename.check_suffix source_name ".mll" then
+ Filename.chop_suffix source_name ".mll" ^ ".ml"
+ else
+ source_name ^ ".ml" in
+ ic := open_in_bin source_name;
+ oc := open_out dest_name;
+ let lexbuf =
+ Lexing.from_channel !ic in
+ let (Lexdef(header,_) as def) =
+ try
+ Parser.lexer_definition Lexer.main lexbuf
+ with exn ->
+ close_out !oc;
+ Sys.remove dest_name;
+ begin match exn with
+ Parsing.Parse_error ->
+ prerr_string "Syntax error around char ";
+ prerr_int (Lexing.lexeme_start lexbuf);
+ prerr_endline "."
+ | Lexer.Lexical_error s ->
+ prerr_string "Lexical error around char ";
+ prerr_int (Lexing.lexeme_start lexbuf);
+ prerr_string ": ";
+ prerr_string s;
+ prerr_endline "."
+ | _ -> raise exn
+ end;
+ exit 2 in
+ let ((init, states, acts) as dfa) = make_dfa def in
+ output_lexdef header dfa;
+ close_in !ic;
+ close_out !oc
+
+let _ = Printexc.catch main (); exit 0
+
diff --git a/lex/output.ml b/lex/output.ml
new file mode 100644
index 0000000000..b3ca459c1b
--- /dev/null
+++ b/lex/output.ml
@@ -0,0 +1,146 @@
+(* Generating a DFA as a set of mutually recursive functions *)
+
+open Syntax
+
+let ic = ref stdin
+and oc = ref stdout
+
+(* 1- Generating the actions *)
+
+let copy_buffer = String.create 1024
+
+let copy_chunk (Location(start,stop)) =
+ let rec copy s =
+ if s <= 0 then () else
+ let n = if s < 1024 then s else 1024 in
+ let m = input !ic copy_buffer 0 n in
+ output !oc copy_buffer 0 m;
+ copy (s - m)
+ in
+ seek_in !ic start;
+ copy (stop - start)
+
+let output_action (i,act) =
+ output_string !oc ("action_" ^ string_of_int i ^ " lexbuf = (\n");
+ copy_chunk act;
+ output_string !oc ")\nand ";
+ ()
+
+(* 2- Generating the states *)
+
+let states = ref ([||] : automata array)
+
+let enumerate_vect v =
+ let rec enum env pos =
+ if pos >= Array.length v then env else
+ try
+ let pl = List.assoc v.(pos) env in
+ pl := pos :: !pl; enum env (succ pos)
+ with Not_found ->
+ enum ((v.(pos), ref [pos]) :: env) (succ pos) in
+ Sort.list
+ (fun (e1, pl1) (e2, pl2) -> List.length !pl1 >= List.length !pl2)
+ (enum [] 0)
+
+let output_move = function
+ Backtrack ->
+ output_string !oc "backtrack lexbuf"
+ | Goto dest ->
+ match !states.(dest) with
+ Perform act_num ->
+ output_string !oc ("action_" ^ string_of_int act_num ^ " lexbuf")
+ | _ ->
+ output_string !oc ("state_" ^ string_of_int dest ^ " lexbuf")
+
+let output_char_for_read oc = function
+ '\'' -> output_string oc "\\'"
+ | '\\' -> output_string oc "\\\\"
+ | '\n' -> output_string oc "\\n"
+ | '\t' -> output_string oc "\\t"
+ | c ->
+ let n = Char.code c in
+ if n >= 32 & n < 127 then
+ output_char oc c
+ else begin
+ output_char oc '\\';
+ output_char oc (Char.chr (48 + n / 100));
+ output_char oc (Char.chr (48 + (n / 10) mod 10));
+ output_char oc (Char.chr (48 + n mod 10))
+ end
+
+let rec output_chars = function
+ [] ->
+ failwith "output_chars"
+ | [c] ->
+ output_string !oc "'";
+ output_char_for_read !oc (Char.chr c);
+ output_string !oc "'"
+ | c::cl ->
+ output_string !oc "'";
+ output_char_for_read !oc (Char.chr c);
+ output_string !oc "'|";
+ output_chars cl
+
+let output_one_trans (dest, chars) =
+ output_chars !chars;
+ output_string !oc " -> ";
+ output_move dest;
+ output_string !oc "\n | ";
+ ()
+
+let output_all_trans trans =
+ output_string !oc " match get_next_char lexbuf with\n ";
+ match enumerate_vect trans with
+ [] ->
+ failwith "output_all_trans"
+ | (default, _) :: rest ->
+ List.iter output_one_trans rest;
+ output_string !oc "_ -> ";
+ output_move default;
+ output_string !oc "\nand ";
+ ()
+
+let output_state state_num = function
+ Perform i ->
+ ()
+ | Shift(what_to_do, moves) ->
+ output_string !oc
+ ("state_" ^ string_of_int state_num ^ " lexbuf =\n");
+ begin match what_to_do with
+ No_remember -> ()
+ | Remember i ->
+ output_string !oc " lexbuf.lex_last_pos <- lexbuf.lex_curr_pos;\n";
+ output_string !oc (" lexbuf.lex_last_action <- Obj.magic action_" ^
+ string_of_int i ^ ";\n")
+ end;
+ output_all_trans moves
+
+(* 3- Generating the entry points *)
+
+let rec output_entries = function
+ [] -> failwith "output_entries"
+ | (name,state_num) :: rest ->
+ output_string !oc (name ^ " lexbuf =\n");
+ output_string !oc " start_lexing lexbuf;\n";
+ output_string !oc (" state_" ^ string_of_int state_num ^ " lexbuf\n");
+ match rest with
+ [] -> output_string !oc "\n"; ()
+ | _ -> output_string !oc "\nand "; output_entries rest
+
+(* All together *)
+
+let output_lexdef header (initial_st, st, actions) =
+ print_int (Array.length st); print_string " states, ";
+ print_int (List.length actions); print_string " actions.";
+ print_newline();
+ output_string !oc "open Obj\nopen Lexing\n\n";
+ copy_chunk header;
+ output_string !oc "\nlet rec ";
+ states := st;
+ List.iter output_action actions;
+ for i = 0 to Array.length st - 1 do
+ output_state i st.(i)
+ done;
+ output_entries initial_st
+
+
diff --git a/lex/parser.mly b/lex/parser.mly
new file mode 100644
index 0000000000..e8851df101
--- /dev/null
+++ b/lex/parser.mly
@@ -0,0 +1,120 @@
+/* The grammar for lexer definitions */
+
+%{
+open Syntax
+
+(* Auxiliaries for the parser. *)
+
+let regexp_for_string s =
+ let rec re_string n =
+ if n >= String.length s then Epsilon
+ else if succ n = String.length s then Characters([s.[n]])
+ else Sequence(Characters([s.[n]]), re_string (succ n))
+ in re_string 0
+
+let char_class c1 c2 =
+ let rec class n =
+ if n > (Char.code c2) then [] else (Char.chr n) :: class(succ n)
+ in class (Char.code c1)
+
+let all_chars = char_class (Char.chr 1) (Char.chr 255)
+
+let rec subtract l1 l2 =
+ match l1 with
+ [] -> []
+ | a::r -> if List.mem a l2 then subtract r l2 else a :: subtract r l2
+%}
+
+%token <string> Tident
+%token <char> Tchar
+%token <string> Tstring
+%token <Syntax.location> Taction
+%token Trule Tparse Tand Tequal Tend Tor Tunderscore Teof Tlbracket Trbracket
+%token Tstar Tmaybe Tplus Tlparen Trparen Tcaret Tdash
+
+%left Tor
+%left CONCAT
+%nonassoc Tmaybe
+%left Tstar
+%left Tplus
+
+%start lexer_definition
+%type <Syntax.lexer_definition> lexer_definition
+
+%%
+
+lexer_definition:
+ header Trule definition other_definitions Tend
+ { Lexdef($1, $3::(List.rev $4)) }
+;
+header:
+ Taction
+ { $1 }
+ |
+ { Location(0,0) }
+;
+other_definitions:
+ other_definitions Tand definition
+ { $3::$1 }
+ |
+ { [] }
+;
+definition:
+ Tident Tequal entry
+ { ($1,$3) }
+;
+entry:
+ Tparse case rest_of_entry
+ { $2::List.rev $3 }
+;
+rest_of_entry:
+ rest_of_entry Tor case
+ { $3::$1 }
+ |
+ { [] }
+;
+case:
+ regexp Taction
+ { ($1,$2) }
+;
+regexp:
+ Tunderscore
+ { Characters all_chars }
+ | Teof
+ { Characters ['\000'] }
+ | Tchar
+ { Characters [$1] }
+ | Tstring
+ { regexp_for_string $1 }
+ | Tlbracket char_class Trbracket
+ { Characters $2 }
+ | regexp Tstar
+ { Repetition $1 }
+ | regexp Tmaybe
+ { Alternative($1, Epsilon) }
+ | regexp Tplus
+ { Sequence($1, Repetition $1) }
+ | regexp Tor regexp
+ { Alternative($1,$3) }
+ | regexp regexp %prec CONCAT
+ { Sequence($1,$2) }
+ | Tlparen regexp Trparen
+ { $2 }
+;
+char_class:
+ Tcaret char_class1
+ { subtract all_chars $2 }
+ | char_class1
+ { $1 }
+;
+char_class1:
+ Tchar Tdash Tchar
+ { char_class $1 $3 }
+ | Tchar
+ { [$1] }
+ | char_class1 char_class1 %prec CONCAT
+ { $1 @ $2 }
+;
+
+%%
+
diff --git a/lex/syntax.ml b/lex/syntax.ml
new file mode 100644
index 0000000000..f692e6f625
--- /dev/null
+++ b/lex/syntax.ml
@@ -0,0 +1,26 @@
+(* The shallow abstract syntax *)
+
+type location =
+ Location of int * int
+
+type regular_expression =
+ Epsilon
+ | Characters of char list
+ | Sequence of regular_expression * regular_expression
+ | Alternative of regular_expression * regular_expression
+ | Repetition of regular_expression
+
+type lexer_definition =
+ Lexdef of location * (string * (regular_expression * location) list) list
+
+(* Representation of automata *)
+
+type automata =
+ Perform of int
+ | Shift of automata_trans * automata_move array
+and automata_trans =
+ No_remember
+ | Remember of int
+and automata_move =
+ Backtrack
+ | Goto of int