9 files changed, 763 insertions, 0 deletions
diff --git a/lex/.depend b/lex/.depend
new file mode 100644
index 0000000000..0379bb77b3
--- /dev/null
+++ b/lex/.depend
@@ -0,0 +1,7 @@
+lexer.cmi: parser.cmi 
+parser.cmi: syntax.cmo 
+lexer.cmo: lexer.cmi parser.cmi syntax.cmo 
+lexgen.cmo: syntax.cmo 
+main.cmo: lexgen.cmo parser.cmi output.cmo syntax.cmo lexer.cmi 
+output.cmo: syntax.cmo 
+parser.cmo: parser.cmi syntax.cmo 
diff --git a/lex/Makefile b/lex/Makefile
new file mode 100644
index 0000000000..4104fe77c2
--- /dev/null
+++ b/lex/Makefile
@@ -0,0 +1,51 @@
+# The lexer generator
+
+CAMLC=../boot/camlrun ../boot/camlc -I ../boot
+COMPFLAGS=
+LINKFLAGS=
+CAMLYACC=../boot/camlyacc
+YACCFLAGS=
+CAMLLEX=../boot/camlrun ../boot/camllex
+CAMLDEP=../tools/camldep
+DEPFLAGS=
+
+OBJS=syntax.cmo parser.cmo lexer.cmo lexgen.cmo output.cmo main.cmo
+
+all: camllex
+
+camllex: $(OBJS)
+	$(CAMLC) $(LINKFLAGS) -o camllex $(OBJS)
+
+clean::
+	rm -f camllex
+	rm -f *.cmo *.cmi camllex
+
+parser.ml parser.mli: parser.mly
+	$(CAMLYACC) $(YACCFLAGS) parser.mly
+
+clean::
+	rm -f parser.ml parser.mli
+
+beforedepend:: parser.ml parser.mli
+
+lexer.ml: lexer.mll
+	$(CAMLLEX) lexer.mll
+
+clean::
+	rm -f lexer.ml
+
+beforedepend:: lexer.ml
+
+.SUFFIXES:
+.SUFFIXES: .ml .cmo .mli .cmi
+
+.ml.cmo:
+	$(CAMLC) -c $(COMPFLAGS) $<
+
+.mli.cmi:
+	$(CAMLC) -c $(COMPFLAGS) $<
+
+depend: beforedepend
+	$(CAMLDEP) *.mli *.ml > .depend
+
+include .depend
diff --git a/lex/lexer.mli b/lex/lexer.mli
new file mode 100644
index 0000000000..6e0b4a5073
--- /dev/null
+++ b/lex/lexer.mli
@@ -0,0 +1,3 @@
+val main: Lexing.lexbuf -> Parser.token
+
+exception Lexical_error of string
diff --git a/lex/lexer.mll b/lex/lexer.mll
new file mode 100644
index 0000000000..41d05c421a
--- /dev/null
+++ b/lex/lexer.mll
@@ -0,0 +1,159 @@
+(* The lexical analyzer for lexer definitions. Bootstrapped! *)
+
+{
+open Syntax
+open Parser
+
+(* Auxiliaries for the lexical analyzer *)
+
+let brace_depth = ref 0
+and comment_depth = ref 0
+
+exception Lexical_error of string
+
+let initial_string_buffer = String.create 256
+let string_buff = ref initial_string_buffer
+let string_index = ref 0
+
+let reset_string_buffer () =
+  string_buff := initial_string_buffer;
+  string_index := 0
+
+let store_string_char c =
+  (if !string_index >= String.length (!string_buff) then
+    let new_buff = String.create (String.length (!string_buff) * 2) in
+      String.blit new_buff 0 (!string_buff) 0 (String.length (!string_buff));
+      string_buff := new_buff;
+      ());
+  !string_buff.[!string_index] <- c;
+  incr string_index
+
+let get_stored_string () =
+  let s = String.sub (!string_buff) 0 (!string_index) in
+    string_buff := initial_string_buffer;
+    s
+
+let char_for_backslash = function
+    'n' -> '\010' (* '\n' when bootstrapped *)
+  | 't' -> '\009' (* '\t' *)
+  | 'b' -> '\008' (* '\b' *)
+  | 'r' -> '\013' (* '\r' *)
+  | c   -> c
+
+let char_for_decimal_code lexbuf i =
+  Char.chr(100 * (Char.code(Lexing.lexeme_char lexbuf i) - 48) +
+               10 * (Char.code(Lexing.lexeme_char lexbuf (i+1)) - 48) +
+                    (Char.code(Lexing.lexeme_char lexbuf (i+2)) - 48))
+
+}
+
+rule main = parse
+    [' ' '\010' '\013' '\009' ] + 
+    { main lexbuf }
+  | "(*" 
+    { comment_depth := 1;
+      comment lexbuf;
+      main lexbuf }
+  | ['A'-'Z' 'a'-'z'] ['A'-'Z' 'a'-'z' '\'' '_' '0'-'9'] *
+    { match Lexing.lexeme lexbuf with
+        "rule" -> Trule
+      | "parse" -> Tparse
+      | "and" -> Tand
+      | "eof" -> Teof
+      | s -> Tident s }
+  | '"' 
+    { reset_string_buffer();
+      string lexbuf;
+      Tstring(get_stored_string()) }
+  | "`" [^ '\\'] "`" 
+    { Tchar(Lexing.lexeme_char lexbuf 1) }
+  | "`" '\\' ['\\' '`' 'n' 't' 'b' 'r'] "`" 
+    { Tchar(char_for_backslash (Lexing.lexeme_char lexbuf 2)) }
+  | "`" '\\' ['0'-'9'] ['0'-'9'] ['0'-'9'] "`" 
+    { Tchar(char_for_decimal_code lexbuf 2) }
+  | "'" [^ '\\'] "'" 
+    { Tchar(Lexing.lexeme_char lexbuf 1) }
+  | "'" '\\' ['\\' '\'' 'n' 't' 'b' 'r'] "'" 
+    { Tchar(char_for_backslash (Lexing.lexeme_char lexbuf 2)) }
+  | "'" '\\' ['0'-'9'] ['0'-'9'] ['0'-'9'] "'" 
+    { Tchar(char_for_decimal_code lexbuf 2) }
+  | '{' 
+    { let n1 = Lexing.lexeme_end lexbuf in
+        brace_depth := 1;
+        let n2 = action lexbuf in
+          Taction(Location(n1, n2)) }
+  | '='  { Tequal }
+  | '|'  { Tor }
+  | '_'  { Tunderscore }
+  | "eof"  { Teof }
+  | '['  { Tlbracket }
+  | ']'  { Trbracket }
+  | '*'  { Tstar }
+  | '?'  { Tmaybe }
+  | '+'  { Tplus }
+  | '('  { Tlparen }
+  | ')'  { Trparen }
+  | '^'  { Tcaret }
+  | '-'  { Tdash }
+  | eof  { Tend }
+  | _
+    { raise(Lexical_error
+             ("illegal character " ^ String.escaped(Lexing.lexeme lexbuf))) }
+
+and action = parse
+    '{' 
+    { incr brace_depth;
+      action lexbuf }
+  | '}' 
+    { decr brace_depth;
+      if !brace_depth == 0 then Lexing.lexeme_start lexbuf else action lexbuf }
+  | '"' 
+    { reset_string_buffer();
+      string lexbuf;
+      reset_string_buffer();
+      action lexbuf }
+  | "'{'"
+    { action lexbuf }
+  | "'{'"
+    { action lexbuf }
+  | "(*" 
+    { comment_depth := 1;
+      comment lexbuf;
+      action lexbuf }
+  | eof 
+    { raise (Lexical_error "unterminated action") }
+  | _ 
+    { action lexbuf }
+      
+and string = parse
+    '"' 
+    { () }
+  | '\\' [' ' '\010' '\013' '\009' '\026' '\012'] +
+    { string lexbuf }
+  | '\\' ['\\' '"' 'n' 't' 'b' 'r'] 
+    { store_string_char(char_for_backslash(Lexing.lexeme_char lexbuf 1));
+      string lexbuf }
+  | '\\' ['0'-'9'] ['0'-'9'] ['0'-'9'] 
+    { store_string_char(char_for_decimal_code lexbuf 1);
+      string lexbuf }
+  | eof 
+    { raise(Lexical_error "unterminated string") }
+  | _ 
+    { store_string_char(Lexing.lexeme_char lexbuf 0);
+      string lexbuf }
+
+and comment = parse
+    "(*" 
+    { incr comment_depth; comment lexbuf }
+  | "*)" 
+    { decr comment_depth;
+      if !comment_depth == 0 then () else comment lexbuf }
+  | '"' 
+    { reset_string_buffer();
+      string lexbuf;
+      reset_string_buffer();
+      comment lexbuf }
+  | eof 
+    { raise(Lexical_error "unterminated comment") }
+  | _ 
+    { comment lexbuf }
diff --git a/lex/lexgen.ml b/lex/lexgen.ml
new file mode 100644
index 0000000000..6870419470
--- /dev/null
+++ b/lex/lexgen.ml
@@ -0,0 +1,203 @@
+(* Compiling a lexer definition *)
+
+open Syntax
+
+(* Deep abstract syntax for regular expressions *)
+
+type regexp =
+    Empty
+  | Chars of int
+  | Action of int
+  | Seq of regexp * regexp
+  | Alt of regexp * regexp
+  | Star of regexp
+
+(* From shallow to deep syntax *)
+
+let chars = ref ([] : char list list)
+let chars_count = ref 0
+let actions = ref ([] : (int * location) list)
+let actions_count = ref 0
+
+let rec encode_regexp = function
+    Epsilon -> Empty
+  | Characters cl ->
+      let n = !chars_count in
+      chars := cl :: !chars;
+      incr chars_count;
+      Chars(n)
+  | Sequence(r1,r2) ->
+      Seq(encode_regexp r1, encode_regexp r2)
+  | Alternative(r1,r2) ->
+      Alt(encode_regexp r1, encode_regexp r2)
+  | Repetition r ->
+      Star (encode_regexp r)
+
+let encode_casedef =
+  List.fold_left
+   (fun reg (expr,act) ->
+     let act_num = !actions_count in
+     incr actions_count;
+     actions := (act_num, act) :: !actions;
+     Alt(reg, Seq(encode_regexp expr, Action act_num)))
+  Empty
+
+let encode_lexdef (Lexdef(_, ld)) =
+  chars := [];
+  chars_count := 0;
+  actions := [];
+  actions_count := 0;
+  let name_regexp_list =
+    List.map (fun (name, casedef) -> (name, encode_casedef casedef)) ld in
+  let chr = Array.of_list (List.rev !chars)
+  and act = !actions in
+  chars := [];
+  actions := [];
+  (chr, name_regexp_list, act)
+
+(* To generate directly a NFA from a regular expression.
+   Confer Aho-Sethi-Ullman, dragon book, chap. 3 *)
+
+type transition =
+    OnChars of int
+  | ToAction of int
+
+let rec merge_trans s1 s2 =
+  match (s1, s2) with
+    ([], _) -> s2
+  | (_, []) -> s1
+  | ((OnChars n1 as t1) :: r1, (OnChars n2 as t2) :: r2) ->
+      if n1 == n2 then t1 :: merge_trans r1 r2 else
+      if n1 <  n2 then t1 :: merge_trans r1 s2 else
+                       t2 :: merge_trans s1 r2
+  | ((ToAction n1 as t1) :: r1, (ToAction n2 as t2) :: r2) ->
+      if n1 == n2 then t1 :: merge_trans r1 r2 else
+      if n1 <  n2 then t1 :: merge_trans r1 s2 else
+                       t2 :: merge_trans s1 r2
+  | ((OnChars n1 as t1) :: r1, (ToAction n2 as t2) :: r2) ->
+      t1 :: merge_trans r1 s2
+  | ((ToAction n1 as t1) :: r1, (OnChars n2 as t2) :: r2) ->
+      t2 :: merge_trans s1 r2
+
+let rec nullable = function
+    Empty      -> true
+  | Chars _    -> false
+  | Action _   -> false
+  | Seq(r1,r2) -> nullable r1 & nullable r2
+  | Alt(r1,r2) -> nullable r1 or nullable r2
+  | Star r     -> true
+
+let rec firstpos = function
+    Empty      -> []
+  | Chars pos  -> [OnChars pos]
+  | Action act -> [ToAction act]
+  | Seq(r1,r2) -> if nullable r1
+                  then merge_trans (firstpos r1) (firstpos r2)
+                  else firstpos r1
+  | Alt(r1,r2) -> merge_trans (firstpos r1) (firstpos r2)
+  | Star r     -> firstpos r
+
+let rec lastpos = function
+    Empty      -> []
+  | Chars pos  -> [OnChars pos]
+  | Action act -> [ToAction act]
+  | Seq(r1,r2) -> if nullable r2
+                  then merge_trans (lastpos r1) (lastpos r2)
+                  else lastpos r2
+  | Alt(r1,r2) -> merge_trans (lastpos r1) (lastpos r2)
+  | Star r     -> lastpos r
+
+let followpos size name_regexp_list =
+  let v = Array.new size [] in
+    let fill_pos first = function
+        OnChars pos -> v.(pos) <- merge_trans first v.(pos); ()
+      | ToAction _  -> () in
+    let rec fill = function
+        Seq(r1,r2) ->
+          fill r1; fill r2;
+          List.iter (fill_pos (firstpos r2)) (lastpos r1)
+      | Alt(r1,r2) ->
+          fill r1; fill r2
+      | Star r ->
+          fill r;
+          List.iter (fill_pos (firstpos r)) (lastpos r)
+      | _ -> () in
+    List.iter (fun (name, regexp) -> fill regexp) name_regexp_list;
+    v
+
+let no_action = 32767
+
+let split_trans_set = List.fold_left
+  (fun (act, pos_set as act_pos_set) ->
+     function OnChars pos   -> (act, pos :: pos_set)
+         |    ToAction act1 -> if act1 < act then (act1, pos_set)
+                                             else act_pos_set)
+  (no_action, [])
+
+let memory  = (Hashtbl.new 131 : (transition list, int) Hashtbl.t)
+and todo    = ref ([] : (transition list * int) list)
+and next    = ref 0
+
+let reset_state_mem () =
+  Hashtbl.clear memory; todo := []; next := 0; ()
+
+let get_state st = 
+  try
+    Hashtbl.find memory st
+  with Not_found ->
+    let nbr = !next in
+    incr next;
+    Hashtbl.add memory st nbr;
+    todo := (st, nbr) :: !todo;
+    nbr
+
+let rec map_on_states f =
+  match !todo with
+    []  -> []
+  | (st,i)::r -> todo := r; let res = f st in (res,i) :: map_on_states f
+
+let number_of_states () =
+  !next
+
+let goto_state = function
+    [] -> Backtrack
+  | ps -> Goto (get_state ps)
+
+let transition_from chars follow pos_set = 
+  let tr = Array.new 256 []
+  and shift = Array.new 256 Backtrack in
+    List.iter
+      (fun pos ->
+        List.iter
+          (fun c ->
+             tr.(Char.code c) <-
+               merge_trans tr.(Char.code c) follow.(pos))
+          chars.(pos))
+      pos_set;
+    for i = 0 to 255 do
+      shift.(i) <- goto_state tr.(i)
+    done;
+    shift
+
+let translate_state chars follow state =
+  match split_trans_set state with
+    n, [] -> Perform n
+  | n, ps -> Shift( (if n == no_action then No_remember else Remember n),
+                    transition_from chars follow ps)
+
+let make_dfa lexdef =
+  let (chars, name_regexp_list, actions) =
+    encode_lexdef lexdef in
+  let follow =
+    followpos (Array.length chars) name_regexp_list in
+  reset_state_mem();
+  let initial_states =
+    List.map (fun (name, regexp) -> (name, get_state(firstpos regexp)))
+        name_regexp_list in
+  let states =
+    map_on_states (translate_state chars follow) in
+  let v =
+    Array.new (number_of_states()) (Perform 0) in
+  List.iter (fun (auto, i) -> v.(i) <- auto) states;
+  reset_state_mem();
+  (initial_states, v, actions)
diff --git a/lex/main.ml b/lex/main.ml
new file mode 100644
index 0000000000..aaefe487dd
--- /dev/null
+++ b/lex/main.ml
@@ -0,0 +1,48 @@
+(* The lexer generator. Command-line parsing. *)
+
+open Syntax
+open Lexgen
+open Output
+
+let main () =
+  if Array.length Sys.argv != 2 then begin
+    prerr_endline "Usage: camllex <input file>";
+    exit 2
+  end;
+  let source_name = Sys.argv.(1) in
+  let dest_name =
+    if Filename.check_suffix source_name ".mll" then
+      Filename.chop_suffix source_name ".mll" ^ ".ml"
+    else
+      source_name ^ ".ml" in
+  ic := open_in_bin source_name;
+  oc := open_out dest_name;
+  let lexbuf =
+    Lexing.from_channel !ic in
+  let (Lexdef(header,_) as def) =
+    try
+      Parser.lexer_definition Lexer.main lexbuf
+    with exn ->
+      close_out !oc;
+      Sys.remove dest_name;
+      begin match exn with
+        Parsing.Parse_error ->
+          prerr_string "Syntax error around char ";
+          prerr_int (Lexing.lexeme_start lexbuf);
+          prerr_endline "."
+      | Lexer.Lexical_error s ->
+          prerr_string "Lexical error around char ";
+          prerr_int (Lexing.lexeme_start lexbuf);
+          prerr_string ": ";
+          prerr_string s;
+          prerr_endline "."
+      | _ -> raise exn
+      end;
+      exit 2 in
+  let ((init, states, acts) as dfa) = make_dfa def in
+  output_lexdef header dfa;
+  close_in !ic;
+  close_out !oc
+
+let _ = Printexc.catch main (); exit 0
+
diff --git a/lex/output.ml b/lex/output.ml
new file mode 100644
index 0000000000..b3ca459c1b
--- /dev/null
+++ b/lex/output.ml
@@ -0,0 +1,146 @@
+(* Generating a DFA as a set of mutually recursive functions *)
+
+open Syntax
+
+let ic = ref stdin
+and oc = ref stdout
+
+(* 1- Generating the actions *)
+
+let copy_buffer = String.create 1024
+
+let copy_chunk (Location(start,stop)) =
+  let rec copy s =
+    if s <= 0 then () else
+      let n = if s < 1024 then s else 1024 in
+      let m = input !ic copy_buffer 0 n in
+        output !oc copy_buffer 0 m;
+        copy (s - m)
+  in
+    seek_in !ic start;
+    copy (stop - start)
+
+let output_action (i,act) =
+  output_string !oc ("action_" ^ string_of_int i ^ " lexbuf = (\n");
+  copy_chunk act;
+  output_string !oc ")\nand ";
+  ()
+
+(* 2- Generating the states *)
+
+let states = ref ([||] : automata array)
+
+let enumerate_vect v =
+  let rec enum env pos =
+    if pos >= Array.length v then env else
+      try
+        let pl = List.assoc v.(pos) env in
+          pl := pos :: !pl; enum env (succ pos)
+        with Not_found ->
+          enum ((v.(pos), ref [pos]) :: env) (succ pos) in
+    Sort.list
+      (fun (e1, pl1) (e2, pl2) -> List.length !pl1 >= List.length !pl2)
+      (enum [] 0)
+
+let output_move = function
+    Backtrack ->
+      output_string !oc "backtrack lexbuf"
+  | Goto dest ->
+      match !states.(dest) with
+        Perform act_num ->
+          output_string !oc ("action_" ^ string_of_int act_num ^ " lexbuf")
+      | _ ->
+          output_string !oc ("state_" ^ string_of_int dest ^ " lexbuf")
+
+let output_char_for_read oc = function
+    '\''  -> output_string oc "\\'"
+  | '\\' -> output_string oc "\\\\"
+  | '\n' -> output_string oc "\\n"
+  | '\t' -> output_string oc "\\t"
+  | c ->
+      let n = Char.code c in
+      if n >= 32 & n < 127 then
+        output_char oc c
+      else begin
+        output_char oc '\\';
+        output_char oc (Char.chr (48 + n / 100));
+        output_char oc (Char.chr (48 + (n / 10) mod 10));
+        output_char oc (Char.chr (48 + n mod 10))
+      end
+
+let rec output_chars = function
+    [] ->
+      failwith "output_chars"
+  | [c] ->
+      output_string !oc "'";
+      output_char_for_read !oc (Char.chr c);
+      output_string !oc "'"
+  | c::cl ->
+      output_string !oc "'";
+      output_char_for_read !oc (Char.chr c);
+      output_string !oc "'|";
+      output_chars cl
+
+let output_one_trans (dest, chars) =
+  output_chars !chars;
+  output_string !oc " -> ";
+  output_move dest;
+  output_string !oc "\n |  ";
+  ()
+
+let output_all_trans trans =
+  output_string !oc "  match get_next_char lexbuf with\n    ";
+  match enumerate_vect trans with
+    [] ->
+      failwith "output_all_trans"
+  | (default, _) :: rest ->
+      List.iter output_one_trans rest;
+      output_string !oc "_ -> ";
+      output_move default;
+      output_string !oc "\nand ";
+      ()
+
+let output_state state_num = function
+    Perform i ->
+      ()
+  | Shift(what_to_do, moves) ->
+      output_string !oc
+        ("state_"  ^ string_of_int state_num ^ " lexbuf =\n");
+      begin match what_to_do with
+        No_remember -> ()
+      | Remember i ->
+          output_string !oc "  lexbuf.lex_last_pos <- lexbuf.lex_curr_pos;\n";
+          output_string !oc ("  lexbuf.lex_last_action <- Obj.magic action_" ^
+                             string_of_int i ^ ";\n")
+      end;
+      output_all_trans moves
+
+(* 3- Generating the entry points *)
+          
+let rec output_entries = function
+    [] -> failwith "output_entries"
+  | (name,state_num) :: rest ->
+      output_string !oc (name ^ " lexbuf =\n");
+      output_string !oc "  start_lexing lexbuf;\n";
+      output_string !oc ("  state_" ^ string_of_int state_num ^ " lexbuf\n");
+      match rest with
+        [] -> output_string !oc "\n"; ()
+      | _  -> output_string !oc "\nand "; output_entries rest
+
+(* All together *)
+
+let output_lexdef header (initial_st, st, actions) =
+  print_int (Array.length st); print_string " states, ";
+  print_int (List.length actions); print_string " actions.";
+  print_newline();
+  output_string !oc "open Obj\nopen Lexing\n\n";
+  copy_chunk header;
+  output_string !oc "\nlet rec ";
+  states := st;
+  List.iter output_action actions;
+  for i = 0 to Array.length st - 1 do
+    output_state i st.(i)
+  done;
+  output_entries initial_st
+
+
diff --git a/lex/parser.mly b/lex/parser.mly
new file mode 100644
index 0000000000..e8851df101
--- /dev/null
+++ b/lex/parser.mly
@@ -0,0 +1,120 @@
+/* The grammar for lexer definitions */
+
+%{
+open Syntax
+
+(* Auxiliaries for the parser. *)
+
+let regexp_for_string s =
+  let rec re_string n =
+    if n >= String.length s then Epsilon
+    else if succ n = String.length s then Characters([s.[n]])
+    else Sequence(Characters([s.[n]]), re_string (succ n))
+  in re_string 0
+
+let char_class c1 c2 =
+  let rec class n =
+    if n > (Char.code c2) then [] else (Char.chr n) :: class(succ n)
+  in class (Char.code c1)
+
+let all_chars = char_class (Char.chr 1) (Char.chr 255)
+
+let rec subtract l1 l2 =
+  match l1 with
+    [] -> []
+  | a::r -> if List.mem a l2 then subtract r l2 else a :: subtract r l2
+%}
+
+%token <string> Tident
+%token <char> Tchar
+%token <string> Tstring
+%token <Syntax.location> Taction
+%token Trule Tparse Tand Tequal Tend Tor Tunderscore Teof Tlbracket Trbracket
+%token Tstar Tmaybe Tplus Tlparen Trparen Tcaret Tdash
+
+%left Tor
+%left CONCAT
+%nonassoc Tmaybe
+%left Tstar
+%left Tplus
+
+%start lexer_definition
+%type <Syntax.lexer_definition> lexer_definition
+
+%%
+
+lexer_definition:
+    header Trule definition other_definitions Tend
+        { Lexdef($1, $3::(List.rev $4)) }
+;
+header:
+    Taction
+        { $1 }
+  |
+        { Location(0,0) }
+;
+other_definitions:
+    other_definitions Tand definition
+        { $3::$1 }
+  |     
+        { [] }
+;
+definition:
+    Tident Tequal entry
+        { ($1,$3) }
+;
+entry:
+    Tparse case rest_of_entry
+        { $2::List.rev $3 }
+;
+rest_of_entry:
+    rest_of_entry Tor case
+        { $3::$1 }
+  |
+        { [] }
+;
+case:
+    regexp Taction
+        { ($1,$2) }
+;
+regexp:
+    Tunderscore
+        { Characters all_chars }
+  | Teof
+        { Characters ['\000'] }
+  | Tchar
+        { Characters [$1] }
+  | Tstring
+        { regexp_for_string $1 }
+  | Tlbracket char_class Trbracket
+        { Characters $2 }
+  | regexp Tstar
+        { Repetition $1 }
+  | regexp Tmaybe
+        { Alternative($1, Epsilon) }
+  | regexp Tplus
+        { Sequence($1, Repetition $1) }
+  | regexp Tor regexp
+        { Alternative($1,$3) }
+  | regexp regexp %prec CONCAT
+        { Sequence($1,$2) }
+  | Tlparen regexp Trparen
+        { $2 }
+;
+char_class:
+    Tcaret char_class1
+        { subtract all_chars $2 }
+  | char_class1
+        { $1 }
+;
+char_class1:
+    Tchar Tdash Tchar
+        { char_class $1 $3 }
+  | Tchar
+        { [$1] }
+  | char_class1 char_class1 %prec CONCAT
+        { $1 @ $2 }
+;
+
+%%
+
diff --git a/lex/syntax.ml b/lex/syntax.ml
new file mode 100644
index 0000000000..f692e6f625
--- /dev/null
+++ b/lex/syntax.ml
@@ -0,0 +1,26 @@
+(* The shallow abstract syntax *)
+
+type location =
+    Location of int * int
+
+type regular_expression =
+    Epsilon
+  | Characters of char list
+  | Sequence of regular_expression * regular_expression
+  | Alternative of regular_expression * regular_expression
+  | Repetition of regular_expression
+
+type lexer_definition =
+    Lexdef of location * (string * (regular_expression * location) list) list
+
+(* Representation of automata *)
+
+type automata =
+    Perform of int
+  | Shift of automata_trans * automata_move array
+and automata_trans =
+    No_remember
+  | Remember of int
+and automata_move =
+    Backtrack
+  | Goto of int