diff options
Diffstat (limited to 'lex')
-rw-r--r-- | lex/.depend | 7 | ||||
-rw-r--r-- | lex/Makefile | 51 | ||||
-rw-r--r-- | lex/lexer.mli | 3 | ||||
-rw-r--r-- | lex/lexer.mll | 159 | ||||
-rw-r--r-- | lex/lexgen.ml | 203 | ||||
-rw-r--r-- | lex/main.ml | 48 | ||||
-rw-r--r-- | lex/output.ml | 146 | ||||
-rw-r--r-- | lex/parser.mly | 120 | ||||
-rw-r--r-- | lex/syntax.ml | 26 |
9 files changed, 763 insertions, 0 deletions
diff --git a/lex/.depend b/lex/.depend new file mode 100644 index 0000000000..0379bb77b3 --- /dev/null +++ b/lex/.depend @@ -0,0 +1,7 @@ +lexer.cmi: parser.cmi +parser.cmi: syntax.cmo +lexer.cmo: lexer.cmi parser.cmi syntax.cmo +lexgen.cmo: syntax.cmo +main.cmo: lexgen.cmo parser.cmi output.cmo syntax.cmo lexer.cmi +output.cmo: syntax.cmo +parser.cmo: parser.cmi syntax.cmo diff --git a/lex/Makefile b/lex/Makefile new file mode 100644 index 0000000000..4104fe77c2 --- /dev/null +++ b/lex/Makefile @@ -0,0 +1,51 @@ +# The lexer generator + +CAMLC=../boot/camlrun ../boot/camlc -I ../boot +COMPFLAGS= +LINKFLAGS= +CAMLYACC=../boot/camlyacc +YACCFLAGS= +CAMLLEX=../boot/camlrun ../boot/camllex +CAMLDEP=../tools/camldep +DEPFLAGS= + +OBJS=syntax.cmo parser.cmo lexer.cmo lexgen.cmo output.cmo main.cmo + +all: camllex + +camllex: $(OBJS) + $(CAMLC) $(LINKFLAGS) -o camllex $(OBJS) + +clean:: + rm -f camllex + rm -f *.cmo *.cmi camllex + +parser.ml parser.mli: parser.mly + $(CAMLYACC) $(YACCFLAGS) parser.mly + +clean:: + rm -f parser.ml parser.mli + +beforedepend:: parser.ml parser.mli + +lexer.ml: lexer.mll + $(CAMLLEX) lexer.mll + +clean:: + rm -f lexer.ml + +beforedepend:: lexer.ml + +.SUFFIXES: +.SUFFIXES: .ml .cmo .mli .cmi + +.ml.cmo: + $(CAMLC) -c $(COMPFLAGS) $< + +.mli.cmi: + $(CAMLC) -c $(COMPFLAGS) $< + +depend: beforedepend + $(CAMLDEP) *.mli *.ml > .depend + +include .depend diff --git a/lex/lexer.mli b/lex/lexer.mli new file mode 100644 index 0000000000..6e0b4a5073 --- /dev/null +++ b/lex/lexer.mli @@ -0,0 +1,3 @@ +val main: Lexing.lexbuf -> Parser.token + +exception Lexical_error of string diff --git a/lex/lexer.mll b/lex/lexer.mll new file mode 100644 index 0000000000..41d05c421a --- /dev/null +++ b/lex/lexer.mll @@ -0,0 +1,159 @@ +(* The lexical analyzer for lexer definitions. Bootstrapped! *) + +{ +open Syntax +open Parser + +(* Auxiliaries for the lexical analyzer *) + +let brace_depth = ref 0 +and comment_depth = ref 0 + +exception Lexical_error of string + +let initial_string_buffer = String.create 256 +let string_buff = ref initial_string_buffer +let string_index = ref 0 + +let reset_string_buffer () = + string_buff := initial_string_buffer; + string_index := 0 + +let store_string_char c = + (if !string_index >= String.length (!string_buff) then + let new_buff = String.create (String.length (!string_buff) * 2) in + String.blit new_buff 0 (!string_buff) 0 (String.length (!string_buff)); + string_buff := new_buff; + ()); + !string_buff.[!string_index] <- c; + incr string_index + +let get_stored_string () = + let s = String.sub (!string_buff) 0 (!string_index) in + string_buff := initial_string_buffer; + s + +let char_for_backslash = function + 'n' -> '\010' (* '\n' when bootstrapped *) + | 't' -> '\009' (* '\t' *) + | 'b' -> '\008' (* '\b' *) + | 'r' -> '\013' (* '\r' *) + | c -> c + +let char_for_decimal_code lexbuf i = + Char.chr(100 * (Char.code(Lexing.lexeme_char lexbuf i) - 48) + + 10 * (Char.code(Lexing.lexeme_char lexbuf (i+1)) - 48) + + (Char.code(Lexing.lexeme_char lexbuf (i+2)) - 48)) + +} + +rule main = parse + [' ' '\010' '\013' '\009' ] + + { main lexbuf } + | "(*" + { comment_depth := 1; + comment lexbuf; + main lexbuf } + | ['A'-'Z' 'a'-'z'] ['A'-'Z' 'a'-'z' '\'' '_' '0'-'9'] * + { match Lexing.lexeme lexbuf with + "rule" -> Trule + | "parse" -> Tparse + | "and" -> Tand + | "eof" -> Teof + | s -> Tident s } + | '"' + { reset_string_buffer(); + string lexbuf; + Tstring(get_stored_string()) } + | "`" [^ '\\'] "`" + { Tchar(Lexing.lexeme_char lexbuf 1) } + | "`" '\\' ['\\' '`' 'n' 't' 'b' 'r'] "`" + { Tchar(char_for_backslash (Lexing.lexeme_char lexbuf 2)) } + | "`" '\\' ['0'-'9'] ['0'-'9'] ['0'-'9'] "`" + { Tchar(char_for_decimal_code lexbuf 2) } + | "'" [^ '\\'] "'" + { Tchar(Lexing.lexeme_char lexbuf 1) } + | "'" '\\' ['\\' '\'' 'n' 't' 'b' 'r'] "'" + { Tchar(char_for_backslash (Lexing.lexeme_char lexbuf 2)) } + | "'" '\\' ['0'-'9'] ['0'-'9'] ['0'-'9'] "'" + { Tchar(char_for_decimal_code lexbuf 2) } + | '{' + { let n1 = Lexing.lexeme_end lexbuf in + brace_depth := 1; + let n2 = action lexbuf in + Taction(Location(n1, n2)) } + | '=' { Tequal } + | '|' { Tor } + | '_' { Tunderscore } + | "eof" { Teof } + | '[' { Tlbracket } + | ']' { Trbracket } + | '*' { Tstar } + | '?' { Tmaybe } + | '+' { Tplus } + | '(' { Tlparen } + | ')' { Trparen } + | '^' { Tcaret } + | '-' { Tdash } + | eof { Tend } + | _ + { raise(Lexical_error + ("illegal character " ^ String.escaped(Lexing.lexeme lexbuf))) } + +and action = parse + '{' + { incr brace_depth; + action lexbuf } + | '}' + { decr brace_depth; + if !brace_depth == 0 then Lexing.lexeme_start lexbuf else action lexbuf } + | '"' + { reset_string_buffer(); + string lexbuf; + reset_string_buffer(); + action lexbuf } + | "'{'" + { action lexbuf } + | "'{'" + { action lexbuf } + | "(*" + { comment_depth := 1; + comment lexbuf; + action lexbuf } + | eof + { raise (Lexical_error "unterminated action") } + | _ + { action lexbuf } + +and string = parse + '"' + { () } + | '\\' [' ' '\010' '\013' '\009' '\026' '\012'] + + { string lexbuf } + | '\\' ['\\' '"' 'n' 't' 'b' 'r'] + { store_string_char(char_for_backslash(Lexing.lexeme_char lexbuf 1)); + string lexbuf } + | '\\' ['0'-'9'] ['0'-'9'] ['0'-'9'] + { store_string_char(char_for_decimal_code lexbuf 1); + string lexbuf } + | eof + { raise(Lexical_error "unterminated string") } + | _ + { store_string_char(Lexing.lexeme_char lexbuf 0); + string lexbuf } + +and comment = parse + "(*" + { incr comment_depth; comment lexbuf } + | "*)" + { decr comment_depth; + if !comment_depth == 0 then () else comment lexbuf } + | '"' + { reset_string_buffer(); + string lexbuf; + reset_string_buffer(); + comment lexbuf } + | eof + { raise(Lexical_error "unterminated comment") } + | _ + { comment lexbuf } diff --git a/lex/lexgen.ml b/lex/lexgen.ml new file mode 100644 index 0000000000..6870419470 --- /dev/null +++ b/lex/lexgen.ml @@ -0,0 +1,203 @@ +(* Compiling a lexer definition *) + +open Syntax + +(* Deep abstract syntax for regular expressions *) + +type regexp = + Empty + | Chars of int + | Action of int + | Seq of regexp * regexp + | Alt of regexp * regexp + | Star of regexp + +(* From shallow to deep syntax *) + +let chars = ref ([] : char list list) +let chars_count = ref 0 +let actions = ref ([] : (int * location) list) +let actions_count = ref 0 + +let rec encode_regexp = function + Epsilon -> Empty + | Characters cl -> + let n = !chars_count in + chars := cl :: !chars; + incr chars_count; + Chars(n) + | Sequence(r1,r2) -> + Seq(encode_regexp r1, encode_regexp r2) + | Alternative(r1,r2) -> + Alt(encode_regexp r1, encode_regexp r2) + | Repetition r -> + Star (encode_regexp r) + +let encode_casedef = + List.fold_left + (fun reg (expr,act) -> + let act_num = !actions_count in + incr actions_count; + actions := (act_num, act) :: !actions; + Alt(reg, Seq(encode_regexp expr, Action act_num))) + Empty + +let encode_lexdef (Lexdef(_, ld)) = + chars := []; + chars_count := 0; + actions := []; + actions_count := 0; + let name_regexp_list = + List.map (fun (name, casedef) -> (name, encode_casedef casedef)) ld in + let chr = Array.of_list (List.rev !chars) + and act = !actions in + chars := []; + actions := []; + (chr, name_regexp_list, act) + +(* To generate directly a NFA from a regular expression. + Confer Aho-Sethi-Ullman, dragon book, chap. 3 *) + +type transition = + OnChars of int + | ToAction of int + +let rec merge_trans s1 s2 = + match (s1, s2) with + ([], _) -> s2 + | (_, []) -> s1 + | ((OnChars n1 as t1) :: r1, (OnChars n2 as t2) :: r2) -> + if n1 == n2 then t1 :: merge_trans r1 r2 else + if n1 < n2 then t1 :: merge_trans r1 s2 else + t2 :: merge_trans s1 r2 + | ((ToAction n1 as t1) :: r1, (ToAction n2 as t2) :: r2) -> + if n1 == n2 then t1 :: merge_trans r1 r2 else + if n1 < n2 then t1 :: merge_trans r1 s2 else + t2 :: merge_trans s1 r2 + | ((OnChars n1 as t1) :: r1, (ToAction n2 as t2) :: r2) -> + t1 :: merge_trans r1 s2 + | ((ToAction n1 as t1) :: r1, (OnChars n2 as t2) :: r2) -> + t2 :: merge_trans s1 r2 + +let rec nullable = function + Empty -> true + | Chars _ -> false + | Action _ -> false + | Seq(r1,r2) -> nullable r1 & nullable r2 + | Alt(r1,r2) -> nullable r1 or nullable r2 + | Star r -> true + +let rec firstpos = function + Empty -> [] + | Chars pos -> [OnChars pos] + | Action act -> [ToAction act] + | Seq(r1,r2) -> if nullable r1 + then merge_trans (firstpos r1) (firstpos r2) + else firstpos r1 + | Alt(r1,r2) -> merge_trans (firstpos r1) (firstpos r2) + | Star r -> firstpos r + +let rec lastpos = function + Empty -> [] + | Chars pos -> [OnChars pos] + | Action act -> [ToAction act] + | Seq(r1,r2) -> if nullable r2 + then merge_trans (lastpos r1) (lastpos r2) + else lastpos r2 + | Alt(r1,r2) -> merge_trans (lastpos r1) (lastpos r2) + | Star r -> lastpos r + +let followpos size name_regexp_list = + let v = Array.new size [] in + let fill_pos first = function + OnChars pos -> v.(pos) <- merge_trans first v.(pos); () + | ToAction _ -> () in + let rec fill = function + Seq(r1,r2) -> + fill r1; fill r2; + List.iter (fill_pos (firstpos r2)) (lastpos r1) + | Alt(r1,r2) -> + fill r1; fill r2 + | Star r -> + fill r; + List.iter (fill_pos (firstpos r)) (lastpos r) + | _ -> () in + List.iter (fun (name, regexp) -> fill regexp) name_regexp_list; + v + +let no_action = 32767 + +let split_trans_set = List.fold_left + (fun (act, pos_set as act_pos_set) -> + function OnChars pos -> (act, pos :: pos_set) + | ToAction act1 -> if act1 < act then (act1, pos_set) + else act_pos_set) + (no_action, []) + +let memory = (Hashtbl.new 131 : (transition list, int) Hashtbl.t) +and todo = ref ([] : (transition list * int) list) +and next = ref 0 + +let reset_state_mem () = + Hashtbl.clear memory; todo := []; next := 0; () + +let get_state st = + try + Hashtbl.find memory st + with Not_found -> + let nbr = !next in + incr next; + Hashtbl.add memory st nbr; + todo := (st, nbr) :: !todo; + nbr + +let rec map_on_states f = + match !todo with + [] -> [] + | (st,i)::r -> todo := r; let res = f st in (res,i) :: map_on_states f + +let number_of_states () = + !next + +let goto_state = function + [] -> Backtrack + | ps -> Goto (get_state ps) + +let transition_from chars follow pos_set = + let tr = Array.new 256 [] + and shift = Array.new 256 Backtrack in + List.iter + (fun pos -> + List.iter + (fun c -> + tr.(Char.code c) <- + merge_trans tr.(Char.code c) follow.(pos)) + chars.(pos)) + pos_set; + for i = 0 to 255 do + shift.(i) <- goto_state tr.(i) + done; + shift + +let translate_state chars follow state = + match split_trans_set state with + n, [] -> Perform n + | n, ps -> Shift( (if n == no_action then No_remember else Remember n), + transition_from chars follow ps) + +let make_dfa lexdef = + let (chars, name_regexp_list, actions) = + encode_lexdef lexdef in + let follow = + followpos (Array.length chars) name_regexp_list in + reset_state_mem(); + let initial_states = + List.map (fun (name, regexp) -> (name, get_state(firstpos regexp))) + name_regexp_list in + let states = + map_on_states (translate_state chars follow) in + let v = + Array.new (number_of_states()) (Perform 0) in + List.iter (fun (auto, i) -> v.(i) <- auto) states; + reset_state_mem(); + (initial_states, v, actions) diff --git a/lex/main.ml b/lex/main.ml new file mode 100644 index 0000000000..aaefe487dd --- /dev/null +++ b/lex/main.ml @@ -0,0 +1,48 @@ +(* The lexer generator. Command-line parsing. *) + +open Syntax +open Lexgen +open Output + +let main () = + if Array.length Sys.argv != 2 then begin + prerr_endline "Usage: camllex <input file>"; + exit 2 + end; + let source_name = Sys.argv.(1) in + let dest_name = + if Filename.check_suffix source_name ".mll" then + Filename.chop_suffix source_name ".mll" ^ ".ml" + else + source_name ^ ".ml" in + ic := open_in_bin source_name; + oc := open_out dest_name; + let lexbuf = + Lexing.from_channel !ic in + let (Lexdef(header,_) as def) = + try + Parser.lexer_definition Lexer.main lexbuf + with exn -> + close_out !oc; + Sys.remove dest_name; + begin match exn with + Parsing.Parse_error -> + prerr_string "Syntax error around char "; + prerr_int (Lexing.lexeme_start lexbuf); + prerr_endline "." + | Lexer.Lexical_error s -> + prerr_string "Lexical error around char "; + prerr_int (Lexing.lexeme_start lexbuf); + prerr_string ": "; + prerr_string s; + prerr_endline "." + | _ -> raise exn + end; + exit 2 in + let ((init, states, acts) as dfa) = make_dfa def in + output_lexdef header dfa; + close_in !ic; + close_out !oc + +let _ = Printexc.catch main (); exit 0 + diff --git a/lex/output.ml b/lex/output.ml new file mode 100644 index 0000000000..b3ca459c1b --- /dev/null +++ b/lex/output.ml @@ -0,0 +1,146 @@ +(* Generating a DFA as a set of mutually recursive functions *) + +open Syntax + +let ic = ref stdin +and oc = ref stdout + +(* 1- Generating the actions *) + +let copy_buffer = String.create 1024 + +let copy_chunk (Location(start,stop)) = + let rec copy s = + if s <= 0 then () else + let n = if s < 1024 then s else 1024 in + let m = input !ic copy_buffer 0 n in + output !oc copy_buffer 0 m; + copy (s - m) + in + seek_in !ic start; + copy (stop - start) + +let output_action (i,act) = + output_string !oc ("action_" ^ string_of_int i ^ " lexbuf = (\n"); + copy_chunk act; + output_string !oc ")\nand "; + () + +(* 2- Generating the states *) + +let states = ref ([||] : automata array) + +let enumerate_vect v = + let rec enum env pos = + if pos >= Array.length v then env else + try + let pl = List.assoc v.(pos) env in + pl := pos :: !pl; enum env (succ pos) + with Not_found -> + enum ((v.(pos), ref [pos]) :: env) (succ pos) in + Sort.list + (fun (e1, pl1) (e2, pl2) -> List.length !pl1 >= List.length !pl2) + (enum [] 0) + +let output_move = function + Backtrack -> + output_string !oc "backtrack lexbuf" + | Goto dest -> + match !states.(dest) with + Perform act_num -> + output_string !oc ("action_" ^ string_of_int act_num ^ " lexbuf") + | _ -> + output_string !oc ("state_" ^ string_of_int dest ^ " lexbuf") + +let output_char_for_read oc = function + '\'' -> output_string oc "\\'" + | '\\' -> output_string oc "\\\\" + | '\n' -> output_string oc "\\n" + | '\t' -> output_string oc "\\t" + | c -> + let n = Char.code c in + if n >= 32 & n < 127 then + output_char oc c + else begin + output_char oc '\\'; + output_char oc (Char.chr (48 + n / 100)); + output_char oc (Char.chr (48 + (n / 10) mod 10)); + output_char oc (Char.chr (48 + n mod 10)) + end + +let rec output_chars = function + [] -> + failwith "output_chars" + | [c] -> + output_string !oc "'"; + output_char_for_read !oc (Char.chr c); + output_string !oc "'" + | c::cl -> + output_string !oc "'"; + output_char_for_read !oc (Char.chr c); + output_string !oc "'|"; + output_chars cl + +let output_one_trans (dest, chars) = + output_chars !chars; + output_string !oc " -> "; + output_move dest; + output_string !oc "\n | "; + () + +let output_all_trans trans = + output_string !oc " match get_next_char lexbuf with\n "; + match enumerate_vect trans with + [] -> + failwith "output_all_trans" + | (default, _) :: rest -> + List.iter output_one_trans rest; + output_string !oc "_ -> "; + output_move default; + output_string !oc "\nand "; + () + +let output_state state_num = function + Perform i -> + () + | Shift(what_to_do, moves) -> + output_string !oc + ("state_" ^ string_of_int state_num ^ " lexbuf =\n"); + begin match what_to_do with + No_remember -> () + | Remember i -> + output_string !oc " lexbuf.lex_last_pos <- lexbuf.lex_curr_pos;\n"; + output_string !oc (" lexbuf.lex_last_action <- Obj.magic action_" ^ + string_of_int i ^ ";\n") + end; + output_all_trans moves + +(* 3- Generating the entry points *) + +let rec output_entries = function + [] -> failwith "output_entries" + | (name,state_num) :: rest -> + output_string !oc (name ^ " lexbuf =\n"); + output_string !oc " start_lexing lexbuf;\n"; + output_string !oc (" state_" ^ string_of_int state_num ^ " lexbuf\n"); + match rest with + [] -> output_string !oc "\n"; () + | _ -> output_string !oc "\nand "; output_entries rest + +(* All together *) + +let output_lexdef header (initial_st, st, actions) = + print_int (Array.length st); print_string " states, "; + print_int (List.length actions); print_string " actions."; + print_newline(); + output_string !oc "open Obj\nopen Lexing\n\n"; + copy_chunk header; + output_string !oc "\nlet rec "; + states := st; + List.iter output_action actions; + for i = 0 to Array.length st - 1 do + output_state i st.(i) + done; + output_entries initial_st + + diff --git a/lex/parser.mly b/lex/parser.mly new file mode 100644 index 0000000000..e8851df101 --- /dev/null +++ b/lex/parser.mly @@ -0,0 +1,120 @@ +/* The grammar for lexer definitions */ + +%{ +open Syntax + +(* Auxiliaries for the parser. *) + +let regexp_for_string s = + let rec re_string n = + if n >= String.length s then Epsilon + else if succ n = String.length s then Characters([s.[n]]) + else Sequence(Characters([s.[n]]), re_string (succ n)) + in re_string 0 + +let char_class c1 c2 = + let rec class n = + if n > (Char.code c2) then [] else (Char.chr n) :: class(succ n) + in class (Char.code c1) + +let all_chars = char_class (Char.chr 1) (Char.chr 255) + +let rec subtract l1 l2 = + match l1 with + [] -> [] + | a::r -> if List.mem a l2 then subtract r l2 else a :: subtract r l2 +%} + +%token <string> Tident +%token <char> Tchar +%token <string> Tstring +%token <Syntax.location> Taction +%token Trule Tparse Tand Tequal Tend Tor Tunderscore Teof Tlbracket Trbracket +%token Tstar Tmaybe Tplus Tlparen Trparen Tcaret Tdash + +%left Tor +%left CONCAT +%nonassoc Tmaybe +%left Tstar +%left Tplus + +%start lexer_definition +%type <Syntax.lexer_definition> lexer_definition + +%% + +lexer_definition: + header Trule definition other_definitions Tend + { Lexdef($1, $3::(List.rev $4)) } +; +header: + Taction + { $1 } + | + { Location(0,0) } +; +other_definitions: + other_definitions Tand definition + { $3::$1 } + | + { [] } +; +definition: + Tident Tequal entry + { ($1,$3) } +; +entry: + Tparse case rest_of_entry + { $2::List.rev $3 } +; +rest_of_entry: + rest_of_entry Tor case + { $3::$1 } + | + { [] } +; +case: + regexp Taction + { ($1,$2) } +; +regexp: + Tunderscore + { Characters all_chars } + | Teof + { Characters ['\000'] } + | Tchar + { Characters [$1] } + | Tstring + { regexp_for_string $1 } + | Tlbracket char_class Trbracket + { Characters $2 } + | regexp Tstar + { Repetition $1 } + | regexp Tmaybe + { Alternative($1, Epsilon) } + | regexp Tplus + { Sequence($1, Repetition $1) } + | regexp Tor regexp + { Alternative($1,$3) } + | regexp regexp %prec CONCAT + { Sequence($1,$2) } + | Tlparen regexp Trparen + { $2 } +; +char_class: + Tcaret char_class1 + { subtract all_chars $2 } + | char_class1 + { $1 } +; +char_class1: + Tchar Tdash Tchar + { char_class $1 $3 } + | Tchar + { [$1] } + | char_class1 char_class1 %prec CONCAT + { $1 @ $2 } +; + +%% + diff --git a/lex/syntax.ml b/lex/syntax.ml new file mode 100644 index 0000000000..f692e6f625 --- /dev/null +++ b/lex/syntax.ml @@ -0,0 +1,26 @@ +(* The shallow abstract syntax *) + +type location = + Location of int * int + +type regular_expression = + Epsilon + | Characters of char list + | Sequence of regular_expression * regular_expression + | Alternative of regular_expression * regular_expression + | Repetition of regular_expression + +type lexer_definition = + Lexdef of location * (string * (regular_expression * location) list) list + +(* Representation of automata *) + +type automata = + Perform of int + | Shift of automata_trans * automata_move array +and automata_trans = + No_remember + | Remember of int +and automata_move = + Backtrack + | Goto of int |