[project @ 1998-02-02 17:27:26 by simonm]

Library re-organisation: All libraries now live under ghc/lib, which has the following structure: ghc/lib/std -- all prelude files (libHS.a) ghc/lib/std/cbits ghc/lib/exts -- standard Hugs/GHC extensions (libHSexts.a) -- available with '-fglasgow-exts' ghc/lib/posix -- POSIX library (libHSposix.a) ghc/lib/posix/cbits -- available with '-syslib posix' ghc/lib/misc -- used to be hslibs/ghc (libHSmisc.a) ghc/lib/misc/cbits -- available with '-syslib misc' ghc/lib/concurrent -- Concurrent libraries (libHSconc.a) -- available with '-concurrent' Also, several non-standard prelude modules had their names changed to begin with 'Prel' to reduce namespace pollution. Addr ==> PrelAddr (Addr interface available in 'exts') ArrBase ==> PrelArr CCall ==> PrelCCall (CCall interface available in 'exts') ConcBase ==> PrelConc GHCerr ==> PrelErr Foreign ==> PrelForeign (Foreign interface available in 'exts') GHC ==> PrelGHC IOHandle ==> PrelHandle IOBase ==> PrelIOBase GHCmain ==> PrelMain STBase ==> PrelST Unsafe ==> PrelUnsafe UnsafeST ==> PrelUnsafeST
author: simonm <unknown> 1998-02-02 17:35:59 +0000
committer: simonm <unknown> 1998-02-02 17:35:59 +0000
commit: 28139aea50376444d56f43f0914291348a51a7e7 (patch)
tree: 595c378188638ef16462972c1e7fcdb8409c7f16 /ghc/lib/misc/Regex.lhs
parent: 98a1ebecb6d22d793b1d9f8e1d24ecbb5a2d130f (diff)
download: haskell-28139aea50376444d56f43f0914291348a51a7e7.tar.gz
1 files changed, 367 insertions, 0 deletions
diff --git a/ghc/lib/misc/Regex.lhs b/ghc/lib/misc/Regex.lhs
new file mode 100644
index 0000000000..2153b623cd
--- /dev/null
+++ b/ghc/lib/misc/Regex.lhs
@@ -0,0 +1,367 @@
+\section[regex]{Haskell binding to the GNU regex library}
+
+What follows is a straightforward binding to the functions
+provided by the GNU regex library (the GNU group of functions with Perl
+like syntax)
+
+\begin{code}
+{-# OPTIONS -#include "cbits/ghcRegex.h" #-}
+
+module Regex (
+	 PatBuffer(..),
+	 re_compile_pattern,
+	 re_match,
+	 re_search,
+	 re_match2,
+	 re_search2,
+	 
+	 REmatch(..)
+    ) where
+
+import GlaExts
+import CCall
+import PackedString
+import Array		( array, bounds, (!) )
+import PrelArr 		( MutableByteArray(..), Array(..) )
+import PrelGHC	 	( MutableByteArray# )
+import Char 		( ord )
+import Foreign
+
+\end{code}
+
+First, the higher level matching structure that the functions herein
+return:
+\begin{code}
+--
+-- GroupBounds hold the interval where a group
+-- matched inside a string, e.g.
+--
+-- matching "reg(exp)" "a regexp" returns the pair (5,7) for the
+-- (exp) group. (PackedString indices start from 0)
+
+type GroupBounds = (Int, Int)
+
+data REmatch
+ = REmatch (Array Int GroupBounds)  -- for $1, ... $n
+	   GroupBounds		    -- for $` (everything before match)
+	   GroupBounds		    -- for $& (entire matched string)
+	   GroupBounds		    -- for $' (everything after)
+	   GroupBounds		    -- for $+ (matched by last bracket)
+\end{code}
+
+Prior to any matching (or searching), the regular expression
+have to compiled into an internal form, the pattern buffer.
+Represent the pattern buffer as a Haskell heap object:
+
+\begin{code}
+data PatBuffer = PatBuffer# (MutableByteArray# RealWorld)
+instance CCallable   PatBuffer
+instance CReturnable PatBuffer
+
+createPatBuffer :: Bool -> IO PatBuffer
+
+createPatBuffer insensitive
+ =  _casm_ ``%r = (int)sizeof(struct re_pattern_buffer);'' >>= \ sz ->
+    stToIO (newCharArray (0,sz))	>>= \ (MutableByteArray _ pbuf#) ->
+    let
+	 pbuf = PatBuffer# pbuf#
+    in
+    (if insensitive then
+       {-
+	 See comment re: fastmap below
+       -}
+       ((_casm_ ``%r = (char *)malloc(256*sizeof(char));'')::IO Addr) >>= \ tmap ->
+       {-
+         Set up the translate table so that any lowercase
+         char. gets mapped to an uppercase one. Beacuse quoting
+         inside CAsmStrings is Problematic, we pass in the ordinal values
+         of 'a','z' and 'A'
+       -}
+       _casm_ ``{ int i;
+
+		  for(i=0; i<256; i++)
+		     ((char *)%0)[i] = (char)i;
+	          for(i=(int)%1;i <=(int)%2;i++)
+		     ((char *)%0)[i] = i - ((int)%1 - (int)%3);
+	          }'' tmap (ord 'a') (ord 'z') (ord 'A') 	>>
+       _casm_ ``((struct re_pattern_buffer *)%0)->translate = %1; '' pbuf tmap
+     else
+       _casm_ ``((struct re_pattern_buffer *)%0)->translate = 0; '' pbuf) >>
+    {-
+      Use a fastmap to speed things up, would like to have the fastmap
+      in the Haskell heap, but it will get GCed before we can say regexp,
+      as the reference to it is buried inside a ByteArray :-(
+    -}
+    ((_casm_ ``%r = (char *)malloc(256*sizeof(char));'')::IO Addr) >>= \ fmap ->
+    _casm_ `` ((struct re_pattern_buffer *)%0)->fastmap   = %1; '' pbuf fmap >>
+    {-
+      We want the compiler of the pattern to alloc. memory
+      for the pattern.
+    -}
+    _casm_ `` ((struct re_pattern_buffer *)%0)->buffer    = 0; '' pbuf >>
+    _casm_ `` ((struct re_pattern_buffer *)%0)->allocated = 0; '' pbuf >>
+    return pbuf
+\end{code}
+
+@re_compile_pattern@ converts a regular expression into a pattern buffer,
+GNU style.
+
+Q: should we lift the syntax bits configuration up to the Haskell
+programmer level ?
+
+\begin{code}
+re_compile_pattern :: PackedString     -- pattern to compile
+		   -> Bool             -- True <=> assume single-line mode
+		   -> Bool             -- True <=> case-insensitive
+		   -> IO PatBuffer
+
+re_compile_pattern str single_line_mode insensitive
+ = createPatBuffer insensitive	>>= \ pbuf ->
+   (if single_line_mode then	-- match a multi-line buffer
+       _casm_ ``re_syntax_options = RE_PERL_SINGLELINE_SYNTAX;''
+    else
+       _casm_ ``re_syntax_options = RE_PERL_MULTILINE_SYNTAX;'') >>
+
+   _casm_ ``  (int)re_compile_pattern((char *)%0,
+   					(int)%1,
+					(struct re_pattern_buffer *)%2);''
+		(unpackPS str) (lengthPS str) pbuf	>>= \ () ->
+   --
+   -- No checking for how the compilation of the pattern went yet.
+   --
+   return pbuf
+\end{code}
+
+Got a match?
+
+Each call to re_match uses a new re_registers structures, so we need
+to ask the regex library to allocate enough memory to store the
+registers in each time.  That's what the line '... REGS_UNALLOCATED'
+is all about.
+
+\begin{code}
+re_match :: PatBuffer     -- compiled regexp
+	 -> PackedString  -- string to match
+	 -> Int           -- start position
+	 -> Bool          -- True <=> record results in registers
+	 -> IO (Maybe REmatch)
+
+re_match pbuf str start reg
+ = ((if reg then  -- record result of match in registers
+      _casm_ ``%r = (struct re_registers *)malloc(sizeof(struct re_registers *));''
+     else
+      _casm_ ``%r = (struct re_registers *)NULL;'')::IO Addr)  >>= \ regs ->
+   _casm_ ``((struct re_pattern_buffer *)%0)->regs_allocated = REGS_UNALLOCATED;
+	    %r=(int)re_match((struct re_pattern_buffer *)%0,
+ 			      (char *)%1,
+			      (int)%2,
+			      (int)%3,
+			      (struct re_registers *)%4);'' pbuf
+							     (unpackPS str)
+							     (lengthPS str)
+							     start
+							     regs	>>= \ match_res ->
+  if match_res == (-2) then
+	error "re_match: Internal error"
+  else if match_res < 0 then
+     _casm_ ``free((struct re_registers *)%0); '' regs >>
+     return Nothing
+  else
+     build_re_match start (lengthPS str) regs	>>= \ arr ->
+     _casm_ ``free(((struct re_registers *)%0)->start);
+              free(((struct re_registers *)%0)->end);
+              free((struct re_registers *)%0); '' regs  >>
+     return (Just arr)
+\end{code}
+
+Matching on 2 strings is useful when you're dealing with multiple
+buffers, which is something that could prove useful for PackedStrings,
+as we don't want to stuff the contents of a file into one massive heap
+chunk, but load (smaller chunks) on demand.
+
+\begin{code}
+re_match2 :: PatBuffer
+	  -> PackedString
+	  -> PackedString
+	  -> Int
+	  -> Int
+	  -> Bool
+	  -> IO (Maybe REmatch)
+
+re_match2 pbuf str1 str2 start stop reg
+ = ((if reg then  -- record result of match in registers
+      _casm_ ``%r = (struct re_registers *)malloc(sizeof(struct re_registers *));''
+     else
+      _casm_ ``%r = (struct re_registers *)NULL;'')::IO Addr)	>>= \ regs ->
+   _casm_ ``%r=(int)re_match_2((struct re_pattern_buffer *)%0,
+ 			        (char *)%1,
+			        (int)%2,
+ 			        (char *)%3,
+			        (int)%4,
+			        (int)%5,
+			        (struct re_registers *)%6,
+			        (int)%7);'' pbuf
+					     (unpackPS str1)
+					     (lengthPS str1)
+					     (unpackPS str2)
+					     (lengthPS str2)
+					     start
+					     regs
+					     stop    >>= \ match_res ->
+  if match_res == (-2) then
+	error "re_match2: Internal error"
+  else if match_res < 0 then
+     _casm_ ``free((struct re_registers *)%0); '' regs >>
+     return Nothing
+  else
+     build_re_match start stop regs	>>= \ arr ->
+     _casm_ ``free((struct re_registers *)%0); '' regs  >>
+     return (Just arr)
+\end{code}
+
+Find all the matches in a string:
+\begin{code}
+re_search :: PatBuffer		-- the compiled regexp
+	  -> PackedString	-- the string to search
+	  -> Int		-- start index
+	  -> Int		-- stop index
+	  -> Bool		-- record result of match in registers 
+	  -> IO (Maybe REmatch)
+
+re_search pbuf str start range reg
+ = (if reg then  -- record result of match in registers
+      _casm_ ``%r = (struct re_registers *)malloc(sizeof(struct re_registers *));''
+    else
+      _casm_ ``%r = (struct re_registers *)NULL;'')	>>= \ regs ->
+   _casm_ ``%r=(int)re_search((struct re_pattern_buffer *)%0,
+ 			       (char *)%1,
+			       (int)%2,
+			       (int)%3,
+			       (int)%4,
+			       (struct re_registers *)%5);'' pbuf
+							     (unpackPS str)
+							     (lengthPS str)
+							     start
+							     range
+							     regs	>>= \ match_res ->
+  if match_res== (-1) then
+     _casm_ `` free((struct re_registers *)%0); '' regs >>
+     return Nothing
+  else
+     let
+      (st,en) = if range > start then 
+		   (start,range)
+	        else
+		   (range,start)
+     in
+      build_re_match st en regs					     >>= \ arr ->
+      _casm_ ``free((struct re_registers *)%0); '' regs >>
+      return (Just arr)
+\end{code}
+
+Double buffer search:
+\begin{code}
+re_search2 :: PatBuffer
+	   -> PackedString
+	   -> PackedString
+	   -> Int
+	   -> Int
+	   -> Int
+	   -> Bool
+	   -> IO (Maybe REmatch)
+
+re_search2 pbuf str1 str2 start range stop reg
+
+ = (if reg then  -- record result of match in registers
+      _casm_ ``%r = (struct re_registers *)malloc(sizeof(struct re_registers *));''
+    else
+      _casm_ ``%r = (struct re_registers *)NULL;'')	>>= \ regs ->
+   _casm_ ``%r=(int)re_search_2((struct re_pattern_buffer *)%0,
+ 			         (char *)%1,
+			         (int)%2,
+ 			         (char *)%3,
+			         (int)%4,
+			         (int)%5,
+			         (int)%6,
+			         (struct re_registers *)%7,
+			         (int)%8);'' pbuf
+					      (unpackPS str1)
+					      (lengthPS str1)
+					      (unpackPS str2)
+					      (lengthPS str2)
+					      start
+					      range
+					      regs
+					      stop    >>= \ match_res ->
+  if match_res== (-1) then
+     _casm_ `` free((struct re_registers *)%0); '' regs >>
+     return Nothing
+  else
+     let
+      (st,en) = if range > start then 
+		   (start,range)
+	        else
+		   (range,start)
+     in
+      build_re_match st en regs					   >>= \ arr ->
+      _casm_ `` free((struct re_registers *)%0); '' regs >>
+      return (Just arr)
+\end{code}
+
+\begin{code}
+build_re_match :: Int
+	       -> Int
+	       -> Addr 
+	       -> IO REmatch
+
+build_re_match str_start str_end regs
+ = _casm_ ``%r=(int)(*(struct re_registers *)%0).num_regs;'' regs  >>= \ len ->
+   match_reg_to_array regs len	>>= \ (match_start,match_end,arr) ->
+   let
+    (1,x) = bounds arr
+
+    bef  = (str_start,match_start)  -- $'
+    aft  = (match_end,str_end)      -- $`
+    lst  = arr!x		    -- $+
+    mtch = (match_start,match_end)  -- $&
+   in
+    return (REmatch arr
+			  bef
+			  mtch
+			  aft
+			  lst)
+   where
+    match_reg_to_array regs len
+     = trundleIO regs (0,[]) len  >>= \ (no,ls) ->
+       let
+        (st,end,ls')
+         = case ls of
+             [] -> (0,0,[])
+	     [(a,b)] -> (a,b,ls)
+             ((a,b):xs) -> (a,b,xs)
+       in        
+        return 
+	   (st,
+	    end,
+	    array (1,max 1 (no-1)) 
+		  [ (i, x) | (i,x) <- zip [1..] ls'])
+
+    trundleIO :: Addr 
+	     -> (Int,[(Int,Int)])
+	     -> Int 
+	     -> IO (Int,[(Int,Int)])
+
+    trundleIO regs (i,acc) len
+     | i==len = return (i,reverse acc)
+     | otherwise	  
+       = _casm_ ``%r = (int)(((struct re_registers *)%0)->start)[(int)%1];'' regs i >>= \ start ->
+         _casm_ ``%r = (int)(((struct re_registers *)%0)->end)[(int)%1];''   regs i >>= \ end ->
+	 let
+	  acc' = (start,end):acc
+	 in
+	  if (start == (-1)) && (end == (-1)) then
+	     return (i,reverse acc)
+	  else
+	     trundleIO regs (i+1,acc') len
+\end{code}
+
author	simonm <unknown>	1998-02-02 17:35:59 +0000
committer	simonm <unknown>	1998-02-02 17:35:59 +0000
commit	28139aea50376444d56f43f0914291348a51a7e7 (patch)
tree	595c378188638ef16462972c1e7fcdb8409c7f16 /ghc/lib/misc/Regex.lhs
parent	98a1ebecb6d22d793b1d9f8e1d24ecbb5a2d130f (diff)
download	haskell-28139aea50376444d56f43f0914291348a51a7e7.tar.gz