summaryrefslogtreecommitdiff
path: root/ghc/lib/misc/Regex.lhs
diff options
context:
space:
mode:
authorsimonm <unknown>1998-02-02 17:35:59 +0000
committersimonm <unknown>1998-02-02 17:35:59 +0000
commit28139aea50376444d56f43f0914291348a51a7e7 (patch)
tree595c378188638ef16462972c1e7fcdb8409c7f16 /ghc/lib/misc/Regex.lhs
parent98a1ebecb6d22d793b1d9f8e1d24ecbb5a2d130f (diff)
downloadhaskell-28139aea50376444d56f43f0914291348a51a7e7.tar.gz
[project @ 1998-02-02 17:27:26 by simonm]
Library re-organisation: All libraries now live under ghc/lib, which has the following structure: ghc/lib/std -- all prelude files (libHS.a) ghc/lib/std/cbits ghc/lib/exts -- standard Hugs/GHC extensions (libHSexts.a) -- available with '-fglasgow-exts' ghc/lib/posix -- POSIX library (libHSposix.a) ghc/lib/posix/cbits -- available with '-syslib posix' ghc/lib/misc -- used to be hslibs/ghc (libHSmisc.a) ghc/lib/misc/cbits -- available with '-syslib misc' ghc/lib/concurrent -- Concurrent libraries (libHSconc.a) -- available with '-concurrent' Also, several non-standard prelude modules had their names changed to begin with 'Prel' to reduce namespace pollution. Addr ==> PrelAddr (Addr interface available in 'exts') ArrBase ==> PrelArr CCall ==> PrelCCall (CCall interface available in 'exts') ConcBase ==> PrelConc GHCerr ==> PrelErr Foreign ==> PrelForeign (Foreign interface available in 'exts') GHC ==> PrelGHC IOHandle ==> PrelHandle IOBase ==> PrelIOBase GHCmain ==> PrelMain STBase ==> PrelST Unsafe ==> PrelUnsafe UnsafeST ==> PrelUnsafeST
Diffstat (limited to 'ghc/lib/misc/Regex.lhs')
-rw-r--r--ghc/lib/misc/Regex.lhs367
1 files changed, 367 insertions, 0 deletions
diff --git a/ghc/lib/misc/Regex.lhs b/ghc/lib/misc/Regex.lhs
new file mode 100644
index 0000000000..2153b623cd
--- /dev/null
+++ b/ghc/lib/misc/Regex.lhs
@@ -0,0 +1,367 @@
+\section[regex]{Haskell binding to the GNU regex library}
+
+What follows is a straightforward binding to the functions
+provided by the GNU regex library (the GNU group of functions with Perl
+like syntax)
+
+\begin{code}
+{-# OPTIONS -#include "cbits/ghcRegex.h" #-}
+
+module Regex (
+ PatBuffer(..),
+ re_compile_pattern,
+ re_match,
+ re_search,
+ re_match2,
+ re_search2,
+
+ REmatch(..)
+ ) where
+
+import GlaExts
+import CCall
+import PackedString
+import Array ( array, bounds, (!) )
+import PrelArr ( MutableByteArray(..), Array(..) )
+import PrelGHC ( MutableByteArray# )
+import Char ( ord )
+import Foreign
+
+\end{code}
+
+First, the higher level matching structure that the functions herein
+return:
+\begin{code}
+--
+-- GroupBounds hold the interval where a group
+-- matched inside a string, e.g.
+--
+-- matching "reg(exp)" "a regexp" returns the pair (5,7) for the
+-- (exp) group. (PackedString indices start from 0)
+
+type GroupBounds = (Int, Int)
+
+data REmatch
+ = REmatch (Array Int GroupBounds) -- for $1, ... $n
+ GroupBounds -- for $` (everything before match)
+ GroupBounds -- for $& (entire matched string)
+ GroupBounds -- for $' (everything after)
+ GroupBounds -- for $+ (matched by last bracket)
+\end{code}
+
+Prior to any matching (or searching), the regular expression
+have to compiled into an internal form, the pattern buffer.
+Represent the pattern buffer as a Haskell heap object:
+
+\begin{code}
+data PatBuffer = PatBuffer# (MutableByteArray# RealWorld)
+instance CCallable PatBuffer
+instance CReturnable PatBuffer
+
+createPatBuffer :: Bool -> IO PatBuffer
+
+createPatBuffer insensitive
+ = _casm_ ``%r = (int)sizeof(struct re_pattern_buffer);'' >>= \ sz ->
+ stToIO (newCharArray (0,sz)) >>= \ (MutableByteArray _ pbuf#) ->
+ let
+ pbuf = PatBuffer# pbuf#
+ in
+ (if insensitive then
+ {-
+ See comment re: fastmap below
+ -}
+ ((_casm_ ``%r = (char *)malloc(256*sizeof(char));'')::IO Addr) >>= \ tmap ->
+ {-
+ Set up the translate table so that any lowercase
+ char. gets mapped to an uppercase one. Beacuse quoting
+ inside CAsmStrings is Problematic, we pass in the ordinal values
+ of 'a','z' and 'A'
+ -}
+ _casm_ ``{ int i;
+
+ for(i=0; i<256; i++)
+ ((char *)%0)[i] = (char)i;
+ for(i=(int)%1;i <=(int)%2;i++)
+ ((char *)%0)[i] = i - ((int)%1 - (int)%3);
+ }'' tmap (ord 'a') (ord 'z') (ord 'A') >>
+ _casm_ ``((struct re_pattern_buffer *)%0)->translate = %1; '' pbuf tmap
+ else
+ _casm_ ``((struct re_pattern_buffer *)%0)->translate = 0; '' pbuf) >>
+ {-
+ Use a fastmap to speed things up, would like to have the fastmap
+ in the Haskell heap, but it will get GCed before we can say regexp,
+ as the reference to it is buried inside a ByteArray :-(
+ -}
+ ((_casm_ ``%r = (char *)malloc(256*sizeof(char));'')::IO Addr) >>= \ fmap ->
+ _casm_ `` ((struct re_pattern_buffer *)%0)->fastmap = %1; '' pbuf fmap >>
+ {-
+ We want the compiler of the pattern to alloc. memory
+ for the pattern.
+ -}
+ _casm_ `` ((struct re_pattern_buffer *)%0)->buffer = 0; '' pbuf >>
+ _casm_ `` ((struct re_pattern_buffer *)%0)->allocated = 0; '' pbuf >>
+ return pbuf
+\end{code}
+
+@re_compile_pattern@ converts a regular expression into a pattern buffer,
+GNU style.
+
+Q: should we lift the syntax bits configuration up to the Haskell
+programmer level ?
+
+\begin{code}
+re_compile_pattern :: PackedString -- pattern to compile
+ -> Bool -- True <=> assume single-line mode
+ -> Bool -- True <=> case-insensitive
+ -> IO PatBuffer
+
+re_compile_pattern str single_line_mode insensitive
+ = createPatBuffer insensitive >>= \ pbuf ->
+ (if single_line_mode then -- match a multi-line buffer
+ _casm_ ``re_syntax_options = RE_PERL_SINGLELINE_SYNTAX;''
+ else
+ _casm_ ``re_syntax_options = RE_PERL_MULTILINE_SYNTAX;'') >>
+
+ _casm_ `` (int)re_compile_pattern((char *)%0,
+ (int)%1,
+ (struct re_pattern_buffer *)%2);''
+ (unpackPS str) (lengthPS str) pbuf >>= \ () ->
+ --
+ -- No checking for how the compilation of the pattern went yet.
+ --
+ return pbuf
+\end{code}
+
+Got a match?
+
+Each call to re_match uses a new re_registers structures, so we need
+to ask the regex library to allocate enough memory to store the
+registers in each time. That's what the line '... REGS_UNALLOCATED'
+is all about.
+
+\begin{code}
+re_match :: PatBuffer -- compiled regexp
+ -> PackedString -- string to match
+ -> Int -- start position
+ -> Bool -- True <=> record results in registers
+ -> IO (Maybe REmatch)
+
+re_match pbuf str start reg
+ = ((if reg then -- record result of match in registers
+ _casm_ ``%r = (struct re_registers *)malloc(sizeof(struct re_registers *));''
+ else
+ _casm_ ``%r = (struct re_registers *)NULL;'')::IO Addr) >>= \ regs ->
+ _casm_ ``((struct re_pattern_buffer *)%0)->regs_allocated = REGS_UNALLOCATED;
+ %r=(int)re_match((struct re_pattern_buffer *)%0,
+ (char *)%1,
+ (int)%2,
+ (int)%3,
+ (struct re_registers *)%4);'' pbuf
+ (unpackPS str)
+ (lengthPS str)
+ start
+ regs >>= \ match_res ->
+ if match_res == (-2) then
+ error "re_match: Internal error"
+ else if match_res < 0 then
+ _casm_ ``free((struct re_registers *)%0); '' regs >>
+ return Nothing
+ else
+ build_re_match start (lengthPS str) regs >>= \ arr ->
+ _casm_ ``free(((struct re_registers *)%0)->start);
+ free(((struct re_registers *)%0)->end);
+ free((struct re_registers *)%0); '' regs >>
+ return (Just arr)
+\end{code}
+
+Matching on 2 strings is useful when you're dealing with multiple
+buffers, which is something that could prove useful for PackedStrings,
+as we don't want to stuff the contents of a file into one massive heap
+chunk, but load (smaller chunks) on demand.
+
+\begin{code}
+re_match2 :: PatBuffer
+ -> PackedString
+ -> PackedString
+ -> Int
+ -> Int
+ -> Bool
+ -> IO (Maybe REmatch)
+
+re_match2 pbuf str1 str2 start stop reg
+ = ((if reg then -- record result of match in registers
+ _casm_ ``%r = (struct re_registers *)malloc(sizeof(struct re_registers *));''
+ else
+ _casm_ ``%r = (struct re_registers *)NULL;'')::IO Addr) >>= \ regs ->
+ _casm_ ``%r=(int)re_match_2((struct re_pattern_buffer *)%0,
+ (char *)%1,
+ (int)%2,
+ (char *)%3,
+ (int)%4,
+ (int)%5,
+ (struct re_registers *)%6,
+ (int)%7);'' pbuf
+ (unpackPS str1)
+ (lengthPS str1)
+ (unpackPS str2)
+ (lengthPS str2)
+ start
+ regs
+ stop >>= \ match_res ->
+ if match_res == (-2) then
+ error "re_match2: Internal error"
+ else if match_res < 0 then
+ _casm_ ``free((struct re_registers *)%0); '' regs >>
+ return Nothing
+ else
+ build_re_match start stop regs >>= \ arr ->
+ _casm_ ``free((struct re_registers *)%0); '' regs >>
+ return (Just arr)
+\end{code}
+
+Find all the matches in a string:
+\begin{code}
+re_search :: PatBuffer -- the compiled regexp
+ -> PackedString -- the string to search
+ -> Int -- start index
+ -> Int -- stop index
+ -> Bool -- record result of match in registers
+ -> IO (Maybe REmatch)
+
+re_search pbuf str start range reg
+ = (if reg then -- record result of match in registers
+ _casm_ ``%r = (struct re_registers *)malloc(sizeof(struct re_registers *));''
+ else
+ _casm_ ``%r = (struct re_registers *)NULL;'') >>= \ regs ->
+ _casm_ ``%r=(int)re_search((struct re_pattern_buffer *)%0,
+ (char *)%1,
+ (int)%2,
+ (int)%3,
+ (int)%4,
+ (struct re_registers *)%5);'' pbuf
+ (unpackPS str)
+ (lengthPS str)
+ start
+ range
+ regs >>= \ match_res ->
+ if match_res== (-1) then
+ _casm_ `` free((struct re_registers *)%0); '' regs >>
+ return Nothing
+ else
+ let
+ (st,en) = if range > start then
+ (start,range)
+ else
+ (range,start)
+ in
+ build_re_match st en regs >>= \ arr ->
+ _casm_ ``free((struct re_registers *)%0); '' regs >>
+ return (Just arr)
+\end{code}
+
+Double buffer search:
+\begin{code}
+re_search2 :: PatBuffer
+ -> PackedString
+ -> PackedString
+ -> Int
+ -> Int
+ -> Int
+ -> Bool
+ -> IO (Maybe REmatch)
+
+re_search2 pbuf str1 str2 start range stop reg
+
+ = (if reg then -- record result of match in registers
+ _casm_ ``%r = (struct re_registers *)malloc(sizeof(struct re_registers *));''
+ else
+ _casm_ ``%r = (struct re_registers *)NULL;'') >>= \ regs ->
+ _casm_ ``%r=(int)re_search_2((struct re_pattern_buffer *)%0,
+ (char *)%1,
+ (int)%2,
+ (char *)%3,
+ (int)%4,
+ (int)%5,
+ (int)%6,
+ (struct re_registers *)%7,
+ (int)%8);'' pbuf
+ (unpackPS str1)
+ (lengthPS str1)
+ (unpackPS str2)
+ (lengthPS str2)
+ start
+ range
+ regs
+ stop >>= \ match_res ->
+ if match_res== (-1) then
+ _casm_ `` free((struct re_registers *)%0); '' regs >>
+ return Nothing
+ else
+ let
+ (st,en) = if range > start then
+ (start,range)
+ else
+ (range,start)
+ in
+ build_re_match st en regs >>= \ arr ->
+ _casm_ `` free((struct re_registers *)%0); '' regs >>
+ return (Just arr)
+\end{code}
+
+\begin{code}
+build_re_match :: Int
+ -> Int
+ -> Addr
+ -> IO REmatch
+
+build_re_match str_start str_end regs
+ = _casm_ ``%r=(int)(*(struct re_registers *)%0).num_regs;'' regs >>= \ len ->
+ match_reg_to_array regs len >>= \ (match_start,match_end,arr) ->
+ let
+ (1,x) = bounds arr
+
+ bef = (str_start,match_start) -- $'
+ aft = (match_end,str_end) -- $`
+ lst = arr!x -- $+
+ mtch = (match_start,match_end) -- $&
+ in
+ return (REmatch arr
+ bef
+ mtch
+ aft
+ lst)
+ where
+ match_reg_to_array regs len
+ = trundleIO regs (0,[]) len >>= \ (no,ls) ->
+ let
+ (st,end,ls')
+ = case ls of
+ [] -> (0,0,[])
+ [(a,b)] -> (a,b,ls)
+ ((a,b):xs) -> (a,b,xs)
+ in
+ return
+ (st,
+ end,
+ array (1,max 1 (no-1))
+ [ (i, x) | (i,x) <- zip [1..] ls'])
+
+ trundleIO :: Addr
+ -> (Int,[(Int,Int)])
+ -> Int
+ -> IO (Int,[(Int,Int)])
+
+ trundleIO regs (i,acc) len
+ | i==len = return (i,reverse acc)
+ | otherwise
+ = _casm_ ``%r = (int)(((struct re_registers *)%0)->start)[(int)%1];'' regs i >>= \ start ->
+ _casm_ ``%r = (int)(((struct re_registers *)%0)->end)[(int)%1];'' regs i >>= \ end ->
+ let
+ acc' = (start,end):acc
+ in
+ if (start == (-1)) && (end == (-1)) then
+ return (i,reverse acc)
+ else
+ trundleIO regs (i+1,acc') len
+\end{code}
+