diff options
author | simonm <unknown> | 1998-02-02 17:35:59 +0000 |
---|---|---|
committer | simonm <unknown> | 1998-02-02 17:35:59 +0000 |
commit | 28139aea50376444d56f43f0914291348a51a7e7 (patch) | |
tree | 595c378188638ef16462972c1e7fcdb8409c7f16 /ghc/lib/misc/Regex.lhs | |
parent | 98a1ebecb6d22d793b1d9f8e1d24ecbb5a2d130f (diff) | |
download | haskell-28139aea50376444d56f43f0914291348a51a7e7.tar.gz |
[project @ 1998-02-02 17:27:26 by simonm]
Library re-organisation:
All libraries now live under ghc/lib, which has the following structure:
ghc/lib/std -- all prelude files (libHS.a)
ghc/lib/std/cbits
ghc/lib/exts -- standard Hugs/GHC extensions (libHSexts.a)
-- available with '-fglasgow-exts'
ghc/lib/posix -- POSIX library (libHSposix.a)
ghc/lib/posix/cbits -- available with '-syslib posix'
ghc/lib/misc -- used to be hslibs/ghc (libHSmisc.a)
ghc/lib/misc/cbits -- available with '-syslib misc'
ghc/lib/concurrent -- Concurrent libraries (libHSconc.a)
-- available with '-concurrent'
Also, several non-standard prelude modules had their names changed to begin
with 'Prel' to reduce namespace pollution.
Addr ==> PrelAddr (Addr interface available in 'exts')
ArrBase ==> PrelArr
CCall ==> PrelCCall (CCall interface available in 'exts')
ConcBase ==> PrelConc
GHCerr ==> PrelErr
Foreign ==> PrelForeign (Foreign interface available in 'exts')
GHC ==> PrelGHC
IOHandle ==> PrelHandle
IOBase ==> PrelIOBase
GHCmain ==> PrelMain
STBase ==> PrelST
Unsafe ==> PrelUnsafe
UnsafeST ==> PrelUnsafeST
Diffstat (limited to 'ghc/lib/misc/Regex.lhs')
-rw-r--r-- | ghc/lib/misc/Regex.lhs | 367 |
1 files changed, 367 insertions, 0 deletions
diff --git a/ghc/lib/misc/Regex.lhs b/ghc/lib/misc/Regex.lhs new file mode 100644 index 0000000000..2153b623cd --- /dev/null +++ b/ghc/lib/misc/Regex.lhs @@ -0,0 +1,367 @@ +\section[regex]{Haskell binding to the GNU regex library} + +What follows is a straightforward binding to the functions +provided by the GNU regex library (the GNU group of functions with Perl +like syntax) + +\begin{code} +{-# OPTIONS -#include "cbits/ghcRegex.h" #-} + +module Regex ( + PatBuffer(..), + re_compile_pattern, + re_match, + re_search, + re_match2, + re_search2, + + REmatch(..) + ) where + +import GlaExts +import CCall +import PackedString +import Array ( array, bounds, (!) ) +import PrelArr ( MutableByteArray(..), Array(..) ) +import PrelGHC ( MutableByteArray# ) +import Char ( ord ) +import Foreign + +\end{code} + +First, the higher level matching structure that the functions herein +return: +\begin{code} +-- +-- GroupBounds hold the interval where a group +-- matched inside a string, e.g. +-- +-- matching "reg(exp)" "a regexp" returns the pair (5,7) for the +-- (exp) group. (PackedString indices start from 0) + +type GroupBounds = (Int, Int) + +data REmatch + = REmatch (Array Int GroupBounds) -- for $1, ... $n + GroupBounds -- for $` (everything before match) + GroupBounds -- for $& (entire matched string) + GroupBounds -- for $' (everything after) + GroupBounds -- for $+ (matched by last bracket) +\end{code} + +Prior to any matching (or searching), the regular expression +have to compiled into an internal form, the pattern buffer. +Represent the pattern buffer as a Haskell heap object: + +\begin{code} +data PatBuffer = PatBuffer# (MutableByteArray# RealWorld) +instance CCallable PatBuffer +instance CReturnable PatBuffer + +createPatBuffer :: Bool -> IO PatBuffer + +createPatBuffer insensitive + = _casm_ ``%r = (int)sizeof(struct re_pattern_buffer);'' >>= \ sz -> + stToIO (newCharArray (0,sz)) >>= \ (MutableByteArray _ pbuf#) -> + let + pbuf = PatBuffer# pbuf# + in + (if insensitive then + {- + See comment re: fastmap below + -} + ((_casm_ ``%r = (char *)malloc(256*sizeof(char));'')::IO Addr) >>= \ tmap -> + {- + Set up the translate table so that any lowercase + char. gets mapped to an uppercase one. Beacuse quoting + inside CAsmStrings is Problematic, we pass in the ordinal values + of 'a','z' and 'A' + -} + _casm_ ``{ int i; + + for(i=0; i<256; i++) + ((char *)%0)[i] = (char)i; + for(i=(int)%1;i <=(int)%2;i++) + ((char *)%0)[i] = i - ((int)%1 - (int)%3); + }'' tmap (ord 'a') (ord 'z') (ord 'A') >> + _casm_ ``((struct re_pattern_buffer *)%0)->translate = %1; '' pbuf tmap + else + _casm_ ``((struct re_pattern_buffer *)%0)->translate = 0; '' pbuf) >> + {- + Use a fastmap to speed things up, would like to have the fastmap + in the Haskell heap, but it will get GCed before we can say regexp, + as the reference to it is buried inside a ByteArray :-( + -} + ((_casm_ ``%r = (char *)malloc(256*sizeof(char));'')::IO Addr) >>= \ fmap -> + _casm_ `` ((struct re_pattern_buffer *)%0)->fastmap = %1; '' pbuf fmap >> + {- + We want the compiler of the pattern to alloc. memory + for the pattern. + -} + _casm_ `` ((struct re_pattern_buffer *)%0)->buffer = 0; '' pbuf >> + _casm_ `` ((struct re_pattern_buffer *)%0)->allocated = 0; '' pbuf >> + return pbuf +\end{code} + +@re_compile_pattern@ converts a regular expression into a pattern buffer, +GNU style. + +Q: should we lift the syntax bits configuration up to the Haskell +programmer level ? + +\begin{code} +re_compile_pattern :: PackedString -- pattern to compile + -> Bool -- True <=> assume single-line mode + -> Bool -- True <=> case-insensitive + -> IO PatBuffer + +re_compile_pattern str single_line_mode insensitive + = createPatBuffer insensitive >>= \ pbuf -> + (if single_line_mode then -- match a multi-line buffer + _casm_ ``re_syntax_options = RE_PERL_SINGLELINE_SYNTAX;'' + else + _casm_ ``re_syntax_options = RE_PERL_MULTILINE_SYNTAX;'') >> + + _casm_ `` (int)re_compile_pattern((char *)%0, + (int)%1, + (struct re_pattern_buffer *)%2);'' + (unpackPS str) (lengthPS str) pbuf >>= \ () -> + -- + -- No checking for how the compilation of the pattern went yet. + -- + return pbuf +\end{code} + +Got a match? + +Each call to re_match uses a new re_registers structures, so we need +to ask the regex library to allocate enough memory to store the +registers in each time. That's what the line '... REGS_UNALLOCATED' +is all about. + +\begin{code} +re_match :: PatBuffer -- compiled regexp + -> PackedString -- string to match + -> Int -- start position + -> Bool -- True <=> record results in registers + -> IO (Maybe REmatch) + +re_match pbuf str start reg + = ((if reg then -- record result of match in registers + _casm_ ``%r = (struct re_registers *)malloc(sizeof(struct re_registers *));'' + else + _casm_ ``%r = (struct re_registers *)NULL;'')::IO Addr) >>= \ regs -> + _casm_ ``((struct re_pattern_buffer *)%0)->regs_allocated = REGS_UNALLOCATED; + %r=(int)re_match((struct re_pattern_buffer *)%0, + (char *)%1, + (int)%2, + (int)%3, + (struct re_registers *)%4);'' pbuf + (unpackPS str) + (lengthPS str) + start + regs >>= \ match_res -> + if match_res == (-2) then + error "re_match: Internal error" + else if match_res < 0 then + _casm_ ``free((struct re_registers *)%0); '' regs >> + return Nothing + else + build_re_match start (lengthPS str) regs >>= \ arr -> + _casm_ ``free(((struct re_registers *)%0)->start); + free(((struct re_registers *)%0)->end); + free((struct re_registers *)%0); '' regs >> + return (Just arr) +\end{code} + +Matching on 2 strings is useful when you're dealing with multiple +buffers, which is something that could prove useful for PackedStrings, +as we don't want to stuff the contents of a file into one massive heap +chunk, but load (smaller chunks) on demand. + +\begin{code} +re_match2 :: PatBuffer + -> PackedString + -> PackedString + -> Int + -> Int + -> Bool + -> IO (Maybe REmatch) + +re_match2 pbuf str1 str2 start stop reg + = ((if reg then -- record result of match in registers + _casm_ ``%r = (struct re_registers *)malloc(sizeof(struct re_registers *));'' + else + _casm_ ``%r = (struct re_registers *)NULL;'')::IO Addr) >>= \ regs -> + _casm_ ``%r=(int)re_match_2((struct re_pattern_buffer *)%0, + (char *)%1, + (int)%2, + (char *)%3, + (int)%4, + (int)%5, + (struct re_registers *)%6, + (int)%7);'' pbuf + (unpackPS str1) + (lengthPS str1) + (unpackPS str2) + (lengthPS str2) + start + regs + stop >>= \ match_res -> + if match_res == (-2) then + error "re_match2: Internal error" + else if match_res < 0 then + _casm_ ``free((struct re_registers *)%0); '' regs >> + return Nothing + else + build_re_match start stop regs >>= \ arr -> + _casm_ ``free((struct re_registers *)%0); '' regs >> + return (Just arr) +\end{code} + +Find all the matches in a string: +\begin{code} +re_search :: PatBuffer -- the compiled regexp + -> PackedString -- the string to search + -> Int -- start index + -> Int -- stop index + -> Bool -- record result of match in registers + -> IO (Maybe REmatch) + +re_search pbuf str start range reg + = (if reg then -- record result of match in registers + _casm_ ``%r = (struct re_registers *)malloc(sizeof(struct re_registers *));'' + else + _casm_ ``%r = (struct re_registers *)NULL;'') >>= \ regs -> + _casm_ ``%r=(int)re_search((struct re_pattern_buffer *)%0, + (char *)%1, + (int)%2, + (int)%3, + (int)%4, + (struct re_registers *)%5);'' pbuf + (unpackPS str) + (lengthPS str) + start + range + regs >>= \ match_res -> + if match_res== (-1) then + _casm_ `` free((struct re_registers *)%0); '' regs >> + return Nothing + else + let + (st,en) = if range > start then + (start,range) + else + (range,start) + in + build_re_match st en regs >>= \ arr -> + _casm_ ``free((struct re_registers *)%0); '' regs >> + return (Just arr) +\end{code} + +Double buffer search: +\begin{code} +re_search2 :: PatBuffer + -> PackedString + -> PackedString + -> Int + -> Int + -> Int + -> Bool + -> IO (Maybe REmatch) + +re_search2 pbuf str1 str2 start range stop reg + + = (if reg then -- record result of match in registers + _casm_ ``%r = (struct re_registers *)malloc(sizeof(struct re_registers *));'' + else + _casm_ ``%r = (struct re_registers *)NULL;'') >>= \ regs -> + _casm_ ``%r=(int)re_search_2((struct re_pattern_buffer *)%0, + (char *)%1, + (int)%2, + (char *)%3, + (int)%4, + (int)%5, + (int)%6, + (struct re_registers *)%7, + (int)%8);'' pbuf + (unpackPS str1) + (lengthPS str1) + (unpackPS str2) + (lengthPS str2) + start + range + regs + stop >>= \ match_res -> + if match_res== (-1) then + _casm_ `` free((struct re_registers *)%0); '' regs >> + return Nothing + else + let + (st,en) = if range > start then + (start,range) + else + (range,start) + in + build_re_match st en regs >>= \ arr -> + _casm_ `` free((struct re_registers *)%0); '' regs >> + return (Just arr) +\end{code} + +\begin{code} +build_re_match :: Int + -> Int + -> Addr + -> IO REmatch + +build_re_match str_start str_end regs + = _casm_ ``%r=(int)(*(struct re_registers *)%0).num_regs;'' regs >>= \ len -> + match_reg_to_array regs len >>= \ (match_start,match_end,arr) -> + let + (1,x) = bounds arr + + bef = (str_start,match_start) -- $' + aft = (match_end,str_end) -- $` + lst = arr!x -- $+ + mtch = (match_start,match_end) -- $& + in + return (REmatch arr + bef + mtch + aft + lst) + where + match_reg_to_array regs len + = trundleIO regs (0,[]) len >>= \ (no,ls) -> + let + (st,end,ls') + = case ls of + [] -> (0,0,[]) + [(a,b)] -> (a,b,ls) + ((a,b):xs) -> (a,b,xs) + in + return + (st, + end, + array (1,max 1 (no-1)) + [ (i, x) | (i,x) <- zip [1..] ls']) + + trundleIO :: Addr + -> (Int,[(Int,Int)]) + -> Int + -> IO (Int,[(Int,Int)]) + + trundleIO regs (i,acc) len + | i==len = return (i,reverse acc) + | otherwise + = _casm_ ``%r = (int)(((struct re_registers *)%0)->start)[(int)%1];'' regs i >>= \ start -> + _casm_ ``%r = (int)(((struct re_registers *)%0)->end)[(int)%1];'' regs i >>= \ end -> + let + acc' = (start,end):acc + in + if (start == (-1)) && (end == (-1)) then + return (i,reverse acc) + else + trundleIO regs (i+1,acc') len +\end{code} + |