1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
|
\section[match]{PackedString functions for matching}
This module provides regular expression matching and substitution
at the PackedString level. It is built on top of the GNU Regex
library modified to handle perl regular expression syntax.
For a complete description of the perl syntax, do `man perlre`
or have a gander in (Programming|Learning) Perl. Here's
a short summary:
^ matches the beginning of line
$ matches end of line
\b matches word boundary
\B matches non-word boundary
\w matches a word(alpha-numeric) character
\W matches a non-word character
\d matches a digit
\D matches a non-digit
\s matches whitespace
\S matches non-whitespace
\A matches beginning of buffer
\Z matches end-of-buffer
. matches any (bar newline in single-line mode)
+ matches 1 or more times
* matches 0 or more times
? matches 0 or 1
{n,m} matches >=n and <=m atoms
{n,} matches at least n times
{n} matches n times
[..] matches any character member of char class.
(..) if pattern inside parens match, then the ith group is bound
to the matched string
\digit matches whatever the ith group matched.
Backslashed letters
\n newline
\r carriage return
\t tab
\f formfeed
\v vertical tab
\a alarm bell
\e escape
\begin{code}
module MatchPS
(
matchPS,
searchPS,
substPS,
replacePS,
match2PS,
search2PS,
getMatchesNo,
getMatchedGroup,
getWholeMatch,
getLastMatch,
getAfterMatch,
findPS,
rfindPS,
chopPS,
matchPrefixPS,
REmatch(..)
) where
import GlaExts
import PackedString
import Array ((!), bounds)
import Char ( isDigit, ord )
import PrelBase ( Char(..) )
import Regex
\end{code}
\subsection[ps-matching]{PackedString matching}
Posix matching, returning an array of the the intervals that
the individual groups matched within the string.
\begin{code}
matchPS :: PackedString -- reg. exp
-> PackedString -- string to match
-> [Char] -- flags
-> Maybe REmatch
matchPS reg str flags
= let
insensitive = 'i' `elem` flags
mode = 's' `elem` flags
in
unsafePerformIO (do
pat <- re_compile_pattern reg mode insensitive
re_match pat str 0 True)
match2PS :: PackedString -- reg. exp
-> PackedString -- string1 to match
-> PackedString -- string2 to match
-> [Char] -- flags
-> Maybe REmatch
match2PS reg str1 str2 flags
= let
insensitive = 'i' `elem` flags
mode = 's' `elem` flags
len1 = lengthPS str1
len2 = lengthPS str2
in
unsafePerformIO (do
pat <- re_compile_pattern reg mode insensitive
re_match2 pat str1 str2 0 (len1+len2) True)
\end{code}
PackedString front-end to searching with GNU Regex
\begin{code}
searchPS :: PackedString -- reg. exp
-> PackedString -- string to match
-> [Char] -- flags
-> Maybe REmatch
searchPS reg str flags
= let
insensitive = 'i' `elem` flags
mode = 's' `elem` flags
in
unsafePerformIO (do
pat <- re_compile_pattern reg mode insensitive
re_search pat str
0
(lengthPS str)
True)
search2PS :: PackedString -- reg. exp
-> PackedString -- string to match
-> PackedString -- string to match
-> [Char] -- flags
-> Maybe REmatch
search2PS reg str1 str2 flags
= let
insensitive = 'i' `elem` flags
mode = 's' `elem` flags
len1 = lengthPS str1
len2 = lengthPS str2
len = len1+len2
in
unsafePerformIO (do
pat <- re_compile_pattern reg mode insensitive
re_search2 pat
str1
str2
0
len
len
True)
\end{code}
@substrPS s st end@ cuts out the chunk in \tr{s} between \tr{st} and \tr{end}, inclusive.
The \tr{Regex} registers represent substrings by storing the start and the end point plus
one( st==end => empty string) , so we use @chunkPS@ instead.
\begin{code}
chunkPS :: PackedString
-> (Int,Int)
-> PackedString
chunkPS str (st,end)
= if st==end then
nilPS
else
substrPS str st (max 0 (end-1))
\end{code}
Perl-like match and substitute
\begin{code}
substPS :: PackedString -- reg. exp
-> PackedString -- replacement
-> [Char] -- flags
-> PackedString -- string
-> PackedString
substPS rexp repl flags pstr = search pstr
where
global = 'g' `elem` flags
case_insensitive = 'i' `elem` flags
mode = 's' `elem` flags -- single-line mode
pat = unsafePerformIO (
re_compile_pattern rexp mode case_insensitive)
search str
= let
search_res
= unsafePerformIO (re_search pat str 0 (lengthPS str) True)
in
case search_res of
Nothing -> str
Just matcher@(REmatch _ before match after _) ->
let
(st,en) = match
prefix = chunkPS str before
suffix
| global && (st /= en) = search (dropPS en str)
| otherwise = chunkPS str after
in
concatPS [prefix,
replace matcher repl str,
suffix]
replace :: REmatch
-> PackedString
-> PackedString
-> PackedString
replace (REmatch arr (_,b_end) match after lst)
replacement
str
= concatPS (reverse acc) -- ToDo: write a `reversed' version of concatPS
where
(_,b) = bounds arr
acc = replace' [] replacement False
single :: Char -> PackedString
single x = consPS x nilPS
replace' :: [PackedString]
-> PackedString
-> Bool
-> [PackedString]
replace' acc repl escaped
| nullPS repl = acc
| otherwise =
let
x = headPS repl
x# = case x of { C# c -> c }
xs = tailPS repl
in
case x# of
'\\'# ->
if escaped then
replace' acc xs True
else
replace' ((single x):acc) xs (not escaped)
'$'# ->
if (not escaped) then
let
x' = headPS xs
xs' = tailPS xs
ith_ival = arr!num
(num,xs_num) = getNumber ((ord x') - ord '0') xs'
in
if (isDigit x') && (num<=b) then
replace' ((chunkPS str ith_ival):acc) xs_num escaped
else if x' == '&' then
replace' ((chunkPS str match):acc) xs' escaped
else if x' == '+' then
replace' ((chunkPS str lst):acc) xs' escaped
else if x' == '`' then
replace' ((chunkPS str (0,b_end)):acc) xs' escaped
else if x' == '\'' then
replace' ((chunkPS str after):acc) xs' escaped
else -- ignore
replace' acc xs escaped
else
replace' ((single x):acc) xs False
_ -> if escaped then
(case x# of
'n'# -> -- newline
replace' ((single '\n'):acc)
'f'# -> -- formfeed
replace' ((single '\f'):acc)
'r'# -> -- carriage return
replace' ((single '\r'):acc)
't'# -> -- (horiz) tab
replace' ((single '\t'):acc)
'v'# -> -- vertical tab
replace' ((single '\v'):acc)
'a'# -> -- alarm bell
replace' ((single '\a'):acc)
'e'# -> -- escape
replace' ((single '\033'):acc)
_ ->
replace' ((single x):acc)) xs False
else
replace' ((single x):acc) xs False
getNumber :: Int -> PackedString -> (Int,PackedString)
getNumber acc ps
= if nullPS ps then
(acc,ps)
else
let
x = headPS ps
xs = tailPS ps
in
if (isDigit x) then
getNumber (acc*10+(ord x - ord '0')) xs
else
(acc,ps)
\end{code}
Just like substPS, but no prefix and suffix.
\begin{code}
replacePS :: PackedString -- reg. exp
-> PackedString -- replacement
-> [Char] -- flags
-> PackedString -- string
-> PackedString
replacePS rexp
repl
flags
str
= search str
where
case_insensitive = 'i' `elem` flags
mode = 's' `elem` flags -- single-line mode
pat = unsafePerformIO (
re_compile_pattern rexp mode case_insensitive)
search str
= let
search_res
= unsafePerformIO (re_search pat str 0 (lengthPS str) True)
in
case search_res of
Nothing -> str
Just matcher@(REmatch arr _ match _ lst) ->
replace matcher repl str
\end{code}
Picking matched groups out of string
\begin{code}
getMatchesNo :: REmatch
-> Int
getMatchesNo (REmatch arr _ _ _ _)
= snd (bounds arr)
getMatchedGroup :: REmatch
-> Int
-> PackedString
-> PackedString
getMatchedGroup (REmatch arr bef mtch _ lst) nth str
| (nth >= 1) && (nth <= grps) = chunkPS str (arr!nth)
| otherwise = error "getMatchedGroup: group out of range"
where
(1,grps) = bounds arr
getWholeMatch :: REmatch -> PackedString -> PackedString
getWholeMatch (REmatch _ _ mtch _ _) str
= chunkPS str mtch
getLastMatch :: REmatch
-> PackedString
-> PackedString
getLastMatch (REmatch _ _ _ _ lst) str
= chunkPS str lst
getAfterMatch :: REmatch
-> PackedString
-> PackedString
getAfterMatch (REmatch _ _ _ aft _) str
= chunkPS str aft
\end{code}
More or less straight translation of a brute-force string matching
function written in C. (Sedgewick ch. 18)
This is intended to provide much the same facilities as index/rindex in perl.
\begin{code}
findPS :: PackedString
-> PackedString
-> Maybe Int
findPS str substr
= let
m = lengthPS substr
n = lengthPS str
loop i j
| j>=m || i>=n = if j==m then (Just (i-m)) else Nothing
| otherwise
= inner_loop i j
inner_loop i j
= if j<m && i<n && (indexPS str i /= indexPS substr j) then
inner_loop (i-j+1) 0
else
loop (i+1) (j+1)
in
loop 0 0
rfindPS :: PackedString
-> PackedString
-> Maybe Int
rfindPS str substr
= let
m = lengthPS substr - 1
n = lengthPS str - 1
loop i j
| j<0 || i<0 = if j<0 then (Just (i+1)) else Nothing
| otherwise
= inner_loop i j
inner_loop i j
= if j>=0 && i>=0 && (indexPS str i /= indexPS substr j) then
inner_loop (i+(m-j)-1) m
else
loop (i-1) (j-1)
in
loop n m
\end{code}
\begin{code}
chopPS :: PackedString -> PackedString
chopPS str = if nullPS str then
nilPS
else
chunkPS str (0,lengthPS str-1)
\end{code}
Tries to match as much as possible of strA starting from the beginning of strB
(handy when matching fancy literals in parsers)
\begin{code}
matchPrefixPS :: PackedString
-> PackedString
-> Int
matchPrefixPS pref str
= matchPrefixPS' pref str 0
where
matchPrefixPS' pref str n
= if (nullPS pref) || (nullPS str) then
n
else if (headPS pref) == (headPS str) then
matchPrefixPS' (tailPS pref) (tailPS str) (n+1)
else
n
\end{code}
|