summaryrefslogtreecommitdiff
path: root/examples/inv_regex.py
blob: 8a0c1bb7af8f6cb0bfe8d2e195651fea3c68ed90 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
#
# original file: https://raw.githubusercontent.com/pyparsing/pyparsing/pyparsing_3.0.9/examples/invRegex.py
#
# Copyright 2008, Paul McGuire
#
# pyparsing script to expand a regular expression into all possible matching strings
# Supports:
# - {n} and {m,n} repetition, but not unbounded + or * repetition
# - ? optional elements
# - [] character ranges
# - () grouping
# - | alternation
#
__all__ = ["count", "invert"]

from pyparsing import (
    Literal,
    one_of,
    Empty,
    printables,
    ParserElement,
    Combine,
    SkipTo,
    infix_notation,
    ParseFatalException,
    Word,
    nums,
    OpAssoc,
    Suppress,
    ParseResults,
    srange,
)

ParserElement.enablePackrat()


class CharacterRangeEmitter:
    def __init__(self, chars):
        # remove duplicate chars in character range, but preserve original order
        seen = set()
        self.charset = "".join(seen.add(c) or c for c in chars if c not in seen)

    def __str__(self):
        return "[" + self.charset + "]"

    def __repr__(self):
        return "[" + self.charset + "]"

    def make_generator(self):
        def gen_chars():
            yield from self.charset

        return gen_chars


class OptionalEmitter:
    def __init__(self, expr):
        self.expr = expr

    def make_generator(self):
        def optional_gen():
            yield ""
            yield from self.expr.make_generator()()

        return optional_gen


class DotEmitter:
    def make_generator(self):
        def dot_gen():
            yield from printables

        return dot_gen


class GroupEmitter:
    def __init__(self, exprs):
        self.exprs = ParseResults(exprs)

    def make_generator(self):
        def group_gen():
            def recurse_list(elist):
                if len(elist) == 1:
                    yield from elist[0].make_generator()()
                else:
                    for s in elist[0].make_generator()():
                        for s2 in recurse_list(elist[1:]):
                            yield s + s2

            if self.exprs:
                yield from recurse_list(self.exprs)

        return group_gen


class AlternativeEmitter:
    def __init__(self, exprs):
        self.exprs = exprs

    def make_generator(self):
        def alt_gen():
            for e in self.exprs:
                yield from e.make_generator()()

        return alt_gen


class LiteralEmitter:
    def __init__(self, lit):
        self.lit = lit

    def __str__(self):
        return "Lit:" + self.lit

    def __repr__(self):
        return "Lit:" + self.lit

    def make_generator(self):
        def lit_gen():
            yield self.lit

        return lit_gen


def handle_range(toks):
    return CharacterRangeEmitter(srange(toks[0]))


def handle_repetition(toks):
    toks = toks[0]
    if toks[1] in "*+":
        raise ParseFatalException("", 0, "unbounded repetition operators not supported")
    if toks[1] == "?":
        return OptionalEmitter(toks[0])
    if "count" in toks:
        return GroupEmitter([toks[0]] * int(toks.count))
    if "minCount" in toks:
        mincount = int(toks.minCount)
        maxcount = int(toks.maxCount)
        optcount = maxcount - mincount
        if optcount:
            opt = OptionalEmitter(toks[0])
            for i in range(1, optcount):
                opt = OptionalEmitter(GroupEmitter([toks[0], opt]))
            return GroupEmitter([toks[0]] * mincount + [opt])
        else:
            return [toks[0]] * mincount


def handle_literal(toks):
    lit = ""
    for t in toks:
        if t[0] == "\\":
            if t[1] == "t":
                lit += "\t"
            else:
                lit += t[1]
        else:
            lit += t
    return LiteralEmitter(lit)


def handle_macro(toks):
    macro_char = toks[0][1]
    if macro_char == "d":
        return CharacterRangeEmitter("0123456789")
    elif macro_char == "w":
        return CharacterRangeEmitter(srange("[A-Za-z0-9_]"))
    elif macro_char == "s":
        return LiteralEmitter(" ")
    else:
        raise ParseFatalException(
            "", 0, "unsupported macro character (" + macro_char + ")"
        )


def handle_sequence(toks):
    return GroupEmitter(toks[0])


def handle_dot():
    return CharacterRangeEmitter(printables)


def handle_alternative(toks):
    return AlternativeEmitter(toks[0])


_parser = None


def parser():
    global _parser
    if _parser is None:
        ParserElement.set_default_whitespace_chars("")
        lbrack, rbrack, lbrace, rbrace, lparen, rparen, colon, qmark = Literal.using_each(
            "[]{}():?"
        )

        re_macro = Combine("\\" + one_of("d w s"))
        escaped_char = ~re_macro + Combine("\\" + one_of(list(printables)))
        re_literal_char = (
            "".join(c for c in printables if c not in r"\[]{}().*?+|") + " \t"
        )

        re_range = Combine(lbrack + SkipTo(rbrack, ignore=escaped_char) + rbrack) # type: ignore 
        re_literal = escaped_char | one_of(list(re_literal_char))
        re_non_capture_group = Suppress("?:")
        re_dot = Literal(".")
        repetition = (
            (lbrace + Word(nums)("count") + rbrace)
            | (lbrace + Word(nums)("minCount") + "," + Word(nums)("maxCount") + rbrace)
            | one_of(list("*+?"))
        )

        re_range.add_parse_action(handle_range)
        re_literal.add_parse_action(handle_literal)
        re_macro.add_parse_action(handle_macro)
        re_dot.add_parse_action(handle_dot)

        re_term = re_literal | re_range | re_macro | re_dot | re_non_capture_group
        re_expr = infix_notation(
            re_term,
            [
                (repetition, 1, OpAssoc.LEFT, handle_repetition),
                (Empty(), 2, OpAssoc.LEFT, handle_sequence),
                (Suppress("|"), 2, OpAssoc.LEFT, handle_alternative),
            ],
        )
        _parser = re_expr

    return _parser


def count(gen):
    """Simple function to count the number of elements returned by a generator."""
    return sum(1 for _ in gen)


def invert(regex):
    r"""
    Call this routine as a generator to return all the strings that
    match the input regular expression.
        for s in invert(r"[A-Z]{3}\d{3}"):
            print s
    """
    invre = GroupEmitter(parser().parseString(regex)).make_generator()
    return invre()


def main():
    tests = r"""
    [A-EA]
    [A-D]*
    [A-D]{3}
    X[A-C]{3}Y
    X[A-C]{3}\(
    X\d
    foobar\d\d
    foobar{2}
    foobar{2,9}
    fooba[rz]{2}
    (foobar){2}
    ([01]\d)|(2[0-5])
    (?:[01]\d)|(2[0-5])
    ([01]\d\d)|(2[0-4]\d)|(25[0-5])
    [A-C]{1,2}
    [A-C]{0,3}
    [A-C]\s[A-C]\s[A-C]
    [A-C]\s?[A-C][A-C]
    [A-C]\s([A-C][A-C])
    [A-C]\s([A-C][A-C])?
    [A-C]{2}\d{2}
    @|TH[12]
    @(@|TH[12])?
    @(@|TH[12]|AL[12]|SP[123]|TB(1[0-9]?|20?|[3-9]))?
    @(@|TH[12]|AL[12]|SP[123]|TB(1[0-9]?|20?|[3-9])|OH(1[0-9]?|2[0-9]?|30?|[4-9]))?
    (([ECMP]|HA|AK)[SD]|HS)T
    [A-CV]{2}
    A[cglmrstu]|B[aehikr]?|C[adeflmorsu]?|D[bsy]|E[rsu]|F[emr]?|G[ade]|H[efgos]?|I[nr]?|Kr?|L[airu]|M[dgnot]|N[abdeiop]?|Os?|P[abdmortu]?|R[abefghnu]|S[bcegimnr]?|T[abcehilm]|Uu[bhopqst]|U|V|W|Xe|Yb?|Z[nr]
    (a|b)|(x|y)
    (a|b) (x|y)
    [ABCDEFG](?:#|##|b|bb)?(?:maj|min|m|sus|aug|dim)?[0-9]?(?:/[ABCDEFG](?:#|##|b|bb)?)?
    (Fri|Mon|S(atur|un)|T(hur|ue)s|Wednes)day
    A(pril|ugust)|((Dec|Nov|Sept)em|Octo)ber|(Febr|Jan)uary|Ju(ly|ne)|Ma(rch|y)
    """.splitlines()

    for t in tests:
        t = t.strip()
        if not t:
            continue
        print("-" * 50)
        print(t)
        try:
            num = count(invert(t))
            print(num)
            maxprint = 30
            for s in invert(t):
                print(s)
                maxprint -= 1
                if not maxprint:
                    break
        except ParseFatalException as pfe:
            print(pfe.msg)
            print("")
            continue
        print("")


if __name__ == "__main__":
    main()