rdflib/plugins/stores/regexmatching.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165

"""
This wrapper intercepts calls through the store interface which make use of
the REGEXTerm class to represent matches by REGEX instead of literal
comparison.

Implemented for stores that don't support this and essentially
provides the support by replacing the REGEXTerms by wildcards (None) and
matching against the results from the store it's wrapping.
"""

from rdflib.store import Store
from rdflib.graph import Graph


import re

# Store is capable of doing its own REGEX matching
NATIVE_REGEX = 0
# Store uses Python's re module internally for REGEX matching
PYTHON_REGEX = 1


class REGEXTerm(str):
    """
    REGEXTerm can be used in any term slot and is interpreted as a request to
    perform a REGEX match (not a string comparison) using the value
    (pre-compiled) for checking rdf:type matches
    """

    def __init__(self, expr):
        self.compiledExpr = re.compile(expr)

    def __reduce__(self):
        return (REGEXTerm, (str(''),))


def regexCompareQuad(quad, regexQuad):
    for index in range(4):
        if isinstance(regexQuad[index], REGEXTerm) and not \
                regexQuad[index].compiledExpr.match(quad[index]):
            return False
    return True


class REGEXMatching(Store):
    def __init__(self, storage):
        self.storage = storage
        self.context_aware = storage.context_aware
        # NOTE: this store can't be formula_aware as it doesn't have enough
        # info to reverse the removal of a quoted statement.
        self.formula_aware = storage.formula_aware
        self.transaction_aware = storage.transaction_aware

    def open(self, configuration, create=True):
        return self.storage.open(configuration, create)

    def close(self, commit_pending_transaction=False):
        self.storage.close()

    def destroy(self, configuration):
        self.storage.destroy(configuration)

    def add(self, triple, context, quoted=False):
        (subject, predicate, object_) = triple
        self.storage.add((subject, predicate, object_), context, quoted)

    def remove(self, triple, context=None):
        (subject, predicate, object_) = triple
        if isinstance(subject, REGEXTerm) or \
            isinstance(predicate, REGEXTerm) or \
            isinstance(object_, REGEXTerm) or \
                (context is not None and
                 isinstance(context.identifier, REGEXTerm)):
            # One or more of the terms is a REGEX expression, so we must
            # replace it / them with wildcard(s)and match after we query.
            s = not isinstance(subject, REGEXTerm) and subject or None
            p = not isinstance(predicate, REGEXTerm) and predicate or None
            o = not isinstance(object_, REGEXTerm) and object_ or None
            c = (context is not None and
                 not isinstance(context.identifier, REGEXTerm)) \
                and context \
                or None

            removeQuadList = []
            for (s1, p1, o1), cg in self.storage.triples((s, p, o), c):
                for ctx in cg:
                    ctx = ctx.identifier
                    if regexCompareQuad(
                            (s1, p1, o1, ctx),
                            (subject, predicate, object_, context
                             is not None and context.identifier or context)):
                        removeQuadList.append((s1, p1, o1, ctx))
            for s, p, o, c in removeQuadList:
                self.storage.remove((s, p, o), c and Graph(self, c) or c)
        else:
            self.storage.remove((subject, predicate, object_), context)

    def triples(self, triple, context=None):
        (subject, predicate, object_) = triple
        if isinstance(subject, REGEXTerm) or \
            isinstance(predicate, REGEXTerm) or \
            isinstance(object_, REGEXTerm) or \
                (context is not None and
                 isinstance(context.identifier, REGEXTerm)):
            # One or more of the terms is a REGEX expression, so we must
            # replace it / them with wildcard(s) and match after we query.
            s = not isinstance(subject, REGEXTerm) and subject or None
            p = not isinstance(predicate, REGEXTerm) and predicate or None
            o = not isinstance(object_, REGEXTerm) and object_ or None
            c = (context is not None and
                 not isinstance(context.identifier, REGEXTerm)) \
                and context \
                or None
            for (s1, p1, o1), cg in self.storage.triples((s, p, o), c):
                matchingCtxs = []
                for ctx in cg:
                    if c is None:
                        if context is None \
                            or context.identifier.compiledExpr.match(
                                ctx.identifier):
                            matchingCtxs.append(ctx)
                    else:
                        matchingCtxs.append(ctx)
                if matchingCtxs \
                    and regexCompareQuad((s1, p1, o1, None),
                                         (subject, predicate, object_, None)):
                    yield (s1, p1, o1), (c for c in matchingCtxs)
        else:
            for (s1, p1, o1), cg in self.storage.triples(
                    (subject, predicate, object_), context):
                yield (s1, p1, o1), cg

    def __len__(self, context=None):
        # NOTE: If the context is a REGEX this could be an expensive
        # proposition
        return self.storage.__len__(context)

    def contexts(self, triple=None):
        # NOTE: There is no way to control REGEX matching for this method at
        # this level as it only returns the contexts, not the matching
        # triples.
        for ctx in self.storage.contexts(triple):
            yield ctx

    def remove_context(self, identifier):
        self.storage.remove((None, None, None), identifier)

    def bind(self, prefix, namespace):
        self.storage.bind(prefix, namespace)

    def prefix(self, namespace):
        return self.storage.prefix(namespace)

    def namespace(self, prefix):
        return self.storage.namespace(prefix)

    def namespaces(self):
        return self.storage.namespaces()

    def commit(self):
        self.storage.commit()

    def rollback(self):
        self.storage.rollback()