diff options
| author | Nicholas Car <nicholas.car@surroundaustralia.com> | 2020-08-27 13:13:45 +1000 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2020-08-27 13:13:45 +1000 |
| commit | 3afffcd19d3a5d240e83b3a59b53e3ee1120c165 (patch) | |
| tree | 42ba0191f0a8f645cbc5b60aefd8a3cbfc383a8b /rdflib/plugins | |
| parent | 3e42f5eea742563cdeab7d655fe55f7d0e25ea16 (diff) | |
| parent | 94295389204175783c2f369c2826f0ba55a2d42c (diff) | |
| download | rdflib-improve_graph_parse.tar.gz | |
Merge branch 'master' into improve_graph_parseimprove_graph_parse
Diffstat (limited to 'rdflib/plugins')
| -rwxr-xr-x | rdflib/plugins/parsers/notation3.py | 102 | ||||
| -rw-r--r-- | rdflib/plugins/parsers/nquads.py | 11 | ||||
| -rw-r--r-- | rdflib/plugins/parsers/nt.py | 33 | ||||
| -rw-r--r-- | rdflib/plugins/parsers/ntriples.py | 122 | ||||
| -rw-r--r-- | rdflib/plugins/parsers/trig.py | 12 |
5 files changed, 143 insertions, 137 deletions
diff --git a/rdflib/plugins/parsers/notation3.py b/rdflib/plugins/parsers/notation3.py index c427f153..d866977d 100755 --- a/rdflib/plugins/parsers/notation3.py +++ b/rdflib/plugins/parsers/notation3.py @@ -139,10 +139,13 @@ def join(here, there): return here + frag # join('mid:foo@example', '../foo') bzzt - if here[bcolonl + 1: bcolonl + 2] != "/": - raise ValueError("Base <%s> has no slash after " "colon - with relative '%s'." % (here, there)) + if here[bcolonl + 1 : bcolonl + 2] != "/": + raise ValueError( + "Base <%s> has no slash after " + "colon - with relative '%s'." % (here, there) + ) - if here[bcolonl + 1: bcolonl + 3] == "//": + if here[bcolonl + 1 : bcolonl + 3] == "//": bpath = here.find("/", bcolonl + 3) else: bpath = bcolonl + 1 @@ -502,14 +505,14 @@ class SinkParser: """ assert tok[0] not in _notNameChars # not for punctuation - if argstr[i: i + 1] == "@": + if argstr[i : i + 1] == "@": i = i + 1 else: if tok not in self.keywords: return -1 # No, this has neither keywords declaration nor "@" if ( - argstr[i: i + len(tok)] == tok + argstr[i : i + len(tok)] == tok and (argstr[i + len(tok)] in _notKeywordsChars) or (colon and argstr[i + len(tok)] == ":") ): @@ -526,7 +529,7 @@ class SinkParser: assert tok[0] not in _notNameChars # not for punctuation - if argstr[i: i + len(tok)].lower() == tok.lower() and ( + if argstr[i : i + len(tok)].lower() == tok.lower() and ( argstr[i + len(tok)] in _notQNameChars ): i = i + len(tok) @@ -794,23 +797,23 @@ class SinkParser: res.append(("->", RDF_type)) return j - if argstr[i: i + 2] == "<=": + if argstr[i : i + 2] == "<=": if self.turtle: self.BadSyntax(argstr, i, "Found '<=' in Turtle mode. ") res.append(("<-", self._store.newSymbol(Logic_NS + "implies"))) return i + 2 - if argstr[i: i + 1] == "=": + if argstr[i : i + 1] == "=": if self.turtle: self.BadSyntax(argstr, i, "Found '=' in Turtle mode") - if argstr[i + 1: i + 2] == ">": + if argstr[i + 1 : i + 2] == ">": res.append(("->", self._store.newSymbol(Logic_NS + "implies"))) return i + 2 res.append(("->", DAML_sameAs)) return i + 1 - if argstr[i: i + 2] == ":=": + if argstr[i : i + 2] == ":=": if self.turtle: self.BadSyntax(argstr, i, "Found ':=' in Turtle mode") @@ -823,7 +826,7 @@ class SinkParser: res.append(("->", r[0])) return j - if argstr[i: i + 2] == ">-" or argstr[i: i + 2] == "<-": + if argstr[i : i + 2] == ">-" or argstr[i : i + 2] == "<-": self.BadSyntax(argstr, j, ">- ... -> syntax is obsolete.") return -1 @@ -844,8 +847,8 @@ class SinkParser: if j < 0: return j # nope - while argstr[j: j + 1] in "!^": # no spaces, must follow exactly (?) - ch = argstr[j: j + 1] + while argstr[j : j + 1] in "!^": # no spaces, must follow exactly (?) + ch = argstr[j : j + 1] subj = res.pop() obj = self.blankNode(uri=self.here(j)) j = self.node(argstr, j + 1, res) @@ -879,7 +882,7 @@ class SinkParser: if j < 0: return j # eof i = j - ch = argstr[i: i + 1] # Quick 1-character checks first: + ch = argstr[i : i + 1] # Quick 1-character checks first: if ch == "[": bnodeID = self.here(i) @@ -887,7 +890,7 @@ class SinkParser: if j < 0: self.BadSyntax(argstr, i, "EOF after '['") # Hack for "is" binding name to anon node - if argstr[j: j + 1] == "=": + if argstr[j : j + 1] == "=": if self.turtle: self.BadSyntax( argstr, j, "Found '[=' or '[ =' when in turtle mode." @@ -905,7 +908,7 @@ class SinkParser: self.BadSyntax( argstr, i, "EOF when objectList expected after [ = " ) - if argstr[j: j + 1] == ";": + if argstr[j : j + 1] == ";": j = j + 1 else: self.BadSyntax(argstr, i, "objectList expected after [= ") @@ -922,7 +925,7 @@ class SinkParser: self.BadSyntax( argstr, i, "EOF when ']' expected after [ <propertyList>" ) - if argstr[j: j + 1] != "]": + if argstr[j : j + 1] != "]": self.BadSyntax(argstr, j, "']' expected") res.append(subj) return j + 1 @@ -931,7 +934,7 @@ class SinkParser: # if self.turtle: # self.BadSyntax(argstr, i, # "found '{' while in Turtle mode, Formulas not supported!") - ch2 = argstr[i + 1: i + 2] + ch2 = argstr[i + 1 : i + 2] if ch2 == "$": # a set i += 1 @@ -942,12 +945,12 @@ class SinkParser: i = self.skipSpace(argstr, j) if i < 0: self.BadSyntax(argstr, i, "needed '$}', found end.") - if argstr[i: i + 2] == "$}": + if argstr[i : i + 2] == "$}": j = i + 2 break if not first_run: - if argstr[i: i + 1] == ",": + if argstr[i : i + 1] == ",": i += 1 else: self.BadSyntax(argstr, i, "expected: ','") @@ -982,7 +985,7 @@ class SinkParser: if i < 0: self.BadSyntax(argstr, i, "needed '}', found end.") - if argstr[i: i + 1] == "}": + if argstr[i : i + 1] == "}": j = i + 1 break @@ -1001,7 +1004,7 @@ class SinkParser: if ch == "(": thing_type = self._store.newList - ch2 = argstr[i + 1: i + 2] + ch2 = argstr[i + 1 : i + 2] if ch2 == "$": thing_type = self._store.newSet i += 1 @@ -1012,7 +1015,7 @@ class SinkParser: i = self.skipSpace(argstr, j) if i < 0: self.BadSyntax(argstr, i, "needed ')', found end.") - if argstr[i: i + 1] == ")": + if argstr[i : i + 1] == ")": j = i + 1 break @@ -1065,7 +1068,7 @@ class SinkParser: break i = j + 1 - if argstr[j: j + 2] == ":-": + if argstr[j : j + 2] == ":-": if self.turtle: self.BadSyntax(argstr, j, "Found in ':-' in Turtle mode") i = j + 2 @@ -1095,7 +1098,7 @@ class SinkParser: j = self.skipSpace(argstr, i) if j < 0: self.BadSyntax(argstr, j, "EOF found in list of objects") - if argstr[i: i + 1] != ";": + if argstr[i : i + 1] != ";": return i i = i + 1 # skip semicolon and continue @@ -1116,7 +1119,7 @@ class SinkParser: j = self.skipSpace(argstr, i) if j < 0: return j # eof - ch = argstr[j: j + 1] + ch = argstr[j : j + 1] if ch != ",": if ch != ".": return -1 @@ -1133,7 +1136,7 @@ class SinkParser: j = self.skipSpace(argstr, i) if j < 0: self.BadSyntax(argstr, j, "EOF found after object") - if argstr[j: j + 1] != ",": + if argstr[j : j + 1] != ",": return j # Found something else! i = self.object(argstr, j + 1, res) if i < 0: @@ -1143,11 +1146,11 @@ class SinkParser: j = self.skipSpace(argstr, i) if j < 0: return j # eof - if argstr[j: j + 1] == ".": + if argstr[j : j + 1] == ".": return j + 1 # skip - if argstr[j: j + 1] == "}": + if argstr[j : j + 1] == "}": return j # don't skip it - if argstr[j: j + 1] == "]": + if argstr[j : j + 1] == "]": return j self.BadSyntax(argstr, j, "expected '.' or '}' or ']' at end of statement") @@ -1212,7 +1215,7 @@ class SinkParser: assert ( ":" in uref ), "With no base URI, cannot deal with relative URIs" - if argstr[i - 1: i] == "#" and not uref[-1:] == "#": + if argstr[i - 1 : i] == "#" and not uref[-1:] == "#": uref = uref + "#" # She meant it! Weirdness in urlparse? symb = self._store.newSymbol(uref) if symb in self._variables: @@ -1261,7 +1264,7 @@ class SinkParser: if j < 0: return -1 - if argstr[j: j + 1] != "?": + if argstr[j : j + 1] != "?": return -1 j = j + 1 i = j @@ -1419,7 +1422,7 @@ class SinkParser: i = j if argstr[i] in self.string_delimiters: - if argstr[i: i + 3] == argstr[i] * 3: + if argstr[i : i + 3] == argstr[i] * 3: delim = argstr[i] * 3 else: delim = argstr[i] @@ -1467,7 +1470,7 @@ class SinkParser: # return -1 ## or fall through? if argstr[i] in self.string_delimiters: - if argstr[i: i + 3] == argstr[i] * 3: + if argstr[i : i + 3] == argstr[i] * 3: delim = argstr[i] * 3 else: delim = argstr[i] @@ -1476,7 +1479,7 @@ class SinkParser: dt = None j, s = self.strconst(argstr, i, delim) lang = None - if argstr[j: j + 1] == "@": # Language? + if argstr[j : j + 1] == "@": # Language? m = langcode.match(argstr, j + 1) if m is None: raise BadSyntax( @@ -1487,9 +1490,9 @@ class SinkParser: "Bad language code syntax on string " + "literal, after @", ) i = m.end() - lang = argstr[j + 1: i] + lang = argstr[j + 1 : i] j = i - if argstr[j: j + 2] == "^^": + if argstr[j : j + 2] == "^^": res2 = [] j = self.uri_ref2(argstr, j + 2, res2) # Read datatype URI dt = res2[0] @@ -1522,15 +1525,15 @@ class SinkParser: if ( delim == delim3 ): # done when delim is """ or ''' and, respectively ... - if argstr[j: j + 5] == delim5: # ... we have "" or '' before + if argstr[j : j + 5] == delim5: # ... we have "" or '' before i = j + 5 ustr = ustr + delim2 return i, ustr - if argstr[j: j + 4] == delim4: # ... we have " or ' before + if argstr[j : j + 4] == delim4: # ... we have " or ' before i = j + 4 ustr = ustr + delim1 return i, ustr - if argstr[j: j + 3] == delim3: # current " or ' is part of delim + if argstr[j : j + 3] == delim3: # current " or ' is part of delim i = j + 3 return i, ustr @@ -1542,8 +1545,8 @@ class SinkParser: m = interesting.search(argstr, j) # was argstr[j:]. # Note for pos param to work, MUST be compiled ... re bug? assert m, "Quote expected in string at ^ in %s^%s" % ( - argstr[j - 20: j], - argstr[j: j + 20], + argstr[j - 20 : j], + argstr[j : j + 20], ) # at least need a quote i = m.start() @@ -1589,7 +1592,7 @@ class SinkParser: elif ch == "\\": j = i + 1 - ch = argstr[j: j + 1] # Will be empty if string ends + ch = argstr[j : j + 1] # Will be empty if string ends if not ch: raise BadSyntax( self._thisDoc, @@ -1620,14 +1623,14 @@ class SinkParser: self._thisDoc, startline, argstr, i, "unterminated string literal(3)" ) try: - return i + n, reg.sub(unicodeExpand, "\\" + prefix + argstr[i: i + n]) + return i + n, reg.sub(unicodeExpand, "\\" + prefix + argstr[i : i + n]) except: raise BadSyntax( self._thisDoc, startline, argstr, i, - "bad string literal hex escape: " + argstr[i: i + n], + "bad string literal hex escape: " + argstr[i : i + n], ) def uEscape(self, argstr, i, startline): @@ -1672,7 +1675,7 @@ class BadSyntax(SyntaxError): self._why, pre, argstr[st:i], - argstr[i: i + 60], + argstr[i : i + 60], post, ) @@ -1896,8 +1899,11 @@ class TurtleParser(Parser): baseURI = graph.absolutize(source.getPublicId() or source.getSystemId() or "") p = SinkParser(sink, baseURI=baseURI, turtle=turtle) - - p.loadStream(source.getByteStream()) + # N3 parser prefers str stream + stream = source.getCharacterStream() + if not stream: + stream = source.getByteStream() + p.loadStream(stream) for prefix, namespace in p._bindings.items(): graph.bind(prefix, namespace) diff --git a/rdflib/plugins/parsers/nquads.py b/rdflib/plugins/parsers/nquads.py index a3bfbc6e..2a3a9136 100644 --- a/rdflib/plugins/parsers/nquads.py +++ b/rdflib/plugins/parsers/nquads.py @@ -31,7 +31,7 @@ from codecs import getreader from rdflib import ConjunctiveGraph # Build up from the NTriples parser: -from rdflib.plugins.parsers.ntriples import NTriplesParser +from rdflib.plugins.parsers.ntriples import W3CNTriplesParser from rdflib.plugins.parsers.ntriples import ParseError from rdflib.plugins.parsers.ntriples import r_tail from rdflib.plugins.parsers.ntriples import r_wspace @@ -39,7 +39,7 @@ from rdflib.plugins.parsers.ntriples import r_wspace __all__ = ["NQuadsParser"] -class NQuadsParser(NTriplesParser): +class NQuadsParser(W3CNTriplesParser): def parse(self, inputsource, sink, bnode_context=None, **kwargs): """ Parse inputsource as an N-Quads file. @@ -57,13 +57,14 @@ class NQuadsParser(NTriplesParser): ) self.sink = ConjunctiveGraph(store=sink.store, identifier=sink.identifier) - source = inputsource.getByteStream() + source = inputsource.getCharacterStream() + if not source: + source = inputsource.getByteStream() + source = getreader("utf-8")(source) if not hasattr(source, "read"): raise ParseError("Item to parse must be a file-like object.") - source = getreader("utf-8")(source) - self.file = source self.buffer = "" while True: diff --git a/rdflib/plugins/parsers/nt.py b/rdflib/plugins/parsers/nt.py deleted file mode 100644 index c37a1aa0..00000000 --- a/rdflib/plugins/parsers/nt.py +++ /dev/null @@ -1,33 +0,0 @@ -from rdflib.parser import Parser -from rdflib.plugins.parsers.ntriples import NTriplesParser - -__all__ = ["NTSink", "NTParser"] - - -class NTSink(object): - def __init__(self, graph): - self.graph = graph - - def triple(self, s, p, o): - self.graph.add((s, p, o)) - - -class NTParser(Parser): - """parser for the ntriples format, often stored with the .nt extension - - See http://www.w3.org/TR/rdf-testcases/#ntriples""" - - def parse(self, source, sink, **kwargs): - ''' - Parse the NT format - - :type source: `rdflib.parser.InputSource` - :param source: the source of NT-formatted data - :type sink: `rdflib.graph.Graph` - :param sink: where to send parsed triples - :param kwargs: Additional arguments to pass to `.NTriplesParser.parse` - ''' - f = source.getByteStream() # TODO getCharacterStream? - parser = NTriplesParser(NTSink(sink)) - parser.parse(f, **kwargs) - f.close() diff --git a/rdflib/plugins/parsers/ntriples.py b/rdflib/plugins/parsers/ntriples.py index 33a4a4e6..d43a240c 100644 --- a/rdflib/plugins/parsers/ntriples.py +++ b/rdflib/plugins/parsers/ntriples.py @@ -1,9 +1,6 @@ -#!/usr/bin/env python -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function +#!/usr/bin/env python3 -__doc__ = """ +__doc__ = """\ N-Triples Parser License: GPL 2, W3C, BSD, or MIT Author: Sean B. Palmer, inamidst.com @@ -15,14 +12,13 @@ import codecs from rdflib.term import URIRef as URI from rdflib.term import BNode as bNode from rdflib.term import Literal - - -from rdflib.compat import cast_bytes from rdflib.compat import decodeUnicodeEscape +from rdflib.exceptions import ParserError as ParseError +from rdflib.parser import Parser -from io import BytesIO +from io import StringIO, TextIOBase, BytesIO -__all__ = ["unquote", "uriquote", "Sink", "NTriplesParser"] +__all__ = ["unquote", "uriquote", "W3CNTriplesParser", "NTGraphSink", "NTParser"] uriref = r'<([^:]+:[^\s"<>]*)>' literal = r'"([^"\\]*(?:\\.[^"\\]*)*)"' @@ -40,15 +36,7 @@ bufsiz = 2048 validate = False -class Node(str): - pass - - -class ParseError(Exception): - pass - - -class Sink(object): +class DummySink(object): def __init__(self): self.length = 0 @@ -78,7 +66,7 @@ def unquote(s): while s: m = r_safe.match(s) if m: - s = s[m.end():] + s = s[m.end() :] result.append(m.group(1)) continue @@ -90,7 +78,7 @@ def unquote(s): m = r_uniquot.match(s) if m: - s = s[m.end():] + s = s[m.end() :] u, U = m.groups() codepoint = int(u or U, 16) if codepoint > 0x10FFFF: @@ -113,11 +101,10 @@ def uriquote(uri): return r_hibyte.sub(lambda m: "%%%02X" % ord(m.group(1)), uri) -class NTriplesParser(object): +class W3CNTriplesParser(object): """An N-Triples Parser. - + This is a legacy-style Triples parser for NTriples provided by W3C Usage:: - p = NTriplesParser(sink=MySink()) sink = p.parse(f) # file; use parsestring for a string @@ -127,6 +114,8 @@ class NTriplesParser(object): `NTriplesParser`. """ + __slots__ = ("_bnode_ids", "sink", "buffer", "file", "line") + def __init__(self, sink=None, bnode_context=None): if bnode_context is not None: self._bnode_ids = bnode_context @@ -136,7 +125,11 @@ class NTriplesParser(object): if sink is not None: self.sink = sink else: - self.sink = Sink() + self.sink = DummySink() + + self.buffer = None + self.file = None + self.line = "" def parse(self, f, bnode_context=None): """ @@ -150,10 +143,13 @@ class NTriplesParser(object): passed in to define a distinct context for a given call to `parse`. """ + if not hasattr(f, "read"): raise ParseError("Item to parse must be a file-like object.") - # since N-Triples 1.1 files can and should be utf-8 encoded - f = codecs.getreader("utf-8")(f) + + if not hasattr(f, "encoding") and not hasattr(f, "charbuffer"): + # someone still using a bytestream here? + f = codecs.getreader("utf-8")(f) self.file = f self.buffer = "" @@ -164,16 +160,17 @@ class NTriplesParser(object): try: self.parseline(bnode_context=bnode_context) except ParseError: - raise ParseError("Invalid line: %r" % self.line) + raise ParseError("Invalid line: {}".format(self.line)) return self.sink def parsestring(self, s, **kwargs): """Parse s as an N-Triples string.""" - if not isinstance(s, str): + if not isinstance(s, (str, bytes, bytearray)): raise ParseError("Item to parse must be a string instance.") - f = BytesIO() - f.write(cast_bytes(s)) - f.seek(0) + if isinstance(s, (bytes, bytearray)): + f = codecs.getreader("utf-8")(BytesIO(s)) + else: + f = StringIO(s) self.parse(f, **kwargs) def readline(self): @@ -189,7 +186,7 @@ class NTriplesParser(object): while True: m = r_line.match(self.buffer) if m: # the more likely prospect - self.buffer = self.buffer[m.end():] + self.buffer = self.buffer[m.end() :] return m.group(1) else: buffer = self.file.read(bufsiz) @@ -211,12 +208,12 @@ class NTriplesParser(object): predicate = self.predicate() self.eat(r_wspaces) - object = self.object(bnode_context) + object_ = self.object(bnode_context) self.eat(r_tail) if self.line: - raise ParseError("Trailing garbage") - self.sink.triple(subject, predicate, object) + raise ParseError("Trailing garbage: {}".format(self.line)) + self.sink.triple(subject, predicate, object_) def peek(self, token): return self.line.startswith(token) @@ -227,7 +224,7 @@ class NTriplesParser(object): # print(dir(pattern)) # print repr(self.line), type(self.line) raise ParseError("Failed to eat %s at %s" % (pattern.pattern, self.line)) - self.line = self.line[m.end():] + self.line = self.line[m.end() :] return m def subject(self, bnode_context=None): @@ -295,13 +292,44 @@ class NTriplesParser(object): return False -# # Obsolete, unused -# def parseURI(uri): -# import urllib -# parser = NTriplesParser() -# u = urllib.urlopen(uri) -# sink = parser.parse(u) -# u.close() -# # for triple in sink: -# # print triple -# print 'Length of input:', sink.length +class NTGraphSink(object): + __slots__ = ("g",) + + def __init__(self, graph): + self.g = graph + + def triple(self, s, p, o): + self.g.add((s, p, o)) + + +class NTParser(Parser): + """parser for the ntriples format, often stored with the .nt extension + + See http://www.w3.org/TR/rdf-testcases/#ntriples""" + + __slots__ = set() + + @classmethod + def parse(cls, source, sink, **kwargs): + """ + Parse the NT format + + :type source: `rdflib.parser.InputSource` + :param source: the source of NT-formatted data + :type sink: `rdflib.graph.Graph` + :param sink: where to send parsed triples + :param kwargs: Additional arguments to pass to `.NTriplesParser.parse` + """ + f = source.getCharacterStream() + if not f: + b = source.getByteStream() + # TextIOBase includes: StringIO and TextIOWrapper + if isinstance(b, TextIOBase): + # f is not really a ByteStream, but a CharacterStream + f = b + else: + # since N-Triples 1.1 files can and should be utf-8 encoded + f = codecs.getreader("utf-8")(b) + parser = W3CNTriplesParser(NTGraphSink(sink)) + parser.parse(f, **kwargs) + f.close() diff --git a/rdflib/plugins/parsers/trig.py b/rdflib/plugins/parsers/trig.py index 8f270de0..938fb259 100644 --- a/rdflib/plugins/parsers/trig.py +++ b/rdflib/plugins/parsers/trig.py @@ -82,7 +82,7 @@ class TrigSinkParser(SinkParser): if j < 0: self.BadSyntax(argstr, i, "EOF found when expected graph") - if argstr[j: j + 1] == "=": # optional = for legacy support + if argstr[j : j + 1] == "=": # optional = for legacy support i = self.skipSpace(argstr, j + 1) if i < 0: @@ -90,7 +90,7 @@ class TrigSinkParser(SinkParser): else: i = j - if argstr[i: i + 1] != "{": + if argstr[i : i + 1] != "{": return -1 # the node wasn't part of a graph j = i + 1 @@ -106,7 +106,7 @@ class TrigSinkParser(SinkParser): if i < 0: self.BadSyntax(argstr, i, "needed '}', found end.") - if argstr[i: i + 1] == "}": + if argstr[i : i + 1] == "}": j = i + 1 break @@ -153,7 +153,11 @@ class TrigParser(Parser): ) p = TrigSinkParser(sink, baseURI=baseURI, turtle=True) - p.loadStream(source.getByteStream()) + stream = source.getCharacterStream() # try to get str stream first + if not stream: + # fallback to get the bytes stream + stream = source.getByteStream() + p.loadStream(stream) for prefix, namespace in p._bindings.items(): conj_graph.bind(prefix, namespace) |
