summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNatanael Arndt <arndtn@gmail.com>2021-03-13 09:22:50 +0100
committerGitHub <noreply@github.com>2021-03-13 09:22:50 +0100
commit1b77d030131b3261b397bf500bc892ac0cdcaa84 (patch)
treef518d3e5990d50e84744aac7bd9378d48bd94205
parent83b3e99e6e4ed4cd6825773c24280373537b2ca9 (diff)
parent9653eefb1a51de751c44dab3c072cf190a85844e (diff)
downloadrdflib-1b77d030131b3261b397bf500bc892ac0cdcaa84.tar.gz
Merge pull request #1272 from white-gecko/feature/speedupN3
speedup notation3/turtle parser
-rwxr-xr-xrdflib/plugins/parsers/notation3.py288
1 files changed, 151 insertions, 137 deletions
diff --git a/rdflib/plugins/parsers/notation3.py b/rdflib/plugins/parsers/notation3.py
index ad90e67b..3cf19ace 100755
--- a/rdflib/plugins/parsers/notation3.py
+++ b/rdflib/plugins/parsers/notation3.py
@@ -137,7 +137,7 @@ def join(here, there):
return here + frag
# join('mid:foo@example', '../foo') bzzt
- if here[bcolonl + 1 : bcolonl + 2] != "/":
+ if here[bcolonl + 1] != "/":
raise ValueError(
"Base <%s> has no slash after "
"colon - with relative '%s'." % (here, there)
@@ -303,14 +303,15 @@ option_noregen = 0 # If set, do not regenerate genids on output
# characters. The XML spec switched to assuming unknown things were name
# characaters.
# _namechars = string.lowercase + string.uppercase + string.digits + '_-'
-_notQNameChars = "\t\r\n !\"#$&'()*,+/;<=>?@[\\]^`{|}~" # else valid qname :-/
-_notKeywordsChars = _notQNameChars + "."
-_notNameChars = _notQNameChars + ":" # Assume anything else valid name :-/
+_notQNameChars = set("\t\r\n !\"#$&'()*,+/;<=>?@[\\]^`{|}~") # else valid qname :-/
+_notKeywordsChars = _notQNameChars | {"."}
+_notNameChars = _notQNameChars | {":"} # Assume anything else valid name :-/
_rdfns = "http://www.w3.org/1999/02/22-rdf-syntax-ns#"
-hexChars = "ABCDEFabcdef0123456789"
-escapeChars = "(_~.-!$&'()*+,;=/?#@%)" # valid for \ escapes in localnames
-
+hexChars = set("ABCDEFabcdef0123456789")
+escapeChars = set("(_~.-!$&'()*+,;=/?#@%)") # valid for \ escapes in localnames
+numberChars = set("0123456789-")
+numberCharsPlus = numberChars | {"+", "."}
def unicodeExpand(m):
try:
@@ -503,19 +504,19 @@ class SinkParser:
"""
assert tok[0] not in _notNameChars # not for punctuation
- if argstr[i : i + 1] == "@":
- i = i + 1
+ if argstr[i] == "@":
+ i += 1
else:
if tok not in self.keywords:
return -1 # No, this has neither keywords declaration nor "@"
+ i_plus_len_tok = i + len(tok)
if (
- argstr[i : i + len(tok)] == tok
- and (argstr[i + len(tok)] in _notKeywordsChars)
- or (colon and argstr[i + len(tok)] == ":")
+ argstr[i : i_plus_len_tok] == tok
+ and (argstr[i_plus_len_tok] in _notKeywordsChars)
+ or (colon and argstr[i_plus_len_tok] == ":")
):
- i = i + len(tok)
- return i
+ return i_plus_len_tok
else:
return -1
@@ -527,10 +528,11 @@ class SinkParser:
assert tok[0] not in _notNameChars # not for punctuation
- if argstr[i : i + len(tok)].lower() == tok.lower() and (
- argstr[i + len(tok)] in _notQNameChars
+ len_tok = len(tok)
+ if argstr[i : i + len_tok].lower() == tok.lower() and (
+ argstr[i + len_tok] in _notQNameChars
):
- i = i + len(tok)
+ i += len_tok
return i
else:
return -1
@@ -802,10 +804,10 @@ class SinkParser:
res.append(("<-", self._store.newSymbol(Logic_NS + "implies")))
return i + 2
- if argstr[i : i + 1] == "=":
+ if argstr[i] == "=":
if self.turtle:
self.BadSyntax(argstr, i, "Found '=' in Turtle mode")
- if argstr[i + 1 : i + 2] == ">":
+ if argstr[i + 1] == ">":
res.append(("->", self._store.newSymbol(Logic_NS + "implies")))
return i + 2
res.append(("->", DAML_sameAs))
@@ -845,8 +847,8 @@ class SinkParser:
if j < 0:
return j # nope
- while argstr[j : j + 1] in "!^": # no spaces, must follow exactly (?)
- ch = argstr[j : j + 1]
+ while argstr[j] in {"!", "^"}: # no spaces, must follow exactly (?)
+ ch = argstr[j]
subj = res.pop()
obj = self.blankNode(uri=self.here(j))
j = self.node(argstr, j + 1, res)
@@ -880,7 +882,7 @@ class SinkParser:
if j < 0:
return j # eof
i = j
- ch = argstr[i : i + 1] # Quick 1-character checks first:
+ ch = argstr[i] # Quick 1-character checks first:
if ch == "[":
bnodeID = self.here(i)
@@ -888,7 +890,7 @@ class SinkParser:
if j < 0:
self.BadSyntax(argstr, i, "EOF after '['")
# Hack for "is" binding name to anon node
- if argstr[j : j + 1] == "=":
+ if argstr[j] == "=":
if self.turtle:
self.BadSyntax(
argstr, j, "Found '[=' or '[ =' when in turtle mode."
@@ -906,8 +908,8 @@ class SinkParser:
self.BadSyntax(
argstr, i, "EOF when objectList expected after [ = "
)
- if argstr[j : j + 1] == ";":
- j = j + 1
+ if argstr[j] == ";":
+ j += 1
else:
self.BadSyntax(argstr, i, "objectList expected after [= ")
@@ -923,7 +925,7 @@ class SinkParser:
self.BadSyntax(
argstr, i, "EOF when ']' expected after [ <propertyList>"
)
- if argstr[j : j + 1] != "]":
+ if argstr[j] != "]":
self.BadSyntax(argstr, j, "']' expected")
res.append(subj)
return j + 1
@@ -932,7 +934,7 @@ class SinkParser:
# if self.turtle:
# self.BadSyntax(argstr, i,
# "found '{' while in Turtle mode, Formulas not supported!")
- ch2 = argstr[i + 1 : i + 2]
+ ch2 = argstr[i + 1]
if ch2 == "$":
# a set
i += 1
@@ -948,7 +950,7 @@ class SinkParser:
break
if not first_run:
- if argstr[i : i + 1] == ",":
+ if argstr[i] == ",":
i += 1
else:
self.BadSyntax(argstr, i, "expected: ','")
@@ -983,7 +985,7 @@ class SinkParser:
if i < 0:
self.BadSyntax(argstr, i, "needed '}', found end.")
- if argstr[i : i + 1] == "}":
+ if argstr[i] == "}":
j = i + 1
break
@@ -1002,7 +1004,7 @@ class SinkParser:
if ch == "(":
thing_type = self._store.newList
- ch2 = argstr[i + 1 : i + 2]
+ ch2 = argstr[i + 1]
if ch2 == "$":
thing_type = self._store.newSet
i += 1
@@ -1013,7 +1015,7 @@ class SinkParser:
i = self.skipSpace(argstr, j)
if i < 0:
self.BadSyntax(argstr, i, "needed ')', found end.")
- if argstr[i : i + 1] == ")":
+ if argstr[i] == ")":
j = i + 1
break
@@ -1096,9 +1098,9 @@ class SinkParser:
j = self.skipSpace(argstr, i)
if j < 0:
self.BadSyntax(argstr, j, "EOF found in list of objects")
- if argstr[i : i + 1] != ";":
+ if argstr[i] != ";":
return i
- i = i + 1 # skip semicolon and continue
+ i += 1 # skip semicolon and continue
def commaSeparatedList(self, argstr, j, res, what):
"""return value: -1 bad syntax; >1 new position in argstr
@@ -1117,7 +1119,7 @@ class SinkParser:
j = self.skipSpace(argstr, i)
if j < 0:
return j # eof
- ch = argstr[j : j + 1]
+ ch = argstr[j]
if ch != ",":
if ch != ".":
return -1
@@ -1134,7 +1136,7 @@ class SinkParser:
j = self.skipSpace(argstr, i)
if j < 0:
self.BadSyntax(argstr, j, "EOF found after object")
- if argstr[j : j + 1] != ",":
+ if argstr[j] != ",":
return j # Found something else!
i = self.object(argstr, j + 1, res)
if i < 0:
@@ -1144,11 +1146,12 @@ class SinkParser:
j = self.skipSpace(argstr, i)
if j < 0:
return j # eof
- if argstr[j : j + 1] == ".":
+ ch = argstr[j]
+ if ch == ".":
return j + 1 # skip
- if argstr[j : j + 1] == "}":
+ if ch == "}":
return j # don't skip it
- if argstr[j : j + 1] == "]":
+ if ch == "]":
return j
self.BadSyntax(argstr, j, "expected '.' or '}' or ']' at end of statement")
@@ -1178,10 +1181,7 @@ class SinkParser:
else:
self.BadSyntax(argstr, i, 'Prefix "%s:" not bound' % (pfx))
symb = self._store.newSymbol(ns + ln)
- if symb in self._variables:
- res.append(self._variables[symb])
- else:
- res.append(symb) # @@@ "#" CONVENTION
+ res.append(self._variables.get(symb, symb))
return j
i = self.skipSpace(argstr, i)
@@ -1197,31 +1197,26 @@ class SinkParser:
return -1
elif argstr[i] == "<":
- i = i + 1
- st = i
- while i < len(argstr):
- if argstr[i] == ">":
- uref = argstr[st:i] # the join should dealt with "":
-
- # expand unicode escapes
- uref = unicodeEscape8.sub(unicodeExpand, uref)
- uref = unicodeEscape4.sub(unicodeExpand, uref)
-
- if self._baseURI:
- uref = join(self._baseURI, uref) # was: uripath.join
- else:
- assert (
- ":" in uref
- ), "With no base URI, cannot deal with relative URIs"
- if argstr[i - 1 : i] == "#" and not uref[-1:] == "#":
- uref = uref + "#" # She meant it! Weirdness in urlparse?
- symb = self._store.newSymbol(uref)
- if symb in self._variables:
- res.append(self._variables[symb])
- else:
- res.append(symb)
- return i + 1
- i = i + 1
+ st = i + 1
+ i = argstr.find(">", st)
+ if i >= 0:
+ uref = argstr[st:i] # the join should dealt with "":
+
+ # expand unicode escapes
+ uref = unicodeEscape8.sub(unicodeExpand, uref)
+ uref = unicodeEscape4.sub(unicodeExpand, uref)
+
+ if self._baseURI:
+ uref = join(self._baseURI, uref) # was: uripath.join
+ else:
+ assert (
+ ":" in uref
+ ), "With no base URI, cannot deal with relative URIs"
+ if argstr[i - 1] == "#" and not uref[-1:] == "#":
+ uref += "#" # She meant it! Weirdness in urlparse?
+ symb = self._store.newSymbol(uref)
+ res.append(self._variables.get(symb, symb))
+ return i + 1
self.BadSyntax(argstr, j, "unterminated URI reference")
elif self.keywordsSet:
@@ -1239,20 +1234,31 @@ class SinkParser:
def skipSpace(self, argstr, i):
"""Skip white space, newlines and comments.
return -1 if EOF, else position of first non-ws character"""
+
+ # Most common case is a non-commented line starting with few spaces and tabs.
+ try:
+ while True:
+ ch = argstr[i]
+ if ch in {" ", "\t"}:
+ i += 1
+ continue
+ elif ch not in {"#", "\r", "\n"}:
+ return i
+ break
+ except IndexError:
+ return -1
+
while 1:
m = eol.match(argstr, i)
if m is None:
break
- self.lines = self.lines + 1
- i = m.end() # Point to first character unmatched
- self.startOfLine = i
+ self.lines += 1
+ self.startOfLine = i = m.end() # Point to first character unmatched
m = ws.match(argstr, i)
if m is not None:
i = m.end()
m = eof.match(argstr, i)
- if m is not None:
- return -1
- return i
+ return i if m is None else -1
def variable(self, argstr, i, res):
""" ?abc -> variable(:abc)
@@ -1262,14 +1268,15 @@ class SinkParser:
if j < 0:
return -1
- if argstr[j : j + 1] != "?":
+ if argstr[j] != "?":
return -1
- j = j + 1
+ j += 1
i = j
- if argstr[j] in "0123456789-":
+ if argstr[j] in numberChars:
self.BadSyntax(argstr, j, "Varible name can't start with '%s'" % argstr[j])
- while i < len(argstr) and argstr[i] not in _notKeywordsChars:
- i = i + 1
+ len_argstr = len(argstr)
+ while i < len_argstr and argstr[i] not in _notKeywordsChars:
+ i += 1
if self._parentContext is None:
varURI = self._store.newSymbol(self._baseURI + "#" + argstr[j:i])
if varURI not in self._variables:
@@ -1297,11 +1304,12 @@ class SinkParser:
if j < 0:
return -1
- if argstr[j] in "0123456789-" or argstr[j] in _notKeywordsChars:
+ if argstr[j] in numberChars or argstr[j] in _notKeywordsChars:
return -1
i = j
- while i < len(argstr) and argstr[i] not in _notKeywordsChars:
- i = i + 1
+ len_argstr = len(argstr)
+ while i < len_argstr and argstr[i] not in _notKeywordsChars:
+ i += 1
res.append(argstr[j:i])
return i
@@ -1317,29 +1325,29 @@ class SinkParser:
return -1
c = argstr[i]
- if c in "0123456789-+.":
+ if c in numberCharsPlus:
return -1
+ len_argstr = len(argstr)
if c not in _notNameChars:
- ln = c
- i = i + 1
- while i < len(argstr):
- c = argstr[i]
- if c not in _notNameChars:
- ln = ln + c
- i = i + 1
- else:
- break
+ j = i
+ i += 1
+
+ try:
+ while argstr[i] not in _notNameChars:
+ i += 1
+ except IndexError:
+ pass # Very rare.
if argstr[i - 1] == ".": # qname cannot end with "."
- ln = ln[:-1]
- if not ln:
- return -1
i -= 1
+ if i == j:
+ return -1
+ ln = argstr[j:i]
else: # First character is non-alpha
ln = "" # Was: None - TBL (why? useful?)
- if i < len(argstr) and argstr[i] == ":":
+ if i < len_argstr and argstr[i] == ":":
pfx = ln
# bnodes names have different rules
if pfx == "_":
@@ -1347,18 +1355,18 @@ class SinkParser:
else:
allowedChars = _notQNameChars
- i = i + 1
+ i += 1
lastslash = False
- # start = i # TODO first char .
+ start = i
ln = ""
- while i < len(argstr):
+ while i < len_argstr:
c = argstr[i]
- if not lastslash and c == "\\":
+ if c == "\\" and not lastslash: # Very rare.
lastslash = True
- i += 1
-
- elif lastslash or c not in allowedChars:
-
+ if start < i:
+ ln += argstr[start:i]
+ start = i + 1
+ elif c not in allowedChars or lastslash: # Most common case is "a-zA-Z"
if lastslash:
if c not in escapeChars:
raise BadSyntax(
@@ -1368,7 +1376,7 @@ class SinkParser:
i,
"illegal escape " + c,
)
- elif c == "%":
+ elif c == "%": # Very rare.
if (
argstr[i + 1] not in hexChars
or argstr[i + 2] not in hexChars
@@ -1380,12 +1388,10 @@ class SinkParser:
i,
"illegal hex escape " + c,
)
-
- ln = ln + c
- i = i + 1
lastslash = False
else:
break
+ i += 1
if lastslash:
raise BadSyntax(
@@ -1394,11 +1400,13 @@ class SinkParser:
if argstr[i - 1] == ".":
# localname cannot end in .
- ln = ln[:-1]
- if not ln:
+ if len(ln) == 0 and start == i:
return -1
i -= 1
+ if start < i:
+ ln += argstr[start:i]
+
res.append((pfx, ln))
return i
@@ -1419,12 +1427,15 @@ class SinkParser:
else:
i = j
- if argstr[i] in self.string_delimiters:
- if argstr[i : i + 3] == argstr[i] * 3:
- delim = argstr[i] * 3
+ ch = argstr[i]
+ if ch in self.string_delimiters:
+ ch_three = ch * 3
+ if argstr[i : i + 3] == ch_three:
+ delim = ch_three
+ i += 3
else:
- delim = argstr[i]
- i = i + len(delim)
+ delim = ch
+ i += 1
j, s = self.strconst(argstr, i, delim)
@@ -1446,7 +1457,7 @@ class SinkParser:
i = j
ch = argstr[i]
- if ch in "-+0987654321.":
+ if ch in numberCharsPlus:
m = exponent_syntax.match(argstr, i)
if m:
j = m.end()
@@ -1467,17 +1478,19 @@ class SinkParser:
# return -1 ## or fall through?
- if argstr[i] in self.string_delimiters:
- if argstr[i : i + 3] == argstr[i] * 3:
- delim = argstr[i] * 3
+ ch_three = ch * 3
+ if ch in self.string_delimiters:
+ if argstr[i : i + 3] == ch_three:
+ delim = ch_three
+ i += 3
else:
- delim = argstr[i]
- i = i + len(delim)
+ delim = ch
+ i += 1
dt = None
j, s = self.strconst(argstr, i, delim)
lang = None
- if argstr[j : j + 1] == "@": # Language?
+ if argstr[j] == "@": # Language?
m = langcode.match(argstr, j + 1)
if m is None:
raise BadSyntax(
@@ -1515,7 +1528,8 @@ class SinkParser:
j = i
ustr = "" # Empty unicode string
startline = self.lines # Remember where for error messages
- while j < len(argstr):
+ len_argstr = len(argstr)
+ while j < len_argstr:
if argstr[j] == delim1:
if delim == delim1: # done when delim is " or '
i = j + 1
@@ -1525,19 +1539,19 @@ class SinkParser:
): # done when delim is """ or ''' and, respectively ...
if argstr[j : j + 5] == delim5: # ... we have "" or '' before
i = j + 5
- ustr = ustr + delim2
+ ustr += delim2
return i, ustr
if argstr[j : j + 4] == delim4: # ... we have " or ' before
i = j + 4
- ustr = ustr + delim1
+ ustr += delim1
return i, ustr
if argstr[j : j + 3] == delim3: # current " or ' is part of delim
i = j + 3
return i, ustr
# we are inside of the string and current char is " or '
- j = j + 1
- ustr = ustr + delim1
+ j += 1
+ ustr += delim1
continue
m = interesting.search(argstr, j) # was argstr[j:].
@@ -1549,7 +1563,7 @@ class SinkParser:
i = m.start()
try:
- ustr = ustr + argstr[j:i]
+ ustr += argstr[j:i]
except UnicodeError:
err = ""
for c in argstr[j:i]:
@@ -1570,11 +1584,11 @@ class SinkParser:
if ch == delim1:
j = i
continue
- elif ch in ('"', "'") and ch != delim1:
- ustr = ustr + ch
+ elif ch in {'"', "'"} and ch != delim1:
+ ustr += ch
j = i + 1
continue
- elif ch in "\r\n":
+ elif ch in {"\r", "\n"}:
if delim == delim1:
raise BadSyntax(
self._thisDoc,
@@ -1583,14 +1597,14 @@ class SinkParser:
i,
"newline found in string literal",
)
- self.lines = self.lines + 1
- ustr = ustr + ch
+ self.lines += 1
+ ustr += ch
j = i + 1
self.startOfLine = j
elif ch == "\\":
j = i + 1
- ch = argstr[j : j + 1] # Will be empty if string ends
+ ch = argstr[j] # Will be empty if string ends
if not ch:
raise BadSyntax(
self._thisDoc,
@@ -1602,14 +1616,14 @@ class SinkParser:
k = "abfrtvn\\\"'".find(ch)
if k >= 0:
uch = "\a\b\f\r\t\v\n\\\"'"[k]
- ustr = ustr + uch
- j = j + 1
+ ustr += uch
+ j += 1
elif ch == "u":
j, ch = self.uEscape(argstr, j + 1, startline)
- ustr = ustr + ch
+ ustr += ch
elif ch == "U":
j, ch = self.UEscape(argstr, j + 1, startline)
- ustr = ustr + ch
+ ustr += ch
else:
self.BadSyntax(argstr, i, "bad escape")