diff options
Diffstat (limited to 'Lib/sre_parse.py')
| -rw-r--r-- | Lib/sre_parse.py | 37 |
1 files changed, 30 insertions, 7 deletions
diff --git a/Lib/sre_parse.py b/Lib/sre_parse.py index a53735b07d..db01e844b4 100644 --- a/Lib/sre_parse.py +++ b/Lib/sre_parse.py @@ -13,6 +13,7 @@ # XXX: show string offset and offending character for all errors from sre_constants import * +import unicodedata SPECIAL_CHARS = ".\\[{()*+?^$|" REPEAT_CHARS = "*+?{" @@ -264,19 +265,19 @@ class Tokenizer: result += c self.__next() return result - def getuntil(self, terminator): + def getuntil(self, terminator, name): result = '' while True: c = self.next self.__next() if c is None: if not result: - raise self.error("missing group name") + raise self.error("missing " + name) raise self.error("missing %s, unterminated name" % terminator, len(result)) if c == terminator: if not result: - raise self.error("missing group name", 1) + raise self.error("missing " + name, 1) break result += c return result @@ -322,6 +323,17 @@ def _class_escape(source, escape): c = int(escape[2:], 16) chr(c) # raise ValueError for invalid code return LITERAL, c + elif c == "N" and source.istext: + # named unicode escape e.g. \N{EM DASH} + if not source.match('{'): + raise source.error("missing {") + charname = source.getuntil('}', 'character name') + try: + c = ord(unicodedata.lookup(charname)) + except KeyError: + raise source.error("undefined character name %r" % charname, + len(charname) + len(r'\N{}')) + return LITERAL, c elif c in OCTDIGITS: # octal escape (up to three digits) escape += source.getwhile(2, OCTDIGITS) @@ -370,6 +382,17 @@ def _escape(source, escape, state): c = int(escape[2:], 16) chr(c) # raise ValueError for invalid code return LITERAL, c + elif c == "N" and source.istext: + # named unicode escape e.g. \N{EM DASH} + if not source.match('{'): + raise source.error("missing {") + charname = source.getuntil('}', 'character name') + try: + c = ord(unicodedata.lookup(charname)) + except KeyError: + raise source.error("undefined character name %r" % charname, + len(charname) + len(r'\N{}')) + return LITERAL, c elif c == "0": # octal escape escape += source.getwhile(2, OCTDIGITS) @@ -679,13 +702,13 @@ def _parse(source, state, verbose, nested, first=False): # python extensions if sourcematch("<"): # named group: skip forward to end of name - name = source.getuntil(">") + name = source.getuntil(">", "group name") if not name.isidentifier(): msg = "bad character in group name %r" % name raise source.error(msg, len(name) + 1) elif sourcematch("="): # named backreference - name = source.getuntil(")") + name = source.getuntil(")", "group name") if not name.isidentifier(): msg = "bad character in group name %r" % name raise source.error(msg, len(name) + 1) @@ -748,7 +771,7 @@ def _parse(source, state, verbose, nested, first=False): elif char == "(": # conditional backreference group - condname = source.getuntil(")") + condname = source.getuntil(")", "group name") if condname.isidentifier(): condgroup = state.groupdict.get(condname) if condgroup is None: @@ -977,7 +1000,7 @@ def parse_template(source, pattern): name = "" if not s.match("<"): raise s.error("missing <") - name = s.getuntil(">") + name = s.getuntil(">", "group name") if name.isidentifier(): try: index = groupindex[name] |
