diff options
Diffstat (limited to 'Lib/HTMLParser.py')
-rw-r--r-- | Lib/HTMLParser.py | 21 |
1 files changed, 12 insertions, 9 deletions
diff --git a/Lib/HTMLParser.py b/Lib/HTMLParser.py index 7cee47a7c5..884d2a53c5 100644 --- a/Lib/HTMLParser.py +++ b/Lib/HTMLParser.py @@ -26,7 +26,7 @@ commentclose = re.compile(r'--\s*>') tagfind = re.compile('[a-zA-Z][-.a-zA-Z0-9:_]*') attrfind = re.compile( r'\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*' - r'(\'[^\']*\'|"[^"]*"|[-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~@]*))?') + r'(\'[^\']*\'|"[^"]*"|[^\s"\'=<>`]*))?') locatestarttagend = re.compile(r""" <[a-zA-Z][-.a-zA-Z0-9:_]* # tag name @@ -99,7 +99,7 @@ class HTMLParser(markupbase.ParserBase): markupbase.ParserBase.reset(self) def feed(self, data): - """Feed data to the parser. + r"""Feed data to the parser. Call this as often as you want, with as little or as much text as you want (may include '\n'). @@ -367,13 +367,16 @@ class HTMLParser(markupbase.ParserBase): return s def replaceEntities(s): s = s.groups()[0] - if s[0] == "#": - s = s[1:] - if s[0] in ['x','X']: - c = int(s[1:], 16) - else: - c = int(s) - return unichr(c) + try: + if s[0] == "#": + s = s[1:] + if s[0] in ['x','X']: + c = int(s[1:], 16) + else: + c = int(s) + return unichr(c) + except ValueError: + return '&#'+s+';' else: # Cannot use name2codepoint directly, because HTMLParser supports apos, # which is not part of HTML 4 |