summaryrefslogtreecommitdiff
path: root/Lib/HTMLParser.py
diff options
context:
space:
mode:
Diffstat (limited to 'Lib/HTMLParser.py')
-rw-r--r--Lib/HTMLParser.py21
1 files changed, 12 insertions, 9 deletions
diff --git a/Lib/HTMLParser.py b/Lib/HTMLParser.py
index 7cee47a7c5..884d2a53c5 100644
--- a/Lib/HTMLParser.py
+++ b/Lib/HTMLParser.py
@@ -26,7 +26,7 @@ commentclose = re.compile(r'--\s*>')
tagfind = re.compile('[a-zA-Z][-.a-zA-Z0-9:_]*')
attrfind = re.compile(
r'\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*'
- r'(\'[^\']*\'|"[^"]*"|[-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~@]*))?')
+ r'(\'[^\']*\'|"[^"]*"|[^\s"\'=<>`]*))?')
locatestarttagend = re.compile(r"""
<[a-zA-Z][-.a-zA-Z0-9:_]* # tag name
@@ -99,7 +99,7 @@ class HTMLParser(markupbase.ParserBase):
markupbase.ParserBase.reset(self)
def feed(self, data):
- """Feed data to the parser.
+ r"""Feed data to the parser.
Call this as often as you want, with as little or as much text
as you want (may include '\n').
@@ -367,13 +367,16 @@ class HTMLParser(markupbase.ParserBase):
return s
def replaceEntities(s):
s = s.groups()[0]
- if s[0] == "#":
- s = s[1:]
- if s[0] in ['x','X']:
- c = int(s[1:], 16)
- else:
- c = int(s)
- return unichr(c)
+ try:
+ if s[0] == "#":
+ s = s[1:]
+ if s[0] in ['x','X']:
+ c = int(s[1:], 16)
+ else:
+ c = int(s)
+ return unichr(c)
+ except ValueError:
+ return '&#'+s+';'
else:
# Cannot use name2codepoint directly, because HTMLParser supports apos,
# which is not part of HTML 4