From 7cf613dc77302fb9a2a6533878aba7296276e12c Mon Sep 17 00:00:00 2001 From: Fred Drake Date: Tue, 4 Sep 2001 16:26:03 +0000 Subject: HTMLParser is allowed to be more strict than sgmllib, so let's not change their basic behavior: When parsing something that cannot possibly be valid in either HTML or XHTML, raise an exception. --- Lib/HTMLParser.py | 47 ++++++++++++++++------------------------------- 1 file changed, 16 insertions(+), 31 deletions(-) (limited to 'Lib/HTMLParser.py') diff --git a/Lib/HTMLParser.py b/Lib/HTMLParser.py index 584046d81e..df8383ecb1 100644 --- a/Lib/HTMLParser.py +++ b/Lib/HTMLParser.py @@ -269,17 +269,18 @@ class HTMLParser: return -1 # in practice, this should look like: ((name|stringlit) S*)+ '>' n = len(rawdata) - decltype = None - extrachars = "" + decltype, j = self.scan_name(j, i) + if j < 0: + return j + if decltype.lower() != "doctype": + raise HTMLParseError("unknown declaration: '%s'" % decltype, + self.getpos()) while j < n: c = rawdata[j] if c == ">": # end of declaration syntax data = rawdata[i+2:j] - if decltype == "doctype": - self.handle_decl(data) - else: - self.unknown_decl(data) + self.handle_decl(data) return j + 1 if c in "\"'": m = declstringlit.match(rawdata, j) @@ -287,30 +288,15 @@ class HTMLParser: return -1 # incomplete j = m.end() elif c in "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ": - m = declname.match(rawdata, j) - if not m: - return -1 # incomplete - j = m.end() - if decltype is None: - decltype = m.group(0).rstrip().lower() - if decltype != "doctype": - extrachars = "=" + name, j = self.scan_name(j, i) elif c == "[" and decltype == "doctype": j = self.parse_doctype_subset(j + 1, i) - if j < 0: - return j - elif c in extrachars: - j = j + 1 - while j < n and rawdata[j] in string.whitespace: - j = j + 1 - if j == n: - # end of buffer while in declaration - return -1 else: raise HTMLParseError( "unexpected char in declaration: %s" % `rawdata[j]`, self.getpos()) - decltype = decltype or '' + if j < 0: + return j return -1 # incomplete # Internal -- scan past the internal subset in a