From fb38c76e0f15e15d08e4635a24719cc120809191 Mon Sep 17 00:00:00 2001
From: Fred Drake <fdrake@acm.org>
Date: Mon, 16 Jul 2001 18:30:35 +0000
Subject: In CDATA mode, make sure entity-reference syntax is not interpreted;
 entity references are not allowed in that mode.

Do a better job of scanning <!DOCTYPE ...> declarations; based on the
code in HTMLParser.py.
---
 Lib/sgmllib.py | 34 ++++++++++++++++++++++++++--------
 1 file changed, 26 insertions(+), 8 deletions(-)
diff --git a/Lib/sgmllib.py b/Lib/sgmllib.py
index 5ff9f70c96..3422980834 100644
--- a/Lib/sgmllib.py
+++ b/Lib/sgmllib.py
@@ -5,7 +5,8 @@
 # XXX There should be a way to distinguish between PCDATA (parsed
 # character data -- the normal case), RCDATA (replaceable character
 # data -- only char and entity references and end tags are special)
-# and CDATA (character data -- only end tags are special).
+# and CDATA (character data -- only end tags are special).  RCDATA is
+# not supported at all.
 
 
 import re
@@ -34,6 +35,9 @@ endbracket = re.compile('[<>]')
 special = re.compile('<![^<>]*>')
 commentopen = re.compile('<!--')
 commentclose = re.compile(r'--\s*>')
+declopen = re.compile('<!')
+declname = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9]*\s*')
+declstringlit = re.compile(r'(\'[^\']*\'|"[^"]*")\s*')
 tagfind = re.compile('[a-zA-Z][-_.a-zA-Z0-9]*')
 attrfind = re.compile(
     r'\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)(\s*=\s*'
@@ -160,6 +164,10 @@ class SGMLParser:
                     i = k
                     continue
             elif rawdata[i] == '&':
+                if self.literal:
+                    self.handle_data(rawdata[i])
+                    i = i+1
+                    continue
                 match = charref.match(rawdata, i)
                 if match:
                     name = match.group(1)
@@ -210,11 +218,20 @@ class SGMLParser:
 
     # Internal -- parse declaration.
     def parse_declaration(self, i):
+        # This is some sort of declaration; in "HTML as
+        # deployed," this should only be the document type
+        # declaration ("<!DOCTYPE html...>").
         rawdata = self.rawdata
         j = i + 2
+        assert rawdata[i:j] == "<!", "unexpected call to parse_declaration"
+        if rawdata[j:j+1] in ("-", ""):
+            # Start of comment followed by buffer boundary,
+            # or just a buffer boundary.
+            return -1
+        # in practice, this should look like: ((name|stringlit) S*)+ '>'
         n = len(rawdata)
         while j < n:
-            c = rawdata[j:j+1]
+            c = rawdata[j]
             if c == ">":
                 # end of declaration syntax
                 self.handle_decl(rawdata[i+2:j])
@@ -222,15 +239,16 @@ class SGMLParser:
             if c in "\"'":
                 m = declstringlit.match(rawdata, j)
                 if not m:
-                    # incomplete or an error?
-                    return -1
+                    return -1 # incomplete
                 j = m.end()
-            else:
-                m = decldata.match(rawdata, j)
+            elif c in "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ":
+                m = declname.match(rawdata, j)
                 if not m:
-                    # incomplete or an error?
-                    return -1
+                    return -1 # incomplete
                 j = m.end()
+            else:
+                raise SGMLParseError(
+                    "unexpected char in declaration: %s" % `rawdata[j]`)
         # end of buffer between tokens
         return -1
 
-- 
cgit v1.2.1