New version of xmllib from Sjoerd.

The main incompatibility is that the error reporting method is now called as parser.syntax_error(msg) instead of parser.syntax_error(lineno, msg) This new version also has some code to deal with the <?xml?> and <!DOCTYPE> tags at the start of an XML document. The documentation has been updated, and a small test module has been created.
author: Guido van Rossum <guido@python.org> 1998-01-29 14:55:24 +0000
committer: Guido van Rossum <guido@python.org> 1998-01-29 14:55:24 +0000
commit: 02505e48508deac4ae835ee833e0a05788c580d0 (patch)
tree: a09b54a85345b9169fff589db26d6e93e4a5be19 /Lib/xmllib.py
parent: 44f5c75f430c92384137c4bef0c0a69dce02ee0b (diff)
download: cpython-git-02505e48508deac4ae835ee833e0a05788c580d0.tar.gz
1 files changed, 162 insertions, 57 deletions
diff --git a/Lib/xmllib.py b/Lib/xmllib.py
index 7b2a76a7c6..9f6e23ecca 100644
--- a/Lib/xmllib.py
+++ b/Lib/xmllib.py
@@ -15,7 +15,7 @@ incomplete = re.compile('&(' + _Name + '|#[0-9]*|#x[0-9a-fA-F]*)?|'
 			   '<([a-zA-Z_:][^<>]*|'
 			      '/([a-zA-Z_:][^<>]*)?|'
 			      '![^<>]*|'
-			      '\?[^<>]*)?')
+			      r'\?[^<>]*)?')
 
 ref = re.compile('&(' + _Name + '|#[0-9]+|#x[0-9a-fA-F]+);?')
 entityref = re.compile('&(?P<name>' + _Name + ')[^-a-zA-Z0-9._:]')
@@ -28,11 +28,12 @@ endtagopen = re.compile('</')
 starttagend = re.compile(_opS + '(?P<slash>/?)>')
 endbracket = re.compile('>')
 tagfind = re.compile(_Name)
-cdataopen = re.compile('<!\[CDATA\[')
-cdataclose = re.compile('\]\]>')
+cdataopen = re.compile(r'<!\[CDATA\[')
+cdataclose = re.compile(r'\]\]>')
+doctype = re.compile('<!DOCTYPE' + _S + '(?P<name>' + _Name + ')' + _S)
 special = re.compile('<!(?P<special>[^<>]*)>')
-procopen = re.compile('<\?(?P<proc>' + _Name + ')' + _S)
-procclose = re.compile('\?>')
+procopen = re.compile(r'<\?(?P<proc>' + _Name + ')' + _S)
+procclose = re.compile(_opS + r'\?>')
 commentopen = re.compile('<!--')
 commentclose = re.compile('-->')
 doubledash = re.compile('--')
@@ -63,10 +64,12 @@ class XMLParser:
     def reset(self):
 	self.rawdata = ''
 	self.stack = []
-	self.lasttag = '???'
 	self.nomoretags = 0
 	self.literal = 0
 	self.lineno = 1
+	self.__at_start = 1
+	self.__seen_doctype = None
+	self.__seen_starttag = 0
 
     # For derived classes only -- enter literal mode (CDATA) till EOF
     def setnomoretags(self):
@@ -98,8 +101,7 @@ class XMLParser:
 		newdata.append(data[i:])
 		return string.join(newdata, '')
 	    if data[res.end(0) - 1] != ';':
-		self.syntax_error(self.lineno,
-				  '; missing after entity/char reference')
+		self.syntax_error("`;' missing after entity/char reference")
 	    newdata.append(data[i:res.start(0)])
 	    str = res.group(1)
 	    if str[0] == '#':
@@ -123,6 +125,8 @@ class XMLParser:
 	i = 0
 	n = len(rawdata)
 	while i < n:
+	    if i > 0:
+		self.__at_start = 0
 	    if self.nomoretags:
 		data = rawdata[i:n]
 		self.handle_data(data)
@@ -135,6 +139,7 @@ class XMLParser:
 	    else:
 		    j = n
 	    if i < j:
+		self.__at_start = 0
 		data = rawdata[i:j]
 		self.handle_data(data)
 		self.lineno = self.lineno + string.count(data, '\n')
@@ -150,6 +155,7 @@ class XMLParser:
 			continue
 		    k = self.parse_starttag(i)
 		    if k < 0: break
+		    self.__seen_starttag = 1
 		    self.lineno = self.lineno + string.count(rawdata[i:k], '\n')
 		    i = k
 		    continue
@@ -180,11 +186,29 @@ class XMLParser:
 		    continue
 		res = procopen.match(rawdata, i)
 		if res:
-		    k = self.parse_proc(i, res)
+		    k = self.parse_proc(i)
 		    if k < 0: break
 		    self.lineno = self.lineno + string.count(rawdata[i:k], '\n')
 		    i = k
 		    continue
+		res = doctype.match(rawdata, i)
+		if res:
+		    if self.literal:
+			data = rawdata[i]
+			self.handle_data(data)
+			self.lineno = self.lineno + string.count(data, '\n')
+			i = i+1
+			continue
+		    if self.__seen_doctype:
+			self.syntax_error('multiple DOCTYPE elements')
+		    if self.__seen_starttag:
+			self.syntax_error('DOCTYPE not at beginning of document')
+		    k = self.parse_doctype(res)
+		    if k < 0: break
+		    self.__seen_doctype = res.group('name')
+		    self.lineno = self.lineno + string.count(rawdata[i:k], '\n')
+		    i = k
+		    continue
 		res = special.match(rawdata, i)
 		if res:
 		    if self.literal:
@@ -202,7 +226,7 @@ class XMLParser:
 		if res is not None:
 		    i = res.end(0)
 		    if rawdata[i-1] != ';':
-			self.syntax_error(self.lineno, '; missing in charref')
+			self.syntax_error("`;' missing in charref")
 			i = i-1
 		    self.handle_charref(res.group('char')[:-1])
 		    self.lineno = self.lineno + string.count(res.group(0), '\n')
@@ -211,7 +235,7 @@ class XMLParser:
 		if res is not None:
 		    i = res.end(0)
 		    if rawdata[i-1] != ';':
-			self.syntax_error(self.lineno, '; missing in entityref')
+			self.syntax_error("`;' missing in entityref")
 			i = i-1
 		    self.handle_entityref(res.group('name'))
 		    self.lineno = self.lineno + string.count(res.group(0), '\n')
@@ -230,7 +254,7 @@ class XMLParser:
 	    j = res.end(0)
 	    if j == n:
 		break # Really incomplete
-	    self.syntax_error(self.lineno, 'bogus < or &')
+	    self.syntax_error("bogus `<' or `&'")
 	    data = res.group(0)
 	    self.handle_data(data)
 	    self.lineno = self.lineno + string.count(data, '\n')
@@ -242,7 +266,11 @@ class XMLParser:
 	    self.lineno = self.lineno + string.count(data, '\n')
 	    i = n
 	self.rawdata = rawdata[i:]
-	# XXX if end: check for empty stack
+	if end:
+	    if self.stack:
+		self.syntax_error('missing end tags')
+		while self.stack:
+		    self.finish_endtag(self.stack[-1])
 
     # Internal -- parse comment, return length or -1 if not terminated
     def parse_comment(self, i):
@@ -254,11 +282,30 @@ class XMLParser:
 	    return -1
 	# doubledash search will succeed because it's a subset of commentclose
 	if doubledash.search(rawdata, i+4).start(0) < res.start(0):
-	    self.syntax_error(self.lineno, "`--' inside comment")
+	    self.syntax_error("`--' inside comment")
 	self.handle_comment(rawdata[i+4: res.start(0)])
 	return res.end(0)
 
-    # Internal -- handle CDATA tag, return lenth or -1 if not terminated
+    # Internal -- handle DOCTYPE tag, return length or -1 if not terminated
+    def parse_doctype(self, res):
+	rawdata = self.rawdata
+	n = len(rawdata)
+	name = res.group('name')
+	j = k = res.end(0)
+	level = 0
+	while k < n:
+	    c = rawdata[k]
+	    if c == '<':
+		level = level + 1
+	    elif c == '>':
+		if level == 0:
+		    self.handle_doctype(name, rawdata[j:k])
+		    return k+1
+		level = level - 1
+	    k = k+1
+	return -1
+
+    # Internal -- handle CDATA tag, return length or -1 if not terminated
     def parse_cdata(self, i):
 	rawdata = self.rawdata
 	if rawdata[i:i+9] <> '<![CDATA[':
@@ -269,60 +316,98 @@ class XMLParser:
 	self.handle_cdata(rawdata[i+9:res.start(0)])
 	return res.end(0)
 
-    def parse_proc(self, i, res):
+    __xml_attributes = {'version': '1.0', 'standalone': 'no', 'encoding': None}
+    # Internal -- handle a processing instruction tag
+    def parse_proc(self, i):
 	rawdata = self.rawdata
-	if not res:
-	    raise RuntimeError, 'unexpected call to parse_proc'
-	name = res.group('proc')
-	res = procclose.search(rawdata, res.end(0))
-	if not res:
-	    return -1
-	self.handle_proc(name, rawdata[res.pos:res.start(0)])
-	return res.end(0)
-
-    # Internal -- handle starttag, return length or -1 if not terminated
-    def parse_starttag(self, i):
-	rawdata = self.rawdata
-	# i points to start of tag
-	end = endbracket.search(rawdata, i+1)
+	end = procclose.search(rawdata, i)
 	if not end:
 	    return -1
 	j = end.start(0)
-	# Now parse the data between i+1 and j into a tag and attrs
-	attrdict = {}
-	res = tagfind.match(rawdata, i+1)
+	res = tagfind.match(rawdata, i+2)
 	if not res:
-	    raise RuntimeError, 'unexpected call to parse_starttag'
+	    raise RuntimeError, 'unexpected call to parse_proc'
 	k = res.end(0)
-	tag = res.group(0)
-	if hasattr(self, tag + '_attributes'):
-	    attrlist = getattr(self, tag + '_attributes')
-	else:
-	    attrlist = None
-	self.lasttag = tag
+	name = res.group(0)
+	if name == 'xml':
+	    if self.__at_start:
+		attrdict, k = self.parse_attributes('xml', k, j,
+						    self.__xml_attributes)
+		if k != j:
+		    self.syntax_error('garbage at end of <?xml?>')
+		if attrdict['version'] != '1.0':
+		    self.syntax_error('only XML version 1.0 supported')
+		self.handle_xml(attrdict.get('encoding', None),
+				attrdict['standalone'])
+		return end.end(0)
+	    else:
+		self.syntax_error("<?xml?> tag not at start of document")
+	self.handle_proc(name, rawdata[k:j])
+	return end.end(0)
+
+    # Internal -- parse attributes between i and j
+    def parse_attributes(self, tag, k, j, attributes = None):
+	rawdata = self.rawdata
+	# Now parse the data between k and j into a tag and attrs
+	attrdict = {}
+	try:
+	    # convert attributes list to dictionary
+	    d = {}
+	    for a in attributes:
+		d[a] = None
+	    attributes = d
+	except TypeError:
+	    pass
 	while k < j:
 	    res = attrfind.match(rawdata, k)
 	    if not res: break
 	    attrname, attrvalue = res.group('name', 'value')
 	    if attrvalue is None:
-		self.syntax_error(self.lineno, 'no attribute value specified')
+		self.syntax_error('no attribute value specified')
 		attrvalue = attrname
 	    elif attrvalue[:1] == "'" == attrvalue[-1:] or \
 		 attrvalue[:1] == '"' == attrvalue[-1:]:
 		attrvalue = attrvalue[1:-1]
 	    else:
-		self.syntax_error(self.lineno, 'attribute value not quoted')
-	    if attrlist is not None and attrname not in attrlist:
-		self.syntax_error(self.lineno,
-				  'unknown attribute %s of element %s' %
+		self.syntax_error('attribute value not quoted')
+	    if attributes is not None and not attributes.has_key(attrname):
+		self.syntax_error('unknown attribute %s of element %s' %
 				  (attrname, tag))
 	    if attrdict.has_key(attrname):
-		self.syntax_error(self.lineno, 'attribute specified twice')
+		self.syntax_error('attribute specified twice')
 	    attrdict[attrname] = self.translate_references(attrvalue)
 	    k = res.end(0)
+	if attributes is not None:
+	    # fill in with default attributes
+	    for key, val in attributes.items():
+		if val is not None and not attrdict.has_key(key):
+		    attrdict[key] = val
+	return attrdict, k
+
+    # Internal -- handle starttag, return length or -1 if not terminated
+    def parse_starttag(self, i):
+	rawdata = self.rawdata
+	# i points to start of tag
+	end = endbracket.search(rawdata, i+1)
+	if not end:
+	    return -1
+	j = end.start(0)
+	res = tagfind.match(rawdata, i+1)
+	if not res:
+	    raise RuntimeError, 'unexpected call to parse_starttag'
+	k = res.end(0)
+	tag = res.group(0)
+	if not self.__seen_starttag and self.__seen_doctype:
+	    if tag != self.__seen_doctype:
+		self.syntax_error('starttag does not match DOCTYPE')
+	if hasattr(self, tag + '_attributes'):
+	    attributes = getattr(self, tag + '_attributes')
+	else:
+	    attributes = None
+	attrdict, k = self.parse_attributes(tag, k, j, attributes)
 	res = starttagend.match(rawdata, k)
 	if not res:
-	    self.syntax_error(self.lineno, 'garbage in start tag')
+	    self.syntax_error('garbage in start tag')
 	self.finish_starttag(tag, attrdict)
 	if res and res.group('slash') == '/':
 	    self.finish_endtag(tag)
@@ -336,7 +421,7 @@ class XMLParser:
 	    return -1
 	res = tagfind.match(rawdata, i+2)
 	if not res:
-	    self.syntax_error(self.lineno, 'no name specified in end tag')
+	    self.syntax_error('no name specified in end tag')
 	    tag = ''
 	    k = i+2
 	else:
@@ -346,7 +431,7 @@ class XMLParser:
 	    # check that there is only white space at end of tag
 	    res = space.match(rawdata, k)
 	    if res is None or res.end(0) != end.start(0):
-		self.syntax_error(self.lineno, 'garbage in end tag')
+		self.syntax_error('garbage in end tag')
 	self.finish_endtag(tag)
 	return end.end(0)
 
@@ -366,12 +451,14 @@ class XMLParser:
     # Internal -- finish processing of end tag
     def finish_endtag(self, tag):
 	if not tag:
+	    self.syntax_error('name-less end tag')
 	    found = len(self.stack) - 1
 	    if found < 0:
 		self.unknown_endtag(tag)
 		return
 	else:
 	    if tag not in self.stack:
+		self.syntax_error('unopened end tag')
 		try:
 		    method = getattr(self, 'end_' + tag)
 		except AttributeError:
@@ -379,8 +466,11 @@ class XMLParser:
 		return
 	    found = len(self.stack)
 	    for i in range(found):
-		if self.stack[i] == tag: found = i
+		if self.stack[i] == tag:
+		    found = i
 	while len(self.stack) > found:
+	    if found < len(self.stack) - 1:
+		self.syntax_error('missing close tag for %s' % self.stack[-1])
 	    tag = self.stack[-1]
 	    try:
 		method = getattr(self, 'end_' + tag)
@@ -392,6 +482,14 @@ class XMLParser:
 		self.unknown_endtag(tag)
 	    del self.stack[-1]
 
+    # Overridable -- handle xml processing instruction
+    def handle_xml(self, encoding, standalone):
+	pass
+
+    # Overridable -- handle DOCTYPE
+    def handle_doctype(self, tag, data):
+	pass
+
     # Overridable -- handle start tag
     def handle_starttag(self, tag, method, attrs):
 	method(attrs)
@@ -416,8 +514,7 @@ class XMLParser:
 	self.handle_data(chr(n))
 
     # Definition of entities -- derived classes may override
-    entitydefs = \
-	    {'lt': '<', 'gt': '>', 'amp': '&', 'quot': '"', 'apos': '\''}
+    entitydefs = {'lt': '<', 'gt': '>', 'amp': '&', 'quot': '"', 'apos': "'"}
 
     # Example -- handle entity reference, no need to override
     def handle_entityref(self, name):
@@ -449,8 +546,8 @@ class XMLParser:
 	pass
 
     # Example -- handle relatively harmless syntax errors, could be overridden
-    def syntax_error(self, lineno, message):
-	raise RuntimeError, 'Syntax error at line %d: %s' % (lineno, message)
+    def syntax_error(self, message):
+	raise RuntimeError, 'Syntax error at line %d: %s' % (self.lineno, message)
 
     # To be overridden -- handlers for unknown objects
     def unknown_starttag(self, tag, attrs): pass
@@ -465,6 +562,14 @@ class TestXMLParser(XMLParser):
 	self.testdata = ""
 	XMLParser.__init__(self, verbose)
 
+    def handle_xml(self, encoding, standalone):
+	self.flush()
+	print 'xml: encoding =',encoding,'standalone =',standalone
+
+    def handle_doctype(self, tag, data):
+	self.flush()
+	print 'DOCTYPE:',tag, `data`
+
     def handle_data(self, data):
 	self.testdata = self.testdata + data
 	if len(`self.testdata`) >= 70:
@@ -495,8 +600,8 @@ class TestXMLParser(XMLParser):
 	    r = r[:32] + '...' + r[-32:]
 	print 'comment:', r
 
-    def syntax_error(self, lineno, message):
-	print 'error at line %d:' % lineno, message
+    def syntax_error(self, message):
+	print 'error at line %d:' % self.lineno, message
 
     def unknown_starttag(self, tag, attrs):
 	self.flush()
@@ -504,7 +609,7 @@ class TestXMLParser(XMLParser):
 	    print 'start tag: <' + tag + '>'
 	else:
 	    print 'start tag: <' + tag,
-	    for name, value in attrs:
+	    for name, value in attrs.items():
 		print name + '=' + '"' + value + '"',
 	    print '>'
author	Guido van Rossum <guido@python.org>	1998-01-29 14:55:24 +0000
committer	Guido van Rossum <guido@python.org>	1998-01-29 14:55:24 +0000
commit	02505e48508deac4ae835ee833e0a05788c580d0 (patch)
tree	a09b54a85345b9169fff589db26d6e93e4a5be19 /Lib/xmllib.py
parent	44f5c75f430c92384137c4bef0c0a69dce02ee0b (diff)
download	cpython-git-02505e48508deac4ae835ee833e0a05788c580d0.tar.gz