diff options
| author | Guido van Rossum <guido@python.org> | 1998-01-29 14:55:24 +0000 | 
|---|---|---|
| committer | Guido van Rossum <guido@python.org> | 1998-01-29 14:55:24 +0000 | 
| commit | 02505e48508deac4ae835ee833e0a05788c580d0 (patch) | |
| tree | a09b54a85345b9169fff589db26d6e93e4a5be19 /Lib/xmllib.py | |
| parent | 44f5c75f430c92384137c4bef0c0a69dce02ee0b (diff) | |
| download | cpython-git-02505e48508deac4ae835ee833e0a05788c580d0.tar.gz | |
New version of xmllib from Sjoerd.
The main incompatibility is that the error reporting method is now
called as
 parser.syntax_error(msg)
instead of
 parser.syntax_error(lineno, msg)
This new version also has some code to deal with the <?xml?> and
<!DOCTYPE> tags at the start of an XML document.
The documentation has been updated, and a small test module has been
created.
Diffstat (limited to 'Lib/xmllib.py')
| -rw-r--r-- | Lib/xmllib.py | 219 | 
1 files changed, 162 insertions, 57 deletions
diff --git a/Lib/xmllib.py b/Lib/xmllib.py index 7b2a76a7c6..9f6e23ecca 100644 --- a/Lib/xmllib.py +++ b/Lib/xmllib.py @@ -15,7 +15,7 @@ incomplete = re.compile('&(' + _Name + '|#[0-9]*|#x[0-9a-fA-F]*)?|'  			   '<([a-zA-Z_:][^<>]*|'  			      '/([a-zA-Z_:][^<>]*)?|'  			      '![^<>]*|' -			      '\?[^<>]*)?') +			      r'\?[^<>]*)?')  ref = re.compile('&(' + _Name + '|#[0-9]+|#x[0-9a-fA-F]+);?')  entityref = re.compile('&(?P<name>' + _Name + ')[^-a-zA-Z0-9._:]') @@ -28,11 +28,12 @@ endtagopen = re.compile('</')  starttagend = re.compile(_opS + '(?P<slash>/?)>')  endbracket = re.compile('>')  tagfind = re.compile(_Name) -cdataopen = re.compile('<!\[CDATA\[') -cdataclose = re.compile('\]\]>') +cdataopen = re.compile(r'<!\[CDATA\[') +cdataclose = re.compile(r'\]\]>') +doctype = re.compile('<!DOCTYPE' + _S + '(?P<name>' + _Name + ')' + _S)  special = re.compile('<!(?P<special>[^<>]*)>') -procopen = re.compile('<\?(?P<proc>' + _Name + ')' + _S) -procclose = re.compile('\?>') +procopen = re.compile(r'<\?(?P<proc>' + _Name + ')' + _S) +procclose = re.compile(_opS + r'\?>')  commentopen = re.compile('<!--')  commentclose = re.compile('-->')  doubledash = re.compile('--') @@ -63,10 +64,12 @@ class XMLParser:      def reset(self):  	self.rawdata = ''  	self.stack = [] -	self.lasttag = '???'  	self.nomoretags = 0  	self.literal = 0  	self.lineno = 1 +	self.__at_start = 1 +	self.__seen_doctype = None +	self.__seen_starttag = 0      # For derived classes only -- enter literal mode (CDATA) till EOF      def setnomoretags(self): @@ -98,8 +101,7 @@ class XMLParser:  		newdata.append(data[i:])  		return string.join(newdata, '')  	    if data[res.end(0) - 1] != ';': -		self.syntax_error(self.lineno, -				  '; missing after entity/char reference') +		self.syntax_error("`;' missing after entity/char reference")  	    newdata.append(data[i:res.start(0)])  	    str = res.group(1)  	    if str[0] == '#': @@ -123,6 +125,8 @@ class XMLParser:  	i = 0  	n = len(rawdata)  	while i < n: +	    if i > 0: +		self.__at_start = 0  	    if self.nomoretags:  		data = rawdata[i:n]  		self.handle_data(data) @@ -135,6 +139,7 @@ class XMLParser:  	    else:  		    j = n  	    if i < j: +		self.__at_start = 0  		data = rawdata[i:j]  		self.handle_data(data)  		self.lineno = self.lineno + string.count(data, '\n') @@ -150,6 +155,7 @@ class XMLParser:  			continue  		    k = self.parse_starttag(i)  		    if k < 0: break +		    self.__seen_starttag = 1  		    self.lineno = self.lineno + string.count(rawdata[i:k], '\n')  		    i = k  		    continue @@ -180,11 +186,29 @@ class XMLParser:  		    continue  		res = procopen.match(rawdata, i)  		if res: -		    k = self.parse_proc(i, res) +		    k = self.parse_proc(i)  		    if k < 0: break  		    self.lineno = self.lineno + string.count(rawdata[i:k], '\n')  		    i = k  		    continue +		res = doctype.match(rawdata, i) +		if res: +		    if self.literal: +			data = rawdata[i] +			self.handle_data(data) +			self.lineno = self.lineno + string.count(data, '\n') +			i = i+1 +			continue +		    if self.__seen_doctype: +			self.syntax_error('multiple DOCTYPE elements') +		    if self.__seen_starttag: +			self.syntax_error('DOCTYPE not at beginning of document') +		    k = self.parse_doctype(res) +		    if k < 0: break +		    self.__seen_doctype = res.group('name') +		    self.lineno = self.lineno + string.count(rawdata[i:k], '\n') +		    i = k +		    continue  		res = special.match(rawdata, i)  		if res:  		    if self.literal: @@ -202,7 +226,7 @@ class XMLParser:  		if res is not None:  		    i = res.end(0)  		    if rawdata[i-1] != ';': -			self.syntax_error(self.lineno, '; missing in charref') +			self.syntax_error("`;' missing in charref")  			i = i-1  		    self.handle_charref(res.group('char')[:-1])  		    self.lineno = self.lineno + string.count(res.group(0), '\n') @@ -211,7 +235,7 @@ class XMLParser:  		if res is not None:  		    i = res.end(0)  		    if rawdata[i-1] != ';': -			self.syntax_error(self.lineno, '; missing in entityref') +			self.syntax_error("`;' missing in entityref")  			i = i-1  		    self.handle_entityref(res.group('name'))  		    self.lineno = self.lineno + string.count(res.group(0), '\n') @@ -230,7 +254,7 @@ class XMLParser:  	    j = res.end(0)  	    if j == n:  		break # Really incomplete -	    self.syntax_error(self.lineno, 'bogus < or &') +	    self.syntax_error("bogus `<' or `&'")  	    data = res.group(0)  	    self.handle_data(data)  	    self.lineno = self.lineno + string.count(data, '\n') @@ -242,7 +266,11 @@ class XMLParser:  	    self.lineno = self.lineno + string.count(data, '\n')  	    i = n  	self.rawdata = rawdata[i:] -	# XXX if end: check for empty stack +	if end: +	    if self.stack: +		self.syntax_error('missing end tags') +		while self.stack: +		    self.finish_endtag(self.stack[-1])      # Internal -- parse comment, return length or -1 if not terminated      def parse_comment(self, i): @@ -254,11 +282,30 @@ class XMLParser:  	    return -1  	# doubledash search will succeed because it's a subset of commentclose  	if doubledash.search(rawdata, i+4).start(0) < res.start(0): -	    self.syntax_error(self.lineno, "`--' inside comment") +	    self.syntax_error("`--' inside comment")  	self.handle_comment(rawdata[i+4: res.start(0)])  	return res.end(0) -    # Internal -- handle CDATA tag, return lenth or -1 if not terminated +    # Internal -- handle DOCTYPE tag, return length or -1 if not terminated +    def parse_doctype(self, res): +	rawdata = self.rawdata +	n = len(rawdata) +	name = res.group('name') +	j = k = res.end(0) +	level = 0 +	while k < n: +	    c = rawdata[k] +	    if c == '<': +		level = level + 1 +	    elif c == '>': +		if level == 0: +		    self.handle_doctype(name, rawdata[j:k]) +		    return k+1 +		level = level - 1 +	    k = k+1 +	return -1 + +    # Internal -- handle CDATA tag, return length or -1 if not terminated      def parse_cdata(self, i):  	rawdata = self.rawdata  	if rawdata[i:i+9] <> '<![CDATA[': @@ -269,60 +316,98 @@ class XMLParser:  	self.handle_cdata(rawdata[i+9:res.start(0)])  	return res.end(0) -    def parse_proc(self, i, res): +    __xml_attributes = {'version': '1.0', 'standalone': 'no', 'encoding': None} +    # Internal -- handle a processing instruction tag +    def parse_proc(self, i):  	rawdata = self.rawdata -	if not res: -	    raise RuntimeError, 'unexpected call to parse_proc' -	name = res.group('proc') -	res = procclose.search(rawdata, res.end(0)) -	if not res: -	    return -1 -	self.handle_proc(name, rawdata[res.pos:res.start(0)]) -	return res.end(0) - -    # Internal -- handle starttag, return length or -1 if not terminated -    def parse_starttag(self, i): -	rawdata = self.rawdata -	# i points to start of tag -	end = endbracket.search(rawdata, i+1) +	end = procclose.search(rawdata, i)  	if not end:  	    return -1  	j = end.start(0) -	# Now parse the data between i+1 and j into a tag and attrs -	attrdict = {} -	res = tagfind.match(rawdata, i+1) +	res = tagfind.match(rawdata, i+2)  	if not res: -	    raise RuntimeError, 'unexpected call to parse_starttag' +	    raise RuntimeError, 'unexpected call to parse_proc'  	k = res.end(0) -	tag = res.group(0) -	if hasattr(self, tag + '_attributes'): -	    attrlist = getattr(self, tag + '_attributes') -	else: -	    attrlist = None -	self.lasttag = tag +	name = res.group(0) +	if name == 'xml': +	    if self.__at_start: +		attrdict, k = self.parse_attributes('xml', k, j, +						    self.__xml_attributes) +		if k != j: +		    self.syntax_error('garbage at end of <?xml?>') +		if attrdict['version'] != '1.0': +		    self.syntax_error('only XML version 1.0 supported') +		self.handle_xml(attrdict.get('encoding', None), +				attrdict['standalone']) +		return end.end(0) +	    else: +		self.syntax_error("<?xml?> tag not at start of document") +	self.handle_proc(name, rawdata[k:j]) +	return end.end(0) + +    # Internal -- parse attributes between i and j +    def parse_attributes(self, tag, k, j, attributes = None): +	rawdata = self.rawdata +	# Now parse the data between k and j into a tag and attrs +	attrdict = {} +	try: +	    # convert attributes list to dictionary +	    d = {} +	    for a in attributes: +		d[a] = None +	    attributes = d +	except TypeError: +	    pass  	while k < j:  	    res = attrfind.match(rawdata, k)  	    if not res: break  	    attrname, attrvalue = res.group('name', 'value')  	    if attrvalue is None: -		self.syntax_error(self.lineno, 'no attribute value specified') +		self.syntax_error('no attribute value specified')  		attrvalue = attrname  	    elif attrvalue[:1] == "'" == attrvalue[-1:] or \  		 attrvalue[:1] == '"' == attrvalue[-1:]:  		attrvalue = attrvalue[1:-1]  	    else: -		self.syntax_error(self.lineno, 'attribute value not quoted') -	    if attrlist is not None and attrname not in attrlist: -		self.syntax_error(self.lineno, -				  'unknown attribute %s of element %s' % +		self.syntax_error('attribute value not quoted') +	    if attributes is not None and not attributes.has_key(attrname): +		self.syntax_error('unknown attribute %s of element %s' %  				  (attrname, tag))  	    if attrdict.has_key(attrname): -		self.syntax_error(self.lineno, 'attribute specified twice') +		self.syntax_error('attribute specified twice')  	    attrdict[attrname] = self.translate_references(attrvalue)  	    k = res.end(0) +	if attributes is not None: +	    # fill in with default attributes +	    for key, val in attributes.items(): +		if val is not None and not attrdict.has_key(key): +		    attrdict[key] = val +	return attrdict, k + +    # Internal -- handle starttag, return length or -1 if not terminated +    def parse_starttag(self, i): +	rawdata = self.rawdata +	# i points to start of tag +	end = endbracket.search(rawdata, i+1) +	if not end: +	    return -1 +	j = end.start(0) +	res = tagfind.match(rawdata, i+1) +	if not res: +	    raise RuntimeError, 'unexpected call to parse_starttag' +	k = res.end(0) +	tag = res.group(0) +	if not self.__seen_starttag and self.__seen_doctype: +	    if tag != self.__seen_doctype: +		self.syntax_error('starttag does not match DOCTYPE') +	if hasattr(self, tag + '_attributes'): +	    attributes = getattr(self, tag + '_attributes') +	else: +	    attributes = None +	attrdict, k = self.parse_attributes(tag, k, j, attributes)  	res = starttagend.match(rawdata, k)  	if not res: -	    self.syntax_error(self.lineno, 'garbage in start tag') +	    self.syntax_error('garbage in start tag')  	self.finish_starttag(tag, attrdict)  	if res and res.group('slash') == '/':  	    self.finish_endtag(tag) @@ -336,7 +421,7 @@ class XMLParser:  	    return -1  	res = tagfind.match(rawdata, i+2)  	if not res: -	    self.syntax_error(self.lineno, 'no name specified in end tag') +	    self.syntax_error('no name specified in end tag')  	    tag = ''  	    k = i+2  	else: @@ -346,7 +431,7 @@ class XMLParser:  	    # check that there is only white space at end of tag  	    res = space.match(rawdata, k)  	    if res is None or res.end(0) != end.start(0): -		self.syntax_error(self.lineno, 'garbage in end tag') +		self.syntax_error('garbage in end tag')  	self.finish_endtag(tag)  	return end.end(0) @@ -366,12 +451,14 @@ class XMLParser:      # Internal -- finish processing of end tag      def finish_endtag(self, tag):  	if not tag: +	    self.syntax_error('name-less end tag')  	    found = len(self.stack) - 1  	    if found < 0:  		self.unknown_endtag(tag)  		return  	else:  	    if tag not in self.stack: +		self.syntax_error('unopened end tag')  		try:  		    method = getattr(self, 'end_' + tag)  		except AttributeError: @@ -379,8 +466,11 @@ class XMLParser:  		return  	    found = len(self.stack)  	    for i in range(found): -		if self.stack[i] == tag: found = i +		if self.stack[i] == tag: +		    found = i  	while len(self.stack) > found: +	    if found < len(self.stack) - 1: +		self.syntax_error('missing close tag for %s' % self.stack[-1])  	    tag = self.stack[-1]  	    try:  		method = getattr(self, 'end_' + tag) @@ -392,6 +482,14 @@ class XMLParser:  		self.unknown_endtag(tag)  	    del self.stack[-1] +    # Overridable -- handle xml processing instruction +    def handle_xml(self, encoding, standalone): +	pass + +    # Overridable -- handle DOCTYPE +    def handle_doctype(self, tag, data): +	pass +      # Overridable -- handle start tag      def handle_starttag(self, tag, method, attrs):  	method(attrs) @@ -416,8 +514,7 @@ class XMLParser:  	self.handle_data(chr(n))      # Definition of entities -- derived classes may override -    entitydefs = \ -	    {'lt': '<', 'gt': '>', 'amp': '&', 'quot': '"', 'apos': '\''} +    entitydefs = {'lt': '<', 'gt': '>', 'amp': '&', 'quot': '"', 'apos': "'"}      # Example -- handle entity reference, no need to override      def handle_entityref(self, name): @@ -449,8 +546,8 @@ class XMLParser:  	pass      # Example -- handle relatively harmless syntax errors, could be overridden -    def syntax_error(self, lineno, message): -	raise RuntimeError, 'Syntax error at line %d: %s' % (lineno, message) +    def syntax_error(self, message): +	raise RuntimeError, 'Syntax error at line %d: %s' % (self.lineno, message)      # To be overridden -- handlers for unknown objects      def unknown_starttag(self, tag, attrs): pass @@ -465,6 +562,14 @@ class TestXMLParser(XMLParser):  	self.testdata = ""  	XMLParser.__init__(self, verbose) +    def handle_xml(self, encoding, standalone): +	self.flush() +	print 'xml: encoding =',encoding,'standalone =',standalone + +    def handle_doctype(self, tag, data): +	self.flush() +	print 'DOCTYPE:',tag, `data` +      def handle_data(self, data):  	self.testdata = self.testdata + data  	if len(`self.testdata`) >= 70: @@ -495,8 +600,8 @@ class TestXMLParser(XMLParser):  	    r = r[:32] + '...' + r[-32:]  	print 'comment:', r -    def syntax_error(self, lineno, message): -	print 'error at line %d:' % lineno, message +    def syntax_error(self, message): +	print 'error at line %d:' % self.lineno, message      def unknown_starttag(self, tag, attrs):  	self.flush() @@ -504,7 +609,7 @@ class TestXMLParser(XMLParser):  	    print 'start tag: <' + tag + '>'  	else:  	    print 'start tag: <' + tag, -	    for name, value in attrs: +	    for name, value in attrs.items():  		print name + '=' + '"' + value + '"',  	    print '>'  | 
