summaryrefslogtreecommitdiff
path: root/lisp/xml.el
diff options
context:
space:
mode:
authorChong Yidong <cyd@gnu.org>2012-07-03 00:21:54 +0800
committerChong Yidong <cyd@gnu.org>2012-07-03 00:21:54 +0800
commita7aef6f5c6e22b167ea0234ab84c0308201d681b (patch)
treefd09ca2bd61cdc2e8b8e222b73578c669fa2e354 /lisp/xml.el
parent2b5208f18115bd0f364c11cbdc8124878158927a (diff)
downloademacs-a7aef6f5c6e22b167ea0234ab84c0308201d681b.tar.gz
* lisp/xml.el: Handle entity and character reference expansion correctly.
(xml-default-ns): New variable. (xml-entity-alist): Use XML spec definitions for lt and amp. (xml-parse-region): Make first two arguments optional. Discard text properties. (xml-parse-tag-1): New function, spun off from xml-parse-tag. All callers changed. (xml-parse-tag): Call xml-parse-tag-1. For backward compatibility, this function should not modify buffer contents. (xml-parse-tag-1): Fix opening-tag regexp. (xml-parse-string): Rewrite, handling entity and character references properly. (xml--entity-replacement-text): Signal an error if a parameter entity is undefined. * test/automated/xml-parse-tests.el (xml-parse-tests--data): More testcases.
Diffstat (limited to 'lisp/xml.el')
-rw-r--r--lisp/xml.el372
1 files changed, 184 insertions, 188 deletions
diff --git a/lisp/xml.el b/lisp/xml.el
index 5c1d2390a23..a3e279b41bd 100644
--- a/lisp/xml.el
+++ b/lisp/xml.el
@@ -80,22 +80,23 @@
;; a worthwhile tradeoff especially since we're usually parsing files
;; instead of hand-crafted XML.
-;;*******************************************************************
-;;**
-;;** Macros to parse the list
-;;**
-;;*******************************************************************
+;;; Macros to parse the list
(defconst xml-undefined-entity "?"
"What to substitute for undefined entities")
+(defconst xml-default-ns '(("" . "")
+ ("xml" . "http://www.w3.org/XML/1998/namespace")
+ ("xmlns" . "http://www.w3.org/2000/xmlns/"))
+ "Alist mapping default XML namespaces to their URIs.")
+
(defvar xml-entity-alist
- '(("lt" . "<")
+ '(("lt" . "&#60;")
("gt" . ">")
("apos" . "'")
("quot" . "\"")
- ("amp" . "&"))
- "Alist of defined XML entities.")
+ ("amp" . "&#38;"))
+ "Alist mapping XML entities to their replacement text.")
(defvar xml-parameter-entity-alist nil
"Alist of defined XML parametric entities.")
@@ -156,11 +157,7 @@ An empty string is returned if the attribute was not found.
See also `xml-get-attribute-or-nil'."
(or (xml-get-attribute-or-nil node attribute) ""))
-;;*******************************************************************
-;;**
-;;** Creating the list
-;;**
-;;*******************************************************************
+;;; Creating the list
;;;###autoload
(defun xml-parse-file (file &optional parse-dtd parse-ns)
@@ -299,8 +296,10 @@ If PARSE-NS is non-nil, then QNAMES are expanded."
;; prolog ::= XMLDecl? Misc* (doctypedecl Misc*)?
;;;###autoload
-(defun xml-parse-region (beg end &optional buffer parse-dtd parse-ns)
+(defun xml-parse-region (&optional beg end buffer parse-dtd parse-ns)
"Parse the region from BEG to END in BUFFER.
+If BEG is nil, it defaults to `point-min'.
+If END is nil, it defaults to `point-max'.
If BUFFER is nil, it defaults to the current buffer.
Returns the XML list for the region, or raises an error if the region
is not well-formed XML.
@@ -312,7 +311,7 @@ If PARSE-NS is non-nil, then QNAMES are expanded."
(unless buffer
(setq buffer (current-buffer)))
(with-temp-buffer
- (insert-buffer-substring buffer beg end)
+ (insert-buffer-substring-no-properties buffer beg end)
(xml--parse-buffer parse-dtd parse-ns)))
(defun xml--parse-buffer (parse-dtd parse-ns)
@@ -327,7 +326,7 @@ If PARSE-NS is non-nil, then QNAMES are expanded."
(if (search-forward "<" nil t)
(progn
(forward-char -1)
- (setq result (xml-parse-tag parse-dtd parse-ns))
+ (setq result (xml-parse-tag-1 parse-dtd parse-ns))
(cond
((null result)
;; Not looking at an xml start tag.
@@ -379,8 +378,7 @@ specify that the name shouldn't be given a namespace."
(xml-parameter-entity-alist xml-parameter-entity-alist)
children)
(while (not (eobp))
- (let ((bit (xml-parse-tag
- parse-dtd parse-ns)))
+ (let ((bit (xml-parse-tag-1 parse-dtd parse-ns)))
(if children
(setq children (append (list bit) children))
(if (stringp bit)
@@ -392,30 +390,32 @@ specify that the name shouldn't be given a namespace."
"Parse the tag at point.
If PARSE-DTD is non-nil, the DTD of the document, if any, is parsed and
returned as the first element in the list.
-If PARSE-NS is non-nil, then QNAMES are expanded.
-Returns one of:
+If PARSE-NS is non-nil, expand QNAMES; if the value of PARSE-NS
+is a list, use it as an alist mapping namespaces to URIs.
+
+Return one of:
- a list : the matching node
- nil : the point is not looking at a tag.
- a pair : the first element is the DTD, the second is the node."
+ (let ((buf (current-buffer))
+ (pos (point)))
+ (with-temp-buffer
+ (insert-buffer-substring-no-properties buf pos)
+ (goto-char (point-min))
+ (xml-parse-tag-1 parse-dtd parse-ns))))
+
+(defun xml-parse-tag-1 (&optional parse-dtd parse-ns)
+ "Like `xml-parse-tag', but possibly modify the buffer while working."
(let ((xml-validating-parser (or parse-dtd xml-validating-parser))
- (xml-ns (if (consp parse-ns)
- parse-ns
- (if parse-ns
- (list
- ;; Default for empty prefix is no namespace
- (cons "" "")
- ;; "xml" namespace
- (cons "xml" "http://www.w3.org/XML/1998/namespace")
- ;; We need to seed the xmlns namespace
- (cons "xmlns" "http://www.w3.org/2000/xmlns/"))))))
+ (xml-ns (cond ((consp parse-ns) parse-ns)
+ (parse-ns xml-default-ns))))
(cond
- ;; Processing instructions (like the <?xml version="1.0"?> tag at the
- ;; beginning of a document).
+ ;; Processing instructions, like <?xml version="1.0"?>.
((looking-at "<\\?")
(search-forward "?>")
(skip-syntax-forward " ")
- (xml-parse-tag parse-dtd xml-ns))
- ;; Character data (CDATA) sections, in which no tag should be interpreted
+ (xml-parse-tag-1 parse-dtd xml-ns))
+ ;; Character data (CDATA) sections, in which no tag should be interpreted
((looking-at "<!\\[CDATA\\[")
(let ((pos (match-end 0)))
(unless (search-forward "]]>" nil t)
@@ -423,33 +423,32 @@ Returns one of:
(concat
(buffer-substring-no-properties pos (match-beginning 0))
(xml-parse-string))))
- ;; DTD for the document
+ ;; DTD for the document
((looking-at "<!DOCTYPE[ \t\n\r]")
(let ((dtd (xml-parse-dtd parse-ns)))
(skip-syntax-forward " ")
(if xml-validating-parser
- (cons dtd (xml-parse-tag nil xml-ns))
- (xml-parse-tag nil xml-ns))))
- ;; skip comments
+ (cons dtd (xml-parse-tag-1 nil xml-ns))
+ (xml-parse-tag-1 nil xml-ns))))
+ ;; skip comments
((looking-at "<!--")
(search-forward "-->")
+ ;; FIXME: This loses the skipped-over spaces.
(skip-syntax-forward " ")
(unless (eobp)
(let ((xml-sub-parser t))
- (xml-parse-tag parse-dtd xml-ns))))
- ;; end tag
+ (xml-parse-tag-1 parse-dtd xml-ns))))
+ ;; end tag
((looking-at "</")
'())
- ;; opening tag
- ((looking-at "<\\([^/>[:space:]]+\\)")
+ ;; opening tag
+ ((looking-at (eval-when-compile (concat "<\\(" xml-name-re "\\)")))
(goto-char (match-end 1))
-
;; Parse this node
(let* ((node-name (match-string-no-properties 1))
;; Parse the attribute list.
(attrs (xml-parse-attlist xml-ns))
children)
-
;; add the xmlns:* attrs to our cache
(when (consp xml-ns)
(dolist (attr attrs)
@@ -458,70 +457,114 @@ Returns one of:
(caar attr)))
(push (cons (cdar attr) (cdr attr))
xml-ns))))
-
(setq children (list attrs (xml-maybe-do-ns node-name "" xml-ns)))
+ (cond
+ ;; is this an empty element ?
+ ((looking-at "/>")
+ (forward-char 2)
+ (nreverse children))
+ ;; is this a valid start tag ?
+ ((eq (char-after) ?>)
+ (forward-char 1)
+ ;; Now check that we have the right end-tag.
+ (let ((end (concat "</" node-name "\\s-*>")))
+ (while (not (looking-at end))
+ (cond
+ ((eobp)
+ (error "XML: (Not Well-Formed) End of buffer while reading element `%s'"
+ node-name))
+ ((looking-at "</")
+ (forward-char 2)
+ (error "XML: (Not Well-Formed) Invalid end tag `%s' (expecting `%s')"
+ (let ((pos (point)))
+ (buffer-substring pos (if (re-search-forward "\\s-*>" nil t)
+ (match-beginning 0)
+ (point-max))))
+ node-name))
+ ;; Read a sub-element and push it onto CHILDREN.
+ ((= (char-after) ?<)
+ (let ((tag (xml-parse-tag-1 nil xml-ns)))
+ (when tag
+ (push tag children))))
+ ;; Read some character data.
+ (t
+ (let ((expansion (xml-parse-string)))
+ (push (if (stringp (car children))
+ ;; If two strings were separated by a
+ ;; comment, concat them.
+ (concat (pop children) expansion)
+ expansion)
+ children)))))
+ ;; Move point past the end-tag.
+ (goto-char (match-end 0))
+ (nreverse children)))
+ ;; Otherwise this was an invalid start tag (expected ">" not found.)
+ (t
+ (error "XML: (Well-Formed) Couldn't parse tag: %s"
+ (buffer-substring-no-properties (- (point) 10) (+ (point) 1)))))))
- ;; is this an empty element ?
- (if (looking-at "/>")
- (progn
- (forward-char 2)
- (nreverse children))
-
- ;; is this a valid start tag ?
- (if (eq (char-after) ?>)
- (progn
- (forward-char 1)
- ;; Now check that we have the right end-tag. Note that this
- ;; one might contain spaces after the tag name
- (let ((end (concat "</" node-name "\\s-*>")))
- (while (not (looking-at end))
- (cond
- ((looking-at "</")
- (error "XML: (Not Well-Formed) Invalid end tag (expecting %s) at pos %d"
- node-name (point)))
- ((= (char-after) ?<)
- (let ((tag (xml-parse-tag nil xml-ns)))
- (when tag
- (push tag children))))
- (t
- (let ((expansion (xml-parse-string)))
- (setq children
- (if (stringp expansion)
- (if (stringp (car children))
- ;; The two strings were separated by a comment.
- (setq children (append (list (concat (car children) expansion))
- (cdr children)))
- (setq children (append (list expansion) children)))
- (setq children (append expansion children))))))))
-
- (goto-char (match-end 0))
- (nreverse children)))
- ;; This was an invalid start tag (Expected ">", but didn't see it.)
- (error "XML: (Well-Formed) Couldn't parse tag: %s"
- (buffer-substring-no-properties (- (point) 10) (+ (point) 1)))))))
- (t ;; (Not one of PI, CDATA, Comment, End tag, or Start tag)
- (unless xml-sub-parser ; Usually, we error out.
+ ;; (Not one of PI, CDATA, Comment, End tag, or Start tag)
+ (t
+ (unless xml-sub-parser ; Usually, we error out.
(error "XML: (Well-Formed) Invalid character"))
-
;; However, if we're parsing incrementally, then we need to deal
;; with stray CDATA.
(xml-parse-string)))))
(defun xml-parse-string ()
- "Parse the next whatever. Could be a string, or an element."
- (let* ((pos (point))
- (string (progn (skip-chars-forward "^<")
- (buffer-substring-no-properties pos (point)))))
- ;; Clean up the string. As per XML specifications, the XML
- ;; processor should always pass the whole string to the
- ;; application. But \r's should be replaced:
- ;; http://www.w3.org/TR/2000/REC-xml-20001006#sec-line-ends
- (setq pos 0)
- (while (string-match "\r\n?" string pos)
- (setq string (replace-match "\n" t t string))
- (setq pos (1+ (match-beginning 0))))
-
- (xml-substitute-special string)))
+ "Parse character data at point, and return it as a string.
+Leave point at the start of the next thing to parse. This
+function can modify the buffer by expanding entity and character
+references."
+ (let ((start (point))
+ ref val)
+ (while (and (not (eobp))
+ (not (looking-at "<")))
+ ;; Find the next < or & character.
+ (skip-chars-forward "^<&")
+ (when (eq (char-after) ?&)
+ ;; If we find an entity or character reference, expand it.
+ (unless (looking-at (eval-when-compile
+ (concat "&\\(?:#\\([0-9]+\\)\\|#x\\([0-9a-fA-F]+\\)\\|\\("
+ xml-name-re "\\)\\);")))
+ (error "XML: (Not Well-Formed) Invalid entity reference"))
+ ;; For a character reference, the next entity or character
+ ;; reference must be after the replacement. [4.6] "Numerical
+ ;; character references are expanded immediately when
+ ;; recognized and MUST be treated as character data."
+ (cond ((setq ref (match-string 1))
+ ;; Decimal character reference
+ (setq val (save-match-data
+ (decode-char 'ucs (string-to-number ref))))
+ (and (null val)
+ xml-validating-parser
+ (error "XML: (Validity) Invalid character `%s'" ref))
+ (replace-match (or (string val) xml-undefined-entity) t t))
+ ;; Hexadecimal character reference
+ ((setq ref (match-string 2))
+ (setq val (save-match-data
+ (decode-char 'ucs (string-to-number ref 16))))
+ (and (null val)
+ xml-validating-parser
+ (error "XML: (Validity) Invalid character `x%s'" ref))
+ (replace-match (or (string val) xml-undefined-entity) t t))
+ ;; For an entity reference, search again from the start
+ ;; of the replaced text, since the replacement can
+ ;; contain entity or character references, or markup.
+ ((setq ref (match-string 3))
+ (setq val (assoc ref xml-entity-alist))
+ (and (null val)
+ xml-validating-parser
+ (error "XML: (Validity) Undefined entity `%s'" ref))
+ (replace-match (cdr val) t t)
+ (goto-char (match-beginning 0))))))
+ ;; [2.11] Clean up line breaks.
+ (let ((end-marker (point-marker)))
+ (goto-char start)
+ (while (re-search-forward "\r\n?" end-marker t)
+ (replace-match "\n" t t))
+ (goto-char end-marker)
+ (buffer-substring start (point)))))
(defun xml-parse-attlist (&optional xml-ns)
"Return the attribute-list after point.
@@ -564,15 +607,11 @@ Leave point at the first non-blank character after the tag."
(skip-syntax-forward " "))
(nreverse attlist)))
-;;*******************************************************************
-;;**
-;;** The DTD (document type declaration)
-;;** The following functions know how to skip or parse the DTD of
-;;** a document
-;;**
-;;*******************************************************************
+;;; DTD (document type declaration)
-;; Fixme: This fails at least if the DTD contains conditional sections.
+;; The following functions know how to skip or parse the DTD of a
+;; document. FIXME: it fails at least if the DTD contains conditional
+;; sections.
(defun xml-skip-dtd ()
"Skip the DTD at point.
@@ -789,9 +828,10 @@ references and parameter-entity references."
;; Parameter entity reference
((setq ref (match-string 3 string))
(setq val (assoc ref xml-parameter-entity-alist))
- (if val
- (push (cdr val) children)
- (push (concat "%" ref ";") children))))
+ (and (null val)
+ xml-validating-parser
+ (error "XML: (Validity) Undefined parameter entity `%s'" ref))
+ (push (or (cdr val) xml-undefined-entity) children)))
(setq string remainder)))
(mapconcat 'identity (nreverse (cons string children)) "")))
@@ -828,79 +868,40 @@ references and parameter-entity references."
(t
elem))))
-;;*******************************************************************
-;;**
-;;** Substituting special XML sequences
-;;**
-;;*******************************************************************
+;;; Substituting special XML sequences
(defun xml-substitute-special (string)
- "Return STRING, after substituting entity references."
- ;; This originally made repeated passes through the string from the
- ;; beginning, which isn't correct, since then either "&amp;amp;" or
- ;; "&#38;amp;" won't DTRT.
-
- (let ((point 0)
- children end-point)
- (while (string-match "&\\([^;]*\\);" string point)
- (setq end-point (match-end 0))
- (let* ((this-part (match-string-no-properties 1 string))
- (prev-part (substring string point (match-beginning 0)))
- (entity (assoc this-part xml-entity-alist))
- (expansion
- (cond ((string-match "#\\([0-9]+\\)" this-part)
- (let ((c (decode-char
- 'ucs
- (string-to-number (match-string-no-properties 1 this-part)))))
- (if c (string c))))
- ((string-match "#x\\([[:xdigit:]]+\\)" this-part)
- (let ((c (decode-char
- 'ucs
- (string-to-number (match-string-no-properties 1 this-part) 16))))
- (if c (string c))))
- (entity
- (cdr entity))
- ((eq (length this-part) 0)
- (error "XML: (Not Well-Formed) No entity given"))
- (t
- (if xml-validating-parser
- (error "XML: (Validity) Undefined entity `%s'"
- this-part)
- xml-undefined-entity)))))
-
- (cond ((null children)
- ;; FIXME: If we have an entity that expands into XML, this won't work.
- (setq children
- (concat prev-part expansion)))
- ((stringp children)
- (if (stringp expansion)
- (setq children (concat children prev-part expansion))
- (setq children (list expansion (concat prev-part children)))))
- ((and (stringp expansion)
- (stringp (car children)))
- (setcar children (concat prev-part expansion (car children))))
- ((stringp expansion)
- (setq children (append (concat prev-part expansion)
- children)))
- ((stringp (car children))
- (setcar children (concat (car children) prev-part))
- (setq children (append expansion children)))
- (t
- (setq children (list expansion
- prev-part
- children))))
- (setq point end-point)))
- (cond ((stringp children)
- (concat children (substring string point)))
- ((stringp (car (last children)))
- (concat (car (last children)) (substring string point)))
- ((null children)
- string)
- (t
- (concat (mapconcat 'identity
- (nreverse children)
- "")
- (substring string point))))))
+ "Return STRING, after substituting entity and character references.
+STRING is assumed to occur in an XML attribute value."
+ (let ((ref-re (eval-when-compile
+ (concat "&\\(?:#\\(x\\)?\\([0-9]+\\)\\|\\("
+ xml-name-re "\\)\\);")))
+ children)
+ (while (string-match ref-re string)
+ (push (substring string 0 (match-beginning 0)) children)
+ (let* ((remainder (substring string (match-end 0)))
+ (ref (match-string 2 string)))
+ (if ref
+ ;; [4.6] Character references are included as
+ ;; character data.
+ (let ((val (decode-char 'ucs (string-to-number
+ ref (if (match-string 1 string) 16)))))
+ (push (cond (val (string val))
+ (xml-validating-parser
+ (error "XML: (Validity) Undefined character `x%s'" ref))
+ (t xml-undefined-entity))
+ children)
+ (setq string remainder))
+ ;; [4.4.5] Entity references are "included in literal".
+ ;; Note that we don't need do anything special to treat
+ ;; quotes as normal data characters.
+ (setq ref (match-string 3 string))
+ (let ((val (or (cdr (assoc ref xml-entity-alist))
+ (if xml-validating-parser
+ (error "XML: (Validity) Undefined entity `%s'" ref)
+ xml-undefined-entity))))
+ (setq string (concat val remainder))))))
+ (mapconcat 'identity (nreverse (cons string children)) "")))
(defun xml-substitute-numeric-entities (string)
"Substitute SGML numeric entities by their respective utf characters.
@@ -921,12 +922,7 @@ by \"*\"."
string)
nil))
-;;*******************************************************************
-;;**
-;;** Printing a tree.
-;;** This function is intended mainly for debugging purposes.
-;;**
-;;*******************************************************************
+;;; Printing a parse tree (mainly for debugging).
(defun xml-debug-print (xml &optional indent-string)
"Outputs the XML in the current buffer.