summaryrefslogtreecommitdiff
path: root/lisp/url/url-util.el
diff options
context:
space:
mode:
authorChong Yidong <cyd@gnu.org>2012-05-09 16:33:48 +0800
committerChong Yidong <cyd@gnu.org>2012-05-09 16:33:48 +0800
commitce7b18ec41c5102f4af27ec22cf873a75f510630 (patch)
tree7608fccb83f4887f7e95f925b2ee36cd4f78be7d /lisp/url/url-util.el
parent66b03a53a3218479b93d64857a08b6d4cb5a0f7f (diff)
downloademacs-ce7b18ec41c5102f4af27ec22cf873a75f510630.tar.gz
Improve RFC 3986 conformance of url package.
Fix 2012-04-10 change to url.el. * url-http.el (url-http-create-request): Ignore obsolete attributes slot of url-object. * url-parse.el: Improve RFC 3986 conformance. (url-generic-parse-url): Do not populate the ATTRIBUTES slot, since this is not reliable for general RFC 3986 URIs. Keep the whole path and query inside the FILENAME slot. Improve docstring. (url-recreate-url-attributes): Mark as obsolete. (url-recreate-url): Handle missing scheme and userinfo. * url-util.el (url-encode-url): New function for URL quoting. (url-encoding-table, url-host-allowed-chars) (url-path-allowed-chars): New constants. (url--allowed-chars): New helper function. (url-hexify-string): Use them. * url-vars.el (url-nonrelative-link): Make the regexp stricter. * url.el (url-retrieve-internal): Use url-encode-url. Fixes: debbugs:7017
Diffstat (limited to 'lisp/url/url-util.el')
-rw-r--r--lisp/url/url-util.el135
1 files changed, 106 insertions, 29 deletions
diff --git a/lisp/url/url-util.el b/lisp/url/url-util.el
index d12bd5447fa..c8016ef6cdb 100644
--- a/lisp/url/url-util.el
+++ b/lisp/url/url-util.el
@@ -333,40 +333,117 @@ forbidden in URL encoding."
(concat tmp str)))
(defconst url-unreserved-chars
- '(
- ?a ?b ?c ?d ?e ?f ?g ?h ?i ?j ?k ?l ?m ?n ?o ?p ?q ?r ?s ?t ?u ?v ?w ?x ?y ?z
+ '(?a ?b ?c ?d ?e ?f ?g ?h ?i ?j ?k ?l ?m ?n ?o ?p ?q ?r ?s ?t ?u ?v ?w ?x ?y ?z
?A ?B ?C ?D ?E ?F ?G ?H ?I ?J ?K ?L ?M ?N ?O ?P ?Q ?R ?S ?T ?U ?V ?W ?X ?Y ?Z
?0 ?1 ?2 ?3 ?4 ?5 ?6 ?7 ?8 ?9
- ?- ?_ ?. ?! ?~ ?* ?' ?\( ?\))
- "A list of characters that are _NOT_ reserved in the URL spec.
-This is taken from RFC 2396.")
+ ?- ?_ ?. ?~)
+ "List of characters that are unreserved in the URL spec.
+This is taken from RFC 3986 (section 2.3).")
+
+(defconst url-encoding-table
+ (let ((vec (make-vector 256 nil)))
+ (dotimes (byte 256)
+ (aset vec byte (format "%%%02x" byte)))
+ vec)
+ "Vector translating bytes to URI-encoded %-sequences.")
+
+(defun url--allowed-chars (char-list)
+ "Return an \"allowed character\" mask (a 256-slot vector).
+The Nth element is non-nil if character N is in CHAR-LIST. The
+result can be passed as the second arg to `url-hexify-string'."
+ (let ((vec (make-vector 256 nil)))
+ (dolist (byte char-list)
+ (ignore-errors (aset vec byte t)))
+ vec))
;;;###autoload
-(defun url-hexify-string (string)
- "Return a new string that is STRING URI-encoded.
-First, STRING is converted to utf-8, if necessary. Then, for each
-character in the utf-8 string, those found in `url-unreserved-chars'
-are left as-is, all others are represented as a three-character
-string: \"%\" followed by two lowercase hex digits."
- ;; To go faster and avoid a lot of consing, we could do:
- ;;
- ;; (defconst url-hexify-table
- ;; (let ((map (make-vector 256 nil)))
- ;; (dotimes (byte 256) (aset map byte
- ;; (if (memq byte url-unreserved-chars)
- ;; (char-to-string byte)
- ;; (format "%%%02x" byte))))
- ;; map))
- ;;
- ;; (mapconcat (curry 'aref url-hexify-table) ...)
+(defun url-hexify-string (string &optional allowed-chars)
+ "URI-encode STRING and return the result.
+If STRING is multibyte, it is first converted to a utf-8 byte
+string. Each byte corresponding to an allowed character is left
+as-is, while all other bytes are converted to a three-character
+string: \"%\" followed by two lowercase hex digits.
+
+The allowed characters are specified by ALLOWED-CHARS. If this
+argument is nil, the list `url-unreserved-chars' determines the
+allowed characters. Otherwise, ALLOWED-CHARS should be a vector
+whose Nth element is non-nil if character N is allowed."
+ (unless allowed-chars
+ (setq allowed-chars (url--allowed-chars url-unreserved-chars)))
(mapconcat (lambda (byte)
- (if (memq byte url-unreserved-chars)
- (char-to-string byte)
- (format "%%%02x" byte)))
- (if (multibyte-string-p string)
- (encode-coding-string string 'utf-8)
- string)
- ""))
+ (if (aref allowed-chars byte)
+ (char-to-string byte)
+ (aref url-encoding-table byte)))
+ (if (multibyte-string-p string)
+ (encode-coding-string string 'utf-8)
+ string)
+ ""))
+
+(defconst url-host-allowed-chars
+ ;; Allow % to avoid re-encoding %-encoded sequences.
+ (url--allowed-chars (append '(?% ?! ?$ ?& ?' ?\( ?\) ?* ?+ ?, ?\; ?=)
+ url-unreserved-chars))
+ "Allowed-character byte mask for the host segment of a URI.
+These characters are specified in RFC 3986, Appendix A.")
+
+(defconst url-path-allowed-chars
+ (let ((vec (copy-sequence url-host-allowed-chars)))
+ (aset vec ?/ t)
+ (aset vec ?: t)
+ (aset vec ?@ t)
+ vec)
+ "Allowed-character byte mask for the path segment of a URI.
+These characters are specified in RFC 3986, Appendix A.")
+
+(defconst url-query-allowed-chars
+ (let ((vec (copy-sequence url-path-allowed-chars)))
+ (aset vec ?? t)
+ vec)
+ "Allowed-character byte mask for the query segment of a URI.
+These characters are specified in RFC 3986, Appendix A.")
+
+;;;###autoload
+(defun url-encode-url (url)
+ "Return a properly URI-encoded version of URL.
+This function also performs URI normalization, e.g. converting
+the scheme to lowercase if it is uppercase. Apart from
+normalization, if URL is already URI-encoded, this function
+should return it unchanged."
+ (if (multibyte-string-p url)
+ (setq url (encode-coding-string url 'utf-8)))
+ (let* ((obj (url-generic-parse-url url))
+ (user (url-user obj))
+ (pass (url-password obj))
+ (host (url-host obj))
+ (file (url-filename obj))
+ (frag (url-target obj))
+ path query)
+ (if user
+ (setf (url-user obj) (url-hexify-string user)))
+ (if pass
+ (setf (url-password obj) (url-hexify-string pass)))
+ (when host
+ ;; No special encoding for IPv6 literals.
+ (unless (string-match "\\`\\[.*\\]\\'" host)
+ (setf (url-host obj)
+ (url-hexify-string host url-host-allowed-chars))))
+ ;; Split FILENAME slot into its PATH and QUERY components, and
+ ;; encode them separately. The PATH component can contain
+ ;; unreserved characters, %-encodings, and /:@!$&'()*+,;=
+ (when file
+ (if (string-match "\\?" file)
+ (setq path (substring file 0 (match-beginning 0))
+ query (substring file (match-end 0)))
+ (setq path file))
+ (setq path (url-hexify-string path url-path-allowed-chars))
+ (if query
+ (setq query (url-hexify-string query url-query-allowed-chars)))
+ (setf (url-filename obj)
+ (if query (concat path "?" query) path)))
+ (if frag
+ (setf (url-target obj)
+ (url-hexify-string frag url-query-allowed-chars)))
+ (url-recreate-url obj)))
;;;###autoload
(defun url-file-extension (fname &optional x)