diff options
author | Chong Yidong <cyd@gnu.org> | 2012-05-09 16:33:48 +0800 |
---|---|---|
committer | Chong Yidong <cyd@gnu.org> | 2012-05-09 16:33:48 +0800 |
commit | ce7b18ec41c5102f4af27ec22cf873a75f510630 (patch) | |
tree | 7608fccb83f4887f7e95f925b2ee36cd4f78be7d /lisp/url/url-util.el | |
parent | 66b03a53a3218479b93d64857a08b6d4cb5a0f7f (diff) | |
download | emacs-ce7b18ec41c5102f4af27ec22cf873a75f510630.tar.gz |
Improve RFC 3986 conformance of url package.
Fix 2012-04-10 change to url.el.
* url-http.el (url-http-create-request): Ignore obsolete
attributes slot of url-object.
* url-parse.el: Improve RFC 3986 conformance.
(url-generic-parse-url): Do not populate the ATTRIBUTES slot,
since this is not reliable for general RFC 3986 URIs. Keep the
whole path and query inside the FILENAME slot. Improve docstring.
(url-recreate-url-attributes): Mark as obsolete.
(url-recreate-url): Handle missing scheme and userinfo.
* url-util.el (url-encode-url): New function for URL quoting.
(url-encoding-table, url-host-allowed-chars)
(url-path-allowed-chars): New constants.
(url--allowed-chars): New helper function.
(url-hexify-string): Use them.
* url-vars.el (url-nonrelative-link): Make the regexp stricter.
* url.el (url-retrieve-internal): Use url-encode-url.
Fixes: debbugs:7017
Diffstat (limited to 'lisp/url/url-util.el')
-rw-r--r-- | lisp/url/url-util.el | 135 |
1 files changed, 106 insertions, 29 deletions
diff --git a/lisp/url/url-util.el b/lisp/url/url-util.el index d12bd5447fa..c8016ef6cdb 100644 --- a/lisp/url/url-util.el +++ b/lisp/url/url-util.el @@ -333,40 +333,117 @@ forbidden in URL encoding." (concat tmp str))) (defconst url-unreserved-chars - '( - ?a ?b ?c ?d ?e ?f ?g ?h ?i ?j ?k ?l ?m ?n ?o ?p ?q ?r ?s ?t ?u ?v ?w ?x ?y ?z + '(?a ?b ?c ?d ?e ?f ?g ?h ?i ?j ?k ?l ?m ?n ?o ?p ?q ?r ?s ?t ?u ?v ?w ?x ?y ?z ?A ?B ?C ?D ?E ?F ?G ?H ?I ?J ?K ?L ?M ?N ?O ?P ?Q ?R ?S ?T ?U ?V ?W ?X ?Y ?Z ?0 ?1 ?2 ?3 ?4 ?5 ?6 ?7 ?8 ?9 - ?- ?_ ?. ?! ?~ ?* ?' ?\( ?\)) - "A list of characters that are _NOT_ reserved in the URL spec. -This is taken from RFC 2396.") + ?- ?_ ?. ?~) + "List of characters that are unreserved in the URL spec. +This is taken from RFC 3986 (section 2.3).") + +(defconst url-encoding-table + (let ((vec (make-vector 256 nil))) + (dotimes (byte 256) + (aset vec byte (format "%%%02x" byte))) + vec) + "Vector translating bytes to URI-encoded %-sequences.") + +(defun url--allowed-chars (char-list) + "Return an \"allowed character\" mask (a 256-slot vector). +The Nth element is non-nil if character N is in CHAR-LIST. The +result can be passed as the second arg to `url-hexify-string'." + (let ((vec (make-vector 256 nil))) + (dolist (byte char-list) + (ignore-errors (aset vec byte t))) + vec)) ;;;###autoload -(defun url-hexify-string (string) - "Return a new string that is STRING URI-encoded. -First, STRING is converted to utf-8, if necessary. Then, for each -character in the utf-8 string, those found in `url-unreserved-chars' -are left as-is, all others are represented as a three-character -string: \"%\" followed by two lowercase hex digits." - ;; To go faster and avoid a lot of consing, we could do: - ;; - ;; (defconst url-hexify-table - ;; (let ((map (make-vector 256 nil))) - ;; (dotimes (byte 256) (aset map byte - ;; (if (memq byte url-unreserved-chars) - ;; (char-to-string byte) - ;; (format "%%%02x" byte)))) - ;; map)) - ;; - ;; (mapconcat (curry 'aref url-hexify-table) ...) +(defun url-hexify-string (string &optional allowed-chars) + "URI-encode STRING and return the result. +If STRING is multibyte, it is first converted to a utf-8 byte +string. Each byte corresponding to an allowed character is left +as-is, while all other bytes are converted to a three-character +string: \"%\" followed by two lowercase hex digits. + +The allowed characters are specified by ALLOWED-CHARS. If this +argument is nil, the list `url-unreserved-chars' determines the +allowed characters. Otherwise, ALLOWED-CHARS should be a vector +whose Nth element is non-nil if character N is allowed." + (unless allowed-chars + (setq allowed-chars (url--allowed-chars url-unreserved-chars))) (mapconcat (lambda (byte) - (if (memq byte url-unreserved-chars) - (char-to-string byte) - (format "%%%02x" byte))) - (if (multibyte-string-p string) - (encode-coding-string string 'utf-8) - string) - "")) + (if (aref allowed-chars byte) + (char-to-string byte) + (aref url-encoding-table byte))) + (if (multibyte-string-p string) + (encode-coding-string string 'utf-8) + string) + "")) + +(defconst url-host-allowed-chars + ;; Allow % to avoid re-encoding %-encoded sequences. + (url--allowed-chars (append '(?% ?! ?$ ?& ?' ?\( ?\) ?* ?+ ?, ?\; ?=) + url-unreserved-chars)) + "Allowed-character byte mask for the host segment of a URI. +These characters are specified in RFC 3986, Appendix A.") + +(defconst url-path-allowed-chars + (let ((vec (copy-sequence url-host-allowed-chars))) + (aset vec ?/ t) + (aset vec ?: t) + (aset vec ?@ t) + vec) + "Allowed-character byte mask for the path segment of a URI. +These characters are specified in RFC 3986, Appendix A.") + +(defconst url-query-allowed-chars + (let ((vec (copy-sequence url-path-allowed-chars))) + (aset vec ?? t) + vec) + "Allowed-character byte mask for the query segment of a URI. +These characters are specified in RFC 3986, Appendix A.") + +;;;###autoload +(defun url-encode-url (url) + "Return a properly URI-encoded version of URL. +This function also performs URI normalization, e.g. converting +the scheme to lowercase if it is uppercase. Apart from +normalization, if URL is already URI-encoded, this function +should return it unchanged." + (if (multibyte-string-p url) + (setq url (encode-coding-string url 'utf-8))) + (let* ((obj (url-generic-parse-url url)) + (user (url-user obj)) + (pass (url-password obj)) + (host (url-host obj)) + (file (url-filename obj)) + (frag (url-target obj)) + path query) + (if user + (setf (url-user obj) (url-hexify-string user))) + (if pass + (setf (url-password obj) (url-hexify-string pass))) + (when host + ;; No special encoding for IPv6 literals. + (unless (string-match "\\`\\[.*\\]\\'" host) + (setf (url-host obj) + (url-hexify-string host url-host-allowed-chars)))) + ;; Split FILENAME slot into its PATH and QUERY components, and + ;; encode them separately. The PATH component can contain + ;; unreserved characters, %-encodings, and /:@!$&'()*+,;= + (when file + (if (string-match "\\?" file) + (setq path (substring file 0 (match-beginning 0)) + query (substring file (match-end 0))) + (setq path file)) + (setq path (url-hexify-string path url-path-allowed-chars)) + (if query + (setq query (url-hexify-string query url-query-allowed-chars))) + (setf (url-filename obj) + (if query (concat path "?" query) path))) + (if frag + (setf (url-target obj) + (url-hexify-string frag url-query-allowed-chars))) + (url-recreate-url obj))) ;;;###autoload (defun url-file-extension (fname &optional x) |