lisp/url/url-queue.el


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192

;;; url-queue.el --- Fetching web pages in parallel   -*- lexical-binding: t -*-

;; Copyright (C) 2011-2016 Free Software Foundation, Inc.

;; Author: Lars Magne Ingebrigtsen <larsi@gnus.org>
;; Keywords: comm

;; This file is part of GNU Emacs.

;; GNU Emacs is free software: you can redistribute it and/or modify
;; it under the terms of the GNU General Public License as published by
;; the Free Software Foundation, either version 3 of the License, or
;; (at your option) any later version.

;; GNU Emacs is distributed in the hope that it will be useful,
;; but WITHOUT ANY WARRANTY; without even the implied warranty of
;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
;; GNU General Public License for more details.

;; You should have received a copy of the GNU General Public License
;; along with GNU Emacs.  If not, see <http://www.gnu.org/licenses/>.

;;; Commentary:

;; The point of this package is to allow fetching web pages in
;; parallel -- but control the level of parallelism to avoid DoS-ing
;; web servers and Emacs.

;;; Code:

(eval-when-compile (require 'cl-lib))
(require 'browse-url)
(require 'url-parse)

(defcustom url-queue-parallel-processes 6
  "The number of concurrent processes."
  :version "24.1"
  :type 'integer
  :group 'url)

(defcustom url-queue-timeout 5
  "How long to let a job live once it's started (in seconds)."
  :version "24.1"
  :type 'integer
  :group 'url)

;;; Internal variables.

(defvar url-queue nil)
(defvar url-queue-progress-timer nil)

(cl-defstruct url-queue
  url callback cbargs silentp
  buffer start-time pre-triggered
  inhibit-cookiesp)

;;;###autoload
(defun url-queue-retrieve (url callback &optional cbargs silent inhibit-cookies)
  "Retrieve URL asynchronously and call CALLBACK with CBARGS when finished.
This is like `url-retrieve' (which see for details of the arguments),
but with limits on the degree of parallelism.  The variable
`url-queue-parallel-processes' sets the number of concurrent processes.
The variable `url-queue-timeout' sets a timeout."
  (setq url-queue
	(append url-queue
		(list (make-url-queue :url url
				      :callback callback
				      :cbargs cbargs
				      :silentp silent
				      :inhibit-cookiesp inhibit-cookies))))
  (url-queue-setup-runners))

;; To ensure asynch behavior, we start the required number of queue
;; runners from `run-with-idle-timer'.  So we're basically going
;; through the queue in two ways: 1) synchronously when a program
;; calls `url-queue-retrieve' (which will then start the required
;; number of queue runners), and 2) at the exit of each job, which
;; will then not start any further threads, but just reuse the
;; previous "slot".

(defun url-queue-setup-runners ()
  (let ((running 0)
	waiting)
    (dolist (entry url-queue)
      (cond
       ((or (url-queue-start-time entry)
	    (url-queue-pre-triggered entry))
	(cl-incf running))
       ((not waiting)
	(setq waiting entry))))
    (when (and waiting
	       (< running url-queue-parallel-processes))
      (setf (url-queue-pre-triggered waiting) t)
      ;; We start fetching from this idle timer...
      (run-with-idle-timer 0.01 nil #'url-queue-run-queue)
      ;; And then we set up a separate timer to ensure progress when a
      ;; web server is unresponsive.
      (unless url-queue-progress-timer
        (setq url-queue-progress-timer
              (run-with-idle-timer 1 1 #'url-queue-check-progress))))))

(defun url-queue-run-queue ()
  (url-queue-prune-old-entries)
  (let ((running 0)
	waiting)
    (dolist (entry url-queue)
      (cond
       ((url-queue-start-time entry)
	(cl-incf running))
       ((not waiting)
	(setq waiting entry))))
    (when (and waiting
	       (< running url-queue-parallel-processes))
      (setf (url-queue-start-time waiting) (float-time))
      (url-queue-start-retrieve waiting))))

(defun url-queue-check-progress ()
  (when url-queue-progress-timer
    (if url-queue
        (url-queue-run-queue)
      (cancel-timer url-queue-progress-timer)
      (setq url-queue-progress-timer nil))))

(defun url-queue-callback-function (status job)
  (setq url-queue (delq job url-queue))
  (when (and (eq (car status) :error)
	     (eq (cadr (cadr status)) 'connection-failed))
    ;; If we get a connection error, then flush all other jobs from
    ;; the host from the queue.  This particularly makes sense if the
    ;; error really is a DNS resolver issue, which happens
    ;; synchronously and totally halts Emacs.
    (url-queue-remove-jobs-from-host
     (plist-get (nthcdr 3 (cadr status)) :host)))
  (url-queue-run-queue)
  (apply (url-queue-callback job) (cons status (url-queue-cbargs job))))

(defun url-queue-remove-jobs-from-host (host)
  (let ((jobs nil))
    (dolist (job url-queue)
      (when (equal (url-host (url-generic-parse-url (url-queue-url job)))
		   host)
	(push job jobs)))
    (dolist (job jobs)
      (url-queue-kill-job job)
      (setq url-queue (delq job url-queue)))))

(defun url-queue-start-retrieve (job)
  (setf (url-queue-buffer job)
	(ignore-errors
	  (let ((url-request-noninteractive t))
	    (url-retrieve (url-queue-url job)
			  #'url-queue-callback-function (list job)
			  (url-queue-silentp job)
			  (url-queue-inhibit-cookiesp job))))))

(defun url-queue-prune-old-entries ()
  (let (dead-jobs)
    (dolist (job url-queue)
      ;; Kill jobs that have lasted longer than the timeout.
      (when (and (url-queue-start-time job)
		 (> (- (float-time) (url-queue-start-time job))
		    url-queue-timeout))
	(push job dead-jobs)))
    (dolist (job dead-jobs)
      (url-queue-kill-job job)
      (setq url-queue (delq job url-queue)))))

(defun url-queue-kill-job (job)
  (when (bufferp (url-queue-buffer job))
    (let (process)
      (while (setq process (get-buffer-process (url-queue-buffer job)))
	(set-process-sentinel process 'ignore)
	(ignore-errors
	  (delete-process process)))))
  ;; Call the callback with an error message to ensure that the caller
  ;; is notified that the job has failed.
  (with-current-buffer
      (if (and (bufferp (url-queue-buffer job))
	       (buffer-live-p (url-queue-buffer job)))
	  ;; Use the (partially filled) process buffer it it exists.
	  (url-queue-buffer job)
	;; If not, just create a new buffer, which will probably be
	;; killed again by the caller.
	(generate-new-buffer " *temp*"))
    (apply (url-queue-callback job)
	   (cons (list :error (list 'error 'url-queue-timeout
				    "Queue timeout exceeded"))
		 (url-queue-cbargs job)))))

(provide 'url-queue)

;;; url-queue.el ends here