summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authormstenner <mstenner>2003-10-12 02:03:27 +0000
committermstenner <mstenner>2003-10-12 02:03:27 +0000
commitede1c014b207ba53625a36f43f7dabf85d936936 (patch)
tree0ca30ee83d0aea46f9595ed979c5733a4d96db62
downloadurlgrabber-ede1c014b207ba53625a36f43f7dabf85d936936.tar.gz
Initial revision
-rw-r--r--LICENSE280
-rw-r--r--MANIFEST.in3
-rw-r--r--README19
-rw-r--r--makefile12
-rw-r--r--progress_meter.py161
-rw-r--r--setup.py9
-rw-r--r--urlgrabber.py752
-rw-r--r--urlgrabber/keepalive.py379
8 files changed, 1615 insertions, 0 deletions
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..c7aea18
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,280 @@
+ GNU GENERAL PUBLIC LICENSE
+ Version 2, June 1991
+
+ Copyright (C) 1989, 1991 Free Software Foundation, Inc.
+ 675 Mass Ave, Cambridge, MA 02139, USA
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+ Preamble
+
+ The licenses for most software are designed to take away your
+freedom to share and change it. By contrast, the GNU General Public
+License is intended to guarantee your freedom to share and change free
+software--to make sure the software is free for all its users. This
+General Public License applies to most of the Free Software
+Foundation's software and to any other program whose authors commit to
+using it. (Some other Free Software Foundation software is covered by
+the GNU Library General Public License instead.) You can apply it to
+your programs, too.
+
+ When we speak of free software, we are referring to freedom, not
+price. Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+this service if you wish), that you receive source code or can get it
+if you want it, that you can change the software or use pieces of it
+in new free programs; and that you know you can do these things.
+
+ To protect your rights, we need to make restrictions that forbid
+anyone to deny you these rights or to ask you to surrender the rights.
+These restrictions translate to certain responsibilities for you if you
+distribute copies of the software, or if you modify it.
+
+ For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must give the recipients all the rights that
+you have. You must make sure that they, too, receive or can get the
+source code. And you must show them these terms so they know their
+rights.
+
+ We protect your rights with two steps: (1) copyright the software, and
+(2) offer you this license which gives you legal permission to copy,
+distribute and/or modify the software.
+
+ Also, for each author's protection and ours, we want to make certain
+that everyone understands that there is no warranty for this free
+software. If the software is modified by someone else and passed on, we
+want its recipients to know that what they have is not the original, so
+that any problems introduced by others will not reflect on the original
+authors' reputations.
+
+ Finally, any free program is threatened constantly by software
+patents. We wish to avoid the danger that redistributors of a free
+program will individually obtain patent licenses, in effect making the
+program proprietary. To prevent this, we have made it clear that any
+patent must be licensed for everyone's free use or not licensed at all.
+
+ The precise terms and conditions for copying, distribution and
+modification follow.
+
+ GNU GENERAL PUBLIC LICENSE
+ TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
+
+ 0. This License applies to any program or other work which contains
+a notice placed by the copyright holder saying it may be distributed
+under the terms of this General Public License. The "Program", below,
+refers to any such program or work, and a "work based on the Program"
+means either the Program or any derivative work under copyright law:
+that is to say, a work containing the Program or a portion of it,
+either verbatim or with modifications and/or translated into another
+language. (Hereinafter, translation is included without limitation in
+the term "modification".) Each licensee is addressed as "you".
+
+Activities other than copying, distribution and modification are not
+covered by this License; they are outside its scope. The act of
+running the Program is not restricted, and the output from the Program
+is covered only if its contents constitute a work based on the
+Program (independent of having been made by running the Program).
+Whether that is true depends on what the Program does.
+
+ 1. You may copy and distribute verbatim copies of the Program's
+source code as you receive it, in any medium, provided that you
+conspicuously and appropriately publish on each copy an appropriate
+copyright notice and disclaimer of warranty; keep intact all the
+notices that refer to this License and to the absence of any warranty;
+and give any other recipients of the Program a copy of this License
+along with the Program.
+
+You may charge a fee for the physical act of transferring a copy, and
+you may at your option offer warranty protection in exchange for a fee.
+
+ 2. You may modify your copy or copies of the Program or any portion
+of it, thus forming a work based on the Program, and copy and
+distribute such modifications or work under the terms of Section 1
+above, provided that you also meet all of these conditions:
+
+ a) You must cause the modified files to carry prominent notices
+ stating that you changed the files and the date of any change.
+
+ b) You must cause any work that you distribute or publish, that in
+ whole or in part contains or is derived from the Program or any
+ part thereof, to be licensed as a whole at no charge to all third
+ parties under the terms of this License.
+
+ c) If the modified program normally reads commands interactively
+ when run, you must cause it, when started running for such
+ interactive use in the most ordinary way, to print or display an
+ announcement including an appropriate copyright notice and a
+ notice that there is no warranty (or else, saying that you provide
+ a warranty) and that users may redistribute the program under
+ these conditions, and telling the user how to view a copy of this
+ License. (Exception: if the Program itself is interactive but
+ does not normally print such an announcement, your work based on
+ the Program is not required to print an announcement.)
+
+These requirements apply to the modified work as a whole. If
+identifiable sections of that work are not derived from the Program,
+and can be reasonably considered independent and separate works in
+themselves, then this License, and its terms, do not apply to those
+sections when you distribute them as separate works. But when you
+distribute the same sections as part of a whole which is a work based
+on the Program, the distribution of the whole must be on the terms of
+this License, whose permissions for other licensees extend to the
+entire whole, and thus to each and every part regardless of who wrote it.
+
+Thus, it is not the intent of this section to claim rights or contest
+your rights to work written entirely by you; rather, the intent is to
+exercise the right to control the distribution of derivative or
+collective works based on the Program.
+
+In addition, mere aggregation of another work not based on the Program
+with the Program (or with a work based on the Program) on a volume of
+a storage or distribution medium does not bring the other work under
+the scope of this License.
+
+ 3. You may copy and distribute the Program (or a work based on it,
+under Section 2) in object code or executable form under the terms of
+Sections 1 and 2 above provided that you also do one of the following:
+
+ a) Accompany it with the complete corresponding machine-readable
+ source code, which must be distributed under the terms of Sections
+ 1 and 2 above on a medium customarily used for software interchange; or,
+
+ b) Accompany it with a written offer, valid for at least three
+ years, to give any third party, for a charge no more than your
+ cost of physically performing source distribution, a complete
+ machine-readable copy of the corresponding source code, to be
+ distributed under the terms of Sections 1 and 2 above on a medium
+ customarily used for software interchange; or,
+
+ c) Accompany it with the information you received as to the offer
+ to distribute corresponding source code. (This alternative is
+ allowed only for noncommercial distribution and only if you
+ received the program in object code or executable form with such
+ an offer, in accord with Subsection b above.)
+
+The source code for a work means the preferred form of the work for
+making modifications to it. For an executable work, complete source
+code means all the source code for all modules it contains, plus any
+associated interface definition files, plus the scripts used to
+control compilation and installation of the executable. However, as a
+special exception, the source code distributed need not include
+anything that is normally distributed (in either source or binary
+form) with the major components (compiler, kernel, and so on) of the
+operating system on which the executable runs, unless that component
+itself accompanies the executable.
+
+If distribution of executable or object code is made by offering
+access to copy from a designated place, then offering equivalent
+access to copy the source code from the same place counts as
+distribution of the source code, even though third parties are not
+compelled to copy the source along with the object code.
+
+ 4. You may not copy, modify, sublicense, or distribute the Program
+except as expressly provided under this License. Any attempt
+otherwise to copy, modify, sublicense or distribute the Program is
+void, and will automatically terminate your rights under this License.
+However, parties who have received copies, or rights, from you under
+this License will not have their licenses terminated so long as such
+parties remain in full compliance.
+
+ 5. You are not required to accept this License, since you have not
+signed it. However, nothing else grants you permission to modify or
+distribute the Program or its derivative works. These actions are
+prohibited by law if you do not accept this License. Therefore, by
+modifying or distributing the Program (or any work based on the
+Program), you indicate your acceptance of this License to do so, and
+all its terms and conditions for copying, distributing or modifying
+the Program or works based on it.
+
+ 6. Each time you redistribute the Program (or any work based on the
+Program), the recipient automatically receives a license from the
+original licensor to copy, distribute or modify the Program subject to
+these terms and conditions. You may not impose any further
+restrictions on the recipients' exercise of the rights granted herein.
+You are not responsible for enforcing compliance by third parties to
+this License.
+
+ 7. If, as a consequence of a court judgment or allegation of patent
+infringement or for any other reason (not limited to patent issues),
+conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License. If you cannot
+distribute so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you
+may not distribute the Program at all. For example, if a patent
+license would not permit royalty-free redistribution of the Program by
+all those who receive copies directly or indirectly through you, then
+the only way you could satisfy both it and this License would be to
+refrain entirely from distribution of the Program.
+
+If any portion of this section is held invalid or unenforceable under
+any particular circumstance, the balance of the section is intended to
+apply and the section as a whole is intended to apply in other
+circumstances.
+
+It is not the purpose of this section to induce you to infringe any
+patents or other property right claims or to contest validity of any
+such claims; this section has the sole purpose of protecting the
+integrity of the free software distribution system, which is
+implemented by public license practices. Many people have made
+generous contributions to the wide range of software distributed
+through that system in reliance on consistent application of that
+system; it is up to the author/donor to decide if he or she is willing
+to distribute software through any other system and a licensee cannot
+impose that choice.
+
+This section is intended to make thoroughly clear what is believed to
+be a consequence of the rest of this License.
+
+ 8. If the distribution and/or use of the Program is restricted in
+certain countries either by patents or by copyrighted interfaces, the
+original copyright holder who places the Program under this License
+may add an explicit geographical distribution limitation excluding
+those countries, so that distribution is permitted only in or among
+countries not thus excluded. In such case, this License incorporates
+the limitation as if written in the body of this License.
+
+ 9. The Free Software Foundation may publish revised and/or new versions
+of the General Public License from time to time. Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+Each version is given a distinguishing version number. If the Program
+specifies a version number of this License which applies to it and "any
+later version", you have the option of following the terms and conditions
+either of that version or of any later version published by the Free
+Software Foundation. If the Program does not specify a version number of
+this License, you may choose any version ever published by the Free Software
+Foundation.
+
+ 10. If you wish to incorporate parts of the Program into other free
+programs whose distribution conditions are different, write to the author
+to ask for permission. For software which is copyrighted by the Free
+Software Foundation, write to the Free Software Foundation; we sometimes
+make exceptions for this. Our decision will be guided by the two goals
+of preserving the free status of all derivatives of our free software and
+of promoting the sharing and reuse of software generally.
+
+ NO WARRANTY
+
+ 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
+FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN
+OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
+PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
+OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS
+TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE
+PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
+REPAIR OR CORRECTION.
+
+ 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
+REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
+INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
+OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
+TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
+YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
+PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGES.
+
+ END OF TERMS AND CONDITIONS
diff --git a/MANIFEST.in b/MANIFEST.in
new file mode 100644
index 0000000..e1f9744
--- /dev/null
+++ b/MANIFEST.in
@@ -0,0 +1,3 @@
+include urlgrabber.py keepalive.py progress_meter.py
+include README
+include LICENSE
diff --git a/README b/README
new file mode 100644
index 0000000..bdc8e86
--- /dev/null
+++ b/README
@@ -0,0 +1,19 @@
+urlgrabber -- A high-level cross-protocol url-grabber
+
+INSTALLATION INSTRUCTIONS
+
+If you want to install urlgrabber on your system, simply open the package
+and run
+
+ python setup.py install
+
+Take a look at the install options by doing
+
+ python setup.py --help
+
+If you are on a Red Hat Linux machine that still uses python 1.5.x by
+default, you may need to run python2 in place of python above.
+
+If you just want to use urlgrabber.py (and/or keepalive.py) in your
+own programs, simply grab the file(s) and put it with the rest of your
+program's modules.
diff --git a/makefile b/makefile
new file mode 100644
index 0000000..bd07f82
--- /dev/null
+++ b/makefile
@@ -0,0 +1,12 @@
+RM = /bin/rm -f
+WEBHOST = teton.dulug.duke.edu
+WEBPATH = /var/www/linuxduke/projects/mini/urlgrabber
+
+dist:
+ python2 setup.py sdist --force-manifest
+ scp dist/* $(WEBHOST):$(WEBPATH)/dist/
+
+clean:
+ $(RM) MANIFEST
+ $(RM) -r dist/
+ $(RM) *.pyc
diff --git a/progress_meter.py b/progress_meter.py
new file mode 100644
index 0000000..506b107
--- /dev/null
+++ b/progress_meter.py
@@ -0,0 +1,161 @@
+import sys
+import time
+
+class text_progress_meter:
+ def __init__(self, fo=sys.stderr):
+ self.fo = fo
+ self.update_period = 0.3 # seconds
+
+ def start(self, filename, url, basename, length):
+ self.filename = filename
+ self.url = url
+ self.basename = basename
+ self.length = length
+ if not length == None:
+ self.flength = self.format_number(length) + 'B'
+ self.start_time = time.time()
+ self.last_update = 0
+ self.read = 0
+ self._do_start()
+
+ def _do_start(self):
+ pass
+
+ def end(self):
+ self.now = time.time()
+ self._do_end()
+
+ def _do_end(self):
+ total_time = self.format_time(self.now - self.start_time)
+ total_size = self.format_number(self.read)
+ if self.length is None:
+ out = '\r%-60.60s %5sB %s ' % \
+ (self.basename, total_size, total_time)
+ else:
+ bar = '='*25
+ out = '\r%-25.25s %3i%% |%-25.25s| %5sB %8s ' % \
+ (self.basename, 100, bar, total_size, total_time)
+ self.fo.write(out)
+ self.fo.write('\n')
+ self.fo.flush()
+
+ def update(self, read):
+ # for a real gui, you probably want to override and put a call
+ # to your mainloop iteration function here
+ self.read = read # put this here so it's caught for self.end
+ now = time.time()
+ if (now >= self.last_update + self.update_period) or \
+ not self.last_update:
+ self.now = now
+ self._do_update(read)
+ self.last_update = now
+
+ def _do_update(self, read):
+ # elapsed time since last update
+ etime = self.now - self.start_time
+ fetime = self.format_time(etime)
+ fread = self.format_number(read)
+
+ #self.length = None
+ if self.length is None:
+ out = '\r%-60.60s %5sB %s ' % \
+ (self.basename, fread, fetime)
+ else:
+ rtime = self.format_time(self.project(etime, read))
+ try: frac = float(read)/self.length
+ except ZeroDivisionError, e: frac = 1.0
+ if frac > 1.0: frac = 1.0
+ bar = '='*int(25 * frac)
+ out = '\r%-25.25s %3i%% |%-25.25s| %5sB %8s ETA ' % \
+ (self.basename, frac*100, bar, fread, rtime)
+ self.fo.write(out)
+ self.fo.flush()
+
+ def project(self, etime, read):
+ # get projected time for total download
+ if read == 0:
+ # if we just started this file, all bets are off
+ self.last_etime = etime
+ self.last_read = 0
+ self.ave_rate = None
+ return None
+
+ time_diff = etime - self.last_etime
+ read_diff = read - self.last_read
+ self.last_etime = etime
+ self.last_read = read
+ try: rate = time_diff / read_diff ## this is actually an inverse-rate
+ except ZeroDivisionError: return 0 ## should only happen at end of file
+
+ self._get_new_ave_rate(rate)
+ remaining_time = self.ave_rate * (self.length - read)
+ if remaining_time < 0: remaining_time = 0
+ return self._round_remaining_time(remaining_time)
+
+ def _get_new_ave_rate(self, rate, epsilon=0.98):
+ if self.ave_rate == None:
+ self.ave_rate = rate
+ else:
+ # calculate a "rolling average" - this balances long-term behavior
+ # with short-term fluctuations
+ # epsilon = 0.0 --> only consider most recent block
+ # epsilon = 1.0 --> only consider first block
+ self.ave_rate = (self.ave_rate * epsilon) + (rate * (1-epsilon))
+
+ def _round_remaining_time(self, remaining_time):
+ # round to further stabilize it
+ i = 1
+ while remaining_time > 30:
+ i = i * 2
+ remaining_time = remaining_time / 2
+ remaining_time = int(remaining_time)
+ return float(remaining_time * i)
+
+ def format_time(self, seconds):
+ if seconds is None or seconds < 0:
+ return '--:--'
+ else:
+ seconds = int(seconds)
+ minutes = seconds / 60
+ seconds = seconds % 60
+ return '%02i:%02i' % (minutes, seconds)
+
+ def format_number(self, number, SI=0, space=' '):
+ """Turn numbers into human-readable metric-like numbers"""
+ symbols = ['', # (none)
+ 'k', # kilo
+ 'M', # mega
+ 'G', # giga
+ 'T', # tera
+ 'P', # peta
+ 'E', # exa
+ 'Z', # zetta
+ 'Y'] # yotta
+
+ if SI: step = 1000.0
+ else: step = 1024.0
+
+ thresh = 999
+ depth = 0
+
+ # we want numbers between
+ while number > thresh:
+ depth = depth + 1
+ number = number / step
+
+ # just in case someone needs more than 1000 yottabytes!
+ diff = depth - len(symbols) + 1
+ if diff > 0:
+ depth = depth - diff
+ number = number * thresh**depth
+
+ if type(number) == type(1) or type(number) == type(1L):
+ format = '%i%s%s'
+ elif number < 9.95:
+ # must use 9.95 for proper sizing. For example, 9.99 will be
+ # rounded to 10.0 with the .1f format string (which is too long)
+ format = '%.1f%s%s'
+ else:
+ format = '%.0f%s%s'
+
+ return(format % (number, space, symbols[depth]))
diff --git a/setup.py b/setup.py
new file mode 100644
index 0000000..8310997
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,9 @@
+from distutils.core import setup
+setup(name="urlgrabber",
+ version="0.2",
+ description="high-level cross-protocol url-grabber",
+ author="Michael D. Stenner",
+ author_email="mstenner@phy.duke.edu",
+ url="http://linux.duke.edu/projects/mini/urlgrabber/",
+ license="GPL",
+ py_modules=["urlgrabber", "keepalive", "progress_meter"])
diff --git a/urlgrabber.py b/urlgrabber.py
new file mode 100644
index 0000000..156d85a
--- /dev/null
+++ b/urlgrabber.py
@@ -0,0 +1,752 @@
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Library General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+
+# Copyright 2002-2003 Michael D. Stenner
+
+import os
+import os.path
+import urlparse
+import rfc822
+import time
+import string
+
+DEBUG=0
+VERSION='0.2'
+
+try:
+ from i18n import _
+except ImportError, msg:
+ def _(st): return st
+
+try:
+ from httplib import HTTPException
+except ImportError, msg:
+ HTTPException = None
+
+special_handlers = []
+
+try:
+ import urllib2
+except ImportError, msg:
+ import urllib
+ urllib._urlopener = urllib.FancyURLopener() # make sure it ready now
+ urllib2 = urllib # this way, we can always just do urllib.urlopen()
+ have_urllib2 = 0
+ auth_handler = None
+else:
+ have_urllib2 = 1
+ auth_handler = urllib2.HTTPBasicAuthHandler( \
+ urllib2.HTTPPasswordMgrWithDefaultRealm())
+ special_handlers.append(auth_handler)
+
+try:
+ # This is a convenient way to make keepalive optional.
+ # Just rename the module so it can't be imported.
+ from keepalive import HTTPHandler
+except ImportError, msg:
+ keepalive_handler = None
+else:
+ keepalive_handler = HTTPHandler()
+ special_handlers.append(keepalive_handler)
+
+if have_urllib2:
+ opener = apply(urllib2.build_opener, special_handlers)
+ urllib2.install_opener(opener)
+
+def set_user_agent(new_user_agent):
+ if have_urllib2: addheaders = opener.addheaders
+ else: addheaders = urllib._urlopener.addheaders
+
+ new_tuple = ('User-agent', new_user_agent)
+
+ for i in range(len(addheaders)):
+ if addheaders[i][0] == 'User-agent':
+ addheaders[i] = new_tuple
+ break
+ else:
+ addheaders.append(new_tuple)
+
+# the calling application can override this user-agent by calling
+# urlgrabber.set_user_agent
+set_user_agent('urlgrabber/%s' % VERSION)
+
+class URLGrabError(IOError):
+ """
+ URLGrabError error codes:
+ -1 - default retry code for retrygrab check functions
+ 0 - everything looks good (you should never see this)
+ 1 - malformed url
+ 2 - local file doesn't exist
+ 3 - request for non-file local file (dir, etc)
+ 4 - IOError on fetch
+ 5 - OSError on fetch
+ 6 - no content length header when we expected one
+ 7 - HTTPException
+ 8 - Exceeded read limit (for urlread)
+
+ Negative codes are reserved for use by functions passed in to
+ retrygrab with checkfunc.
+
+ You can use it like this:
+ try: urlgrab(url)
+ except URLGrabError, e:
+ if e.errno == 3: ...
+ # or
+ print e.strerror
+ # or simply
+ print e #### print '[Errno %i] %s' % (e.errno, e.strerror)
+ """
+ pass
+
+def close_all():
+ """close any open keepalive connections"""
+ if keepalive_handler: keepalive_handler.close_all()
+
+_throttle = 1.0
+_bandwidth = 0
+def set_throttle(new_throttle):
+ """urlgrab supports throttling via two values: throttle and bandwidth
+ Between the two, you can either specify and absolute throttle threshold
+ or specify a theshold as a fraction of maximum available bandwidth.
+
+ throttle is a number - if it's an int, it's the bytes/second throttle
+ limit. If it's a float, it is first multiplied by bandwidth. If
+ throttle == 0, throttling is disabled. If None, the module-level
+ default (which can be set with set_throttle) is used.
+
+ bandwidth is the nominal max bandwidth in bytes/second. If throttle
+ is a float and bandwidth == 0, throttling is disabled. If None,
+ the module-level default (which can be set with set_bandwidth) is
+ used.
+
+ EXAMPLES:
+
+ Lets say you have a 100 Mbps connection. This is (about) 10^8 bits
+ per second, or 12,500,000 Bytes per second. You have a number of
+ throttling options:
+
+ *) set_bandwidth(12500000); set_throttle(0.5) # throttle is a float
+
+ This will limit urlgrab to use half of your available bandwidth.
+
+ *) set_throttle(6250000) # throttle is an int
+
+ This will also limit urlgrab to use half of your available
+ bandwidth, regardless of what bandwidth is set to.
+
+ *) set_throttle(6250000); set_throttle(1.0) # float
+
+ Use half your bandwidth
+
+ *) set_throttle(6250000); set_throttle(2.0) # float
+
+ Use up to 12,500,000 Bytes per second (your nominal max bandwidth)
+
+ *) set_throttle(6250000); set_throttle(0) # throttle = 0
+
+ Disable throttling - this is more efficient than a very large
+ throttle setting.
+
+ *) set_throttle(0); set_throttle(1.0) # throttle is float, bandwidth = 0
+
+ Disable throttling - this is the default when the module is loaded.
+
+
+ SUGGESTED AUTHOR IMPLEMENTATION
+
+ While this is flexible, it's not extremely obvious to the user. I
+ suggest you implement a float throttle as a percent to make the
+ distinction between absolute and relative throttling very explicit.
+
+ Also, you may want to convert the units to something more convenient
+ than bytes/second, such as kbps or kB/s, etc.
+ """
+ global _throttle
+ _throttle = new_throttle
+
+def set_bandwidth(new_bandwidth):
+ global _bandwidth
+ _bandwidth = new_bandwidth
+
+_progress_obj = None
+def set_progress_obj(new_progress_obj):
+ global _progress_obj
+ _progress_obj = new_progress_obj
+
+
+def retrygrab(url, filename=None, copy_local=0, close_connection=0,
+ progress_obj=None, throttle=None, bandwidth=None,
+ numtries=3, retrycodes=[-1,2,4,5,6,7], checkfunc=None):
+ """a wrapper function for urlgrab that retries downloads
+
+ The args for retrygrab are the same as urlgrab except for numtries,
+ retrycodes, and checkfunc. You should use keyword arguments for
+ both in case new args are added to urlgrab later. If you use keyword
+ args (especially for the retrygrab-specific options) then retrygrab
+ will continue to be a drop-in replacement for urlgrab. Otherwise,
+ things may break.
+
+ retrygrab exits just like urlgrab in either case. Either it
+ returns the local filename or it raises an exception. The
+ exception raised will be the one raised MOST RECENTLY by urlgrab.
+
+ retrygrab ONLY retries if URLGrabError is raised. If urlgrab (or
+ checkfunc) raise some other exception, it will be passed up
+ immediately.
+
+ numtries
+ number of times to retry the grab before bailing. If this is
+ zero, it will retry forever. This was intentional... really,
+ it was :)
+
+ retrycodes
+ the errorcodes (values of e.errno) for which it should retry.
+ See the doc on URLGrabError for more details on this.
+
+ checkfunc
+ a function to do additional checks. This defaults to None,
+ which means no additional checking. The function should simply
+ return on a successful check. It should raise URLGrabError on
+ and unsuccessful check. Raising of any other exception will
+ be considered immediate failure and no retries will occur.
+
+ Negative error numbers are reserved for use by these passed in
+ functions. By default, -1 results in a retry, but this can be
+ customized with retrycodes.
+
+ If you simply pass in a function, it will be given exactly one
+ argument: the local file name as returned by urlgrab. If you
+ need to pass in other arguments, you can do so like this:
+
+ checkfunc=(function, ('arg1', 2), {'kwarg': 3})
+
+ if the downloaded file as filename /tmp/stuff, then this will
+ result in this call:
+
+ function('/tmp/stuff', 'arg1', 2, kwarg=3)
+
+ NOTE: both the "args" tuple and "kwargs" dict must be present
+ if you use this syntax, but either (or both) can be empty.
+ """
+
+ tries = 0
+ if not checkfunc is None:
+ if callable(checkfunc):
+ func, args, kwargs = checkfunc, (), {}
+ else:
+ func, args, kwargs = checkfunc
+ else:
+ func = None
+
+ while 1:
+ tries = tries + 1
+ if DEBUG: print 'TRY #%i: %s' % (tries, url)
+ try:
+ fname = urlgrab(url, filename, copy_local, close_connection,
+ progress_obj, throttle, bandwidth)
+ if not func is None: apply(func, (fname, )+args, kwargs)
+ if DEBUG: print 'RESULT = success (%s)' % fname
+ return fname
+ except URLGrabError, e:
+ if DEBUG: print 'EXCEPTION: %s' % e
+ if tries == numtries or (e.errno not in retrycodes): raise
+
+def urlgrab(url, filename=None, copy_local=0, close_connection=0,
+ progress_obj=None, throttle=None, bandwidth=None):
+ """grab the file at <url> and make a local copy at <filename>
+
+ If filename is none, the basename of the url is used.
+
+ copy_local is ignored except for file:// urls, in which case it
+ specifies whether urlgrab should still make a copy of the file, or
+ simply point to the existing copy.
+
+ close_connection tells urlgrab to close the connection after
+ completion. This is ignored unless the download happens with the
+ http keepalive handler. Otherwise, the connection is left open
+ for further use.
+
+ progress_obj is a class instance that supports the following methods:
+ po.start(filename, url, basename, length)
+ # length will be None if unknown
+ po.update(read) # read == bytes read so far
+ po.end()
+
+ throttle is a number - if it's an int, it's the bytes/second throttle
+ limit. If it's a float, it is first multiplied by bandwidth. If
+ throttle == 0, throttling is disabled. If None, the module-level
+ default (which can be set with set_throttle) is used.
+
+ bandwidth is the nominal max bandwidth in bytes/second. If throttle
+ is a float and bandwidth == 0, throttling is disabled. If None,
+ the module-level default (which can be set with set_bandwidth) is
+ used.
+
+ urlgrab returns the filename of the local file, which may be different
+ from the passed-in filename if copy_local == 0.
+ """
+
+ url, parts = _parse_url(url)
+ (scheme, host, path, parm, query, frag) = parts
+
+ if filename == None:
+ filename = os.path.basename(path)
+ if scheme == 'file' and not copy_local:
+ # just return the name of the local file - don't make a copy
+ # currently we don't do anything with the progress_cb here
+ if not os.path.exists(path):
+ raise URLGrabError(2, _('Local file does not exist: %s') % (path, ))
+ elif not os.path.isfile(path):
+ raise URLGrabError(3, _('Not a normal file: %s') % (path, ))
+ else:
+ return path
+
+ raw_throttle = _get_raw_throttle(throttle, bandwidth)
+ if progress_obj is None: progress_obj = _progress_obj
+ elif not progress_obj: progress_obj = None
+ fo, hdr = _do_open(url)
+
+ # download and store the file
+ try:
+ if progress_obj or raw_throttle:
+ if progress_obj:
+ try: length = int(hdr['Content-Length'])
+ except: length = None
+ progress_obj.start(filename, url, os.path.basename(path), length)
+ fo = URLGrabberFileObject(fo, progress_obj, raw_throttle)
+ _do_grab(filename, fo, hdr)
+ fo.close()
+
+ if close_connection:
+ # try and close connection
+ try: fo.close_connection()
+ except AttributeError: pass
+ except IOError, e:
+ raise URLGrabError(4, _('IOError: %s') % (e, ))
+ except OSError, e:
+ raise URLGrabError(5, _('OSError: %s') % (e, ))
+ except HTTPException, e:
+ raise URLGrabError(7, _('HTTP Error (%s): %s') % \
+ (e.__class__.__name__, e))
+
+ return filename
+
+def urlopen(url, progress_obj=None, throttle=None, bandwidth=None):
+ """open the url and return a file object
+
+ If a progress object or throttle specifications exist, then
+ a special file object will be returned that supports them.
+ The file object can be treated like any other file object.
+ """
+ url, parts = _parse_url(url)
+ (scheme, host, path, parm, query, frag) = parts
+ raw_throttle = _get_raw_throttle(throttle, bandwidth)
+ if progress_obj is None: progress_obj = _progress_obj
+ fo, hdr = _do_open(url)
+ if progress_obj or raw_throttle:
+ if progress_obj:
+ try: length = int(hdr['Content-Length'])
+ except: length = None
+ progress_obj.start(None, url, os.path.basename(path), length)
+ fo = URLGrabberFileObject(fo, progress_obj, raw_throttle)
+ return fo
+
+def urlread(url, progress_obj=None, throttle=None, bandwidth=None, limit=None):
+ """read the url into a string, up to 'limit' bytes
+
+ If the limit is exceeded, an exception will be thrown. Note that urlread
+ is NOT intended to be used as a way of saying "I want the first N bytes"
+ but rather 'read the whole file into memory, but don't use too much'
+ """
+ fo = urlopen(url, progress_obj, throttle, bandwidth)
+ s = fo.read(limit+1)
+ fo.close()
+ if limit and len(s) > limit:
+ raise URLGrabError(8, _('Exceeded limit (%i): %s') % (limit, url))
+ return s
+
+class URLGrabberFileObject:
+ """This is a file-object wrapper that supports progress objects and
+ throttling.
+
+ This exists to solve the following problem: lets say you want to
+ drop-in replace a normal open with urlopen. You want to use a
+ progress meter and/or throttling, but how do you do that without
+ rewriting your code? Answer: urlopen will return a wrapped file
+ object that does the progress meter and-or throttling internally.
+ """
+
+ def __init__(self, fo, progress_obj, raw_throttle):
+ self.fo = fo
+ self.raw_throttle = raw_throttle
+ self.progress_obj = progress_obj
+ self._rbuf = ''
+ self._rbufsize = 1024*8
+ self._ttime = time.time()
+ self._tsize = 0
+ self._amount_read = 0
+ if progress_obj: progress_obj.update(0)
+
+ def __getattr__(self, name):
+ """This effectively allows us to wrap at the instance level.
+ Any attribute not found in _this_ object will be searched for
+ in self.fo. This includes methods."""
+ if hasattr(self.fo, name):
+ return getattr(self.fo, name)
+ raise AttributeError, name
+
+ def _fill_buffer(self, amt=None):
+ """fill the buffer to contain at least 'amt' bytes by reading
+ from the underlying file object. If amt is None, then it will
+ read until it gets nothing more. It updates the progress meter
+ and throttles after every self._rbufsize bytes."""
+ # the _rbuf test is only in this first 'if' for speed. It's not
+ # logically necessary
+ if self._rbuf and not amt is None:
+ L = len(self._rbuf)
+ if amt > L:
+ amt = amt - L
+ else:
+ return
+
+ # if we've made it here, then we don't have enough in the buffer
+ # and we need to read more.
+
+ buf = [self._rbuf]
+ bufsize = len(self._rbuf)
+ while amt is None or amt:
+ # first, delay if necessary for throttling reasons
+ if self.raw_throttle:
+ diff = self._tsize/self.raw_throttle - \
+ (time.time() - self._ttime)
+ if diff > 0: time.sleep(diff)
+ self._ttime = time.time()
+
+ # now read some data, up to self._rbufsize
+ if amt is None: readamount = self._rbufsize
+ else: readamount = min(amt, self._rbufsize)
+ new = self.fo.read(readamount)
+ newsize = len(new)
+ if not newsize: break # no more to read
+
+ if amt: amt = amt - newsize
+ buf.append(new)
+ bufsize = bufsize + newsize
+ self._tsize = newsize
+ self._amount_read = self._amount_read + newsize
+ if self.progress_obj:
+ self.progress_obj.update(self._amount_read)
+
+ self._rbuf = string.join(buf, '')
+ return
+
+ def read(self, amt=None):
+ self._fill_buffer(amt)
+ if amt is None:
+ s, self._rbuf = self._rbuf, ''
+ else:
+ s, self._rbuf = self._rbuf[:amt], self._rbuf[amt:]
+ return s
+
+ def readline(self, limit=-1):
+ i = string.find(self._rbuf, '\n')
+ while i < 0 and not (0 < limit <= len(self._rbuf)):
+ L = len(self._rbuf)
+ self._fill_buffer(L + self._rbufsize)
+ if not len(self._rbuf) > L: break
+ i = string.find(self._rbuf, '\n', L)
+
+ if i < 0: i = len(self._rbuf)
+ else: i = i+1
+ if 0 <= limit < len(self._rbuf): i = limit
+
+ s, self._rbuf = self._rbuf[:i], self._rbuf[i:]
+ return s
+
+ def close(self):
+ if self.progress_obj:
+ self.progress_obj.end()
+ self.fo.close()
+
+def _parse_url(url):
+ """break up the url into its component parts
+
+ This function disassembles a url and
+ 1) "normalizes" it, tidying it up a bit
+ 2) does any authentication stuff it needs to do
+
+ it returns the (cleaned) url and a tuple of component parts
+ """
+ (scheme, host, path, parm, query, frag) = urlparse.urlparse(url)
+ path = os.path.normpath(path)
+ if '@' in host and auth_handler and scheme in ['http', 'https']:
+ try:
+ # should we be using urllib.splituser and splitpasswd instead?
+ user_password, host = string.split(host, '@', 1)
+ user, password = string.split(user_password, ':', 1)
+ except ValueError, e:
+ raise URLGrabError(1, _('Bad URL: %s') % url)
+ if DEBUG: print 'adding HTTP auth: %s, %s' % (user, password)
+ auth_handler.add_password(None, host, user, password)
+
+ parts = (scheme, host, path, parm, query, frag)
+ return urlparse.urlunparse(parts), parts
+
+def _get_raw_throttle(throttle, bandwidth):
+ if throttle == None: throttle = _throttle
+ if throttle <= 0: raw_throttle = 0
+ elif type(throttle) == type(0): raw_throttle = float(throttle)
+ else: # throttle is a float
+ if bandwidth == None: bandwidth = _bandwidth
+ raw_throttle = bandwidth * throttle
+ return raw_throttle
+
+def _do_open(url):
+ """initiate the connection & get the headers
+ return the file object and header object
+ """
+ try:
+ fo = urllib2.urlopen(url)
+ hdr = fo.info()
+ except ValueError, e:
+ raise URLGrabError(1, _('Bad URL: %s') % (e, ))
+ except IOError, e:
+ raise URLGrabError(4, _('IOError: %s') % (e, ))
+ except OSError, e:
+ raise URLGrabError(5, _('OSError: %s') % (e, ))
+ except HTTPException, e:
+ raise URLGrabError(7, _('HTTP Error (%s): %s') % \
+ (e.__class__.__name__, e))
+
+ # OK, this "cute little hack" may have outlived its usefulness.
+ # the role of urlgrabber is expaning and we're wanting it to handle
+ # things (like cgi output) that this is preventing. For now, I'm
+ # simply going to comment it out and see what breaks.
+
+ # this is a cute little hack - if there isn't a "Content-Length"
+ # header then its probably something generated dynamically, such
+ # as php, cgi, a directory listing, or an error message. It is
+ # probably not what we want.
+ #if have_urllib2 or scheme != 'file':
+ # # urllib does not provide content-length for local files
+ # if not hdr is None and not hdr.has_key('Content-Length'):
+ # raise URLGrabError(6, _('ERROR: Url Return no Content-Length - something is wrong'))
+
+ return fo, hdr
+
+_last_modified_format = '%a, %d %b %Y %H:%M:%S %Z'
+def _do_grab(filename, fo, hdr):
+ """dump the file to filename"""
+ new_fo = open(filename, 'wb')
+ bs = 1024*8
+ size = 0
+
+ block = fo.read(bs)
+ size = size + len(block)
+ while block:
+ new_fo.write(block)
+ block = fo.read(bs)
+ size = size + len(block)
+
+ new_fo.close()
+
+ try:
+ modified_tuple = hdr.getdate_tz('last-modified')
+ modified_stamp = rfc822.mktime_tz(modified_tuple)
+ os.utime(filename, (modified_stamp, modified_stamp))
+ except (TypeError,), e: pass
+
+ return size
+
+#####################################################################
+# TESTING
+def _main_test():
+ import sys
+ try: url, filename = sys.argv[1:3]
+ except ValueError:
+ print 'usage:', sys.argv[0], \
+ '<url> <filename> [copy_local=0|1] [close_connection=0|1]'
+ sys.exit()
+
+ kwargs = {}
+ for a in sys.argv[3:]:
+ k, v = string.split(a, '=', 1)
+ kwargs[k] = int(v)
+
+ set_throttle(1.0)
+ set_bandwidth(32 * 1024)
+ print "throttle: %s, throttle bandwidth: %s B/s" % (_throttle, _bandwidth)
+
+ try: from progress_meter import text_progress_meter
+ except ImportError, e: pass
+ else: kwargs['progress_obj'] = text_progress_meter()
+
+ try: name = apply(urlgrab, (url, filename), kwargs)
+ except URLGrabError, e: print e
+ else: print 'LOCAL FILE:', name
+
+
+def _speed_test():
+ #### speed test --- see comment below
+ import sys
+
+ full_times = []
+ raw_times = []
+ set_throttle(2**40) # throttle to 1 TB/s :)
+
+ try:
+ from progress_meter import text_progress_meter
+ except ImportError, e:
+ tpm = None
+ print 'not using progress meter'
+ else:
+ tpm = text_progress_meter(fo=open('/dev/null', 'w'))
+
+ # to address concerns that the overhead from the progress meter
+ # and throttling slow things down, we do this little test. Make
+ # sure /tmp/test holds a sanely-sized file (like .2 MB)
+ #
+ # using this test, you get the FULL overhead of the progress
+ # meter and throttling, without the benefit: the meter is directed
+ # to /dev/null and the throttle bandwidth is set EXTREMELY high.
+ #
+ # note: it _is_ even slower to direct the progress meter to a real
+ # tty or file, but I'm just interested in the overhead from _this_
+ # module.
+
+ # get it nicely cached before we start comparing
+ print 'pre-caching'
+ for i in range(100):
+ urlgrab('file:///tmp/test', '/tmp/test2',
+ copy_local=1)
+
+ reps = 1000
+ for i in range(reps):
+ print '\r%4i/%-4i' % (i, reps),
+ sys.stdout.flush()
+ t = time.time()
+ urlgrab('file:///tmp/test', '/tmp/test2',
+ copy_local=1, progress_obj=tpm)
+ full_times.append(1000 * (time.time() - t))
+
+ t = time.time()
+ urlgrab('file:///tmp/test', '/tmp/test2',
+ copy_local=1, progress_obj=None)
+ raw_times.append(1000* (time.time() - t))
+ print '\r'
+
+ full_times.sort()
+ full_mean = 0.0
+ for i in full_times: full_mean = full_mean + i
+ full_mean = full_mean/len(full_times)
+ print '[full] mean: %.3f ms, median: %.3f ms, min: %.3f ms, max: %.3f ms' % \
+ (full_mean, full_times[int(len(full_times)/2)], min(full_times),
+ max(full_times))
+
+ raw_times.sort()
+ raw_mean = 0.0
+ for i in raw_times: raw_mean = raw_mean + i
+ raw_mean = raw_mean/len(raw_times)
+ print '[raw] mean: %.3f ms, median: %.3f ms, min: %.3f ms, max: %.3f ms' % \
+ (raw_mean, raw_times[int(len(raw_times)/2)], min(raw_times),
+ max(raw_times))
+
+ close_all()
+
+def _retry_test():
+ import sys
+ try: url, filename = sys.argv[1:3]
+ except ValueError:
+ print 'usage:', sys.argv[0], \
+ '<url> <filename> [copy_local=0|1] [close_connection=0|1]'
+ sys.exit()
+
+ kwargs = {}
+ for a in sys.argv[3:]:
+ k, v = string.split(a, '=', 1)
+ kwargs[k] = int(v)
+
+ try: from progress_meter import text_progress_meter
+ except ImportError, e: pass
+ else: kwargs['progress_obj'] = text_progress_meter()
+
+ global DEBUG
+ #DEBUG = 1
+ def cfunc(filename, hello, there='foo'):
+ print hello, there
+ import random
+ rnum = random.random()
+ if rnum < .5:
+ print 'forcing retry'
+ raise URLGrabError(-1, 'forcing retry')
+ if rnum < .75:
+ print 'forcing failure'
+ raise URLGrabError(-2, 'forcing immediate failure')
+ print 'success'
+ return
+
+ close_all()
+ kwargs['checkfunc'] = (cfunc, ('hello',), {'there':'there'})
+ try: name = apply(retrygrab, (url, filename), kwargs)
+ except URLGrabError, e: print e
+ else: print 'LOCAL FILE:', name
+
+def _file_object_test(filename=None):
+ import random, cStringIO, sys
+ if filename is None:
+ filename = __file__
+ print 'using file "%s" for comparisons' % filename
+ fo = open(filename)
+ s_input = fo.read()
+ fo.close()
+
+ for testfunc in [_test_file_object_smallread,
+ _test_file_object_readall,
+ _test_file_object_readline,
+ _test_file_object_readlines]:
+ fo_input = cStringIO.StringIO(s_input)
+ fo_output = cStringIO.StringIO()
+ wrapper = URLGrabberFileObject(fo_input, None, 0)
+ print 'testing %-30s ' % testfunc.__name__,
+ testfunc(wrapper, fo_output)
+ s_output = fo_output.getvalue()
+ if s_output == s_input: print 'passed'
+ else: print 'FAILED'
+
+def _test_file_object_smallread(wrapper, fo_output):
+ while 1:
+ s = wrapper.read(23)
+ fo_output.write(s)
+ if not s: return
+
+def _test_file_object_readall(wrapper, fo_output):
+ s = wrapper.read()
+ fo_output.write(s)
+
+def _test_file_object_readline(wrapper, fo_output):
+ while 1:
+ s = wrapper.readline()
+ fo_output.write(s)
+ if not s: return
+
+def _test_file_object_readlines(wrapper, fo_output):
+ li = wrapper.readlines()
+ fo_output.write(string.join(li, ''))
+
+if __name__ == '__main__':
+ _main_test()
+ #_speed_test()
+ #_retry_test()
+ #_file_object_test()
+
diff --git a/urlgrabber/keepalive.py b/urlgrabber/keepalive.py
new file mode 100644
index 0000000..08bddd8
--- /dev/null
+++ b/urlgrabber/keepalive.py
@@ -0,0 +1,379 @@
+#!/usr/bin/python2
+"""An HTTP handler for urllib2 that supports HTTP 1.1 and keepalive.
+
+>>> import urllib2
+>>> from keepalive import HTTPHandler
+>>> keepalive_handler = HTTPHandler()
+>>> opener = urllib2.build_opener(keepalive_handler)
+>>> urllib2.install_opener(opener)
+>>>
+>>> fo = urllib2.urlopen('http://www.python.org')
+
+To remove the handler, simply re-run build_opener with no arguments, and
+install that opener.
+
+You can explicitly close connections by using the close_connection()
+method of the returned file-like object (described below) or you can
+use the handler methods:
+
+ close_connection(host)
+ close_all()
+ open_connections()
+
+>>> keepalive_handler.close_all()
+
+EXTRA ATTRIBUTES AND METHODS
+
+ Upon a status of 200, the object returned has a few additional
+ attributes and methods, which should not be used if you want to
+ remain consistent with the normal urllib2-returned objects:
+
+ close_connection() - close the connection to the host
+ readlines() - you know, readlines()
+ status - the return status (ie 404)
+ reason - english translation of status (ie 'File not found')
+
+ If you want the best of both worlds, use this inside an
+ AttributeError-catching try:
+
+ >>> try: status = fo.status
+ >>> except AttributeError: status = None
+
+ Unfortunately, these are ONLY there if status == 200, so it's not
+ easy to distinguish between non-200 responses. The reason is that
+ urllib2 tries to do clever things with error codes 301, 302, 401,
+ and 407, and it wraps the object upon return.
+
+ You can optionally set the module-level global HANDLE_ERRORS to 0,
+ in which case the handler will always return the object directly.
+ If you like the fancy handling of errors, don't do this. If you
+ prefer to see your error codes, then do.
+
+"""
+
+import urllib2
+import httplib
+import socket
+
+VERSION = (0, 1)
+#STRING_VERSION = '.'.join(map(str, VERSION))
+DEBUG = 0
+HANDLE_ERRORS = 1
+
+class HTTPHandler(urllib2.HTTPHandler):
+ def __init__(self):
+ self._connections = {}
+
+ def close_connection(self, host):
+ """close connection to <host>
+ host is the host:port spec, as in 'www.cnn.com:8080' as passed in.
+ no error occurs if there is no connection to that host."""
+ self._remove_connection(host, close=1)
+
+ def open_connections(self):
+ """return a list of connected hosts"""
+ return self._connections.keys()
+
+ def close_all(self):
+ """close all open connections"""
+ for host, conn in self._connections.items():
+ conn.close()
+ self._connections = {}
+
+ def _remove_connection(self, host, close=0):
+ if self._connections.has_key(host):
+ if close: self._connections[host].close()
+ del self._connections[host]
+
+ def _start_connection(self, h, req):
+ try:
+ if req.has_data():
+ data = req.get_data()
+ h.putrequest('POST', req.get_selector())
+ if not req.headers.has_key('Content-type'):
+ h.putheader('Content-type',
+ 'application/x-www-form-urlencoded')
+ if not req.headers.has_key('Content-length'):
+ h.putheader('Content-length', '%d' % len(data))
+ else:
+ h.putrequest('GET', req.get_selector())
+ except socket.error, err:
+ raise urllib2.URLError(err)
+
+ for args in self.parent.addheaders:
+ h.putheader(*args)
+ for k, v in req.headers.items():
+ h.putheader(k, v)
+ h.endheaders()
+ if req.has_data():
+ h.send(data)
+
+ def do_open(self, http_class, req):
+ host = req.get_host()
+ if not host:
+ raise urllib2.URLError('no host given')
+
+ try:
+ need_new_connection = 1
+ h = self._connections.get(host)
+ if not h is None:
+ try:
+ self._start_connection(h, req)
+ except socket.error, e:
+ r = None
+ else:
+ try: r = h.getresponse()
+ except httplib.ResponseNotReady, e: r = None
+
+ if r is None or r.version == 9:
+ # httplib falls back to assuming HTTP 0.9 if it gets a
+ # bad header back. This is most likely to happen if
+ # the socket has been closed by the server since we
+ # last used the connection.
+ if DEBUG: print "failed to re-use connection to %s" % host
+ h.close()
+ else:
+ if DEBUG: print "re-using connection to %s" % host
+ need_new_connection = 0
+ if need_new_connection:
+ if DEBUG: print "creating new connection to %s" % host
+ h = http_class(host)
+ self._connections[host] = h
+ self._start_connection(h, req)
+ r = h.getresponse()
+ except socket.error, err:
+ raise urllib2.URLError(err)
+
+ # if not a persistent connection, don't try to reuse it
+ if r.will_close: self._remove_connection(host)
+
+ if DEBUG:
+ print "STATUS: %s, %s" % (r.status, r.reason)
+ r._handler = self
+ r._host = host
+ r._url = req.get_full_url()
+
+ if r.status == 200 or not HANDLE_ERRORS:
+ return r
+ else:
+ return self.parent.error('http', req, r, r.status, r.reason, r.msg)
+
+ def http_open(self, req):
+ return self.do_open(HTTPConnection, req)
+
+class HTTPResponse(httplib.HTTPResponse):
+
+ # we need to subclass HTTPResponse in order to
+ # 1) add readline() and readlines() methods
+ # 2) add close_connection() methods
+ # 3) add info() and geturl() methods
+
+ # in order to add readline(), read must be modified to deal with a
+ # buffer. example: readline must read a buffer and then spit back
+ # one line at a time. The only real alternative is to read one
+ # BYTE at a time (ick). Once something has been read, it can't be
+ # put back (ok, maybe it can, but that's even uglier than this),
+ # so if you THEN do a normal read, you must first take stuff from
+ # the buffer.
+
+ # the read method wraps the original to accomodate buffering,
+ # although read() never adds to the buffer.
+ # Both readline and readlines have been stolen with almost no
+ # modification from socket.py
+
+
+ def __init__(self, sock, debuglevel=0, strict=0, method=None):
+ if method: # the httplib in python 2.3 uses the method arg
+ httplib.HTTPResponse.__init__(self, sock, debuglevel, method)
+ else: # 2.2 doesn't
+ httplib.HTTPResponse.__init__(self, sock, debuglevel)
+ self.fileno = sock.fileno
+ self._rbuf = ''
+ self._rbufsize = 8096
+ self._handler = None # inserted by the handler later
+ self._host = None # (same)
+ self._url = None # (same)
+
+ _raw_read = httplib.HTTPResponse.read
+
+ def close_connection(self):
+ self.close()
+ self._handler._remove_connection(self._host, close=1)
+
+ def info(self):
+ return self.msg
+
+ def geturl(self):
+ return self._url
+
+ def read(self, amt=None):
+ # the _rbuf test is only in this first if for speed. It's not
+ # logically necessary
+ if self._rbuf and not amt is None:
+ L = len(self._rbuf)
+ if amt > L:
+ amt -= L
+ else:
+ s = self._rbuf[:amt]
+ self._rbuf = self._rbuf[amt:]
+ return s
+
+ s = self._rbuf + self._raw_read(amt)
+ self._rbuf = ''
+ return s
+
+ def readline(self, limit=-1):
+ data = ""
+ i = self._rbuf.find('\n')
+ while i < 0 and not (0 < limit <= len(self._rbuf)):
+ new = self._raw_read(self._rbufsize)
+ if not new: break
+ i = new.find('\n')
+ if i >= 0: i = i + len(self._rbuf)
+ self._rbuf = self._rbuf + new
+ if i < 0: i = len(self._rbuf)
+ else: i = i+1
+ if 0 <= limit < len(self._rbuf): i = limit
+ data, self._rbuf = self._rbuf[:i], self._rbuf[i:]
+ return data
+
+ def readlines(self, sizehint = 0):
+ total = 0
+ list = []
+ while 1:
+ line = self.readline()
+ if not line: break
+ list.append(line)
+ total += len(line)
+ if sizehint and total >= sizehint:
+ break
+ return list
+
+
+class HTTPConnection(httplib.HTTPConnection):
+ # use the modified response class
+ response_class = HTTPResponse
+
+#########################################################################
+##### TEST FUNCTIONS
+#########################################################################
+
+def error_handler(url):
+ global HANDLE_ERRORS
+ orig = HANDLE_ERRORS
+ keepalive_handler = HTTPHandler()
+ opener = urllib2.build_opener(keepalive_handler)
+ urllib2.install_opener(opener)
+ pos = {0: 'off', 1: 'on'}
+ for i in (0, 1):
+ print " fancy error handling %s (HANDLE_ERRORS = %i)" % (pos[i], i)
+ HANDLE_ERRORS = i
+ try:
+ fo = urllib2.urlopen(url)
+ foo = fo.read()
+ fo.close()
+ try: status, reason = fo.status, fo.reason
+ except AttributeError: status, reason = None, None
+ except IOError, e:
+ print " EXCEPTION: %s" % e
+ raise
+ else:
+ print " status = %s, reason = %s" % (status, reason)
+ HANDLE_ERRORS = orig
+ hosts = keepalive_handler.open_connections()
+ print "open connections:", ' '.join(hosts)
+ keepalive_handler.close_all()
+
+def continuity(url):
+ import md5
+ format = '%25s: %s'
+
+ # first fetch the file with the normal http handler
+ opener = urllib2.build_opener()
+ urllib2.install_opener(opener)
+ fo = urllib2.urlopen(url)
+ foo = fo.read()
+ fo.close()
+ m = md5.new(foo)
+ print format % ('normal urllib', m.hexdigest())
+
+ # now install the keepalive handler and try again
+ opener = urllib2.build_opener(HTTPHandler())
+ urllib2.install_opener(opener)
+
+ fo = urllib2.urlopen(url)
+ foo = fo.read()
+ fo.close()
+ m = md5.new(foo)
+ print format % ('keepalive read', m.hexdigest())
+
+ fo = urllib2.urlopen(url)
+ foo = ''
+ while 1:
+ f = fo.readline()
+ if f: foo = foo + f
+ else: break
+ fo.close()
+ m = md5.new(foo)
+ print format % ('keepalive readline', m.hexdigest())
+
+def comp(N, url):
+ print ' making %i connections to:\n %s' % (N, url)
+
+ sys.stdout.write(' first using the normal urllib handlers')
+ # first use normal opener
+ opener = urllib2.build_opener()
+ urllib2.install_opener(opener)
+ t1 = fetch(N, url)
+ print ' TIME: %.3f s' % t1
+
+ sys.stdout.write(' now using the keepalive handler ')
+ # now install the keepalive handler and try again
+ opener = urllib2.build_opener(HTTPHandler())
+ urllib2.install_opener(opener)
+ t2 = fetch(N, url)
+ print ' TIME: %.3f s' % t2
+ print ' improvement factor: %.2f' % (t1/t2, )
+
+def fetch(N, url, delay=0):
+ lens = []
+ starttime = time.time()
+ for i in range(N):
+ if delay and i > 0: time.sleep(delay)
+ fo = urllib2.urlopen(url)
+ foo = fo.read()
+ fo.close()
+ lens.append(len(foo))
+ diff = time.time() - starttime
+
+ j = 0
+ for i in lens[1:]:
+ j = j + 1
+ if not i == lens[0]:
+ print "WARNING: inconsistent length on read %i: %i" % (j, i)
+
+ return diff
+
+def test(url, N=10):
+ print "checking error hander (do this on a non-200)"
+ try: error_handler(url)
+ except IOError, e:
+ print "exiting - exception will prevent further tests"
+ sys.exit()
+ print
+ print "performing continuity test (making sure stuff isn't corrupted)"
+ continuity(url)
+ print
+ print "performing speed comparison"
+ comp(N, url)
+
+if __name__ == '__main__':
+ import time
+ import sys
+ try:
+ N = int(sys.argv[1])
+ url = sys.argv[2]
+ except:
+ print "%s <integer> <url>" % sys.argv[0]
+ else:
+ test(url, N)