diff options
author | mstenner <mstenner> | 2003-10-12 02:03:27 +0000 |
---|---|---|
committer | mstenner <mstenner> | 2003-10-12 02:03:27 +0000 |
commit | ede1c014b207ba53625a36f43f7dabf85d936936 (patch) | |
tree | 0ca30ee83d0aea46f9595ed979c5733a4d96db62 | |
download | urlgrabber-ede1c014b207ba53625a36f43f7dabf85d936936.tar.gz |
Initial revision
-rw-r--r-- | LICENSE | 280 | ||||
-rw-r--r-- | MANIFEST.in | 3 | ||||
-rw-r--r-- | README | 19 | ||||
-rw-r--r-- | makefile | 12 | ||||
-rw-r--r-- | progress_meter.py | 161 | ||||
-rw-r--r-- | setup.py | 9 | ||||
-rw-r--r-- | urlgrabber.py | 752 | ||||
-rw-r--r-- | urlgrabber/keepalive.py | 379 |
8 files changed, 1615 insertions, 0 deletions
@@ -0,0 +1,280 @@ + GNU GENERAL PUBLIC LICENSE + Version 2, June 1991 + + Copyright (C) 1989, 1991 Free Software Foundation, Inc. + 675 Mass Ave, Cambridge, MA 02139, USA + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The licenses for most software are designed to take away your +freedom to share and change it. By contrast, the GNU General Public +License is intended to guarantee your freedom to share and change free +software--to make sure the software is free for all its users. This +General Public License applies to most of the Free Software +Foundation's software and to any other program whose authors commit to +using it. (Some other Free Software Foundation software is covered by +the GNU Library General Public License instead.) You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +this service if you wish), that you receive source code or can get it +if you want it, that you can change the software or use pieces of it +in new free programs; and that you know you can do these things. + + To protect your rights, we need to make restrictions that forbid +anyone to deny you these rights or to ask you to surrender the rights. +These restrictions translate to certain responsibilities for you if you +distribute copies of the software, or if you modify it. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must give the recipients all the rights that +you have. You must make sure that they, too, receive or can get the +source code. And you must show them these terms so they know their +rights. + + We protect your rights with two steps: (1) copyright the software, and +(2) offer you this license which gives you legal permission to copy, +distribute and/or modify the software. + + Also, for each author's protection and ours, we want to make certain +that everyone understands that there is no warranty for this free +software. If the software is modified by someone else and passed on, we +want its recipients to know that what they have is not the original, so +that any problems introduced by others will not reflect on the original +authors' reputations. + + Finally, any free program is threatened constantly by software +patents. We wish to avoid the danger that redistributors of a free +program will individually obtain patent licenses, in effect making the +program proprietary. To prevent this, we have made it clear that any +patent must be licensed for everyone's free use or not licensed at all. + + The precise terms and conditions for copying, distribution and +modification follow. + + GNU GENERAL PUBLIC LICENSE + TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION + + 0. This License applies to any program or other work which contains +a notice placed by the copyright holder saying it may be distributed +under the terms of this General Public License. The "Program", below, +refers to any such program or work, and a "work based on the Program" +means either the Program or any derivative work under copyright law: +that is to say, a work containing the Program or a portion of it, +either verbatim or with modifications and/or translated into another +language. (Hereinafter, translation is included without limitation in +the term "modification".) Each licensee is addressed as "you". + +Activities other than copying, distribution and modification are not +covered by this License; they are outside its scope. The act of +running the Program is not restricted, and the output from the Program +is covered only if its contents constitute a work based on the +Program (independent of having been made by running the Program). +Whether that is true depends on what the Program does. + + 1. You may copy and distribute verbatim copies of the Program's +source code as you receive it, in any medium, provided that you +conspicuously and appropriately publish on each copy an appropriate +copyright notice and disclaimer of warranty; keep intact all the +notices that refer to this License and to the absence of any warranty; +and give any other recipients of the Program a copy of this License +along with the Program. + +You may charge a fee for the physical act of transferring a copy, and +you may at your option offer warranty protection in exchange for a fee. + + 2. You may modify your copy or copies of the Program or any portion +of it, thus forming a work based on the Program, and copy and +distribute such modifications or work under the terms of Section 1 +above, provided that you also meet all of these conditions: + + a) You must cause the modified files to carry prominent notices + stating that you changed the files and the date of any change. + + b) You must cause any work that you distribute or publish, that in + whole or in part contains or is derived from the Program or any + part thereof, to be licensed as a whole at no charge to all third + parties under the terms of this License. + + c) If the modified program normally reads commands interactively + when run, you must cause it, when started running for such + interactive use in the most ordinary way, to print or display an + announcement including an appropriate copyright notice and a + notice that there is no warranty (or else, saying that you provide + a warranty) and that users may redistribute the program under + these conditions, and telling the user how to view a copy of this + License. (Exception: if the Program itself is interactive but + does not normally print such an announcement, your work based on + the Program is not required to print an announcement.) + +These requirements apply to the modified work as a whole. If +identifiable sections of that work are not derived from the Program, +and can be reasonably considered independent and separate works in +themselves, then this License, and its terms, do not apply to those +sections when you distribute them as separate works. But when you +distribute the same sections as part of a whole which is a work based +on the Program, the distribution of the whole must be on the terms of +this License, whose permissions for other licensees extend to the +entire whole, and thus to each and every part regardless of who wrote it. + +Thus, it is not the intent of this section to claim rights or contest +your rights to work written entirely by you; rather, the intent is to +exercise the right to control the distribution of derivative or +collective works based on the Program. + +In addition, mere aggregation of another work not based on the Program +with the Program (or with a work based on the Program) on a volume of +a storage or distribution medium does not bring the other work under +the scope of this License. + + 3. You may copy and distribute the Program (or a work based on it, +under Section 2) in object code or executable form under the terms of +Sections 1 and 2 above provided that you also do one of the following: + + a) Accompany it with the complete corresponding machine-readable + source code, which must be distributed under the terms of Sections + 1 and 2 above on a medium customarily used for software interchange; or, + + b) Accompany it with a written offer, valid for at least three + years, to give any third party, for a charge no more than your + cost of physically performing source distribution, a complete + machine-readable copy of the corresponding source code, to be + distributed under the terms of Sections 1 and 2 above on a medium + customarily used for software interchange; or, + + c) Accompany it with the information you received as to the offer + to distribute corresponding source code. (This alternative is + allowed only for noncommercial distribution and only if you + received the program in object code or executable form with such + an offer, in accord with Subsection b above.) + +The source code for a work means the preferred form of the work for +making modifications to it. For an executable work, complete source +code means all the source code for all modules it contains, plus any +associated interface definition files, plus the scripts used to +control compilation and installation of the executable. However, as a +special exception, the source code distributed need not include +anything that is normally distributed (in either source or binary +form) with the major components (compiler, kernel, and so on) of the +operating system on which the executable runs, unless that component +itself accompanies the executable. + +If distribution of executable or object code is made by offering +access to copy from a designated place, then offering equivalent +access to copy the source code from the same place counts as +distribution of the source code, even though third parties are not +compelled to copy the source along with the object code. + + 4. You may not copy, modify, sublicense, or distribute the Program +except as expressly provided under this License. Any attempt +otherwise to copy, modify, sublicense or distribute the Program is +void, and will automatically terminate your rights under this License. +However, parties who have received copies, or rights, from you under +this License will not have their licenses terminated so long as such +parties remain in full compliance. + + 5. You are not required to accept this License, since you have not +signed it. However, nothing else grants you permission to modify or +distribute the Program or its derivative works. These actions are +prohibited by law if you do not accept this License. Therefore, by +modifying or distributing the Program (or any work based on the +Program), you indicate your acceptance of this License to do so, and +all its terms and conditions for copying, distributing or modifying +the Program or works based on it. + + 6. Each time you redistribute the Program (or any work based on the +Program), the recipient automatically receives a license from the +original licensor to copy, distribute or modify the Program subject to +these terms and conditions. You may not impose any further +restrictions on the recipients' exercise of the rights granted herein. +You are not responsible for enforcing compliance by third parties to +this License. + + 7. If, as a consequence of a court judgment or allegation of patent +infringement or for any other reason (not limited to patent issues), +conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot +distribute so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you +may not distribute the Program at all. For example, if a patent +license would not permit royalty-free redistribution of the Program by +all those who receive copies directly or indirectly through you, then +the only way you could satisfy both it and this License would be to +refrain entirely from distribution of the Program. + +If any portion of this section is held invalid or unenforceable under +any particular circumstance, the balance of the section is intended to +apply and the section as a whole is intended to apply in other +circumstances. + +It is not the purpose of this section to induce you to infringe any +patents or other property right claims or to contest validity of any +such claims; this section has the sole purpose of protecting the +integrity of the free software distribution system, which is +implemented by public license practices. Many people have made +generous contributions to the wide range of software distributed +through that system in reliance on consistent application of that +system; it is up to the author/donor to decide if he or she is willing +to distribute software through any other system and a licensee cannot +impose that choice. + +This section is intended to make thoroughly clear what is believed to +be a consequence of the rest of this License. + + 8. If the distribution and/or use of the Program is restricted in +certain countries either by patents or by copyrighted interfaces, the +original copyright holder who places the Program under this License +may add an explicit geographical distribution limitation excluding +those countries, so that distribution is permitted only in or among +countries not thus excluded. In such case, this License incorporates +the limitation as if written in the body of this License. + + 9. The Free Software Foundation may publish revised and/or new versions +of the General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + +Each version is given a distinguishing version number. If the Program +specifies a version number of this License which applies to it and "any +later version", you have the option of following the terms and conditions +either of that version or of any later version published by the Free +Software Foundation. If the Program does not specify a version number of +this License, you may choose any version ever published by the Free Software +Foundation. + + 10. If you wish to incorporate parts of the Program into other free +programs whose distribution conditions are different, write to the author +to ask for permission. For software which is copyrighted by the Free +Software Foundation, write to the Free Software Foundation; we sometimes +make exceptions for this. Our decision will be guided by the two goals +of preserving the free status of all derivatives of our free software and +of promoting the sharing and reuse of software generally. + + NO WARRANTY + + 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY +FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN +OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES +PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED +OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS +TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE +PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, +REPAIR OR CORRECTION. + + 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR +REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, +INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING +OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED +TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY +YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER +PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE +POSSIBILITY OF SUCH DAMAGES. + + END OF TERMS AND CONDITIONS diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 0000000..e1f9744 --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1,3 @@ +include urlgrabber.py keepalive.py progress_meter.py +include README +include LICENSE @@ -0,0 +1,19 @@ +urlgrabber -- A high-level cross-protocol url-grabber + +INSTALLATION INSTRUCTIONS + +If you want to install urlgrabber on your system, simply open the package +and run + + python setup.py install + +Take a look at the install options by doing + + python setup.py --help + +If you are on a Red Hat Linux machine that still uses python 1.5.x by +default, you may need to run python2 in place of python above. + +If you just want to use urlgrabber.py (and/or keepalive.py) in your +own programs, simply grab the file(s) and put it with the rest of your +program's modules. diff --git a/makefile b/makefile new file mode 100644 index 0000000..bd07f82 --- /dev/null +++ b/makefile @@ -0,0 +1,12 @@ +RM = /bin/rm -f +WEBHOST = teton.dulug.duke.edu +WEBPATH = /var/www/linuxduke/projects/mini/urlgrabber + +dist: + python2 setup.py sdist --force-manifest + scp dist/* $(WEBHOST):$(WEBPATH)/dist/ + +clean: + $(RM) MANIFEST + $(RM) -r dist/ + $(RM) *.pyc diff --git a/progress_meter.py b/progress_meter.py new file mode 100644 index 0000000..506b107 --- /dev/null +++ b/progress_meter.py @@ -0,0 +1,161 @@ +import sys +import time + +class text_progress_meter: + def __init__(self, fo=sys.stderr): + self.fo = fo + self.update_period = 0.3 # seconds + + def start(self, filename, url, basename, length): + self.filename = filename + self.url = url + self.basename = basename + self.length = length + if not length == None: + self.flength = self.format_number(length) + 'B' + self.start_time = time.time() + self.last_update = 0 + self.read = 0 + self._do_start() + + def _do_start(self): + pass + + def end(self): + self.now = time.time() + self._do_end() + + def _do_end(self): + total_time = self.format_time(self.now - self.start_time) + total_size = self.format_number(self.read) + if self.length is None: + out = '\r%-60.60s %5sB %s ' % \ + (self.basename, total_size, total_time) + else: + bar = '='*25 + out = '\r%-25.25s %3i%% |%-25.25s| %5sB %8s ' % \ + (self.basename, 100, bar, total_size, total_time) + self.fo.write(out) + self.fo.write('\n') + self.fo.flush() + + def update(self, read): + # for a real gui, you probably want to override and put a call + # to your mainloop iteration function here + self.read = read # put this here so it's caught for self.end + now = time.time() + if (now >= self.last_update + self.update_period) or \ + not self.last_update: + self.now = now + self._do_update(read) + self.last_update = now + + def _do_update(self, read): + # elapsed time since last update + etime = self.now - self.start_time + fetime = self.format_time(etime) + fread = self.format_number(read) + + #self.length = None + if self.length is None: + out = '\r%-60.60s %5sB %s ' % \ + (self.basename, fread, fetime) + else: + rtime = self.format_time(self.project(etime, read)) + try: frac = float(read)/self.length + except ZeroDivisionError, e: frac = 1.0 + if frac > 1.0: frac = 1.0 + bar = '='*int(25 * frac) + out = '\r%-25.25s %3i%% |%-25.25s| %5sB %8s ETA ' % \ + (self.basename, frac*100, bar, fread, rtime) + self.fo.write(out) + self.fo.flush() + + def project(self, etime, read): + # get projected time for total download + if read == 0: + # if we just started this file, all bets are off + self.last_etime = etime + self.last_read = 0 + self.ave_rate = None + return None + + time_diff = etime - self.last_etime + read_diff = read - self.last_read + self.last_etime = etime + self.last_read = read + try: rate = time_diff / read_diff ## this is actually an inverse-rate + except ZeroDivisionError: return 0 ## should only happen at end of file + + self._get_new_ave_rate(rate) + remaining_time = self.ave_rate * (self.length - read) + if remaining_time < 0: remaining_time = 0 + return self._round_remaining_time(remaining_time) + + def _get_new_ave_rate(self, rate, epsilon=0.98): + if self.ave_rate == None: + self.ave_rate = rate + else: + # calculate a "rolling average" - this balances long-term behavior + # with short-term fluctuations + # epsilon = 0.0 --> only consider most recent block + # epsilon = 1.0 --> only consider first block + self.ave_rate = (self.ave_rate * epsilon) + (rate * (1-epsilon)) + + def _round_remaining_time(self, remaining_time): + # round to further stabilize it + i = 1 + while remaining_time > 30: + i = i * 2 + remaining_time = remaining_time / 2 + remaining_time = int(remaining_time) + return float(remaining_time * i) + + def format_time(self, seconds): + if seconds is None or seconds < 0: + return '--:--' + else: + seconds = int(seconds) + minutes = seconds / 60 + seconds = seconds % 60 + return '%02i:%02i' % (minutes, seconds) + + def format_number(self, number, SI=0, space=' '): + """Turn numbers into human-readable metric-like numbers""" + symbols = ['', # (none) + 'k', # kilo + 'M', # mega + 'G', # giga + 'T', # tera + 'P', # peta + 'E', # exa + 'Z', # zetta + 'Y'] # yotta + + if SI: step = 1000.0 + else: step = 1024.0 + + thresh = 999 + depth = 0 + + # we want numbers between + while number > thresh: + depth = depth + 1 + number = number / step + + # just in case someone needs more than 1000 yottabytes! + diff = depth - len(symbols) + 1 + if diff > 0: + depth = depth - diff + number = number * thresh**depth + + if type(number) == type(1) or type(number) == type(1L): + format = '%i%s%s' + elif number < 9.95: + # must use 9.95 for proper sizing. For example, 9.99 will be + # rounded to 10.0 with the .1f format string (which is too long) + format = '%.1f%s%s' + else: + format = '%.0f%s%s' + + return(format % (number, space, symbols[depth])) diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..8310997 --- /dev/null +++ b/setup.py @@ -0,0 +1,9 @@ +from distutils.core import setup +setup(name="urlgrabber", + version="0.2", + description="high-level cross-protocol url-grabber", + author="Michael D. Stenner", + author_email="mstenner@phy.duke.edu", + url="http://linux.duke.edu/projects/mini/urlgrabber/", + license="GPL", + py_modules=["urlgrabber", "keepalive", "progress_meter"]) diff --git a/urlgrabber.py b/urlgrabber.py new file mode 100644 index 0000000..156d85a --- /dev/null +++ b/urlgrabber.py @@ -0,0 +1,752 @@ +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Library General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + +# Copyright 2002-2003 Michael D. Stenner + +import os +import os.path +import urlparse +import rfc822 +import time +import string + +DEBUG=0 +VERSION='0.2' + +try: + from i18n import _ +except ImportError, msg: + def _(st): return st + +try: + from httplib import HTTPException +except ImportError, msg: + HTTPException = None + +special_handlers = [] + +try: + import urllib2 +except ImportError, msg: + import urllib + urllib._urlopener = urllib.FancyURLopener() # make sure it ready now + urllib2 = urllib # this way, we can always just do urllib.urlopen() + have_urllib2 = 0 + auth_handler = None +else: + have_urllib2 = 1 + auth_handler = urllib2.HTTPBasicAuthHandler( \ + urllib2.HTTPPasswordMgrWithDefaultRealm()) + special_handlers.append(auth_handler) + +try: + # This is a convenient way to make keepalive optional. + # Just rename the module so it can't be imported. + from keepalive import HTTPHandler +except ImportError, msg: + keepalive_handler = None +else: + keepalive_handler = HTTPHandler() + special_handlers.append(keepalive_handler) + +if have_urllib2: + opener = apply(urllib2.build_opener, special_handlers) + urllib2.install_opener(opener) + +def set_user_agent(new_user_agent): + if have_urllib2: addheaders = opener.addheaders + else: addheaders = urllib._urlopener.addheaders + + new_tuple = ('User-agent', new_user_agent) + + for i in range(len(addheaders)): + if addheaders[i][0] == 'User-agent': + addheaders[i] = new_tuple + break + else: + addheaders.append(new_tuple) + +# the calling application can override this user-agent by calling +# urlgrabber.set_user_agent +set_user_agent('urlgrabber/%s' % VERSION) + +class URLGrabError(IOError): + """ + URLGrabError error codes: + -1 - default retry code for retrygrab check functions + 0 - everything looks good (you should never see this) + 1 - malformed url + 2 - local file doesn't exist + 3 - request for non-file local file (dir, etc) + 4 - IOError on fetch + 5 - OSError on fetch + 6 - no content length header when we expected one + 7 - HTTPException + 8 - Exceeded read limit (for urlread) + + Negative codes are reserved for use by functions passed in to + retrygrab with checkfunc. + + You can use it like this: + try: urlgrab(url) + except URLGrabError, e: + if e.errno == 3: ... + # or + print e.strerror + # or simply + print e #### print '[Errno %i] %s' % (e.errno, e.strerror) + """ + pass + +def close_all(): + """close any open keepalive connections""" + if keepalive_handler: keepalive_handler.close_all() + +_throttle = 1.0 +_bandwidth = 0 +def set_throttle(new_throttle): + """urlgrab supports throttling via two values: throttle and bandwidth + Between the two, you can either specify and absolute throttle threshold + or specify a theshold as a fraction of maximum available bandwidth. + + throttle is a number - if it's an int, it's the bytes/second throttle + limit. If it's a float, it is first multiplied by bandwidth. If + throttle == 0, throttling is disabled. If None, the module-level + default (which can be set with set_throttle) is used. + + bandwidth is the nominal max bandwidth in bytes/second. If throttle + is a float and bandwidth == 0, throttling is disabled. If None, + the module-level default (which can be set with set_bandwidth) is + used. + + EXAMPLES: + + Lets say you have a 100 Mbps connection. This is (about) 10^8 bits + per second, or 12,500,000 Bytes per second. You have a number of + throttling options: + + *) set_bandwidth(12500000); set_throttle(0.5) # throttle is a float + + This will limit urlgrab to use half of your available bandwidth. + + *) set_throttle(6250000) # throttle is an int + + This will also limit urlgrab to use half of your available + bandwidth, regardless of what bandwidth is set to. + + *) set_throttle(6250000); set_throttle(1.0) # float + + Use half your bandwidth + + *) set_throttle(6250000); set_throttle(2.0) # float + + Use up to 12,500,000 Bytes per second (your nominal max bandwidth) + + *) set_throttle(6250000); set_throttle(0) # throttle = 0 + + Disable throttling - this is more efficient than a very large + throttle setting. + + *) set_throttle(0); set_throttle(1.0) # throttle is float, bandwidth = 0 + + Disable throttling - this is the default when the module is loaded. + + + SUGGESTED AUTHOR IMPLEMENTATION + + While this is flexible, it's not extremely obvious to the user. I + suggest you implement a float throttle as a percent to make the + distinction between absolute and relative throttling very explicit. + + Also, you may want to convert the units to something more convenient + than bytes/second, such as kbps or kB/s, etc. + """ + global _throttle + _throttle = new_throttle + +def set_bandwidth(new_bandwidth): + global _bandwidth + _bandwidth = new_bandwidth + +_progress_obj = None +def set_progress_obj(new_progress_obj): + global _progress_obj + _progress_obj = new_progress_obj + + +def retrygrab(url, filename=None, copy_local=0, close_connection=0, + progress_obj=None, throttle=None, bandwidth=None, + numtries=3, retrycodes=[-1,2,4,5,6,7], checkfunc=None): + """a wrapper function for urlgrab that retries downloads + + The args for retrygrab are the same as urlgrab except for numtries, + retrycodes, and checkfunc. You should use keyword arguments for + both in case new args are added to urlgrab later. If you use keyword + args (especially for the retrygrab-specific options) then retrygrab + will continue to be a drop-in replacement for urlgrab. Otherwise, + things may break. + + retrygrab exits just like urlgrab in either case. Either it + returns the local filename or it raises an exception. The + exception raised will be the one raised MOST RECENTLY by urlgrab. + + retrygrab ONLY retries if URLGrabError is raised. If urlgrab (or + checkfunc) raise some other exception, it will be passed up + immediately. + + numtries + number of times to retry the grab before bailing. If this is + zero, it will retry forever. This was intentional... really, + it was :) + + retrycodes + the errorcodes (values of e.errno) for which it should retry. + See the doc on URLGrabError for more details on this. + + checkfunc + a function to do additional checks. This defaults to None, + which means no additional checking. The function should simply + return on a successful check. It should raise URLGrabError on + and unsuccessful check. Raising of any other exception will + be considered immediate failure and no retries will occur. + + Negative error numbers are reserved for use by these passed in + functions. By default, -1 results in a retry, but this can be + customized with retrycodes. + + If you simply pass in a function, it will be given exactly one + argument: the local file name as returned by urlgrab. If you + need to pass in other arguments, you can do so like this: + + checkfunc=(function, ('arg1', 2), {'kwarg': 3}) + + if the downloaded file as filename /tmp/stuff, then this will + result in this call: + + function('/tmp/stuff', 'arg1', 2, kwarg=3) + + NOTE: both the "args" tuple and "kwargs" dict must be present + if you use this syntax, but either (or both) can be empty. + """ + + tries = 0 + if not checkfunc is None: + if callable(checkfunc): + func, args, kwargs = checkfunc, (), {} + else: + func, args, kwargs = checkfunc + else: + func = None + + while 1: + tries = tries + 1 + if DEBUG: print 'TRY #%i: %s' % (tries, url) + try: + fname = urlgrab(url, filename, copy_local, close_connection, + progress_obj, throttle, bandwidth) + if not func is None: apply(func, (fname, )+args, kwargs) + if DEBUG: print 'RESULT = success (%s)' % fname + return fname + except URLGrabError, e: + if DEBUG: print 'EXCEPTION: %s' % e + if tries == numtries or (e.errno not in retrycodes): raise + +def urlgrab(url, filename=None, copy_local=0, close_connection=0, + progress_obj=None, throttle=None, bandwidth=None): + """grab the file at <url> and make a local copy at <filename> + + If filename is none, the basename of the url is used. + + copy_local is ignored except for file:// urls, in which case it + specifies whether urlgrab should still make a copy of the file, or + simply point to the existing copy. + + close_connection tells urlgrab to close the connection after + completion. This is ignored unless the download happens with the + http keepalive handler. Otherwise, the connection is left open + for further use. + + progress_obj is a class instance that supports the following methods: + po.start(filename, url, basename, length) + # length will be None if unknown + po.update(read) # read == bytes read so far + po.end() + + throttle is a number - if it's an int, it's the bytes/second throttle + limit. If it's a float, it is first multiplied by bandwidth. If + throttle == 0, throttling is disabled. If None, the module-level + default (which can be set with set_throttle) is used. + + bandwidth is the nominal max bandwidth in bytes/second. If throttle + is a float and bandwidth == 0, throttling is disabled. If None, + the module-level default (which can be set with set_bandwidth) is + used. + + urlgrab returns the filename of the local file, which may be different + from the passed-in filename if copy_local == 0. + """ + + url, parts = _parse_url(url) + (scheme, host, path, parm, query, frag) = parts + + if filename == None: + filename = os.path.basename(path) + if scheme == 'file' and not copy_local: + # just return the name of the local file - don't make a copy + # currently we don't do anything with the progress_cb here + if not os.path.exists(path): + raise URLGrabError(2, _('Local file does not exist: %s') % (path, )) + elif not os.path.isfile(path): + raise URLGrabError(3, _('Not a normal file: %s') % (path, )) + else: + return path + + raw_throttle = _get_raw_throttle(throttle, bandwidth) + if progress_obj is None: progress_obj = _progress_obj + elif not progress_obj: progress_obj = None + fo, hdr = _do_open(url) + + # download and store the file + try: + if progress_obj or raw_throttle: + if progress_obj: + try: length = int(hdr['Content-Length']) + except: length = None + progress_obj.start(filename, url, os.path.basename(path), length) + fo = URLGrabberFileObject(fo, progress_obj, raw_throttle) + _do_grab(filename, fo, hdr) + fo.close() + + if close_connection: + # try and close connection + try: fo.close_connection() + except AttributeError: pass + except IOError, e: + raise URLGrabError(4, _('IOError: %s') % (e, )) + except OSError, e: + raise URLGrabError(5, _('OSError: %s') % (e, )) + except HTTPException, e: + raise URLGrabError(7, _('HTTP Error (%s): %s') % \ + (e.__class__.__name__, e)) + + return filename + +def urlopen(url, progress_obj=None, throttle=None, bandwidth=None): + """open the url and return a file object + + If a progress object or throttle specifications exist, then + a special file object will be returned that supports them. + The file object can be treated like any other file object. + """ + url, parts = _parse_url(url) + (scheme, host, path, parm, query, frag) = parts + raw_throttle = _get_raw_throttle(throttle, bandwidth) + if progress_obj is None: progress_obj = _progress_obj + fo, hdr = _do_open(url) + if progress_obj or raw_throttle: + if progress_obj: + try: length = int(hdr['Content-Length']) + except: length = None + progress_obj.start(None, url, os.path.basename(path), length) + fo = URLGrabberFileObject(fo, progress_obj, raw_throttle) + return fo + +def urlread(url, progress_obj=None, throttle=None, bandwidth=None, limit=None): + """read the url into a string, up to 'limit' bytes + + If the limit is exceeded, an exception will be thrown. Note that urlread + is NOT intended to be used as a way of saying "I want the first N bytes" + but rather 'read the whole file into memory, but don't use too much' + """ + fo = urlopen(url, progress_obj, throttle, bandwidth) + s = fo.read(limit+1) + fo.close() + if limit and len(s) > limit: + raise URLGrabError(8, _('Exceeded limit (%i): %s') % (limit, url)) + return s + +class URLGrabberFileObject: + """This is a file-object wrapper that supports progress objects and + throttling. + + This exists to solve the following problem: lets say you want to + drop-in replace a normal open with urlopen. You want to use a + progress meter and/or throttling, but how do you do that without + rewriting your code? Answer: urlopen will return a wrapped file + object that does the progress meter and-or throttling internally. + """ + + def __init__(self, fo, progress_obj, raw_throttle): + self.fo = fo + self.raw_throttle = raw_throttle + self.progress_obj = progress_obj + self._rbuf = '' + self._rbufsize = 1024*8 + self._ttime = time.time() + self._tsize = 0 + self._amount_read = 0 + if progress_obj: progress_obj.update(0) + + def __getattr__(self, name): + """This effectively allows us to wrap at the instance level. + Any attribute not found in _this_ object will be searched for + in self.fo. This includes methods.""" + if hasattr(self.fo, name): + return getattr(self.fo, name) + raise AttributeError, name + + def _fill_buffer(self, amt=None): + """fill the buffer to contain at least 'amt' bytes by reading + from the underlying file object. If amt is None, then it will + read until it gets nothing more. It updates the progress meter + and throttles after every self._rbufsize bytes.""" + # the _rbuf test is only in this first 'if' for speed. It's not + # logically necessary + if self._rbuf and not amt is None: + L = len(self._rbuf) + if amt > L: + amt = amt - L + else: + return + + # if we've made it here, then we don't have enough in the buffer + # and we need to read more. + + buf = [self._rbuf] + bufsize = len(self._rbuf) + while amt is None or amt: + # first, delay if necessary for throttling reasons + if self.raw_throttle: + diff = self._tsize/self.raw_throttle - \ + (time.time() - self._ttime) + if diff > 0: time.sleep(diff) + self._ttime = time.time() + + # now read some data, up to self._rbufsize + if amt is None: readamount = self._rbufsize + else: readamount = min(amt, self._rbufsize) + new = self.fo.read(readamount) + newsize = len(new) + if not newsize: break # no more to read + + if amt: amt = amt - newsize + buf.append(new) + bufsize = bufsize + newsize + self._tsize = newsize + self._amount_read = self._amount_read + newsize + if self.progress_obj: + self.progress_obj.update(self._amount_read) + + self._rbuf = string.join(buf, '') + return + + def read(self, amt=None): + self._fill_buffer(amt) + if amt is None: + s, self._rbuf = self._rbuf, '' + else: + s, self._rbuf = self._rbuf[:amt], self._rbuf[amt:] + return s + + def readline(self, limit=-1): + i = string.find(self._rbuf, '\n') + while i < 0 and not (0 < limit <= len(self._rbuf)): + L = len(self._rbuf) + self._fill_buffer(L + self._rbufsize) + if not len(self._rbuf) > L: break + i = string.find(self._rbuf, '\n', L) + + if i < 0: i = len(self._rbuf) + else: i = i+1 + if 0 <= limit < len(self._rbuf): i = limit + + s, self._rbuf = self._rbuf[:i], self._rbuf[i:] + return s + + def close(self): + if self.progress_obj: + self.progress_obj.end() + self.fo.close() + +def _parse_url(url): + """break up the url into its component parts + + This function disassembles a url and + 1) "normalizes" it, tidying it up a bit + 2) does any authentication stuff it needs to do + + it returns the (cleaned) url and a tuple of component parts + """ + (scheme, host, path, parm, query, frag) = urlparse.urlparse(url) + path = os.path.normpath(path) + if '@' in host and auth_handler and scheme in ['http', 'https']: + try: + # should we be using urllib.splituser and splitpasswd instead? + user_password, host = string.split(host, '@', 1) + user, password = string.split(user_password, ':', 1) + except ValueError, e: + raise URLGrabError(1, _('Bad URL: %s') % url) + if DEBUG: print 'adding HTTP auth: %s, %s' % (user, password) + auth_handler.add_password(None, host, user, password) + + parts = (scheme, host, path, parm, query, frag) + return urlparse.urlunparse(parts), parts + +def _get_raw_throttle(throttle, bandwidth): + if throttle == None: throttle = _throttle + if throttle <= 0: raw_throttle = 0 + elif type(throttle) == type(0): raw_throttle = float(throttle) + else: # throttle is a float + if bandwidth == None: bandwidth = _bandwidth + raw_throttle = bandwidth * throttle + return raw_throttle + +def _do_open(url): + """initiate the connection & get the headers + return the file object and header object + """ + try: + fo = urllib2.urlopen(url) + hdr = fo.info() + except ValueError, e: + raise URLGrabError(1, _('Bad URL: %s') % (e, )) + except IOError, e: + raise URLGrabError(4, _('IOError: %s') % (e, )) + except OSError, e: + raise URLGrabError(5, _('OSError: %s') % (e, )) + except HTTPException, e: + raise URLGrabError(7, _('HTTP Error (%s): %s') % \ + (e.__class__.__name__, e)) + + # OK, this "cute little hack" may have outlived its usefulness. + # the role of urlgrabber is expaning and we're wanting it to handle + # things (like cgi output) that this is preventing. For now, I'm + # simply going to comment it out and see what breaks. + + # this is a cute little hack - if there isn't a "Content-Length" + # header then its probably something generated dynamically, such + # as php, cgi, a directory listing, or an error message. It is + # probably not what we want. + #if have_urllib2 or scheme != 'file': + # # urllib does not provide content-length for local files + # if not hdr is None and not hdr.has_key('Content-Length'): + # raise URLGrabError(6, _('ERROR: Url Return no Content-Length - something is wrong')) + + return fo, hdr + +_last_modified_format = '%a, %d %b %Y %H:%M:%S %Z' +def _do_grab(filename, fo, hdr): + """dump the file to filename""" + new_fo = open(filename, 'wb') + bs = 1024*8 + size = 0 + + block = fo.read(bs) + size = size + len(block) + while block: + new_fo.write(block) + block = fo.read(bs) + size = size + len(block) + + new_fo.close() + + try: + modified_tuple = hdr.getdate_tz('last-modified') + modified_stamp = rfc822.mktime_tz(modified_tuple) + os.utime(filename, (modified_stamp, modified_stamp)) + except (TypeError,), e: pass + + return size + +##################################################################### +# TESTING +def _main_test(): + import sys + try: url, filename = sys.argv[1:3] + except ValueError: + print 'usage:', sys.argv[0], \ + '<url> <filename> [copy_local=0|1] [close_connection=0|1]' + sys.exit() + + kwargs = {} + for a in sys.argv[3:]: + k, v = string.split(a, '=', 1) + kwargs[k] = int(v) + + set_throttle(1.0) + set_bandwidth(32 * 1024) + print "throttle: %s, throttle bandwidth: %s B/s" % (_throttle, _bandwidth) + + try: from progress_meter import text_progress_meter + except ImportError, e: pass + else: kwargs['progress_obj'] = text_progress_meter() + + try: name = apply(urlgrab, (url, filename), kwargs) + except URLGrabError, e: print e + else: print 'LOCAL FILE:', name + + +def _speed_test(): + #### speed test --- see comment below + import sys + + full_times = [] + raw_times = [] + set_throttle(2**40) # throttle to 1 TB/s :) + + try: + from progress_meter import text_progress_meter + except ImportError, e: + tpm = None + print 'not using progress meter' + else: + tpm = text_progress_meter(fo=open('/dev/null', 'w')) + + # to address concerns that the overhead from the progress meter + # and throttling slow things down, we do this little test. Make + # sure /tmp/test holds a sanely-sized file (like .2 MB) + # + # using this test, you get the FULL overhead of the progress + # meter and throttling, without the benefit: the meter is directed + # to /dev/null and the throttle bandwidth is set EXTREMELY high. + # + # note: it _is_ even slower to direct the progress meter to a real + # tty or file, but I'm just interested in the overhead from _this_ + # module. + + # get it nicely cached before we start comparing + print 'pre-caching' + for i in range(100): + urlgrab('file:///tmp/test', '/tmp/test2', + copy_local=1) + + reps = 1000 + for i in range(reps): + print '\r%4i/%-4i' % (i, reps), + sys.stdout.flush() + t = time.time() + urlgrab('file:///tmp/test', '/tmp/test2', + copy_local=1, progress_obj=tpm) + full_times.append(1000 * (time.time() - t)) + + t = time.time() + urlgrab('file:///tmp/test', '/tmp/test2', + copy_local=1, progress_obj=None) + raw_times.append(1000* (time.time() - t)) + print '\r' + + full_times.sort() + full_mean = 0.0 + for i in full_times: full_mean = full_mean + i + full_mean = full_mean/len(full_times) + print '[full] mean: %.3f ms, median: %.3f ms, min: %.3f ms, max: %.3f ms' % \ + (full_mean, full_times[int(len(full_times)/2)], min(full_times), + max(full_times)) + + raw_times.sort() + raw_mean = 0.0 + for i in raw_times: raw_mean = raw_mean + i + raw_mean = raw_mean/len(raw_times) + print '[raw] mean: %.3f ms, median: %.3f ms, min: %.3f ms, max: %.3f ms' % \ + (raw_mean, raw_times[int(len(raw_times)/2)], min(raw_times), + max(raw_times)) + + close_all() + +def _retry_test(): + import sys + try: url, filename = sys.argv[1:3] + except ValueError: + print 'usage:', sys.argv[0], \ + '<url> <filename> [copy_local=0|1] [close_connection=0|1]' + sys.exit() + + kwargs = {} + for a in sys.argv[3:]: + k, v = string.split(a, '=', 1) + kwargs[k] = int(v) + + try: from progress_meter import text_progress_meter + except ImportError, e: pass + else: kwargs['progress_obj'] = text_progress_meter() + + global DEBUG + #DEBUG = 1 + def cfunc(filename, hello, there='foo'): + print hello, there + import random + rnum = random.random() + if rnum < .5: + print 'forcing retry' + raise URLGrabError(-1, 'forcing retry') + if rnum < .75: + print 'forcing failure' + raise URLGrabError(-2, 'forcing immediate failure') + print 'success' + return + + close_all() + kwargs['checkfunc'] = (cfunc, ('hello',), {'there':'there'}) + try: name = apply(retrygrab, (url, filename), kwargs) + except URLGrabError, e: print e + else: print 'LOCAL FILE:', name + +def _file_object_test(filename=None): + import random, cStringIO, sys + if filename is None: + filename = __file__ + print 'using file "%s" for comparisons' % filename + fo = open(filename) + s_input = fo.read() + fo.close() + + for testfunc in [_test_file_object_smallread, + _test_file_object_readall, + _test_file_object_readline, + _test_file_object_readlines]: + fo_input = cStringIO.StringIO(s_input) + fo_output = cStringIO.StringIO() + wrapper = URLGrabberFileObject(fo_input, None, 0) + print 'testing %-30s ' % testfunc.__name__, + testfunc(wrapper, fo_output) + s_output = fo_output.getvalue() + if s_output == s_input: print 'passed' + else: print 'FAILED' + +def _test_file_object_smallread(wrapper, fo_output): + while 1: + s = wrapper.read(23) + fo_output.write(s) + if not s: return + +def _test_file_object_readall(wrapper, fo_output): + s = wrapper.read() + fo_output.write(s) + +def _test_file_object_readline(wrapper, fo_output): + while 1: + s = wrapper.readline() + fo_output.write(s) + if not s: return + +def _test_file_object_readlines(wrapper, fo_output): + li = wrapper.readlines() + fo_output.write(string.join(li, '')) + +if __name__ == '__main__': + _main_test() + #_speed_test() + #_retry_test() + #_file_object_test() + diff --git a/urlgrabber/keepalive.py b/urlgrabber/keepalive.py new file mode 100644 index 0000000..08bddd8 --- /dev/null +++ b/urlgrabber/keepalive.py @@ -0,0 +1,379 @@ +#!/usr/bin/python2 +"""An HTTP handler for urllib2 that supports HTTP 1.1 and keepalive. + +>>> import urllib2 +>>> from keepalive import HTTPHandler +>>> keepalive_handler = HTTPHandler() +>>> opener = urllib2.build_opener(keepalive_handler) +>>> urllib2.install_opener(opener) +>>> +>>> fo = urllib2.urlopen('http://www.python.org') + +To remove the handler, simply re-run build_opener with no arguments, and +install that opener. + +You can explicitly close connections by using the close_connection() +method of the returned file-like object (described below) or you can +use the handler methods: + + close_connection(host) + close_all() + open_connections() + +>>> keepalive_handler.close_all() + +EXTRA ATTRIBUTES AND METHODS + + Upon a status of 200, the object returned has a few additional + attributes and methods, which should not be used if you want to + remain consistent with the normal urllib2-returned objects: + + close_connection() - close the connection to the host + readlines() - you know, readlines() + status - the return status (ie 404) + reason - english translation of status (ie 'File not found') + + If you want the best of both worlds, use this inside an + AttributeError-catching try: + + >>> try: status = fo.status + >>> except AttributeError: status = None + + Unfortunately, these are ONLY there if status == 200, so it's not + easy to distinguish between non-200 responses. The reason is that + urllib2 tries to do clever things with error codes 301, 302, 401, + and 407, and it wraps the object upon return. + + You can optionally set the module-level global HANDLE_ERRORS to 0, + in which case the handler will always return the object directly. + If you like the fancy handling of errors, don't do this. If you + prefer to see your error codes, then do. + +""" + +import urllib2 +import httplib +import socket + +VERSION = (0, 1) +#STRING_VERSION = '.'.join(map(str, VERSION)) +DEBUG = 0 +HANDLE_ERRORS = 1 + +class HTTPHandler(urllib2.HTTPHandler): + def __init__(self): + self._connections = {} + + def close_connection(self, host): + """close connection to <host> + host is the host:port spec, as in 'www.cnn.com:8080' as passed in. + no error occurs if there is no connection to that host.""" + self._remove_connection(host, close=1) + + def open_connections(self): + """return a list of connected hosts""" + return self._connections.keys() + + def close_all(self): + """close all open connections""" + for host, conn in self._connections.items(): + conn.close() + self._connections = {} + + def _remove_connection(self, host, close=0): + if self._connections.has_key(host): + if close: self._connections[host].close() + del self._connections[host] + + def _start_connection(self, h, req): + try: + if req.has_data(): + data = req.get_data() + h.putrequest('POST', req.get_selector()) + if not req.headers.has_key('Content-type'): + h.putheader('Content-type', + 'application/x-www-form-urlencoded') + if not req.headers.has_key('Content-length'): + h.putheader('Content-length', '%d' % len(data)) + else: + h.putrequest('GET', req.get_selector()) + except socket.error, err: + raise urllib2.URLError(err) + + for args in self.parent.addheaders: + h.putheader(*args) + for k, v in req.headers.items(): + h.putheader(k, v) + h.endheaders() + if req.has_data(): + h.send(data) + + def do_open(self, http_class, req): + host = req.get_host() + if not host: + raise urllib2.URLError('no host given') + + try: + need_new_connection = 1 + h = self._connections.get(host) + if not h is None: + try: + self._start_connection(h, req) + except socket.error, e: + r = None + else: + try: r = h.getresponse() + except httplib.ResponseNotReady, e: r = None + + if r is None or r.version == 9: + # httplib falls back to assuming HTTP 0.9 if it gets a + # bad header back. This is most likely to happen if + # the socket has been closed by the server since we + # last used the connection. + if DEBUG: print "failed to re-use connection to %s" % host + h.close() + else: + if DEBUG: print "re-using connection to %s" % host + need_new_connection = 0 + if need_new_connection: + if DEBUG: print "creating new connection to %s" % host + h = http_class(host) + self._connections[host] = h + self._start_connection(h, req) + r = h.getresponse() + except socket.error, err: + raise urllib2.URLError(err) + + # if not a persistent connection, don't try to reuse it + if r.will_close: self._remove_connection(host) + + if DEBUG: + print "STATUS: %s, %s" % (r.status, r.reason) + r._handler = self + r._host = host + r._url = req.get_full_url() + + if r.status == 200 or not HANDLE_ERRORS: + return r + else: + return self.parent.error('http', req, r, r.status, r.reason, r.msg) + + def http_open(self, req): + return self.do_open(HTTPConnection, req) + +class HTTPResponse(httplib.HTTPResponse): + + # we need to subclass HTTPResponse in order to + # 1) add readline() and readlines() methods + # 2) add close_connection() methods + # 3) add info() and geturl() methods + + # in order to add readline(), read must be modified to deal with a + # buffer. example: readline must read a buffer and then spit back + # one line at a time. The only real alternative is to read one + # BYTE at a time (ick). Once something has been read, it can't be + # put back (ok, maybe it can, but that's even uglier than this), + # so if you THEN do a normal read, you must first take stuff from + # the buffer. + + # the read method wraps the original to accomodate buffering, + # although read() never adds to the buffer. + # Both readline and readlines have been stolen with almost no + # modification from socket.py + + + def __init__(self, sock, debuglevel=0, strict=0, method=None): + if method: # the httplib in python 2.3 uses the method arg + httplib.HTTPResponse.__init__(self, sock, debuglevel, method) + else: # 2.2 doesn't + httplib.HTTPResponse.__init__(self, sock, debuglevel) + self.fileno = sock.fileno + self._rbuf = '' + self._rbufsize = 8096 + self._handler = None # inserted by the handler later + self._host = None # (same) + self._url = None # (same) + + _raw_read = httplib.HTTPResponse.read + + def close_connection(self): + self.close() + self._handler._remove_connection(self._host, close=1) + + def info(self): + return self.msg + + def geturl(self): + return self._url + + def read(self, amt=None): + # the _rbuf test is only in this first if for speed. It's not + # logically necessary + if self._rbuf and not amt is None: + L = len(self._rbuf) + if amt > L: + amt -= L + else: + s = self._rbuf[:amt] + self._rbuf = self._rbuf[amt:] + return s + + s = self._rbuf + self._raw_read(amt) + self._rbuf = '' + return s + + def readline(self, limit=-1): + data = "" + i = self._rbuf.find('\n') + while i < 0 and not (0 < limit <= len(self._rbuf)): + new = self._raw_read(self._rbufsize) + if not new: break + i = new.find('\n') + if i >= 0: i = i + len(self._rbuf) + self._rbuf = self._rbuf + new + if i < 0: i = len(self._rbuf) + else: i = i+1 + if 0 <= limit < len(self._rbuf): i = limit + data, self._rbuf = self._rbuf[:i], self._rbuf[i:] + return data + + def readlines(self, sizehint = 0): + total = 0 + list = [] + while 1: + line = self.readline() + if not line: break + list.append(line) + total += len(line) + if sizehint and total >= sizehint: + break + return list + + +class HTTPConnection(httplib.HTTPConnection): + # use the modified response class + response_class = HTTPResponse + +######################################################################### +##### TEST FUNCTIONS +######################################################################### + +def error_handler(url): + global HANDLE_ERRORS + orig = HANDLE_ERRORS + keepalive_handler = HTTPHandler() + opener = urllib2.build_opener(keepalive_handler) + urllib2.install_opener(opener) + pos = {0: 'off', 1: 'on'} + for i in (0, 1): + print " fancy error handling %s (HANDLE_ERRORS = %i)" % (pos[i], i) + HANDLE_ERRORS = i + try: + fo = urllib2.urlopen(url) + foo = fo.read() + fo.close() + try: status, reason = fo.status, fo.reason + except AttributeError: status, reason = None, None + except IOError, e: + print " EXCEPTION: %s" % e + raise + else: + print " status = %s, reason = %s" % (status, reason) + HANDLE_ERRORS = orig + hosts = keepalive_handler.open_connections() + print "open connections:", ' '.join(hosts) + keepalive_handler.close_all() + +def continuity(url): + import md5 + format = '%25s: %s' + + # first fetch the file with the normal http handler + opener = urllib2.build_opener() + urllib2.install_opener(opener) + fo = urllib2.urlopen(url) + foo = fo.read() + fo.close() + m = md5.new(foo) + print format % ('normal urllib', m.hexdigest()) + + # now install the keepalive handler and try again + opener = urllib2.build_opener(HTTPHandler()) + urllib2.install_opener(opener) + + fo = urllib2.urlopen(url) + foo = fo.read() + fo.close() + m = md5.new(foo) + print format % ('keepalive read', m.hexdigest()) + + fo = urllib2.urlopen(url) + foo = '' + while 1: + f = fo.readline() + if f: foo = foo + f + else: break + fo.close() + m = md5.new(foo) + print format % ('keepalive readline', m.hexdigest()) + +def comp(N, url): + print ' making %i connections to:\n %s' % (N, url) + + sys.stdout.write(' first using the normal urllib handlers') + # first use normal opener + opener = urllib2.build_opener() + urllib2.install_opener(opener) + t1 = fetch(N, url) + print ' TIME: %.3f s' % t1 + + sys.stdout.write(' now using the keepalive handler ') + # now install the keepalive handler and try again + opener = urllib2.build_opener(HTTPHandler()) + urllib2.install_opener(opener) + t2 = fetch(N, url) + print ' TIME: %.3f s' % t2 + print ' improvement factor: %.2f' % (t1/t2, ) + +def fetch(N, url, delay=0): + lens = [] + starttime = time.time() + for i in range(N): + if delay and i > 0: time.sleep(delay) + fo = urllib2.urlopen(url) + foo = fo.read() + fo.close() + lens.append(len(foo)) + diff = time.time() - starttime + + j = 0 + for i in lens[1:]: + j = j + 1 + if not i == lens[0]: + print "WARNING: inconsistent length on read %i: %i" % (j, i) + + return diff + +def test(url, N=10): + print "checking error hander (do this on a non-200)" + try: error_handler(url) + except IOError, e: + print "exiting - exception will prevent further tests" + sys.exit() + print + print "performing continuity test (making sure stuff isn't corrupted)" + continuity(url) + print + print "performing speed comparison" + comp(N, url) + +if __name__ == '__main__': + import time + import sys + try: + N = int(sys.argv[1]) + url = sys.argv[2] + except: + print "%s <integer> <url>" % sys.argv[0] + else: + test(url, N) |