diff options
| author | Tim Peters <tim.peters@gmail.com> | 2002-11-04 19:50:11 +0000 | 
|---|---|---|
| committer | Tim Peters <tim.peters@gmail.com> | 2002-11-04 19:50:11 +0000 | 
| commit | fb0ea525d528153838586bf8ece15a45bbf5ddf3 (patch) | |
| tree | 0dbd427c60c6142e6bfd146e998d88c07f1aedc6 | |
| parent | 47ca2bc661e39238a948d1c967756fd0580c3502 (diff) | |
| download | cpython-git-fb0ea525d528153838586bf8ece15a45bbf5ddf3.tar.gz | |
Related to SF patch 618135: gzip.py and files > 2G.
Fixed the signed/unsigned confusions when dealing with files >= 2GB.
4GB is still a hard limitation of the gzip file format, though.
Testing this was a bitch on Win98SE due to frequent system freezes.  It
didn't freeze while running gzip, it kept freezing while trying to *create*
a > 2GB test file!  This wasn't Python's doing.  I don't know of a
reasonable way to test this functionality in regrtest.py, so I'm not
checking in a test case (a test case would necessarily require creating
a 2GB+ file first, using gzip to zip it, using gzip to unzip it again,
and then compare before-and-after; so >4GB free space would be required,
and a loooong time; I did all this "by hand" once).
Bugfix candidate, I guess.
| -rw-r--r-- | Lib/gzip.py | 52 | ||||
| -rw-r--r-- | Misc/NEWS | 6 | 
2 files changed, 39 insertions, 19 deletions
diff --git a/Lib/gzip.py b/Lib/gzip.py index 55d448dd1d..8802adb0d9 100644 --- a/Lib/gzip.py +++ b/Lib/gzip.py @@ -15,12 +15,21 @@ FTEXT, FHCRC, FEXTRA, FNAME, FCOMMENT = 1, 2, 4, 8, 16  READ, WRITE = 1, 2 +def U32(i): +    """Return i as an unsigned integer, assuming it fits in 32 bits. + +    If it's >= 2GB when viewed as a 32-bit unsigned int, return a long. +    """ +    if i < 0: +        i += 1L << 32 +    return i +  def write32(output, value):      output.write(struct.pack("<l", value))  def write32u(output, value): -    if value < 0: -        value = value + 0x100000000L +    # The L format writes the bit pattern correctly whether signed +    # or unsigned.      output.write(struct.pack("<L", value))  def read32(input): @@ -157,19 +166,21 @@ class GzipFile:          if flag & FEXTRA:              # Read & discard the extra field, if present -            xlen=ord(self.fileobj.read(1)) -            xlen=xlen+256*ord(self.fileobj.read(1)) +            xlen = ord(self.fileobj.read(1)) +            xlen = xlen + 256*ord(self.fileobj.read(1))              self.fileobj.read(xlen)          if flag & FNAME:              # Read and discard a null-terminated string containing the filename              while True: -                s=self.fileobj.read(1) -                if not s or s=='\000': break +                s = self.fileobj.read(1) +                if not s or s=='\000': +                    break          if flag & FCOMMENT:              # Read and discard a null-terminated string containing a comment              while True: -                s=self.fileobj.read(1) -                if not s or s=='\000': break +                s = self.fileobj.read(1) +                if not s or s=='\000': +                    break          if flag & FHCRC:              self.fileobj.read(2)     # Read & discard the 16-bit header CRC @@ -225,7 +236,8 @@ class GzipFile:          self.offset -= len(buf)      def _read(self, size=1024): -        if self.fileobj is None: raise EOFError, "Reached EOF" +        if self.fileobj is None: +            raise EOFError, "Reached EOF"          if self._new_member:              # If the _new_member flag is set, we have to @@ -286,8 +298,8 @@ class GzipFile:          # uncompressed data matches the stored values.          self.fileobj.seek(-8, 1)          crc32 = read32(self.fileobj) -        isize = read32(self.fileobj) -        if crc32%0x100000000L != self.crc%0x100000000L: +        isize = U32(read32(self.fileobj))   # may exceed 2GB +        if U32(crc32) != U32(self.crc):              raise ValueError, "CRC check failed"          elif isize != self.size:              raise ValueError, "Incorrect length of data produced" @@ -296,7 +308,8 @@ class GzipFile:          if self.mode == WRITE:              self.fileobj.write(self.compress.flush())              write32(self.fileobj, self.crc) -            write32(self.fileobj, self.size) +            # self.size may exceed 2GB +            write32u(self.fileobj, self.size)              self.fileobj = None          elif self.mode == READ:              self.fileobj = None @@ -338,15 +351,16 @@ class GzipFile:              if offset < self.offset:                  raise IOError('Negative seek in write mode')              count = offset - self.offset -            for i in range(count/1024): -                self.write(1024*'\0') -            self.write((count%1024)*'\0') +            for i in range(count // 1024): +                self.write(1024 * '\0') +            self.write((count % 1024) * '\0')          elif self.mode == READ:              if offset < self.offset:                  # for negative seek, rewind and do positive seek                  self.rewind()              count = offset - self.offset -            for i in range(count/1024): self.read(1024) +            for i in range(count // 1024): +                self.read(1024)              self.read(count % 1024)      def readline(self, size=-1): @@ -379,11 +393,13 @@ class GzipFile:      def readlines(self, sizehint=0):          # Negative numbers result in reading all the lines -        if sizehint <= 0: sizehint = sys.maxint +        if sizehint <= 0: +            sizehint = sys.maxint          L = []          while sizehint > 0:              line = self.readline() -            if line == "": break +            if line == "": +                break              L.append(line)              sizehint = sizehint - len(line) @@ -355,6 +355,10 @@ Extension modules  Library  ------- +- gzip.py now handles files exceeding 2GB.  Note that 4GB is still a +  fundamental limitation of the underlying gzip file format (it only +  has 32 bits to record the file size). +  - xml.sax.saxutils.unescape has been added, to replace entity references    with their entity value. @@ -365,7 +369,7 @@ Library  - Various configure methods of Tkinter have been stream-lined, so that    tag_configure, image_configure, window_configure now return a -  dictionary when invoked with no argument.  +  dictionary when invoked with no argument.  - Importing the readline module now no longer has the side effect of    calling setlocale(LC_CTYPE, "").  The initial "C" locale, or  | 
