summaryrefslogtreecommitdiff
path: root/Lib/zipfile.py
diff options
context:
space:
mode:
authorJason R. Coombs <jaraco@jaraco.com>2020-02-11 21:58:47 -0500
committerGitHub <noreply@github.com>2020-02-11 21:58:47 -0500
commite5bd73632e77dc5ab0cab77e48e94ca5e354be8a (patch)
tree714f30a26206aeec8fd331b42ae095b638f216e5 /Lib/zipfile.py
parente6be9b59a911626d6597fe148c32f0342bd2bd24 (diff)
downloadcpython-git-e5bd73632e77dc5ab0cab77e48e94ca5e354be8a.tar.gz
bpo-39595: Improve zipfile.Path performance (#18406)
* Improve zipfile.Path performance on zipfiles with a large number of entries. * 📜🤖 Added by blurb_it. * Add bpo to blurb * Sync with importlib_metadata 1.5 (6fe70ca) * Update blurb. * Remove compatibility code * Add stubs module, omitted from earlier commit Co-authored-by: blurb-it[bot] <43283697+blurb-it[bot]@users.noreply.github.com>
Diffstat (limited to 'Lib/zipfile.py')
-rw-r--r--Lib/zipfile.py102
1 files changed, 80 insertions, 22 deletions
diff --git a/Lib/zipfile.py b/Lib/zipfile.py
index 2da87ef505..4510fac250 100644
--- a/Lib/zipfile.py
+++ b/Lib/zipfile.py
@@ -16,6 +16,8 @@ import struct
import sys
import threading
import time
+import contextlib
+from collections import OrderedDict
try:
import zlib # We may need its compression method
@@ -2159,6 +2161,79 @@ def _ancestry(path):
path, tail = posixpath.split(path)
+class CompleteDirs(ZipFile):
+ """
+ A ZipFile subclass that ensures that implied directories
+ are always included in the namelist.
+ """
+
+ @staticmethod
+ def _implied_dirs(names):
+ parents = itertools.chain.from_iterable(map(_parents, names))
+ # Deduplicate entries in original order
+ implied_dirs = OrderedDict.fromkeys(
+ p + posixpath.sep for p in parents
+ # Cast names to a set for O(1) lookups
+ if p + posixpath.sep not in set(names)
+ )
+ return implied_dirs
+
+ def namelist(self):
+ names = super(CompleteDirs, self).namelist()
+ return names + list(self._implied_dirs(names))
+
+ def _name_set(self):
+ return set(self.namelist())
+
+ def resolve_dir(self, name):
+ """
+ If the name represents a directory, return that name
+ as a directory (with the trailing slash).
+ """
+ names = self._name_set()
+ dirname = name + '/'
+ dir_match = name not in names and dirname in names
+ return dirname if dir_match else name
+
+ @classmethod
+ def make(cls, source):
+ """
+ Given a source (filename or zipfile), return an
+ appropriate CompleteDirs subclass.
+ """
+ if isinstance(source, CompleteDirs):
+ return source
+
+ if not isinstance(source, ZipFile):
+ return cls(source)
+
+ # Only allow for FastPath when supplied zipfile is read-only
+ if 'r' not in source.mode:
+ cls = CompleteDirs
+
+ res = cls.__new__(cls)
+ vars(res).update(vars(source))
+ return res
+
+
+class FastLookup(CompleteDirs):
+ """
+ ZipFile subclass to ensure implicit
+ dirs exist and are resolved rapidly.
+ """
+ def namelist(self):
+ with contextlib.suppress(AttributeError):
+ return self.__names
+ self.__names = super(FastLookup, self).namelist()
+ return self.__names
+
+ def _name_set(self):
+ with contextlib.suppress(AttributeError):
+ return self.__lookup
+ self.__lookup = super(FastLookup, self)._name_set()
+ return self.__lookup
+
+
class Path:
"""
A pathlib-compatible interface for zip files.
@@ -2227,7 +2302,7 @@ class Path:
__repr = "{self.__class__.__name__}({self.root.filename!r}, {self.at!r})"
def __init__(self, root, at=""):
- self.root = root if isinstance(root, ZipFile) else ZipFile(root)
+ self.root = FastLookup.make(root)
self.at = at
@property
@@ -2259,12 +2334,12 @@ class Path:
return not self.is_dir()
def exists(self):
- return self.at in self._names()
+ return self.at in self.root._name_set()
def iterdir(self):
if not self.is_dir():
raise ValueError("Can't listdir a file")
- subs = map(self._next, self._names())
+ subs = map(self._next, self.root.namelist())
return filter(self._is_child, subs)
def __str__(self):
@@ -2275,25 +2350,10 @@ class Path:
def joinpath(self, add):
next = posixpath.join(self.at, add)
- next_dir = posixpath.join(self.at, add, "")
- names = self._names()
- return self._next(next_dir if next not in names and next_dir in names else next)
+ return self._next(self.root.resolve_dir(next))
__truediv__ = joinpath
- @staticmethod
- def _implied_dirs(names):
- return _unique_everseen(
- parent + "/"
- for name in names
- for parent in _parents(name)
- if parent + "/" not in names
- )
-
- @classmethod
- def _add_implied_dirs(cls, names):
- return names + list(cls._implied_dirs(names))
-
@property
def parent(self):
parent_at = posixpath.dirname(self.at.rstrip('/'))
@@ -2301,9 +2361,6 @@ class Path:
parent_at += '/'
return self._next(parent_at)
- def _names(self):
- return self._add_implied_dirs(self.root.namelist())
-
def main(args=None):
import argparse
@@ -2365,5 +2422,6 @@ def main(args=None):
zippath = ''
addToZip(zf, path, zippath)
+
if __name__ == "__main__":
main()