summaryrefslogtreecommitdiff
path: root/Lib
diff options
context:
space:
mode:
authorMiss Islington (bot) <31488909+miss-islington@users.noreply.github.com>2020-02-11 19:21:32 -0800
committerGitHub <noreply@github.com>2020-02-11 22:21:32 -0500
commited4d263e8767b0e4c47df99141b500d36ce0275d (patch)
tree8abdb2a3ca1b863c451dbcca0be73b165dc605a1 /Lib
parent190433d8150bf719fa0ba972dbacf2214942f54e (diff)
downloadcpython-git-ed4d263e8767b0e4c47df99141b500d36ce0275d.tar.gz
bpo-39595: Improve zipfile.Path performance (GH-18406) (GH-18472)
* Improve zipfile.Path performance on zipfiles with a large number of entries. * 📜🤖 Added by blurb_it. * Add bpo to blurb * Sync with importlib_metadata 1.5 (6fe70ca) * Update blurb. * Remove compatibility code * Add stubs module, omitted from earlier commit Co-authored-by: blurb-it[bot] <43283697+blurb-it[bot]@users.noreply.github.com> (cherry picked from commit e5bd73632e77dc5ab0cab77e48e94ca5e354be8a) Co-authored-by: Jason R. Coombs <jaraco@jaraco.com> Co-authored-by: Jason R. Coombs <jaraco@jaraco.com>
Diffstat (limited to 'Lib')
-rw-r--r--Lib/importlib/metadata.py12
-rw-r--r--Lib/test/test_importlib/fixtures.py23
-rw-r--r--Lib/test/test_importlib/stubs.py10
-rw-r--r--Lib/test/test_importlib/test_main.py32
-rw-r--r--Lib/test/test_zipfile.py142
-rw-r--r--Lib/zipfile.py102
6 files changed, 253 insertions, 68 deletions
diff --git a/Lib/importlib/metadata.py b/Lib/importlib/metadata.py
index ae8ecf9b85..831f593277 100644
--- a/Lib/importlib/metadata.py
+++ b/Lib/importlib/metadata.py
@@ -391,6 +391,7 @@ class FastPath:
def __init__(self, root):
self.root = root
+ self.base = os.path.basename(root).lower()
def joinpath(self, child):
return pathlib.Path(self.root, child)
@@ -413,12 +414,11 @@ class FastPath:
)
def is_egg(self, search):
- root_n_low = os.path.split(self.root)[1].lower()
-
+ base = self.base
return (
- root_n_low == search.normalized + '.egg'
- or root_n_low.startswith(search.prefix)
- and root_n_low.endswith('.egg'))
+ base == search.versionless_egg_name
+ or base.startswith(search.prefix)
+ and base.endswith('.egg'))
def search(self, name):
for child in self.children():
@@ -439,6 +439,7 @@ class Prepared:
prefix = ''
suffixes = '.dist-info', '.egg-info'
exact_matches = [''][:0]
+ versionless_egg_name = ''
def __init__(self, name):
self.name = name
@@ -448,6 +449,7 @@ class Prepared:
self.prefix = self.normalized + '-'
self.exact_matches = [
self.normalized + suffix for suffix in self.suffixes]
+ self.versionless_egg_name = self.normalized + '.egg'
class MetadataPathFinder(DistributionFinder):
diff --git a/Lib/test/test_importlib/fixtures.py b/Lib/test/test_importlib/fixtures.py
index 0b4ce18d5a..695c92a786 100644
--- a/Lib/test/test_importlib/fixtures.py
+++ b/Lib/test/test_importlib/fixtures.py
@@ -47,14 +47,28 @@ def tempdir_as_cwd():
yield tmp
-class SiteDir:
+@contextlib.contextmanager
+def install_finder(finder):
+ sys.meta_path.append(finder)
+ try:
+ yield
+ finally:
+ sys.meta_path.remove(finder)
+
+
+class Fixtures:
def setUp(self):
self.fixtures = ExitStack()
self.addCleanup(self.fixtures.close)
+
+
+class SiteDir(Fixtures):
+ def setUp(self):
+ super(SiteDir, self).setUp()
self.site_dir = self.fixtures.enter_context(tempdir())
-class OnSysPath:
+class OnSysPath(Fixtures):
@staticmethod
@contextlib.contextmanager
def add_sys_path(dir):
@@ -198,3 +212,8 @@ def build_files(file_defs, prefix=pathlib.Path()):
def DALS(str):
"Dedent and left-strip"
return textwrap.dedent(str).lstrip()
+
+
+class NullFinder:
+ def find_module(self, name):
+ pass
diff --git a/Lib/test/test_importlib/stubs.py b/Lib/test/test_importlib/stubs.py
new file mode 100644
index 0000000000..e5b011c399
--- /dev/null
+++ b/Lib/test/test_importlib/stubs.py
@@ -0,0 +1,10 @@
+import unittest
+
+
+class fake_filesystem_unittest:
+ """
+ Stubbed version of the pyfakefs module
+ """
+ class TestCase(unittest.TestCase):
+ def setUpPyfakefs(self):
+ self.skipTest("pyfakefs not available")
diff --git a/Lib/test/test_importlib/test_main.py b/Lib/test/test_importlib/test_main.py
index c5f1dbbae3..42a79992ec 100644
--- a/Lib/test/test_importlib/test_main.py
+++ b/Lib/test/test_importlib/test_main.py
@@ -7,6 +7,11 @@ import textwrap
import unittest
import importlib.metadata
+try:
+ import pyfakefs.fake_filesystem_unittest as ffs
+except ImportError:
+ from .stubs import fake_filesystem_unittest as ffs
+
from . import fixtures
from importlib.metadata import (
Distribution, EntryPoint,
@@ -185,6 +190,33 @@ class DirectoryTest(fixtures.OnSysPath, fixtures.SiteDir, unittest.TestCase):
version('foo')
+class MissingSysPath(fixtures.OnSysPath, unittest.TestCase):
+ site_dir = '/does-not-exist'
+
+ def test_discovery(self):
+ """
+ Discovering distributions should succeed even if
+ there is an invalid path on sys.path.
+ """
+ importlib.metadata.distributions()
+
+
+class InaccessibleSysPath(fixtures.OnSysPath, ffs.TestCase):
+ site_dir = '/access-denied'
+
+ def setUp(self):
+ super(InaccessibleSysPath, self).setUp()
+ self.setUpPyfakefs()
+ self.fs.create_dir(self.site_dir, perm_bits=000)
+
+ def test_discovery(self):
+ """
+ Discovering distributions should succeed even if
+ there is an invalid path on sys.path.
+ """
+ list(importlib.metadata.distributions())
+
+
class TestEntryPoints(unittest.TestCase):
def __init__(self, *args):
super(TestEntryPoints, self).__init__(*args)
diff --git a/Lib/test/test_zipfile.py b/Lib/test/test_zipfile.py
index c65de9202c..61bca8651c 100644
--- a/Lib/test/test_zipfile.py
+++ b/Lib/test/test_zipfile.py
@@ -2683,16 +2683,71 @@ class CommandLineTest(unittest.TestCase):
self.assertEqual(f.read(), zf.read(zi))
+class TestExecutablePrependedZip(unittest.TestCase):
+ """Test our ability to open zip files with an executable prepended."""
+
+ def setUp(self):
+ self.exe_zip = findfile('exe_with_zip', subdir='ziptestdata')
+ self.exe_zip64 = findfile('exe_with_z64', subdir='ziptestdata')
+
+ def _test_zip_works(self, name):
+ # bpo28494 sanity check: ensure is_zipfile works on these.
+ self.assertTrue(zipfile.is_zipfile(name),
+ f'is_zipfile failed on {name}')
+ # Ensure we can operate on these via ZipFile.
+ with zipfile.ZipFile(name) as zipfp:
+ for n in zipfp.namelist():
+ data = zipfp.read(n)
+ self.assertIn(b'FAVORITE_NUMBER', data)
+
+ def test_read_zip_with_exe_prepended(self):
+ self._test_zip_works(self.exe_zip)
+
+ def test_read_zip64_with_exe_prepended(self):
+ self._test_zip_works(self.exe_zip64)
+
+ @unittest.skipUnless(sys.executable, 'sys.executable required.')
+ @unittest.skipUnless(os.access('/bin/bash', os.X_OK),
+ 'Test relies on #!/bin/bash working.')
+ def test_execute_zip2(self):
+ output = subprocess.check_output([self.exe_zip, sys.executable])
+ self.assertIn(b'number in executable: 5', output)
+
+ @unittest.skipUnless(sys.executable, 'sys.executable required.')
+ @unittest.skipUnless(os.access('/bin/bash', os.X_OK),
+ 'Test relies on #!/bin/bash working.')
+ def test_execute_zip64(self):
+ output = subprocess.check_output([self.exe_zip64, sys.executable])
+ self.assertIn(b'number in executable: 5', output)
+
+
# Poor man's technique to consume a (smallish) iterable.
consume = tuple
+# from jaraco.itertools 5.0
+class jaraco:
+ class itertools:
+ class Counter:
+ def __init__(self, i):
+ self.count = 0
+ self._orig_iter = iter(i)
+
+ def __iter__(self):
+ return self
+
+ def __next__(self):
+ result = next(self._orig_iter)
+ self.count += 1
+ return result
+
+
def add_dirs(zf):
"""
Given a writable zip file zf, inject directory entries for
any directories implied by the presence of children.
"""
- for name in zipfile.Path._implied_dirs(zf.namelist()):
+ for name in zipfile.CompleteDirs._implied_dirs(zf.namelist()):
zf.writestr(name, b"")
return zf
@@ -2733,44 +2788,6 @@ def build_alpharep_fixture():
return zf
-class TestExecutablePrependedZip(unittest.TestCase):
- """Test our ability to open zip files with an executable prepended."""
-
- def setUp(self):
- self.exe_zip = findfile('exe_with_zip', subdir='ziptestdata')
- self.exe_zip64 = findfile('exe_with_z64', subdir='ziptestdata')
-
- def _test_zip_works(self, name):
- # bpo-28494 sanity check: ensure is_zipfile works on these.
- self.assertTrue(zipfile.is_zipfile(name),
- f'is_zipfile failed on {name}')
- # Ensure we can operate on these via ZipFile.
- with zipfile.ZipFile(name) as zipfp:
- for n in zipfp.namelist():
- data = zipfp.read(n)
- self.assertIn(b'FAVORITE_NUMBER', data)
-
- def test_read_zip_with_exe_prepended(self):
- self._test_zip_works(self.exe_zip)
-
- def test_read_zip64_with_exe_prepended(self):
- self._test_zip_works(self.exe_zip64)
-
- @unittest.skipUnless(sys.executable, 'sys.executable required.')
- @unittest.skipUnless(os.access('/bin/bash', os.X_OK),
- 'Test relies on #!/bin/bash working.')
- def test_execute_zip2(self):
- output = subprocess.check_output([self.exe_zip, sys.executable])
- self.assertIn(b'number in executable: 5', output)
-
- @unittest.skipUnless(sys.executable, 'sys.executable required.')
- @unittest.skipUnless(os.access('/bin/bash', os.X_OK),
- 'Test relies on #!/bin/bash working.')
- def test_execute_zip64(self):
- output = subprocess.check_output([self.exe_zip64, sys.executable])
- self.assertIn(b'number in executable: 5', output)
-
-
class TestPath(unittest.TestCase):
def setUp(self):
self.fixtures = contextlib.ExitStack()
@@ -2808,6 +2825,14 @@ class TestPath(unittest.TestCase):
i, = h.iterdir()
assert i.is_file()
+ def test_subdir_is_dir(self):
+ for alpharep in self.zipfile_alpharep():
+ root = zipfile.Path(alpharep)
+ assert (root / 'b').is_dir()
+ assert (root / 'b/').is_dir()
+ assert (root / 'g').is_dir()
+ assert (root / 'g/').is_dir()
+
def test_open(self):
for alpharep in self.zipfile_alpharep():
root = zipfile.Path(alpharep)
@@ -2869,6 +2894,45 @@ class TestPath(unittest.TestCase):
root = zipfile.Path(alpharep)
assert (root / 'missing dir/').parent.at == ''
+ def test_mutability(self):
+ """
+ If the underlying zipfile is changed, the Path object should
+ reflect that change.
+ """
+ for alpharep in self.zipfile_alpharep():
+ root = zipfile.Path(alpharep)
+ a, b, g = root.iterdir()
+ alpharep.writestr('foo.txt', 'foo')
+ alpharep.writestr('bar/baz.txt', 'baz')
+ assert any(
+ child.name == 'foo.txt'
+ for child in root.iterdir())
+ assert (root / 'foo.txt').read_text() == 'foo'
+ baz, = (root / 'bar').iterdir()
+ assert baz.read_text() == 'baz'
+
+ HUGE_ZIPFILE_NUM_ENTRIES = 2 ** 13
+
+ def huge_zipfile(self):
+ """Create a read-only zipfile with a huge number of entries entries."""
+ strm = io.BytesIO()
+ zf = zipfile.ZipFile(strm, "w")
+ for entry in map(str, range(self.HUGE_ZIPFILE_NUM_ENTRIES)):
+ zf.writestr(entry, entry)
+ zf.mode = 'r'
+ return zf
+
+ def test_joinpath_constant_time(self):
+ """
+ Ensure joinpath on items in zipfile is linear time.
+ """
+ root = zipfile.Path(self.huge_zipfile())
+ entries = jaraco.itertools.Counter(root.iterdir())
+ for entry in entries:
+ entry.joinpath('suffix')
+ # Check the file iterated all items
+ assert entries.count == self.HUGE_ZIPFILE_NUM_ENTRIES
+
if __name__ == "__main__":
unittest.main()
diff --git a/Lib/zipfile.py b/Lib/zipfile.py
index 8b99c1189b..5dc6516cc4 100644
--- a/Lib/zipfile.py
+++ b/Lib/zipfile.py
@@ -16,6 +16,8 @@ import struct
import sys
import threading
import time
+import contextlib
+from collections import OrderedDict
try:
import zlib # We may need its compression method
@@ -2182,6 +2184,79 @@ def _ancestry(path):
path, tail = posixpath.split(path)
+class CompleteDirs(ZipFile):
+ """
+ A ZipFile subclass that ensures that implied directories
+ are always included in the namelist.
+ """
+
+ @staticmethod
+ def _implied_dirs(names):
+ parents = itertools.chain.from_iterable(map(_parents, names))
+ # Deduplicate entries in original order
+ implied_dirs = OrderedDict.fromkeys(
+ p + posixpath.sep for p in parents
+ # Cast names to a set for O(1) lookups
+ if p + posixpath.sep not in set(names)
+ )
+ return implied_dirs
+
+ def namelist(self):
+ names = super(CompleteDirs, self).namelist()
+ return names + list(self._implied_dirs(names))
+
+ def _name_set(self):
+ return set(self.namelist())
+
+ def resolve_dir(self, name):
+ """
+ If the name represents a directory, return that name
+ as a directory (with the trailing slash).
+ """
+ names = self._name_set()
+ dirname = name + '/'
+ dir_match = name not in names and dirname in names
+ return dirname if dir_match else name
+
+ @classmethod
+ def make(cls, source):
+ """
+ Given a source (filename or zipfile), return an
+ appropriate CompleteDirs subclass.
+ """
+ if isinstance(source, CompleteDirs):
+ return source
+
+ if not isinstance(source, ZipFile):
+ return cls(source)
+
+ # Only allow for FastPath when supplied zipfile is read-only
+ if 'r' not in source.mode:
+ cls = CompleteDirs
+
+ res = cls.__new__(cls)
+ vars(res).update(vars(source))
+ return res
+
+
+class FastLookup(CompleteDirs):
+ """
+ ZipFile subclass to ensure implicit
+ dirs exist and are resolved rapidly.
+ """
+ def namelist(self):
+ with contextlib.suppress(AttributeError):
+ return self.__names
+ self.__names = super(FastLookup, self).namelist()
+ return self.__names
+
+ def _name_set(self):
+ with contextlib.suppress(AttributeError):
+ return self.__lookup
+ self.__lookup = super(FastLookup, self)._name_set()
+ return self.__lookup
+
+
class Path:
"""
A pathlib-compatible interface for zip files.
@@ -2250,7 +2325,7 @@ class Path:
__repr = "{self.__class__.__name__}({self.root.filename!r}, {self.at!r})"
def __init__(self, root, at=""):
- self.root = root if isinstance(root, ZipFile) else ZipFile(root)
+ self.root = FastLookup.make(root)
self.at = at
@property
@@ -2282,12 +2357,12 @@ class Path:
return not self.is_dir()
def exists(self):
- return self.at in self._names()
+ return self.at in self.root._name_set()
def iterdir(self):
if not self.is_dir():
raise ValueError("Can't listdir a file")
- subs = map(self._next, self._names())
+ subs = map(self._next, self.root.namelist())
return filter(self._is_child, subs)
def __str__(self):
@@ -2298,25 +2373,10 @@ class Path:
def joinpath(self, add):
next = posixpath.join(self.at, add)
- next_dir = posixpath.join(self.at, add, "")
- names = self._names()
- return self._next(next_dir if next not in names and next_dir in names else next)
+ return self._next(self.root.resolve_dir(next))
__truediv__ = joinpath
- @staticmethod
- def _implied_dirs(names):
- return _unique_everseen(
- parent + "/"
- for name in names
- for parent in _parents(name)
- if parent + "/" not in names
- )
-
- @classmethod
- def _add_implied_dirs(cls, names):
- return names + list(cls._implied_dirs(names))
-
@property
def parent(self):
parent_at = posixpath.dirname(self.at.rstrip('/'))
@@ -2324,9 +2384,6 @@ class Path:
parent_at += '/'
return self._next(parent_at)
- def _names(self):
- return self._add_implied_dirs(self.root.namelist())
-
def main(args=None):
import argparse
@@ -2388,5 +2445,6 @@ def main(args=None):
zippath = ''
addToZip(zf, path, zippath)
+
if __name__ == "__main__":
main()