# Copyright (C) 2010, 2011 Sebastian Thiel (byronimo@gmail.com) and contributors # # This module is part of GitDB and is released under # the New BSD License: http://www.opensource.org/licenses/bsd-license.php """Performance tests for object store""" from __future__ import print_function from gitdb.test.performance.lib import ( TestBigRepoR ) from gitdb import ( MemoryDB, GitDB, IStream, ) from gitdb.typ import str_blob_type from gitdb.exc import UnsupportedOperation from gitdb.db.pack import PackedDB from gitdb.utils.compat import xrange import sys import os from time import time class TestPackedDBPerformance(TestBigRepoR): def test_pack_random_access(self): pdb = PackedDB(os.path.join(self.gitrepopath, "objects/pack")) # sha lookup st = time() sha_list = list(pdb.sha_iter()) elapsed = max(time() - st, 0.001) # prevent zero divison errors on windows ns = len(sha_list) print("PDB: looked up %i shas by index in %f s ( %f shas/s )" % ( ns, elapsed, ns / elapsed), file=sys.stderr) # sha lookup: best-case and worst case access pdb_pack_info = pdb._pack_info # END shuffle shas st = time() for sha in sha_list: pdb_pack_info(sha) # END for each sha to look up elapsed = max(time() - st, 0.001) # prevent zero divison errors on windows # discard cache del(pdb._entities) pdb.entities() print("PDB: looked up %i sha in %i packs in %f s ( %f shas/s )" % (ns, len(pdb.entities()), elapsed, ns / elapsed), file=sys.stderr) # END for each random mode # query info and streams only max_items = 10000 # can wait longer when testing memory for pdb_fun in (pdb.info, pdb.stream): st = time() for sha in sha_list[:max_items]: pdb_fun(sha) elapsed = max(time() - st, 0.001) # prevent zero divison errors on windows print("PDB: Obtained %i object %s by sha in %f s ( %f items/s )" % (max_items, pdb_fun.__name__.upper(), elapsed, max_items / elapsed), file=sys.stderr) # END for each function # retrieve stream and read all max_items = 5000 pdb_stream = pdb.stream total_size = 0 st = time() for sha in sha_list[:max_items]: stream = pdb_stream(sha) read_len = len(stream.read()) assert read_len == stream.size total_size += stream.size elapsed = max(time() - st, 0.001) # prevent zero divison errors on windows total_kib = total_size / 1000 print("PDB: Obtained %i streams by sha and read all bytes " "totallying %i KiB ( %f KiB / s ) in %f s ( %f streams/s )" % (max_items, total_kib, total_kib / elapsed, elapsed, max_items / elapsed), file=sys.stderr) def test_loose_correctness(self): """based on the pack(s) of our packed object DB, we will just copy and verify all objects in the back into the loose object db (memory). This should help finding dormant issues like this one faster: https://github.com/gitpython-developers/GitPython/issues/220 .. note:: It doesn't seem this test can find the issue unless the given pack contains highly compressed data files, like archives.""" from gitdb.util import bin_to_hex pdb = GitDB(os.path.join(self.gitrepopath, 'objects')) mdb = MemoryDB() for c, sha in enumerate(pdb.sha_iter()): with pdb.stream(sha) as ostream: # the issue only showed on larger files which are hardly compressible ... if ostream.type != str_blob_type: continue istream = IStream(ostream.type, ostream.size, ostream.stream) mdb.store(istream) assert istream.binsha == sha, "Failed on object %s" % bin_to_hex(sha).decode('ascii') # this can fail ... sometimes, so the packs dataset should be huge with mdb.stream(sha) as ost2: assert len(ost2.read()) == ostream.size if c and c % 1000 == 0: print("Verified %i loose object compression/decompression cycles" % c, file=sys.stderr) mdb._cache.clear() # end for each sha to copy def test_correctness(self): pdb = PackedDB(os.path.join(self.gitrepopath, "objects/pack")) # disabled for now as it used to work perfectly, checking big repositories takes a long time print("Endurance run: verify streaming of objects (crc and sha)", file=sys.stderr) for crc in range(2): count = 0 st = time() for entity in pdb.entities(): with entity: pack_verify = entity.is_valid_stream idx = entity.index() sha_by_index = idx.sha for index in xrange(idx.size()): try: assert pack_verify(sha_by_index(index), use_crc=crc) count += 1 except UnsupportedOperation: pass # END ignore old indices # END for each index # END for each entity elapsed = max(time() - st, 0.001) # prevent zero divison errors on windows print("PDB: verified %i objects (crc=%i) in %f s ( %f objects/s )" % (count, crc, elapsed, count / elapsed), file=sys.stderr) # END for each verify mode