From f0c05ea8da7932961af162bb30231640b89e40bc Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Tue, 5 Jul 2011 18:13:30 +0200 Subject: util: pick the type of memory manager based on the python version, to have optimal results in all cases (at least the ones I can test) pack: now works properly with a sliding memory manager test_packedodb_pure: fixed very memory hungry implementation by using an iterator. This will of course reduce the measured performance a bit, but 750MB of memory is just a little bit too much for an ordinary test. Maybe it would be alright to just reduce the number of items ... but performance isn't a strength of python after all --- git/pack.py | 32 +++++++++++++++----------- git/test/performance/db/test_packedodb_pure.py | 7 +++--- git/util.py | 7 +++++- 3 files changed, 28 insertions(+), 18 deletions(-) diff --git a/git/pack.py b/git/pack.py index ddd22448..627035fd 100644 --- a/git/pack.py +++ b/git/pack.py @@ -73,7 +73,7 @@ __all__ = ('PackIndexFile', 'PackFile', 'PackEntity') #{ Utilities -def pack_object_at(data, offset, as_stream): +def pack_object_at(cursor, offset, as_stream): """ :return: Tuple(abs_data_offset, PackInfo|PackStream) an object of the correct type according to the type_id of the object. @@ -83,7 +83,7 @@ def pack_object_at(data, offset, as_stream): :parma offset: offset in to the data at which the object information is located :param as_stream: if True, a stream object will be returned that can read the data, otherwise you receive an info object only""" - data = buffer(data, offset) + data = cursor.use_region(offset).buffer() type_id, uncomp_size, data_rela_offset = pack_object_header_info(data) total_rela_offset = None # set later, actual offset until data stream begins delta_info = None @@ -269,6 +269,10 @@ class PackIndexFile(LazyMixin): # that we can actually write to the location - it could be a read-only # alternate for instance self._cursor = mman.make_cursor(self._indexpath).use_region() + # We will assume that the index will always fully fit into memory ! + if mman.window_size() > 0 and self._cursor.file_size() > mman.window_size(): + raise AssertionError("The index file at %s is too large to fit into a mapped window (%i > %i). This is a limitation of the implementation" % (self._indexpath, self._cursor.file_size(), mman.window_size())) + #END assert window size else: # now its time to initialize everything - if we are here, someone wants # to access the fanout table or related properties @@ -527,13 +531,13 @@ class PackFile(LazyMixin): def _iter_objects(self, start_offset, as_stream=True): """Handle the actual iteration of objects within this pack""" - data = self._cursor.map() - content_size = len(data) - self.footer_size + c = self._cursor + content_size = c.file_size() - self.footer_size cur_offset = start_offset or self.first_object_offset null = NullStream() while cur_offset < content_size: - data_offset, ostream = pack_object_at(data, cur_offset, True) + data_offset, ostream = pack_object_at(c, cur_offset, True) # scrub the stream to the end - this decompresses the object, but yields # the amount of compressed bytes we need to get to the next offset @@ -562,12 +566,14 @@ class PackFile(LazyMixin): def data(self): """ :return: read-only data of this pack. It provides random access and usually - is a memory map""" - return self._cursor.map() + is a memory map. + :note: This method is unsafe as it returns a window into a file which might be larger than than the actual window size""" + # can use map as we are starting at offset 0. Otherwise we would have to use buffer() + return self._cursor.use_region().map() def checksum(self): """:return: 20 byte sha1 hash on all object sha's contained in this file""" - return self._cursor.map()[-20:] + return self._cursor.use_region(self._cursor.file_size()-20).buffer()[:] def path(self): """:return: path to the packfile""" @@ -586,9 +592,9 @@ class PackFile(LazyMixin): If the object at offset is no delta, the size of the list is 1. :param offset: specifies the first byte of the object within this pack""" out = list() - data = self._cursor.map() + c = self._cursor while True: - ostream = pack_object_at(data, offset, True)[1] + ostream = pack_object_at(c, offset, True)[1] out.append(ostream) if ostream.type_id == OFS_DELTA: offset = ostream.pack_offset - ostream.delta_info @@ -610,14 +616,14 @@ class PackFile(LazyMixin): :param offset: byte offset :return: OPackInfo instance, the actual type differs depending on the type_id attribute""" - return pack_object_at(self._cursor.map(), offset or self.first_object_offset, False)[1] + return pack_object_at(self._cursor, offset or self.first_object_offset, False)[1] def stream(self, offset): """Retrieve an object at the given file-relative offset as stream along with its information :param offset: byte offset :return: OPackStream instance, the actual type differs depending on the type_id attribute""" - return pack_object_at(self._cursor.map(), offset or self.first_object_offset, True)[1] + return pack_object_at(self._cursor, offset or self.first_object_offset, True)[1] def stream_iter(self, start_offset=0): """ @@ -700,7 +706,7 @@ class PackEntity(LazyMixin): sha = self._index.sha(index) # END assure sha is present ( in output ) offset = self._index.offset(index) - type_id, uncomp_size, data_rela_offset = pack_object_header_info(buffer(self._pack._cursor.map(), offset)) + type_id, uncomp_size, data_rela_offset = pack_object_header_info(self._pack._cursor.use_region(offset).buffer()) if as_stream: if type_id not in delta_types: packstream = self._pack.stream(offset) diff --git a/git/test/performance/db/test_packedodb_pure.py b/git/test/performance/db/test_packedodb_pure.py index 4ea09779..11497d9d 100644 --- a/git/test/performance/db/test_packedodb_pure.py +++ b/git/test/performance/db/test_packedodb_pure.py @@ -49,18 +49,17 @@ class TestPurePackedODB(TestPurePackedODBPerformanceBase): count = 0 total_size = 0 st = time() - objs = list() for sha in rorepo.sha_iter(): count += 1 - objs.append(rorepo.stream(sha)) + rorepo.stream(sha) if count == ni: break #END gather objects for pack-writing elapsed = time() - st - print >> sys.stderr, "PDB Streaming: Got %i streams from %s by sha in in %f s ( %f streams/s )" % (ni, rorepo.__class__.__name__, elapsed, ni / elapsed) + print >> sys.stderr, "PDB Streaming: Got %i streams from %s by sha in in %f s ( %f streams/s )" % (count, rorepo.__class__.__name__, elapsed, count / elapsed) st = time() - PackEntity.write_pack(objs, ostream.write) + PackEntity.write_pack((rorepo.stream(sha) for sha in rorepo.sha_iter()), ostream.write, object_count=ni) elapsed = time() - st total_kb = ostream.bytes_written() / 1000 print >> sys.stderr, "PDB Streaming: Wrote pack of size %i kb in %f s (%f kb/s)" % (total_kb, elapsed, total_kb/elapsed) diff --git a/git/util.py b/git/util.py index 0c018447..0e7e4cba 100644 --- a/git/util.py +++ b/git/util.py @@ -17,6 +17,7 @@ import shutil import tempfile from smmap import ( StaticWindowMapManager, + SlidingWindowMapManager, SlidingWindowMapBuffer ) @@ -72,7 +73,11 @@ pool = ThreadPool(0) # initialize our global memory manager instance # Use it to free cached (and unused) resources. -mman = StaticWindowMapManager() +if sys.version_info[1] < 6: + mman = StaticWindowMapManager() +else: + mman = SlidingWindowMapManager() +#END handle mman #} END globals -- cgit v1.2.1