diff options
-rw-r--r-- | .gitmodules | 3 | ||||
-rw-r--r-- | lib/git/__init__.py | 21 | ||||
-rw-r--r-- | lib/git/errors.py | 12 | ||||
m--------- | lib/git/ext/gitdb | 0 | ||||
-rw-r--r-- | lib/git/objects/commit.py | 2 | ||||
-rw-r--r-- | lib/git/odb/__init__.py | 6 | ||||
-rw-r--r-- | lib/git/odb/db.py | 341 | ||||
-rw-r--r-- | lib/git/odb/fun.py | 115 | ||||
-rw-r--r-- | lib/git/odb/stream.py | 445 | ||||
-rw-r--r-- | lib/git/odb/utils.py | 38 | ||||
-rw-r--r-- | lib/git/repo.py | 2 | ||||
-rw-r--r-- | lib/git/utils.py | 35 | ||||
-rw-r--r-- | test/git/odb/__init__.py | 1 | ||||
-rw-r--r-- | test/git/odb/lib.py | 60 | ||||
-rw-r--r-- | test/git/odb/test_db.py | 90 | ||||
-rw-r--r-- | test/git/odb/test_stream.py | 172 | ||||
-rw-r--r-- | test/git/odb/test_utils.py | 15 | ||||
-rw-r--r-- | test/git/performance/test_streams.py | 13 | ||||
-rw-r--r-- | test/git/test_commit.py | 2 | ||||
-rw-r--r-- | test/testlib/helper.py | 23 |
20 files changed, 33 insertions, 1363 deletions
diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 00000000..4cdd431a --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "lib/git/ext/gitdb"] + path = lib/git/ext/gitdb + url = git://gitorious.org/git-python/gitdb.git diff --git a/lib/git/__init__.py b/lib/git/__init__.py index 2f17c55b..6728ba82 100644 --- a/lib/git/__init__.py +++ b/lib/git/__init__.py @@ -5,10 +5,25 @@ # the BSD License: http://www.opensource.org/licenses/bsd-license.php import os +import sys import inspect __version__ = 'git' + +#{ Initialization +def _init_externals(): + """Initialize external projects by putting them into the path""" + sys.path.append(os.path.join(os.path.dirname(__file__), 'ext')) + +#} END initialization + +################# +_init_externals() +################# + +#{ Imports + from git.config import GitConfigParser from git.objects import * from git.refs import * @@ -22,8 +37,8 @@ from git.remote import * from git.index import * from git.utils import LockFile, BlockingLockFile -# odb is NOT imported intentionally - if you really want it, you should get it -# yourself as its part of the core +#} END imports __all__ = [ name for name, obj in locals().items() - if not (name.startswith('_') or inspect.ismodule(obj)) ] + if not (name.startswith('_') or inspect.ismodule(obj)) ] + diff --git a/lib/git/errors.py b/lib/git/errors.py index d8a35e02..6d926500 100644 --- a/lib/git/errors.py +++ b/lib/git/errors.py @@ -10,18 +10,6 @@ Module containing all exceptions thrown througout the git package, class InvalidGitRepositoryError(Exception): """ Thrown if the given repository appears to have an invalid format. """ -class ODBError(Exception): - """All errors thrown by the object database""" - -class InvalidDBRoot(ODBError): - """Thrown if an object database cannot be initialized at the given path""" - -class BadObject(ODBError): - """The object with the given SHA does not exist""" - -class BadObjectType(ODBError): - """The object had an unsupported type""" - class NoSuchPathError(OSError): """ Thrown if a path could not be access by the system. """ diff --git a/lib/git/ext/gitdb b/lib/git/ext/gitdb new file mode 160000 +Subproject 10fef8f8e4ee83cf54feadbb5ffb522efec739f diff --git a/lib/git/objects/commit.py b/lib/git/objects/commit.py index 9a3c2c95..0d8f2c64 100644 --- a/lib/git/objects/commit.py +++ b/lib/git/objects/commit.py @@ -9,7 +9,7 @@ import git.diff as diff import git.stats as stats from git.actor import Actor from tree import Tree -from git.odb import IStream +from gitdb import IStream from cStringIO import StringIO import base import utils diff --git a/lib/git/odb/__init__.py b/lib/git/odb/__init__.py deleted file mode 100644 index 5789d7eb..00000000 --- a/lib/git/odb/__init__.py +++ /dev/null @@ -1,6 +0,0 @@ -"""Initialize the object database module""" - -# default imports -from db import * -from stream import * - diff --git a/lib/git/odb/db.py b/lib/git/odb/db.py deleted file mode 100644 index 5d3cc6a3..00000000 --- a/lib/git/odb/db.py +++ /dev/null @@ -1,341 +0,0 @@ -"""Contains implementations of database retrieveing objects""" -from git.utils import IndexFileSHA1Writer -from git.errors import ( - InvalidDBRoot, - BadObject, - BadObjectType - ) - -from stream import ( - DecompressMemMapReader, - FDCompressedSha1Writer, - Sha1Writer, - OStream, - OInfo - ) - -from utils import ( - ENOENT, - to_hex_sha, - exists, - hex_to_bin, - isdir, - mkdir, - rename, - dirname, - join - ) - -from fun import ( - chunk_size, - loose_object_header_info, - write_object, - stream_copy - ) - -import tempfile -import mmap -import os - - -__all__ = ('ObjectDBR', 'ObjectDBW', 'FileDBBase', 'LooseObjectDB', 'PackedDB', - 'CompoundDB', 'ReferenceDB', 'GitObjectDB' ) - -class ObjectDBR(object): - """Defines an interface for object database lookup. - Objects are identified either by hex-sha (40 bytes) or - by sha (20 bytes)""" - - def __contains__(self, sha): - return self.has_obj - - #{ Query Interface - def has_object(self, sha): - """ - :return: True if the object identified by the given 40 byte hexsha or 20 bytes - binary sha is contained in the database - :raise BadObject:""" - raise NotImplementedError("To be implemented in subclass") - - def info(self, sha): - """ :return: OInfo instance - :param sha: 40 bytes hexsha or 20 bytes binary sha - :raise BadObject:""" - raise NotImplementedError("To be implemented in subclass") - - def info_async(self, input_channel): - """Retrieve information of a multitude of objects asynchronously - :param input_channel: Channel yielding the sha's of the objects of interest - :return: Channel yielding OInfo|InvalidOInfo, in any order""" - raise NotImplementedError("To be implemented in subclass") - - def stream(self, sha): - """:return: OStream instance - :param sha: 40 bytes hexsha or 20 bytes binary sha - :raise BadObject:""" - raise NotImplementedError("To be implemented in subclass") - - def stream_async(self, input_channel): - """Retrieve the OStream of multiple objects - :param input_channel: see ``info`` - :param max_threads: see ``ObjectDBW.store`` - :return: Channel yielding OStream|InvalidOStream instances in any order""" - raise NotImplementedError("To be implemented in subclass") - - #} END query interface - -class ObjectDBW(object): - """Defines an interface to create objects in the database""" - - def __init__(self, *args, **kwargs): - self._ostream = None - - #{ Edit Interface - def set_ostream(self, stream): - """Adjusts the stream to which all data should be sent when storing new objects - :param stream: if not None, the stream to use, if None the default stream - will be used. - :return: previously installed stream, or None if there was no override - :raise TypeError: if the stream doesn't have the supported functionality""" - cstream = self._ostream - self._ostream = stream - return cstream - - def ostream(self): - """:return: overridden output stream this instance will write to, or None - if it will write to the default stream""" - return self._ostream - - def store(self, istream): - """Create a new object in the database - :return: the input istream object with its sha set to its corresponding value - :param istream: IStream compatible instance. If its sha is already set - to a value, the object will just be stored in the our database format, - in which case the input stream is expected to be in object format ( header + contents ). - :raise IOError: if data could not be written""" - raise NotImplementedError("To be implemented in subclass") - - def store_async(self, input_channel): - """Create multiple new objects in the database asynchronously. The method will - return right away, returning an output channel which receives the results as - they are computed. - - :return: Channel yielding your IStream which served as input, in any order. - The IStreams sha will be set to the sha it received during the process, - or its error attribute will be set to the exception informing about the error. - :param input_channel: Channel yielding IStream instance. - As the same instances will be used in the output channel, you can create a map - between the id(istream) -> istream - :note:As some ODB implementations implement this operation as atomic, they might - abort the whole operation if one item could not be processed. Hence check how - many items have actually been produced.""" - raise NotImplementedError("To be implemented in subclass") - - #} END edit interface - - -class FileDBBase(object): - """Provides basic facilities to retrieve files of interest, including - caching facilities to help mapping hexsha's to objects""" - - def __init__(self, root_path): - """Initialize this instance to look for its files at the given root path - All subsequent operations will be relative to this path - :raise InvalidDBRoot: - :note: The base will not perform any accessablity checking as the base - might not yet be accessible, but become accessible before the first - access.""" - super(FileDBBase, self).__init__() - self._root_path = root_path - - - #{ Interface - def root_path(self): - """:return: path at which this db operates""" - return self._root_path - - def db_path(self, rela_path): - """ - :return: the given relative path relative to our database root, allowing - to pontentially access datafiles""" - return join(self._root_path, rela_path) - #} END interface - - - -class LooseObjectDB(FileDBBase, ObjectDBR, ObjectDBW): - """A database which operates on loose object files""" - - # CONFIGURATION - # chunks in which data will be copied between streams - stream_chunk_size = chunk_size - - - def __init__(self, root_path): - super(LooseObjectDB, self).__init__(root_path) - self._hexsha_to_file = dict() - # Additional Flags - might be set to 0 after the first failure - # Depending on the root, this might work for some mounts, for others not, which - # is why it is per instance - self._fd_open_flags = getattr(os, 'O_NOATIME', 0) - - #{ Interface - def object_path(self, hexsha): - """ - :return: path at which the object with the given hexsha would be stored, - relative to the database root""" - return join(hexsha[:2], hexsha[2:]) - - def readable_db_object_path(self, hexsha): - """ - :return: readable object path to the object identified by hexsha - :raise BadObject: If the object file does not exist""" - try: - return self._hexsha_to_file[hexsha] - except KeyError: - pass - # END ignore cache misses - - # try filesystem - path = self.db_path(self.object_path(hexsha)) - if exists(path): - self._hexsha_to_file[hexsha] = path - return path - # END handle cache - raise BadObject(hexsha) - - #} END interface - - def _map_loose_object(self, sha): - """ - :return: memory map of that file to allow random read access - :raise BadObject: if object could not be located""" - db_path = self.db_path(self.object_path(to_hex_sha(sha))) - try: - fd = os.open(db_path, os.O_RDONLY|self._fd_open_flags) - except OSError,e: - if e.errno != ENOENT: - # try again without noatime - try: - fd = os.open(db_path, os.O_RDONLY) - except OSError: - raise BadObject(to_hex_sha(sha)) - # didn't work because of our flag, don't try it again - self._fd_open_flags = 0 - else: - raise BadObject(to_hex_sha(sha)) - # END handle error - # END exception handling - try: - return mmap.mmap(fd, 0, access=mmap.ACCESS_READ) - finally: - os.close(fd) - # END assure file is closed - - def set_ostream(self, stream): - """:raise TypeError: if the stream does not support the Sha1Writer interface""" - if stream is not None and not isinstance(stream, Sha1Writer): - raise TypeError("Output stream musst support the %s interface" % Sha1Writer.__name__) - return super(LooseObjectDB, self).set_ostream(stream) - - def info(self, sha): - m = self._map_loose_object(sha) - try: - type, size = loose_object_header_info(m) - return OInfo(sha, type, size) - finally: - m.close() - # END assure release of system resources - - def stream(self, sha): - m = self._map_loose_object(sha) - type, size, stream = DecompressMemMapReader.new(m, close_on_deletion = True) - return OStream(sha, type, size, stream) - - def has_object(self, sha): - try: - self.readable_db_object_path(to_hex_sha(sha)) - return True - except BadObject: - return False - # END check existance - - def store(self, istream): - """note: The sha we produce will be hex by nature""" - tmp_path = None - writer = self.ostream() - if writer is None: - # open a tmp file to write the data to - fd, tmp_path = tempfile.mkstemp(prefix='obj', dir=self._root_path) - writer = FDCompressedSha1Writer(fd) - # END handle custom writer - - try: - try: - if istream.sha is not None: - stream_copy(istream.read, writer.write, istream.size, self.stream_chunk_size) - else: - # write object with header, we have to make a new one - write_object(istream.type, istream.size, istream.read, writer.write, - chunk_size=self.stream_chunk_size) - # END handle direct stream copies - except: - if tmp_path: - os.remove(tmp_path) - raise - # END assure tmpfile removal on error - finally: - if tmp_path: - writer.close() - # END assure target stream is closed - - sha = istream.sha or writer.sha(as_hex=True) - - if tmp_path: - obj_path = self.db_path(self.object_path(sha)) - obj_dir = dirname(obj_path) - if not isdir(obj_dir): - mkdir(obj_dir) - # END handle destination directory - rename(tmp_path, obj_path) - # END handle dry_run - - istream.sha = sha - return istream - - -class PackedDB(FileDBBase, ObjectDBR): - """A database operating on a set of object packs""" - - -class CompoundDB(ObjectDBR): - """A database which delegates calls to sub-databases""" - - -class ReferenceDB(CompoundDB): - """A database consisting of database referred to in a file""" - - -#class GitObjectDB(CompoundDB, ObjectDBW): -class GitObjectDB(LooseObjectDB): - """A database representing the default git object store, which includes loose - objects, pack files and an alternates file - - It will create objects only in the loose object database. - :note: for now, we use the git command to do all the lookup, just until he - have packs and the other implementations - """ - def __init__(self, root_path, git): - """Initialize this instance with the root and a git command""" - super(GitObjectDB, self).__init__(root_path) - self._git = git - - def info(self, sha): - t = self._git.get_object_header(sha) - return OInfo(*t) - - def stream(self, sha): - """For now, all lookup is done by git itself""" - t = self._git.stream_object_data(sha) - return OStream(*t) - diff --git a/lib/git/odb/fun.py b/lib/git/odb/fun.py deleted file mode 100644 index 3321a8ea..00000000 --- a/lib/git/odb/fun.py +++ /dev/null @@ -1,115 +0,0 @@ -"""Contains basic c-functions which usually contain performance critical code -Keeping this code separate from the beginning makes it easier to out-source -it into c later, if required""" - -from git.errors import ( - BadObjectType - ) - -import zlib -decompressobj = zlib.decompressobj - - -# INVARIANTS -type_id_to_type_map = { - 1 : "commit", - 2 : "tree", - 3 : "blob", - 4 : "tag" - } - -# used when dealing with larger streams -chunk_size = 1000*1000 - -__all__ = ('is_loose_object', 'loose_object_header_info', 'object_header_info', - 'write_object' ) - -#{ Routines - -def is_loose_object(m): - """:return: True the file contained in memory map m appears to be a loose object. - Only the first two bytes are needed""" - b0, b1 = map(ord, m[:2]) - word = (b0 << 8) + b1 - return b0 == 0x78 and (word % 31) == 0 - -def loose_object_header_info(m): - """:return: tuple(type_string, uncompressed_size_in_bytes) the type string of the - object as well as its uncompressed size in bytes. - :param m: memory map from which to read the compressed object data""" - decompress_size = 8192 # is used in cgit as well - hdr = decompressobj().decompress(m, decompress_size) - type_name, size = hdr[:hdr.find("\0")].split(" ") - return type_name, int(size) - -def object_header_info(m): - """:return: tuple(type_string, uncompressed_size_in_bytes - :param mmap: mapped memory map. It will be - seeked to the actual start of the object contents, which can be used - to initialize a zlib decompress object. - :note: This routine can only handle new-style objects which are assumably contained - in packs - """ - assert not is_loose_object(m), "Use loose_object_header_info instead" - - c = b0 # first byte - i = 1 # next char to read - type_id = (c >> 4) & 7 # numeric type - size = c & 15 # starting size - s = 4 # starting bit-shift size - while c & 0x80: - c = ord(m[i]) - i += 1 - size += (c & 0x7f) << s - s += 7 - # END character loop - - # finally seek the map to the start of the data stream - m.seek(i) - try: - return (type_id_to_type_map[type_id], size) - except KeyError: - # invalid object type - we could try to be smart now and decode part - # of the stream to get the info, problem is that we had trouble finding - # the exact start of the content stream - raise BadObjectType(type_id) - # END handle exceptions - -def write_object(type, size, read, write, chunk_size=chunk_size): - """Write the object as identified by type, size and source_stream into the - target_stream - - :param type: type string of the object - :param size: amount of bytes to write from source_stream - :param read: read method of a stream providing the content data - :param write: write method of the output stream - :param close_target_stream: if True, the target stream will be closed when - the routine exits, even if an error is thrown - :return: The actual amount of bytes written to stream, which includes the header and a trailing newline""" - tbw = 0 # total num bytes written - - # WRITE HEADER: type SP size NULL - tbw += write("%s %i\0" % (type, size)) - tbw += stream_copy(read, write, size, chunk_size) - - return tbw - -def stream_copy(read, write, size, chunk_size): - """Copy a stream up to size bytes using the provided read and write methods, - in chunks of chunk_size - :note: its much like stream_copy utility, but operates just using methods""" - dbw = 0 # num data bytes written - - # WRITE ALL DATA UP TO SIZE - while True: - cs = min(chunk_size, size-dbw) - data_len = write(read(cs)) - dbw += data_len - if data_len < cs or dbw == size: - break - # END check for stream end - # END duplicate data - return dbw - - -#} END routines diff --git a/lib/git/odb/stream.py b/lib/git/odb/stream.py deleted file mode 100644 index da97cf5b..00000000 --- a/lib/git/odb/stream.py +++ /dev/null @@ -1,445 +0,0 @@ -import zlib -from cStringIO import StringIO -from git.utils import make_sha -import errno - -from utils import ( - to_hex_sha, - to_bin_sha, - write, - close - ) - -__all__ = ('OInfo', 'OStream', 'IStream', 'InvalidOInfo', 'InvalidOStream', - 'DecompressMemMapReader', 'FDCompressedSha1Writer') - - -# ZLIB configuration -# used when compressing objects - 1 to 9 ( slowest ) -Z_BEST_SPEED = 1 - - -#{ ODB Bases - -class OInfo(tuple): - """Carries information about an object in an ODB, provdiing information - about the sha of the object, the type_string as well as the uncompressed size - in bytes. - - It can be accessed using tuple notation and using attribute access notation:: - - assert dbi[0] == dbi.sha - assert dbi[1] == dbi.type - assert dbi[2] == dbi.size - - The type is designed to be as lighteight as possible.""" - __slots__ = tuple() - - def __new__(cls, sha, type, size): - return tuple.__new__(cls, (sha, type, size)) - - def __init__(self, *args): - tuple.__init__(self) - - #{ Interface - @property - def sha(self): - return self[0] - - @property - def type(self): - return self[1] - - @property - def size(self): - return self[2] - #} END interface - - -class OStream(OInfo): - """Base for object streams retrieved from the database, providing additional - information about the stream. - Generally, ODB streams are read-only as objects are immutable""" - __slots__ = tuple() - - def __new__(cls, sha, type, size, stream, *args, **kwargs): - """Helps with the initialization of subclasses""" - return tuple.__new__(cls, (sha, type, size, stream)) - - - def __init__(self, *args, **kwargs): - tuple.__init__(self) - - #{ Stream Reader Interface - - def read(self, size=-1): - return self[3].read(size) - - #} END stream reader interface - - -class IStream(list): - """Represents an input content stream to be fed into the ODB. It is mutable to allow - the ODB to record information about the operations outcome right in this instance. - - It provides interfaces for the OStream and a StreamReader to allow the instance - to blend in without prior conversion. - - The only method your content stream must support is 'read'""" - __slots__ = tuple() - - def __new__(cls, type, size, stream, sha=None): - return list.__new__(cls, (sha, type, size, stream, None)) - - def __init__(self, type, size, stream, sha=None): - list.__init__(self, (sha, type, size, stream, None)) - - #{ Interface - - @property - def hexsha(self): - """:return: our sha, hex encoded, 40 bytes""" - return to_hex_sha(self[0]) - - @property - def binsha(self): - """:return: our sha as binary, 20 bytes""" - return to_bin_sha(self[0]) - - def _error(self): - """:return: the error that occurred when processing the stream, or None""" - return self[4] - - def _set_error(self, exc): - """Set this input stream to the given exc, may be None to reset the error""" - self[4] = exc - - error = property(_error, _set_error) - - #} END interface - - #{ Stream Reader Interface - - def read(self, size=-1): - """Implements a simple stream reader interface, passing the read call on - to our internal stream""" - return self[3].read(size) - - #} END stream reader interface - - #{ interface - - def _set_sha(self, sha): - self[0] = sha - - def _sha(self): - return self[0] - - sha = property(_sha, _set_sha) - - - def _type(self): - return self[1] - - def _set_type(self, type): - self[1] = type - - type = property(_type, _set_type) - - def _size(self): - return self[2] - - def _set_size(self, size): - self[2] = size - - size = property(_size, _set_size) - - def _stream(self): - return self[3] - - def _set_stream(self, stream): - self[3] = stream - - stream = property(_stream, _set_stream) - - #} END odb info interface - - -class InvalidOInfo(tuple): - """Carries information about a sha identifying an object which is invalid in - the queried database. The exception attribute provides more information about - the cause of the issue""" - __slots__ = tuple() - - def __new__(cls, sha, exc): - return tuple.__new__(cls, (sha, exc)) - - def __init__(self, sha, exc): - tuple.__init__(self, (sha, exc)) - - @property - def sha(self): - return self[0] - - @property - def error(self): - """:return: exception instance explaining the failure""" - return self[1] - - -class InvalidOStream(InvalidOInfo): - """Carries information about an invalid ODB stream""" - __slots__ = tuple() - -#} END ODB Bases - - -#{ RO Streams - -class DecompressMemMapReader(object): - """Reads data in chunks from a memory map and decompresses it. The client sees - only the uncompressed data, respective file-like read calls are handling on-demand - buffered decompression accordingly - - A constraint on the total size of bytes is activated, simulating - a logical file within a possibly larger physical memory area - - To read efficiently, you clearly don't want to read individual bytes, instead, - read a few kilobytes at least. - - :note: The chunk-size should be carefully selected as it will involve quite a bit - of string copying due to the way the zlib is implemented. Its very wasteful, - hence we try to find a good tradeoff between allocation time and number of - times we actually allocate. An own zlib implementation would be good here - to better support streamed reading - it would only need to keep the mmap - and decompress it into chunks, thats all ... """ - __slots__ = ('_m', '_zip', '_buf', '_buflen', '_br', '_cws', '_cwe', '_s', '_close') - - max_read_size = 512*1024 # currently unused - - def __init__(self, m, close_on_deletion, size): - """Initialize with mmap for stream reading - :param m: must be content data - use new if you have object data and no size""" - self._m = m - self._zip = zlib.decompressobj() - self._buf = None # buffer of decompressed bytes - self._buflen = 0 # length of bytes in buffer - self._s = size # size of uncompressed data to read in total - self._br = 0 # num uncompressed bytes read - self._cws = 0 # start byte of compression window - self._cwe = 0 # end byte of compression window - self._close = close_on_deletion # close the memmap on deletion ? - - def __del__(self): - if self._close: - self._m.close() - # END handle resource freeing - - def _parse_header_info(self): - """If this stream contains object data, parse the header info and skip the - stream to a point where each read will yield object content - :return: parsed type_string, size""" - # read header - maxb = 512 # should really be enough, cgit uses 8192 I believe - self._s = maxb - hdr = self.read(maxb) - hdrend = hdr.find("\0") - type, size = hdr[:hdrend].split(" ") - size = int(size) - self._s = size - - # adjust internal state to match actual header length that we ignore - # The buffer will be depleted first on future reads - self._br = 0 - hdrend += 1 # count terminating \0 - self._buf = StringIO(hdr[hdrend:]) - self._buflen = len(hdr) - hdrend - - return type, size - - @classmethod - def new(self, m, close_on_deletion=False): - """Create a new DecompressMemMapReader instance for acting as a read-only stream - This method parses the object header from m and returns the parsed - type and size, as well as the created stream instance. - :param m: memory map on which to oparate. It must be object data ( header + contents ) - :param close_on_deletion: if True, the memory map will be closed once we are - being deleted""" - inst = DecompressMemMapReader(m, close_on_deletion, 0) - type, size = inst._parse_header_info() - return type, size, inst - - def read(self, size=-1): - if size < 1: - size = self._s - self._br - else: - size = min(size, self._s - self._br) - # END clamp size - - if size == 0: - return str() - # END handle depletion - - # protect from memory peaks - # If he tries to read large chunks, our memory patterns get really bad - # as we end up copying a possibly huge chunk from our memory map right into - # memory. This might not even be possible. Nonetheless, try to dampen the - # effect a bit by reading in chunks, returning a huge string in the end. - # Our performance now depends on StringIO. This way we don't need two large - # buffers in peak times, but only one large one in the end which is - # the return buffer - # NO: We don't do it - if the user thinks its best, he is right. If he - # has trouble, he will start reading in chunks. According to our tests - # its still faster if we read 10 Mb at once instead of chunking it. - - # if size > self.max_read_size: - # sio = StringIO() - # while size: - # read_size = min(self.max_read_size, size) - # data = self.read(read_size) - # sio.write(data) - # size -= len(data) - # if len(data) < read_size: - # break - # # END data loop - # sio.seek(0) - # return sio.getvalue() - # # END handle maxread - # - # deplete the buffer, then just continue using the decompress object - # which has an own buffer. We just need this to transparently parse the - # header from the zlib stream - dat = str() - if self._buf: - if self._buflen >= size: - # have enough data - dat = self._buf.read(size) - self._buflen -= size - self._br += size - return dat - else: - dat = self._buf.read() # ouch, duplicates data - size -= self._buflen - self._br += self._buflen - - self._buflen = 0 - self._buf = None - # END handle buffer len - # END handle buffer - - # decompress some data - # Abstract: zlib needs to operate on chunks of our memory map ( which may - # be large ), as it will otherwise and always fill in the 'unconsumed_tail' - # attribute which possible reads our whole map to the end, forcing - # everything to be read from disk even though just a portion was requested. - # As this would be a nogo, we workaround it by passing only chunks of data, - # moving the window into the memory map along as we decompress, which keeps - # the tail smaller than our chunk-size. This causes 'only' the chunk to be - # copied once, and another copy of a part of it when it creates the unconsumed - # tail. We have to use it to hand in the appropriate amount of bytes durin g - # the next read. - tail = self._zip.unconsumed_tail - if tail: - # move the window, make it as large as size demands. For code-clarity, - # we just take the chunk from our map again instead of reusing the unconsumed - # tail. The latter one would safe some memory copying, but we could end up - # with not getting enough data uncompressed, so we had to sort that out as well. - # Now we just assume the worst case, hence the data is uncompressed and the window - # needs to be as large as the uncompressed bytes we want to read. - self._cws = self._cwe - len(tail) - self._cwe = self._cws + size - else: - cws = self._cws - self._cws = self._cwe - self._cwe = cws + size - # END handle tail - - - # if window is too small, make it larger so zip can decompress something - win_size = self._cwe - self._cws - if win_size < 8: - self._cwe = self._cws + 8 - # END adjust winsize - indata = self._m[self._cws:self._cwe] # another copy ... :( - - # get the actual window end to be sure we don't use it for computations - self._cwe = self._cws + len(indata) - - dcompdat = self._zip.decompress(indata, size) - - self._br += len(dcompdat) - if dat: - dcompdat = dat + dcompdat - - return dcompdat - -#} END RO streams - - -#{ W Streams - -class Sha1Writer(object): - """Simple stream writer which produces a sha whenever you like as it degests - everything it is supposed to write""" - __slots__ = "sha1" - - def __init__(self): - self.sha1 = make_sha("") - - #{ Stream Interface - - def write(self, data): - """:raise IOError: If not all bytes could be written - :return: lenght of incoming data""" - self.sha1.update(data) - return len(data) - - # END stream interface - - #{ Interface - - def sha(self, as_hex = False): - """:return: sha so far - :param as_hex: if True, sha will be hex-encoded, binary otherwise""" - if as_hex: - return self.sha1.hexdigest() - return self.sha1.digest() - - #} END interface - -class FDCompressedSha1Writer(Sha1Writer): - """Digests data written to it, making the sha available, then compress the - data and write it to the file descriptor - :note: operates on raw file descriptors - :note: for this to work, you have to use the close-method of this instance""" - __slots__ = ("fd", "sha1", "zip") - - # default exception - exc = IOError("Failed to write all bytes to filedescriptor") - - def __init__(self, fd): - super(FDCompressedSha1Writer, self).__init__() - self.fd = fd - self.zip = zlib.compressobj(Z_BEST_SPEED) - - #{ Stream Interface - - def write(self, data): - """:raise IOError: If not all bytes could be written - :return: lenght of incoming data""" - self.sha1.update(data) - cdata = self.zip.compress(data) - bytes_written = write(self.fd, cdata) - if bytes_written != len(cdata): - raise self.exc - return len(data) - - def close(self): - remainder = self.zip.flush() - if write(self.fd, remainder) != len(remainder): - raise self.exc - return close(self.fd) - - #} END stream interface - -#} END W streams diff --git a/lib/git/odb/utils.py b/lib/git/odb/utils.py deleted file mode 100644 index 6863e97b..00000000 --- a/lib/git/odb/utils.py +++ /dev/null @@ -1,38 +0,0 @@ -import binascii -import os -import errno - -#{ Routines - -hex_to_bin = binascii.a2b_hex -bin_to_hex = binascii.b2a_hex - -def to_hex_sha(sha): - """:return: hexified version of sha""" - if len(sha) == 40: - return sha - return bin_to_hex(sha) - -def to_bin_sha(sha): - if len(sha) == 20: - return sha - return hex_to_bin(sha) - -# errors -ENOENT = errno.ENOENT - -# os shortcuts -exists = os.path.exists -mkdir = os.mkdir -isdir = os.path.isdir -rename = os.rename -dirname = os.path.dirname -join = os.path.join -read = os.read -write = os.write -close = os.close - - -#} END Routines - - diff --git a/lib/git/repo.py b/lib/git/repo.py index 78e5f526..4442a79e 100644 --- a/lib/git/repo.py +++ b/lib/git/repo.py @@ -13,7 +13,7 @@ from objects import * from config import GitConfigParser from remote import Remote -from odb import GitObjectDB +from gitdb import GitObjectDB import os import sys diff --git a/lib/git/utils.py b/lib/git/utils.py index 60a7de48..ab763a10 100644 --- a/lib/git/utils.py +++ b/lib/git/utils.py @@ -9,38 +9,11 @@ import sys import time import tempfile -try: - import hashlib -except ImportError: - import sha +from gitdb.util import ( + stream_copy, + make_sha + ) -def make_sha(source=''): - """ - A python2.4 workaround for the sha/hashlib module fiasco - - Note - From the dulwich project - """ - try: - return hashlib.sha1(source) - except NameError: - sha1 = sha.sha(source) - return sha1 - -def stream_copy(source, destination, chunk_size=512*1024): - """Copy all data from the source stream into the destination stream in chunks - of size chunk_size - :return: amount of bytes written""" - br = 0 - while True: - chunk = source.read(chunk_size) - destination.write(chunk) - br += len(chunk) - if len(chunk) < chunk_size: - break - # END reading output stream - return br - def join_path(a, *p): """Join path tokens together similar to os.path.join, but always use diff --git a/test/git/odb/__init__.py b/test/git/odb/__init__.py deleted file mode 100644 index 8b137891..00000000 --- a/test/git/odb/__init__.py +++ /dev/null @@ -1 +0,0 @@ - diff --git a/test/git/odb/lib.py b/test/git/odb/lib.py deleted file mode 100644 index d5199748..00000000 --- a/test/git/odb/lib.py +++ /dev/null @@ -1,60 +0,0 @@ -"""Utilities used in ODB testing""" -from git.odb import ( - OStream, - ) -from git.odb.stream import Sha1Writer - -import zlib -from cStringIO import StringIO - -#{ Stream Utilities - -class DummyStream(object): - def __init__(self): - self.was_read = False - self.bytes = 0 - self.closed = False - - def read(self, size): - self.was_read = True - self.bytes = size - - def close(self): - self.closed = True - - def _assert(self): - assert self.was_read - - -class DeriveTest(OStream): - def __init__(self, sha, type, size, stream, *args, **kwargs): - self.myarg = kwargs.pop('myarg') - self.args = args - - def _assert(self): - assert self.args - assert self.myarg - - -class ZippedStoreShaWriter(Sha1Writer): - """Remembers everything someone writes to it""" - __slots__ = ('buf', 'zip') - def __init__(self): - Sha1Writer.__init__(self) - self.buf = StringIO() - self.zip = zlib.compressobj(1) # fastest - - def __getattr__(self, attr): - return getattr(self.buf, attr) - - def write(self, data): - alen = Sha1Writer.write(self, data) - self.buf.write(self.zip.compress(data)) - return alen - - def close(self): - self.buf.write(self.zip.flush()) - - -#} END stream utilitiess - diff --git a/test/git/odb/test_db.py b/test/git/odb/test_db.py deleted file mode 100644 index 35ba8680..00000000 --- a/test/git/odb/test_db.py +++ /dev/null @@ -1,90 +0,0 @@ -"""Test for object db""" -from test.testlib import * -from lib import ZippedStoreShaWriter - -from git.odb import * -from git.odb.stream import Sha1Writer -from git import Blob -from git.errors import BadObject - - -from cStringIO import StringIO -import os - -class TestDB(TestBase): - """Test the different db class implementations""" - - # data - two_lines = "1234\nhello world" - - all_data = (two_lines, ) - - def _assert_object_writing(self, db): - """General tests to verify object writing, compatible to ObjectDBW - :note: requires write access to the database""" - # start in 'dry-run' mode, using a simple sha1 writer - ostreams = (ZippedStoreShaWriter, None) - for ostreamcls in ostreams: - for data in self.all_data: - dry_run = ostreamcls is not None - ostream = None - if ostreamcls is not None: - ostream = ostreamcls() - assert isinstance(ostream, Sha1Writer) - # END create ostream - - prev_ostream = db.set_ostream(ostream) - assert type(prev_ostream) in ostreams or prev_ostream in ostreams - - istream = IStream(Blob.type, len(data), StringIO(data)) - - # store returns same istream instance, with new sha set - my_istream = db.store(istream) - sha = istream.sha - assert my_istream is istream - assert db.has_object(sha) != dry_run - assert len(sha) == 40 # for now we require 40 byte shas as default - - # verify data - the slow way, we want to run code - if not dry_run: - info = db.info(sha) - assert Blob.type == info.type - assert info.size == len(data) - - ostream = db.stream(sha) - assert ostream.read() == data - assert ostream.type == Blob.type - assert ostream.size == len(data) - else: - self.failUnlessRaises(BadObject, db.info, sha) - self.failUnlessRaises(BadObject, db.stream, sha) - - # DIRECT STREAM COPY - # our data hase been written in object format to the StringIO - # we pasesd as output stream. No physical database representation - # was created. - # Test direct stream copy of object streams, the result must be - # identical to what we fed in - ostream.seek(0) - istream.stream = ostream - assert istream.sha is not None - prev_sha = istream.sha - - db.set_ostream(ZippedStoreShaWriter()) - db.store(istream) - assert istream.sha == prev_sha - new_ostream = db.ostream() - - # note: only works as long our store write uses the same compression - # level, which is zip - assert ostream.getvalue() == new_ostream.getvalue() - # END for each data set - # END for each dry_run mode - - @with_bare_rw_repo - def test_writing(self, rwrepo): - ldb = LooseObjectDB(os.path.join(rwrepo.git_dir, 'objects')) - - # write data - self._assert_object_writing(ldb) - diff --git a/test/git/odb/test_stream.py b/test/git/odb/test_stream.py deleted file mode 100644 index 020fe6bd..00000000 --- a/test/git/odb/test_stream.py +++ /dev/null @@ -1,172 +0,0 @@ -"""Test for object db""" -from test.testlib import * -from lib import ( - DummyStream, - DeriveTest, - Sha1Writer - ) - -from git.odb import * -from git import Blob -from cStringIO import StringIO -import tempfile -import os -import zlib - - - - -class TestStream(TestBase): - """Test stream classes""" - - data_sizes = (15, 10000, 1000*1024+512) - - def test_streams(self): - # test info - sha = Blob.NULL_HEX_SHA - s = 20 - info = OInfo(sha, Blob.type, s) - assert info.sha == sha - assert info.type == Blob.type - assert info.size == s - - # test ostream - stream = DummyStream() - ostream = OStream(*(info + (stream, ))) - ostream.read(15) - stream._assert() - assert stream.bytes == 15 - ostream.read(20) - assert stream.bytes == 20 - - # derive with own args - DeriveTest(sha, Blob.type, s, stream, 'mine',myarg = 3)._assert() - - # test istream - istream = IStream(Blob.type, s, stream) - assert istream.sha == None - istream.sha = sha - assert istream.sha == sha - - assert len(istream.binsha) == 20 - assert len(istream.hexsha) == 40 - - assert istream.size == s - istream.size = s * 2 - istream.size == s * 2 - assert istream.type == Blob.type - istream.type = "something" - assert istream.type == "something" - assert istream.stream is stream - istream.stream = None - assert istream.stream is None - - assert istream.error is None - istream.error = Exception() - assert isinstance(istream.error, Exception) - - def _assert_stream_reader(self, stream, cdata, rewind_stream=lambda s: None): - """Make stream tests - the orig_stream is seekable, allowing it to be - rewound and reused - :param cdata: the data we expect to read from stream, the contents - :param rewind_stream: function called to rewind the stream to make it ready - for reuse""" - ns = 10 - assert len(cdata) > ns-1, "Data must be larger than %i, was %i" % (ns, len(cdata)) - - # read in small steps - ss = len(cdata) / ns - for i in range(ns): - data = stream.read(ss) - chunk = cdata[i*ss:(i+1)*ss] - assert data == chunk - # END for each step - rest = stream.read() - if rest: - assert rest == cdata[-len(rest):] - # END handle rest - - rewind_stream(stream) - - # read everything - rdata = stream.read() - assert rdata == cdata - - def test_decompress_reader(self): - for close_on_deletion in range(2): - for with_size in range(2): - for ds in self.data_sizes: - cdata = make_bytes(ds, randomize=False) - - # zdata = zipped actual data - # cdata = original content data - - # create reader - if with_size: - # need object data - zdata = zlib.compress(make_object(Blob.type, cdata)) - type, size, reader = DecompressMemMapReader.new(zdata, close_on_deletion) - assert size == len(cdata) - assert type == Blob.type - else: - # here we need content data - zdata = zlib.compress(cdata) - reader = DecompressMemMapReader(zdata, close_on_deletion, len(cdata)) - assert reader._s == len(cdata) - # END get reader - - def rewind(r): - r._zip = zlib.decompressobj() - r._br = r._cws = r._cwe = 0 - if with_size: - r._parse_header_info() - # END skip header - # END make rewind func - - self._assert_stream_reader(reader, cdata, rewind) - - # put in a dummy stream for closing - dummy = DummyStream() - reader._m = dummy - - assert not dummy.closed - del(reader) - assert dummy.closed == close_on_deletion - #zdi# - # END for each datasize - # END whether size should be used - # END whether stream should be closed when deleted - - def test_sha_writer(self): - writer = Sha1Writer() - assert 2 == writer.write("hi") - assert len(writer.sha(as_hex=1)) == 40 - assert len(writer.sha(as_hex=0)) == 20 - - # make sure it does something ;) - prev_sha = writer.sha() - writer.write("hi again") - assert writer.sha() != prev_sha - - def test_compressed_writer(self): - for ds in self.data_sizes: - fd, path = tempfile.mkstemp() - ostream = FDCompressedSha1Writer(fd) - data = make_bytes(ds, randomize=False) - - # for now, just a single write, code doesn't care about chunking - assert len(data) == ostream.write(data) - ostream.close() - # its closed already - self.failUnlessRaises(OSError, os.close, fd) - - # read everything back, compare to data we zip - fd = os.open(path, os.O_RDONLY) - written_data = os.read(fd, os.path.getsize(path)) - os.close(fd) - assert written_data == zlib.compress(data, 1) # best speed - - os.remove(path) - # END for each os - - diff --git a/test/git/odb/test_utils.py b/test/git/odb/test_utils.py deleted file mode 100644 index 34572b37..00000000 --- a/test/git/odb/test_utils.py +++ /dev/null @@ -1,15 +0,0 @@ -"""Test for object db""" -from test.testlib import * -from git import Blob -from git.odb.utils import ( - to_hex_sha, - to_bin_sha - ) - - -class TestUtils(TestBase): - def test_basics(self): - assert to_hex_sha(Blob.NULL_HEX_SHA) == Blob.NULL_HEX_SHA - assert len(to_bin_sha(Blob.NULL_HEX_SHA)) == 20 - assert to_hex_sha(to_bin_sha(Blob.NULL_HEX_SHA)) == Blob.NULL_HEX_SHA - diff --git a/test/git/performance/test_streams.py b/test/git/performance/test_streams.py index 8f600cb3..36d0d29c 100644 --- a/test/git/performance/test_streams.py +++ b/test/git/performance/test_streams.py @@ -1,28 +1,21 @@ """Performance data streaming performance""" from test.testlib import * -from git.odb import * +from gitdb import * -from cStringIO import StringIO from time import time import os import sys import stat import subprocess +from gitdb.test.lib import make_memory_file from lib import ( TestBigRepoR ) -def make_memory_file(size_in_bytes, randomize=False): - """:return: tuple(size_of_stream, stream) - :param randomize: try to produce a very random stream""" - d = make_bytes(size_in_bytes, randomize) - return len(d), StringIO(d) - - class TestObjDBPerformance(TestBigRepoR): large_data_size_bytes = 1000*1000*10 # some MiB should do it @@ -30,6 +23,8 @@ class TestObjDBPerformance(TestBigRepoR): @with_bare_rw_repo def test_large_data_streaming(self, rwrepo): + # TODO: This part overlaps with the same file in gitdb.test.performance.test_stream + # It should be shared if possible ldb = LooseObjectDB(os.path.join(rwrepo.git_dir, 'objects')) for randomize in range(2): diff --git a/test/git/test_commit.py b/test/git/test_commit.py index e65e2e59..8629e625 100644 --- a/test/git/test_commit.py +++ b/test/git/test_commit.py @@ -6,7 +6,7 @@ from test.testlib import * from git import * -from git.odb import IStream +from gitdb import IStream from cStringIO import StringIO import time diff --git a/test/testlib/helper.py b/test/testlib/helper.py index 457cc26c..715427c2 100644 --- a/test/testlib/helper.py +++ b/test/testlib/helper.py @@ -9,8 +9,6 @@ from git import Repo, Remote, GitCommandError from unittest import TestCase import tempfile import shutil -import random -from array import array import cStringIO GIT_REPO = os.path.dirname(os.path.dirname(os.path.dirname(__file__))) @@ -27,24 +25,6 @@ def fixture(name): def absolute_project_path(): return os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")) -def make_bytes(size_in_bytes, randomize=False): - """:return: string with given size in bytes - :param randomize: try to produce a very random stream""" - actual_size = size_in_bytes / 4 - producer = xrange(actual_size) - if randomize: - producer = list(producer) - random.shuffle(producer) - # END randomize - a = array('i', producer) - return a.tostring() - - -def make_object(type, data): - """:return: bytes resembling an uncompressed object""" - odata = "blob %i\0" % len(data) - return odata + data - #} END routines #{ Adapters @@ -87,8 +67,7 @@ def with_bare_rw_repo(func): it will be removed completely. Use this if you want to make purely index based adjustments, change refs, create - heads, generally operations that do not need a working tree. - """ + heads, generally operations that do not need a working tree.""" def bare_repo_creator(self): repo_dir = tempfile.mktemp("bare_repo") rw_repo = self.rorepo.clone(repo_dir, shared=True, bare=True) |