summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--.gitmodules3
-rw-r--r--lib/git/__init__.py21
-rw-r--r--lib/git/errors.py12
m---------lib/git/ext/gitdb0
-rw-r--r--lib/git/objects/commit.py2
-rw-r--r--lib/git/odb/__init__.py6
-rw-r--r--lib/git/odb/db.py341
-rw-r--r--lib/git/odb/fun.py115
-rw-r--r--lib/git/odb/stream.py445
-rw-r--r--lib/git/odb/utils.py38
-rw-r--r--lib/git/repo.py2
-rw-r--r--lib/git/utils.py35
-rw-r--r--test/git/odb/__init__.py1
-rw-r--r--test/git/odb/lib.py60
-rw-r--r--test/git/odb/test_db.py90
-rw-r--r--test/git/odb/test_stream.py172
-rw-r--r--test/git/odb/test_utils.py15
-rw-r--r--test/git/performance/test_streams.py13
-rw-r--r--test/git/test_commit.py2
-rw-r--r--test/testlib/helper.py23
20 files changed, 33 insertions, 1363 deletions
diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 00000000..4cdd431a
--- /dev/null
+++ b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "lib/git/ext/gitdb"]
+ path = lib/git/ext/gitdb
+ url = git://gitorious.org/git-python/gitdb.git
diff --git a/lib/git/__init__.py b/lib/git/__init__.py
index 2f17c55b..6728ba82 100644
--- a/lib/git/__init__.py
+++ b/lib/git/__init__.py
@@ -5,10 +5,25 @@
# the BSD License: http://www.opensource.org/licenses/bsd-license.php
import os
+import sys
import inspect
__version__ = 'git'
+
+#{ Initialization
+def _init_externals():
+ """Initialize external projects by putting them into the path"""
+ sys.path.append(os.path.join(os.path.dirname(__file__), 'ext'))
+
+#} END initialization
+
+#################
+_init_externals()
+#################
+
+#{ Imports
+
from git.config import GitConfigParser
from git.objects import *
from git.refs import *
@@ -22,8 +37,8 @@ from git.remote import *
from git.index import *
from git.utils import LockFile, BlockingLockFile
-# odb is NOT imported intentionally - if you really want it, you should get it
-# yourself as its part of the core
+#} END imports
__all__ = [ name for name, obj in locals().items()
- if not (name.startswith('_') or inspect.ismodule(obj)) ]
+ if not (name.startswith('_') or inspect.ismodule(obj)) ]
+
diff --git a/lib/git/errors.py b/lib/git/errors.py
index d8a35e02..6d926500 100644
--- a/lib/git/errors.py
+++ b/lib/git/errors.py
@@ -10,18 +10,6 @@ Module containing all exceptions thrown througout the git package,
class InvalidGitRepositoryError(Exception):
""" Thrown if the given repository appears to have an invalid format. """
-class ODBError(Exception):
- """All errors thrown by the object database"""
-
-class InvalidDBRoot(ODBError):
- """Thrown if an object database cannot be initialized at the given path"""
-
-class BadObject(ODBError):
- """The object with the given SHA does not exist"""
-
-class BadObjectType(ODBError):
- """The object had an unsupported type"""
-
class NoSuchPathError(OSError):
""" Thrown if a path could not be access by the system. """
diff --git a/lib/git/ext/gitdb b/lib/git/ext/gitdb
new file mode 160000
+Subproject 10fef8f8e4ee83cf54feadbb5ffb522efec739f
diff --git a/lib/git/objects/commit.py b/lib/git/objects/commit.py
index 9a3c2c95..0d8f2c64 100644
--- a/lib/git/objects/commit.py
+++ b/lib/git/objects/commit.py
@@ -9,7 +9,7 @@ import git.diff as diff
import git.stats as stats
from git.actor import Actor
from tree import Tree
-from git.odb import IStream
+from gitdb import IStream
from cStringIO import StringIO
import base
import utils
diff --git a/lib/git/odb/__init__.py b/lib/git/odb/__init__.py
deleted file mode 100644
index 5789d7eb..00000000
--- a/lib/git/odb/__init__.py
+++ /dev/null
@@ -1,6 +0,0 @@
-"""Initialize the object database module"""
-
-# default imports
-from db import *
-from stream import *
-
diff --git a/lib/git/odb/db.py b/lib/git/odb/db.py
deleted file mode 100644
index 5d3cc6a3..00000000
--- a/lib/git/odb/db.py
+++ /dev/null
@@ -1,341 +0,0 @@
-"""Contains implementations of database retrieveing objects"""
-from git.utils import IndexFileSHA1Writer
-from git.errors import (
- InvalidDBRoot,
- BadObject,
- BadObjectType
- )
-
-from stream import (
- DecompressMemMapReader,
- FDCompressedSha1Writer,
- Sha1Writer,
- OStream,
- OInfo
- )
-
-from utils import (
- ENOENT,
- to_hex_sha,
- exists,
- hex_to_bin,
- isdir,
- mkdir,
- rename,
- dirname,
- join
- )
-
-from fun import (
- chunk_size,
- loose_object_header_info,
- write_object,
- stream_copy
- )
-
-import tempfile
-import mmap
-import os
-
-
-__all__ = ('ObjectDBR', 'ObjectDBW', 'FileDBBase', 'LooseObjectDB', 'PackedDB',
- 'CompoundDB', 'ReferenceDB', 'GitObjectDB' )
-
-class ObjectDBR(object):
- """Defines an interface for object database lookup.
- Objects are identified either by hex-sha (40 bytes) or
- by sha (20 bytes)"""
-
- def __contains__(self, sha):
- return self.has_obj
-
- #{ Query Interface
- def has_object(self, sha):
- """
- :return: True if the object identified by the given 40 byte hexsha or 20 bytes
- binary sha is contained in the database
- :raise BadObject:"""
- raise NotImplementedError("To be implemented in subclass")
-
- def info(self, sha):
- """ :return: OInfo instance
- :param sha: 40 bytes hexsha or 20 bytes binary sha
- :raise BadObject:"""
- raise NotImplementedError("To be implemented in subclass")
-
- def info_async(self, input_channel):
- """Retrieve information of a multitude of objects asynchronously
- :param input_channel: Channel yielding the sha's of the objects of interest
- :return: Channel yielding OInfo|InvalidOInfo, in any order"""
- raise NotImplementedError("To be implemented in subclass")
-
- def stream(self, sha):
- """:return: OStream instance
- :param sha: 40 bytes hexsha or 20 bytes binary sha
- :raise BadObject:"""
- raise NotImplementedError("To be implemented in subclass")
-
- def stream_async(self, input_channel):
- """Retrieve the OStream of multiple objects
- :param input_channel: see ``info``
- :param max_threads: see ``ObjectDBW.store``
- :return: Channel yielding OStream|InvalidOStream instances in any order"""
- raise NotImplementedError("To be implemented in subclass")
-
- #} END query interface
-
-class ObjectDBW(object):
- """Defines an interface to create objects in the database"""
-
- def __init__(self, *args, **kwargs):
- self._ostream = None
-
- #{ Edit Interface
- def set_ostream(self, stream):
- """Adjusts the stream to which all data should be sent when storing new objects
- :param stream: if not None, the stream to use, if None the default stream
- will be used.
- :return: previously installed stream, or None if there was no override
- :raise TypeError: if the stream doesn't have the supported functionality"""
- cstream = self._ostream
- self._ostream = stream
- return cstream
-
- def ostream(self):
- """:return: overridden output stream this instance will write to, or None
- if it will write to the default stream"""
- return self._ostream
-
- def store(self, istream):
- """Create a new object in the database
- :return: the input istream object with its sha set to its corresponding value
- :param istream: IStream compatible instance. If its sha is already set
- to a value, the object will just be stored in the our database format,
- in which case the input stream is expected to be in object format ( header + contents ).
- :raise IOError: if data could not be written"""
- raise NotImplementedError("To be implemented in subclass")
-
- def store_async(self, input_channel):
- """Create multiple new objects in the database asynchronously. The method will
- return right away, returning an output channel which receives the results as
- they are computed.
-
- :return: Channel yielding your IStream which served as input, in any order.
- The IStreams sha will be set to the sha it received during the process,
- or its error attribute will be set to the exception informing about the error.
- :param input_channel: Channel yielding IStream instance.
- As the same instances will be used in the output channel, you can create a map
- between the id(istream) -> istream
- :note:As some ODB implementations implement this operation as atomic, they might
- abort the whole operation if one item could not be processed. Hence check how
- many items have actually been produced."""
- raise NotImplementedError("To be implemented in subclass")
-
- #} END edit interface
-
-
-class FileDBBase(object):
- """Provides basic facilities to retrieve files of interest, including
- caching facilities to help mapping hexsha's to objects"""
-
- def __init__(self, root_path):
- """Initialize this instance to look for its files at the given root path
- All subsequent operations will be relative to this path
- :raise InvalidDBRoot:
- :note: The base will not perform any accessablity checking as the base
- might not yet be accessible, but become accessible before the first
- access."""
- super(FileDBBase, self).__init__()
- self._root_path = root_path
-
-
- #{ Interface
- def root_path(self):
- """:return: path at which this db operates"""
- return self._root_path
-
- def db_path(self, rela_path):
- """
- :return: the given relative path relative to our database root, allowing
- to pontentially access datafiles"""
- return join(self._root_path, rela_path)
- #} END interface
-
-
-
-class LooseObjectDB(FileDBBase, ObjectDBR, ObjectDBW):
- """A database which operates on loose object files"""
-
- # CONFIGURATION
- # chunks in which data will be copied between streams
- stream_chunk_size = chunk_size
-
-
- def __init__(self, root_path):
- super(LooseObjectDB, self).__init__(root_path)
- self._hexsha_to_file = dict()
- # Additional Flags - might be set to 0 after the first failure
- # Depending on the root, this might work for some mounts, for others not, which
- # is why it is per instance
- self._fd_open_flags = getattr(os, 'O_NOATIME', 0)
-
- #{ Interface
- def object_path(self, hexsha):
- """
- :return: path at which the object with the given hexsha would be stored,
- relative to the database root"""
- return join(hexsha[:2], hexsha[2:])
-
- def readable_db_object_path(self, hexsha):
- """
- :return: readable object path to the object identified by hexsha
- :raise BadObject: If the object file does not exist"""
- try:
- return self._hexsha_to_file[hexsha]
- except KeyError:
- pass
- # END ignore cache misses
-
- # try filesystem
- path = self.db_path(self.object_path(hexsha))
- if exists(path):
- self._hexsha_to_file[hexsha] = path
- return path
- # END handle cache
- raise BadObject(hexsha)
-
- #} END interface
-
- def _map_loose_object(self, sha):
- """
- :return: memory map of that file to allow random read access
- :raise BadObject: if object could not be located"""
- db_path = self.db_path(self.object_path(to_hex_sha(sha)))
- try:
- fd = os.open(db_path, os.O_RDONLY|self._fd_open_flags)
- except OSError,e:
- if e.errno != ENOENT:
- # try again without noatime
- try:
- fd = os.open(db_path, os.O_RDONLY)
- except OSError:
- raise BadObject(to_hex_sha(sha))
- # didn't work because of our flag, don't try it again
- self._fd_open_flags = 0
- else:
- raise BadObject(to_hex_sha(sha))
- # END handle error
- # END exception handling
- try:
- return mmap.mmap(fd, 0, access=mmap.ACCESS_READ)
- finally:
- os.close(fd)
- # END assure file is closed
-
- def set_ostream(self, stream):
- """:raise TypeError: if the stream does not support the Sha1Writer interface"""
- if stream is not None and not isinstance(stream, Sha1Writer):
- raise TypeError("Output stream musst support the %s interface" % Sha1Writer.__name__)
- return super(LooseObjectDB, self).set_ostream(stream)
-
- def info(self, sha):
- m = self._map_loose_object(sha)
- try:
- type, size = loose_object_header_info(m)
- return OInfo(sha, type, size)
- finally:
- m.close()
- # END assure release of system resources
-
- def stream(self, sha):
- m = self._map_loose_object(sha)
- type, size, stream = DecompressMemMapReader.new(m, close_on_deletion = True)
- return OStream(sha, type, size, stream)
-
- def has_object(self, sha):
- try:
- self.readable_db_object_path(to_hex_sha(sha))
- return True
- except BadObject:
- return False
- # END check existance
-
- def store(self, istream):
- """note: The sha we produce will be hex by nature"""
- tmp_path = None
- writer = self.ostream()
- if writer is None:
- # open a tmp file to write the data to
- fd, tmp_path = tempfile.mkstemp(prefix='obj', dir=self._root_path)
- writer = FDCompressedSha1Writer(fd)
- # END handle custom writer
-
- try:
- try:
- if istream.sha is not None:
- stream_copy(istream.read, writer.write, istream.size, self.stream_chunk_size)
- else:
- # write object with header, we have to make a new one
- write_object(istream.type, istream.size, istream.read, writer.write,
- chunk_size=self.stream_chunk_size)
- # END handle direct stream copies
- except:
- if tmp_path:
- os.remove(tmp_path)
- raise
- # END assure tmpfile removal on error
- finally:
- if tmp_path:
- writer.close()
- # END assure target stream is closed
-
- sha = istream.sha or writer.sha(as_hex=True)
-
- if tmp_path:
- obj_path = self.db_path(self.object_path(sha))
- obj_dir = dirname(obj_path)
- if not isdir(obj_dir):
- mkdir(obj_dir)
- # END handle destination directory
- rename(tmp_path, obj_path)
- # END handle dry_run
-
- istream.sha = sha
- return istream
-
-
-class PackedDB(FileDBBase, ObjectDBR):
- """A database operating on a set of object packs"""
-
-
-class CompoundDB(ObjectDBR):
- """A database which delegates calls to sub-databases"""
-
-
-class ReferenceDB(CompoundDB):
- """A database consisting of database referred to in a file"""
-
-
-#class GitObjectDB(CompoundDB, ObjectDBW):
-class GitObjectDB(LooseObjectDB):
- """A database representing the default git object store, which includes loose
- objects, pack files and an alternates file
-
- It will create objects only in the loose object database.
- :note: for now, we use the git command to do all the lookup, just until he
- have packs and the other implementations
- """
- def __init__(self, root_path, git):
- """Initialize this instance with the root and a git command"""
- super(GitObjectDB, self).__init__(root_path)
- self._git = git
-
- def info(self, sha):
- t = self._git.get_object_header(sha)
- return OInfo(*t)
-
- def stream(self, sha):
- """For now, all lookup is done by git itself"""
- t = self._git.stream_object_data(sha)
- return OStream(*t)
-
diff --git a/lib/git/odb/fun.py b/lib/git/odb/fun.py
deleted file mode 100644
index 3321a8ea..00000000
--- a/lib/git/odb/fun.py
+++ /dev/null
@@ -1,115 +0,0 @@
-"""Contains basic c-functions which usually contain performance critical code
-Keeping this code separate from the beginning makes it easier to out-source
-it into c later, if required"""
-
-from git.errors import (
- BadObjectType
- )
-
-import zlib
-decompressobj = zlib.decompressobj
-
-
-# INVARIANTS
-type_id_to_type_map = {
- 1 : "commit",
- 2 : "tree",
- 3 : "blob",
- 4 : "tag"
- }
-
-# used when dealing with larger streams
-chunk_size = 1000*1000
-
-__all__ = ('is_loose_object', 'loose_object_header_info', 'object_header_info',
- 'write_object' )
-
-#{ Routines
-
-def is_loose_object(m):
- """:return: True the file contained in memory map m appears to be a loose object.
- Only the first two bytes are needed"""
- b0, b1 = map(ord, m[:2])
- word = (b0 << 8) + b1
- return b0 == 0x78 and (word % 31) == 0
-
-def loose_object_header_info(m):
- """:return: tuple(type_string, uncompressed_size_in_bytes) the type string of the
- object as well as its uncompressed size in bytes.
- :param m: memory map from which to read the compressed object data"""
- decompress_size = 8192 # is used in cgit as well
- hdr = decompressobj().decompress(m, decompress_size)
- type_name, size = hdr[:hdr.find("\0")].split(" ")
- return type_name, int(size)
-
-def object_header_info(m):
- """:return: tuple(type_string, uncompressed_size_in_bytes
- :param mmap: mapped memory map. It will be
- seeked to the actual start of the object contents, which can be used
- to initialize a zlib decompress object.
- :note: This routine can only handle new-style objects which are assumably contained
- in packs
- """
- assert not is_loose_object(m), "Use loose_object_header_info instead"
-
- c = b0 # first byte
- i = 1 # next char to read
- type_id = (c >> 4) & 7 # numeric type
- size = c & 15 # starting size
- s = 4 # starting bit-shift size
- while c & 0x80:
- c = ord(m[i])
- i += 1
- size += (c & 0x7f) << s
- s += 7
- # END character loop
-
- # finally seek the map to the start of the data stream
- m.seek(i)
- try:
- return (type_id_to_type_map[type_id], size)
- except KeyError:
- # invalid object type - we could try to be smart now and decode part
- # of the stream to get the info, problem is that we had trouble finding
- # the exact start of the content stream
- raise BadObjectType(type_id)
- # END handle exceptions
-
-def write_object(type, size, read, write, chunk_size=chunk_size):
- """Write the object as identified by type, size and source_stream into the
- target_stream
-
- :param type: type string of the object
- :param size: amount of bytes to write from source_stream
- :param read: read method of a stream providing the content data
- :param write: write method of the output stream
- :param close_target_stream: if True, the target stream will be closed when
- the routine exits, even if an error is thrown
- :return: The actual amount of bytes written to stream, which includes the header and a trailing newline"""
- tbw = 0 # total num bytes written
-
- # WRITE HEADER: type SP size NULL
- tbw += write("%s %i\0" % (type, size))
- tbw += stream_copy(read, write, size, chunk_size)
-
- return tbw
-
-def stream_copy(read, write, size, chunk_size):
- """Copy a stream up to size bytes using the provided read and write methods,
- in chunks of chunk_size
- :note: its much like stream_copy utility, but operates just using methods"""
- dbw = 0 # num data bytes written
-
- # WRITE ALL DATA UP TO SIZE
- while True:
- cs = min(chunk_size, size-dbw)
- data_len = write(read(cs))
- dbw += data_len
- if data_len < cs or dbw == size:
- break
- # END check for stream end
- # END duplicate data
- return dbw
-
-
-#} END routines
diff --git a/lib/git/odb/stream.py b/lib/git/odb/stream.py
deleted file mode 100644
index da97cf5b..00000000
--- a/lib/git/odb/stream.py
+++ /dev/null
@@ -1,445 +0,0 @@
-import zlib
-from cStringIO import StringIO
-from git.utils import make_sha
-import errno
-
-from utils import (
- to_hex_sha,
- to_bin_sha,
- write,
- close
- )
-
-__all__ = ('OInfo', 'OStream', 'IStream', 'InvalidOInfo', 'InvalidOStream',
- 'DecompressMemMapReader', 'FDCompressedSha1Writer')
-
-
-# ZLIB configuration
-# used when compressing objects - 1 to 9 ( slowest )
-Z_BEST_SPEED = 1
-
-
-#{ ODB Bases
-
-class OInfo(tuple):
- """Carries information about an object in an ODB, provdiing information
- about the sha of the object, the type_string as well as the uncompressed size
- in bytes.
-
- It can be accessed using tuple notation and using attribute access notation::
-
- assert dbi[0] == dbi.sha
- assert dbi[1] == dbi.type
- assert dbi[2] == dbi.size
-
- The type is designed to be as lighteight as possible."""
- __slots__ = tuple()
-
- def __new__(cls, sha, type, size):
- return tuple.__new__(cls, (sha, type, size))
-
- def __init__(self, *args):
- tuple.__init__(self)
-
- #{ Interface
- @property
- def sha(self):
- return self[0]
-
- @property
- def type(self):
- return self[1]
-
- @property
- def size(self):
- return self[2]
- #} END interface
-
-
-class OStream(OInfo):
- """Base for object streams retrieved from the database, providing additional
- information about the stream.
- Generally, ODB streams are read-only as objects are immutable"""
- __slots__ = tuple()
-
- def __new__(cls, sha, type, size, stream, *args, **kwargs):
- """Helps with the initialization of subclasses"""
- return tuple.__new__(cls, (sha, type, size, stream))
-
-
- def __init__(self, *args, **kwargs):
- tuple.__init__(self)
-
- #{ Stream Reader Interface
-
- def read(self, size=-1):
- return self[3].read(size)
-
- #} END stream reader interface
-
-
-class IStream(list):
- """Represents an input content stream to be fed into the ODB. It is mutable to allow
- the ODB to record information about the operations outcome right in this instance.
-
- It provides interfaces for the OStream and a StreamReader to allow the instance
- to blend in without prior conversion.
-
- The only method your content stream must support is 'read'"""
- __slots__ = tuple()
-
- def __new__(cls, type, size, stream, sha=None):
- return list.__new__(cls, (sha, type, size, stream, None))
-
- def __init__(self, type, size, stream, sha=None):
- list.__init__(self, (sha, type, size, stream, None))
-
- #{ Interface
-
- @property
- def hexsha(self):
- """:return: our sha, hex encoded, 40 bytes"""
- return to_hex_sha(self[0])
-
- @property
- def binsha(self):
- """:return: our sha as binary, 20 bytes"""
- return to_bin_sha(self[0])
-
- def _error(self):
- """:return: the error that occurred when processing the stream, or None"""
- return self[4]
-
- def _set_error(self, exc):
- """Set this input stream to the given exc, may be None to reset the error"""
- self[4] = exc
-
- error = property(_error, _set_error)
-
- #} END interface
-
- #{ Stream Reader Interface
-
- def read(self, size=-1):
- """Implements a simple stream reader interface, passing the read call on
- to our internal stream"""
- return self[3].read(size)
-
- #} END stream reader interface
-
- #{ interface
-
- def _set_sha(self, sha):
- self[0] = sha
-
- def _sha(self):
- return self[0]
-
- sha = property(_sha, _set_sha)
-
-
- def _type(self):
- return self[1]
-
- def _set_type(self, type):
- self[1] = type
-
- type = property(_type, _set_type)
-
- def _size(self):
- return self[2]
-
- def _set_size(self, size):
- self[2] = size
-
- size = property(_size, _set_size)
-
- def _stream(self):
- return self[3]
-
- def _set_stream(self, stream):
- self[3] = stream
-
- stream = property(_stream, _set_stream)
-
- #} END odb info interface
-
-
-class InvalidOInfo(tuple):
- """Carries information about a sha identifying an object which is invalid in
- the queried database. The exception attribute provides more information about
- the cause of the issue"""
- __slots__ = tuple()
-
- def __new__(cls, sha, exc):
- return tuple.__new__(cls, (sha, exc))
-
- def __init__(self, sha, exc):
- tuple.__init__(self, (sha, exc))
-
- @property
- def sha(self):
- return self[0]
-
- @property
- def error(self):
- """:return: exception instance explaining the failure"""
- return self[1]
-
-
-class InvalidOStream(InvalidOInfo):
- """Carries information about an invalid ODB stream"""
- __slots__ = tuple()
-
-#} END ODB Bases
-
-
-#{ RO Streams
-
-class DecompressMemMapReader(object):
- """Reads data in chunks from a memory map and decompresses it. The client sees
- only the uncompressed data, respective file-like read calls are handling on-demand
- buffered decompression accordingly
-
- A constraint on the total size of bytes is activated, simulating
- a logical file within a possibly larger physical memory area
-
- To read efficiently, you clearly don't want to read individual bytes, instead,
- read a few kilobytes at least.
-
- :note: The chunk-size should be carefully selected as it will involve quite a bit
- of string copying due to the way the zlib is implemented. Its very wasteful,
- hence we try to find a good tradeoff between allocation time and number of
- times we actually allocate. An own zlib implementation would be good here
- to better support streamed reading - it would only need to keep the mmap
- and decompress it into chunks, thats all ... """
- __slots__ = ('_m', '_zip', '_buf', '_buflen', '_br', '_cws', '_cwe', '_s', '_close')
-
- max_read_size = 512*1024 # currently unused
-
- def __init__(self, m, close_on_deletion, size):
- """Initialize with mmap for stream reading
- :param m: must be content data - use new if you have object data and no size"""
- self._m = m
- self._zip = zlib.decompressobj()
- self._buf = None # buffer of decompressed bytes
- self._buflen = 0 # length of bytes in buffer
- self._s = size # size of uncompressed data to read in total
- self._br = 0 # num uncompressed bytes read
- self._cws = 0 # start byte of compression window
- self._cwe = 0 # end byte of compression window
- self._close = close_on_deletion # close the memmap on deletion ?
-
- def __del__(self):
- if self._close:
- self._m.close()
- # END handle resource freeing
-
- def _parse_header_info(self):
- """If this stream contains object data, parse the header info and skip the
- stream to a point where each read will yield object content
- :return: parsed type_string, size"""
- # read header
- maxb = 512 # should really be enough, cgit uses 8192 I believe
- self._s = maxb
- hdr = self.read(maxb)
- hdrend = hdr.find("\0")
- type, size = hdr[:hdrend].split(" ")
- size = int(size)
- self._s = size
-
- # adjust internal state to match actual header length that we ignore
- # The buffer will be depleted first on future reads
- self._br = 0
- hdrend += 1 # count terminating \0
- self._buf = StringIO(hdr[hdrend:])
- self._buflen = len(hdr) - hdrend
-
- return type, size
-
- @classmethod
- def new(self, m, close_on_deletion=False):
- """Create a new DecompressMemMapReader instance for acting as a read-only stream
- This method parses the object header from m and returns the parsed
- type and size, as well as the created stream instance.
- :param m: memory map on which to oparate. It must be object data ( header + contents )
- :param close_on_deletion: if True, the memory map will be closed once we are
- being deleted"""
- inst = DecompressMemMapReader(m, close_on_deletion, 0)
- type, size = inst._parse_header_info()
- return type, size, inst
-
- def read(self, size=-1):
- if size < 1:
- size = self._s - self._br
- else:
- size = min(size, self._s - self._br)
- # END clamp size
-
- if size == 0:
- return str()
- # END handle depletion
-
- # protect from memory peaks
- # If he tries to read large chunks, our memory patterns get really bad
- # as we end up copying a possibly huge chunk from our memory map right into
- # memory. This might not even be possible. Nonetheless, try to dampen the
- # effect a bit by reading in chunks, returning a huge string in the end.
- # Our performance now depends on StringIO. This way we don't need two large
- # buffers in peak times, but only one large one in the end which is
- # the return buffer
- # NO: We don't do it - if the user thinks its best, he is right. If he
- # has trouble, he will start reading in chunks. According to our tests
- # its still faster if we read 10 Mb at once instead of chunking it.
-
- # if size > self.max_read_size:
- # sio = StringIO()
- # while size:
- # read_size = min(self.max_read_size, size)
- # data = self.read(read_size)
- # sio.write(data)
- # size -= len(data)
- # if len(data) < read_size:
- # break
- # # END data loop
- # sio.seek(0)
- # return sio.getvalue()
- # # END handle maxread
- #
- # deplete the buffer, then just continue using the decompress object
- # which has an own buffer. We just need this to transparently parse the
- # header from the zlib stream
- dat = str()
- if self._buf:
- if self._buflen >= size:
- # have enough data
- dat = self._buf.read(size)
- self._buflen -= size
- self._br += size
- return dat
- else:
- dat = self._buf.read() # ouch, duplicates data
- size -= self._buflen
- self._br += self._buflen
-
- self._buflen = 0
- self._buf = None
- # END handle buffer len
- # END handle buffer
-
- # decompress some data
- # Abstract: zlib needs to operate on chunks of our memory map ( which may
- # be large ), as it will otherwise and always fill in the 'unconsumed_tail'
- # attribute which possible reads our whole map to the end, forcing
- # everything to be read from disk even though just a portion was requested.
- # As this would be a nogo, we workaround it by passing only chunks of data,
- # moving the window into the memory map along as we decompress, which keeps
- # the tail smaller than our chunk-size. This causes 'only' the chunk to be
- # copied once, and another copy of a part of it when it creates the unconsumed
- # tail. We have to use it to hand in the appropriate amount of bytes durin g
- # the next read.
- tail = self._zip.unconsumed_tail
- if tail:
- # move the window, make it as large as size demands. For code-clarity,
- # we just take the chunk from our map again instead of reusing the unconsumed
- # tail. The latter one would safe some memory copying, but we could end up
- # with not getting enough data uncompressed, so we had to sort that out as well.
- # Now we just assume the worst case, hence the data is uncompressed and the window
- # needs to be as large as the uncompressed bytes we want to read.
- self._cws = self._cwe - len(tail)
- self._cwe = self._cws + size
- else:
- cws = self._cws
- self._cws = self._cwe
- self._cwe = cws + size
- # END handle tail
-
-
- # if window is too small, make it larger so zip can decompress something
- win_size = self._cwe - self._cws
- if win_size < 8:
- self._cwe = self._cws + 8
- # END adjust winsize
- indata = self._m[self._cws:self._cwe] # another copy ... :(
-
- # get the actual window end to be sure we don't use it for computations
- self._cwe = self._cws + len(indata)
-
- dcompdat = self._zip.decompress(indata, size)
-
- self._br += len(dcompdat)
- if dat:
- dcompdat = dat + dcompdat
-
- return dcompdat
-
-#} END RO streams
-
-
-#{ W Streams
-
-class Sha1Writer(object):
- """Simple stream writer which produces a sha whenever you like as it degests
- everything it is supposed to write"""
- __slots__ = "sha1"
-
- def __init__(self):
- self.sha1 = make_sha("")
-
- #{ Stream Interface
-
- def write(self, data):
- """:raise IOError: If not all bytes could be written
- :return: lenght of incoming data"""
- self.sha1.update(data)
- return len(data)
-
- # END stream interface
-
- #{ Interface
-
- def sha(self, as_hex = False):
- """:return: sha so far
- :param as_hex: if True, sha will be hex-encoded, binary otherwise"""
- if as_hex:
- return self.sha1.hexdigest()
- return self.sha1.digest()
-
- #} END interface
-
-class FDCompressedSha1Writer(Sha1Writer):
- """Digests data written to it, making the sha available, then compress the
- data and write it to the file descriptor
- :note: operates on raw file descriptors
- :note: for this to work, you have to use the close-method of this instance"""
- __slots__ = ("fd", "sha1", "zip")
-
- # default exception
- exc = IOError("Failed to write all bytes to filedescriptor")
-
- def __init__(self, fd):
- super(FDCompressedSha1Writer, self).__init__()
- self.fd = fd
- self.zip = zlib.compressobj(Z_BEST_SPEED)
-
- #{ Stream Interface
-
- def write(self, data):
- """:raise IOError: If not all bytes could be written
- :return: lenght of incoming data"""
- self.sha1.update(data)
- cdata = self.zip.compress(data)
- bytes_written = write(self.fd, cdata)
- if bytes_written != len(cdata):
- raise self.exc
- return len(data)
-
- def close(self):
- remainder = self.zip.flush()
- if write(self.fd, remainder) != len(remainder):
- raise self.exc
- return close(self.fd)
-
- #} END stream interface
-
-#} END W streams
diff --git a/lib/git/odb/utils.py b/lib/git/odb/utils.py
deleted file mode 100644
index 6863e97b..00000000
--- a/lib/git/odb/utils.py
+++ /dev/null
@@ -1,38 +0,0 @@
-import binascii
-import os
-import errno
-
-#{ Routines
-
-hex_to_bin = binascii.a2b_hex
-bin_to_hex = binascii.b2a_hex
-
-def to_hex_sha(sha):
- """:return: hexified version of sha"""
- if len(sha) == 40:
- return sha
- return bin_to_hex(sha)
-
-def to_bin_sha(sha):
- if len(sha) == 20:
- return sha
- return hex_to_bin(sha)
-
-# errors
-ENOENT = errno.ENOENT
-
-# os shortcuts
-exists = os.path.exists
-mkdir = os.mkdir
-isdir = os.path.isdir
-rename = os.rename
-dirname = os.path.dirname
-join = os.path.join
-read = os.read
-write = os.write
-close = os.close
-
-
-#} END Routines
-
-
diff --git a/lib/git/repo.py b/lib/git/repo.py
index 78e5f526..4442a79e 100644
--- a/lib/git/repo.py
+++ b/lib/git/repo.py
@@ -13,7 +13,7 @@ from objects import *
from config import GitConfigParser
from remote import Remote
-from odb import GitObjectDB
+from gitdb import GitObjectDB
import os
import sys
diff --git a/lib/git/utils.py b/lib/git/utils.py
index 60a7de48..ab763a10 100644
--- a/lib/git/utils.py
+++ b/lib/git/utils.py
@@ -9,38 +9,11 @@ import sys
import time
import tempfile
-try:
- import hashlib
-except ImportError:
- import sha
+from gitdb.util import (
+ stream_copy,
+ make_sha
+ )
-def make_sha(source=''):
- """
- A python2.4 workaround for the sha/hashlib module fiasco
-
- Note
- From the dulwich project
- """
- try:
- return hashlib.sha1(source)
- except NameError:
- sha1 = sha.sha(source)
- return sha1
-
-def stream_copy(source, destination, chunk_size=512*1024):
- """Copy all data from the source stream into the destination stream in chunks
- of size chunk_size
- :return: amount of bytes written"""
- br = 0
- while True:
- chunk = source.read(chunk_size)
- destination.write(chunk)
- br += len(chunk)
- if len(chunk) < chunk_size:
- break
- # END reading output stream
- return br
-
def join_path(a, *p):
"""Join path tokens together similar to os.path.join, but always use
diff --git a/test/git/odb/__init__.py b/test/git/odb/__init__.py
deleted file mode 100644
index 8b137891..00000000
--- a/test/git/odb/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-
diff --git a/test/git/odb/lib.py b/test/git/odb/lib.py
deleted file mode 100644
index d5199748..00000000
--- a/test/git/odb/lib.py
+++ /dev/null
@@ -1,60 +0,0 @@
-"""Utilities used in ODB testing"""
-from git.odb import (
- OStream,
- )
-from git.odb.stream import Sha1Writer
-
-import zlib
-from cStringIO import StringIO
-
-#{ Stream Utilities
-
-class DummyStream(object):
- def __init__(self):
- self.was_read = False
- self.bytes = 0
- self.closed = False
-
- def read(self, size):
- self.was_read = True
- self.bytes = size
-
- def close(self):
- self.closed = True
-
- def _assert(self):
- assert self.was_read
-
-
-class DeriveTest(OStream):
- def __init__(self, sha, type, size, stream, *args, **kwargs):
- self.myarg = kwargs.pop('myarg')
- self.args = args
-
- def _assert(self):
- assert self.args
- assert self.myarg
-
-
-class ZippedStoreShaWriter(Sha1Writer):
- """Remembers everything someone writes to it"""
- __slots__ = ('buf', 'zip')
- def __init__(self):
- Sha1Writer.__init__(self)
- self.buf = StringIO()
- self.zip = zlib.compressobj(1) # fastest
-
- def __getattr__(self, attr):
- return getattr(self.buf, attr)
-
- def write(self, data):
- alen = Sha1Writer.write(self, data)
- self.buf.write(self.zip.compress(data))
- return alen
-
- def close(self):
- self.buf.write(self.zip.flush())
-
-
-#} END stream utilitiess
-
diff --git a/test/git/odb/test_db.py b/test/git/odb/test_db.py
deleted file mode 100644
index 35ba8680..00000000
--- a/test/git/odb/test_db.py
+++ /dev/null
@@ -1,90 +0,0 @@
-"""Test for object db"""
-from test.testlib import *
-from lib import ZippedStoreShaWriter
-
-from git.odb import *
-from git.odb.stream import Sha1Writer
-from git import Blob
-from git.errors import BadObject
-
-
-from cStringIO import StringIO
-import os
-
-class TestDB(TestBase):
- """Test the different db class implementations"""
-
- # data
- two_lines = "1234\nhello world"
-
- all_data = (two_lines, )
-
- def _assert_object_writing(self, db):
- """General tests to verify object writing, compatible to ObjectDBW
- :note: requires write access to the database"""
- # start in 'dry-run' mode, using a simple sha1 writer
- ostreams = (ZippedStoreShaWriter, None)
- for ostreamcls in ostreams:
- for data in self.all_data:
- dry_run = ostreamcls is not None
- ostream = None
- if ostreamcls is not None:
- ostream = ostreamcls()
- assert isinstance(ostream, Sha1Writer)
- # END create ostream
-
- prev_ostream = db.set_ostream(ostream)
- assert type(prev_ostream) in ostreams or prev_ostream in ostreams
-
- istream = IStream(Blob.type, len(data), StringIO(data))
-
- # store returns same istream instance, with new sha set
- my_istream = db.store(istream)
- sha = istream.sha
- assert my_istream is istream
- assert db.has_object(sha) != dry_run
- assert len(sha) == 40 # for now we require 40 byte shas as default
-
- # verify data - the slow way, we want to run code
- if not dry_run:
- info = db.info(sha)
- assert Blob.type == info.type
- assert info.size == len(data)
-
- ostream = db.stream(sha)
- assert ostream.read() == data
- assert ostream.type == Blob.type
- assert ostream.size == len(data)
- else:
- self.failUnlessRaises(BadObject, db.info, sha)
- self.failUnlessRaises(BadObject, db.stream, sha)
-
- # DIRECT STREAM COPY
- # our data hase been written in object format to the StringIO
- # we pasesd as output stream. No physical database representation
- # was created.
- # Test direct stream copy of object streams, the result must be
- # identical to what we fed in
- ostream.seek(0)
- istream.stream = ostream
- assert istream.sha is not None
- prev_sha = istream.sha
-
- db.set_ostream(ZippedStoreShaWriter())
- db.store(istream)
- assert istream.sha == prev_sha
- new_ostream = db.ostream()
-
- # note: only works as long our store write uses the same compression
- # level, which is zip
- assert ostream.getvalue() == new_ostream.getvalue()
- # END for each data set
- # END for each dry_run mode
-
- @with_bare_rw_repo
- def test_writing(self, rwrepo):
- ldb = LooseObjectDB(os.path.join(rwrepo.git_dir, 'objects'))
-
- # write data
- self._assert_object_writing(ldb)
-
diff --git a/test/git/odb/test_stream.py b/test/git/odb/test_stream.py
deleted file mode 100644
index 020fe6bd..00000000
--- a/test/git/odb/test_stream.py
+++ /dev/null
@@ -1,172 +0,0 @@
-"""Test for object db"""
-from test.testlib import *
-from lib import (
- DummyStream,
- DeriveTest,
- Sha1Writer
- )
-
-from git.odb import *
-from git import Blob
-from cStringIO import StringIO
-import tempfile
-import os
-import zlib
-
-
-
-
-class TestStream(TestBase):
- """Test stream classes"""
-
- data_sizes = (15, 10000, 1000*1024+512)
-
- def test_streams(self):
- # test info
- sha = Blob.NULL_HEX_SHA
- s = 20
- info = OInfo(sha, Blob.type, s)
- assert info.sha == sha
- assert info.type == Blob.type
- assert info.size == s
-
- # test ostream
- stream = DummyStream()
- ostream = OStream(*(info + (stream, )))
- ostream.read(15)
- stream._assert()
- assert stream.bytes == 15
- ostream.read(20)
- assert stream.bytes == 20
-
- # derive with own args
- DeriveTest(sha, Blob.type, s, stream, 'mine',myarg = 3)._assert()
-
- # test istream
- istream = IStream(Blob.type, s, stream)
- assert istream.sha == None
- istream.sha = sha
- assert istream.sha == sha
-
- assert len(istream.binsha) == 20
- assert len(istream.hexsha) == 40
-
- assert istream.size == s
- istream.size = s * 2
- istream.size == s * 2
- assert istream.type == Blob.type
- istream.type = "something"
- assert istream.type == "something"
- assert istream.stream is stream
- istream.stream = None
- assert istream.stream is None
-
- assert istream.error is None
- istream.error = Exception()
- assert isinstance(istream.error, Exception)
-
- def _assert_stream_reader(self, stream, cdata, rewind_stream=lambda s: None):
- """Make stream tests - the orig_stream is seekable, allowing it to be
- rewound and reused
- :param cdata: the data we expect to read from stream, the contents
- :param rewind_stream: function called to rewind the stream to make it ready
- for reuse"""
- ns = 10
- assert len(cdata) > ns-1, "Data must be larger than %i, was %i" % (ns, len(cdata))
-
- # read in small steps
- ss = len(cdata) / ns
- for i in range(ns):
- data = stream.read(ss)
- chunk = cdata[i*ss:(i+1)*ss]
- assert data == chunk
- # END for each step
- rest = stream.read()
- if rest:
- assert rest == cdata[-len(rest):]
- # END handle rest
-
- rewind_stream(stream)
-
- # read everything
- rdata = stream.read()
- assert rdata == cdata
-
- def test_decompress_reader(self):
- for close_on_deletion in range(2):
- for with_size in range(2):
- for ds in self.data_sizes:
- cdata = make_bytes(ds, randomize=False)
-
- # zdata = zipped actual data
- # cdata = original content data
-
- # create reader
- if with_size:
- # need object data
- zdata = zlib.compress(make_object(Blob.type, cdata))
- type, size, reader = DecompressMemMapReader.new(zdata, close_on_deletion)
- assert size == len(cdata)
- assert type == Blob.type
- else:
- # here we need content data
- zdata = zlib.compress(cdata)
- reader = DecompressMemMapReader(zdata, close_on_deletion, len(cdata))
- assert reader._s == len(cdata)
- # END get reader
-
- def rewind(r):
- r._zip = zlib.decompressobj()
- r._br = r._cws = r._cwe = 0
- if with_size:
- r._parse_header_info()
- # END skip header
- # END make rewind func
-
- self._assert_stream_reader(reader, cdata, rewind)
-
- # put in a dummy stream for closing
- dummy = DummyStream()
- reader._m = dummy
-
- assert not dummy.closed
- del(reader)
- assert dummy.closed == close_on_deletion
- #zdi#
- # END for each datasize
- # END whether size should be used
- # END whether stream should be closed when deleted
-
- def test_sha_writer(self):
- writer = Sha1Writer()
- assert 2 == writer.write("hi")
- assert len(writer.sha(as_hex=1)) == 40
- assert len(writer.sha(as_hex=0)) == 20
-
- # make sure it does something ;)
- prev_sha = writer.sha()
- writer.write("hi again")
- assert writer.sha() != prev_sha
-
- def test_compressed_writer(self):
- for ds in self.data_sizes:
- fd, path = tempfile.mkstemp()
- ostream = FDCompressedSha1Writer(fd)
- data = make_bytes(ds, randomize=False)
-
- # for now, just a single write, code doesn't care about chunking
- assert len(data) == ostream.write(data)
- ostream.close()
- # its closed already
- self.failUnlessRaises(OSError, os.close, fd)
-
- # read everything back, compare to data we zip
- fd = os.open(path, os.O_RDONLY)
- written_data = os.read(fd, os.path.getsize(path))
- os.close(fd)
- assert written_data == zlib.compress(data, 1) # best speed
-
- os.remove(path)
- # END for each os
-
-
diff --git a/test/git/odb/test_utils.py b/test/git/odb/test_utils.py
deleted file mode 100644
index 34572b37..00000000
--- a/test/git/odb/test_utils.py
+++ /dev/null
@@ -1,15 +0,0 @@
-"""Test for object db"""
-from test.testlib import *
-from git import Blob
-from git.odb.utils import (
- to_hex_sha,
- to_bin_sha
- )
-
-
-class TestUtils(TestBase):
- def test_basics(self):
- assert to_hex_sha(Blob.NULL_HEX_SHA) == Blob.NULL_HEX_SHA
- assert len(to_bin_sha(Blob.NULL_HEX_SHA)) == 20
- assert to_hex_sha(to_bin_sha(Blob.NULL_HEX_SHA)) == Blob.NULL_HEX_SHA
-
diff --git a/test/git/performance/test_streams.py b/test/git/performance/test_streams.py
index 8f600cb3..36d0d29c 100644
--- a/test/git/performance/test_streams.py
+++ b/test/git/performance/test_streams.py
@@ -1,28 +1,21 @@
"""Performance data streaming performance"""
from test.testlib import *
-from git.odb import *
+from gitdb import *
-from cStringIO import StringIO
from time import time
import os
import sys
import stat
import subprocess
+from gitdb.test.lib import make_memory_file
from lib import (
TestBigRepoR
)
-def make_memory_file(size_in_bytes, randomize=False):
- """:return: tuple(size_of_stream, stream)
- :param randomize: try to produce a very random stream"""
- d = make_bytes(size_in_bytes, randomize)
- return len(d), StringIO(d)
-
-
class TestObjDBPerformance(TestBigRepoR):
large_data_size_bytes = 1000*1000*10 # some MiB should do it
@@ -30,6 +23,8 @@ class TestObjDBPerformance(TestBigRepoR):
@with_bare_rw_repo
def test_large_data_streaming(self, rwrepo):
+ # TODO: This part overlaps with the same file in gitdb.test.performance.test_stream
+ # It should be shared if possible
ldb = LooseObjectDB(os.path.join(rwrepo.git_dir, 'objects'))
for randomize in range(2):
diff --git a/test/git/test_commit.py b/test/git/test_commit.py
index e65e2e59..8629e625 100644
--- a/test/git/test_commit.py
+++ b/test/git/test_commit.py
@@ -6,7 +6,7 @@
from test.testlib import *
from git import *
-from git.odb import IStream
+from gitdb import IStream
from cStringIO import StringIO
import time
diff --git a/test/testlib/helper.py b/test/testlib/helper.py
index 457cc26c..715427c2 100644
--- a/test/testlib/helper.py
+++ b/test/testlib/helper.py
@@ -9,8 +9,6 @@ from git import Repo, Remote, GitCommandError
from unittest import TestCase
import tempfile
import shutil
-import random
-from array import array
import cStringIO
GIT_REPO = os.path.dirname(os.path.dirname(os.path.dirname(__file__)))
@@ -27,24 +25,6 @@ def fixture(name):
def absolute_project_path():
return os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))
-def make_bytes(size_in_bytes, randomize=False):
- """:return: string with given size in bytes
- :param randomize: try to produce a very random stream"""
- actual_size = size_in_bytes / 4
- producer = xrange(actual_size)
- if randomize:
- producer = list(producer)
- random.shuffle(producer)
- # END randomize
- a = array('i', producer)
- return a.tostring()
-
-
-def make_object(type, data):
- """:return: bytes resembling an uncompressed object"""
- odata = "blob %i\0" % len(data)
- return odata + data
-
#} END routines
#{ Adapters
@@ -87,8 +67,7 @@ def with_bare_rw_repo(func):
it will be removed completely.
Use this if you want to make purely index based adjustments, change refs, create
- heads, generally operations that do not need a working tree.
- """
+ heads, generally operations that do not need a working tree."""
def bare_repo_creator(self):
repo_dir = tempfile.mktemp("bare_repo")
rw_repo = self.rorepo.clone(repo_dir, shared=True, bare=True)