From ae5a69f67822d81bbbd8f4af93be68703e730b37 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Wed, 2 Jun 2010 16:41:28 +0200 Subject: commit: redesigned revlist and commit parsing, commits are always retrieved from their object information directly. This is faster, and resolves issues with the rev-list format and empty commit messages Adjusted many tests to go with the changes, as they were still mocked. The mock was removed if necessary and replaced by code that actually executes --- lib/git/objects/commit.py | 98 +++++------- test/fixtures/rev_list | 27 +--- test/git/performance/test_commit.py | 8 +- test/git/test_commit.py | 310 ++++++++++++++++++------------------ test/git/test_repo.py | 39 ++--- 5 files changed, 217 insertions(+), 265 deletions(-) diff --git a/lib/git/objects/commit.py b/lib/git/objects/commit.py index 948e9a54..98aca360 100644 --- a/lib/git/objects/commit.py +++ b/lib/git/objects/commit.py @@ -106,13 +106,12 @@ class Commit(base.Object, Iterable, diff.Diffable, utils.Traversable, utils.Seri return commit.parents def _set_cache_(self, attr): - """ - Called by LazyMixin superclass when the given uninitialized member needs + """ Called by LazyMixin superclass when the given uninitialized member needs to be set. - We set all values at once. - """ + We set all values at once. """ if attr in Commit.__slots__: # read the data in a chunk, its faster - then provide a file wrapper + # Could use self.data, but lets try to get it with less calls hexsha, typename, size, data = self.repo.git.get_object_data(self) self._deserialize(StringIO(data)) else: @@ -181,16 +180,16 @@ class Commit(base.Object, Iterable, diff.Diffable, utils.Traversable, utils.Seri Returns iterator yielding Commit items """ - options = {'pretty': 'raw', 'as_process' : True } - options.update(kwargs) - + if 'pretty' in kwargs: + raise ValueError("--pretty cannot be used as parsing expects single sha's only") + # END handle pretty args = list() if paths: args.extend(('--', paths)) # END if paths - proc = repo.git.rev_list(rev, args, **options) - return cls._iter_from_process_or_stream(repo, proc, True) + proc = repo.git.rev_list(rev, args, as_process=True, **kwargs) + return cls._iter_from_process_or_stream(repo, proc) def iter_parents(self, paths='', **kwargs): """ @@ -235,35 +234,30 @@ class Commit(base.Object, Iterable, diff.Diffable, utils.Traversable, utils.Seri return stats.Stats._list_from_string(self.repo, text) @classmethod - def _iter_from_process_or_stream(cls, repo, proc_or_stream, from_rev_list): - """ - Parse out commit information into a list of Commit objects - - ``repo`` - is the Repo - - ``proc`` - git-rev-list process instance (raw format) + def _iter_from_process_or_stream(cls, repo, proc_or_stream): + """Parse out commit information into a list of Commit objects + We expect one-line per commit, and parse the actual commit information directly + from our lighting fast object database - ``from_rev_list`` - If True, the stream was created by rev-list in which case we parse - the message differently - Returns - iterator returning Commit objects - """ + :param proc: git-rev-list process instance - one sha per line + :return: iterator returning Commit objects""" stream = proc_or_stream if not hasattr(stream,'readline'): stream = proc_or_stream.stdout + readline = stream.readline while True: - line = stream.readline() + line = readline() if not line: break - commit_tokens = line.split() - id = commit_tokens[1] - assert commit_tokens[0] == "commit" + sha = line.strip() + if len(sha) > 40: + # split additional information, as returned by bisect for instance + sha, rest = line.split(None, 1) + # END handle extra info - yield Commit(repo, id)._deserialize(stream, from_rev_list) + assert len(sha) == 40, "Invalid line: %s" % sha + yield Commit(repo, sha) # END for each line in stream @@ -386,15 +380,16 @@ class Commit(base.Object, Iterable, diff.Diffable, utils.Traversable, utils.Seri # for now, this is very inefficient and in fact shouldn't be used like this return super(Commit, self)._serialize(stream) - def _deserialize(self, stream, from_rev_list=False): + def _deserialize(self, stream): """:param from_rev_list: if true, the stream format is coming from the rev-list command Otherwise it is assumed to be a plain data stream from our object""" - self.tree = Tree(self.repo, stream.readline().split()[1], 0, '') + readline = stream.readline + self.tree = Tree(self.repo, readline().split()[1], 0, '') self.parents = list() next_line = None while True: - parent_line = stream.readline() + parent_line = readline() if not parent_line.startswith('parent'): next_line = parent_line break @@ -404,37 +399,24 @@ class Commit(base.Object, Iterable, diff.Diffable, utils.Traversable, utils.Seri self.parents = tuple(self.parents) self.author, self.authored_date, self.author_tz_offset = utils.parse_actor_and_date(next_line) - self.committer, self.committed_date, self.committer_tz_offset = utils.parse_actor_and_date(stream.readline()) + self.committer, self.committed_date, self.committer_tz_offset = utils.parse_actor_and_date(readline()) - # empty line + # now we can have the encoding line, or an empty line followed by the optional + # message. self.encoding = self.default_encoding - enc = stream.readline() - enc.strip() + # read encoding or empty line to separate message + enc = readline() + enc = enc.strip() if enc: self.encoding = enc[enc.find(' ')+1:] - # END parse encoding - - message_lines = list() - if from_rev_list: - while True: - msg_line = stream.readline() - if not msg_line.startswith(' '): - # and forget about this empty marker - # cut the last newline to get rid of the artificial newline added - # by rev-list command. Lets hope its just linux style \n - message_lines[-1] = message_lines[-1][:-1] - break - # END abort message reading - # strip leading 4 spaces - message_lines.append(msg_line[4:]) - # END while there are message lines - self.message = ''.join(message_lines) - else: - # a stream from our data simply gives us the plain message - # The end of our message stream is marked with a newline that we strip - self.message = stream.read()[:-1] - # END message parsing + # now comes the message separator + readline() + # END handle encoding + + # a stream from our data simply gives us the plain message + # The end of our message stream is marked with a newline that we strip + self.message = stream.read()[:-1] return self #} END serializable implementation diff --git a/test/fixtures/rev_list b/test/fixtures/rev_list index 95a1ebff..1a576118 100644 --- a/test/fixtures/rev_list +++ b/test/fixtures/rev_list @@ -1,24 +1,3 @@ -commit 4c8124ffcf4039d292442eeccabdeca5af5c5017 -tree 672eca9b7f9e09c22dcb128c283e8c3c8d7697a4 -parent 634396b2f541a9f2d58b00be1a07f0c358b999b3 -author Tom Preston-Werner 1191999972 -0700 -committer Tom Preston-Werner 1191999972 -0700 - - implement Grit#heads - -commit 634396b2f541a9f2d58b00be1a07f0c358b999b3 -tree b35b4bf642d667fdd613eebcfe4e17efd420fb8a -author Tom Preston-Werner 1191997100 -0700 -committer Tom Preston-Werner 1191997100 -0700 - - initial grit setup - -commit ab25fd8483882c3bda8a458ad2965d2248654335 -tree c20b5ec543bde1e43a931449b196052c06ed8acc -parent 6e64c55896aabb9a7d8e9f8f296f426d21a78c2c -parent 7f874954efb9ba35210445be456c74e037ba6af2 -author Tom Preston-Werner 1182645538 -0700 -committer Tom Preston-Werner 1182645538 -0700 - - Merge branch 'site' - Some other stuff +4c8124ffcf4039d292442eeccabdeca5af5c5017 +634396b2f541a9f2d58b00be1a07f0c358b999b3 +ab25fd8483882c3bda8a458ad2965d2248654335 diff --git a/test/git/performance/test_commit.py b/test/git/performance/test_commit.py index c1f8ce59..b4a9d868 100644 --- a/test/git/performance/test_commit.py +++ b/test/git/performance/test_commit.py @@ -4,12 +4,12 @@ # This module is part of GitPython and is released under # the BSD License: http://www.opensource.org/licenses/bsd-license.php -from test.testlib import * +from lib import * from git import * from time import time import sys -class TestPerformance(TestBase): +class TestPerformance(TestBigRepoReadOnly): # ref with about 100 commits in its history ref_100 = '0.1.6' @@ -48,7 +48,7 @@ class TestPerformance(TestBase): # bound to cat-file parsing performance nc = 0 st = time() - for c in self.rorepo.commit(self.ref_100).traverse(branch_first=False): + for c in self.gitrepo.commit(self.head_sha_2k).traverse(branch_first=False): nc += 1 self._query_commit_info(c) # END for each traversed commit @@ -59,7 +59,7 @@ class TestPerformance(TestBase): # bound to stream parsing performance nc = 0 st = time() - for c in Commit.iter_items(self.rorepo, self.ref_100): + for c in Commit.iter_items(self.gitrepo, self.head_sha_2k): nc += 1 self._query_commit_info(c) # END for each traversed commit diff --git a/test/git/test_commit.py b/test/git/test_commit.py index 28b407ac..ad7a0082 100644 --- a/test/git/test_commit.py +++ b/test/git/test_commit.py @@ -9,169 +9,165 @@ from git import * class TestCommit(TestBase): - def test_bake(self): + def test_bake(self): - commit = Commit(self.rorepo, **{'sha': '2454ae89983a4496a445ce347d7a41c0bb0ea7ae'}) - commit.author # bake + commit = Commit(self.rorepo, '2454ae89983a4496a445ce347d7a41c0bb0ea7ae') + commit.author # bake - assert_equal("Sebastian Thiel", commit.author.name) - assert_equal("byronimo@gmail.com", commit.author.email) - assert commit.author == commit.committer - assert isinstance(commit.authored_date, int) and isinstance(commit.committed_date, int) - assert isinstance(commit.author_tz_offset, int) and isinstance(commit.committer_tz_offset, int) - assert commit.message == "Added missing information to docstrings of commit and stats module" + assert_equal("Sebastian Thiel", commit.author.name) + assert_equal("byronimo@gmail.com", commit.author.email) + assert commit.author == commit.committer + assert isinstance(commit.authored_date, int) and isinstance(commit.committed_date, int) + assert isinstance(commit.author_tz_offset, int) and isinstance(commit.committer_tz_offset, int) + assert commit.message == "Added missing information to docstrings of commit and stats module" - def test_stats(self): - commit = Commit(self.rorepo, '33ebe7acec14b25c5f84f35a664803fcab2f7781') - stats = commit.stats - - def check_entries(d): - assert isinstance(d, dict) - for key in ("insertions", "deletions", "lines"): - assert key in d - # END assertion helper - assert stats.files - assert stats.total - - check_entries(stats.total) - assert "files" in stats.total - - for filepath, d in stats.files.items(): - check_entries(d) - # END for each stated file - - # assure data is parsed properly - michael = Actor._from_string("Michael Trier ") - assert commit.author == michael - assert commit.committer == michael - assert commit.authored_date == 1210193388 - assert commit.committed_date == 1210193388 - assert commit.author_tz_offset == 14400, commit.author_tz_offset - assert commit.committer_tz_offset == 14400, commit.committer_tz_offset - assert commit.message == "initial project" - - def test_traversal(self): - start = self.rorepo.commit("a4d06724202afccd2b5c54f81bcf2bf26dea7fff") - first = self.rorepo.commit("33ebe7acec14b25c5f84f35a664803fcab2f7781") - p0 = start.parents[0] - p1 = start.parents[1] - p00 = p0.parents[0] - p10 = p1.parents[0] - - # basic branch first, depth first - dfirst = start.traverse(branch_first=False) - bfirst = start.traverse(branch_first=True) - assert dfirst.next() == p0 - assert dfirst.next() == p00 - - assert bfirst.next() == p0 - assert bfirst.next() == p1 - assert bfirst.next() == p00 - assert bfirst.next() == p10 - - # at some point, both iterations should stop - assert list(bfirst)[-1] == first - stoptraverse = self.rorepo.commit("254d04aa3180eb8b8daf7b7ff25f010cd69b4e7d").traverse(as_edge=True) - l = list(stoptraverse) - assert len(l[0]) == 2 - - # ignore self - assert start.traverse(ignore_self=False).next() == start - - # depth - assert len(list(start.traverse(ignore_self=False, depth=0))) == 1 - - # prune - assert start.traverse(branch_first=1, prune=lambda i,d: i==p0).next() == p1 - - # predicate - assert start.traverse(branch_first=1, predicate=lambda i,d: i==p1).next() == p1 - - # traversal should stop when the beginning is reached - self.failUnlessRaises(StopIteration, first.traverse().next) - - # parents of the first commit should be empty ( as the only parent has a null - # sha ) - assert len(first.parents) == 0 - - def test_iteration(self): - # we can iterate commits - all_commits = Commit.list_items(self.rorepo, self.rorepo.head) - assert all_commits - assert all_commits == list(self.rorepo.iter_commits()) - - # this includes merge commits - mcomit = Commit(self.rorepo, 'd884adc80c80300b4cc05321494713904ef1df2d') - assert mcomit in all_commits - - # we can limit the result to paths - ltd_commits = list(self.rorepo.iter_commits(paths='CHANGES')) - assert ltd_commits and len(ltd_commits) < len(all_commits) - - # show commits of multiple paths, resulting in a union of commits - less_ltd_commits = list(Commit.iter_items(self.rorepo, 'master', paths=('CHANGES', 'AUTHORS'))) - assert len(ltd_commits) < len(less_ltd_commits) - - - @patch_object(Git, '_call_process') - def test_rev_list_bisect_all(self, git): - """ - 'git rev-list --bisect-all' returns additional information - in the commit header. This test ensures that we properly parse it. - """ + def test_stats(self): + commit = Commit(self.rorepo, '33ebe7acec14b25c5f84f35a664803fcab2f7781') + stats = commit.stats + + def check_entries(d): + assert isinstance(d, dict) + for key in ("insertions", "deletions", "lines"): + assert key in d + # END assertion helper + assert stats.files + assert stats.total + + check_entries(stats.total) + assert "files" in stats.total + + for filepath, d in stats.files.items(): + check_entries(d) + # END for each stated file + + # assure data is parsed properly + michael = Actor._from_string("Michael Trier ") + assert commit.author == michael + assert commit.committer == michael + assert commit.authored_date == 1210193388 + assert commit.committed_date == 1210193388 + assert commit.author_tz_offset == 14400, commit.author_tz_offset + assert commit.committer_tz_offset == 14400, commit.committer_tz_offset + assert commit.message == "initial project" + + def test_traversal(self): + start = self.rorepo.commit("a4d06724202afccd2b5c54f81bcf2bf26dea7fff") + first = self.rorepo.commit("33ebe7acec14b25c5f84f35a664803fcab2f7781") + p0 = start.parents[0] + p1 = start.parents[1] + p00 = p0.parents[0] + p10 = p1.parents[0] + + # basic branch first, depth first + dfirst = start.traverse(branch_first=False) + bfirst = start.traverse(branch_first=True) + assert dfirst.next() == p0 + assert dfirst.next() == p00 + + assert bfirst.next() == p0 + assert bfirst.next() == p1 + assert bfirst.next() == p00 + assert bfirst.next() == p10 + + # at some point, both iterations should stop + assert list(bfirst)[-1] == first + stoptraverse = self.rorepo.commit("254d04aa3180eb8b8daf7b7ff25f010cd69b4e7d").traverse(as_edge=True) + l = list(stoptraverse) + assert len(l[0]) == 2 + + # ignore self + assert start.traverse(ignore_self=False).next() == start + + # depth + assert len(list(start.traverse(ignore_self=False, depth=0))) == 1 + + # prune + assert start.traverse(branch_first=1, prune=lambda i,d: i==p0).next() == p1 + + # predicate + assert start.traverse(branch_first=1, predicate=lambda i,d: i==p1).next() == p1 + + # traversal should stop when the beginning is reached + self.failUnlessRaises(StopIteration, first.traverse().next) + + # parents of the first commit should be empty ( as the only parent has a null + # sha ) + assert len(first.parents) == 0 + + def test_iteration(self): + # we can iterate commits + all_commits = Commit.list_items(self.rorepo, self.rorepo.head) + assert all_commits + assert all_commits == list(self.rorepo.iter_commits()) + + # this includes merge commits + mcomit = Commit(self.rorepo, 'd884adc80c80300b4cc05321494713904ef1df2d') + assert mcomit in all_commits + + # we can limit the result to paths + ltd_commits = list(self.rorepo.iter_commits(paths='CHANGES')) + assert ltd_commits and len(ltd_commits) < len(all_commits) + + # show commits of multiple paths, resulting in a union of commits + less_ltd_commits = list(Commit.iter_items(self.rorepo, 'master', paths=('CHANGES', 'AUTHORS'))) + assert len(ltd_commits) < len(less_ltd_commits) + + def test_iter_items(self): + # pretty not allowed + self.failUnlessRaises(ValueError, Commit.iter_items, self.rorepo, 'master', pretty="raw") + + def test_rev_list_bisect_all(self): + """ + 'git rev-list --bisect-all' returns additional information + in the commit header. This test ensures that we properly parse it. + """ + revs = self.rorepo.git.rev_list('933d23bf95a5bd1624fbcdf328d904e1fa173474', + first_parent=True, + bisect_all=True) - git.return_value = fixture('rev_list_bisect_all') + commits = Commit._iter_from_process_or_stream(self.rorepo, StringProcessAdapter(revs)) + expected_ids = ( + '7156cece3c49544abb6bf7a0c218eb36646fad6d', + '1f66cfbbce58b4b552b041707a12d437cc5f400a', + '33ebe7acec14b25c5f84f35a664803fcab2f7781', + '933d23bf95a5bd1624fbcdf328d904e1fa173474' + ) + for sha1, commit in zip(expected_ids, commits): + assert_equal(sha1, commit.sha) - revs = self.rorepo.git.rev_list('HEAD', - pretty='raw', - first_parent=True, - bisect_all=True) - assert_true(git.called) + def test_count(self): + assert self.rorepo.tag('refs/tags/0.1.5').commit.count( ) == 143 + + def test_list(self): + assert isinstance(Commit.list_items(self.rorepo, '0.1.5', max_count=5)['5117c9c8a4d3af19a9958677e45cda9269de1541'], Commit) - commits = Commit._iter_from_process_or_stream(self.rorepo, StringProcessAdapter(revs), True) - expected_ids = ( - 'cf37099ea8d1d8c7fbf9b6d12d7ec0249d3acb8b', - '33ebe7acec14b25c5f84f35a664803fcab2f7781', - 'a6604a00a652e754cb8b6b0b9f194f839fc38d7c', - '8df638c22c75ddc9a43ecdde90c0c9939f5009e7', - 'c231551328faa864848bde6ff8127f59c9566e90', - ) - for sha1, commit in zip(expected_ids, commits): - assert_equal(sha1, commit.sha) + def test_str(self): + commit = Commit(self.rorepo, 'abc') + assert_equal ("abc", str(commit)) - def test_count(self): - assert self.rorepo.tag('refs/tags/0.1.5').commit.count( ) == 143 - - def test_list(self): - assert isinstance(Commit.list_items(self.rorepo, '0.1.5', max_count=5)['5117c9c8a4d3af19a9958677e45cda9269de1541'], Commit) + def test_repr(self): + commit = Commit(self.rorepo, 'abc') + assert_equal('', repr(commit)) - def test_str(self): - commit = Commit(self.rorepo, 'abc') - assert_equal ("abc", str(commit)) - - def test_repr(self): - commit = Commit(self.rorepo, 'abc') - assert_equal('', repr(commit)) - - def test_equality(self): - commit1 = Commit(self.rorepo, 'abc') - commit2 = Commit(self.rorepo, 'abc') - commit3 = Commit(self.rorepo, 'zyx') - assert_equal(commit1, commit2) - assert_not_equal(commit2, commit3) - - def test_iter_parents(self): - # should return all but ourselves, even if skip is defined - c = self.rorepo.commit('0.1.5') - for skip in (0, 1): - piter = c.iter_parents(skip=skip) - first_parent = piter.next() - assert first_parent != c - assert first_parent == c.parents[0] - # END for each - - def test_base(self): - name_rev = self.rorepo.head.commit.name_rev - assert isinstance(name_rev, basestring) - + def test_equality(self): + commit1 = Commit(self.rorepo, 'abc') + commit2 = Commit(self.rorepo, 'abc') + commit3 = Commit(self.rorepo, 'zyx') + assert_equal(commit1, commit2) + assert_not_equal(commit2, commit3) + + def test_iter_parents(self): + # should return all but ourselves, even if skip is defined + c = self.rorepo.commit('0.1.5') + for skip in (0, 1): + piter = c.iter_parents(skip=skip) + first_parent = piter.next() + assert first_parent != c + assert first_parent == c.parents[0] + # END for each + + def test_base(self): + name_rev = self.rorepo.head.commit.name_rev + assert isinstance(name_rev, basestring) + diff --git a/test/git/test_repo.py b/test/git/test_repo.py index 9316245b..ddf2b3e1 100644 --- a/test/git/test_repo.py +++ b/test/git/test_repo.py @@ -55,32 +55,27 @@ class TestRepo(TestBase): # try from invalid revision that does not exist self.failUnlessRaises(ValueError, self.rorepo.tree, 'hello world') - @patch_object(Git, '_call_process') - def test_commits(self, git): - git.return_value = StringProcessAdapter(fixture('rev_list')) - - commits = list(self.rorepo.iter_commits('master', max_count=10)) + def test_commits(self): + mc = 10 + commits = list(self.rorepo.iter_commits('0.1.6', max_count=mc)) + assert len(commits) == mc c = commits[0] - assert_equal('4c8124ffcf4039d292442eeccabdeca5af5c5017', c.sha) - assert_equal(["634396b2f541a9f2d58b00be1a07f0c358b999b3"], [p.sha for p in c.parents]) - assert_equal("672eca9b7f9e09c22dcb128c283e8c3c8d7697a4", c.tree.sha) - assert_equal("Tom Preston-Werner", c.author.name) - assert_equal("tom@mojombo.com", c.author.email) - assert_equal(1191999972, c.authored_date) - assert_equal("Tom Preston-Werner", c.committer.name) - assert_equal("tom@mojombo.com", c.committer.email) - assert_equal(1191999972, c.committed_date) - assert_equal("implement Grit#heads", c.message) + assert_equal('9a4b1d4d11eee3c5362a4152216376e634bd14cf', c.sha) + assert_equal(["c76852d0bff115720af3f27acdb084c59361e5f6"], [p.sha for p in c.parents]) + assert_equal("ce41fc29549042f1aa09cc03174896cf23f112e3", c.tree.sha) + assert_equal("Michael Trier", c.author.name) + assert_equal("mtrier@gmail.com", c.author.email) + assert_equal(1232829715, c.authored_date) + assert_equal(5*3600, c.author_tz_offset) + assert_equal("Michael Trier", c.committer.name) + assert_equal("mtrier@gmail.com", c.committer.email) + assert_equal(1232829715, c.committed_date) + assert_equal(5*3600, c.committer_tz_offset) + assert_equal("Bumped version 0.1.6", c.message) c = commits[1] - assert_equal(tuple(), c.parents) - - c = commits[2] - assert_equal(["6e64c55896aabb9a7d8e9f8f296f426d21a78c2c", "7f874954efb9ba35210445be456c74e037ba6af2"], map(lambda p: p.sha, c.parents)) - assert_equal("Merge branch 'site'", c.summary) - - assert_true(git.called) + assert isinstance(c.parents, tuple) def test_trees(self): mc = 30 -- cgit v1.2.1