summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPaul Sowden <paul@idontsmoke.co.uk>2008-11-20 00:09:43 -0800
committerMichael Trier <mtrier@gmail.com>2008-12-15 14:12:23 -0500
commit753e908dcea03cf9962cf45d3965cf93b0d30d94 (patch)
tree409595f1b5368461b5afc7ee0db642c6a83fd538
parent9e14356d12226cb140b0e070bd079468b4ab599b (diff)
downloadgitpython-753e908dcea03cf9962cf45d3965cf93b0d30d94.tar.gz
implement a far far faster diff parser
The old diff parser in list_from_string took a large amount of time to parse long diffs, on one of my repositories it took over 3 minutes to parse the initial commit. The new parser uses a single regexp to match the header of a diff, and iterates over the each individual diff by splitting the entire string by the diff seperator, attempting to match the header for each individual diff. With the new parser parsing the same repository is almost instant, woohoo! (cherry picked from commit 5b6b27f153bdc30380bea12a528ef483571dd57a)
-rw-r--r--lib/git/diff.py61
-rw-r--r--test/git/test_commit.py2
2 files changed, 20 insertions, 43 deletions
diff --git a/lib/git/diff.py b/lib/git/diff.py
index 51315fe3..28ebda01 100644
--- a/lib/git/diff.py
+++ b/lib/git/diff.py
@@ -34,52 +34,29 @@ class Diff(object):
@classmethod
def list_from_string(cls, repo, text):
- lines = text.splitlines()
- a_mode = None
- b_mode = None
diffs = []
- while lines:
- m = re.search(r'^diff --git a/(\S+) b/(\S+)$', lines.pop(0))
- if m:
- a_path, b_path = m.groups()
- if re.search(r'^old mode', lines[0]):
- m = re.search(r'^old mode (\d+)', lines.pop(0))
- if m:
- a_mode, = m.groups()
- m = re.search(r'^new mode (\d+)', lines.pop(0))
- if m:
- b_mode, = m.groups()
- if re.search(r'^diff --git', lines[0]):
- diffs.append(Diff(repo, a_path, b_path, None, None, a_mode, b_mode, False, False, None))
- continue
- new_file = False
- deleted_file = False
+ diff_header = re.compile(r"""
+ #^diff[ ]--git
+ [ ]a/(?P<a_path>\S+)[ ]b/(?P<b_path>\S+)\n
+ (?:^old[ ]mode[ ](?P<old_mode>\d+)(?:\n|$))?
+ (?:^new[ ]mode[ ](?P<new_mode>\d+)(?:\n|$))?
+ (?:^new[ ]file[ ]mode[ ](?P<new_file_mode>.+)(?:\n|$))?
+ (?:^deleted[ ]file[ ]mode[ ](?P<deleted_file_mode>.+)(?:\n|$))?
+ (?:^index[ ](?P<a_commit>[0-9A-Fa-f]+)
+ \.\.(?P<b_commit>[0-9A-Fa-f]+)[ ]?(?P<b_mode>.+)?(?:\n|$))?
+ """, re.VERBOSE | re.MULTILINE).match
- if re.search(r'^new file', lines[0]):
- m = re.search(r'^new file mode (.+)', lines.pop(0))
- if m:
- b_mode, = m.groups()
- a_mode = None
- new_file = True
- elif re.search(r'^deleted file', lines[0]):
- m = re.search(r'^deleted file mode (.+)$', lines.pop(0))
- if m:
- a_mode, = m.groups()
- b_mode = None
- deleted_file = True
+ for diff in ('\n' + text).split('\ndiff --git')[1:]:
+ header = diff_header(diff)
- m = re.search(r'^index ([0-9A-Fa-f]+)\.\.([0-9A-Fa-f]+) ?(.+)?$', lines.pop(0))
- if m:
- a_commit, b_commit, b_mode = m.groups()
- if b_mode:
- b_mode = b_mode.strip()
+ a_path, b_path, old_mode, new_mode, new_file_mode, deleted_file_mode, \
+ a_commit, b_commit, b_mode = header.groups()
+ new_file, deleted_file = bool(new_file_mode), bool(deleted_file_mode)
- diff_lines = []
- while lines and not re.search(r'^diff', lines[0]):
- diff_lines.append(lines.pop(0))
-
- diff = "\n".join(diff_lines)
- diffs.append(Diff(repo, a_path, b_path, a_commit, b_commit, a_mode, b_mode, new_file, deleted_file, diff))
+ diffs.append(Diff(repo, a_path, b_path, a_commit, b_commit,
+ old_mode or deleted_file_mode, new_mode or new_file_mode or b_mode,
+ new_file, deleted_file, diff[header.end():]))
return diffs
+
diff --git a/test/git/test_commit.py b/test/git/test_commit.py
index f6e34dac..93c7d2c2 100644
--- a/test/git/test_commit.py
+++ b/test/git/test_commit.py
@@ -130,7 +130,7 @@ class TestCommit(object):
assert_equal('History.txt', diffs[0].a_path)
assert_equal('History.txt', diffs[0].b_path)
assert_equal(None, diffs[0].a_commit)
- assert_equal(None, diffs[0].b_mode)
+ assert_equal('100644', diffs[0].b_mode)
assert_equal('81d2c27608b352814cbe979a6acd678d30219678', diffs[0].b_commit.id)
assert_equal(True, diffs[0].new_file)
assert_equal(False, diffs[0].deleted_file)