From 3ee291c469fc7ea6065ed22f344ed3f2792aa2ca Mon Sep 17 00:00:00 2001 From: Vincent Driessen Date: Tue, 14 Jun 2016 22:44:11 +0200 Subject: Store raw path bytes in Diff instances MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previously, the following fields on Diff instances were assumed to be passed in as unicode strings: - `a_path` - `b_path` - `rename_from` - `rename_to` However, since Git natively records paths as bytes, these may potentially not have a valid unicode representation. This patch changes the Diff instance to instead take the following equivalent fields that should be raw bytes instead: - `a_rawpath` - `b_rawpath` - `raw_rename_from` - `raw_rename_to` NOTE ON BACKWARD COMPATIBILITY: The original `a_path`, `b_path`, etc. fields are still available as properties (rather than slots). These properties now dynamically decode the raw bytes into a unicode string (performing the potentially destructive operation of replacing invalid unicode chars by "�"'s). This means that all code using Diffs should remain backward compatible. The only exception is when people would manually construct Diff instances by calling the constructor directly, in which case they should now pass in bytes rather than unicode strings. See also the discussion on https://github.com/gitpython-developers/GitPython/pull/467 --- doc/source/changes.rst | 3 +++ git/compat.py | 2 ++ git/diff.py | 58 +++++++++++++++++++++++++++++++++++--------------- git/test/test_diff.py | 6 +++++- 4 files changed, 51 insertions(+), 18 deletions(-) diff --git a/doc/source/changes.rst b/doc/source/changes.rst index 3f5b8c50..492d1055 100644 --- a/doc/source/changes.rst +++ b/doc/source/changes.rst @@ -5,6 +5,9 @@ Changelog 2.0.6 - Fixes and Features ========================== +* API: Diffs now have `a_rawpath`, `b_rawpath`, `raw_rename_from`, + `raw_rename_to` properties, which are the raw-bytes equivalents of their + unicode path counterparts. * Fix: TypeError about passing keyword argument to string decode() on Python 2.6. * Feature: `setUrl API on Remotes `_ diff --git a/git/compat.py b/git/compat.py index 5b46255c..b3572474 100644 --- a/git/compat.py +++ b/git/compat.py @@ -35,6 +35,7 @@ if PY3: return d.values() range = xrange unicode = str + binary_type = bytes else: FileType = file # usually, this is just ascii, which might not enough for our encoding needs @@ -44,6 +45,7 @@ else: byte_ord = ord bchr = chr unicode = unicode + binary_type = str range = xrange def mviter(d): return d.itervalues() diff --git a/git/diff.py b/git/diff.py index aeaa67d5..06193920 100644 --- a/git/diff.py +++ b/git/diff.py @@ -7,6 +7,7 @@ import re from gitdb.util import hex_to_bin +from .compat import binary_type from .objects.blob import Blob from .objects.util import mode_str_to_int @@ -245,18 +246,20 @@ class Diff(object): NULL_HEX_SHA = "0" * 40 NULL_BIN_SHA = b"\0" * 20 - __slots__ = ("a_blob", "b_blob", "a_mode", "b_mode", "a_path", "b_path", - "new_file", "deleted_file", "rename_from", "rename_to", "diff") + __slots__ = ("a_blob", "b_blob", "a_mode", "b_mode", "a_rawpath", "b_rawpath", + "new_file", "deleted_file", "raw_rename_from", "raw_rename_to", "diff") - def __init__(self, repo, a_path, b_path, a_blob_id, b_blob_id, a_mode, - b_mode, new_file, deleted_file, rename_from, - rename_to, diff): + def __init__(self, repo, a_rawpath, b_rawpath, a_blob_id, b_blob_id, a_mode, + b_mode, new_file, deleted_file, raw_rename_from, + raw_rename_to, diff): self.a_mode = a_mode self.b_mode = b_mode - self.a_path = a_path - self.b_path = b_path + assert a_rawpath is None or isinstance(a_rawpath, binary_type) + assert b_rawpath is None or isinstance(b_rawpath, binary_type) + self.a_rawpath = a_rawpath + self.b_rawpath = b_rawpath if self.a_mode: self.a_mode = mode_str_to_int(self.a_mode) @@ -266,19 +269,21 @@ class Diff(object): if a_blob_id is None or a_blob_id == self.NULL_HEX_SHA: self.a_blob = None else: - self.a_blob = Blob(repo, hex_to_bin(a_blob_id), mode=self.a_mode, path=a_path) + self.a_blob = Blob(repo, hex_to_bin(a_blob_id), mode=self.a_mode, path=self.a_path) if b_blob_id is None or b_blob_id == self.NULL_HEX_SHA: self.b_blob = None else: - self.b_blob = Blob(repo, hex_to_bin(b_blob_id), mode=self.b_mode, path=b_path) + self.b_blob = Blob(repo, hex_to_bin(b_blob_id), mode=self.b_mode, path=self.b_path) self.new_file = new_file self.deleted_file = deleted_file # be clear and use None instead of empty strings - self.rename_from = rename_from or None - self.rename_to = rename_to or None + assert raw_rename_from is None or isinstance(raw_rename_from, binary_type) + assert raw_rename_to is None or isinstance(raw_rename_to, binary_type) + self.raw_rename_from = raw_rename_from or None + self.raw_rename_to = raw_rename_to or None self.diff = diff @@ -344,6 +349,22 @@ class Diff(object): # end return res + @property + def a_path(self): + return self.a_rawpath.decode(defenc, 'replace') if self.a_rawpath else None + + @property + def b_path(self): + return self.b_rawpath.decode(defenc, 'replace') if self.b_rawpath else None + + @property + def rename_from(self): + return self.raw_rename_from.decode(defenc, 'replace') if self.raw_rename_from else None + + @property + def rename_to(self): + return self.raw_rename_to.decode(defenc, 'replace') if self.raw_rename_to else None + @property def renamed(self): """:returns: True if the blob of our diff has been renamed @@ -388,6 +409,7 @@ class Diff(object): new_file_mode, deleted_file_mode, \ a_blob_id, b_blob_id, b_mode, \ a_path, b_path = header.groups() + new_file, deleted_file = bool(new_file_mode), bool(deleted_file_mode) a_path = cls._pick_best_path(a_path, rename_from, a_path_fallback) @@ -404,15 +426,15 @@ class Diff(object): a_mode = old_mode or deleted_file_mode or (a_path and (b_mode or new_mode or new_file_mode)) b_mode = b_mode or new_mode or new_file_mode or (b_path and a_mode) index.append(Diff(repo, - a_path and a_path.decode(defenc, 'replace'), - b_path and b_path.decode(defenc, 'replace'), + a_path, + b_path, a_blob_id and a_blob_id.decode(defenc), b_blob_id and b_blob_id.decode(defenc), a_mode and a_mode.decode(defenc), b_mode and b_mode.decode(defenc), new_file, deleted_file, - rename_from and rename_from.decode(defenc, 'replace'), - rename_to and rename_to.decode(defenc, 'replace'), + rename_from, + rename_to, None)) previous_header = header @@ -438,8 +460,8 @@ class Diff(object): meta, _, path = line[1:].partition('\t') old_mode, new_mode, a_blob_id, b_blob_id, change_type = meta.split(None, 4) path = path.strip() - a_path = path - b_path = path + a_path = path.encode(defenc) + b_path = path.encode(defenc) deleted_file = False new_file = False rename_from = None @@ -455,6 +477,8 @@ class Diff(object): new_file = True elif change_type[0] == 'R': # parses RXXX, where XXX is a confidence value a_path, b_path = path.split('\t', 1) + a_path = a_path.encode(defenc) + b_path = b_path.encode(defenc) rename_from, rename_to = a_path, b_path # END add/remove handling diff --git a/git/test/test_diff.py b/git/test/test_diff.py index 8d189b12..ba0d2d13 100644 --- a/git/test/test_diff.py +++ b/git/test/test_diff.py @@ -90,6 +90,8 @@ class TestDiff(TestBase): assert_true(diff.renamed) assert_equal(diff.rename_from, u'Jérôme') assert_equal(diff.rename_to, u'müller') + assert_equal(diff.raw_rename_from, b'J\xc3\xa9r\xc3\xb4me') + assert_equal(diff.raw_rename_to, b'm\xc3\xbcller') assert isinstance(str(diff), str) output = StringProcessAdapter(fixture('diff_rename_raw')) @@ -129,7 +131,7 @@ class TestDiff(TestBase): output = StringProcessAdapter(fixture('diff_index_raw')) res = Diff._index_from_raw_format(None, output.stdout) assert res[0].deleted_file - assert res[0].b_path == '' + assert res[0].b_path is None def test_diff_initial_commit(self): initial_commit = self.rorepo.commit('33ebe7acec14b25c5f84f35a664803fcab2f7781') @@ -162,7 +164,9 @@ class TestDiff(TestBase): self.assertEqual(res[7].b_path, u'path/with-question-mark?') self.assertEqual(res[8].b_path, u'path/¯\\_(ツ)_|¯') self.assertEqual(res[9].b_path, u'path/💩.txt') + self.assertEqual(res[9].b_rawpath, b'path/\xf0\x9f\x92\xa9.txt') self.assertEqual(res[10].b_path, u'path/�-invalid-unicode-path.txt') + self.assertEqual(res[10].b_rawpath, b'path/\x80-invalid-unicode-path.txt') # The "Moves" # NOTE: The path prefixes a/ and b/ here are legit! We're actually -- cgit v1.2.1