From 0eafe201905d85be767c24106eb1ab12efd3ee22 Mon Sep 17 00:00:00 2001 From: Vincent Driessen Date: Mon, 30 May 2016 16:20:22 +0200 Subject: Add test case as example of Git commit with invalid data MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This is a real commit from the microjs.com open source project, see https://github.com/madrobby/microjs.com/commit/7e8457c17850d0991763941213dcb403d80f39f8, which is declared to be encoded in UTF-8, but contains invalid bytes. This makes GitPython choke on it while decoding. Rather than choking, this should instead accept the error and replace the invalid bytes by the � (\x80) char. --- git/test/fixtures/commit_invalid_data | 6 ++++++ git/test/test_commit.py | 7 +++++++ 2 files changed, 13 insertions(+) create mode 100644 git/test/fixtures/commit_invalid_data diff --git a/git/test/fixtures/commit_invalid_data b/git/test/fixtures/commit_invalid_data new file mode 100644 index 00000000..d112bf2d --- /dev/null +++ b/git/test/fixtures/commit_invalid_data @@ -0,0 +1,6 @@ +tree 9f1a495d7d9692d24f5caedaa89f5c2c32d59368 +parent 492ace2ffce0e426ebeb55e364e987bcf024dd3b +author E.Azer KoÃoÃoÃoculu 1306710073 +0300 +committer E.Azer KoÃoÃoÃoculu 1306710073 +0300 + +add environjs diff --git a/git/test/test_commit.py b/git/test/test_commit.py index 23b7154a..ea8cd9af 100644 --- a/git/test/test_commit.py +++ b/git/test/test_commit.py @@ -306,6 +306,13 @@ class TestCommit(TestBase): # it appears cmt.author.__repr__() + def test_invalid_commit(self): + cmt = self.rorepo.commit() + cmt._deserialize(open(fixture_path('commit_invalid_data'), 'rb')) + + assert cmt.author.name == u'E.Azer Ko�o�o�oculu', cmt.author.name + assert cmt.author.email == 'azer@kodfabrik.com', cmt.author.email + def test_gpgsig(self): cmt = self.rorepo.commit() cmt._deserialize(open(fixture_path('commit_with_gpgsig'), 'rb')) -- cgit v1.2.1 From 79c99c0f66c8f3c8d13258376c82125a23b1b5c8 Mon Sep 17 00:00:00 2001 From: Vincent Driessen Date: Mon, 30 May 2016 16:26:43 +0200 Subject: Ignore invalid data when decoding commit objects MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previously, GitPython chokes on this while decoding. Rather than choking, instead accept the error and replace the invalid bytes by the � (\x80) char. --- git/objects/commit.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/git/objects/commit.py b/git/objects/commit.py index dc722f97..58a8912f 100644 --- a/git/objects/commit.py +++ b/git/objects/commit.py @@ -501,14 +501,14 @@ class Commit(base.Object, Iterable, Diffable, Traversable, Serializable): try: self.author, self.authored_date, self.author_tz_offset = \ - parse_actor_and_date(author_line.decode(self.encoding)) + parse_actor_and_date(author_line.decode(self.encoding, errors='replace')) except UnicodeDecodeError: log.error("Failed to decode author line '%s' using encoding %s", author_line, self.encoding, exc_info=True) try: self.committer, self.committed_date, self.committer_tz_offset = \ - parse_actor_and_date(committer_line.decode(self.encoding)) + parse_actor_and_date(committer_line.decode(self.encoding, errors='replace')) except UnicodeDecodeError: log.error("Failed to decode committer line '%s' using encoding %s", committer_line, self.encoding, exc_info=True) @@ -518,7 +518,7 @@ class Commit(base.Object, Iterable, Diffable, Traversable, Serializable): # The end of our message stream is marked with a newline that we strip self.message = stream.read() try: - self.message = self.message.decode(self.encoding) + self.message = self.message.decode(self.encoding, errors='replace') except UnicodeDecodeError: log.error("Failed to decode message '%s' using encoding %s", self.message, self.encoding, exc_info=True) # END exception handling -- cgit v1.2.1