Fixes test_blob and improved commit writing/reading

author: Sebastian Thiel <byronimo@gmail.com> 2015-01-05 16:44:54 +0100
committer: Sebastian Thiel <byronimo@gmail.com> 2015-01-05 16:44:54 +0100
commit: 8a308613467a1510f8dac514624abae4e10c0779 (patch)
tree: 6c703cb00875ca77972b6250db8b63800a051502
parent: 3d0556a31916a709e9da3eafb92fc6b8bf69896c (diff)
download: gitpython-8a308613467a1510f8dac514624abae4e10c0779.tar.gz
3 files changed, 40 insertions, 34 deletions
diff --git a/git/compat.py b/git/compat.py
index 4a892ad2..f11d1423 100644
--- a/git/compat.py
+++ b/git/compat.py
@@ -27,12 +27,15 @@ defenc = sys.getdefaultencoding()
 if PY3:
     import io
     FileType = io.IOBase
+    def byte_ord(b):
+        return b
 else:
     FileType = file
     # usually, this is just ascii, which might not enough for our encoding needs
     # Unless it's set specifically, we override it to be utf-8
     if defenc == 'ascii':
         defenc = 'utf-8'
+    byte_ord = ord
 
 
 def with_metaclass(meta, *bases):
@@ -54,4 +57,3 @@ def with_metaclass(meta, *bases):
     # end metaclass
     return metaclass(meta.__name__ + 'Helper', None, {})
     # end handle py2
-
diff --git a/git/objects/commit.py b/git/objects/commit.py
index 5ad7902b..f8b5c969 100644
--- a/git/objects/commit.py
+++ b/git/objects/commit.py
@@ -419,23 +419,25 @@ class Commit(base.Object, Iterable, Diffable, Traversable, Serializable):
         next_line = None
         while True:
             parent_line = readline()
-            if not parent_line.startswith('parent'):
+            if not parent_line.startswith(b'parent'):
                 next_line = parent_line
                 break
             # END abort reading parents
-            self.parents.append(type(self)(self.repo, hex_to_bin(parent_line.split()[-1])))
+            self.parents.append(type(self)(self.repo, hex_to_bin(parent_line.split()[-1].decode('ascii'))))
         # END for each parent line
         self.parents = tuple(self.parents)
 
-        self.author, self.authored_date, self.author_tz_offset = parse_actor_and_date(next_line)
-        self.committer, self.committed_date, self.committer_tz_offset = parse_actor_and_date(readline())
+        # we don't know actual author encoding before we have parsed it, so keep the lines around
+        author_line = next_line
+        committer_line = readline()
 
         # we might run into one or more mergetag blocks, skip those for now
         next_line = readline()
-        while next_line.startswith('mergetag '):
+        while next_line.startswith(b'mergetag '):
             next_line = readline()
             while next_line.startswith(' '):
                 next_line = readline()
+        # end skip mergetags
 
         # now we can have the encoding line, or an empty line followed by the optional
         # message.
@@ -444,39 +446,40 @@ class Commit(base.Object, Iterable, Diffable, Traversable, Serializable):
         # read headers
         enc = next_line
         buf = enc.strip()
-        while buf != "":
-            if buf[0:10] == "encoding ":
-                self.encoding = buf[buf.find(' ') + 1:]
-            elif buf[0:7] == "gpgsig ":
-                sig = buf[buf.find(' ') + 1:] + "\n"
+        while buf:
+            if buf[0:10] == b"encoding ":
+                self.encoding = buf[buf.find(' ') + 1:].decode('ascii')
+            elif buf[0:7] == b"gpgsig ":
+                sig = buf[buf.find(b' ') + 1:] + b"\n"
                 is_next_header = False
                 while True:
                     sigbuf = readline()
-                    if sigbuf == "":
+                    if not sigbuf:
                         break
-                    if sigbuf[0:1] != " ":
+                    if sigbuf[0:1] != b" ":
                         buf = sigbuf.strip()
                         is_next_header = True
                         break
                     sig += sigbuf[1:]
-                self.gpgsig = sig.rstrip("\n")
+                # end read all signature
+                self.gpgsig = sig.rstrip(b"\n").decode('ascii')
                 if is_next_header:
                     continue
             buf = readline().strip()
-
         # decode the authors name
+
         try:
-            self.author.name = self.author.name.decode(self.encoding)
+            self.author, self.authored_date, self.author_tz_offset = \
+                parse_actor_and_date(author_line.decode(self.encoding))
         except UnicodeDecodeError:
-            log.error("Failed to decode author name '%s' using encoding %s", self.author.name, self.encoding,
+            log.error("Failed to decode author line '%s' using encoding %s", author_line, self.encoding,
                       exc_info=True)
-        # END handle author's encoding
 
-        # decode committer name
         try:
-            self.committer.name = self.committer.name.decode(self.encoding)
+            self.committer, self.committed_date, self.committer_tz_offset = \
+                parse_actor_and_date(committer_line.decode(self.encoding))
         except UnicodeDecodeError:
-            log.error("Failed to decode committer name '%s' using encoding %s", self.committer.name, self.encoding,
+            log.error("Failed to decode committer line '%s' using encoding %s", committer_line, self.encoding,
                       exc_info=True)
         # END handle author's encoding
 
@@ -488,6 +491,7 @@ class Commit(base.Object, Iterable, Diffable, Traversable, Serializable):
         except UnicodeDecodeError:
             log.error("Failed to decode message '%s' using encoding %s", self.message, self.encoding, exc_info=True)
         # END exception handling
+
         return self
 
     #} END serializable implementation
diff --git a/git/objects/fun.py b/git/objects/fun.py
index db2ec7c2..f92a4c06 100644
--- a/git/objects/fun.py
+++ b/git/objects/fun.py
@@ -1,6 +1,9 @@
 """Module with functions which are supposed to be as fast as possible"""
 from stat import S_ISDIR
 from git.compat import (
+    byte_ord,
+    force_bytes,
+    defenc,
     xrange,
     text_type
 )
@@ -17,13 +20,13 @@ def tree_to_stream(entries, write):
     bit_mask = 7            # 3 bits set
 
     for binsha, mode, name in entries:
-        mode_str = ''
+        mode_str = b''
         for i in xrange(6):
             mode_str = chr(((mode >> (i * 3)) & bit_mask) + ord_zero) + mode_str
         # END for each 8 octal value
 
         # git slices away the first octal if its zero
-        if mode_str[0] == '0':
+        if byte_ord(mode_str[0]) == ord_zero:
             mode_str = mode_str[1:]
         # END save a byte
 
@@ -33,16 +36,16 @@ def tree_to_stream(entries, write):
         # According to my tests, this is exactly what git does, that is it just
         # takes the input literally, which appears to be utf8 on linux.
         if isinstance(name, text_type):
-            name = name.encode("utf8")
-        write("%s %s\0%s" % (mode_str, name, binsha))
+            name = name.encode(defenc)
+        write(b''.join(mode_str, b' ', name, b'\0', binsha))
     # END for each item
 
-
 def tree_entries_from_data(data):
     """Reads the binary representation of a tree and returns tuples of Tree items
-    :param data: data block with tree data
+    :param data: data block with tree data (as bytes)
     :return: list(tuple(binsha, mode, tree_relative_path), ...)"""
     ord_zero = ord('0')
+    space_ord = ord(' ')
     len_data = len(data)
     i = 0
     out = list()
@@ -52,10 +55,10 @@ def tree_entries_from_data(data):
         # read mode
         # Some git versions truncate the leading 0, some don't
         # The type will be extracted from the mode later
-        while data[i] != ' ':
+        while byte_ord(data[i]) != space_ord:
             # move existing mode integer up one level being 3 bits
             # and add the actual ordinal value of the character
-            mode = (mode << 3) + (ord(data[i]) - ord_zero)
+            mode = (mode << 3) + (byte_ord(data[i]) - ord_zero)
             i += 1
         # END while reading mode
 
@@ -65,7 +68,7 @@ def tree_entries_from_data(data):
         # parse name, it is NULL separated
 
         ns = i
-        while data[i] != '\0':
+        while byte_ord(data[i]) != 0:
             i += 1
         # END while not reached NULL
 
@@ -73,12 +76,9 @@ def tree_entries_from_data(data):
         # Only use the respective unicode object if the byte stream was encoded
         name = data[ns:i]
         try:
-            name_enc = name.decode("utf-8")
+            name = name.decode(defenc)
         except UnicodeDecodeError:
             pass
-        else:
-            if len(name) > len(name_enc):
-                name = name_enc
         # END handle encoding
 
         # byte is NULL, get next 20
author	Sebastian Thiel <byronimo@gmail.com>	2015-01-05 16:44:54 +0100
committer	Sebastian Thiel <byronimo@gmail.com>	2015-01-05 16:44:54 +0100
commit	8a308613467a1510f8dac514624abae4e10c0779 (patch)
tree	6c703cb00875ca77972b6250db8b63800a051502
parent	3d0556a31916a709e9da3eafb92fc6b8bf69896c (diff)
download	gitpython-8a308613467a1510f8dac514624abae4e10c0779.tar.gz