Reduce memory usage for chunk_hashes

On a 180MB file this reduced total memory usage by approximately 40%. This was also marginally faster (but not by much). I've also added the start of unittests for the writer module, and I've written some very basic unittests for the chunk_hashes function.
author: James Saryerwinnie <js@jamesls.com> 2012-09-06 10:55:18 -0700
committer: James Saryerwinnie <js@jamesls.com> 2012-09-06 15:20:41 -0700
commit: f117db58ae25a788a0ab522e89986cea6bded31a (patch)
tree: cdf0e3aa2940eb92dab9449b4ad6f05977b8382b
parent: f1b007e6c8f29096e5e20266c66df873b777dc4e (diff)
download: boto-f117db58ae25a788a0ab522e89986cea6bded31a.tar.gz
2 files changed, 37 insertions, 9 deletions
diff --git a/boto/glacier/writer.py b/boto/glacier/writer.py
index aca94e76..b57723c7 100644
--- a/boto/glacier/writer.py
+++ b/boto/glacier/writer.py
@@ -28,15 +28,17 @@ import math
 import json
 
 
-def chunk_hashes(str):
-    """
-    Break up the byte-string into 1MB chunks and return sha256 hashes
-    for each.
-    """
-    chunk = 1024 * 1024
-    chunk_count = int(math.ceil(len(str) / float(chunk)))
-    chunks = [str[i * chunk:(i + 1) * chunk] for i in range(chunk_count)]
-    return [hashlib.sha256(x).digest() for x in chunks]
+_ONE_MEGABYTE = 1024 * 1024
+
+
+def chunk_hashes(bytestring, chunk_size=_ONE_MEGABYTE):
+    chunk_count = int(math.ceil(len(bytestring) / float(chunk_size)))
+    hashes = []
+    for i in xrange(chunk_count):
+        start = i * chunk_size
+        end = (i + 1) * chunk_size
+        hashes.append(hashlib.sha256(bytestring[start:end]).digest())
+    return hashes
 
 
 def tree_hash(fo):
diff --git a/tests/unit/glacier/test_writer.py b/tests/unit/glacier/test_writer.py
new file mode 100644
index 00000000..216429fd
--- /dev/null
+++ b/tests/unit/glacier/test_writer.py
@@ -0,0 +1,26 @@
+from hashlib import sha256
+
+from tests.unit import unittest
+import mock
+
+from boto.glacier.writer import Writer, chunk_hashes
+
+
+class TestChunking(unittest.TestCase):
+    def test_chunk_hashes_exact(self):
+        chunks = chunk_hashes('a' * (2 * 1024 * 1024))
+        self.assertEqual(len(chunks), 2)
+        self.assertEqual(chunks[0], sha256('a' * 1024 * 1024).digest())
+
+    def test_chunks_with_leftovers(self):
+        bytestring = 'a' * (2 * 1024 * 1024 + 20)
+        chunks = chunk_hashes(bytestring)
+        self.assertEqual(len(chunks), 3)
+        self.assertEqual(chunks[0], sha256('a' * 1024 * 1024).digest())
+        self.assertEqual(chunks[1], sha256('a' * 1024 * 1024).digest())
+        self.assertEqual(chunks[2], sha256('a' * 20).digest())
+
+    def test_less_than_one_chunk(self):
+        chunks = chunk_hashes('aaaa')
+        self.assertEqual(len(chunks), 1)
+        self.assertEqual(chunks[0], sha256('aaaa').digest())
author	James Saryerwinnie <js@jamesls.com>	2012-09-06 10:55:18 -0700
committer	James Saryerwinnie <js@jamesls.com>	2012-09-06 15:20:41 -0700
commit	f117db58ae25a788a0ab522e89986cea6bded31a (patch)
tree	cdf0e3aa2940eb92dab9449b4ad6f05977b8382b
parent	f1b007e6c8f29096e5e20266c66df873b777dc4e (diff)
download	boto-f117db58ae25a788a0ab522e89986cea6bded31a.tar.gz