diff options
author | James Saryerwinnie <js@jamesls.com> | 2012-09-06 10:55:18 -0700 |
---|---|---|
committer | James Saryerwinnie <js@jamesls.com> | 2012-09-06 15:20:41 -0700 |
commit | f117db58ae25a788a0ab522e89986cea6bded31a (patch) | |
tree | cdf0e3aa2940eb92dab9449b4ad6f05977b8382b | |
parent | f1b007e6c8f29096e5e20266c66df873b777dc4e (diff) | |
download | boto-f117db58ae25a788a0ab522e89986cea6bded31a.tar.gz |
Reduce memory usage for chunk_hashes
On a 180MB file this reduced total memory
usage by approximately 40%. This was also marginally
faster (but not by much).
I've also added the start of unittests for the writer module,
and I've written some very basic unittests for the chunk_hashes
function.
-rw-r--r-- | boto/glacier/writer.py | 20 | ||||
-rw-r--r-- | tests/unit/glacier/test_writer.py | 26 |
2 files changed, 37 insertions, 9 deletions
diff --git a/boto/glacier/writer.py b/boto/glacier/writer.py index aca94e76..b57723c7 100644 --- a/boto/glacier/writer.py +++ b/boto/glacier/writer.py @@ -28,15 +28,17 @@ import math import json -def chunk_hashes(str): - """ - Break up the byte-string into 1MB chunks and return sha256 hashes - for each. - """ - chunk = 1024 * 1024 - chunk_count = int(math.ceil(len(str) / float(chunk))) - chunks = [str[i * chunk:(i + 1) * chunk] for i in range(chunk_count)] - return [hashlib.sha256(x).digest() for x in chunks] +_ONE_MEGABYTE = 1024 * 1024 + + +def chunk_hashes(bytestring, chunk_size=_ONE_MEGABYTE): + chunk_count = int(math.ceil(len(bytestring) / float(chunk_size))) + hashes = [] + for i in xrange(chunk_count): + start = i * chunk_size + end = (i + 1) * chunk_size + hashes.append(hashlib.sha256(bytestring[start:end]).digest()) + return hashes def tree_hash(fo): diff --git a/tests/unit/glacier/test_writer.py b/tests/unit/glacier/test_writer.py new file mode 100644 index 00000000..216429fd --- /dev/null +++ b/tests/unit/glacier/test_writer.py @@ -0,0 +1,26 @@ +from hashlib import sha256 + +from tests.unit import unittest +import mock + +from boto.glacier.writer import Writer, chunk_hashes + + +class TestChunking(unittest.TestCase): + def test_chunk_hashes_exact(self): + chunks = chunk_hashes('a' * (2 * 1024 * 1024)) + self.assertEqual(len(chunks), 2) + self.assertEqual(chunks[0], sha256('a' * 1024 * 1024).digest()) + + def test_chunks_with_leftovers(self): + bytestring = 'a' * (2 * 1024 * 1024 + 20) + chunks = chunk_hashes(bytestring) + self.assertEqual(len(chunks), 3) + self.assertEqual(chunks[0], sha256('a' * 1024 * 1024).digest()) + self.assertEqual(chunks[1], sha256('a' * 1024 * 1024).digest()) + self.assertEqual(chunks[2], sha256('a' * 20).digest()) + + def test_less_than_one_chunk(self): + chunks = chunk_hashes('aaaa') + self.assertEqual(len(chunks), 1) + self.assertEqual(chunks[0], sha256('aaaa').digest()) |