1# Copyright (c) 2012 Amazon.com, Inc. or its affiliates. All Rights Reserved 2# 3# Permission is hereby granted, free of charge, to any person obtaining a 4# copy of this software and associated documentation files (the 5# "Software"), to deal in the Software without restriction, including 6# without limitation the rights to use, copy, modify, merge, publish, dis- 7# tribute, sublicense, and/or sell copies of the Software, and to permit 8# persons to whom the Software is furnished to do so, subject to the fol- 9# lowing conditions: 10# 11# The above copyright notice and this permission notice shall be included 12# in all copies or substantial portions of the Software. 13# 14# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 15# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABIL- 16# ITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT 17# SHALL THE AUTHOR BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 18# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 20# IN THE SOFTWARE. 21# 22import hashlib 23import math 24import binascii 25 26from boto.compat import six 27 28 29_MEGABYTE = 1024 * 1024 30DEFAULT_PART_SIZE = 4 * _MEGABYTE 31MAXIMUM_NUMBER_OF_PARTS = 10000 32 33 34def minimum_part_size(size_in_bytes, default_part_size=DEFAULT_PART_SIZE): 35 """Calculate the minimum part size needed for a multipart upload. 36 37 Glacier allows a maximum of 10,000 parts per upload. It also 38 states that the maximum archive size is 10,000 * 4 GB, which means 39 the part size can range from 1MB to 4GB (provided it is one 1MB 40 multiplied by a power of 2). 41 42 This function will compute what the minimum part size must be in 43 order to upload a file of size ``size_in_bytes``. 44 45 It will first check if ``default_part_size`` is sufficient for 46 a part size given the ``size_in_bytes``. If this is not the case, 47 then the smallest part size than can accomodate a file of size 48 ``size_in_bytes`` will be returned. 49 50 If the file size is greater than the maximum allowed archive 51 size of 10,000 * 4GB, a ``ValueError`` will be raised. 52 53 """ 54 # The default part size (4 MB) will be too small for a very large 55 # archive, as there is a limit of 10,000 parts in a multipart upload. 56 # This puts the maximum allowed archive size with the default part size 57 # at 40,000 MB. We need to do a sanity check on the part size, and find 58 # one that works if the default is too small. 59 part_size = _MEGABYTE 60 if (default_part_size * MAXIMUM_NUMBER_OF_PARTS) < size_in_bytes: 61 if size_in_bytes > (4096 * _MEGABYTE * 10000): 62 raise ValueError("File size too large: %s" % size_in_bytes) 63 min_part_size = size_in_bytes / 10000 64 power = 3 65 while part_size < min_part_size: 66 part_size = math.ldexp(_MEGABYTE, power) 67 power += 1 68 part_size = int(part_size) 69 else: 70 part_size = default_part_size 71 return part_size 72 73 74def chunk_hashes(bytestring, chunk_size=_MEGABYTE): 75 chunk_count = int(math.ceil(len(bytestring) / float(chunk_size))) 76 hashes = [] 77 for i in range(chunk_count): 78 start = i * chunk_size 79 end = (i + 1) * chunk_size 80 hashes.append(hashlib.sha256(bytestring[start:end]).digest()) 81 if not hashes: 82 return [hashlib.sha256(b'').digest()] 83 return hashes 84 85 86def tree_hash(fo): 87 """ 88 Given a hash of each 1MB chunk (from chunk_hashes) this will hash 89 together adjacent hashes until it ends up with one big one. So a 90 tree of hashes. 91 """ 92 hashes = [] 93 hashes.extend(fo) 94 while len(hashes) > 1: 95 new_hashes = [] 96 while True: 97 if len(hashes) > 1: 98 first = hashes.pop(0) 99 second = hashes.pop(0) 100 new_hashes.append(hashlib.sha256(first + second).digest()) 101 elif len(hashes) == 1: 102 only = hashes.pop(0) 103 new_hashes.append(only) 104 else: 105 break 106 hashes.extend(new_hashes) 107 return hashes[0] 108 109 110def compute_hashes_from_fileobj(fileobj, chunk_size=1024 * 1024): 111 """Compute the linear and tree hash from a fileobj. 112 113 This function will compute the linear/tree hash of a fileobj 114 in a single pass through the fileobj. 115 116 :param fileobj: A file like object. 117 118 :param chunk_size: The size of the chunks to use for the tree 119 hash. This is also the buffer size used to read from 120 `fileobj`. 121 122 :rtype: tuple 123 :return: A tuple of (linear_hash, tree_hash). Both hashes 124 are returned in hex. 125 126 """ 127 # Python 3+, not binary 128 if six.PY3 and hasattr(fileobj, 'mode') and 'b' not in fileobj.mode: 129 raise ValueError('File-like object must be opened in binary mode!') 130 131 linear_hash = hashlib.sha256() 132 chunks = [] 133 chunk = fileobj.read(chunk_size) 134 while chunk: 135 # It's possible to get a file-like object that has no mode (checked 136 # above) and returns something other than bytes (e.g. str). So here 137 # we try to catch that and encode to bytes. 138 if not isinstance(chunk, bytes): 139 chunk = chunk.encode(getattr(fileobj, 'encoding', '') or 'utf-8') 140 linear_hash.update(chunk) 141 chunks.append(hashlib.sha256(chunk).digest()) 142 chunk = fileobj.read(chunk_size) 143 if not chunks: 144 chunks = [hashlib.sha256(b'').digest()] 145 return linear_hash.hexdigest(), bytes_to_hex(tree_hash(chunks)) 146 147 148def bytes_to_hex(str_as_bytes): 149 return binascii.hexlify(str_as_bytes) 150 151 152def tree_hash_from_str(str_as_bytes): 153 """ 154 155 :type str_as_bytes: str 156 :param str_as_bytes: The string for which to compute the tree hash. 157 158 :rtype: str 159 :return: The computed tree hash, returned as hex. 160 161 """ 162 return bytes_to_hex(tree_hash(chunk_hashes(str_as_bytes))) 163 164 165class ResettingFileSender(object): 166 def __init__(self, archive): 167 self._archive = archive 168 self._starting_offset = archive.tell() 169 170 def __call__(self, connection, method, path, body, headers): 171 try: 172 connection.request(method, path, self._archive, headers) 173 return connection.getresponse() 174 finally: 175 self._archive.seek(self._starting_offset) 176