1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
|
#!/usr/bin/python
# This file is part of Ansible
#
# Ansible is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Ansible is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Ansible. If not, see <http://www.gnu.org/licenses/>.
ANSIBLE_METADATA = {'metadata_version': '1.0',
'status': ['preview'],
'supported_by': 'community'}
DOCUMENTATION = '''
---
module: s3_sync
short_description: Efficiently upload multiple files to S3
description:
- The S3 module is great, but it is very slow for a large volume of files- even a dozen will be noticeable. In addition to speed, it handles globbing,
inclusions/exclusions, mime types, expiration mapping, recursion, and smart directory mapping.
version_added: "2.3"
options:
mode:
description:
- sync direction.
required: true
default: 'push'
choices: [ push ]
file_change_strategy:
description:
- Difference determination method to allow changes-only syncing. Unlike rsync, files are not patched- they are fully skipped or fully uploaded.
- date_size will upload if file sizes don't match or if local file modified date is newer than s3's version
- checksum will compare etag values based on s3's implementation of chunked md5s.
- force will always upload all files.
required: false
default: 'date_size'
choices: [ force, checksum, date_size ]
bucket:
description:
- Bucket name.
required: true
key_prefix:
description:
- In addition to file path, prepend s3 path with this prefix. Module will add slash at end of prefix if necessary.
required: false
file_root:
description:
- File/directory path for synchronization. This is a local path.
- This root path is scrubbed from the key name, so subdirectories will remain as keys.
required: true
permission:
description:
- Canned ACL to apply to synced files.
- Changing this ACL only changes newly synced files, it does not trigger a full reupload.
required: false
choices: [ '', private, public-read, public-read-write, authenticated-read, aws-exec-read, bucket-owner-read, bucket-owner-full-control ]
mime_map:
description:
- >
Dict entry from extension to MIME type. This will override any default/sniffed MIME type.
For example C({".txt": "application/text", ".yml": "appication/text"})
required: false
include:
description:
- Shell pattern-style file matching.
- Used before exclude to determine eligible files (for instance, only "*.gif")
- For multiple patterns, comma-separate them.
required: false
default: "*"
exclude:
description:
- Shell pattern-style file matching.
- Used after include to remove files (for instance, skip "*.txt")
- For multiple patterns, comma-separate them.
required: false
default: ".*"
author: tedder
extends_documentation_fragment:
- aws
- ec2
'''
EXAMPLES = '''
- name: basic upload
s3_sync:
bucket: tedder
file_root: roles/s3/files/
- name: all the options
s3_sync:
bucket: tedder
file_root: roles/s3/files
mime_map:
.yml: application/text
.json: application/text
key_prefix: config_files/web
file_change_strategy: force
permission: public-read
include: "*"
exclude: "*.txt,.*"
'''
RETURN = '''
filelist_initial:
description: file listing (dicts) from inital globbing
returned: always
type: list
sample: [{
"bytes": 151,
"chopped_path": "policy.json",
"fullpath": "roles/cf/files/policy.json",
"modified_epoch": 1477416706
}]
filelist_local_etag:
description: file listing (dicts) including calculated local etag
returned: always
type: list
sample: [{
"bytes": 151,
"chopped_path": "policy.json",
"fullpath": "roles/cf/files/policy.json",
"mime_type": "application/json",
"modified_epoch": 1477416706,
"s3_path": "s3sync/policy.json"
}]
filelist_s3:
description: file listing (dicts) including information about previously-uploaded versions
returned: always
type: list
sample: [{
"bytes": 151,
"chopped_path": "policy.json",
"fullpath": "roles/cf/files/policy.json",
"mime_type": "application/json",
"modified_epoch": 1477416706,
"s3_path": "s3sync/policy.json"
}]
filelist_typed:
description: file listing (dicts) with calculated or overridden mime types
returned: always
type: list
sample: [{
"bytes": 151,
"chopped_path": "policy.json",
"fullpath": "roles/cf/files/policy.json",
"mime_type": "application/json",
"modified_epoch": 1477416706
}]
filelist_actionable:
description: file listing (dicts) of files that will be uploaded after the strategy decision
returned: always
type: list
sample: [{
"bytes": 151,
"chopped_path": "policy.json",
"fullpath": "roles/cf/files/policy.json",
"mime_type": "application/json",
"modified_epoch": 1477931256,
"s3_path": "s3sync/policy.json",
"whysize": "151 / 151",
"whytime": "1477931256 / 1477929260"
}]
uploaded:
description: file listing (dicts) of files that were actually uploaded
returned: always
type: list
sample: [{
"bytes": 151,
"chopped_path": "policy.json",
"fullpath": "roles/cf/files/policy.json",
"s3_path": "s3sync/policy.json",
"whysize": "151 / 151",
"whytime": "1477931637 / 1477931489"
}]
'''
import os
import stat as osstat # os.stat constants
import mimetypes
import datetime
from dateutil import tz
import hashlib
import fnmatch
import traceback
# import module snippets
from ansible.module_utils.basic import AnsibleModule
from ansible.module_utils.ec2 import ec2_argument_spec
# import a class, otherwise we'll use a fully qualified path
# from ansible.module_utils.ec2 import AWSRetry
import ansible.module_utils.ec2
try:
import botocore
HAS_BOTO3 = True
except ImportError:
HAS_BOTO3 = False
def boto_exception(err):
'''generic error message handler'''
if hasattr(err, 'error_message'):
error = err.error_message
elif hasattr(err, 'message'):
error = str(err.message) + ' ' + str(err) + ' - ' + str(type(err))
else:
error = '%s: %s' % (Exception, err)
return error
# the following function, calculate_multipart_etag, is from tlastowka
# on github and is used under its (compatible) GPL license. So this
# license applies to the following function.
# source: https://github.com/tlastowka/calculate_multipart_etag/blob/master/calculate_multipart_etag.py
#
# calculate_multipart_etag Copyright (C) 2015
# Tony Lastowka <tlastowka at gmail dot com>
# https://github.com/tlastowka
#
#
# calculate_multipart_etag is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# calculate_multipart_etag is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with calculate_multipart_etag. If not, see <http://www.gnu.org/licenses/>.
DEFAULT_CHUNK_SIZE = 5 * 1024 * 1024
def calculate_multipart_etag(source_path, chunk_size=DEFAULT_CHUNK_SIZE):
"""
calculates a multipart upload etag for amazon s3
Arguments:
source_path -- The file to calculate the etag for
chunk_size -- The chunk size to calculate for.
"""
md5s = []
with open(source_path, 'rb') as fp:
while True:
data = fp.read(chunk_size)
if not data:
break
md5s.append(hashlib.md5(data))
if len(md5s) == 1:
new_etag = '"{}"'.format(md5s[0].hexdigest())
else: # > 1
digests = b"".join(m.digest() for m in md5s)
new_md5 = hashlib.md5(digests)
new_etag = '"{}-{}"'.format(new_md5.hexdigest(), len(md5s))
return new_etag
def gather_files(fileroot, include=None, exclude=None):
ret = []
for (dirpath, dirnames, filenames) in os.walk(fileroot):
for fn in filenames:
fullpath = os.path.join(dirpath, fn)
# include/exclude
if include:
found = False
for x in include.split(','):
if fnmatch.fnmatch(fn, x):
found = True
if not found:
# not on the include list, so we don't want it.
continue
if exclude:
found = False
for x in exclude.split(','):
if fnmatch.fnmatch(fn, x):
found = True
if found:
# skip it, even if previously included.
continue
chopped_path = os.path.relpath(fullpath, start=fileroot)
fstat = os.stat(fullpath)
f_size = fstat[osstat.ST_SIZE]
f_modified_epoch = fstat[osstat.ST_MTIME]
ret.append({
'fullpath': fullpath,
'chopped_path': chopped_path,
'modified_epoch': f_modified_epoch,
'bytes': f_size,
})
# dirpath = path *to* the directory
# dirnames = subdirs *in* our directory
# filenames
return ret
def calculate_s3_path(filelist, key_prefix=''):
ret = []
for fileentry in filelist:
# don't modify the input dict
retentry = fileentry.copy()
retentry['s3_path'] = os.path.join(key_prefix, fileentry['chopped_path'])
ret.append(retentry)
return ret
def calculate_local_etag(filelist, key_prefix=''):
'''Really, "calculate md5", but since AWS uses their own format, we'll just call
it a "local etag". TODO optimization: only calculate if remote key exists.'''
ret = []
for fileentry in filelist:
# don't modify the input dict
retentry = fileentry.copy()
retentry['local_etag'] = calculate_multipart_etag(fileentry['fullpath'])
ret.append(retentry)
return ret
def determine_mimetypes(filelist, override_map):
ret = []
for fileentry in filelist:
retentry = fileentry.copy()
localfile = fileentry['fullpath']
# reminder: file extension is '.txt', not 'txt'.
_, file_extension = os.path.splitext(localfile)
if override_map and override_map.get(file_extension):
# override? use it.
retentry['mime_type'] = override_map[file_extension]
else:
# else sniff it
retentry['mime_type'], retentry['encoding'] = mimetypes.guess_type(localfile, strict=False)
# might be None or '' from one of the above. Not a great type but better than nothing.
if not retentry['mime_type']:
retentry['mime_type'] = 'application/octet-stream'
ret.append(retentry)
return ret
def head_s3(s3, bucket, s3keys):
retkeys = []
for entry in s3keys:
retentry = entry.copy()
# don't modify the input dict
try:
retentry['s3_head'] = s3.head_object(Bucket=bucket, Key=entry['s3_path'])
except botocore.exceptions.ClientError as err:
if (hasattr(err, 'response') and
'ResponseMetadata' in err.response and
'HTTPStatusCode' in err.response['ResponseMetadata'] and
str(err.response['ResponseMetadata']['HTTPStatusCode']) == '404'):
pass
else:
raise Exception(err)
# error_msg = boto_exception(err)
# return {'error': error_msg}
retkeys.append(retentry)
return retkeys
def filter_list(s3, bucket, s3filelist, strategy):
keeplist = list(s3filelist)
for e in keeplist:
e['_strategy'] = strategy
# init/fetch info from S3 if we're going to use it for comparisons
if not strategy == 'force':
keeplist = head_s3(s3, bucket, s3filelist)
# now actually run the strategies
if strategy == 'checksum':
for entry in keeplist:
if entry.get('s3_head'):
# since we have a remote s3 object, compare the values.
if entry['s3_head']['ETag'] == entry['local_etag']:
# files match, so remove the entry
entry['skip_flag'] = True
else:
# file etags don't match, keep the entry.
pass
else: # we don't have an etag, so we'll keep it.
pass
elif strategy == 'date_size':
for entry in keeplist:
if entry.get('s3_head'):
# fstat = entry['stat']
local_modified_epoch = entry['modified_epoch']
local_size = entry['bytes']
# py2's datetime doesn't have a timestamp() field, so we have to revert to something more awkward.
# remote_modified_epoch = entry['s3_head']['LastModified'].timestamp()
remote_modified_datetime = entry['s3_head']['LastModified']
delta = (remote_modified_datetime - datetime.datetime(1970, 1, 1, tzinfo=tz.tzutc()))
remote_modified_epoch = delta.seconds + (delta.days * 86400)
remote_size = entry['s3_head']['ContentLength']
entry['whytime'] = '{} / {}'.format(local_modified_epoch, remote_modified_epoch)
entry['whysize'] = '{} / {}'.format(local_size, remote_size)
if local_modified_epoch <= remote_modified_epoch or local_size == remote_size:
entry['skip_flag'] = True
else:
entry['why'] = "no s3_head"
# else: probably 'force'. Basically we don't skip with any with other strategies.
else:
pass
# prune 'please skip' entries, if any.
return [x for x in keeplist if not x.get('skip_flag')]
def upload_files(s3, bucket, filelist, params):
ret = []
for entry in filelist:
args = {
'ContentType': entry['mime_type']
}
if params.get('permission'):
args['ACL'] = params['permission']
# if this fails exception is caught in main()
s3.upload_file(entry['fullpath'], bucket, entry['s3_path'], ExtraArgs=args, Callback=None, Config=None)
ret.append(entry)
return ret
def main():
argument_spec = ec2_argument_spec()
argument_spec.update(dict(
mode=dict(choices=['push'], default='push'),
file_change_strategy=dict(choices=['force', 'date_size', 'checksum'], default='date_size'),
bucket=dict(required=True),
key_prefix=dict(required=False, default=''),
file_root=dict(required=True, type='path'),
permission=dict(required=False, choices=['private', 'public-read', 'public-read-write', 'authenticated-read',
'aws-exec-read', 'bucket-owner-read', 'bucket-owner-full-control']),
retries=dict(required=False),
mime_map=dict(required=False, type='dict'),
exclude=dict(required=False, default=".*"),
include=dict(required=False, default="*"),
# future options: cache_control (string or map, perhaps), encoding, metadata, storage_class, retries
)
)
module = AnsibleModule(
argument_spec=argument_spec,
)
if not HAS_BOTO3:
module.fail_json(msg='boto3 required for this module')
result = {}
mode = module.params['mode']
region, ec2_url, aws_connect_kwargs = ansible.module_utils.ec2.get_aws_connection_info(module, boto3=True)
if not region:
module.fail_json(msg="Region must be specified")
s3 = ansible.module_utils.ec2.boto3_conn(module, conn_type='client', resource='s3', region=region, endpoint=ec2_url, **aws_connect_kwargs)
if mode == 'push':
try:
result['filelist_initial'] = gather_files(module.params['file_root'], exclude=module.params['exclude'], include=module.params['include'])
result['filelist_typed'] = determine_mimetypes(result['filelist_initial'], module.params.get('mime_map'))
result['filelist_s3'] = calculate_s3_path(result['filelist_typed'], module.params['key_prefix'])
result['filelist_local_etag'] = calculate_local_etag(result['filelist_s3'])
result['filelist_actionable'] = filter_list(s3, module.params['bucket'], result['filelist_local_etag'], module.params['file_change_strategy'])
result['uploads'] = upload_files(s3, module.params['bucket'], result['filelist_actionable'], module.params)
# mark changed if we actually upload something.
if result.get('uploads') and len(result.get('uploads')):
result['changed'] = True
# result.update(filelist=actionable_filelist)
except botocore.exceptions.ClientError as err:
error_msg = boto_exception(err)
module.fail_json(msg=error_msg, exception=traceback.format_exc(), **camel_dict_to_snake_dict(err.response))
module.exit_json(**result)
if __name__ == '__main__':
main()
|