1# Copyright (c) 2013 The WebM project authors. All Rights Reserved. 2# 3# Use of this source code is governed by a BSD-style license 4# that can be found in the LICENSE file in the root of the source 5# tree. An additional intellectual property rights grant can be found 6# in the file PATENTS. All contributing project authors may 7# be found in the AUTHORS file in the root of the source tree. 8# 9# This simple script pulls test files from the webm homepage 10# It is intelligent enough to only pull files if 11# 1) File / test_data folder does not exist 12# 2) SHA mismatch 13 14import pycurl 15import csv 16import hashlib 17import re 18import os.path 19import time 20import itertools 21import sys 22import getopt 23 24#globals 25url = '' 26file_list_path = '' 27local_resource_path = '' 28 29# Helper functions: 30# A simple function which returns the sha hash of a file in hex 31def get_file_sha(filename): 32 try: 33 sha_hash = hashlib.sha1() 34 with open(filename, 'rb') as file: 35 buf = file.read(HASH_CHUNK) 36 while len(buf) > 0: 37 sha_hash.update(buf) 38 buf = file.read(HASH_CHUNK) 39 return sha_hash.hexdigest() 40 except IOError: 41 print "Error reading " + filename 42 43# Downloads a file from a url, and then checks the sha against the passed 44# in sha 45def download_and_check_sha(url, filename, sha): 46 path = os.path.join(local_resource_path, filename) 47 fp = open(path, "wb") 48 curl = pycurl.Curl() 49 curl.setopt(pycurl.URL, url + "/" + filename) 50 curl.setopt(pycurl.WRITEDATA, fp) 51 curl.perform() 52 curl.close() 53 fp.close() 54 return get_file_sha(path) == sha 55 56#constants 57ftp_retries = 3 58 59SHA_COL = 0 60NAME_COL = 1 61EXPECTED_COL = 2 62HASH_CHUNK = 65536 63 64# Main script 65try: 66 opts, args = \ 67 getopt.getopt(sys.argv[1:], \ 68 "u:i:o:", ["url=", "input_csv=", "output_dir="]) 69except: 70 print 'get_files.py -u <url> -i <input_csv> -o <output_dir>' 71 sys.exit(2) 72 73for opt, arg in opts: 74 if opt == '-u': 75 url = arg 76 elif opt in ("-i", "--input_csv"): 77 file_list_path = os.path.join(arg) 78 elif opt in ("-o", "--output_dir"): 79 local_resource_path = os.path.join(arg) 80 81if len(sys.argv) != 7: 82 print "Expects two paths and a url!" 83 exit(1) 84 85if not os.path.isdir(local_resource_path): 86 os.makedirs(local_resource_path) 87 88file_list_csv = open(file_list_path, "rb") 89 90# Our 'csv' file uses multiple spaces as a delimiter, python's 91# csv class only uses single character delimiters, so we convert them below 92file_list_reader = csv.reader((re.sub(' +', ' ', line) \ 93 for line in file_list_csv), delimiter = ' ') 94 95file_shas = [] 96file_names = [] 97 98for row in file_list_reader: 99 if len(row) != EXPECTED_COL: 100 continue 101 file_shas.append(row[SHA_COL]) 102 file_names.append(row[NAME_COL]) 103 104file_list_csv.close() 105 106# Download files, only if they don't already exist and have correct shas 107for filename, sha in itertools.izip(file_names, file_shas): 108 path = os.path.join(local_resource_path, filename) 109 if os.path.isfile(path) \ 110 and get_file_sha(path) == sha: 111 print path + ' exists, skipping' 112 continue 113 for retry in range(0, ftp_retries): 114 print "Downloading " + path 115 if not download_and_check_sha(url, filename, sha): 116 print "Sha does not match, retrying..." 117 else: 118 break 119