1# Copyright (c) 2013 The WebM project authors. All Rights Reserved.
2#
3# Use of this source code is governed by a BSD-style license
4# that can be found in the LICENSE file in the root of the source
5# tree. An additional intellectual property rights grant can be found
6# in the file PATENTS.  All contributing project authors may
7# be found in the AUTHORS file in the root of the source tree.
8#
9# This simple script pulls test files from the webm homepage
10# It is intelligent enough to only pull files if
11#   1) File / test_data folder does not exist
12#   2) SHA mismatch
13
14import pycurl
15import csv
16import hashlib
17import re
18import os.path
19import time
20import itertools
21import sys
22import getopt
23
24#globals
25url = ''
26file_list_path = ''
27local_resource_path = ''
28
29# Helper functions:
30# A simple function which returns the sha hash of a file in hex
31def get_file_sha(filename):
32  try:
33    sha_hash = hashlib.sha1()
34    with open(filename, 'rb') as file:
35      buf = file.read(HASH_CHUNK)
36      while len(buf) > 0:
37        sha_hash.update(buf)
38        buf = file.read(HASH_CHUNK)
39      return sha_hash.hexdigest()
40  except IOError:
41    print "Error reading " + filename
42
43# Downloads a file from a url, and then checks the sha against the passed
44# in sha
45def download_and_check_sha(url, filename, sha):
46  path = os.path.join(local_resource_path, filename)
47  fp = open(path, "wb")
48  curl = pycurl.Curl()
49  curl.setopt(pycurl.URL, url + "/" + filename)
50  curl.setopt(pycurl.WRITEDATA, fp)
51  curl.perform()
52  curl.close()
53  fp.close()
54  return get_file_sha(path) == sha
55
56#constants
57ftp_retries = 3
58
59SHA_COL = 0
60NAME_COL = 1
61EXPECTED_COL = 2
62HASH_CHUNK = 65536
63
64# Main script
65try:
66  opts, args = \
67      getopt.getopt(sys.argv[1:], \
68                    "u:i:o:", ["url=", "input_csv=", "output_dir="])
69except:
70  print 'get_files.py -u <url> -i <input_csv> -o <output_dir>'
71  sys.exit(2)
72
73for opt, arg in opts:
74  if opt == '-u':
75    url = arg
76  elif opt in ("-i", "--input_csv"):
77    file_list_path = os.path.join(arg)
78  elif opt in ("-o", "--output_dir"):
79    local_resource_path = os.path.join(arg)
80
81if len(sys.argv) != 7:
82  print "Expects two paths and a url!"
83  exit(1)
84
85if not os.path.isdir(local_resource_path):
86  os.makedirs(local_resource_path)
87
88file_list_csv = open(file_list_path, "rb")
89
90# Our 'csv' file uses multiple spaces as a delimiter, python's
91# csv class only uses single character delimiters, so we convert them below
92file_list_reader = csv.reader((re.sub(' +', ' ', line) \
93    for line in file_list_csv), delimiter = ' ')
94
95file_shas = []
96file_names = []
97
98for row in file_list_reader:
99  if len(row) != EXPECTED_COL:
100      continue
101  file_shas.append(row[SHA_COL])
102  file_names.append(row[NAME_COL])
103
104file_list_csv.close()
105
106# Download files, only if they don't already exist and have correct shas
107for filename, sha in itertools.izip(file_names, file_shas):
108  path = os.path.join(local_resource_path, filename)
109  if os.path.isfile(path) \
110      and get_file_sha(path) == sha:
111    print path + ' exists, skipping'
112    continue
113  for retry in range(0, ftp_retries):
114    print "Downloading " + path
115    if not download_and_check_sha(url, filename, sha):
116      print "Sha does not match, retrying..."
117    else:
118      break
119