1# -*- coding: utf-8 -*-
2# Copyright 2010 Google Inc. All Rights Reserved.
3#
4# Licensed under the Apache License, Version 2.0 (the "License");
5# you may not use this file except in compliance with the License.
6# You may obtain a copy of the License at
7#
8#     http://www.apache.org/licenses/LICENSE-2.0
9#
10# Unless required by applicable law or agreed to in writing, software
11# distributed under the License is distributed on an "AS IS" BASIS,
12# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13# See the License for the specific language governing permissions and
14# limitations under the License.
15"""Static data and helper functions."""
16
17from __future__ import absolute_import
18
19import collections
20import errno
21import logging
22import math
23import multiprocessing
24import os
25import pkgutil
26import re
27import struct
28import sys
29import tempfile
30import textwrap
31import threading
32import traceback
33import xml.etree.ElementTree as ElementTree
34
35import boto
36from boto import config
37import boto.auth
38from boto.exception import NoAuthHandlerFound
39from boto.gs.connection import GSConnection
40from boto.provider import Provider
41from boto.pyami.config import BotoConfigLocations
42import httplib2
43from oauth2client.client import HAS_CRYPTO
44from retry_decorator import retry_decorator
45
46import gslib
47from gslib.exception import CommandException
48from gslib.storage_url import StorageUrlFromString
49from gslib.translation_helper import AclTranslation
50from gslib.translation_helper import GenerationFromUrlAndString
51from gslib.translation_helper import S3_ACL_MARKER_GUID
52from gslib.translation_helper import S3_DELETE_MARKER_GUID
53from gslib.translation_helper import S3_MARKER_GUIDS
54
55# Detect platform types.
56PLATFORM = str(sys.platform).lower()
57IS_WINDOWS = 'win32' in PLATFORM
58IS_CYGWIN = 'cygwin' in PLATFORM
59IS_LINUX = 'linux' in PLATFORM
60IS_OSX = 'darwin' in PLATFORM
61
62# pylint: disable=g-import-not-at-top
63if IS_WINDOWS:
64  from ctypes import c_int
65  from ctypes import c_uint64
66  from ctypes import c_char_p
67  from ctypes import c_wchar_p
68  from ctypes import windll
69  from ctypes import POINTER
70  from ctypes import WINFUNCTYPE
71  from ctypes import WinError
72
73# pylint: disable=g-import-not-at-top
74try:
75  # This module doesn't necessarily exist on Windows.
76  import resource
77  HAS_RESOURCE_MODULE = True
78except ImportError, e:
79  HAS_RESOURCE_MODULE = False
80
81ONE_KIB = 1024
82ONE_MIB = 1024 * 1024
83TWO_MIB = 2 * ONE_MIB
84EIGHT_MIB = 8 * ONE_MIB
85TEN_MIB = 10 * ONE_MIB
86DEFAULT_FILE_BUFFER_SIZE = 8 * ONE_KIB
87_DEFAULT_LINES = 25
88
89# By default, the timeout for SSL read errors is infinite. This could
90# cause gsutil to hang on network disconnect, so pick a more reasonable
91# timeout.
92SSL_TIMEOUT = 60
93
94# Start with a progress callback every 64 KiB during uploads/downloads (JSON
95# API). Callback implementation should back off until it hits the maximum size
96# so that callbacks do not create huge amounts of log output.
97START_CALLBACK_PER_BYTES = 1024*64
98MAX_CALLBACK_PER_BYTES = 1024*1024*100
99
100# Upload/download files in 8 KiB chunks over the HTTP connection.
101TRANSFER_BUFFER_SIZE = 1024*8
102
103# Default number of progress callbacks during transfer (XML API).
104XML_PROGRESS_CALLBACKS = 10
105
106# For files >= this size, output a message indicating that we're running an
107# operation on the file (like hashing or gzipping) so it does not appear to the
108# user that the command is hanging.
109MIN_SIZE_COMPUTE_LOGGING = 100*1024*1024  # 100 MiB
110
111NO_MAX = sys.maxint
112
113UTF8 = 'utf-8'
114
115VERSION_MATCHER = re.compile(r'^(?P<maj>\d+)(\.(?P<min>\d+)(?P<suffix>.*))?')
116
117RELEASE_NOTES_URL = 'https://pub.storage.googleapis.com/gsutil_ReleaseNotes.txt'
118
119# Binary exponentiation strings.
120_EXP_STRINGS = [
121    (0, 'B', 'bit'),
122    (10, 'KiB', 'Kibit', 'K'),
123    (20, 'MiB', 'Mibit', 'M'),
124    (30, 'GiB', 'Gibit', 'G'),
125    (40, 'TiB', 'Tibit', 'T'),
126    (50, 'PiB', 'Pibit', 'P'),
127    (60, 'EiB', 'Eibit', 'E'),
128]
129
130
131global manager  # pylint: disable=global-at-module-level
132certs_file_lock = threading.Lock()
133configured_certs_files = []
134
135
136def _GenerateSuffixRegex():
137  """Creates a suffix regex for human-readable byte counts."""
138  human_bytes_re = r'(?P<num>\d*\.\d+|\d+)\s*(?P<suffix>%s)?'
139  suffixes = []
140  suffix_to_si = {}
141  for i, si in enumerate(_EXP_STRINGS):
142    si_suffixes = [s.lower() for s in list(si)[1:]]
143    for suffix in si_suffixes:
144      suffix_to_si[suffix] = i
145    suffixes.extend(si_suffixes)
146  human_bytes_re %= '|'.join(suffixes)
147  matcher = re.compile(human_bytes_re)
148  return suffix_to_si, matcher
149
150SUFFIX_TO_SI, MATCH_HUMAN_BYTES = _GenerateSuffixRegex()
151
152SECONDS_PER_DAY = 3600 * 24
153
154# On Unix-like systems, we will set the maximum number of open files to avoid
155# hitting the limit imposed by the OS. This number was obtained experimentally.
156MIN_ACCEPTABLE_OPEN_FILES_LIMIT = 1000
157
158GSUTIL_PUB_TARBALL = 'gs://pub/gsutil.tar.gz'
159
160Retry = retry_decorator.retry  # pylint: disable=invalid-name
161
162# Cache the values from this check such that they're available to all callers
163# without needing to run all the checks again (some of these, such as calling
164# multiprocessing.Manager(), are expensive operations).
165cached_multiprocessing_is_available = None
166cached_multiprocessing_is_available_stack_trace = None
167cached_multiprocessing_is_available_message = None
168
169
170# Enum class for specifying listing style.
171class ListingStyle(object):
172  SHORT = 'SHORT'
173  LONG = 'LONG'
174  LONG_LONG = 'LONG_LONG'
175
176
177def UsingCrcmodExtension(crcmod):
178  return (boto.config.get('GSUtil', 'test_assume_fast_crcmod', None) or
179          (getattr(crcmod, 'crcmod', None) and
180           getattr(crcmod.crcmod, '_usingExtension', None)))
181
182
183def CheckFreeSpace(path):
184  """Return path/drive free space (in bytes)."""
185  if IS_WINDOWS:
186    try:
187      # pylint: disable=invalid-name
188      get_disk_free_space_ex = WINFUNCTYPE(c_int, c_wchar_p,
189                                           POINTER(c_uint64),
190                                           POINTER(c_uint64),
191                                           POINTER(c_uint64))
192      get_disk_free_space_ex = get_disk_free_space_ex(
193          ('GetDiskFreeSpaceExW', windll.kernel32), (
194              (1, 'lpszPathName'),
195              (2, 'lpFreeUserSpace'),
196              (2, 'lpTotalSpace'),
197              (2, 'lpFreeSpace'),))
198    except AttributeError:
199      get_disk_free_space_ex = WINFUNCTYPE(c_int, c_char_p,
200                                           POINTER(c_uint64),
201                                           POINTER(c_uint64),
202                                           POINTER(c_uint64))
203      get_disk_free_space_ex = get_disk_free_space_ex(
204          ('GetDiskFreeSpaceExA', windll.kernel32), (
205              (1, 'lpszPathName'),
206              (2, 'lpFreeUserSpace'),
207              (2, 'lpTotalSpace'),
208              (2, 'lpFreeSpace'),))
209
210    def GetDiskFreeSpaceExErrCheck(result, unused_func, args):
211      if not result:
212        raise WinError()
213      return args[1].value
214    get_disk_free_space_ex.errcheck = GetDiskFreeSpaceExErrCheck
215
216    return get_disk_free_space_ex(os.getenv('SystemDrive'))
217  else:
218    (_, f_frsize, _, _, f_bavail, _, _, _, _, _) = os.statvfs(path)
219    return f_frsize * f_bavail
220
221
222def CreateDirIfNeeded(dir_path, mode=0777):
223  """Creates a directory, suppressing already-exists errors."""
224  if not os.path.exists(dir_path):
225    try:
226      # Unfortunately, even though we catch and ignore EEXIST, this call will
227      # output a (needless) error message (no way to avoid that in Python).
228      os.makedirs(dir_path, mode)
229    # Ignore 'already exists' in case user tried to start up several
230    # resumable uploads concurrently from a machine where no tracker dir had
231    # yet been created.
232    except OSError as e:
233      if e.errno != errno.EEXIST:
234        raise
235
236
237def DivideAndCeil(dividend, divisor):
238  """Returns ceil(dividend / divisor).
239
240  Takes care to avoid the pitfalls of floating point arithmetic that could
241  otherwise yield the wrong result for large numbers.
242
243  Args:
244    dividend: Dividend for the operation.
245    divisor: Divisor for the operation.
246
247  Returns:
248    Quotient.
249  """
250  quotient = dividend // divisor
251  if (dividend % divisor) != 0:
252    quotient += 1
253  return quotient
254
255
256def GetGsutilStateDir():
257  """Returns the location of the directory for gsutil state files.
258
259  Certain operations, such as cross-process credential sharing and
260  resumable transfer tracking, need a known location for state files which
261  are created by gsutil as-needed.
262
263  This location should only be used for storing data that is required to be in
264  a static location.
265
266  Returns:
267    Path to directory for gsutil static state files.
268  """
269  config_file_dir = config.get(
270      'GSUtil', 'state_dir',
271      os.path.expanduser(os.path.join('~', '.gsutil')))
272  CreateDirIfNeeded(config_file_dir)
273  return config_file_dir
274
275
276def GetCredentialStoreFilename():
277  return os.path.join(GetGsutilStateDir(), 'credstore')
278
279
280def GetGceCredentialCacheFilename():
281  return os.path.join(GetGsutilStateDir(), 'gcecredcache')
282
283
284def GetTabCompletionLogFilename():
285  return os.path.join(GetGsutilStateDir(), 'tab-completion-logs')
286
287
288def GetTabCompletionCacheFilename():
289  tab_completion_dir = os.path.join(GetGsutilStateDir(), 'tab-completion')
290  # Limit read permissions on the directory to owner for privacy.
291  CreateDirIfNeeded(tab_completion_dir, mode=0700)
292  return os.path.join(tab_completion_dir, 'cache')
293
294
295def PrintTrackerDirDeprecationWarningIfNeeded():
296  # TODO: Remove this along with the tracker_dir config value 1 year after
297  # 4.6 release date. Use state_dir instead.
298  if config.has_option('GSUtil', 'resumable_tracker_dir'):
299    sys.stderr.write('Warning: you have set resumable_tracker_dir in your '
300                     '.boto configuration file. This configuration option is '
301                     'deprecated; please use the state_dir configuration '
302                     'option instead.\n')
303
304
305# Name of file where we keep the timestamp for the last time we checked whether
306# a new version of gsutil is available.
307PrintTrackerDirDeprecationWarningIfNeeded()
308CreateDirIfNeeded(GetGsutilStateDir())
309LAST_CHECKED_FOR_GSUTIL_UPDATE_TIMESTAMP_FILE = (
310    os.path.join(GetGsutilStateDir(), '.last_software_update_check'))
311
312
313def HasConfiguredCredentials():
314  """Determines if boto credential/config file exists."""
315  has_goog_creds = (config.has_option('Credentials', 'gs_access_key_id') and
316                    config.has_option('Credentials', 'gs_secret_access_key'))
317  has_amzn_creds = (config.has_option('Credentials', 'aws_access_key_id') and
318                    config.has_option('Credentials', 'aws_secret_access_key'))
319  has_oauth_creds = (
320      config.has_option('Credentials', 'gs_oauth2_refresh_token'))
321  has_service_account_creds = (
322      HAS_CRYPTO and
323      config.has_option('Credentials', 'gs_service_client_id') and
324      config.has_option('Credentials', 'gs_service_key_file'))
325
326  if (has_goog_creds or has_amzn_creds or has_oauth_creds or
327      has_service_account_creds):
328    return True
329
330  valid_auth_handler = None
331  try:
332    valid_auth_handler = boto.auth.get_auth_handler(
333        GSConnection.DefaultHost, config, Provider('google'),
334        requested_capability=['s3'])
335    # Exclude the no-op auth handler as indicating credentials are configured.
336    # Note we can't use isinstance() here because the no-op module may not be
337    # imported so we can't get a reference to the class type.
338    if getattr(getattr(valid_auth_handler, '__class__', None),
339               '__name__', None) == 'NoOpAuth':
340      valid_auth_handler = None
341  except NoAuthHandlerFound:
342    pass
343
344  return valid_auth_handler
345
346
347def ConfigureNoOpAuthIfNeeded():
348  """Sets up no-op auth handler if no boto credentials are configured."""
349  if not HasConfiguredCredentials():
350    if (config.has_option('Credentials', 'gs_service_client_id')
351        and not HAS_CRYPTO):
352      if os.environ.get('CLOUDSDK_WRAPPER') == '1':
353        raise CommandException('\n'.join(textwrap.wrap(
354            'Your gsutil is configured with an OAuth2 service account, but '
355            'you do not have PyOpenSSL or PyCrypto 2.6 or later installed. '
356            'Service account authentication requires one of these libraries; '
357            'please reactivate your service account via the gcloud auth '
358            'command and ensure any gcloud packages necessary for '
359            'service accounts are present.')))
360      else:
361        raise CommandException('\n'.join(textwrap.wrap(
362            'Your gsutil is configured with an OAuth2 service account, but '
363            'you do not have PyOpenSSL or PyCrypto 2.6 or later installed. '
364            'Service account authentication requires one of these libraries; '
365            'please install either of them to proceed, or configure a '
366            'different type of credentials with "gsutil config".')))
367    else:
368      # With no boto config file the user can still access publicly readable
369      # buckets and objects.
370      from gslib import no_op_auth_plugin  # pylint: disable=unused-variable
371
372
373def GetConfigFilePath():
374  config_path = 'no config found'
375  for path in BotoConfigLocations:
376    try:
377      with open(path, 'r'):
378        config_path = path
379      break
380    except IOError:
381      pass
382  return config_path
383
384
385def GetBotoConfigFileList():
386  """Returns list of boto config files that exist."""
387  config_paths = boto.pyami.config.BotoConfigLocations
388  if 'AWS_CREDENTIAL_FILE' in os.environ:
389    config_paths.append(os.environ['AWS_CREDENTIAL_FILE'])
390  config_files = {}
391  for config_path in config_paths:
392    if os.path.exists(config_path):
393      config_files[config_path] = 1
394  cf_list = []
395  for config_file in config_files:
396    cf_list.append(config_file)
397  return cf_list
398
399
400def GetCertsFile():
401  """Configures and returns the CA Certificates file.
402
403  If one is already configured, use it. Otherwise, amend the configuration
404  (in boto.config) to use the cert roots distributed with gsutil.
405
406  Returns:
407    string filename of the certs file to use.
408  """
409  certs_file = boto.config.get('Boto', 'ca_certificates_file', None)
410  if not certs_file:
411    with certs_file_lock:
412      if configured_certs_files:
413        disk_certs_file = configured_certs_files[0]
414      else:
415        disk_certs_file = os.path.abspath(
416            os.path.join(gslib.GSLIB_DIR, 'data', 'cacerts.txt'))
417        if not os.path.exists(disk_certs_file):
418          # If the file is not present on disk, this means the gslib module
419          # doesn't actually exist on disk anywhere. This can happen if it's
420          # being imported from a zip file. Unfortunately, we have to copy the
421          # certs file to a local temp file on disk because the underlying SSL
422          # socket requires it to be a filesystem path.
423          certs_data = pkgutil.get_data('gslib', 'data/cacerts.txt')
424          if not certs_data:
425            raise CommandException('Certificates file not found. Please '
426                                   'reinstall gsutil from scratch')
427          fd, fname = tempfile.mkstemp(suffix='.txt', prefix='gsutil-cacerts')
428          f = os.fdopen(fd, 'w')
429          f.write(certs_data)
430          f.close()
431          configured_certs_files.append(fname)
432          disk_certs_file = fname
433      certs_file = disk_certs_file
434  return certs_file
435
436
437def GetCleanupFiles():
438  """Returns a list of temp files to delete (if possible) when program exits."""
439  cleanup_files = []
440  if configured_certs_files:
441    cleanup_files += configured_certs_files
442  return cleanup_files
443
444
445def ProxyInfoFromEnvironmentVar(proxy_env_var):
446  """Reads proxy info from the environment and converts to httplib2.ProxyInfo.
447
448  Args:
449    proxy_env_var: Environment variable string to read, such as http_proxy or
450       https_proxy.
451
452  Returns:
453    httplib2.ProxyInfo constructed from the environment string.
454  """
455  proxy_url = os.environ.get(proxy_env_var)
456  if not proxy_url or not proxy_env_var.lower().startswith('http'):
457    return httplib2.ProxyInfo(httplib2.socks.PROXY_TYPE_HTTP, None, 0)
458  proxy_protocol = proxy_env_var.lower().split('_')[0]
459  if not proxy_url.lower().startswith('http'):
460    # proxy_info_from_url requires a protocol, which is always http or https.
461    proxy_url = proxy_protocol + '://' + proxy_url
462  return httplib2.proxy_info_from_url(proxy_url, method=proxy_protocol)
463
464
465def GetNewHttp(http_class=httplib2.Http, **kwargs):
466  """Creates and returns a new httplib2.Http instance.
467
468  Args:
469    http_class: Optional custom Http class to use.
470    **kwargs: Arguments to pass to http_class constructor.
471
472  Returns:
473    An initialized httplib2.Http instance.
474  """
475  proxy_info = httplib2.ProxyInfo(
476      proxy_type=3,
477      proxy_host=boto.config.get('Boto', 'proxy', None),
478      proxy_port=boto.config.getint('Boto', 'proxy_port', 0),
479      proxy_user=boto.config.get('Boto', 'proxy_user', None),
480      proxy_pass=boto.config.get('Boto', 'proxy_pass', None),
481      proxy_rdns=boto.config.get('Boto', 'proxy_rdns', False))
482
483  if not (proxy_info.proxy_host and proxy_info.proxy_port):
484    # Fall back to using the environment variable.
485    for proxy_env_var in ['http_proxy', 'https_proxy', 'HTTPS_PROXY']:
486      if proxy_env_var in os.environ and os.environ[proxy_env_var]:
487        proxy_info = ProxyInfoFromEnvironmentVar(proxy_env_var)
488        # Assume proxy_rnds is True if a proxy environment variable exists.
489        proxy_info.proxy_rdns = boto.config.get('Boto', 'proxy_rdns', True)
490        break
491
492  # Some installers don't package a certs file with httplib2, so use the
493  # one included with gsutil.
494  kwargs['ca_certs'] = GetCertsFile()
495  # Use a non-infinite SSL timeout to avoid hangs during network flakiness.
496  kwargs['timeout'] = SSL_TIMEOUT
497  http = http_class(proxy_info=proxy_info, **kwargs)
498  http.disable_ssl_certificate_validation = (not config.getbool(
499      'Boto', 'https_validate_certificates'))
500  return http
501
502
503# Retry for 10 minutes with exponential backoff, which corresponds to
504# the maximum Downtime Period specified in the GCS SLA
505# (https://cloud.google.com/storage/sla)
506def GetNumRetries():
507  return config.getint('Boto', 'num_retries', 23)
508
509
510def GetMaxRetryDelay():
511  return config.getint('Boto', 'max_retry_delay', 32)
512
513
514# Resumable downloads and uploads make one HTTP call per chunk (and must be
515# in multiples of 256KiB). Overridable for testing.
516def GetJsonResumableChunkSize():
517  chunk_size = config.getint('GSUtil', 'json_resumable_chunk_size',
518                             1024*1024*100L)
519  if chunk_size == 0:
520    chunk_size = 1024*256L
521  elif chunk_size % 1024*256L != 0:
522    chunk_size += (1024*256L - (chunk_size % (1024*256L)))
523  return chunk_size
524
525
526def _RoundToNearestExponent(num):
527  i = 0
528  while i+1 < len(_EXP_STRINGS) and num >= (2 ** _EXP_STRINGS[i+1][0]):
529    i += 1
530  return i, round(float(num) / 2 ** _EXP_STRINGS[i][0], 2)
531
532
533def MakeHumanReadable(num):
534  """Generates human readable string for a number of bytes.
535
536  Args:
537    num: The number, in bytes.
538
539  Returns:
540    A string form of the number using size abbreviations (KiB, MiB, etc.).
541  """
542  i, rounded_val = _RoundToNearestExponent(num)
543  return '%g %s' % (rounded_val, _EXP_STRINGS[i][1])
544
545
546def MakeBitsHumanReadable(num):
547  """Generates human readable string for a number of bits.
548
549  Args:
550    num: The number, in bits.
551
552  Returns:
553    A string form of the number using bit size abbreviations (kbit, Mbit, etc.)
554  """
555  i, rounded_val = _RoundToNearestExponent(num)
556  return '%g %s' % (rounded_val, _EXP_STRINGS[i][2])
557
558
559def HumanReadableToBytes(human_string):
560  """Tries to convert a human-readable string to a number of bytes.
561
562  Args:
563    human_string: A string supplied by user, e.g. '1M', '3 GiB'.
564  Returns:
565    An integer containing the number of bytes.
566  Raises:
567    ValueError: on an invalid string.
568  """
569  human_string = human_string.lower()
570  m = MATCH_HUMAN_BYTES.match(human_string)
571  if m:
572    num = float(m.group('num'))
573    if m.group('suffix'):
574      power = _EXP_STRINGS[SUFFIX_TO_SI[m.group('suffix')]][0]
575      num *= (2.0 ** power)
576    num = int(round(num))
577    return num
578  raise ValueError('Invalid byte string specified: %s' % human_string)
579
580
581def Percentile(values, percent, key=lambda x: x):
582  """Find the percentile of a list of values.
583
584  Taken from: http://code.activestate.com/recipes/511478/
585
586  Args:
587    values: a list of numeric values. Note that the values MUST BE already
588            sorted.
589    percent: a float value from 0.0 to 1.0.
590    key: optional key function to compute value from each element of the list
591         of values.
592
593  Returns:
594    The percentile of the values.
595  """
596  if not values:
597    return None
598  k = (len(values) - 1) * percent
599  f = math.floor(k)
600  c = math.ceil(k)
601  if f == c:
602    return key(values[int(k)])
603  d0 = key(values[int(f)]) * (c-k)
604  d1 = key(values[int(c)]) * (k-f)
605  return d0 + d1
606
607
608def RemoveCRLFFromString(input_str):
609  """Returns the input string with all \\n and \\r removed."""
610  return re.sub(r'[\r\n]', '', input_str)
611
612
613def UnaryDictToXml(message):
614  """Generates XML representation of a nested dict.
615
616  This dict contains exactly one top-level entry and an arbitrary number of
617  2nd-level entries, e.g. capturing a WebsiteConfiguration message.
618
619  Args:
620    message: The dict encoding the message.
621
622  Returns:
623    XML string representation of the input dict.
624
625  Raises:
626    Exception: if dict contains more than one top-level entry.
627  """
628  if len(message) != 1:
629    raise Exception('Expected dict of size 1, got size %d' % len(message))
630
631  name, content = message.items()[0]
632  element_type = ElementTree.Element(name)
633  for element_property, value in sorted(content.items()):
634    node = ElementTree.SubElement(element_type, element_property)
635    node.text = value
636  return ElementTree.tostring(element_type)
637
638
639def LookUpGsutilVersion(gsutil_api, url_str):
640  """Looks up the gsutil version of the specified gsutil tarball URL.
641
642  Version is specified in the metadata field set on that object.
643
644  Args:
645    gsutil_api: gsutil Cloud API to use when retrieving gsutil tarball.
646    url_str: tarball URL to retrieve (such as 'gs://pub/gsutil.tar.gz').
647
648  Returns:
649    Version string if URL is a cloud URL containing x-goog-meta-gsutil-version
650    metadata, else None.
651  """
652  url = StorageUrlFromString(url_str)
653  if url.IsCloudUrl():
654    obj = gsutil_api.GetObjectMetadata(url.bucket_name, url.object_name,
655                                       provider=url.scheme,
656                                       fields=['metadata'])
657    if obj.metadata and obj.metadata.additionalProperties:
658      for prop in obj.metadata.additionalProperties:
659        if prop.key == 'gsutil_version':
660          return prop.value
661
662
663def GetGsutilVersionModifiedTime():
664  """Returns unix timestamp of when the VERSION file was last modified."""
665  if not gslib.VERSION_FILE:
666    return 0
667  return int(os.path.getmtime(gslib.VERSION_FILE))
668
669
670def IsRunningInteractively():
671  """Returns True if currently running interactively on a TTY."""
672  return sys.stdout.isatty() and sys.stderr.isatty() and sys.stdin.isatty()
673
674
675def _HttpsValidateCertifcatesEnabled():
676  return config.get('Boto', 'https_validate_certificates', True)
677
678CERTIFICATE_VALIDATION_ENABLED = _HttpsValidateCertifcatesEnabled()
679
680
681def _BotoIsSecure():
682  return config.get('Boto', 'is_secure', True)
683
684BOTO_IS_SECURE = _BotoIsSecure()
685
686
687def ResumableThreshold():
688  return config.getint('GSUtil', 'resumable_threshold', EIGHT_MIB)
689
690
691def AddAcceptEncoding(headers):
692  """Adds accept-encoding:gzip to the dictionary of headers."""
693  # If Accept-Encoding is not already set, set it to enable gzip.
694  if 'accept-encoding' not in headers:
695    headers['accept-encoding'] = 'gzip'
696
697
698# pylint: disable=too-many-statements
699def PrintFullInfoAboutObject(bucket_listing_ref, incl_acl=True):
700  """Print full info for given object (like what displays for gsutil ls -L).
701
702  Args:
703    bucket_listing_ref: BucketListingRef being listed.
704                        Must have ref_type OBJECT and a populated root_object
705                        with the desired fields.
706    incl_acl: True if ACL info should be output.
707
708  Returns:
709    Tuple (number of objects, object_length)
710
711  Raises:
712    Exception: if calling bug encountered.
713  """
714  url_str = bucket_listing_ref.url_string
715  storage_url = StorageUrlFromString(url_str)
716  obj = bucket_listing_ref.root_object
717
718  if (obj.metadata and S3_DELETE_MARKER_GUID in
719      obj.metadata.additionalProperties):
720    num_bytes = 0
721    num_objs = 0
722    url_str += '<DeleteMarker>'
723  else:
724    num_bytes = obj.size
725    num_objs = 1
726
727  print '%s:' % url_str.encode(UTF8)
728  if obj.updated:
729    print '\tCreation time:\t\t%s' % obj.updated.strftime(
730        '%a, %d %b %Y %H:%M:%S GMT')
731  if obj.cacheControl:
732    print '\tCache-Control:\t\t%s' % obj.cacheControl
733  if obj.contentDisposition:
734    print '\tContent-Disposition:\t\t%s' % obj.contentDisposition
735  if obj.contentEncoding:
736    print '\tContent-Encoding:\t\t%s' % obj.contentEncoding
737  if obj.contentLanguage:
738    print '\tContent-Language:\t%s' % obj.contentLanguage
739  print '\tContent-Length:\t\t%s' % obj.size
740  print '\tContent-Type:\t\t%s' % obj.contentType
741  if obj.componentCount:
742    print '\tComponent-Count:\t%d' % obj.componentCount
743  marker_props = {}
744  if obj.metadata and obj.metadata.additionalProperties:
745    non_marker_props = []
746    for add_prop in obj.metadata.additionalProperties:
747      if add_prop.key not in S3_MARKER_GUIDS:
748        non_marker_props.append(add_prop)
749      else:
750        marker_props[add_prop.key] = add_prop.value
751    if non_marker_props:
752      print '\tMetadata:'
753      for ap in non_marker_props:
754        meta_string = '\t\t%s:\t\t%s' % (ap.key, ap.value)
755        print meta_string.encode(UTF8)
756  if obj.crc32c: print '\tHash (crc32c):\t\t%s' % obj.crc32c
757  if obj.md5Hash: print '\tHash (md5):\t\t%s' % obj.md5Hash
758  print '\tETag:\t\t\t%s' % obj.etag.strip('"\'')
759  if obj.generation:
760    generation_str = GenerationFromUrlAndString(storage_url, obj.generation)
761    print '\tGeneration:\t\t%s' % generation_str
762  if obj.metageneration:
763    print '\tMetageneration:\t\t%s' % obj.metageneration
764  if incl_acl:
765    # JSON API won't return acls as part of the response unless we have
766    # full control scope
767    if obj.acl:
768      print '\tACL:\t\t%s' % AclTranslation.JsonFromMessage(obj.acl)
769    elif S3_ACL_MARKER_GUID in marker_props:
770      print '\tACL:\t\t%s' % marker_props[S3_ACL_MARKER_GUID]
771    else:
772      print ('\tACL:\t\t\tACCESS DENIED. Note: you need OWNER '
773             'permission\n\t\t\t\ton the object to read its ACL.')
774
775  return (num_objs, num_bytes)
776
777
778def CompareVersions(first, second):
779  """Compares the first and second gsutil version strings.
780
781  For example, 3.33 > 3.7, and 4.1 is a greater major version than 3.33.
782  Does not handle multiple periods (e.g. 3.3.4) or complicated suffixes
783  (e.g., 3.3RC4 vs. 3.3RC5). A version string with a suffix is treated as
784  less than its non-suffix counterpart (e.g. 3.32 > 3.32pre).
785
786  Args:
787    first: First gsutil version string.
788    second: Second gsutil version string.
789
790  Returns:
791    (g, m):
792       g is True if first known to be greater than second, else False.
793       m is True if first known to be greater by at least 1 major version,
794         else False.
795  """
796  m1 = VERSION_MATCHER.match(str(first))
797  m2 = VERSION_MATCHER.match(str(second))
798
799  # If passed strings we don't know how to handle, be conservative.
800  if not m1 or not m2:
801    return (False, False)
802
803  major_ver1 = int(m1.group('maj'))
804  minor_ver1 = int(m1.group('min')) if m1.group('min') else 0
805  suffix_ver1 = m1.group('suffix')
806  major_ver2 = int(m2.group('maj'))
807  minor_ver2 = int(m2.group('min')) if m2.group('min') else 0
808  suffix_ver2 = m2.group('suffix')
809
810  if major_ver1 > major_ver2:
811    return (True, True)
812  elif major_ver1 == major_ver2:
813    if minor_ver1 > minor_ver2:
814      return (True, False)
815    elif minor_ver1 == minor_ver2:
816      return (bool(suffix_ver2) and not suffix_ver1, False)
817  return (False, False)
818
819
820def _IncreaseSoftLimitForResource(resource_name, fallback_value):
821  """Sets a new soft limit for the maximum number of open files.
822
823  The soft limit is used for this process (and its children), but the
824  hard limit is set by the system and cannot be exceeded.
825
826  We will first try to set the soft limit to the hard limit's value; if that
827  fails, we will try to set the soft limit to the fallback_value iff this would
828  increase the soft limit.
829
830  Args:
831    resource_name: Name of the resource to increase the soft limit for.
832    fallback_value: Fallback value to be used if we couldn't set the
833                    soft value to the hard value (e.g., if the hard value
834                    is "unlimited").
835
836  Returns:
837    Current soft limit for the resource (after any changes we were able to
838    make), or -1 if the resource doesn't exist.
839  """
840
841  # Get the value of the resource.
842  try:
843    (soft_limit, hard_limit) = resource.getrlimit(resource_name)
844  except (resource.error, ValueError):
845    # The resource wasn't present, so we can't do anything here.
846    return -1
847
848  # Try to set the value of the soft limit to the value of the hard limit.
849  if hard_limit > soft_limit:  # Some OS's report 0 for "unlimited".
850    try:
851      resource.setrlimit(resource_name, (hard_limit, hard_limit))
852      return hard_limit
853    except (resource.error, ValueError):
854      # We'll ignore this and try the fallback value.
855      pass
856
857  # Try to set the value of the soft limit to the fallback value.
858  if soft_limit < fallback_value:
859    try:
860      resource.setrlimit(resource_name, (fallback_value, hard_limit))
861      return fallback_value
862    except (resource.error, ValueError):
863      # We couldn't change the soft limit, so just report the current
864      # value of the soft limit.
865      return soft_limit
866  else:
867    return soft_limit
868
869
870def GetCloudApiInstance(cls, thread_state=None):
871  """Gets a gsutil Cloud API instance.
872
873  Since Cloud API implementations are not guaranteed to be thread-safe, each
874  thread needs its own instance. These instances are passed to each thread
875  via the thread pool logic in command.
876
877  Args:
878    cls: Command class to be used for single-threaded case.
879    thread_state: Per thread state from this thread containing a gsutil
880                  Cloud API instance.
881
882  Returns:
883    gsutil Cloud API instance.
884  """
885  return thread_state or cls.gsutil_api
886
887
888def GetFileSize(fp, position_to_eof=False):
889  """Returns size of file, optionally leaving fp positioned at EOF."""
890  if not position_to_eof:
891    cur_pos = fp.tell()
892  fp.seek(0, os.SEEK_END)
893  cur_file_size = fp.tell()
894  if not position_to_eof:
895    fp.seek(cur_pos)
896  return cur_file_size
897
898
899def GetStreamFromFileUrl(storage_url):
900  if storage_url.IsStream():
901    return sys.stdin
902  else:
903    return open(storage_url.object_name, 'rb')
904
905
906def UrlsAreForSingleProvider(url_args):
907  """Tests whether the URLs are all for a single provider.
908
909  Args:
910    url_args: Strings to check.
911
912  Returns:
913    True if URLs are for single provider, False otherwise.
914  """
915  provider = None
916  url = None
917  for url_str in url_args:
918    url = StorageUrlFromString(url_str)
919    if not provider:
920      provider = url.scheme
921    elif url.scheme != provider:
922      return False
923  return provider is not None
924
925
926def HaveFileUrls(args_to_check):
927  """Checks whether args_to_check contain any file URLs.
928
929  Args:
930    args_to_check: Command-line argument subset to check.
931
932  Returns:
933    True if args_to_check contains any file URLs.
934  """
935  for url_str in args_to_check:
936    storage_url = StorageUrlFromString(url_str)
937    if storage_url.IsFileUrl():
938      return True
939  return False
940
941
942def HaveProviderUrls(args_to_check):
943  """Checks whether args_to_check contains any provider URLs (like 'gs://').
944
945  Args:
946    args_to_check: Command-line argument subset to check.
947
948  Returns:
949    True if args_to_check contains any provider URLs.
950  """
951  for url_str in args_to_check:
952    storage_url = StorageUrlFromString(url_str)
953    if storage_url.IsCloudUrl() and storage_url.IsProvider():
954      return True
955  return False
956
957# This must be defined at the module level for pickling across processes.
958MultiprocessingIsAvailableResult = collections.namedtuple(
959    'MultiprocessingIsAvailableResult', ['is_available', 'stack_trace'])
960
961
962def CheckMultiprocessingAvailableAndInit(logger=None):
963  """Checks if multiprocessing is available.
964
965  There are some environments in which there is no way to use multiprocessing
966  logic that's built into Python (e.g., if /dev/shm is not available, then
967  we can't create semaphores). This simply tries out a few things that will be
968  needed to make sure the environment can support the pieces of the
969  multiprocessing module that we need.
970
971  If multiprocessing is available, this performs necessary initialization for
972  multiprocessing.  See gslib.command.InitializeMultiprocessingVariables for
973  an explanation of why this is necessary.
974
975  Args:
976    logger: logging.logger to use for debug output.
977
978  Returns:
979    (multiprocessing_is_available, stack_trace):
980      multiprocessing_is_available: True iff the multiprocessing module is
981                                    available for use.
982      stack_trace: The stack trace generated by the call we tried that failed.
983  """
984  # pylint: disable=global-variable-undefined
985  global cached_multiprocessing_is_available
986  global cached_multiprocessing_check_stack_trace
987  global cached_multiprocessing_is_available_message
988  if cached_multiprocessing_is_available is not None:
989    if logger:
990      logger.debug(cached_multiprocessing_check_stack_trace)
991      logger.warn(cached_multiprocessing_is_available_message)
992    return MultiprocessingIsAvailableResult(
993        is_available=cached_multiprocessing_is_available,
994        stack_trace=cached_multiprocessing_check_stack_trace)
995
996  if IS_WINDOWS:
997    message = """
998Multiple processes are not supported on Windows. Operations requesting
999parallelism will be executed with multiple threads in a single process only.
1000"""
1001    if logger:
1002      logger.warn(message)
1003    return MultiprocessingIsAvailableResult(is_available=False,
1004                                            stack_trace=None)
1005
1006  stack_trace = None
1007  multiprocessing_is_available = True
1008  message = """
1009You have requested multiple processes for an operation, but the
1010required functionality of Python\'s multiprocessing module is not available.
1011Operations requesting parallelism will be executed with multiple threads in a
1012single process only.
1013"""
1014  try:
1015    # Fails if /dev/shm (or some equivalent thereof) is not available for use
1016    # (e.g., there's no implementation, or we can't write to it, etc.).
1017    try:
1018      multiprocessing.Value('i', 0)
1019    except:
1020      message += """
1021Please ensure that you have write access to both /dev/shm and /run/shm.
1022"""
1023      raise  # We'll handle this in one place below.
1024
1025    # Manager objects and Windows are generally a pain to work with, so try it
1026    # out as a sanity check. This definitely works on some versions of Windows,
1027    # but it's certainly possible that there is some unknown configuration for
1028    # which it won't.
1029    global manager  # pylint: disable=global-variable-undefined
1030
1031    manager = multiprocessing.Manager()
1032
1033    # Check that the max number of open files is reasonable. Always check this
1034    # after we're sure that the basic multiprocessing functionality is
1035    # available, since this won't matter unless that's true.
1036    limit = -1
1037    if HAS_RESOURCE_MODULE:
1038      # Try to set this with both resource names - RLIMIT_NOFILE for most Unix
1039      # platforms, and RLIMIT_OFILE for BSD. Ignore AttributeError because the
1040      # "resource" module is not guaranteed to know about these names.
1041      try:
1042        limit = max(limit,
1043                    _IncreaseSoftLimitForResource(
1044                        resource.RLIMIT_NOFILE,
1045                        MIN_ACCEPTABLE_OPEN_FILES_LIMIT))
1046      except AttributeError:
1047        pass
1048      try:
1049        limit = max(limit,
1050                    _IncreaseSoftLimitForResource(
1051                        resource.RLIMIT_OFILE, MIN_ACCEPTABLE_OPEN_FILES_LIMIT))
1052      except AttributeError:
1053        pass
1054
1055    if limit < MIN_ACCEPTABLE_OPEN_FILES_LIMIT:
1056      message += ("""
1057Your max number of open files, %s, is too low to allow safe multiprocessing.
1058On Linux you can fix this by adding something like "ulimit -n 10000" to your
1059~/.bashrc or equivalent file and opening a new terminal.
1060
1061On MacOS, you may also need to run a command like this once (in addition to the
1062above instructions), which might require a restart of your system to take
1063effect:
1064  launchctl limit maxfiles 10000
1065
1066Alternatively, edit /etc/launchd.conf with something like:
1067  limit maxfiles 10000 10000
1068
1069""" % limit)
1070      raise Exception('Max number of open files, %s, is too low.' % limit)
1071  except:  # pylint: disable=bare-except
1072    stack_trace = traceback.format_exc()
1073    multiprocessing_is_available = False
1074    if logger is not None:
1075      logger.debug(stack_trace)
1076      logger.warn(message)
1077
1078  # Set the cached values so that we never need to do this check again.
1079  cached_multiprocessing_is_available = multiprocessing_is_available
1080  cached_multiprocessing_check_stack_trace = stack_trace
1081  cached_multiprocessing_is_available_message = message
1082  return MultiprocessingIsAvailableResult(
1083      is_available=cached_multiprocessing_is_available,
1084      stack_trace=cached_multiprocessing_check_stack_trace)
1085
1086
1087def CreateLock():
1088  """Returns either a multiprocessing lock or a threading lock.
1089
1090  Use Multiprocessing lock iff we have access to the parts of the
1091  multiprocessing module that are necessary to enable parallelism in operations.
1092
1093  Returns:
1094    Multiprocessing or threading lock.
1095  """
1096  if CheckMultiprocessingAvailableAndInit().is_available:
1097    return manager.Lock()
1098  else:
1099    return threading.Lock()
1100
1101
1102def IsCloudSubdirPlaceholder(url, blr=None):
1103  """Determines if URL is a cloud subdir placeholder.
1104
1105  This function is needed because GUI tools (like the GCS cloud console) allow
1106  users to create empty "folders" by creating a placeholder object; and parts
1107  of gsutil need to treat those placeholder objects specially. For example,
1108  gsutil rsync needs to avoid downloading those objects because they can cause
1109  conflicts (see comments in rsync command for details).
1110
1111  We currently detect two cases:
1112    - Cloud objects whose name ends with '_$folder$'
1113    - Cloud objects whose name ends with '/'
1114
1115  Args:
1116    url: The URL to be checked.
1117    blr: BucketListingRef to check, or None if not available.
1118         If None, size won't be checked.
1119
1120  Returns:
1121    True/False.
1122  """
1123  if not url.IsCloudUrl():
1124    return False
1125  url_str = url.url_string
1126  if url_str.endswith('_$folder$'):
1127    return True
1128  if blr and blr.IsObject():
1129    size = blr.root_object.size
1130  else:
1131    size = 0
1132  return size == 0 and url_str.endswith('/')
1133
1134
1135def GetTermLines():
1136  """Returns number of terminal lines."""
1137  # fcntl isn't supported in Windows.
1138  try:
1139    import fcntl    # pylint: disable=g-import-not-at-top
1140    import termios  # pylint: disable=g-import-not-at-top
1141  except ImportError:
1142    return _DEFAULT_LINES
1143  def ioctl_GWINSZ(fd):  # pylint: disable=invalid-name
1144    try:
1145      return struct.unpack(
1146          'hh', fcntl.ioctl(fd, termios.TIOCGWINSZ, '1234'))[0]
1147    except:  # pylint: disable=bare-except
1148      return 0  # Failure (so will retry on different file descriptor below).
1149  # Try to find a valid number of lines from termio for stdin, stdout,
1150  # or stderr, in that order.
1151  ioc = ioctl_GWINSZ(0) or ioctl_GWINSZ(1) or ioctl_GWINSZ(2)
1152  if not ioc:
1153    try:
1154      fd = os.open(os.ctermid(), os.O_RDONLY)
1155      ioc = ioctl_GWINSZ(fd)
1156      os.close(fd)
1157    except:  # pylint: disable=bare-except
1158      pass
1159  if not ioc:
1160    ioc = os.environ.get('LINES', _DEFAULT_LINES)
1161  return int(ioc)
1162
1163
1164class GsutilStreamHandler(logging.StreamHandler):
1165  """A subclass of StreamHandler for use in gsutil."""
1166
1167  def flush(self):
1168    # Note: we override the flush method here due to a python 2.6 bug. The
1169    # python logging module will try to flush all stream handlers at exit.
1170    # If the StreamHandler is pointing to a file that is already closed, the
1171    # method throws an exception. Our unit tests temporarily redirect stderr,
1172    # which causes the default StreamHandler to open its stream against a
1173    # temporary file. By the time the process shuts down, the underlying file
1174    # is closed, causing an exception. This was fixed in Python 2.7, but to
1175    # remove the flake from Python 2.6, we maintain this here.
1176    try:
1177      logging.StreamHandler.flush(self)
1178    except ValueError:
1179      pass
1180
1181
1182def StdinIterator():
1183  """A generator function that returns lines from stdin."""
1184  for line in sys.stdin:
1185    # Strip CRLF.
1186    yield line.rstrip()
1187