1# -*- coding: utf-8 -*-
2# Copyright 2010 Google Inc. All Rights Reserved.
3#
4# Licensed under the Apache License, Version 2.0 (the "License");
5# you may not use this file except in compliance with the License.
6# You may obtain a copy of the License at
7#
8#     http://www.apache.org/licenses/LICENSE-2.0
9#
10# Unless required by applicable law or agreed to in writing, software
11# distributed under the License is distributed on an "AS IS" BASIS,
12# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13# See the License for the specific language governing permissions and
14# limitations under the License.
15"""Wildcard iterator class and supporting functions."""
16
17from __future__ import absolute_import
18
19import fnmatch
20import glob
21import os
22import re
23import sys
24import textwrap
25
26from gslib.bucket_listing_ref import BucketListingBucket
27from gslib.bucket_listing_ref import BucketListingObject
28from gslib.bucket_listing_ref import BucketListingPrefix
29from gslib.cloud_api import AccessDeniedException
30from gslib.cloud_api import CloudApi
31from gslib.cloud_api import NotFoundException
32from gslib.exception import CommandException
33from gslib.storage_url import ContainsWildcard
34from gslib.storage_url import StorageUrlFromString
35from gslib.storage_url import StripOneSlash
36from gslib.storage_url import WILDCARD_REGEX
37from gslib.translation_helper import GenerationFromUrlAndString
38from gslib.util import UTF8
39
40
41FLAT_LIST_REGEX = re.compile(r'(?P<before>.*?)\*\*(?P<after>.*)')
42
43
44class WildcardIterator(object):
45  """Class for iterating over Google Cloud Storage strings containing wildcards.
46
47  The base class is abstract; you should instantiate using the
48  wildcard_iterator() static factory method, which chooses the right
49  implementation depending on the base string.
50  """
51
52  # TODO: Standardize on __str__ and __repr__ here and elsewhere.  Define both
53  # and make one return the other.
54  def __repr__(self):
55    """Returns string representation of WildcardIterator."""
56    return 'WildcardIterator(%s)' % self.wildcard_url.url_string
57
58
59class CloudWildcardIterator(WildcardIterator):
60  """WildcardIterator subclass for buckets, bucket subdirs and objects.
61
62  Iterates over BucketListingRef matching the Url string wildcard. It's
63  much more efficient to first get metadata that's available in the Bucket
64  (for example to get the name and size of each object), because that
65  information is available in the object list results.
66  """
67
68  def __init__(self, wildcard_url, gsutil_api, all_versions=False,
69               debug=0, project_id=None):
70    """Instantiates an iterator that matches the wildcard URL.
71
72    Args:
73      wildcard_url: CloudUrl that contains the wildcard to iterate.
74      gsutil_api: Cloud storage interface.  Passed in for thread safety, also
75                  settable for testing/mocking.
76      all_versions: If true, the iterator yields all versions of objects
77                    matching the wildcard.  If false, yields just the live
78                    object version.
79      debug: Debug level to control debug output for iterator.
80      project_id: Project ID to use for bucket listings.
81    """
82    self.wildcard_url = wildcard_url
83    self.all_versions = all_versions
84    self.debug = debug
85    self.gsutil_api = gsutil_api
86    self.project_id = project_id
87
88  def __iter__(self, bucket_listing_fields=None,
89               expand_top_level_buckets=False):
90    """Iterator that gets called when iterating over the cloud wildcard.
91
92    In the case where no wildcard is present, returns a single matching object,
93    single matching prefix, or one of each if both exist.
94
95    Args:
96      bucket_listing_fields: Iterable fields to include in bucket listings.
97                             Ex. ['name', 'acl'].  Iterator is
98                             responsible for converting these to list-style
99                             format ['items/name', 'items/acl'] as well as
100                             adding any fields necessary for listing such as
101                             prefixes.  API implemenation is responsible for
102                             adding pagination fields.  If this is None,
103                             all fields are returned.
104      expand_top_level_buckets: If true, yield no BUCKET references.  Instead,
105                                expand buckets into top-level objects and
106                                prefixes.
107
108    Yields:
109      BucketListingRef of type BUCKET, OBJECT or PREFIX.
110    """
111    single_version_request = self.wildcard_url.HasGeneration()
112
113    # For wildcard expansion purposes, we need at a minimum the name of
114    # each object and prefix.  If we're not using the default of requesting
115    # all fields, make sure at least these are requested.  The Cloud API
116    # tolerates specifying the same field twice.
117    get_fields = None
118    if bucket_listing_fields:
119      get_fields = set()
120      for field in bucket_listing_fields:
121        get_fields.add(field)
122      bucket_listing_fields = self._GetToListFields(
123          get_fields=bucket_listing_fields)
124      bucket_listing_fields.update(['items/name', 'prefixes'])
125      get_fields.update(['name'])
126      # If we're making versioned requests, ensure generation and
127      # metageneration are also included.
128      if single_version_request or self.all_versions:
129        bucket_listing_fields.update(['items/generation',
130                                      'items/metageneration'])
131        get_fields.update(['generation', 'metageneration'])
132
133    # Handle bucket wildcarding, if any, in _ExpandBucketWildcards. Then
134    # iterate over the expanded bucket strings and handle any object
135    # wildcarding.
136    for bucket_listing_ref in self._ExpandBucketWildcards(bucket_fields=['id']):
137      bucket_url_string = bucket_listing_ref.url_string
138      if self.wildcard_url.IsBucket():
139        # IsBucket() guarantees there are no prefix or object wildcards, and
140        # thus this is a top-level listing of buckets.
141        if expand_top_level_buckets:
142          url = StorageUrlFromString(bucket_url_string)
143          for obj_or_prefix in self.gsutil_api.ListObjects(
144              url.bucket_name, delimiter='/', all_versions=self.all_versions,
145              provider=self.wildcard_url.scheme,
146              fields=bucket_listing_fields):
147            if obj_or_prefix.datatype == CloudApi.CsObjectOrPrefixType.OBJECT:
148              yield self._GetObjectRef(bucket_url_string, obj_or_prefix.data,
149                                       with_version=self.all_versions)
150            else:  # CloudApi.CsObjectOrPrefixType.PREFIX:
151              yield self._GetPrefixRef(bucket_url_string, obj_or_prefix.data)
152        else:
153          yield bucket_listing_ref
154      else:
155        # By default, assume a non-wildcarded URL is an object, not a prefix.
156        # This prevents unnecessary listings (which are slower, more expensive,
157        # and also subject to eventual consistency).
158        if (not ContainsWildcard(self.wildcard_url.url_string) and
159            self.wildcard_url.IsObject() and not self.all_versions):
160          try:
161            get_object = self.gsutil_api.GetObjectMetadata(
162                self.wildcard_url.bucket_name,
163                self.wildcard_url.object_name,
164                generation=self.wildcard_url.generation,
165                provider=self.wildcard_url.scheme,
166                fields=get_fields)
167            yield self._GetObjectRef(
168                self.wildcard_url.bucket_url_string, get_object,
169                with_version=(self.all_versions or single_version_request))
170            return
171          except (NotFoundException, AccessDeniedException):
172            # It's possible this is a prefix - try to list instead.
173            pass
174
175        # Expand iteratively by building prefix/delimiter bucket listing
176        # request, filtering the results per the current level's wildcard
177        # (if present), and continuing with the next component of the
178        # wildcard. See _BuildBucketFilterStrings() documentation for details.
179        if single_version_request:
180          url_string = '%s%s#%s' % (bucket_url_string,
181                                    self.wildcard_url.object_name,
182                                    self.wildcard_url.generation)
183        else:
184          # Rstrip any prefixes to correspond with rstripped prefix wildcard
185          # from _BuildBucketFilterStrings().
186          url_string = '%s%s' % (bucket_url_string,
187                                 StripOneSlash(self.wildcard_url.object_name)
188                                 or '/')  # Cover root object named '/' case.
189        urls_needing_expansion = [url_string]
190        while urls_needing_expansion:
191          url = StorageUrlFromString(urls_needing_expansion.pop(0))
192          (prefix, delimiter, prefix_wildcard, suffix_wildcard) = (
193              self._BuildBucketFilterStrings(url.object_name))
194          prog = re.compile(fnmatch.translate(prefix_wildcard))
195
196          # List bucket for objects matching prefix up to delimiter.
197          for obj_or_prefix in self.gsutil_api.ListObjects(
198              url.bucket_name, prefix=prefix, delimiter=delimiter,
199              all_versions=self.all_versions or single_version_request,
200              provider=self.wildcard_url.scheme,
201              fields=bucket_listing_fields):
202            if obj_or_prefix.datatype == CloudApi.CsObjectOrPrefixType.OBJECT:
203              gcs_object = obj_or_prefix.data
204              if prog.match(gcs_object.name):
205                if not suffix_wildcard or (
206                    StripOneSlash(gcs_object.name) == suffix_wildcard):
207                  if not single_version_request or (
208                      self._SingleVersionMatches(gcs_object.generation)):
209                    yield self._GetObjectRef(
210                        bucket_url_string, gcs_object, with_version=(
211                            self.all_versions or single_version_request))
212            else:  # CloudApi.CsObjectOrPrefixType.PREFIX
213              prefix = obj_or_prefix.data
214              # If the prefix ends with a slash, remove it.  Note that we only
215              # remove one slash so that we can successfully enumerate dirs
216              # containing multiple slashes.
217              rstripped_prefix = StripOneSlash(prefix)
218              if prog.match(rstripped_prefix):
219                if suffix_wildcard and rstripped_prefix != suffix_wildcard:
220                  # There's more wildcard left to expand.
221                  url_append_string = '%s%s' % (
222                      bucket_url_string, rstripped_prefix + '/' +
223                      suffix_wildcard)
224                  urls_needing_expansion.append(url_append_string)
225                else:
226                  # No wildcard to expand, just yield the prefix
227                  yield self._GetPrefixRef(bucket_url_string, prefix)
228
229  def _BuildBucketFilterStrings(self, wildcard):
230    """Builds strings needed for querying a bucket and filtering results.
231
232    This implements wildcard object name matching.
233
234    Args:
235      wildcard: The wildcard string to match to objects.
236
237    Returns:
238      (prefix, delimiter, prefix_wildcard, suffix_wildcard)
239      where:
240        prefix is the prefix to be sent in bucket GET request.
241        delimiter is the delimiter to be sent in bucket GET request.
242        prefix_wildcard is the wildcard to be used to filter bucket GET results.
243        suffix_wildcard is wildcard to be appended to filtered bucket GET
244          results for next wildcard expansion iteration.
245      For example, given the wildcard gs://bucket/abc/d*e/f*.txt we
246      would build prefix= abc/d, delimiter=/, prefix_wildcard=d*e, and
247      suffix_wildcard=f*.txt. Using this prefix and delimiter for a bucket
248      listing request will then produce a listing result set that can be
249      filtered using this prefix_wildcard; and we'd use this suffix_wildcard
250      to feed into the next call(s) to _BuildBucketFilterStrings(), for the
251      next iteration of listing/filtering.
252
253    Raises:
254      AssertionError if wildcard doesn't contain any wildcard chars.
255    """
256    # Generate a request prefix if the object name part of the wildcard starts
257    # with a non-wildcard string (e.g., that's true for 'gs://bucket/abc*xyz').
258    match = WILDCARD_REGEX.search(wildcard)
259    if not match:
260      # Input "wildcard" has no wildcard chars, so just return tuple that will
261      # cause a bucket listing to match the given input wildcard. Example: if
262      # previous iteration yielded gs://bucket/dir/ with suffix_wildcard abc,
263      # the next iteration will call _BuildBucketFilterStrings() with
264      # gs://bucket/dir/abc, and we will return prefix ='dir/abc',
265      # delimiter='/', prefix_wildcard='dir/abc', and suffix_wildcard=''.
266      prefix = wildcard
267      delimiter = '/'
268      prefix_wildcard = wildcard
269      suffix_wildcard = ''
270    else:
271      if match.start() > 0:
272        # Wildcard does not occur at beginning of object name, so construct a
273        # prefix string to send to server.
274        prefix = wildcard[:match.start()]
275        wildcard_part = wildcard[match.start():]
276      else:
277        prefix = None
278        wildcard_part = wildcard
279      end = wildcard_part.find('/')
280      if end != -1:
281        wildcard_part = wildcard_part[:end+1]
282      # Remove trailing '/' so we will match gs://bucket/abc* as well as
283      # gs://bucket/abc*/ with the same wildcard regex.
284      prefix_wildcard = StripOneSlash((prefix or '') + wildcard_part)
285      suffix_wildcard = wildcard[match.end():]
286      end = suffix_wildcard.find('/')
287      if end == -1:
288        suffix_wildcard = ''
289      else:
290        suffix_wildcard = suffix_wildcard[end+1:]
291      # To implement recursive (**) wildcarding, if prefix_wildcard
292      # suffix_wildcard starts with '**' don't send a delimiter, and combine
293      # suffix_wildcard at end of prefix_wildcard.
294      if prefix_wildcard.find('**') != -1:
295        delimiter = None
296        prefix_wildcard += suffix_wildcard
297        suffix_wildcard = ''
298      else:
299        delimiter = '/'
300    # The following debug output is useful for tracing how the algorithm
301    # walks through a multi-part wildcard like gs://bucket/abc/d*e/f*.txt
302    if self.debug > 1:
303      sys.stderr.write(
304          'DEBUG: wildcard=%s, prefix=%s, delimiter=%s, '
305          'prefix_wildcard=%s, suffix_wildcard=%s\n' %
306          (wildcard, prefix, delimiter, prefix_wildcard, suffix_wildcard))
307    return (prefix, delimiter, prefix_wildcard, suffix_wildcard)
308
309  def _SingleVersionMatches(self, listed_generation):
310    decoded_generation = GenerationFromUrlAndString(self.wildcard_url,
311                                                    listed_generation)
312    return str(self.wildcard_url.generation) == str(decoded_generation)
313
314  def _ExpandBucketWildcards(self, bucket_fields=None):
315    """Expands bucket and provider wildcards.
316
317    Builds a list of bucket url strings that can be iterated on.
318
319    Args:
320      bucket_fields: If present, populate only these metadata fields for
321                     buckets.  Example value: ['acl', 'defaultObjectAcl']
322
323    Yields:
324      BucketListingRefereneces of type BUCKET.
325    """
326    bucket_url = StorageUrlFromString(self.wildcard_url.bucket_url_string)
327    if (bucket_fields and set(bucket_fields) == set(['id']) and
328        not ContainsWildcard(self.wildcard_url.bucket_name)):
329      # If we just want the name of a non-wildcarded bucket URL,
330      # don't make an RPC.
331      yield BucketListingBucket(bucket_url)
332    elif(self.wildcard_url.IsBucket() and
333         not ContainsWildcard(self.wildcard_url.bucket_name)):
334      # If we have a non-wildcarded bucket URL, get just that bucket.
335      yield BucketListingBucket(
336          bucket_url, root_object=self.gsutil_api.GetBucket(
337              self.wildcard_url.bucket_name, provider=self.wildcard_url.scheme,
338              fields=bucket_fields))
339    else:
340      regex = fnmatch.translate(self.wildcard_url.bucket_name)
341      prog = re.compile(regex)
342
343      fields = self._GetToListFields(bucket_fields)
344      if fields:
345        fields.add('items/id')
346      for bucket in self.gsutil_api.ListBuckets(
347          fields=fields, project_id=self.project_id,
348          provider=self.wildcard_url.scheme):
349        if prog.match(bucket.id):
350          url = StorageUrlFromString(
351              '%s://%s/' % (self.wildcard_url.scheme, bucket.id))
352          yield BucketListingBucket(url, root_object=bucket)
353
354  def _GetToListFields(self, get_fields=None):
355    """Prepends 'items/' to the input fields and converts it to a set.
356
357    This way field sets requested for GetBucket can be used in ListBucket calls.
358    Note that the input set must contain only bucket or object fields; listing
359    fields such as prefixes or nextPageToken should be added after calling
360    this function.
361
362    Args:
363      get_fields: Iterable fields usable in GetBucket/GetObject calls.
364
365    Returns:
366      Set of fields usable in ListBuckets/ListObjects calls.
367    """
368    if get_fields:
369      list_fields = set()
370      for field in get_fields:
371        list_fields.add('items/' + field)
372      return list_fields
373
374  def _GetObjectRef(self, bucket_url_string, gcs_object, with_version=False):
375    """Creates a BucketListingRef of type OBJECT from the arguments.
376
377    Args:
378      bucket_url_string: Wildcardless string describing the containing bucket.
379      gcs_object: gsutil_api root Object for populating the BucketListingRef.
380      with_version: If true, return a reference with a versioned string.
381
382    Returns:
383      BucketListingRef of type OBJECT.
384    """
385    # Generation can be None in test mocks, so just return the
386    # live object for simplicity.
387    if with_version and gcs_object.generation is not None:
388      generation_str = GenerationFromUrlAndString(self.wildcard_url,
389                                                  gcs_object.generation)
390      object_string = '%s%s#%s' % (bucket_url_string, gcs_object.name,
391                                   generation_str)
392    else:
393      object_string = '%s%s' % (bucket_url_string, gcs_object.name)
394    object_url = StorageUrlFromString(object_string)
395    return BucketListingObject(object_url, root_object=gcs_object)
396
397  def _GetPrefixRef(self, bucket_url_string, prefix):
398    """Creates a BucketListingRef of type PREFIX from the arguments.
399
400    Args:
401      bucket_url_string: Wildcardless string describing the containing bucket.
402      prefix: gsutil_api Prefix for populating the BucketListingRef
403
404    Returns:
405      BucketListingRef of type PREFIX.
406    """
407    prefix_url = StorageUrlFromString('%s%s' % (bucket_url_string, prefix))
408    return BucketListingPrefix(prefix_url, root_object=prefix)
409
410  def IterBuckets(self, bucket_fields=None):
411    """Iterates over the wildcard, returning refs for each expanded bucket.
412
413    This ignores the object part of the URL entirely and expands only the
414    the bucket portion.  It will yield BucketListingRefs of type BUCKET only.
415
416    Args:
417      bucket_fields: Iterable fields to include in bucket listings.
418                     Ex. ['defaultObjectAcl', 'logging'].  This function is
419                     responsible for converting these to listing-style
420                     format ['items/defaultObjectAcl', 'items/logging'], as
421                     well as adding any fields necessary for listing such as
422                     'items/id'.  API implemenation is responsible for
423                     adding pagination fields.  If this is None, all fields are
424                     returned.
425
426    Yields:
427      BucketListingRef of type BUCKET, or empty iterator if no matches.
428    """
429    for blr in self._ExpandBucketWildcards(bucket_fields=bucket_fields):
430      yield blr
431
432  def IterAll(self, bucket_listing_fields=None, expand_top_level_buckets=False):
433    """Iterates over the wildcard, yielding bucket, prefix or object refs.
434
435    Args:
436      bucket_listing_fields: If present, populate only these metadata
437                             fields for listed objects.
438      expand_top_level_buckets: If true and the wildcard expands only to
439                                Bucket(s), yields the expansion of each bucket
440                                into a top-level listing of prefixes and objects
441                                in that bucket instead of a BucketListingRef
442                                to that bucket.
443
444    Yields:
445      BucketListingRef, or empty iterator if no matches.
446    """
447    for blr in self.__iter__(
448        bucket_listing_fields=bucket_listing_fields,
449        expand_top_level_buckets=expand_top_level_buckets):
450      yield blr
451
452  def IterObjects(self, bucket_listing_fields=None):
453    """Iterates over the wildcard, yielding only object BucketListingRefs.
454
455    Args:
456      bucket_listing_fields: If present, populate only these metadata
457                             fields for listed objects.
458
459    Yields:
460      BucketListingRefs of type OBJECT or empty iterator if no matches.
461    """
462    for blr in self.__iter__(bucket_listing_fields=bucket_listing_fields,
463                             expand_top_level_buckets=True):
464      if blr.IsObject():
465        yield blr
466
467
468class FileWildcardIterator(WildcardIterator):
469  """WildcardIterator subclass for files and directories.
470
471  If you use recursive wildcards ('**') only a single such wildcard is
472  supported. For example you could use the wildcard '**/*.txt' to list all .txt
473  files in any subdirectory of the current directory, but you couldn't use a
474  wildcard like '**/abc/**/*.txt' (which would, if supported, let you find .txt
475  files in any subdirectory named 'abc').
476  """
477
478  def __init__(self, wildcard_url, debug=0):
479    """Instantiates an iterator over BucketListingRefs matching wildcard URL.
480
481    Args:
482      wildcard_url: FileUrl that contains the wildcard to iterate.
483      debug: Debug level (range 0..3).
484    """
485    self.wildcard_url = wildcard_url
486    self.debug = debug
487
488  def __iter__(self):
489    """Iterator that gets called when iterating over the file wildcard.
490
491    In the case where no wildcard is present, returns a single matching file
492    or directory.
493
494    Raises:
495      WildcardException: if invalid wildcard found.
496
497    Yields:
498      BucketListingRef of type OBJECT (for files) or PREFIX (for directories)
499    """
500    wildcard = self.wildcard_url.object_name
501    match = FLAT_LIST_REGEX.match(wildcard)
502    if match:
503      # Recursive wildcarding request ('.../**/...').
504      # Example input: wildcard = '/tmp/tmp2pQJAX/**/*'
505      base_dir = match.group('before')[:-1]
506      remaining_wildcard = match.group('after')
507      # At this point for the above example base_dir = '/tmp/tmp2pQJAX' and
508      # remaining_wildcard = '/*'
509      if remaining_wildcard.startswith('*'):
510        raise WildcardException('Invalid wildcard with more than 2 consecutive '
511                                '*s (%s)' % wildcard)
512      # If there was no remaining wildcard past the recursive wildcard,
513      # treat it as if it were a '*'. For example, file://tmp/** is equivalent
514      # to file://tmp/**/*
515      if not remaining_wildcard:
516        remaining_wildcard = '*'
517      # Skip slash(es).
518      remaining_wildcard = remaining_wildcard.lstrip(os.sep)
519      filepaths = self._IterDir(base_dir, remaining_wildcard)
520    else:
521      # Not a recursive wildcarding request.
522      filepaths = glob.iglob(wildcard)
523    for filepath in filepaths:
524      expanded_url = StorageUrlFromString(filepath)
525      if os.path.isdir(filepath):
526        yield BucketListingPrefix(expanded_url)
527      else:
528        yield BucketListingObject(expanded_url)
529
530  def _IterDir(self, directory, wildcard):
531    """An iterator over the specified dir and wildcard."""
532    # UTF8-encode directory before passing it to os.walk() so if there are
533    # non-valid UTF8 chars in the file name (e.g., that can happen if the file
534    # originated on Windows) os.walk() will not attempt to decode and then die
535    # with a "codec can't decode byte" error, and instead we can catch the error
536    # at yield time and print a more informative error message.
537    for dirpath, unused_dirnames, filenames in os.walk(directory.encode(UTF8)):
538      for f in fnmatch.filter(filenames, wildcard):
539        try:
540          yield os.path.join(dirpath, f).decode(UTF8)
541        except UnicodeDecodeError:
542          # Note: We considered several ways to deal with this, but each had
543          # problems:
544          # 1. Raise an exception and try to catch in a higher layer (the
545          #    gsutil cp command), so we can properly support the gsutil cp -c
546          #    option. That doesn't work because raising an exception during
547          #    iteration terminates the generator.
548          # 2. Accumulate a list of bad filenames and skip processing each
549          #    during iteration, then raise at the end, with exception text
550          #    printing the bad paths. That doesn't work because iteration is
551          #    wrapped in PluralityCheckableIterator, so it's possible there
552          #    are not-yet-performed copy operations at the time we reach the
553          #    end of the iteration and raise the exception - which would cause
554          #    us to skip copying validly named files. Moreover, the gsutil
555          #    cp command loops over argv, so if you run the command gsutil cp
556          #    -rc dir1 dir2 gs://bucket, an invalid unicode name inside dir1
557          #    would cause dir2 never to be visited.
558          # 3. Print the invalid pathname and skip it during iteration. That
559          #    would work but would mean gsutil cp could exit with status 0
560          #    even though some files weren't copied.
561          # 4. Change the WildcardIterator to include an error status along with
562          #    the result. That would solve the problem but would be a
563          #    substantial change (WildcardIterator is used in many parts of
564          #    gsutil), and we didn't feel that magnitude of change was
565          #    warranted by this relatively uncommon corner case.
566          # Instead we chose to abort when one such file is encountered, and
567          # require the user to remove or rename the files and try again.
568          raise CommandException('\n'.join(textwrap.wrap(
569              'Invalid Unicode path encountered (%s). gsutil cannot proceed '
570              'with such files present. Please remove or rename this file and '
571              'try again. NOTE: the path printed above replaces the '
572              'problematic characters with a hex-encoded printable '
573              'representation. For more details (including how to convert to a '
574              'gsutil-compatible encoding) see `gsutil help encoding`.' %
575              repr(os.path.join(dirpath, f)))))
576
577  # pylint: disable=unused-argument
578  def IterObjects(self, bucket_listing_fields=None):
579    """Iterates over the wildcard, yielding only object (file) refs.
580
581    Args:
582      bucket_listing_fields: Ignored as filesystems don't have buckets.
583
584    Yields:
585      BucketListingRefs of type OBJECT or empty iterator if no matches.
586    """
587    for bucket_listing_ref in self.IterAll():
588      if bucket_listing_ref.IsObject():
589        yield bucket_listing_ref
590
591  # pylint: disable=unused-argument
592  def IterAll(self, bucket_listing_fields=None, expand_top_level_buckets=False):
593    """Iterates over the wildcard, yielding BucketListingRefs.
594
595    Args:
596      bucket_listing_fields: Ignored; filesystems don't have buckets.
597      expand_top_level_buckets: Ignored; filesystems don't have buckets.
598
599    Yields:
600      BucketListingRefs of type OBJECT (file) or PREFIX (directory),
601      or empty iterator if no matches.
602    """
603    for bucket_listing_ref in self.__iter__():
604      yield bucket_listing_ref
605
606  def IterBuckets(self, unused_bucket_fields=None):
607    """Placeholder to allow polymorphic use of WildcardIterator.
608
609    Args:
610      unused_bucket_fields: Ignored; filesystems don't have buckets.
611
612    Raises:
613      WildcardException: in all cases.
614    """
615    raise WildcardException(
616        'Iterating over Buckets not possible for file wildcards')
617
618
619class WildcardException(StandardError):
620  """Exception raised for invalid wildcard URLs."""
621
622  def __init__(self, reason):
623    StandardError.__init__(self)
624    self.reason = reason
625
626  def __repr__(self):
627    return 'WildcardException: %s' % self.reason
628
629  def __str__(self):
630    return 'WildcardException: %s' % self.reason
631
632
633def CreateWildcardIterator(url_str, gsutil_api, all_versions=False, debug=0,
634                           project_id=None):
635  """Instantiate a WildcardIterator for the given URL string.
636
637  Args:
638    url_str: URL string naming wildcard object(s) to iterate.
639    gsutil_api: Cloud storage interface.  Passed in for thread safety, also
640                settable for testing/mocking.
641    all_versions: If true, the iterator yields all versions of objects
642                  matching the wildcard.  If false, yields just the live
643                  object version.
644    debug: Debug level to control debug output for iterator.
645    project_id: Project id to use for bucket listings.
646
647  Returns:
648    A WildcardIterator that handles the requested iteration.
649  """
650
651  url = StorageUrlFromString(url_str)
652  if url.IsFileUrl():
653    return FileWildcardIterator(url, debug=debug)
654  else:  # Cloud URL
655    return CloudWildcardIterator(
656        url, gsutil_api, all_versions=all_versions, debug=debug,
657        project_id=project_id)
658