1# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2#
3# Licensed under the Apache License, Version 2.0 (the "License");
4# you may not use this file except in compliance with the License.
5# You may obtain a copy of the License at
6#
7#     http://www.apache.org/licenses/LICENSE-2.0
8#
9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS,
11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12# See the License for the specific language governing permissions and
13# limitations under the License.
14# ==============================================================================
15"""File IO methods that wrap the C++ FileSystem API.
16
17The C++ FileSystem API is SWIG wrapped in file_io.i. These functions call those
18to accomplish basic File IO operations.
19"""
20from __future__ import absolute_import
21from __future__ import division
22from __future__ import print_function
23
24import binascii
25import os
26import uuid
27
28import six
29
30from tensorflow.python import pywrap_tensorflow
31from tensorflow.python.framework import c_api_util
32from tensorflow.python.framework import errors
33from tensorflow.python.util import compat
34from tensorflow.python.util import deprecation
35from tensorflow.python.util.tf_export import tf_export
36
37# A good default block size depends on the system in question.
38# A somewhat conservative default chosen here.
39_DEFAULT_BLOCK_SIZE = 16 * 1024 * 1024
40
41
42class FileIO(object):
43  """FileIO class that exposes methods to read / write to / from files.
44
45  The constructor takes the following arguments:
46  name: name of the file
47  mode: one of 'r', 'w', 'a', 'r+', 'w+', 'a+'. Append 'b' for bytes mode.
48
49  Can be used as an iterator to iterate over lines in the file.
50
51  The default buffer size used for the BufferedInputStream used for reading
52  the file line by line is 1024 * 512 bytes.
53  """
54
55  def __init__(self, name, mode):
56    self.__name = name
57    self.__mode = mode
58    self._read_buf = None
59    self._writable_file = None
60    self._binary_mode = "b" in mode
61    mode = mode.replace("b", "")
62    if mode not in ("r", "w", "a", "r+", "w+", "a+"):
63      raise errors.InvalidArgumentError(
64          None, None, "mode is not 'r' or 'w' or 'a' or 'r+' or 'w+' or 'a+'")
65    self._read_check_passed = mode in ("r", "r+", "a+", "w+")
66    self._write_check_passed = mode in ("a", "w", "r+", "a+", "w+")
67
68  @property
69  def name(self):
70    """Returns the file name."""
71    return self.__name
72
73  @property
74  def mode(self):
75    """Returns the mode in which the file was opened."""
76    return self.__mode
77
78  def _preread_check(self):
79    if not self._read_buf:
80      if not self._read_check_passed:
81        raise errors.PermissionDeniedError(None, None,
82                                           "File isn't open for reading")
83      with errors.raise_exception_on_not_ok_status() as status:
84        self._read_buf = pywrap_tensorflow.CreateBufferedInputStream(
85            compat.as_bytes(self.__name), 1024 * 512, status)
86
87  def _prewrite_check(self):
88    if not self._writable_file:
89      if not self._write_check_passed:
90        raise errors.PermissionDeniedError(None, None,
91                                           "File isn't open for writing")
92      with errors.raise_exception_on_not_ok_status() as status:
93        self._writable_file = pywrap_tensorflow.CreateWritableFile(
94            compat.as_bytes(self.__name), compat.as_bytes(self.__mode), status)
95
96  def _prepare_value(self, val):
97    if self._binary_mode:
98      return compat.as_bytes(val)
99    else:
100      return compat.as_str_any(val)
101
102  def size(self):
103    """Returns the size of the file."""
104    return stat(self.__name).length
105
106  def write(self, file_content):
107    """Writes file_content to the file. Appends to the end of the file."""
108    self._prewrite_check()
109    with errors.raise_exception_on_not_ok_status() as status:
110      pywrap_tensorflow.AppendToFile(
111          compat.as_bytes(file_content), self._writable_file, status)
112
113  def read(self, n=-1):
114    """Returns the contents of a file as a string.
115
116    Starts reading from current position in file.
117
118    Args:
119      n: Read 'n' bytes if n != -1. If n = -1, reads to end of file.
120
121    Returns:
122      'n' bytes of the file (or whole file) in bytes mode or 'n' bytes of the
123      string if in string (regular) mode.
124    """
125    self._preread_check()
126    with errors.raise_exception_on_not_ok_status() as status:
127      if n == -1:
128        length = self.size() - self.tell()
129      else:
130        length = n
131      return self._prepare_value(
132          pywrap_tensorflow.ReadFromStream(self._read_buf, length, status))
133
134  @deprecation.deprecated_args(
135      None,
136      "position is deprecated in favor of the offset argument.",
137      "position")
138  def seek(self, offset=None, whence=0, position=None):
139    # TODO(jhseu): Delete later. Used to omit `position` from docs.
140    # pylint: disable=g-doc-args
141    """Seeks to the offset in the file.
142
143    Args:
144      offset: The byte count relative to the whence argument.
145      whence: Valid values for whence are:
146        0: start of the file (default)
147        1: relative to the current position of the file
148        2: relative to the end of file. offset is usually negative.
149    """
150    # pylint: enable=g-doc-args
151    self._preread_check()
152    # We needed to make offset a keyword argument for backwards-compatibility.
153    # This check exists so that we can convert back to having offset be a
154    # positional argument.
155    # TODO(jhseu): Make `offset` a positional argument after `position` is
156    # deleted.
157    if offset is None and position is None:
158      raise TypeError("seek(): offset argument required")
159    if offset is not None and position is not None:
160      raise TypeError("seek(): offset and position may not be set "
161                      "simultaneously.")
162
163    if position is not None:
164      offset = position
165
166    with errors.raise_exception_on_not_ok_status() as status:
167      if whence == 0:
168        pass
169      elif whence == 1:
170        offset += self.tell()
171      elif whence == 2:
172        offset += self.size()
173      else:
174        raise errors.InvalidArgumentError(
175            None, None,
176            "Invalid whence argument: {}. Valid values are 0, 1, or 2."
177            .format(whence))
178      ret_status = self._read_buf.Seek(offset)
179      pywrap_tensorflow.Set_TF_Status_from_Status(status, ret_status)
180
181  def readline(self):
182    r"""Reads the next line from the file. Leaves the '\n' at the end."""
183    self._preread_check()
184    return self._prepare_value(self._read_buf.ReadLineAsString())
185
186  def readlines(self):
187    """Returns all lines from the file in a list."""
188    self._preread_check()
189    lines = []
190    while True:
191      s = self.readline()
192      if not s:
193        break
194      lines.append(s)
195    return lines
196
197  def tell(self):
198    """Returns the current position in the file."""
199    if self._read_check_passed:
200      self._preread_check()
201      return self._read_buf.Tell()
202    else:
203      self._prewrite_check()
204
205      with errors.raise_exception_on_not_ok_status() as status:
206        return pywrap_tensorflow.TellFile(self._writable_file, status)
207
208  def __enter__(self):
209    """Make usable with "with" statement."""
210    return self
211
212  def __exit__(self, unused_type, unused_value, unused_traceback):
213    """Make usable with "with" statement."""
214    self.close()
215
216  def __iter__(self):
217    return self
218
219  def next(self):
220    retval = self.readline()
221    if not retval:
222      raise StopIteration()
223    return retval
224
225  def __next__(self):
226    return self.next()
227
228  def flush(self):
229    """Flushes the Writable file.
230
231    This only ensures that the data has made its way out of the process without
232    any guarantees on whether it's written to disk. This means that the
233    data would survive an application crash but not necessarily an OS crash.
234    """
235    if self._writable_file:
236      with errors.raise_exception_on_not_ok_status() as status:
237        ret_status = self._writable_file.Flush()
238        pywrap_tensorflow.Set_TF_Status_from_Status(status, ret_status)
239
240  def close(self):
241    """Closes FileIO. Should be called for the WritableFile to be flushed."""
242    self._read_buf = None
243    if self._writable_file:
244      with errors.raise_exception_on_not_ok_status() as status:
245        ret_status = self._writable_file.Close()
246        pywrap_tensorflow.Set_TF_Status_from_Status(status, ret_status)
247    self._writable_file = None
248
249
250@tf_export(v1=["gfile.Exists"])
251def file_exists(filename):
252  """Determines whether a path exists or not.
253
254  Args:
255    filename: string, a path
256
257  Returns:
258    True if the path exists, whether its a file or a directory.
259    False if the path does not exist and there are no filesystem errors.
260
261  Raises:
262    errors.OpError: Propagates any errors reported by the FileSystem API.
263  """
264  return file_exists_v2(filename)
265
266
267@tf_export("io.gfile.exists")
268def file_exists_v2(path):
269  """Determines whether a path exists or not.
270
271  Args:
272    path: string, a path
273
274  Returns:
275    True if the path exists, whether its a file or a directory.
276    False if the path does not exist and there are no filesystem errors.
277
278  Raises:
279    errors.OpError: Propagates any errors reported by the FileSystem API.
280  """
281  try:
282    with errors.raise_exception_on_not_ok_status() as status:
283      pywrap_tensorflow.FileExists(compat.as_bytes(path), status)
284  except errors.NotFoundError:
285    return False
286  return True
287
288
289@tf_export(v1=["gfile.Remove"])
290def delete_file(filename):
291  """Deletes the file located at 'filename'.
292
293  Args:
294    filename: string, a filename
295
296  Raises:
297    errors.OpError: Propagates any errors reported by the FileSystem API.  E.g.,
298    NotFoundError if the file does not exist.
299  """
300  delete_file_v2(filename)
301
302
303@tf_export("io.gfile.remove")
304def delete_file_v2(path):
305  """Deletes the path located at 'path'.
306
307  Args:
308    path: string, a path
309
310  Raises:
311    errors.OpError: Propagates any errors reported by the FileSystem API.  E.g.,
312    NotFoundError if the path does not exist.
313  """
314  with errors.raise_exception_on_not_ok_status() as status:
315    pywrap_tensorflow.DeleteFile(compat.as_bytes(path), status)
316
317
318def read_file_to_string(filename, binary_mode=False):
319  """Reads the entire contents of a file to a string.
320
321  Args:
322    filename: string, path to a file
323    binary_mode: whether to open the file in binary mode or not. This changes
324        the type of the object returned.
325
326  Returns:
327    contents of the file as a string or bytes.
328
329  Raises:
330    errors.OpError: Raises variety of errors that are subtypes e.g.
331    NotFoundError etc.
332  """
333  if binary_mode:
334    f = FileIO(filename, mode="rb")
335  else:
336    f = FileIO(filename, mode="r")
337  return f.read()
338
339
340def write_string_to_file(filename, file_content):
341  """Writes a string to a given file.
342
343  Args:
344    filename: string, path to a file
345    file_content: string, contents that need to be written to the file
346
347  Raises:
348    errors.OpError: If there are errors during the operation.
349  """
350  with FileIO(filename, mode="w") as f:
351    f.write(file_content)
352
353
354@tf_export(v1=["gfile.Glob"])
355def get_matching_files(filename):
356  """Returns a list of files that match the given pattern(s).
357
358  Args:
359    filename: string or iterable of strings. The glob pattern(s).
360
361  Returns:
362    A list of strings containing filenames that match the given pattern(s).
363
364  Raises:
365    errors.OpError: If there are filesystem / directory listing errors.
366  """
367  return get_matching_files_v2(filename)
368
369
370@tf_export("io.gfile.glob")
371def get_matching_files_v2(pattern):
372  """Returns a list of files that match the given pattern(s).
373
374  Args:
375    pattern: string or iterable of strings. The glob pattern(s).
376
377  Returns:
378    A list of strings containing filenames that match the given pattern(s).
379
380  Raises:
381    errors.OpError: If there are filesystem / directory listing errors.
382  """
383  with errors.raise_exception_on_not_ok_status() as status:
384    if isinstance(pattern, six.string_types):
385      return [
386          # Convert the filenames to string from bytes.
387          compat.as_str_any(matching_filename)
388          for matching_filename in pywrap_tensorflow.GetMatchingFiles(
389              compat.as_bytes(pattern), status)
390      ]
391    else:
392      return [
393          # Convert the filenames to string from bytes.
394          compat.as_str_any(matching_filename)
395          for single_filename in pattern
396          for matching_filename in pywrap_tensorflow.GetMatchingFiles(
397              compat.as_bytes(single_filename), status)
398      ]
399
400
401@tf_export(v1=["gfile.MkDir"])
402def create_dir(dirname):
403  """Creates a directory with the name 'dirname'.
404
405  Args:
406    dirname: string, name of the directory to be created
407
408  Notes:
409    The parent directories need to exist. Use recursive_create_dir instead if
410    there is the possibility that the parent dirs don't exist.
411
412  Raises:
413    errors.OpError: If the operation fails.
414  """
415  create_dir_v2(dirname)
416
417
418@tf_export("io.gfile.mkdir")
419def create_dir_v2(path):
420  """Creates a directory with the name given by 'path'.
421
422  Args:
423    path: string, name of the directory to be created
424
425  Notes:
426    The parent directories need to exist. Use recursive_create_dir instead if
427    there is the possibility that the parent dirs don't exist.
428
429  Raises:
430    errors.OpError: If the operation fails.
431  """
432  with errors.raise_exception_on_not_ok_status() as status:
433    pywrap_tensorflow.CreateDir(compat.as_bytes(path), status)
434
435
436@tf_export(v1=["gfile.MakeDirs"])
437def recursive_create_dir(dirname):
438  """Creates a directory and all parent/intermediate directories.
439
440  It succeeds if dirname already exists and is writable.
441
442  Args:
443    dirname: string, name of the directory to be created
444
445  Raises:
446    errors.OpError: If the operation fails.
447  """
448  recursive_create_dir_v2(dirname)
449
450
451@tf_export("io.gfile.makedirs")
452def recursive_create_dir_v2(path):
453  """Creates a directory and all parent/intermediate directories.
454
455  It succeeds if path already exists and is writable.
456
457  Args:
458    path: string, name of the directory to be created
459
460  Raises:
461    errors.OpError: If the operation fails.
462  """
463  with errors.raise_exception_on_not_ok_status() as status:
464    pywrap_tensorflow.RecursivelyCreateDir(compat.as_bytes(path), status)
465
466
467@tf_export(v1=["gfile.Copy"])
468def copy(oldpath, newpath, overwrite=False):
469  """Copies data from oldpath to newpath.
470
471  Args:
472    oldpath: string, name of the file who's contents need to be copied
473    newpath: string, name of the file to which to copy to
474    overwrite: boolean, if false its an error for newpath to be occupied by an
475        existing file.
476
477  Raises:
478    errors.OpError: If the operation fails.
479  """
480  copy_v2(oldpath, newpath, overwrite)
481
482
483@tf_export("io.gfile.copy")
484def copy_v2(src, dst, overwrite=False):
485  """Copies data from src to dst.
486
487  Args:
488    src: string, name of the file whose contents need to be copied
489    dst: string, name of the file to which to copy to
490    overwrite: boolean, if false its an error for newpath to be occupied by an
491        existing file.
492
493  Raises:
494    errors.OpError: If the operation fails.
495  """
496  with errors.raise_exception_on_not_ok_status() as status:
497    pywrap_tensorflow.CopyFile(
498        compat.as_bytes(src), compat.as_bytes(dst), overwrite, status)
499
500
501@tf_export(v1=["gfile.Rename"])
502def rename(oldname, newname, overwrite=False):
503  """Rename or move a file / directory.
504
505  Args:
506    oldname: string, pathname for a file
507    newname: string, pathname to which the file needs to be moved
508    overwrite: boolean, if false it's an error for `newname` to be occupied by
509        an existing file.
510
511  Raises:
512    errors.OpError: If the operation fails.
513  """
514  rename_v2(oldname, newname, overwrite)
515
516
517@tf_export("io.gfile.rename")
518def rename_v2(src, dst, overwrite=False):
519  """Rename or move a file / directory.
520
521  Args:
522    src: string, pathname for a file
523    dst: string, pathname to which the file needs to be moved
524    overwrite: boolean, if false it's an error for `dst` to be occupied by
525        an existing file.
526
527  Raises:
528    errors.OpError: If the operation fails.
529  """
530  with errors.raise_exception_on_not_ok_status() as status:
531    pywrap_tensorflow.RenameFile(
532        compat.as_bytes(src), compat.as_bytes(dst), overwrite, status)
533
534
535def atomic_write_string_to_file(filename, contents, overwrite=True):
536  """Writes to `filename` atomically.
537
538  This means that when `filename` appears in the filesystem, it will contain
539  all of `contents`. With write_string_to_file, it is possible for the file
540  to appear in the filesystem with `contents` only partially written.
541
542  Accomplished by writing to a temp file and then renaming it.
543
544  Args:
545    filename: string, pathname for a file
546    contents: string, contents that need to be written to the file
547    overwrite: boolean, if false it's an error for `filename` to be occupied by
548        an existing file.
549  """
550  temp_pathname = filename + ".tmp" + uuid.uuid4().hex
551  write_string_to_file(temp_pathname, contents)
552  try:
553    rename(temp_pathname, filename, overwrite)
554  except errors.OpError:
555    delete_file(temp_pathname)
556    raise
557
558
559@tf_export(v1=["gfile.DeleteRecursively"])
560def delete_recursively(dirname):
561  """Deletes everything under dirname recursively.
562
563  Args:
564    dirname: string, a path to a directory
565
566  Raises:
567    errors.OpError: If the operation fails.
568  """
569  delete_recursively_v2(dirname)
570
571
572@tf_export("io.gfile.rmtree")
573def delete_recursively_v2(path):
574  """Deletes everything under path recursively.
575
576  Args:
577    path: string, a path
578
579  Raises:
580    errors.OpError: If the operation fails.
581  """
582  with errors.raise_exception_on_not_ok_status() as status:
583    pywrap_tensorflow.DeleteRecursively(compat.as_bytes(path), status)
584
585
586@tf_export(v1=["gfile.IsDirectory"])
587def is_directory(dirname):
588  """Returns whether the path is a directory or not.
589
590  Args:
591    dirname: string, path to a potential directory
592
593  Returns:
594    True, if the path is a directory; False otherwise
595  """
596  return is_directory_v2(dirname)
597
598
599@tf_export("io.gfile.isdir")
600def is_directory_v2(path):
601  """Returns whether the path is a directory or not.
602
603  Args:
604    path: string, path to a potential directory
605
606  Returns:
607    True, if the path is a directory; False otherwise
608  """
609  status = c_api_util.ScopedTFStatus()
610  return pywrap_tensorflow.IsDirectory(compat.as_bytes(path), status)
611
612
613@tf_export(v1=["gfile.ListDirectory"])
614def list_directory(dirname):
615  """Returns a list of entries contained within a directory.
616
617  The list is in arbitrary order. It does not contain the special entries "."
618  and "..".
619
620  Args:
621    dirname: string, path to a directory
622
623  Returns:
624    [filename1, filename2, ... filenameN] as strings
625
626  Raises:
627    errors.NotFoundError if directory doesn't exist
628  """
629  return list_directory_v2(dirname)
630
631
632@tf_export("io.gfile.listdir")
633def list_directory_v2(path):
634  """Returns a list of entries contained within a directory.
635
636  The list is in arbitrary order. It does not contain the special entries "."
637  and "..".
638
639  Args:
640    path: string, path to a directory
641
642  Returns:
643    [filename1, filename2, ... filenameN] as strings
644
645  Raises:
646    errors.NotFoundError if directory doesn't exist
647  """
648  if not is_directory(path):
649    raise errors.NotFoundError(
650        node_def=None,
651        op=None,
652        message="Could not find directory {}".format(path))
653  with errors.raise_exception_on_not_ok_status() as status:
654    # Convert each element to string, since the return values of the
655    # vector of string should be interpreted as strings, not bytes.
656    return [
657        compat.as_str_any(filename)
658        for filename in pywrap_tensorflow.GetChildren(
659            compat.as_bytes(path), status)
660    ]
661
662
663@tf_export(v1=["gfile.Walk"])
664def walk(top, in_order=True):
665  """Recursive directory tree generator for directories.
666
667  Args:
668    top: string, a Directory name
669    in_order: bool, Traverse in order if True, post order if False.
670
671  Errors that happen while listing directories are ignored.
672
673  Yields:
674    Each yield is a 3-tuple:  the pathname of a directory, followed by lists of
675    all its subdirectories and leaf files.
676    (dirname, [subdirname, subdirname, ...], [filename, filename, ...])
677    as strings
678  """
679  return walk_v2(top, in_order)
680
681
682@tf_export("io.gfile.walk")
683def walk_v2(top, topdown=True, onerror=None):
684  """Recursive directory tree generator for directories.
685
686  Args:
687    top: string, a Directory name
688    topdown: bool, Traverse pre order if True, post order if False.
689    onerror: optional handler for errors. Should be a function, it will be
690      called with the error as argument. Rethrowing the error aborts the walk.
691
692  Errors that happen while listing directories are ignored.
693
694  Yields:
695    Each yield is a 3-tuple:  the pathname of a directory, followed by lists of
696    all its subdirectories and leaf files.
697    (dirname, [subdirname, subdirname, ...], [filename, filename, ...])
698    as strings
699  """
700  top = compat.as_str_any(top)
701  try:
702    listing = list_directory(top)
703  except errors.NotFoundError as err:
704    if onerror:
705      onerror(err)
706    else:
707      return
708
709  files = []
710  subdirs = []
711  for item in listing:
712    full_path = os.path.join(top, item)
713    if is_directory(full_path):
714      subdirs.append(item)
715    else:
716      files.append(item)
717
718  here = (top, subdirs, files)
719
720  if topdown:
721    yield here
722
723  for subdir in subdirs:
724    for subitem in walk_v2(os.path.join(top, subdir), topdown, onerror=onerror):
725      yield subitem
726
727  if not topdown:
728    yield here
729
730
731@tf_export(v1=["gfile.Stat"])
732def stat(filename):
733  """Returns file statistics for a given path.
734
735  Args:
736    filename: string, path to a file
737
738  Returns:
739    FileStatistics struct that contains information about the path
740
741  Raises:
742    errors.OpError: If the operation fails.
743  """
744  return stat_v2(filename)
745
746
747@tf_export("io.gfile.stat")
748def stat_v2(path):
749  """Returns file statistics for a given path.
750
751  Args:
752    path: string, path to a file
753
754  Returns:
755    FileStatistics struct that contains information about the path
756
757  Raises:
758    errors.OpError: If the operation fails.
759  """
760  file_statistics = pywrap_tensorflow.FileStatistics()
761  with errors.raise_exception_on_not_ok_status() as status:
762    pywrap_tensorflow.Stat(compat.as_bytes(path), file_statistics, status)
763    return file_statistics
764
765
766def filecmp(filename_a, filename_b):
767  """Compare two files, returning True if they are the same, False otherwise.
768
769  We check size first and return False quickly if the files are different sizes.
770  If they are the same size, we continue to generating a crc for the whole file.
771
772  You might wonder: why not use Python's filecmp.cmp() instead? The answer is
773  that the builtin library is not robust to the many different filesystems
774  TensorFlow runs on, and so we here perform a similar comparison with
775  the more robust FileIO.
776
777  Args:
778    filename_a: string path to the first file.
779    filename_b: string path to the second file.
780
781  Returns:
782    True if the files are the same, False otherwise.
783  """
784  size_a = FileIO(filename_a, "rb").size()
785  size_b = FileIO(filename_b, "rb").size()
786  if size_a != size_b:
787    return False
788
789  # Size is the same. Do a full check.
790  crc_a = file_crc32(filename_a)
791  crc_b = file_crc32(filename_b)
792  return crc_a == crc_b
793
794
795def file_crc32(filename, block_size=_DEFAULT_BLOCK_SIZE):
796  """Get the crc32 of the passed file.
797
798  The crc32 of a file can be used for error checking; two files with the same
799  crc32 are considered equivalent. Note that the entire file must be read
800  to produce the crc32.
801
802  Args:
803    filename: string, path to a file
804    block_size: Integer, process the files by reading blocks of `block_size`
805      bytes. Use -1 to read the file as once.
806
807  Returns:
808    hexadecimal as string, the crc32 of the passed file.
809  """
810  crc = 0
811  with FileIO(filename, mode="rb") as f:
812    chunk = f.read(n=block_size)
813    while chunk:
814      crc = binascii.crc32(chunk, crc)
815      chunk = f.read(n=block_size)
816  return hex(crc & 0xFFFFFFFF)
817