1# Copyright 2013 The Chromium Authors. All rights reserved.
2# Use of this source code is governed by a BSD-style license that can be
3# found in the LICENSE file.
4
5import difflib
6import hashlib
7import itertools
8import json
9import os
10import sys
11import zipfile
12
13
14# When set and a difference is detected, a diff of what changed is printed.
15PRINT_EXPLANATIONS = int(os.environ.get('PRINT_BUILD_EXPLANATIONS', 0))
16
17# An escape hatch that causes all targets to be rebuilt.
18_FORCE_REBUILD = int(os.environ.get('FORCE_REBUILD', 0))
19
20
21def CallAndRecordIfStale(
22    function, record_path=None, input_paths=None, input_strings=None,
23    output_paths=None, force=False, pass_changes=False):
24  """Calls function if outputs are stale.
25
26  Outputs are considered stale if:
27  - any output_paths are missing, or
28  - the contents of any file within input_paths has changed, or
29  - the contents of input_strings has changed.
30
31  To debug which files are out-of-date, set the environment variable:
32      PRINT_MD5_DIFFS=1
33
34  Args:
35    function: The function to call.
36    record_path: Path to record metadata.
37      Defaults to output_paths[0] + '.md5.stamp'
38    input_paths: List of paths to calcualte an md5 sum on.
39    input_strings: List of strings to record verbatim.
40    output_paths: List of output paths.
41    force: Whether to treat outputs as missing regardless of whether they
42      actually are.
43    pass_changes: Whether to pass a Changes instance to |function|.
44  """
45  assert record_path or output_paths
46  input_paths = input_paths or []
47  input_strings = input_strings or []
48  output_paths = output_paths or []
49  record_path = record_path or output_paths[0] + '.md5.stamp'
50
51  assert record_path.endswith('.stamp'), (
52      'record paths must end in \'.stamp\' so that they are easy to find '
53      'and delete')
54
55  new_metadata = _Metadata()
56  new_metadata.AddStrings(input_strings)
57
58  for path in input_paths:
59    if _IsZipFile(path):
60      entries = _ExtractZipEntries(path)
61      new_metadata.AddZipFile(path, entries)
62    else:
63      new_metadata.AddFile(path, _Md5ForPath(path))
64
65  old_metadata = None
66  force = force or _FORCE_REBUILD
67  missing_outputs = [x for x in output_paths if force or not os.path.exists(x)]
68  # When outputs are missing, don't bother gathering change information.
69  if not missing_outputs and os.path.exists(record_path):
70    with open(record_path, 'r') as jsonfile:
71      try:
72        old_metadata = _Metadata.FromFile(jsonfile)
73      except:  # pylint: disable=bare-except
74        pass  # Not yet using new file format.
75
76  changes = Changes(old_metadata, new_metadata, force, missing_outputs)
77  if not changes.HasChanges():
78    return
79
80  if PRINT_EXPLANATIONS:
81    print '=' * 80
82    print 'Target is stale: %s' % record_path
83    print changes.DescribeDifference()
84    print '=' * 80
85
86  args = (changes,) if pass_changes else ()
87  function(*args)
88
89  with open(record_path, 'w') as f:
90    new_metadata.ToFile(f)
91
92
93class Changes(object):
94  """Provides and API for querying what changed between runs."""
95
96  def __init__(self, old_metadata, new_metadata, force, missing_outputs):
97    self.old_metadata = old_metadata
98    self.new_metadata = new_metadata
99    self.force = force
100    self.missing_outputs = missing_outputs
101
102  def _GetOldTag(self, path, subpath=None):
103    return self.old_metadata and self.old_metadata.GetTag(path, subpath)
104
105  def HasChanges(self):
106    """Returns whether any changes exist."""
107    return (self.force or
108            not self.old_metadata or
109            self.old_metadata.StringsMd5() != self.new_metadata.StringsMd5() or
110            self.old_metadata.FilesMd5() != self.new_metadata.FilesMd5())
111
112  def AddedOrModifiedOnly(self):
113    """Returns whether the only changes were from added or modified (sub)files.
114
115    No missing outputs, no removed paths/subpaths.
116    """
117    if (self.force or
118        not self.old_metadata or
119        self.old_metadata.StringsMd5() != self.new_metadata.StringsMd5()):
120      return False
121    if any(self.IterRemovedPaths()):
122      return False
123    for path in self.IterModifiedPaths():
124      if any(self.IterRemovedSubpaths(path)):
125        return False
126    return True
127
128  def IterAllPaths(self):
129    """Generator for paths."""
130    return self.new_metadata.IterPaths();
131
132  def IterAllSubpaths(self, path):
133    """Generator for subpaths."""
134    return self.new_metadata.IterSubpaths(path);
135
136  def IterAddedPaths(self):
137    """Generator for paths that were added."""
138    for path in self.new_metadata.IterPaths():
139      if self._GetOldTag(path) is None:
140        yield path
141
142  def IterAddedSubpaths(self, path):
143    """Generator for paths that were added within the given zip file."""
144    for subpath in self.new_metadata.IterSubpaths(path):
145      if self._GetOldTag(path, subpath) is None:
146        yield subpath
147
148  def IterRemovedPaths(self):
149    """Generator for paths that were removed."""
150    if self.old_metadata:
151      for path in self.old_metadata.IterPaths():
152        if self.new_metadata.GetTag(path) is None:
153          yield path
154
155  def IterRemovedSubpaths(self, path):
156    """Generator for paths that were removed within the given zip file."""
157    if self.old_metadata:
158      for subpath in self.old_metadata.IterSubpaths(path):
159        if self.new_metadata.GetTag(path, subpath) is None:
160          yield subpath
161
162  def IterModifiedPaths(self):
163    """Generator for paths whose contents have changed."""
164    for path in self.new_metadata.IterPaths():
165      old_tag = self._GetOldTag(path)
166      new_tag = self.new_metadata.GetTag(path)
167      if old_tag is not None and old_tag != new_tag:
168        yield path
169
170  def IterModifiedSubpaths(self, path):
171    """Generator for paths within a zip file whose contents have changed."""
172    for subpath in self.new_metadata.IterSubpaths(path):
173      old_tag = self._GetOldTag(path, subpath)
174      new_tag = self.new_metadata.GetTag(path, subpath)
175      if old_tag is not None and old_tag != new_tag:
176        yield subpath
177
178  def IterChangedPaths(self):
179    """Generator for all changed paths (added/removed/modified)."""
180    return itertools.chain(self.IterRemovedPaths(),
181                           self.IterModifiedPaths(),
182                           self.IterAddedPaths())
183
184  def IterChangedSubpaths(self, path):
185    """Generator for paths within a zip that were added/removed/modified."""
186    return itertools.chain(self.IterRemovedSubpaths(path),
187                           self.IterModifiedSubpaths(path),
188                           self.IterAddedSubpaths(path))
189
190  def DescribeDifference(self):
191    """Returns a human-readable description of what changed."""
192    if self.force:
193      return 'force=True'
194    elif self.missing_outputs:
195      return 'Outputs do not exist:\n  ' + '\n  '.join(self.missing_outputs)
196    elif self.old_metadata is None:
197      return 'Previous stamp file not found.'
198
199    if self.old_metadata.StringsMd5() != self.new_metadata.StringsMd5():
200      ndiff = difflib.ndiff(self.old_metadata.GetStrings(),
201                            self.new_metadata.GetStrings())
202      changed = [s for s in ndiff if not s.startswith(' ')]
203      return 'Input strings changed:\n  ' + '\n  '.join(changed)
204
205    if self.old_metadata.FilesMd5() == self.new_metadata.FilesMd5():
206      return "There's no difference."
207
208    lines = []
209    lines.extend('Added: ' + p for p in self.IterAddedPaths())
210    lines.extend('Removed: ' + p for p in self.IterRemovedPaths())
211    for path in self.IterModifiedPaths():
212      lines.append('Modified: ' + path)
213      lines.extend('  -> Subpath added: ' + p
214                   for p in self.IterAddedSubpaths(path))
215      lines.extend('  -> Subpath removed: ' + p
216                   for p in self.IterRemovedSubpaths(path))
217      lines.extend('  -> Subpath modified: ' + p
218                   for p in self.IterModifiedSubpaths(path))
219    if lines:
220      return 'Input files changed:\n  ' + '\n  '.join(lines)
221    return 'I have no idea what changed (there is a bug).'
222
223
224class _Metadata(object):
225  """Data model for tracking change metadata."""
226  # Schema:
227  # {
228  #   "files-md5": "VALUE",
229  #   "strings-md5": "VALUE",
230  #   "input-files": [
231  #     {
232  #       "path": "path.jar",
233  #       "tag": "{MD5 of entries}",
234  #       "entries": [
235  #         { "path": "org/chromium/base/Foo.class", "tag": "{CRC32}" }, ...
236  #       ]
237  #     }, {
238  #       "path": "path.txt",
239  #       "tag": "{MD5}",
240  #     }
241  #   ],
242  #   "input-strings": ["a", "b", ...],
243  # }
244  def __init__(self):
245    self._files_md5 = None
246    self._strings_md5 = None
247    self._files = []
248    self._strings = []
249    # Map of (path, subpath) -> entry. Created upon first call to _GetEntry().
250    self._file_map = None
251
252  @classmethod
253  def FromFile(cls, fileobj):
254    """Returns a _Metadata initialized from a file object."""
255    ret = cls()
256    obj = json.load(fileobj)
257    ret._files_md5 = obj['files-md5']
258    ret._strings_md5 = obj['strings-md5']
259    ret._files = obj['input-files']
260    ret._strings = obj['input-strings']
261    return ret
262
263  def ToFile(self, fileobj):
264    """Serializes metadata to the given file object."""
265    obj = {
266        "files-md5": self.FilesMd5(),
267        "strings-md5": self.StringsMd5(),
268        "input-files": self._files,
269        "input-strings": self._strings,
270    }
271    json.dump(obj, fileobj, indent=2)
272
273  def _AssertNotQueried(self):
274    assert self._files_md5 is None
275    assert self._strings_md5 is None
276    assert self._file_map is None
277
278  def AddStrings(self, values):
279    self._AssertNotQueried()
280    self._strings.extend(str(v) for v in values)
281
282  def AddFile(self, path, tag):
283    """Adds metadata for a non-zip file.
284
285    Args:
286      path: Path to the file.
287      tag: A short string representative of the file contents.
288    """
289    self._AssertNotQueried()
290    self._files.append({
291        'path': path,
292        'tag': tag,
293    })
294
295  def AddZipFile(self, path, entries):
296    """Adds metadata for a zip file.
297
298    Args:
299      path: Path to the file.
300      entries: List of (subpath, tag) tuples for entries within the zip.
301    """
302    self._AssertNotQueried()
303    tag = _ComputeInlineMd5(itertools.chain((e[0] for e in entries),
304                                            (e[1] for e in entries)))
305    self._files.append({
306        'path': path,
307        'tag': tag,
308        'entries': [{"path": e[0], "tag": e[1]} for e in entries],
309    })
310
311  def GetStrings(self):
312    """Returns the list of input strings."""
313    return self._strings
314
315  def FilesMd5(self):
316    """Lazily computes and returns the aggregate md5 of input files."""
317    if self._files_md5 is None:
318      # Omit paths from md5 since temporary files have random names.
319      self._files_md5 = _ComputeInlineMd5(
320          self.GetTag(p) for p in sorted(self.IterPaths()))
321    return self._files_md5
322
323  def StringsMd5(self):
324    """Lazily computes and returns the aggregate md5 of input strings."""
325    if self._strings_md5 is None:
326      self._strings_md5 = _ComputeInlineMd5(self._strings)
327    return self._strings_md5
328
329  def _GetEntry(self, path, subpath=None):
330    """Returns the JSON entry for the given path / subpath."""
331    if self._file_map is None:
332      self._file_map = {}
333      for entry in self._files:
334        self._file_map[(entry['path'], None)] = entry
335        for subentry in entry.get('entries', ()):
336          self._file_map[(entry['path'], subentry['path'])] = subentry
337    return self._file_map.get((path, subpath))
338
339  def GetTag(self, path, subpath=None):
340    """Returns the tag for the given path / subpath."""
341    ret = self._GetEntry(path, subpath)
342    return ret and ret['tag']
343
344  def IterPaths(self):
345    """Returns a generator for all top-level paths."""
346    return (e['path'] for e in self._files)
347
348  def IterSubpaths(self, path):
349    """Returns a generator for all subpaths in the given zip.
350
351    If the given path is not a zip file or doesn't exist, returns an empty
352    iterable.
353    """
354    outer_entry = self._GetEntry(path)
355    if not outer_entry:
356      return ()
357    subentries = outer_entry.get('entries', [])
358    return (entry['path'] for entry in subentries)
359
360
361def _UpdateMd5ForFile(md5, path, block_size=2**16):
362  with open(path, 'rb') as infile:
363    while True:
364      data = infile.read(block_size)
365      if not data:
366        break
367      md5.update(data)
368
369
370def _UpdateMd5ForDirectory(md5, dir_path):
371  for root, _, files in os.walk(dir_path):
372    for f in files:
373      _UpdateMd5ForFile(md5, os.path.join(root, f))
374
375
376def _Md5ForPath(path):
377  md5 = hashlib.md5()
378  if os.path.isdir(path):
379    _UpdateMd5ForDirectory(md5, path)
380  else:
381    _UpdateMd5ForFile(md5, path)
382  return md5.hexdigest()
383
384
385def _ComputeInlineMd5(iterable):
386  """Computes the md5 of the concatenated parameters."""
387  md5 = hashlib.md5()
388  for item in iterable:
389    md5.update(str(item))
390  return md5.hexdigest()
391
392
393def _IsZipFile(path):
394  """Returns whether to treat the given file as a zip file."""
395  # ijar doesn't set the CRC32 field.
396  if path.endswith('.interface.jar'):
397    return False
398  return path[-4:] in ('.zip', '.apk', '.jar') or path.endswith('.srcjar')
399
400
401def _ExtractZipEntries(path):
402  """Returns a list of (path, CRC32) of all files within |path|."""
403  entries = []
404  with zipfile.ZipFile(path) as zip_file:
405    for zip_info in zip_file.infolist():
406      # Skip directories and empty files.
407      if zip_info.CRC:
408        entries.append(
409            (zip_info.filename, zip_info.CRC + zip_info.compress_type))
410  return entries
411