1#!/usr/bin/env python3
2# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
3#
4# Licensed under the Apache License, Version 2.0 (the "License");
5# you may not use this file except in compliance with the License.
6# You may obtain a copy of the License at
7#
8#     http://www.apache.org/licenses/LICENSE-2.0
9#
10# Unless required by applicable law or agreed to in writing, software
11# distributed under the License is distributed on an "AS IS" BASIS,
12# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13# See the License for the specific language governing permissions and
14# limitations under the License.
15# ==============================================================================
16r"""Used for Google-internal artifact size tracking.
17
18See go/tf-devinfra/sizetrack.
19
20INVOCATION: The following flags are required:
21
22  sizetrack_helper.py \
23      --artifact=ARTIFACT, or --manual_bytes=MANUAL_BYTES
24      --artifact_id=ARTIFACT_ID \
25      --team=TEAM \
26      ... other optional args ...
27
28On Windows you might need something like:
29
30    C:\Python38\python.exe C:\path\to\sizetrack_helper.py ...
31
32PREREQUISITES:
33
34  1. Your current activated GCP user must have access scopes and IAM permissions
35     to do the following:
36
37      1. Query and load data into BigQuery
38      2. Upload files to GCS
39
40  2. Your environment must match the following criteria:
41
42      1. Current directory is a git repository
43      2. CL-based commits have a PiperOrigin-RevId trailer. This is the case
44         for any use of Copybara Single-source-of-truth, e.g. TensorFlow.
45         Only these commits are considered when running commands.
46"""
47
48from __future__ import absolute_import
49from __future__ import division
50from __future__ import print_function
51
52import argparse
53import csv
54import datetime
55import os
56import os.path
57import pathlib
58import platform
59import subprocess
60
61
62parser = argparse.ArgumentParser(
63    usage=__doc__, formatter_class=argparse.ArgumentDefaultsHelpFormatter)
64parser.add_argument(
65    "--project",
66    type=str,
67    default="tensorflow-testing",
68    help="GCP project you can access.")
69parser.add_argument(
70    "--dataset",
71    type=str,
72    default="sizetracker",
73    help="BigQuery dataset containing --table")
74parser.add_argument(
75    "--table", type=str, default="tensorflow_devinfra", help="BigQuery table.")
76parser.add_argument(
77    "--upload",
78    action="store_true",
79    help="Upload the artifact to --bucket for analysis.")
80parser.add_argument(
81    "--bucket",
82    type=str,
83    default="gs://tf-sizetracker-artifacts",
84    help="GCS bucket for artifacts.")
85parser.add_argument(
86    "--team",
87    type=str,
88    help="For grouping in the dashboard and buckets; e.g. tf-lite-team.")
89parser.add_argument(
90    "--artifact_id",
91    type=str,
92    help="Unique ID for your artifact, used for sorting dashboards.")
93parser.add_argument(
94    "-n",
95    "--dry_run",
96    action="store_true",
97    help="Dry run: do not load to BigQuery or upload to GCS.")
98parser.add_argument(
99    "--job",
100    type=str,
101    help="Name of job calling this script. Default: $KOKORO_JOB_NAME.")
102parser.add_argument(
103    "--build_id",
104    type=str,
105    help="UUID of build calling this script. Default: $KOKORO_BUILD_ID.")
106parser.add_argument(
107    "--print_schema",
108    action="store_true",
109    help="Print the table schema and don't do anything else.")
110size = parser.add_mutually_exclusive_group()
111size.add_argument(
112    "--artifact",
113    type=argparse.FileType("r"),
114    help="Local file you are measuring.")
115size.add_argument(
116    "--manual_bytes",
117    type=int,
118    help="Manually set the recorded size instead of providing an artifact.")
119FLAGS = parser.parse_args()
120
121
122NOW = datetime.datetime.now(
123    datetime.timezone.utc).replace(microsecond=0).isoformat()
124TABLE_NAME = "{}.{}".format(FLAGS.dataset, FLAGS.table)
125PROJECT_LEVEL_TABLE_NAME = "{}:{}".format(FLAGS.project, TABLE_NAME)
126CL_TRAILER = "PiperOrigin-RevId"
127PRETTY_COMMIT_DATE = "%cI"
128PRETTY_CL = "%(trailers:key={},valueonly)".format(CL_TRAILER)
129PRETTY_HEAD_INFO = "%h\t{cl}\t%s\t%ae\t%aI\t%ce\t%cI".format(cl=PRETTY_CL)
130PRETTY_EARLY = "%aI\t{cl}\t%cI".format(cl=PRETTY_CL)
131PRETTY_COMMIT = "%h"
132# This is a BigQuery table schema defined as CSV
133# See https://cloud.google.com/bigquery/docs/schemas
134SCHEMA = ",".join([
135    "id:string",
136    "filename:string",
137    # These 6 lines are from git's format=pretty
138    # %h $CL_PRETTY %s %ae %aI %ce %cI
139    "commit:string",
140    "cl:int64",
141    "description:string",
142    "author:string",
143    "author_date:timestamp",
144    "committer:string",
145    "commit_date:timestamp",
146    # Done with format=pretty
147    "earliest_commit:string",
148    "earliest_cl:int64",
149    "earliest_author_date:timestamp",
150    "earliest_commit_date:timestamp",
151    "all_commits:string",
152    "all_cls:string",
153    "bytes:int64",
154    "team:string",
155    "logged_date:timestamp",
156    "uploaded_to:string",
157    "job:string",
158    "build_id:string",
159])
160# Select the earliest recorded commit in the same table for the same artifact
161# and team. Used to determine the full range of tested commits for each
162# invocation. Returns empty string if there are no earlier records.
163BQ_GET_EARLIEST_INCLUDED_COMMIT = """
164  SELECT
165    commit
166  FROM {table} WHERE
167    commit_date < '{earlier_than_this_date}'
168    AND id = '{artifact_id}'
169    AND team = '{team}'
170  ORDER BY commit_date DESC LIMIT 1
171"""
172
173
174# pylint: disable=unused-argument
175def git_pretty(commit_range, pretty_format, n=None):
176  r"""Run git log and return the cleaned results.
177
178  Git is assumed to be available in the PATH.
179
180  The PiperOrigin-RevId trailer always picks up an extra newline, so this splits
181  entries on a null byte (\0, or %x00 for git log) and removes newlines.
182
183  Args:
184    commit_range: Standard range given to git log, e.g. HEAD~1..HEAD
185    pretty_format: See https://git-scm.com/docs/pretty-formats
186    n: Number of commits to get. By default, get all within commit_range.
187
188  Returns:
189    List of strings of whatever the format string was.
190  """
191  n = [] if n is None else ["-n", "1"]
192  try:
193    ret = subprocess.run([
194        "git", "log", *n, "--date", "iso", "--grep", CL_TRAILER, commit_range,
195        "--pretty=format:" + pretty_format + "%x00"
196    ],
197                         check=True,
198                         universal_newlines=True,
199                         stderr=subprocess.PIPE,
200                         stdout=subprocess.PIPE)
201  except subprocess.CalledProcessError as e:
202    print(e.stderr)
203    print(e.stdout)
204    raise e
205  out = ret.stdout.replace("\n", "")
206  # Split by \0 and make list of text, extra whitespace and empty lines removed
207  return list(filter(None, map(str.strip, out.split("\0"))))
208
209
210def gcloud(tool, args, stdin=None):
211  r"""Run a Google cloud utility.
212
213  On Linux and MacOS, utilities are assumed to be in the PATH.
214  On Windows, utilities are assumed to be available as
215    C:\Program Files (x86)\Google\Cloud SDK\google-cloud-sdk\bin\{tool}.cmd
216
217  Args:
218    tool: CLI tool, e.g. bq, gcloud, gsutil
219    args: List of arguments, same format as subprocess.run
220    stdin: String to send to stdin
221
222  Returns:
223    String, the stdout of the tool
224  """
225
226  if platform.system() == "Windows":
227    tool = (r"C:\Program Files (x86)\Google\Cloud "
228            r"SDK\google-cloud-sdk\bin\{}.cmd").format(tool)
229
230  try:
231    ret = subprocess.run([tool, *args],
232                         check=True,
233                         universal_newlines=True,
234                         stdout=subprocess.PIPE,
235                         stderr=subprocess.PIPE,
236                         input=stdin)
237  except subprocess.CalledProcessError as e:
238    print(e.stderr)
239    print(e.stdout)
240    raise e
241  return ret.stdout.strip()
242
243
244def bq(args, stdin=None):
245  """Helper for running bq, the BigQuery tool."""
246  # bq prints extra messages to stdout if ~/.bigqueryrc doesn't exist
247  pathlib.Path(pathlib.Path.home() / ".bigqueryrc").touch()
248  return gcloud(
249      "bq", ["--project_id", FLAGS.project, "--headless", *args],
250      stdin=stdin)
251
252
253def get_all_tested_commits():
254  """Get details about the full commit range tested by this invocation."""
255  head_info = git_pretty("HEAD", PRETTY_HEAD_INFO, n=1)
256  _, _, _, _, _, _, current_commit_date = head_info[0].split("\t")
257
258  query_earliest_included_commit = BQ_GET_EARLIEST_INCLUDED_COMMIT.format(
259      table=TABLE_NAME,
260      earlier_than_this_date=current_commit_date,
261      artifact_id=FLAGS.artifact_id,
262      team=FLAGS.team)
263
264  # --format=csv returns an empty string if no results, or else two lines:
265  # commit
266  # COMMIT_HASH
267  earliest_commit = bq(["query", "--format", "csv", "--nouse_legacy_sql"],
268                       stdin=query_earliest_included_commit)
269
270  # Compute the commit/CL range since the last test
271  if earliest_commit:
272
273    earliest_commit = earliest_commit.splitlines()[-1]  # Ignore CSV header
274    early_author_date, early_cl, early_commit_date = git_pretty(
275        earliest_commit, PRETTY_EARLY, n=1)[0].split("\t")
276
277    all_range = "{commit}..HEAD".format(commit=earliest_commit)
278    # Reversed: convert to chronological
279    all_commits = ",".join(reversed(git_pretty(all_range, PRETTY_COMMIT)))
280    all_changelists = ",".join(reversed(git_pretty(all_range, PRETTY_CL)))
281
282    return [
283        earliest_commit, early_cl, early_author_date, early_commit_date,
284        all_commits, all_changelists
285    ]
286
287  # If the artifact has never been tracked before this commit
288  # Empty cells in CSV loads are loaded as NULL values
289  else:
290    return [""] * 6
291
292
293def get_upload_path():
294  """Generate URL for 'gsutil cp'."""
295  if FLAGS.upload and FLAGS.artifact:
296    artifact_filename = os.path.basename(FLAGS.artifact.name)
297    # note: not os.path.join here, because gsutil is always linux-style
298    # Using a timestamp prevents duplicate entries
299    path = "{bucket}/{team}/{artifact_id}/{now}.{artifact_filename}".format(
300        bucket=FLAGS.bucket,
301        team=FLAGS.team,
302        artifact_id=FLAGS.artifact_id,
303        now=NOW,
304        artifact_filename=artifact_filename)
305    return path
306  else:
307    return ""
308
309
310def build_row():
311  """Assemble one row of data about this artifact."""
312  (earliest_commit, early_cl, early_author_date, early_commit_date, all_commits,
313   all_changelists) = get_all_tested_commits()
314
315  # Use UTC to make sure machines in different timezones load consistent data
316  current_time = datetime.datetime.now(datetime.timezone.utc).isoformat()
317  artifact_filename = ("NO_FILE" if not FLAGS.artifact else os.path.basename(
318      FLAGS.artifact.name))
319  size_bytes = FLAGS.manual_bytes or os.path.getsize(FLAGS.artifact.name)
320  head_info = git_pretty("HEAD", PRETTY_HEAD_INFO, n=1)
321  all_head_info_items = head_info[0].split("\t")
322  return [
323      FLAGS.artifact_id,
324      artifact_filename,
325      *all_head_info_items,
326      earliest_commit,
327      early_cl,
328      early_author_date,
329      early_commit_date,
330      all_commits,
331      all_changelists,
332      size_bytes,
333      FLAGS.team,
334      current_time,
335      get_upload_path(),
336      FLAGS.job,
337      FLAGS.build_id,
338  ]
339
340
341def main():
342
343  # Validate flags
344  if FLAGS.print_schema:
345    print(SCHEMA)
346    exit(0)
347  elif not FLAGS.team or not FLAGS.artifact_id or not (FLAGS.artifact or
348                                                       FLAGS.manual_bytes):
349    print(
350        "--team and --artifact_id are required if --print_schema is not "
351        "specified.\nYou must also specify one of --artifact or --manual_bytes."
352        "\nPass -h or --help for usage.")
353    exit(1)
354
355  if not FLAGS.job:
356    FLAGS.job = os.environ.get("KOKORO_JOB_NAME", "NO_JOB")
357  if not FLAGS.build_id:
358    FLAGS.build_id = os.environ.get("KOKORO_BUILD_ID", "NO_BUILD")
359
360  # Generate data about this artifact into a Tab Separated Value file
361  next_tsv_row = build_row()
362
363  # Upload artifact into GCS if it exists
364  if FLAGS.upload and FLAGS.artifact:
365    upload_path = get_upload_path()
366    if FLAGS.dry_run:
367      print("DRY RUN: Would gsutil cp to:\n{}".format(upload_path))
368    else:
369      gcloud("gsutil", ["cp", FLAGS.artifact.name, upload_path])
370
371  # Load into BigQuery
372  if FLAGS.dry_run:
373    print("DRY RUN: Generated this TSV row:")
374    print("\t".join(map(str, next_tsv_row)))
375  else:
376    with open("data.tsv", "w", newline="") as tsvfile:
377      writer = csv.writer(tsvfile, delimiter="\t", quoting=csv.QUOTE_MINIMAL,
378                          lineterminator=os.linesep)
379      writer.writerow(next_tsv_row)
380    bq([
381        "load", "--source_format", "CSV", "--field_delimiter", "tab",
382        PROJECT_LEVEL_TABLE_NAME, "data.tsv", SCHEMA
383    ])
384
385
386if __name__ == "__main__":
387  main()
388