1#!/usr/bin/env python3 2# Copyright 2020 The TensorFlow Authors. All Rights Reserved. 3# 4# Licensed under the Apache License, Version 2.0 (the "License"); 5# you may not use this file except in compliance with the License. 6# You may obtain a copy of the License at 7# 8# http://www.apache.org/licenses/LICENSE-2.0 9# 10# Unless required by applicable law or agreed to in writing, software 11# distributed under the License is distributed on an "AS IS" BASIS, 12# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13# See the License for the specific language governing permissions and 14# limitations under the License. 15# ============================================================================== 16r"""Used for Google-internal artifact size tracking. 17 18See go/tf-devinfra/sizetrack. 19 20INVOCATION: The following flags are required: 21 22 sizetrack_helper.py \ 23 --artifact=ARTIFACT, or --manual_bytes=MANUAL_BYTES 24 --artifact_id=ARTIFACT_ID \ 25 --team=TEAM \ 26 ... other optional args ... 27 28On Windows you might need something like: 29 30 C:\Python38\python.exe C:\path\to\sizetrack_helper.py ... 31 32PREREQUISITES: 33 34 1. Your current activated GCP user must have access scopes and IAM permissions 35 to do the following: 36 37 1. Query and load data into BigQuery 38 2. Upload files to GCS 39 40 2. Your environment must match the following criteria: 41 42 1. Current directory is a git repository 43 2. CL-based commits have a PiperOrigin-RevId trailer. This is the case 44 for any use of Copybara Single-source-of-truth, e.g. TensorFlow. 45 Only these commits are considered when running commands. 46""" 47 48from __future__ import absolute_import 49from __future__ import division 50from __future__ import print_function 51 52import argparse 53import csv 54import datetime 55import os 56import os.path 57import pathlib 58import platform 59import subprocess 60 61 62parser = argparse.ArgumentParser( 63 usage=__doc__, formatter_class=argparse.ArgumentDefaultsHelpFormatter) 64parser.add_argument( 65 "--project", 66 type=str, 67 default="tensorflow-testing", 68 help="GCP project you can access.") 69parser.add_argument( 70 "--dataset", 71 type=str, 72 default="sizetracker", 73 help="BigQuery dataset containing --table") 74parser.add_argument( 75 "--table", type=str, default="tensorflow_devinfra", help="BigQuery table.") 76parser.add_argument( 77 "--upload", 78 action="store_true", 79 help="Upload the artifact to --bucket for analysis.") 80parser.add_argument( 81 "--bucket", 82 type=str, 83 default="gs://tf-sizetracker-artifacts", 84 help="GCS bucket for artifacts.") 85parser.add_argument( 86 "--team", 87 type=str, 88 help="For grouping in the dashboard and buckets; e.g. tf-lite-team.") 89parser.add_argument( 90 "--artifact_id", 91 type=str, 92 help="Unique ID for your artifact, used for sorting dashboards.") 93parser.add_argument( 94 "-n", 95 "--dry_run", 96 action="store_true", 97 help="Dry run: do not load to BigQuery or upload to GCS.") 98parser.add_argument( 99 "--job", 100 type=str, 101 help="Name of job calling this script. Default: $KOKORO_JOB_NAME.") 102parser.add_argument( 103 "--build_id", 104 type=str, 105 help="UUID of build calling this script. Default: $KOKORO_BUILD_ID.") 106parser.add_argument( 107 "--print_schema", 108 action="store_true", 109 help="Print the table schema and don't do anything else.") 110size = parser.add_mutually_exclusive_group() 111size.add_argument( 112 "--artifact", 113 type=argparse.FileType("r"), 114 help="Local file you are measuring.") 115size.add_argument( 116 "--manual_bytes", 117 type=int, 118 help="Manually set the recorded size instead of providing an artifact.") 119FLAGS = parser.parse_args() 120 121 122NOW = datetime.datetime.now( 123 datetime.timezone.utc).replace(microsecond=0).isoformat() 124TABLE_NAME = "{}.{}".format(FLAGS.dataset, FLAGS.table) 125PROJECT_LEVEL_TABLE_NAME = "{}:{}".format(FLAGS.project, TABLE_NAME) 126CL_TRAILER = "PiperOrigin-RevId" 127PRETTY_COMMIT_DATE = "%cI" 128PRETTY_CL = "%(trailers:key={},valueonly)".format(CL_TRAILER) 129PRETTY_HEAD_INFO = "%h\t{cl}\t%s\t%ae\t%aI\t%ce\t%cI".format(cl=PRETTY_CL) 130PRETTY_EARLY = "%aI\t{cl}\t%cI".format(cl=PRETTY_CL) 131PRETTY_COMMIT = "%h" 132# This is a BigQuery table schema defined as CSV 133# See https://cloud.google.com/bigquery/docs/schemas 134SCHEMA = ",".join([ 135 "id:string", 136 "filename:string", 137 # These 6 lines are from git's format=pretty 138 # %h $CL_PRETTY %s %ae %aI %ce %cI 139 "commit:string", 140 "cl:int64", 141 "description:string", 142 "author:string", 143 "author_date:timestamp", 144 "committer:string", 145 "commit_date:timestamp", 146 # Done with format=pretty 147 "earliest_commit:string", 148 "earliest_cl:int64", 149 "earliest_author_date:timestamp", 150 "earliest_commit_date:timestamp", 151 "all_commits:string", 152 "all_cls:string", 153 "bytes:int64", 154 "team:string", 155 "logged_date:timestamp", 156 "uploaded_to:string", 157 "job:string", 158 "build_id:string", 159]) 160# Select the earliest recorded commit in the same table for the same artifact 161# and team. Used to determine the full range of tested commits for each 162# invocation. Returns empty string if there are no earlier records. 163BQ_GET_EARLIEST_INCLUDED_COMMIT = """ 164 SELECT 165 commit 166 FROM {table} WHERE 167 commit_date < '{earlier_than_this_date}' 168 AND id = '{artifact_id}' 169 AND team = '{team}' 170 ORDER BY commit_date DESC LIMIT 1 171""" 172 173 174# pylint: disable=unused-argument 175def git_pretty(commit_range, pretty_format, n=None): 176 r"""Run git log and return the cleaned results. 177 178 Git is assumed to be available in the PATH. 179 180 The PiperOrigin-RevId trailer always picks up an extra newline, so this splits 181 entries on a null byte (\0, or %x00 for git log) and removes newlines. 182 183 Args: 184 commit_range: Standard range given to git log, e.g. HEAD~1..HEAD 185 pretty_format: See https://git-scm.com/docs/pretty-formats 186 n: Number of commits to get. By default, get all within commit_range. 187 188 Returns: 189 List of strings of whatever the format string was. 190 """ 191 n = [] if n is None else ["-n", "1"] 192 try: 193 ret = subprocess.run([ 194 "git", "log", *n, "--date", "iso", "--grep", CL_TRAILER, commit_range, 195 "--pretty=format:" + pretty_format + "%x00" 196 ], 197 check=True, 198 universal_newlines=True, 199 stderr=subprocess.PIPE, 200 stdout=subprocess.PIPE) 201 except subprocess.CalledProcessError as e: 202 print(e.stderr) 203 print(e.stdout) 204 raise e 205 out = ret.stdout.replace("\n", "") 206 # Split by \0 and make list of text, extra whitespace and empty lines removed 207 return list(filter(None, map(str.strip, out.split("\0")))) 208 209 210def gcloud(tool, args, stdin=None): 211 r"""Run a Google cloud utility. 212 213 On Linux and MacOS, utilities are assumed to be in the PATH. 214 On Windows, utilities are assumed to be available as 215 C:\Program Files (x86)\Google\Cloud SDK\google-cloud-sdk\bin\{tool}.cmd 216 217 Args: 218 tool: CLI tool, e.g. bq, gcloud, gsutil 219 args: List of arguments, same format as subprocess.run 220 stdin: String to send to stdin 221 222 Returns: 223 String, the stdout of the tool 224 """ 225 226 if platform.system() == "Windows": 227 tool = (r"C:\Program Files (x86)\Google\Cloud " 228 r"SDK\google-cloud-sdk\bin\{}.cmd").format(tool) 229 230 try: 231 ret = subprocess.run([tool, *args], 232 check=True, 233 universal_newlines=True, 234 stdout=subprocess.PIPE, 235 stderr=subprocess.PIPE, 236 input=stdin) 237 except subprocess.CalledProcessError as e: 238 print(e.stderr) 239 print(e.stdout) 240 raise e 241 return ret.stdout.strip() 242 243 244def bq(args, stdin=None): 245 """Helper for running bq, the BigQuery tool.""" 246 # bq prints extra messages to stdout if ~/.bigqueryrc doesn't exist 247 pathlib.Path(pathlib.Path.home() / ".bigqueryrc").touch() 248 return gcloud( 249 "bq", ["--project_id", FLAGS.project, "--headless", *args], 250 stdin=stdin) 251 252 253def get_all_tested_commits(): 254 """Get details about the full commit range tested by this invocation.""" 255 head_info = git_pretty("HEAD", PRETTY_HEAD_INFO, n=1) 256 _, _, _, _, _, _, current_commit_date = head_info[0].split("\t") 257 258 query_earliest_included_commit = BQ_GET_EARLIEST_INCLUDED_COMMIT.format( 259 table=TABLE_NAME, 260 earlier_than_this_date=current_commit_date, 261 artifact_id=FLAGS.artifact_id, 262 team=FLAGS.team) 263 264 # --format=csv returns an empty string if no results, or else two lines: 265 # commit 266 # COMMIT_HASH 267 earliest_commit = bq(["query", "--format", "csv", "--nouse_legacy_sql"], 268 stdin=query_earliest_included_commit) 269 270 # Compute the commit/CL range since the last test 271 if earliest_commit: 272 273 earliest_commit = earliest_commit.splitlines()[-1] # Ignore CSV header 274 early_author_date, early_cl, early_commit_date = git_pretty( 275 earliest_commit, PRETTY_EARLY, n=1)[0].split("\t") 276 277 all_range = "{commit}..HEAD".format(commit=earliest_commit) 278 # Reversed: convert to chronological 279 all_commits = ",".join(reversed(git_pretty(all_range, PRETTY_COMMIT))) 280 all_changelists = ",".join(reversed(git_pretty(all_range, PRETTY_CL))) 281 282 return [ 283 earliest_commit, early_cl, early_author_date, early_commit_date, 284 all_commits, all_changelists 285 ] 286 287 # If the artifact has never been tracked before this commit 288 # Empty cells in CSV loads are loaded as NULL values 289 else: 290 return [""] * 6 291 292 293def get_upload_path(): 294 """Generate URL for 'gsutil cp'.""" 295 if FLAGS.upload and FLAGS.artifact: 296 artifact_filename = os.path.basename(FLAGS.artifact.name) 297 # note: not os.path.join here, because gsutil is always linux-style 298 # Using a timestamp prevents duplicate entries 299 path = "{bucket}/{team}/{artifact_id}/{now}.{artifact_filename}".format( 300 bucket=FLAGS.bucket, 301 team=FLAGS.team, 302 artifact_id=FLAGS.artifact_id, 303 now=NOW, 304 artifact_filename=artifact_filename) 305 return path 306 else: 307 return "" 308 309 310def build_row(): 311 """Assemble one row of data about this artifact.""" 312 (earliest_commit, early_cl, early_author_date, early_commit_date, all_commits, 313 all_changelists) = get_all_tested_commits() 314 315 # Use UTC to make sure machines in different timezones load consistent data 316 current_time = datetime.datetime.now(datetime.timezone.utc).isoformat() 317 artifact_filename = ("NO_FILE" if not FLAGS.artifact else os.path.basename( 318 FLAGS.artifact.name)) 319 size_bytes = FLAGS.manual_bytes or os.path.getsize(FLAGS.artifact.name) 320 head_info = git_pretty("HEAD", PRETTY_HEAD_INFO, n=1) 321 all_head_info_items = head_info[0].split("\t") 322 return [ 323 FLAGS.artifact_id, 324 artifact_filename, 325 *all_head_info_items, 326 earliest_commit, 327 early_cl, 328 early_author_date, 329 early_commit_date, 330 all_commits, 331 all_changelists, 332 size_bytes, 333 FLAGS.team, 334 current_time, 335 get_upload_path(), 336 FLAGS.job, 337 FLAGS.build_id, 338 ] 339 340 341def main(): 342 343 # Validate flags 344 if FLAGS.print_schema: 345 print(SCHEMA) 346 exit(0) 347 elif not FLAGS.team or not FLAGS.artifact_id or not (FLAGS.artifact or 348 FLAGS.manual_bytes): 349 print( 350 "--team and --artifact_id are required if --print_schema is not " 351 "specified.\nYou must also specify one of --artifact or --manual_bytes." 352 "\nPass -h or --help for usage.") 353 exit(1) 354 355 if not FLAGS.job: 356 FLAGS.job = os.environ.get("KOKORO_JOB_NAME", "NO_JOB") 357 if not FLAGS.build_id: 358 FLAGS.build_id = os.environ.get("KOKORO_BUILD_ID", "NO_BUILD") 359 360 # Generate data about this artifact into a Tab Separated Value file 361 next_tsv_row = build_row() 362 363 # Upload artifact into GCS if it exists 364 if FLAGS.upload and FLAGS.artifact: 365 upload_path = get_upload_path() 366 if FLAGS.dry_run: 367 print("DRY RUN: Would gsutil cp to:\n{}".format(upload_path)) 368 else: 369 gcloud("gsutil", ["cp", FLAGS.artifact.name, upload_path]) 370 371 # Load into BigQuery 372 if FLAGS.dry_run: 373 print("DRY RUN: Generated this TSV row:") 374 print("\t".join(map(str, next_tsv_row))) 375 else: 376 with open("data.tsv", "w", newline="") as tsvfile: 377 writer = csv.writer(tsvfile, delimiter="\t", quoting=csv.QUOTE_MINIMAL, 378 lineterminator=os.linesep) 379 writer.writerow(next_tsv_row) 380 bq([ 381 "load", "--source_format", "CSV", "--field_delimiter", "tab", 382 PROJECT_LEVEL_TABLE_NAME, "data.tsv", SCHEMA 383 ]) 384 385 386if __name__ == "__main__": 387 main() 388