1#!/usr/bin/python2 2# Copyright (c) 2012 The Chromium OS Authors. All rights reserved. 3# Use of this source code is governed by a BSD-style license that can be 4# found in the LICENSE file. 5 6import argparse 7import datetime 8import logging 9import os 10import re 11import sys 12import time 13 14os.environ['DJANGO_SETTINGS_MODULE'] = 'frontend.settings' 15 16import common 17from autotest_lib.server import utils 18from django.db import connections, transaction 19 20 21# Format Appears as: [Date] [Time] - [Msg Level] - [Message] 22LOGGING_FORMAT = '%(asctime)s - %(levelname)s - %(message)s' 23# This regex makes sure the input is in the format of YYYY-MM-DD (2012-02-01) 24DATE_FORMAT_REGEX = ('^(19|20)\d\d[- /.](0[1-9]|1[012])[- /.](0[1-9]|[12][0-9]' 25 '|3[01])$') 26SELECT_CMD_FORMAT = """ 27SELECT %(table)s.%(primary_key)s FROM %(table)s 28WHERE %(table)s.%(time_column)s <= "%(date)s" 29""" 30SELECT_JOIN_CMD_FORMAT = """ 31SELECT %(table)s.%(primary_key)s FROM %(table)s 32INNER JOIN %(related_table)s 33 ON %(table)s.%(foreign_key)s=%(related_table)s.%(related_primary_key)s 34WHERE %(related_table)s.%(time_column)s <= "%(date)s" 35""" 36SELECT_WITH_INDIRECTION_FORMAT = """ 37SELECT %(table)s.%(primary_key)s FROM %(table)s 38INNER JOIN %(indirection_table)s 39 ON %(table)s.%(foreign_key)s = 40 %(indirection_table)s.%(indirection_primary_key)s 41INNER JOIN %(related_table)s 42 ON %(indirection_table)s.%(indirection_foreign_key)s = 43 %(related_table)s.%(related_primary_key)s 44WHERE %(related_table)s.%(time_column)s <= "%(date)s" 45""" 46DELETE_ROWS_FORMAT = """ 47DELETE FROM %(table)s 48WHERE %(table)s.%(primary_key)s IN (%(rows)s) 49""" 50 51 52AFE_JOB_ID = 'afe_job_id' 53JOB_ID = 'job_id' 54JOB_IDX = 'job_idx' 55TEST_IDX = 'test_idx' 56 57# CAUTION: Make sure only the 'default' connection is used. Otherwise 58# db_cleanup may delete stuff from the global database, which is generally not 59# intended. 60cursor = connections['default'].cursor() 61 62# Globals for command line flag constants, for convenience. 63DRY_RUN = False 64STEP_SIZE = None 65LOAD_RATIO = 1.0 66 67class ProgressBar(object): 68 TEXT = "{:<40s} [{:<20s}] ({:>9d}/{:>9d})" 69 70 def __init__(self, name, amount): 71 self._name = name 72 self._amount = amount 73 self._cur = 0 74 75 def __enter__(self): 76 return self 77 78 def __exit__(self, a, b, c): 79 sys.stdout.write('\n') 80 sys.stdout.flush() 81 82 def update(self, x): 83 """ 84 Advance the counter by `x`. 85 86 @param x: An integer of how many more elements were processed. 87 """ 88 self._cur += x 89 90 def show(self): 91 """ 92 Display the progress bar on the current line. Repeated invocations 93 "update" the display. 94 """ 95 if self._amount == 0: 96 barlen = 20 97 else: 98 barlen = int(20 * self._cur / float(self._amount)) 99 if barlen: 100 bartext = '=' * (barlen-1) + '>' 101 else: 102 bartext = '' 103 text = self.TEXT.format(self._name, bartext, self._cur, self._amount) 104 sys.stdout.write('\r') 105 sys.stdout.write(text) 106 sys.stdout.flush() 107 108 109def grouper(iterable, n): 110 """ 111 Group the elements of `iterable` into groups of maximum size `n`. 112 113 @param iterable: An iterable. 114 @param n: Max size of returned groups. 115 @returns: Yields iterables of size <= n. 116 117 >>> grouper('ABCDEFG', 3) 118 [['A', 'B', C'], ['D', 'E', 'F'], ['G']] 119 """ 120 args = [iter(iterable)] * n 121 while True: 122 lst = [] 123 try: 124 for itr in args: 125 lst.append(next(itr)) 126 yield lst 127 except StopIteration: 128 if lst: 129 yield lst 130 break 131 132 133def _delete_table_data_before_date(table_to_delete_from, primary_key, 134 related_table, related_primary_key, 135 date, foreign_key=None, 136 time_column="started_time", 137 indirection_table=None, 138 indirection_primary_key=None, 139 indirection_foreign_key=None): 140 """ 141 We want a delete statement that will only delete from one table while 142 using a related table to find the rows to delete. 143 144 An example mysql command: 145 DELETE FROM tko_iteration_result USING tko_iteration_result INNER JOIN 146 tko_tests WHERE tko_iteration_result.test_idx=tko_tests.test_idx AND 147 tko_tests.started_time <= '2012-02-01'; 148 149 There are also tables that require 2 joins to determine which rows we want 150 to delete and we determine these rows by joining the table we want to 151 delete from with an indirection table to the actual jobs table. 152 153 @param table_to_delete_from: Table whose rows we want to delete. 154 @param related_table: Table with the date information we are selecting by. 155 @param foreign_key: Foreign key used in table_to_delete_from to reference 156 the related table. If None, the primary_key is used. 157 @param primary_key: Primary key in the related table. 158 @param date: End date of the information we are trying to delete. 159 @param time_column: Column that we want to use to compare the date to. 160 @param indirection_table: Table we use to link the data we are trying to 161 delete with the table with the date information. 162 @param indirection_primary_key: Key we use to connect the indirection table 163 to the table we are trying to delete rows 164 from. 165 @param indirection_foreign_key: Key we use to connect the indirection table 166 to the table with the date information. 167 """ 168 if not foreign_key: 169 foreign_key = primary_key 170 171 if not related_table: 172 # Deleting from a table directly. 173 variables = dict(table=table_to_delete_from, primary_key=primary_key, 174 time_column=time_column, date=date) 175 sql = SELECT_CMD_FORMAT % variables 176 elif not indirection_table: 177 # Deleting using a single JOIN to get the date information. 178 variables = dict(primary_key=primary_key, table=table_to_delete_from, 179 foreign_key=foreign_key, related_table=related_table, 180 related_primary_key=related_primary_key, 181 time_column=time_column, date=date) 182 sql = SELECT_JOIN_CMD_FORMAT % variables 183 else: 184 # There are cases where we need to JOIN 3 TABLES to determine the rows 185 # we want to delete. 186 variables = dict(primary_key=primary_key, table=table_to_delete_from, 187 indirection_table=indirection_table, 188 foreign_key=foreign_key, 189 indirection_primary_key=indirection_primary_key, 190 related_table=related_table, 191 related_primary_key=related_primary_key, 192 indirection_foreign_key=indirection_foreign_key, 193 time_column=time_column, date=date) 194 sql = SELECT_WITH_INDIRECTION_FORMAT % variables 195 196 logging.debug('SQL: %s', sql) 197 cursor.execute(sql, []) 198 rows = [x[0] for x in cursor.fetchall()] 199 logging.debug(rows) 200 201 if not rows or rows == [None]: 202 with ProgressBar(table_to_delete_from, 0) as pb: 203 pb.show() 204 logging.debug('Nothing to delete for %s', table_to_delete_from) 205 return 206 207 with ProgressBar(table_to_delete_from, len(rows)) as pb: 208 for row_keys in grouper(rows, STEP_SIZE): 209 variables['rows'] = ','.join([str(x) for x in row_keys]) 210 sql = DELETE_ROWS_FORMAT % variables 211 212 start = time.time() 213 logging.debug('SQL: %s', sql) 214 if not DRY_RUN: 215 cursor.execute(sql, []) 216 transaction.commit_unless_managed(using='default') 217 end = time.time() 218 219 pb.update(len(row_keys)) 220 pb.show() 221 222 if LOAD_RATIO != 1.0: 223 assert 0 < LOAD_RATIO <= 1, ( 224 'Load ratio must be a fraction between 0 and 1.') 225 time.sleep((end - start) / LOAD_RATIO) 226 227 228def _subtract_days(date, days_to_subtract): 229 """ 230 Return a date (string) that is 'days' before 'date' 231 232 @param date: date (string) we are subtracting from. 233 @param days_to_subtract: days (int) we are subtracting. 234 """ 235 date_obj = datetime.datetime.strptime(date, '%Y-%m-%d') 236 difference = date_obj - datetime.timedelta(days=days_to_subtract) 237 return difference.strftime('%Y-%m-%d') 238 239 240def _delete_all_data_before_date(date): 241 """ 242 Delete all the database data before a given date. 243 244 This function focuses predominately on the data for jobs in tko_jobs. 245 However not all jobs in afe_jobs are also in tko_jobs. 246 247 Therefore we delete all the afe_job and foreign key relations prior to two 248 days before date. Then we do the queries using tko_jobs and these 249 tables to ensure all the related information is gone. Even though we are 250 repeating deletes on these tables, the second delete will be quick and 251 completely thorough in ensuring we clean up all the foreign key 252 dependencies correctly. 253 254 @param date: End date of the information we are trying to delete. 255 @param step: Rows to delete per SQL query. 256 """ 257 # First cleanup all afe_job related data (prior to 2 days before date). 258 # The reason for this is not all afe_jobs may be in tko_jobs. 259 afe_date = _subtract_days(date, 2) 260 logging.info('Cleaning up all afe_job data prior to %s.', afe_date) 261 _delete_table_data_before_date('afe_aborted_host_queue_entries', 262 'queue_entry_id', 263 'afe_jobs', 'id', afe_date, 264 time_column= 'created_on', 265 foreign_key='queue_entry_id', 266 indirection_table='afe_host_queue_entries', 267 indirection_primary_key='id', 268 indirection_foreign_key='job_id') 269 _delete_table_data_before_date('afe_special_tasks', 'id', 270 'afe_jobs', 'id', 271 afe_date, time_column='created_on', 272 foreign_key='queue_entry_id', 273 indirection_table='afe_host_queue_entries', 274 indirection_primary_key='id', 275 indirection_foreign_key='job_id') 276 _delete_table_data_before_date('afe_host_queue_entries', 'id', 277 'afe_jobs', 'id', 278 afe_date, time_column='created_on', 279 foreign_key=JOB_ID) 280 _delete_table_data_before_date('afe_job_keyvals', 'id', 281 'afe_jobs', 'id', 282 afe_date, time_column='created_on', 283 foreign_key=JOB_ID) 284 _delete_table_data_before_date('afe_jobs_dependency_labels', 'id', 285 'afe_jobs', 'id', 286 afe_date, time_column='created_on', 287 foreign_key=JOB_ID) 288 _delete_table_data_before_date('afe_jobs', 'id', 289 None, None, 290 afe_date, time_column='created_on') 291 # Special tasks that aren't associated with an HQE 292 # Since we don't do the queue_entry_id=NULL check, we might wipe out a bit 293 # more than we should, but I doubt anyone will notice or care. 294 _delete_table_data_before_date('afe_special_tasks', 'id', 295 None, None, 296 afe_date, time_column='time_requested') 297 298 # Now go through and clean up all the rows related to tko_jobs prior to 299 # date. 300 logging.info('Cleaning up all data related to tko_jobs prior to %s.', 301 date) 302 _delete_table_data_before_date('tko_test_attributes', 'id', 303 'tko_tests', TEST_IDX, 304 date, foreign_key=TEST_IDX) 305 _delete_table_data_before_date('tko_test_labels_tests', 'id', 306 'tko_tests', TEST_IDX, 307 date, foreign_key= 'test_id') 308 _delete_table_data_before_date('tko_iteration_result', TEST_IDX, 309 'tko_tests', TEST_IDX, 310 date) 311 _delete_table_data_before_date('tko_iteration_perf_value', TEST_IDX, 312 'tko_tests', TEST_IDX, 313 date) 314 _delete_table_data_before_date('tko_iteration_attributes', TEST_IDX, 315 'tko_tests', TEST_IDX, 316 date) 317 _delete_table_data_before_date('tko_job_keyvals', 'id', 318 'tko_jobs', JOB_IDX, 319 date, foreign_key='job_id') 320 _delete_table_data_before_date('afe_aborted_host_queue_entries', 321 'queue_entry_id', 322 'tko_jobs', AFE_JOB_ID, date, 323 foreign_key='queue_entry_id', 324 indirection_table='afe_host_queue_entries', 325 indirection_primary_key='id', 326 indirection_foreign_key='job_id') 327 _delete_table_data_before_date('afe_special_tasks', 'id', 328 'tko_jobs', AFE_JOB_ID, 329 date, foreign_key='queue_entry_id', 330 indirection_table='afe_host_queue_entries', 331 indirection_primary_key='id', 332 indirection_foreign_key='job_id') 333 _delete_table_data_before_date('afe_host_queue_entries', 'id', 334 'tko_jobs', AFE_JOB_ID, 335 date, foreign_key='job_id') 336 _delete_table_data_before_date('afe_job_keyvals', 'id', 337 'tko_jobs', AFE_JOB_ID, 338 date, foreign_key='job_id') 339 _delete_table_data_before_date('afe_jobs_dependency_labels', 'id', 340 'tko_jobs', AFE_JOB_ID, 341 date, foreign_key='job_id') 342 _delete_table_data_before_date('afe_jobs', 'id', 343 'tko_jobs', AFE_JOB_ID, 344 date, foreign_key='id') 345 _delete_table_data_before_date('tko_tests', TEST_IDX, 346 'tko_jobs', JOB_IDX, 347 date, foreign_key=JOB_IDX) 348 _delete_table_data_before_date('tko_jobs', JOB_IDX, 349 None, None, date) 350 351 352def parse_args(): 353 """Parse command line arguments""" 354 parser = argparse.ArgumentParser() 355 parser.add_argument('-v', '--verbose', action='store_true', 356 help='Print SQL commands and results') 357 parser.add_argument('--step', type=int, action='store', 358 default=1000, 359 help='Number of rows to delete at once') 360 parser.add_argument('--dry_run', action='store_true', 361 help='Print SQL queries instead of executing them.') 362 parser.add_argument('--load_ratio', type=float, action='store', default=0.2, 363 help=('The fraction of time the script should be ' 364 'performing deletes. For example --load_ratio=.2 ' 365 'will cause the script to sleep 80% of the time, ' 366 'and perform work for the other 20%.')) 367 parser.add_argument('date', help='Keep results newer than') 368 return parser.parse_args() 369 370 371def main(): 372 args = parse_args() 373 374 verbose = args.verbose or args.dry_run 375 level = logging.DEBUG if verbose else logging.INFO 376 logging.basicConfig(level=level, format=LOGGING_FORMAT) 377 logging.info('Calling: %s', sys.argv) 378 379 if not re.match(DATE_FORMAT_REGEX, args.date): 380 print 'DATE must be in yyyy-mm-dd format!' 381 return 382 383 global STEP_SIZE, DRY_RUN, LOAD_RATIO 384 STEP_SIZE = args.step 385 DRY_RUN = args.dry_run 386 LOAD_RATIO = args.load_ratio 387 388 _delete_all_data_before_date(args.date) 389 390 391if __name__ == '__main__': 392 main() 393