#!/usr/bin/python2 # Copyright (c) 2012 The Chromium OS Authors. All rights reserved. # Use of this source code is governed by a BSD-style license that can be # found in the LICENSE file. import argparse import datetime import logging import os import re import sys import time os.environ['DJANGO_SETTINGS_MODULE'] = 'frontend.settings' import common from autotest_lib.server import utils from django.db import connections, transaction # Format Appears as: [Date] [Time] - [Msg Level] - [Message] LOGGING_FORMAT = '%(asctime)s - %(levelname)s - %(message)s' # This regex makes sure the input is in the format of YYYY-MM-DD (2012-02-01) DATE_FORMAT_REGEX = ('^(19|20)\d\d[- /.](0[1-9]|1[012])[- /.](0[1-9]|[12][0-9]' '|3[01])$') SELECT_CMD_FORMAT = """ SELECT %(table)s.%(primary_key)s FROM %(table)s WHERE %(table)s.%(time_column)s <= "%(date)s" """ SELECT_JOIN_CMD_FORMAT = """ SELECT %(table)s.%(primary_key)s FROM %(table)s INNER JOIN %(related_table)s ON %(table)s.%(foreign_key)s=%(related_table)s.%(related_primary_key)s WHERE %(related_table)s.%(time_column)s <= "%(date)s" """ SELECT_WITH_INDIRECTION_FORMAT = """ SELECT %(table)s.%(primary_key)s FROM %(table)s INNER JOIN %(indirection_table)s ON %(table)s.%(foreign_key)s = %(indirection_table)s.%(indirection_primary_key)s INNER JOIN %(related_table)s ON %(indirection_table)s.%(indirection_foreign_key)s = %(related_table)s.%(related_primary_key)s WHERE %(related_table)s.%(time_column)s <= "%(date)s" """ DELETE_ROWS_FORMAT = """ DELETE FROM %(table)s WHERE %(table)s.%(primary_key)s IN (%(rows)s) """ AFE_JOB_ID = 'afe_job_id' JOB_ID = 'job_id' JOB_IDX = 'job_idx' TEST_IDX = 'test_idx' # CAUTION: Make sure only the 'default' connection is used. Otherwise # db_cleanup may delete stuff from the global database, which is generally not # intended. cursor = connections['default'].cursor() # Globals for command line flag constants, for convenience. DRY_RUN = False STEP_SIZE = None LOAD_RATIO = 1.0 class ProgressBar(object): TEXT = "{:<40s} [{:<20s}] ({:>9d}/{:>9d})" def __init__(self, name, amount): self._name = name self._amount = amount self._cur = 0 def __enter__(self): return self def __exit__(self, a, b, c): sys.stdout.write('\n') sys.stdout.flush() def update(self, x): """ Advance the counter by `x`. @param x: An integer of how many more elements were processed. """ self._cur += x def show(self): """ Display the progress bar on the current line. Repeated invocations "update" the display. """ if self._amount == 0: barlen = 20 else: barlen = int(20 * self._cur / float(self._amount)) if barlen: bartext = '=' * (barlen-1) + '>' else: bartext = '' text = self.TEXT.format(self._name, bartext, self._cur, self._amount) sys.stdout.write('\r') sys.stdout.write(text) sys.stdout.flush() def grouper(iterable, n): """ Group the elements of `iterable` into groups of maximum size `n`. @param iterable: An iterable. @param n: Max size of returned groups. @returns: Yields iterables of size <= n. >>> grouper('ABCDEFG', 3) [['A', 'B', C'], ['D', 'E', 'F'], ['G']] """ args = [iter(iterable)] * n while True: lst = [] try: for itr in args: lst.append(next(itr)) yield lst except StopIteration: if lst: yield lst break def _delete_table_data_before_date(table_to_delete_from, primary_key, related_table, related_primary_key, date, foreign_key=None, time_column="started_time", indirection_table=None, indirection_primary_key=None, indirection_foreign_key=None): """ We want a delete statement that will only delete from one table while using a related table to find the rows to delete. An example mysql command: DELETE FROM tko_iteration_result USING tko_iteration_result INNER JOIN tko_tests WHERE tko_iteration_result.test_idx=tko_tests.test_idx AND tko_tests.started_time <= '2012-02-01'; There are also tables that require 2 joins to determine which rows we want to delete and we determine these rows by joining the table we want to delete from with an indirection table to the actual jobs table. @param table_to_delete_from: Table whose rows we want to delete. @param related_table: Table with the date information we are selecting by. @param foreign_key: Foreign key used in table_to_delete_from to reference the related table. If None, the primary_key is used. @param primary_key: Primary key in the related table. @param date: End date of the information we are trying to delete. @param time_column: Column that we want to use to compare the date to. @param indirection_table: Table we use to link the data we are trying to delete with the table with the date information. @param indirection_primary_key: Key we use to connect the indirection table to the table we are trying to delete rows from. @param indirection_foreign_key: Key we use to connect the indirection table to the table with the date information. """ if not foreign_key: foreign_key = primary_key if not related_table: # Deleting from a table directly. variables = dict(table=table_to_delete_from, primary_key=primary_key, time_column=time_column, date=date) sql = SELECT_CMD_FORMAT % variables elif not indirection_table: # Deleting using a single JOIN to get the date information. variables = dict(primary_key=primary_key, table=table_to_delete_from, foreign_key=foreign_key, related_table=related_table, related_primary_key=related_primary_key, time_column=time_column, date=date) sql = SELECT_JOIN_CMD_FORMAT % variables else: # There are cases where we need to JOIN 3 TABLES to determine the rows # we want to delete. variables = dict(primary_key=primary_key, table=table_to_delete_from, indirection_table=indirection_table, foreign_key=foreign_key, indirection_primary_key=indirection_primary_key, related_table=related_table, related_primary_key=related_primary_key, indirection_foreign_key=indirection_foreign_key, time_column=time_column, date=date) sql = SELECT_WITH_INDIRECTION_FORMAT % variables logging.debug('SQL: %s', sql) cursor.execute(sql, []) rows = [x[0] for x in cursor.fetchall()] logging.debug(rows) if not rows or rows == [None]: with ProgressBar(table_to_delete_from, 0) as pb: pb.show() logging.debug('Nothing to delete for %s', table_to_delete_from) return with ProgressBar(table_to_delete_from, len(rows)) as pb: for row_keys in grouper(rows, STEP_SIZE): variables['rows'] = ','.join([str(x) for x in row_keys]) sql = DELETE_ROWS_FORMAT % variables start = time.time() logging.debug('SQL: %s', sql) if not DRY_RUN: cursor.execute(sql, []) transaction.commit_unless_managed(using='default') end = time.time() pb.update(len(row_keys)) pb.show() if LOAD_RATIO != 1.0: assert 0 < LOAD_RATIO <= 1, ( 'Load ratio must be a fraction between 0 and 1.') time.sleep((end - start) / LOAD_RATIO) def _subtract_days(date, days_to_subtract): """ Return a date (string) that is 'days' before 'date' @param date: date (string) we are subtracting from. @param days_to_subtract: days (int) we are subtracting. """ date_obj = datetime.datetime.strptime(date, '%Y-%m-%d') difference = date_obj - datetime.timedelta(days=days_to_subtract) return difference.strftime('%Y-%m-%d') def _delete_all_data_before_date(date): """ Delete all the database data before a given date. This function focuses predominately on the data for jobs in tko_jobs. However not all jobs in afe_jobs are also in tko_jobs. Therefore we delete all the afe_job and foreign key relations prior to two days before date. Then we do the queries using tko_jobs and these tables to ensure all the related information is gone. Even though we are repeating deletes on these tables, the second delete will be quick and completely thorough in ensuring we clean up all the foreign key dependencies correctly. @param date: End date of the information we are trying to delete. @param step: Rows to delete per SQL query. """ # First cleanup all afe_job related data (prior to 2 days before date). # The reason for this is not all afe_jobs may be in tko_jobs. afe_date = _subtract_days(date, 2) logging.info('Cleaning up all afe_job data prior to %s.', afe_date) _delete_table_data_before_date('afe_aborted_host_queue_entries', 'queue_entry_id', 'afe_jobs', 'id', afe_date, time_column= 'created_on', foreign_key='queue_entry_id', indirection_table='afe_host_queue_entries', indirection_primary_key='id', indirection_foreign_key='job_id') _delete_table_data_before_date('afe_special_tasks', 'id', 'afe_jobs', 'id', afe_date, time_column='created_on', foreign_key='queue_entry_id', indirection_table='afe_host_queue_entries', indirection_primary_key='id', indirection_foreign_key='job_id') _delete_table_data_before_date('afe_host_queue_entries', 'id', 'afe_jobs', 'id', afe_date, time_column='created_on', foreign_key=JOB_ID) _delete_table_data_before_date('afe_job_keyvals', 'id', 'afe_jobs', 'id', afe_date, time_column='created_on', foreign_key=JOB_ID) _delete_table_data_before_date('afe_jobs_dependency_labels', 'id', 'afe_jobs', 'id', afe_date, time_column='created_on', foreign_key=JOB_ID) _delete_table_data_before_date('afe_jobs', 'id', None, None, afe_date, time_column='created_on') # Special tasks that aren't associated with an HQE # Since we don't do the queue_entry_id=NULL check, we might wipe out a bit # more than we should, but I doubt anyone will notice or care. _delete_table_data_before_date('afe_special_tasks', 'id', None, None, afe_date, time_column='time_requested') # Now go through and clean up all the rows related to tko_jobs prior to # date. logging.info('Cleaning up all data related to tko_jobs prior to %s.', date) _delete_table_data_before_date('tko_test_attributes', 'id', 'tko_tests', TEST_IDX, date, foreign_key=TEST_IDX) _delete_table_data_before_date('tko_test_labels_tests', 'id', 'tko_tests', TEST_IDX, date, foreign_key= 'test_id') _delete_table_data_before_date('tko_iteration_result', TEST_IDX, 'tko_tests', TEST_IDX, date) _delete_table_data_before_date('tko_iteration_perf_value', TEST_IDX, 'tko_tests', TEST_IDX, date) _delete_table_data_before_date('tko_iteration_attributes', TEST_IDX, 'tko_tests', TEST_IDX, date) _delete_table_data_before_date('tko_job_keyvals', 'id', 'tko_jobs', JOB_IDX, date, foreign_key='job_id') _delete_table_data_before_date('afe_aborted_host_queue_entries', 'queue_entry_id', 'tko_jobs', AFE_JOB_ID, date, foreign_key='queue_entry_id', indirection_table='afe_host_queue_entries', indirection_primary_key='id', indirection_foreign_key='job_id') _delete_table_data_before_date('afe_special_tasks', 'id', 'tko_jobs', AFE_JOB_ID, date, foreign_key='queue_entry_id', indirection_table='afe_host_queue_entries', indirection_primary_key='id', indirection_foreign_key='job_id') _delete_table_data_before_date('afe_host_queue_entries', 'id', 'tko_jobs', AFE_JOB_ID, date, foreign_key='job_id') _delete_table_data_before_date('afe_job_keyvals', 'id', 'tko_jobs', AFE_JOB_ID, date, foreign_key='job_id') _delete_table_data_before_date('afe_jobs_dependency_labels', 'id', 'tko_jobs', AFE_JOB_ID, date, foreign_key='job_id') _delete_table_data_before_date('afe_jobs', 'id', 'tko_jobs', AFE_JOB_ID, date, foreign_key='id') _delete_table_data_before_date('tko_tests', TEST_IDX, 'tko_jobs', JOB_IDX, date, foreign_key=JOB_IDX) _delete_table_data_before_date('tko_jobs', JOB_IDX, None, None, date) def parse_args(): """Parse command line arguments""" parser = argparse.ArgumentParser() parser.add_argument('-v', '--verbose', action='store_true', help='Print SQL commands and results') parser.add_argument('--step', type=int, action='store', default=1000, help='Number of rows to delete at once') parser.add_argument('--dry_run', action='store_true', help='Print SQL queries instead of executing them.') parser.add_argument('--load_ratio', type=float, action='store', default=0.2, help=('The fraction of time the script should be ' 'performing deletes. For example --load_ratio=.2 ' 'will cause the script to sleep 80% of the time, ' 'and perform work for the other 20%.')) parser.add_argument('date', help='Keep results newer than') return parser.parse_args() def main(): args = parse_args() verbose = args.verbose or args.dry_run level = logging.DEBUG if verbose else logging.INFO logging.basicConfig(level=level, format=LOGGING_FORMAT) logging.info('Calling: %s', sys.argv) if not re.match(DATE_FORMAT_REGEX, args.date): print 'DATE must be in yyyy-mm-dd format!' return global STEP_SIZE, DRY_RUN, LOAD_RATIO STEP_SIZE = args.step DRY_RUN = args.dry_run LOAD_RATIO = args.load_ratio _delete_all_data_before_date(args.date) if __name__ == '__main__': main()