autotest/site_utils/collect_suite_time_stats.py

#!/usr/bin/python
#
# Copyright (c) 2014 The Chromium OS Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.


"""Script to calculate timing stats for suites.

This script measures nine stats for a suite run.
1. Net suite runtime.
2. Suite scheduling overhead.
3. Average scheduling overhead.
4. Average Queuing time.
5. Average Resetting time.
6. Average provisioning time.
7. Average Running time.
8. Average Parsing time.
9. Average Gathering time.

When the cron_mode is enabled, this script throws all stats but the first one
(Net suite runtime) to Graphite because the first one is already
being sent to Graphite by Autotest online.

Net suite runtime is end-to-end time for a suite from the beginning
to the end.
It is stored in a field, "duration", of a type, "suite_runtime" in
elasticsearch (ES).

Suite scheduling overhead is defined by the average of DUT overheads.
Suite is composed of one or more jobs, and those jobs are run on
one or more DUTs that are available.
A DUT overhead is defined by:
    DUT_i overhead = sum(net time for job_k - runtime for job_k
                         - runtime for special tasks of job_k)
    Job_k are the jobs run on DUT_i.

Net time for a job is the time from job_queued_time to hqe_finished_time.
job_queued_time is stored in the "queued_time" column of "tko_jobs" table.
hqe_finished_time is stored in the "finished_on" of "afe_host_queue_entries"
table.
We do not use "job_finished_time" of "tko_jobs" as job_finished_time is
recorded before gathering/parsing/archiving.
We do not use hqe started time ("started_on" of "afe_host_queue_entries"),
as it does not account for the lag from a host is assigned to the job till
the scheduler sees the assignment.

Runtime for job_k is the sum of durations for the records of
"job_time_breakdown" type in ES that have "Queued" or "Running" status.
It is possible that a job has multiple "Queued" records when the job's test
failed and tried again.
We take into account only the last "Queued" record.

Runtime for special tasks of job_k is the sum of durations for the records
of "job_time_breakdown" type in ES that have "Resetting", "Provisioning",
"Gathering", or "Parsing" status.
We take into account only the records whose timestamp is larger than
the timestamp of the last "Queued" record.
"""

import argparse
from datetime import datetime
from datetime import timedelta

import common
from autotest_lib.client.common_lib import host_queue_entry_states
from autotest_lib.client.common_lib import time_utils
from autotest_lib.client.common_lib.cros.graphite import autotest_es
from autotest_lib.client.common_lib.cros.graphite import autotest_stats
from autotest_lib.frontend import setup_django_environment
from autotest_lib.frontend.afe import models
from autotest_lib.frontend.tko import models as tko_models
from autotest_lib.server import utils
from autotest_lib.site_utils import job_overhead


_options = None

_hqes = host_queue_entry_states.Status
_states = [
        _hqes.QUEUED, _hqes.RESETTING, _hqes.PROVISIONING,
        _hqes.RUNNING, _hqes.GATHERING, _hqes.PARSING]


def mean(l):
    """
    Calculates an Arithmetic Mean for the numbers in a list.

    @param l: A list of numbers.
    @return: Arithmetic mean if the list is not empty.
             Otherwise, returns zero.
    """
    return float(sum(l)) / len(l) if l else 0


def print_verbose(string, *args):
    if _options.verbose:
        print(string % args)


def get_nontask_runtime(job_id, dut, job_info_dict):
    """
    Get sum of durations for "Queued", "Running", "Parsing", and "Gathering"
    status records.
    job_info_dict will be modified in this function to store the duration
    for each status.

    @param job_id: The job id of interest.
    @param dut: Hostname of a DUT that the job ran on.
    @param job_info_dict: Dictionary that has information for jobs.
    @return: Tuple of sum of durations and the timestamp for the last
             Queued record.
    """
    results = autotest_es.query(
            fields_returned=['status', 'duration', 'time_recorded'],
            equality_constraints=[('_type', 'job_time_breakdown'),
                                  ('job_id', job_id),
                                  ('hostname', dut)],
            sort_specs=[{'time_recorded': 'desc'}])

    sum = 0
    last_queued_timestamp = 0
    # There could be multiple "Queued" records.
    # Get sum of durations for the records after the last "Queued" records
    # (including the last "Queued" record).
    # Exploits the fact that "results" are ordered in the descending order
    # of time_recorded.
    for hit in results.hits:
        job_info_dict[job_id][hit['status']] = float(hit['duration'])
        if hit['status'] == 'Queued':
            # The first Queued record is the last one because of the descending
            # order of "results".
            last_queued_timestamp = float(hit['time_recorded'])
            sum += float(hit['duration'])
            break
        else:
            sum += float(hit['duration'])
    return (sum, last_queued_timestamp)


def get_tasks_runtime(task_list, dut, t_start, job_id, job_info_dict):
    """
    Get sum of durations for special tasks.
    job_info_dict will be modified in this function to store the duration
    for each special task.

    @param task_list: List of task id.
    @param dut: Hostname of a DUT that the tasks ran on.
    @param t_start: Beginning timestamp.
    @param job_id: The job id that is related to the tasks.
                   This is used only for debugging purpose.
    @param job_info_dict: Dictionary that has information for jobs.
    @return: Sum of durations of the tasks.
    """
    t_start_epoch = time_utils.to_epoch_time(t_start)
    results = autotest_es.query(
            fields_returned=['status', 'task_id', 'duration'],
            equality_constraints=[('_type', 'job_time_breakdown'),
                                  ('hostname', dut)],
            range_constraints=[('time_recorded', t_start_epoch, None)],
            batch_constraints=[('task_id', task_list)])
    sum = 0
    for hit in results.hits:
        sum += float(hit['duration'])
        job_info_dict[job_id][hit['status']] = float(hit['duration'])
        print_verbose('Task %s for Job %s took %s',
                      hit['task_id'], job_id, hit['duration'])
    return sum


def get_job_runtime(job_id, dut, job_info_dict):
    """
    Get sum of durations for the entries that are related to a job.
    job_info_dict will be modified in this function.

    @param job_id: The job id of interest.
    @param dut: Hostname of a DUT that the job ran on.
    @param job_info_dict: Dictionary that has information for jobs.
    @return: Total duration taken by a job.
    """
    sum, t_last_queued = get_nontask_runtime(job_id, dut, job_info_dict)
    print_verbose('Job %s took %f, last Queued: %s',
                  job_id, sum, t_last_queued)
    sum += get_tasks_runtime(
            list(job_info_dict[job_id]['tasks']), dut, t_last_queued,
            job_id, job_info_dict)
    return sum


def get_dut_overhead(dut, jobs, job_info_dict):
    """
    Calculates the scheduling overhead of a DUT.

    The scheduling overhead of a DUT is defined by the sum of scheduling
    overheads for the jobs that ran on the DUT.
    The scheduling overhead for a job is defined by the difference
    of net job runtime and real job runtime.
    job_info_dict will be modified in this function.

    @param dut: Hostname of a DUT.
    @param jobs: The list of jobs that ran on the DUT.
    @param job_info_dict: Dictionary that has information for jobs.
    @return: Scheduling overhead of a DUT in a floating point value.
             The unit is a second.
    """
    overheads = []
    for job_id in jobs:
        (t_start, t_end) = job_info_dict[job_id]['timestamps']
        runtime = get_job_runtime(job_id, dut, job_info_dict)
        overheads.append(t_end - t_start - runtime)
        print_verbose('Job: %s, Net runtime: %f, Real runtime: %f, '
                      'Overhead: %f', job_id, t_end - t_start, runtime,
                      t_end - t_start - runtime)
    return sum(overheads)


def get_child_jobs_info(suite_job_id, num_child_jobs, sanity_check):
    """
    Gets information about child jobs of a suite.

    @param suite_job_id: Job id of a suite.
    @param num_child_jobs: Number of child jobs of the suite.
    @param sanity_check: Do sanity check if True.
    @return: A tuple of (dictionary, list). For dictionary, the key is
             a DUT's hostname and the value is a list of jobs that ran on
             the DUT. List is the list of all jobs of the suite.
    """
    results = autotest_es.query(
            fields_returned=['job_id', 'hostname'],
            equality_constraints=[('_type', 'host_history'),
                                  ('parent_job_id', suite_job_id),
                                  ('status', 'Running'),])

    dut_jobs_dict = {}
    job_filter = set()
    for hit in results.hits:
        job_id = hit['job_id']
        dut = hit['hostname']
        if job_id in job_filter:
            continue
        job_list = dut_jobs_dict.setdefault(dut, [])
        job_list.append(job_id)
        job_filter.add(job_id)

    if sanity_check and len(job_filter) != num_child_jobs:
        print('WARNING: Mismatch number of child jobs of a suite (%d): '
              '%d != %d' % (suite_job_id, len(job_filter), num_child_jobs))
    return dut_jobs_dict, list(job_filter)


def get_job_timestamps(job_list, job_info_dict):
    """
    Get beginning time and ending time for each job.

    The beginning time of a job is "queued_time" of "tko_jobs" table.
    The ending time of a job is "finished_on" of "afe_host_queue_entries" table.
    job_info_dict will be modified in this function to store the timestamps.

    @param job_list: List of job ids
    @param job_info_dict: Dictionary that timestamps for each job will be stored
    """
    tko = tko_models.Job.objects.filter(afe_job_id__in=job_list)
    hqe = models.HostQueueEntry.objects.filter(job_id__in=job_list)
    job_start = {}
    for t in tko:
        job_start[t.afe_job_id] = time_utils.to_epoch_time(t.queued_time)
    job_end = {}
    for h in hqe:
        job_end[h.job_id] = time_utils.to_epoch_time(h.finished_on)

    for job_id in job_list:
        info_dict = job_info_dict.setdefault(job_id, {})
        info_dict.setdefault('timestamps', (job_start[job_id], job_end[job_id]))


def get_job_tasks(job_list, job_info_dict):
    """
    Get task ids for each job.
    job_info_dict will be modified in this function to store the task ids.

    @param job_list: List of job ids
    @param job_info_dict: Dictionary that task ids for each job will be stored.
    """
    results = autotest_es.query(
            fields_returned=['job_id', 'task_id'],
            equality_constraints=[('_type', 'host_history')],
            batch_constraints=[('job_id', job_list)])
    for hit in results.hits:
        if 'task_id' in hit:
            info_dict = job_info_dict.setdefault(hit['job_id'], {})
            task_set = info_dict.setdefault('tasks', set())
            task_set.add(hit['task_id'])


def get_scheduling_overhead(suite_job_id, num_child_jobs, sanity_check=True):
    """
    Calculates a scheduling overhead.

    A scheduling overhead is defined by the average of DUT overheads
    for the DUTs that the child jobs of a suite ran on.

    @param suite_job_id: Job id of a suite.
    @param num_child_jobs: Number of child jobs of the suite.
    @param sanity_check: Do sanity check if True.
    @return: Dictionary storing stats.
    """
    dut_jobs_dict, job_list = get_child_jobs_info(
            suite_job_id, num_child_jobs, sanity_check)
    job_info_dict = {}
    get_job_timestamps(job_list, job_info_dict)
    get_job_tasks(job_list, job_info_dict)

    dut_overheads = []
    avg_overhead = 0
    for dut, jobs in dut_jobs_dict.iteritems():
        print_verbose('Dut: %s, Jobs: %s', dut, jobs)
        overhead = get_dut_overhead(dut, jobs, job_info_dict)
        avg_overhead += overhead
        print_verbose('Dut overhead: %f', overhead)
        dut_overheads.append(overhead)

    if job_list:
        avg_overhead = avg_overhead / len(job_list)

    state_samples_dict = {}
    for info in job_info_dict.itervalues():
        for state in _states:
            if state in info:
                samples = state_samples_dict.setdefault(state, [])
                samples.append(info[state])

    if state_samples_dict:
        result = {state: mean(state_samples_dict[state])
                  if state in state_samples_dict else 0
                  for state in _states}
    result['suite_overhead'] = mean(dut_overheads)
    result['overhead'] = avg_overhead
    result['num_duts'] = len(dut_jobs_dict)
    return result


def print_suite_stats(suite_stats):
    """Prints out statistics for a suite to standard output."""
    print('suite_overhead: %(suite_overhead)f, overhead: %(overhead)f,' %
          suite_stats),
    for state in _states:
        if state in suite_stats:
            print('%s: %f,' % (state, suite_stats[state])),
    print('num_duts: %(num_duts)d' % suite_stats)


def analyze_suites(start_time, end_time):
    """
    Calculates timing stats (i.e., suite runtime, scheduling overhead)
    for the suites that finished within the timestamps given by parameters.

    @param start_time: Beginning timestamp.
    @param end_time: Ending timestamp.
    """
    print('Analyzing suites from %s to %s...' % (
          time_utils.epoch_time_to_date_string(start_time),
          time_utils.epoch_time_to_date_string(end_time)))

    if _options.bvtonly:
        batch_constraints = [
                ('suite_name', ['bvt-inline', 'bvt-cq', 'bvt-perbuild'])]
    else:
        batch_constraints = []

    start_time_epoch = time_utils.to_epoch_time(start_time)
    end_time_epoch = time_utils.to_epoch_time(end_time)
    results = autotest_es.query(
            fields_returned=['suite_name', 'suite_job_id', 'board', 'build',
                             'num_child_jobs', 'duration'],
            equality_constraints=[('_type', job_overhead.SUITE_RUNTIME_KEY),],
            range_constraints=[('time_recorded', start_time_epoch,
                                end_time_epoch)],
            sort_specs=[{'time_recorded': 'asc'}],
            batch_constraints=batch_constraints)
    print('Found %d suites' % (results.total))

    for hit in results.hits:
        suite_job_id = hit['suite_job_id']

        try:
            suite_name = hit['suite_name']
            num_child_jobs = int(hit['num_child_jobs'])
            suite_runtime = float(hit['duration'])

            print('Suite: %s (%s), Board: %s, Build: %s, Num child jobs: %d' % (
                    suite_name, suite_job_id, hit['board'], hit['build'],
                    num_child_jobs))

            suite_stats = get_scheduling_overhead(suite_job_id, num_child_jobs)
            print('Suite: %s (%s) runtime: %f,' % (
                    suite_name, suite_job_id, suite_runtime)),
            print_suite_stats(suite_stats)

            if _options.cron_mode:
                key = utils.get_data_key(
                        'suite_time_stats', suite_name, hit['build'],
                        hit['board'])
                autotest_stats.Timer(key).send('suite_runtime', suite_runtime)
                for stat, val in suite_stats.iteritems():
                    autotest_stats.Timer(key).send(stat, val)
        except Exception as e:
            print('ERROR: Exception is raised while processing suite %s' % (
                    suite_job_id))
            print e


def analyze_suite(suite_job_id):
    suite_stats = get_scheduling_overhead(suite_job_id, 0, False)
    print('Suite (%s)' % suite_job_id),
    print_suite_stats(suite_stats)


def main():
    """main script."""
    parser = argparse.ArgumentParser(
            formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('-c', dest='cron_mode', action='store_true',
                        help=('Run in a cron mode. Cron mode '
                              'sends calculated stat data to Graphite.'),
                        default=False)
    parser.add_argument('-s', type=int, dest='span',
                        help=('Number of hours that stats should be '
                              'collected.'),
                        default=1)
    parser.add_argument('--bvtonly', dest='bvtonly', action='store_true',
                        help=('Gets bvt suites only (i.e., bvt-inline,'
                              'bvt-cq, bvt-perbuild).'),
                        default=False)
    parser.add_argument('--suite', type=int, dest='suite_job_id',
                        help=('Job id of a suite.'))
    parser.add_argument('--verbose', dest='verbose', action='store_true',
                        help=('Prints out more info if True.'),
                        default=False)
    global _options
    _options = parser.parse_args()

    if _options.suite_job_id:
        analyze_suite(_options.suite_job_id)
    else:
        end_time = time_utils.to_epoch_time(datetime.now())
        start_time = end_time - timedelta(hours=_options.span).total_seconds()
        analyze_suites(start_time, end_time)


if __name__ == '__main__':
    main()