1#pylint: disable-msg=C0111 2 3# Copyright (c) 2014 The Chromium OS Authors. All rights reserved. 4# Use of this source code is governed by a BSD-style license that can be 5# found in the LICENSE file. 6 7"""Scheduler library classes. 8""" 9 10import collections 11import logging 12 13import common 14 15from autotest_lib.frontend import setup_django_environment 16 17from autotest_lib.client.common_lib import utils 18from autotest_lib.frontend.afe import models 19from autotest_lib.server.cros.dynamic_suite import constants 20from autotest_lib.scheduler import scheduler_models 21from autotest_lib.scheduler import scheduler_lib 22 23try: 24 from chromite.lib import metrics 25except ImportError: 26 metrics = utils.metrics_mock 27 28 29_job_timer_name = 'chromeos/autotest/scheduler/job_query_durations/%s' 30class AFEJobQueryManager(object): 31 """Query manager for AFE Jobs.""" 32 33 # A subquery to only get inactive hostless jobs. 34 hostless_query = 'host_id IS NULL AND meta_host IS NULL' 35 36 37 @metrics.SecondsTimerDecorator( 38 _job_timer_name % 'get_pending_queue_entries') 39 def get_pending_queue_entries(self, only_hostless=False): 40 """ 41 Fetch a list of new host queue entries. 42 43 The ordering of this list is important, as every new agent 44 we schedule can potentially contribute to the process count 45 on the drone, which has a static limit. The sort order 46 prioritizes jobs as follows: 47 1. High priority jobs: Based on the afe_job's priority 48 2. With hosts and metahosts: This will only happen if we don't 49 activate the hqe after assigning a host to it in 50 schedule_new_jobs. 51 3. With hosts but without metahosts: When tests are scheduled 52 through the frontend the owner of the job would have chosen 53 a host for it. 54 4. Without hosts but with metahosts: This is the common case of 55 a new test that needs a DUT. We assign a host and set it to 56 active so it shouldn't show up in case 2 on the next tick. 57 5. Without hosts and without metahosts: Hostless suite jobs, that 58 will result in new jobs that fall under category 4. 59 60 A note about the ordering of cases 3 and 4: 61 Prioritizing one case above the other leads to earlier acquisition 62 of the following resources: 1. process slots on the drone 2. machines. 63 - When a user schedules a job through the afe they choose a specific 64 host for it. Jobs with metahost can utilize any host that satisfies 65 the metahost criterion. This means that if we had scheduled 4 before 66 3 there is a good chance that a job which could've used another host, 67 will now use the host assigned to a metahost-less job. Given the 68 availability of machines in pool:suites, this almost guarantees 69 starvation for jobs scheduled through the frontend. 70 - Scheduling 4 before 3 also has its pros however, since a suite 71 has the concept of a time out, whereas users can wait. If we hit the 72 process count on the drone a suite can timeout waiting on the test, 73 but a user job generally has a much longer timeout, and relatively 74 harmless consequences. 75 The current ordering was chosed because it is more likely that we will 76 run out of machines in pool:suites than processes on the drone. 77 78 @returns A list of HQEs ordered according to sort_order. 79 """ 80 sort_order = ('afe_jobs.priority DESC, ' 81 'ISNULL(host_id), ' 82 'ISNULL(meta_host), ' 83 'parent_job_id, ' 84 'job_id') 85 # Don't execute jobs that should be executed by a shard in the global 86 # scheduler. 87 # This won't prevent the shard scheduler to run this, as the shard db 88 # doesn't have an an entry in afe_shards_labels. 89 query=('NOT complete AND NOT active AND status="Queued"' 90 'AND NOT aborted AND afe_shards_labels.id IS NULL') 91 92 # TODO(jakobjuelich, beeps): Optimize this query. Details: 93 # Compressed output of EXPLAIN <query>: 94 # +------------------------+--------+-------------------------+-------+ 95 # | table | type | key | rows | 96 # +------------------------+--------+-------------------------+-------+ 97 # | afe_host_queue_entries | ref | host_queue_entry_status | 30536 | 98 # | afe_shards_labels | ref | shard_label_id_fk | 1 | 99 # | afe_jobs | eq_ref | PRIMARY | 1 | 100 # +------------------------+--------+-------------------------+-------+ 101 # This shows the first part of the query fetches a lot of objects, that 102 # are then filtered. The joins are comparably fast: There's usually just 103 # one or none shard mapping that can be answered fully using an index 104 # (shard_label_id_fk), similar thing applies to the job. 105 # 106 # This works for now, but once O(#Jobs in shard) << O(#Jobs in Queued), 107 # it might be more efficient to filter on the meta_host first, instead 108 # of the status. 109 if only_hostless: 110 query = '%s AND (%s)' % (query, self.hostless_query) 111 return list(scheduler_models.HostQueueEntry.fetch( 112 joins=('INNER JOIN afe_jobs ON (job_id=afe_jobs.id) ' 113 'LEFT JOIN afe_shards_labels ON (' 114 'meta_host=afe_shards_labels.label_id)'), 115 where=query, order_by=sort_order)) 116 117 118 @metrics.SecondsTimerDecorator( 119 _job_timer_name % 'get_prioritized_special_tasks') 120 def get_prioritized_special_tasks(self, only_tasks_with_leased_hosts=False): 121 """ 122 Returns all queued SpecialTasks prioritized for repair first, then 123 cleanup, then verify. 124 125 @param only_tasks_with_leased_hosts: If true, this method only returns 126 tasks with leased hosts. 127 128 @return: list of afe.models.SpecialTasks sorted according to priority. 129 """ 130 queued_tasks = models.SpecialTask.objects.filter(is_active=False, 131 is_complete=False, 132 host__locked=False) 133 # exclude hosts with active queue entries unless the SpecialTask is for 134 # that queue entry 135 queued_tasks = models.SpecialTask.objects.add_join( 136 queued_tasks, 'afe_host_queue_entries', 'host_id', 137 join_condition='afe_host_queue_entries.active', 138 join_from_key='host_id', force_left_join=True) 139 queued_tasks = queued_tasks.extra( 140 where=['(afe_host_queue_entries.id IS NULL OR ' 141 'afe_host_queue_entries.id = ' 142 'afe_special_tasks.queue_entry_id)']) 143 if only_tasks_with_leased_hosts: 144 queued_tasks = queued_tasks.filter(host__leased=True) 145 146 # reorder tasks by priority 147 task_priority_order = [models.SpecialTask.Task.REPAIR, 148 models.SpecialTask.Task.CLEANUP, 149 models.SpecialTask.Task.VERIFY, 150 models.SpecialTask.Task.RESET, 151 models.SpecialTask.Task.PROVISION] 152 def task_priority_key(task): 153 return task_priority_order.index(task.task) 154 return sorted(queued_tasks, key=task_priority_key) 155 156 157 @classmethod 158 def get_overlapping_jobs(cls): 159 """A helper method to get all active jobs using the same host. 160 161 @return: A list of dictionaries with the hqe id, job_id and host_id 162 of the currently overlapping jobs. 163 """ 164 # Filter all active hqes and stand alone special tasks to make sure 165 # a host isn't being used by two jobs at the same time. An incomplete 166 # stand alone special task can share a host with an active hqe, an 167 # example of this is the cleanup scheduled in gathering. 168 hqe_hosts = list(models.HostQueueEntry.objects.filter( 169 active=1, complete=0, host_id__isnull=False).values_list( 170 'host_id', flat=True)) 171 special_task_hosts = list(models.SpecialTask.objects.filter( 172 is_active=1, is_complete=0, host_id__isnull=False, 173 queue_entry_id__isnull=True).values_list('host_id', flat=True)) 174 host_counts = collections.Counter( 175 hqe_hosts + special_task_hosts).most_common() 176 multiple_hosts = [count[0] for count in host_counts if count[1] > 1] 177 return list(models.HostQueueEntry.objects.filter( 178 host_id__in=multiple_hosts, active=True).values( 179 'id', 'job_id', 'host_id')) 180 181 182 @metrics.SecondsTimerDecorator( 183 _job_timer_name % 'get_suite_host_assignment') 184 def get_suite_host_assignment(self): 185 """A helper method to get how many hosts each suite is holding. 186 187 @return: Two dictionaries (suite_host_num, hosts_to_suites) 188 suite_host_num maps suite job id to number of hosts 189 holding by its child jobs. 190 hosts_to_suites contains current hosts held by 191 any suites, and maps the host id to its parent_job_id. 192 """ 193 query = models.HostQueueEntry.objects.filter( 194 host_id__isnull=False, complete=0, active=1, 195 job__parent_job_id__isnull=False) 196 suite_host_num = {} 197 hosts_to_suites = {} 198 for hqe in query: 199 host_id = hqe.host_id 200 parent_job_id = hqe.job.parent_job_id 201 count = suite_host_num.get(parent_job_id, 0) 202 suite_host_num[parent_job_id] = count + 1 203 hosts_to_suites[host_id] = parent_job_id 204 return suite_host_num, hosts_to_suites 205 206 207 @metrics.SecondsTimerDecorator( _job_timer_name % 'get_min_duts_of_suites') 208 def get_min_duts_of_suites(self, suite_job_ids): 209 """Load suite_min_duts job keyval for a set of suites. 210 211 @param suite_job_ids: A set of suite job ids. 212 213 @return: A dictionary where the key is a suite job id, 214 the value is the value of 'suite_min_duts'. 215 """ 216 query = models.JobKeyval.objects.filter( 217 job_id__in=suite_job_ids, 218 key=constants.SUITE_MIN_DUTS_KEY, value__isnull=False) 219 return dict((keyval.job_id, int(keyval.value)) for keyval in query) 220 221 222_host_timer_name = 'chromeos/autotest/scheduler/host_query_durations/%s' 223class AFEHostQueryManager(object): 224 """Query manager for AFE Hosts.""" 225 226 def __init__(self): 227 """Create an AFEHostQueryManager. 228 229 @param db: A connection to the database with the afe_hosts table. 230 """ 231 self._db = scheduler_lib.ConnectionManager().get_connection() 232 233 234 def _process_many2many_dict(self, rows, flip=False): 235 result = {} 236 for row in rows: 237 left_id, right_id = int(row[0]), int(row[1]) 238 if flip: 239 left_id, right_id = right_id, left_id 240 result.setdefault(left_id, set()).add(right_id) 241 return result 242 243 244 def _get_sql_id_list(self, id_list): 245 return ','.join(str(item_id) for item_id in id_list) 246 247 248 def _get_many2many_dict(self, query, id_list, flip=False): 249 if not id_list: 250 return {} 251 query %= self._get_sql_id_list(id_list) 252 rows = self._db.execute(query) 253 return self._process_many2many_dict(rows, flip) 254 255 256 def _get_ready_hosts(self): 257 # We don't lose anything by re-doing these checks 258 # even though we release hosts on the same conditions. 259 # In the future we might have multiple clients that 260 # release_hosts and/or lock them independent of the 261 # scheduler tick. 262 hosts = scheduler_models.Host.fetch( 263 where="NOT afe_hosts.leased " 264 "AND NOT afe_hosts.locked " 265 "AND (afe_hosts.status IS NULL " 266 "OR afe_hosts.status = 'Ready')") 267 return dict((host.id, host) for host in hosts) 268 269 270 @metrics.SecondsTimerDecorator(_host_timer_name % 'get_job_acl_groups') 271 def _get_job_acl_groups(self, job_ids): 272 query = """ 273 SELECT afe_jobs.id, afe_acl_groups_users.aclgroup_id 274 FROM afe_jobs 275 INNER JOIN afe_users ON afe_users.login = afe_jobs.owner 276 INNER JOIN afe_acl_groups_users ON 277 afe_acl_groups_users.user_id = afe_users.id 278 WHERE afe_jobs.id IN (%s) 279 """ 280 return self._get_many2many_dict(query, job_ids) 281 282 283 def _get_job_ineligible_hosts(self, job_ids): 284 query = """ 285 SELECT job_id, host_id 286 FROM afe_ineligible_host_queues 287 WHERE job_id IN (%s) 288 """ 289 return self._get_many2many_dict(query, job_ids) 290 291 292 @metrics.SecondsTimerDecorator(_host_timer_name % 'get_job_dependencies') 293 def _get_job_dependencies(self, job_ids): 294 query = """ 295 SELECT job_id, label_id 296 FROM afe_jobs_dependency_labels 297 WHERE job_id IN (%s) 298 """ 299 return self._get_many2many_dict(query, job_ids) 300 301 def _get_host_acls(self, host_ids): 302 query = """ 303 SELECT host_id, aclgroup_id 304 FROM afe_acl_groups_hosts 305 WHERE host_id IN (%s) 306 """ 307 return self._get_many2many_dict(query, host_ids) 308 309 310 def _get_label_hosts(self, host_ids): 311 if not host_ids: 312 return {}, {} 313 query = """ 314 SELECT label_id, host_id 315 FROM afe_hosts_labels 316 WHERE host_id IN (%s) 317 """ % self._get_sql_id_list(host_ids) 318 rows = self._db.execute(query) 319 labels_to_hosts = self._process_many2many_dict(rows) 320 hosts_to_labels = self._process_many2many_dict(rows, flip=True) 321 return labels_to_hosts, hosts_to_labels 322 323 324 @classmethod 325 def find_unused_healty_hosts(cls): 326 """Get hosts that are currently unused and in the READY state. 327 328 @return: A list of host objects, one for each unused healthy host. 329 """ 330 # Avoid any host with a currently active queue entry against it. 331 hqe_join = ('LEFT JOIN afe_host_queue_entries AS active_hqe ' 332 'ON (afe_hosts.id = active_hqe.host_id AND ' 333 'active_hqe.active)') 334 335 # Avoid any host with a new special task against it. There are 2 cases 336 # when an inactive but incomplete special task will not use the host 337 # this tick: 1. When the host is locked 2. When an active hqe already 338 # has special tasks for the same host. In both these cases this host 339 # will not be in the ready hosts list anyway. In all other cases, 340 # an incomplete special task will grab the host before a new job does 341 # by assigning an agent to it. 342 special_task_join = ('LEFT JOIN afe_special_tasks as new_tasks ' 343 'ON (afe_hosts.id = new_tasks.host_id AND ' 344 'new_tasks.is_complete=0)') 345 346 return scheduler_models.Host.fetch( 347 joins='%s %s' % (hqe_join, special_task_join), 348 where="active_hqe.host_id IS NULL AND new_tasks.host_id IS NULL " 349 "AND afe_hosts.leased " 350 "AND NOT afe_hosts.locked " 351 "AND (afe_hosts.status IS NULL " 352 "OR afe_hosts.status = 'Ready')") 353 354 @metrics.SecondsTimerDecorator(_host_timer_name % 'set_leased') 355 def set_leased(self, leased_value, **kwargs): 356 """Modify the leased bit on the hosts with ids in host_ids. 357 358 @param leased_value: The True/False value of the leased column for 359 the hosts with ids in host_ids. 360 @param kwargs: The args to use in finding matching hosts. 361 """ 362 logging.info('Setting leased = %s for the hosts that match %s', 363 leased_value, kwargs) 364 models.Host.objects.filter(**kwargs).update(leased=leased_value) 365 366 367 @metrics.SecondsTimerDecorator(_host_timer_name % 'get_labels') 368 def _get_labels(self, job_dependencies): 369 """ 370 Calculate a dict mapping label id to label object so that we don't 371 frequently round trip to the database every time we need a label. 372 373 @param job_dependencies: A dict mapping an integer job id to a list of 374 integer label id's. ie. {job_id: [label_id]} 375 @return: A dict mapping an integer label id to a scheduler model label 376 object. ie. {label_id: label_object} 377 378 """ 379 id_to_label = dict() 380 # Pull all the labels on hosts we might look at 381 host_labels = scheduler_models.Label.fetch( 382 where="id IN (SELECT label_id FROM afe_hosts_labels)") 383 id_to_label.update([(label.id, label) for label in host_labels]) 384 # and pull all the labels on jobs we might look at. 385 job_label_set = set() 386 for job_deps in job_dependencies.values(): 387 job_label_set.update(job_deps) 388 # On the rare/impossible chance that no jobs have any labels, we 389 # can skip this. 390 if job_label_set: 391 job_string_label_list = ','.join([str(x) for x in job_label_set]) 392 job_labels = scheduler_models.Label.fetch( 393 where="id IN (%s)" % job_string_label_list) 394 id_to_label.update([(label.id, label) for label in job_labels]) 395 return id_to_label 396 397 398 def refresh(self, pending_queue_entries): 399 """Update the query manager. 400 401 Cache information about a list of queue entries and eligible hosts 402 from the database so clients can avoid expensive round trips during 403 host acquisition. 404 405 @param pending_queue_entries: A list of queue entries about which we 406 need information. 407 """ 408 self._hosts_available = self._get_ready_hosts() 409 relevant_jobs = [queue_entry.job_id 410 for queue_entry in pending_queue_entries] 411 self._job_acls = self._get_job_acl_groups(relevant_jobs) 412 self._ineligible_hosts = (self._get_job_ineligible_hosts(relevant_jobs)) 413 self._job_dependencies = (self._get_job_dependencies(relevant_jobs)) 414 host_ids = self._hosts_available.keys() 415 self._host_acls = self._get_host_acls(host_ids) 416 self._label_hosts, self._host_labels = ( 417 self._get_label_hosts(host_ids)) 418 self._labels = self._get_labels(self._job_dependencies) 419