1#!/usr/bin/python 2 3# Copyright (c) 2014 The Chromium OS Authors. All rights reserved. 4# Use of this source code is governed by a BSD-style license that can be 5# found in the LICENSE file. 6 7""" 8This script crawls crbug. Sort-of. 9Invocation: 10 Get all bugs with labels, strings (in summary and/or comments): 11 crbug_crawler.py --labels 'one two three' 12 --queries '"first query" "second query"' 13 14 Get baddest open bugs of all time: 15 crbug_crawler.py --reap 16 17Tips: 18 - Label based queries will return faster than text queries. 19 - contrib/crbug_shell.py is a wrapper that allows you to incrementally 20 filter search results using this script. 21""" 22 23import argparse 24import cmd 25import logging 26import sys 27import shlex 28 29import common 30from autotest_lib.client.common_lib import global_config 31from autotest_lib.server.cros.dynamic_suite import reporting 32 33 34def _parse_args(args): 35 if not args: 36 import crbug_crawler 37 logging.error('Improper usage of crbug_crawler: %s', 38 crbug_crawler.__doc__) 39 sys.exit(1) 40 41 description = ('Usage: crbug_crawler.py --reap') 42 parser = argparse.ArgumentParser(description=description) 43 parser.add_argument('--quiet', help=('Turn off logging noise.'), 44 action='store_true', default=False) 45 parser.add_argument('--num', help='Number of issues to output.', default=10, 46 type=int) 47 parser.add_argument('--queries', 48 help=('Search query. Eg: --queries "%s %s"' % 49 ('build_Root', 'login')), 50 default='') 51 parser.add_argument('--labels', 52 help=('Search labels. Eg: --labels "%s %s"' % 53 ('autofiled', 'Pri-1')), default=None) 54 parser.add_argument('--reap', help=('Top autofiled bugs ordered by count.'), 55 action='store_true', default=False) 56 return parser.parse_args(args) 57 58 59class Update(object): 60 """Class encapsulating fields of an update to a bug. 61 """ 62 open_statuses = ['Unconfirmed', 'Untriaged', 'Available', 'Assigned', 63 'Started', 'ExternalDependency'] 64 closed_statuses = ['Fixed', 'Verified', 'Duplicate', 'WontFix', 'Archived'] 65 66 def __init__(self, comment='', labels='', status=''): 67 self.comment = comment 68 self.labels = labels if labels else [] 69 self.status = status 70 71 72 def __str__(self): 73 msg = 'status: %s' % self.status 74 if self.labels: 75 msg = '%s labels: %s' % (msg, self.labels) 76 if self.comment: 77 msg = '%s comment: %s' % (msg, self.comment) 78 return msg 79 80 81class UpdateManager(object): 82 """Update manager that allows you to revert status updates. 83 84 This class keeps track of the last update applied and is capable 85 of reverting it. 86 """ 87 88 def __init__(self, autocommit=False): 89 """Initialize update manager. 90 91 @param autocommit: If False just print out the update instead 92 of committing it. 93 """ 94 self.history = {} 95 self.present = {} 96 self.reporter = reporting.Reporter() 97 self.phapi_lib = self.reporter.get_bug_tracker_client() 98 self.autocommit = autocommit 99 100 101 def revert(self): 102 """Only manages status reverts as of now. 103 """ 104 for issue_id, update in self.history.iteritems(): 105 logging.warning('You will have to manually update %s and %s on %s', 106 self.present[issue_id].labels, 107 self.present[issue_id].comment, issue_id) 108 # Create a new update with just the status. 109 self.update(issue_id, Update(status=update.status)) 110 111 112 def update(self, old_issue, update): 113 """Record the state of an issue before updating it. 114 115 @param old_issue: The issue to update. If an id is specified an 116 issue is constructed. If an issue object (as defined in phapi_lib 117 Issue)is passed in, it is used directly. 118 @param update: The Update object to apply to the issue. 119 """ 120 if type(old_issue) == int: 121 old_issue = self.phapi_lib.get_tracker_issue_by_id(old_issue) 122 old_update = Update( 123 labels=old_issue.labels, status=old_issue.status) 124 125 if not update.status: 126 update.status = old_update.status 127 elif (update.status not in Update.open_statuses and 128 update.status not in Update.closed_statuses): 129 raise ValueError('Unknown status %s' % update.status) 130 131 if not self.autocommit: 132 logging.warning('Would have applied the following update: ' 133 '%s -> %s', old_update, update) 134 return 135 136 self.history[old_issue.id] = old_update 137 self.reporter.modify_bug_report( 138 issue_id=old_issue.id, comment=update.comment, 139 label_update=update.labels, 140 status=update.status) 141 self.present[old_issue.id] = update 142 143 144class Crawler(object): 145 """Class capable of crawling crbug. 146 147 This class applies filters to issues it crawls and caches them locally. 148 """ 149 150 # The limit at which we ask for confirmation to proceed with the crawl. 151 PROMPT_LIMIT = 2000 152 153 def __init__(self): 154 self.reporter = reporting.Reporter() 155 self.phapi_client = self.reporter.get_bug_tracker_client() 156 self.issues = None 157 self.all_autofiled_query = 'ANCHOR TestFailure' 158 self.all_autofiled_label = 'autofiled' 159 self.prompted = False 160 161 162 def fuzzy_search(self, query='', label='', fast=True): 163 """Returns all issues using one query and/or one label. 164 165 @param query: A string representing the query. 166 @param label: A string representing the label. 167 @param fast: If true, don't bother fetching comments. 168 169 @return: A list of issues matching the query. If fast is 170 specified the issues won't have comments. 171 """ 172 if not query and not label: 173 raise ValueError('Require query or labels to make a tracker query, ' 174 'try query = "%s" or one of the predefined labels %s' % 175 (self.fuzzy_search_anchor(), 176 self.reporter._PREDEFINED_LABELS)) 177 if type(label) != str: 178 raise ValueError('The crawler only supports one label per query, ' 179 'and it must be a string. you supplied %s' % label) 180 return self.phapi_client.get_tracker_issues_by_text( 181 query, label=label, full_text=not fast) 182 183 184 @staticmethod 185 def _get_autofiled_count(issue): 186 """Return the autofiled count. 187 188 @param issue: An issue object that has labels. 189 190 @return: An integer representing the autofiled count. 191 """ 192 for label in issue.labels: 193 if 'autofiled-count-' in label: 194 return int(label.replace('autofiled-count-', '')) 195 196 # Force bugs without autofiled-count to sink 197 return 0 198 199 200 def _prompt_crawl(self, new_issues, start_index): 201 """Warn the user that a crawl is getting large. 202 203 This method prompts for a y/n answer in case the user wants to abort the 204 crawl and specify another set of labels/queries. 205 206 @param new_issues: A list of issues used with the start_index to 207 determine the number of issues already processed. 208 @param start_index: The start index of the next crawl iteration. 209 """ 210 logging.warning('Found %s issues, Crawling issues starting from %s', 211 len(new_issues), start_index) 212 if start_index > self.PROMPT_LIMIT and not self.prompted: 213 logging.warning('Already crawled %s issues, it is possible that' 214 'you\'ve specified a very general label. If this is the ' 215 'case consider re-rodering the labels so they start with ' 216 'the rarest. Continue crawling [y/n]?', 217 start_index + len(new_issues)) 218 self.prompted = raw_input() == 'y' 219 if not self.prompted: 220 sys.exit(0) 221 222 223 def exhaustive_crawl(self, query='', label='', fast=True): 224 """Perform an exhaustive crawl using one label and query string. 225 226 @param query: A string representing one query. 227 @param lable: A string representing one label. 228 229 @return A list of issues sorted by descending autofiled count. 230 """ 231 start_index = 0 232 self.phapi_client.set_max_results(200) 233 logging.warning('Performing an exhaustive crawl with label %s query %s', 234 label, query) 235 vague_issues = [] 236 new_issues = self.fuzzy_search(query=query, label=label, fast=fast) 237 while new_issues: 238 vague_issues += new_issues 239 start_index += len(new_issues) + 1 240 self.phapi_client.set_start_index(start_index) 241 new_issues = self.fuzzy_search(query=query, label=label, 242 fast=fast) 243 self._prompt_crawl(new_issues, start_index) 244 245 # Subsequent calls will clear the issues cache with new results. 246 self.phapi_client.set_start_index(1) 247 return sorted(vague_issues, reverse=True, 248 key=lambda issue: self._get_autofiled_count(issue)) 249 250 251 @staticmethod 252 def filter_labels(issues, labels): 253 """Takes a list of labels and returns matching issues. 254 255 @param issues: A list of issues to parse for labels. 256 @param labels: A list of labels to match. 257 258 @return: A list of matching issues. The issues must contain 259 all the labels specified. 260 """ 261 if not labels: 262 return issues 263 matching_issues = set([]) 264 labels = set(labels) 265 for issue in issues: 266 issue_labels = set(issue.labels) 267 if issue_labels.issuperset(labels): 268 matching_issues.add(issue) 269 return matching_issues 270 271 272 @classmethod 273 def does_query_match(cls, issue, query): 274 """Check if a query matches the given issue. 275 276 @param issue: The issue to check. 277 @param query: The query to check against. 278 279 @return: True if the query matches, false otherwise. 280 """ 281 if query in issue.title or query in issue.summary: 282 return True 283 # We can only search comments if the issue is a complete issue 284 # i.e as defined in phapi_lib.Issue. 285 try: 286 if any(query in comment for comment in issue.comments): 287 return True 288 except (AttributeError, TypeError): 289 pass 290 return False 291 292 293 @classmethod 294 def filter_queries(cls, issues, queries): 295 """Take a list of queries and returns matching issues. 296 297 @param issues: A list of issues to parse. If the issues contain 298 comments and a query is not in the issues title or summmary, 299 the comments are parsed for a substring match. 300 @param queries: A list of queries to parse the issues for. 301 This method looks for an exact substring match within each issue. 302 303 @return: A list of matching issues. 304 """ 305 if not queries: 306 return issues 307 matching_issues = set([]) 308 for issue in issues: 309 # For each query, check if it's in the title, description or 310 # comments. If a query isn't in any of these, discard the issue. 311 for query in queries: 312 if cls.does_query_match(issue, query): 313 matching_issues.add(issue) 314 else: 315 if issue in matching_issues: 316 logging.warning('%s: %s\n \tPassed a subset of the ' 317 'queries but failed query %s', 318 issue.id, issue.title, query) 319 matching_issues.remove(issue) 320 break 321 return matching_issues 322 323 324 def filter_issues(self, queries='', labels=None, fast=True): 325 """Run the queries, labels filters by crawling crbug. 326 327 @param queries: A space seperated string of queries, usually passed 328 through the command line. 329 @param labels: A space seperated string of labels, usually passed 330 through the command line. 331 @param fast: If specified, skip creating comments for issues since this 332 can be a slow process. This value is only a suggestion, since it is 333 ignored if multiple queries are specified. 334 """ 335 queries = shlex.split(queries) 336 labels = shlex.split(labels) if labels else None 337 338 # We'll need comments to filter multiple queries. 339 if len(queries) > 1: 340 fast = False 341 matching_issues = self.exhaustive_crawl( 342 query=queries.pop(0) if queries else '', 343 label=labels.pop(0) if labels else '', fast=fast) 344 matching_issues = self.filter_labels(matching_issues, labels) 345 matching_issues = self.filter_queries(matching_issues, queries) 346 self.issues = list(matching_issues) 347 348 349 def dump_issues(self, limit=None): 350 """Print issues. 351 """ 352 if limit and limit < len(self.issues): 353 issues = self.issues[:limit] 354 else: 355 issues = self.issues 356 #TODO: Modify formatting, include some paging etc. 357 for issue in issues: 358 try: 359 print ('[%s] %s crbug.com/%s %s' % 360 (self._get_autofiled_count(issue), 361 issue.status, issue.id, issue.title)) 362 except UnicodeEncodeError as e: 363 print "Unicdoe error decoding issue id %s" % issue.id 364 continue 365 366 367def _update_test(args): 368 """A simple update test, to record usage. 369 """ 370 updater = UpdateManager(autocommit=True) 371 for issue in issues: 372 updater.update(issue, 373 Update(comment='this is bogus', labels=['bogus'], 374 status='Assigned')) 375 updater.revert() 376 377 378def configure_logging(quiet=False): 379 """Configure logging. 380 381 @param quiet: True to turn off warning messages. 382 """ 383 logging.basicConfig() 384 logger = logging.getLogger() 385 level = logging.WARNING 386 if quiet: 387 level = logging.ERROR 388 logger.setLevel(level) 389 390 391def main(args): 392 crawler = Crawler() 393 if args.reap: 394 if args.queries or args.labels: 395 logging.error('Query based ranking of bugs not supported yet.') 396 return 397 queries = '' 398 labels = crawler.all_autofiled_label 399 else: 400 queries = args.queries 401 labels = args.labels 402 crawler.filter_issues(queries=queries, labels=labels, 403 fast=False if queries else True) 404 crawler.dump_issues(int(args.num)) 405 logging.warning('\nThis is a truncated list of %s results, use --num %s ' 406 'to get them all. If you want more informative results/better ' 407 'querying capabilities try crbug_shell.py.', 408 args.num, len(crawler.issues)) 409 410 411if __name__ == '__main__': 412 args = _parse_args(sys.argv[1:]) 413 configure_logging(args.quiet) 414 main(args) 415 416