1#!/usr/bin/python
2
3# Copyright (c) 2014 The Chromium OS Authors. All rights reserved.
4# Use of this source code is governed by a BSD-style license that can be
5# found in the LICENSE file.
6
7"""
8This script crawls crbug. Sort-of.
9Invocation:
10    Get all bugs with labels, strings (in summary and/or comments):
11        crbug_crawler.py --labels 'one two three'
12                         --queries '"first query" "second query"'
13
14    Get baddest open bugs of all time:
15        crbug_crawler.py --reap
16
17Tips:
18    - Label based queries will return faster than text queries.
19    - contrib/crbug_shell.py is a wrapper that allows you to incrementally
20        filter search results using this script.
21"""
22
23import argparse
24import cmd
25import logging
26import sys
27import shlex
28
29import common
30from autotest_lib.client.common_lib import global_config
31from autotest_lib.server.cros.dynamic_suite import reporting
32
33
34def _parse_args(args):
35    if not args:
36        import crbug_crawler
37        logging.error('Improper usage of crbug_crawler: %s',
38                crbug_crawler.__doc__)
39        sys.exit(1)
40
41    description = ('Usage: crbug_crawler.py --reap')
42    parser = argparse.ArgumentParser(description=description)
43    parser.add_argument('--quiet', help=('Turn off logging noise.'),
44            action='store_true', default=False)
45    parser.add_argument('--num', help='Number of issues to output.', default=10,
46            type=int)
47    parser.add_argument('--queries',
48                        help=('Search query. Eg: --queries "%s %s"' %
49                              ('build_Root', 'login')),
50                        default='')
51    parser.add_argument('--labels',
52                        help=('Search labels. Eg: --labels "%s %s"' %
53                              ('autofiled', 'Pri-1')), default=None)
54    parser.add_argument('--reap', help=('Top autofiled bugs ordered by count.'),
55            action='store_true', default=False)
56    return parser.parse_args(args)
57
58
59class Update(object):
60    """Class encapsulating fields of an update to a bug.
61    """
62    open_statuses = ['Unconfirmed', 'Untriaged', 'Available', 'Assigned',
63                     'Started', 'ExternalDependency']
64    closed_statuses = ['Fixed', 'Verified', 'Duplicate', 'WontFix', 'Archived']
65
66    def __init__(self, comment='', labels='', status=''):
67        self.comment = comment
68        self.labels = labels if labels else []
69        self.status = status
70
71
72    def __str__(self):
73        msg = 'status: %s' % self.status
74        if self.labels:
75            msg = '%s labels: %s' % (msg, self.labels)
76        if self.comment:
77            msg = '%s comment: %s' % (msg, self.comment)
78        return msg
79
80
81class UpdateManager(object):
82    """Update manager that allows you to revert status updates.
83
84    This class keeps track of the last update applied and is capable
85    of reverting it.
86    """
87
88    def __init__(self, autocommit=False):
89        """Initialize update manager.
90
91        @param autocommit: If False just print out the update instead
92            of committing it.
93        """
94        self.history = {}
95        self.present = {}
96        self.reporter = reporting.Reporter()
97        self.phapi_lib = self.reporter.get_bug_tracker_client()
98        self.autocommit = autocommit
99
100
101    def revert(self):
102        """Only manages status reverts as of now.
103        """
104        for issue_id, update in self.history.iteritems():
105            logging.warning('You will have to manually update %s and %s on %s',
106                    self.present[issue_id].labels,
107                    self.present[issue_id].comment, issue_id)
108            # Create a new update with just the status.
109            self.update(issue_id, Update(status=update.status))
110
111
112    def update(self, old_issue, update):
113        """Record the state of an issue before updating it.
114
115        @param old_issue: The issue to update. If an id is specified an
116            issue is constructed. If an issue object (as defined in phapi_lib
117            Issue)is passed in, it is used directly.
118        @param update: The Update object to apply to the issue.
119        """
120        if type(old_issue) == int:
121            old_issue = self.phapi_lib.get_tracker_issue_by_id(old_issue)
122        old_update = Update(
123                labels=old_issue.labels, status=old_issue.status)
124
125        if not update.status:
126            update.status = old_update.status
127        elif (update.status not in Update.open_statuses and
128              update.status not in Update.closed_statuses):
129            raise ValueError('Unknown status %s' % update.status)
130
131        if not self.autocommit:
132            logging.warning('Would have applied the following update: '
133                    '%s -> %s', old_update, update)
134            return
135
136        self.history[old_issue.id] = old_update
137        self.reporter.modify_bug_report(
138                issue_id=old_issue.id, comment=update.comment,
139                label_update=update.labels,
140                status=update.status)
141        self.present[old_issue.id] = update
142
143
144class Crawler(object):
145    """Class capable of crawling crbug.
146
147    This class applies filters to issues it crawls and caches them locally.
148    """
149
150    # The limit at which we ask for confirmation to proceed with the crawl.
151    PROMPT_LIMIT = 2000
152
153    def __init__(self):
154        self.reporter = reporting.Reporter()
155        self.phapi_client = self.reporter.get_bug_tracker_client()
156        self.issues = None
157        self.all_autofiled_query = 'ANCHOR  TestFailure'
158        self.all_autofiled_label = 'autofiled'
159        self.prompted = False
160
161
162    def fuzzy_search(self, query='', label='', fast=True):
163        """Returns all issues using one query and/or one label.
164
165        @param query: A string representing the query.
166        @param label: A string representing the label.
167        @param fast: If true, don't bother fetching comments.
168
169        @return: A list of issues matching the query. If fast is
170            specified the issues won't have comments.
171        """
172        if not query and not label:
173            raise ValueError('Require query or labels to make a tracker query, '
174                    'try query = "%s" or one of the predefined labels %s' %
175                    (self.fuzzy_search_anchor(),
176                     self.reporter._PREDEFINED_LABELS))
177        if type(label) != str:
178            raise ValueError('The crawler only supports one label per query, '
179                    'and it must be a string. you supplied %s' % label)
180        return self.phapi_client.get_tracker_issues_by_text(
181                query, label=label, full_text=not fast)
182
183
184    @staticmethod
185    def _get_autofiled_count(issue):
186        """Return the autofiled count.
187
188        @param issue: An issue object that has labels.
189
190        @return: An integer representing the autofiled count.
191        """
192        for label in issue.labels:
193            if 'autofiled-count-' in label:
194                return int(label.replace('autofiled-count-', ''))
195
196        # Force bugs without autofiled-count to sink
197        return 0
198
199
200    def _prompt_crawl(self, new_issues, start_index):
201        """Warn the user that a crawl is getting large.
202
203        This method prompts for a y/n answer in case the user wants to abort the
204        crawl and specify another set of labels/queries.
205
206        @param new_issues: A list of issues used with the start_index to
207            determine the number of issues already processed.
208        @param start_index: The start index of the next crawl iteration.
209        """
210        logging.warning('Found %s issues, Crawling issues starting from %s',
211                len(new_issues), start_index)
212        if start_index > self.PROMPT_LIMIT and not self.prompted:
213            logging.warning('Already crawled %s issues, it is possible that'
214                    'you\'ve specified a very general label. If this is the '
215                    'case consider re-rodering the labels so they start with '
216                    'the rarest. Continue crawling [y/n]?',
217                    start_index + len(new_issues))
218            self.prompted = raw_input() == 'y'
219            if not self.prompted:
220                sys.exit(0)
221
222
223    def exhaustive_crawl(self, query='', label='', fast=True):
224        """Perform an exhaustive crawl using one label and query string.
225
226        @param query: A string representing one query.
227        @param lable: A string representing one label.
228
229        @return A list of issues sorted by descending autofiled count.
230        """
231        start_index = 0
232        self.phapi_client.set_max_results(200)
233        logging.warning('Performing an exhaustive crawl with label %s query %s',
234                label, query)
235        vague_issues = []
236        new_issues = self.fuzzy_search(query=query, label=label, fast=fast)
237        while new_issues:
238            vague_issues += new_issues
239            start_index += len(new_issues) + 1
240            self.phapi_client.set_start_index(start_index)
241            new_issues = self.fuzzy_search(query=query, label=label,
242                    fast=fast)
243            self._prompt_crawl(new_issues, start_index)
244
245        # Subsequent calls will clear the issues cache with new results.
246        self.phapi_client.set_start_index(1)
247        return sorted(vague_issues, reverse=True,
248                      key=lambda issue: self._get_autofiled_count(issue))
249
250
251    @staticmethod
252    def filter_labels(issues, labels):
253        """Takes a list of labels and returns matching issues.
254
255        @param issues: A list of issues to parse for labels.
256        @param labels: A list of labels to match.
257
258        @return: A list of matching issues. The issues must contain
259            all the labels specified.
260        """
261        if not labels:
262            return issues
263        matching_issues = set([])
264        labels = set(labels)
265        for issue in issues:
266            issue_labels = set(issue.labels)
267            if issue_labels.issuperset(labels):
268                matching_issues.add(issue)
269        return matching_issues
270
271
272    @classmethod
273    def does_query_match(cls, issue, query):
274        """Check if a query matches the given issue.
275
276        @param issue: The issue to check.
277        @param query: The query to check against.
278
279        @return: True if the query matches, false otherwise.
280        """
281        if query in issue.title or query in issue.summary:
282            return True
283        # We can only search comments if the issue is a complete issue
284        # i.e as defined in phapi_lib.Issue.
285        try:
286            if any(query in comment for comment in issue.comments):
287                return True
288        except (AttributeError, TypeError):
289            pass
290        return False
291
292
293    @classmethod
294    def filter_queries(cls, issues, queries):
295        """Take a list of queries and returns matching issues.
296
297        @param issues: A list of issues to parse. If the issues contain
298            comments and a query is not in the issues title or summmary,
299            the comments are parsed for a substring match.
300        @param queries: A list of queries to parse the issues for.
301            This method looks for an exact substring match within each issue.
302
303        @return: A list of matching issues.
304        """
305        if not queries:
306            return issues
307        matching_issues = set([])
308        for issue in issues:
309            # For each query, check if it's in the title, description or
310            # comments. If a query isn't in any of these, discard the issue.
311            for query in queries:
312                if cls.does_query_match(issue, query):
313                    matching_issues.add(issue)
314                else:
315                    if issue in matching_issues:
316                        logging.warning('%s: %s\n \tPassed a subset of the '
317                                'queries but failed query %s',
318                                issue.id, issue.title, query)
319                        matching_issues.remove(issue)
320                    break
321        return matching_issues
322
323
324    def filter_issues(self, queries='', labels=None, fast=True):
325        """Run the queries, labels filters by crawling crbug.
326
327        @param queries: A space seperated string of queries, usually passed
328            through the command line.
329        @param labels: A space seperated string of labels, usually passed
330            through the command line.
331        @param fast: If specified, skip creating comments for issues since this
332            can be a slow process. This value is only a suggestion, since it is
333            ignored if multiple queries are specified.
334        """
335        queries = shlex.split(queries)
336        labels = shlex.split(labels) if labels else None
337
338        # We'll need comments to filter multiple queries.
339        if len(queries) > 1:
340            fast = False
341        matching_issues = self.exhaustive_crawl(
342                query=queries.pop(0) if queries else '',
343                label=labels.pop(0) if labels else '', fast=fast)
344        matching_issues = self.filter_labels(matching_issues, labels)
345        matching_issues = self.filter_queries(matching_issues, queries)
346        self.issues = list(matching_issues)
347
348
349    def dump_issues(self, limit=None):
350        """Print issues.
351        """
352        if limit and limit < len(self.issues):
353            issues = self.issues[:limit]
354        else:
355            issues = self.issues
356        #TODO: Modify formatting, include some paging etc.
357        for issue in issues:
358            try:
359                print ('[%s] %s crbug.com/%s %s' %
360                       (self._get_autofiled_count(issue),
361                        issue.status, issue.id, issue.title))
362            except UnicodeEncodeError as e:
363                print "Unicdoe error decoding issue id %s" % issue.id
364                continue
365
366
367def _update_test(args):
368    """A simple update test, to record usage.
369    """
370    updater = UpdateManager(autocommit=True)
371    for issue in issues:
372        updater.update(issue,
373                       Update(comment='this is bogus', labels=['bogus'],
374                              status='Assigned'))
375    updater.revert()
376
377
378def configure_logging(quiet=False):
379    """Configure logging.
380
381    @param quiet: True to turn off warning messages.
382    """
383    logging.basicConfig()
384    logger = logging.getLogger()
385    level = logging.WARNING
386    if quiet:
387        level = logging.ERROR
388    logger.setLevel(level)
389
390
391def main(args):
392    crawler = Crawler()
393    if args.reap:
394        if args.queries or args.labels:
395            logging.error('Query based ranking of bugs not supported yet.')
396            return
397        queries = ''
398        labels = crawler.all_autofiled_label
399    else:
400        queries = args.queries
401        labels = args.labels
402    crawler.filter_issues(queries=queries, labels=labels,
403            fast=False if queries else True)
404    crawler.dump_issues(int(args.num))
405    logging.warning('\nThis is a truncated list of %s results, use --num %s '
406            'to get them all. If you want more informative results/better '
407            'querying capabilities try crbug_shell.py.',
408            args.num, len(crawler.issues))
409
410
411if __name__ == '__main__':
412    args = _parse_args(sys.argv[1:])
413    configure_logging(args.quiet)
414    main(args)
415
416