[autotest] Utility scripts to crawl, rank, filter crbug.

Usage:
    ./contrib/crbug_shell.py
    ./contrib/crbug_crawler.py --reap
    ./contrb/crbug_crawler.py --labels "autofiled" --num 100 | grep\
        Untriaged

crbug_shell is setup to regenerate your credentials file using the
client secret in shadow_config. If neither is found it will fail.

TEST=Ran the scripts.
BUG=chromium:348040
CQ-DEPEND=CL:188527

Change-Id: Ie871c505eaaa45dbd86da84e81da7d3244712161
Reviewed-on: https://chromium-review.googlesource.com/188572
Reviewed-by: Prashanth B <[email protected]>
Tested-by: Prashanth B <[email protected]>
Commit-Queue: Prashanth B <[email protected]>
diff --git a/contrib/crbug_crawler.py b/contrib/crbug_crawler.py
new file mode 100755
index 0000000..12b697f
--- /dev/null
+++ b/contrib/crbug_crawler.py
@@ -0,0 +1,415 @@
+#!/usr/bin/python
+
+# Copyright (c) 2014 The Chromium OS Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
+"""
+This script crawls crbug. Sort-of.
+Invocation:
+    Get all bugs with labels, strings (in summary and/or comments):
+        crbug_crawler.py --labels 'one two three'
+                         --queries '"first query" "second query"'
+
+    Get baddest open bugs of all time:
+        crbug_crawler.py --reap
+
+Tips:
+    - Label based queries will return faster than text queries.
+    - contrib/crbug_shell.py is a wrapper that allows you to incrementally
+        filter search results using this script.
+"""
+
+import argparse
+import cmd
+import logging
+import sys
+import shlex
+
+import common
+from autotest_lib.client.common_lib import global_config
+from autotest_lib.server.cros.dynamic_suite import reporting
+
+
+def _parse_args(args):
+    if not args:
+        import crbug_crawler
+        logging.error('Improper usage of crbug_crawler: %s',
+                crbug_crawler.__doc__)
+        sys.exit(1)
+
+    description = ('Usage: crbug_crawler.py --reap')
+    parser = argparse.ArgumentParser(description=description)
+    parser.add_argument('--quiet', help=('Turn off logging noise.'),
+            action='store_true', default=False)
+    parser.add_argument('--num', help='Number of issues to output.', default=10,
+            type=int)
+    parser.add_argument('--queries',
+                        help=('Search query. Eg: --queries "%s %s"' %
+                              ('build_Root', 'login')),
+                        default='')
+    parser.add_argument('--labels',
+                        help=('Search labels. Eg: --labels "%s %s"' %
+                              ('autofiled', 'Pri-1')), default=None)
+    parser.add_argument('--reap', help=('Top autofiled bugs ordered by count.'),
+            action='store_true', default=False)
+    return parser.parse_args(args)
+
+
+class Update(object):
+    """Class encapsulating fields of an update to a bug.
+    """
+    open_statuses = ['Unconfirmed', 'Untriaged', 'Available', 'Assigned',
+                     'Started', 'ExternalDependency']
+    closed_statuses = ['Fixed', 'Verified', 'Duplicate', 'WontFix', 'Archived']
+
+    def __init__(self, comment='', labels='', status=''):
+        self.comment = comment
+        self.labels = labels if labels else []
+        self.status = status
+
+
+    def __str__(self):
+        msg = 'status: %s' % self.status
+        if self.labels:
+            msg = '%s labels: %s' % (msg, self.labels)
+        if self.comment:
+            msg = '%s comment: %s' % (msg, self.comment)
+        return msg
+
+
+class UpdateManager(object):
+    """Update manager that allows you to revert status updates.
+
+    This class keeps track of the last update applied and is capable
+    of reverting it.
+    """
+
+    def __init__(self, autocommit=False):
+        """Initialize update manager.
+
+        @param autocommit: If False just print out the update instead
+            of committing it.
+        """
+        self.history = {}
+        self.present = {}
+        self.reporter = reporting.Reporter()
+        self.phapi_lib = self.reporter.get_bug_tracker_client()
+        self.autocommit = autocommit
+
+
+    def revert(self):
+        """Only manages status reverts as of now.
+        """
+        for issue_id, update in self.history.iteritems():
+            logging.warning('You will have to manually update %s and %s on %s',
+                    self.present[issue_id].labels,
+                    self.present[issue_id].comment, issue_id)
+            # Create a new update with just the status.
+            self.update(issue_id, Update(status=update.status))
+
+
+    def update(self, old_issue, update):
+        """Record the state of an issue before updating it.
+
+        @param old_issue: The issue to update. If an id is specified an
+            issue is constructed. If an issue object (as defined in phapi_lib
+            Issue)is passed in, it is used directly.
+        @param update: The Update object to apply to the issue.
+        """
+        if type(old_issue) == int:
+            old_issue = self.phapi_lib.get_tracker_issue_by_id(old_issue)
+        old_update = Update(
+                labels=old_issue.labels, status=old_issue.status)
+
+        if not update.status:
+            update.status = old_update.status
+        elif (update.status not in Update.open_statuses and
+              update.status not in Update.closed_statuses):
+            raise ValueError('Unknown status %s' % update.status)
+
+        if not self.autocommit:
+            logging.warning('Would have applied the following update: '
+                    '%s -> %s', old_update, update)
+            return
+
+        self.history[old_issue.id] = old_update
+        self.reporter.modify_bug_report(
+                issue_id=old_issue.id, comment=update.comment,
+                label_update=update.labels,
+                status=update.status)
+        self.present[old_issue.id] = update
+
+
+class Crawler(object):
+    """Class capable of crawling crbug.
+
+    This class applies filters to issues it crawls and caches them locally.
+    """
+
+    # The limit at which we ask for confirmation to proceed with the crawl.
+    PROMPT_LIMIT = 2000
+
+    def __init__(self):
+        self.reporter = reporting.Reporter()
+        self.phapi_client = self.reporter.get_bug_tracker_client()
+        self.issues = None
+        self.all_autofiled_query = 'ANCHOR  TestFailure'
+        self.all_autofiled_label = 'autofiled'
+        self.prompted = False
+
+
+    def fuzzy_search(self, query='', label='', fast=True):
+        """Returns all issues using one query and/or one label.
+
+        @param query: A string representing the query.
+        @param label: A string representing the label.
+        @param fast: If true, don't bother fetching comments.
+
+        @return: A list of issues matching the query. If fast is
+            specified the issues won't have comments.
+        """
+        if not query and not label:
+            raise ValueError('Require query or labels to make a tracker query, '
+                    'try query = "%s" or one of the predefined labels %s' %
+                    (self.fuzzy_search_anchor(),
+                     self.reporter._PREDEFINED_LABELS))
+        if type(label) != str:
+            raise ValueError('The crawler only supports one label per query, '
+                    'and it must be a string. you supplied %s' % label)
+        return self.phapi_client.get_tracker_issues_by_text(
+                query, label=label, full_text=not fast)
+
+
+    @staticmethod
+    def _get_autofiled_count(issue):
+        """Return the autofiled count.
+
+        @param issue: An issue object that has labels.
+
+        @return: An integer representing the autofiled count.
+        """
+        for label in issue.labels:
+            if 'autofiled-count-' in label:
+                return int(label.replace('autofiled-count-', ''))
+
+        # Force bugs without autofiled-count to sink
+        return 0
+
+
+    def _prompt_crawl(self, new_issues, start_index):
+        """Warn the user that a crawl is getting large.
+
+        This method prompts for a y/n answer in case the user wants to abort the
+        crawl and specify another set of labels/queries.
+
+        @param new_issues: A list of issues used with the start_index to
+            determine the number of issues already processed.
+        @param start_index: The start index of the next crawl iteration.
+        """
+        logging.warning('Found %s issues, Crawling issues starting from %s',
+                len(new_issues), start_index)
+        if start_index > self.PROMPT_LIMIT and not self.prompted:
+            logging.warning('Already crawled %s issues, it is possible that'
+                    'you\'ve specified a very general label. If this is the '
+                    'case consider re-rodering the labels so they start with '
+                    'the rarest. Continue crawling [y/n]?',
+                    start_index + len(new_issues))
+            self.prompted = raw_input() == 'y'
+            if not self.prompted:
+                sys.exit(0)
+
+
+    def exhaustive_crawl(self, query='', label='', fast=True):
+        """Perform an exhaustive crawl using one label and query string.
+
+        @param query: A string representing one query.
+        @param lable: A string representing one label.
+
+        @return A list of issues sorted by descending autofiled count.
+        """
+        start_index = 0
+        self.phapi_client.set_max_results(200)
+        logging.warning('Performing an exhaustive crawl with label %s query %s',
+                label, query)
+        vague_issues = []
+        new_issues = self.fuzzy_search(query=query, label=label, fast=fast)
+        while new_issues:
+            vague_issues += new_issues
+            start_index += len(new_issues) + 1
+            self.phapi_client.set_start_index(start_index)
+            new_issues = self.fuzzy_search(query=query, label=label,
+                    fast=fast)
+            self._prompt_crawl(new_issues, start_index)
+
+        # Subsequent calls will clear the issues cache with new results.
+        self.phapi_client.set_start_index(1)
+        return sorted(vague_issues, reverse=True,
+                      key=lambda issue: self._get_autofiled_count(issue))
+
+
+    @staticmethod
+    def filter_labels(issues, labels):
+        """Takes a list of labels and returns matching issues.
+
+        @param issues: A list of issues to parse for labels.
+        @param labels: A list of labels to match.
+
+        @return: A list of matching issues. The issues must contain
+            all the labels specified.
+        """
+        if not labels:
+            return issues
+        matching_issues = set([])
+        labels = set(labels)
+        for issue in issues:
+            issue_labels = set(issue.labels)
+            if issue_labels.issuperset(labels):
+                matching_issues.add(issue)
+        return matching_issues
+
+
+    @classmethod
+    def does_query_match(cls, issue, query):
+        """Check if a query matches the given issue.
+
+        @param issue: The issue to check.
+        @param query: The query to check against.
+
+        @return: True if the query matches, false otherwise.
+        """
+        if query in issue.title or query in issue.summary:
+            return True
+        # We can only search comments if the issue is a complete issue
+        # i.e as defined in phapi_lib.Issue.
+        try:
+            if any(query in comment for comment in issue.comments):
+                return True
+        except (AttributeError, TypeError):
+            pass
+        return False
+
+
+    @classmethod
+    def filter_queries(cls, issues, queries):
+        """Take a list of queries and returns matching issues.
+
+        @param issues: A list of issues to parse. If the issues contain
+            comments and a query is not in the issues title or summmary,
+            the comments are parsed for a substring match.
+        @param queries: A list of queries to parse the issues for.
+            This method looks for an exact substring match within each issue.
+
+        @return: A list of matching issues.
+        """
+        if not queries:
+            return issues
+        matching_issues = set([])
+        for issue in issues:
+            # For each query, check if it's in the title, description or
+            # comments. If a query isn't in any of these, discard the issue.
+            for query in queries:
+                if cls.does_query_match(issue, query):
+                    matching_issues.add(issue)
+                else:
+                    if issue in matching_issues:
+                        logging.warning('%s: %s\n \tPassed a subset of the '
+                                'queries but failed query %s',
+                                issue.id, issue.title, query)
+                        matching_issues.remove(issue)
+                    break
+        return matching_issues
+
+
+    def filter_issues(self, queries='', labels=None, fast=True):
+        """Run the queries, labels filters by crawling crbug.
+
+        @param queries: A space seperated string of queries, usually passed
+            through the command line.
+        @param labels: A space seperated string of labels, usually passed
+            through the command line.
+        @param fast: If specified, skip creating comments for issues since this
+            can be a slow process. This value is only a suggestion, since it is
+            ignored if multiple queries are specified.
+        """
+        queries = shlex.split(queries)
+        labels = shlex.split(labels) if labels else None
+
+        # We'll need comments to filter multiple queries.
+        if len(queries) > 1:
+            fast = False
+        matching_issues = self.exhaustive_crawl(
+                query=queries.pop(0) if queries else '',
+                label=labels.pop(0) if labels else '', fast=fast)
+        matching_issues = self.filter_labels(matching_issues, labels)
+        matching_issues = self.filter_queries(matching_issues, queries)
+        self.issues = list(matching_issues)
+
+
+    def dump_issues(self, limit=None):
+        """Print issues.
+        """
+        if limit and limit < len(self.issues):
+            issues = self.issues[:limit]
+        else:
+            issues = self.issues
+        #TODO: Modify formatting, include some paging etc.
+        for issue in issues:
+            try:
+                print ('[%s] %s crbug.com/%s %s' %
+                       (self._get_autofiled_count(issue),
+                        issue.status, issue.id, issue.title))
+            except UnicodeEncodeError as e:
+                print "Unicdoe error decoding issue id %s" % issue.id
+                continue
+
+
+def _update_test(args):
+    """A simple update test, to record usage.
+    """
+    updater = UpdateManager(autocommit=True)
+    for issue in issues:
+        updater.update(issue,
+                       Update(comment='this is bogus', labels=['bogus'],
+                              status='Assigned'))
+    updater.revert()
+
+
+def configure_logging(quiet=False):
+    """Configure logging.
+
+    @param quiet: True to turn off warning messages.
+    """
+    logging.basicConfig()
+    logger = logging.getLogger()
+    level = logging.WARNING
+    if quiet:
+        level = logging.ERROR
+    logger.setLevel(level)
+
+
+def main(args):
+    crawler = Crawler()
+    if args.reap:
+        if args.queries or args.labels:
+            logging.error('Query based ranking of bugs not supported yet.')
+            return
+        queries = ''
+        labels = crawler.all_autofiled_label
+    else:
+        queries = args.queries
+        labels = args.labels
+    crawler.filter_issues(queries=queries, labels=labels,
+            fast=False if queries else True)
+    crawler.dump_issues(int(args.num))
+    logging.warning('\nThis is a truncated list of %s results, use --num %s '
+            'to get them all. If you want more informative results/better '
+            'querying capabilities try crbug_shell.py.',
+            args.num, len(crawler.issues))
+
+
+if __name__ == '__main__':
+    args = _parse_args(sys.argv[1:])
+    configure_logging(args.quiet)
+    main(args)
+