[autotest] Utility scripts to crawl, rank, filter crbug.
Usage:
./contrib/crbug_shell.py
./contrib/crbug_crawler.py --reap
./contrb/crbug_crawler.py --labels "autofiled" --num 100 | grep\
Untriaged
crbug_shell is setup to regenerate your credentials file using the
client secret in shadow_config. If neither is found it will fail.
TEST=Ran the scripts.
BUG=chromium:348040
CQ-DEPEND=CL:188527
Change-Id: Ie871c505eaaa45dbd86da84e81da7d3244712161
Reviewed-on: https://chromium-review.googlesource.com/188572
Reviewed-by: Prashanth B <[email protected]>
Tested-by: Prashanth B <[email protected]>
Commit-Queue: Prashanth B <[email protected]>
diff --git a/contrib/crbug_crawler.py b/contrib/crbug_crawler.py
new file mode 100755
index 0000000..12b697f
--- /dev/null
+++ b/contrib/crbug_crawler.py
@@ -0,0 +1,415 @@
+#!/usr/bin/python
+
+# Copyright (c) 2014 The Chromium OS Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
+"""
+This script crawls crbug. Sort-of.
+Invocation:
+ Get all bugs with labels, strings (in summary and/or comments):
+ crbug_crawler.py --labels 'one two three'
+ --queries '"first query" "second query"'
+
+ Get baddest open bugs of all time:
+ crbug_crawler.py --reap
+
+Tips:
+ - Label based queries will return faster than text queries.
+ - contrib/crbug_shell.py is a wrapper that allows you to incrementally
+ filter search results using this script.
+"""
+
+import argparse
+import cmd
+import logging
+import sys
+import shlex
+
+import common
+from autotest_lib.client.common_lib import global_config
+from autotest_lib.server.cros.dynamic_suite import reporting
+
+
+def _parse_args(args):
+ if not args:
+ import crbug_crawler
+ logging.error('Improper usage of crbug_crawler: %s',
+ crbug_crawler.__doc__)
+ sys.exit(1)
+
+ description = ('Usage: crbug_crawler.py --reap')
+ parser = argparse.ArgumentParser(description=description)
+ parser.add_argument('--quiet', help=('Turn off logging noise.'),
+ action='store_true', default=False)
+ parser.add_argument('--num', help='Number of issues to output.', default=10,
+ type=int)
+ parser.add_argument('--queries',
+ help=('Search query. Eg: --queries "%s %s"' %
+ ('build_Root', 'login')),
+ default='')
+ parser.add_argument('--labels',
+ help=('Search labels. Eg: --labels "%s %s"' %
+ ('autofiled', 'Pri-1')), default=None)
+ parser.add_argument('--reap', help=('Top autofiled bugs ordered by count.'),
+ action='store_true', default=False)
+ return parser.parse_args(args)
+
+
+class Update(object):
+ """Class encapsulating fields of an update to a bug.
+ """
+ open_statuses = ['Unconfirmed', 'Untriaged', 'Available', 'Assigned',
+ 'Started', 'ExternalDependency']
+ closed_statuses = ['Fixed', 'Verified', 'Duplicate', 'WontFix', 'Archived']
+
+ def __init__(self, comment='', labels='', status=''):
+ self.comment = comment
+ self.labels = labels if labels else []
+ self.status = status
+
+
+ def __str__(self):
+ msg = 'status: %s' % self.status
+ if self.labels:
+ msg = '%s labels: %s' % (msg, self.labels)
+ if self.comment:
+ msg = '%s comment: %s' % (msg, self.comment)
+ return msg
+
+
+class UpdateManager(object):
+ """Update manager that allows you to revert status updates.
+
+ This class keeps track of the last update applied and is capable
+ of reverting it.
+ """
+
+ def __init__(self, autocommit=False):
+ """Initialize update manager.
+
+ @param autocommit: If False just print out the update instead
+ of committing it.
+ """
+ self.history = {}
+ self.present = {}
+ self.reporter = reporting.Reporter()
+ self.phapi_lib = self.reporter.get_bug_tracker_client()
+ self.autocommit = autocommit
+
+
+ def revert(self):
+ """Only manages status reverts as of now.
+ """
+ for issue_id, update in self.history.iteritems():
+ logging.warning('You will have to manually update %s and %s on %s',
+ self.present[issue_id].labels,
+ self.present[issue_id].comment, issue_id)
+ # Create a new update with just the status.
+ self.update(issue_id, Update(status=update.status))
+
+
+ def update(self, old_issue, update):
+ """Record the state of an issue before updating it.
+
+ @param old_issue: The issue to update. If an id is specified an
+ issue is constructed. If an issue object (as defined in phapi_lib
+ Issue)is passed in, it is used directly.
+ @param update: The Update object to apply to the issue.
+ """
+ if type(old_issue) == int:
+ old_issue = self.phapi_lib.get_tracker_issue_by_id(old_issue)
+ old_update = Update(
+ labels=old_issue.labels, status=old_issue.status)
+
+ if not update.status:
+ update.status = old_update.status
+ elif (update.status not in Update.open_statuses and
+ update.status not in Update.closed_statuses):
+ raise ValueError('Unknown status %s' % update.status)
+
+ if not self.autocommit:
+ logging.warning('Would have applied the following update: '
+ '%s -> %s', old_update, update)
+ return
+
+ self.history[old_issue.id] = old_update
+ self.reporter.modify_bug_report(
+ issue_id=old_issue.id, comment=update.comment,
+ label_update=update.labels,
+ status=update.status)
+ self.present[old_issue.id] = update
+
+
+class Crawler(object):
+ """Class capable of crawling crbug.
+
+ This class applies filters to issues it crawls and caches them locally.
+ """
+
+ # The limit at which we ask for confirmation to proceed with the crawl.
+ PROMPT_LIMIT = 2000
+
+ def __init__(self):
+ self.reporter = reporting.Reporter()
+ self.phapi_client = self.reporter.get_bug_tracker_client()
+ self.issues = None
+ self.all_autofiled_query = 'ANCHOR TestFailure'
+ self.all_autofiled_label = 'autofiled'
+ self.prompted = False
+
+
+ def fuzzy_search(self, query='', label='', fast=True):
+ """Returns all issues using one query and/or one label.
+
+ @param query: A string representing the query.
+ @param label: A string representing the label.
+ @param fast: If true, don't bother fetching comments.
+
+ @return: A list of issues matching the query. If fast is
+ specified the issues won't have comments.
+ """
+ if not query and not label:
+ raise ValueError('Require query or labels to make a tracker query, '
+ 'try query = "%s" or one of the predefined labels %s' %
+ (self.fuzzy_search_anchor(),
+ self.reporter._PREDEFINED_LABELS))
+ if type(label) != str:
+ raise ValueError('The crawler only supports one label per query, '
+ 'and it must be a string. you supplied %s' % label)
+ return self.phapi_client.get_tracker_issues_by_text(
+ query, label=label, full_text=not fast)
+
+
+ @staticmethod
+ def _get_autofiled_count(issue):
+ """Return the autofiled count.
+
+ @param issue: An issue object that has labels.
+
+ @return: An integer representing the autofiled count.
+ """
+ for label in issue.labels:
+ if 'autofiled-count-' in label:
+ return int(label.replace('autofiled-count-', ''))
+
+ # Force bugs without autofiled-count to sink
+ return 0
+
+
+ def _prompt_crawl(self, new_issues, start_index):
+ """Warn the user that a crawl is getting large.
+
+ This method prompts for a y/n answer in case the user wants to abort the
+ crawl and specify another set of labels/queries.
+
+ @param new_issues: A list of issues used with the start_index to
+ determine the number of issues already processed.
+ @param start_index: The start index of the next crawl iteration.
+ """
+ logging.warning('Found %s issues, Crawling issues starting from %s',
+ len(new_issues), start_index)
+ if start_index > self.PROMPT_LIMIT and not self.prompted:
+ logging.warning('Already crawled %s issues, it is possible that'
+ 'you\'ve specified a very general label. If this is the '
+ 'case consider re-rodering the labels so they start with '
+ 'the rarest. Continue crawling [y/n]?',
+ start_index + len(new_issues))
+ self.prompted = raw_input() == 'y'
+ if not self.prompted:
+ sys.exit(0)
+
+
+ def exhaustive_crawl(self, query='', label='', fast=True):
+ """Perform an exhaustive crawl using one label and query string.
+
+ @param query: A string representing one query.
+ @param lable: A string representing one label.
+
+ @return A list of issues sorted by descending autofiled count.
+ """
+ start_index = 0
+ self.phapi_client.set_max_results(200)
+ logging.warning('Performing an exhaustive crawl with label %s query %s',
+ label, query)
+ vague_issues = []
+ new_issues = self.fuzzy_search(query=query, label=label, fast=fast)
+ while new_issues:
+ vague_issues += new_issues
+ start_index += len(new_issues) + 1
+ self.phapi_client.set_start_index(start_index)
+ new_issues = self.fuzzy_search(query=query, label=label,
+ fast=fast)
+ self._prompt_crawl(new_issues, start_index)
+
+ # Subsequent calls will clear the issues cache with new results.
+ self.phapi_client.set_start_index(1)
+ return sorted(vague_issues, reverse=True,
+ key=lambda issue: self._get_autofiled_count(issue))
+
+
+ @staticmethod
+ def filter_labels(issues, labels):
+ """Takes a list of labels and returns matching issues.
+
+ @param issues: A list of issues to parse for labels.
+ @param labels: A list of labels to match.
+
+ @return: A list of matching issues. The issues must contain
+ all the labels specified.
+ """
+ if not labels:
+ return issues
+ matching_issues = set([])
+ labels = set(labels)
+ for issue in issues:
+ issue_labels = set(issue.labels)
+ if issue_labels.issuperset(labels):
+ matching_issues.add(issue)
+ return matching_issues
+
+
+ @classmethod
+ def does_query_match(cls, issue, query):
+ """Check if a query matches the given issue.
+
+ @param issue: The issue to check.
+ @param query: The query to check against.
+
+ @return: True if the query matches, false otherwise.
+ """
+ if query in issue.title or query in issue.summary:
+ return True
+ # We can only search comments if the issue is a complete issue
+ # i.e as defined in phapi_lib.Issue.
+ try:
+ if any(query in comment for comment in issue.comments):
+ return True
+ except (AttributeError, TypeError):
+ pass
+ return False
+
+
+ @classmethod
+ def filter_queries(cls, issues, queries):
+ """Take a list of queries and returns matching issues.
+
+ @param issues: A list of issues to parse. If the issues contain
+ comments and a query is not in the issues title or summmary,
+ the comments are parsed for a substring match.
+ @param queries: A list of queries to parse the issues for.
+ This method looks for an exact substring match within each issue.
+
+ @return: A list of matching issues.
+ """
+ if not queries:
+ return issues
+ matching_issues = set([])
+ for issue in issues:
+ # For each query, check if it's in the title, description or
+ # comments. If a query isn't in any of these, discard the issue.
+ for query in queries:
+ if cls.does_query_match(issue, query):
+ matching_issues.add(issue)
+ else:
+ if issue in matching_issues:
+ logging.warning('%s: %s\n \tPassed a subset of the '
+ 'queries but failed query %s',
+ issue.id, issue.title, query)
+ matching_issues.remove(issue)
+ break
+ return matching_issues
+
+
+ def filter_issues(self, queries='', labels=None, fast=True):
+ """Run the queries, labels filters by crawling crbug.
+
+ @param queries: A space seperated string of queries, usually passed
+ through the command line.
+ @param labels: A space seperated string of labels, usually passed
+ through the command line.
+ @param fast: If specified, skip creating comments for issues since this
+ can be a slow process. This value is only a suggestion, since it is
+ ignored if multiple queries are specified.
+ """
+ queries = shlex.split(queries)
+ labels = shlex.split(labels) if labels else None
+
+ # We'll need comments to filter multiple queries.
+ if len(queries) > 1:
+ fast = False
+ matching_issues = self.exhaustive_crawl(
+ query=queries.pop(0) if queries else '',
+ label=labels.pop(0) if labels else '', fast=fast)
+ matching_issues = self.filter_labels(matching_issues, labels)
+ matching_issues = self.filter_queries(matching_issues, queries)
+ self.issues = list(matching_issues)
+
+
+ def dump_issues(self, limit=None):
+ """Print issues.
+ """
+ if limit and limit < len(self.issues):
+ issues = self.issues[:limit]
+ else:
+ issues = self.issues
+ #TODO: Modify formatting, include some paging etc.
+ for issue in issues:
+ try:
+ print ('[%s] %s crbug.com/%s %s' %
+ (self._get_autofiled_count(issue),
+ issue.status, issue.id, issue.title))
+ except UnicodeEncodeError as e:
+ print "Unicdoe error decoding issue id %s" % issue.id
+ continue
+
+
+def _update_test(args):
+ """A simple update test, to record usage.
+ """
+ updater = UpdateManager(autocommit=True)
+ for issue in issues:
+ updater.update(issue,
+ Update(comment='this is bogus', labels=['bogus'],
+ status='Assigned'))
+ updater.revert()
+
+
+def configure_logging(quiet=False):
+ """Configure logging.
+
+ @param quiet: True to turn off warning messages.
+ """
+ logging.basicConfig()
+ logger = logging.getLogger()
+ level = logging.WARNING
+ if quiet:
+ level = logging.ERROR
+ logger.setLevel(level)
+
+
+def main(args):
+ crawler = Crawler()
+ if args.reap:
+ if args.queries or args.labels:
+ logging.error('Query based ranking of bugs not supported yet.')
+ return
+ queries = ''
+ labels = crawler.all_autofiled_label
+ else:
+ queries = args.queries
+ labels = args.labels
+ crawler.filter_issues(queries=queries, labels=labels,
+ fast=False if queries else True)
+ crawler.dump_issues(int(args.num))
+ logging.warning('\nThis is a truncated list of %s results, use --num %s '
+ 'to get them all. If you want more informative results/better '
+ 'querying capabilities try crbug_shell.py.',
+ args.num, len(crawler.issues))
+
+
+if __name__ == '__main__':
+ args = _parse_args(sys.argv[1:])
+ configure_logging(args.quiet)
+ main(args)
+