Prashanth B | dc5cb2e | 2014-03-03 13:20:54 -0800 | [diff] [blame] | 1 | #!/usr/bin/python |
| 2 | |
| 3 | # Copyright (c) 2014 The Chromium OS Authors. All rights reserved. |
| 4 | # Use of this source code is governed by a BSD-style license that can be |
| 5 | # found in the LICENSE file. |
| 6 | |
| 7 | """ |
| 8 | This script crawls crbug. Sort-of. |
| 9 | Invocation: |
| 10 | Get all bugs with labels, strings (in summary and/or comments): |
| 11 | crbug_crawler.py --labels 'one two three' |
| 12 | --queries '"first query" "second query"' |
| 13 | |
| 14 | Get baddest open bugs of all time: |
| 15 | crbug_crawler.py --reap |
| 16 | |
| 17 | Tips: |
| 18 | - Label based queries will return faster than text queries. |
| 19 | - contrib/crbug_shell.py is a wrapper that allows you to incrementally |
| 20 | filter search results using this script. |
| 21 | """ |
| 22 | |
| 23 | import argparse |
| 24 | import cmd |
| 25 | import logging |
| 26 | import sys |
| 27 | import shlex |
| 28 | |
| 29 | import common |
| 30 | from autotest_lib.client.common_lib import global_config |
| 31 | from autotest_lib.server.cros.dynamic_suite import reporting |
| 32 | |
| 33 | |
| 34 | def _parse_args(args): |
| 35 | if not args: |
| 36 | import crbug_crawler |
| 37 | logging.error('Improper usage of crbug_crawler: %s', |
| 38 | crbug_crawler.__doc__) |
| 39 | sys.exit(1) |
| 40 | |
| 41 | description = ('Usage: crbug_crawler.py --reap') |
| 42 | parser = argparse.ArgumentParser(description=description) |
| 43 | parser.add_argument('--quiet', help=('Turn off logging noise.'), |
| 44 | action='store_true', default=False) |
| 45 | parser.add_argument('--num', help='Number of issues to output.', default=10, |
| 46 | type=int) |
| 47 | parser.add_argument('--queries', |
| 48 | help=('Search query. Eg: --queries "%s %s"' % |
| 49 | ('build_Root', 'login')), |
| 50 | default='') |
| 51 | parser.add_argument('--labels', |
| 52 | help=('Search labels. Eg: --labels "%s %s"' % |
| 53 | ('autofiled', 'Pri-1')), default=None) |
| 54 | parser.add_argument('--reap', help=('Top autofiled bugs ordered by count.'), |
| 55 | action='store_true', default=False) |
| 56 | return parser.parse_args(args) |
| 57 | |
| 58 | |
| 59 | class Update(object): |
| 60 | """Class encapsulating fields of an update to a bug. |
| 61 | """ |
| 62 | open_statuses = ['Unconfirmed', 'Untriaged', 'Available', 'Assigned', |
| 63 | 'Started', 'ExternalDependency'] |
| 64 | closed_statuses = ['Fixed', 'Verified', 'Duplicate', 'WontFix', 'Archived'] |
| 65 | |
| 66 | def __init__(self, comment='', labels='', status=''): |
| 67 | self.comment = comment |
| 68 | self.labels = labels if labels else [] |
| 69 | self.status = status |
| 70 | |
| 71 | |
| 72 | def __str__(self): |
| 73 | msg = 'status: %s' % self.status |
| 74 | if self.labels: |
| 75 | msg = '%s labels: %s' % (msg, self.labels) |
| 76 | if self.comment: |
| 77 | msg = '%s comment: %s' % (msg, self.comment) |
| 78 | return msg |
| 79 | |
| 80 | |
| 81 | class UpdateManager(object): |
| 82 | """Update manager that allows you to revert status updates. |
| 83 | |
| 84 | This class keeps track of the last update applied and is capable |
| 85 | of reverting it. |
| 86 | """ |
| 87 | |
| 88 | def __init__(self, autocommit=False): |
| 89 | """Initialize update manager. |
| 90 | |
| 91 | @param autocommit: If False just print out the update instead |
| 92 | of committing it. |
| 93 | """ |
| 94 | self.history = {} |
| 95 | self.present = {} |
| 96 | self.reporter = reporting.Reporter() |
| 97 | self.phapi_lib = self.reporter.get_bug_tracker_client() |
| 98 | self.autocommit = autocommit |
| 99 | |
| 100 | |
| 101 | def revert(self): |
| 102 | """Only manages status reverts as of now. |
| 103 | """ |
| 104 | for issue_id, update in self.history.iteritems(): |
| 105 | logging.warning('You will have to manually update %s and %s on %s', |
| 106 | self.present[issue_id].labels, |
| 107 | self.present[issue_id].comment, issue_id) |
| 108 | # Create a new update with just the status. |
| 109 | self.update(issue_id, Update(status=update.status)) |
| 110 | |
| 111 | |
| 112 | def update(self, old_issue, update): |
| 113 | """Record the state of an issue before updating it. |
| 114 | |
| 115 | @param old_issue: The issue to update. If an id is specified an |
| 116 | issue is constructed. If an issue object (as defined in phapi_lib |
| 117 | Issue)is passed in, it is used directly. |
| 118 | @param update: The Update object to apply to the issue. |
| 119 | """ |
| 120 | if type(old_issue) == int: |
| 121 | old_issue = self.phapi_lib.get_tracker_issue_by_id(old_issue) |
| 122 | old_update = Update( |
| 123 | labels=old_issue.labels, status=old_issue.status) |
| 124 | |
| 125 | if not update.status: |
| 126 | update.status = old_update.status |
| 127 | elif (update.status not in Update.open_statuses and |
| 128 | update.status not in Update.closed_statuses): |
| 129 | raise ValueError('Unknown status %s' % update.status) |
| 130 | |
| 131 | if not self.autocommit: |
| 132 | logging.warning('Would have applied the following update: ' |
| 133 | '%s -> %s', old_update, update) |
| 134 | return |
| 135 | |
| 136 | self.history[old_issue.id] = old_update |
| 137 | self.reporter.modify_bug_report( |
| 138 | issue_id=old_issue.id, comment=update.comment, |
| 139 | label_update=update.labels, |
| 140 | status=update.status) |
| 141 | self.present[old_issue.id] = update |
| 142 | |
| 143 | |
| 144 | class Crawler(object): |
| 145 | """Class capable of crawling crbug. |
| 146 | |
| 147 | This class applies filters to issues it crawls and caches them locally. |
| 148 | """ |
| 149 | |
| 150 | # The limit at which we ask for confirmation to proceed with the crawl. |
| 151 | PROMPT_LIMIT = 2000 |
| 152 | |
| 153 | def __init__(self): |
| 154 | self.reporter = reporting.Reporter() |
| 155 | self.phapi_client = self.reporter.get_bug_tracker_client() |
| 156 | self.issues = None |
| 157 | self.all_autofiled_query = 'ANCHOR TestFailure' |
| 158 | self.all_autofiled_label = 'autofiled' |
| 159 | self.prompted = False |
| 160 | |
| 161 | |
| 162 | def fuzzy_search(self, query='', label='', fast=True): |
| 163 | """Returns all issues using one query and/or one label. |
| 164 | |
| 165 | @param query: A string representing the query. |
| 166 | @param label: A string representing the label. |
| 167 | @param fast: If true, don't bother fetching comments. |
| 168 | |
| 169 | @return: A list of issues matching the query. If fast is |
| 170 | specified the issues won't have comments. |
| 171 | """ |
| 172 | if not query and not label: |
| 173 | raise ValueError('Require query or labels to make a tracker query, ' |
| 174 | 'try query = "%s" or one of the predefined labels %s' % |
| 175 | (self.fuzzy_search_anchor(), |
| 176 | self.reporter._PREDEFINED_LABELS)) |
| 177 | if type(label) != str: |
| 178 | raise ValueError('The crawler only supports one label per query, ' |
| 179 | 'and it must be a string. you supplied %s' % label) |
| 180 | return self.phapi_client.get_tracker_issues_by_text( |
| 181 | query, label=label, full_text=not fast) |
| 182 | |
| 183 | |
| 184 | @staticmethod |
| 185 | def _get_autofiled_count(issue): |
| 186 | """Return the autofiled count. |
| 187 | |
| 188 | @param issue: An issue object that has labels. |
| 189 | |
| 190 | @return: An integer representing the autofiled count. |
| 191 | """ |
| 192 | for label in issue.labels: |
| 193 | if 'autofiled-count-' in label: |
| 194 | return int(label.replace('autofiled-count-', '')) |
| 195 | |
| 196 | # Force bugs without autofiled-count to sink |
| 197 | return 0 |
| 198 | |
| 199 | |
| 200 | def _prompt_crawl(self, new_issues, start_index): |
| 201 | """Warn the user that a crawl is getting large. |
| 202 | |
| 203 | This method prompts for a y/n answer in case the user wants to abort the |
| 204 | crawl and specify another set of labels/queries. |
| 205 | |
| 206 | @param new_issues: A list of issues used with the start_index to |
| 207 | determine the number of issues already processed. |
| 208 | @param start_index: The start index of the next crawl iteration. |
| 209 | """ |
| 210 | logging.warning('Found %s issues, Crawling issues starting from %s', |
| 211 | len(new_issues), start_index) |
| 212 | if start_index > self.PROMPT_LIMIT and not self.prompted: |
| 213 | logging.warning('Already crawled %s issues, it is possible that' |
| 214 | 'you\'ve specified a very general label. If this is the ' |
| 215 | 'case consider re-rodering the labels so they start with ' |
| 216 | 'the rarest. Continue crawling [y/n]?', |
| 217 | start_index + len(new_issues)) |
| 218 | self.prompted = raw_input() == 'y' |
| 219 | if not self.prompted: |
| 220 | sys.exit(0) |
| 221 | |
| 222 | |
| 223 | def exhaustive_crawl(self, query='', label='', fast=True): |
| 224 | """Perform an exhaustive crawl using one label and query string. |
| 225 | |
| 226 | @param query: A string representing one query. |
| 227 | @param lable: A string representing one label. |
| 228 | |
| 229 | @return A list of issues sorted by descending autofiled count. |
| 230 | """ |
| 231 | start_index = 0 |
| 232 | self.phapi_client.set_max_results(200) |
| 233 | logging.warning('Performing an exhaustive crawl with label %s query %s', |
| 234 | label, query) |
| 235 | vague_issues = [] |
| 236 | new_issues = self.fuzzy_search(query=query, label=label, fast=fast) |
| 237 | while new_issues: |
| 238 | vague_issues += new_issues |
| 239 | start_index += len(new_issues) + 1 |
| 240 | self.phapi_client.set_start_index(start_index) |
| 241 | new_issues = self.fuzzy_search(query=query, label=label, |
| 242 | fast=fast) |
| 243 | self._prompt_crawl(new_issues, start_index) |
| 244 | |
| 245 | # Subsequent calls will clear the issues cache with new results. |
| 246 | self.phapi_client.set_start_index(1) |
| 247 | return sorted(vague_issues, reverse=True, |
| 248 | key=lambda issue: self._get_autofiled_count(issue)) |
| 249 | |
| 250 | |
| 251 | @staticmethod |
| 252 | def filter_labels(issues, labels): |
| 253 | """Takes a list of labels and returns matching issues. |
| 254 | |
| 255 | @param issues: A list of issues to parse for labels. |
| 256 | @param labels: A list of labels to match. |
| 257 | |
| 258 | @return: A list of matching issues. The issues must contain |
| 259 | all the labels specified. |
| 260 | """ |
| 261 | if not labels: |
| 262 | return issues |
| 263 | matching_issues = set([]) |
| 264 | labels = set(labels) |
| 265 | for issue in issues: |
| 266 | issue_labels = set(issue.labels) |
| 267 | if issue_labels.issuperset(labels): |
| 268 | matching_issues.add(issue) |
| 269 | return matching_issues |
| 270 | |
| 271 | |
| 272 | @classmethod |
| 273 | def does_query_match(cls, issue, query): |
| 274 | """Check if a query matches the given issue. |
| 275 | |
| 276 | @param issue: The issue to check. |
| 277 | @param query: The query to check against. |
| 278 | |
| 279 | @return: True if the query matches, false otherwise. |
| 280 | """ |
| 281 | if query in issue.title or query in issue.summary: |
| 282 | return True |
| 283 | # We can only search comments if the issue is a complete issue |
| 284 | # i.e as defined in phapi_lib.Issue. |
| 285 | try: |
| 286 | if any(query in comment for comment in issue.comments): |
| 287 | return True |
| 288 | except (AttributeError, TypeError): |
| 289 | pass |
| 290 | return False |
| 291 | |
| 292 | |
| 293 | @classmethod |
| 294 | def filter_queries(cls, issues, queries): |
| 295 | """Take a list of queries and returns matching issues. |
| 296 | |
| 297 | @param issues: A list of issues to parse. If the issues contain |
| 298 | comments and a query is not in the issues title or summmary, |
| 299 | the comments are parsed for a substring match. |
| 300 | @param queries: A list of queries to parse the issues for. |
| 301 | This method looks for an exact substring match within each issue. |
| 302 | |
| 303 | @return: A list of matching issues. |
| 304 | """ |
| 305 | if not queries: |
| 306 | return issues |
| 307 | matching_issues = set([]) |
| 308 | for issue in issues: |
| 309 | # For each query, check if it's in the title, description or |
| 310 | # comments. If a query isn't in any of these, discard the issue. |
| 311 | for query in queries: |
| 312 | if cls.does_query_match(issue, query): |
| 313 | matching_issues.add(issue) |
| 314 | else: |
| 315 | if issue in matching_issues: |
| 316 | logging.warning('%s: %s\n \tPassed a subset of the ' |
| 317 | 'queries but failed query %s', |
| 318 | issue.id, issue.title, query) |
| 319 | matching_issues.remove(issue) |
| 320 | break |
| 321 | return matching_issues |
| 322 | |
| 323 | |
| 324 | def filter_issues(self, queries='', labels=None, fast=True): |
| 325 | """Run the queries, labels filters by crawling crbug. |
| 326 | |
| 327 | @param queries: A space seperated string of queries, usually passed |
| 328 | through the command line. |
| 329 | @param labels: A space seperated string of labels, usually passed |
| 330 | through the command line. |
| 331 | @param fast: If specified, skip creating comments for issues since this |
| 332 | can be a slow process. This value is only a suggestion, since it is |
| 333 | ignored if multiple queries are specified. |
| 334 | """ |
| 335 | queries = shlex.split(queries) |
| 336 | labels = shlex.split(labels) if labels else None |
| 337 | |
| 338 | # We'll need comments to filter multiple queries. |
| 339 | if len(queries) > 1: |
| 340 | fast = False |
| 341 | matching_issues = self.exhaustive_crawl( |
| 342 | query=queries.pop(0) if queries else '', |
| 343 | label=labels.pop(0) if labels else '', fast=fast) |
| 344 | matching_issues = self.filter_labels(matching_issues, labels) |
| 345 | matching_issues = self.filter_queries(matching_issues, queries) |
| 346 | self.issues = list(matching_issues) |
| 347 | |
| 348 | |
| 349 | def dump_issues(self, limit=None): |
| 350 | """Print issues. |
| 351 | """ |
| 352 | if limit and limit < len(self.issues): |
| 353 | issues = self.issues[:limit] |
| 354 | else: |
| 355 | issues = self.issues |
| 356 | #TODO: Modify formatting, include some paging etc. |
| 357 | for issue in issues: |
| 358 | try: |
| 359 | print ('[%s] %s crbug.com/%s %s' % |
| 360 | (self._get_autofiled_count(issue), |
| 361 | issue.status, issue.id, issue.title)) |
| 362 | except UnicodeEncodeError as e: |
| 363 | print "Unicdoe error decoding issue id %s" % issue.id |
| 364 | continue |
| 365 | |
| 366 | |
| 367 | def _update_test(args): |
| 368 | """A simple update test, to record usage. |
| 369 | """ |
| 370 | updater = UpdateManager(autocommit=True) |
| 371 | for issue in issues: |
| 372 | updater.update(issue, |
| 373 | Update(comment='this is bogus', labels=['bogus'], |
| 374 | status='Assigned')) |
| 375 | updater.revert() |
| 376 | |
| 377 | |
| 378 | def configure_logging(quiet=False): |
| 379 | """Configure logging. |
| 380 | |
| 381 | @param quiet: True to turn off warning messages. |
| 382 | """ |
| 383 | logging.basicConfig() |
| 384 | logger = logging.getLogger() |
| 385 | level = logging.WARNING |
| 386 | if quiet: |
| 387 | level = logging.ERROR |
| 388 | logger.setLevel(level) |
| 389 | |
| 390 | |
| 391 | def main(args): |
| 392 | crawler = Crawler() |
| 393 | if args.reap: |
| 394 | if args.queries or args.labels: |
| 395 | logging.error('Query based ranking of bugs not supported yet.') |
| 396 | return |
| 397 | queries = '' |
| 398 | labels = crawler.all_autofiled_label |
| 399 | else: |
| 400 | queries = args.queries |
| 401 | labels = args.labels |
| 402 | crawler.filter_issues(queries=queries, labels=labels, |
| 403 | fast=False if queries else True) |
| 404 | crawler.dump_issues(int(args.num)) |
| 405 | logging.warning('\nThis is a truncated list of %s results, use --num %s ' |
| 406 | 'to get them all. If you want more informative results/better ' |
| 407 | 'querying capabilities try crbug_shell.py.', |
| 408 | args.num, len(crawler.issues)) |
| 409 | |
| 410 | |
| 411 | if __name__ == '__main__': |
| 412 | args = _parse_args(sys.argv[1:]) |
| 413 | configure_logging(args.quiet) |
| 414 | main(args) |
| 415 | |