blob: 12b697fd7e49218816d83d227cfe7de622f315f8 [file] [log] [blame]
Prashanth Bdc5cb2e2014-03-03 13:20:54 -08001#!/usr/bin/python
2
3# Copyright (c) 2014 The Chromium OS Authors. All rights reserved.
4# Use of this source code is governed by a BSD-style license that can be
5# found in the LICENSE file.
6
7"""
8This script crawls crbug. Sort-of.
9Invocation:
10 Get all bugs with labels, strings (in summary and/or comments):
11 crbug_crawler.py --labels 'one two three'
12 --queries '"first query" "second query"'
13
14 Get baddest open bugs of all time:
15 crbug_crawler.py --reap
16
17Tips:
18 - Label based queries will return faster than text queries.
19 - contrib/crbug_shell.py is a wrapper that allows you to incrementally
20 filter search results using this script.
21"""
22
23import argparse
24import cmd
25import logging
26import sys
27import shlex
28
29import common
30from autotest_lib.client.common_lib import global_config
31from autotest_lib.server.cros.dynamic_suite import reporting
32
33
34def _parse_args(args):
35 if not args:
36 import crbug_crawler
37 logging.error('Improper usage of crbug_crawler: %s',
38 crbug_crawler.__doc__)
39 sys.exit(1)
40
41 description = ('Usage: crbug_crawler.py --reap')
42 parser = argparse.ArgumentParser(description=description)
43 parser.add_argument('--quiet', help=('Turn off logging noise.'),
44 action='store_true', default=False)
45 parser.add_argument('--num', help='Number of issues to output.', default=10,
46 type=int)
47 parser.add_argument('--queries',
48 help=('Search query. Eg: --queries "%s %s"' %
49 ('build_Root', 'login')),
50 default='')
51 parser.add_argument('--labels',
52 help=('Search labels. Eg: --labels "%s %s"' %
53 ('autofiled', 'Pri-1')), default=None)
54 parser.add_argument('--reap', help=('Top autofiled bugs ordered by count.'),
55 action='store_true', default=False)
56 return parser.parse_args(args)
57
58
59class Update(object):
60 """Class encapsulating fields of an update to a bug.
61 """
62 open_statuses = ['Unconfirmed', 'Untriaged', 'Available', 'Assigned',
63 'Started', 'ExternalDependency']
64 closed_statuses = ['Fixed', 'Verified', 'Duplicate', 'WontFix', 'Archived']
65
66 def __init__(self, comment='', labels='', status=''):
67 self.comment = comment
68 self.labels = labels if labels else []
69 self.status = status
70
71
72 def __str__(self):
73 msg = 'status: %s' % self.status
74 if self.labels:
75 msg = '%s labels: %s' % (msg, self.labels)
76 if self.comment:
77 msg = '%s comment: %s' % (msg, self.comment)
78 return msg
79
80
81class UpdateManager(object):
82 """Update manager that allows you to revert status updates.
83
84 This class keeps track of the last update applied and is capable
85 of reverting it.
86 """
87
88 def __init__(self, autocommit=False):
89 """Initialize update manager.
90
91 @param autocommit: If False just print out the update instead
92 of committing it.
93 """
94 self.history = {}
95 self.present = {}
96 self.reporter = reporting.Reporter()
97 self.phapi_lib = self.reporter.get_bug_tracker_client()
98 self.autocommit = autocommit
99
100
101 def revert(self):
102 """Only manages status reverts as of now.
103 """
104 for issue_id, update in self.history.iteritems():
105 logging.warning('You will have to manually update %s and %s on %s',
106 self.present[issue_id].labels,
107 self.present[issue_id].comment, issue_id)
108 # Create a new update with just the status.
109 self.update(issue_id, Update(status=update.status))
110
111
112 def update(self, old_issue, update):
113 """Record the state of an issue before updating it.
114
115 @param old_issue: The issue to update. If an id is specified an
116 issue is constructed. If an issue object (as defined in phapi_lib
117 Issue)is passed in, it is used directly.
118 @param update: The Update object to apply to the issue.
119 """
120 if type(old_issue) == int:
121 old_issue = self.phapi_lib.get_tracker_issue_by_id(old_issue)
122 old_update = Update(
123 labels=old_issue.labels, status=old_issue.status)
124
125 if not update.status:
126 update.status = old_update.status
127 elif (update.status not in Update.open_statuses and
128 update.status not in Update.closed_statuses):
129 raise ValueError('Unknown status %s' % update.status)
130
131 if not self.autocommit:
132 logging.warning('Would have applied the following update: '
133 '%s -> %s', old_update, update)
134 return
135
136 self.history[old_issue.id] = old_update
137 self.reporter.modify_bug_report(
138 issue_id=old_issue.id, comment=update.comment,
139 label_update=update.labels,
140 status=update.status)
141 self.present[old_issue.id] = update
142
143
144class Crawler(object):
145 """Class capable of crawling crbug.
146
147 This class applies filters to issues it crawls and caches them locally.
148 """
149
150 # The limit at which we ask for confirmation to proceed with the crawl.
151 PROMPT_LIMIT = 2000
152
153 def __init__(self):
154 self.reporter = reporting.Reporter()
155 self.phapi_client = self.reporter.get_bug_tracker_client()
156 self.issues = None
157 self.all_autofiled_query = 'ANCHOR TestFailure'
158 self.all_autofiled_label = 'autofiled'
159 self.prompted = False
160
161
162 def fuzzy_search(self, query='', label='', fast=True):
163 """Returns all issues using one query and/or one label.
164
165 @param query: A string representing the query.
166 @param label: A string representing the label.
167 @param fast: If true, don't bother fetching comments.
168
169 @return: A list of issues matching the query. If fast is
170 specified the issues won't have comments.
171 """
172 if not query and not label:
173 raise ValueError('Require query or labels to make a tracker query, '
174 'try query = "%s" or one of the predefined labels %s' %
175 (self.fuzzy_search_anchor(),
176 self.reporter._PREDEFINED_LABELS))
177 if type(label) != str:
178 raise ValueError('The crawler only supports one label per query, '
179 'and it must be a string. you supplied %s' % label)
180 return self.phapi_client.get_tracker_issues_by_text(
181 query, label=label, full_text=not fast)
182
183
184 @staticmethod
185 def _get_autofiled_count(issue):
186 """Return the autofiled count.
187
188 @param issue: An issue object that has labels.
189
190 @return: An integer representing the autofiled count.
191 """
192 for label in issue.labels:
193 if 'autofiled-count-' in label:
194 return int(label.replace('autofiled-count-', ''))
195
196 # Force bugs without autofiled-count to sink
197 return 0
198
199
200 def _prompt_crawl(self, new_issues, start_index):
201 """Warn the user that a crawl is getting large.
202
203 This method prompts for a y/n answer in case the user wants to abort the
204 crawl and specify another set of labels/queries.
205
206 @param new_issues: A list of issues used with the start_index to
207 determine the number of issues already processed.
208 @param start_index: The start index of the next crawl iteration.
209 """
210 logging.warning('Found %s issues, Crawling issues starting from %s',
211 len(new_issues), start_index)
212 if start_index > self.PROMPT_LIMIT and not self.prompted:
213 logging.warning('Already crawled %s issues, it is possible that'
214 'you\'ve specified a very general label. If this is the '
215 'case consider re-rodering the labels so they start with '
216 'the rarest. Continue crawling [y/n]?',
217 start_index + len(new_issues))
218 self.prompted = raw_input() == 'y'
219 if not self.prompted:
220 sys.exit(0)
221
222
223 def exhaustive_crawl(self, query='', label='', fast=True):
224 """Perform an exhaustive crawl using one label and query string.
225
226 @param query: A string representing one query.
227 @param lable: A string representing one label.
228
229 @return A list of issues sorted by descending autofiled count.
230 """
231 start_index = 0
232 self.phapi_client.set_max_results(200)
233 logging.warning('Performing an exhaustive crawl with label %s query %s',
234 label, query)
235 vague_issues = []
236 new_issues = self.fuzzy_search(query=query, label=label, fast=fast)
237 while new_issues:
238 vague_issues += new_issues
239 start_index += len(new_issues) + 1
240 self.phapi_client.set_start_index(start_index)
241 new_issues = self.fuzzy_search(query=query, label=label,
242 fast=fast)
243 self._prompt_crawl(new_issues, start_index)
244
245 # Subsequent calls will clear the issues cache with new results.
246 self.phapi_client.set_start_index(1)
247 return sorted(vague_issues, reverse=True,
248 key=lambda issue: self._get_autofiled_count(issue))
249
250
251 @staticmethod
252 def filter_labels(issues, labels):
253 """Takes a list of labels and returns matching issues.
254
255 @param issues: A list of issues to parse for labels.
256 @param labels: A list of labels to match.
257
258 @return: A list of matching issues. The issues must contain
259 all the labels specified.
260 """
261 if not labels:
262 return issues
263 matching_issues = set([])
264 labels = set(labels)
265 for issue in issues:
266 issue_labels = set(issue.labels)
267 if issue_labels.issuperset(labels):
268 matching_issues.add(issue)
269 return matching_issues
270
271
272 @classmethod
273 def does_query_match(cls, issue, query):
274 """Check if a query matches the given issue.
275
276 @param issue: The issue to check.
277 @param query: The query to check against.
278
279 @return: True if the query matches, false otherwise.
280 """
281 if query in issue.title or query in issue.summary:
282 return True
283 # We can only search comments if the issue is a complete issue
284 # i.e as defined in phapi_lib.Issue.
285 try:
286 if any(query in comment for comment in issue.comments):
287 return True
288 except (AttributeError, TypeError):
289 pass
290 return False
291
292
293 @classmethod
294 def filter_queries(cls, issues, queries):
295 """Take a list of queries and returns matching issues.
296
297 @param issues: A list of issues to parse. If the issues contain
298 comments and a query is not in the issues title or summmary,
299 the comments are parsed for a substring match.
300 @param queries: A list of queries to parse the issues for.
301 This method looks for an exact substring match within each issue.
302
303 @return: A list of matching issues.
304 """
305 if not queries:
306 return issues
307 matching_issues = set([])
308 for issue in issues:
309 # For each query, check if it's in the title, description or
310 # comments. If a query isn't in any of these, discard the issue.
311 for query in queries:
312 if cls.does_query_match(issue, query):
313 matching_issues.add(issue)
314 else:
315 if issue in matching_issues:
316 logging.warning('%s: %s\n \tPassed a subset of the '
317 'queries but failed query %s',
318 issue.id, issue.title, query)
319 matching_issues.remove(issue)
320 break
321 return matching_issues
322
323
324 def filter_issues(self, queries='', labels=None, fast=True):
325 """Run the queries, labels filters by crawling crbug.
326
327 @param queries: A space seperated string of queries, usually passed
328 through the command line.
329 @param labels: A space seperated string of labels, usually passed
330 through the command line.
331 @param fast: If specified, skip creating comments for issues since this
332 can be a slow process. This value is only a suggestion, since it is
333 ignored if multiple queries are specified.
334 """
335 queries = shlex.split(queries)
336 labels = shlex.split(labels) if labels else None
337
338 # We'll need comments to filter multiple queries.
339 if len(queries) > 1:
340 fast = False
341 matching_issues = self.exhaustive_crawl(
342 query=queries.pop(0) if queries else '',
343 label=labels.pop(0) if labels else '', fast=fast)
344 matching_issues = self.filter_labels(matching_issues, labels)
345 matching_issues = self.filter_queries(matching_issues, queries)
346 self.issues = list(matching_issues)
347
348
349 def dump_issues(self, limit=None):
350 """Print issues.
351 """
352 if limit and limit < len(self.issues):
353 issues = self.issues[:limit]
354 else:
355 issues = self.issues
356 #TODO: Modify formatting, include some paging etc.
357 for issue in issues:
358 try:
359 print ('[%s] %s crbug.com/%s %s' %
360 (self._get_autofiled_count(issue),
361 issue.status, issue.id, issue.title))
362 except UnicodeEncodeError as e:
363 print "Unicdoe error decoding issue id %s" % issue.id
364 continue
365
366
367def _update_test(args):
368 """A simple update test, to record usage.
369 """
370 updater = UpdateManager(autocommit=True)
371 for issue in issues:
372 updater.update(issue,
373 Update(comment='this is bogus', labels=['bogus'],
374 status='Assigned'))
375 updater.revert()
376
377
378def configure_logging(quiet=False):
379 """Configure logging.
380
381 @param quiet: True to turn off warning messages.
382 """
383 logging.basicConfig()
384 logger = logging.getLogger()
385 level = logging.WARNING
386 if quiet:
387 level = logging.ERROR
388 logger.setLevel(level)
389
390
391def main(args):
392 crawler = Crawler()
393 if args.reap:
394 if args.queries or args.labels:
395 logging.error('Query based ranking of bugs not supported yet.')
396 return
397 queries = ''
398 labels = crawler.all_autofiled_label
399 else:
400 queries = args.queries
401 labels = args.labels
402 crawler.filter_issues(queries=queries, labels=labels,
403 fast=False if queries else True)
404 crawler.dump_issues(int(args.num))
405 logging.warning('\nThis is a truncated list of %s results, use --num %s '
406 'to get them all. If you want more informative results/better '
407 'querying capabilities try crbug_shell.py.',
408 args.num, len(crawler.issues))
409
410
411if __name__ == '__main__':
412 args = _parse_args(sys.argv[1:])
413 configure_logging(args.quiet)
414 main(args)
415