blob: ed00af7da1ab86ca3a6a8e99de66cd2e7d9d3ea6 [file] [log] [blame]
Prashanth B923ca262014-03-14 12:36:29 -07001#!/usr/bin/python
2#
3# Copyright (c) 2014 The Chromium OS Authors. All rights reserved.
4# Use of this source code is governed by a BSD-style license that can be
5# found in the LICENSE file.
6
7import datetime as datetime_base
8import logging
9from datetime import datetime
10
11import common
12
Allen Li6a612392016-08-18 12:09:32 -070013from autotest_lib.client.common_lib import global_config
14from autotest_lib.client.common_lib import host_states
J. Richard Barnetteaf1e8262016-03-04 12:55:11 -080015from autotest_lib.client.common_lib import time_utils
Dan Shi20952c12014-05-14 17:07:38 -070016from autotest_lib.server import utils
Prashanth B923ca262014-03-14 12:36:29 -070017from autotest_lib.server.cros.dynamic_suite import reporting_utils
Aviv Keshet7ee95862016-08-30 15:18:27 -070018from autotest_lib.server.lib import status_history
Prashanth B923ca262014-03-14 12:36:29 -070019
Allen Li6a612392016-08-18 12:09:32 -070020CONFIG = global_config.global_config
21
Prashanth B923ca262014-03-14 12:36:29 -070022
Fang Deng6197da32014-09-25 10:18:48 -070023class BoardNotAvailableError(utils.TestLabException):
24 """Raised when a board is not available in the lab."""
25
26
27class NotEnoughDutsError(utils.TestLabException):
28 """Rasied when the lab doesn't have the minimum number of duts."""
29
Allen Li6a612392016-08-18 12:09:32 -070030 def __init__(self, board, pool, num_available, num_required, hosts):
31 """Initialize instance.
32
33 Please pass arguments by keyword.
34
35 @param board: Name of board.
36 @param pool: Name of pool.
37 @param num_available: Number of available hosts.
38 @param num_required: Number of hosts required.
39 @param hosts: Sequence of Host instances for given board and pool.
40 """
41 self.board = board
42 self.pool = pool
43 self.num_available = num_available
44 self.num_required = num_required
45 self.hosts = hosts
46 self.bug_id = None
47 self.suite_name = None
48 self.build = None
49
50
51 def __repr__(self):
52 return (
53 '<{cls} at 0x{id:x} with'
54 ' board={this.board!r},'
55 ' pool={this.pool!r},'
56 ' num_available={this.num_available!r},'
57 ' num_required={this.num_required!r},'
58 ' bug_id={this.bug_id!r},'
59 ' suite_name={this.suite_name!r},'
60 ' build={this.build!r}>'
61 .format(cls=type(self).__name__, id=id(self), this=self)
62 )
63
64
65 def __str__(self):
66 msg_parts = [
67 'Not enough DUTs for board: {this.board}, pool: {this.pool};'
68 ' required: {this.num_required}, found: {this.num_available}'
69 ]
Allen Lidc2c69a2016-09-14 19:05:47 -070070 format_dict = {'this': self}
Allen Li6a612392016-08-18 12:09:32 -070071 if self.bug_id is not None:
Allen Lidc2c69a2016-09-14 19:05:47 -070072 msg_parts.append('bug: {bug_url}')
73 format_dict['bug_url'] = reporting_utils.link_crbug(self.bug_id)
Allen Li6a612392016-08-18 12:09:32 -070074 if self.suite_name is not None:
Allen Lidc2c69a2016-09-14 19:05:47 -070075 msg_parts.append('suite: {this.suite_name}')
Allen Li6a612392016-08-18 12:09:32 -070076 if self.build is not None:
Allen Lidc2c69a2016-09-14 19:05:47 -070077 msg_parts.append('build: {this.build}')
78 return ', '.join(msg_parts).format(**format_dict)
Allen Li6a612392016-08-18 12:09:32 -070079
80
81 def add_bug_id(self, bug_id):
82 """Add crbug id associated with this exception.
83
84 @param bug_id crbug id whose str() value is used in a crbug URL.
85 """
86 self.bug_id = bug_id
87
88
89 def add_suite_name(self, suite_name):
90 """Add name of test suite that needed the DUTs.
91
92 @param suite_name Name of test suite.
93 """
94 self.suite_name = suite_name
95
96
97 def add_build(self, build):
98 """Add name of build of job that needed the DUTs.
99
100 @param build Name of build.
101 """
102 self.build = build
103
Fang Deng6197da32014-09-25 10:18:48 -0700104
Prashanth Ba7be2072014-07-15 15:03:21 -0700105class SimpleTimer(object):
106 """A simple timer used to periodically check if a deadline has passed."""
107
108 def _reset(self):
109 """Reset the deadline."""
110 if not self.interval_hours or self.interval_hours < 0:
111 logging.error('Bad interval %s', self.interval_hours)
112 self.deadline = None
113 return
114 self.deadline = datetime.now() + datetime_base.timedelta(
115 hours=self.interval_hours)
116
117
118 def __init__(self, interval_hours=0.5):
119 """Initialize a simple periodic deadline timer.
120
121 @param interval_hours: Interval of the deadline.
122 """
123 self.interval_hours = interval_hours
124 self._reset()
125
126
127 def poll(self):
128 """Poll the timer to see if we've hit the deadline.
129
130 This method resets the deadline if it has passed. If the deadline
131 hasn't been set, or the current time is less than the deadline, the
132 method returns False.
133
134 @return: True if the deadline has passed, False otherwise.
135 """
136 if not self.deadline or datetime.now() < self.deadline:
137 return False
138 self._reset()
139 return True
140
141
Prashanth B923ca262014-03-14 12:36:29 -0700142class JobTimer(object):
143 """Utility class capable of measuring job timeouts.
144 """
145
146 # Format used in datetime - string conversion.
147 time_format = '%m-%d-%Y [%H:%M:%S]'
148
149 def __init__(self, job_created_time, timeout_mins):
150 """JobTimer constructor.
151
152 @param job_created_time: float representing the time a job was
153 created. Eg: time.time()
154 @param timeout_mins: float representing the timeout in minutes.
155 """
156 self.job_created_time = datetime.fromtimestamp(job_created_time)
157 self.timeout_hours = datetime_base.timedelta(hours=timeout_mins/60.0)
Prashanth Ba7be2072014-07-15 15:03:21 -0700158 self.debug_output_timer = SimpleTimer(interval_hours=0.5)
Prashanth B923ca262014-03-14 12:36:29 -0700159 self.past_halftime = False
160
161
162 @classmethod
163 def format_time(cls, datetime_obj):
164 """Get the string formatted version of the datetime object.
165
166 @param datetime_obj: A datetime.datetime object.
167 Eg: datetime.datetime.now()
168
169 @return: A formatted string containing the date/time of the
170 input datetime.
171 """
172 return datetime_obj.strftime(cls.time_format)
173
174
175 def elapsed_time(self):
176 """Get the time elapsed since this job was created.
177
178 @return: A timedelta object representing the elapsed time.
179 """
180 return datetime.now() - self.job_created_time
181
182
183 def is_suite_timeout(self):
184 """Check if the suite timed out.
185
186 @return: True if more than timeout_hours has elapsed since the suite job
187 was created.
188 """
189 if self.elapsed_time() >= self.timeout_hours:
190 logging.info('Suite timed out. Started on %s, timed out on %s',
191 self.format_time(self.job_created_time),
192 self.format_time(datetime.now()))
193 return True
194 return False
195
196
197 def first_past_halftime(self):
198 """Check if we just crossed half time.
199
200 This method will only return True once, the first time it is called
201 after a job's elapsed time is past half its timeout.
202
203 @return True: If this is the first call of the method after halftime.
204 """
205 if (not self.past_halftime and
206 self.elapsed_time() > self.timeout_hours/2):
207 self.past_halftime = True
208 return True
209 return False
210
211
212class RPCHelper(object):
213 """A class to help diagnose a suite run through the rpc interface.
214 """
215
216 def __init__(self, rpc_interface):
217 """Constructor for rpc helper class.
218
219 @param rpc_interface: An rpc object, eg: A RetryingAFE instance.
220 """
221 self.rpc_interface = rpc_interface
222
223
J. Richard Barnetteaf1e8262016-03-04 12:55:11 -0800224 def diagnose_pool(self, board, pool, time_delta_hours, limit=10):
Prashanth B923ca262014-03-14 12:36:29 -0700225 """Log diagnostic information about a timeout for a board/pool.
226
227 @param board: The board for which the current suite was run.
228 @param pool: The pool against which the current suite was run.
229 @param time_delta_hours: The time from which we should log information.
230 This is a datetime.timedelta object, as stored by the JobTimer.
231 @param limit: The maximum number of jobs per host, to log.
232
233 @raises proxy.JSONRPCException: For exceptions thrown across the wire.
234 """
J. Richard Barnetteaf1e8262016-03-04 12:55:11 -0800235 end_time = datetime.now()
236 start_time = end_time - time_delta_hours
237 get_histories = status_history.HostJobHistory.get_multiple_histories
238 host_histories = get_histories(
239 self.rpc_interface,
240 time_utils.to_epoch_time(start_time),
241 time_utils.to_epoch_time(end_time),
242 board=board, pool=pool)
243 if not host_histories:
244 logging.error('No hosts found for board:%s in pool:%s',
Alex Miller8bf50202014-06-02 04:10:51 -0700245 board, pool)
Prashanth B923ca262014-03-14 12:36:29 -0700246 return
J. Richard Barnetteaf1e8262016-03-04 12:55:11 -0800247 status_map = {
248 status_history.UNUSED: 'Unused',
249 status_history.UNKNOWN: 'No job history',
250 status_history.WORKING: 'Working',
251 status_history.BROKEN: 'Failed repair'
252 }
253 for history in host_histories:
254 count = 0
255 job_info =''
256 for job in history:
257 start_time = (
258 time_utils.epoch_time_to_date_string(job.start_time))
Prashanth B923ca262014-03-14 12:36:29 -0700259 job_info += ('%s %s started on: %s status %s\n' %
J. Richard Barnetteaf1e8262016-03-04 12:55:11 -0800260 (job.id, job.name, start_time, job.job_status))
261 count += 1
262 if count >= limit:
263 break
264 host = history.host
265 logging.error('host: %s, status: %s, locked: %s '
266 'diagnosis: %s\n'
267 'labels: %s\nLast %s jobs within %s:\n'
268 '%s',
269 history.hostname, host.status, host.locked,
270 status_map[history.last_diagnosis()[0]],
271 host.labels, limit, time_delta_hours,
272 job_info)
Prashanth B923ca262014-03-14 12:36:29 -0700273
274
Allen Li6a612392016-08-18 12:09:32 -0700275 def _is_host_available(self, host):
276 """Check whether DUT host is available.
277
278 @param host: The Host instance for the DUT.
279 @return: bool
280 """
281 return not (host.locked or host.status in host_states.UNAVAILABLE_STATES)
282
283
Ningning Xiaf2c206c2016-04-13 14:15:51 -0700284 def check_dut_availability(self, board, pool, minimum_duts=0, skip_duts_check=False):
Dan Shi20952c12014-05-14 17:07:38 -0700285 """Check if DUT availability for a given board and pool is less than
286 minimum.
287
288 @param board: The board to check DUT availability.
289 @param pool: The pool to check DUT availability.
Dan Shi81363372014-06-03 22:27:37 -0700290 @param minimum_duts: Minimum Number of available machines required to
291 run the suite. Default is set to 0, which means do
292 not force the check of available machines before
293 running the suite.
Ningning Xiaf2c206c2016-04-13 14:15:51 -0700294 @param skip_duts_check: If True, skip minimum available DUTs check.
Fang Deng6197da32014-09-25 10:18:48 -0700295 @raise: NotEnoughDutsError if DUT availability is lower than minimum.
296 @raise: BoardNotAvailableError if no host found for requested
297 board/pool.
Dan Shi20952c12014-05-14 17:07:38 -0700298 """
Dan Shi8de6d1b2014-06-12 09:10:37 -0700299 if minimum_duts == 0:
Dan Shi94234cb2014-05-23 20:04:31 -0700300 return
301
Allen Li6a612392016-08-18 12:09:32 -0700302 # TODO(ayatane): Replace label prefixes with constants in
303 # site_utils.suite_scheduler.constants
Dan Shi20952c12014-05-14 17:07:38 -0700304 hosts = self.rpc_interface.get_hosts(
305 invalid=False,
306 multiple_labels=('pool:%s' % pool, 'board:%s' % board))
307 if not hosts:
Fang Deng6197da32014-09-25 10:18:48 -0700308 raise BoardNotAvailableError(
Shuqian Zhaoade6e7d2015-12-07 18:01:11 -0800309 'No hosts found for board:%s in pool:%s. The test lab '
310 'currently does not cover test for this board and pool.'%
Alex Miller8bf50202014-06-02 04:10:51 -0700311 (board, pool))
Dan Shi20952c12014-05-14 17:07:38 -0700312
Ningning Xiaf2c206c2016-04-13 14:15:51 -0700313 if skip_duts_check:
314 # Bypass minimum avilable DUTs check
315 logging.debug('skip_duts_check is on, do not enforce minimum DUTs check.')
316 return
317
Dan Shi8de6d1b2014-06-12 09:10:37 -0700318 if len(hosts) < minimum_duts:
Dan Shi94234cb2014-05-23 20:04:31 -0700319 logging.debug('The total number of DUTs for %s in pool:%s is %d, '
Allen Li6a612392016-08-18 12:09:32 -0700320 'which is less than %d, the required minimum number of'
321 ' available DUTS', board, pool, len(hosts),
Dan Shi8de6d1b2014-06-12 09:10:37 -0700322 minimum_duts)
Ningning Xiaf2c206c2016-04-13 14:15:51 -0700323
Allen Li6a612392016-08-18 12:09:32 -0700324 available_hosts = 0
Prashanth Balasubramaniana6c03be2014-10-20 18:47:24 -0700325 for host in hosts:
Allen Li6a612392016-08-18 12:09:32 -0700326 if self._is_host_available(host):
327 available_hosts += 1
Dan Shi20952c12014-05-14 17:07:38 -0700328 logging.debug('%d of %d DUTs are available for board %s pool %s.',
Allen Li6a612392016-08-18 12:09:32 -0700329 available_hosts, len(hosts), board, pool)
330 if available_hosts < minimum_duts:
Fang Deng6197da32014-09-25 10:18:48 -0700331 raise NotEnoughDutsError(
Allen Li6a612392016-08-18 12:09:32 -0700332 board=board,
333 pool=pool,
334 num_available=available_hosts,
335 num_required=minimum_duts,
336 hosts=hosts)
Dan Shi20952c12014-05-14 17:07:38 -0700337
338
MK Ryu4790eec2014-07-31 11:39:02 -0700339 def diagnose_job(self, job_id, instance_server):
Prashanth B923ca262014-03-14 12:36:29 -0700340 """Diagnose a suite job.
341
342 Logs information about the jobs that are still to run in the suite.
343
344 @param job_id: The id of the suite job to get information about.
345 No meaningful information gets logged if the id is for a sub-job.
MK Ryu4790eec2014-07-31 11:39:02 -0700346 @param instance_server: The instance server.
347 Eg: cautotest, cautotest-cq, localhost.
Prashanth B923ca262014-03-14 12:36:29 -0700348 """
349 incomplete_jobs = self.rpc_interface.get_jobs(
350 parent_job_id=job_id, summary=True,
351 hostqueueentry__complete=False)
352 if incomplete_jobs:
353 logging.info('\n%s printing summary of incomplete jobs (%s):\n',
354 JobTimer.format_time(datetime.now()),
355 len(incomplete_jobs))
356 for job in incomplete_jobs:
357 logging.info('%s: %s', job.testname[job.testname.rfind('/')+1:],
MK Ryu4790eec2014-07-31 11:39:02 -0700358 reporting_utils.link_job(job.id, instance_server))
Prashanth B923ca262014-03-14 12:36:29 -0700359 else:
360 logging.info('All jobs in suite have already completed.')