blob: c1c2259ecf96f49bd2de3282a9938760598d1041 [file] [log] [blame]
/*
* Copyright (C) 2022 The Android Open Source Project
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include <chrono>
#include <future>
#include <optional>
#include <queue>
#include <string>
#include <unordered_map>
#include <unordered_set>
#include <variant>
#include "base/ConditionVariable.h"
#include "base/Lock.h"
#include "base/Metrics.h"
#include "base/Thread.h"
#include "host-common/logging.h"
using android::base::EventHangMetadata;
#define WATCHDOG_DATA(msg, hangType, data) \
std::make_unique<EventHangMetadata>(__FILE__, __func__, msg, __LINE__, hangType, data)
namespace emugl {
using android::base::ConditionVariable;
using android::base::Lock;
using android::base::MetricsLogger;
using std::chrono::duration;
using std::chrono::steady_clock;
using std::chrono::time_point;
static uint64_t kDefaultIntervalMs = 1'000;
static uint64_t kDefaultTimeoutMs = 5'000;
// HealthMonitor provides the ability to register arbitrary start/touch/stop events associated
// with client defined tasks. At some pre-defined interval, it will periodically consume
// all logged events to assess whether the system is hanging on any task. Via the
// MetricsLogger, it will log hang and unhang events when it detects tasks hanging/resuming.
// TODO: willho@ Integrate with crashpad to upload host dumps when a hang is detected.
// Design doc: http://go/gfxstream-health-monitor
template <class Clock = steady_clock>
class HealthMonitor : public android::base::Thread {
public:
// Alias for task id.
using Id = uint64_t;
// Constructor
// `heatbeatIntervalMs` is the interval, in milleseconds, that the thread will sleep for
// in between health checks.
HealthMonitor(MetricsLogger& metricsLogger, uint64_t heartbeatInterval = kDefaultIntervalMs);
// Destructor
// Enqueues an event to end monitoring and waits on thread to process remaining queued events.
~HealthMonitor();
// Start monitoring a task. Returns an id that is used for touch and stop operations.
// `metadata` is a struct containing info on the task watchdog to be passed through to the
// metrics logger.
// `timeout` is the duration in milliseconds a task is allowed to run before it's
// considered "hung". Because `timeout` must be larger than the monitor's heartbeat
// interval, as shorter timeout periods would not be detected, this method will set actual
// timeout to the lesser of `timeout` and twice the heartbeat interval.
Id startMonitoringTask(std::unique_ptr<EventHangMetadata> metadata,
uint64_t timeout = kDefaultTimeoutMs);
// Touch a monitored task. Resets the timeout countdown for that task.
void touchMonitoredTask(Id id);
// Stop monitoring a task.
void stopMonitoringTask(Id id);
private:
using Duration = typename Clock::duration; // duration<double>;
using Timestamp = time_point<Clock, Duration>;
// Allow test class access to private functions
friend class HealthMonitorTest;
struct MonitoredEventType {
struct Start {
Id id;
std::unique_ptr<EventHangMetadata> metadata;
Timestamp timeOccurred;
Duration timeoutThreshold;
};
struct Touch {
Id id;
Timestamp timeOccurred;
};
struct Stop {
Id id;
Timestamp timeOccurred;
};
struct EndMonitoring {};
struct Poll {
std::promise<void> complete;
};
};
using MonitoredEvent =
std::variant<std::monostate, typename MonitoredEventType::Start,
typename MonitoredEventType::Touch, typename MonitoredEventType::Stop,
typename MonitoredEventType::EndMonitoring, typename MonitoredEventType::Poll>;
struct MonitoredTask {
Id id;
Timestamp timeoutTimestamp;
Duration timeoutThreshold;
std::optional<Timestamp> hungTimestamp;
std::unique_ptr<EventHangMetadata> metadata;
};
// Thread's main loop
intptr_t main() override;
// Explicitly wake the monitor thread. Returns a future that can be used to wait until the
// poll event has been processed.
std::future<void> poll();
// Immutable. Multi-thread access is safe.
const Duration mInterval;
// Members accessed only on the worker thread. Not protected by mutex.
int mHungTasks = 0;
MetricsLogger& mLogger;
std::unordered_map<Id, MonitoredTask> mMonitoredTasks;
// Lock and cv control access to queue and id counter
android::base::ConditionVariable mCv;
Lock mLock;
Id mNextId = 0;
std::queue<std::unique_ptr<MonitoredEvent>> mEventQueue;
};
// This class provides an RAII mechanism for monitoring a task.
template <class Clock = steady_clock>
class HealthWatchdog {
public:
HealthWatchdog(HealthMonitor<Clock>& healthMonitor, std::unique_ptr<EventHangMetadata> metadata,
uint64_t timeout = kDefaultTimeoutMs)
: mHealthMonitor(healthMonitor) {
mId = mHealthMonitor.startMonitoringTask(std::move(metadata), timeout);
}
~HealthWatchdog() { mHealthMonitor.stopMonitoringTask(mId); }
void touch() { mHealthMonitor.touchMonitoredTask(mId); }
private:
typename HealthMonitor<Clock>::Id mId;
HealthMonitor<Clock>& mHealthMonitor;
};
} // namespace emugl