blob: 447b7182583b1abe8196b4e414782276ea3e4032 [file] [log] [blame]
/*
* Copyright (C) 2018 The Android Open Source Project
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "host/libs/process_monitor/process_monitor.h"
#ifdef __linux__
#include <sys/prctl.h>
#endif
#include <sys/types.h>
#include <sys/wait.h>
#include <assert.h>
#include <errno.h>
#include <signal.h>
#include <stdio.h>
#include <algorithm>
#include <atomic>
#include <cstdint>
#include <future>
#include <memory>
#include <string>
#include <thread>
#include <android-base/file.h>
#include <android-base/logging.h>
#include "common/libs/fs/shared_buf.h"
#include "common/libs/fs/shared_select.h"
#include "common/libs/utils/result.h"
#include "common/libs/utils/subprocess.h"
#include "host/libs/config/cuttlefish_config.h"
#include "host/libs/config/known_paths.h"
#include "host/libs/process_monitor/process_monitor_channel.h"
namespace cuttlefish {
namespace {
void LogSubprocessExit(const std::string& name, pid_t pid, int wstatus) {
LOG(INFO) << "Detected unexpected exit of monitored subprocess " << name;
if (WIFEXITED(wstatus)) {
LOG(INFO) << "Subprocess " << name << " (" << pid
<< ") has exited with exit code " << WEXITSTATUS(wstatus);
} else if (WIFSIGNALED(wstatus)) {
int sig_num = WTERMSIG(wstatus);
LOG(ERROR) << "Subprocess " << name << " (" << pid
<< ") was interrupted by a signal '" << strsignal(sig_num)
<< "' (" << sig_num << ")";
} else {
LOG(INFO) << "subprocess " << name << " (" << pid
<< ") has exited for unknown reasons";
}
}
void LogSubprocessExit(const std::string& name, const siginfo_t& infop) {
LOG(INFO) << "Detected unexpected exit of monitored subprocess " << name;
if (infop.si_code == CLD_EXITED) {
LOG(INFO) << "Subprocess " << name << " (" << infop.si_pid
<< ") has exited with exit code " << infop.si_status;
} else if (infop.si_code == CLD_KILLED) {
LOG(ERROR) << "Subprocess " << name << " (" << infop.si_pid
<< ") was interrupted by a signal '"
<< strsignal(infop.si_status) << "' (" << infop.si_status << ")";
} else {
LOG(INFO) << "subprocess " << name << " (" << infop.si_pid
<< ") has exited for unknown reasons (code = " << infop.si_code
<< ", status = " << infop.si_status << ")";
}
}
Result<void> StartSubprocesses(std::vector<MonitorEntry>& entries) {
LOG(DEBUG) << "Starting monitored subprocesses";
for (auto& monitored : entries) {
LOG(INFO) << monitored.cmd->GetShortName();
auto options = SubprocessOptions().InGroup(true);
monitored.proc.reset(new Subprocess(monitored.cmd->Start(options)));
CF_EXPECT(monitored.proc->Started(), "Failed to start subprocess");
}
return {};
}
Result<void> MonitorLoop(const std::atomic_bool& running,
std::mutex& properties_mutex,
const bool restart_subprocesses,
std::vector<MonitorEntry>& monitored) {
while (running.load()) {
int wstatus;
pid_t pid = wait(&wstatus);
int error_num = errno;
CF_EXPECT(pid != -1, "Wait failed: " << strerror(error_num));
if (!WIFSIGNALED(wstatus) && !WIFEXITED(wstatus)) {
LOG(DEBUG) << "Unexpected status from wait: " << wstatus << " for pid "
<< pid;
continue;
}
if (!running.load()) { // Avoid extra restarts near the end
break;
}
auto matches = [pid](const auto& it) { return it.proc->pid() == pid; };
std::unique_lock lock(properties_mutex);
auto it = std::find_if(monitored.begin(), monitored.end(), matches);
if (it == monitored.end()) {
LogSubprocessExit("(unknown)", pid, wstatus);
} else {
LogSubprocessExit(it->cmd->GetShortName(), it->proc->pid(), wstatus);
if (restart_subprocesses) {
auto options = SubprocessOptions().InGroup(true);
// in the future, cmd->Start might not run exec()
it->proc.reset(new Subprocess(it->cmd->Start(options)));
} else {
bool is_critical = it->is_critical;
monitored.erase(it);
if (running.load() && is_critical) {
LOG(ERROR) << "Stopping all monitored processes due to unexpected "
"exit of critical process";
Command stop_cmd(StopCvdBinary());
stop_cmd.Start();
}
}
}
}
return {};
}
Result<void> StopSubprocesses(std::vector<MonitorEntry>& monitored) {
LOG(DEBUG) << "Stopping monitored subprocesses";
auto stop = [](const auto& it) {
auto stop_result = it.proc->Stop();
if (stop_result == StopperResult::kStopFailure) {
LOG(WARNING) << "Error in stopping \"" << it.cmd->GetShortName() << "\"";
return false;
}
siginfo_t infop;
auto success = it.proc->Wait(&infop, WEXITED);
if (success < 0) {
LOG(WARNING) << "Failed to wait for process " << it.cmd->GetShortName();
return false;
}
if (stop_result == StopperResult::kStopCrash) {
LogSubprocessExit(it.cmd->GetShortName(), infop);
}
return true;
};
// Processes were started in the order they appear in the vector, stop them in
// reverse order for symmetry.
size_t stopped = std::count_if(monitored.rbegin(), monitored.rend(), stop);
CF_EXPECT(stopped == monitored.size(), "Didn't stop all subprocesses");
return {};
}
} // namespace
Result<void> ProcessMonitor::ReadMonitorSocketLoop(std::atomic_bool& running) {
LOG(DEBUG) << "Waiting for a `stop` message from the parent";
while (running.load()) {
using process_monitor_impl::ParentToChildMessage;
auto message = CF_EXPECT(ParentToChildMessage::Read(child_monitor_socket_));
if (message.Stop()) {
running.store(false);
// Wake up the wait() loop by giving it an exited child process
if (fork() == 0) {
std::exit(0);
}
// will break the for-loop as running is now false
continue;
}
using process_monitor_impl::ParentToChildMessageType;
if (message.Type() == ParentToChildMessageType::kHostSuspend) {
CF_EXPECT(SuspendHostProcessesImpl());
continue;
}
if (message.Type() == ParentToChildMessageType::kHostResume) {
CF_EXPECT(ResumeHostProcessesImpl());
continue;
}
}
return {};
}
Result<void> ProcessMonitor::SuspendHostProcessesImpl() {
std::lock_guard lock(properties_mutex_);
auto& monitor_entries = properties_.entries_;
for (const auto& entry : monitor_entries) {
if (!entry.cmd) {
LOG(ERROR) << "Monitor Entry has a nullptr for cmd.";
continue;
}
if (!entry.proc) {
LOG(ERROR) << "Monitor Entry has a nullptr for proc.";
continue;
}
auto prog_name = android::base::Basename(entry.cmd->Executable());
auto process_restart_bin =
android::base::Basename(ProcessRestarterBinary());
if (process_restart_bin == prog_name) {
CF_EXPECT(entry.proc->SendSignal(SIGTSTP));
} else {
CF_EXPECT(entry.proc->SendSignalToGroup(SIGTSTP));
}
}
using process_monitor_impl::ChildToParentResponse;
using process_monitor_impl::ChildToParentResponseType;
ChildToParentResponse response(ChildToParentResponseType::kSuccess);
CF_EXPECT(response.Write(child_monitor_socket_));
return {};
}
Result<void> ProcessMonitor::ResumeHostProcessesImpl() {
std::lock_guard lock(properties_mutex_);
auto& monitor_entries = properties_.entries_;
for (const auto& entry : monitor_entries) {
if (!entry.cmd) {
LOG(ERROR) << "Monitor Entry has a nullptr for cmd.";
continue;
}
if (!entry.proc) {
LOG(ERROR) << "Monitor Entry has a nullptr for proc.";
continue;
}
auto prog_name = android::base::Basename(entry.cmd->Executable());
auto process_restart_bin =
android::base::Basename(ProcessRestarterBinary());
if (process_restart_bin == prog_name) {
CF_EXPECT(entry.proc->SendSignal(SIGCONT));
} else {
CF_EXPECT(entry.proc->SendSignalToGroup(SIGCONT));
}
}
using process_monitor_impl::ChildToParentResponse;
using process_monitor_impl::ChildToParentResponseType;
ChildToParentResponse response(ChildToParentResponseType::kSuccess);
CF_EXPECT(response.Write(child_monitor_socket_));
return {};
}
ProcessMonitor::Properties& ProcessMonitor::Properties::RestartSubprocesses(
bool r) & {
restart_subprocesses_ = r;
return *this;
}
ProcessMonitor::Properties ProcessMonitor::Properties::RestartSubprocesses(
bool r) && {
return std::move(RestartSubprocesses(r));
}
ProcessMonitor::Properties& ProcessMonitor::Properties::AddCommand(
MonitorCommand cmd) & {
entries_.emplace_back(std::move(cmd.command), cmd.is_critical);
return *this;
}
ProcessMonitor::Properties ProcessMonitor::Properties::AddCommand(
MonitorCommand cmd) && {
return std::move(AddCommand(std::move(cmd)));
}
ProcessMonitor::ProcessMonitor(ProcessMonitor::Properties&& properties)
: properties_(std::move(properties)), monitor_(-1) {}
Result<void> ProcessMonitor::StopMonitoredProcesses() {
CF_EXPECT(monitor_ != -1, "The monitor process has already exited.");
CF_EXPECT(parent_monitor_socket_->IsOpen(),
"The monitor socket is already closed");
using process_monitor_impl::ParentToChildMessage;
using process_monitor_impl::ParentToChildMessageType;
ParentToChildMessage message(ParentToChildMessageType::kStop);
CF_EXPECT(message.Write(parent_monitor_socket_));
pid_t last_monitor = monitor_;
monitor_ = -1;
parent_monitor_socket_->Close();
int wstatus;
CF_EXPECT(waitpid(last_monitor, &wstatus, 0) == last_monitor,
"Failed to wait for monitor process");
CF_EXPECT(!WIFSIGNALED(wstatus), "Monitor process exited due to a signal");
CF_EXPECT(WIFEXITED(wstatus), "Monitor process exited for unknown reasons");
CF_EXPECT(WEXITSTATUS(wstatus) == 0,
"Monitor process exited with code " << WEXITSTATUS(wstatus));
return {};
}
Result<void> ProcessMonitor::SuspendMonitoredProcesses() {
CF_EXPECT(monitor_ != -1, "The monitor process has already exited.");
CF_EXPECT(parent_monitor_socket_->IsOpen(),
"The monitor socket is already closed");
using process_monitor_impl::ParentToChildMessage;
using process_monitor_impl::ParentToChildMessageType;
ParentToChildMessage message(ParentToChildMessageType::kHostSuspend);
CF_EXPECT(message.Write(parent_monitor_socket_));
using process_monitor_impl::ChildToParentResponse;
auto response =
CF_EXPECT(ChildToParentResponse::Read(parent_monitor_socket_));
CF_EXPECT(response.Success(),
"On kHostSuspend, the child run_cvd returned kFailure.");
return {};
}
Result<void> ProcessMonitor::ResumeMonitoredProcesses() {
CF_EXPECT(monitor_ != -1, "The monitor process has already exited.");
CF_EXPECT(parent_monitor_socket_->IsOpen(),
"The monitor socket is already closed");
using process_monitor_impl::ParentToChildMessage;
using process_monitor_impl::ParentToChildMessageType;
ParentToChildMessage message(ParentToChildMessageType::kHostResume);
CF_EXPECT(message.Write(parent_monitor_socket_));
using process_monitor_impl::ChildToParentResponse;
auto response =
CF_EXPECT(ChildToParentResponse::Read(parent_monitor_socket_));
CF_EXPECT(response.Success(),
"On kHostResume, the child run_cvd returned kFailure.");
return {};
}
Result<void> ProcessMonitor::StartAndMonitorProcesses() {
CF_EXPECT(monitor_ == -1, "The monitor process was already started");
CF_EXPECT(!parent_monitor_socket_->IsOpen(),
"Parent monitor socket was already opened");
SharedFD parent_sock;
SharedFD child_sock;
SharedFD::SocketPair(AF_UNIX, SOCK_STREAM, 0, &parent_sock, &child_sock);
monitor_ = fork();
if (monitor_ == 0) {
child_monitor_socket_ = std::move(child_sock);
parent_sock->Close();
auto monitor_result = MonitorRoutine();
if (!monitor_result.ok()) {
LOG(ERROR) << "Monitoring processes failed:\n"
<< monitor_result.error().FormatForEnv();
}
std::exit(monitor_result.ok() ? 0 : 1);
} else {
parent_monitor_socket_ = std::move(parent_sock);
child_sock->Close();
return {};
}
}
Result<void> ProcessMonitor::MonitorRoutine() {
#ifdef __linux__
// Make this process a subreaper to reliably catch subprocess exits.
// See https://man7.org/linux/man-pages/man2/prctl.2.html
prctl(PR_SET_CHILD_SUBREAPER, 1);
prctl(PR_SET_PDEATHSIG, SIGHUP); // Die when parent dies
#endif
LOG(DEBUG) << "Monitoring subprocesses";
StartSubprocesses(properties_.entries_);
std::atomic_bool running(true);
auto read_monitor_socket_loop =
[this](std::atomic_bool& running) -> Result<void> {
CF_EXPECT(this->ReadMonitorSocketLoop(running));
return {};
};
auto parent_comms = std::async(std::launch::async, read_monitor_socket_loop,
std::ref(running));
MonitorLoop(running, properties_mutex_, properties_.restart_subprocesses_,
properties_.entries_);
CF_EXPECT(parent_comms.get(), "Should have exited if monitoring stopped");
StopSubprocesses(properties_.entries_);
LOG(DEBUG) << "Done monitoring subprocesses";
return {};
}
} // namespace cuttlefish