blob: e9b1127238548e179f043f000a11da80568fa699 [file] [log] [blame] [edit]
#include <c10/util/Backtrace.h>
#include <c10/util/Logging.h>
#include <c10/util/signal_handler.h>
#if defined(C10_SUPPORTS_SIGNAL_HANDLER)
// Normal signal handler implementation.
#include <dirent.h>
#include <fmt/core.h>
#include <sys/syscall.h>
#include <unistd.h>
#include <atomic>
#include <chrono>
#include <condition_variable>
#include <cstdint>
#include <cstdio>
#include <cstdlib>
#include <iostream>
#include <mutex>
#ifdef C10_ANDROID
#ifndef SYS_gettid
#define SYS_gettid __NR_gettid
#endif
#ifndef SYS_tgkill
#define SYS_tgkill __NR_tgkill
#endif
#endif
namespace {
struct sigaction previousSighup;
struct sigaction previousSigint;
std::atomic<int> sigintCount(0);
std::atomic<int> sighupCount(0);
std::atomic<int> hookedUpCount(0);
void handleSignal(int signal) {
switch (signal) {
// TODO: what if the previous handler uses sa_sigaction?
case SIGHUP:
sighupCount += 1;
if (previousSighup.sa_handler) {
previousSighup.sa_handler(signal);
}
break;
case SIGINT:
sigintCount += 1;
if (previousSigint.sa_handler) {
previousSigint.sa_handler(signal);
}
break;
}
}
void hookupHandler() {
if (hookedUpCount++) {
return;
}
struct sigaction sa {};
// Setup the handler
sa.sa_handler = &handleSignal;
// Restart the system call, if at all possible
sa.sa_flags = SA_RESTART;
// Block every signal during the handler
sigfillset(&sa.sa_mask);
// Intercept SIGHUP and SIGINT
if (sigaction(SIGHUP, &sa, &previousSighup) == -1) {
LOG(FATAL) << "Cannot install SIGHUP handler.";
}
if (sigaction(SIGINT, &sa, &previousSigint) == -1) {
LOG(FATAL) << "Cannot install SIGINT handler.";
}
}
// Set the signal handlers to the default.
void unhookHandler() {
if (--hookedUpCount > 0) {
return;
}
struct sigaction sa {};
// Setup the sighub handler
sa.sa_handler = SIG_DFL;
// Restart the system call, if at all possible
sa.sa_flags = SA_RESTART;
// Block every signal during the handler
sigfillset(&sa.sa_mask);
// Intercept SIGHUP and SIGINT
if (sigaction(SIGHUP, &previousSighup, nullptr) == -1) {
LOG(FATAL) << "Cannot uninstall SIGHUP handler.";
}
if (sigaction(SIGINT, &previousSigint, nullptr) == -1) {
LOG(FATAL) << "Cannot uninstall SIGINT handler.";
}
}
} // namespace
namespace c10 {
#if defined(C10_SUPPORTS_FATAL_SIGNAL_HANDLERS)
FatalSignalHandler& FatalSignalHandler::getInstance() {
// Leaky singleton to avoid module destructor race.
static FatalSignalHandler* handler = new FatalSignalHandler();
return *handler;
}
FatalSignalHandler::~FatalSignalHandler() = default;
FatalSignalHandler::FatalSignalHandler()
: fatalSignalHandlersInstalled(false),
fatalSignalReceived(false),
fatalSignalName("<UNKNOWN>"),
writingCond(),
writingMutex(),
signalReceived(false) {}
// NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays)
FatalSignalHandler::signal_handler FatalSignalHandler::kSignalHandlers[] = {
{"SIGABRT", SIGABRT, {}},
{"SIGINT", SIGINT, {}},
{"SIGILL", SIGILL, {}},
{"SIGFPE", SIGFPE, {}},
{"SIGBUS", SIGBUS, {}},
{"SIGSEGV", SIGSEGV, {}},
{nullptr, 0, {}}};
struct sigaction* FatalSignalHandler::getPreviousSigaction(int signum) {
for (auto handler = kSignalHandlers; handler->name != nullptr; handler++) {
if (handler->signum == signum) {
return &handler->previous;
}
}
return nullptr;
}
const char* FatalSignalHandler::getSignalName(int signum) {
for (auto handler = kSignalHandlers; handler->name != nullptr; handler++) {
if (handler->signum == signum) {
return handler->name;
}
}
return nullptr;
}
void FatalSignalHandler::callPreviousSignalHandler(
struct sigaction* action,
int signum,
siginfo_t* info,
void* ctx) {
if (!action->sa_handler) {
return;
}
if ((action->sa_flags & SA_SIGINFO) == SA_SIGINFO) {
action->sa_sigaction(signum, info, ctx);
} else {
action->sa_handler(signum);
}
}
// needsLock signals whether we need to lock our writing mutex.
void FatalSignalHandler::stacktraceSignalHandler(bool needsLock) {
std::unique_lock<std::mutex> ul(writingMutex, std::defer_lock);
if (needsLock) {
ul.lock();
signalReceived = true;
}
pid_t tid = static_cast<pid_t>(syscall(SYS_gettid));
std::string backtrace = fmt::format(
"{}({}), PID: {}, Thread {}: \n {}",
fatalSignalName,
fatalSignum,
::getpid(),
tid,
c10::get_backtrace());
std::cerr << backtrace << std::endl;
if (needsLock) {
ul.unlock();
writingCond.notify_all();
}
}
void FatalSignalHandler::fatalSignalHandlerPostProcess() {}
void FatalSignalHandler::fatalSignalHandlerStatic(int signum) {
getInstance().fatalSignalHandler(signum);
}
// Our fatal signal entry point
void FatalSignalHandler::fatalSignalHandler(int signum) {
// Check if this is a proper signal that we declared above.
const char* name = getSignalName(signum);
if (!name) {
return;
}
if (fatalSignalReceived) {
return;
}
// Set the flag so that our SIGUSR2 handler knows that we're aborting and
// that it should intercept any SIGUSR2 signal.
fatalSignalReceived = true;
// Set state for other threads.
fatalSignum = signum;
fatalSignalName = name;
// Linux doesn't have a nice userland API for enumerating threads so we
// need to use the proc pseudo-filesystem.
DIR* procDir = opendir("/proc/self/task");
if (procDir) {
pid_t pid = getpid();
pid_t currentTid = static_cast<pid_t>(syscall(SYS_gettid));
struct dirent* entry = nullptr;
std::unique_lock<std::mutex> ul(writingMutex);
while ((entry = readdir(procDir)) != nullptr) {
if (entry->d_name[0] == '.') {
continue;
}
pid_t tid = atoi(entry->d_name);
// If we've found the current thread then we'll jump into the SIGUSR2
// handler instead of signaling to avoid deadlocking.
if (tid != currentTid) {
signalReceived = false;
syscall(SYS_tgkill, pid, tid, SIGUSR2);
auto now = std::chrono::system_clock::now();
using namespace std::chrono_literals;
// we use wait_until instead of wait because on ROCm there was
// a single thread that wouldn't receive the SIGUSR2
if (std::cv_status::timeout == writingCond.wait_until(ul, now + 2s)) {
if (!signalReceived) {
std::cerr << "signal lost waiting for stacktrace " << pid << ":"
<< tid << std::endl;
break;
}
}
} else {
stacktraceSignalHandler(false);
}
}
} else {
perror("Failed to open /proc/self/task");
}
fatalSignalHandlerPostProcess();
sigaction(signum, getPreviousSigaction(signum), nullptr);
raise(signum);
}
// Our SIGUSR2 entry point
void FatalSignalHandler::stacktraceSignalHandlerStatic(
int signum,
siginfo_t* info,
void* ctx) {
getInstance().stacktraceSignalHandler(signum, info, ctx);
}
void FatalSignalHandler::stacktraceSignalHandler(
int signum,
siginfo_t* info,
void* ctx) {
if (fatalSignalReceived) {
stacktraceSignalHandler(true);
} else {
// We don't want to actually change the signal handler as we want to
// remain the signal handler so that we may get the usr2 signal later.
callPreviousSignalHandler(&previousSigusr2, signum, info, ctx);
}
}
// Installs SIGABRT signal handler so that we get stack traces
// from every thread on SIGABRT caused exit. Also installs SIGUSR2 handler
// so that threads can communicate with each other (be sure if you use SIGUSR2)
// to install your handler before initing caffe2 (we properly fall back to
// the previous handler if we didn't initiate the SIGUSR2).
void FatalSignalHandler::installFatalSignalHandlers() {
std::lock_guard<std::mutex> locker(fatalSignalHandlersInstallationMutex);
if (fatalSignalHandlersInstalled) {
return;
}
fatalSignalHandlersInstalled = true;
struct sigaction sa {};
sigemptyset(&sa.sa_mask);
// Since we'll be in an exiting situation it's possible there's memory
// corruption, so make our own stack just in case.
sa.sa_flags = SA_ONSTACK | SA_SIGINFO;
sa.sa_handler = FatalSignalHandler::fatalSignalHandlerStatic;
for (auto* handler = kSignalHandlers; handler->name != nullptr; handler++) {
if (sigaction(handler->signum, &sa, &handler->previous)) {
std::string str("Failed to add ");
str += handler->name;
str += " handler!";
perror(str.c_str());
}
}
sa.sa_sigaction = FatalSignalHandler::stacktraceSignalHandlerStatic;
if (sigaction(SIGUSR2, &sa, &previousSigusr2)) {
perror("Failed to add SIGUSR2 handler!");
}
}
void FatalSignalHandler::uninstallFatalSignalHandlers() {
std::lock_guard<std::mutex> locker(fatalSignalHandlersInstallationMutex);
if (!fatalSignalHandlersInstalled) {
return;
}
fatalSignalHandlersInstalled = false;
for (auto* handler = kSignalHandlers; handler->name != nullptr; handler++) {
if (sigaction(handler->signum, &handler->previous, nullptr)) {
std::string str("Failed to remove ");
str += handler->name;
str += " handler!";
perror(str.c_str());
} else {
handler->previous = {};
}
}
if (sigaction(SIGUSR2, &previousSigusr2, nullptr)) {
perror("Failed to add SIGUSR2 handler!");
} else {
previousSigusr2 = {};
}
}
#endif // defined(C10_SUPPORTS_FATAL_SIGNAL_HANDLERS)
SignalHandler::SignalHandler(
SignalHandler::Action SIGINT_action,
SignalHandler::Action SIGHUP_action)
: SIGINT_action_(SIGINT_action),
SIGHUP_action_(SIGHUP_action),
my_sigint_count_(sigintCount),
my_sighup_count_(sighupCount) {
hookupHandler();
}
SignalHandler::~SignalHandler() {
unhookHandler();
}
// Return true iff a SIGINT has been received since the last time this
// function was called.
bool SignalHandler::GotSIGINT() {
uint64_t count = sigintCount;
uint64_t localCount = my_sigint_count_.exchange(count);
return (localCount != count);
}
// Return true iff a SIGHUP has been received since the last time this
// function was called.
bool SignalHandler::GotSIGHUP() {
uint64_t count = sighupCount;
uint64_t localCount = my_sighup_count_.exchange(count);
return (localCount != count);
}
SignalHandler::Action SignalHandler::CheckForSignals() {
if (GotSIGHUP()) {
return SIGHUP_action_;
}
if (GotSIGINT()) {
return SIGINT_action_;
}
return SignalHandler::Action::NONE;
}
#if defined(C10_SUPPORTS_FATAL_SIGNAL_HANDLERS)
void FatalSignalHandler::setPrintStackTracesOnFatalSignal(bool print) {
if (print) {
installFatalSignalHandlers();
} else {
uninstallFatalSignalHandlers();
}
}
bool FatalSignalHandler::printStackTracesOnFatalSignal() {
std::lock_guard<std::mutex> locker(fatalSignalHandlersInstallationMutex);
return fatalSignalHandlersInstalled;
}
#endif // defined(C10_SUPPORTS_FATAL_SIGNAL_HANDLERS)
} // namespace c10
#else // defined(C10_SUPPORTS_SIGNAL_HANDLER)
// TODO: Currently we do not support signal handling in non-Linux yet - below is
// a minimal implementation that makes things compile.
namespace c10 {
SignalHandler::SignalHandler(
SignalHandler::Action SIGINT_action,
SignalHandler::Action SIGHUP_action) {
SIGINT_action_ = SIGINT_action;
SIGHUP_action_ = SIGHUP_action;
my_sigint_count_ = 0;
my_sighup_count_ = 0;
}
SignalHandler::~SignalHandler() {}
bool SignalHandler::GotSIGINT() {
return false;
}
bool SignalHandler::GotSIGHUP() {
return false;
}
SignalHandler::Action SignalHandler::CheckForSignals() {
return SignalHandler::Action::NONE;
}
} // namespace c10
#endif // defined(C10_SUPPORTS_SIGNAL_HANDLER)