blob: c569360137b603819acee714c3e8686a11cb08cd [file] [log] [blame]
/* -*- Mode: C++; tab-width: 8; c-basic-offset: 2; indent-tabs-mode: nil; -*- */
#include "RecordCommand.h"
#include <linux/capability.h>
#include <spawn.h>
#include <sys/prctl.h>
#include <sys/wait.h>
#include <sysexits.h>
#include <time.h>
#include "preload/preload_interface.h"
#include "Flags.h"
#include "RecordSession.h"
#include "StringVectorToCharArray.h"
#include "WaitManager.h"
#include "WaitStatus.h"
#include "core.h"
#include "git_revision.h"
#include "kernel_metadata.h"
#include "log.h"
#include "main.h"
#include "util.h"
using namespace std;
namespace rr {
RecordCommand RecordCommand::singleton(
"record",
" rr record [OPTION]... <exe> [exe-args]...\n"
" -c, --num-cpu-ticks=<NUM> maximum number of 'CPU ticks' (currently \n"
" retired conditional branches) to allow a \n"
" task to run before interrupting it\n"
" --disable-avx-512 Masks out the CPUID bits for AVX512\n"
" This can improve trace portability\n"
" --disable-cpuid-features <CCC>[,<DDD>]\n"
" Mask out CPUID EAX=1 feature bits\n"
" <CCC>: Bitmask of bits to clear from ECX\n"
" <DDD>: Bitmask of bits to clear from EDX\n"
" --disable-cpuid-features-ext <BBB>[,<CCC>[,<DDD>]]\n"
" Mask out CPUID EAX=7,ECX=0 feature bits\n"
" <BBB>: Bitmask of bits to clear from EBX\n"
" <CCC>: Bitmask of bits to clear from ECX\n"
" <DDD>: Bitmask of bits to clear from EDX\n"
" --disable-cpuid-features-xsave <AAA>\n"
" Mask out CPUID EAX=0xD,ECX=1 feature bits\n"
" <AAA>: Bitmask of bits to clear from EAX\n"
" -h, --chaos randomize scheduling decisions to try to \n"
" reproduce bugs\n"
" -n, --no-syscall-buffer disable the syscall buffer preload \n"
" library even if it would otherwise be used\n"
" --no-file-cloning disable file cloning for mmapped files\n"
" --no-read-cloning disable file-block cloning for syscallbuf\n"
" reads\n"
" --num-cores=N pretend to have N cores (rr will still\n"
" only run on a single core). Overrides\n"
" random setting from --chaos.\n"
" -o, --output-trace-dir<DIR> set the output trace directory.\n"
" _RR_TRACE_DIR gets ignored.\n"
" Directory name is given name, not the\n"
" application name.\n"
" -p --print-trace-dir=<NUM> print trace directory followed by a newline\n"
" to given file descriptor\n"
" --syscall-buffer-sig=<NUM> the signal used for communication with the\n"
" syscall buffer. SIGPWR by default, unused\n"
" if --no-syscall-buffer is passed\n"
" -t, --continue-through-signal=<SIG>\n"
" Unhandled <SIG> signals will be ignored\n"
" instead of terminating the program. The\n"
" signal will still be delivered for user\n"
" handlers and debugging.\n"
" -u, --cpu-unbound allow tracees to run on any virtual CPU.\n"
" Default is to bind to a random CPU. This "
"option\n"
" can cause replay divergence: use with\n"
" caution.\n"
" --bind-to-cpu=<NUM> Bind to a particular CPU\n"
" instead of a randomly chosen one.\n"
" -v, --env=NAME=VALUE value to add to the environment of the\n"
" tracee. There can be any number of these.\n"
" -w, --wait Wait for all child processes to exit, not\n"
" just the initial process.\n"
" --nested=<value> Control behavior when run inside an outer\n"
" rr recording. Default: exit with error\n"
" --nested=ignore Directly start child process so it's part\n"
" of the outer recording\n"
" --nested=detach Start a separate recording session.\n"
" Must not share memory with the outer.\n"
" --nested=release Run the child without recording it.\n"
" Must not share memory with the outer.\n"
" --setuid-sudo If running under sudo, pretend to be the\n"
" user that ran sudo rather than root. This\n"
" allows recording setuid/setcap binaries.\n"
" --trace-id Sets the trace id to the specified id.\n"
" --copy-preload-src Copy preload sources to trace dir\n"
" --stap-sdt Enables the use of SystemTap statically-\n"
" defined tracepoints\n"
" --asan Override heuristics and always enable ASAN\n"
" compatibility.\n"
" --tsan Override heuristics and always enable TSAN\n"
" compatibility.\n");
struct RecordFlags {
vector<string> extra_env;
/* Max counter value before the scheduler interrupts a tracee. */
Ticks max_ticks;
/* Whenever |ignore_sig| is pending for a tracee, decline to
* deliver it. */
int ignore_sig;
/* Whenever |continue_through_sig| is delivered to a tracee, if there is no
* user handler and the signal would terminate the program, just ignore it. */
int continue_through_sig;
/* Whether to use syscall buffering optimization during recording. */
RecordSession::SyscallBuffering use_syscall_buffer;
/* If nonzero, the desired syscall buffer size. Must be a multiple of the page
* size.
*/
size_t syscall_buffer_size;
/* CPUID features to disable */
DisableCPUIDFeatures disable_cpuid_features;
int print_trace_dir;
string output_trace_dir;
/* Whether to use file-cloning optimization during recording. */
bool use_file_cloning;
/* Whether to use read-cloning optimization during recording. */
bool use_read_cloning;
/* Whether tracee processes in record and replay are allowed
* to run on any logical CPU. */
BindCPU bind_cpu;
/* True if we should context switch after every rr event */
bool always_switch;
/* Whether to enable chaos mode in the scheduler */
bool chaos;
/* Controls number of cores reported to recorded process. */
int num_cores;
/* True if we should wait for all processes to exit before finishing
* recording. */
bool wait_for_all;
/* Start child process directly if run under nested rr recording */
NestedBehavior nested;
bool scarce_fds;
bool setuid_sudo;
unique_ptr<TraceUuid> trace_id;
/* Copy preload sources to trace dir */
bool copy_preload_src;
/* The signal to use for syscallbuf desched events */
int syscallbuf_desched_sig;
/* True if we should load the audit library for SystemTap SDT support. */
bool stap_sdt;
/* True if we should unmap the vdso */
bool unmap_vdso;
/* True if we should always enable ASAN compatibility. */
bool asan;
/* True if we should always enable TSAN compatibility. */
bool tsan;
RecordFlags()
: max_ticks(Scheduler::DEFAULT_MAX_TICKS),
ignore_sig(0),
continue_through_sig(0),
use_syscall_buffer(RecordSession::ENABLE_SYSCALL_BUF),
syscall_buffer_size(0),
print_trace_dir(-1),
output_trace_dir(""),
use_file_cloning(true),
use_read_cloning(true),
bind_cpu(BIND_CPU),
always_switch(false),
chaos(false),
num_cores(0),
wait_for_all(false),
nested(NESTED_ERROR),
scarce_fds(false),
setuid_sudo(false),
copy_preload_src(false),
syscallbuf_desched_sig(SYSCALLBUF_DEFAULT_DESCHED_SIGNAL),
stap_sdt(false),
unmap_vdso(false),
asan(false),
tsan(false) {}
};
static void parse_signal_name(ParsedOption& opt) {
if (opt.int_value != INT64_MIN) {
return;
}
for (int i = 1; i < _NSIG; i++) {
std::string signame = signal_name(i);
if (signame == opt.value) {
opt.int_value = i;
return;
}
DEBUG_ASSERT(signame[0] == 'S' && signame[1] == 'I' && signame[2] == 'G');
if (signame.substr(3) == opt.value) {
opt.int_value = i;
return;
}
}
}
static vector<uint32_t> parse_feature_bits(ParsedOption& opt) {
vector<uint32_t> ret;
const char* p = opt.value.c_str();
while (*p) {
char* endptr;
unsigned long long v = strtoull(p, &endptr, 0);
if (v > UINT32_MAX || (*endptr && *endptr != ',')) {
return vector<uint32_t>();
}
ret.push_back(v);
p = *endptr == ',' ? endptr + 1 : endptr;
}
return ret;
}
static bool parse_record_arg(vector<string>& args, RecordFlags& flags) {
if (parse_global_option(args)) {
return true;
}
static const OptionSpec options[] = {
{ 0, "no-read-cloning", NO_PARAMETER },
{ 1, "no-file-cloning", NO_PARAMETER },
{ 2, "syscall-buffer-size", HAS_PARAMETER },
{ 3, "nested", HAS_PARAMETER },
{ 4, "scarce-fds", NO_PARAMETER },
{ 5, "setuid-sudo", NO_PARAMETER },
{ 6, "bind-to-cpu", HAS_PARAMETER },
{ 7, "disable-cpuid-features", HAS_PARAMETER },
{ 8, "disable-cpuid-features-ext", HAS_PARAMETER },
{ 9, "disable-cpuid-features-xsave", HAS_PARAMETER },
{ 10, "num-cores", HAS_PARAMETER },
{ 11, "trace-id", HAS_PARAMETER },
{ 12, "copy-preload-src", NO_PARAMETER },
{ 13, "syscall-buffer-sig", HAS_PARAMETER },
{ 14, "stap-sdt", NO_PARAMETER },
{ 15, "unmap-vdso", NO_PARAMETER },
{ 16, "disable-avx-512", NO_PARAMETER },
{ 17, "asan", NO_PARAMETER },
{ 18, "tsan", NO_PARAMETER },
{ 'c', "num-cpu-ticks", HAS_PARAMETER },
{ 'h', "chaos", NO_PARAMETER },
{ 'i', "ignore-signal", HAS_PARAMETER },
{ 'n', "no-syscall-buffer", NO_PARAMETER },
{ 'p', "print-trace-dir", HAS_PARAMETER },
{ 'o', "output-trace-dir", HAS_PARAMETER },
{ 's', "always-switch", NO_PARAMETER },
{ 't', "continue-through-signal", HAS_PARAMETER },
{ 'u', "cpu-unbound", NO_PARAMETER },
{ 'v', "env", HAS_PARAMETER },
{ 'w', "wait", NO_PARAMETER }};
ParsedOption opt;
auto args_copy = args;
if (!Command::parse_option(args_copy, options, &opt)) {
return false;
}
switch (opt.short_name) {
case 'c':
if (!opt.verify_valid_int(1, Scheduler::MAX_MAX_TICKS)) {
return false;
}
flags.max_ticks = opt.int_value;
break;
case 'h':
LOG(info) << "Enabled chaos mode";
flags.chaos = true;
break;
case 'i':
parse_signal_name(opt);
if (!opt.verify_valid_int(1, _NSIG - 1)) {
return false;
}
flags.ignore_sig = opt.int_value;
break;
case 'n':
flags.use_syscall_buffer = RecordSession::DISABLE_SYSCALL_BUF;
break;
case 'p':
if (!opt.verify_valid_int(0, INT32_MAX)) {
return false;
}
flags.print_trace_dir = opt.int_value;
break;
case 'o':
flags.output_trace_dir = opt.value;
break;
case 0:
flags.use_read_cloning = false;
break;
case 1:
flags.use_file_cloning = false;
break;
case 2:
if (!opt.verify_valid_int(4, 1024 * 1024)) {
return false;
}
flags.syscall_buffer_size = ceil_page_size(opt.int_value * 1024);
break;
case 3:
if (opt.value == "default" || opt.value == "error") {
flags.nested = NESTED_ERROR;
} else if (opt.value == "ignore") {
flags.nested = NESTED_IGNORE;
} else if (opt.value == "detach") {
flags.nested = NESTED_DETACH;
} else if (opt.value == "release") {
flags.nested = NESTED_RELEASE;
} else {
LOG(warn) << "Unknown nesting behavior `" << opt.value << "`";
flags.nested = NESTED_ERROR;
}
break;
case 4:
flags.scarce_fds = true;
break;
case 5:
flags.setuid_sudo = true;
break;
case 6:
if (!opt.verify_valid_int(0, INT32_MAX)) {
return false;
}
flags.bind_cpu = BindCPU(opt.int_value);
break;
case 7: {
vector<uint32_t> bits = parse_feature_bits(opt);
if (bits.empty() || bits.size() > 2) {
return false;
}
flags.disable_cpuid_features.features_ecx = bits[0];
if (bits.size() > 1) {
flags.disable_cpuid_features.features_edx = bits[1];
}
break;
}
case 8: {
vector<uint32_t> bits = parse_feature_bits(opt);
if (bits.empty() || bits.size() > 3) {
return false;
}
flags.disable_cpuid_features.extended_features_ebx = bits[0];
if (bits.size() > 1) {
flags.disable_cpuid_features.extended_features_ecx = bits[1];
if (bits.size() > 2) {
flags.disable_cpuid_features.extended_features_edx = bits[2];
}
}
break;
}
case 9: {
vector<uint32_t> bits = parse_feature_bits(opt);
if (bits.size() != 1) {
return false;
}
flags.disable_cpuid_features.xsave_features_eax = bits[0];
break;
}
case 10: {
if (!opt.verify_valid_int(1, 128)) {
return false;
}
flags.num_cores = opt.int_value;
break;
}
case 11: {
const uint8_t SUM_GROUP_LENS[5] = { 8, 12, 16, 20, 32 };
/* Parse UUIDs from string form optionally with hyphens */
uint8_t digit = 0; // This counts only hex digits (i.e. not hyphens)
uint8_t group = 0;
uint8_t acc = 0;
unique_ptr<TraceUuid> buf(new TraceUuid);
auto it = opt.value.begin();
while (it < opt.value.end()) {
auto c = *it;
if (digit > SUM_GROUP_LENS[4]) {
return false;
}
if (digit % 2 == 0) {
// First digit of the byte.
if ('0' <= c && c <= '9') {
acc = c - '0';
} else if ('a' <= c && c <= 'f') {
acc = c - 'a' + 10;
} else if ('A' <= c && c <= 'F') {
acc = c - 'A' + 10;
} else if (c == '-') {
// Group delimiter.
if (SUM_GROUP_LENS[group] != digit) {
return false;
}
++group;
++it;
continue;
} else {
return false;
}
} else {
// Second digit of the byte.
acc <<= 4;
if ('0' <= c && c <= '9') {
acc += c - '0';
} else if ('a' <= c && c <= 'f') {
acc += c - 'a' + 10;
} else if ('A' <= c && c <= 'F') {
acc += c - 'A' + 10;
} else {
return false;
}
buf->bytes[digit / 2] = acc;
}
++digit;
++it;
}
if (SUM_GROUP_LENS[4] != digit) {
return false;
}
flags.trace_id.swap(buf);
break;
}
case 12:
flags.copy_preload_src = true;
break;
case 13:
parse_signal_name(opt);
if (!opt.verify_valid_int(1, _NSIG - 1)) {
return false;
}
flags.syscallbuf_desched_sig = opt.int_value;
break;
case 14:
flags.stap_sdt = true;
break;
case 15:
flags.unmap_vdso = true;
break;
case 16:
flags.disable_cpuid_features.extended_features_ebx |= 0xdc230000;
flags.disable_cpuid_features.extended_features_ecx |= 0x00002c42;
flags.disable_cpuid_features.extended_features_edx |= 0x0000000c;
break;
case 17:
flags.asan = true;
break;
case 18:
flags.tsan = true;
break;
case 's':
flags.always_switch = true;
break;
case 't':
parse_signal_name(opt);
if (!opt.verify_valid_int(1, _NSIG - 1)) {
return false;
}
flags.continue_through_sig = opt.int_value;
break;
case 'u':
flags.bind_cpu = UNBOUND_CPU;
break;
case 'v':
flags.extra_env.push_back(opt.value);
break;
case 'w':
flags.wait_for_all = true;
break;
default:
DEBUG_ASSERT(0 && "Unknown option");
}
args = args_copy;
return true;
}
static volatile double term_requested;
static bool did_print_reassurance = false;
static const double TRACEE_SIGTERM_RESPONSE_MAX_TIME = 5;
static const double RR_SIGKILL_GRACE_TIME = 5;
/**
* A terminating signal was received.
*
* First we forward it to the tracee. Then if the tracee is still
* running after TRACEE_SIGTERM_RESPONSE_MAX_TIME, we kill it with SIGKILL.
* If a term request remains pending for more than one second,
* then assume rr is wedged and abort().
*
* Note that this is called in a signal handler and could also
* be called off the main thread.
*/
static void handle_SIGTERM(__attribute__((unused)) int sig) {
// Don't use LOG() here because we're in a signal handler. If we do anything
// that could allocate, we could deadlock.
if (term_requested > 0) {
double now = monotonic_now_sec();
if (now - term_requested > 1 + TRACEE_SIGTERM_RESPONSE_MAX_TIME) {
if (!did_print_reassurance) {
static const char msg[] =
"[rr] Tracee failed to exit within 1s after SIGKILL. Recording will forcibly terminate in 4s.\n";
did_print_reassurance = true;
write_all(STDERR_FILENO, msg, sizeof(msg) - 1);
} else if (now - term_requested > RR_SIGKILL_GRACE_TIME + TRACEE_SIGTERM_RESPONSE_MAX_TIME) {
notifying_abort();
}
}
} else {
term_requested = monotonic_now_sec();
}
}
/**
* Something segfaulted - this is probably a bug in rr. Try to at least
* give a stacktrace.
*/
static void handle_SIGSEGV(__attribute__((unused)) int sig) {
static const char msg[] =
"rr itself crashed (SIGSEGV). This shouldn't happen!\n";
write_all(STDERR_FILENO, msg, sizeof(msg) - 1);
notifying_abort();
}
static void install_signal_handlers(void) {
struct sigaction sa;
memset(&sa, 0, sizeof(sa));
sa.sa_handler = handle_SIGTERM;
sigaction(SIGTERM, &sa, nullptr);
sa.sa_handler = handle_SIGSEGV;
sigaction(SIGSEGV, &sa, nullptr);
sa.sa_handler = SIG_IGN;
sigaction(SIGHUP, &sa, nullptr);
sigaction(SIGINT, &sa, nullptr);
sigaction(SIGABRT, &sa, nullptr);
sigaction(SIGQUIT, &sa, nullptr);
}
static void setup_session_from_flags(RecordSession& session,
const RecordFlags& flags) {
session.scheduler().set_max_ticks(flags.max_ticks);
session.scheduler().set_always_switch(flags.always_switch);
session.set_enable_chaos(flags.chaos);
if (flags.num_cores) {
// Set the number of cores reported, possibly overriding the chaos mode
// setting.
session.set_num_cores(flags.num_cores);
}
session.set_use_read_cloning(flags.use_read_cloning);
session.set_use_file_cloning(flags.use_file_cloning);
session.set_ignore_sig(flags.ignore_sig);
session.set_continue_through_sig(flags.continue_through_sig);
session.set_wait_for_all(flags.wait_for_all);
if (flags.syscall_buffer_size > 0) {
session.set_syscall_buffer_size(flags.syscall_buffer_size);
}
if (flags.scarce_fds) {
for (int i = 0; i < 950; ++i) {
open("/dev/null", O_RDONLY);
}
}
}
static RecordSession* static_session;
// This can be called during debugging to close the trace so it can be used
// later.
void force_close_record_session() {
if (static_session) {
static_session->close_trace_writer(TraceWriter::CLOSE_ERROR);
}
}
static void copy_preload_sources_to_trace(const string& trace_dir) {
string files_dir = trace_dir + "/files.rr";
mkdir(files_dir.c_str(), 0700);
pid_t pid;
string dest_path = files_dir + "/librrpreload.zip";
string src_path = resource_path() + "share/rr/src";
char zip[] = "zip";
char r[] = "-r";
char j[] = "-j";
char* argv[] = {
zip, r, j,
const_cast<char*>(dest_path.c_str()),
const_cast<char*>(src_path.c_str()),
NULL
};
posix_spawn_file_actions_t actions;
posix_spawn_file_actions_init(&actions);
posix_spawn_file_actions_addopen(&actions, STDOUT_FILENO, "/dev/null", O_RDONLY, 0);
posix_spawn_file_actions_addopen(&actions, STDERR_FILENO, "/dev/null", O_RDONLY, 0);
int ret = posix_spawnp(&pid, argv[0], &actions, NULL, argv, environ);
if (ret) {
FATAL() << "Can't spawn 'zip'";
}
posix_spawn_file_actions_destroy(&actions);
WaitResult result = WaitManager::wait_exit(WaitOptions(pid));
if (result.code != WAIT_OK) {
FATAL() << "Wait failed";
}
LOG(info) << "Got zip status " << result.status;
}
static void save_rr_git_revision(const string& trace_dir) {
string files_dir = trace_dir + "/files.rr";
mkdir(files_dir.c_str(), 0700);
string dest_path = files_dir + "/rr_git_revision";
ScopedFd fd(dest_path.c_str(), O_CREAT | O_WRONLY, 0600);
ssize_t written = write(fd, GIT_REVISION, sizeof(GIT_REVISION) - 1);
if (written != sizeof(GIT_REVISION) - 1) {
FATAL() << "Can't write GIT_REVISION";
}
}
static void* repeat_SIGTERM(__attribute__((unused)) void* p) {
sleep_time(TRACEE_SIGTERM_RESPONSE_MAX_TIME);
/* send another SIGTERM so we wake up and SIGKILL our tracees */
kill(getpid(), SIGTERM);
sleep_time(RR_SIGKILL_GRACE_TIME);
/* Ok, now we're really wedged, just repeatedly SIGTERM until we're out */
while (1) {
kill(getpid(), SIGTERM);
sleep_time(0.01);
}
}
static WaitStatus record(const vector<string>& args, const RecordFlags& flags) {
LOG(info) << "Start recording...";
auto session = RecordSession::create(
args, flags.extra_env, flags.disable_cpuid_features,
flags.use_syscall_buffer, flags.syscallbuf_desched_sig,
flags.bind_cpu, flags.output_trace_dir,
flags.trace_id.get(),
flags.stap_sdt, flags.unmap_vdso, flags.asan, flags.tsan);
setup_session_from_flags(*session, flags);
static_session = session.get();
if (flags.print_trace_dir >= 0) {
const string& dir = session->trace_writer().dir();
write_all(flags.print_trace_dir, dir.c_str(), dir.size());
write_all(flags.print_trace_dir, "\n", 1);
}
if (flags.copy_preload_src) {
const string& dir = session->trace_writer().dir();
copy_preload_sources_to_trace(dir);
save_rr_git_revision(dir);
}
// Install signal handlers after creating the session, to ensure they're not
// inherited by the tracee.
install_signal_handlers();
RecordSession::RecordResult step_result;
bool did_forward_SIGTERM = false;
bool did_term_detached_tasks = false;
pthread_t term_repeater_thread;
do {
bool done_initial_exec = session->done_initial_exec();
step_result = session->record_step();
// Only create latest-trace symlink if --output-trace-dir is not being used
if (!done_initial_exec && session->done_initial_exec() && flags.output_trace_dir.empty()) {
session->trace_writer().make_latest_trace();
}
if (term_requested) {
if (monotonic_now_sec() - term_requested > TRACEE_SIGTERM_RESPONSE_MAX_TIME) {
/* time ran out for the tracee to respond to SIGTERM; kill everything */
session->terminate_tracees();
} else if (!did_forward_SIGTERM) {
session->forward_SIGTERM();
// Start a thread to send a SIGTERM to ourselves (again)
// in case the tracee doesn't respond to SIGTERM.
pthread_create(&term_repeater_thread, NULL, repeat_SIGTERM, NULL);
did_forward_SIGTERM = true;
}
/* Forward SIGTERM to detached tasks immediately */
if (!did_term_detached_tasks) {
session->term_detached_tasks();
did_term_detached_tasks = true;
}
}
} while (step_result.status == RecordSession::STEP_CONTINUE);
session->close_trace_writer(TraceWriter::CLOSE_OK);
static_session = nullptr;
switch (step_result.status) {
case RecordSession::STEP_CONTINUE:
// SIGTERM interrupted us.
return WaitStatus::for_fatal_sig(SIGTERM);
case RecordSession::STEP_EXITED:
return step_result.exit_status;
case RecordSession::STEP_SPAWN_FAILED:
cerr << "\n" << step_result.failure_message << "\n";
return WaitStatus::for_exit_code(EX_UNAVAILABLE);
default:
DEBUG_ASSERT(0 && "Unknown exit status");
return WaitStatus();
}
}
static void exec_child(vector<string>& args) {
execvp(args[0].c_str(), StringVectorToCharArray(args).get());
// That failed. Try executing the file directly.
execv(args[0].c_str(), StringVectorToCharArray(args).get());
switch (errno) {
case ENOENT:
fprintf(stderr, "execv failed: '%s' (or interpreter) not found (%s)",
args[0].c_str(), errno_name(errno).c_str());
break;
default:
fprintf(stderr, "execv of '%s' failed (%s)", args[0].c_str(),
errno_name(errno).c_str());
break;
}
_exit(1);
// Never returns!
}
static void reset_uid_sudo() {
// Let's change our uids now. We do keep capabilities though, since that's
// the point of the exercise. The first exec will reset both the keepcaps,
// and the capabilities in the child
std::string sudo_uid = getenv("SUDO_UID");
std::string sudo_gid = getenv("SUDO_GID");
DEBUG_ASSERT(!sudo_uid.empty() && !sudo_gid.empty());
uid_t tracee_uid = stoi(sudo_uid);
gid_t tracee_gid = stoi(sudo_gid);
// Setuid will drop effective capabilities. Save them now and set them
// back after
struct NativeArch::cap_header header = {.version =
_LINUX_CAPABILITY_VERSION_3,
.pid = 0 };
struct NativeArch::cap_data data[2];
if (syscall(NativeArch::capget, &header, data) != 0) {
FATAL() << "FAILED to read capabilities";
}
if (prctl(PR_SET_KEEPCAPS, 1, 0, 0, 0)) {
FATAL() << "FAILED to set keepcaps";
}
if (setgid(tracee_gid) != 0) {
FATAL() << "FAILED to setgid to sudo group";
}
if (setuid(tracee_uid) != 0) {
FATAL() << "FAILED to setuid to sudo user";
}
if (syscall(NativeArch::capset, &header, data) != 0) {
FATAL() << "FAILED to set capabilities";
}
// Just make sure the ambient set is cleared, to avoid polluting the tracee
prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_CLEAR_ALL, 0, 0, 0);
}
int RecordCommand::run(vector<string>& args) {
RecordFlags flags;
while (parse_record_arg(args, flags)) {
}
if (running_under_rr()) {
switch (flags.nested) {
case NESTED_IGNORE:
exec_child(args);
return 1;
case NESTED_DETACH:
case NESTED_RELEASE: {
int ret = syscall(SYS_rrcall_detach_teleport, (uintptr_t)0, (uintptr_t)0,
(uintptr_t)0, (uintptr_t)0, (uintptr_t)0, (uintptr_t)0);
if (ret < 0) {
FATAL() << "Failed to detach from parent rr";
}
if (running_under_rr(false)) {
FATAL() << "Detaching from parent rr did not work";
}
if (flags.nested == NESTED_RELEASE) {
exec_child(args);
return 1;
}
// running_under_rr() changed - respect the log specification from RR_LOG
// just as if we hadn't been running under rr.
apply_log_spec_from_env();
break;
}
default:
fprintf(stderr, "rr: cannot run rr recording under rr. Exiting.\n"
"Use `rr record --nested=ignore` to start the child "
"process directly.\n");
return 1;
}
}
if (!verify_not_option(args) || args.size() == 0) {
print_help(stderr);
return 1;
}
assert_prerequisites(flags.use_syscall_buffer);
if (flags.setuid_sudo) {
if (geteuid() != 0 || getenv("SUDO_UID") == NULL) {
fprintf(stderr, "rr: --setuid-sudo option may only be used under sudo.\n"
"Re-run as `sudo -EP --preserve-env=HOME rr record --setuid-sudo` to"
"record privileged executables.\n");
return 1;
}
reset_uid_sudo();
}
if (flags.chaos) {
// Add up to one page worth of random padding to the environment to induce
// a variety of possible stack pointer offsets
vector<char> chars;
chars.resize(random() % page_size());
memset(chars.data(), '0', chars.size());
chars.push_back(0);
string padding = string("RR_CHAOS_PADDING=") + chars.data();
flags.extra_env.push_back(padding);
}
WaitStatus status = record(args, flags);
// Everything should have been cleaned up by now.
check_for_leaks();
switch (status.type()) {
case WaitStatus::EXIT:
return status.exit_code();
case WaitStatus::FATAL_SIGNAL:
signal(status.fatal_sig(), SIG_DFL);
prctl(PR_SET_DUMPABLE, 0);
kill(getpid(), status.fatal_sig());
break;
default:
FATAL() << "Don't know why we exited: " << status;
break;
}
return 1;
}
} // namespace rr