blob: fdc449ddb5b1b132f4c6e40ea4a99afe3f37972f [file] [log] [blame]
/* -*- Mode: C++; tab-width: 8; c-basic-offset: 2; indent-tabs-mode: nil; -*- */
#include <errno.h>
#include <limits.h>
#include <linux/capability.h>
#include <linux/elf.h>
#include <linux/ipc.h>
#include <linux/net.h>
#include <linux/perf_event.h>
#include <linux/prctl.h>
#include <linux/unistd.h>
#include <math.h>
#include <stdarg.h>
#include <stdlib.h>
#include <string.h>
#include <sys/personality.h>
#include <sys/resource.h>
#include <sys/socket.h>
#include <sys/time.h>
#include <sys/types.h>
#include <sys/un.h>
#include <sys/user.h>
#include <sys/wait.h>
#include <syscall.h>
#include <limits>
#include <set>
#include <sstream>
#include <rr/rr.h>
#include "Task.h"
#include "preload/preload_interface.h"
#include "AutoRemoteSyscalls.h"
#include "CPUIDBugDetector.h"
#include "Flags.h"
#include "MagicSaveDataMonitor.h"
#include "PidFdMonitor.h"
#include "PreserveFileMonitor.h"
#include "ProcMemMonitor.h"
#include "RecordSession.h"
#include "RecordTask.h"
#include "ReplaySession.h"
#include "ReplayTask.h"
#include "ScopedFd.h"
#include "StdioMonitor.h"
#include "StringVectorToCharArray.h"
#include "TraceeAttentionSet.h"
#include "WaitManager.h"
#include "cpp_supplement.h"
#include "fast_forward.h"
#include "kernel_abi.h"
#include "kernel_metadata.h"
#include "kernel_supplement.h"
#include "log.h"
#include "record_signal.h"
#include "seccomp-bpf.h"
#include "util.h"
using namespace std;
namespace rr {
static const unsigned int NUM_X86_DEBUG_REGS = 8;
static const unsigned int NUM_X86_WATCHPOINTS = 4;
Task::Task(Session& session, pid_t _tid, pid_t _rec_tid, uint32_t serial,
SupportedArch a)
: scratch_ptr(),
scratch_size(),
// This will be initialized when the syscall buffer is.
desched_fd_child(-1),
// This will be initialized when the syscall buffer is.
cloned_file_data_fd_child(-1),
hpc(_tid, session.cpu_binding(), session.ticks_semantics(),
session.need_performance_counters()),
tid(_tid),
rec_tid(_rec_tid > 0 ? _rec_tid : _tid),
own_namespace_rec_tid(_rec_tid > 0 ? _rec_tid: _tid),
syscallbuf_size(0),
ticks_at_last_syscall_entry(0),
ip_at_last_syscall_entry(nullptr),
last_syscall_entry_recorded(false),
serial(serial),
ticks(0),
registers(a),
how_last_execution_resumed(RESUME_CONT),
last_resume_orig_cx(0),
did_set_breakpoint_after_cpuid(false),
is_stopped_(false),
in_unexpected_exit(false),
seccomp_bpf_enabled(false),
registers_dirty(false),
orig_syscallno_dirty(false),
extra_registers(a),
extra_registers_known(false),
session_(&session),
top_of_stack(),
seen_ptrace_exit_event_(false),
handled_ptrace_exit_event_(false),
expecting_ptrace_interrupt_stop(0),
was_reaped_(false),
forgotten(false) {
memset(&thread_locals, 0, sizeof(thread_locals));
}
void Task::detach() {
LOG(debug) << "detaching from Task " << tid << " (rec:" << rec_tid << ")";
fallible_ptrace(PTRACE_DETACH, nullptr, nullptr);
// Not really, but there's also no reason to actually try to reap it,
// since we detached.
was_reaped_ = true;
}
void Task::reenable_cpuid_tsc() {
if (is_x86ish(arch())) {
AutoRemoteSyscalls remote(this);
if (session().has_cpuid_faulting()) {
remote.infallible_syscall(syscall_number_for_arch_prctl(arch()),
ARCH_SET_CPUID, 1);
}
remote.infallible_syscall(syscall_number_for_prctl(arch()),
PR_SET_TSC, PR_TSC_ENABLE);
}
}
void Task::wait_exit() {
LOG(debug) << "Waiting for exit of " << tid;
/* We want to wait for the child to exit, but we don't actually
* want to reap the task when it's dead. We could use WEXITED | WNOWAIT,
* but that would hang if `t` is a thread-group-leader of a thread group
* that has other still-running threads. Instead, we wait for WSTOPPED, but
* we know that there is no possibility for the task to stop between now and
* its exit, at which point the system call will return with -ECHILD.
* There is one exception: If there was a simultaneous exec from another
* thread, and this is the group leader, then this task may lose its pid
* as soon as it enters the zombie state, causing `tid` to refer to the
* newly-execed thread and us getting a PTRACE_EVENT_EXEC instead. To account
* for this we add `| WNOWAIT` to prevent dequeuing the event and simply take
* it as an indication that the task has execed.
*/
WaitOptions options(tid);
options.consume = false;
do {
WaitResult result = WaitManager::wait_stop(options);
if (result.code == WAIT_OK) {
if (result.status.ptrace_event() == PTRACE_EVENT_EXIT) {
// It's possible that the earlier exit event was synthetic, in which
// case we're only now catching up to the real process exit. In that
// case, just ask the process to actually exit. (TODO: We may want to
// catch this earlier).
return proceed_to_exit();
}
ASSERT(this, result.status.ptrace_event() == PTRACE_EVENT_EXEC)
<< "Expected PTRACE_EVENT_EXEC, got " << result.status;
// The kernel will do the reaping for us in this case
was_reaped_ = true;
} else if (result.code == WAIT_NO_STATUS) {
// Wait was EINTR'd most likely - retry.
continue;
} else {
ASSERT(this, result.code == WAIT_NO_CHILD);
}
} while (false);
}
void Task::proceed_to_exit(bool wait) {
LOG(debug) << "Advancing tid " << tid << " to exit; wait=" << wait;
int ret = fallible_ptrace(PTRACE_CONT, nullptr, nullptr);
ASSERT(this, ret == 0 || (ret == -1 && errno == ESRCH))
<< "Got ret=" << ret << " errno=" << errno;
if (wait) {
wait_exit();
}
}
WaitStatus Task::kill() {
if (was_reaped()) {
return this->status();
}
/* This call is racy. There is basically three situations:
* 1. By the time the kernel gets around to delivering this signal,
* we were already in a PTRACE_EVENT_EXIT stop (e.g. due to an earlier
* fatal signal or group exit from a sibling task that the kernel
* didn't report to us yet), that we didn't observe yet (if we had, we
* would have removed the task from the task map already). In this case,
* this signal will advance from the PTRACE_EVENT_EXIT and put the child
* into hidden-zombie state, which the waitpid below will reap.
* 2. The task was in a coredump wait. This situation essentially works the
* same as 1, but the final exit status will be some other fatal signal.
* 3. Anything else basically. The signal will take priority and put us
* into the PTRACE_EVENT_EXIT stop, which the subsequent waitpid will
* then observe.
*/
LOG(debug) << "Sending SIGKILL to " << tid;
int ret = syscall(SYS_tgkill, real_tgid(), tid, SIGKILL);
ASSERT(this, ret == 0);
WaitResult result;
bool is_exit_event;
do {
result = WaitManager::wait_stop_or_exit(WaitOptions(tid));
ASSERT(this, result.code == WAIT_OK);
LOG(debug) << " -> " << result.status;
is_exit_event = result.status.ptrace_event() == PTRACE_EVENT_EXIT;
// Loop until we get a suitable event; there could be a cached stop
// notification.
} while (!(is_exit_event || result.status.type() == WaitStatus::FATAL_SIGNAL ||
result.status.type() == WaitStatus::EXIT));
did_kill();
WaitStatus status = result.status;
if (is_exit_event) {
/* If this is the exit event, we can detach here and the task will
* continue to zombie state for its parent to reap. If we're not in
* the exit event, we already reaped it from the ptrace perspective,
* which implicitly detached.
*/
unsigned long long_status;
if (ptrace_if_stopped(PTRACE_GETEVENTMSG, nullptr, &long_status)) {
status = WaitStatus(long_status);
} else {
// The task has been killed due to SIGKILL or equivalent.
status = WaitStatus::for_fatal_sig(SIGKILL);
}
int ret = fallible_ptrace(PTRACE_DETACH, nullptr, nullptr);
DEBUG_ASSERT(ret == 0 || (ret == -1 && errno == ESRCH));
if (ret == -1) {
/* It's possible for the above ptrace to fail with ESRCH. How?
* It's the other side of the race described above. If an external
* process issues an additional SIGKILL, we will advance from the
* ptrace exit event and we might still be processing the exit, just
* as the detach request comes in. To address this, we waitpid again,
* which will reap/detach us from ptrace and frees the real parent to
* do its reaping. */
result = WaitManager::wait_exit(WaitOptions(tid));
ASSERT(this, result.code == WAIT_OK);
LOG(debug) << " --> " << result.status;
ASSERT(this, result.status.fatal_sig() == SIGKILL);
status = result.status;
}
} else {
was_reaped_ = true;
}
return status;
}
Task::~Task() {
if (!forgotten) {
ASSERT(this, handled_ptrace_exit_event_);
ASSERT(this, syscallbuf_child.is_null());
if (!session().is_recording() && !was_reaped()) {
// Reap the zombie.
WaitResult result = WaitManager::wait_exit(WaitOptions(tid));
ASSERT(this, result.code == WAIT_OK || result.code == WAIT_NO_CHILD);
}
LOG(debug) << " dead";
}
session().on_destroy(this);
tg->erase_task(this);
as->erase_task(this);
fds->erase_task(this);
}
void Task::forget() {
forgotten = true;
}
void Task::finish_emulated_syscall() {
// XXX verify that this can't be interrupted by a breakpoint trap
Registers r = regs();
// Passing RESUME_NO_TICKS here is not only a small performance optimization,
// but also avoids counting an event if the instruction immediately following
// a syscall instruction is a conditional branch.
bool ok = resume_execution(RESUME_SYSCALL, RESUME_WAIT_NO_EXIT, RESUME_NO_TICKS);
ASSERT(this, ok) << "Tracee exited unexpectedly";
set_regs(r);
wait_status = WaitStatus();
}
string Task::name() const {
char buf[1024];
sprintf(buf, "/proc/%d/comm", tid);
ScopedFd comm(buf, O_RDONLY);
if (!comm.is_open()) {
return "???";
}
ssize_t bytes = read(comm, buf, sizeof(buf) - 1);
ASSERT(this, bytes >= 0);
if (bytes > 0 && buf[bytes - 1] == '\n') {
--bytes;
}
return string(buf, bytes);
}
void Task::set_name(AutoRemoteSyscalls& remote, const std::string& name) {
ASSERT(this, this == remote.task());
char prname[17];
strncpy(prname, name.c_str(), sizeof(prname));
prname[16] = 0;
AutoRestoreMem remote_prname(remote, (const uint8_t*)prname, 16);
LOG(debug) << " setting name to " << prname;
remote.infallible_syscall(syscall_number_for_prctl(remote.arch()), PR_SET_NAME,
remote_prname.get().as_int());
}
void Task::dump(FILE* out) const {
out = out ? out : stderr;
stringstream ss;
ss << wait_status;
fprintf(out, " %s(tid:%d rec_tid:%d status:0x%s)<%p>\n", name().c_str(),
tid, rec_tid, ss.str().c_str(), this);
if (session().is_recording()) {
// TODO pending events are currently only meaningful
// during recording. We should change that
// eventually, to have more informative output.
log_pending_events();
}
}
std::string Task::proc_fd_path(int fd) {
char path[PATH_MAX];
snprintf(path, sizeof(path) - 1, "/proc/%d/fd/%d", tid, fd);
return path;
}
std::string Task::proc_pagemap_path() {
char path[PATH_MAX];
snprintf(path, sizeof(path) - 1, "/proc/%d/pagemap", tid);
return path;
}
std::string Task::proc_stat_path() {
char path[PATH_MAX];
snprintf(path, sizeof(path) - 1, "/proc/%d/stat", tid);
return path;
}
std::string Task::proc_exe_path() {
char path[PATH_MAX];
snprintf(path, sizeof(path) - 1, "/proc/%d/exe", tid);
return path;
}
std::string Task::exe_path() {
char proc_exe[PATH_MAX];
snprintf(proc_exe, sizeof(proc_exe), "/proc/%d/exe", tid);
char exe[PATH_MAX];
ssize_t ret = readlink(proc_exe, exe, sizeof(exe) - 1);
ASSERT(this, ret >= 0);
exe[ret] = 0;
return exe;
}
struct stat Task::stat_fd(int fd) {
char path[PATH_MAX];
snprintf(path, sizeof(path) - 1, "/proc/%d/fd/%d", tid, fd);
struct stat result;
auto ret = ::stat(path, &result);
ASSERT(this, ret == 0);
return result;
}
struct stat Task::lstat_fd(int fd) {
char path[PATH_MAX];
snprintf(path, sizeof(path) - 1, "/proc/%d/fd/%d", tid, fd);
struct stat result;
auto ret = ::lstat(path, &result);
ASSERT(this, ret == 0);
return result;
}
ScopedFd Task::open_fd(int fd, int flags) {
char path[PATH_MAX];
snprintf(path, sizeof(path) - 1, "/proc/%d/fd/%d", tid, fd);
return ScopedFd(path, flags);
}
string Task::file_name_of_fd(int fd) {
char path[PATH_MAX];
char procfd[40];
snprintf(procfd, sizeof(procfd) - 1, "/proc/%d/fd/%d", tid, fd);
ssize_t nbytes = readlink(procfd, path, sizeof(path) - 1);
if (nbytes < 0) {
path[0] = 0;
} else {
path[nbytes] = 0;
}
return path;
}
pid_t Task::get_ptrace_eventmsg_pid() {
unsigned long msg = 0;
if (!ptrace_if_stopped(PTRACE_GETEVENTMSG, nullptr, &msg)) {
return -1;
}
return msg;
}
const siginfo_t& Task::get_siginfo() {
DEBUG_ASSERT(stop_sig());
return pending_siginfo;
}
/**
* Must be idempotent.
*/
void Task::destroy_buffers(Task *as_task, Task *fd_task) {
auto saved_syscallbuf_child = syscallbuf_child;
// Clear syscallbuf_child now so nothing tries to use it while tearing
// down buffers.
syscallbuf_child = nullptr;
if (as_task != nullptr) {
AutoRemoteSyscalls remote(as_task);
as_task->unmap_buffers_for(remote, this, saved_syscallbuf_child);
if (as_task == fd_task) {
as_task->close_buffers_for(remote, this, true);
}
goto done;
}
if (fd_task != nullptr) {
AutoRemoteSyscalls remote(fd_task);
fd_task->close_buffers_for(remote, this, true);
}
done:
scratch_ptr = nullptr;
desched_fd_child = -1;
cloned_file_data_fd_child = -1;
}
void Task::unmap_buffers_for(
AutoRemoteSyscalls& remote, Task* other,
remote_ptr<struct syscallbuf_hdr> saved_syscallbuf_child) {
if (other->scratch_ptr) {
if (remote.infallible_munmap_syscall_if_alive(
other->scratch_ptr, other->scratch_size)) {
vm()->unmap(this, other->scratch_ptr, other->scratch_size);
}
}
if (!saved_syscallbuf_child.is_null()) {
if (remote.infallible_munmap_syscall_if_alive(
saved_syscallbuf_child, other->syscallbuf_size)) {
vm()->unmap(this, saved_syscallbuf_child, other->syscallbuf_size);
}
}
}
void Task::did_kill()
{
/* We may or may not have seen this event (see the note on race conditions
* in Session.cc), but let's pretend that we did to make this task look like
* other that we didn't kill ourselves
*/
seen_ptrace_exit_event_ = true;
handled_ptrace_exit_event_ = true;
syscallbuf_child = nullptr;
/* No need to unmap/close things in the child here - the kernel did that for
* us when the child died. */
scratch_ptr = nullptr;
desched_fd_child = -1;
cloned_file_data_fd_child = -1;
}
/**
* Must be idempotent.
*/
void Task::close_buffers_for(AutoRemoteSyscalls& remote, Task* other, bool really_close) {
if (other->desched_fd_child >= 0) {
if (session().is_recording() && really_close) {
remote.infallible_close_syscall_if_alive(other->desched_fd_child);
}
fds->did_close(other->desched_fd_child);
}
if (other->cloned_file_data_fd_child >= 0) {
if (really_close) {
remote.infallible_close_syscall_if_alive(other->cloned_file_data_fd_child);
}
fds->did_close(other->cloned_file_data_fd_child);
}
}
void Task::emulate_jump(remote_code_ptr ip) {
Registers r = regs();
r.set_ip(ip);
set_regs(r);
ticks += PerfCounters::ticks_for_unconditional_indirect_branch(this);
}
bool Task::is_desched_event_syscall() {
return is_ioctl_syscall(regs().original_syscallno(), arch()) &&
desched_fd_child != -1 &&
desched_fd_child == (int)regs().arg1_signed();
}
bool Task::is_ptrace_seccomp_event() const {
int event = ptrace_event();
return (PTRACE_EVENT_SECCOMP_OBSOLETE == event ||
PTRACE_EVENT_SECCOMP == event);
}
template <typename Arch>
static vector<uint8_t> ptrace_get_regs_set(Task* t, const Registers& regs,
size_t min_size) {
auto iov = t->read_mem(remote_ptr<typename Arch::iovec>(regs.arg4()));
ASSERT(t, iov.iov_len >= min_size)
<< "Should have been caught during prepare_ptrace";
return t->read_mem(iov.iov_base.rptr().template cast<uint8_t>(), iov.iov_len);
}
static void process_shmdt(Task* t, remote_ptr<void> addr) {
size_t size = t->vm()->get_shm_size(addr);
t->vm()->remove_shm_size(addr);
t->vm()->unmap(t, addr, size);
}
template <typename Arch>
static void ptrace_syscall_exit_legacy_arch(Task* t, Task* tracee, const Registers& regs)
{
switch ((int)regs.orig_arg1_signed()) {
case Arch::PTRACE_SETREGS: {
auto data = t->read_mem(
remote_ptr<typename Arch::user_regs_struct>(regs.arg4()));
Registers r = tracee->regs();
r.set_from_ptrace_for_arch(Arch::arch(), &data, sizeof(data));
tracee->set_regs(r);
break;
}
case Arch::PTRACE_SETFPREGS: {
auto data = t->read_mem(
remote_ptr<typename Arch::user_fpregs_struct>(regs.arg4()));
auto r = tracee->extra_regs();
r.set_user_fpregs_struct(t, Arch::arch(), &data, sizeof(data));
tracee->set_extra_regs(r);
break;
}
case Arch::PTRACE_SETFPXREGS: {
auto data =
t->read_mem(remote_ptr<X86Arch::user_fpxregs_struct>(regs.arg4()));
auto r = tracee->extra_regs();
r.set_user_fpxregs_struct(t, data);
tracee->set_extra_regs(r);
break;
}
case Arch::PTRACE_POKEUSR: {
size_t addr = regs.arg3();
typename Arch::unsigned_word data = regs.arg4();
if (addr < sizeof(typename Arch::user_regs_struct)) {
Registers r = tracee->regs();
r.write_register_by_user_offset(addr, data);
tracee->set_regs(r);
} else if (addr >= offsetof(typename Arch::user, u_debugreg[0]) &&
addr < offsetof(typename Arch::user, u_debugreg[8])) {
size_t regno =
(addr - offsetof(typename Arch::user, u_debugreg[0])) /
sizeof(data);
tracee->set_x86_debug_reg(regno, data);
}
break;
}
default:
break;
}
}
template <>
void ptrace_syscall_exit_legacy_arch<ARM64Arch>(Task*, Task*, const Registers&)
{
// Nothing to do - unimplemented on this architecture
return;
}
template <typename Arch>
void Task::on_syscall_exit_arch(int syscallno, const Registers& regs) {
session().accumulate_syscall_performed();
if (regs.original_syscallno() == SECCOMP_MAGIC_SKIP_ORIGINAL_SYSCALLNO) {
return;
}
if (syscallno == session_->syscall_number_for_rrcall_mprotect_record()) {
// When we record an rr replay of a tracee which does a syscallbuf'ed
// `mprotect`, neither the replay nor its recording see the mprotect
// syscall, since it's untraced during both recording and replay. rr
// replay is notified of the syscall via the `mprotect_records`
// mechanism; if it's being recorded, it forwards that notification to
// the recorder by calling this syscall.
pid_t tid = regs.orig_arg1();
remote_ptr<void> addr = regs.arg2();
size_t num_bytes = regs.arg3();
int prot = regs.arg4_signed();
Task* t = session().find_task(tid);
ASSERT(this, t);
return t->vm()->protect(t, addr, num_bytes, prot);
}
// mprotect can change the protection status of some mapped regions before
// failing.
// SYS_rrcall_mprotect_record always fails with ENOSYS, though we want to
// note its usage here.
if (regs.syscall_failed() && !is_mprotect_syscall(syscallno, regs.arch())
&& !is_pkey_mprotect_syscall(syscallno, regs.arch())) {
return;
}
switch (syscallno) {
case Arch::brk:
case Arch::mmap:
case Arch::mmap2:
case Arch::mremap: {
LOG(debug) << "(brk/mmap/mmap2/mremap will receive / has received direct "
"processing)";
return;
}
case Arch::pkey_mprotect:
case Arch::mprotect: {
remote_ptr<void> addr = regs.orig_arg1();
size_t num_bytes = regs.arg2();
int prot = regs.arg3_signed();
return vm()->protect(this, addr, num_bytes, prot);
}
case Arch::munmap: {
remote_ptr<void> addr = regs.orig_arg1();
size_t num_bytes = regs.arg2();
return vm()->unmap(this, addr, num_bytes);
}
case Arch::shmdt:
return process_shmdt(this, regs.orig_arg1());
case Arch::madvise: {
remote_ptr<void> addr = regs.orig_arg1();
size_t num_bytes = regs.arg2();
int advice = regs.arg3();
return vm()->advise(this, addr, num_bytes, advice);
}
case Arch::ipc: {
switch ((int)regs.orig_arg1_signed()) {
case SHMDT:
return process_shmdt(this, regs.arg5());
default:
break;
}
break;
}
case Arch::set_thread_area:
set_thread_area(regs.orig_arg1());
return;
case Arch::prctl:
switch ((int)regs.orig_arg1_signed()) {
case PR_SET_SECCOMP:
if (regs.arg2() == SECCOMP_MODE_FILTER && session().is_recording()) {
seccomp_bpf_enabled = true;
}
break;
case PR_SET_NAME:
did_prctl_set_prname(regs.arg2());
break;
}
return;
case Arch::dup:
case Arch::dup2:
case Arch::dup3:
fd_table()->did_dup(regs.orig_arg1(), regs.syscall_result());
return;
case Arch::fcntl64:
case Arch::fcntl:
if (regs.arg2() == Arch::DUPFD || regs.arg2() == Arch::DUPFD_CLOEXEC) {
fd_table()->did_dup(regs.orig_arg1(), regs.syscall_result());
}
return;
case Arch::close:
fd_table()->did_close(regs.orig_arg1());
return;
case Arch::unshare:
if (regs.orig_arg1() & CLONE_FILES) {
fds->erase_task(this);
fds = fds->clone();
fds->insert_task(this);
vm()->fd_tables_changed();
}
return;
case Arch::pwrite64:
case Arch::write: {
int fd = (int)regs.orig_arg1_signed();
vector<FileMonitor::Range> ranges;
ssize_t amount = regs.syscall_result_signed();
if (amount > 0) {
ranges.push_back(FileMonitor::Range(regs.arg2(), amount));
}
FileMonitor::LazyOffset offset(this, regs, syscallno);
fd_table()->did_write(this, fd, ranges, offset);
return;
}
case Arch::pwritev:
case Arch::writev: {
int fd = (int)regs.orig_arg1_signed();
vector<FileMonitor::Range> ranges;
auto iovecs =
read_mem(remote_ptr<typename Arch::iovec>(regs.arg2()), regs.arg3());
ssize_t written = regs.syscall_result_signed();
ASSERT(this, written >= 0);
for (auto& v : iovecs) {
ssize_t amount = min<ssize_t>(written, v.iov_len);
if (amount > 0) {
ranges.push_back(FileMonitor::Range(v.iov_base, amount));
written -= amount;
}
}
FileMonitor::LazyOffset offset(this, regs, syscallno);
fd_table()->did_write(this, fd, ranges, offset);
return;
}
case Arch::ptrace: {
pid_t pid = (pid_t)regs.arg2_signed();
Task* tracee = session().find_task(pid);
switch ((int)regs.orig_arg1_signed()) {
case PTRACE_SETREGSET: {
switch ((int)regs.arg3()) {
case NT_PRSTATUS: {
auto set = ptrace_get_regs_set<Arch>(
this, regs, user_regs_struct_size(tracee->arch()));
Registers r = tracee->regs();
r.set_from_ptrace_for_arch(tracee->arch(), set.data(), set.size());
tracee->set_regs(r);
break;
}
case NT_PRFPREG: {
auto set = ptrace_get_regs_set<Arch>(
this, regs, user_fpregs_struct_size(tracee->arch()));
ExtraRegisters r = tracee->extra_regs();
r.set_user_fpregs_struct(this, tracee->arch(), set.data(),
set.size());
tracee->set_extra_regs(r);
break;
}
case NT_ARM_SYSTEM_CALL: {
auto set = ptrace_get_regs_set<Arch>(
this, regs, sizeof(int));
ASSERT(this, set.size() >= sizeof(int));
int new_syscallno = *(int*)set.data();
Registers r = tracee->regs();
r.set_original_syscallno(new_syscallno);
tracee->set_regs(r);
break;
}
case NT_ARM_HW_WATCH:
case NT_ARM_HW_BREAK: {
auto set = ptrace_get_regs_set<Arch>(
this, regs, offsetof(ARM64Arch::user_hwdebug_state, dbg_regs[0]));
ASSERT(this, set.size() >= sizeof(int));
tracee->set_aarch64_debug_regs((int)regs.arg3(),
(ARM64Arch::user_hwdebug_state*)set.data(),
(set.size() - offsetof(ARM64Arch::user_hwdebug_state, dbg_regs[0]))/
2*sizeof(ARM64Arch::hw_bp));
break;
}
case NT_X86_XSTATE: {
switch (tracee->extra_regs().format()) {
case ExtraRegisters::XSAVE: {
XSaveLayout layout;
ReplaySession* replay = session().as_replay();
if (replay) {
layout = xsave_layout_from_trace(
replay->trace_reader().cpuid_records());
} else {
layout = xsave_native_layout();
}
auto set = ptrace_get_regs_set<Arch>(this, regs, layout.full_size);
ExtraRegisters r;
bool ok =
r.set_to_raw_data(tracee->arch(), ExtraRegisters::XSAVE,
set.data(), set.size(), layout);
ASSERT(this, ok) << "Invalid XSAVE data";
tracee->set_extra_regs(r);
break;
}
default:
ASSERT(this, false) << "Unknown ExtraRegisters format; "
"Should have been caught during "
"prepare_ptrace";
}
break;
}
default:
ASSERT(this, false) << "Unknown regset type; Should have been "
"caught during prepare_ptrace";
break;
}
break;
}
case Arch::PTRACE_ARCH_PRCTL: {
if (tracee->arch() != x86_64) {
break;
}
int code = (int)regs.arg4();
switch (code) {
case ARCH_GET_FS:
case ARCH_GET_GS:
break;
case ARCH_SET_FS:
case ARCH_SET_GS: {
Registers r = tracee->regs();
if (regs.arg3() == 0) {
// Work around a kernel bug in pre-4.7 kernels, where setting
// the gs/fs base to 0 via PTRACE_REGSET did not work correctly.
// If this fails the tracee is on the exit path and it
// doesn't matter what its fs/gs base is.
tracee->ptrace_if_stopped(Arch::PTRACE_ARCH_PRCTL, regs.arg3(),
(void*)(uintptr_t)regs.arg4());
}
if (code == ARCH_SET_FS) {
r.set_fs_base(regs.arg3());
} else {
r.set_gs_base(regs.arg3());
}
tracee->set_regs(r);
break;
}
default:
ASSERT(tracee, 0) << "Should have detected this earlier";
}
break;
}
case Arch::PTRACE_SETREGS:
case Arch::PTRACE_SETFPREGS:
case Arch::PTRACE_SETFPXREGS:
case Arch::PTRACE_POKEUSR: {
ptrace_syscall_exit_legacy_arch<Arch>(this, tracee, regs);
}
}
return;
}
case Arch::pidfd_open: {
int fd = regs.syscall_result();
pid_t pid = (pid_t)regs.orig_arg1();
TaskUid tuid;
if (Task* t = session().find_task(pid)) {
tuid = t->tuid();
}
fd_table()->add_monitor(this, fd, new PidFdMonitor(tuid));
return;
}
case Arch::pidfd_getfd: {
int pidfd = regs.orig_arg1();
int fd = regs.arg2();
if (PidFdMonitor* monitor = PidFdMonitor::get(fd_table().get(), pidfd)) {
// NB: This can return NULL if the pidfd is for a process outside of
// the rr trace.
if (auto source = monitor->fd_table(session())) {
fd_table()->did_dup(source.get(), fd, regs.syscall_result());
}
} else {
LOG(warn) << "pidfd_getfd succeeded but we lost track of the pidfd " << pidfd;
}
return;
}
}
}
void Task::on_syscall_exit(int syscallno, SupportedArch arch,
const Registers& regs) {
with_converted_registers<void>(regs, arch, [&](const Registers& regs) {
RR_ARCH_FUNCTION(on_syscall_exit_arch, arch, syscallno, regs);
});
}
void Task::move_ip_before_breakpoint() {
// TODO: assert that this is at a breakpoint trap.
Registers r = regs();
r.set_ip(r.ip().undo_executed_bkpt(arch()));
set_regs(r);
}
bool Task::enter_syscall(bool allow_exit) {
bool need_ptrace_syscall_event = !seccomp_bpf_enabled ||
session().syscall_seccomp_ordering() ==
Session::SECCOMP_BEFORE_PTRACE_SYSCALL;
bool need_seccomp_event = seccomp_bpf_enabled;
while (need_ptrace_syscall_event || need_seccomp_event) {
if (!resume_execution(need_ptrace_syscall_event ? RESUME_SYSCALL : RESUME_CONT,
RESUME_WAIT_NO_EXIT, RESUME_NO_TICKS)) {
return false;
}
if (is_ptrace_seccomp_event()) {
ASSERT(this, need_seccomp_event);
need_seccomp_event = false;
continue;
}
if (allow_exit && ptrace_event() == PTRACE_EVENT_EXIT) {
return false;
}
ASSERT(this, !ptrace_event());
if (session().is_recording() && wait_status.group_stop()) {
static_cast<RecordTask*>(this)->stash_group_stop();
continue;
}
if (!stop_sig()) {
ASSERT(this, need_ptrace_syscall_event);
need_ptrace_syscall_event = false;
continue;
}
if (ReplaySession::is_ignored_signal(stop_sig()) &&
session().is_replaying()) {
continue;
}
ASSERT(this, session().is_recording() && !is_deterministic_signal(this))
<< " got unexpected signal " << signal_name(stop_sig());
if (stop_sig() == session().as_record()->syscallbuf_desched_sig()) {
continue;
}
static_cast<RecordTask*>(this)->stash_sig();
}
apply_syscall_entry_regs();
canonicalize_regs(arch());
return true;
}
bool Task::exit_syscall() {
// If PTRACE_SYSCALL_BEFORE_SECCOMP, we are inconsistent about
// whether we process the syscall on the syscall entry trap or
// on the seccomp trap. Detect if we are on the former and
// just bring us forward to the seccomp trap.
bool will_see_seccomp = seccomp_bpf_enabled &&
(session().syscall_seccomp_ordering() ==
Session::PTRACE_SYSCALL_BEFORE_SECCOMP) &&
!is_ptrace_seccomp_event();
while (true) {
if (!resume_execution(RESUME_SYSCALL, RESUME_WAIT_NO_EXIT, RESUME_NO_TICKS)) {
return false;
}
if (will_see_seccomp && is_ptrace_seccomp_event()) {
will_see_seccomp = false;
continue;
}
if (ptrace_event() == PTRACE_EVENT_EXIT) {
return false;
}
ASSERT(this, !ptrace_event());
if (!stop_sig()) {
canonicalize_regs(arch());
break;
}
if (ReplaySession::is_ignored_signal(stop_sig()) &&
session().is_replaying()) {
continue;
}
ASSERT(this, session().is_recording());
static_cast<RecordTask*>(this)->stash_sig();
}
return true;
}
bool Task::exit_syscall_and_prepare_restart() {
Registers r = regs();
int syscallno = r.original_syscallno();
LOG(debug) << "exit_syscall_and_prepare_restart from syscall "
<< rr::syscall_name(syscallno, r.arch());
r.set_original_syscallno(syscall_number_for_gettid(r.arch()));
set_regs(r);
// This exits the hijacked SYS_gettid. Now the tracee is
// ready to do our bidding.
if (!exit_syscall()) {
// The tracee unexpectedly exited. To get this to replay correctly, we need to
// make it look like we really entered the syscall. Then
// handle_ptrace_exit_event will record something appropriate.
r.emulate_syscall_entry();
set_regs(r);
return false;
}
LOG(debug) << "exit_syscall_and_prepare_restart done";
// Restore these regs to what they would have been just before
// the tracee trapped at the syscall.
r.set_original_syscallno(-1);
r.set_syscallno(syscallno);
r.set_ip(r.ip() - syscall_instruction_length(r.arch()));
set_regs(r);
return true;
}
#if defined(__i386__) || defined(__x86_64__)
#define AR_L (1 << 21)
static bool is_long_mode_segment(uint32_t segment) {
uint32_t ar = 0;
asm("lar %[segment], %[ar]" : [ar] "=r"(ar) : [segment] "r"(segment));
return ar & AR_L;
}
#endif
void Task::post_exec(const string& exe_file) {
Task* stopped_task_in_address_space = nullptr;
bool other_task_in_address_space = false;
for (Task* t : as->task_set()) {
if (t != this) {
other_task_in_address_space = true;
if (t->is_stopped_) {
stopped_task_in_address_space = t;
break;
}
}
}
if (stopped_task_in_address_space) {
LOG(warn) << "Unmapping buffers using tid " << stopped_task_in_address_space->tid;
AutoRemoteSyscalls remote(stopped_task_in_address_space);
unmap_buffers_for(remote, this, syscallbuf_child);
} else if (other_task_in_address_space) {
// We should clean up our syscallbuf/scratch but that's too hard since we
// have no stopped task to use for that :-(.
// (We can't clean up those buffers *before* the exec completes, because it
// might fail in which case we shouldn't have cleaned them up.)
// Just let the buffers leak. The AddressSpace will clean up our local
// shared buffer when it's destroyed.
LOG(warn) << "Intentionally leaking syscallbuf after exec for task " << tid;
}
session().post_exec();
as->erase_task(this);
fds->erase_task(this);
extra_registers = ExtraRegisters(registers.arch());
extra_registers_known = false;
ExtraRegisters e = extra_regs();
e.reset();
set_extra_regs(e);
syscallbuf_child = nullptr;
syscallbuf_size = 0;
scratch_ptr = nullptr;
cloned_file_data_fd_child = -1;
desched_fd_child = -1;
preload_globals = nullptr;
rseq_state = nullptr;
thread_group()->execed = true;
thread_areas_.clear();
memset(&thread_locals, 0, sizeof(thread_locals));
as = session().create_vm(this, exe_file, as->uid().exec_count() + 1);
// It's barely-documented, but Linux unshares the fd table on exec
fds = fds->clone();
fds->insert_task(this);
}
static string prname_from_exe_image(const string& e) {
size_t last_slash = e.rfind('/');
return e.substr(last_slash == e.npos ? 0 : last_slash + 1);
}
void Task::post_exec_syscall(const std::string& original_exe_file) {
canonicalize_regs(arch());
as->post_exec_syscall(this);
AutoRemoteSyscalls remote(this);
set_name(remote, prname_from_exe_image(original_exe_file));
if (session().has_cpuid_faulting()) {
remote.infallible_syscall(syscall_number_for_arch_prctl(arch()),
ARCH_SET_CPUID, 0);
}
}
bool Task::execed() const { return tg->execed; }
void Task::flush_inconsistent_state() { ticks = 0; }
string Task::read_c_str(remote_ptr<char> child_addr, bool *ok) {
remote_ptr<void> p = child_addr;
string str;
while (true) {
// We're only guaranteed that [child_addr,
// end_of_page) is mapped.
remote_ptr<void> end_of_page = ceil_page_size(p + 1);
ssize_t nbytes = end_of_page - p;
std::unique_ptr<char[]> buf(new char[nbytes]);
read_bytes_helper(p, nbytes, buf.get(), ok);
if (ok && !*ok) {
return "";
}
for (int i = 0; i < nbytes; ++i) {
if ('\0' == buf[i]) {
return str;
}
str += buf[i];
}
p = end_of_page;
}
}
const Registers& Task::regs() const {
// If we're in an unexpected exit then the tracee may
// not be stopped but we know its registers won't change again,
// so it's safe to ask for them here.
ASSERT(this, is_stopped_ || was_reaped_ || in_unexpected_exit);
return registers;
}
const ExtraRegisters* Task::extra_regs_fallible() {
if (!extra_registers_known) {
#if defined(__i386__) || defined(__x86_64__)
if (xsave_area_size() > 512) {
LOG(debug) << " (refreshing extra-register cache using XSAVE)";
extra_registers.format_ = ExtraRegisters::XSAVE;
extra_registers.data_.resize(xsave_area_size());
struct iovec vec = { extra_registers.data_.data(),
extra_registers.data_.size() };
if (fallible_ptrace(PTRACE_GETREGSET, NT_X86_XSTATE, &vec)) {
return nullptr;
}
extra_registers.data_.resize(vec.iov_len);
// The kernel may return less than the full XSTATE
extra_registers.validate(this);
} else {
#if defined(__i386__)
LOG(debug) << " (refreshing extra-register cache using FPXREGS)";
extra_registers.format_ = ExtraRegisters::XSAVE;
extra_registers.data_.resize(sizeof(user_fpxregs_struct));
if (fallible_ptrace(X86Arch::PTRACE_GETFPXREGS, nullptr, extra_registers.data_.data())) {
return nullptr;
}
#elif defined(__x86_64__)
// x86-64 that doesn't support XSAVE; apparently Xeon E5620 (Westmere)
// is in this class.
LOG(debug) << " (refreshing extra-register cache using FPREGS)";
extra_registers.format_ = ExtraRegisters::XSAVE;
extra_registers.data_.resize(sizeof(user_fpregs_struct));
if (fallible_ptrace(PTRACE_GETFPREGS, nullptr, extra_registers.data_.data())) {
return nullptr;
}
#endif
}
#elif defined(__aarch64__)
LOG(debug) << " (refreshing extra-register cache using FPR)";
extra_registers.format_ = ExtraRegisters::NT_FPR;
extra_registers.data_.resize(sizeof(ARM64Arch::user_fpregs_struct));
struct iovec vec = { extra_registers.data_.data(),
extra_registers.data_.size() };
if (fallible_ptrace(PTRACE_GETREGSET, NT_PRFPREG, &vec)) {
return nullptr;
}
extra_registers.data_.resize(vec.iov_len);
#else
#error need to define new extra_regs support
#endif
extra_registers_known = true;
}
return &extra_registers;
}
const ExtraRegisters& Task::extra_regs() {
if (!extra_regs_fallible()) {
ASSERT(this, false) << "Can't find task for infallible extra_regs";
}
return extra_registers;
}
#if defined(__i386__) || defined(__x86_64__)
static ssize_t dr_user_word_offset(size_t i) {
DEBUG_ASSERT(i < NUM_X86_DEBUG_REGS);
return offsetof(struct user, u_debugreg[0]) + sizeof(void*) * i;
}
uintptr_t Task::get_debug_reg(size_t regno) {
errno = 0;
long result =
fallible_ptrace(PTRACE_PEEKUSER, dr_user_word_offset(regno), nullptr);
if (errno == ESRCH) {
return 0;
}
return result;
}
bool Task::set_x86_debug_reg(size_t regno, uintptr_t value) {
errno = 0;
fallible_ptrace(PTRACE_POKEUSER, dr_user_word_offset(regno), (void*)value);
return errno == ESRCH || errno == 0;
}
uintptr_t Task::x86_debug_status() {
return fallible_ptrace(PTRACE_PEEKUSER, dr_user_word_offset(6), nullptr);
}
#else
#define FATAL_X86_ONLY() FATAL() << "Reached x86-only code path on non-x86 architecture";
uintptr_t Task::get_debug_reg(size_t) {
FATAL_X86_ONLY();
return 0;
}
bool Task::set_x86_debug_reg(size_t, uintptr_t) {
FATAL_X86_ONLY();
return false;
}
uintptr_t Task::x86_debug_status() {
FATAL_X86_ONLY();
return 0;
}
#endif
#if defined(__aarch64__)
bool Task::set_aarch64_debug_regs(int which, ARM64Arch::user_hwdebug_state *regs, size_t nregs) {
errno = 0;
struct iovec iov { .iov_base = regs, .iov_len = sizeof(*regs) - (16-nregs)*sizeof(ARM64Arch::hw_bp) };
ASSERT(this, which == NT_ARM_HW_BREAK || which == NT_ARM_HW_WATCH);
fallible_ptrace(PTRACE_SETREGSET, which, (void*)&iov);
return errno == 0;
}
bool Task::get_aarch64_debug_regs(int which, ARM64Arch::user_hwdebug_state *regs) {
errno = 0;
struct iovec iov { .iov_base = regs, .iov_len = sizeof(*regs) };
ASSERT(this, which == NT_ARM_HW_BREAK || which == NT_ARM_HW_WATCH);
fallible_ptrace(PTRACE_GETREGSET, which, (void*)&iov);
return errno == 0;
}
#else
bool Task::set_aarch64_debug_regs(int, ARM64Arch::user_hwdebug_state *, size_t) {
FATAL() << "Reached aarch64 code path on non-aarch64 system";
return false;
}
bool Task::get_aarch64_debug_regs(int, ARM64Arch::user_hwdebug_state *regs) {
// Following memset just to silence a warning about dbg_info may be used uninitialized.
memset(regs, 0, sizeof(*regs));
FATAL() << "Reached aarch64 code path on non-aarch64 system";
return false;
}
#endif
void Task::set_x86_debug_status(uintptr_t status) {
if (arch() == x86 || arch() == x86_64) {
set_x86_debug_reg(6, status);
}
}
static bool is_singlestep_resume(ResumeRequest request) {
return request == RESUME_SINGLESTEP || request == RESUME_SYSEMU_SINGLESTEP;
}
TrapReasons Task::compute_trap_reasons() {
ASSERT(this, stop_sig() == SIGTRAP);
TrapReasons reasons;
const siginfo_t& si = get_siginfo();
if (arch() == x86 || arch() == x86_64) {
uintptr_t status = x86_debug_status();
reasons.singlestep = (status & DS_SINGLESTEP) != 0;
if (!reasons.singlestep && is_singlestep_resume(how_last_execution_resumed)) {
if (is_at_syscall_instruction(this, address_of_last_execution_resume) &&
ip() ==
address_of_last_execution_resume +
syscall_instruction_length(arch())) {
// During replay we execute syscall instructions in certain cases, e.g.
// mprotect with syscallbuf. The kernel does not set DS_SINGLESTEP when we
// step over those instructions so we need to detect that here.
reasons.singlestep = true;
} else {
TrappedInstruction ti =
trapped_instruction_at(this, address_of_last_execution_resume);
if (ti == TrappedInstruction::CPUID &&
ip() == address_of_last_execution_resume +
trapped_instruction_len(TrappedInstruction::CPUID)) {
// Likewise we emulate CPUID instructions and must forcibly detect that
// here.
reasons.singlestep = true;
// This also takes care of the did_set_breakpoint_after_cpuid workaround case
} else if (ti == TrappedInstruction::INT3 &&
ip() == address_of_last_execution_resume +
trapped_instruction_len(TrappedInstruction::INT3)) {
// INT3 instructions should also be turned into a singlestep here.
reasons.singlestep = true;
}
}
}
// In VMWare Player 6.0.4 build-2249910, 32-bit Ubuntu x86 guest,
// single-stepping does not trigger watchpoints :-(. So we have to
// check watchpoints here. fast_forward also hides watchpoint changes.
// Write-watchpoints will detect that their value has changed and trigger.
// XXX Read/exec watchpoints can't be detected this way so they're still
// broken in the above configuration :-(.
if ((DS_WATCHPOINT_ANY | DS_SINGLESTEP) & status) {
as->notify_watchpoint_fired(status, nullptr,
is_singlestep_resume(how_last_execution_resumed)
? address_of_last_execution_resume : nullptr);
}
reasons.watchpoint =
as->has_any_watchpoint_changes() || (DS_WATCHPOINT_ANY & status);
} else if (arch() == aarch64) {
reasons.watchpoint = false;
reasons.singlestep = si.si_code == TRAP_TRACE;
reasons.watchpoint = si.si_code == TRAP_HWBKPT;
if (reasons.watchpoint) {
as->notify_watchpoint_fired(0, remote_ptr<void>((uintptr_t)si.si_addr),
is_singlestep_resume(how_last_execution_resumed)
? address_of_last_execution_resume : nullptr);
}
}
// If we triggered a breakpoint, this would be the address of the breakpoint
remote_code_ptr ip_at_breakpoint = ip().undo_executed_bkpt(arch());
// Don't trust siginfo to report execution of a breakpoint if singlestep or
// watchpoint triggered.
if (reasons.singlestep) {
reasons.breakpoint =
as->is_breakpoint_instruction(this, address_of_last_execution_resume);
if (reasons.breakpoint) {
ASSERT(this, address_of_last_execution_resume == ip_at_breakpoint);
}
} else if (reasons.watchpoint) {
// We didn't singlestep, so watchpoint state is completely accurate.
// The only way the last instruction could have triggered a watchpoint
// and be a breakpoint instruction is if an EXEC watchpoint fired
// at the breakpoint address.
reasons.breakpoint = as->has_exec_watchpoint_fired(ip_at_breakpoint) &&
as->is_breakpoint_instruction(this, ip_at_breakpoint);
} else {
ASSERT(this, SIGTRAP == si.si_signo) << " expected SIGTRAP, got " << si;
reasons.breakpoint = is_kernel_trap(si.si_code);
if (reasons.breakpoint) {
ASSERT(this, as->is_breakpoint_instruction(this, ip_at_breakpoint))
<< " expected breakpoint at " << ip_at_breakpoint << ", got siginfo "
<< si;
}
}
return reasons;
}
static void* preload_thread_locals_local_addr(AddressSpace& as) {
if (!as.has_mapping(AddressSpace::preload_thread_locals_start())) {
return nullptr;
}
// There might have been a mapping there, but not the one we expect (i.e.
// the one shared with us for thread locals). In that case we behave as
// if the mapping didn't exist at all.
auto& mapping = as.mapping_of(AddressSpace::preload_thread_locals_start());
if (mapping.flags & AddressSpace::Mapping::IS_THREAD_LOCALS) {
DEBUG_ASSERT(mapping.local_addr);
return mapping.local_addr;
}
return nullptr;
}
template <typename Arch> static void setup_preload_thread_locals_arch(Task* t) {
void* local_addr = preload_thread_locals_local_addr(*t->vm());
if (local_addr) {
auto locals = reinterpret_cast<preload_thread_locals<Arch>*>(local_addr);
static_assert(sizeof(*locals) <= PRELOAD_THREAD_LOCALS_SIZE,
"bad PRELOAD_THREAD_LOCALS_SIZE");
locals->syscallbuf_stub_alt_stack = t->syscallbuf_alt_stack();
}
}
void Task::setup_preload_thread_locals() {
activate_preload_thread_locals();
RR_ARCH_FUNCTION(setup_preload_thread_locals_arch, arch(), this);
}
const Task::ThreadLocals& Task::fetch_preload_thread_locals() {
if (tuid() == as->thread_locals_tuid()) {
void* local_addr = preload_thread_locals_local_addr(*as);
if (local_addr) {
memcpy(thread_locals, local_addr, PRELOAD_THREAD_LOCALS_SIZE);
return thread_locals;
}
// The mapping might have been removed by crazy application code.
// That's OK, assuming the preload library was removed too.
memset(&thread_locals, 0, sizeof(thread_locals));
}
return thread_locals;
}
void Task::activate_preload_thread_locals() {
// Switch thread-locals to the new task.
if (tuid() != as->thread_locals_tuid()) {
void* local_addr = preload_thread_locals_local_addr(*as);
if (local_addr) {
Task* t = session().find_task(as->thread_locals_tuid());
if (t) {
t->fetch_preload_thread_locals();
}
memcpy(local_addr, thread_locals, PRELOAD_THREAD_LOCALS_SIZE);
as->set_thread_locals_tuid(tuid());
}
}
}
#if defined(__x86_64__) || defined(__i386__)
static bool cpu_has_KNL_string_singlestep_bug() {
static bool has_quirk =
((cpuid(CPUID_GETFEATURES, 0).eax & 0xF0FF0) == 0x50670);
return has_quirk;
}
#else
static bool cpu_has_KNL_string_singlestep_bug() {
return false;
}
#endif
/*
* The value of rcx above which the CPU doesn't properly handle singlestep for
* string instructions. Right now, since only once CPU has this quirk, this
* value is hardcoded, but could depend on the CPU architecture in the future.
*/
static int single_step_coalesce_cutoff() { return 16; }
void Task::work_around_KNL_string_singlestep_bug() {
/* The extra cx >= cutoff check is just an optimization, to avoid the
moderately expensive load from ip() if we can */
if (!cpu_has_KNL_string_singlestep_bug()) {
return;
}
uintptr_t cx = regs().cx();
uintptr_t cutoff = single_step_coalesce_cutoff();
if (cx > cutoff && at_x86_string_instruction(this)) {
/* KNL has a quirk where single-stepping a string instruction can step up
to 64 iterations. Work around this by fudging registers to force the
processor to execute one iteration and one iteration only. */
LOG(debug) << "Working around KNL single-step hardware bug (cx=" << cx
<< ")";
if (cx > cutoff) {
last_resume_orig_cx = cx;
Registers r = regs();
/* An arbitrary value < cutoff would work fine here, except 1, since
the last iteration of the loop behaves differently */
r.set_cx(cutoff);
set_regs(r);
}
}
}
bool Task::resume_execution(ResumeRequest how, WaitRequest wait_how,
TicksRequest tick_period, int sig) {
// Ensure our HW debug registers are up to date before we execute any code.
// If this fails because the task died, the code below will detect it.
set_debug_regs(vm()->get_hw_watchpoints());
bool setup_succeeded = will_resume_execution(how, wait_how, tick_period, sig);
// During record, the process could have died, but otherwise, we control
// process lifecycles and this should never fail.
ASSERT(this, session().is_recording() || setup_succeeded);
if (setup_succeeded) {
if (tick_period != RESUME_NO_TICKS) {
if (tick_period == RESUME_UNLIMITED_TICKS) {
hpc.reset(0);
} else {
ASSERT(this, tick_period >= 0 && tick_period <= MAX_TICKS_REQUEST);
hpc.reset(max<Ticks>(1, tick_period));
}
activate_preload_thread_locals();
}
LOG(debug) << "resuming execution of " << tid << " with "
<< ptrace_req_name<NativeArch>(how)
<< (sig ? string(", signal ") + signal_name(sig) : string())
<< " tick_period " << tick_period << " wait " << wait_how;
set_x86_debug_status(0);
if (is_singlestep_resume(how)) {
work_around_KNL_string_singlestep_bug();
if (is_x86ish(arch())) {
singlestepping_instruction = trapped_instruction_at(this, ip());
if (singlestepping_instruction == TrappedInstruction::CPUID) {
// In KVM virtual machines (and maybe others), singlestepping over CPUID
// executes the following instruction as well. Work around that.
did_set_breakpoint_after_cpuid =
vm()->add_breakpoint(ip() + trapped_instruction_len(singlestepping_instruction), BKPT_INTERNAL);
}
} else if (arch() == aarch64 && is_singlestep_resume(how_last_execution_resumed)) {
// On aarch64, if the last execution was any sort of single step, then
// resuming again with PTRACE_(SYSEMU_)SINGLESTEP will cause a debug fault
// immediately before executing the next instruction in userspace
// (essentially completing the singlestep that got "interrupted" by
// trapping into the kernel). To prevent this, we must re-arm the
// PSTATE.SS bit. (If the last resume was not a single step,
// the kernel will apply this modification).
if (!registers.aarch64_singlestep_flag()) {
registers.set_aarch64_singlestep_flag();
registers_dirty = true;
}
}
}
address_of_last_execution_resume = ip();
how_last_execution_resumed = how;
flush_regs();
}
if (session().is_recording() && !seen_ptrace_exit_event()) {
/* There's a nasty race where a stopped task gets woken up by a SIGKILL
* and advances to the PTRACE_EXIT_EVENT ptrace-stop just before we
* send a PTRACE_CONT. Our PTRACE_CONT will cause it to continue and exit,
* which means we don't get a chance to clean up robust futexes etc.
* Avoid that by doing a waitpid() here to see if it has exited.
* This doesn't fully close the race since in theory we could be preempted
* between the waitpid and the ptrace_if_stopped, giving another task
* a chance to SIGKILL our tracee and advance it to the PTRACE_EXIT_EVENT,
* or just letting the tracee be scheduled to process its pending SIGKILL.
*/
WaitOptions options(tid);
options.block_seconds = 0.0;
WaitResult result = WaitManager::wait_stop_or_exit(options);
ASSERT(this, result.code == WAIT_OK || result.code == WAIT_NO_STATUS);
if (result.code == WAIT_OK) {
// In some (but not all) cases where the child was killed with SIGKILL,
// we don't get PTRACE_EVENT_EXIT before it just exits, because a SIGKILL
// arrived when the child was already in the PTRACE_EVENT_EXIT stop.
// The status could be any exit or fatal-signal status, since this status
// can reflect what caused the thread to exit before the SIGKILL arrived
// and forced it out of the PTRACE_EVENT_EXIT stop.
ASSERT(this,
result.status.ptrace_event() == PTRACE_EVENT_EXIT ||
result.status.reaped())
<< "got " << result.status;
LOG(debug) << "Task " << tid << " exited unexpectedly with status "
<< result.status;
if (did_waitpid(result.status)) {
// We reached a new stop (or actually reaped the task).
// Consider this "resume execution" to be done.
return wait_how != RESUME_WAIT_NO_EXIT;
}
ASSERT(this, result.status.ptrace_event() == PTRACE_EVENT_EXIT)
<< "did_waitpid should always succeed for reaped() statuses";
// The tracee must have been kicked out of PTRACE_EVENT_EXIT
// by a SIGKILL (only possible on older kernels).
// If we were supposed to wait, we've failed.
// We can't wait now because on old kernels tasks can block
// indefinitely even after PTRACE_EVENT_EXIT (e.g. due to coredumping).
// We don't know what state it's in exactly, but registers haven't changed
// since nothing has really happened since the last stop.
set_stopped(false);
return RESUME_NONBLOCKING == wait_how;
}
}
ASSERT(this, setup_succeeded);
ptrace_if_stopped(how, nullptr, (void*)(uintptr_t)sig);
// If ptrace_if_stopped failed, it means we're running along the
// exit path due to a SIGKILL or equivalent, so just like if it
// succeeded, we will eventually receive a wait notification.
set_stopped(false);
extra_registers_known = false;
if (RESUME_NONBLOCKING != wait_how) {
if (!wait()) {
return false;
}
if (wait_how == RESUME_WAIT_NO_EXIT) {
return ptrace_event() != PTRACE_EVENT_EXIT && !was_reaped();
}
}
return true;
}
void Task::set_regs(const Registers& regs) {
// Only allow registers to be set while our copy is the source of truth.
ASSERT(this, is_stopped_ || in_unexpected_exit);
if (registers.original_syscallno() != regs.original_syscallno()) {
orig_syscallno_dirty = true;
}
bool changed = registers != regs;
if (changed) {
registers_dirty = true;
registers = regs;
}
}
void Task::flush_regs() {
if (registers_dirty) {
LOG(debug) << "Flushing registers for tid " << tid << " " << registers;
auto ptrace_regs = registers.get_ptrace_iovec();
#if defined(__i386__) || defined(__x86_64__)
if (ptrace_if_stopped(PTRACE_SETREGSET, NT_PRSTATUS, &ptrace_regs)) {
/* If that failed, the task was killed and it should not matter what
we tried to set. But we will remember that our registers are dirty. */
registers_dirty = false;
orig_syscallno_dirty = false;
}
#elif defined(__aarch64__)
if (ptrace_if_stopped(PTRACE_SETREGSET, NT_PRSTATUS, &ptrace_regs)) {
/* If that failed, the task was killed and it should not matter what
we tried to set. But we will remember that our registers are dirty. */
registers_dirty = false;
}
#else
#error "Unknown architecture"
#endif
}
#if defined(__i386__) || defined(__x86_64__)
else {
ASSERT(this, !orig_syscallno_dirty);
}
#elif defined(__aarch64__)
if (orig_syscallno_dirty) {
uintptr_t syscall = registers.original_syscallno();
struct iovec vec = { &syscall,
sizeof(syscall) };
LOG(debug) << "Changing syscall to " << syscall;
if (ptrace_if_stopped(PTRACE_SETREGSET, NT_ARM_SYSTEM_CALL, &vec)) {
/* If that failed, the task was killed and it should not matter what
we tried to set. But we will remember that our registers are dirty. */
orig_syscallno_dirty = false;
}
}
#endif
}
void Task::set_extra_regs(const ExtraRegisters& regs) {
ASSERT(this, !regs.empty()) << "Trying to set empty ExtraRegisters";
ASSERT(this, regs.arch() == arch())
<< "Trying to set wrong arch ExtraRegisters";
extra_registers = regs;
switch (extra_registers.format()) {
case ExtraRegisters::XSAVE: {
if (xsave_area_size() > 512) {
struct iovec vec = { extra_registers.data_.data(),
extra_registers.data_.size() };
if (ptrace_if_stopped(PTRACE_SETREGSET, NT_X86_XSTATE, &vec)) {
/* If that failed, the task was killed and it should not matter what
we tried to set. But we will remember that our registers are dirty. */
extra_registers_known = true;
}
} else {
#if defined(__i386__)
ASSERT(this,
extra_registers.data_.size() == sizeof(user_fpxregs_struct));
if (ptrace_if_stopped(X86Arch::PTRACE_SETFPXREGS, nullptr,
extra_registers.data_.data())) {
/* If that failed, the task was killed and it should not matter what
we tried to set. But we will remember that our registers are dirty. */
extra_registers_known = true;
}
#elif defined(__x86_64__)
ASSERT(this,
extra_registers.data_.size() == sizeof(user_fpregs_struct));
if (ptrace_if_stopped(PTRACE_SETFPREGS, nullptr,
extra_registers.data_.data())) {
/* If that failed, the task was killed and it should not matter what
we tried to set. But we will remember that our registers are dirty. */
extra_registers_known = true;
}
#endif
}
break;
}
case ExtraRegisters::NT_FPR: {
struct iovec vec = { extra_registers.data_.data(),
extra_registers.data_.size() };
if (ptrace_if_stopped(PTRACE_SETREGSET, NT_PRFPREG, &vec)) {
/* If that failed, the task was killed and it should not matter what
we tried to set. But we will remember that our registers are dirty. */
extra_registers_known = true;
}
break;
}
default:
ASSERT(this, false) << "Unexpected ExtraRegisters format";
}
}
enum WatchBytesX86 {
BYTES_1 = 0x00,
BYTES_2 = 0x01,
BYTES_4 = 0x03,
BYTES_8 = 0x02
};
static WatchBytesX86 num_bytes_to_dr_len(size_t num_bytes) {
switch (num_bytes) {
case 1:
return BYTES_1;
case 2:
return BYTES_2;
case 4:
return BYTES_4;
case 8:
return BYTES_8;
default:
FATAL() << "Unsupported breakpoint size " << num_bytes;
return WatchBytesX86(-1); // not reached
}
}
struct DebugControl {
uintptr_t dr0_local : 1;
uintptr_t dr0_global : 1;
uintptr_t dr1_local : 1;
uintptr_t dr1_global : 1;
uintptr_t dr2_local : 1;
uintptr_t dr2_global : 1;
uintptr_t dr3_local : 1;
uintptr_t dr3_global : 1;
uintptr_t ignored : 8;
WatchType dr0_type : 2;
WatchBytesX86 dr0_len : 2;
WatchType dr1_type : 2;
WatchBytesX86 dr1_len : 2;
WatchType dr2_type : 2;
WatchBytesX86 dr2_len : 2;
WatchType dr3_type : 2;
WatchBytesX86 dr3_len : 2;
void enable(size_t index, WatchBytesX86 size, WatchType type) {
switch (index) {
#define CASE(_i) \
case _i: \
dr##_i##_local = 1; \
dr##_i##_global = 0; \
dr##_i##_type = type; \
dr##_i##_len = size; \
break
CASE(0);
CASE(1);
CASE(2);
CASE(3);
#undef CASE
default:
FATAL() << "Invalid index";
}
}
};
static_assert(sizeof(DebugControl) == sizeof(uintptr_t),
"Can't pack DebugControl");
union PackedDebugControl {
uintptr_t packed;
DebugControl ctl;
};
static bool set_x86_debug_regs(Task *t, const Task::HardwareWatchpoints& regs) {
// Reset the debug status since we're about to change the set
// of programmed watchpoints.
t->set_x86_debug_reg(6, 0);
if (regs.size() > NUM_X86_WATCHPOINTS) {
t->set_x86_debug_reg(7, 0);
return false;
}
// Work around kernel bug https://bugzilla.kernel.org/show_bug.cgi?id=200965.
// For every watchpoint we're going to use, enable it with size 1.
// This will let us set the address freely without potentially triggering
// the kernel bug which will reject an unaligned address if the watchpoint
// is disabled but was non-size-1.
PackedDebugControl dr7;
dr7.packed = 0;
for (size_t i = 0; i < regs.size(); ++i) {
dr7.ctl.enable(i, BYTES_1, WATCH_EXEC);
}
t->set_x86_debug_reg(7, dr7.packed);
if (regs.empty()) {
// Don't do another redundant poke to DR7.
return true;
}
size_t index = 0;
for (auto reg : regs) {
if (!t->set_x86_debug_reg(index, reg.addr.as_int())) {
t->set_x86_debug_reg(7, 0);
return false;
}
dr7.ctl.enable(index, num_bytes_to_dr_len(reg.num_bytes), reg.type);
++index;
}
return t->set_x86_debug_reg(7, dr7.packed);
}
template <typename Arch>
static bool set_debug_regs_arch(Task* t,
const Task::HardwareWatchpoints& regs);
template <> bool set_debug_regs_arch<X86Arch>(Task* t,
const Task::HardwareWatchpoints& regs) {
return set_x86_debug_regs(t, regs);
}
template <> bool set_debug_regs_arch<X64Arch>(Task* t,
const Task::HardwareWatchpoints& regs) {
return set_x86_debug_regs(t, regs);
}
static void query_max_bp_wp(Task* t, ssize_t* max_bp, ssize_t* max_wp) {
ARM64Arch::user_hwdebug_state bps;
ARM64Arch::user_hwdebug_state wps;
bool ok = t->get_aarch64_debug_regs(NT_ARM_HW_BREAK, &bps) &&
t->get_aarch64_debug_regs(NT_ARM_HW_WATCH, &wps);
ASSERT(t, ok);
*max_bp = bps.dbg_info & 0xff;
*max_wp = wps.dbg_info & 0xff;
}
template <> bool set_debug_regs_arch<ARM64Arch>(Task* t,
const Task::HardwareWatchpoints& regs) {
ARM64Arch::user_hwdebug_state bps;
ARM64Arch::user_hwdebug_state wps;
memset(&bps, 0, sizeof(bps));
memset(&wps, 0, sizeof(wps));
static ssize_t max_bp = -1;
static ssize_t max_wp = -1;
if (max_bp == -1) {
query_max_bp_wp(t, &max_bp, &max_wp);
}
// Having at least one of each is architecturally guaranteed
ASSERT(t, max_bp >= 1 && max_wp >= 1);
ssize_t cur_bp = 0;
ssize_t cur_wp = 0;
for (auto reg : regs) {
// GDB always splits these into nicely aligned platform chunks for us,
// but let's be general and support unaligned registers also.
size_t len = reg.num_bytes;
remote_ptr<uint8_t> addr = reg.addr.cast<uint8_t>();
while (len > 0) {
ARM64Arch::hw_bp* bp = nullptr;
if (reg.type == WATCH_EXEC) {
if (cur_bp == max_bp) {
return false;
}
bp = &bps.dbg_regs[cur_bp++];
} else {
if (cur_wp == max_wp) {
return false;
}
bp = &wps.dbg_regs[cur_wp++];
}
ARM64Arch::hw_breakpoint_ctrl ctrl;
memset(&ctrl, 0, sizeof(ctrl));
switch (reg.type) {
case WATCH_EXEC:
ctrl.type = ARM_WATCH_EXEC;
break;
case WATCH_WRITE:
ctrl.type = ARM_WATCH_WRITE;
break;
case WATCH_READWRITE:
ctrl.type = ARM_WATCH_READWRITE;
break;
}
ctrl.enabled = 1;
ctrl.priv = ARM_PRIV_EL0;
uintptr_t off = (uintptr_t)addr.as_int() % 8;
size_t cur_bp_len = std::min(8-off, len);
// This is a byte mask of which particular byte in the 8byte word at `addr`
// to watch.
uintptr_t mask = ((((uintptr_t)1) << cur_bp_len) - 1) << off;
ASSERT(t, (mask & ~0xff) == 0);
ctrl.length = mask;
bp->addr = addr.as_int() - off;
bp->ctrl = ctrl;
len -= cur_bp_len;
addr += cur_bp_len;
}
}
// max_bp rather than cur_bp to make sure to clear out any unused slots
return t->set_aarch64_debug_regs(NT_ARM_HW_BREAK, &bps, max_bp) &&
t->set_aarch64_debug_regs(NT_ARM_HW_WATCH, &wps, max_wp);
}
static bool set_debug_regs_internal(Task* t, const Task::HardwareWatchpoints& regs) {
RR_ARCH_FUNCTION(set_debug_regs_arch, t->arch(), t, regs);
}
bool Task::set_debug_regs(const HardwareWatchpoints& regs) {
if (regs == current_hardware_watchpoints) {
return true;
}
bool ret = set_debug_regs_internal(this, regs);
if (ret) {
current_hardware_watchpoints = regs;
} else {
current_hardware_watchpoints.clear();
}
return ret;
}
static void set_thread_area(std::vector<X86Arch::user_desc>& thread_areas_,
X86Arch::user_desc desc) {
for (auto& t : thread_areas_) {
if (t.entry_number == desc.entry_number) {
t = desc;
return;
}
}
thread_areas_.push_back(desc);
}
void Task::set_thread_area(remote_ptr<X86Arch::user_desc> tls) {
// We rely on the fact that user_desc is word-size-independent.
DEBUG_ASSERT(arch() == x86 || arch() == x86_64);
auto desc = read_mem(tls);
rr::set_thread_area(thread_areas_, desc);
}
int Task::emulate_set_thread_area(int idx, X86Arch::user_desc desc) {
DEBUG_ASSERT(arch() == x86 || arch() == x86_64);
errno = 0;
fallible_ptrace(NativeArch::PTRACE_SET_THREAD_AREA, idx, &desc);
if (errno != 0) {
return errno;
}
desc.entry_number = idx;
rr::set_thread_area(thread_areas_, desc);
return 0;
}
int Task::emulate_get_thread_area(int idx, X86Arch::user_desc& desc) {
DEBUG_ASSERT(arch() == x86 || arch() == x86_64);
LOG(debug) << "Emulating PTRACE_GET_THREAD_AREA";
errno = 0;
fallible_ptrace(NativeArch::PTRACE_GET_THREAD_AREA, idx, &desc);
return errno;
}
pid_t Task::tgid() const { return tg->tgid; }
pid_t Task::real_tgid() const {
// Unless we're recording, each task is in its own thread group
return session().is_recording() ? tgid() : tid;
}
const string& Task::trace_dir() const {
const TraceStream* trace = trace_stream();
ASSERT(this, trace) << "Trace directory not available";
return trace->dir();
}
uint32_t Task::trace_time() const {
const TraceStream* trace = trace_stream();
return trace ? trace->time() : 0;
}
static bool is_signal_triggered_by_ptrace_interrupt(int group_stop_sig) {
switch (group_stop_sig) {
case SIGTRAP:
// We sometimes see SIGSTOP at interrupts, though the
// docs don't mention that.
case SIGSTOP:
return true;
default:
return false;
}
}
// This function doesn't really need to do anything. The signal will cause
// waitpid to return EINTR and that's all we need.
static void handle_alarm_signal(__attribute__((unused)) int sig) {}
bool Task::do_ptrace_interrupt() {
errno = 0;
fallible_ptrace(PTRACE_INTERRUPT, nullptr, nullptr);
if (errno) {
ASSERT(this, errno == ESRCH) << "Unexpected PTRACE_INTERRUPT error " << errno;
return false;
}
expecting_ptrace_interrupt_stop = 2;
return true;
}
bool Task::account_for_potential_ptrace_interrupt_stop(WaitStatus status) {
if (expecting_ptrace_interrupt_stop > 0) {
--expecting_ptrace_interrupt_stop;
if (is_signal_triggered_by_ptrace_interrupt(status.group_stop())) {
expecting_ptrace_interrupt_stop = 0;
return true;
}
}
return false;
}
bool Task::wait(double interrupt_after_elapsed) {
LOG(debug) << "going into blocking wait for " << tid << " ...";
ASSERT(this, session().is_recording() || interrupt_after_elapsed == -1);
bool sent_wait_interrupt = false;
WaitResult result;
while (true) {
if (interrupt_after_elapsed == 0 && !sent_wait_interrupt) {
// If this fails, the tracee must be a zombie or altogether gone,
// in which case we should detect that status change later.
do_ptrace_interrupt();
if (session().is_recording()) {
// Force this timeslice to end
session().as_record()->scheduler().expire_timeslice();
}
sent_wait_interrupt = true;
}
WaitOptions options(tid);
if (interrupt_after_elapsed > 0) {
options.block_seconds = interrupt_after_elapsed;
interrupt_after_elapsed = 0;
}
result = WaitManager::wait_stop(options);
if (result.code == WAIT_OK) {
break;
}
if (result.code == WAIT_NO_CHILD) {
/* The process died without us getting a PTRACE_EXIT_EVENT notification.
* This is possible if the process receives a SIGKILL while in the exit
* event stop, but before we were able to read the event notification.
*/
return false;
}
ASSERT(this, result.code == WAIT_NO_STATUS);
}
if (sent_wait_interrupt) {
LOG(warn) << "Forced to PTRACE_INTERRUPT tracee";
if (!is_signal_triggered_by_ptrace_interrupt(result.status.group_stop())) {
LOG(warn) << " PTRACE_INTERRUPT raced with another event " << result.status;
}
}
return did_waitpid(result.status);
}
void Task::canonicalize_regs(SupportedArch syscall_arch) {
ASSERT(this, is_stopped_);
if (registers.arch() == x86_64) {
if (syscall_arch == x86) {
// The int $0x80 compatibility handling clears r8-r11
// (see arch/x86/entry/entry_64_compat.S). The sysenter compatibility
// handling also clears r12-r15. However, to actually make such a syscall,
// the user process would have to switch itself into compatibility mode,
// which, though possible, does not appear to actually be done by any
// real application (contrary to int $0x80, which is accessible from 64bit
// mode as well).
registers_dirty |= registers.set_r8(0x0);
registers_dirty |= registers.set_r9(0x0);
registers_dirty |= registers.set_r10(0x0);
registers_dirty |= registers.set_r11(0x0);
} else {
// x86-64 'syscall' instruction copies RFLAGS to R11 on syscall entry.
// If we single-stepped into the syscall instruction, the TF flag will be
// set in R11. We don't want the value in R11 to depend on whether we
// were single-stepping during record or replay, possibly causing
// divergence.
// This doesn't matter when exiting a sigreturn syscall, since it
// restores the original flags.
// For untraced syscalls, the untraced-syscall entry point code (see
// write_rr_page) does this itself.
// We tried just clearing %r11, but that caused hangs in
// Ubuntu/Debian kernels.
// Making this match the flags makes this operation idempotent, which is
// helpful.
registers_dirty |= registers.set_r11(0x246);
// x86-64 'syscall' instruction copies return address to RCX on syscall
// entry. rr-related kernel activity normally sets RCX to -1 at some point
// during syscall execution, but apparently in some (unknown) situations
// probably involving untraced syscalls, that doesn't happen. To avoid
// potential issues, forcibly replace RCX with -1 always.
// This doesn't matter (and we should not do this) when exiting a
// sigreturn syscall, since it will restore the original RCX and we don't
// want to clobber that.
// For untraced syscalls, the untraced-syscall entry point code (see
// write_rr_page) does this itself.
registers_dirty |= registers.set_cx((intptr_t)-1);
}
// On kernel 3.13.0-68-generic #111-Ubuntu SMP we have observed a failed
// execve() clearing all flags during recording. During replay we emulate
// the exec so this wouldn't happen. Just reset all flags so everything's
// consistent.
// 0x246 is ZF+PF+IF+reserved, the result clearing a register using
// "xor reg, reg".
registers_dirty |= registers.set_flags(0x246);
} else if (registers.arch() == x86) {
// The x86 SYSENTER handling in Linux modifies EBP and EFLAGS on entry.
// EBP is the potential sixth syscall parameter, stored on the user stack.
// The EFLAGS changes are described here:
// http://linux-kernel.2935.n7.nabble.com/ia32-sysenter-target-does-not-preserve-EFLAGS-td1074164.html
// In a VMWare guest, the modifications to EFLAGS appear to be
// nondeterministic. Cover that up by setting EFLAGS to reasonable values
// now.
registers_dirty |= registers.set_flags(0x246);
}
}
bool Task::read_aarch64_tls_register(uintptr_t *result) {
struct iovec vec = { result, sizeof(*result) };
return ptrace_if_stopped(PTRACE_GETREGSET, NT_ARM_TLS, &vec);
}
void Task::set_aarch64_tls_register(uintptr_t val) {
struct iovec vec = { &val, sizeof(val) };
ptrace_if_stopped(PTRACE_SETREGSET, NT_ARM_TLS, &vec);
/* If that failed, the task was killed and it should not matter what
we tried to set. */
}
bool Task::did_waitpid(WaitStatus status) {
if (is_detached_proxy() &&
(status.stop_sig() == SIGSTOP || status.stop_sig() == SIGCONT)) {
LOG(debug) << "Task " << tid << " is a detached proxy, ignoring status " << status;
return true;
}
LOG(debug) << " Task " << tid << " changed status to " << status;
intptr_t original_syscallno = registers.original_syscallno();
LOG(debug) << " (refreshing register cache)";
Ticks more_ticks = 0;
if (status.reaped()) {
was_reaped_ = true;
if (handled_ptrace_exit_event_) {
LOG(debug) << "Reaped task late " << tid;
// We did not reap this task when it exited, likely because it was a
// thread group leader blocked on the exit of the other members of
// its thread group. This has now reaped the task, so all we need to do
// here is get out quickly and the higher-level function should go ahead
// and delete us.
wait_status = status;
return true;
}
LOG(debug) << "Unexpected process reap for " << tid;
/* Mark buffers as having been destroyed. We missed our chance
* to destroy them normally in handle_ptrace_exit_event.
* XXX: We could try to find some tasks here to unmap our buffers, but it
* seems hardly worth it.
*/
destroy_buffers(nullptr, nullptr);
} else {
bool was_stopped = is_stopped_;
// Mark as stopped now. If we fail one of the ticks assertions below,
// the test-monitor (or user) might want to attach the emergency debugger,
// which needs to know that the tracee is stopped.
set_stopped(true);
// After PTRACE_INTERRUPT, any next two stops may be a group stop caused by
// that PTRACE_INTERRUPT (or neither may be). This is because PTRACE_INTERRUPT
// generally lets other stops win (and thus doesn't inject it's own stop), but
// if the other stop was already done processing, even we didn't see it yet,
// the stop will still be queued, so we could see the other stop and then the
// PTRACE_INTERRUPT group stop.
// When we issue PTRACE_INTERRUPT, we this set this counter to 2, and here
// we decrement it on every stop such that while this counter is positive,
// any group-stop could be one induced by PTRACE_INTERRUPT
if (account_for_potential_ptrace_interrupt_stop(status)) {
// Assume this was PTRACE_INTERRUPT and thus treat this as
// TIME_SLICE_SIGNAL instead.
status = WaitStatus::for_stop_sig(PerfCounters::TIME_SLICE_SIGNAL);
memset(&pending_siginfo, 0, sizeof(pending_siginfo));
pending_siginfo.si_signo = PerfCounters::TIME_SLICE_SIGNAL;
pending_siginfo.si_fd = hpc.ticks_interrupt_fd();
pending_siginfo.si_code = POLL_IN;
} else if (status.stop_sig()) {
if (!ptrace_if_stopped(PTRACE_GETSIGINFO, nullptr, &pending_siginfo)) {
LOG(debug) << "Unexpected process death getting siginfo for " << tid;
// Let's pretend this stop never happened.
set_stopped(false);
in_unexpected_exit = true;
return false;
}
}
// A SIGKILL or equivalent can cause a task to exit without us having run it, in
// which case we might have pending register changes for it that are now
// irrelevant. In that case we just throw away our register changes and use
// whatever the kernel now has.
if (status.ptrace_event() != PTRACE_EVENT_EXIT) {
ASSERT(this, !registers_dirty) << "Registers shouldn't already be dirty (status is " << status << ")";
}
// If the task was stopped, we don't need to read the registers.
// In fact if we didn't start the thread, we may not have flushed dirty
// registers but still received a PTRACE_EVENT_EXIT, in which case the
// task's register values are not what they should be.
if (!was_stopped && !registers_dirty) {
LOG(debug) << "Requesting registers from tracee " << tid;
NativeArch::user_regs_struct ptrace_regs;
#if defined(__i386__) || defined(__x86_64__)
if (ptrace_if_stopped(PTRACE_GETREGS, nullptr, &ptrace_regs)) {
registers.set_from_ptrace(ptrace_regs);
// Check the architecture of the task by looking at the
// cs segment register and checking if that segment is a long mode segment
// (Linux always uses GDT entries for this, which are globally the same).
SupportedArch a = is_long_mode_segment(registers.cs()) ? x86_64 : x86;
if (a == x86_64 && NativeArch::arch() == x86) {
FATAL() << "Sorry, tracee " << tid << " is executing in x86-64 mode"
<< " and that's not supported with a 32-bit rr.";
}
if (a != registers.arch()) {
registers.set_arch(a);
registers.set_from_ptrace(ptrace_regs);
}
// Only adjust tick count if we were able to read registers.
// For example if the task is already reaped we don't have new
// register values and we don't want to read a ticks value
// that mismatches our registers.
more_ticks = hpc.read_ticks(this);
}
#elif defined(__aarch64__)
struct iovec vec = { &ptrace_regs,
sizeof(ptrace_regs) };
if (ptrace_if_stopped(PTRACE_GETREGSET, NT_PRSTATUS, &vec)) {
registers.set_from_ptrace(ptrace_regs);
more_ticks = hpc.read_ticks(this);
}
#else
#error detect architecture here
#endif
else {
LOG(debug) << "Unexpected process death for " << tid;
// Let's pretend this stop never happened.
// Note that pending_siginfo may have been overwritten above,
// but in that case we're going to ignore this signal-stop
// so it doesn't matter.
set_stopped(false);
in_unexpected_exit = true;
return false;
}
}
}
wait_status = status;
// We stop counting here because there may be things we want to do to the
// tracee that would otherwise generate ticks.
hpc.stop_counting();
session().accumulate_ticks_processed(more_ticks);
ticks += more_ticks;
if (was_reaped_) {
ASSERT(this, !handled_ptrace_exit_event_);
} else if (status.ptrace_event() == PTRACE_EVENT_EXIT) {
ASSERT(this, !handled_ptrace_exit_event_);
seen_ptrace_exit_event_ = true;
} else {
if (arch() == x86 || arch() == x86_64) {
// Clear the single step flag in case we got here by taking a signal
// after asking for a single step. We want to avoid taking that single
// step after the signal resumes, so the singlestep flag needs to be
// cleared. On aarch64, the kernel does this for us.
if (registers.x86_singlestep_flag()) {
registers.clear_x86_singlestep_flag();
registers_dirty = true;
}
if (last_resume_orig_cx != 0) {
uintptr_t new_cx = registers.cx();
/* Un-fudge registers, if we fudged them to work around the KNL hardware
quirk */
unsigned cutoff = single_step_coalesce_cutoff();
ASSERT(this, new_cx == cutoff - 1 || new_cx == cutoff);
registers.set_cx(last_resume_orig_cx - cutoff + new_cx);
registers_dirty = true;
}
last_resume_orig_cx = 0;
}
if (did_set_breakpoint_after_cpuid) {
remote_code_ptr bkpt_addr =
address_of_last_execution_resume + trapped_instruction_len(singlestepping_instruction);
if (ip().undo_executed_bkpt(arch()) == bkpt_addr) {
Registers r = regs();
r.set_ip(bkpt_addr);
set_regs(r);
}
vm()->remove_breakpoint(bkpt_addr, BKPT_INTERNAL);
did_set_breakpoint_after_cpuid = false;
}
if ((singlestepping_instruction == TrappedInstruction::PUSHF ||
singlestepping_instruction == TrappedInstruction::PUSHF16) &&
ip() == address_of_last_execution_resume +
trapped_instruction_len(singlestepping_instruction)) {
// We singlestepped through a pushf. Clear TF bit on stack.
auto sp = regs().sp().cast<uint16_t>();
// If this address is invalid then we should have segfaulted instead of
// retiring the instruction!
uint16_t val = read_mem(sp);
write_mem(sp, (uint16_t)(val & ~X86_TF_FLAG));
}
singlestepping_instruction = TrappedInstruction::NONE;
// We might have singlestepped at the resumption address and just exited
// the kernel without executing the breakpoint at that address.
// The kernel usually (always?) singlesteps an extra instruction when
// we do this with PTRACE_SYSEMU_SINGLESTEP, but rr's ptrace emulation
// doesn't and it's kind of a kernel bug.
if (as->get_breakpoint_type_at_addr(address_of_last_execution_resume) !=
BKPT_NONE &&
stop_sig() == SIGTRAP && !ptrace_event() &&
ip().undo_executed_bkpt(arch()) == address_of_last_execution_resume) {
ASSERT(this, more_ticks == 0);
// When we resume execution and immediately hit a breakpoint, the original
// syscall number can be reset to -1. Undo that, so that the register
// state matches the state we'd be in if we hadn't resumed. ReplayTimeline
// depends on resume-at-a-breakpoint being a noop.
registers.set_original_syscallno(original_syscallno);
registers_dirty = true;
}
// If we're in the rr page, we may have just returned from an untraced
// syscall there and while in the rr page registers need to be consistent
// between record and replay. During replay most untraced syscalls are
// replaced with "xor eax,eax" (right after a "movq -1, %rcx") so
// rcx is always -1, but during recording it sometimes isn't after we've
// done a real syscall.
if (is_in_rr_page()) {
// N.B.: Cross architecture syscalls don't go through the rr page, so we
// know what the architecture is.
canonicalize_regs(arch());
}
}
did_wait();
return true;
}
template <typename Arch>
static void set_tls_from_clone_arch(Task* t, remote_ptr<void> tls) {
if (Arch::clone_tls_type == Arch::UserDescPointer) {
t->set_thread_area(tls.cast<X86Arch::user_desc>());
}
}
static void set_tls_from_clone(Task* t, remote_ptr<void> tls) {
RR_ARCH_FUNCTION(set_tls_from_clone_arch, t->arch(), t, tls);
}
template <typename Arch>
static void setup_preload_thread_locals_from_clone_arch(Task* t, Task* origin) {
void* local_addr = preload_thread_locals_local_addr(*t->vm());
if (local_addr) {
t->activate_preload_thread_locals();
auto locals = reinterpret_cast<preload_thread_locals<Arch>*>(local_addr);
auto origin_locals = reinterpret_cast<const preload_thread_locals<Arch>*>(
origin->fetch_preload_thread_locals());
locals->alt_stack_nesting_level = origin_locals->alt_stack_nesting_level;
// vfork() will restore the flags on the way out since its on the same
// stack.
locals->saved_flags = origin_locals->saved_flags;
// clone() syscalls set the child stack pointer, so the child is no
// longer in the syscallbuf code even if the parent was.
if (PRELOAD_THREAD_LOCAL_SCRATCH2_SIZE >= 8 * 2) {
// On aarch64, we use this to save and restore some register values across clone
memcpy(locals->stub_scratch_2, origin_locals->stub_scratch_2, 8 * 2);
}
}
}
void Task::setup_preload_thread_locals_from_clone(Task* origin) {
RR_ARCH_FUNCTION(setup_preload_thread_locals_from_clone_arch, this->arch(), this, origin);
}
Task* Task::clone(CloneReason reason, int flags, remote_ptr<void> stack,
remote_ptr<void> tls, remote_ptr<int>, pid_t new_tid,
pid_t new_rec_tid, uint32_t new_serial,
Session* other_session,
FdTable::shr_ptr new_fds,
ThreadGroup::shr_ptr new_tg) {
Session* new_task_session = &session();
if (other_session) {
ASSERT(this, reason != TRACEE_CLONE);
new_task_session = other_session;
} else {
ASSERT(this, reason == TRACEE_CLONE);
}
string n;
if (!session().is_recording()) {
n = name();
}
Task* t =
new_task_session->new_task(new_tid, new_rec_tid, new_serial, arch(), n);
if (CLONE_SHARE_VM & flags) {
t->as = as;
if (!stack.is_null()) {
remote_ptr<void> last_stack_byte = stack - 1;
if (t->as->has_mapping(last_stack_byte)) {
auto mapping = t->as->mapping_of(last_stack_byte);
if (!mapping.recorded_map.is_heap()) {
const KernelMapping& m = mapping.map;
LOG(debug) << "mapping stack for " << new_tid << " at " << m;
t->as->map(t, m.start(), m.size(), m.prot(), m.flags(),
m.file_offset_bytes(), "[stack]", m.device(), m.inode());
}
}
}
// rseq state is not cloned into new threads
} else {
t->as = new_task_session->clone(t, as);
if (rseq_state) {
// rseq state is cloned into non-thread children
t->rseq_state = make_unique<RseqState>(*rseq_state);
}
}
t->syscallbuf_size = syscallbuf_size;
t->preload_globals = preload_globals;
t->seccomp_bpf_enabled = seccomp_bpf_enabled;
// FdTable is either shared or copied, so the contents of
// syscallbuf_fds_disabled_child are still valid.
if (CLONE_SHARE_FILES & flags) {
ASSERT(this, !new_fds);
t->fds = fds;
} else if (new_fds) {
t->fds = new_fds;
} else {
t->fds = fds->clone();
}
t->fds->insert_task(t);
t->top_of_stack = stack;
// wait() before trying to do anything that might need to
// use ptrace to access memory
bool ok = t->wait();
ASSERT(t, ok) << "Task " << t->tid << " killed unexpectedly; not sure how to handle this";
t->post_wait_clone(this, flags);
if (CLONE_SHARE_THREAD_GROUP & flags) {
ASSERT(this, !new_tg);
t->tg = tg;
} else {
if (new_tg) {
t->tg = new_tg;
} else {
t->tg = new_task_session->clone(t, tg);
}
}
t->tg->insert_task(t);
t->open_mem_fd_if_needed();
t->thread_areas_ = thread_areas_;
if (CLONE_SET_TLS & flags) {
set_tls_from_clone(t, tls);
}
t->as->insert_task(t);
if (reason == TRACEE_CLONE) {
if (!(CLONE_SHARE_VM & flags)) {
// Unmap syscallbuf and scratch for tasks running the original address
// space.
AutoRemoteSyscalls remote(t);
for (Task* tt : as->task_set()) {
// Leak the scratch buffer for the task we cloned from. We need to do
// this because we may be using part of it for the syscallbuf stack
// and unmapping it now would cause a crash in the new task.
if (tt != this) {
t->unmap_buffers_for(remote, tt, tt->syscallbuf_child);
}
}
as->did_fork_into(t);
}
// `t` doesn't have a syscallbuf and `t->desched_fd_child`/
// `t->cloned_file_data_fd_child` are both -1.
if (session().is_replaying()) {
// `t` is not really sharing our fd table, in fact our real fd table
// is only used by this task, so it only contains our syscallbuf fds (if any),
// not the fds for any other task. So, only really-close the fds for 'this'.
// We still need to update t's `fds` table to indicate that those fds were
// closed during recording, though, otherwise we may get FileMonitor
// collisions.
AutoRemoteSyscalls remote(t);
for (Task* tt : fds->task_set()) {
t->close_buffers_for(remote, tt, tt == this);
}
} else if (CLONE_SHARE_FILES & flags) {
// `t` is sharing our fd table, so it should not close anything.
} else {
// Close syscallbuf fds for all tasks using the original fd table.
AutoRemoteSyscalls remote(t);
for (Task* tt : fds->task_set()) {
t->close_buffers_for(remote, tt, true);
}
}
}
t->post_vm_clone(reason, flags, this);
// Copy debug register values. We assume the kernel will either copy debug
// registers into the new task, or the debug registers will be unset
// in the new task. If we have no HW watchpoints then debug registers
// will definitely be unset in the new task so there is nothing to do.
if (!current_hardware_watchpoints.empty()) {
// Copy debug register settings into the new task so we're in a known state.
bool ret = set_debug_regs_internal(t, current_hardware_watchpoints);
if (!ret) {
LOG(warn) << "Failed to initialize new task's debug registers; "
<< "this should always work since we were able to set them in the old task, "
<< "but the new task might have been killed";
}
t->current_hardware_watchpoints = current_hardware_watchpoints;
}
return t;
}
bool Task::post_vm_clone(CloneReason reason, int flags, Task* origin) {
bool created_preload_thread_locals_mapping = false;
if (!(CLONE_SHARE_VM & flags)) {
created_preload_thread_locals_mapping = this->as->post_vm_clone(this);
}
this->as->fd_tables_changed();
if (reason == TRACEE_CLONE) {
setup_preload_thread_locals_from_clone(origin);
}
return created_preload_thread_locals_mapping;
}
Task* Task::os_fork_into(Session* session, FdTable::shr_ptr new_fds) {
AutoRemoteSyscalls remote(this, AutoRemoteSyscalls::DISABLE_MEMORY_PARAMS);
Task* child =
os_clone(Task::SESSION_CLONE_LEADER, session, remote, rec_tid, serial,
// Most likely, we'll be setting up a
// CLEARTID futex. That's not done
// here, but rather later in
// |copy_state()|.
//
// We also don't use any of the SETTID
// flags because that earlier work will
// be copied by fork()ing the address
// space.
SIGCHLD,
std::move(new_fds));
// When we forked ourselves, the child inherited the setup we
// did to make the clone() call. So we have to "finish" the
// remote calls (i.e. undo fudged state) in the child too,
// even though we never made any syscalls there.
remote.restore_state_to(child);
return child;
}
Task* Task::os_clone_into(const CapturedState& state,
AutoRemoteSyscalls& remote,
const ClonedFdTables& cloned_fd_tables,
ThreadGroup::shr_ptr new_tg) {
auto fdtable_entry = cloned_fd_tables.find(state.fdtable_identity);
DEBUG_ASSERT(fdtable_entry != cloned_fd_tables.end() &&
"All captured fd tables should be in cloned_fd_tables");
return os_clone(Task::SESSION_CLONE_NONLEADER, &remote.task()->session(),
remote, state.rec_tid, state.serial,
// We don't actually /need/ to specify the
// SIGHAND/SYSVMEM flags because those things
// are emulated in the tracee. But we use the
// same flags as glibc to be on the safe side
// wrt kernel bugs.
//
// We don't pass CLONE_SETTLS here *only*
// because we'll do it later in
// |copy_state()|.
//
// See |os_fork_into()| above for discussion
// of the CTID flags.
(CLONE_VM | CLONE_FS | CLONE_SIGHAND |
CLONE_SYSVSEM),
fdtable_entry->second,
std::move(new_tg),
state.top_of_stack);
}
template <typename Arch>
static void copy_tls_arch(const Task::CapturedState& state,
AutoRemoteSyscalls& remote) {
if (Arch::clone_tls_type == Arch::UserDescPointer) {
for (const auto& t : state.thread_areas) {
AutoRestoreMem remote_tls(remote, (const uint8_t*)&t, sizeof(t));
LOG(debug) << " setting tls " << remote_tls.get();
remote.infallible_syscall(
syscall_number_for_set_thread_area(remote.arch()),
remote_tls.get().as_int());
}
} else if (Arch::arch() == aarch64) {
remote.task()->set_aarch64_tls_register(state.tls_register);
}
}
static void copy_tls(const Task::CapturedState& state,
AutoRemoteSyscalls& remote) {
RR_ARCH_FUNCTION(copy_tls_arch, remote.arch(), state, remote);
}
static int64_t fdinfo_field(Task* t, int fd, const char* field, bool must_exist) {
char buf[1024];
sprintf(buf, "/proc/%d/fdinfo/%d", t->tid, fd);
ScopedFd info(buf, O_RDONLY);
if (must_exist) {
ASSERT(t, info.is_open()) << "Can't open " << buf;
} else if (!info.is_open()) {
return -1;
}
ssize_t bytes = read(info, buf, sizeof(buf) - 1);
ASSERT(t, bytes > 0);
buf[bytes] = 0;
char* p = buf;
size_t field_len = strlen(field);
while (*p) {
if (strncmp(p, field, field_len) == 0) {
char* end;
long long int r = strtoll(p + field_len, &end, 10);
ASSERT(t, *end == 0 || *end == '\n');
return r;
}
while (*p) {
if (*p == '\n') {
++p;
break;
}
++p;
}
}
return -1;
}
int64_t Task::fd_offset(int fd) {
return fdinfo_field(this, fd, "pos:", true);
}
pid_t Task::pid_of_pidfd(int fd) {
return fdinfo_field(this, fd, "Pid:", false);
}
Task::CapturedState Task::capture_state() {
CapturedState state;
state.rec_tid = rec_tid;
state.own_namespace_rec_tid = own_namespace_rec_tid;
state.fdtable_identity = uintptr_t(fds.get());
state.serial = serial;
state.tguid = thread_group()->tguid();
state.regs = regs();
state.extra_regs = extra_regs();
state.prname = name();
if (arch() == aarch64) {
bool ok = read_aarch64_tls_register(&state.tls_register);
ASSERT(this, ok) << "Tracee died; this shouldn't happen in replay";
}
if (rseq_state) {
state.rseq_state = make_unique<RseqState>(*rseq_state);
}
state.thread_areas = thread_areas_;
state.desched_fd_child = desched_fd_child;
state.cloned_file_data_fd_child = cloned_file_data_fd_child;
state.cloned_file_data_fname = cloned_file_data_fname;
state.cloned_file_data_offset =
cloned_file_data_fd_child >= 0
? fd_offset(cloned_file_data_fd_child)
: 0;
memcpy(&state.thread_locals, fetch_preload_thread_locals(),
PRELOAD_THREAD_LOCALS_SIZE);
state.syscallbuf_child = syscallbuf_child;
state.syscallbuf_size = syscallbuf_size;
state.preload_globals = preload_globals;
state.scratch_ptr = scratch_ptr;
state.scratch_size = scratch_size;
state.wait_status = wait_status;
state.ticks = ticks;
state.top_of_stack = top_of_stack;
return state;
}
void Task::copy_state(const CapturedState& state) {
set_regs(state.regs);
set_extra_regs(state.extra_regs);
{
AutoRemoteSyscalls remote(this);
set_name(remote, state.prname);
copy_tls(state, remote);
thread_areas_ = state.thread_areas;
syscallbuf_size = state.syscallbuf_size;
ASSERT(this, !syscallbuf_child)
<< "Syscallbuf should not already be initialized in clone";
if (!state.syscallbuf_child.is_null()) {
// All these fields are preserved by the fork.
desched_fd_child = state.desched_fd_child;
cloned_file_data_fd_child = state.cloned_file_data_fd_child;
cloned_file_data_fname = state.cloned_file_data_fname;
if (cloned_file_data_fd_child >= 0) {
ScopedFd fd(cloned_file_data_fname.c_str(), session().as_record() ?
O_RDWR : O_RDONLY);
remote.infallible_send_fd_dup(fd, cloned_file_data_fd_child, O_CLOEXEC);
remote.infallible_lseek_syscall(
cloned_file_data_fd_child, state.cloned_file_data_offset, SEEK_SET);
}
syscallbuf_child = state.syscallbuf_child;
}
}
preload_globals = state.preload_globals;
ASSERT(this, as->thread_locals_tuid() != tuid());
memcpy(&thread_locals, &state.thread_locals, PRELOAD_THREAD_LOCALS_SIZE);
// The scratch buffer (for now) is merely a private mapping in
// the remote task. The CoW copy made by fork()'ing the
// address space has the semantics we want. It's not used in
// replay anyway.
scratch_ptr = state.scratch_ptr;
scratch_size = state.scratch_size;
// Whatever |from|'s last wait status was is what ours would
// have been.
wait_status = state.wait_status;
ticks = state.ticks;
own_namespace_rec_tid = state.own_namespace_rec_tid;
if (state.rseq_state) {
rseq_state = make_unique<RseqState>(*state.rseq_state);
}
}
remote_ptr<const struct syscallbuf_record> Task::next_syscallbuf_record() {
return ((syscallbuf_child + 1).cast<uint8_t>() +
read_mem(REMOTE_PTR_FIELD(syscallbuf_child, num_rec_bytes)))
.cast<const struct syscallbuf_record>();
}
long Task::stored_record_size(
remote_ptr<const struct syscallbuf_record> record) {
return ::stored_record_size(read_mem(REMOTE_PTR_FIELD(record, size)));
}
long Task::fallible_ptrace(int request, remote_ptr<void> addr, void* data) {
return ptrace(_ptrace_request(request), tid, addr, data);
}
bool Task::open_mem_fd() {
// Use ptrace to read/write during open_mem_fd
as->set_mem_fd(ScopedFd());
if (!is_stopped_) {
LOG(warn) << "Can't retrieve mem fd for " << tid <<
"; process not stopped, racing with exec?";
return false;
}
/**
* We're expecting that either we or the child can read the mem fd.
* It's possible for both to not be the case (us on certain kernel
* configurations, the child after it did a setuid).
*/
char pid_path[PATH_MAX];
sprintf(pid_path, "/proc/%d", tid);
ScopedFd dir_fd(pid_path, O_PATH);
if (dir_fd < 0) {
LOG(info) << "Can't retrieve mem fd for " << tid << "; process no longer exists??";
return false;
}
ScopedFd fd = ScopedFd::openat(dir_fd, "mem", O_RDWR | O_CLOEXEC);
if (!fd.is_open()) {
LOG(debug) << "Falling back to the remote fd dance";
AutoRemoteSyscalls remote(this);
int remote_mem_dir_fd = remote.send_fd(dir_fd);
if (remote_mem_dir_fd < 0) {
LOG(info) << "Can't retrieve mem fd for " << tid << "; process is exiting?";
return false;
}
char mem[] = "mem";
// If the remote dies, any of these can fail. That's ok, we'll just
// find that the fd wasn't successfully opened.
AutoRestoreMem remote_path(remote, mem, sizeof(mem));
int remote_mem_fd = remote.syscall(syscall_number_for_openat(arch()),
remote_mem_dir_fd, remote_path.get(), O_RDWR);
if (remote_mem_fd < 0) {
LOG(info) << "Can't retrieve mem fd for " << tid
<< "; couldn't open /proc/...mem; errno=" << errno_name(-remote_mem_fd);
return false;
}
fd = remote.retrieve_fd(remote_mem_fd);
remote.infallible_close_syscall_if_alive(remote_mem_fd);
remote.infallible_close_syscall_if_alive(remote_mem_dir_fd);
}
if (!fd.is_open()) {
LOG(info) << "Can't retrieve mem fd for " << tid << "; process no longer exists?";
return false;
}
as->set_mem_fd(std::move(fd));
return true;
}
void Task::open_mem_fd_if_needed() {
if (!as->mem_fd().is_open()) {
open_mem_fd();
}
}
ScopedFd& Task::pagemap_fd() {
if (!as->pagemap_fd().is_open()) {
ScopedFd fd(proc_pagemap_path().c_str(), O_RDONLY);
if (fd.is_open()) {
as->set_pagemap_fd(std::move(fd));
} else {
LOG(info) << "Can't retrieve pagemap fd for " << tid;
}
}
return as->pagemap_fd();
}
KernelMapping Task::init_syscall_buffer(AutoRemoteSyscalls& remote,
remote_ptr<void> map_hint) {
char name[50];
sprintf(name, "syscallbuf.%d", rec_tid);
KernelMapping km =
Session::create_shared_mmap(remote, syscallbuf_size, map_hint, name);
if (!km.size()) {
return km;
}
auto& m = remote.task()->vm()->mapping_of(km.start());
remote.task()->vm()->mapping_flags_of(km.start()) |=
AddressSpace::Mapping::IS_SYSCALLBUF;
ASSERT(this, !syscallbuf_child)
<< "Should not already have syscallbuf initialized!";
syscallbuf_child = km.start().cast<struct syscallbuf_hdr>();
// No entries to begin with.
memset(m.local_addr, 0, sizeof(struct syscallbuf_hdr));
return km;
}
void Task::set_syscallbuf_locked(bool locked) {
if (!syscallbuf_child) {
return;
}
remote_ptr<uint8_t> remote_addr = REMOTE_PTR_FIELD(syscallbuf_child, locked);
uint8_t locked_before = read_mem(remote_addr);
uint8_t new_locked = locked ? (locked_before | SYSCALLBUF_LOCKED_TRACER)
: (locked_before & ~SYSCALLBUF_LOCKED_TRACER);
if (new_locked != locked_before) {
write_mem(remote_addr, new_locked);
}
}
void Task::reset_syscallbuf() {
if (!syscallbuf_child) {
return;
}
ASSERT(this,
!is_in_untraced_syscall() ||
0 == (SYSCALLBUF_LOCKED_TRACEE &
read_mem(REMOTE_PTR_FIELD(syscallbuf_child, locked))));
// Memset is easiest to do by using the local mapping which should always
// exist for the syscallbuf
uint32_t num_rec =
read_mem(REMOTE_PTR_FIELD(syscallbuf_child, num_rec_bytes));
uint8_t* ptr = as->local_mapping(syscallbuf_child + 1, num_rec);
DEBUG_ASSERT(ptr != nullptr);
memset(ptr, 0, num_rec);
write_mem(REMOTE_PTR_FIELD(syscallbuf_child, num_rec_bytes), (uint32_t)0);
write_mem(REMOTE_PTR_FIELD(syscallbuf_child, mprotect_record_count),
(uint32_t)0);
write_mem(REMOTE_PTR_FIELD(syscallbuf_child, mprotect_record_count_completed),
(uint32_t)0);
write_mem(REMOTE_PTR_FIELD(syscallbuf_child, blocked_sigs_generation),
(uint32_t)0);
}
ssize_t Task::read_bytes_ptrace(remote_ptr<void> addr, ssize_t buf_size,
void* buf) {
ssize_t nread = 0;
// ptrace operates on the word size of the host, so we really do want
// to use sizes of host types here.
uintptr_t word_size = sizeof(long);
errno = 0;
// Only read aligned words. This ensures we can always read the last
// byte before an unmapped region.
while (nread < buf_size) {
uintptr_t start = addr.as_int() + nread;
uintptr_t start_word = start & ~(word_size - 1);
uintptr_t end_word = start_word + word_size;
uintptr_t length = std::min(end_word - start, uintptr_t(buf_size - nread));
long v = fallible_ptrace(PTRACE_PEEKDATA, start_word, nullptr);
if (errno) {
break;
}
memcpy(static_cast<uint8_t*>(buf) + nread,
reinterpret_cast<uint8_t*>(&v) + (start - start_word), length);
nread += length;
}
return nread;
}
ssize_t Task::write_bytes_ptrace(remote_ptr<void> addr, ssize_t buf_size,
const void* buf) {
ssize_t nwritten = 0;
// ptrace operates on the word size of the host, so we really do want
// to use sizes of host types here.
uintptr_t word_size = sizeof(long);
errno = 0;
// Only write aligned words. This ensures we can always write the last
// byte before an unmapped region.
while (nwritten < buf_size) {
uintptr_t start = addr.as_int() + nwritten;
uintptr_t start_word = start & ~(word_size - 1);
uintptr_t end_word = start_word + word_size;
uintptr_t length =
std::min(end_word - start, uintptr_t(buf_size - nwritten));
long v;
if (length < word_size) {
v = fallible_ptrace(PTRACE_PEEKDATA, start_word, nullptr);
if (errno) {
break;
}
}
memcpy(reinterpret_cast<uint8_t*>(&v) + (start - start_word),
static_cast<const uint8_t*>(buf) + nwritten, length);
fallible_ptrace(PTRACE_POKEDATA, start_word, reinterpret_cast<void*>(v));
nwritten += length;
}
return nwritten;
}
ssize_t Task::read_bytes_fallible(remote_ptr<void> addr, ssize_t buf_size,
void* buf) {
ASSERT_ACTIONS(this, buf_size >= 0, << "Invalid buf_size " << buf_size);
if (0 == buf_size) {
return 0;
}
if (uint8_t* local_addr = as->local_mapping(addr, buf_size)) {
memcpy(buf, local_addr, buf_size);
return buf_size;
}
if (!as->mem_fd().is_open()) {
return read_bytes_ptrace(addr, buf_size, static_cast<uint8_t*>(buf));
}
ssize_t all_read = 0;
while (all_read < buf_size) {
errno = 0;
ssize_t nread = pread64(as->mem_fd(), static_cast<uint8_t*>(buf) + all_read,
buf_size - all_read, addr.as_int() + all_read);
// We open the mem_fd just after being notified of
// exec(), when the Task is created. Trying to read from that
// fd seems to return 0 with errno 0. Reopening the mem fd
// allows the pwrite to succeed. It seems that the first mem
// fd we open, very early in exec, refers to the address space
// before the exec and the second mem fd refers to the address
// space after exec.
if (0 == nread && 0 == all_read && 0 == errno) {
if (!open_mem_fd()) {
return 0;
}
continue;
}
if (nread <= 0) {
if (all_read > 0) {
// We did successfully read some data, so return success and ignore
// any error.
errno = 0;
return all_read;
}
return nread;
}
// We read some data. We should try again in case we get short reads.
all_read += nread;
}
return all_read;
}
void Task::read_bytes_helper(remote_ptr<void> addr, ssize_t buf_size, void* buf,
bool* ok) {
// pread64 etc can't handle addresses that appear to be negative ...
// like [vsyscall].
ssize_t nread = read_bytes_fallible(addr, buf_size, buf);
if (nread != buf_size) {
if (ok) {
*ok = false;
} else {
ASSERT(this, false) << "Should have read " << buf_size << " bytes from "
<< addr << ", but only read " << nread;
}
}
}
/**
* This function exists to work around
* https://bugzilla.kernel.org/show_bug.cgi?id=99101.
* On some kernels pwrite() to /proc/.../mem fails when writing to a region
* that's PROT_NONE.
* Also, writing through MAP_SHARED readonly mappings fails (even if the
* file was opened read-write originally), so we handle that here too.
*/
static ssize_t safe_pwrite64(Task* t, const void* buf, ssize_t buf_size,
remote_ptr<void> addr) {
vector<KernelMapping> mappings_to_fix;
for (const auto& m :
t->vm()->maps_containing_or_after(floor_page_size(addr))) {
if (m.map.start() >= ceil_page_size(addr + buf_size)) {
break;
}
if (m.map.prot() & PROT_WRITE) {
continue;
}
if (!(m.map.prot() & PROT_READ) || (m.map.flags() & MAP_SHARED)) {
mappings_to_fix.push_back(m.map);
}
};
if (mappings_to_fix.empty()) {
return pwrite_all_fallible(t->vm()->mem_fd(), buf, buf_size, addr.as_int());
}
AutoRemoteSyscalls remote(t);
int mprotect_syscallno = syscall_number_for_mprotect(t->arch());
bool failed_access = false;
for (auto& m : mappings_to_fix) {
long ret = remote.syscall(mprotect_syscallno, m.start(), m.size(), m.prot() | PROT_WRITE);
if ((int)ret == -EACCES) {
// We could be trying to write to a read-only shared file. In that case we should
// report the error without dying.
failed_access = true;
} else {
remote.check_syscall_result(ret, mprotect_syscallno, false);
}
}
ssize_t nwritten;
if (failed_access) {
nwritten = -1;
} else {
nwritten = pwrite_all_fallible(t->vm()->mem_fd(), buf, buf_size, addr.as_int());
}
for (auto& m : mappings_to_fix) {
remote.infallible_syscall(mprotect_syscallno, m.start(), m.size(),
m.prot());
}
if (failed_access) {
errno = EACCES;
}
return nwritten;
}
void Task::write_bytes_helper(remote_ptr<void> addr, ssize_t buf_size,
const void* buf, bool* ok, uint32_t flags) {
ASSERT(this, buf_size >= 0) << "Invalid buf_size " << buf_size;
if (0 == buf_size) {
return;
}
ssize_t nwritten = write_bytes_helper_no_notifications(addr, buf_size, buf, ok, flags);
if (nwritten > 0) {
vm()->notify_written(addr, nwritten, flags);
}
}
ssize_t Task::write_bytes_helper_no_notifications(remote_ptr<void> addr, ssize_t buf_size,
const void* buf, bool* ok, uint32_t flags) {
ASSERT(this, buf_size >= 0) << "Invalid buf_size " << buf_size;
if (0 == buf_size) {
return 0;
}
if (uint8_t* local_addr = as->local_mapping(addr, buf_size)) {
memcpy(local_addr, buf, buf_size);
return buf_size;
}
if (!as->mem_fd().is_open()) {
ssize_t nwritten =
write_bytes_ptrace(addr, buf_size, static_cast<const uint8_t*>(buf));
if (ok && nwritten < buf_size) {
*ok = false;
}
return nwritten;
}
errno = 0;
ssize_t nwritten = safe_pwrite64(this, buf, buf_size, addr.as_int());
// See comment in read_bytes_helper().
if (0 == nwritten && 0 == errno) {
open_mem_fd();
return write_bytes_helper_no_notifications(addr, buf_size, buf, ok, flags);
}
if (errno == EPERM) {
FATAL() << "Can't write to /proc/" << tid << "/mem\n"
<< "Maybe you need to disable grsecurity MPROTECT with:\n"
<< " setfattr -n user.pax.flags -v 'emr' <executable>";
}
if (ok) {
if (nwritten < buf_size) {
*ok = false;
}
} else {
ASSERT(this, nwritten == buf_size)
<< "Should have written " << buf_size << " bytes to " << addr
<< ", but only wrote " << nwritten;
}
return nwritten;
}
uint64_t Task::write_ranges(const vector<FileMonitor::Range>& ranges,
void* data, size_t size) {
uint8_t* p = static_cast<uint8_t*>(data);
size_t s = size;
size_t result = 0;
for (auto& r : ranges) {
size_t bytes = min(s, r.length);
write_bytes_helper(r.data, bytes, p);
s -= bytes;
result += bytes;
if (s == 0) {
break;
}
}
return result;
}
void Task::write_zeroes(unique_ptr<AutoRemoteSyscalls>* remote, remote_ptr<void> addr, size_t size) {
if (!size) {
return;
}
bool remove_ok = true;
remote_ptr<void> initial_addr = addr;
size_t initial_size = size;
vector<uint8_t> zeroes;
while (size > 0) {
size_t bytes;
remote_ptr<void> first_page = ceil_page_size(addr);
if (addr < first_page) {
bytes = min<size_t>(first_page - addr, size);
} else {
if (remove_ok) {
remote_ptr<void> last_page = floor_page_size(addr + size);
if (first_page < last_page) {
if (!*remote) {
*remote = make_unique<AutoRemoteSyscalls>(this);
}
int ret = (*remote)->syscall(syscall_number_for_madvise(arch()), first_page, last_page - first_page, MADV_REMOVE);
if (ret == 0) {
addr = last_page;
size -= last_page - first_page;
continue;
}
// Don't try MADV_REMOVE again
remove_ok = false;
}
}
bytes = min<size_t>(4*1024*1024, size);
}
zeroes.resize(bytes);
memset(zeroes.data(), 0, bytes);
ssize_t written = write_bytes_helper_no_notifications(addr, bytes, zeroes.data(), nullptr, 0);
ASSERT(this, written == (ssize_t)bytes);
addr += bytes;
size -= bytes;
}
vm()->notify_written(initial_addr, initial_size, 0);
}
const TraceStream* Task::trace_stream() const {
if (session().as_record()) {
return &session().as_record()->trace_writer();
}
if (session().as_replay()) {
return &session().as_replay()->trace_reader();
}
return nullptr;
}
bool Task::ptrace_if_stopped(int request, remote_ptr<void> addr, void* data) {
ASSERT(this, is_stopped_);
errno = 0;
fallible_ptrace(request, addr, data);
if (errno == ESRCH) {
LOG(debug) << "ptrace_if_stopped tid " << tid << " was not stopped";
return false;
}
ASSERT(this, !errno) << "ptrace(" << ptrace_req_name<NativeArch>(request) << ", " << tid
<< ", addr=" << addr << ", data=" << data
<< ") failed with errno " << errno;
return true;
}
SupportedArch Task::detect_syscall_arch() {
SupportedArch syscall_arch;
bool ok = get_syscall_instruction_arch(
this, regs().ip().decrement_by_syscall_insn_length(arch()),
&syscall_arch);
ASSERT(this, ok);
return syscall_arch;
}
bool Task::clone_syscall_is_complete(pid_t* new_pid,
SupportedArch syscall_arch) {
int event = ptrace_event();
if (PTRACE_EVENT_CLONE == event || PTRACE_EVENT_FORK == event ||
PTRACE_EVENT_VFORK == event) {
*new_pid = get_ptrace_eventmsg_pid();
ASSERT(this, *new_pid >= 0)
<< "Task was killed just after clone/fork/vfork and before we could get the new pid; giving up";
return true;
}
ASSERT(this, !event) << "Unexpected ptrace event "
<< ptrace_event_name(event);
// EAGAIN can happen here due to fork failing under load. The caller must
// handle this.
// XXX ENOSYS shouldn't happen here.
intptr_t result = regs().syscall_result_signed();
ASSERT(this,
regs().syscall_may_restart() || -ENOSYS == result ||
-EAGAIN == result || -ENOMEM == result)
<< "Unexpected task status " << status() << " ("
<< syscall_name(regs().original_syscallno(), syscall_arch)
<< " syscall errno: " << errno_name(-result) << ")";
return false;
}
template <typename Arch> static void do_preload_init_arch(Task* t) {
auto params = t->read_mem(
remote_ptr<rrcall_init_preload_params<Arch>>(t->regs().orig_arg1()));
for (Task* tt : t->vm()->task_set()) {
tt->preload_globals = params.globals.rptr();
}
ReplaySession *replay = t->session().as_replay();
if (replay && replay->has_trace_quirk(TraceReader::UsesGlobalsInReplay)) {
t->write_mem(REMOTE_PTR_FIELD(t->preload_globals, reserved_legacy_in_replay), (unsigned char)1);
}
}
static void do_preload_init(Task* t) {
RR_ARCH_FUNCTION(do_preload_init_arch, t->arch(), t);
}
void Task::at_preload_init() {
as->at_preload_init(this);
do_preload_init(this);
fd_table()->init_syscallbuf_fds_disabled(this);
}
template <typename Arch>
static long perform_remote_clone_arch(
AutoRemoteSyscalls& remote, unsigned base_flags, remote_ptr<void> stack,
remote_ptr<int> ptid, remote_ptr<void> tls, remote_ptr<int> ctid) {
switch (Arch::clone_parameter_ordering) {
case Arch::FlagsStackParentTLSChild:
return remote.syscall(Arch::clone, base_flags, stack, ptid.as_int(),
tls.as_int(), ctid.as_int());
case Arch::FlagsStackParentChildTLS:
return remote.syscall(Arch::clone, base_flags, stack, ptid.as_int(),
ctid.as_int(), tls.as_int());
}
}
static long perform_remote_clone(AutoRemoteSyscalls& remote,
unsigned base_flags, remote_ptr<void> stack,
remote_ptr<int> ptid, remote_ptr<void> tls,
remote_ptr<int> ctid) {
RR_ARCH_FUNCTION(perform_remote_clone_arch, remote.arch(), remote, base_flags,
stack, ptid, tls, ctid);
}
/*static*/ Task* Task::os_clone(CloneReason reason, Session* session,
AutoRemoteSyscalls& remote, pid_t rec_child_tid,
uint32_t new_serial, unsigned base_flags,
FdTable::shr_ptr new_fds,
ThreadGroup::shr_ptr new_tg,
remote_ptr<void> stack, remote_ptr<int> ptid,
remote_ptr<void> tls, remote_ptr<int> ctid) {
long ret;
do {
ret = perform_remote_clone(remote, base_flags, stack, ptid, tls, ctid);
} while (ret == -EAGAIN);
ASSERT(remote.task(), ret >= 0)
<< "remote clone failed with errno " << errno_name(-ret);
Task* child = remote.task()->clone(
reason, clone_flags_to_task_flags(base_flags), stack, tls, ctid,
remote.new_tid(), rec_child_tid, new_serial, session, std::move(new_fds),
std::move(new_tg));
return child;
}
static void setup_fd_table(Task* t, FdTable& fds, int tracee_socket_fd_number) {
fds.add_monitor(t, STDOUT_FILENO, new StdioMonitor(t->session().tracee_output_fd(STDOUT_FILENO)));
fds.add_monitor(t, STDERR_FILENO, new StdioMonitor(t->session().tracee_output_fd(STDERR_FILENO)));
fds.add_monitor(t, RR_MAGIC_SAVE_DATA_FD, new MagicSaveDataMonitor());
fds.add_monitor(t, tracee_socket_fd_number, new PreserveFileMonitor());
}
static void spawned_child_fatal_error(const ScopedFd& err_fd,
const char* format, ...) {
va_list args;
va_start(args, format);
char* buf;
if (vasprintf(&buf, format, args) < 0) {
exit(1);
}
char* buf2;
if (asprintf(&buf2, "%s (%s)", buf, errno_name(errno).c_str()) < 0) {
exit(1);
}
write_all(err_fd, buf2, strlen(buf2));
_exit(1);
}
static void disable_tsc(const ScopedFd& err_fd) {
/* Trap to the rr process if a 'rdtsc' instruction is issued.
* That allows rr to record the tsc and replay it
* deterministically. */
if (0 > prctl(PR_SET_TSC, PR_TSC_SIGSEGV, 0, 0, 0)) {
spawned_child_fatal_error(err_fd, "error setting up prctl");
}
}
template <typename Arch> void set_up_process_arch(const ScopedFd&);
template <> void set_up_process_arch<X86Arch>(const ScopedFd& err_fd) { disable_tsc(err_fd); }
template <> void set_up_process_arch<X64Arch>(const ScopedFd& err_fd) { disable_tsc(err_fd); }
template <> void set_up_process_arch<ARM64Arch>(const ScopedFd&) {}
void set_up_process_arch(SupportedArch arch, const ScopedFd& err_fd) {
RR_ARCH_FUNCTION(set_up_process_arch, arch, err_fd);
}
/**
* Prepare this process and its ancestors for recording/replay by
* preventing direct access to sources of nondeterminism, and ensuring
* that rr bugs don't adversely affect the underlying system.
*/
static void set_up_process(Session& session, const ScopedFd& err_fd,
const ScopedFd& sock_fd, int sock_fd_number) {
/* TODO tracees can probably undo some of the setup below
* ... */
// Restore signal mask
sigset_t sigmask;
TraceeAttentionSet::get_original_sigmask(&sigmask);
sigprocmask(SIG_SETMASK, &sigmask, nullptr);
struct NativeArch::cap_header header = {.version =
_LINUX_CAPABILITY_VERSION_3,
.pid = 0 };
struct NativeArch::cap_data caps[2];
if (syscall(NativeArch::capget, &header, &caps) != 0) {
spawned_child_fatal_error(err_fd, "Failed to read capabilities");
}
uint32_t perfmon_mask = 1 << (CAP_PERFMON - 32);
if (caps[1].permitted & perfmon_mask) {
// Try to pass CAP_PERFMON into our tracees.
caps[1].inheritable |= perfmon_mask;
// Ignore any failures here. Capabilities are super complex and I'm not
// sure this can be trusted to succeed.
if (syscall(NativeArch::capset, &header, &caps) == 0) {
// Install CAP_PERFMON as an ambient capabilities.
// This prctl was only added in 4.3. Ignore failures.
prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_RAISE, CAP_PERFMON, 0, 0);
}
}
/* CLOEXEC so that the original fd here will be closed by the exec that's
* about to happen.
*/
int fd = open("/dev/null", O_WRONLY | O_CLOEXEC);
if (0 > fd) {
spawned_child_fatal_error(err_fd, "error opening /dev/null");
}
if (RR_MAGIC_SAVE_DATA_FD != dup2(fd, RR_MAGIC_SAVE_DATA_FD)) {
spawned_child_fatal_error(err_fd, "error duping to RR_MAGIC_SAVE_DATA_FD");
}
if (sock_fd_number != dup2(sock_fd, sock_fd_number)) {
spawned_child_fatal_error(err_fd,
"error duping to RR_RESERVED_SOCKET_FD");
}
if (session.is_replaying()) {
// This task and all its descendants should silently reap any terminating
// children.
if (SIG_ERR == signal(SIGCHLD, SIG_IGN)) {
spawned_child_fatal_error(err_fd, "error doing signal()");
}
// If the rr process dies, prevent runaway tracee processes
// from dragging down the underlying system.
//
// TODO: this isn't inherited across fork().
if (0 > prctl(PR_SET_PDEATHSIG, SIGKILL)) {
spawned_child_fatal_error(err_fd, "Couldn't set parent-death signal");
}
// Put the replaying processes into their own session. This will stop
// signals being sent to these processes by the terminal --- in particular
// SIGTSTP/SIGINT/SIGWINCH.
setsid();
// Preserve increased resource limits, in case the tracee
// increased its limits and we need high limits to apply during replay.
} else {
restore_initial_resource_limits();
}
/* Do any architecture specific setup, such as disabling non-deterministic
instructions */
set_up_process_arch(NativeArch::arch(), err_fd);
/* If we're in setuid_sudo mode, we have CAP_SYS_ADMIN, so we don't need to
set NO_NEW_PRIVS here in order to install the seccomp filter later. In,
emulate any potentially privileged, operations, so we might as well set
no_new_privs */
if (!session.is_recording() || !has_effective_caps(1 << CAP_SYS_ADMIN)) {
if (0 > prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) {
spawned_child_fatal_error(
err_fd,
"prctl(NO_NEW_PRIVS) failed, SECCOMP_FILTER is not available: your "
"kernel is too old. Use `record -n` to disable the filter.");
}
}
}
static SeccompFilter<struct sock_filter> create_seccomp_filter() {
SeccompFilter<struct sock_filter> f;
for (auto& e : AddressSpace::rr_page_syscalls()) {
if (e.traced == AddressSpace::UNTRACED) {
auto ip = AddressSpace::rr_page_syscall_exit_point(e.traced, e.privileged,
e.enabled,
NativeArch::arch());
f.allow_syscalls_from_callsite(ip);
}
}
f.trace();
return f;
}
/**
* This is called (and must be called) in the tracee after rr has taken
* ptrace control. Otherwise, once we've installed the seccomp filter,
* things go wrong because we have no ptracer and the seccomp filter demands
* one.
*/
static void set_up_seccomp_filter(const struct sock_fprog& prog, const ScopedFd& err_fd) {
/* Note: the filter is installed only for record. This call
* will be emulated (not passed to the kernel) in the replay. */
if (0 > prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, (uintptr_t)&prog, 0, 0)) {
spawned_child_fatal_error(
err_fd, "prctl(SECCOMP) failed, SECCOMP_FILTER is not available: your "
"kernel is too old.");
}
/* anything that happens from this point on gets filtered! */
}
static void run_initial_child(Session& session, const ScopedFd& error_fd,
const ScopedFd& sock_fd, int sock_fd_number,
const char* exe_path_cstr,
char* const argv_array[],
char* const envp_array[],
const struct sock_fprog& seccomp_prog) {
pid_t pid = getpid();
set_up_process(session, error_fd, sock_fd, sock_fd_number);
// The preceding code must run before sending SIGSTOP here,
// since after SIGSTOP replay emulates almost all syscalls, but
// we need the above syscalls to run "for real".
// Signal to tracer that we're configured.
::kill(pid, SIGSTOP);
// This code must run after rr has taken ptrace control.
set_up_seccomp_filter(seccomp_prog, error_fd);
// We do a small amount of dummy work here to retire
// some branches in order to ensure that the ticks value is
// non-zero. The tracer can then check the ticks value
// at the first ptrace-trap to see if it seems to be
// working.
int start = random() % 5;
int num_its = start + 5;
int sum = 0;
for (int i = start; i < num_its; ++i) {
sum += i;
}
syscall(SYS_write, -1, &sum, sizeof(sum));
CPUIDBugDetector::run_detection_code();
execve(exe_path_cstr, argv_array, envp_array);
switch (errno) {
case ENOENT:
spawned_child_fatal_error(
error_fd, "execve failed: '%s' (or interpreter) not found",
exe_path_cstr);
break;
default:
spawned_child_fatal_error(error_fd, "execve of '%s' failed",
exe_path_cstr);
break;
}
// Never returns!
}
long Task::ptrace_seize(pid_t tid, Session& session) {
intptr_t options = PTRACE_O_TRACESYSGOOD | PTRACE_O_TRACEFORK |
PTRACE_O_TRACECLONE;
if (!Flags::get().disable_ptrace_exit_events) {
options |= PTRACE_O_TRACEEXIT;
}
if (session.is_recording()) {
options |= PTRACE_O_TRACEVFORK | PTRACE_O_TRACESECCOMP | PTRACE_O_TRACEEXEC;
}
long ret =
ptrace((_ptrace_request)PTRACE_SEIZE, tid, nullptr, (void*)(options | PTRACE_O_EXITKILL));
if (ret < 0 && errno == EINVAL) {
// PTRACE_O_EXITKILL was added in kernel 3.8, and we only need
// it for more robust cleanup, so tolerate not having it.
ret = ptrace((_ptrace_request)PTRACE_SEIZE, tid, nullptr, (void*)options);
}
return ret;
}
/*static*/ Task* Task::spawn(Session& session, ScopedFd& error_fd,
ScopedFd* sock_fd_out,
ScopedFd* sock_fd_receiver_out,
int* tracee_socket_fd_number_out,
const std::string& exe_path,
const std::vector<std::string>& argv,
const std::vector<std::string>& envp,
pid_t rec_tid) {
DEBUG_ASSERT(session.tasks().size() == 0);
int sockets[2];
long ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, sockets);
if (ret < 0) {
FATAL() << "socketpair failed";
}
*sock_fd_out = ScopedFd(sockets[0]);
*sock_fd_receiver_out = ScopedFd(sockets[1]);
// Find a usable FD number to dup to in the child. RR_RESERVED_SOCKET_FD
// might already be used by an outer rr.
int fd_number = RR_RESERVED_SOCKET_FD;
// We assume no other thread is mucking with this part of the fd address space.
while (true) {
ret = fcntl(fd_number, F_GETFD);
if (ret < 0) {
if (errno != EBADF) {
FATAL() << "Error checking fd";
}
break;
}
++fd_number;
}
*tracee_socket_fd_number_out = fd_number;
pid_t tid;
// After fork() in a multithreaded program, the child can safely call only
// async-signal-safe functions, and malloc is not one of them (breaks e.g.
// with tcmalloc).
// Doing the allocations before the fork duplicates the allocations, but
// prevents errors.
StringVectorToCharArray argv_array(argv);
StringVectorToCharArray envp_array(envp);
SeccompFilter<struct sock_filter> filter = create_seccomp_filter();
struct sock_fprog prog = {(unsigned short)filter.filters.size(),
filter.filters.data()};
do {
tid = fork();
// fork() can fail with EAGAIN due to temporary load issues. In such
// cases, retry the fork().
} while (0 > tid && errno == EAGAIN);
if (0 == tid) {
run_initial_child(session, error_fd, *sock_fd_receiver_out, fd_number, exe_path.c_str(),
argv_array.get(), envp_array.get(), prog);
// run_initial_child never returns
}
if (0 > tid) {
FATAL() << "Failed to fork";
}
// Make sure the child has the only reference to this side of the pipe.
error_fd.close();
// Sync with the child process.
// We minimize the code we run between fork()ing and PTRACE_SEIZE, because
// any abnormal exit of the rr process will leave the child paused and
// parented by the init process, i.e. effectively leaked. After PTRACE_SEIZE
// with PTRACE_O_EXITKILL, the tracee will die if rr dies.
if (getenv("RR_TEST_DELAY_SEIZE")) {
sleep(1);
}
ret = ptrace_seize(tid, session);
if (ret) {
// Note that although the tracee may have died due to some fatal error,
// we haven't reaped its exit code so there's no danger of killing
// (or PTRACE_SEIZEing) the wrong process.
int tmp_errno = errno;
::kill(tid, SIGKILL);
errno = tmp_errno;
string hint;
if (errno == EPERM) {
hint = "; child probably died before reaching SIGSTOP\n"
"Child's message: " +
session.read_spawned_task_error();
}
FATAL() << "PTRACE_SEIZE failed for tid " << tid << hint;
}
Task* t = session.new_task(tid, rec_tid, session.next_task_serial(),
NativeArch::arch(), "rr");
auto tg = session.create_initial_tg(t);
t->tg.swap(tg);
auto as = session.create_vm(t);
t->as.swap(as);
t->fds = FdTable::create(t);
setup_fd_table(t, *t->fds, fd_number);
// Install signal handler here, so that when creating the first RecordTask
// it sees the exact same signal state in the parent as will be in the child.
struct sigaction sa;
sa.sa_handler = handle_alarm_signal;
sigemptyset(&sa.sa_mask);
sa.sa_flags = 0; // No SA_RESTART, so waitpid() will be interrupted
sigaction(SIGALRM, &sa, nullptr);
if (!t->wait()) {
FATAL() << "Tracee died before reaching SIGSTOP";
}
if (t->ptrace_event() == PTRACE_EVENT_EXIT) {
t->proceed_to_exit();
FATAL() << "Tracee died before reaching SIGSTOP\n"
"Child's message: "
<< session.read_spawned_task_error();
}
// SIGSTOP can be reported as a signal-stop or group-stop depending on
// whether PTRACE_SEIZE happened before or after it was delivered.
if (SIGSTOP != t->status().stop_sig() &&
SIGSTOP != t->status().group_stop()) {
WaitStatus failed_status = t->status();
t->kill();
FATAL() << "Unexpected stop " << failed_status
<< "\nChild's message: "
<< session.read_spawned_task_error();
}
t->clear_wait_status();
t->open_mem_fd();
return t;
}
void* Task::preload_thread_locals() {
return preload_thread_locals_local_addr(*as);
}
static bool file_was_deleted(string s) {
static const char deleted[] = " (deleted)";
ssize_t find_deleted = s.size() - (sizeof(deleted) - 1);
return s.find(deleted) == size_t(find_deleted);
}
static void create_mapping(Task *t, AutoRemoteSyscalls &remote, const KernelMapping &km) {
string real_file_name;
dev_t device = KernelMapping::NO_DEVICE;
ino_t inode = KernelMapping::NO_INODE;
if (km.is_real_device() && !file_was_deleted(km.fsname())) {
struct stat real_file;
string real_file_name;
remote.finish_direct_mmap(km.start(), km.size(), km.prot(), km.flags(),
km.fsname(), O_RDONLY, km.file_offset_bytes(),
real_file, real_file_name);
} else {
auto ret = remote.infallible_mmap_syscall_if_alive(km.start(), km.size(), km.prot(),
km.flags() | MAP_FIXED | MAP_ANONYMOUS, -1,
0);
ASSERT(t, ret || t->vm()->task_set().size() == t->thread_group()->task_set().size())
<< "Not handling shared address spaces where one threadgroup unexpectedly dies";
}
t->vm()->map(t, km.start(), km.size(), km.prot(), km.flags(), km.file_offset_bytes(),
real_file_name, device, inode, nullptr, &km);
}
static void apply_mm_map(AutoRemoteSyscalls& remote, const NativeArch::prctl_mm_map& map)
{
unsigned int expected_size = 0;
int result = prctl(PR_SET_MM, PR_SET_MM_MAP_SIZE, &expected_size, 0, 0);
if (result != 0) {
FATAL() << "Failed to get expected MM_MAP_SIZE. Error was " << errno_name(-result);
}
const void* pmap = NULL;
int pmap_size = 0;
/* Expected size matches native prctl_mm_map */
if (expected_size == sizeof(map)) {
pmap = &map;
pmap_size = sizeof(map);
}
#if defined(__i386__)
/* A 64-bit kernel expects a "64-bit sized" prctl_mm_map
even from a 32-bit process. */
X64Arch::prctl_mm_map map64;
if (expected_size == sizeof(map64)) {
LOG(warn) << "Kernel expects different sized MM_MAP. Using 64-bit prctl_mm_map.";
memcpy(&map64, &map, sizeof(map));
map64.auxv.val = map.auxv.val;
map64.auxv_size = map.auxv_size;
map64.exe_fd = map.exe_fd;
pmap = &map64;
pmap_size = sizeof(map64);
}
#endif
/* Are we prepared for the requested structure size? */
if (pmap == NULL || pmap_size == 0) {
FATAL() << "Kernel expects MM_MAP of size " << expected_size;
}
AutoRestoreMem remote_mm_map(remote, (const uint8_t*)pmap, pmap_size);
result = remote.syscall(syscall_number_for_prctl(remote.task()->arch()), PR_SET_MM,
PR_SET_MM_MAP, remote_mm_map.get().as_int(),
pmap_size);
if (result == -EINVAL &&
(map.start_brk <= map.end_data || map.brk <= map.end_data)) {
CLEAN_FATAL() << "The linux kernel prohibits duplication of this task's memory map," <<
" because the brk segment is located below the data segment. Sorry.";
}
else if (result != 0) {
FATAL() << "Failed to set target task memory map. Error was " << errno_name(-result);
}
}
static void copy_mem_mapping(Task* from, Task* to, const KernelMapping& km) {
vector<char> buf;
buf.resize(km.size());
ssize_t bytes = from->read_bytes_fallible(km.start(), km.size(), buf.data());
// There can be mappings of files where the mapping starts beyond the end-of-file
// so no bytes will be read.
if (bytes > 0) {
// We may have a short read here if there are beyond-end-of-mapped-file pages
// in the mapping.
bool ok = true;
to->write_bytes_helper(km.start(), bytes, buf.data(), &ok);
ASSERT(to, ok);
}
}
// https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/tree/fs/proc/task_mmu.c?h=v6.3#n1352
#define PM_PRESENT (1ULL << 63)
#define PM_SWAP (1ULL << 62)
static bool copy_mem_mapping_just_used(Task* from, Task* to, const KernelMapping& km)
{
ScopedFd& fd = from->pagemap_fd();
if (!fd.is_open()) {
LOG(debug) << "Failed to open " << from->proc_pagemap_path();
return false;
}
size_t pagesize = page_size();
uint64_t pages_present = 0; // Just for logging
const int max_buf_size = 65536;
vector<uint64_t> buf;
for (uintptr_t page_offset = 0; page_offset < km.size() / pagesize; page_offset += max_buf_size) {
auto page_read_offset = (km.start().as_int() / pagesize + page_offset);
size_t page_read_count = min<size_t>(max_buf_size, km.size() / pagesize - page_offset);
buf.resize(page_read_count);
size_t bytes_read = pread(fd, buf.data(), page_read_count * sizeof(uint64_t), page_read_offset * sizeof(uint64_t));
ASSERT(from, bytes_read == page_read_count * sizeof(uint64_t));
// A chunk was read from pagemap above, now iterate through it to detect
// if memory is physically present (bit 63, PM_PRESENT) or in swap (bit 62, PM_SWAP) in Task "from".
// If yes, just transfer those pages to the new Task "to".
// Also try to find consecutive pages to copy them in one operation.
// The file /proc/PID/pagemap consists of 64-bit values, each describing
// the state of one page. See https://www.kernel.org/doc/Documentation/vm/pagemap.txt
for (size_t page = 0; page < page_read_count; ++page) {
if (buf[page] & (PM_PRESENT | PM_SWAP)) {
auto start = km.start() + (page_offset + page) * pagesize;
if (start >= km.end()) {
break;
}
++pages_present;
// Check for consecutive used pages
while (page + 1 < page_read_count &&
buf[page + 1] & (PM_PRESENT | PM_SWAP))
{
++page;
++pages_present;
}
auto end = km.start() + (page_offset + page + 1) * pagesize;
LOG(debug) << km << " copying start: 0x" << hex << start << " end: 0x" << end
<< dec << " pages: " << (end - start) / pagesize;
auto pages = km.subrange(start, end);
copy_mem_mapping(from, to, pages);
}
}
}
LOG(debug) << km << " pages_present: " << pages_present << " pages_total: " << km.size() / pagesize;
return true;
}
static void mremap_move(AutoRemoteSyscalls& remote, remote_ptr<void> src,
remote_ptr<void> dest, size_t size, const char* message) {
if (!size) {
return;
}
long ret = remote.syscall(syscall_number_for_mremap(remote.arch()),
src, size, size, MREMAP_MAYMOVE | MREMAP_FIXED, dest);
ASSERT(remote.task(), remote_ptr<void>(ret) == dest)
<< "Failed to move from " << src << " to " << dest << " "
<< HEX(size) << " bytes, ret=" << ret << ", " << message;
remote.task()->vm()->remap(remote.task(), src, size, dest, size,
MREMAP_MAYMOVE | MREMAP_FIXED);
}
/* Remap VDSO and VVAR to the addresses is used in the target process,
before they get unmapped.
Otherwise the kernel seems to put the address of the original
VDSO __kernel_rt_sigreturn function as return address on the stack.
This might not affect x86_64 because there __restore_rt
located in libpthread.so.0 is used.
*/
static void move_vdso_and_vvar_mappings(AutoRemoteSyscalls& remote,
const KernelMapping& vdso_new, const KernelMapping& vvar_new) {
KernelMapping vdso_current;
KernelMapping vvar_current;
Task* t = remote.task();
for (const auto& m : t->vm()->maps()) {
if (m.map.is_vdso()) {
vdso_current = m.map;
} else if (m.map.is_vvar()) {
vvar_current = m.map;
}
}
ASSERT(t, vdso_current.size() == vdso_new.size())
<< "VDSO size mismatch";
ASSERT(t, vvar_current.size() == vvar_new.size() || !vvar_new.size())
<< "VVAR size mismatch";
// Handle case where old and new addresses overlap by finding a free range early in the
// address space we can use as a temporary buffer. VDSOs are always at fairly high
// addresses so this shouldn't introduce any new overlap issues.
// We move VDSO and VVAR to their temp addresses first, then move both of them to their
// final address, to avoid situations where current's VDSO overlaps target's VVAR or
// vice versa.
size_t temp_size = vdso_new.size() + vvar_new.size();
remote_ptr<void> vdso_temp_address = t->vm()->find_free_memory(t,
temp_size,
remote_ptr<void>(65536), AddressSpace::FindFreeMemoryPolicy::STRICT_SEARCH);
remote_ptr<void> vvar_temp_address = vdso_temp_address + vdso_new.size();
MemoryRange temp_range(vdso_temp_address, temp_size);
ASSERT(t, !temp_range.intersects(vdso_new))
<< "Free memory found overlaps new VDSO address";
ASSERT(t, !temp_range.intersects(vvar_new))
<< "Free memory found overlaps new VVAR address";
mremap_move(remote, vdso_current.start(), vdso_temp_address, vdso_new.size(),
"vdso_current.start() -> vdso_temp_address");
if (vvar_new.size()) {
mremap_move(remote, vvar_current.start(), vvar_temp_address, vvar_current.size(),
"vvar_current.start() -> vvar_temp_address");
} else {
bool ok = remote.infallible_munmap_syscall_if_alive(vvar_current.start(),
vvar_current.size());
ASSERT(t, ok) << "Duped task got killed?";
t->vm()->unmap(t, vvar_current.start(), vvar_current.size());
}
mremap_move(remote, vdso_temp_address, vdso_new.start(), vdso_new.size(),
"vdso_temp_address -> vdso_new.start()");
mremap_move(remote, vvar_temp_address, vvar_new.start(), vvar_new.size(),
"vvar_temp_address -> vvar_new.start()");
}
const int all_rlimits[] = {
(int)RLIMIT_AS, (int)RLIMIT_CORE, (int)RLIMIT_CPU, (int)RLIMIT_DATA,
(int)RLIMIT_FSIZE, (int)RLIMIT_LOCKS, (int)RLIMIT_MEMLOCK,
(int)RLIMIT_MSGQUEUE, (int)RLIMIT_NICE, (int)RLIMIT_NOFILE, (int)RLIMIT_NPROC,
(int)RLIMIT_RSS, (int)RLIMIT_RTTIME, (int)RLIMIT_SIGPENDING, (int)RLIMIT_STACK
};
void Task::dup_from(Task *other) {
std::vector<KernelMapping> mappings;
KernelMapping stack_mapping;
bool found_stack = false;
KernelMapping vdso_mapping;
KernelMapping vvar_mapping;
for (auto map : other->vm()->maps()) {
auto km = map.map;
if (map.flags != AddressSpace::Mapping::FLAG_NONE) {
if (map.flags & (AddressSpace::Mapping::IS_THREAD_LOCALS |
AddressSpace::Mapping::IS_RR_PAGE)) {
// While under rr control this task already has an rr page and
// a thread locals shared segment, don't mess with them.
continue;
}
// For rr private mappings, just make an anonymous segment of the same size
km = KernelMapping(km.start(), km.end(), string(), KernelMapping::NO_DEVICE,
KernelMapping::NO_INODE, km.prot(),
(km.flags() & ~MAP_SHARED) | MAP_PRIVATE, 0);
}
if (km.is_stack() && !found_stack) {
stack_mapping = km;
found_stack = true;
} else {
if (km.is_vdso()) {
vdso_mapping = km;
} else if (km.is_vvar()) {
vvar_mapping = km;
} else if (!km.is_vsyscall()) {
mappings.push_back(km);
}
}
}
ASSERT(this, found_stack);
// Copy address space
LOG(debug) << "Mapping rr page for " << tid;
{
AutoRemoteSyscalls remote(this);
this->vm()->map_rr_page(remote);
}
{
AutoRemoteSyscalls remote(this, AutoRemoteSyscalls::DISABLE_MEMORY_PARAMS);
move_vdso_and_vvar_mappings(remote, vdso_mapping, vvar_mapping);
LOG(debug) << "Unmapping memory for " << tid;
// TODO: Only do this if the rr page isn't already mapped
AddressSpace::UnmapOptions options;
options.exclude_vdso_vvar = true;
this->vm()->unmap_all_but_rr_mappings(remote, options);
LOG(debug) << "Creating stack mapping " << stack_mapping << " for " << tid;
create_mapping(this, remote, stack_mapping);
LOG(debug) << "Copying stack into " << tid;
copy_mem_mapping(other, this, stack_mapping);
}
{
AutoRemoteSyscalls remote_this(this);
for (auto &km : mappings) {
LOG(debug) << "Creating mapping " << km << " for " << tid;
create_mapping(this, remote_this, km);
LOG(debug) << "Copying mapping into " << tid;
if (!(km.flags() & MAP_SHARED)) {
// Make the effort just for bigger mappings, copy smaller as a whole.
if ((km.flags() & MAP_ANONYMOUS) &&
km.size() >= 0x400000/*4MB*/)
{
LOG(debug) << "Using copy_mem_mapping_just_used";
if (copy_mem_mapping_just_used(other, this, km)) {
continue;
}
LOG(debug) << "Fallback to copy_mem_mapping";
}
copy_mem_mapping(other, this, km);
}
}
AutoRemoteSyscalls remote_other(other);
std::vector<int> all_fds = read_all_proc_fds(other->tid);
for (int fd : all_fds) {
if (fd == session().tracee_fd_number()) {
continue;
}
// If this is a /proc/self/mem fd, rewrite it for the new task
FileMonitor *fd_monitor = other->fd_table()->get_monitor(fd);
ScopedFd here;
if (fd_monitor && fd_monitor->type() == FileMonitor::ProcMem &&
((ProcMemMonitor *)fd_monitor)->target_is_vm(other->vm().get())) {
here = ScopedFd(::dup(this->vm()->mem_fd().get()));
} else {
here = remote_other.retrieve_fd(fd);
}
int remote_fd_flags = remote_other.infallible_syscall(
syscall_number_for_fcntl(this->arch()), fd, F_GETFD);
int remote_fd = remote_this.infallible_send_fd_if_alive(here);
if (remote_fd >= 0) {
if (remote_fd != fd) {
remote_this.infallible_syscall(syscall_number_for_dup3(this->arch()), remote_fd, fd, 0);
remote_this.infallible_close_syscall_if_alive(remote_fd);
}
remote_other.infallible_syscall(
syscall_number_for_fcntl(this->arch()),
fd, F_SETFD, remote_fd_flags);
}
}
string path = ".";
AutoRestoreMem child_path(remote_other, path.c_str());
{
long child_fd =
remote_other.syscall(syscall_number_for_openat(other->arch()), AT_FDCWD,
child_path.get(), O_RDONLY);
ASSERT(other, child_fd != -1);
ScopedFd fd = remote_other.retrieve_fd(child_fd);
remote_other.infallible_close_syscall_if_alive(child_fd);
child_fd = remote_this.infallible_send_fd_if_alive(fd);
if (child_fd >= 0) {
remote_this.syscall(syscall_number_for_fchdir(this->arch()), child_fd);
remote_this.infallible_close_syscall_if_alive(child_fd);
}
}
// Copy rlimits
struct rlimit64 limit;
for (size_t i = 0; i < (sizeof(all_rlimits)/sizeof(all_rlimits[0])); ++i) {
int err = syscall(SYS_prlimit64, (uintptr_t)other->tid,
(uintptr_t)all_rlimits[i], (uintptr_t)NULL, (uintptr_t)&limit);
ASSERT(other, err == 0);
err = syscall(SYS_prlimit64, (uintptr_t)this->tid,
(uintptr_t)all_rlimits[i], (uintptr_t)&limit, (uintptr_t)NULL);
ASSERT(this, err == 0);
}
NativeArch::prctl_mm_map map;
memset(&map, 0, sizeof(map));
other->vm()->read_mm_map(other, &map);
apply_mm_map(remote_this, map);
}
copy_state(other->capture_state());
activate_preload_thread_locals();
}
/**
* Proceeds until the next system call, which is being executed.
* Returns false if did_waitpid failed because the task got SIGKILL
* or equivalent.
*/
static bool __ptrace_cont(Task* t, ResumeRequest resume_how,
SupportedArch syscall_arch, int expect_syscallno,
int expect_syscallno2 = -1, pid_t new_tid = -1) {
t->resume_execution(resume_how, RESUME_NONBLOCKING, RESUME_NO_TICKS);
while (true) {
// Do our own waiting instead of calling Task::wait() so we can detect and
// handle tid changes due to off-main-thread execve.
WaitOptions options(t->tid);
if (new_tid >= 0) {
options.unblock_on_other_tasks = true;
}
WaitResult result = WaitManager::wait_stop(options);
if (new_tid >= 0 && result.code == WAIT_NO_CHILD) {
// tid change happened before our wait call. Try another wait .
options.tid = new_tid;
options.unblock_on_other_tasks = false;
result = WaitManager::wait_stop(options);
}
ASSERT(t, result.code == WAIT_OK);
if (new_tid >= 0) {
t->hpc.set_tid(new_tid);
t->tid = new_tid;
}
if (!t->did_waitpid(result.status)) {
return false;
}
if (ReplaySession::is_ignored_signal(t->status().stop_sig())) {
t->resume_execution(resume_how, RESUME_NONBLOCKING, RESUME_NO_TICKS);
} else {
break;
}
}
ASSERT(t, !t->stop_sig())
<< "Expected no pending signal, but got " << t->stop_sig();
/* check if we are synchronized with the trace -- should never fail */
int current_syscall = t->regs().original_syscallno();
ASSERT(t,
current_syscall == expect_syscallno ||
current_syscall == expect_syscallno2)
<< "Should be at " << syscall_name(expect_syscallno, syscall_arch)
<< ", but instead at " << syscall_name(current_syscall, syscall_arch);
return true;
}
void Task::did_handle_ptrace_exit_event() {
ASSERT(this, !handled_ptrace_exit_event_);
handled_ptrace_exit_event_ = true;
}
void Task::os_exec(SupportedArch exec_arch, std::string filename)
{
// Setup memory and registers for the execve call. We may not have to save
// the old values since they're going to be wiped out by execve. We can
// determine this by checking if this address space has any tasks with a
// different tgid.
Task* memory_task = this;
for (auto task : vm()->task_set()) {
if (task->tgid() != tgid()) {
memory_task = task;
break;
}
}
// Old data if required
std::vector<uint8_t> saved_data;
// Set up everything
Registers regs = this->regs();
regs.set_ip(vm()->traced_syscall_ip());
remote_ptr<void> remote_mem = floor_page_size(regs.sp());
// Determine how much memory we'll need
size_t filename_size = filename.size() + 1;
size_t total_size = filename_size + sizeof(size_t);
if (memory_task != this) {
saved_data = read_mem(remote_mem.cast<uint8_t>(), total_size);
}
// We write a zero word in the host size, not t's size, but that's OK,
// since the host size must be bigger than t's size.
// We pass no argv or envp, so exec params 2 and 3 just point to the NULL
// word.
write_mem(remote_mem.cast<size_t>(), size_t(0));
regs.set_arg2(remote_mem);
regs.set_arg3(remote_mem);
remote_ptr<void> filename_addr = remote_mem + sizeof(size_t);
write_bytes_helper(filename_addr, filename_size, filename.c_str());
regs.set_arg1(filename_addr);
/* The original_syscallno is execve in the old architecture. The kernel does
* not update the original_syscallno when the architecture changes across
* an exec.
* We're using the dedicated traced-syscall IP so its arch is t's arch.
*/
int expect_syscallno = syscall_number_for_execve(arch());
regs.set_syscallno(expect_syscallno);
regs.set_original_syscallno(expect_syscallno);
set_regs(regs);
LOG(debug) << "Beginning execve" << this->regs();
enter_syscall();
ASSERT(this, !stop_sig()) << "exec failed on entry";
/* Complete the syscall. The tid of the task will be the thread-group-leader
* tid, no matter what tid it was before.
*/
pid_t tgid = real_tgid();
bool ok = __ptrace_cont(this, RESUME_SYSCALL, arch(), expect_syscallno,
syscall_number_for_execve(exec_arch),
tgid == tid ? -1 : tgid);
ASSERT(this, ok) << "Task " << tid << " got killed while trying to exec";
LOG(debug) << this->status() << " " << this->regs();
if (this->regs().syscall_result()) {
errno = -this->regs().syscall_result();
if (access(filename.c_str(), 0) == -1 && errno == ENOENT &&
exec_arch == x86) {
FATAL() << "Cannot find " << filename
<< " to replay this 32-bit process; you probably built rr with "
"disable32bit";
}
errno = -this->regs().syscall_result();
ASSERT(this, false) << "Exec of " << filename << " failed";
}
// Restore any memory if required. We need to do this through memory_task,
// since the new task is now on the new address space. Do it now because
// later we may try to unmap this task's syscallbuf.
if (memory_task != this) {
memory_task->write_mem(remote_mem.cast<uint8_t>(), saved_data.data(),
saved_data.size());
}
}
void Task::apply_syscall_entry_regs()
{
if (arch() == aarch64) {
registers.set_original_syscallno(registers.syscallno());
registers.set_orig_arg1(registers.arg1());
// Don't update registers_dirty here, because these registers are not part
// of the ptrace state tracked by that flag.
ticks_at_last_syscall_entry = tick_count();
ip_at_last_syscall_entry = registers.ip();
last_syscall_entry_recorded = false;
}
}
void Task::tgkill(int sig) {
LOG(debug) << "Sending " << sig << " to tid " << tid;
ASSERT(this, 0 == syscall(SYS_tgkill, real_tgid(), tid, sig));
}
bool Task::move_to_signal_stop()
{
LOG(debug) << " maybe not in signal-stop (status " << status()
<< "); doing tgkill(SYSCALLBUF_DESCHED_SIGNAL)";
// Always send SYSCALLBUF_DESCHED_SIGNAL because other signals (except
// TIME_SLICE_SIGNAL) will be blocked by
// RecordTask::will_resume_execution().
// During record make sure to use the syscallbuf desched sig.
// During replay, it doesn't really matter, since we don't apply
// the signal mask to the replay task.
int sig = SYSCALLBUF_DEFAULT_DESCHED_SIGNAL;
if (session().is_recording()) {
sig = session().as_record()->syscallbuf_desched_sig();
}
// Note that this signal cannot be blocked by tracees.
this->tgkill(sig);
/* Now singlestep the task until we're in a signal-stop for the signal
* we've just sent. We must absorb and forget that signal here since we
* don't want it delivered to the task for real.
*/
auto old_ip = ip();
if (arch() == aarch64 && session().is_recording() && status().is_syscall() &&
static_cast<RecordTask*>(this)->at_may_restart_syscall()) {
// On aarch64, single step of an aborted syscall
// will cause us to move to before the syscall instruction
old_ip = old_ip.decrement_by_syscall_insn_length(arch());
}
do {
if (!resume_execution(RESUME_SINGLESTEP, RESUME_WAIT_NO_EXIT, RESUME_NO_TICKS)) {
return false;
}
ASSERT(this, old_ip == ip())
<< "Singlestep actually advanced when we "
<< "just expected a signal; was at " << old_ip << " now at "
<< ip() << " with status " << status();
// Ignore any pending TIME_SLICE_SIGNALs and continue until we get our
// SYSCALLBUF_DESCHED_SIGNAL.
} while (stop_sig() == PerfCounters::TIME_SLICE_SIGNAL);
return true;
}
bool Task::should_apply_rseq_abort(EventType event_type, remote_code_ptr* new_ip,
bool* invalid_rseq_cs) {
/* Syscallbuf flushes don't trigger rseq aborts ---
whatever triggered the syscallbuf flush might */
if (!rseq_state || event_type == EV_SYSCALLBUF_FLUSH) {
return false;
}
// We're relying on the fact that rseq_t is the same across architectures.
// These reads might fail if the task is dead and gone.
bool ok = true;
auto rseq = read_mem(rseq_state->ptr.cast<typename NativeArch::rseq_t>(), &ok);
if (!ok || !rseq.rseq_cs) {
return false;
}
auto rseq_cs = read_mem(remote_ptr<typename NativeArch::rseq_cs>(rseq.rseq_cs), &ok);
if (!ok || rseq_cs.version ||
rseq_cs.start_ip + rseq_cs.post_commit_offset < rseq_cs.start_ip ||
rseq_cs.abort_ip - rseq_cs.start_ip < rseq_cs.post_commit_offset) {
*invalid_rseq_cs = true;
return false;
}
if (ip().register_value() - rseq_cs.start_ip >= rseq_cs.post_commit_offset) {
return false;
}
uint32_t flag;
switch (event_type) {
case EV_SCHED:
flag = 1 << RR_RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT_BIT;
break;
case EV_SIGNAL:
flag = 1 << RR_RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL_BIT;
break;
default:
/* A system call inside the rseq region should SIGSEGV but we don't emulate that yet */
ASSERT(this, false) << "Unsupported event type";
return false;
}
if ((rseq.flags | rseq_cs.flags) & flag) {
return false;
}
uint32_t sig = read_mem(remote_ptr<uint32_t>(rseq_cs.abort_ip - 4), &ok);
if (!ok || sig != rseq_state->abort_prefix_signature) {
*invalid_rseq_cs = true;
return false;
}
*new_ip = remote_code_ptr(rseq_cs.abort_ip);
return true;
}
}