blob: 2b60536e5438efd2e9001f43d8294c72aee10243 [file] [log] [blame]
/* -*- Mode: C++; tab-width: 8; c-basic-offset: 2; indent-tabs-mode: nil; -*- */
#include "AutoRemoteSyscalls.h"
#include <limits.h>
#include <linux/net.h>
#include <sys/socket.h>
#include <sys/types.h>
#include <sstream>
#include "rr/rr.h"
#include "RecordSession.h"
#include "RecordTask.h"
#include "ReplaySession.h"
#include "Session.h"
#include "Task.h"
#include "core.h"
#include "kernel_abi.h"
#include "kernel_metadata.h"
#include "log.h"
#include "util.h"
using namespace std;
namespace rr {
/**
* The ABI of the socketcall syscall is a nightmare; the first arg to
* the kernel is the sub-operation, and the second argument is a
* pointer to the args. The args depend on the sub-op.
*/
template <typename Arch> struct socketcall_args {
typename Arch::signed_long args[3];
} __attribute__((packed));
void AutoRestoreMem::init(const void* mem, ssize_t num_bytes) {
ASSERT(remote.task(),
remote.enable_mem_params() == AutoRemoteSyscalls::ENABLE_MEMORY_PARAMS)
<< "Memory parameters were disabled";
len = num_bytes;
saved_sp = remote.regs().sp();
remote.regs().set_sp(remote.regs().sp() - len);
remote.task()->set_regs(remote.regs());
addr = remote.regs().sp();
data.resize(len);
bool ok = true;
remote.task()->read_bytes_helper(addr, len, data.data(), &ok);
if (mem) {
remote.task()->write_bytes_helper(addr, len, mem, &ok);
}
if (!ok) {
addr = nullptr;
}
}
AutoRestoreMem::~AutoRestoreMem() {
DEBUG_ASSERT(saved_sp == remote.regs().sp() + len);
if (addr) {
// XXX what should we do if this task was sigkilled but the address
// space is used by other live tasks?
remote.task()->write_bytes_helper(addr, len, data.data());
}
remote.regs().set_sp(remote.regs().sp() + len);
remote.task()->set_regs(remote.regs());
}
static bool is_SIGTRAP_default_and_unblocked(Task* t) {
if (!t->session().is_recording()) {
return true;
}
RecordTask* rt = static_cast<RecordTask*>(t);
return rt->sig_disposition(SIGTRAP) == SIGNAL_DEFAULT &&
!rt->is_sig_blocked(SIGTRAP);
}
AutoRemoteSyscalls::AutoRemoteSyscalls(Task* t,
MemParamsEnabled enable_mem_params)
: t(t),
initial_regs(t->regs()),
initial_ip(t->ip()),
initial_sp(t->regs().sp()),
initial_at_seccomp(t->ptrace_event() == PTRACE_EVENT_SECCOMP),
restore_wait_status(t->status()),
new_tid_(-1),
scratch_mem_was_mapped(false),
use_singlestep_path(false),
enable_mem_params_(enable_mem_params),
restore_sigmask(false),
need_sigpending_renable(false) {
if (initial_at_seccomp) {
// This should only ever happen during recording - we don't use the
// seccomp traps during replay.
ASSERT(t, t->session().is_recording());
}
// We support two paths for syscalls:
// -- a fast path using a privileged untraced syscall and PTRACE_SINGLESTEP.
// This only requires a single task-wait.
// -- a slower path using a privileged traced syscall and PTRACE_SYSCALL/
// PTRACE_CONT via Task::enter_syscall(). This requires 2 or 3 task-waits
// depending on whether the seccomp event fires before the syscall-entry
// event.
// Use the slow path when running under rr, because the rr recording us
// needs to see and trace these tracee syscalls, and if they're untraced by
// us they're also untraced by the outer rr.
// Use the slow path if SIGTRAP is blocked or ignored because otherwise
// the PTRACE_SINGLESTEP will cause the kernel to unblock it.
setup_path(t->vm()->has_rr_page() && !running_under_rr() &&
is_SIGTRAP_default_and_unblocked(t));
if (enable_mem_params == ENABLE_MEMORY_PARAMS) {
maybe_fix_stack_pointer();
}
if (t->status().is_syscall() && t->regs().syscall_may_restart()) {
// VERY rare corner case alert: It is possible for the following sequence
// of events to occur:
//
// 1. Thread A is in a blocking may-restart syscall and gets interrupted by a tg-targeted signal
// 2. Thread B dequeues the signal
// 3. Thread A is in the syscall-exit-stop with TIF_SIGPENDING set (with registers indicating syscall restart)
// 4. We get here to perform an AutoRemoteSyscall
// 5. During AutoRemoteSyscall, TIF_SIGPENDING gets cleared on return to userspace
// 6. We finish the AutoRemoteSyscall and re-apply the registers.
// 7. ... As a result, the kernel does not check whether it needs to perform the
/// syscall-restart register adjustment because TIF_SIGPENDING is not set.
// 8. The -ERESTART error code leaks to userspace.
//
// Arguably this is a kernel bug, but it's not clear how the behavior should be changed.
//
// To work around this, we forcibly re-enable TIF_SIGPENDING when cleaning up
// AutoRemoteSyscall (see below).
need_sigpending_renable = true;
}
if (t->session().is_recording()) {
RecordTask *rt = static_cast<RecordTask*>(t);
if (rt->schedule_frozen) {
// If we're explicitly controlling the schedule, make sure not to accidentally run
// any signals that we were not meant to be able to see.
restore_sigmask = true;
sigmask_to_restore = rt->get_sigmask();
sig_set_t all_blocked;
memset(&all_blocked, 0xff, sizeof(all_blocked));
// Ignore the process dying here - we'll notice later.
(void)rt->set_sigmask(all_blocked);
}
}
}
void AutoRemoteSyscalls::setup_path(bool enable_singlestep_path) {
#if defined(__aarch64__)
// XXXkhuey this fast path doesn't work on AArch64 yet, go slow instead
enable_singlestep_path = false;
#endif
if (!replaced_bytes.empty()) {
// XXX what to do here to clean up if the task died unexpectedly?
t->write_mem(remote_ptr<uint8_t>(initial_regs.ip().to_data_ptr<uint8_t>()),
replaced_bytes.data(), replaced_bytes.size());
}
remote_code_ptr syscall_ip;
use_singlestep_path = enable_singlestep_path;
if (use_singlestep_path) {
syscall_ip = AddressSpace::rr_page_syscall_entry_point(
AddressSpace::UNTRACED, AddressSpace::PRIVILEGED,
AddressSpace::RECORDING_AND_REPLAY, t->arch());
} else {
syscall_ip = t->vm()->traced_syscall_ip();
}
initial_regs.set_ip(syscall_ip);
// We need to make sure to clear any breakpoints or other alterations of
// the syscall instruction we're using. Note that the tracee may have set its
// own breakpoints or otherwise modified the instruction, so suspending our
// own breakpoint is insufficient.
std::vector<uint8_t> syscall = rr::syscall_instruction(t->arch());
bool ok = true;
replaced_bytes =
t->read_mem(initial_regs.ip().to_data_ptr<uint8_t>(), syscall.size(), &ok);
if (!ok) {
// The task died
return;
}
if (replaced_bytes == syscall) {
replaced_bytes.clear();
} else {
t->write_mem(initial_regs.ip().to_data_ptr<uint8_t>(), syscall.data(),
syscall.size(), &ok);
}
}
static bool is_usable_area(const KernelMapping& km) {
return (km.prot() & (PROT_READ | PROT_WRITE)) == (PROT_READ | PROT_WRITE) &&
(km.flags() & MAP_PRIVATE);
}
void AutoRemoteSyscalls::maybe_fix_stack_pointer() {
if (!t->session().done_initial_exec()) {
return;
}
remote_ptr<void> last_stack_byte = t->regs().sp() - 1;
if (t->vm()->has_mapping(last_stack_byte)) {
auto m = t->vm()->mapping_of(last_stack_byte);
if (is_usable_area(m.map) && m.map.start() + 2048 <= t->regs().sp()) {
// 'sp' is in a stack region and there's plenty of space there. No need
// to fix anything.
return;
}
}
MemoryRange found_stack;
for (const auto& m : t->vm()->maps()) {
if (is_usable_area(m.map)) {
found_stack = m.map;
break;
}
};
if (found_stack.start().is_null()) {
AutoRemoteSyscalls remote(t, DISABLE_MEMORY_PARAMS);
found_stack =
MemoryRange(remote.infallible_mmap_syscall_if_alive(
remote_ptr<void>(), 4096, PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_ANONYMOUS, -1, 0),
4096);
ASSERT(t, !found_stack.start().is_null())
<< "Tracee unexpectedly died here";
scratch_mem_was_mapped = true;
}
fixed_sp = found_stack.end();
DEBUG_ASSERT(!fixed_sp.is_null());
initial_regs.set_sp(fixed_sp);
}
AutoRemoteSyscalls::~AutoRemoteSyscalls() { restore_state_to(t); }
void AutoRemoteSyscalls::restore_state_to(Task* t) {
// Check if the task was unexpectedly killed via SIGKILL or equivalent.
bool is_exiting = !t->is_stopped() || t->ptrace_event() == PTRACE_EVENT_EXIT ||
t->was_reaped();
// Unmap our scatch region if required
if (scratch_mem_was_mapped && !is_exiting) {
AutoRemoteSyscalls remote(t, DISABLE_MEMORY_PARAMS);
remote.infallible_syscall(syscall_number_for_munmap(arch()),
fixed_sp - 4096, 4096);
}
if (!replaced_bytes.empty()) {
// XXX how to clean up if the task died and the address space is shared with live task?
t->write_mem(remote_ptr<uint8_t>(initial_regs.ip().to_data_ptr<uint8_t>()),
replaced_bytes.data(), replaced_bytes.size());
}
auto regs = initial_regs;
regs.set_ip(initial_ip);
regs.set_sp(initial_sp);
if (is_exiting) {
// Don't restore status; callers need to see the task is exiting.
// And the other stuff we don't below won't work.
// But do restore registers so it looks like the exit happened in a clean state.
t->set_regs(regs);
return;
}
if (t->arch() == aarch64 && regs.syscall_may_restart()) {
// On AArch64, the kernel restarts aborted syscalls using an internal `orig_x0`.
// This gets overwritten everytime we make a syscall so we need to restore it
// if we are at a syscall that may restart.
// The kernel `orig_x0` isn't accessible from ptrace AFAICT but fortunately
// it does **NOT** get reset on syscall exit so we can actually set it's value
// just by making a dummy syscall with the correct x0 value.
auto restart_res = regs.syscall_result();
regs.set_ip(t->vm()->traced_syscall_ip());
// This can be any side-effect-free syscall that doesn't care about arg1.
// The kernel sets its `orig_x0` no matter whether the syscall actually needs it.
regs.set_syscallno(rr::ARM64Arch::getpid);
regs.set_arg1(regs.orig_arg1());
t->set_regs(regs);
if (t->enter_syscall(true)) {
if (!t->resume_execution(RESUME_SYSCALL, RESUME_WAIT_NO_EXIT, RESUME_NO_TICKS)) {
// Tracee died unexpectedly, there is nothing more we can do.
// Do not restore the status, we want callers to see that the task died.
return;
}
}
regs.set_ip(initial_ip);
regs.set_syscallno(regs.original_syscallno());
regs.set_syscall_result(restart_res);
}
// If we were sitting at a seccomp trap, try to get back there by resuming
// here. Since the original register contents caused a seccomp trap,
// re-running the syscall with the same registers should put us right back
// to this same seccomp trap.
if (initial_at_seccomp && t->ptrace_event() != PTRACE_EVENT_SECCOMP) {
regs.set_ip(initial_ip.decrement_by_syscall_insn_length(t->arch()));
regs.set_syscallno(regs.original_syscallno());
t->set_regs(regs);
RecordTask* rt = static_cast<RecordTask*>(t);
while (true) {
if (!rt->resume_execution(RESUME_CONT, RESUME_WAIT_NO_EXIT, RESUME_NO_TICKS)) {
// Tracee died unexpectedly, there is nothing more we can do.
// Do not restore the status, we want callers to see that the task died.
return;
}
if (rt->ptrace_event())
break;
rt->stash_sig();
}
ASSERT(rt, rt->ptrace_event() == PTRACE_EVENT_SECCOMP);
} else {
// Restore stomped registers.
t->set_regs(regs);
}
t->set_status(restore_wait_status);
if (restore_sigmask) {
static_cast<RecordTask*>(t)->set_sigmask(sigmask_to_restore);
}
if (need_sigpending_renable) {
// The purpose of this PTRACE_INTERRUPT is to re-enable TIF_SIGPENDING on
// the tracee, without forcing any actual signals on it. Since PTRACE_INTERRUPT
// needs to be able to interrupt re-startable system calls, it is required
// to set TIF_SIGPENDING, but the fact that this works is of course a very
// deep implementation detail.
// If this fails then the tracee must be dead or no longer traced, in which
// case we no longer care about its TIF_SIGPENDING status.
t->do_ptrace_interrupt();
}
}
static bool ignore_signal(Task* t) {
int sig = t->stop_sig();
if (!sig) {
return false;
}
if (t->session().is_replaying()) {
if (ReplaySession::is_ignored_signal(sig)) {
return true;
}
} else if (t->session().is_recording()) {
auto rt = static_cast<RecordTask*>(t);
if (sig != rt->session().syscallbuf_desched_sig()) {
rt->stash_sig();
}
return true;
}
siginfo_t siginfo;
errno = 0;
t->fallible_ptrace(PTRACE_GETSIGINFO, nullptr, &siginfo);
if (errno) {
ASSERT(t, false) << "Unexpected signal " << signal_name(sig);
} else {
ASSERT(t, false) << "Unexpected signal " << siginfo;
}
return false;
}
long AutoRemoteSyscalls::syscall_base(int syscallno, Registers& callregs) {
LOG(debug) << "syscall " << syscall_name(syscallno, t->arch()) << " " << callregs;
if (t->seen_ptrace_exit_event()) {
LOG(debug) << "Task is dying, don't try anything.";
return -ESRCH;
}
if ((int)callregs.arg1() == SIGTRAP && use_singlestep_path &&
(is_sigaction_syscall(syscallno, t->arch()) ||
is_rt_sigaction_syscall(syscallno, t->arch()) ||
is_signal_syscall(syscallno, t->arch()))) {
// Don't use the fast path if we're about to set up a signal handler
// for SIGTRAP!
LOG(debug) << "Disabling singlestep path due to SIGTRAP sigaction";
setup_path(false);
callregs.set_ip(initial_regs.ip());
}
callregs.set_original_syscallno(syscallno);
callregs.set_syscallno(syscallno);
t->set_regs(callregs);
bool from_seccomp = initial_at_seccomp && t->ptrace_event() == PTRACE_EVENT_SECCOMP;
if (use_singlestep_path && !from_seccomp) {
while (true) {
if (!t->resume_execution(RESUME_SINGLESTEP, RESUME_WAIT_NO_EXIT, RESUME_NO_TICKS)) {
// Tracee was killed, there is nothing more we can do.
return -ESRCH;
}
LOG(debug) << "Used singlestep path; status=" << t->status();
// When a PTRACE_EVENT_EXIT is returned we don't update registers
if (t->ip() != callregs.ip()) {
// We entered the syscall, so stop now
break;
}
if (t->stop_sig() == SIGTRAP && t->get_siginfo().si_code == TRAP_TRACE) {
// On aarch64, if we were previously in a syscall-exit stop, continuing
// with PTRACE_SINGLESTEP will result in incurring a trap upon execution
// of the first instruction in userspace. Ignore such a trap.
continue;
}
if (ignore_signal(t)) {
// We were interrupted by a signal before we even entered the syscall
continue;
}
ASSERT(t, false) << "Unexpected status " << t->status();
}
} else {
if (from_seccomp) {
LOG(debug) << "Skipping enter_syscall - already at seccomp stop";
} else {
if (!t->enter_syscall(true)) {
// Tracee was killed, there is nothing more we can do.
// Ensure callers see the task death status.
return -ESRCH;
}
LOG(debug) << "Used enter_syscall; status=" << t->status();
}
if (!t->resume_execution(RESUME_SYSCALL, RESUME_WAIT_NO_EXIT, RESUME_NO_TICKS)) {
// Tracee was killed, there is nothing more we can do.
// Ensure callers see the task death status.
return -ESRCH;
}
LOG(debug) << "syscall exit status=" << t->status();
}
while (true) {
if (t->status().is_syscall() ||
(t->stop_sig() == SIGTRAP &&
is_kernel_trap(t->get_siginfo().si_code))) {
// If we got a SIGTRAP then we assume that's our singlestep and we're
// done.
break;
}
if (is_clone_syscall(syscallno, t->arch()) &&
t->clone_syscall_is_complete(&new_tid_, t->arch())) {
if (!t->resume_execution(RESUME_SYSCALL, RESUME_WAIT_NO_EXIT, RESUME_NO_TICKS)) {
// Tracee was killed, there is nothing more we can do.
return -ESRCH;
}
LOG(debug) << "got clone event; new status=" << t->status();
continue;
}
if (ignore_signal(t)) {
if (t->regs().syscall_may_restart()) {
if (!t->enter_syscall(true)) {
// Tracee was killed, there is nothing more we can do.
return -ESRCH;
}
LOG(debug) << "signal ignored; restarting syscall, status="
<< t->status();
if (!t->resume_execution(RESUME_SYSCALL, RESUME_WAIT_NO_EXIT, RESUME_NO_TICKS)) {
// Tracee was killed, there is nothing more we can do.
return -ESRCH;
}
LOG(debug) << "syscall exit status=" << t->status();
continue;
}
LOG(debug) << "signal ignored";
// We have been notified of a signal after a non-interruptible syscall
// completed. Don't continue, we're done here.
break;
}
ASSERT(t, false) << "Unexpected status " << t->status();
break;
}
LOG(debug) << "done, result=" << t->regs().syscall_result();
return t->regs().syscall_result();
}
SupportedArch AutoRemoteSyscalls::arch() const { return t->arch(); }
template <typename Arch>
static void write_socketcall_args(Task* t, remote_ptr<void> remote_mem,
typename Arch::signed_long arg1,
typename Arch::signed_long arg2,
typename Arch::signed_long arg3,
bool* ok) {
socketcall_args<Arch> sc_args = { { arg1, arg2, arg3 } };
t->write_mem(remote_mem.cast<socketcall_args<Arch>>(), sc_args, ok);
}
template <typename Arch>
struct fd_message {
// Unfortunately we need to send at least one byte of data in our
// message for it to work
char data;
typename Arch::iovec msgdata;
char cmsgbuf[Arch::cmsg_space(sizeof(int))];
typename Arch::msghdr msg;
// XXX: Could make this conditional on Arch
socketcall_args<Arch> socketcall;
void init(remote_ptr<fd_message<Arch>> base) {
data = 0;
msgdata.iov_base = REMOTE_PTR_FIELD(base, data);
msgdata.iov_len = 1;
memset(&msg, 0, sizeof(msg));
msg.msg_control = REMOTE_PTR_FIELD(base, cmsgbuf);
msg.msg_controllen = sizeof(cmsgbuf);
msg.msg_iov = REMOTE_PTR_FIELD(base, msgdata);
msg.msg_iovlen = 1;
}
fd_message(remote_ptr<fd_message<Arch>> base) {
init(base);
}
fd_message() {
init((uintptr_t)this);
}
remote_ptr<fd_message<Arch>> remote_this() {
return msgdata.iov_base.rptr().as_int();
}
remote_ptr<typename Arch::msghdr> remote_msg() {
return REMOTE_PTR_FIELD(remote_this(), msg);
}
remote_ptr<socketcall_args<Arch>> remote_sc_args() {
return REMOTE_PTR_FIELD(remote_this(), socketcall);
}
remote_ptr<int> remote_cmsgdata() {
return REMOTE_PTR_FIELD(remote_this(), cmsgbuf).as_int() +
(uintptr_t)Arch::cmsg_data(NULL);
}
};
template <typename Arch>
static long child_sendmsg(AutoRemoteSyscalls& remote, int child_sock, int fd) {
AutoRestoreMem remote_buf(remote, nullptr, sizeof(fd_message<Arch>));
fd_message<Arch> msg(remote_buf.get().cast<fd_message<Arch>>());
// Pull the puppet strings to have the child send its fd
// to us. Similarly to above, we DONT_WAIT on the
// call to finish, since it's likely not defined whether the
// sendmsg() may block on our recvmsg()ing what the tracee
// sent us (in which case we would deadlock with the tracee).
// We call sendmsg on child socket, but first we have to prepare a lot of
// data.
auto cmsg = reinterpret_cast<typename Arch::cmsghdr*>(msg.cmsgbuf);
cmsg->cmsg_len = Arch::cmsg_len(sizeof(fd));
cmsg->cmsg_level = SOL_SOCKET;
cmsg->cmsg_type = SCM_RIGHTS;
*static_cast<int*>(Arch::cmsg_data(cmsg)) = fd;
if (has_socketcall_syscall(Arch::arch())) {
socketcall_args<Arch> sc_args = { { child_sock, (typename Arch::signed_long)msg.remote_msg().as_int(), 0 } };
msg.socketcall = sc_args;
}
bool ok = true;
remote.task()->write_bytes_helper(remote_buf.get().cast<char>(),
sizeof(msg), &msg, &ok);
if (!ok) {
return -ESRCH;
}
if (!has_socketcall_syscall(Arch::arch())) {
return remote.syscall(Arch::sendmsg, child_sock, msg.remote_msg(), 0);
}
return remote.syscall(Arch::socketcall, SYS_SENDMSG, msg.remote_sc_args());
}
template <typename Arch>
static long child_recvmsg(AutoRemoteSyscalls& remote, int child_sock) {
AutoRestoreMem remote_buf(remote, nullptr, sizeof(fd_message<Arch>));
fd_message<Arch> msg(remote_buf.get().cast<fd_message<Arch>>());
bool ok = true;
if (has_socketcall_syscall(Arch::arch())) {
socketcall_args<Arch> sc_args = { { child_sock,
(typename Arch::signed_long)msg.remote_msg().as_int(), 0 } };
msg.socketcall = sc_args;
}
remote.task()->write_bytes_helper(remote_buf.get().cast<char>(),
sizeof(msg), &msg, &ok);
if (!ok) {
ASSERT(remote.task(), errno == ESRCH) << "Error writing " << remote_buf.get()
<< " in " << remote.task()->tid;
LOG(debug) << "Failed to write memory";
return -ESRCH;
}
int ret = 0;
if (has_socketcall_syscall(Arch::arch())) {
ret = remote.syscall(Arch::socketcall, SYS_RECVMSG, msg.remote_sc_args());
} else {
ret = remote.syscall(Arch::recvmsg, child_sock, msg.remote_msg(), 0);
}
if (ret < 0) {
LOG(debug) << "Failed to recvmsg " << ret;
return ret;
}
int their_fd = remote.task()->read_mem(msg.remote_cmsgdata(), &ok);
if (!ok) {
ASSERT(remote.task(), errno == ESRCH);
LOG(debug) << "Failed to read msg";
return -ESRCH;
}
return their_fd;
}
#define MAX_FDS_READ 2
// Try to read a single-character message from `sock`. Will collect
// up to MAX_FDS_READ fds in an SCM_RIGHTS control message and return those
// fds. Returns an empty vector if reading the message fails.
static vector<ScopedFd> maybe_receive_fds(ScopedFd& sock, bool blocking = true) {
vector<ScopedFd> ret;
struct msghdr msg;
memset(&msg, 0, sizeof(msg));
char ch;
struct iovec iov = { &ch, 1 };
msg.msg_iov = &iov;
msg.msg_iovlen = 1;
char cmsgbuf[(CMSG_SPACE(MAX_FDS_READ * sizeof(int)))];
msg.msg_control = cmsgbuf;
msg.msg_controllen = sizeof(cmsgbuf);
int flags = MSG_CMSG_CLOEXEC;
if (!blocking) {
flags |= MSG_DONTWAIT;
}
if (recvmsg(sock, &msg, flags) < 0) {
return ret;
}
struct cmsghdr* cmsg = CMSG_FIRSTHDR(&msg);
if (!cmsg || cmsg->cmsg_level != SOL_SOCKET ||
cmsg->cmsg_type != SCM_RIGHTS) {
FATAL() << "Invalid cmsg";
}
int num_fds = (cmsg->cmsg_len - CMSG_LEN(0))/sizeof(int);
for (int i = 0; i < num_fds; i++) {
int fd;
memcpy(&fd, CMSG_DATA(cmsg) + i*sizeof(int), sizeof(int));
DEBUG_ASSERT(fd >= 0);
ret.push_back(ScopedFd(fd));
}
return ret;
}
static void sendmsg_socket(ScopedFd& sock, int fd_to_send)
{
fd_message<NativeArch> msg;
struct msghdr *msgp = (struct msghdr*)&msg.msg;
struct cmsghdr* cmsg = CMSG_FIRSTHDR(msgp);
cmsg->cmsg_level = SOL_SOCKET;
cmsg->cmsg_type = SCM_RIGHTS;
cmsg->cmsg_len = CMSG_LEN(sizeof(fd_to_send));
*(int*)CMSG_DATA(cmsg) = fd_to_send;
if (0 > sendmsg(sock, msgp, 0)) {
FATAL() << "Failed to send fd";
}
}
static Task* thread_group_leader_for_fds(Task* t) {
for (Task* tt : t->fd_table()->task_set()) {
if (tt->tgid() == tt->rec_tid && !tt->seen_ptrace_exit_event()) {
return tt;
}
}
return nullptr;
}
template <typename Arch> ScopedFd AutoRemoteSyscalls::retrieve_fd_arch(int fd) {
ScopedFd ret;
if (!pid_fd.is_open()) {
// Try to use pidfd_getfd to get the fd without round-tripping to the tracee.
// pidfd_getfd requires a threadgroup leader, so find one if we can.
Task* tg_leader_for_fds = thread_group_leader_for_fds(t);
if (tg_leader_for_fds) {
pid_fd = ScopedFd(::syscall(NativeArch::pidfd_open, tg_leader_for_fds->tid, 0));
ASSERT(t, pid_fd.is_open() || errno == ENOSYS)
<< "Error in pidfd_open errno=" << errno_name(errno);
}
}
if (pid_fd.is_open()) {
ret = ScopedFd(::syscall(NativeArch::pidfd_getfd, pid_fd.get(), fd, 0));
if (ret.is_open()) {
return ret;
}
ASSERT(t, errno == ENOSYS) << "Failed in pidfd_getfd errno=" << errno_name(errno);
}
// Clear out any pending message in the socket.
maybe_receive_fds(task()->session().tracee_socket_receiver_fd(), false);
long child_syscall_result =
child_sendmsg<Arch>(*this, task()->session().tracee_fd_number(), fd);
if (child_syscall_result == -ESRCH) {
return ret;
}
ASSERT(t, child_syscall_result > 0) << "Failed to sendmsg() in tracee; err="
<< errno_name(-child_syscall_result);
vector<ScopedFd> fds = maybe_receive_fds(task()->session().tracee_socket_fd());
ASSERT(t, !fds.empty()) << "Failed to receive fd";
ASSERT(t, fds.size() == 1);
return std::move(fds[0]);
}
ScopedFd AutoRemoteSyscalls::retrieve_fd(int fd) {
RR_ARCH_FUNCTION(retrieve_fd_arch, arch(), fd);
}
template <typename Arch> int AutoRemoteSyscalls::send_fd_arch(const ScopedFd &our_fd) {
if (!our_fd.is_open()) {
return -EBADF;
}
// Clear out any pending message from the socket.
maybe_receive_fds(task()->session().tracee_socket_receiver_fd(), false);
LOG(debug) << "Sending fd " << our_fd.get() << " via socket fd " << task()->session().tracee_socket_fd().get();
sendmsg_socket(task()->session().tracee_socket_fd(), our_fd.get());
long child_syscall_result =
child_recvmsg<Arch>(*this, task()->session().tracee_fd_number());
// If the child died before reading the message from the socket,
// the message will still be in the socket buffer and will be received
// the next time we try to send something to a tracee. That's why
// before using tracee_socket_receiver_fd we need to drain up to one message
// from it.
ASSERT(t, child_syscall_result >= 0 || child_syscall_result == -ESRCH)
<< "Failed to recvmsg() in tracee; err=" << errno_name(-child_syscall_result);
return child_syscall_result;
}
int AutoRemoteSyscalls::send_fd(const ScopedFd &our_fd) {
RR_ARCH_FUNCTION(send_fd_arch, arch(), our_fd);
}
void AutoRemoteSyscalls::infallible_close_syscall_if_alive(int child_fd) {
infallible_syscall_if_alive(syscall_number_for_close(arch()), child_fd);
}
int AutoRemoteSyscalls::infallible_send_fd_if_alive(const ScopedFd &our_fd) {
int child_fd = send_fd(our_fd);
ASSERT(t, child_fd >= 0 || (child_fd == -ESRCH && !t->session().is_replaying()))
<< "Failed to send fd; err=" << errno_name(-child_fd);
return child_fd;
}
void AutoRemoteSyscalls::infallible_send_fd_dup(const ScopedFd& our_fd, int dup_to, int dup3_flags) {
int remote_fd = infallible_send_fd_if_alive(our_fd);
ASSERT(t, remote_fd >= 0);
if (remote_fd != dup_to) {
long ret = infallible_syscall(syscall_number_for_dup3(arch()), remote_fd,
dup_to, dup3_flags);
ASSERT(task(), ret == dup_to);
infallible_close_syscall_if_alive(remote_fd);
}
}
remote_ptr<void> AutoRemoteSyscalls::infallible_mmap_syscall_if_alive(
remote_ptr<void> addr, size_t length, int prot, int flags, int child_fd,
uint64_t offset_bytes) {
ASSERT(t, offset_bytes % page_size() == 0)
<< "mmap offset (" << offset_bytes << ") must be multiple of page size ("
<< page_size() << ")";
// The first syscall argument is called "arg 1", so
// our syscall-arg-index template parameter starts
// with "1".
remote_ptr<void> ret =
has_mmap2_syscall(arch())
? infallible_syscall_ptr_if_alive(syscall_number_for_mmap2(arch()), addr,
length, prot, flags, child_fd,
(off_t)offset_bytes / 4096)
: infallible_syscall_ptr_if_alive(syscall_number_for_mmap(arch()), addr,
length, prot, flags, child_fd,
offset_bytes);
if (flags & MAP_FIXED) {
if (ret) {
ASSERT(t, addr == ret) << "MAP_FIXED at " << addr << " but got " << ret;
} else {
if (!t->vm()->has_mapping(addr)) {
KernelMapping km = t->vm()->read_kernel_mapping(t, addr);
if (km.size()) {
ASSERT(t, km.start() == addr && km.size() == ceil_page_size(length));
// The mapping was created. Pretend this call succeeded.
ret = addr;
}
}
}
}
return ret;
}
bool AutoRemoteSyscalls::infallible_munmap_syscall_if_alive(
remote_ptr<void> addr, size_t length) {
long ret = infallible_syscall_if_alive(syscall_number_for_munmap(arch()),
addr, length);
if (ret) {
if (t->vm()->has_mapping(addr)) {
KernelMapping km = t->vm()->read_kernel_mapping(t, addr);
if (!km.size()) {
// The unmap happened but the task must have died before
// reporting the status.
ret = 0;
}
}
}
return !ret;
}
int64_t AutoRemoteSyscalls::infallible_lseek_syscall(int fd, int64_t offset,
int whence) {
switch (arch()) {
case x86: {
AutoRestoreMem mem(*this, &offset, sizeof(int64_t));
infallible_syscall(syscall_number_for__llseek(arch()), fd, offset >> 32,
offset, mem.get(), whence);
return t->read_mem(mem.get().cast<int64_t>());
}
case x86_64:
case aarch64:
return infallible_syscall(syscall_number_for_lseek(arch()), fd, offset,
whence);
default:
ASSERT(t, false) << "Unknown arch";
return -1;
}
}
void AutoRemoteSyscalls::check_syscall_result(long ret, int syscallno, bool allow_death) {
if (word_size(t->arch()) == 4) {
// Sign-extend ret because it can be a 32-bit negative errno
ret = (int)ret;
}
if (ret == -ESRCH && allow_death && !t->session().is_replaying()) {
return;
}
if (-4096 < ret && ret < 0) {
string extra_msg;
if (is_open_syscall(syscallno, arch())) {
extra_msg = " opening " + t->read_c_str(t->regs().arg1());
} else if (is_openat_syscall(syscallno, arch())) {
extra_msg = " opening " + t->read_c_str(t->regs().arg2());
} else if (is_mremap_syscall(syscallno, arch()) ||
is_mmap_syscall(syscallno, arch())) {
AddressSpace::print_process_maps(t);
}
ASSERT(t, false) << "Syscall " << syscall_name(syscallno, arch())
<< " failed with errno " << errno_name(-ret) << extra_msg
<< " arg1=0x" << hex << t->regs().arg1() << " arg2=0x" << t->regs().arg2()
<< " arg3=0x" << hex << t->regs().arg3() << " arg4=0x" << t->regs().arg4()
<< " arg5=0x" << hex << t->regs().arg5() << " arg6=0x" << t->regs().arg6();
}
}
void AutoRemoteSyscalls::finish_direct_mmap(
remote_ptr<void> rec_addr, size_t length,
int prot, int flags,
const string& backing_file_name,
int backing_file_open_flags,
off64_t backing_offset_bytes,
struct stat& real_file, string& real_file_name) {
int fd;
LOG(debug) << "directly mmap'ing " << length << " bytes of "
<< backing_file_name << " at offset "
<< HEX(backing_offset_bytes);
ASSERT(task(), !(flags & MAP_GROWSDOWN));
/* Open in the tracee the file that was mapped during
* recording. */
{
AutoRestoreMem child_str(*this, backing_file_name.c_str());
if (word_size(t->arch()) == 4) {
backing_file_open_flags |= RR_LARGEFILE_32;
}
fd = infallible_syscall(syscall_number_for_openat(arch()), -1,
child_str.get().as_int(),
backing_file_open_flags);
}
/* And mmap that file. */
infallible_mmap_syscall_if_alive(rec_addr, length,
/* (We let SHARED|WRITEABLE
* mappings go through while
* they're not handled properly,
* but we shouldn't do that.) */
prot, (flags & ~MAP_SYNC) | MAP_FIXED, fd,
/* MAP_SYNC is used to request direct mapping
* (DAX) from the filesystem for persistent
* memory devices (requires
* MAP_SHARED_VALIDATE). Drop it for the
* backing file. */
backing_offset_bytes);
// While it's open, grab the link reference.
real_file = task()->stat_fd(fd);
real_file_name = task()->file_name_of_fd(fd);
/* Don't leak the tmp fd. The mmap doesn't need the fd to
* stay open. */
infallible_close_syscall_if_alive(fd);
}
} // namespace rr