blob: cd144d926b63a1229366d35c8f69f1096060da69 [file] [log] [blame]
/* -*- Mode: C++; tab-width: 8; c-basic-offset: 2; indent-tabs-mode: nil; -*- */
#ifndef RR_TASK_H_
#define RR_TASK_H_
#include <memory>
#include <vector>
#include <unordered_map>
#include "preload/preload_interface.h"
#include "AddressSpace.h"
#include "Event.h"
#include "ExtraRegisters.h"
#include "FdTable.h"
#include "PerfCounters.h"
#include "Registers.h"
#include "TaskishUid.h"
#include "ThreadGroup.h"
#include "TraceStream.h"
#include "WaitStatus.h"
#include "core.h"
#include "kernel_abi.h"
#include "kernel_supplement.h"
#include "remote_code_ptr.h"
#include "util.h"
struct syscallbuf_hdr;
struct syscallbuf_record;
namespace rr {
class AutoRemoteSyscalls;
class RecordSession;
class ReplaySession;
class ScopedFd;
class Session;
class ThreadGroup;
enum CloneFlags {
/**
* The child gets a semantic copy of all parent resources (and
* becomes a new thread group). This is the semantics of the
* fork() syscall.
*/
CLONE_SHARE_NOTHING = 0,
/**
* Child will share the table of signal dispositions with its
* parent.
*/
CLONE_SHARE_SIGHANDLERS = 1 << 0,
/** Child will join its parent's thread group. */
CLONE_SHARE_THREAD_GROUP = 1 << 1,
/** Child will share its parent's address space. */
CLONE_SHARE_VM = 1 << 2,
/** Child will share its parent's file descriptor table. */
CLONE_SHARE_FILES = 1 << 3,
/** Kernel will clear and notify tid futex on task exit. */
CLONE_CLEARTID = 1 << 4,
// Set the thread area to what's specified by the |tls| arg.
CLONE_SET_TLS = 1 << 5,
};
/**
* Enumeration of ways to resume execution. See the ptrace manual for
* details of the semantics of these.
*
* We define a new datatype because the PTRACE_SYSEMU* requests aren't
* part of the official ptrace API, and we want to use a strong type
* for these resume requests to ensure callers don't confuse their
* arguments.
*/
enum ResumeRequest {
RESUME_CONT = PTRACE_CONT,
RESUME_SINGLESTEP = PTRACE_SINGLESTEP,
RESUME_SYSCALL = PTRACE_SYSCALL,
RESUME_SYSEMU = NativeArch::PTRACE_SYSEMU,
RESUME_SYSEMU_SINGLESTEP = NativeArch::PTRACE_SYSEMU_SINGLESTEP,
};
enum WaitRequest {
// Don't wait after resuming.
RESUME_NONBLOCKING,
// After resuming, blocking-waitpid() until tracee status
// changes.
RESUME_WAIT,
// Like RESUME_WAIT, but we're not expecting a PTRACE_EVENT_EXIT
// or reap, so return false also in that case.
RESUME_WAIT_NO_EXIT
};
enum TicksRequest {
// We don't expect to see any ticks (though we seem to on the odd buggy
// system...). Using this is a small performance optimization because we don't
// have to stop and restart the performance counters. This may also avoid
// bugs on some systems that report performance counter advances while
// in the kernel...
RESUME_NO_TICKS = -2,
RESUME_UNLIMITED_TICKS = -1,
// Positive values are a request for an interrupt
// after that number of ticks
// Don't request more than this!
MAX_TICKS_REQUEST = 2000000000,
};
/** Reasons why a SIGTRAP might have been delivered. Multiple reasons can
* apply. Also, none can apply, e.g. if someone sent us a SIGTRAP via kill().
*/
struct TrapReasons {
/* Singlestep completed (RESUME_SINGLESTEP, RESUME_SYSEMU_SINGLESTEP). */
bool singlestep;
/* Hardware watchpoint fired. This includes cases where the actual values
* did not change (i.e. AddressSpace::has_any_watchpoint_changes may return
* false even though this is set). */
bool watchpoint;
/* Breakpoint instruction was executed. */
bool breakpoint;
};
struct RseqState {
remote_ptr<void> ptr;
uint32_t abort_prefix_signature;
RseqState(remote_ptr<void> ptr, uint32_t abort_prefix_signature)
: ptr(ptr), abort_prefix_signature(abort_prefix_signature) {}
};
/**
* A "task" is a task in the linux usage: the unit of scheduling. (OS
* people sometimes call this a "thread control block".) Multiple
* tasks may share the same address space and file descriptors, in
* which case they're commonly called "threads". Or two tasks may
* have their own address spaces and file descriptors, in which case
* they're called "processes". Both look the same to rr (on linux),
* so no distinction is made here.
*/
class Task {
friend class Session;
friend class RecordSession;
friend class ReplaySession;
public:
typedef std::vector<WatchConfig> HardwareWatchpoints;
/**
* Ptrace-detach the task.
*/
void detach();
/*
* Re-enable the CPUID instruction in this task (if it was previously
* disabled to support CPUID emulation) as well as the use of rdtsc.
*/
void reenable_cpuid_tsc();
/**
* Wait for the task to exit, but do not reap/detach yet.
*/
void wait_exit();
/**
* Advance the task to its exit state if it's not already there.
* If `wait` is false, then during recording Scheduler::start() must be
* called.
*/
void proceed_to_exit(bool wait = true);
/**
* Kill this task and wait for it to exit.
* N.B.: If may_reap() is false, this may hang.
* Returns the WaitStatus of the task at exit (usually SIGKILL, but may not
* be if we raced with another exit reason).
*/
WaitStatus kill();
/**
* This must be in an emulated syscall, entered through
* |cont_sysemu()| or |cont_sysemu_singlestep()|, but that's
* not checked. If so, step over the system call instruction
* to "exit" the emulated syscall.
*/
void finish_emulated_syscall();
size_t syscallbuf_data_size() {
return read_mem(REMOTE_PTR_FIELD(syscallbuf_child, num_rec_bytes)) +
sizeof(struct syscallbuf_hdr);
}
/**
* Dump attributes of this process, including pending events,
* to |out|, which defaults to LOG_FILE.
*/
void dump(FILE* out = nullptr) const;
/**
* Called after the first exec in a session, when the session first
* enters a consistent state. Prior to that, the task state
* can vary based on how rr set up the child process. We have to flush
* out any state that might have been affected by that.
*/
void flush_inconsistent_state();
/**
* Return total number of ticks ever executed by this task.
* Updates tick count from the current performance counter values if
* necessary.
*/
Ticks tick_count() { return ticks; }
/**
* Return the path of this fd as /proc/<pid>/fd/<fd>
*/
std::string proc_fd_path(int fd);
/**
* Return the path of /proc/<pid>/pagemap
*/
std::string proc_pagemap_path();
/**
* Return the path of /proc/<pid>/stat
*/
std::string proc_stat_path();
/**
* Return the path of /proc/<pid>/exe
*/
std::string proc_exe_path();
/**
* Return the path of the executable (i.e. what
* /proc/<pid>/exe points to).
*/
std::string exe_path();
/**
* Stat |fd| in the context of this task's fd table.
*/
struct stat stat_fd(int fd);
/**
* Lstat |fd| in the context of this task's fd table.
*/
struct stat lstat_fd(int fd);
/**
* Open |fd| in the context of this task's fd table.
*/
ScopedFd open_fd(int fd, int flags);
/**
* Get the name of the file referenced by |fd| in the context of this
* task's fd table.
*/
std::string file_name_of_fd(int fd);
/**
* Get current offset of |fd|
*/
int64_t fd_offset(int fd);
/**
* Get pid of pidfd |fd|
*/
pid_t pid_of_pidfd(int fd);
/**
* Records the wait status of this task as |status|, e.g. if
* |wait()/try_wait()| has returned it. Call this whenever a waitpid
* returned activity for this task.
* If this returns false, then the task was kicked out of a ptrace-stop
* by SIGKILL or equivalent before we could read registers etc.
* We will treat this stop as if it never happened; the caller must
* act as if there was no stop.
* If `status.reaped()` (i.e. fatal signal or normal exit), this always
* returns true.
*/
bool did_waitpid(WaitStatus status);
/**
* Syscalls have side effects on registers (e.g. setting the flags register).
* Perform those side effects on |registers| to make it look like a syscall
* happened.
*/
void canonicalize_regs(SupportedArch syscall_arch);
/**
* Return the ptrace message pid associated with the current ptrace
* event, f.e. the new child's pid at PTRACE_EVENT_CLONE.
* Returns -1 if the ptrace returns ESRCH, i.e. the task is not in a
* ptrace-stop.
*/
pid_t get_ptrace_eventmsg_pid();
/**
* Return the siginfo at the signal-stop of this.
* Not meaningful unless this is actually at a signal stop.
*/
const siginfo_t& get_siginfo();
/**
* Destroy in the tracee task the scratch buffer and syscallbuf (if
* syscallbuf_child is non-null).
* Both the as_task and the fd_task must be able to execute remote syscalls
* and share the address space, resp. the file descriptor table with the
* current task. If either of these is null, the corresponding resource is
* not destroyed remote (e.g. if there are no other tasks left in the same
* address space or file descriptor table).
*/
void destroy_buffers(Task *as_task, Task *fd_task);
void destroy_buffers() { destroy_buffers(this, this); }
void did_kill();
void unmap_buffers_for(
AutoRemoteSyscalls& remote, Task* t,
remote_ptr<struct syscallbuf_hdr> saved_syscallbuf_child);
/* Close fds related to `t`'s syscallbuf, in this task's fd table.
If `really_close` is true, actually close the kernel fds through `remote`,
otherwise only update our FdTable. */
void close_buffers_for(AutoRemoteSyscalls& remote, Task* t, bool really_close);
remote_ptr<const struct syscallbuf_record> next_syscallbuf_record();
long stored_record_size(remote_ptr<const struct syscallbuf_record> record);
/** Return the current $ip of this. */
remote_code_ptr ip() { return regs().ip(); }
/**
* Emulate a jump to a new IP, updating the ticks counter as appropriate.
*/
void emulate_jump(remote_code_ptr);
void count_direct_jump()
{
ticks += PerfCounters::ticks_for_unconditional_direct_branch(this);
}
/**
* Return true if this is at an arm-desched-event or
* disarm-desched-event syscall.
*/
bool is_desched_event_syscall();
/**
* Return true when this task is in a traced syscall made by the
* syscallbuf code. Callers may assume |is_in_syscallbuf()|
* is implied by this. Note that once we've entered the traced syscall,
* ip() is immediately after the syscall instruction.
*/
bool is_in_traced_syscall() {
return ip() ==
as->traced_syscall_ip().increment_by_syscall_insn_length(
arch()) ||
ip() ==
as->privileged_traced_syscall_ip()
.increment_by_syscall_insn_length(arch());
}
bool is_at_traced_syscall_entry() {
return ip() == as->traced_syscall_ip() ||
ip() == as->privileged_traced_syscall_ip();
}
/**
* Return true when this task is in an untraced syscall, i.e. one
* initiated by a function in the syscallbuf. Callers may
* assume |is_in_syscallbuf()| is implied by this. Note that once we've
* entered the traced syscall, ip() is immediately after the syscall
* instruction.
*/
bool is_in_untraced_syscall() {
const AddressSpace::SyscallType *t;
if (arch() == aarch64 && stop_sig() > 0) {
// On aarch64 we can't distinguish untraced syscall entry and exit
// when a signal happened
t = AddressSpace::rr_page_syscall_from_entry_point(arch(), ip());
} else {
t = AddressSpace::rr_page_syscall_from_exit_point(arch(), ip());
}
return t && t->traced == AddressSpace::UNTRACED;
}
bool is_in_rr_page() {
auto p = ip().to_data_ptr<void>();
return AddressSpace::rr_page_start() <= p &&
p < AddressSpace::rr_page_end();
}
/**
* Return true if |ptrace_event()| is the trace event
* generated by the syscallbuf seccomp-bpf when a traced
* syscall is entered.
*/
bool is_ptrace_seccomp_event() const;
/** Dump all pending events to the RecordTask INFO log. */
virtual void log_pending_events() const {}
/**
* Call this hook just before exiting a syscall. Often Task
* attributes need to be updated based on the finishing syscall.
* Use 'regs' instead of this->regs() because some registers may not be
* set properly in the task yet.
*/
virtual void on_syscall_exit(int syscallno, SupportedArch arch,
const Registers& regs);
/**
* Hook called by `resume_execution`.
* Returns `false` if the task is in the process of dying and setup could not
* be completed, `true` otherwise.
*/
virtual bool will_resume_execution(ResumeRequest, WaitRequest, TicksRequest,
int /*sig*/) { return true; }
/**
* Hook called by `did_waitpid`.
*/
virtual void did_wait() {}
/**
* Return the pid of the task in its own pid namespace.
* Only RecordTasks actually change pid namespaces, but
* this value is stored and present during replay too.
*/
pid_t own_namespace_tid() { return own_namespace_rec_tid; }
/**
* Assuming ip() is just past a breakpoint instruction, adjust
* ip() backwards to point at that breakpoint insn.
*/
void move_ip_before_breakpoint();
/**
* Assuming we've just entered a syscall, exit that syscall and reset
* state to reenter the syscall just as it was called the first time.
* Returns false if we see the process exit instead.
*/
bool exit_syscall_and_prepare_restart();
/**
* We're currently in user-space with registers set up to perform a system
* call. Continue into the kernel and stop where we can modify the syscall
* state.
* Return `true` if the syscall entry succeeded.
* Return `false` if the tracee exited unexpectedly.
*/
bool enter_syscall(bool allow_exit=false);
/**
* We have observed entry to a syscall (either by PTRACE_EVENT_SECCOMP or
* a syscall, depending on the value of Session::syscall_seccomp_ordering()).
* Continue into the kernel to perform the syscall and stop at the
* PTRACE_SYSCALL syscall-exit trap. Returns false if we see the process exit
* before that; we may or may not be stopped in that case.
*/
bool exit_syscall();
/**
* Return the "task name"; i.e. what |prctl(PR_GET_NAME)| or
* /proc/tid/comm say that the task's name is.
*
* During recording we don't monitor changes to this, we just let
* the kernel update it directly. This lets us syscall-buffer PR_SET_NAME.
* During replay we monitor changes to this and cache the name in ReplayTask,
* hence these methods are virtual. During replay the task's actual name
* is "rr:" followed by the original name.
*/
virtual std::string name() const;
virtual void set_name(AutoRemoteSyscalls& remote, const std::string& name);
/**
* Called for every PR_SET_NAME during replay but not always during recording
* (it is not called for syscall-buffered PR_SET_NAME).
*/
virtual void did_prctl_set_prname(remote_ptr<void>) {}
/**
* Call this method when this task has just performed an |execve()|
* (so we're in the new address space), but before the system call has
* returned.
* `exe_file` is the name of the executable file in the trace, if there is one,
* otherwise the original exe file name --- a best-effort filename we can
* pass to gdb for it to read the exe.
*/
void post_exec(const std::string& exe_file);
/**
* Call this method when this task has exited a successful execve() syscall.
* At this point it is safe to make remote syscalls.
* `original_exe_file` is the original file exe file name.
*/
void post_exec_syscall(const std::string& original_exe_file);
/**
* Return true if this task has execed.
*/
bool execed() const;
/**
* Return true if this task is dead and just waiting to be reaped.
*/
virtual bool already_exited() const { return false; }
virtual bool is_detached_proxy() const { return false; }
/**
* Read |N| bytes from |child_addr| into |buf|, or don't
* return.
*/
template <size_t N>
void read_bytes(remote_ptr<void> child_addr, uint8_t (&buf)[N]) {
return read_bytes_helper(child_addr, N, buf);
}
/** Return the current regs of this. */
const Registers& regs() const;
/** Return the extra registers of this. Asserts if the task died. */
const ExtraRegisters& extra_regs();
/** Return the extra registers of this, or null if the task died. */
const ExtraRegisters* extra_regs_fallible();
/** Return the current arch of this. This can change due to exec(). */
SupportedArch arch() const {
// Use 'registers' directly instead of calling regs(), since this can
// be called while the task is not stopped.
return registers.arch();
}
/**
* Return the debug status (DR6 on x86). The debug status is always cleared
* in resume_execution() before we resume, so it always only reflects the
* events since the last resume. Must not be called on non-x86 architectures.
*/
uintptr_t x86_debug_status();
/**
* Set the debug status (DR6 on x86). Noop on non-x86 architectures.
*/
void set_x86_debug_status(uintptr_t status);
/**
* Determine why a SIGTRAP occurred. On x86, uses x86_debug_status() but doesn't
* consume it.
*/
TrapReasons compute_trap_reasons();
/**
* Called on syscall entry to save any registers that we need to keep, but
* cannot get from the kernel (r.g. orig_x0 on aarch64).
*/
void apply_syscall_entry_regs();
/**
* Read |val| from |child_addr|.
* If the data can't all be read, then if |ok| is non-null
* sets *ok to false, otherwise asserts.
*/
template <typename T>
T read_mem(remote_ptr<T> child_addr, bool* ok = nullptr) {
typename std::remove_cv<T>::type val;
read_bytes_helper(child_addr, sizeof(val), &val, ok);
return val;
}
/**
* Read |count| values from |child_addr|.
*/
template <typename T>
std::vector<T> read_mem(remote_ptr<T> child_addr, size_t count,
bool* ok = nullptr) {
std::vector<T> v;
v.resize(count);
read_bytes_helper(child_addr, sizeof(T) * count, v.data(), ok);
return v;
}
/**
* Read and return the C string located at |child_addr| in
* this address space. If the data can't all be read (because the c string to
* be read is invalid), then if |ok| is non-null, sets *ok to
* false, otherwise asserts.
*/
std::string read_c_str(remote_ptr<char> child_addr, bool *ok = nullptr);
/**
* Resume execution |how|, delivering |sig| if nonzero.
* After resuming, |wait_how|. In replay, reset hpcs and
* request a tick period of tick_period. The default value
* of tick_period is 0, which means effectively infinite.
* If interrupt_after_elapsed is nonzero, we interrupt the task
* after that many seconds have elapsed.
*
* All tracee execution goes through here.
*
* If `wait_how` == RESUME_WAIT and we don't complete a
* did_waitpid() (e.g. because the tracee was SIGKILLed or
* equivalent), this returns false.
*/
bool resume_execution(ResumeRequest how, WaitRequest wait_how,
TicksRequest tick_period, int sig = 0);
/** Return the session this is part of. */
Session& session() const { return *session_; }
/** Set the tracee's registers to |regs|. Lazy. */
void set_regs(const Registers& regs);
/** Ensure registers are flushed back to the underlying task. */
void flush_regs();
/** Set the tracee's extra registers to |regs|. */
void set_extra_regs(const ExtraRegisters& regs);
/** Adjust IP for rseq abort if necessary and return true if an abort is required.
* Sets *rseq_cs_invalid if it was invalid */
bool should_apply_rseq_abort(EventType event_type, remote_code_ptr* new_ip,
bool* invalid_rseq_cs);
/**
* Read the aarch64 TLS register via ptrace. Returns true on success, false
* on failure. On success `result` is set to the tracee's TLS register.
* This can only fail when ptrace_if_stopped fails, i.e. the tracee
* is on the exit path due to a SIGKILL or equivalent.
*/
bool read_aarch64_tls_register(uintptr_t *result);
void set_aarch64_tls_register(uintptr_t val);
/**
* Program the debug registers to the vector of watchpoint
* configurations in |reg| (also updating the debug control
* register appropriately). Return true if all registers were
* successfully programmed, false otherwise. Any time false
* is returned, the caller is guaranteed that no watchpoint
* has been enabled; either all of |regs| is enabled and true
* is returned, or none are and false is returned.
*/
bool set_debug_regs(const HardwareWatchpoints& watchpoints);
bool set_aarch64_debug_regs(int which, ARM64Arch::user_hwdebug_state *regs, size_t nregs);
bool get_aarch64_debug_regs(int which, ARM64Arch::user_hwdebug_state *regs);
uintptr_t get_debug_reg(size_t regno);
bool set_x86_debug_reg(size_t regno, uintptr_t value);
/** Update the thread area to |addr|. */
void set_thread_area(remote_ptr<X86Arch::user_desc> tls);
/** Set the thread area at index `idx` to desc and reflect this
* into the OS task. Returns 0 on success, errno otherwise.
*/
int emulate_set_thread_area(int idx, X86Arch::user_desc desc);
/** Get the thread area from the remote process.
* Returns 0 on success, errno otherwise.
*/
int emulate_get_thread_area(int idx, X86Arch::user_desc& desc);
const std::vector<X86Arch::user_desc>& thread_areas() {
DEBUG_ASSERT(arch() == x86 || arch() == x86_64);
return thread_areas_;
}
void set_status(WaitStatus status) { wait_status = status; }
/**
* Return true when the task stopped for a ptrace-stop and we
* haven't resumed it yet.
*/
bool is_stopped() const { return is_stopped_; }
/**
* Setter for `is_stopped_` to update `Scheduler::ntasks_stopped`.
*/
virtual void set_stopped(bool stopped) { is_stopped_ = stopped; }
/**
* Return the status of this as of the last successful wait()/try_wait() call.
*/
WaitStatus status() const { return wait_status; }
/**
* Return the ptrace event as of the last call to |wait()/try_wait()|.
*/
int ptrace_event() const { return wait_status.ptrace_event(); }
/**
* Return the signal that's pending for this as of the last
* call to |wait()/try_wait()|. The signal 0 means "no signal".
*/
int stop_sig() const { return wait_status.stop_sig(); }
void clear_wait_status() { wait_status = WaitStatus(); }
/** Return the thread group this belongs to. */
std::shared_ptr<ThreadGroup> thread_group() const { return tg; }
/** Return the id of this task's recorded thread group. */
pid_t tgid() const;
/** Return id of real OS thread group. */
pid_t real_tgid() const;
TaskUid tuid() const { return TaskUid(rec_tid, serial); }
/** Return the dir of the trace we're using. */
const std::string& trace_dir() const;
/**
* Get the current "time" measured as ticks on recording trace
* events. |task_time()| returns that "time" wrt this task
* only.
*/
uint32_t trace_time() const;
/**
* Call this to reset syscallbuf_hdr->num_rec_bytes and zero out the data
* recorded in the syscall buffer. This makes for more deterministic behavior
* especially during replay, where during checkpointing we only save and
* restore the recorded data area.
*/
void reset_syscallbuf();
/**
* Return the virtual memory mapping (address space) of this
* task.
*/
AddressSpace::shr_ptr vm() { return as; }
FdTable::shr_ptr fd_table() { return fds; }
/**
* Block until the status of this changes. wait() expects the wait to end
* with the process in a stopped() state. If interrupt_after_elapsed >= 0,
* interrupt the task after that many seconds have elapsed. If
* interrupt_after_elapsed == 0.0, the interrupt will happen immediately.
* Returns false if the wait failed because we reached a stop but we got
* SIGKILLed (or equivalent) out of it, in which case it is not safe to wait
* because that might block indefinitely waiting for us to acknowledge the
* PTRACE_EVENT_EXIT of other tasks.
*/
bool wait(double interrupt_after_elapsed = -1);
/**
* Currently we don't allow recording across uid changes, so we can
* just return rr's uid.
*/
uid_t getuid() { return ::getuid(); }
/**
* Write |N| bytes from |buf| to |child_addr|, or don't return.
*/
template <size_t N>
void write_bytes(remote_ptr<void> child_addr, const uint8_t (&buf)[N]) {
write_bytes_helper(child_addr, N, buf);
}
enum WriteFlags {
IS_BREAKPOINT_RELATED = 0x1,
};
/**
* Write |val| to |child_addr|.
*/
template <typename T>
void write_mem(remote_ptr<T> child_addr, const T& val, bool* ok = nullptr,
uint32_t flags = 0) {
DEBUG_ASSERT(type_has_no_holes<T>());
write_bytes_helper(child_addr, sizeof(val), static_cast<const void*>(&val),
ok, flags);
}
/**
* This is not the helper you're looking for. See above: you
* probably accidentally wrote |write_mem(addr, &foo)| when
* you meant |write_mem(addr, foo)|.
*/
template <typename T>
void write_mem(remote_ptr<T> child_addr, const T* val) = delete;
template <typename T>
void write_mem(remote_ptr<T> child_addr, const T* val, int count,
bool* ok = nullptr) {
DEBUG_ASSERT(type_has_no_holes<T>());
write_bytes_helper(child_addr, sizeof(*val) * count,
static_cast<const void*>(val), ok);
}
uint64_t write_ranges(const std::vector<FileMonitor::Range>& ranges,
void* data, size_t size);
/**
* Writes zeroes to the given memory range.
* For efficiency tries using MADV_REMOVE via `remote`. Caches
* an AutoRemoteSyscalls in `*remote`.
*/
void write_zeroes(std::unique_ptr<AutoRemoteSyscalls>* remote, remote_ptr<void> addr, size_t size);
/**
* Don't use these helpers directly; use the safer and more
* convenient variants above.
*
* Read/write the number of bytes that the template wrapper
* inferred.
*/
ssize_t read_bytes_fallible(remote_ptr<void> addr, ssize_t buf_size,
void* buf);
/**
* If the data can't all be read, then if |ok| is non-null, sets *ok to
* false, otherwise asserts.
*/
void read_bytes_helper(remote_ptr<void> addr, ssize_t buf_size, void* buf,
bool* ok = nullptr);
/**
* |flags| is bits from WriteFlags.
*/
void write_bytes_helper(remote_ptr<void> addr, ssize_t buf_size,
const void* buf, bool* ok = nullptr,
uint32_t flags = 0);
/**
* |flags| is bits from WriteFlags.
* Returns number of bytes written.
*/
ssize_t write_bytes_helper_no_notifications(remote_ptr<void> addr, ssize_t buf_size,
const void* buf, bool* ok = nullptr,
uint32_t flags = 0);
SupportedArch detect_syscall_arch();
/**
* Call this when performing a clone syscall in this task. Returns
* true if the call completed, false if it was interrupted and
* needs to be resumed. When the call returns true, the task is
* stopped at a PTRACE_EVENT_CLONE or PTRACE_EVENT_FORK.
*/
bool clone_syscall_is_complete(pid_t* new_pid, SupportedArch syscall_arch);
/**
* Called when SYS_rrcall_init_preload has happened.
*/
virtual void at_preload_init();
/**
* Open /proc/[tid]/mem fd for our AddressSpace, closing the old one
* first. If necessary we force the tracee to open the file
* itself and smuggle the fd back to us.
* Returns false if the process no longer exists.
*/
bool open_mem_fd();
/**
* Calls open_mem_fd if this task's AddressSpace doesn't already have one.
*/
void open_mem_fd_if_needed();
/**
* Open /proc/[tid]/pagemap fd for our AddressSpace.
*/
ScopedFd& pagemap_fd();
/**
* Perform a PTRACE_INTERRUPT and set up the counter for potential spurious stops
* to be detected in `account_for_potential_ptrace_interrupt_stop`.
* Returns true if it succeeded, false if we got ESRCH (i.e. the tracee has
* disappeared or is not being ptraced; PTRACE_INTERRUPT doesn't require the
* tracee to be stopped).
*/
bool do_ptrace_interrupt();
/**
* Sometimes we use PTRACE_INTERRUPT to kick the tracee out of various
* undesirable states. Unfortunately, that can (but need not) result in later
* undesired GROUP-STOP-SIGTRAP stops which report the PTRACE_INTERRUPT.
* This function may be called when examining stops to account for any
* such spurious stops.
*
* Should be called at exactly once for every ptrace stop.
*
* Returns true if the stop is caused by a PTRACE_INTERRUPT we know about,
* false otherwise.
*/
bool account_for_potential_ptrace_interrupt_stop(WaitStatus status);
/* Imagine that task A passes buffer |b| to the read()
* syscall. Imagine that, after A is switched out for task B,
* task B then writes to |b|. Then B is switched out for A.
* Since rr doesn't schedule the kernel code, the result is
* nondeterministic. To avoid that class of replay
* divergence, we "redirect" (in)outparams passed to may-block
* syscalls, to "scratch memory". The kernel writes to
* scratch deterministically, and when A (in the example
* above) exits its read() syscall, rr copies the scratch data
* back to the original buffers, serializing A and B in the
* example above.
*
* Syscalls can "nest" due to signal handlers. If a syscall A
* is interrupted by a signal, and the sighandler calls B,
* then we can have scratch buffers set up for args of both A
* and B. In linux, B won't actually re-enter A; A is exited
* with a "will-restart" error code and its args are saved for
* when (or if) it's restarted after the signal. But that
* doesn't really matter wrt scratch space. (TODO: in the
* future, we may be able to use that fact to simplify
* things.)
*
* Because of nesting, at first blush it seems we should push
* scratch allocations onto a stack and pop them as syscalls
* (or restarts thereof) complete. But under a critical
* assumption, we can actually skip that. The critical
* assumption is that the kernel writes its (in)outparams
* atomically wrt signal interruptions, and only writes them
* on successful exit. Each syscall will complete in stack
* order, and it's invariant that the syscall processors must
* only write back to user buffers *only* the data that was
* written by the kernel. So as long as the atomicity
* assumption holds, the completion of syscalls higher in the
* event stack may overwrite scratch space, but the completion
* of each syscall will overwrite those overwrites again, and
* that over-overwritten data is exactly and only what we'll
* write back to the tracee.
*
* |scratch_ptr| points at the mapped address in the child,
* and |size| is the total available space. */
remote_ptr<void> scratch_ptr;
/* The full size of the scratch buffer.
* The last page of the scratch buffer is used as an alternate stack
* for the syscallbuf code. So the usable size is less than this.
*/
ssize_t scratch_size;
/* The child's desched counter event fd number */
int desched_fd_child;
/* The child's cloned_file_data_fd */
int cloned_file_data_fd_child;
/* The filename opened by the child's cloned_file_data_fd */
std::string cloned_file_data_fname;
// Current rseq state if registered
std::unique_ptr<RseqState> rseq_state;
PerfCounters hpc;
/* This is always the "real" tid of the tracee. For a detached proxy,
* it's the proxy tid. */
pid_t tid;
/* This is always the recorded tid of the tracee. During
* recording, it's synonymous with |tid|, and during replay
* it's the tid that was recorded. For a detached proxy,
* this is the tid of the detachd process. */
pid_t rec_tid;
/* This is the recorded tid of the tracee *in its own pid namespace*. */
pid_t own_namespace_rec_tid;
size_t syscallbuf_size;
/* Points at the tracee's mapping of the buffer. */
remote_ptr<struct syscallbuf_hdr> syscallbuf_child;
remote_ptr<struct preload_globals> preload_globals;
typedef uint8_t ThreadLocals[PRELOAD_THREAD_LOCALS_SIZE];
ThreadLocals thread_locals;
size_t usable_scratch_size() {
return std::max<ssize_t>(0, scratch_size - page_size());
}
remote_ptr<void> syscallbuf_alt_stack() {
return scratch_ptr.is_null() ? remote_ptr<void>()
: scratch_ptr + scratch_size;
}
void setup_preload_thread_locals();
void setup_preload_thread_locals_from_clone(Task* origin);
// If `fetch_full` is false, avoid fetching the full stub_scratch_2 on aarch64
// and only fetch the first two pointers from it.
const ThreadLocals& fetch_preload_thread_locals();
void activate_preload_thread_locals();
struct CapturedState {
Ticks ticks;
Registers regs;
ExtraRegisters extra_regs;
std::string prname;
uintptr_t fdtable_identity;
remote_ptr<struct syscallbuf_hdr> syscallbuf_child;
size_t syscallbuf_size;
size_t num_syscallbuf_bytes;
remote_ptr<struct preload_globals> preload_globals;
remote_ptr<void> scratch_ptr;
ssize_t scratch_size;
remote_ptr<void> top_of_stack;
std::unique_ptr<RseqState> rseq_state;
uint64_t cloned_file_data_offset;
ThreadLocals thread_locals;
pid_t rec_tid;
pid_t own_namespace_rec_tid;
uint32_t serial;
ThreadGroupUid tguid;
int desched_fd_child;
int cloned_file_data_fd_child;
std::string cloned_file_data_fname;
WaitStatus wait_status;
// TLS state (architecture specific)
// On x86_64 the tls register is part of the general register state (%fs)
// On x86 thread_areas is used
// on aarch64, tls_register is used
uintptr_t tls_register;
std::vector<X86Arch::user_desc> thread_areas;
};
/**
* Lock or unlock the syscallbuf to prevent the preload library from using it.
* Only has an effect if the syscallbuf has been initialized.
*/
void set_syscallbuf_locked(bool locked);
// Disable syscall buffering during diversions
void set_in_diversion(bool in_diversion) {
if (preload_globals) {
write_mem(REMOTE_PTR_FIELD(preload_globals, in_diversion),
(unsigned char)in_diversion);
}
set_syscallbuf_locked(in_diversion);
}
/**
* Executes a ptrace() call that expects the task to be in a ptrace-stop.
* Errors other than ESRCH are treated as fatal (those are rr bugs).
* Only call this when `Task::is_stopped_`.
* Even when `is_stopped_` is true, this can return false because the kernel
* could have pushed the task out of the ptrace-stop due to SIGKILL or
* equivalent (such as `zap_pid_ns_processes`).
*
* So when this returns false, one of the following is true:
* * The tracee is executing towards its PTRACE_EVENT_EXIT stop. This
* happens concurrently with rr so it may enter that stop at any time.
* But it can also be indefinitely delayed before reaching the exit stop,
* e.g. waiting in`zap_pid_ns_processes`.
* * In older kernels (before 9a95f78eab70deeb5a4c879c19b841a6af5b66e7)
* it is possible for a tracee stopped in PTRACE_EVENT_EXIT to be kicked
* out of that stop by another SIGKILL. In that case it is executing towards
* or has actually reached the zombie state. In old kernels it can be
* blocked indefinitely from reaching the zombie state due to coredumping.
*
* In either of these cases, the tracee has been killed via SIGKILL or equivalent
* and will not execute user code or system calls again. We can assume
* its registers won't change again. It won't handle any more signals.
*/
bool ptrace_if_stopped(int request, remote_ptr<void> addr, void* data);
/**
* Make the ptrace |request| with |addr| and |data|, return
* the ptrace return value. Just a very thin wrapper around the syscall.
*/
long fallible_ptrace(int request, remote_ptr<void> addr, void* data);
bool seen_ptrace_exit_event() const {
return seen_ptrace_exit_event_;
}
void did_handle_ptrace_exit_event();
remote_code_ptr last_execution_resume() const {
return address_of_last_execution_resume;
}
bool was_reaped() const {
return was_reaped_;
}
bool handled_ptrace_exit_event() const {
return handled_ptrace_exit_event_;
}
void os_exec(SupportedArch arch, std::string filename);
void os_exec_stub(SupportedArch arch) {
os_exec(arch, find_exec_stub(arch));
}
/**
* Try to make the current task look exactly like some `other` task
* by copying that task's address space and other relevant properties,
* but without using the os's clone system call.
*/
void dup_from(Task *task);
virtual ~Task();
/**
* Fork and exec the initial task. If something goes wrong later
* (i.e. an exec does not occur before an exit), an error may be
* readable from the other end of the pipe whose write end is error_fd.
*/
static Task* spawn(Session& session, ScopedFd& error_fd,
ScopedFd* sock_fd_out,
ScopedFd* sock_fd_receiver_out,
int* tracee_socket_fd_number_out,
const std::string& exe_path,
const std::vector<std::string>& argv,
const std::vector<std::string>& envp, pid_t rec_tid = -1);
/**
* Do PTRACE_SEIZE on this tid with the correct ptrace options.
*/
static long ptrace_seize(pid_t tid, Session& session);
/**
* Do a tgkill to send a specific signal to this task.
*/
void tgkill(int sig);
/**
* Try to move this task to a signal stop by signaling it with the
* syscallbuf desched signal (which is guaranteed not to be blocked).
* Returns false if the task exited unexpectedly.
*/
bool move_to_signal_stop();
// A map from original table to (potentially detached) clone, to preserve
// FdTable sharing relationships during a session fork.
using ClonedFdTables = std::unordered_map<uintptr_t, FdTable::shr_ptr>;
/**
* Just forget that this Task exists. Another rr process will manage it.
*/
void forget();
// Used on aarch64 to detect whether we've recorded x0 and x8 on syscall entry
Ticks ticks_at_last_syscall_entry;
remote_code_ptr ip_at_last_syscall_entry;
// Whether the syscall entry corresponding to `{ticks,ip}_at_last_syscall_entry`
// has been recorded in the trace
// (used to avoid double recording on unexpected exit)
bool last_syscall_entry_recorded;
protected:
Task(Session& session, pid_t tid, pid_t rec_tid, uint32_t serial,
SupportedArch a);
enum CloneReason {
// Cloning a task in the same session due to tracee fork()/vfork()/clone()
TRACEE_CLONE,
// Cloning a task into a new session as the leader for a checkpoint
SESSION_CLONE_LEADER,
// Cloning a task into the same session to recreate threads while
// restoring a checkpoint
SESSION_CLONE_NONLEADER,
};
/**
* Return a new Task cloned from |p|. |flags| are a set of
* CloneFlags (see above) that determine which resources are
* shared or copied to the new child. |new_tid| is the tid
* assigned to the new task by the kernel. |new_rec_tid| is
* only relevant to replay, and is the pid that was assigned
* to the task during recording.
*/
virtual Task* clone(CloneReason reason, int flags, remote_ptr<void> stack,
remote_ptr<void> tls, remote_ptr<int> cleartid_addr,
pid_t new_tid, pid_t new_rec_tid, uint32_t new_serial,
Session* other_session = nullptr,
FdTable::shr_ptr new_fds = nullptr,
ThreadGroup::shr_ptr new_tg = nullptr);
/**
* Internal method called after the first wait() during a clone().
*/
virtual void post_wait_clone(Task*, int) {}
/**
* Internal method called after the clone to fix up the new address space.
*/
virtual bool post_vm_clone(CloneReason reason, int flags, Task* origin);
template <typename Arch>
void on_syscall_exit_arch(int syscallno, const Registers& regs);
/** Helper function for init_buffers. */
template <typename Arch> void init_buffers_arch(remote_ptr<void> map_hint);
/**
* Grab state from this task into a structure that we can use to
* initialize a new task via os_clone_into/os_fork_into and copy_state.
*/
CapturedState capture_state();
/**
* Make this task look like an identical copy of the task whose state
* was captured by capture_task_state(), in
* every way relevant to replay. This task should have been
* created by calling os_clone_into() or os_fork_into(),
* and if it wasn't results are undefined.
*
* Some task state must be copied into this by injecting and
* running syscalls in this task. Other state is metadata
* that can simply be copied over in local memory.
*/
void copy_state(const CapturedState& state);
/**
* Read tracee memory using PTRACE_PEEKDATA calls. Slow, only use
* as fallback. Returns number of bytes actually read.
*/
ssize_t read_bytes_ptrace(remote_ptr<void> addr, ssize_t buf_size, void* buf);
/**
* Write tracee memory using PTRACE_POKEDATA calls. Slow, only use
* as fallback. Returns number of bytes actually written.
*/
ssize_t write_bytes_ptrace(remote_ptr<void> addr, ssize_t buf_size,
const void* buf);
/**
* Try writing 'buf' to 'addr' by replacing pages in the tracee
* address-space using a temporary file. This may work around PaX issues.
*/
bool try_replace_pages(remote_ptr<void> addr, ssize_t buf_size,
const void* buf);
/**
* Map the syscallbuffer for this, shared with this process.
* |map_hint| is the address where the syscallbuf is expected
* to be mapped --- and this is asserted --- or nullptr if
* there are no expectations.
* Initializes syscallbuf_child.
*/
KernelMapping init_syscall_buffer(AutoRemoteSyscalls& remote,
remote_ptr<void> map_hint);
/**
* Make the OS-level calls to create a new fork or clone that
* will eventually be a copy of this task and return that Task
* metadata. These methods are used in concert with
* |Task::copy_state()| to create task copies during
* checkpointing.
*
* For |os_fork_into()|, |session| will be tracking the
* returned fork child.
*
* For |os_clone_into()|, |task_leader| is the "main thread"
* in the process into which the copy of this task will be
* created. |task_leader| will perform the actual OS calls to
* create the new child.
*/
Task* os_fork_into(Session* session, FdTable::shr_ptr new_fds);
static Task* os_clone_into(const CapturedState& state,
AutoRemoteSyscalls& remote,
const ClonedFdTables& cloned_fd_tables,
ThreadGroup::shr_ptr new_tg);
/**
* Return the TraceStream that we're using, if in recording or replay.
* Returns null if we're not in record or replay.
*/
const TraceStream* trace_stream() const;
/**
* Make the OS-level calls to clone |parent| into |session|
* and return the resulting Task metadata for that new
* process. This is as opposed to |Task::clone()|, which only
* attaches Task metadata to an /existing/ process.
*
* The new clone will be tracked in |session|. The other
* arguments are as for |Task::clone()| above.
*/
static Task* os_clone(CloneReason reason, Session* session,
AutoRemoteSyscalls& remote, pid_t rec_child_tid,
uint32_t new_serial, unsigned base_flags,
FdTable::shr_ptr new_fds = nullptr,
ThreadGroup::shr_ptr new_tg = nullptr,
remote_ptr<void> stack = nullptr,
remote_ptr<int> ptid = nullptr,
remote_ptr<void> tls = nullptr,
remote_ptr<int> ctid = nullptr);
void work_around_KNL_string_singlestep_bug();
void* preload_thread_locals();
uint32_t serial;
// The address space of this task.
AddressSpace::shr_ptr as;
// The file descriptor table of this task.
FdTable::shr_ptr fds;
// Count of all ticks seen by this task since tracees became
// consistent and the task last wait()ed.
Ticks ticks;
// Copy of the child registers.
// When is_stopped_ or in_unexpected_exit, these are the source of
// truth. Otherwise the child is running and the registers could be
// changed by the kernel or user-space execution, and the values here
// are meaningless.
// See also registers_dirty.
Registers registers;
// Where we last resumed execution
remote_code_ptr address_of_last_execution_resume;
// Current hardware watchpoint state as programmed into debug registers
HardwareWatchpoints current_hardware_watchpoints;
ResumeRequest how_last_execution_resumed;
// In certain circumstances, due to hardware bugs, we need to fudge the
// cx register. If so, we record the original value here. See comments in
// Task.cc
uint64_t last_resume_orig_cx;
// The instruction type we're singlestepping through.
TrappedInstruction singlestepping_instruction;
// True if we set a breakpoint after a singlestepped CPUID instruction.
// We need this in addition to `singlestepping_instruction` because that
// might be CPUID but we failed to set the breakpoint.
bool did_set_breakpoint_after_cpuid;
// True when we know via waitpid() that the task was stopped in
// a ptrace-stop and we haven't resumed it.
// It is possible that the task has been pushed out of the ptrace-stop
// without our knowledge, due to a SIGKILL or equivalent such as
// zap_pid_ns_processes.
bool is_stopped_;
// True when we've been kicked out of a ptrace-stop via SIGKILL or
// equivalent.
bool in_unexpected_exit;
/* True when the seccomp filter has been enabled via prctl(). This happens
* in the first system call issued by the initial tracee (after it returns
* from kill(SIGSTOP) to synchronize with the tracer). */
bool seccomp_bpf_enabled;
// True when 'registers' has changes that haven't been flushed back to the
// task yet.
bool registers_dirty;
// True when changes to the original syscallno in 'registers' have not been
// flushed back to the task yet. Some architectures (e.g. AArch64) require a
// separate ptrace call for this.
bool orig_syscallno_dirty;
// When |extra_registers_known|, we have saved our extra registers.
ExtraRegisters extra_registers;
bool extra_registers_known;
// The session we're part of.
Session* session_;
// The thread group this belongs to.
std::shared_ptr<ThreadGroup> tg;
// Entries set by |set_thread_area()| or the |tls| argument to |clone()|
// (when that's a user_desc). May be more than one due to different
// entry_numbers.
// x86(_64) only.
std::vector<X86Arch::user_desc> thread_areas_;
// The |stack| argument passed to |clone()|, which for
// "threads" is the top of the user-allocated stack.
remote_ptr<void> top_of_stack;
// The most recent status of this task as returned by
// waitpid().
WaitStatus wait_status;
// The most recent siginfo (captured when wait_status shows pending_sig())
siginfo_t pending_siginfo;
// True when a PTRACE_EXIT_EVENT has been observed in the wait_status
// for this task.
bool seen_ptrace_exit_event_;
// True when a PTRACE_EXIT_EVENT has been handled for this task.
// By handled we mean either RecordSession's handle_ptrace_exit_event was
// run (or the replay equivalent) or we recognized that the task is already
// dead and we cleaned up our books so we don't try to destroy our buffers
// or anything like that in an already deceased task.
// We might defer handling the exit (e.g. if there's an ongoing execve).
// If this is true, `seen_ptrace_exit_event` must be true.
bool handled_ptrace_exit_event_;
// A counter for the number of stops for which the stop may have been caused
// by PTRACE_INTERRUPT. See description in do_waitpid
int expecting_ptrace_interrupt_stop;
bool was_reaped_;
// Let this Task object be destroyed with no consequences.
bool forgotten;
Task(Task&) = delete;
Task operator=(Task&) = delete;
};
} // namespace rr
#endif /* RR_TASK_H_ */