src/Task.cc - toolchain/rr - Git at Google

 /* -*- Mode: C++; tab-width: 8; c-basic-offset: 2; indent-tabs-mode: nil; -*- */

 #include <errno.h>
 #include <limits.h>
 #include <linux/capability.h>
 #include <linux/elf.h>
 #include <linux/ipc.h>
 #include <linux/net.h>
 #include <linux/perf_event.h>
 #include <linux/prctl.h>
 #include <linux/unistd.h>
 #include <math.h>
 #include <stdarg.h>
 #include <stdlib.h>
 #include <string.h>
 #include <sys/personality.h>
 #include <sys/resource.h>
 #include <sys/socket.h>
 #include <sys/time.h>
 #include <sys/types.h>
 #include <sys/un.h>
 #include <sys/user.h>
 #include <sys/wait.h>
 #include <syscall.h>

 #include <limits>
 #include <set>
 #include <sstream>

 #include <rr/rr.h>

 #include "Task.h"

 #include "preload/preload_interface.h"

 #include "AutoRemoteSyscalls.h"
 #include "CPUIDBugDetector.h"
 #include "Flags.h"
 #include "MagicSaveDataMonitor.h"
 #include "PidFdMonitor.h"
 #include "PreserveFileMonitor.h"
 #include "ProcMemMonitor.h"
 #include "RecordSession.h"
 #include "RecordTask.h"
 #include "ReplaySession.h"
 #include "ReplayTask.h"
 #include "ScopedFd.h"
 #include "StdioMonitor.h"
 #include "StringVectorToCharArray.h"
 #include "TraceeAttentionSet.h"
 #include "WaitManager.h"
 #include "cpp_supplement.h"
 #include "fast_forward.h"
 #include "kernel_abi.h"
 #include "kernel_metadata.h"
 #include "kernel_supplement.h"
 #include "log.h"
 #include "record_signal.h"
 #include "seccomp-bpf.h"
 #include "util.h"

 using namespace std;

 namespace rr {

 static const unsigned int NUM_X86_DEBUG_REGS = 8;
 static const unsigned int NUM_X86_WATCHPOINTS = 4;

 Task::Task(Session& session, pid_t _tid, pid_t _rec_tid, uint32_t serial,
            SupportedArch a)
     : scratch_ptr(),
       scratch_size(),
       // This will be initialized when the syscall buffer is.
       desched_fd_child(-1),
       // This will be initialized when the syscall buffer is.
       cloned_file_data_fd_child(-1),
       hpc(_tid, session.cpu_binding(), session.ticks_semantics(),
           session.need_performance_counters()),
       tid(_tid),
       rec_tid(_rec_tid > 0 ? _rec_tid : _tid),
       own_namespace_rec_tid(_rec_tid > 0 ? _rec_tid: _tid),
       syscallbuf_size(0),
       ticks_at_last_syscall_entry(0),
       ip_at_last_syscall_entry(nullptr),
       last_syscall_entry_recorded(false),
       serial(serial),
       ticks(0),
       registers(a),
       how_last_execution_resumed(RESUME_CONT),
       last_resume_orig_cx(0),
       did_set_breakpoint_after_cpuid(false),
       is_stopped_(false),
       in_unexpected_exit(false),
       seccomp_bpf_enabled(false),
       registers_dirty(false),
       orig_syscallno_dirty(false),
       extra_registers(a),
       extra_registers_known(false),
       session_(&session),
       top_of_stack(),
       seen_ptrace_exit_event_(false),
       handled_ptrace_exit_event_(false),
       expecting_ptrace_interrupt_stop(0),
       was_reaped_(false),
       forgotten(false) {
   memset(&thread_locals, 0, sizeof(thread_locals));
 }

 void Task::detach() {
   LOG(debug) << "detaching from Task " << tid << " (rec:" << rec_tid << ")";

   fallible_ptrace(PTRACE_DETACH, nullptr, nullptr);

   // Not really, but there's also no reason to actually try to reap it,
   // since we detached.
   was_reaped_ = true;
 }

 void Task::reenable_cpuid_tsc() {
   if (is_x86ish(arch())) {
     AutoRemoteSyscalls remote(this);
     if (session().has_cpuid_faulting()) {
       remote.infallible_syscall(syscall_number_for_arch_prctl(arch()),
                             ARCH_SET_CPUID, 1);
     }
     remote.infallible_syscall(syscall_number_for_prctl(arch()),
                           PR_SET_TSC, PR_TSC_ENABLE);
   }
 }

 void Task::wait_exit() {
   LOG(debug) << "Waiting for exit of " << tid;
   /* We want to wait for the child to exit, but we don't actually
    * want to reap the task when it's dead. We could use WEXITED | WNOWAIT,
    * but that would hang if `t` is a thread-group-leader of a thread group
    * that has other still-running threads. Instead, we wait for WSTOPPED, but
    * we know that there is no possibility for the task to stop between now and
    * its exit, at which point the system call will return with -ECHILD.
    * There is one exception: If there was a simultaneous exec from another
    * thread, and this is the group leader, then this task may lose its pid
    * as soon as it enters the zombie state, causing `tid` to refer to the
    * newly-execed thread and us getting a PTRACE_EVENT_EXEC instead. To account
    * for this we add `| WNOWAIT` to prevent dequeuing the event and simply take
    * it as an indication that the task has execed.
    */
   WaitOptions options(tid);
   options.consume = false;
   do {
     WaitResult result = WaitManager::wait_stop(options);
     if (result.code == WAIT_OK) {
       if (result.status.ptrace_event() == PTRACE_EVENT_EXIT) {
         // It's possible that the earlier exit event was synthetic, in which
         // case we're only now catching up to the real process exit. In that
         // case, just ask the process to actually exit. (TODO: We may want to
         // catch this earlier).
         return proceed_to_exit();
       }
       ASSERT(this, result.status.ptrace_event() == PTRACE_EVENT_EXEC)
         << "Expected PTRACE_EVENT_EXEC, got " << result.status;
       // The kernel will do the reaping for us in this case
       was_reaped_ = true;
     } else if (result.code == WAIT_NO_STATUS) {
       // Wait was EINTR'd most likely - retry.
       continue;
     } else {
       ASSERT(this, result.code == WAIT_NO_CHILD);
     }
   } while (false);
 }

 void Task::proceed_to_exit(bool wait) {
   LOG(debug) << "Advancing tid " << tid << " to exit; wait=" << wait;
   int ret = fallible_ptrace(PTRACE_CONT, nullptr, nullptr);
   ASSERT(this, ret == 0 || (ret == -1 && errno == ESRCH))
     << "Got ret=" << ret << " errno=" << errno;
   if (wait) {
     wait_exit();
   }
 }

 WaitStatus Task::kill() {
   if (was_reaped()) {
     return this->status();
   }
   /* This call is racy. There is basically three situations:
   * 1. By the time the kernel gets around to delivering this signal,
   *    we were already in a PTRACE_EVENT_EXIT stop (e.g. due to an earlier
   *    fatal signal or group exit from a sibling task that the kernel
   *    didn't report to us yet), that we didn't observe yet (if we had, we
   *    would have removed the task from the task map already). In this case,
   *    this signal will advance from the PTRACE_EVENT_EXIT and put the child
   *    into hidden-zombie state, which the waitpid below will reap.
   * 2. The task was in a coredump wait. This situation essentially works the
   *    same as 1, but the final exit status will be some other fatal signal.
   * 3. Anything else basically. The signal will take priority and put us
   *    into the PTRACE_EVENT_EXIT stop, which the subsequent waitpid will
   *    then observe.
   */
   LOG(debug) << "Sending SIGKILL to " << tid;
   int ret = syscall(SYS_tgkill, real_tgid(), tid, SIGKILL);
   ASSERT(this, ret == 0);
   WaitResult result;
   bool is_exit_event;
   do {
     result = WaitManager::wait_stop_or_exit(WaitOptions(tid));
     ASSERT(this, result.code == WAIT_OK);
     LOG(debug) << " -> " << result.status;
     is_exit_event = result.status.ptrace_event() == PTRACE_EVENT_EXIT;
     // Loop until we get a suitable event; there could be a cached stop
     // notification.
   } while (!(is_exit_event || result.status.type() == WaitStatus::FATAL_SIGNAL ||
              result.status.type() == WaitStatus::EXIT));
   did_kill();
   WaitStatus status = result.status;
   if (is_exit_event) {
     /* If this is the exit event, we can detach here and the task will
       * continue to zombie state for its parent to reap. If we're not in
       * the exit event, we already reaped it from the ptrace perspective,
       * which implicitly detached.
       */
     unsigned long long_status;
     if (ptrace_if_stopped(PTRACE_GETEVENTMSG, nullptr, &long_status)) {
       status = WaitStatus(long_status);
     } else {
       // The task has been killed due to SIGKILL or equivalent.
       status = WaitStatus::for_fatal_sig(SIGKILL);
     }
     int ret = fallible_ptrace(PTRACE_DETACH, nullptr, nullptr);
     DEBUG_ASSERT(ret == 0 || (ret == -1 && errno == ESRCH));
     if (ret == -1) {
       /* It's possible for the above ptrace to fail with ESRCH. How?
       * It's the other side of the race described above. If an external
       * process issues an additional SIGKILL, we will advance from the
       * ptrace exit event and we might still be processing the exit, just
       * as the detach request comes in. To address this, we waitpid again,
       * which will reap/detach us from ptrace and frees the real parent to
       * do its reaping. */
       result = WaitManager::wait_exit(WaitOptions(tid));
       ASSERT(this, result.code == WAIT_OK);
       LOG(debug) << " --> " << result.status;
       ASSERT(this, result.status.fatal_sig() == SIGKILL);
       status = result.status;
     }
   } else {
     was_reaped_ = true;
   }
   return status;
 }

 Task::~Task() {
   if (!forgotten) {
     ASSERT(this, handled_ptrace_exit_event_);
     ASSERT(this, syscallbuf_child.is_null());

     if (!session().is_recording() && !was_reaped()) {
       // Reap the zombie.
       WaitResult result = WaitManager::wait_exit(WaitOptions(tid));
       ASSERT(this, result.code == WAIT_OK || result.code == WAIT_NO_CHILD);
     }

     LOG(debug) << "  dead";
   }

   session().on_destroy(this);
   tg->erase_task(this);
   as->erase_task(this);
   fds->erase_task(this);
 }

 void Task::forget() {
   forgotten = true;
 }

 void Task::finish_emulated_syscall() {
   // XXX verify that this can't be interrupted by a breakpoint trap
   Registers r = regs();

   // Passing RESUME_NO_TICKS here is not only a small performance optimization,
   // but also avoids counting an event if the instruction immediately following
   // a syscall instruction is a conditional branch.
   bool ok = resume_execution(RESUME_SYSCALL, RESUME_WAIT_NO_EXIT, RESUME_NO_TICKS);
   ASSERT(this, ok) << "Tracee exited unexpectedly";

   set_regs(r);
   wait_status = WaitStatus();
 }

 string Task::name() const {
   char buf[1024];
   sprintf(buf, "/proc/%d/comm", tid);
   ScopedFd comm(buf, O_RDONLY);
   if (!comm.is_open()) {
     return "???";
   }
   ssize_t bytes = read(comm, buf, sizeof(buf) - 1);
   ASSERT(this, bytes >= 0);
   if (bytes > 0 && buf[bytes - 1] == '\n') {
     --bytes;
   }
   return string(buf, bytes);
 }

 void Task::set_name(AutoRemoteSyscalls& remote, const std::string& name) {
   ASSERT(this, this == remote.task());
   char prname[17];
   strncpy(prname, name.c_str(), sizeof(prname));
   prname[16] = 0;
   AutoRestoreMem remote_prname(remote, (const uint8_t*)prname, 16);
   LOG(debug) << "    setting name to " << prname;
   remote.infallible_syscall(syscall_number_for_prctl(remote.arch()), PR_SET_NAME,
                             remote_prname.get().as_int());
 }

 void Task::dump(FILE* out) const {
   out = out ? out : stderr;
   stringstream ss;
   ss << wait_status;
   fprintf(out, "  %s(tid:%d rec_tid:%d status:0x%s)<%p>\n", name().c_str(),
           tid, rec_tid, ss.str().c_str(), this);
   if (session().is_recording()) {
     // TODO pending events are currently only meaningful
     // during recording.  We should change that
     // eventually, to have more informative output.
     log_pending_events();
   }
 }

 std::string Task::proc_fd_path(int fd) {
   char path[PATH_MAX];
   snprintf(path, sizeof(path) - 1, "/proc/%d/fd/%d", tid, fd);
   return path;
 }

 std::string Task::proc_pagemap_path() {
   char path[PATH_MAX];
   snprintf(path, sizeof(path) - 1, "/proc/%d/pagemap", tid);
   return path;
 }

 std::string Task::proc_stat_path() {
   char path[PATH_MAX];
   snprintf(path, sizeof(path) - 1, "/proc/%d/stat", tid);
   return path;
 }

 std::string Task::proc_exe_path() {
   char path[PATH_MAX];
   snprintf(path, sizeof(path) - 1, "/proc/%d/exe", tid);
   return path;
 }

 std::string Task::exe_path() {
   char proc_exe[PATH_MAX];
   snprintf(proc_exe, sizeof(proc_exe), "/proc/%d/exe", tid);
   char exe[PATH_MAX];
   ssize_t ret = readlink(proc_exe, exe, sizeof(exe) - 1);
   ASSERT(this, ret >= 0);
   exe[ret] = 0;
   return exe;
 }

 struct stat Task::stat_fd(int fd) {
   char path[PATH_MAX];
   snprintf(path, sizeof(path) - 1, "/proc/%d/fd/%d", tid, fd);
   struct stat result;
   auto ret = ::stat(path, &result);
   ASSERT(this, ret == 0);
   return result;
 }

 struct stat Task::lstat_fd(int fd) {
   char path[PATH_MAX];
   snprintf(path, sizeof(path) - 1, "/proc/%d/fd/%d", tid, fd);
   struct stat result;
   auto ret = ::lstat(path, &result);
   ASSERT(this, ret == 0);
   return result;
 }

 ScopedFd Task::open_fd(int fd, int flags) {
   char path[PATH_MAX];
   snprintf(path, sizeof(path) - 1, "/proc/%d/fd/%d", tid, fd);
   return ScopedFd(path, flags);
 }

 string Task::file_name_of_fd(int fd) {
   char path[PATH_MAX];
   char procfd[40];
   snprintf(procfd, sizeof(procfd) - 1, "/proc/%d/fd/%d", tid, fd);
   ssize_t nbytes = readlink(procfd, path, sizeof(path) - 1);
   if (nbytes < 0) {
     path[0] = 0;
   } else {
     path[nbytes] = 0;
   }
   return path;
 }

 pid_t Task::get_ptrace_eventmsg_pid() {
   unsigned long msg = 0;
   if (!ptrace_if_stopped(PTRACE_GETEVENTMSG, nullptr, &msg)) {
     return -1;
   }
   return msg;
 }

 const siginfo_t& Task::get_siginfo() {
   DEBUG_ASSERT(stop_sig());
   return pending_siginfo;
 }

 /**
  * Must be idempotent.
  */
 void Task::destroy_buffers(Task *as_task, Task *fd_task) {
   auto saved_syscallbuf_child = syscallbuf_child;
   // Clear syscallbuf_child now so nothing tries to use it while tearing
   // down buffers.
   syscallbuf_child = nullptr;
   if (as_task != nullptr) {
     AutoRemoteSyscalls remote(as_task);
     as_task->unmap_buffers_for(remote, this, saved_syscallbuf_child);
     if (as_task == fd_task) {
       as_task->close_buffers_for(remote, this, true);
     }
     goto done;
   }
   if (fd_task != nullptr) {
     AutoRemoteSyscalls remote(fd_task);
     fd_task->close_buffers_for(remote, this, true);
   }
 done:
   scratch_ptr = nullptr;
   desched_fd_child = -1;
   cloned_file_data_fd_child = -1;
 }

 void Task::unmap_buffers_for(
     AutoRemoteSyscalls& remote, Task* other,
     remote_ptr<struct syscallbuf_hdr> saved_syscallbuf_child) {
   if (other->scratch_ptr) {
     if (remote.infallible_munmap_syscall_if_alive(
           other->scratch_ptr, other->scratch_size)) {
       vm()->unmap(this, other->scratch_ptr, other->scratch_size);
     }
   }
   if (!saved_syscallbuf_child.is_null()) {
     if (remote.infallible_munmap_syscall_if_alive(
           saved_syscallbuf_child, other->syscallbuf_size)) {
       vm()->unmap(this, saved_syscallbuf_child, other->syscallbuf_size);
     }
   }
 }

 void Task::did_kill()
 {
   /* We may or may not have seen this event (see the note on race conditions
    * in Session.cc), but let's pretend that we did to make this task look like
    * other that we didn't kill ourselves
    */
   seen_ptrace_exit_event_ = true;
   handled_ptrace_exit_event_ = true;
   syscallbuf_child = nullptr;
   /* No need to unmap/close things in the child here - the kernel did that for
    * us when the child died. */
   scratch_ptr = nullptr;
   desched_fd_child = -1;
   cloned_file_data_fd_child = -1;
 }

 /**
  * Must be idempotent.
  */
 void Task::close_buffers_for(AutoRemoteSyscalls& remote, Task* other, bool really_close) {
   if (other->desched_fd_child >= 0) {
     if (session().is_recording() && really_close) {
       remote.infallible_close_syscall_if_alive(other->desched_fd_child);
     }
     fds->did_close(other->desched_fd_child);
   }
   if (other->cloned_file_data_fd_child >= 0) {
     if (really_close) {
       remote.infallible_close_syscall_if_alive(other->cloned_file_data_fd_child);
     }
     fds->did_close(other->cloned_file_data_fd_child);
   }
 }

 void Task::emulate_jump(remote_code_ptr ip) {
   Registers r = regs();
   r.set_ip(ip);
   set_regs(r);
   ticks += PerfCounters::ticks_for_unconditional_indirect_branch(this);
 }

 bool Task::is_desched_event_syscall() {
   return is_ioctl_syscall(regs().original_syscallno(), arch()) &&
          desched_fd_child != -1 &&
          desched_fd_child == (int)regs().arg1_signed();
 }

 bool Task::is_ptrace_seccomp_event() const {
   int event = ptrace_event();
   return (PTRACE_EVENT_SECCOMP_OBSOLETE == event ||
           PTRACE_EVENT_SECCOMP == event);
 }

 template <typename Arch>
 static vector<uint8_t> ptrace_get_regs_set(Task* t, const Registers& regs,
                                            size_t min_size) {
   auto iov = t->read_mem(remote_ptr<typename Arch::iovec>(regs.arg4()));
   ASSERT(t, iov.iov_len >= min_size)
       << "Should have been caught during prepare_ptrace";
   return t->read_mem(iov.iov_base.rptr().template cast<uint8_t>(), iov.iov_len);
 }

 static void process_shmdt(Task* t, remote_ptr<void> addr) {
   size_t size = t->vm()->get_shm_size(addr);
   t->vm()->remove_shm_size(addr);
   t->vm()->unmap(t, addr, size);
 }

 template <typename Arch>
 static void ptrace_syscall_exit_legacy_arch(Task* t, Task* tracee, const Registers& regs)
 {
   switch ((int)regs.orig_arg1_signed()) {
     case Arch::PTRACE_SETREGS: {
       auto data = t->read_mem(
           remote_ptr<typename Arch::user_regs_struct>(regs.arg4()));
       Registers r = tracee->regs();
       r.set_from_ptrace_for_arch(Arch::arch(), &data, sizeof(data));
       tracee->set_regs(r);
       break;
     }
     case Arch::PTRACE_SETFPREGS: {
       auto data = t->read_mem(
           remote_ptr<typename Arch::user_fpregs_struct>(regs.arg4()));
       auto r = tracee->extra_regs();
       r.set_user_fpregs_struct(t, Arch::arch(), &data, sizeof(data));
       tracee->set_extra_regs(r);
       break;
     }
     case Arch::PTRACE_SETFPXREGS: {
       auto data =
           t->read_mem(remote_ptr<X86Arch::user_fpxregs_struct>(regs.arg4()));
       auto r = tracee->extra_regs();
       r.set_user_fpxregs_struct(t, data);
       tracee->set_extra_regs(r);
       break;
     }
     case Arch::PTRACE_POKEUSR: {
       size_t addr = regs.arg3();
       typename Arch::unsigned_word data = regs.arg4();
       if (addr < sizeof(typename Arch::user_regs_struct)) {
         Registers r = tracee->regs();
         r.write_register_by_user_offset(addr, data);
         tracee->set_regs(r);
       } else if (addr >= offsetof(typename Arch::user, u_debugreg[0]) &&
                   addr < offsetof(typename Arch::user, u_debugreg[8])) {
         size_t regno =
             (addr - offsetof(typename Arch::user, u_debugreg[0])) /
             sizeof(data);
         tracee->set_x86_debug_reg(regno, data);
       }
       break;
     }
     default:
       break;
   }
 }

 template <>
 void ptrace_syscall_exit_legacy_arch<ARM64Arch>(Task*, Task*, const Registers&)
 {
   // Nothing to do - unimplemented on this architecture
   return;
 }

 template <typename Arch>
 void Task::on_syscall_exit_arch(int syscallno, const Registers& regs) {
   session().accumulate_syscall_performed();

   if (regs.original_syscallno() == SECCOMP_MAGIC_SKIP_ORIGINAL_SYSCALLNO) {
     return;
   }

   if (syscallno == session_->syscall_number_for_rrcall_mprotect_record()) {
     // When we record an rr replay of a tracee which does a syscallbuf'ed
     // `mprotect`, neither the replay nor its recording see the mprotect
     // syscall, since it's untraced during both recording and replay. rr
     // replay is notified of the syscall via the `mprotect_records`
     // mechanism; if it's being recorded, it forwards that notification to
     // the recorder by calling this syscall.
     pid_t tid = regs.orig_arg1();
     remote_ptr<void> addr = regs.arg2();
     size_t num_bytes = regs.arg3();
     int prot = regs.arg4_signed();
     Task* t = session().find_task(tid);
     ASSERT(this, t);
     return t->vm()->protect(t, addr, num_bytes, prot);
   }

   // mprotect can change the protection status of some mapped regions before
   // failing.
   // SYS_rrcall_mprotect_record always fails with ENOSYS, though we want to
   // note its usage here.
   if (regs.syscall_failed() && !is_mprotect_syscall(syscallno, regs.arch())
       && !is_pkey_mprotect_syscall(syscallno, regs.arch())) {
     return;
   }

   switch (syscallno) {
     case Arch::brk:
     case Arch::mmap:
     case Arch::mmap2:
     case Arch::mremap: {
       LOG(debug) << "(brk/mmap/mmap2/mremap will receive / has received direct "
                     "processing)";
       return;
     }

     case Arch::pkey_mprotect:
     case Arch::mprotect: {
       remote_ptr<void> addr = regs.orig_arg1();
       size_t num_bytes = regs.arg2();
       int prot = regs.arg3_signed();
       return vm()->protect(this, addr, num_bytes, prot);
     }
     case Arch::munmap: {
       remote_ptr<void> addr = regs.orig_arg1();
       size_t num_bytes = regs.arg2();
       return vm()->unmap(this, addr, num_bytes);
     }
     case Arch::shmdt:
       return process_shmdt(this, regs.orig_arg1());
     case Arch::madvise: {
       remote_ptr<void> addr = regs.orig_arg1();
       size_t num_bytes = regs.arg2();
       int advice = regs.arg3();
       return vm()->advise(this, addr, num_bytes, advice);
     }
     case Arch::ipc: {
       switch ((int)regs.orig_arg1_signed()) {
         case SHMDT:
           return process_shmdt(this, regs.arg5());
         default:
           break;
       }
       break;
     }

     case Arch::set_thread_area:
       set_thread_area(regs.orig_arg1());
       return;

     case Arch::prctl:
       switch ((int)regs.orig_arg1_signed()) {
         case PR_SET_SECCOMP:
           if (regs.arg2() == SECCOMP_MODE_FILTER && session().is_recording()) {
             seccomp_bpf_enabled = true;
           }
           break;
         case PR_SET_NAME:
           did_prctl_set_prname(regs.arg2());
           break;
       }
       return;

     case Arch::dup:
     case Arch::dup2:
     case Arch::dup3:
       fd_table()->did_dup(regs.orig_arg1(), regs.syscall_result());
       return;
     case Arch::fcntl64:
     case Arch::fcntl:
       if (regs.arg2() == Arch::DUPFD || regs.arg2() == Arch::DUPFD_CLOEXEC) {
         fd_table()->did_dup(regs.orig_arg1(), regs.syscall_result());
       }
       return;
     case Arch::close:
       fd_table()->did_close(regs.orig_arg1());
       return;

     case Arch::unshare:
       if (regs.orig_arg1() & CLONE_FILES) {
         fds->erase_task(this);
         fds = fds->clone();
         fds->insert_task(this);
         vm()->fd_tables_changed();
       }
       return;

     case Arch::pwrite64:
     case Arch::write: {
       int fd = (int)regs.orig_arg1_signed();
       vector<FileMonitor::Range> ranges;
       ssize_t amount = regs.syscall_result_signed();
       if (amount > 0) {
         ranges.push_back(FileMonitor::Range(regs.arg2(), amount));
       }
       FileMonitor::LazyOffset offset(this, regs, syscallno);
       fd_table()->did_write(this, fd, ranges, offset);
       return;
     }

     case Arch::pwritev:
     case Arch::writev: {
       int fd = (int)regs.orig_arg1_signed();
       vector<FileMonitor::Range> ranges;
       auto iovecs =
           read_mem(remote_ptr<typename Arch::iovec>(regs.arg2()), regs.arg3());
       ssize_t written = regs.syscall_result_signed();
       ASSERT(this, written >= 0);
       for (auto& v : iovecs) {
         ssize_t amount = min<ssize_t>(written, v.iov_len);
         if (amount > 0) {
           ranges.push_back(FileMonitor::Range(v.iov_base, amount));
           written -= amount;
         }
       }
       FileMonitor::LazyOffset offset(this, regs, syscallno);
       fd_table()->did_write(this, fd, ranges, offset);
       return;
     }

     case Arch::ptrace: {
       pid_t pid = (pid_t)regs.arg2_signed();
       Task* tracee = session().find_task(pid);
       switch ((int)regs.orig_arg1_signed()) {
         case PTRACE_SETREGSET: {
           switch ((int)regs.arg3()) {
             case NT_PRSTATUS: {
               auto set = ptrace_get_regs_set<Arch>(
                   this, regs, user_regs_struct_size(tracee->arch()));
               Registers r = tracee->regs();
               r.set_from_ptrace_for_arch(tracee->arch(), set.data(), set.size());
               tracee->set_regs(r);
               break;
             }
             case NT_PRFPREG: {
               auto set = ptrace_get_regs_set<Arch>(
                   this, regs, user_fpregs_struct_size(tracee->arch()));
               ExtraRegisters r = tracee->extra_regs();
               r.set_user_fpregs_struct(this, tracee->arch(), set.data(),
                                        set.size());
               tracee->set_extra_regs(r);
               break;
             }
             case NT_ARM_SYSTEM_CALL: {
               auto set = ptrace_get_regs_set<Arch>(
                   this, regs, sizeof(int));
               ASSERT(this, set.size() >= sizeof(int));
               int new_syscallno = *(int*)set.data();
               Registers r = tracee->regs();
               r.set_original_syscallno(new_syscallno);
               tracee->set_regs(r);
               break;
             }
             case NT_ARM_HW_WATCH:
             case NT_ARM_HW_BREAK: {
               auto set = ptrace_get_regs_set<Arch>(
                   this, regs, offsetof(ARM64Arch::user_hwdebug_state, dbg_regs[0]));
               ASSERT(this, set.size() >= sizeof(int));
               tracee->set_aarch64_debug_regs((int)regs.arg3(),
                 (ARM64Arch::user_hwdebug_state*)set.data(),
                 (set.size() - offsetof(ARM64Arch::user_hwdebug_state, dbg_regs[0]))/
                   2*sizeof(ARM64Arch::hw_bp));
               break;
             }
             case NT_X86_XSTATE: {
               switch (tracee->extra_regs().format()) {
                 case ExtraRegisters::XSAVE: {
                   XSaveLayout layout;
                   ReplaySession* replay = session().as_replay();
                   if (replay) {
                     layout = xsave_layout_from_trace(
                         replay->trace_reader().cpuid_records());
                   } else {
                     layout = xsave_native_layout();
                   }
                   auto set = ptrace_get_regs_set<Arch>(this, regs, layout.full_size);
                   ExtraRegisters r;
                   bool ok =
                       r.set_to_raw_data(tracee->arch(), ExtraRegisters::XSAVE,
                                         set.data(), set.size(), layout);
                   ASSERT(this, ok) << "Invalid XSAVE data";
                   tracee->set_extra_regs(r);
                   break;
                 }
                 default:
                   ASSERT(this, false) << "Unknown ExtraRegisters format; "
                                          "Should have been caught during "
                                          "prepare_ptrace";
               }
               break;
             }
             default:
               ASSERT(this, false) << "Unknown regset type; Should have been "
                                      "caught during prepare_ptrace";
               break;
           }
           break;
         }
         case Arch::PTRACE_ARCH_PRCTL: {
           if (tracee->arch() != x86_64) {
             break;
           }
           int code = (int)regs.arg4();
           switch (code) {
             case ARCH_GET_FS:
             case ARCH_GET_GS:
               break;
             case ARCH_SET_FS:
             case ARCH_SET_GS: {
               Registers r = tracee->regs();
               if (regs.arg3() == 0) {
                 // Work around a kernel bug in pre-4.7 kernels, where setting
                 // the gs/fs base to 0 via PTRACE_REGSET did not work correctly.
                 // If this fails the tracee is on the exit path and it
                 // doesn't matter what its fs/gs base is.
                 tracee->ptrace_if_stopped(Arch::PTRACE_ARCH_PRCTL, regs.arg3(),
                                         (void*)(uintptr_t)regs.arg4());
               }
               if (code == ARCH_SET_FS) {
                 r.set_fs_base(regs.arg3());
               } else {
                 r.set_gs_base(regs.arg3());
               }
               tracee->set_regs(r);
               break;
             }
             default:
               ASSERT(tracee, 0) << "Should have detected this earlier";
           }
           break;
         }
         case Arch::PTRACE_SETREGS:
         case Arch::PTRACE_SETFPREGS:
         case Arch::PTRACE_SETFPXREGS:
         case Arch::PTRACE_POKEUSR: {
           ptrace_syscall_exit_legacy_arch<Arch>(this, tracee, regs);
         }
       }
       return;
     }
     case Arch::pidfd_open: {
       int fd = regs.syscall_result();
       pid_t pid = (pid_t)regs.orig_arg1();
       TaskUid tuid;
       if (Task* t = session().find_task(pid)) {
         tuid = t->tuid();
       }
       fd_table()->add_monitor(this, fd, new PidFdMonitor(tuid));
       return;
     }
     case Arch::pidfd_getfd: {
       int pidfd = regs.orig_arg1();
       int fd = regs.arg2();
       if (PidFdMonitor* monitor = PidFdMonitor::get(fd_table().get(), pidfd)) {
         // NB: This can return NULL if the pidfd is for a process outside of
         // the rr trace.
         if (auto source = monitor->fd_table(session())) {
           fd_table()->did_dup(source.get(), fd, regs.syscall_result());
         }
       } else {
         LOG(warn) << "pidfd_getfd succeeded but we lost track of the pidfd " << pidfd;
       }
       return;
     }
   }
 }

 void Task::on_syscall_exit(int syscallno, SupportedArch arch,
                            const Registers& regs) {
   with_converted_registers<void>(regs, arch, [&](const Registers& regs) {
     RR_ARCH_FUNCTION(on_syscall_exit_arch, arch, syscallno, regs);
   });
 }

 void Task::move_ip_before_breakpoint() {
   // TODO: assert that this is at a breakpoint trap.
   Registers r = regs();
   r.set_ip(r.ip().undo_executed_bkpt(arch()));
   set_regs(r);
 }

 bool Task::enter_syscall(bool allow_exit) {
   bool need_ptrace_syscall_event = !seccomp_bpf_enabled ||
                                    session().syscall_seccomp_ordering() ==
                                        Session::SECCOMP_BEFORE_PTRACE_SYSCALL;
   bool need_seccomp_event = seccomp_bpf_enabled;
   while (need_ptrace_syscall_event || need_seccomp_event) {
     if (!resume_execution(need_ptrace_syscall_event ? RESUME_SYSCALL : RESUME_CONT,
                           RESUME_WAIT_NO_EXIT, RESUME_NO_TICKS)) {
       return false;
     }
     if (is_ptrace_seccomp_event()) {
       ASSERT(this, need_seccomp_event);
       need_seccomp_event = false;
       continue;
     }
     if (allow_exit && ptrace_event() == PTRACE_EVENT_EXIT) {
       return false;
     }
     ASSERT(this, !ptrace_event());
     if (session().is_recording() && wait_status.group_stop()) {
       static_cast<RecordTask*>(this)->stash_group_stop();
       continue;
     }
     if (!stop_sig()) {
       ASSERT(this, need_ptrace_syscall_event);
       need_ptrace_syscall_event = false;
       continue;
     }
     if (ReplaySession::is_ignored_signal(stop_sig()) &&
         session().is_replaying()) {
       continue;
     }
     ASSERT(this, session().is_recording() && !is_deterministic_signal(this))
         << " got unexpected signal " << signal_name(stop_sig());
     if (stop_sig() == session().as_record()->syscallbuf_desched_sig()) {
       continue;
     }
     static_cast<RecordTask*>(this)->stash_sig();
   }
   apply_syscall_entry_regs();
   canonicalize_regs(arch());
   return true;
 }

 bool Task::exit_syscall() {
   // If PTRACE_SYSCALL_BEFORE_SECCOMP, we are inconsistent about
   // whether we process the syscall on the syscall entry trap or
   // on the seccomp trap. Detect if we are on the former and
   // just bring us forward to the seccomp trap.
   bool will_see_seccomp = seccomp_bpf_enabled &&
                           (session().syscall_seccomp_ordering() ==
                            Session::PTRACE_SYSCALL_BEFORE_SECCOMP) &&
                           !is_ptrace_seccomp_event();
   while (true) {
     if (!resume_execution(RESUME_SYSCALL, RESUME_WAIT_NO_EXIT, RESUME_NO_TICKS)) {
       return false;
     }
     if (will_see_seccomp && is_ptrace_seccomp_event()) {
       will_see_seccomp = false;
       continue;
     }
     if (ptrace_event() == PTRACE_EVENT_EXIT) {
       return false;
     }
     ASSERT(this, !ptrace_event());
     if (!stop_sig()) {
       canonicalize_regs(arch());
       break;
     }
     if (ReplaySession::is_ignored_signal(stop_sig()) &&
         session().is_replaying()) {
       continue;
     }
     ASSERT(this, session().is_recording());
     static_cast<RecordTask*>(this)->stash_sig();
   }
   return true;
 }

 bool Task::exit_syscall_and_prepare_restart() {
   Registers r = regs();
   int syscallno = r.original_syscallno();
   LOG(debug) << "exit_syscall_and_prepare_restart from syscall "
              << rr::syscall_name(syscallno, r.arch());
   r.set_original_syscallno(syscall_number_for_gettid(r.arch()));
   set_regs(r);
   // This exits the hijacked SYS_gettid.  Now the tracee is
   // ready to do our bidding.
   if (!exit_syscall()) {
     // The tracee unexpectedly exited. To get this to replay correctly, we need to
     // make it look like we really entered the syscall. Then
     // handle_ptrace_exit_event will record something appropriate.
     r.emulate_syscall_entry();
     set_regs(r);
     return false;
   }
   LOG(debug) << "exit_syscall_and_prepare_restart done";

   // Restore these regs to what they would have been just before
   // the tracee trapped at the syscall.
   r.set_original_syscallno(-1);
   r.set_syscallno(syscallno);
   r.set_ip(r.ip() - syscall_instruction_length(r.arch()));
   set_regs(r);
   return true;
 }

 #if defined(__i386__) || defined(__x86_64__)
 #define AR_L (1 << 21)
 static bool is_long_mode_segment(uint32_t segment) {
   uint32_t ar = 0;
   asm("lar %[segment], %[ar]" : [ar] "=r"(ar) : [segment] "r"(segment));
   return ar & AR_L;
 }
 #endif

 void Task::post_exec(const string& exe_file) {
   Task* stopped_task_in_address_space = nullptr;
   bool other_task_in_address_space = false;
   for (Task* t : as->task_set()) {
     if (t != this) {
       other_task_in_address_space = true;
       if (t->is_stopped_) {
         stopped_task_in_address_space = t;
         break;
       }
     }
   }
   if (stopped_task_in_address_space) {
     LOG(warn) << "Unmapping buffers using tid " << stopped_task_in_address_space->tid;
     AutoRemoteSyscalls remote(stopped_task_in_address_space);
     unmap_buffers_for(remote, this, syscallbuf_child);
   } else if (other_task_in_address_space) {
     // We should clean up our syscallbuf/scratch but that's too hard since we
     // have no stopped task to use for that :-(.
     // (We can't clean up those buffers *before* the exec completes, because it
     // might fail in which case we shouldn't have cleaned them up.)
     // Just let the buffers leak. The AddressSpace will clean up our local
     // shared buffer when it's destroyed.
     LOG(warn) << "Intentionally leaking syscallbuf after exec for task " << tid;
   }

   session().post_exec();

   as->erase_task(this);
   fds->erase_task(this);

   extra_registers = ExtraRegisters(registers.arch());
   extra_registers_known = false;
   ExtraRegisters e = extra_regs();
   e.reset();
   set_extra_regs(e);

   syscallbuf_child = nullptr;
   syscallbuf_size = 0;
   scratch_ptr = nullptr;
   cloned_file_data_fd_child = -1;
   desched_fd_child = -1;
   preload_globals = nullptr;
   rseq_state = nullptr;
   thread_group()->execed = true;

   thread_areas_.clear();
   memset(&thread_locals, 0, sizeof(thread_locals));

   as = session().create_vm(this, exe_file, as->uid().exec_count() + 1);
   // It's barely-documented, but Linux unshares the fd table on exec
   fds = fds->clone();
   fds->insert_task(this);
 }

 static string prname_from_exe_image(const string& e) {
   size_t last_slash = e.rfind('/');
   return e.substr(last_slash == e.npos ? 0 : last_slash + 1);
 }

 void Task::post_exec_syscall(const std::string& original_exe_file) {
   canonicalize_regs(arch());
   as->post_exec_syscall(this);

   AutoRemoteSyscalls remote(this);
   set_name(remote, prname_from_exe_image(original_exe_file));
   if (session().has_cpuid_faulting()) {
     remote.infallible_syscall(syscall_number_for_arch_prctl(arch()),
                               ARCH_SET_CPUID, 0);
   }
 }

 bool Task::execed() const { return tg->execed; }

 void Task::flush_inconsistent_state() { ticks = 0; }

 string Task::read_c_str(remote_ptr<char> child_addr, bool *ok) {
   remote_ptr<void> p = child_addr;
   string str;
   while (true) {
     // We're only guaranteed that [child_addr,
     // end_of_page) is mapped.
     remote_ptr<void> end_of_page = ceil_page_size(p + 1);
     ssize_t nbytes = end_of_page - p;
     std::unique_ptr<char[]> buf(new char[nbytes]);

     read_bytes_helper(p, nbytes, buf.get(), ok);
     if (ok && !*ok) {
       return "";
     }
     for (int i = 0; i < nbytes; ++i) {
       if ('\0' == buf[i]) {
         return str;
       }
       str += buf[i];
     }
     p = end_of_page;
   }
 }

 const Registers& Task::regs() const {
   // If we're in an unexpected exit then the tracee may
   // not be stopped but we know its registers won't change again,
   // so it's safe to ask for them here.
   ASSERT(this, is_stopped_ || was_reaped_ || in_unexpected_exit);
   return registers;
 }

 const ExtraRegisters* Task::extra_regs_fallible() {
   if (!extra_registers_known) {
 #if defined(__i386__) || defined(__x86_64__)
     if (xsave_area_size() > 512) {
       LOG(debug) << "  (refreshing extra-register cache using XSAVE)";

       extra_registers.format_ = ExtraRegisters::XSAVE;
       extra_registers.data_.resize(xsave_area_size());
       struct iovec vec = { extra_registers.data_.data(),
                            extra_registers.data_.size() };
       if (fallible_ptrace(PTRACE_GETREGSET, NT_X86_XSTATE, &vec)) {
         return nullptr;
       }
       extra_registers.data_.resize(vec.iov_len);
       // The kernel may return less than the full XSTATE
       extra_registers.validate(this);
     } else {
 #if defined(__i386__)
       LOG(debug) << "  (refreshing extra-register cache using FPXREGS)";

       extra_registers.format_ = ExtraRegisters::XSAVE;
       extra_registers.data_.resize(sizeof(user_fpxregs_struct));
       if (fallible_ptrace(X86Arch::PTRACE_GETFPXREGS, nullptr, extra_registers.data_.data())) {
         return nullptr;
       }
 #elif defined(__x86_64__)
       // x86-64 that doesn't support XSAVE; apparently Xeon E5620 (Westmere)
       // is in this class.
       LOG(debug) << "  (refreshing extra-register cache using FPREGS)";

       extra_registers.format_ = ExtraRegisters::XSAVE;
       extra_registers.data_.resize(sizeof(user_fpregs_struct));
       if (fallible_ptrace(PTRACE_GETFPREGS, nullptr, extra_registers.data_.data())) {
         return nullptr;
       }
 #endif
     }
 #elif defined(__aarch64__)
     LOG(debug) << "  (refreshing extra-register cache using FPR)";

     extra_registers.format_ = ExtraRegisters::NT_FPR;
     extra_registers.data_.resize(sizeof(ARM64Arch::user_fpregs_struct));
     struct iovec vec = { extra_registers.data_.data(),
                           extra_registers.data_.size() };
     if (fallible_ptrace(PTRACE_GETREGSET, NT_PRFPREG, &vec)) {
       return nullptr;
     }
     extra_registers.data_.resize(vec.iov_len);
 #else
 #error need to define new extra_regs support
 #endif
     extra_registers_known = true;
   }
   return &extra_registers;
 }

 const ExtraRegisters& Task::extra_regs() {
   if (!extra_regs_fallible()) {
     ASSERT(this, false) << "Can't find task for infallible extra_regs";
   }
   return extra_registers;
 }

 #if defined(__i386__) || defined(__x86_64__)
 static ssize_t dr_user_word_offset(size_t i) {
   DEBUG_ASSERT(i < NUM_X86_DEBUG_REGS);
   return offsetof(struct user, u_debugreg[0]) + sizeof(void*) * i;
 }

 uintptr_t Task::get_debug_reg(size_t regno) {
   errno = 0;
   long result =
       fallible_ptrace(PTRACE_PEEKUSER, dr_user_word_offset(regno), nullptr);
   if (errno == ESRCH) {
     return 0;
   }
   return result;
 }

 bool Task::set_x86_debug_reg(size_t regno, uintptr_t value) {
   errno = 0;
   fallible_ptrace(PTRACE_POKEUSER, dr_user_word_offset(regno), (void*)value);
   return errno == ESRCH || errno == 0;
 }

 uintptr_t Task::x86_debug_status() {
   return fallible_ptrace(PTRACE_PEEKUSER, dr_user_word_offset(6), nullptr);
 }
 #else
 #define FATAL_X86_ONLY() FATAL() << "Reached x86-only code path on non-x86 architecture";
 uintptr_t Task::get_debug_reg(size_t) {
   FATAL_X86_ONLY();
   return 0;
 }

 bool Task::set_x86_debug_reg(size_t, uintptr_t) {
   FATAL_X86_ONLY();
   return false;
 }

 uintptr_t Task::x86_debug_status() {
   FATAL_X86_ONLY();
   return 0;
 }
 #endif

 #if defined(__aarch64__)
 bool Task::set_aarch64_debug_regs(int which, ARM64Arch::user_hwdebug_state *regs, size_t nregs) {
   errno = 0;
   struct iovec iov { .iov_base = regs, .iov_len = sizeof(*regs) - (16-nregs)*sizeof(ARM64Arch::hw_bp) };
   ASSERT(this, which == NT_ARM_HW_BREAK || which == NT_ARM_HW_WATCH);
   fallible_ptrace(PTRACE_SETREGSET, which, (void*)&iov);
   return errno == 0;
 }
 bool Task::get_aarch64_debug_regs(int which, ARM64Arch::user_hwdebug_state *regs) {
   errno = 0;
   struct iovec iov { .iov_base = regs, .iov_len = sizeof(*regs) };
   ASSERT(this, which == NT_ARM_HW_BREAK || which == NT_ARM_HW_WATCH);
   fallible_ptrace(PTRACE_GETREGSET, which, (void*)&iov);
   return errno == 0;
 }
 #else
 bool Task::set_aarch64_debug_regs(int, ARM64Arch::user_hwdebug_state *, size_t) {
   FATAL() << "Reached aarch64 code path on non-aarch64 system";
   return false;
 }
 bool Task::get_aarch64_debug_regs(int, ARM64Arch::user_hwdebug_state *regs) {
   // Following memset just to silence a warning about dbg_info may be used uninitialized.
   memset(regs, 0, sizeof(*regs));
   FATAL() << "Reached aarch64 code path on non-aarch64 system";
   return false;
 }
 #endif

 void Task::set_x86_debug_status(uintptr_t status) {
   if (arch() == x86 || arch() == x86_64) {
     set_x86_debug_reg(6, status);
   }
 }

 static bool is_singlestep_resume(ResumeRequest request) {
   return request == RESUME_SINGLESTEP || request == RESUME_SYSEMU_SINGLESTEP;
 }

 TrapReasons Task::compute_trap_reasons() {
   ASSERT(this, stop_sig() == SIGTRAP);

   TrapReasons reasons;

   const siginfo_t& si = get_siginfo();
   if (arch() == x86 || arch() == x86_64) {
     uintptr_t status = x86_debug_status();
     reasons.singlestep = (status & DS_SINGLESTEP) != 0;
     if (!reasons.singlestep && is_singlestep_resume(how_last_execution_resumed)) {
       if (is_at_syscall_instruction(this, address_of_last_execution_resume) &&
           ip() ==
               address_of_last_execution_resume +
                   syscall_instruction_length(arch())) {
         // During replay we execute syscall instructions in certain cases, e.g.
         // mprotect with syscallbuf. The kernel does not set DS_SINGLESTEP when we
         // step over those instructions so we need to detect that here.
         reasons.singlestep = true;
       } else {
         TrappedInstruction ti =
           trapped_instruction_at(this, address_of_last_execution_resume);
         if (ti == TrappedInstruction::CPUID &&
             ip() == address_of_last_execution_resume +
                         trapped_instruction_len(TrappedInstruction::CPUID)) {
           // Likewise we emulate CPUID instructions and must forcibly detect that
           // here.
           reasons.singlestep = true;
           // This also takes care of the did_set_breakpoint_after_cpuid workaround case
         } else if (ti == TrappedInstruction::INT3 &&
             ip() == address_of_last_execution_resume +
                         trapped_instruction_len(TrappedInstruction::INT3)) {
           // INT3 instructions should also be turned into a singlestep here.
           reasons.singlestep = true;
         }
       }
     }

     // In VMWare Player 6.0.4 build-2249910, 32-bit Ubuntu x86 guest,
     // single-stepping does not trigger watchpoints :-(. So we have to
     // check watchpoints here. fast_forward also hides watchpoint changes.
     // Write-watchpoints will detect that their value has changed and trigger.
     // XXX Read/exec watchpoints can't be detected this way so they're still
     // broken in the above configuration :-(.
     if ((DS_WATCHPOINT_ANY | DS_SINGLESTEP) & status) {
       as->notify_watchpoint_fired(status, nullptr,
           is_singlestep_resume(how_last_execution_resumed)
               ? address_of_last_execution_resume : nullptr);
     }
     reasons.watchpoint =
         as->has_any_watchpoint_changes() || (DS_WATCHPOINT_ANY & status);
   } else if (arch() == aarch64) {
     reasons.watchpoint = false;
     reasons.singlestep = si.si_code == TRAP_TRACE;
     reasons.watchpoint = si.si_code == TRAP_HWBKPT;
     if (reasons.watchpoint) {
       as->notify_watchpoint_fired(0, remote_ptr<void>((uintptr_t)si.si_addr),
           is_singlestep_resume(how_last_execution_resumed)
               ? address_of_last_execution_resume : nullptr);
     }
   }

   // If we triggered a breakpoint, this would be the address of the breakpoint
   remote_code_ptr ip_at_breakpoint = ip().undo_executed_bkpt(arch());
   // Don't trust siginfo to report execution of a breakpoint if singlestep or
   // watchpoint triggered.
   if (reasons.singlestep) {
     reasons.breakpoint =
         as->is_breakpoint_instruction(this, address_of_last_execution_resume);
     if (reasons.breakpoint) {
       ASSERT(this, address_of_last_execution_resume == ip_at_breakpoint);
     }
   } else if (reasons.watchpoint) {
     // We didn't singlestep, so watchpoint state is completely accurate.
     // The only way the last instruction could have triggered a watchpoint
     // and be a breakpoint instruction is if an EXEC watchpoint fired
     // at the breakpoint address.
     reasons.breakpoint = as->has_exec_watchpoint_fired(ip_at_breakpoint) &&
                          as->is_breakpoint_instruction(this, ip_at_breakpoint);
   } else {
     ASSERT(this, SIGTRAP == si.si_signo) << " expected SIGTRAP, got " << si;
     reasons.breakpoint = is_kernel_trap(si.si_code);
     if (reasons.breakpoint) {
       ASSERT(this, as->is_breakpoint_instruction(this, ip_at_breakpoint))
           << " expected breakpoint at " << ip_at_breakpoint << ", got siginfo "
           << si;
     }
   }
   return reasons;
 }

 static void* preload_thread_locals_local_addr(AddressSpace& as) {
   if (!as.has_mapping(AddressSpace::preload_thread_locals_start())) {
     return nullptr;
   }
   // There might have been a mapping there, but not the one we expect (i.e.
   // the one shared with us for thread locals). In that case we behave as
   // if the mapping didn't exist at all.
   auto& mapping = as.mapping_of(AddressSpace::preload_thread_locals_start());
   if (mapping.flags & AddressSpace::Mapping::IS_THREAD_LOCALS) {
     DEBUG_ASSERT(mapping.local_addr);
     return mapping.local_addr;
   }
   return nullptr;
 }

 template <typename Arch> static void setup_preload_thread_locals_arch(Task* t) {
   void* local_addr = preload_thread_locals_local_addr(*t->vm());
   if (local_addr) {
     auto locals = reinterpret_cast<preload_thread_locals<Arch>*>(local_addr);
     static_assert(sizeof(*locals) <= PRELOAD_THREAD_LOCALS_SIZE,
                   "bad PRELOAD_THREAD_LOCALS_SIZE");
     locals->syscallbuf_stub_alt_stack = t->syscallbuf_alt_stack();
   }
 }

 void Task::setup_preload_thread_locals() {
   activate_preload_thread_locals();
   RR_ARCH_FUNCTION(setup_preload_thread_locals_arch, arch(), this);
 }

 const Task::ThreadLocals& Task::fetch_preload_thread_locals() {
   if (tuid() == as->thread_locals_tuid()) {
     void* local_addr = preload_thread_locals_local_addr(*as);
     if (local_addr) {
       memcpy(thread_locals, local_addr, PRELOAD_THREAD_LOCALS_SIZE);
       return thread_locals;
     }
     // The mapping might have been removed by crazy application code.
     // That's OK, assuming the preload library was removed too.
     memset(&thread_locals, 0, sizeof(thread_locals));
   }
   return thread_locals;
 }

 void Task::activate_preload_thread_locals() {
   // Switch thread-locals to the new task.
   if (tuid() != as->thread_locals_tuid()) {
     void* local_addr = preload_thread_locals_local_addr(*as);
     if (local_addr) {
       Task* t = session().find_task(as->thread_locals_tuid());
       if (t) {
         t->fetch_preload_thread_locals();
       }
       memcpy(local_addr, thread_locals, PRELOAD_THREAD_LOCALS_SIZE);
       as->set_thread_locals_tuid(tuid());
     }
   }
 }

 #if defined(__x86_64__) || defined(__i386__)
 static bool cpu_has_KNL_string_singlestep_bug() {
   static bool has_quirk =
       ((cpuid(CPUID_GETFEATURES, 0).eax & 0xF0FF0) == 0x50670);
   return has_quirk;
 }
 #else
 static bool cpu_has_KNL_string_singlestep_bug() {
   return false;
 }
 #endif

 /*
  * The value of rcx above which the CPU doesn't properly handle singlestep for
  * string instructions. Right now, since only once CPU has this quirk, this
  * value is hardcoded, but could depend on the CPU architecture in the future.
  */
 static int single_step_coalesce_cutoff() { return 16; }

 void Task::work_around_KNL_string_singlestep_bug() {
   /* The extra cx >= cutoff check is just an optimization, to avoid the
      moderately expensive load from ip() if we can */
   if (!cpu_has_KNL_string_singlestep_bug()) {
     return;
   }
   uintptr_t cx = regs().cx();
   uintptr_t cutoff = single_step_coalesce_cutoff();
   if (cx > cutoff && at_x86_string_instruction(this)) {
     /* KNL has a quirk where single-stepping a string instruction can step up
       to 64 iterations. Work around this by fudging registers to force the
       processor to execute one iteration and one iteration only. */
     LOG(debug) << "Working around KNL single-step hardware bug (cx=" << cx
               << ")";
     if (cx > cutoff) {
       last_resume_orig_cx = cx;
       Registers r = regs();
       /* An arbitrary value < cutoff would work fine here, except 1, since
         the last iteration of the loop behaves differently */
       r.set_cx(cutoff);
       set_regs(r);
     }
   }
 }

 bool Task::resume_execution(ResumeRequest how, WaitRequest wait_how,
                             TicksRequest tick_period, int sig) {
   // Ensure our HW debug registers are up to date before we execute any code.
   // If this fails because the task died, the code below will detect it.
   set_debug_regs(vm()->get_hw_watchpoints());

   bool setup_succeeded = will_resume_execution(how, wait_how, tick_period, sig);

   // During record, the process could have died, but otherwise, we control
   // process lifecycles and this should never fail.
   ASSERT(this, session().is_recording() || setup_succeeded);

   if (setup_succeeded) {
     if (tick_period != RESUME_NO_TICKS) {
       if (tick_period == RESUME_UNLIMITED_TICKS) {
         hpc.reset(0);
       } else {
         ASSERT(this, tick_period >= 0 && tick_period <= MAX_TICKS_REQUEST);
         hpc.reset(max<Ticks>(1, tick_period));
       }
       activate_preload_thread_locals();
     }

     LOG(debug) << "resuming execution of " << tid << " with "
               << ptrace_req_name<NativeArch>(how)
               << (sig ? string(", signal ") + signal_name(sig) : string())
               << " tick_period " << tick_period << " wait " << wait_how;
     set_x86_debug_status(0);

     if (is_singlestep_resume(how)) {
       work_around_KNL_string_singlestep_bug();
       if (is_x86ish(arch())) {
         singlestepping_instruction = trapped_instruction_at(this, ip());
         if (singlestepping_instruction == TrappedInstruction::CPUID) {
           // In KVM virtual machines (and maybe others), singlestepping over CPUID
           // executes the following instruction as well. Work around that.
           did_set_breakpoint_after_cpuid =
             vm()->add_breakpoint(ip() + trapped_instruction_len(singlestepping_instruction), BKPT_INTERNAL);
         }
       } else if (arch() == aarch64 && is_singlestep_resume(how_last_execution_resumed)) {
         // On aarch64, if the last execution was any sort of single step, then
         // resuming again with PTRACE_(SYSEMU_)SINGLESTEP will cause a debug fault
         // immediately before executing the next instruction in userspace
         // (essentially completing the singlestep that got "interrupted" by
         // trapping into the kernel). To prevent this, we must re-arm the
         // PSTATE.SS bit. (If the last resume was not a single step,
         // the kernel will apply this modification).
         if (!registers.aarch64_singlestep_flag()) {
           registers.set_aarch64_singlestep_flag();
           registers_dirty = true;
         }
       }
     }

     address_of_last_execution_resume = ip();
     how_last_execution_resumed = how;

     flush_regs();
   }

   if (session().is_recording() && !seen_ptrace_exit_event()) {
     /* There's a nasty race where a stopped task gets woken up by a SIGKILL
      * and advances to the PTRACE_EXIT_EVENT ptrace-stop just before we
      * send a PTRACE_CONT. Our PTRACE_CONT will cause it to continue and exit,
      * which means we don't get a chance to clean up robust futexes etc.
      * Avoid that by doing a waitpid() here to see if it has exited.
      * This doesn't fully close the race since in theory we could be preempted
      * between the waitpid and the ptrace_if_stopped, giving another task
      * a chance to SIGKILL our tracee and advance it to the PTRACE_EXIT_EVENT,
      * or just letting the tracee be scheduled to process its pending SIGKILL.
      */
     WaitOptions options(tid);
     options.block_seconds = 0.0;
     WaitResult result = WaitManager::wait_stop_or_exit(options);
     ASSERT(this, result.code == WAIT_OK || result.code == WAIT_NO_STATUS);
     if (result.code == WAIT_OK) {
       // In some (but not all) cases where the child was killed with SIGKILL,
       // we don't get PTRACE_EVENT_EXIT before it just exits, because a SIGKILL
       // arrived when the child was already in the PTRACE_EVENT_EXIT stop.
       // The status could be any exit or fatal-signal status, since this status
       // can reflect what caused the thread to exit before the SIGKILL arrived
       // and forced it out of the PTRACE_EVENT_EXIT stop.
       ASSERT(this,
              result.status.ptrace_event() == PTRACE_EVENT_EXIT ||
                  result.status.reaped())
           << "got " << result.status;
       LOG(debug) << "Task " << tid << " exited unexpectedly with status "
           << result.status;
       if (did_waitpid(result.status)) {
         // We reached a new stop (or actually reaped the task).
         // Consider this "resume execution" to be done.
         return wait_how != RESUME_WAIT_NO_EXIT;
       }
       ASSERT(this, result.status.ptrace_event() == PTRACE_EVENT_EXIT)
         << "did_waitpid should always succeed for reaped() statuses";
       // The tracee must have been kicked out of PTRACE_EVENT_EXIT
       // by a SIGKILL (only possible on older kernels).
       // If we were supposed to wait, we've failed.
       // We can't wait now because on old kernels tasks can block
       // indefinitely even after PTRACE_EVENT_EXIT (e.g. due to coredumping).
       // We don't know what state it's in exactly, but registers haven't changed
       // since nothing has really happened since the last stop.
       set_stopped(false);
       return RESUME_NONBLOCKING == wait_how;
     }
   }
   ASSERT(this, setup_succeeded);
   ptrace_if_stopped(how, nullptr, (void*)(uintptr_t)sig);
   // If ptrace_if_stopped failed, it means we're running along the
   // exit path due to a SIGKILL or equivalent, so just like if it
   // succeeded, we will eventually receive a wait notification.
   set_stopped(false);
   extra_registers_known = false;
   if (RESUME_NONBLOCKING != wait_how) {
     if (!wait()) {
       return false;
     }
     if (wait_how == RESUME_WAIT_NO_EXIT) {
       return ptrace_event() != PTRACE_EVENT_EXIT && !was_reaped();
     }
   }
   return true;
 }

 void Task::set_regs(const Registers& regs) {
   // Only allow registers to be set while our copy is the source of truth.
   ASSERT(this, is_stopped_ || in_unexpected_exit);
   if (registers.original_syscallno() != regs.original_syscallno()) {
     orig_syscallno_dirty = true;
   }
   bool changed = registers != regs;
   if (changed) {
     registers_dirty = true;
     registers = regs;
   }
 }

 void Task::flush_regs() {
   if (registers_dirty) {
     LOG(debug) << "Flushing registers for tid " << tid << " " << registers;
     auto ptrace_regs = registers.get_ptrace_iovec();
 #if defined(__i386__) || defined(__x86_64__)
     if (ptrace_if_stopped(PTRACE_SETREGSET, NT_PRSTATUS, &ptrace_regs)) {
       /* If that failed, the task was killed and it should not matter what
          we tried to set. But we will remember that our registers are dirty. */
       registers_dirty = false;
       orig_syscallno_dirty = false;
     }
 #elif defined(__aarch64__)
     if (ptrace_if_stopped(PTRACE_SETREGSET, NT_PRSTATUS, &ptrace_regs)) {
       /* If that failed, the task was killed and it should not matter what
          we tried to set. But we will remember that our registers are dirty. */
       registers_dirty = false;
     }
 #else
     #error "Unknown architecture"
 #endif
   }
 #if defined(__i386__) || defined(__x86_64__)
   else {
     ASSERT(this, !orig_syscallno_dirty);
   }
 #elif defined(__aarch64__)
   if (orig_syscallno_dirty) {
     uintptr_t syscall = registers.original_syscallno();
     struct iovec vec = { &syscall,
                           sizeof(syscall) };
     LOG(debug) << "Changing syscall to " << syscall;
     if (ptrace_if_stopped(PTRACE_SETREGSET, NT_ARM_SYSTEM_CALL, &vec)) {
       /* If that failed, the task was killed and it should not matter what
          we tried to set. But we will remember that our registers are dirty. */
       orig_syscallno_dirty = false;
     }
   }
 #endif
 }

 void Task::set_extra_regs(const ExtraRegisters& regs) {
   ASSERT(this, !regs.empty()) << "Trying to set empty ExtraRegisters";
   ASSERT(this, regs.arch() == arch())
       << "Trying to set wrong arch ExtraRegisters";
   extra_registers = regs;

   switch (extra_registers.format()) {
     case ExtraRegisters::XSAVE: {
       if (xsave_area_size() > 512) {
         struct iovec vec = { extra_registers.data_.data(),
                              extra_registers.data_.size() };
         if (ptrace_if_stopped(PTRACE_SETREGSET, NT_X86_XSTATE, &vec)) {
           /* If that failed, the task was killed and it should not matter what
              we tried to set. But we will remember that our registers are dirty. */
           extra_registers_known = true;
         }
       } else {
 #if defined(__i386__)
         ASSERT(this,
                extra_registers.data_.size() == sizeof(user_fpxregs_struct));
         if (ptrace_if_stopped(X86Arch::PTRACE_SETFPXREGS, nullptr,
                               extra_registers.data_.data())) {
           /* If that failed, the task was killed and it should not matter what
              we tried to set. But we will remember that our registers are dirty. */
           extra_registers_known = true;
         }
 #elif defined(__x86_64__)
         ASSERT(this,
                extra_registers.data_.size() == sizeof(user_fpregs_struct));
         if (ptrace_if_stopped(PTRACE_SETFPREGS, nullptr,
                               extra_registers.data_.data())) {
           /* If that failed, the task was killed and it should not matter what
              we tried to set. But we will remember that our registers are dirty. */
           extra_registers_known = true;
         }
 #endif
       }
       break;
     }
     case ExtraRegisters::NT_FPR: {
       struct iovec vec = { extra_registers.data_.data(),
                             extra_registers.data_.size() };
       if (ptrace_if_stopped(PTRACE_SETREGSET, NT_PRFPREG, &vec)) {
         /* If that failed, the task was killed and it should not matter what
            we tried to set. But we will remember that our registers are dirty. */
         extra_registers_known = true;
       }
       break;
     }
     default:
       ASSERT(this, false) << "Unexpected ExtraRegisters format";
   }
 }

 enum WatchBytesX86 {
   BYTES_1 = 0x00,
   BYTES_2 = 0x01,
   BYTES_4 = 0x03,
   BYTES_8 = 0x02
 };
 static WatchBytesX86 num_bytes_to_dr_len(size_t num_bytes) {
   switch (num_bytes) {
     case 1:
       return BYTES_1;
     case 2:
       return BYTES_2;
     case 4:
       return BYTES_4;
     case 8:
       return BYTES_8;
     default:
       FATAL() << "Unsupported breakpoint size " << num_bytes;
       return WatchBytesX86(-1); // not reached
   }
 }

 struct DebugControl {
   uintptr_t dr0_local : 1;
   uintptr_t dr0_global : 1;
   uintptr_t dr1_local : 1;
   uintptr_t dr1_global : 1;
   uintptr_t dr2_local : 1;
   uintptr_t dr2_global : 1;
   uintptr_t dr3_local : 1;
   uintptr_t dr3_global : 1;

   uintptr_t ignored : 8;

   WatchType dr0_type : 2;
   WatchBytesX86 dr0_len : 2;
   WatchType dr1_type : 2;
   WatchBytesX86 dr1_len : 2;
   WatchType dr2_type : 2;
   WatchBytesX86 dr2_len : 2;
   WatchType dr3_type : 2;
   WatchBytesX86 dr3_len : 2;

   void enable(size_t index, WatchBytesX86 size, WatchType type) {
     switch (index) {
 #define CASE(_i)                                                  \
       case _i:                                                    \
         dr##_i##_local = 1;                                       \
         dr##_i##_global = 0;                                      \
         dr##_i##_type = type;                                     \
         dr##_i##_len = size;                                      \
         break
       CASE(0);
       CASE(1);
       CASE(2);
       CASE(3);
 #undef CASE
       default:
         FATAL() << "Invalid index";
     }
   }
 };

 static_assert(sizeof(DebugControl) == sizeof(uintptr_t),
               "Can't pack DebugControl");

 union PackedDebugControl {
   uintptr_t packed;
   DebugControl ctl;
 };

 static bool set_x86_debug_regs(Task *t, const Task::HardwareWatchpoints& regs) {
   // Reset the debug status since we're about to change the set
   // of programmed watchpoints.
   t->set_x86_debug_reg(6, 0);

   if (regs.size() > NUM_X86_WATCHPOINTS) {
     t->set_x86_debug_reg(7, 0);
     return false;
   }

   // Work around kernel bug https://bugzilla.kernel.org/show_bug.cgi?id=200965.
   // For every watchpoint we're going to use, enable it with size 1.
   // This will let us set the address freely without potentially triggering
   // the kernel bug which will reject an unaligned address if the watchpoint
   // is disabled but was non-size-1.
   PackedDebugControl dr7;
   dr7.packed = 0;
   for (size_t i = 0; i < regs.size(); ++i) {
     dr7.ctl.enable(i, BYTES_1, WATCH_EXEC);
   }
   t->set_x86_debug_reg(7, dr7.packed);
   if (regs.empty()) {
     // Don't do another redundant poke to DR7.
     return true;
   }

   size_t index = 0;
   for (auto reg : regs) {
     if (!t->set_x86_debug_reg(index, reg.addr.as_int())) {
       t->set_x86_debug_reg(7, 0);
       return false;
     }
     dr7.ctl.enable(index, num_bytes_to_dr_len(reg.num_bytes), reg.type);
     ++index;
   }
   return t->set_x86_debug_reg(7, dr7.packed);
 }

 template <typename Arch>
 static bool set_debug_regs_arch(Task* t,
                                 const Task::HardwareWatchpoints& regs);
 template <> bool set_debug_regs_arch<X86Arch>(Task* t,
                                               const Task::HardwareWatchpoints& regs) {
   return set_x86_debug_regs(t, regs);
 }
 template <> bool set_debug_regs_arch<X64Arch>(Task* t,
                                               const Task::HardwareWatchpoints& regs) {
   return set_x86_debug_regs(t, regs);
 }

 static void query_max_bp_wp(Task* t, ssize_t* max_bp, ssize_t* max_wp) {
   ARM64Arch::user_hwdebug_state bps;
   ARM64Arch::user_hwdebug_state wps;
   bool ok = t->get_aarch64_debug_regs(NT_ARM_HW_BREAK, &bps) &&
             t->get_aarch64_debug_regs(NT_ARM_HW_WATCH, &wps);
   ASSERT(t, ok);
   *max_bp = bps.dbg_info & 0xff;
   *max_wp = wps.dbg_info & 0xff;
 }

 template <> bool set_debug_regs_arch<ARM64Arch>(Task* t,
                                                 const Task::HardwareWatchpoints& regs) {
   ARM64Arch::user_hwdebug_state bps;
   ARM64Arch::user_hwdebug_state wps;
   memset(&bps, 0, sizeof(bps));
   memset(&wps, 0, sizeof(wps));

   static ssize_t max_bp = -1;
   static ssize_t max_wp = -1;
   if (max_bp == -1) {
     query_max_bp_wp(t, &max_bp, &max_wp);
   }

   // Having at least one of each is architecturally guaranteed
   ASSERT(t, max_bp >= 1 && max_wp >= 1);

   ssize_t cur_bp = 0;
   ssize_t cur_wp = 0;
   for (auto reg : regs) {
     // GDB always splits these into nicely aligned platform chunks for us,
     // but let's be general and support unaligned registers also.
     size_t len = reg.num_bytes;
     remote_ptr<uint8_t> addr = reg.addr.cast<uint8_t>();
     while (len > 0) {
       ARM64Arch::hw_bp* bp = nullptr;
       if (reg.type == WATCH_EXEC) {
         if (cur_bp == max_bp) {
           return false;
         }
         bp = &bps.dbg_regs[cur_bp++];
       } else {
         if (cur_wp == max_wp) {
           return false;
         }
         bp = &wps.dbg_regs[cur_wp++];
       }
       ARM64Arch::hw_breakpoint_ctrl ctrl;
       memset(&ctrl, 0, sizeof(ctrl));
       switch (reg.type) {
         case WATCH_EXEC:
           ctrl.type = ARM_WATCH_EXEC;
           break;
         case WATCH_WRITE:
           ctrl.type = ARM_WATCH_WRITE;
           break;
         case WATCH_READWRITE:
           ctrl.type = ARM_WATCH_READWRITE;
           break;
       }
       ctrl.enabled = 1;
       ctrl.priv = ARM_PRIV_EL0;
       uintptr_t off = (uintptr_t)addr.as_int() % 8;
       size_t cur_bp_len = std::min(8-off, len);
       // This is a byte mask of which particular byte in the 8byte word at `addr`
       // to watch.
       uintptr_t mask = ((((uintptr_t)1) << cur_bp_len) - 1) << off;
       ASSERT(t, (mask & ~0xff) == 0);
       ctrl.length = mask;
       bp->addr = addr.as_int() - off;
       bp->ctrl = ctrl;
       len -= cur_bp_len;
       addr += cur_bp_len;
     }
   }

   // max_bp rather than cur_bp to make sure to clear out any unused slots
   return t->set_aarch64_debug_regs(NT_ARM_HW_BREAK, &bps, max_bp) &&
          t->set_aarch64_debug_regs(NT_ARM_HW_WATCH, &wps, max_wp);
 }

 static bool set_debug_regs_internal(Task* t, const Task::HardwareWatchpoints& regs) {
   RR_ARCH_FUNCTION(set_debug_regs_arch, t->arch(), t, regs);
 }

 bool Task::set_debug_regs(const HardwareWatchpoints& regs) {
   if (regs == current_hardware_watchpoints) {
     return true;
   }
   bool ret = set_debug_regs_internal(this, regs);
   if (ret) {
     current_hardware_watchpoints = regs;
   } else {
     current_hardware_watchpoints.clear();
   }
   return ret;
 }

 static void set_thread_area(std::vector<X86Arch::user_desc>& thread_areas_,
                             X86Arch::user_desc desc) {
   for (auto& t : thread_areas_) {
     if (t.entry_number == desc.entry_number) {
       t = desc;
       return;
     }
   }
   thread_areas_.push_back(desc);
 }

 void Task::set_thread_area(remote_ptr<X86Arch::user_desc> tls) {
   // We rely on the fact that user_desc is word-size-independent.
   DEBUG_ASSERT(arch() == x86 || arch() == x86_64);
   auto desc = read_mem(tls);
   rr::set_thread_area(thread_areas_, desc);
 }

 int Task::emulate_set_thread_area(int idx, X86Arch::user_desc desc) {
   DEBUG_ASSERT(arch() == x86 || arch() == x86_64);
   errno = 0;
   fallible_ptrace(NativeArch::PTRACE_SET_THREAD_AREA, idx, &desc);
   if (errno != 0) {
     return errno;
   }
   desc.entry_number = idx;
   rr::set_thread_area(thread_areas_, desc);
   return 0;
 }

 int Task::emulate_get_thread_area(int idx, X86Arch::user_desc& desc) {
   DEBUG_ASSERT(arch() == x86 || arch() == x86_64);
   LOG(debug) << "Emulating PTRACE_GET_THREAD_AREA";
   errno = 0;
   fallible_ptrace(NativeArch::PTRACE_GET_THREAD_AREA, idx, &desc);
   return errno;
 }

 pid_t Task::tgid() const { return tg->tgid; }

 pid_t Task::real_tgid() const {
   // Unless we're recording, each task is in its own thread group
   return session().is_recording() ? tgid() : tid;
 }

 const string& Task::trace_dir() const {
   const TraceStream* trace = trace_stream();
   ASSERT(this, trace) << "Trace directory not available";
   return trace->dir();
 }

 uint32_t Task::trace_time() const {
   const TraceStream* trace = trace_stream();
   return trace ? trace->time() : 0;
 }

 static bool is_signal_triggered_by_ptrace_interrupt(int group_stop_sig) {
   switch (group_stop_sig) {
     case SIGTRAP:
     // We sometimes see SIGSTOP at interrupts, though the
     // docs don't mention that.
     case SIGSTOP:
       return true;
     default:
       return false;
   }
 }

 // This function doesn't really need to do anything. The signal will cause
 // waitpid to return EINTR and that's all we need.
 static void handle_alarm_signal(__attribute__((unused)) int sig) {}

 bool Task::do_ptrace_interrupt() {
   errno = 0;
   fallible_ptrace(PTRACE_INTERRUPT, nullptr, nullptr);
   if (errno) {
     ASSERT(this, errno == ESRCH) << "Unexpected PTRACE_INTERRUPT error " << errno;
     return false;
   }
   expecting_ptrace_interrupt_stop = 2;
   return true;
 }

 bool Task::account_for_potential_ptrace_interrupt_stop(WaitStatus status) {
   if (expecting_ptrace_interrupt_stop > 0) {
     --expecting_ptrace_interrupt_stop;
     if (is_signal_triggered_by_ptrace_interrupt(status.group_stop())) {
       expecting_ptrace_interrupt_stop = 0;
       return true;
     }
   }
   return false;
 }

 bool Task::wait(double interrupt_after_elapsed) {
   LOG(debug) << "going into blocking wait for " << tid << " ...";
   ASSERT(this, session().is_recording() || interrupt_after_elapsed == -1);

   bool sent_wait_interrupt = false;
   WaitResult result;
   while (true) {
     if (interrupt_after_elapsed == 0 && !sent_wait_interrupt) {
       // If this fails, the tracee must be a zombie or altogether gone,
       // in which case we should detect that status change later.
       do_ptrace_interrupt();
       if (session().is_recording()) {
         // Force this timeslice to end
         session().as_record()->scheduler().expire_timeslice();
       }
       sent_wait_interrupt = true;
     }

     WaitOptions options(tid);
     if (interrupt_after_elapsed > 0) {
       options.block_seconds = interrupt_after_elapsed;
       interrupt_after_elapsed = 0;
     }
     result = WaitManager::wait_stop(options);

     if (result.code == WAIT_OK) {
       break;
     }
     if (result.code == WAIT_NO_CHILD) {
       /* The process died without us getting a PTRACE_EXIT_EVENT notification.
        * This is possible if the process receives a SIGKILL while in the exit
        * event stop, but before we were able to read the event notification.
        */
       return false;
     }
     ASSERT(this, result.code == WAIT_NO_STATUS);
   }

   if (sent_wait_interrupt) {
     LOG(warn) << "Forced to PTRACE_INTERRUPT tracee";
     if (!is_signal_triggered_by_ptrace_interrupt(result.status.group_stop())) {
       LOG(warn) << "  PTRACE_INTERRUPT raced with another event " << result.status;
     }
   }
   return did_waitpid(result.status);
 }

 void Task::canonicalize_regs(SupportedArch syscall_arch) {
   ASSERT(this, is_stopped_);

   if (registers.arch() == x86_64) {
     if (syscall_arch == x86) {
       // The int $0x80 compatibility handling clears r8-r11
       // (see arch/x86/entry/entry_64_compat.S). The sysenter compatibility
       // handling also clears r12-r15. However, to actually make such a syscall,
       // the user process would have to switch itself into compatibility mode,
       // which, though possible, does not appear to actually be done by any
       // real application (contrary to int $0x80, which is accessible from 64bit
       // mode as well).
       registers_dirty |= registers.set_r8(0x0);
       registers_dirty |= registers.set_r9(0x0);
       registers_dirty |= registers.set_r10(0x0);
       registers_dirty |= registers.set_r11(0x0);
     } else {
       // x86-64 'syscall' instruction copies RFLAGS to R11 on syscall entry.
       // If we single-stepped into the syscall instruction, the TF flag will be
       // set in R11. We don't want the value in R11 to depend on whether we
       // were single-stepping during record or replay, possibly causing
       // divergence.
       // This doesn't matter when exiting a sigreturn syscall, since it
       // restores the original flags.
       // For untraced syscalls, the untraced-syscall entry point code (see
       // write_rr_page) does this itself.
       // We tried just clearing %r11, but that caused hangs in
       // Ubuntu/Debian kernels.
       // Making this match the flags makes this operation idempotent, which is
       // helpful.
       registers_dirty |= registers.set_r11(0x246);
       // x86-64 'syscall' instruction copies return address to RCX on syscall
       // entry. rr-related kernel activity normally sets RCX to -1 at some point
       // during syscall execution, but apparently in some (unknown) situations
       // probably involving untraced syscalls, that doesn't happen. To avoid
       // potential issues, forcibly replace RCX with -1 always.
       // This doesn't matter (and we should not do this) when exiting a
       // sigreturn syscall, since it will restore the original RCX and we don't
       // want to clobber that.
       // For untraced syscalls, the untraced-syscall entry point code (see
       // write_rr_page) does this itself.
       registers_dirty |= registers.set_cx((intptr_t)-1);
     }
     // On kernel 3.13.0-68-generic #111-Ubuntu SMP we have observed a failed
     // execve() clearing all flags during recording. During replay we emulate
     // the exec so this wouldn't happen. Just reset all flags so everything's
     // consistent.
     // 0x246 is ZF+PF+IF+reserved, the result clearing a register using
     // "xor reg, reg".
     registers_dirty |= registers.set_flags(0x246);
   } else if (registers.arch() == x86) {
     // The x86 SYSENTER handling in Linux modifies EBP and EFLAGS on entry.
     // EBP is the potential sixth syscall parameter, stored on the user stack.
     // The EFLAGS changes are described here:
     // http://linux-kernel.2935.n7.nabble.com/ia32-sysenter-target-does-not-preserve-EFLAGS-td1074164.html
     // In a VMWare guest, the modifications to EFLAGS appear to be
     // nondeterministic. Cover that up by setting EFLAGS to reasonable values
     // now.
     registers_dirty |= registers.set_flags(0x246);
   }
 }

 bool Task::read_aarch64_tls_register(uintptr_t *result) {
   struct iovec vec = { result, sizeof(*result) };
   return ptrace_if_stopped(PTRACE_GETREGSET, NT_ARM_TLS, &vec);
 }

 void Task::set_aarch64_tls_register(uintptr_t val) {
   struct iovec vec = { &val, sizeof(val) };
   ptrace_if_stopped(PTRACE_SETREGSET, NT_ARM_TLS, &vec);
   /* If that failed, the task was killed and it should not matter what
      we tried to set. */
 }

 bool Task::did_waitpid(WaitStatus status) {
   if (is_detached_proxy() &&
       (status.stop_sig() == SIGSTOP || status.stop_sig() == SIGCONT)) {
     LOG(debug) << "Task " << tid << " is a detached proxy, ignoring status " << status;
     return true;
   }

   LOG(debug) << "  Task " << tid << " changed status to " << status;

   intptr_t original_syscallno = registers.original_syscallno();
   LOG(debug) << "  (refreshing register cache)";
   Ticks more_ticks = 0;

   if (status.reaped()) {
     was_reaped_ = true;
     if (handled_ptrace_exit_event_) {
       LOG(debug) << "Reaped task late " << tid;
       // We did not reap this task when it exited, likely because it was a
       // thread group leader blocked on the exit of the other members of
       // its thread group. This has now reaped the task, so all we need to do
       // here is get out quickly and the higher-level function should go ahead
       // and delete us.
       wait_status = status;
       return true;
     }
     LOG(debug) << "Unexpected process reap for " << tid;
     /* Mark buffers as having been destroyed. We missed our chance
      * to destroy them normally in handle_ptrace_exit_event.
      * XXX: We could try to find some tasks here to unmap our buffers, but it
      *      seems hardly worth it.
      */
     destroy_buffers(nullptr, nullptr);
   } else {
     bool was_stopped = is_stopped_;
     // Mark as stopped now. If we fail one of the ticks assertions below,
     // the test-monitor (or user) might want to attach the emergency debugger,
     // which needs to know that the tracee is stopped.
     set_stopped(true);

     // After PTRACE_INTERRUPT, any next two stops may be a group stop caused by
     // that PTRACE_INTERRUPT (or neither may be). This is because PTRACE_INTERRUPT
     // generally lets other stops win (and thus doesn't inject it's own stop), but
     // if the other stop was already done processing, even we didn't see it yet,
     // the stop will still be queued, so we could see the other stop and then the
     // PTRACE_INTERRUPT group stop.
     // When we issue PTRACE_INTERRUPT, we this set this counter to 2, and here
     // we decrement it on every stop such that while this counter is positive,
     // any group-stop could be one induced by PTRACE_INTERRUPT
     if (account_for_potential_ptrace_interrupt_stop(status)) {
       // Assume this was PTRACE_INTERRUPT and thus treat this as
       // TIME_SLICE_SIGNAL instead.
       status = WaitStatus::for_stop_sig(PerfCounters::TIME_SLICE_SIGNAL);
       memset(&pending_siginfo, 0, sizeof(pending_siginfo));
       pending_siginfo.si_signo = PerfCounters::TIME_SLICE_SIGNAL;
       pending_siginfo.si_fd = hpc.ticks_interrupt_fd();
       pending_siginfo.si_code = POLL_IN;
     } else if (status.stop_sig()) {
       if (!ptrace_if_stopped(PTRACE_GETSIGINFO, nullptr, &pending_siginfo)) {
         LOG(debug) << "Unexpected process death getting siginfo for " << tid;
         // Let's pretend this stop never happened.
         set_stopped(false);
         in_unexpected_exit = true;
         return false;
       }
     }

     // A SIGKILL or equivalent can cause a task to exit without us having run it, in
     // which case we might have pending register changes for it that are now
     // irrelevant. In that case we just throw away our register changes and use
     // whatever the kernel now has.
     if (status.ptrace_event() != PTRACE_EVENT_EXIT) {
       ASSERT(this, !registers_dirty) << "Registers shouldn't already be dirty (status is " << status << ")";
     }
     // If the task was stopped, we don't need to read the registers.
     // In fact if we didn't start the thread, we may not have flushed dirty
     // registers but still received a PTRACE_EVENT_EXIT, in which case the
     // task's register values are not what they should be.
     if (!was_stopped && !registers_dirty) {
       LOG(debug) << "Requesting registers from tracee " << tid;
       NativeArch::user_regs_struct ptrace_regs;

 #if defined(__i386__) || defined(__x86_64__)
       if (ptrace_if_stopped(PTRACE_GETREGS, nullptr, &ptrace_regs)) {
         registers.set_from_ptrace(ptrace_regs);
         // Check the architecture of the task by looking at the
         // cs segment register and checking if that segment is a long mode segment
         // (Linux always uses GDT entries for this, which are globally the same).
         SupportedArch a = is_long_mode_segment(registers.cs()) ? x86_64 : x86;

         if (a == x86_64 && NativeArch::arch() == x86) {
           FATAL() << "Sorry, tracee " << tid << " is executing in x86-64 mode"
                   << " and that's not supported with a 32-bit rr.";
         }

         if (a != registers.arch()) {
           registers.set_arch(a);
           registers.set_from_ptrace(ptrace_regs);
         }

         // Only adjust tick count if we were able to read registers.
         // For example if the task is already reaped we don't have new
         // register values and we don't want to read a ticks value
         // that mismatches our registers.
         more_ticks = hpc.read_ticks(this);
       }
 #elif defined(__aarch64__)
       struct iovec vec = { &ptrace_regs,
                           sizeof(ptrace_regs) };
       if (ptrace_if_stopped(PTRACE_GETREGSET, NT_PRSTATUS, &vec)) {
         registers.set_from_ptrace(ptrace_regs);
         more_ticks = hpc.read_ticks(this);
       }
 #else
 #error detect architecture here
 #endif
       else {
         LOG(debug) << "Unexpected process death for " << tid;
         // Let's pretend this stop never happened.
         // Note that pending_siginfo may have been overwritten above,
         // but in that case we're going to ignore this signal-stop
         // so it doesn't matter.
         set_stopped(false);
         in_unexpected_exit = true;
         return false;
       }
     }
   }

   wait_status = status;
   // We stop counting here because there may be things we want to do to the
   // tracee that would otherwise generate ticks.
   hpc.stop_counting();
   session().accumulate_ticks_processed(more_ticks);
   ticks += more_ticks;

   if (was_reaped_) {
     ASSERT(this, !handled_ptrace_exit_event_);
   } else if (status.ptrace_event() == PTRACE_EVENT_EXIT) {
     ASSERT(this, !handled_ptrace_exit_event_);
     seen_ptrace_exit_event_ = true;
   } else {
     if (arch() == x86 || arch() == x86_64) {
       // Clear the single step flag in case we got here by taking a signal
       // after asking for a single step. We want to avoid taking that single
       // step after the signal resumes, so the singlestep flag needs to be
       // cleared. On aarch64, the kernel does this for us.
       if (registers.x86_singlestep_flag()) {
         registers.clear_x86_singlestep_flag();
         registers_dirty = true;
       }

       if (last_resume_orig_cx != 0) {
         uintptr_t new_cx = registers.cx();
         /* Un-fudge registers, if we fudged them to work around the KNL hardware
           quirk */
         unsigned cutoff = single_step_coalesce_cutoff();
         ASSERT(this, new_cx == cutoff - 1 || new_cx == cutoff);
         registers.set_cx(last_resume_orig_cx - cutoff + new_cx);
         registers_dirty = true;
       }
       last_resume_orig_cx = 0;
     }

     if (did_set_breakpoint_after_cpuid) {
       remote_code_ptr bkpt_addr =
         address_of_last_execution_resume + trapped_instruction_len(singlestepping_instruction);
       if (ip().undo_executed_bkpt(arch()) == bkpt_addr) {
         Registers r = regs();
         r.set_ip(bkpt_addr);
         set_regs(r);
       }
       vm()->remove_breakpoint(bkpt_addr, BKPT_INTERNAL);
       did_set_breakpoint_after_cpuid = false;
     }
     if ((singlestepping_instruction == TrappedInstruction::PUSHF ||
          singlestepping_instruction == TrappedInstruction::PUSHF16) &&
         ip() == address_of_last_execution_resume +
           trapped_instruction_len(singlestepping_instruction)) {
       // We singlestepped through a pushf. Clear TF bit on stack.
       auto sp = regs().sp().cast<uint16_t>();
       // If this address is invalid then we should have segfaulted instead of
       // retiring the instruction!
       uint16_t val = read_mem(sp);
       write_mem(sp, (uint16_t)(val & ~X86_TF_FLAG));
     }
     singlestepping_instruction = TrappedInstruction::NONE;

     // We might have singlestepped at the resumption address and just exited
     // the kernel without executing the breakpoint at that address.
     // The kernel usually (always?) singlesteps an extra instruction when
     // we do this with PTRACE_SYSEMU_SINGLESTEP, but rr's ptrace emulation
     // doesn't and it's kind of a kernel bug.
     if (as->get_breakpoint_type_at_addr(address_of_last_execution_resume) !=
             BKPT_NONE &&
         stop_sig() == SIGTRAP && !ptrace_event() &&
         ip().undo_executed_bkpt(arch()) == address_of_last_execution_resume) {
       ASSERT(this, more_ticks == 0);
       // When we resume execution and immediately hit a breakpoint, the original
       // syscall number can be reset to -1. Undo that, so that the register
       // state matches the state we'd be in if we hadn't resumed. ReplayTimeline
       // depends on resume-at-a-breakpoint being a noop.
       registers.set_original_syscallno(original_syscallno);
       registers_dirty = true;
     }

     // If we're in the rr page,  we may have just returned from an untraced
     // syscall there and while in the rr page registers need to be consistent
     // between record and replay. During replay most untraced syscalls are
     // replaced with "xor eax,eax" (right after a "movq -1, %rcx") so
     // rcx is always -1, but during recording it sometimes isn't after we've
     // done a real syscall.
     if (is_in_rr_page()) {
       // N.B.: Cross architecture syscalls don't go through the rr page, so we
       // know what the architecture is.
       canonicalize_regs(arch());
     }
   }

   did_wait();
   return true;
 }

 template <typename Arch>
 static void set_tls_from_clone_arch(Task* t, remote_ptr<void> tls) {
   if (Arch::clone_tls_type == Arch::UserDescPointer) {
     t->set_thread_area(tls.cast<X86Arch::user_desc>());
   }
 }

 static void set_tls_from_clone(Task* t, remote_ptr<void> tls) {
   RR_ARCH_FUNCTION(set_tls_from_clone_arch, t->arch(), t, tls);
 }

 template <typename Arch>
 static void setup_preload_thread_locals_from_clone_arch(Task* t, Task* origin) {
   void* local_addr = preload_thread_locals_local_addr(*t->vm());
   if (local_addr) {
     t->activate_preload_thread_locals();
     auto locals = reinterpret_cast<preload_thread_locals<Arch>*>(local_addr);
     auto origin_locals = reinterpret_cast<const preload_thread_locals<Arch>*>(
         origin->fetch_preload_thread_locals());
     locals->alt_stack_nesting_level = origin_locals->alt_stack_nesting_level;
     // vfork() will restore the flags on the way out since its on the same
     // stack.
     locals->saved_flags = origin_locals->saved_flags;
     // clone() syscalls set the child stack pointer, so the child is no
     // longer in the syscallbuf code even if the parent was.
     if (PRELOAD_THREAD_LOCAL_SCRATCH2_SIZE >= 8 * 2) {
       // On aarch64, we use this to save and restore some register values across clone
       memcpy(locals->stub_scratch_2, origin_locals->stub_scratch_2, 8 * 2);
     }
   }
 }

 void Task::setup_preload_thread_locals_from_clone(Task* origin) {
   RR_ARCH_FUNCTION(setup_preload_thread_locals_from_clone_arch, this->arch(), this, origin);
 }

 Task* Task::clone(CloneReason reason, int flags, remote_ptr<void> stack,
                   remote_ptr<void> tls, remote_ptr<int>, pid_t new_tid,
                   pid_t new_rec_tid, uint32_t new_serial,
                   Session* other_session,
                   FdTable::shr_ptr new_fds,
                   ThreadGroup::shr_ptr new_tg) {
   Session* new_task_session = &session();
   if (other_session) {
     ASSERT(this, reason != TRACEE_CLONE);
     new_task_session = other_session;
   } else {
     ASSERT(this, reason == TRACEE_CLONE);
   }
   string n;
   if (!session().is_recording()) {
     n = name();
   }
   Task* t =
       new_task_session->new_task(new_tid, new_rec_tid, new_serial, arch(), n);

   if (CLONE_SHARE_VM & flags) {
     t->as = as;
     if (!stack.is_null()) {
       remote_ptr<void> last_stack_byte = stack - 1;
       if (t->as->has_mapping(last_stack_byte)) {
         auto mapping = t->as->mapping_of(last_stack_byte);
         if (!mapping.recorded_map.is_heap()) {
           const KernelMapping& m = mapping.map;
           LOG(debug) << "mapping stack for " << new_tid << " at " << m;
           t->as->map(t, m.start(), m.size(), m.prot(), m.flags(),
                      m.file_offset_bytes(), "[stack]", m.device(), m.inode());
         }
       }
     }
     // rseq state is not cloned into new threads
   } else {
     t->as = new_task_session->clone(t, as);
     if (rseq_state) {
       // rseq state is cloned into non-thread children
       t->rseq_state = make_unique<RseqState>(*rseq_state);
     }
   }

   t->syscallbuf_size = syscallbuf_size;
   t->preload_globals = preload_globals;
   t->seccomp_bpf_enabled = seccomp_bpf_enabled;

   // FdTable is either shared or copied, so the contents of
   // syscallbuf_fds_disabled_child are still valid.
   if (CLONE_SHARE_FILES & flags) {
     ASSERT(this, !new_fds);
     t->fds = fds;
   } else if (new_fds) {
     t->fds = new_fds;
   } else {
     t->fds = fds->clone();
   }
   t->fds->insert_task(t);

   t->top_of_stack = stack;

   // wait() before trying to do anything that might need to
   // use ptrace to access memory
   bool ok = t->wait();
   ASSERT(t, ok) << "Task " << t->tid << " killed unexpectedly; not sure how to handle this";

   t->post_wait_clone(this, flags);
   if (CLONE_SHARE_THREAD_GROUP & flags) {
     ASSERT(this, !new_tg);
     t->tg = tg;
   } else {
     if (new_tg) {
       t->tg = new_tg;
     } else {
       t->tg = new_task_session->clone(t, tg);
     }
   }
   t->tg->insert_task(t);

   t->open_mem_fd_if_needed();
   t->thread_areas_ = thread_areas_;
   if (CLONE_SET_TLS & flags) {
     set_tls_from_clone(t, tls);
   }

   t->as->insert_task(t);

   if (reason == TRACEE_CLONE) {
     if (!(CLONE_SHARE_VM & flags)) {
       // Unmap syscallbuf and scratch for tasks running the original address
       // space.
       AutoRemoteSyscalls remote(t);
       for (Task* tt : as->task_set()) {
         // Leak the scratch buffer for the task we cloned from. We need to do
         // this because we may be using part of it for the syscallbuf stack
         // and unmapping it now would cause a crash in the new task.
         if (tt != this) {
           t->unmap_buffers_for(remote, tt, tt->syscallbuf_child);
         }
       }
       as->did_fork_into(t);
     }

     // `t` doesn't have a syscallbuf and `t->desched_fd_child`/
     // `t->cloned_file_data_fd_child` are both -1.
     if (session().is_replaying()) {
       // `t` is not really sharing our fd table, in fact our real fd table
       // is only used by this task, so it only contains our syscallbuf fds (if any),
       // not the fds for any other task. So, only really-close the fds for 'this'.
       // We still need to update t's `fds` table to indicate that those fds were
       // closed during recording, though, otherwise we may get FileMonitor
       // collisions.
       AutoRemoteSyscalls remote(t);
       for (Task* tt : fds->task_set()) {
         t->close_buffers_for(remote, tt, tt == this);
       }
     } else if (CLONE_SHARE_FILES & flags) {
       // `t` is sharing our fd table, so it should not close anything.
     } else {
       // Close syscallbuf fds for all tasks using the original fd table.
       AutoRemoteSyscalls remote(t);
       for (Task* tt : fds->task_set()) {
         t->close_buffers_for(remote, tt, true);
       }
     }
   }

   t->post_vm_clone(reason, flags, this);

   // Copy debug register values. We assume the kernel will either copy debug
   // registers into the new task, or the debug registers will be unset
   // in the new task. If we have no HW watchpoints then debug registers
   // will definitely be unset in the new task so there is nothing to do.
   if (!current_hardware_watchpoints.empty()) {
     // Copy debug register settings into the new task so we're in a known state.
     bool ret = set_debug_regs_internal(t, current_hardware_watchpoints);
     if (!ret) {
       LOG(warn) << "Failed to initialize new task's debug registers; "
                 << "this should always work since we were able to set them in the old task, "
                 << "but the new task might have been killed";
     }
     t->current_hardware_watchpoints = current_hardware_watchpoints;
   }

   return t;
 }

 bool Task::post_vm_clone(CloneReason reason, int flags, Task* origin) {
   bool created_preload_thread_locals_mapping = false;
   if (!(CLONE_SHARE_VM & flags)) {
     created_preload_thread_locals_mapping = this->as->post_vm_clone(this);
   }
   this->as->fd_tables_changed();

   if (reason == TRACEE_CLONE) {
     setup_preload_thread_locals_from_clone(origin);
   }

   return created_preload_thread_locals_mapping;
 }

 Task* Task::os_fork_into(Session* session, FdTable::shr_ptr new_fds) {
   AutoRemoteSyscalls remote(this, AutoRemoteSyscalls::DISABLE_MEMORY_PARAMS);
   Task* child =
       os_clone(Task::SESSION_CLONE_LEADER, session, remote, rec_tid, serial,
                // Most likely, we'll be setting up a
                // CLEARTID futex.  That's not done
                // here, but rather later in
                // |copy_state()|.
                //
                // We also don't use any of the SETTID
                // flags because that earlier work will
                // be copied by fork()ing the address
                // space.
                SIGCHLD,
                std::move(new_fds));
   // When we forked ourselves, the child inherited the setup we
   // did to make the clone() call.  So we have to "finish" the
   // remote calls (i.e. undo fudged state) in the child too,
   // even though we never made any syscalls there.
   remote.restore_state_to(child);
   return child;
 }

 Task* Task::os_clone_into(const CapturedState& state,
                           AutoRemoteSyscalls& remote,
                           const ClonedFdTables& cloned_fd_tables,
                           ThreadGroup::shr_ptr new_tg) {
   auto fdtable_entry = cloned_fd_tables.find(state.fdtable_identity);
   DEBUG_ASSERT(fdtable_entry != cloned_fd_tables.end() &&
                "All captured fd tables should be in cloned_fd_tables");
   return os_clone(Task::SESSION_CLONE_NONLEADER, &remote.task()->session(),
                   remote, state.rec_tid, state.serial,
                   // We don't actually /need/ to specify the
                   // SIGHAND/SYSVMEM flags because those things
                   // are emulated in the tracee.  But we use the
                   // same flags as glibc to be on the safe side
                   // wrt kernel bugs.
                   //
                   // We don't pass CLONE_SETTLS here *only*
                   // because we'll do it later in
                   // |copy_state()|.
                   //
                   // See |os_fork_into()| above for discussion
                   // of the CTID flags.
                   (CLONE_VM | CLONE_FS | CLONE_SIGHAND |
                    CLONE_SYSVSEM),
                   fdtable_entry->second,
                   std::move(new_tg),
                   state.top_of_stack);
 }

 template <typename Arch>
 static void copy_tls_arch(const Task::CapturedState& state,
                           AutoRemoteSyscalls& remote) {
   if (Arch::clone_tls_type == Arch::UserDescPointer) {
     for (const auto& t : state.thread_areas) {
       AutoRestoreMem remote_tls(remote, (const uint8_t*)&t, sizeof(t));
       LOG(debug) << "    setting tls " << remote_tls.get();
       remote.infallible_syscall(
           syscall_number_for_set_thread_area(remote.arch()),
           remote_tls.get().as_int());
     }
   } else if (Arch::arch() == aarch64) {
     remote.task()->set_aarch64_tls_register(state.tls_register);
   }
 }

 static void copy_tls(const Task::CapturedState& state,
                      AutoRemoteSyscalls& remote) {
   RR_ARCH_FUNCTION(copy_tls_arch, remote.arch(), state, remote);
 }

 static int64_t fdinfo_field(Task* t, int fd, const char* field, bool must_exist) {
   char buf[1024];
   sprintf(buf, "/proc/%d/fdinfo/%d", t->tid, fd);
   ScopedFd info(buf, O_RDONLY);
   if (must_exist) {
     ASSERT(t, info.is_open()) << "Can't open " << buf;
   } else if (!info.is_open()) {
     return -1;
   }
   ssize_t bytes = read(info, buf, sizeof(buf) - 1);
   ASSERT(t, bytes > 0);
   buf[bytes] = 0;

   char* p = buf;
   size_t field_len = strlen(field);
   while (*p) {
     if (strncmp(p, field, field_len) == 0) {
       char* end;
       long long int r = strtoll(p + field_len, &end, 10);
       ASSERT(t, *end == 0 || *end == '\n');
       return r;
     }
     while (*p) {
       if (*p == '\n') {
         ++p;
         break;
       }
       ++p;
     }
   }
   return -1;
 }

 int64_t Task::fd_offset(int fd) {
   return fdinfo_field(this, fd, "pos:", true);
 }

 pid_t Task::pid_of_pidfd(int fd) {
   return fdinfo_field(this, fd, "Pid:", false);
 }

 Task::CapturedState Task::capture_state() {
   CapturedState state;
   state.rec_tid = rec_tid;
   state.own_namespace_rec_tid = own_namespace_rec_tid;
   state.fdtable_identity = uintptr_t(fds.get());
   state.serial = serial;
   state.tguid = thread_group()->tguid();
   state.regs = regs();
   state.extra_regs = extra_regs();
   state.prname = name();
   if (arch() == aarch64) {
     bool ok = read_aarch64_tls_register(&state.tls_register);
     ASSERT(this, ok) << "Tracee died; this shouldn't happen in replay";
   }
   if (rseq_state) {
     state.rseq_state = make_unique<RseqState>(*rseq_state);
   }

   state.thread_areas = thread_areas_;
   state.desched_fd_child = desched_fd_child;
   state.cloned_file_data_fd_child = cloned_file_data_fd_child;
   state.cloned_file_data_fname = cloned_file_data_fname;
   state.cloned_file_data_offset =
       cloned_file_data_fd_child >= 0
           ? fd_offset(cloned_file_data_fd_child)
           : 0;
   memcpy(&state.thread_locals, fetch_preload_thread_locals(),
          PRELOAD_THREAD_LOCALS_SIZE);
   state.syscallbuf_child = syscallbuf_child;
   state.syscallbuf_size = syscallbuf_size;
   state.preload_globals = preload_globals;
   state.scratch_ptr = scratch_ptr;
   state.scratch_size = scratch_size;
   state.wait_status = wait_status;
   state.ticks = ticks;
   state.top_of_stack = top_of_stack;
   return state;
 }

 void Task::copy_state(const CapturedState& state) {
   set_regs(state.regs);
   set_extra_regs(state.extra_regs);
   {
     AutoRemoteSyscalls remote(this);
     set_name(remote, state.prname);
     copy_tls(state, remote);
     thread_areas_ = state.thread_areas;
     syscallbuf_size = state.syscallbuf_size;

     ASSERT(this, !syscallbuf_child)
         << "Syscallbuf should not already be initialized in clone";
     if (!state.syscallbuf_child.is_null()) {
       // All these fields are preserved by the fork.
       desched_fd_child = state.desched_fd_child;
       cloned_file_data_fd_child = state.cloned_file_data_fd_child;
       cloned_file_data_fname = state.cloned_file_data_fname;
       if (cloned_file_data_fd_child >= 0) {
         ScopedFd fd(cloned_file_data_fname.c_str(), session().as_record() ?
           O_RDWR : O_RDONLY);
         remote.infallible_send_fd_dup(fd, cloned_file_data_fd_child, O_CLOEXEC);
         remote.infallible_lseek_syscall(
             cloned_file_data_fd_child, state.cloned_file_data_offset, SEEK_SET);
       }
       syscallbuf_child = state.syscallbuf_child;
     }
   }
   preload_globals = state.preload_globals;
   ASSERT(this, as->thread_locals_tuid() != tuid());
   memcpy(&thread_locals, &state.thread_locals, PRELOAD_THREAD_LOCALS_SIZE);
   // The scratch buffer (for now) is merely a private mapping in
   // the remote task.  The CoW copy made by fork()'ing the
   // address space has the semantics we want.  It's not used in
   // replay anyway.
   scratch_ptr = state.scratch_ptr;
   scratch_size = state.scratch_size;

   // Whatever |from|'s last wait status was is what ours would
   // have been.
   wait_status = state.wait_status;

   ticks = state.ticks;
   own_namespace_rec_tid = state.own_namespace_rec_tid;
   if (state.rseq_state) {
     rseq_state = make_unique<RseqState>(*state.rseq_state);
   }
 }

 remote_ptr<const struct syscallbuf_record> Task::next_syscallbuf_record() {
   return ((syscallbuf_child + 1).cast<uint8_t>() +
           read_mem(REMOTE_PTR_FIELD(syscallbuf_child, num_rec_bytes)))
       .cast<const struct syscallbuf_record>();
 }

 long Task::stored_record_size(
     remote_ptr<const struct syscallbuf_record> record) {
   return ::stored_record_size(read_mem(REMOTE_PTR_FIELD(record, size)));
 }

 long Task::fallible_ptrace(int request, remote_ptr<void> addr, void* data) {
   return ptrace(_ptrace_request(request), tid, addr, data);
 }

 bool Task::open_mem_fd() {
   // Use ptrace to read/write during open_mem_fd
   as->set_mem_fd(ScopedFd());

   if (!is_stopped_) {
     LOG(warn) << "Can't retrieve mem fd for " << tid <<
       "; process not stopped, racing with exec?";
     return false;
   }

   /**
    * We're expecting that either we or the child can read the mem fd.
    * It's possible for both to not be the case (us on certain kernel
    * configurations, the child after it did a setuid).
    */
   char pid_path[PATH_MAX];
   sprintf(pid_path, "/proc/%d", tid);
   ScopedFd dir_fd(pid_path, O_PATH);
   if (dir_fd < 0) {
     LOG(info) << "Can't retrieve mem fd for " << tid << "; process no longer exists??";
     return false;
   }

   ScopedFd fd = ScopedFd::openat(dir_fd, "mem", O_RDWR | O_CLOEXEC);
   if (!fd.is_open()) {
     LOG(debug) << "Falling back to the remote fd dance";
     AutoRemoteSyscalls remote(this);
     int remote_mem_dir_fd = remote.send_fd(dir_fd);
     if (remote_mem_dir_fd < 0) {
       LOG(info) << "Can't retrieve mem fd for " << tid << "; process is exiting?";
       return false;
     }

     char mem[] = "mem";
     // If the remote dies, any of these can fail. That's ok, we'll just
     // find that the fd wasn't successfully opened.
     AutoRestoreMem remote_path(remote, mem, sizeof(mem));
     int remote_mem_fd = remote.syscall(syscall_number_for_openat(arch()),
                         remote_mem_dir_fd, remote_path.get(), O_RDWR);
     if (remote_mem_fd < 0) {
       LOG(info) << "Can't retrieve mem fd for " << tid
         << "; couldn't open /proc/...mem; errno=" << errno_name(-remote_mem_fd);
       return false;
     }
     fd = remote.retrieve_fd(remote_mem_fd);
     remote.infallible_close_syscall_if_alive(remote_mem_fd);
     remote.infallible_close_syscall_if_alive(remote_mem_dir_fd);
   }

   if (!fd.is_open()) {
     LOG(info) << "Can't retrieve mem fd for " << tid << "; process no longer exists?";
     return false;
   }
   as->set_mem_fd(std::move(fd));
   return true;
 }

 void Task::open_mem_fd_if_needed() {
   if (!as->mem_fd().is_open()) {
     open_mem_fd();
   }
 }

 ScopedFd& Task::pagemap_fd() {
   if (!as->pagemap_fd().is_open()) {
     ScopedFd fd(proc_pagemap_path().c_str(), O_RDONLY);
     if (fd.is_open()) {
       as->set_pagemap_fd(std::move(fd));
     } else {
       LOG(info) << "Can't retrieve pagemap fd for " << tid;
     }
   }
   return as->pagemap_fd();
 }

 KernelMapping Task::init_syscall_buffer(AutoRemoteSyscalls& remote,
                                         remote_ptr<void> map_hint) {
   char name[50];
   sprintf(name, "syscallbuf.%d", rec_tid);
   KernelMapping km =
       Session::create_shared_mmap(remote, syscallbuf_size, map_hint, name);
   if (!km.size()) {
     return km;
   }
   auto& m = remote.task()->vm()->mapping_of(km.start());
   remote.task()->vm()->mapping_flags_of(km.start()) |=
       AddressSpace::Mapping::IS_SYSCALLBUF;

   ASSERT(this, !syscallbuf_child)
       << "Should not already have syscallbuf initialized!";

   syscallbuf_child = km.start().cast<struct syscallbuf_hdr>();

   // No entries to begin with.
   memset(m.local_addr, 0, sizeof(struct syscallbuf_hdr));

   return km;
 }

 void Task::set_syscallbuf_locked(bool locked) {
   if (!syscallbuf_child) {
     return;
   }
   remote_ptr<uint8_t> remote_addr = REMOTE_PTR_FIELD(syscallbuf_child, locked);
   uint8_t locked_before = read_mem(remote_addr);
   uint8_t new_locked = locked ? (locked_before | SYSCALLBUF_LOCKED_TRACER)
                               : (locked_before & ~SYSCALLBUF_LOCKED_TRACER);
   if (new_locked != locked_before) {
     write_mem(remote_addr, new_locked);
   }
 }

 void Task::reset_syscallbuf() {
   if (!syscallbuf_child) {
     return;
   }

   ASSERT(this,
          !is_in_untraced_syscall() ||
              0 == (SYSCALLBUF_LOCKED_TRACEE &
                    read_mem(REMOTE_PTR_FIELD(syscallbuf_child, locked))));

   // Memset is easiest to do by using the local mapping which should always
   // exist for the syscallbuf
   uint32_t num_rec =
       read_mem(REMOTE_PTR_FIELD(syscallbuf_child, num_rec_bytes));
   uint8_t* ptr = as->local_mapping(syscallbuf_child + 1, num_rec);
   DEBUG_ASSERT(ptr != nullptr);
   memset(ptr, 0, num_rec);
   write_mem(REMOTE_PTR_FIELD(syscallbuf_child, num_rec_bytes), (uint32_t)0);
   write_mem(REMOTE_PTR_FIELD(syscallbuf_child, mprotect_record_count),
             (uint32_t)0);
   write_mem(REMOTE_PTR_FIELD(syscallbuf_child, mprotect_record_count_completed),
             (uint32_t)0);
   write_mem(REMOTE_PTR_FIELD(syscallbuf_child, blocked_sigs_generation),
             (uint32_t)0);
 }

 ssize_t Task::read_bytes_ptrace(remote_ptr<void> addr, ssize_t buf_size,
                                 void* buf) {
   ssize_t nread = 0;
   // ptrace operates on the word size of the host, so we really do want
   // to use sizes of host types here.
   uintptr_t word_size = sizeof(long);
   errno = 0;
   // Only read aligned words. This ensures we can always read the last
   // byte before an unmapped region.
   while (nread < buf_size) {
     uintptr_t start = addr.as_int() + nread;
     uintptr_t start_word = start & ~(word_size - 1);
     uintptr_t end_word = start_word + word_size;
     uintptr_t length = std::min(end_word - start, uintptr_t(buf_size - nread));

     long v = fallible_ptrace(PTRACE_PEEKDATA, start_word, nullptr);
     if (errno) {
       break;
     }
     memcpy(static_cast<uint8_t*>(buf) + nread,
            reinterpret_cast<uint8_t*>(&v) + (start - start_word), length);
     nread += length;
   }

   return nread;
 }

 ssize_t Task::write_bytes_ptrace(remote_ptr<void> addr, ssize_t buf_size,
                                  const void* buf) {
   ssize_t nwritten = 0;
   // ptrace operates on the word size of the host, so we really do want
   // to use sizes of host types here.
   uintptr_t word_size = sizeof(long);
   errno = 0;
   // Only write aligned words. This ensures we can always write the last
   // byte before an unmapped region.
   while (nwritten < buf_size) {
     uintptr_t start = addr.as_int() + nwritten;
     uintptr_t start_word = start & ~(word_size - 1);
     uintptr_t end_word = start_word + word_size;
     uintptr_t length =
         std::min(end_word - start, uintptr_t(buf_size - nwritten));

     long v;
     if (length < word_size) {
       v = fallible_ptrace(PTRACE_PEEKDATA, start_word, nullptr);
       if (errno) {
         break;
       }
     }
     memcpy(reinterpret_cast<uint8_t*>(&v) + (start - start_word),
            static_cast<const uint8_t*>(buf) + nwritten, length);
     fallible_ptrace(PTRACE_POKEDATA, start_word, reinterpret_cast<void*>(v));
     nwritten += length;
   }

   return nwritten;
 }

 ssize_t Task::read_bytes_fallible(remote_ptr<void> addr, ssize_t buf_size,
                                   void* buf) {
   ASSERT_ACTIONS(this, buf_size >= 0, << "Invalid buf_size " << buf_size);
   if (0 == buf_size) {
     return 0;
   }

   if (uint8_t* local_addr = as->local_mapping(addr, buf_size)) {
     memcpy(buf, local_addr, buf_size);
     return buf_size;
   }

   if (!as->mem_fd().is_open()) {
     return read_bytes_ptrace(addr, buf_size, static_cast<uint8_t*>(buf));
   }

   ssize_t all_read = 0;
   while (all_read < buf_size) {
     errno = 0;
     ssize_t nread = pread64(as->mem_fd(), static_cast<uint8_t*>(buf) + all_read,
                             buf_size - all_read, addr.as_int() + all_read);
     // We open the mem_fd just after being notified of
     // exec(), when the Task is created.  Trying to read from that
     // fd seems to return 0 with errno 0.  Reopening the mem fd
     // allows the pwrite to succeed.  It seems that the first mem
     // fd we open, very early in exec, refers to the address space
     // before the exec and the second mem fd refers to the address
     // space after exec.
     if (0 == nread && 0 == all_read && 0 == errno) {
       if (!open_mem_fd()) {
         return 0;
       }
       continue;
     }
     if (nread <= 0) {
       if (all_read > 0) {
         // We did successfully read some data, so return success and ignore
         // any error.
         errno = 0;
         return all_read;
       }
       return nread;
     }
     // We read some data. We should try again in case we get short reads.
     all_read += nread;
   }
   return all_read;
 }

 void Task::read_bytes_helper(remote_ptr<void> addr, ssize_t buf_size, void* buf,
                              bool* ok) {
   // pread64 etc can't handle addresses that appear to be negative ...
   // like [vsyscall].
   ssize_t nread = read_bytes_fallible(addr, buf_size, buf);
   if (nread != buf_size) {
     if (ok) {
       *ok = false;
     } else {
       ASSERT(this, false) << "Should have read " << buf_size << " bytes from "
                           << addr << ", but only read " << nread;
     }
   }
 }

 /**
  * This function exists to work around
  * https://bugzilla.kernel.org/show_bug.cgi?id=99101.
  * On some kernels pwrite() to /proc/.../mem fails when writing to a region
  * that's PROT_NONE.
  * Also, writing through MAP_SHARED readonly mappings fails (even if the
  * file was opened read-write originally), so we handle that here too.
  */
 static ssize_t safe_pwrite64(Task* t, const void* buf, ssize_t buf_size,
                              remote_ptr<void> addr) {
   vector<KernelMapping> mappings_to_fix;
   for (const auto& m :
        t->vm()->maps_containing_or_after(floor_page_size(addr))) {
     if (m.map.start() >= ceil_page_size(addr + buf_size)) {
       break;
     }
     if (m.map.prot() & PROT_WRITE) {
       continue;
     }
     if (!(m.map.prot() & PROT_READ) || (m.map.flags() & MAP_SHARED)) {
       mappings_to_fix.push_back(m.map);
     }
   };

   if (mappings_to_fix.empty()) {
     return pwrite_all_fallible(t->vm()->mem_fd(), buf, buf_size, addr.as_int());
   }

   AutoRemoteSyscalls remote(t);
   int mprotect_syscallno = syscall_number_for_mprotect(t->arch());
   bool failed_access = false;
   for (auto& m : mappings_to_fix) {
     long ret = remote.syscall(mprotect_syscallno, m.start(), m.size(), m.prot() | PROT_WRITE);
     if ((int)ret == -EACCES) {
       // We could be trying to write to a read-only shared file. In that case we should
       // report the error without dying.
       failed_access = true;
     } else {
       remote.check_syscall_result(ret, mprotect_syscallno, false);
     }
   }
   ssize_t nwritten;
   if (failed_access) {
     nwritten = -1;
   } else {
     nwritten = pwrite_all_fallible(t->vm()->mem_fd(), buf, buf_size, addr.as_int());
   }
   for (auto& m : mappings_to_fix) {
     remote.infallible_syscall(mprotect_syscallno, m.start(), m.size(),
                               m.prot());
   }
   if (failed_access) {
     errno = EACCES;
   }
   return nwritten;
 }

 void Task::write_bytes_helper(remote_ptr<void> addr, ssize_t buf_size,
                               const void* buf, bool* ok, uint32_t flags) {
   ASSERT(this, buf_size >= 0) << "Invalid buf_size " << buf_size;
   if (0 == buf_size) {
     return;
   }

   ssize_t nwritten = write_bytes_helper_no_notifications(addr, buf_size, buf, ok, flags);
   if (nwritten > 0) {
     vm()->notify_written(addr, nwritten, flags);
   }
 }

 ssize_t Task::write_bytes_helper_no_notifications(remote_ptr<void> addr, ssize_t buf_size,
                                                   const void* buf, bool* ok, uint32_t flags) {
   ASSERT(this, buf_size >= 0) << "Invalid buf_size " << buf_size;
   if (0 == buf_size) {
     return 0;
   }

   if (uint8_t* local_addr = as->local_mapping(addr, buf_size)) {
     memcpy(local_addr, buf, buf_size);
     return buf_size;
   }

   if (!as->mem_fd().is_open()) {
     ssize_t nwritten =
         write_bytes_ptrace(addr, buf_size, static_cast<const uint8_t*>(buf));
     if (ok && nwritten < buf_size) {
       *ok = false;
     }
     return nwritten;
   }

   errno = 0;
   ssize_t nwritten = safe_pwrite64(this, buf, buf_size, addr.as_int());
   // See comment in read_bytes_helper().
   if (0 == nwritten && 0 == errno) {
     open_mem_fd();
     return write_bytes_helper_no_notifications(addr, buf_size, buf, ok, flags);
   }
   if (errno == EPERM) {
     FATAL() << "Can't write to /proc/" << tid << "/mem\n"
             << "Maybe you need to disable grsecurity MPROTECT with:\n"
             << "  setfattr -n user.pax.flags -v 'emr' <executable>";
   }
   if (ok) {
     if (nwritten < buf_size) {
       *ok = false;
     }
   } else {
     ASSERT(this, nwritten == buf_size)
         << "Should have written " << buf_size << " bytes to " << addr
         << ", but only wrote " << nwritten;
   }
   return nwritten;
 }

 uint64_t Task::write_ranges(const vector<FileMonitor::Range>& ranges,
                             void* data, size_t size) {
   uint8_t* p = static_cast<uint8_t*>(data);
   size_t s = size;
   size_t result = 0;
   for (auto& r : ranges) {
     size_t bytes = min(s, r.length);
     write_bytes_helper(r.data, bytes, p);
     s -= bytes;
     result += bytes;
     if (s == 0) {
       break;
     }
   }
   return result;
 }

 void Task::write_zeroes(unique_ptr<AutoRemoteSyscalls>* remote, remote_ptr<void> addr, size_t size) {
   if (!size) {
     return;
   }

   bool remove_ok = true;
   remote_ptr<void> initial_addr = addr;
   size_t initial_size = size;
   vector<uint8_t> zeroes;
   while (size > 0) {
     size_t bytes;
     remote_ptr<void> first_page = ceil_page_size(addr);
     if (addr < first_page) {
       bytes = min<size_t>(first_page - addr, size);
     } else {
       if (remove_ok) {
         remote_ptr<void> last_page = floor_page_size(addr + size);
         if (first_page < last_page) {
           if (!*remote) {
             *remote = make_unique<AutoRemoteSyscalls>(this);
           }
           int ret = (*remote)->syscall(syscall_number_for_madvise(arch()), first_page, last_page - first_page, MADV_REMOVE);
           if (ret == 0) {
             addr = last_page;
             size -= last_page - first_page;
             continue;
           }
           // Don't try MADV_REMOVE again
           remove_ok = false;
         }
       }
       bytes = min<size_t>(4*1024*1024, size);
     }
     zeroes.resize(bytes);
     memset(zeroes.data(), 0, bytes);
     ssize_t written = write_bytes_helper_no_notifications(addr, bytes, zeroes.data(), nullptr, 0);
     ASSERT(this, written == (ssize_t)bytes);
     addr += bytes;
     size -= bytes;
   }
   vm()->notify_written(initial_addr, initial_size, 0);
 }

 const TraceStream* Task::trace_stream() const {
   if (session().as_record()) {
     return &session().as_record()->trace_writer();
   }
   if (session().as_replay()) {
     return &session().as_replay()->trace_reader();
   }
   return nullptr;
 }

 bool Task::ptrace_if_stopped(int request, remote_ptr<void> addr, void* data) {
   ASSERT(this, is_stopped_);

   errno = 0;
   fallible_ptrace(request, addr, data);
   if (errno == ESRCH) {
     LOG(debug) << "ptrace_if_stopped tid " << tid << " was not stopped";
     return false;
   }
   ASSERT(this, !errno) << "ptrace(" << ptrace_req_name<NativeArch>(request) << ", " << tid
                        << ", addr=" << addr << ", data=" << data
                        << ") failed with errno " << errno;
   return true;
 }

 SupportedArch Task::detect_syscall_arch() {
   SupportedArch syscall_arch;
   bool ok = get_syscall_instruction_arch(
       this, regs().ip().decrement_by_syscall_insn_length(arch()),
       &syscall_arch);
   ASSERT(this, ok);
   return syscall_arch;
 }

 bool Task::clone_syscall_is_complete(pid_t* new_pid,
                                      SupportedArch syscall_arch) {
   int event = ptrace_event();
   if (PTRACE_EVENT_CLONE == event || PTRACE_EVENT_FORK == event ||
       PTRACE_EVENT_VFORK == event) {
     *new_pid = get_ptrace_eventmsg_pid();
     ASSERT(this, *new_pid >= 0)
       << "Task was killed just after clone/fork/vfork and before we could get the new pid; giving up";
     return true;
   }
   ASSERT(this, !event) << "Unexpected ptrace event "
                        << ptrace_event_name(event);

   // EAGAIN can happen here due to fork failing under load. The caller must
   // handle this.
   // XXX ENOSYS shouldn't happen here.
   intptr_t result = regs().syscall_result_signed();
   ASSERT(this,
          regs().syscall_may_restart() || -ENOSYS == result ||
              -EAGAIN == result || -ENOMEM == result)
       << "Unexpected task status " << status() << " ("
       << syscall_name(regs().original_syscallno(), syscall_arch)
       << " syscall errno: " << errno_name(-result) << ")";
   return false;
 }

 template <typename Arch> static void do_preload_init_arch(Task* t) {
   auto params = t->read_mem(
       remote_ptr<rrcall_init_preload_params<Arch>>(t->regs().orig_arg1()));

   for (Task* tt : t->vm()->task_set()) {
     tt->preload_globals = params.globals.rptr();
   }

   ReplaySession *replay = t->session().as_replay();
   if (replay && replay->has_trace_quirk(TraceReader::UsesGlobalsInReplay)) {
     t->write_mem(REMOTE_PTR_FIELD(t->preload_globals, reserved_legacy_in_replay), (unsigned char)1);
   }
 }

 static void do_preload_init(Task* t) {
   RR_ARCH_FUNCTION(do_preload_init_arch, t->arch(), t);
 }

 void Task::at_preload_init() {
   as->at_preload_init(this);
   do_preload_init(this);

   fd_table()->init_syscallbuf_fds_disabled(this);
 }

 template <typename Arch>
 static long perform_remote_clone_arch(
     AutoRemoteSyscalls& remote, unsigned base_flags, remote_ptr<void> stack,
     remote_ptr<int> ptid, remote_ptr<void> tls, remote_ptr<int> ctid) {
   switch (Arch::clone_parameter_ordering) {
     case Arch::FlagsStackParentTLSChild:
       return remote.syscall(Arch::clone, base_flags, stack, ptid.as_int(),
                             tls.as_int(), ctid.as_int());
     case Arch::FlagsStackParentChildTLS:
       return remote.syscall(Arch::clone, base_flags, stack, ptid.as_int(),
                             ctid.as_int(), tls.as_int());
   }
 }

 static long perform_remote_clone(AutoRemoteSyscalls& remote,
                                  unsigned base_flags, remote_ptr<void> stack,
                                  remote_ptr<int> ptid, remote_ptr<void> tls,
                                  remote_ptr<int> ctid) {
   RR_ARCH_FUNCTION(perform_remote_clone_arch, remote.arch(), remote, base_flags,
                    stack, ptid, tls, ctid);
 }

 /*static*/ Task* Task::os_clone(CloneReason reason, Session* session,
                                 AutoRemoteSyscalls& remote, pid_t rec_child_tid,
                                 uint32_t new_serial, unsigned base_flags,
                                 FdTable::shr_ptr new_fds,
                                 ThreadGroup::shr_ptr new_tg,
                                 remote_ptr<void> stack, remote_ptr<int> ptid,
                                 remote_ptr<void> tls, remote_ptr<int> ctid) {
   long ret;
   do {
     ret = perform_remote_clone(remote, base_flags, stack, ptid, tls, ctid);
   } while (ret == -EAGAIN);
   ASSERT(remote.task(), ret >= 0)
       << "remote clone failed with errno " << errno_name(-ret);

   Task* child = remote.task()->clone(
       reason, clone_flags_to_task_flags(base_flags), stack, tls, ctid,
       remote.new_tid(), rec_child_tid, new_serial, session, std::move(new_fds),
       std::move(new_tg));
   return child;
 }

 static void setup_fd_table(Task* t, FdTable& fds, int tracee_socket_fd_number) {
   fds.add_monitor(t, STDOUT_FILENO, new StdioMonitor(t->session().tracee_output_fd(STDOUT_FILENO)));
   fds.add_monitor(t, STDERR_FILENO, new StdioMonitor(t->session().tracee_output_fd(STDERR_FILENO)));
   fds.add_monitor(t, RR_MAGIC_SAVE_DATA_FD, new MagicSaveDataMonitor());
   fds.add_monitor(t, tracee_socket_fd_number, new PreserveFileMonitor());
 }

 static void spawned_child_fatal_error(const ScopedFd& err_fd,
                                       const char* format, ...) {
   va_list args;
   va_start(args, format);
   char* buf;
   if (vasprintf(&buf, format, args) < 0) {
     exit(1);
   }

   char* buf2;
   if (asprintf(&buf2, "%s (%s)", buf, errno_name(errno).c_str()) < 0) {
     exit(1);
   }
   write_all(err_fd, buf2, strlen(buf2));
   _exit(1);
 }

 static void disable_tsc(const ScopedFd& err_fd) {
   /* Trap to the rr process if a 'rdtsc' instruction is issued.
    * That allows rr to record the tsc and replay it
    * deterministically. */
   if (0 > prctl(PR_SET_TSC, PR_TSC_SIGSEGV, 0, 0, 0)) {
     spawned_child_fatal_error(err_fd, "error setting up prctl");
   }
 }

 template <typename Arch> void set_up_process_arch(const ScopedFd&);
 template <> void set_up_process_arch<X86Arch>(const ScopedFd& err_fd) { disable_tsc(err_fd); }
 template <> void set_up_process_arch<X64Arch>(const ScopedFd& err_fd) { disable_tsc(err_fd); }
 template <> void set_up_process_arch<ARM64Arch>(const ScopedFd&) {}

 void set_up_process_arch(SupportedArch arch, const ScopedFd& err_fd) {
   RR_ARCH_FUNCTION(set_up_process_arch, arch, err_fd);
 }

 /**
  * Prepare this process and its ancestors for recording/replay by
  * preventing direct access to sources of nondeterminism, and ensuring
  * that rr bugs don't adversely affect the underlying system.
  */
 static void set_up_process(Session& session, const ScopedFd& err_fd,
                            const ScopedFd& sock_fd, int sock_fd_number) {
   /* TODO tracees can probably undo some of the setup below
    * ... */

   // Restore signal mask
   sigset_t sigmask;
   TraceeAttentionSet::get_original_sigmask(&sigmask);
   sigprocmask(SIG_SETMASK, &sigmask, nullptr);

   struct NativeArch::cap_header header = {.version =
                                               _LINUX_CAPABILITY_VERSION_3,
                                           .pid = 0 };
   struct NativeArch::cap_data caps[2];
   if (syscall(NativeArch::capget, &header, &caps) != 0) {
     spawned_child_fatal_error(err_fd, "Failed to read capabilities");
   }
   uint32_t perfmon_mask = 1 << (CAP_PERFMON - 32);
   if (caps[1].permitted & perfmon_mask) {
     // Try to pass CAP_PERFMON into our tracees.
     caps[1].inheritable |= perfmon_mask;
     // Ignore any failures here. Capabilities are super complex and I'm not
     // sure this can be trusted to succeed.
     if (syscall(NativeArch::capset, &header, &caps) == 0) {
       // Install CAP_PERFMON as an ambient capabilities.
       // This prctl was only added in 4.3. Ignore failures.
       prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_RAISE, CAP_PERFMON, 0, 0);
     }
   }

   /* CLOEXEC so that the original fd here will be closed by the exec that's
    * about to happen.
    */
   int fd = open("/dev/null", O_WRONLY | O_CLOEXEC);
   if (0 > fd) {
     spawned_child_fatal_error(err_fd, "error opening /dev/null");
   }
   if (RR_MAGIC_SAVE_DATA_FD != dup2(fd, RR_MAGIC_SAVE_DATA_FD)) {
     spawned_child_fatal_error(err_fd, "error duping to RR_MAGIC_SAVE_DATA_FD");
   }

   if (sock_fd_number != dup2(sock_fd, sock_fd_number)) {
     spawned_child_fatal_error(err_fd,
                               "error duping to RR_RESERVED_SOCKET_FD");
   }

   if (session.is_replaying()) {
     // This task and all its descendants should silently reap any terminating
     // children.
     if (SIG_ERR == signal(SIGCHLD, SIG_IGN)) {
       spawned_child_fatal_error(err_fd, "error doing signal()");
     }

     // If the rr process dies, prevent runaway tracee processes
     // from dragging down the underlying system.
     //
     // TODO: this isn't inherited across fork().
     if (0 > prctl(PR_SET_PDEATHSIG, SIGKILL)) {
       spawned_child_fatal_error(err_fd, "Couldn't set parent-death signal");
     }

     // Put the replaying processes into their own session. This will stop
     // signals being sent to these processes by the terminal --- in particular
     // SIGTSTP/SIGINT/SIGWINCH.
     setsid();
     // Preserve increased resource limits, in case the tracee
     // increased its limits and we need high limits to apply during replay.
   } else {
     restore_initial_resource_limits();
   }

   /* Do any architecture specific setup, such as disabling non-deterministic
      instructions */
   set_up_process_arch(NativeArch::arch(), err_fd);

   /* If we're in setuid_sudo mode, we have CAP_SYS_ADMIN, so we don't need to
      set NO_NEW_PRIVS here in order to install the seccomp filter later. In,
      emulate any potentially privileged, operations, so we might as well set
      no_new_privs */
   if (!session.is_recording() || !has_effective_caps(1 << CAP_SYS_ADMIN)) {
     if (0 > prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) {
       spawned_child_fatal_error(
           err_fd,
           "prctl(NO_NEW_PRIVS) failed, SECCOMP_FILTER is not available: your "
           "kernel is too old. Use `record -n` to disable the filter.");
     }
   }
 }

 static SeccompFilter<struct sock_filter> create_seccomp_filter() {
   SeccompFilter<struct sock_filter> f;
   for (auto& e : AddressSpace::rr_page_syscalls()) {
     if (e.traced == AddressSpace::UNTRACED) {
       auto ip = AddressSpace::rr_page_syscall_exit_point(e.traced, e.privileged,
                                                          e.enabled,
                                                          NativeArch::arch());
       f.allow_syscalls_from_callsite(ip);
     }
   }
   f.trace();
   return f;
 }

 /**
  * This is called (and must be called) in the tracee after rr has taken
  * ptrace control. Otherwise, once we've installed the seccomp filter,
  * things go wrong because we have no ptracer and the seccomp filter demands
  * one.
  */
 static void set_up_seccomp_filter(const struct sock_fprog& prog, const ScopedFd& err_fd) {
   /* Note: the filter is installed only for record. This call
    * will be emulated (not passed to the kernel) in the replay. */
   if (0 > prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, (uintptr_t)&prog, 0, 0)) {
     spawned_child_fatal_error(
         err_fd, "prctl(SECCOMP) failed, SECCOMP_FILTER is not available: your "
                 "kernel is too old.");
   }
   /* anything that happens from this point on gets filtered! */
 }

 static void run_initial_child(Session& session, const ScopedFd& error_fd,
                               const ScopedFd& sock_fd, int sock_fd_number,
                               const char* exe_path_cstr,
                               char* const argv_array[],
                               char* const envp_array[],
                               const struct sock_fprog& seccomp_prog) {
   pid_t pid = getpid();

   set_up_process(session, error_fd, sock_fd, sock_fd_number);
   // The preceding code must run before sending SIGSTOP here,
   // since after SIGSTOP replay emulates almost all syscalls, but
   // we need the above syscalls to run "for real".

   // Signal to tracer that we're configured.
   ::kill(pid, SIGSTOP);

   // This code must run after rr has taken ptrace control.
   set_up_seccomp_filter(seccomp_prog, error_fd);

   // We do a small amount of dummy work here to retire
   // some branches in order to ensure that the ticks value is
   // non-zero.  The tracer can then check the ticks value
   // at the first ptrace-trap to see if it seems to be
   // working.
   int start = random() % 5;
   int num_its = start + 5;
   int sum = 0;
   for (int i = start; i < num_its; ++i) {
     sum += i;
   }
   syscall(SYS_write, -1, &sum, sizeof(sum));

   CPUIDBugDetector::run_detection_code();

   execve(exe_path_cstr, argv_array, envp_array);

   switch (errno) {
     case ENOENT:
       spawned_child_fatal_error(
           error_fd, "execve failed: '%s' (or interpreter) not found",
           exe_path_cstr);
       break;
     default:
       spawned_child_fatal_error(error_fd, "execve of '%s' failed",
                                 exe_path_cstr);
       break;
   }
   // Never returns!
 }

 long Task::ptrace_seize(pid_t tid, Session& session) {
   intptr_t options = PTRACE_O_TRACESYSGOOD | PTRACE_O_TRACEFORK |
                      PTRACE_O_TRACECLONE;
   if (!Flags::get().disable_ptrace_exit_events) {
     options |= PTRACE_O_TRACEEXIT;
   }
   if (session.is_recording()) {
     options |= PTRACE_O_TRACEVFORK | PTRACE_O_TRACESECCOMP | PTRACE_O_TRACEEXEC;
   }

   long ret =
       ptrace((_ptrace_request)PTRACE_SEIZE, tid, nullptr, (void*)(options | PTRACE_O_EXITKILL));
   if (ret < 0 && errno == EINVAL) {
     // PTRACE_O_EXITKILL was added in kernel 3.8, and we only need
     // it for more robust cleanup, so tolerate not having it.
     ret = ptrace((_ptrace_request)PTRACE_SEIZE, tid, nullptr, (void*)options);
   }
   return ret;
 }

 /*static*/ Task* Task::spawn(Session& session, ScopedFd& error_fd,
                              ScopedFd* sock_fd_out,
                              ScopedFd* sock_fd_receiver_out,
                              int* tracee_socket_fd_number_out,
                              const std::string& exe_path,
                              const std::vector<std::string>& argv,
                              const std::vector<std::string>& envp,
                              pid_t rec_tid) {
   DEBUG_ASSERT(session.tasks().size() == 0);

   int sockets[2];
   long ret = socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, sockets);
   if (ret < 0) {
     FATAL() << "socketpair failed";
   }
   *sock_fd_out = ScopedFd(sockets[0]);
   *sock_fd_receiver_out = ScopedFd(sockets[1]);

   // Find a usable FD number to dup to in the child. RR_RESERVED_SOCKET_FD
   // might already be used by an outer rr.
   int fd_number = RR_RESERVED_SOCKET_FD;
   // We assume no other thread is mucking with this part of the fd address space.
   while (true) {
     ret = fcntl(fd_number, F_GETFD);
     if (ret < 0) {
       if (errno != EBADF) {
         FATAL() << "Error checking fd";
       }
       break;
     }
     ++fd_number;
   }
   *tracee_socket_fd_number_out = fd_number;

   pid_t tid;
   // After fork() in a multithreaded program, the child can safely call only
   // async-signal-safe functions, and malloc is not one of them (breaks e.g.
   // with tcmalloc).
   // Doing the allocations before the fork duplicates the allocations, but
   // prevents errors.
   StringVectorToCharArray argv_array(argv);
   StringVectorToCharArray envp_array(envp);
   SeccompFilter<struct sock_filter> filter = create_seccomp_filter();
   struct sock_fprog prog = {(unsigned short)filter.filters.size(),
                             filter.filters.data()};
   do {
     tid = fork();
     // fork() can fail with EAGAIN due to temporary load issues. In such
     // cases, retry the fork().
   } while (0 > tid && errno == EAGAIN);

   if (0 == tid) {
     run_initial_child(session, error_fd, *sock_fd_receiver_out, fd_number, exe_path.c_str(),
                       argv_array.get(), envp_array.get(), prog);
     // run_initial_child never returns
   }

   if (0 > tid) {
     FATAL() << "Failed to fork";
   }

   // Make sure the child has the only reference to this side of the pipe.
   error_fd.close();

   // Sync with the child process.
   // We minimize the code we run between fork()ing and PTRACE_SEIZE, because
   // any abnormal exit of the rr process will leave the child paused and
   // parented by the init process, i.e. effectively leaked. After PTRACE_SEIZE
   // with PTRACE_O_EXITKILL, the tracee will die if rr dies.
   if (getenv("RR_TEST_DELAY_SEIZE")) {
     sleep(1);
   }
   ret = ptrace_seize(tid, session);
   if (ret) {
     // Note that although the tracee may have died due to some fatal error,
     // we haven't reaped its exit code so there's no danger of killing
     // (or PTRACE_SEIZEing) the wrong process.
     int tmp_errno = errno;
     ::kill(tid, SIGKILL);
     errno = tmp_errno;

     string hint;
     if (errno == EPERM) {
       hint = "; child probably died before reaching SIGSTOP\n"
              "Child's message: " +
              session.read_spawned_task_error();
     }
     FATAL() << "PTRACE_SEIZE failed for tid " << tid << hint;
   }

   Task* t = session.new_task(tid, rec_tid, session.next_task_serial(),
                              NativeArch::arch(), "rr");
   auto tg = session.create_initial_tg(t);
   t->tg.swap(tg);
   auto as = session.create_vm(t);
   t->as.swap(as);
   t->fds = FdTable::create(t);
   setup_fd_table(t, *t->fds, fd_number);

   // Install signal handler here, so that when creating the first RecordTask
   // it sees the exact same signal state in the parent as will be in the child.
   struct sigaction sa;
   sa.sa_handler = handle_alarm_signal;
   sigemptyset(&sa.sa_mask);
   sa.sa_flags = 0; // No SA_RESTART, so waitpid() will be interrupted
   sigaction(SIGALRM, &sa, nullptr);

   if (!t->wait()) {
     FATAL() << "Tracee died before reaching SIGSTOP";
   }
   if (t->ptrace_event() == PTRACE_EVENT_EXIT) {
     t->proceed_to_exit();
     FATAL() << "Tracee died before reaching SIGSTOP\n"
                "Child's message: "
             << session.read_spawned_task_error();
   }
   // SIGSTOP can be reported as a signal-stop or group-stop depending on
   // whether PTRACE_SEIZE happened before or after it was delivered.
   if (SIGSTOP != t->status().stop_sig() &&
       SIGSTOP != t->status().group_stop()) {
     WaitStatus failed_status = t->status();
     t->kill();
     FATAL() << "Unexpected stop " << failed_status
             << "\nChild's message: "
             << session.read_spawned_task_error();
   }

   t->clear_wait_status();
   t->open_mem_fd();
   return t;
 }

 void* Task::preload_thread_locals() {
   return preload_thread_locals_local_addr(*as);
 }

 static bool file_was_deleted(string s) {
   static const char deleted[] = " (deleted)";
   ssize_t find_deleted = s.size() - (sizeof(deleted) - 1);
   return s.find(deleted) == size_t(find_deleted);
 }

 static void create_mapping(Task *t, AutoRemoteSyscalls &remote, const KernelMapping &km) {
   string real_file_name;
   dev_t device = KernelMapping::NO_DEVICE;
   ino_t inode = KernelMapping::NO_INODE;
   if (km.is_real_device() && !file_was_deleted(km.fsname())) {
     struct stat real_file;
     string real_file_name;
     remote.finish_direct_mmap(km.start(), km.size(), km.prot(), km.flags(),
       km.fsname(), O_RDONLY, km.file_offset_bytes(),
       real_file, real_file_name);
   } else {
     auto ret = remote.infallible_mmap_syscall_if_alive(km.start(), km.size(), km.prot(),
                                                        km.flags() | MAP_FIXED | MAP_ANONYMOUS, -1,
                                                        0);
     ASSERT(t, ret || t->vm()->task_set().size() == t->thread_group()->task_set().size())
       << "Not handling shared address spaces where one threadgroup unexpectedly dies";
   }
   t->vm()->map(t, km.start(), km.size(), km.prot(), km.flags(), km.file_offset_bytes(),
                real_file_name, device, inode, nullptr, &km);
 }

 static void apply_mm_map(AutoRemoteSyscalls& remote, const NativeArch::prctl_mm_map& map)
 {
   unsigned int expected_size = 0;
   int result = prctl(PR_SET_MM, PR_SET_MM_MAP_SIZE, &expected_size, 0, 0);
   if (result != 0) {
     FATAL() << "Failed to get expected MM_MAP_SIZE. Error was " << errno_name(-result);
   }

   const void* pmap = NULL;
   int pmap_size = 0;

   /* Expected size matches native prctl_mm_map */
   if (expected_size == sizeof(map)) {
     pmap = &map;
     pmap_size = sizeof(map);
   }

 #if defined(__i386__)
   /* A 64-bit kernel expects a "64-bit sized" prctl_mm_map
      even from a 32-bit process. */
   X64Arch::prctl_mm_map map64;
   if (expected_size == sizeof(map64)) {
     LOG(warn) << "Kernel expects different sized MM_MAP. Using 64-bit prctl_mm_map.";
     memcpy(&map64, &map, sizeof(map));
     map64.auxv.val = map.auxv.val;
     map64.auxv_size = map.auxv_size;
     map64.exe_fd = map.exe_fd;

     pmap = &map64;
     pmap_size = sizeof(map64);
   }
 #endif

   /* Are we prepared for the requested structure size? */
   if (pmap == NULL || pmap_size == 0) {
     FATAL() << "Kernel expects MM_MAP of size " << expected_size;
   }

   AutoRestoreMem remote_mm_map(remote, (const uint8_t*)pmap, pmap_size);
   result = remote.syscall(syscall_number_for_prctl(remote.task()->arch()), PR_SET_MM,
                           PR_SET_MM_MAP, remote_mm_map.get().as_int(),
                           pmap_size);
   if (result == -EINVAL &&
       (map.start_brk <= map.end_data || map.brk <= map.end_data)) {
     CLEAN_FATAL() << "The linux kernel prohibits duplication of this task's memory map," <<
                 " because the brk segment is located below the data segment. Sorry.";
   }
   else if (result != 0) {
     FATAL() << "Failed to set target task memory map. Error was " << errno_name(-result);
   }
 }

 static void copy_mem_mapping(Task* from, Task* to, const KernelMapping& km) {
   vector<char> buf;
   buf.resize(km.size());
   ssize_t bytes = from->read_bytes_fallible(km.start(), km.size(), buf.data());
   // There can be mappings of files where the mapping starts beyond the end-of-file
   // so no bytes will be read.
   if (bytes > 0) {
     // We may have a short read here if there are beyond-end-of-mapped-file pages
     // in the mapping.
     bool ok = true;
     to->write_bytes_helper(km.start(), bytes, buf.data(), &ok);
     ASSERT(to, ok);
   }
 }

 // https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/tree/fs/proc/task_mmu.c?h=v6.3#n1352
 #define PM_PRESENT (1ULL << 63)
 #define PM_SWAP    (1ULL << 62)

 static bool copy_mem_mapping_just_used(Task* from, Task* to, const KernelMapping& km)
 {
   ScopedFd& fd = from->pagemap_fd();
   if (!fd.is_open()) {
     LOG(debug) << "Failed to open " << from->proc_pagemap_path();
     return false;
   }

   size_t pagesize = page_size();
   uint64_t pages_present = 0; // Just for logging

   const int max_buf_size = 65536;
   vector<uint64_t> buf;

   for (uintptr_t page_offset = 0; page_offset < km.size() / pagesize; page_offset += max_buf_size) {
     auto page_read_offset = (km.start().as_int() / pagesize + page_offset);
     size_t page_read_count = min<size_t>(max_buf_size, km.size() / pagesize - page_offset);
     buf.resize(page_read_count);
     size_t bytes_read = pread(fd, buf.data(), page_read_count * sizeof(uint64_t), page_read_offset * sizeof(uint64_t));
     ASSERT(from, bytes_read == page_read_count * sizeof(uint64_t));

     // A chunk was read from pagemap above, now iterate through it to detect
     // if memory is physically present (bit 63, PM_PRESENT) or in swap (bit 62, PM_SWAP) in Task "from".
     // If yes, just transfer those pages to the new Task "to".
     // Also try to find consecutive pages to copy them in one operation.
     // The file /proc/PID/pagemap consists of 64-bit values, each describing
     // the state of one page. See https://www.kernel.org/doc/Documentation/vm/pagemap.txt

     for (size_t page = 0; page < page_read_count; ++page) {
       if (buf[page] & (PM_PRESENT | PM_SWAP)) {
         auto start = km.start() + (page_offset + page) * pagesize;
         if (start >= km.end()) {
           break;
         }
         ++pages_present;

         // Check for consecutive used pages
         while (page + 1 < page_read_count &&
                buf[page + 1] & (PM_PRESENT | PM_SWAP))
         {
           ++page;
           ++pages_present;
         }

         auto end = km.start() + (page_offset + page + 1) * pagesize;
         LOG(debug) << km << " copying start: 0x" << hex << start << " end: 0x" << end
                    << dec << " pages: " << (end - start) / pagesize;
         auto pages = km.subrange(start, end);
         copy_mem_mapping(from, to, pages);
       }
     }
   }
   LOG(debug) << km << " pages_present: " << pages_present << " pages_total: " << km.size() / pagesize;
   return true;
 }

 static void mremap_move(AutoRemoteSyscalls& remote, remote_ptr<void> src,
     remote_ptr<void> dest, size_t size, const char* message) {
   if (!size) {
     return;
   }
   long ret = remote.syscall(syscall_number_for_mremap(remote.arch()),
                             src, size, size, MREMAP_MAYMOVE | MREMAP_FIXED, dest);
   ASSERT(remote.task(), remote_ptr<void>(ret) == dest)
     << "Failed to move from " << src << " to " << dest << " "
     << HEX(size) << " bytes, ret=" << ret << ", " << message;
   remote.task()->vm()->remap(remote.task(), src, size, dest, size,
                              MREMAP_MAYMOVE | MREMAP_FIXED);
 }

 /* Remap VDSO and VVAR to the addresses is used in the target process,
    before they get unmapped.
    Otherwise the kernel seems to put the address of the original
    VDSO __kernel_rt_sigreturn function as return address on the stack.
    This might not affect x86_64 because there __restore_rt
    located in libpthread.so.0 is used.
 */
 static void move_vdso_and_vvar_mappings(AutoRemoteSyscalls& remote,
     const KernelMapping& vdso_new, const KernelMapping& vvar_new) {
   KernelMapping vdso_current;
   KernelMapping vvar_current;
   Task* t = remote.task();
   for (const auto& m : t->vm()->maps()) {
     if (m.map.is_vdso()) {
       vdso_current = m.map;
     } else if (m.map.is_vvar()) {
       vvar_current = m.map;
     }
   }

   ASSERT(t, vdso_current.size() == vdso_new.size())
     << "VDSO size mismatch";
   ASSERT(t, vvar_current.size() == vvar_new.size() || !vvar_new.size())
     << "VVAR size mismatch";

   // Handle case where old and new addresses overlap by finding a free range early in the
   // address space we can use as a temporary buffer. VDSOs are always at fairly high
   // addresses so this shouldn't introduce any new overlap issues.
   // We move VDSO and VVAR to their temp addresses first, then move both of them to their
   // final address, to avoid situations where current's VDSO overlaps target's VVAR or
   // vice versa.
   size_t temp_size = vdso_new.size() + vvar_new.size();
   remote_ptr<void> vdso_temp_address = t->vm()->find_free_memory(t,
         temp_size,
         remote_ptr<void>(65536), AddressSpace::FindFreeMemoryPolicy::STRICT_SEARCH);
   remote_ptr<void> vvar_temp_address = vdso_temp_address + vdso_new.size();
   MemoryRange temp_range(vdso_temp_address, temp_size);
   ASSERT(t, !temp_range.intersects(vdso_new))
     << "Free memory found overlaps new VDSO address";
   ASSERT(t, !temp_range.intersects(vvar_new))
     << "Free memory found overlaps new VVAR address";

   mremap_move(remote, vdso_current.start(), vdso_temp_address, vdso_new.size(),
               "vdso_current.start() -> vdso_temp_address");
   if (vvar_new.size()) {
     mremap_move(remote, vvar_current.start(), vvar_temp_address, vvar_current.size(),
                 "vvar_current.start() -> vvar_temp_address");
   } else {
     bool ok = remote.infallible_munmap_syscall_if_alive(vvar_current.start(),
         vvar_current.size());
     ASSERT(t, ok) << "Duped task got killed?";
     t->vm()->unmap(t, vvar_current.start(), vvar_current.size());
   }
   mremap_move(remote, vdso_temp_address, vdso_new.start(), vdso_new.size(),
               "vdso_temp_address -> vdso_new.start()");
   mremap_move(remote, vvar_temp_address, vvar_new.start(), vvar_new.size(),
               "vvar_temp_address -> vvar_new.start()");
 }

 const int all_rlimits[] = {
   (int)RLIMIT_AS, (int)RLIMIT_CORE, (int)RLIMIT_CPU, (int)RLIMIT_DATA,
   (int)RLIMIT_FSIZE, (int)RLIMIT_LOCKS, (int)RLIMIT_MEMLOCK,
   (int)RLIMIT_MSGQUEUE, (int)RLIMIT_NICE, (int)RLIMIT_NOFILE, (int)RLIMIT_NPROC,
   (int)RLIMIT_RSS, (int)RLIMIT_RTTIME, (int)RLIMIT_SIGPENDING, (int)RLIMIT_STACK
 };

 void Task::dup_from(Task *other) {
   std::vector<KernelMapping> mappings;
   KernelMapping stack_mapping;
   bool found_stack = false;
   KernelMapping vdso_mapping;
   KernelMapping vvar_mapping;

   for (auto map : other->vm()->maps()) {
     auto km = map.map;
     if (map.flags != AddressSpace::Mapping::FLAG_NONE) {
       if (map.flags & (AddressSpace::Mapping::IS_THREAD_LOCALS |
                        AddressSpace::Mapping::IS_RR_PAGE)) {
         // While under rr control this task already has an rr page and
         // a thread locals shared segment, don't mess with them.
         continue;
       }
       // For rr private mappings, just make an anonymous segment of the same size
       km = KernelMapping(km.start(), km.end(), string(), KernelMapping::NO_DEVICE,
                            KernelMapping::NO_INODE, km.prot(),
                            (km.flags() & ~MAP_SHARED) | MAP_PRIVATE, 0);
     }
     if (km.is_stack() && !found_stack) {
       stack_mapping = km;
       found_stack = true;
     } else {
       if (km.is_vdso()) {
         vdso_mapping = km;
       } else if (km.is_vvar()) {
         vvar_mapping = km;
       } else if (!km.is_vsyscall()) {
         mappings.push_back(km);
       }
     }
   }
   ASSERT(this, found_stack);
   // Copy address space
   LOG(debug) << "Mapping rr page for " << tid;
   {
     AutoRemoteSyscalls remote(this);
     this->vm()->map_rr_page(remote);
   }
   {
     AutoRemoteSyscalls remote(this, AutoRemoteSyscalls::DISABLE_MEMORY_PARAMS);
     move_vdso_and_vvar_mappings(remote, vdso_mapping, vvar_mapping);
     LOG(debug) << "Unmapping memory for " << tid;
     // TODO: Only do this if the rr page isn't already mapped
     AddressSpace::UnmapOptions options;
     options.exclude_vdso_vvar = true;
     this->vm()->unmap_all_but_rr_mappings(remote, options);
     LOG(debug) << "Creating stack mapping " << stack_mapping << " for " << tid;
     create_mapping(this, remote, stack_mapping);
     LOG(debug) << "Copying stack into " << tid;
     copy_mem_mapping(other, this, stack_mapping);
   }
   {
     AutoRemoteSyscalls remote_this(this);
     for (auto &km : mappings) {
       LOG(debug) << "Creating mapping " << km << " for " << tid;
       create_mapping(this, remote_this, km);
       LOG(debug) << "Copying mapping into " << tid;
       if (!(km.flags() & MAP_SHARED)) {
         // Make the effort just for bigger mappings, copy smaller as a whole.
         if ((km.flags() & MAP_ANONYMOUS) &&
             km.size() >= 0x400000/*4MB*/)
         {
           LOG(debug) << "Using copy_mem_mapping_just_used";
           if (copy_mem_mapping_just_used(other, this, km)) {
             continue;
           }
           LOG(debug) << "Fallback to copy_mem_mapping";
         }
         copy_mem_mapping(other, this, km);
       }
     }
     AutoRemoteSyscalls remote_other(other);
     std::vector<int> all_fds = read_all_proc_fds(other->tid);
     for (int fd : all_fds) {
       if (fd == session().tracee_fd_number()) {
         continue;
       }
       // If this is a /proc/self/mem fd, rewrite it for the new task
       FileMonitor *fd_monitor = other->fd_table()->get_monitor(fd);
       ScopedFd here;
       if (fd_monitor && fd_monitor->type() == FileMonitor::ProcMem &&
           ((ProcMemMonitor *)fd_monitor)->target_is_vm(other->vm().get())) {
         here = ScopedFd(::dup(this->vm()->mem_fd().get()));
       } else {
         here = remote_other.retrieve_fd(fd);
       }
       int remote_fd_flags = remote_other.infallible_syscall(
         syscall_number_for_fcntl(this->arch()), fd, F_GETFD);
       int remote_fd = remote_this.infallible_send_fd_if_alive(here);
       if (remote_fd >= 0) {
         if (remote_fd != fd) {
           remote_this.infallible_syscall(syscall_number_for_dup3(this->arch()), remote_fd, fd, 0);
           remote_this.infallible_close_syscall_if_alive(remote_fd);
         }
         remote_other.infallible_syscall(
           syscall_number_for_fcntl(this->arch()),
           fd, F_SETFD, remote_fd_flags);
       }
     }
     string path = ".";
     AutoRestoreMem child_path(remote_other, path.c_str());
     {
       long child_fd =
         remote_other.syscall(syscall_number_for_openat(other->arch()), AT_FDCWD,
                        child_path.get(), O_RDONLY);
       ASSERT(other, child_fd != -1);
       ScopedFd fd = remote_other.retrieve_fd(child_fd);
       remote_other.infallible_close_syscall_if_alive(child_fd);
       child_fd = remote_this.infallible_send_fd_if_alive(fd);
       if (child_fd >= 0) {
         remote_this.syscall(syscall_number_for_fchdir(this->arch()), child_fd);
         remote_this.infallible_close_syscall_if_alive(child_fd);
       }
     }

     // Copy rlimits
     struct rlimit64 limit;
     for (size_t i = 0; i < (sizeof(all_rlimits)/sizeof(all_rlimits[0])); ++i) {
       int err = syscall(SYS_prlimit64, (uintptr_t)other->tid,
         (uintptr_t)all_rlimits[i], (uintptr_t)NULL, (uintptr_t)&limit);
       ASSERT(other, err == 0);
       err = syscall(SYS_prlimit64, (uintptr_t)this->tid,
         (uintptr_t)all_rlimits[i], (uintptr_t)&limit, (uintptr_t)NULL);
       ASSERT(this, err == 0);
     }

     NativeArch::prctl_mm_map map;
     memset(&map, 0, sizeof(map));

     other->vm()->read_mm_map(other, &map);
     apply_mm_map(remote_this, map);
   }
   copy_state(other->capture_state());
   activate_preload_thread_locals();
 }

 /**
  * Proceeds until the next system call, which is being executed.
  * Returns false if did_waitpid failed because the task got SIGKILL
  * or equivalent.
  */
 static bool __ptrace_cont(Task* t, ResumeRequest resume_how,
                           SupportedArch syscall_arch, int expect_syscallno,
                           int expect_syscallno2 = -1, pid_t new_tid = -1) {
   t->resume_execution(resume_how, RESUME_NONBLOCKING, RESUME_NO_TICKS);
   while (true) {
     // Do our own waiting instead of calling Task::wait() so we can detect and
     // handle tid changes due to off-main-thread execve.
     WaitOptions options(t->tid);
     if (new_tid >= 0) {
       options.unblock_on_other_tasks = true;
     }
     WaitResult result = WaitManager::wait_stop(options);
     if (new_tid >= 0 && result.code == WAIT_NO_CHILD) {
       // tid change happened before our wait call. Try another wait .
       options.tid = new_tid;
       options.unblock_on_other_tasks = false;
       result = WaitManager::wait_stop(options);
     }
     ASSERT(t, result.code == WAIT_OK);
     if (new_tid >= 0) {
       t->hpc.set_tid(new_tid);
       t->tid = new_tid;
     }
     if (!t->did_waitpid(result.status)) {
       return false;
     }

     if (ReplaySession::is_ignored_signal(t->status().stop_sig())) {
       t->resume_execution(resume_how, RESUME_NONBLOCKING, RESUME_NO_TICKS);
     } else {
       break;
     }
   }

   ASSERT(t, !t->stop_sig())
       << "Expected no pending signal, but got " << t->stop_sig();

   /* check if we are synchronized with the trace -- should never fail */
   int current_syscall = t->regs().original_syscallno();
   ASSERT(t,
          current_syscall == expect_syscallno ||
              current_syscall == expect_syscallno2)
       << "Should be at " << syscall_name(expect_syscallno, syscall_arch)
       << ", but instead at " << syscall_name(current_syscall, syscall_arch);
   return true;
 }

 void Task::did_handle_ptrace_exit_event() {
   ASSERT(this, !handled_ptrace_exit_event_);
   handled_ptrace_exit_event_ = true;
 }

 void Task::os_exec(SupportedArch exec_arch, std::string filename)
 {
   // Setup memory and registers for the execve call. We may not have to save
   // the old values since they're going to be wiped out by execve. We can
   // determine this by checking if this address space has any tasks with a
   // different tgid.
   Task* memory_task = this;
   for (auto task : vm()->task_set()) {
     if (task->tgid() != tgid()) {
       memory_task = task;
       break;
     }
   }

   // Old data if required
   std::vector<uint8_t> saved_data;

   // Set up everything
   Registers regs = this->regs();
   regs.set_ip(vm()->traced_syscall_ip());
   remote_ptr<void> remote_mem = floor_page_size(regs.sp());

   // Determine how much memory we'll need
   size_t filename_size = filename.size() + 1;
   size_t total_size = filename_size + sizeof(size_t);
   if (memory_task != this) {
     saved_data = read_mem(remote_mem.cast<uint8_t>(), total_size);
   }

   // We write a zero word in the host size, not t's size, but that's OK,
   // since the host size must be bigger than t's size.
   // We pass no argv or envp, so exec params 2 and 3 just point to the NULL
   // word.
   write_mem(remote_mem.cast<size_t>(), size_t(0));
   regs.set_arg2(remote_mem);
   regs.set_arg3(remote_mem);
   remote_ptr<void> filename_addr = remote_mem + sizeof(size_t);
   write_bytes_helper(filename_addr, filename_size, filename.c_str());
   regs.set_arg1(filename_addr);
   /* The original_syscallno is execve in the old architecture. The kernel does
    * not update the original_syscallno when the architecture changes across
    * an exec.
    * We're using the dedicated traced-syscall IP so its arch is t's arch.
    */
   int expect_syscallno = syscall_number_for_execve(arch());
   regs.set_syscallno(expect_syscallno);
   regs.set_original_syscallno(expect_syscallno);
   set_regs(regs);

   LOG(debug) << "Beginning execve" << this->regs();
   enter_syscall();
   ASSERT(this, !stop_sig()) << "exec failed on entry";
   /* Complete the syscall. The tid of the task will be the thread-group-leader
    * tid, no matter what tid it was before.
    */
   pid_t tgid = real_tgid();
   bool ok = __ptrace_cont(this, RESUME_SYSCALL, arch(), expect_syscallno,
                           syscall_number_for_execve(exec_arch),
                           tgid == tid ? -1 : tgid);
   ASSERT(this, ok) << "Task " << tid << " got killed while trying to exec";
   LOG(debug) << this->status() << " " << this->regs();
   if (this->regs().syscall_result()) {
     errno = -this->regs().syscall_result();
     if (access(filename.c_str(), 0) == -1 && errno == ENOENT &&
         exec_arch == x86) {
       FATAL() << "Cannot find " << filename
               << " to replay this 32-bit process; you probably built rr with "
                  "disable32bit";
     }
     errno = -this->regs().syscall_result();
     ASSERT(this, false) << "Exec of " << filename << " failed";
   }

   // Restore any memory if required. We need to do this through memory_task,
   // since the new task is now on the new address space. Do it now because
   // later we may try to unmap this task's syscallbuf.
   if (memory_task != this) {
     memory_task->write_mem(remote_mem.cast<uint8_t>(), saved_data.data(),
                            saved_data.size());
   }
 }

 void Task::apply_syscall_entry_regs()
 {
   if (arch() == aarch64) {
     registers.set_original_syscallno(registers.syscallno());
     registers.set_orig_arg1(registers.arg1());
     // Don't update registers_dirty here, because these registers are not part
     // of the ptrace state tracked by that flag.
     ticks_at_last_syscall_entry = tick_count();
     ip_at_last_syscall_entry = registers.ip();
     last_syscall_entry_recorded = false;
   }
 }

 void Task::tgkill(int sig) {
   LOG(debug) << "Sending " << sig << " to tid " << tid;
   ASSERT(this, 0 == syscall(SYS_tgkill, real_tgid(), tid, sig));
 }

 bool Task::move_to_signal_stop()
 {
   LOG(debug) << "    maybe not in signal-stop (status " << status()
              << "); doing tgkill(SYSCALLBUF_DESCHED_SIGNAL)";
   // Always send SYSCALLBUF_DESCHED_SIGNAL because other signals (except
   // TIME_SLICE_SIGNAL) will be blocked by
   // RecordTask::will_resume_execution().
   // During record make sure to use the syscallbuf desched sig.
   // During replay, it doesn't really matter, since we don't apply
   // the signal mask to the replay task.
   int sig = SYSCALLBUF_DEFAULT_DESCHED_SIGNAL;
   if (session().is_recording()) {
     sig = session().as_record()->syscallbuf_desched_sig();
   }
   // Note that this signal cannot be blocked by tracees.
   this->tgkill(sig);
   /* Now singlestep the task until we're in a signal-stop for the signal
    * we've just sent. We must absorb and forget that signal here since we
    * don't want it delivered to the task for real.
    */
   auto old_ip = ip();
   if (arch() == aarch64 && session().is_recording() && status().is_syscall() &&
       static_cast<RecordTask*>(this)->at_may_restart_syscall()) {
     // On aarch64, single step of an aborted syscall
     // will cause us to move to before the syscall instruction
     old_ip = old_ip.decrement_by_syscall_insn_length(arch());
   }
   do {
     if (!resume_execution(RESUME_SINGLESTEP, RESUME_WAIT_NO_EXIT, RESUME_NO_TICKS)) {
       return false;
     }
     ASSERT(this, old_ip == ip())
         << "Singlestep actually advanced when we "
         << "just expected a signal; was at " << old_ip << " now at "
         << ip() << " with status " << status();
     // Ignore any pending TIME_SLICE_SIGNALs and continue until we get our
     // SYSCALLBUF_DESCHED_SIGNAL.
   } while (stop_sig() == PerfCounters::TIME_SLICE_SIGNAL);
   return true;
 }

 bool Task::should_apply_rseq_abort(EventType event_type, remote_code_ptr* new_ip,
                                    bool* invalid_rseq_cs) {
   /* Syscallbuf flushes don't trigger rseq aborts ---
      whatever triggered the syscallbuf flush might */
   if (!rseq_state || event_type == EV_SYSCALLBUF_FLUSH) {
     return false;
   }
   // We're relying on the fact that rseq_t is the same across architectures.
   // These reads might fail if the task is dead and gone.
   bool ok = true;
   auto rseq = read_mem(rseq_state->ptr.cast<typename NativeArch::rseq_t>(), &ok);
   if (!ok || !rseq.rseq_cs) {
     return false;
   }
   auto rseq_cs = read_mem(remote_ptr<typename NativeArch::rseq_cs>(rseq.rseq_cs), &ok);
   if (!ok || rseq_cs.version ||
       rseq_cs.start_ip + rseq_cs.post_commit_offset < rseq_cs.start_ip ||
       rseq_cs.abort_ip - rseq_cs.start_ip < rseq_cs.post_commit_offset) {
     *invalid_rseq_cs = true;
     return false;
   }
   if (ip().register_value() - rseq_cs.start_ip >= rseq_cs.post_commit_offset) {
     return false;
   }
   uint32_t flag;
   switch (event_type) {
     case EV_SCHED:
       flag = 1 << RR_RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT_BIT;
       break;
     case EV_SIGNAL:
       flag = 1 << RR_RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL_BIT;
       break;
     default:
       /* A system call inside the rseq region should SIGSEGV but we don't emulate that yet */
       ASSERT(this, false) << "Unsupported event type";
       return false;
   }
   if ((rseq.flags | rseq_cs.flags) & flag) {
     return false;
   }
   uint32_t sig = read_mem(remote_ptr<uint32_t>(rseq_cs.abort_ip - 4), &ok);
   if (!ok || sig != rseq_state->abort_prefix_signature) {
     *invalid_rseq_cs = true;
     return false;
   }
   *new_ip = remote_code_ptr(rseq_cs.abort_ip);
   return true;
 }

 }