src/RecordSession.cc - toolchain/rr - Git at Google

 /* -*- Mode: C++; tab-width: 8; c-basic-offset: 2; indent-tabs-mode: nil; -*- */

 #include "RecordSession.h"

 #include <elf.h>
 #include <limits.h>
 #include <linux/capability.h>
 #include <linux/futex.h>
 #include <sys/types.h>
 #include <sys/socket.h>

 #include <algorithm>
 #include <sstream>
 #include <string>

 #include "AutoRemoteSyscalls.h"
 #include "ElfReader.h"
 #include "Flags.h"
 #include "RecordTask.h"
 #include "TraceeAttentionSet.h"
 #include "VirtualPerfCounterMonitor.h"
 #include "WaitManager.h"
 #include "core.h"
 #include "ftrace.h"
 #include "kernel_metadata.h"
 #include "kernel_supplement.h"
 #include "log.h"
 #include "record_signal.h"
 #include "record_syscall.h"
 #include "seccomp-bpf.h"

 namespace rr {

 // Undef si_addr_lsb since it's an alias for a field name that doesn't exist,
 // and we need to use the actual field name.
 #ifdef si_addr_lsb
 #undef si_addr_lsb
 #endif

 using namespace rr;
 using namespace std;

 template <typename T> static remote_ptr<T> mask_low_bit(remote_ptr<T> p) {
   return p.as_int() & ~uintptr_t(1);
 }

 template <typename Arch>
 static void record_robust_futex_change(
     RecordTask* t, const typename Arch::robust_list_head& head,
     remote_ptr<void> base) {
   if (base.is_null()) {
     return;
   }
   remote_ptr<void> futex_void_ptr = base + head.futex_offset;
   auto futex_ptr = futex_void_ptr.cast<uint32_t>();
   // We can't just record the current futex value because at this point
   // in task exit the robust futex handling has not happened yet. So we have
   // to emulate what the kernel will do!
   bool ok = true;
   uint32_t val = t->read_mem(futex_ptr, &ok);
   if (!ok) {
     return;
   }
   if (pid_t(val & FUTEX_TID_MASK) != t->own_namespace_rec_tid) {
     return;
   }
   val = (val & FUTEX_WAITERS) | FUTEX_OWNER_DIED;
   // Update memory now so that the kernel doesn't decide to do it later, at
   // a time that might race with other tracee execution.
   t->write_mem(futex_ptr, val);
   t->record_local(futex_ptr, &val);
 }

 /**
  * Any user-space writes performed by robust futex handling are captured here.
  * They must be emulated during replay; the kernel will not do it for us
  * during replay because the TID value in each futex is the recorded
  * TID, not the actual TID of the dying task.
  */
 template <typename Arch>
 static void record_robust_futex_changes_arch(RecordTask* t) {
   if (t->did_record_robust_futex_changes) {
     return;
   }
   t->did_record_robust_futex_changes = true;

   auto head_ptr = t->robust_list().cast<typename Arch::robust_list_head>();
   if (head_ptr.is_null()) {
     return;
   }
   ASSERT(t, t->robust_list_len() == sizeof(typename Arch::robust_list_head));
   bool ok = true;
   auto head = t->read_mem(head_ptr, &ok);
   if (!ok) {
     return;
   }
   record_robust_futex_change<Arch>(t, head,
                                    mask_low_bit(head.list_op_pending.rptr()));
   for (auto current = mask_low_bit(head.list.next.rptr());
        current.as_int() != head_ptr.as_int();) {
     record_robust_futex_change<Arch>(t, head, current);
     auto next = t->read_mem(current, &ok);
     if (!ok) {
       return;
     }
     current = mask_low_bit(next.next.rptr());
   }
 }

 static void record_robust_futex_changes(RecordTask* t) {
   RR_ARCH_FUNCTION(record_robust_futex_changes_arch, t->arch(), t);
 }

 static void record_exit_trace_event(RecordTask* t, WaitStatus exit_status) {
   t->session().trace_writer().write_task_event(
       TraceTaskEvent::for_exit(t->tid, exit_status));
   if (t->thread_group()->tgid == t->tid) {
     t->thread_group()->exit_status = exit_status;
   }
 }

 static bool looks_like_syscall_entry(RecordTask* t) {
   bool ok;
   bool at_syscall = is_at_syscall_instruction(t,
       t->regs().ip().decrement_by_syscall_insn_length(t->arch()), &ok);
   // It's possible for the task to have died (e.g. if it got signaled twice
   // in rapid succession). In that case, try to just go by register contents.
   if (ok && !at_syscall) {
     return false;
   }
   if (is_x86ish(t->arch())) {
     // On x86 rax gets set to ENOSYS on entry. Elsewhere this does not happen.
     // Further, even if we did ask about the syscallno, it might have been
     // reset by the signal handler. However, on non-x86 platforms we currently
     // count taken braches, rather than only conditional ones, so it should
     // be impossible to see the same syscall ip twice without intervening
     // ticks, so the check that follows these conditions, should be sufficient
     // there.
     return t->regs().original_syscallno() >= 0 &&
            t->regs().syscall_result_signed() == -ENOSYS;
   } else if (t->arch() == aarch64) {
     // We recorded when we saw the last syscall entry
     // so just use that to determine if we've already save it in the trace.
     if (t->ticks_at_last_syscall_entry == t->tick_count() &&
         t->ip_at_last_syscall_entry == t->regs().ip()) {
       return !t->last_syscall_entry_recorded;
     }
   }
   // Getting a sched event here is better than a spurious syscall event.
   // Syscall entry does not cause visible register modification, so upon
   // hitting the sched event the register state would indeed match.
   return ok;
 }

 /**
  * Return true if we handle a ptrace exit event for task t. When this returns
  * true, t may have been deleted.
  */
 static bool handle_ptrace_exit_event(RecordTask* t) {
   if (t->was_reaped()) {
     if (t->handled_ptrace_exit_event()) {
       t->did_reach_zombie();
       return true;
     }
   } else if (t->ptrace_event() != PTRACE_EVENT_EXIT) {
     return false;
   }

   if (t->stable_exit || t->was_reaped()) {
     LOG(debug) << "stable exit";
   } else {
     if (!t->may_be_blocked()) {
       // might have been hit by a SIGKILL or a SECCOMP_RET_KILL, in which case
       // there might be some execution since its last recorded event that we
       // need to replay.
       // There's a weird case (in 4.13.5-200.fc26.x86_64 at least) where the
       // task can enter the kernel but instead of receiving a syscall ptrace
       // event, we receive a PTRACE_EVENT_EXIT due to a concurrent execve
       // (and probably a concurrent SIGKILL could do the same). The task state
       // has been updated to reflect syscall entry. If we record a SCHED in
       // that state replay of the SCHED will fail. So detect that state and fix
       // it up.
       // If we got killed in an untraced syscall on AArch64,
       // it is difficult/impossible to tell if the value of x0 has been overwritten
       // with the syscall result/error number
       // and it's even harder to recover the correct value of x0.
       // Simply ignore these since we weren't going to record them anyway.
       if (looks_like_syscall_entry(t) && !t->is_in_untraced_syscall()) {
         // Either we're in a syscall, or we're immediately after a syscall
         // and it exited.
         if (t->ticks_at_last_recorded_syscall_exit == t->tick_count() &&
             t->regs().ip() == t->ip_at_last_recorded_syscall_exit) {
           LOG(debug) << "Nothing to record after PTRACE_EVENT_EXIT";
           // It's the latter case; do nothing.
         } else {
           // It's the former case ... probably. Theoretically we could have
           // re-executed a syscall without any ticks in between, but that seems
           // highly improbable.
           // Record the syscall-entry event that we otherwise failed to record.
           t->canonicalize_regs(t->arch());
           auto r = t->regs();
           if (t->arch() == aarch64) {
             // On AArch64, when we get here, there are 3 different cases,
             // 1. EXIT before we hit the syscall entry stop
             // 2. EXIT after syscall entry stop but
             //    before the result (X0) is overwritten
             // 3. EXIT after syscall entry stop and
             //    after the result (X0) is overwritten
             //    (i.e. after the syscall but we got an EXIT
             //     before the syscall exit stop.)

             // We detect the first case based on `*_at_last_syscall_entry`
             // set by `apply_syscall_entry_regs` and trust the current values
             // `x0` and `x8`.

             // For the second and third cases, we rely on the syscall enter stop
             // to set the orig_arg1 and original_syscallno correctly.
             if (t->ticks_at_last_syscall_entry == t->tick_count() &&
                 t->ip_at_last_syscall_entry == r.ip()) {
               // We need to rely on the saved `orig_arg1` since in the third case
               // the `x0` may already be overwritten.
               // The assertion here assumes that
               // `apply_syscall_entry_regs` is called when we enter the syscall
               // and `x8` still holds the correct syscall number
               // when we hit the process exit stop.
               ASSERT(t, r.original_syscallno() == r.syscallno())
                 << "syscallno not saved by syscall enter handler: " << r;
               r.set_arg1(r.orig_arg1());
             } else {
               r.set_original_syscallno(r.syscallno());
             }
           }
           // Assume it's a native-arch syscall. If it isn't, it doesn't matter
           // all that much since we aren't actually going to do anything with it
           // in this task.
           // Avoid calling detect_syscall_arch here since it could fail if the
           // task is already completely dead and gone.
           SyscallEvent event(r.original_syscallno(), t->arch());
           event.state = ENTERING_SYSCALL;
           // Don't try to reset the syscallbuf here. The task may be exiting
           // while in arbitrary syscallbuf code. And of course, because it's
           // exiting, it doesn't matter if we don't reset the syscallbuf.
           t->record_event(event, RecordTask::FLUSH_SYSCALLBUF,
                           RecordTask::DONT_RESET_SYSCALLBUF, &r);
         }
       } else {
         // Don't try to reset the syscallbuf here. The task may be exiting
         // while in arbitrary syscallbuf code. And of course, because it's
         // exiting, it doesn't matter if we don't reset the syscallbuf.
         // XXX flushing the syscallbuf may be risky too...
         auto event = Event::sched();
         // When replaying this SCHED, we won't proceed past the `syscall_hook`
         // entry point. Code inside the syscallbuf may be in a bad state during
         // replay because we didn't save buffered syscalls.
         event.Sched().in_syscallbuf_syscall_hook = t->syscallbuf_code_layout.syscallbuf_syscall_hook;
         t->record_event(event, RecordTask::FLUSH_SYSCALLBUF,
                         RecordTask::DONT_RESET_SYSCALLBUF);
       }
     }
     /* XXX: We could try to find some tasks here to unmap our buffers, but it
      *      seems hardly worth it.
      * Mark buffers as gone after recording events, in case they need to flush the syscallbuf.
      */
     t->destroy_buffers(nullptr, nullptr);
   }

   WaitStatus exit_status;
   if (t->was_reaped()) {
     exit_status = t->status();
   } else {
     record_robust_futex_changes(t);

     unsigned long msg = 0;
     // If ptrace_if_stopped fails, then the task has been killed by SIGKILL
     // or equivalent.
     if (t->ptrace_if_stopped(PTRACE_GETEVENTMSG, nullptr, &msg)) {
       exit_status = WaitStatus(msg);
     } else {
       exit_status = WaitStatus::for_fatal_sig(SIGKILL);
     }
   }

   t->did_handle_ptrace_exit_event();

   // If we died because of a coredumping signal, that is a barrier event, and
   // every task in the address space needs to pass its PTRACE_EXIT_EVENT before
   // they proceed to (potentially hidden) zombie state, so we can't wait for
   // that to happen.
   // Similarly we can't wait for this task to exit if there are other
   // tasks in its pid namespace that need to exit and this is the last thread
   // of pid-1 in that namespace, because the kernel must reap them before
   // letting this task complete its exit.
   bool may_wait_exit = !t->was_reaped() && !is_coredumping_signal(exit_status.fatal_sig()) &&
     !t->waiting_for_pid_namespace_tasks_to_exit();
   record_exit_trace_event(t, exit_status);
   t->record_exit_event(
     (!t->was_reaped() && !may_wait_exit) ? RecordTask::WRITE_CHILD_TID : RecordTask::KERNEL_WRITES_CHILD_TID);
   if (!t->was_reaped()) {
     t->proceed_to_exit(may_wait_exit);
   }
   t->do_ptrace_exit_stop(exit_status);
   if (may_wait_exit) {
     t->did_reach_zombie();
   } else if (!t->was_reaped()) {
     t->waiting_for_reap = true;
   }
   return true;
 }

 static void note_entering_syscall(RecordTask* t) {
   ASSERT(t, EV_SYSCALL == t->ev().type());
   t->ev().Syscall().state = ENTERING_SYSCALL;
   if (!t->ev().Syscall().is_restart) {
     /* Save a copy of the arg registers so that we
      * can use them to detect later restarted
      * syscalls, if this syscall ends up being
      * restarted.  We have to save the registers
      * in this rather awkward place because we
      * need the original registers; the restart
      * (if it's not a SYS_restart_syscall restart)
      * will use the original registers. */
     t->ev().Syscall().regs = t->regs();
   } else {
     t->ev().Syscall().regs.set_syscallno(t->regs().syscallno());
     // We may have intentionally stored the syscall result here.
     // Now that we're safely past the signal delivery, make the
     // registers look like they did at the original syscall entry
     // again.
     t->ev().Syscall().regs.set_arg1(t->ev().Syscall().regs.orig_arg1());
     if (t->arch() == aarch64) {
       // We probably got here with a PTRACE_SYSCALL. The x7
       // value will be wrong due to the aarch64 kernel bug.
       // Get it from the syscall event.
       Registers r = t->regs();
       r.set_x7(t->ev().Syscall().regs.x7());
       t->set_regs(r);
     }
   }
 }

 #if defined (__x86_64__)
 static bool is_in_vsyscall(remote_code_ptr ip)
 {
   // This is hardcoded by the Linux ABI
   remote_code_ptr vsyscall_start = 0xffffffffff600000;
   remote_code_ptr vsyscall_end = 0xffffffffff601000;
   return vsyscall_start <= ip && ip < vsyscall_end;
 }
 #else
 static bool is_in_vsyscall(remote_code_ptr)
 {
   return false;
 }
 #endif

 void RecordSession::handle_seccomp_traced_syscall(RecordTask* t,
                                                   StepState* step_state,
                                                   RecordResult* result,
                                                   bool* did_enter_syscall) {
   *did_enter_syscall = false;

   // Special case: If the tracee issues a vsyscall, we will get a seccomp trap,
   // but no syscall traps whatsoever. In particular, we wouldn't see it during
   // replay either. We try to monkeypatch the caller on the assumption that known
   // callers of this (deprecated) interface all follow a common pattern. If we
   // can't patch the caller, this is a fatal error, since the recording will
   // otherwise be broken.
   remote_code_ptr ip = t->regs().ip();
   if (is_in_vsyscall(ip)) {
     remote_ptr<void> sp = t->regs().sp();
     // The kernel assumes the return address is on the stack - we do the same
     remote_ptr<remote_code_ptr> ret_addr_addr = sp.cast<remote_code_ptr>();
     remote_code_ptr ret_addr = t->read_mem(ret_addr_addr);

     // Skip this syscall. We will attempt to patch it to the vdso entry and
     // let the tracee retry there.
     Registers regs = t->regs();
     regs.set_original_syscallno(-1);
     // We can't modify the ip here, the kernel will kill the tracee with
     // SIGSYS. Instead, we set a breakpoint at the return instruction.
     t->set_regs(regs);
     t->vm()->add_breakpoint(ret_addr, BKPT_INTERNAL);
     while (true) {
       if (!t->resume_execution(RESUME_SYSCALL, RESUME_WAIT_NO_EXIT, RESUME_NO_TICKS)) {
         // Tracee exited unexpectedly
         return;
       }
       ASSERT(t, !t->ptrace_event());
       if (t->stop_sig() == syscallbuf_desched_sig()) {
         continue;
       }
       if (t->stop_sig() == SIGTRAP &&
           is_kernel_trap(t->get_siginfo().si_code)) {
         // Hit the breakpoint
         break;
       }
       t->stash_sig();
     }
     t->vm()->remove_breakpoint(ret_addr, BKPT_INTERNAL);

     ASSERT(t, t->regs().ip().undo_executed_bkpt(t->arch()) == ret_addr);

     // Now that we're in a sane state, ask the Monkeypatcher to try and patch
     // that.
     bool patch_ok = t->vm()->monkeypatcher().try_patch_vsyscall_caller(t, ret_addr);
     ASSERT(t, patch_ok) << "The tracee issues a vsyscall to " << ip
             << " but we failed to monkeypatch the caller (return address "
             << ret_addr << ", sp=" << sp << "). Recording will not succeed. Exiting.";

     // Reset to the start of the region and continue
     regs = t->regs();
     regs.set_ip(ret_addr.decrement_by_vsyscall_entry_length(t->arch()));
     t->set_regs(regs);

     // We patched this syscall, record that
     auto ev = Event::patch_syscall();
     ev.PatchSyscall().patch_vsyscall = true;
     t->record_event(ev);

     step_state->continue_type = RecordSession::CONTINUE;
     return;
   }

   int syscallno = t->regs().original_syscallno();
   if (syscallno < 0) {
     // negative syscall numbers after a SECCOMP event
     // are treated as "skip this syscall". There will be one syscall event
     // reported instead of two. So fake an enter-syscall event now.
     // It doesn't really matter what the syscall-arch is.
     t->canonicalize_regs(t->arch());
     if (syscall_seccomp_ordering_ == SECCOMP_BEFORE_PTRACE_SYSCALL) {
       // If the ptrace entry stop hasn't happened yet, we're at a weird
       // intermediate state where the behavior of the next PTRACE_SYSCALL
       // will depend on the register state (i.e. whether we see an entry
       // trap or proceed right to the exit trap). To make things easier
       // on the rest of the system, do a fake syscall entry, then reset
       // the register state.
       Registers orig_regs = t->regs();
       Registers r = orig_regs;
       r.set_original_syscallno(syscall_number_for_gettid(t->arch()));
       t->set_regs(r);
       if (!t->resume_execution(RESUME_SYSCALL, RESUME_WAIT_NO_EXIT, RESUME_NO_TICKS)) {
         // Tracee died unexpectedly. We did not enter a syscall.
         // We shouldn't try to resume it now.
         step_state->continue_type = RecordSession::DONT_CONTINUE;
         return;
       }
       t->set_regs(orig_regs);
     }

     // Don't continue yet. At the next iteration of record_step, we'll
     // enter syscall_state_changed and that will trigger a continue to
     // the syscall exit.
     step_state->continue_type = RecordSession::DONT_CONTINUE;
     if (!process_syscall_entry(t, step_state, result, t->arch())) {
       return;
     }
     *did_enter_syscall = true;
     return;
   }

   if (syscall_seccomp_ordering_ == SECCOMP_BEFORE_PTRACE_SYSCALL) {
     // The next continue needs to be a PTRACE_SYSCALL to observe
     // the enter-syscall event.
     step_state->continue_type = RecordSession::CONTINUE_SYSCALL;
   } else {
     ASSERT(t, syscall_seccomp_ordering_ == PTRACE_SYSCALL_BEFORE_SECCOMP);
     if (t->ev().is_syscall_event() &&
         t->ev().Syscall().state == PROCESSING_SYSCALL) {
       // We did PTRACE_SYSCALL and already saw a syscall trap. Just ignore this.
       LOG(debug) << "Ignoring SECCOMP syscall trap since we already got a "
                     "PTRACE_SYSCALL trap";
       // The next continue needs to be a PTRACE_SYSCALL to observe
       // the exit-syscall event.
       step_state->continue_type = RecordSession::CONTINUE_SYSCALL;
       // Need to restore last_task_switchable since it will have been
       // reset to PREVENT_SWITCH
       last_task_switchable = t->ev().Syscall().switchable;
     } else {
       // We've already passed the PTRACE_SYSCALL trap for syscall entry, so
       // we need to handle that now.
       SupportedArch syscall_arch = t->detect_syscall_arch();
       t->canonicalize_regs(syscall_arch);
       if (!process_syscall_entry(t, step_state, result, syscall_arch)) {
         step_state->continue_type = RecordSession::DONT_CONTINUE;
         return;
       }
       *did_enter_syscall = true;
     }
   }
 }

 static void seccomp_trap_done(RecordTask* t) {
   t->pop_seccomp_trap();

   // It's safe to reset the syscall buffer now.
   t->delay_syscallbuf_reset_for_seccomp_trap = false;

   t->write_and_record(REMOTE_PTR_FIELD(t->syscallbuf_child, failed_during_preparation),
                       (uint8_t)1);
   if (EV_DESCHED == t->ev().type()) {
     // Desched processing will do the rest for us
     return;
   }

   // Abort the current syscallbuf record, which corresponds to the syscall that
   // wasn't actually executed due to seccomp.
   t->write_mem(REMOTE_PTR_FIELD(t->syscallbuf_child, abort_commit), (uint8_t)1);
   t->record_event(Event::syscallbuf_abort_commit());

   // In fact, we need to. Running the syscall exit hook will ensure we
   // reset the buffer before we try to buffer another a syscall.
   t->write_mem(
       REMOTE_PTR_FIELD(t->syscallbuf_child, notify_on_syscall_hook_exit),
       (uint8_t)1);
 }

 static void handle_seccomp_trap(RecordTask* t,
                                 RecordSession::StepState* step_state,
                                 uint16_t seccomp_data) {
   // The architecture may be wrong, but that's ok, because an actual syscall
   // entry did happen, so the registers are already updated according to the
   // architecture of the system call.
   t->canonicalize_regs(t->detect_syscall_arch());
   t->apply_syscall_entry_regs();

   Registers r = t->regs();
   int syscallno = r.original_syscallno();
   // Cause kernel processing to skip the syscall
   r.set_original_syscallno(SECCOMP_MAGIC_SKIP_ORIGINAL_SYSCALLNO);
   t->set_regs(r);

   bool syscall_entry_already_recorded = false;
   if (t->ev().is_syscall_event()) {
     // A syscall event was already pushed, probably because we did a
     // PTRACE_SYSCALL to enter the syscall during handle_desched_event. Cancel
     // that event now since the seccomp SIGSYS aborts it completely.
     ASSERT(t, t->ev().Syscall().number == syscallno);
     // Make sure any prepared syscall state is discarded and any temporary
     // effects (e.g. redirecting pointers to scratch) undone.
     rec_abort_prepared_syscall(t);
     if (t->ev().type() == EV_SYSCALL_INTERRUPTION) {
       // The event could be a syscall-interruption if it was pushed by
       // `handle_desched_event`. In that case, it has not been recorded yet.
       t->pop_syscall_interruption();
     } else {
       t->pop_syscall();
       syscall_entry_already_recorded = true;
     }
   }

   if (t->is_in_untraced_syscall()) {
     ASSERT(t, !t->delay_syscallbuf_reset_for_seccomp_trap);
     // Don't reset the syscallbuf immediately after delivering the trap. We have
     // to wait until this buffered syscall aborts completely before resetting
     // the buffer.
     t->delay_syscallbuf_reset_for_seccomp_trap = true;

     t->push_event(Event::seccomp_trap());

     // desched may be armed but we're not going to execute the syscall, let
     // alone block. If it fires, ignore it.
     t->write_mem(
         REMOTE_PTR_FIELD(t->syscallbuf_child, desched_signal_may_be_relevant),
         (uint8_t)0);
   }

   t->push_syscall_event(syscallno);
   t->ev().Syscall().failed_during_preparation = true;
   note_entering_syscall(t);

   if (t->is_in_untraced_syscall() && !syscall_entry_already_recorded) {
     t->record_current_event();
   }

   // Use NativeArch here because different versions of system headers
   // have inconsistent field naming.
   union {
     NativeArch::siginfo_t native_api;
     siginfo_t linux_api;
   } si;
   memset(&si, 0, sizeof(si));
   si.native_api.si_signo = SIGSYS;
   si.native_api.si_errno = seccomp_data;
   si.native_api.si_code = SYS_SECCOMP;
   si.native_api._sifields._sigsys._arch = to_audit_arch(r.arch());
   si.native_api._sifields._sigsys._syscall = syscallno;
   // Documentation says that si_call_addr is the address of the syscall
   // instruction, but in tests it's immediately after the syscall
   // instruction.
   si.native_api._sifields._sigsys._call_addr = t->ip().to_data_ptr<void>();
   LOG(debug) << "Synthesizing " << si.linux_api;
   t->stash_synthetic_sig(si.linux_api, DETERMINISTIC_SIG);

   // Tests show that the current registers are preserved (on x86, eax/rax
   // retains the syscall number).
   r.set_syscallno(syscallno);
   t->set_regs(r);
   t->maybe_restore_original_syscall_registers();

   if (t->is_in_untraced_syscall()) {
     // For buffered syscalls, go ahead and record the exit state immediately.
     t->ev().Syscall().state = EXITING_SYSCALL;
     t->record_current_event();
     t->pop_syscall();

     // The tracee is currently in the seccomp ptrace-stop. Advance it to the
     // syscall-exit stop so that when we try to deliver the SIGSYS via
     // PTRACE_SINGLESTEP, that doesn't trigger a SIGTRAP stop.
     // If this fails, that's fine, we're not going to deliver the SIGSYS.
     t->resume_execution(RESUME_SYSCALL, RESUME_WAIT_NO_EXIT, RESUME_NO_TICKS);
   }

   // Don't continue yet. At the next iteration of record_step, if we
   // recorded the syscall-entry we'll enter syscall_state_changed and
   // that will trigger a continue to the syscall exit. If we recorded the
   // syscall-exit we'll go straight into signal delivery.
   step_state->continue_type = RecordSession::DONT_CONTINUE;
 }

 static void handle_seccomp_errno(RecordTask* t,
                                  RecordSession::StepState* step_state,
                                  uint16_t seccomp_data) {
   t->canonicalize_regs(t->detect_syscall_arch());

   Registers r = t->regs();
   int syscallno = r.original_syscallno();
   // Cause kernel processing to skip the syscall
   r.set_original_syscallno(SECCOMP_MAGIC_SKIP_ORIGINAL_SYSCALLNO);
   t->set_regs(r);

   if (!t->is_in_untraced_syscall()) {
     t->push_syscall_event(syscallno);
     // Note that the syscall failed. prepare_clone() needs to know
     // this during replay of the syscall entry.
     t->ev().Syscall().failed_during_preparation = true;
     note_entering_syscall(t);
   }

   r.set_syscall_result(-seccomp_data);
   t->set_regs(r);
   // Don't continue yet. At the next iteration of record_step, if we
   // recorded the syscall-entry we'll enter syscall_state_changed and
   // that will trigger a continue to the syscall exit.
   step_state->continue_type = RecordSession::DONT_CONTINUE;
 }

 bool RecordSession::handle_ptrace_event(RecordTask** t_ptr,
                                         StepState* step_state,
                                         RecordResult* result,
                                         bool* did_enter_syscall) {
   *did_enter_syscall = false;

   RecordTask* t = *t_ptr;
   if (t->status().group_stop() || t->has_stashed_group_stop()) {
     t->clear_stashed_group_stop();
     last_task_switchable = ALLOW_SWITCH;
     step_state->continue_type = DONT_CONTINUE;
     return true;
   }

   int event = t->ptrace_event();
   if (!event) {
     return false;
   }

   LOG(debug) << "  " << t->tid << ": handle_ptrace_event "
              << ptrace_event_name(event) << ": event " << t->ev();

   switch (event) {
     case PTRACE_EVENT_SECCOMP_OBSOLETE:
     case PTRACE_EVENT_SECCOMP: {
       if (syscall_seccomp_ordering_ == PTRACE_SYSCALL_BEFORE_SECCOMP_UNKNOWN) {
         syscall_seccomp_ordering_ = SECCOMP_BEFORE_PTRACE_SYSCALL;
       }

       int seccomp_data = t->get_ptrace_eventmsg_seccomp_data();
       // We need to set the orig_* values before we let the process continue to exit
       // since the handler for the exit event will need them.
       // See `handle_ptrace_exit_event` above.
       t->apply_syscall_entry_regs();
       if (seccomp_data < 0) {
         // Process just died. Urk. Just wait for the exit event and pretend this stop never happened!
         last_task_switchable = ALLOW_SWITCH;
         step_state->continue_type = DONT_CONTINUE;
         return true;
       }
       int syscallno = t->regs().original_syscallno();
       if (seccomp_data == SECCOMP_RET_DATA) {
         LOG(debug) << "  traced syscall entered: "
                    << syscall_name(syscallno, t->arch());
         handle_seccomp_traced_syscall(t, step_state, result, did_enter_syscall);
       } else {
         // Note that we make no attempt to patch the syscall site when the
         // user handle does not return ALLOW. Apart from the ERRNO case,
         // handling these syscalls is necessarily slow anyway.
         uint32_t real_result;
         if (!seccomp_filter_rewriter().map_filter_data_to_real_result(
                 t, seccomp_data, &real_result)) {
           LOG(debug)
               << "Process terminated unexpectedly during PTRACE_GETEVENTMSG";
           step_state->continue_type = RecordSession::CONTINUE;
           break;
         }
         uint16_t real_result_data = real_result & SECCOMP_RET_DATA;
         switch (real_result & SECCOMP_RET_ACTION) {
           case SECCOMP_RET_TRAP:
             LOG(debug) << "  seccomp trap for syscall: "
                        << syscall_name(syscallno, t->arch());
             handle_seccomp_trap(t, step_state, real_result_data);
             break;
           case SECCOMP_RET_ERRNO:
             LOG(debug) << "  seccomp errno " << errno_name(real_result_data)
                        << " for syscall: "
                        << syscall_name(syscallno, t->arch());
             handle_seccomp_errno(t, step_state, real_result_data);
             break;
           case SECCOMP_RET_KILL:
             LOG(debug) << "  seccomp kill for syscall: "
                        << syscall_name(syscallno, t->arch());
             t->tgkill(SIGKILL);
             // Rely on the SIGKILL to bump us out of the ptrace stop.
             step_state->continue_type = RecordSession::DONT_CONTINUE;
             // Now wait for us to actually exit our ptrace-stop and proceed
             // to the PTRACE_EVENT_EXIT. This avoids the race where our
             // PTRACE_CONT might kick us out of the PTRACE_EVENT_EXIT before
             // we can process it.
             // If this fails because of *another* SIGKILL that's fine.
             t->wait();
             break;
           default:
             ASSERT(t, false) << "Seccomp result not handled";
             break;
         }
       }
       break;
     }

     case PTRACE_EVENT_EXEC: {
       if (t->thread_group()->task_set().size() > 1) {
         // All tasks but the task that did the execve should have exited by
         // now and notified us of their exits. However, it's possible that
         // while running the thread-group leader, our PTRACE_CONT raced with its
         // PTRACE_EVENT_EXIT and it exited, and the next event we got is this
         // PTRACE_EVENT_EXEC after the exec'ing task changed its tid to the
         // leader's tid. Or maybe there are kernel bugs; on
         // 4.2.0-42-generic running exec_from_other_thread, we reproducibly
         // enter PTRACE_EVENT_EXEC for the thread-group leader without seeing
         // its PTRACE_EVENT_EXIT.

         // So, record this task's exit and destroy it.
         // XXX We can't do record_robust_futex_changes here because the address
         // space has already gone. That would only matter if some of them were
         // in memory accessible to another process even after exec, i.e. a
         // shared-memory mapping or two different thread-groups sharing the same
         // address space.
         pid_t tid = t->rec_tid;
         WaitStatus status = t->status();
         record_exit_trace_event(t, WaitStatus(0));
         t->record_exit_event();
         // Don't call RecordTask::destroy() because we don't want to
         // PTRACE_DETACH.
         delete t;
         // Steal the exec'ing task and make it the thread-group leader, and
         // carry on!
         t = revive_task_for_exec(tid);
         scheduler().set_current(t);
         *t_ptr = t;
         // Tell t that it is actually stopped, because the stop we got is really
         // for this task, not the old dead task.
         if (!t->did_waitpid(status)) {
           // This is totally untested and almost certainly broken, but if the
           // task was SIGKILLed out of the EXEC stop then we should probably
           // just pretend the exec never happened.
           step_state->continue_type = CONTINUE_SYSCALL;
           break;
         }
       }
       t->post_exec();
       t->session().scheduler().did_exit_execve(t);

       // Forward ptrace exec notification
       if (t->emulated_ptracer) {
         if (t->emulated_ptrace_options & PTRACE_O_TRACEEXEC) {
           t->emulate_ptrace_stop(
               WaitStatus::for_ptrace_event(PTRACE_EVENT_EXEC));
         } else if (!t->emulated_ptrace_seized) {
           // Inject legacy SIGTRAP-after-exec
           t->tgkill(SIGTRAP);
         }
       }

       if (t->emulated_stop_pending) {
         step_state->continue_type = DONT_CONTINUE;
       } else {
         // Skip past the ptrace event.
         step_state->continue_type = CONTINUE_SYSCALL;
       }
       break;
     }

     default:
       ASSERT(t, false) << "Unhandled ptrace event " << ptrace_event_name(event)
                        << "(" << event << ")";
       break;
   }

   return true;
 }

 static void debug_exec_state(const char* msg, RecordTask* t) {
   LOG(debug) << msg << ": status=" << t->status();
 }

 template <typename Arch>
 static bool is_ptrace_any_singlestep_arch(int command) {
   return command >= 0 &&
     (command == PTRACE_SINGLESTEP || command == Arch::PTRACE_SYSEMU_SINGLESTEP);
 }

 static bool is_ptrace_any_singlestep(SupportedArch arch, int command)
 {
   RR_ARCH_FUNCTION(is_ptrace_any_singlestep_arch, arch, command);
 }

 void RecordSession::task_continue(const StepState& step_state) {
   RecordTask* t = scheduler().current();

   ASSERT(t, step_state.continue_type != DONT_CONTINUE);
   // A task in an emulated ptrace-stop must really stay stopped
   ASSERT(t, !t->emulated_stop_pending);

   bool may_restart = t->at_may_restart_syscall();

   if (may_restart && t->seccomp_bpf_enabled) {
     LOG(debug) << "  PTRACE_SYSCALL to possibly-restarted " << t->ev();
   }

   if (!t->vm()->first_run_event()) {
     t->vm()->set_first_run_event(trace_writer().time());
   }

   if (!t->thread_group()->first_run_event()) {
     t->thread_group()->set_first_run_event(trace_writer().time());
   }

   TicksRequest ticks_request;
   ResumeRequest resume;
   if (step_state.continue_type == CONTINUE_SYSCALL) {
     ticks_request = RESUME_NO_TICKS;
     resume = RESUME_SYSCALL;
   } else {
     if (t->has_stashed_sig(PerfCounters::TIME_SLICE_SIGNAL)) {
       // timeslice signal already stashed, no point in generating another one
       // (and potentially slow)
       ticks_request = RESUME_UNLIMITED_TICKS;
     } else if (scheduler().may_use_unlimited_ticks()) {
       ticks_request = RESUME_UNLIMITED_TICKS;
     } else {
       ticks_request = (TicksRequest)max<Ticks>(
           0, scheduler().current_timeslice_end() - t->tick_count());
     }

     // Clear any lingering state, then see if we need to stop earlier for a
     // tracee-requested pmc interrupt on the virtualized performance counter.
     t->next_pmc_interrupt_is_for_user = false;
     if (auto vpmc =
             VirtualPerfCounterMonitor::interrupting_virtual_pmc_for_task(t)) {
       ASSERT(t, vpmc->target_tuid() == t->tuid());

       Ticks after = max<Ticks>(vpmc->target_ticks() - t->tick_count(), 0);
       if ((uint64_t)after < (uint64_t)ticks_request) {
         LOG(debug) << "ticks_request constrained from " << ticks_request
                    << " to " << after << " for vpmc";
         ticks_request = (TicksRequest)after;
         t->next_pmc_interrupt_is_for_user = true;
       }
     }

     // Override requested by the tracee for testing purposes
     if (t->tick_request_override != (TicksRequest)0) {
       ASSERT(t, !t->next_pmc_interrupt_is_for_user);
       ticks_request = t->tick_request_override;
       t->tick_request_override = (TicksRequest)0;
     }

     bool singlestep = is_ptrace_any_singlestep(t->arch(),
       t->emulated_ptrace_cont_command);
     if (singlestep && is_at_syscall_instruction(t, t->ip())) {
       // We're about to singlestep into a syscall instruction.
       // Act like we're NOT singlestepping since doing a PTRACE_SINGLESTEP would
       // skip over the system call.
       LOG(debug)
           << "Clearing singlestep because we're about to enter a syscall";
       singlestep = false;
     }
     if (singlestep) {
       resume = RESUME_SINGLESTEP;
     } else {
       /* We won't receive PTRACE_EVENT_SECCOMP events until
        * the seccomp filter is installed by the
        * syscall_buffer lib in the child, therefore we must
        * record in the traditional way (with PTRACE_SYSCALL)
        * until it is installed. */
       /* Kernel commit
          https://github.com/torvalds/linux/commit/93e35efb8de45393cf61ed07f7b407629bf698ea
          makes PTRACE_SYSCALL traps be delivered *before* seccomp RET_TRACE
          traps.
          Detect and handle this. */
       if (!t->seccomp_bpf_enabled || may_restart ||
           syscall_seccomp_ordering_ == PTRACE_SYSCALL_BEFORE_SECCOMP_UNKNOWN) {
         resume = RESUME_SYSCALL;
       } else {
         /* When the seccomp filter is on, instead of capturing
          * syscalls by using PTRACE_SYSCALL, the filter will
          * generate the ptrace events. This means we allow the
          * process to run using PTRACE_CONT, and rely on the
          * seccomp filter to generate the special
          * PTRACE_EVENT_SECCOMP event once a syscall happens.
          * This event is handled here by simply allowing the
          * process to continue to the actual entry point of
          * the syscall (using cont_syscall_block()) and then
          * using the same logic as before. */
         resume = RESUME_CONT;
       }
     }
   }
   t->resume_execution(resume, RESUME_NONBLOCKING, ticks_request);
 }

 /**
  * Step |t| forward until the tracee syscall that disarms the desched
  * event. If a signal becomes pending in the interim, we stash it.
  * This allows the caller to deliver the signal after this returns.
  * (In reality the desched event will already have been disarmed before we
  * enter this function.)
  */
 static void advance_to_disarm_desched_syscall(RecordTask* t) {
   int old_sig = 0;

   LOG(debug) << "desched: DISARMING_DESCHED_EVENT";
   /* TODO: send this through main loop. */
   /* TODO: mask off signals and avoid this loop. */
   do {
     if (!t->resume_execution(RESUME_SYSCALL, RESUME_WAIT_NO_EXIT, RESUME_UNLIMITED_TICKS)) {
       return;
     }
     if (t->status().is_syscall()) {
       t->apply_syscall_entry_regs();
     }
     /* We can safely ignore TIME_SLICE_SIGNAL while trying to
      * reach the disarm-desched ioctl: once we reach it,
      * the desched'd syscall will be "done" and the tracee
      * will be at a preemption point.  In fact, we *want*
      * to ignore this signal.  Syscalls like read() can
      * have large buffers passed to them, and we have to
      * copy-out the buffered out data to the user's
      * buffer.  This happens in the interval where we're
      * reaching the disarm-desched ioctl, so that code is
      * susceptible to receiving TIME_SLICE_SIGNAL. */
     int sig = t->stop_sig();
     if (PerfCounters::TIME_SLICE_SIGNAL == sig) {
       continue;
     }
     // We should not receive SYSCALLBUF_DESCHED_SIGNAL since it should already
     // have been disarmed. However, we observe these being received here when
     // we arm the desched signal before we restart a blocking syscall, which
     // completes successfully, then we disarm, then we see a desched signal
     // here.
     if (t->session().syscallbuf_desched_sig() == sig) {
       continue;
     }
     if (sig && sig == old_sig) {
       LOG(debug) << "  coalescing pending " << signal_name(sig);
       continue;
     }
     if (sig) {
       LOG(debug) << "  " << signal_name(sig) << " now pending";
       t->stash_sig();
     }
   } while (!t->is_disarm_desched_event_syscall());

   // Exit the syscall. If this fails, that's fine, we can ignore it.
   t->resume_execution(RESUME_SYSCALL, RESUME_WAIT_NO_EXIT, RESUME_NO_TICKS);
 }

 /**
  * |t| is at a desched event and some relevant aspect of its state
  * changed.  (For now, changes except the original desched'd syscall
  * being restarted.)
  */
 void RecordSession::desched_state_changed(RecordTask* t) {
   LOG(debug) << "desched: IN_SYSCALL";
   /* We need to ensure that the syscallbuf code doesn't
    * try to commit the current record; we've already
    * recorded that syscall.  The following event sets
    * the abort-commit bit. */
   t->write_mem(REMOTE_PTR_FIELD(t->syscallbuf_child, abort_commit), (uint8_t)1);
   t->record_event(Event::syscallbuf_abort_commit());

   advance_to_disarm_desched_syscall(t);

   t->pop_desched();

   /* The tracee has just finished sanity-checking the
    * aborted record, and won't touch the syscallbuf
    * during this (aborted) transaction again.  So now
    * is a good time for us to reset the record counter. */
   t->delay_syscallbuf_reset_for_desched = false;
   // Run the syscallbuf exit hook. This ensures we'll be able to reset
   // the syscallbuf before trying to buffer another syscall.
   t->write_mem(
       REMOTE_PTR_FIELD(t->syscallbuf_child, notify_on_syscall_hook_exit),
       (uint8_t)1);
 }

 static void syscall_not_restarted(RecordTask* t) {
   LOG(debug) << "  " << t->tid << ": popping abandoned interrupted " << t->ev()
              << "; pending events:";
   if (IS_LOGGING(debug)) {
     t->log_pending_events();
   }
   t->pop_syscall_interruption();
 }

 /**
  * "Thaw" a frozen interrupted syscall if |t| is restarting it.
  * Return true if a syscall is indeed restarted.
  *
  * A postcondition of this function is that |t->ev| is no longer a
  * syscall interruption, whether or whether not a syscall was
  * restarted.
  */
 static bool maybe_restart_syscall(RecordTask* t) {
   if (is_restart_syscall_syscall(t->regs().original_syscallno(), t->arch())) {
     LOG(debug) << "  " << t->tid << ": SYS_restart_syscall'ing " << t->ev();
   }
   if (t->is_syscall_restart()) {
     t->ev().transform(EV_SYSCALL);
     Registers regs = t->regs();
     regs.set_original_syscallno(t->ev().Syscall().regs.original_syscallno());
     t->set_regs(regs);
     t->canonicalize_regs(t->arch());
     return true;
   }
   if (EV_SYSCALL_INTERRUPTION == t->ev().type()) {
     syscall_not_restarted(t);
   }
   return false;
 }

 /**
  * After a SYS_sigreturn "exit" of task |t| with return value |ret|,
  * check to see if there's an interrupted syscall that /won't/ be
  * restarted, and if so, pop it off the pending event stack.
  */
 static void maybe_discard_syscall_interruption(RecordTask* t, intptr_t ret) {
   int syscallno;

   if (EV_SYSCALL_INTERRUPTION != t->ev().type()) {
     /* We currently don't track syscalls interrupted with
      * ERESTARTSYS or ERESTARTNOHAND, so it's possible for
      * a sigreturn not to affect the event stack. */
     LOG(debug) << "  (no interrupted syscall to retire)";
     return;
   }

   syscallno = t->ev().Syscall().number;
   if (0 > ret) {
     syscall_not_restarted(t);
   } else if (t->arch() == x86 || t->arch() == x86_64) {
     // On x86, we would have expected this to get restored to the syscallno.
     // Since the syscallno is in a different register on other platforms, this
     // assert does not apply.
     ASSERT(t, syscallno == ret)
         << "Interrupted call was " << t->ev().Syscall().syscall_name()
         << " and sigreturn claims to be restarting "
         << syscall_name(ret, t->ev().Syscall().arch());
   }
 }

 /**
  * Copy the registers used for syscall arguments (not including
  * syscall number) from |from| to |to|.
  */
 static void copy_syscall_arg_regs(Registers* to, const Registers& from) {
   to->set_orig_arg1(from.arg1());
   to->set_arg2(from.arg2());
   to->set_arg3(from.arg3());
   to->set_arg4(from.arg4());
   to->set_arg5(from.arg5());
   to->set_arg6(from.arg6());
 }

 static void maybe_trigger_emulated_ptrace_syscall_exit_stop(RecordTask* t) {
   if (t->emulated_ptrace_cont_command == PTRACE_SYSCALL) {
     t->emulate_ptrace_stop(WaitStatus::for_syscall(t), SYSCALL_EXIT_STOP);
   } else if (is_ptrace_any_singlestep(t->arch(), t->emulated_ptrace_cont_command)) {
     // Deliver the singlestep trap now that we've finished executing the
     // syscall.
     t->emulate_ptrace_stop(WaitStatus::for_stop_sig(SIGTRAP), SIGNAL_DELIVERY_STOP, nullptr,
                            SI_KERNEL);
   }
 }

 static void save_interrupted_syscall_ret_in_syscallbuf(RecordTask* t,
                                                        intptr_t retval) {
   // Record storing the return value in the syscallbuf record, where
   // we expect to find it during replay.
   auto child_rec = t->next_syscallbuf_record();
   // Also store it there now so that our memory checksums are correct.
   // It will be overwritten by the tracee's syscallbuf code.
   t->write_and_record(REMOTE_PTR_FIELD(child_rec, ret),
                       static_cast<int64_t>(retval));
 }

 static bool is_in_privileged_syscall(RecordTask* t) {
   auto type = AddressSpace::rr_page_syscall_from_exit_point(t->arch(), t->ip());
   return type && type->privileged == AddressSpace::PRIVILEGED;
 }

 void RecordSession::syscall_state_changed(RecordTask* t,
                                           StepState* step_state) {
   switch (t->ev().Syscall().state) {
     case ENTERING_SYSCALL_PTRACE:
       debug_exec_state("EXEC_SYSCALL_ENTRY_PTRACE", t);
       step_state->continue_type = DONT_CONTINUE;
       last_task_switchable = ALLOW_SWITCH;
       if (t->emulated_stop_type != NOT_STOPPED) {
         // Don't go any further.
         return;
       }
       if (t->ev().Syscall().in_sysemu) {
         // We'll have recorded just the ENTERING_SYSCALL_PTRACE event and
         // nothing else. Resume with an invalid syscall to ensure no real
         // syscall runs.
         t->pop_syscall();
         Registers r = t->regs();
         Registers orig_regs = r;
         r.set_original_syscallno(-1);
         t->set_regs(r);
         // If this fails because of premature exit, don't mess with the
         // task anymore.
         if (t->resume_execution(RESUME_SYSCALL, RESUME_WAIT_NO_EXIT, RESUME_NO_TICKS)) {
           ASSERT(t, t->ip() == r.ip());
           t->set_regs(orig_regs);
           maybe_trigger_emulated_ptrace_syscall_exit_stop(t);
         }
         return;
       }
       last_task_switchable = PREVENT_SWITCH;
       t->ev().Syscall().regs = t->regs();
       t->ev().Syscall().state = ENTERING_SYSCALL;
       // The syscallno may have been changed by the ptracer
       t->ev().Syscall().number = t->regs().original_syscallno();
       return;

     case ENTERING_SYSCALL: {
       debug_exec_state("EXEC_SYSCALL_ENTRY", t);
       ASSERT(t, !t->emulated_stop_pending);

       // Flush syscallbuf now so that anything recorded by
       // rec_prepare_syscall is associated with the syscall event
       t->maybe_flush_syscallbuf();

       last_task_switchable = t->ev().Syscall().switchable =
           rec_prepare_syscall(t);
       t->record_event(t->ev(), RecordTask::DONT_FLUSH_SYSCALLBUF,
                       RecordTask::ALLOW_RESET_SYSCALLBUF,
                       &t->ev().Syscall().regs);

       debug_exec_state("after cont", t);
       t->ev().Syscall().state = PROCESSING_SYSCALL;

       if (t->emulated_stop_pending) {
         step_state->continue_type = DONT_CONTINUE;
       } else {
         // Resume the syscall execution in the kernel context.
         step_state->continue_type = CONTINUE_SYSCALL;
       }

       if (t->session().done_initial_exec() && Flags::get().check_cached_mmaps) {
         t->vm()->verify(t);
       }

       if (t->desched_rec() && t->is_in_untraced_syscall() &&
           t->has_stashed_sig()) {
         // We have a signal to deliver but we're about to (re?)enter an untraced
         // syscall that may block and the desched event has been disarmed.
         // Rearm the desched event so if the syscall blocks, it will be
         // interrupted and we'll have a chance to deliver our signal.
         LOG(debug) << "Rearming desched event so we'll get a chance to deliver "
                       "stashed signal";
         arm_desched_event(t);
       }

       if (t->detached_proxy) {
         // We detached. Record that.
         t->record_event(Event::exit(), RecordTask::DONT_FLUSH_SYSCALLBUF,
           RecordTask::DONT_RESET_SYSCALLBUF);
         t->session().trace_writer().write_task_event(
             TraceTaskEvent::for_detach(t->tid));
         step_state->continue_type = DONT_CONTINUE;
       }

       return;
     }

     case PROCESSING_SYSCALL:
       debug_exec_state("EXEC_IN_SYSCALL", t);

       // Linux kicks tasks out of syscalls before delivering
       // signals.
       ASSERT(t, !t->stop_sig()) << "Signal " << signal_name(t->stop_sig())
                                 << " pending while in syscall???";

       t->ev().Syscall().state = EXITING_SYSCALL;
       step_state->continue_type = DONT_CONTINUE;
       return;

     case EXITING_SYSCALL: {
       debug_exec_state("EXEC_SYSCALL_DONE", t);

       DEBUG_ASSERT(t->stop_sig() == 0);

       SupportedArch syscall_arch = t->ev().Syscall().arch();
       int syscallno = t->ev().Syscall().number;
       intptr_t retval = t->regs().syscall_result_signed();

       if (t->desched_rec()) {
         // If we enabled the desched event above, disable it.
         disarm_desched_event(t);
         // Write syscall return value to the syscallbuf now. This lets replay
         // get the correct value even though we're aborting the commit. This
         // value affects register values in the preload code (which must be
         // correct since register values may escape).
         save_interrupted_syscall_ret_in_syscallbuf(t, retval);
       }

       // sigreturn is a special snowflake, because it
       // doesn't actually return.  Instead, it undoes the
       // setup for signal delivery, which possibly includes
       // preparing the tracee for a restart-syscall.  So we
       // take this opportunity to possibly pop an
       // interrupted-syscall event.
       if (is_sigreturn(syscallno, syscall_arch)) {
         if (is_x86ish(t->arch())) {
           ASSERT(t, t->regs().original_syscallno() == -1);
         }
         rec_did_sigreturn(t);
         t->record_current_event();
         t->pop_syscall();

         // We've finished processing this signal now.
         t->pop_signal_handler();
         t->invalidate_sigmask();

         maybe_discard_syscall_interruption(t, retval);

         if (EV_SECCOMP_TRAP == t->ev().type()) {
           LOG(debug) << "  exiting seccomp trap";
           save_interrupted_syscall_ret_in_syscallbuf(t, retval);
           seccomp_trap_done(t);
         }
         if (EV_DESCHED == t->ev().type()) {
           LOG(debug) << "  exiting desched critical section";
           // The signal handler could have modified the apparent syscall
           // return handler. Save that value into the syscall buf again so
           // replay will pick it up later.
           save_interrupted_syscall_ret_in_syscallbuf(t, retval);
           desched_state_changed(t);
         }
       } else {
         LOG(debug) << "  original_syscallno:" << t->regs().original_syscallno()
                    << " (" << syscall_name(syscallno, syscall_arch)
                    << "); return val:" << HEX(t->regs().syscall_result());

         /* a syscall_restart ending is equivalent to the
          * restarted syscall ending */
         if (t->ev().Syscall().is_restart) {
           LOG(debug) << "  exiting restarted "
                      << syscall_name(syscallno, syscall_arch);
         }

         /* TODO: is there any reason a restart_syscall can't
          * be interrupted by a signal and itself restarted? */
         bool may_restart = !is_restart_syscall_syscall(syscallno, t->arch())
                            // SYS_pause is either interrupted or
                            // never returns.  It doesn't restart.
                            && !is_pause_syscall(syscallno, t->arch()) &&
                            t->regs().syscall_may_restart();
         /* no need to process the syscall in case its
          * restarted this will be done in the exit from the
          * restart_syscall */
         if (!may_restart) {
           rec_process_syscall(t);
           if (t->session().done_initial_exec() &&
               Flags::get().check_cached_mmaps) {
             t->vm()->verify(t);
           }
         } else {
           LOG(debug) << "  may restart "
                      << syscall_name(syscallno, syscall_arch)
                      << " (from retval " << HEX(retval) << ")";

           rec_prepare_restart_syscall(t);
           /* If we may restart this syscall, we've most
            * likely fudged some of the argument
            * registers with scratch pointers.  We don't
            * want to record those fudged registers,
            * because scratch doesn't exist in replay.
            * So cover our tracks here. */
           Registers r = t->regs();
           copy_syscall_arg_regs(&r, t->ev().Syscall().regs);
           t->set_regs(r);
           // We need to track what the return value was on architectures
           // where the kernel replaces the return value by the new arg1
           // on restart.
           t->ev().Syscall().regs = r;
         }
         t->record_current_event();

         /* If we're not going to restart this syscall, we're
          * done with it.  But if we are, "freeze" it on the
          * event stack until the execution point where it
          * might be restarted. */
         if (!may_restart) {
           t->pop_syscall();
           if (EV_DESCHED == t->ev().type()) {
             LOG(debug) << "  exiting desched critical section";
             desched_state_changed(t);
           }
         } else {
           t->ev().transform(EV_SYSCALL_INTERRUPTION);
           t->ev().Syscall().is_restart = true;
         }

         t->canonicalize_regs(syscall_arch);

         if (!may_restart) {
           if (t->retry_syscall_patching) {
             LOG(debug) << "Retrying deferred syscall patching";
             t->retry_syscall_patching = false;
             if (t->vm()->monkeypatcher().try_patch_syscall(t, false)) {
               // Syscall was patched. Emit event and continue execution.
               auto ev = Event::patch_syscall();
               ev.PatchSyscall().patch_after_syscall = true;
               t->record_event(ev);
             }
           }
         }
       }

       last_task_switchable = ALLOW_SWITCH;
       step_state->continue_type = DONT_CONTINUE;

       if (!is_in_privileged_syscall(t)) {
         maybe_trigger_emulated_ptrace_syscall_exit_stop(t);
       }
       return;
     }

     default:
       FATAL() << "Unknown exec state " << t->ev().Syscall().state;
   }
 }

 /** If the perf counters seem to be working return, otherwise don't return. */
 void RecordSession::check_initial_task_syscalls(RecordTask* t,
                                                 RecordResult* step_result) {
   if (done_initial_exec()) {
     return;
   }

   if (is_write_syscall(t->ev().Syscall().number, t->arch()) &&
       t->regs().arg1_signed() == -1) {
     Ticks ticks = t->tick_count();
     LOG(debug) << "ticks on entry to dummy write: " << ticks;
     if (ticks == 0) {
       step_result->status = RecordSession::STEP_SPAWN_FAILED;
       step_result->failure_message = string(
           "rr internal recorder error: Performance counter doesn't seem to "
           "be working. Are you perhaps running rr in a VM but didn't enable "
           "perf-counter virtualization?");
     }
   }

   if (is_exit_group_syscall(t->ev().Syscall().number, t->arch())) {
     step_result->status = RecordSession::STEP_SPAWN_FAILED;
     step_result->failure_message = read_spawned_task_error();
   }
 }

 RecordTask* RecordSession::revive_task_for_exec(pid_t rec_tid) {
   unsigned long msg = 0;
   int ret =
       ptrace(_ptrace_request(PTRACE_GETEVENTMSG), rec_tid, nullptr, &msg);
   if (ret < 0) {
     FATAL() << "Can't get old tid for execve (leader=" << rec_tid << ")";
   }
   RecordTask* t = find_task(msg);
   if (!t) {
     FATAL() << "Can't find old task for execve";
   }
   ASSERT(t, rec_tid == t->tgid());
   pid_t own_namespace_tid = t->thread_group()->tgid_own_namespace;

   LOG(debug) << "Changing task tid from " << t->tid << " to " << rec_tid;

   // Pretend the old task cloned a new task with the right tid, and then exited
   trace_writer().write_task_event(TraceTaskEvent::for_clone(
       rec_tid, t->tid, own_namespace_tid,
       CLONE_VM | CLONE_FS | CLONE_FILES | CLONE_SIGHAND | CLONE_THREAD |
           CLONE_SYSVSEM));
   trace_writer().write_task_event(
       TraceTaskEvent::for_exit(t->tid, WaitStatus::for_exit_code(0)));

   // Account for tid change
   task_map.erase(t->tid);
   task_map.insert(make_pair(rec_tid, t));
   // Update the serial as if this task was really created by cloning the old
   // task.
   t->set_tid_and_update_serial(rec_tid, own_namespace_tid);

   return t;
 }

 /**
  * Take a NativeArch::siginfo_t& here instead of siginfo_t because different
  * versions of system headers have inconsistent field naming.
  */
 template <typename Arch>
 static void setup_sigframe_siginfo_arch(RecordTask* t,
                                         const siginfo_t& siginfo) {
   remote_ptr<typename Arch::siginfo_t> dest;
   switch (Arch::arch()) {
     case x86: {
       auto p = t->regs().sp().cast<typename Arch::unsigned_word>() + 2;
       dest = t->read_mem(p);
       break;
     }
     case x86_64:
       dest = t->regs().si();
       break;
     case aarch64:
       dest = t->regs().x1();
       break;
     default:
       DEBUG_ASSERT(0 && "Unknown architecture");
       break;
   }
   typename Arch::siginfo_t si = t->read_mem(dest);
   set_arch_siginfo(siginfo, t->arch(), &si, sizeof(si));
   t->write_mem(dest, si);
 }

 static void setup_sigframe_siginfo(RecordTask* t, const siginfo_t& siginfo) {
   RR_ARCH_FUNCTION(setup_sigframe_siginfo_arch, t->arch(), t, siginfo);
 }

 /**
  * Get t into a state where resume_execution with a signal will actually work.
  */
 static bool preinject_signal(RecordTask* t) {
   int sig = t->ev().Signal().siginfo.si_signo;

   /* Signal injection is tricky. Per the ptrace(2) man page, injecting
    * a signal while the task is not in a signal-stop is not guaranteed to work
    * (and indeed, we see that the kernel sometimes ignores such signals).
    * But some signals must be delayed until after the signal-stop that notified
    * us of them.
    * So, first we check if we're in a signal-stop that we can use to inject
    * a signal. Some (all?) SIGTRAP stops are *not* usable for signal injection.
    */
   if (t->stop_sig() && t->stop_sig() != SIGTRAP) {
     LOG(debug) << "    in signal-stop for " << signal_name(t->stop_sig());
   } else {
     /* We're not in a usable signal-stop. Force a signal-stop by sending
      * a new signal with tgkill (as the ptrace(2) man page recommends).
      */
     LOG(debug) << "    maybe not in signal-stop (status " << t->status()
                << "); doing tgkill(SYSCALLBUF_DESCHED_SIGNAL)";
     if (!t->move_to_signal_stop()) {
       /* We raced with an exit (e.g. due to a pending SIGKILL). */
       return false;
     }

     ASSERT(t, t->stop_sig() == t->session().syscallbuf_desched_sig())
         << "Expected SYSCALLBUF_DESCHED_SIGNAL, got " << t->status();
     /* We're now in a signal-stop */
   }

   /* Now that we're in a signal-stop, we can inject our signal and advance
    * to the signal handler with one single-step.
    */
   LOG(debug) << "    injecting signal " << signal_name(sig);
   t->set_siginfo(t->ev().Signal().siginfo);
   return true;
 }

 /**
  * Returns true if the signal should be delivered.
  * Returns false if this signal should not be delivered because another signal
  * occurred during delivery or there was a premature exit.
  * Must call t->stashed_signal_processed() once we're ready to unmask signals.
  */
 static bool inject_handled_signal(RecordTask* t) {
   if (!preinject_signal(t)) {
     // Task prematurely exited.
     return false;
   }
   // If there aren't any more stashed signals, it's OK to stop blocking all
   // signals.
   t->stashed_signal_processed();

   int sig = t->ev().Signal().siginfo.si_signo;
   do {
     // We are ready to inject our signal.
     // XXX we assume the kernel won't respond by notifying us of a different
     // signal. We don't want to do this with signals blocked because that will
     // save a bogus signal mask in the signal frame.
     if (!t->resume_execution(RESUME_SINGLESTEP, RESUME_WAIT_NO_EXIT, RESUME_NO_TICKS, sig)) {
       return false;
     }
     // Signal injection can change the sigmask due to sa_mask effects, lack of
     // SA_NODEFER, and signal frame construction triggering a synchronous
     // SIGSEGV.
     t->invalidate_sigmask();
     // Repeat injection if we got a desched signal. We observe in Linux 4.14.12
     // that we get SYSCALLBUF_DESCHED_SIGNAL here once in a while.
   } while (t->stop_sig() == t->session().syscallbuf_desched_sig());

   if (t->stop_sig() == SIGSEGV) {
     // Constructing the signal handler frame must have failed. Stash the signal
     // to deliver it later.
     t->stash_sig();
     if (sig == SIGSEGV) {
       // The kernel will kill the process after this. Make sure we know to treat
       // it as fatal when we inject it. Also disable the signal handler to match
       // what the kernel does.
       t->did_set_sig_handler_default(SIGSEGV);
       t->thread_group()->received_sigframe_SIGSEGV = true;
     }
     return false;
   }

   // We stepped into a user signal handler.
   ASSERT(t, t->stop_sig() == SIGTRAP)
       << "Got unexpected status " << t->status() << " trying to deliver " << sig
       << " siginfo is " << t->get_siginfo();
   ASSERT(t, t->get_signal_user_handler(sig) == t->ip())
       << "Expected handler IP " << t->get_signal_user_handler(sig) << ", got "
       << t->ip()
       << "; actual signal mask=" << HEX(t->read_sigmask_from_process())
       << " (cached " << HEX(t->get_sigmask()) << ")";

   if (t->signal_handler_takes_siginfo(sig)) {
     // The kernel copied siginfo into userspace so it can pass a pointer to
     // the signal handler. Replace the contents of that siginfo with
     // the exact data we want to deliver. (We called Task::set_siginfo
     // above to set that data, but the kernel sanitizes the passed-in data
     // which wipes out certain fields; e.g. we can't set SI_KERNEL in si_code.)
     setup_sigframe_siginfo(t, t->ev().Signal().siginfo);
   }

   // The kernel clears the FPU state on entering the signal handler, but prior
   // to 4.7 or thereabouts ptrace can still return stale values. Fix that here.
   // This also sets bit 0 of the XINUSE register to 1 to avoid issues where it
   // get set to 1 nondeterministically.
   ExtraRegisters e = t->extra_regs();
   e.reset();
   t->set_extra_regs(e);

   return true;
 }

 /**
  * |t| is being delivered a signal, and its state changed.
  * Must call t->stashed_signal_processed() once we're ready to unmask signals.
  */
 bool RecordSession::signal_state_changed(RecordTask* t, StepState* step_state) {
   int sig = t->ev().Signal().siginfo.si_signo;

   switch (t->ev().type()) {
     case EV_SIGNAL: {
       // This event is used by the replayer to advance to
       // the point of signal delivery.
       if (t->arch() == aarch64 && t->status().is_syscall() &&
           t->prev_ev() && t->prev_ev()->type() == EV_SYSCALL_INTERRUPTION) {
         // On aarch64, replaying expects the signal to be delivered before
         // the syscall instruction but the current pc during recording
         // is after the syscall instruction with the arg1 clobbered
         // with the return value (aborted syscall).
         auto regs = t->regs();
         auto &syscall_regs = t->prev_ev()->Syscall().regs;
         regs.set_ip(syscall_regs.ip().decrement_by_syscall_insn_length(t->arch()));
         regs.set_arg1(syscall_regs.orig_arg1());
         t->record_event(t->ev(), RecordTask::FLUSH_SYSCALLBUF,
                         RecordTask::ALLOW_RESET_SYSCALLBUF, &regs);
       } else {
         t->record_current_event();
       }
       t->ev().transform(EV_SIGNAL_DELIVERY);
       ssize_t sigframe_size = 0;

       bool has_handler = t->signal_has_user_handler(sig);
       if (has_handler) {
         LOG(debug) << "  " << t->tid << ": " << signal_name(sig)
                    << " has user handler";

         if (!inject_handled_signal(t)) {
           // Signal delivery isn't happening. Prepare to process the new
           // signal that aborted signal delivery.
           t->signal_delivered(sig);
           t->pop_signal_delivery();
           step_state->continue_type = DONT_CONTINUE;
           last_task_switchable = PREVENT_SWITCH;
           break;
         }

         if (is_x86ish(t->arch())) {
           // It's somewhat difficult engineering-wise to
           // compute the sigframe size at compile time,
           // and it can vary across kernel versions and CPU
           // microarchitectures. So this size is an overestimate
           // of the real size(s).
           //
           // If this size becomes too small in the
           // future, and unit tests that use sighandlers
           // are run with checksumming enabled, then
           // they can catch errors here.
           sigframe_size = 1152 /* Overestimate of kernel sigframe */ +
                           128 /* Redzone */ +
                           /* this returns 512 when XSAVE unsupported */
                           xsave_area_size();
         } else if (t->arch() == aarch64) {
           sigframe_size = sizeof(ARM64Arch::rt_sigframe) +
                           sizeof(ARM64Arch::user_fpsimd_state);
         } else {
           DEBUG_ASSERT(0 && "Add sigframe size for your architecture here");
         }

         t->ev().transform(EV_SIGNAL_HANDLER);
         t->signal_delivered(sig);
         // We already continued! Don't continue now, and allow switching.
         step_state->continue_type = DONT_CONTINUE;
         last_task_switchable = ALLOW_SWITCH;
       } else {
         t->stashed_signal_processed();
         LOG(debug) << "  " << t->tid << ": no user handler for "
                    << signal_name(sig);
         // Don't do another task continue. We want to deliver the signal
         // as the next thing that the task does.
         step_state->continue_type = DONT_CONTINUE;
         // If we didn't set up the sighandler frame, we need
         // to ensure that this tracee is scheduled next so
         // that we can deliver the signal normally.  We have
         // to do that because setting up the sighandler frame
         // is synchronous, but delivery otherwise is async.
         // But right after this, we may have to process some
         // syscallbuf state, so we can't let the tracee race
         // with us.
         last_task_switchable = PREVENT_SWITCH;
       }

       // We record this data even if sigframe_size is zero to simplify replay.
       // Stop recording data if we run off the end of a writable mapping.
       // Our sigframe size is conservative so we need to do this.
       t->record_remote_writable(t->regs().sp(), sigframe_size);

       // This event is used by the replayer to set up the signal handler frame.
       // But if we don't have a handler, we don't want to record the event
       // until we deal with the EV_SIGNAL_DELIVERY.
       if (has_handler) {
         t->record_current_event();
       }
       break;
     }

     case EV_SIGNAL_DELIVERY: {
       // A SIGSTOP requires us to allow switching to another task.
       // So does a fatal, core-dumping signal, since we need to allow other
       // tasks to proceed to their exit events.
       bool is_deterministic = t->ev().Signal().deterministic == DETERMINISTIC_SIG;
       // Signals that would normally be fatal are just ignored for init processes,
       // unless they're deterministic.
       bool is_fatal = t->ev().Signal().disposition == DISPOSITION_FATAL &&
         (!t->is_container_init() || is_deterministic);
       Switchable can_switch = ((is_fatal && is_coredumping_signal(sig)) || sig == SIGSTOP) ?
         ALLOW_SWITCH : PREVENT_SWITCH;

       // We didn't record this event above, so do that now.
       // NB: If there is no handler, and we interrupted a syscall, and there are
       // no more actionable signals, the kernel sets us up for a syscall
       // restart. But it does that *after* the ptrace trap. To replay this
       // correctly we need to fake those changes here. But we don't do this
       // if we're going to switch away at the ptrace trap, and for the moment,
       // 'can_switch' is actually 'will_switch'.
       // This is essentially copied from do_signal in arch/x86/kernel/signal.c
       bool has_other_signals = t->has_any_actionable_signal();
       auto r = t->regs();
       if (!is_fatal) {
         Event *prev_ev = t->prev_ev();
         if (can_switch == PREVENT_SWITCH && !has_other_signals && prev_ev &&
             EV_SYSCALL_INTERRUPTION == prev_ev->type()) {
           switch (prev_ev->Syscall().regs.syscall_result_signed()) {
             case -ERESTARTNOHAND:
             case -ERESTARTSYS:
             case -ERESTARTNOINTR:
               r.set_syscallno(r.original_syscallno());
               break;
             case -ERESTART_RESTARTBLOCK:
               r.set_syscallno(syscall_number_for_restart_syscall(t->arch()));
               break;
           }
           // On aarch64, the kernel modifies the registers before the signal stop.
           // so we should not decrement the pc again or we'll rerun the instruction
           // before the syscall.
           // [1] https://github.com/torvalds/linux/blob/caffb99b6929f41a69edbb5aef3a359bf45f3315/arch/arm64/kernel/signal.c#L855-L862
           if (t->arch() != aarch64)
             r.set_ip(r.ip().decrement_by_syscall_insn_length(t->arch()));
           // Now that we've mucked with the registers, we can't switch tasks. That
           // could allow more signals to be generated, breaking our assumption
           // that we are the last signal.
         } else {
           // But if we didn't touch the registers switching here is ok.
           can_switch = ALLOW_SWITCH;
         }
       }

       t->record_event(t->ev(), RecordTask::FLUSH_SYSCALLBUF,
                       RecordTask::ALLOW_RESET_SYSCALLBUF, &r);
       // Don't actually set_regs(r), the kernel does these modifications.

       if (t->is_container_init() && is_fatal) {
         // Nondeterministic signals were already filtered out.
         ASSERT(t, is_deterministic);
         // Usually, the kernel removes the killable-protection from an init process
         // when a deterministic fatal signal gets executed, but (due to what is
         // arguably a bug) when a ptracer is attached, this does not happen.
         // If we try to inject it here, the kernel will just ignore it,
         // and we'll go around again. As a hack, we detach here, in the
         // expectation that the deterministic instruction will run again and
         // actually kill the task now that it isn't under ptrace control anymore.
         t->destroy_buffers(nullptr, nullptr);
         WaitStatus exit_status = WaitStatus::for_fatal_sig(sig);
         record_exit_trace_event(t, exit_status);
         // Allow writing child_tid now because otherwise the write will race
         t->record_exit_event(RecordTask::WRITE_CHILD_TID);
         // On a real affected kernel, we probably would have never gotten here,
         // since the signal we would be seeing was not deterministic, but let's
         // be conservative and still try to emulate the ptrace stop.
         t->do_ptrace_exit_stop(exit_status);
         t->did_kill();
         t->detach();
         // Not really, but we detached, so we're never gonna see that event
         // anyway, so just pretend we're there already
         t->did_reach_zombie();
         return true;
       }

       // Only inject fatal signals. Non-fatal signals with signal handlers
       // were taken care of above; for non-fatal signals without signal
       // handlers, there is no need to deliver the signal at all. In fact,
       // there is really no way to inject a non-fatal, non-handled signal
       // without letting the task execute at least one instruction, which
       // we don't want to do here.
       bool inject_signal = is_fatal && sig != get_continue_through_sig();
       if (inject_signal) {
         preinject_signal(t);
         t->resume_execution(RESUME_CONT, RESUME_NONBLOCKING, RESUME_NO_TICKS,
                             sig);
       }

       t->signal_delivered(sig);
       if (!inject_signal || !is_coredumping_signal(sig)) {
         /* Fatal signals may core-dump, so we don't consider the signal
          * delivery complete until we've actually managed to advance past that
          */
         t->pop_signal_delivery();
       }

       // Mark each task in this address space as expecting a ptrace exit
       // to avoid causing any ptrace_exit races.
       if (is_fatal && is_coredumping_signal(sig)) {
         for (Task *ot : t->vm()->task_set()) {
           if (t != ot) {
             if (t->tgid() == ot->tgid() || coredumping_signal_takes_down_entire_vm()) {
               ((RecordTask *)ot)->waiting_for_ptrace_exit = true;
             }
           }
         }
       }

       last_task_switchable = can_switch;
       step_state->continue_type = DONT_CONTINUE;
       break;
     }

     default:
       FATAL() << "Unhandled signal state " << t->ev().type();
       break;
   }
   return false;
 }

 bool RecordSession::handle_signal_event(RecordTask* t, StepState* step_state) {
   int sig = t->stop_sig();
   if (!sig) {
     return false;
   }
   if (!done_initial_exec()) {
     // If the initial tracee isn't prepared to handle
     // signals yet, then us ignoring the ptrace
     // notification here will have the side effect of
     // declining to deliver the signal.
     //
     // This doesn't really occur in practice, only in
     // tests that force a degenerately low time slice.
     LOG(warn) << "Dropping " << signal_name(sig)
               << " because it can't be delivered yet";
     // These signals might have effects on the sigmask.
     t->invalidate_sigmask();
     // No events to be recorded, so no syscallbuf updates
     // needed.
     return true;
   }

   if (sig == SIGTRAP && handle_syscallbuf_breakpoint(t)) {
     return true;
   }

   SignalDeterministic deterministic = is_deterministic_signal(t);
   // The kernel might have forcibly unblocked the signal. Check whether it
   // was blocked now, before we update our cached sigmask.
   SignalBlocked signal_was_blocked =
       t->is_sig_blocked(sig) ? SIG_BLOCKED : SIG_UNBLOCKED;
   if (deterministic || sig == t->session().syscallbuf_desched_sig()) {
     // Don't stash these signals; deliver them immediately.
     // We don't want them to be reordered around other signals.
     // invalidate_sigmask() must not be called before we reach handle_signal!
     siginfo_t siginfo = t->get_siginfo();
     switch (handle_signal(t, &siginfo, deterministic, signal_was_blocked)) {
       case SIGNAL_PTRACE_STOP:
         // Emulated ptrace-stop. Don't run the task again yet.
         last_task_switchable = ALLOW_SWITCH;
         step_state->continue_type = DONT_CONTINUE;
         return true;
       case DEFER_SIGNAL:
         ASSERT(t, false) << "Can't defer deterministic or internal signal "
                          << siginfo << " at ip " << t->ip();
         break;
       case SIGNAL_HANDLED:
         if (t->ptrace_event() == PTRACE_EVENT_SECCOMP) {
           // `handle_desched_event` detected a spurious desched followed
           // by a SECCOMP event, which it left pending. Handle that SECCOMP
           // event now.
           bool dummy_did_enter_syscall;
           handle_ptrace_event(&t, step_state, nullptr,
                               &dummy_did_enter_syscall);
           ASSERT(t, !dummy_did_enter_syscall);
         }
         if (t->ptrace_event() == PTRACE_EVENT_EXIT) {
           // Tracee was nuked (probably SIGKILL) during desched processing.
           return true;
         }
         break;
     }
     return false;
   }
   // Conservatively invalidate the sigmask in case just accepting a signal has
   // sigmask effects.
   t->invalidate_sigmask();
   if (sig == PerfCounters::TIME_SLICE_SIGNAL) {
     if (t->next_pmc_interrupt_is_for_user) {
       auto vpmc =
           VirtualPerfCounterMonitor::interrupting_virtual_pmc_for_task(t);
       ASSERT(t, vpmc);

       // Synthesize the requested signal.
       vpmc->synthesize_signal(t);

       t->next_pmc_interrupt_is_for_user = false;
       return true;
     }

     auto& si = t->get_siginfo();
     /* This implementation will of course fall over if rr tries to
      * record itself.
      *
      * NB: we can't check that the ticks is >= the programmed
      * target, because this signal may have become pending before
      * we reset the HPC counters.  There be a way to handle that
      * more elegantly, but bridge will be crossed in due time.
      *
      * We can't check that the fd matches t->hpc.ticks_fd() because this
      * signal could have been queued quite a long time ago and the PerfCounters
      * might have been stopped (and restarted!), perhaps even more than once,
      * since the signal was queued. possibly changing its fd. We could check
      * against all fds the PerfCounters have ever used, but that seems like
      * overkill.
      */
     ASSERT(t,
            PerfCounters::TIME_SLICE_SIGNAL == si.si_signo &&
                (RecordTask::SYNTHETIC_TIME_SLICE_SI_CODE == si.si_code ||
                 POLL_IN == si.si_code))
         << "Tracee is using SIGSTKFLT??? (code=" << si.si_code
         << ", fd=" << si.si_fd << ")";
   }
   t->stash_sig();
   return true;
 }

 template <typename Arch>
 static bool is_ptrace_any_sysemu_arch(int command) {
   return command >= 0 &&
     (command == Arch::PTRACE_SYSEMU ||
      command == Arch::PTRACE_SYSEMU_SINGLESTEP);
 }

 static bool is_ptrace_any_sysemu(SupportedArch arch, int command)
 {
   RR_ARCH_FUNCTION(is_ptrace_any_sysemu_arch, arch, command);
 }

 bool RecordSession::process_syscall_entry(RecordTask* t, StepState* step_state,
                                           RecordResult* step_result,
                                           SupportedArch syscall_arch) {
   if (const RecordTask::StashedSignal* sig = t->stashed_sig_not_synthetic_SIGCHLD()) {
     // The only four cases where we allow a stashed signal to be pending on
     // syscall entry are:
     // -- the signal is a ptrace-related signal, in which case if it's generated
     // during a blocking syscall, it does not interrupt the syscall
     // -- rrcall_notify_syscall_hook_exit, which is effectively a noop and
     // lets us dispatch signals afterward
     // -- when we're entering a blocking untraced syscall. If it really blocks,
     // we'll get the desched-signal notification and dispatch our stashed
     // signal.
     // -- when we're doing a privileged syscall that's internal to the preload
     // logic
     // We do not generally want to have stashed signals pending when we enter
     // a syscall, because that will execute with a hacked signal mask
     // (see RecordTask::will_resume_execution) which could make things go wrong.
     ASSERT(t,
            t->desched_rec() || is_rrcall_notify_syscall_hook_exit_syscall(
                                    t->regs().original_syscallno(), t->arch()) ||
                t->ip() ==
                    t->vm()
                        ->privileged_traced_syscall_ip()
                        .increment_by_syscall_insn_length(t->arch()))
       << "Stashed signal pending on syscall entry when it shouldn't be: "
       << sig->siginfo << "; regs=" << t->regs()
       << "; last_execution_resume=" << t->last_execution_resume()
       << "; sig ip=" << sig->ip;
   }

   // We just entered a syscall.
   if (!maybe_restart_syscall(t)) {
     if (syscall_seccomp_ordering_ == PTRACE_SYSCALL_BEFORE_SECCOMP_UNKNOWN &&
         t->seccomp_bpf_enabled) {
       // We received a PTRACE_SYSCALL notification before the seccomp
       // notification. Ignore it and continue to the seccomp notification.
       syscall_seccomp_ordering_ = PTRACE_SYSCALL_BEFORE_SECCOMP;
       step_state->continue_type = CONTINUE;
       return true;
     }

     // Don't ever patch a sigreturn syscall. These can't go through the syscallbuf.
     if (!is_sigreturn(t->regs().original_syscallno(), t->arch())) {
       if (t->vm()->monkeypatcher().try_patch_syscall(t)) {
         // Syscall was patched. Emit event and continue execution.
         t->record_event(Event::patch_syscall());
         return true;
       }
       if (!t->is_stopped()) {
         // task exited while we were trying to patch it.
         // Make sure that this exit event gets processed
         step_state->continue_type = DONT_CONTINUE;
         return false;
       }
     }

     t->push_event(SyscallEvent(t->regs().original_syscallno(), syscall_arch));
   }

   check_initial_task_syscalls(t, step_result);
   note_entering_syscall(t);
   if ((t->emulated_ptrace_cont_command == PTRACE_SYSCALL ||
        is_ptrace_any_sysemu(t->arch(),
         t->emulated_ptrace_cont_command)) &&
       !is_in_privileged_syscall(t)) {
     t->ev().Syscall().state = ENTERING_SYSCALL_PTRACE;
     t->emulate_ptrace_stop(WaitStatus::for_syscall(t), SYSCALL_ENTRY_STOP);
     t->record_current_event();

     t->ev().Syscall().in_sysemu = is_ptrace_any_sysemu(t->arch(),
       t->emulated_ptrace_cont_command);
   }
   return true;
 }

 /**
  * The execution of |t| has just been resumed, and it most likely has
  * a new event that needs to be processed.  Prepare that new event.
  * Returns false if the task exits during processing
  */
 void RecordSession::runnable_state_changed(RecordTask* t, StepState* step_state,
                                            RecordResult* step_result,
                                            bool can_consume_wait_status) {
   switch (t->ev().type()) {
     case EV_NOOP:
       t->pop_noop();
       return;
     case EV_INSTRUCTION_TRAP:
       t->record_current_event();
       t->pop_event(t->ev().type());
       return;
     case EV_SENTINEL:
     case EV_SIGNAL_HANDLER:
     case EV_SYSCALL_INTERRUPTION: {
       if (!can_consume_wait_status) {
         return;
       }

       SupportedArch syscall_arch = t->detect_syscall_arch();
       t->canonicalize_regs(syscall_arch);
       t->apply_syscall_entry_regs();
       process_syscall_entry(t, step_state, step_result, syscall_arch);
       return;
     }

     default:
       return;
   }
 }

 bool RecordSession::prepare_to_inject_signal(RecordTask* t,
                                              StepState* step_state) {
   if (!done_initial_exec() || step_state->continue_type != CONTINUE) {
     return false;
   }

   union {
     NativeArch::siginfo_t native_api;
     siginfo_t linux_api;
   } si;
   const RecordTask::StashedSignal* sig;

   while (true) {
     sig = t->peek_stashed_sig_to_deliver();
     if (!sig) {
       return false;
     }
     si.linux_api = sig->siginfo;
     if (si.linux_api.si_signo == get_ignore_sig()) {
       LOG(debug) << "Declining to deliver "
                  << signal_name(si.linux_api.si_signo) << " by user request";
       t->pop_stash_sig(sig);
       t->stashed_signal_processed();
     } else {
       break;
     }
   }

   if (sig->deterministic == DETERMINISTIC_SIG &&
       sig->siginfo.si_signo == SIGSYS &&
       t->is_sig_blocked(sig->siginfo.si_signo) == SIG_BLOCKED) {
     // Our synthesized deterministic SIGSYS (seccomp trap) needs to match the
     // kernel behavior of unblocking the signal and resetting disposition to
     // default.
     (void)t->unblock_signal(SIGSYS);
     t->set_sig_handler_default(SIGSYS);
   }
   switch (handle_signal(t, &si.linux_api, sig->deterministic, SIG_UNBLOCKED)) {
     case SIGNAL_PTRACE_STOP:
       // Emulated ptrace-stop. Don't run the task again yet.
       last_task_switchable = ALLOW_SWITCH;
       LOG(debug) << signal_name(si.linux_api.si_signo)
                  << ", emulating ptrace stop";
       break;
     case DEFER_SIGNAL:
       LOG(debug) << signal_name(si.linux_api.si_signo) << " deferred";
       // Leave signal on the stack and continue task execution. We'll try again
       // later.
       return false;
     case SIGNAL_HANDLED:
       LOG(debug) << signal_name(si.linux_api.si_signo) << " handled";
       // Signal is now a pending event on |t|'s event stack

       if (t->ev().type() == EV_SCHED) {
         if (t->maybe_in_spinlock()) {
           LOG(debug) << "Detected possible spinlock, forcing one round-robin";
           scheduler().schedule_one_round_robin(t);
         }
         // Allow switching after a SCHED. We'll flush the SCHED if and only
         // if we really do a switch.
         last_task_switchable = ALLOW_SWITCH;
       }
       break;
   }
   step_state->continue_type = DONT_CONTINUE;
   t->pop_stash_sig(sig);
   if (t->ev().type() != EV_SIGNAL) {
     t->stashed_signal_processed();
   }
   return true;
 }

 static void inject_ld_helper_library(vector<string>& env,
                                      string env_var,
                                      string value) {
   // Our preload lib should come first if possible, because that will speed up
   // the loading of the other libraries; it's also a good idea to put our audit
   // library at the head of the list, since there's only sixteen possible link
   // namespaces on glibc and each audit library uses up one.
   //
   // We supply a placeholder which is then mutated to the correct filename in
   // Monkeypatcher::patch_after_exec.
   auto env_assignment = env_var + "=";
   auto it = env.begin();
   for (; it != env.end(); ++it) {
     if (it->find(env_assignment) != 0) {
       continue;
     }
     // Honor old preloads too.  This may cause
     // problems, but only in those libs, and
     // that's the user's problem.
     value += ":";
     value += it->substr(it->find("=") + 1);
     break;
   }
   value = env_assignment + value;
   if (it == env.end()) {
     env.push_back(value);
   } else {
     *it = value;
   }
 }

 void strip_outer_ld_preload(vector<string>& env) {
   auto env_assignment = "LD_PRELOAD=";
   auto it = env.begin();
   for (; it != env.end(); ++it) {
     if (it->find(env_assignment) != 0) {
       continue;
     }
     size_t colon_pos = it->find(":");
     if (colon_pos != string::npos) {
       // If the preload library is loaded at all, it must be first
       size_t preload_pos = it->find("librrpreload");
       if (preload_pos < colon_pos) {
         string new_ld_preload = it->substr(++colon_pos);
         *it = env_assignment + new_ld_preload;
         return;
       } else {
         DEBUG_ASSERT(preload_pos == string::npos);
       }
     }
   }
 }

 static const MemoryRange asan_shadow(remote_ptr<void>((uintptr_t)0x00007fff7000LL),
                                      remote_ptr<void>((uintptr_t)0x10007fff8000LL));
 static const MemoryRange asan_allocator_reserved(remote_ptr<void>((uintptr_t)0x600000000000LL),
                                                  remote_ptr<void>((uintptr_t)0x640000002000LL));

 // See https://github.com/llvm/llvm-project/blob/main/compiler-rt/lib/tsan/rtl/tsan_platform_posix.cpp
 static const MemoryRange tsan_shadow(remote_ptr<void>((uintptr_t)0x008000000000LL),
                                      remote_ptr<void>((uintptr_t)0x550000000000LL));
 // The memory area 0x7b0000000000-0x7c0000002000 is reserved for TSAN's
 // custom heap allocator --- applications end up using it, but *we* can't use
 // it.
 static const MemoryRange tsan_exclude(remote_ptr<void>((uintptr_t)0x568000000000LL),
                                       remote_ptr<void>((uintptr_t)0x7e8000000000LL));
 // It's only 1TB because tsan can't fit more
 static const MemoryRange tsan_fixed_global_exclusion_range(remote_ptr<void>((uintptr_t)0x7e8000000000LL),
                                                            remote_ptr<void>((uintptr_t)0x7f8000000000LL));

 struct ExeInfo {
   ExeInfo() : arch(NativeArch::arch()) {}
   SupportedArch arch;
   // Empty if anything fails
   string sanitizer_path;
   vector<MemoryRange> sanitizer_exclude_memory_ranges;
   // If non-empty, use this as the global exclusion range.
   MemoryRange fixed_global_exclusion_range;

   void setup_asan_memory_ranges() {
     if (!check_sanitizer_arch()) {
       return;
     }
     sanitizer_exclude_memory_ranges.push_back(asan_shadow);
     sanitizer_exclude_memory_ranges.push_back(asan_allocator_reserved);
   }
   void setup_tsan_memory_ranges() {
     if (!check_sanitizer_arch()) {
       return;
     }
     sanitizer_exclude_memory_ranges.push_back(tsan_shadow);
     sanitizer_exclude_memory_ranges.push_back(tsan_exclude);
     fixed_global_exclusion_range = tsan_fixed_global_exclusion_range;
   }
 private:
   bool check_sanitizer_arch() {
     switch (arch) {
       case x86_64:
         return true;
       default:
         // We have no idea what's going on. Disable mmap randomization if
         // chaos mode is active.
         sanitizer_exclude_memory_ranges.push_back(MemoryRange::all());
         return false;
     }
   }
 };

 static ExeInfo read_exe_info(const string& exe_file) {
   ExeInfo ret;
   ScopedFd fd(exe_file.c_str(), O_RDONLY);
   if (!fd.is_open()) {
     return ret;
   }
   ElfFileReader reader(fd);
   ret.arch = reader.arch();

   DynamicSection dynamic = reader.read_dynamic();
   for (auto& entry : dynamic.entries) {
     if (entry.tag == DT_NEEDED && entry.val < dynamic.strtab.size()) {
       const char* name = &dynamic.strtab[entry.val];
       if (!strncmp(name, "libasan", 7)) {
         ret.sanitizer_path = string(name);
         ret.setup_asan_memory_ranges();
       } else if (!strncmp(name, "libtsan", 7)) {
         ret.sanitizer_path = string(name);
         ret.setup_tsan_memory_ranges();
       }
     }
   }

   auto syms = reader.read_symbols(".dynsym", ".dynstr");
   for (size_t i = 0; i < syms.size(); ++i) {
     if (syms.is_name(i, "__asan_init")) {
       ret.setup_asan_memory_ranges();
     } else if (syms.is_name(i, "__tsan_init")) {
       ret.setup_tsan_memory_ranges();
     }
   }

   return ret;
 }

 static string lookup_by_path(const string& name) {
   if (name.find('/') != string::npos) {
     return name;
   }
   const char* env = getenv("PATH");
   if (!env) {
     return name;
   }
   char* p = strdup(env);
   char* s = p;
   while (*s) {
     char* next = strchr(s, ':');
     if (next) {
       *next = 0;
     }
     string file = string(s) + "/" + name;
     struct stat st;
     if (!stat(file.c_str(), &st) && S_ISREG(st.st_mode) &&
         !access(file.c_str(), X_OK)) {
       free(p);
       return file;
     }
     if (!next) {
       break;
     }
     s = next + 1;
   }
   free(p);
   return name;
 }

 /*static*/ RecordSession::shr_ptr RecordSession::create(
     const vector<string>& argv, const vector<string>& extra_env,
     const DisableCPUIDFeatures& disable_cpuid_features,
     SyscallBuffering syscallbuf,
     unsigned char syscallbuf_desched_sig,
     BindCPU bind_cpu,
     const string& output_trace_dir,
     const TraceUuid* trace_id,
     bool use_audit,
     bool unmap_vdso,
     bool force_asan_active,
     bool force_tsan_active) {
   TraceeAttentionSet::initialize();

   // The syscallbuf library interposes some critical
   // external symbols like XShmQueryExtension(), so we
   // preload it whether or not syscallbuf is enabled. Indicate here whether
   // syscallbuf is enabled.
   if (syscallbuf == DISABLE_SYSCALL_BUF) {
     unsetenv(SYSCALLBUF_ENABLED_ENV_VAR);
   } else {
     setenv(SYSCALLBUF_ENABLED_ENV_VAR, "1", 1);

     if (!has_effective_caps(uint64_t(1) << CAP_SYS_ADMIN) &&
         !has_effective_caps(uint64_t(1) << CAP_PERFMON)) {
       ScopedFd fd("/proc/sys/kernel/perf_event_paranoid", O_RDONLY);
       if (fd.is_open()) {
         char buf[100];
         ssize_t size = read(fd, buf, sizeof(buf) - 1);
         if (size >= 0) {
           buf[size] = 0;
           int val = atoi(buf);
           if (val > 1) {
             fprintf(stderr,
                     "rr needs /proc/sys/kernel/perf_event_paranoid <= 1, but it is %d.\n"
                     "Change it to 1, or use 'rr record -n' (slow).\n"
                     "Consider putting 'kernel.perf_event_paranoid = 1' in /etc/sysctl.d/10-rr.conf.\n"
                     "See 'man 8 sysctl', 'man 5 sysctl.d' (systemd systems)\n"
                     "and 'man 5 sysctl.conf' (non-systemd systems) for more details.\n",
                     val);
             exit(1);
           }
         }
       }
     }
   }

   vector<string> env = current_env();

   // Have extra_env override anything already in the environment
   for (string extra : extra_env) {
     string extra_var = extra.substr(0, extra.find('='));
     auto it = env.begin();
     for (; it != env.end(); ++it) {
       if (it->find(extra_var) != 0) {
         continue;
       }
       it = env.erase(it);
       break;
     }
   }
   env.insert(env.end(), extra_env.begin(), extra_env.end());

   string full_path = lookup_by_path(argv[0]);
   struct stat st;
   if (stat(full_path.c_str(), &st) == 0 && S_ISDIR(st.st_mode)) {
     CLEAN_FATAL() << "Provided tracee '" << argv[0] << "' is a directory, not an executable";
   }
   ExeInfo exe_info = read_exe_info(full_path);
   if (exe_info.sanitizer_exclude_memory_ranges.empty()) {
     if (force_asan_active) {
       exe_info.setup_asan_memory_ranges();
     } else if (force_tsan_active) {
       exe_info.setup_tsan_memory_ranges();
     }
   }

   // Strip any LD_PRELOAD that an outer rr may have inserted
   strip_outer_ld_preload(env);

   // LD_PRELOAD the syscall interception lib
   string syscall_buffer_lib_path = find_helper_library(SYSCALLBUF_LIB_FILENAME);
   if (!syscall_buffer_lib_path.empty()) {
     string ld_preload = "";
     if (!exe_info.sanitizer_path.empty()) {
       LOG(debug) << "Prepending " << exe_info.sanitizer_path << " to LD_PRELOAD";
       // Put an LD_PRELOAD entry for it before our preload library, because
       // it checks that it's loaded first
       ld_preload += exe_info.sanitizer_path + ":";
     }
     ld_preload += syscall_buffer_lib_path + SYSCALLBUF_LIB_FILENAME_PADDED;
     // When librrpreload is built against glibc 2.34 but runs in a process linking pre-2.34 glibc,
     // its call to dlsym needs to search libdl before libc. When librrpreload found dlsym
     // in libc at link time, pre-2.34 ld.so throws a fatal error if it searches for dlsym in libc and
     // can't find it.
     ld_preload += ":libdl.so.2";
     inject_ld_helper_library(env, "LD_PRELOAD", ld_preload);
   }

   if (use_audit) {
     string rtld_audit_lib_path = find_helper_library(RTLDAUDIT_LIB_FILENAME);
     if (!rtld_audit_lib_path.empty()) {
       string ld_audit = rtld_audit_lib_path + RTLDAUDIT_LIB_FILENAME_PADDED;
       inject_ld_helper_library(env, "LD_AUDIT", ld_audit);
     }
   }

   env.push_back("RUNNING_UNDER_RR=1");
   // Stop Mesa using the GPU
   env.push_back("LIBGL_ALWAYS_SOFTWARE=1");
   env.push_back("GBM_ALWAYS_SOFTWARE=1");
   env.push_back("SDL_RENDER_DRIVER=software");
   // Stop sssd from using shared-memory with its daemon
   env.push_back("SSS_NSS_USE_MEMCACHE=NO");

   // Disable Gecko's "wait for gdb to attach on process crash" behavior, since
   // it is useless when running under rr.
   env.push_back("MOZ_GDB_SLEEP=0");

   // Avoid GVFS using separate processes that might run
   // outside the recording but share memory mapped files.
   env.push_back("GIO_USE_VFS=local");

   // If we have CPUID faulting, don't use these environment hacks. We don't
   // need them and the user might want to use them themselves for other reasons.
   if (!Session::has_cpuid_faulting()) {
     // OpenSSL uses RDRAND, but we can disable it. These bitmasks are inverted
     // and ANDed with the results of CPUID. The number below is 2^62, which is the
     // bit for RDRAND support.
     env.push_back("OPENSSL_ia32cap=~4611686018427387904:0");
     // Disable Qt's use of RDRAND/RDSEED/RTM
     env.push_back("QT_NO_CPU_FEATURE=rdrnd rdseed rtm");
     // Disable systemd's use of RDRAND
     env.push_back("SYSTEMD_RDRAND=0");
   }

   shr_ptr session(
       new RecordSession(full_path, argv, env, disable_cpuid_features,
                         syscallbuf, syscallbuf_desched_sig, bind_cpu,
                         output_trace_dir, trace_id, use_audit, unmap_vdso));
   session->excluded_ranges_ = std::move(exe_info.sanitizer_exclude_memory_ranges);
   session->fixed_global_exclusion_range_ = std::move(exe_info.fixed_global_exclusion_range);
   return session;
 }

 RecordSession::RecordSession(const std::string& exe_path,
                              const std::vector<std::string>& argv,
                              const std::vector<std::string>& envp,
                              const DisableCPUIDFeatures& disable_cpuid_features,
                              SyscallBuffering syscallbuf,
                              int syscallbuf_desched_sig,
                              BindCPU bind_cpu,
                              const string& output_trace_dir,
                              const TraceUuid* trace_id,
                              bool use_audit,
                              bool unmap_vdso)
     : trace_out(argv[0], output_trace_dir, ticks_semantics_),
       scheduler_(*this),
       trace_id(trace_id),
       disable_cpuid_features_(disable_cpuid_features),
       ignore_sig(0),
       continue_through_sig(0),
       last_task_switchable(PREVENT_SWITCH),
       syscall_buffer_size_(1024 * 1024),
       syscallbuf_desched_sig_(syscallbuf_desched_sig),
       use_syscall_buffer_(syscallbuf == ENABLE_SYSCALL_BUF),
       use_file_cloning_(true),
       use_read_cloning_(true),
       enable_chaos_(false),
       wait_for_all_(false),
       use_audit_(use_audit),
       unmap_vdso_(unmap_vdso) {
   if (!has_cpuid_faulting() &&
       disable_cpuid_features.any_features_disabled()) {
     FATAL() << "CPUID faulting required to disable CPUID features";
   }

   if (rr::syscall_number_for_rrcall_init_preload(x86_64) != RR_CALL_BASE) {
     FATAL() << "RR_CALL_BASE is incorrect";
   }

   trace_out.set_bound_cpu(choose_cpu(bind_cpu, cpu_lock));
   do_bind_cpu();
   ScopedFd error_fd = create_spawn_task_error_pipe();
   RecordTask* t = static_cast<RecordTask*>(
       Task::spawn(*this, error_fd, &tracee_socket_fd(),
                   &tracee_socket_receiver_fd(),
                   &tracee_socket_fd_number,
                   exe_path, argv, envp));

   if (NativeArch::is_x86ish()) {
     // CPU affinity has been set.
     trace_out.setup_cpuid_records(has_cpuid_faulting(), disable_cpuid_features_);
     if (cpu_has_xsave_fip_fdp_quirk()) {
       trace_out.set_xsave_fip_fdp_quirk(true);
       // Clear FIP/FDP on every event to reduce the probability of this quirk
       // causing divergence, especially when porting traces to Intel machines
       trace_out.set_clear_fip_fdp(true);
     }
     if (cpu_has_fdp_exception_only_quirk()) {
       trace_out.set_fdp_exception_only_quirk(true);
     }
   }

   initial_thread_group = t->thread_group();
   on_create(t);
 }

 RecordSession::RecordResult RecordSession::record_step() {
   RecordResult result;

   if (task_map.empty()) {
     result.status = STEP_EXITED;
     result.exit_status = initial_thread_group->exit_status;
     return result;
   }

   if (!wait_for_all_ && initial_thread_group->task_set().empty()) {
     // SIGKILL any tasks we haven't already killed.
     terminate_tracees();
   }

   result.status = STEP_CONTINUE;

   TaskUid prev_task_tuid;
   if (scheduler().current()) {
     prev_task_tuid = scheduler().current()->tuid();
   }
   auto rescheduled = scheduler().reschedule(last_task_switchable);
   if (rescheduled.interrupted_by_signal) {
     // The scheduler was waiting for some task to become active, but was
     // interrupted by a signal. Yield to our caller now to give the caller
     // a chance to do something triggered by the signal
     // (e.g. terminate the recording).
     return result;
   }
   RecordTask* t = scheduler().current();
   if (!t) {
     // No child to schedule. Yield to our caller to give it a chance
     // to do something (e.g. terminate the recording).
     return result;
   }
   // If the task has been reaped prematurely then it's not running
   // and we can't get registers etc, so minimize what we do between here
   // to handle_ptrace_exit_event().
   if (t->waiting_for_reap) {
     // Give it another chance to be reaped
     t->did_reach_zombie();
     return result;
   }
   RecordTask* prev_task = find_task(prev_task_tuid);
   if (prev_task && prev_task->ev().type() == EV_SCHED) {
     if (prev_task != t) {
       // We did do a context switch, so record the SCHED event. Otherwise
       // we'll just discard it.
       prev_task->record_current_event();
     }
     prev_task->pop_event(EV_SCHED);
   }

   // Have to disable context-switching until we know it's safe
   // to allow switching the context.
   last_task_switchable = PREVENT_SWITCH;

   LOG(debug) << "trace time " << t->trace_time() << ": Active task is "
              << t->tid << ". Events:";
   if (IS_LOGGING(debug)) {
     t->log_pending_events();
   }

   if (handle_ptrace_exit_event(t)) {
     // t may have been deleted.
     last_task_switchable = ALLOW_SWITCH;
     return result;
   }

   if (rescheduled.started_new_timeslice) {
     t->registers_at_start_of_last_timeslice = t->regs();
     t->time_at_start_of_last_timeslice = trace_writer().time();
   }

   StepState step_state(CONTINUE);

   ASSERT(t, t->is_stopped()) << "Somehow we're not stopped here; status="
     << t->status();
   bool did_enter_syscall;
   if (rescheduled.by_waitpid &&
       handle_ptrace_event(&t, &step_state, &result, &did_enter_syscall)) {
     if (result.status != STEP_CONTINUE ||
         step_state.continue_type == DONT_CONTINUE) {
       last_task_switchable = ALLOW_SWITCH;
       return result;
     }

     if (did_enter_syscall && t->ev().type() == EV_SYSCALL) {
       syscall_state_changed(t, &step_state);
     }
   } else {
     ASSERT(t, t->is_stopped()) << "handle_ptrace_event left us in a not-stopped state";
     if (rescheduled.by_waitpid && handle_signal_event(t, &step_state)) {
       // Tracee may have exited while processing descheds; handle that.
       if (handle_ptrace_exit_event(t)) {
         // t may have been deleted.
         last_task_switchable = ALLOW_SWITCH;
         return result;
       }
     } else {
       ASSERT(t, t->is_stopped()) << "handle_signal_event left us in a not-stopped state";
       runnable_state_changed(t, &step_state, &result, rescheduled.by_waitpid);

       if (result.status != STEP_CONTINUE ||
           step_state.continue_type == DONT_CONTINUE) {
         return result;
       }

       switch (t->ev().type()) {
         case EV_DESCHED:
           desched_state_changed(t);
           break;
         case EV_SYSCALL:
           syscall_state_changed(t, &step_state);
           break;
         case EV_SIGNAL:
         case EV_SIGNAL_DELIVERY:
           if (signal_state_changed(t, &step_state)) {
             // t may have been deleted
             return result;
           }
           break;
         default:
           break;
       }
     }
   }

   t->verify_signal_states();

   // We try to inject a signal if there's one pending; otherwise we continue
   // task execution.
   if (!prepare_to_inject_signal(t, &step_state) &&
       step_state.continue_type != DONT_CONTINUE) {
     // Ensure that we aren't allowing switches away from a running task.
     // Only tasks blocked in a syscall can be switched away from, otherwise
     // we have races.
     ASSERT(t,
            last_task_switchable == PREVENT_SWITCH ||
                t->may_be_blocked());

     debug_exec_state("EXEC_START", t);

     task_continue(step_state);
   }

   return result;
 }

 void RecordSession::terminate_tracees() {
   for (auto& v : task_map) {
     RecordTask* t = static_cast<RecordTask*>(v.second);
     if (!t->detached_proxy && !t->sent_shutdown_kill) {
       LOG(debug) << "Terminating tracee " << t->tid;
       ::kill(t->rec_tid, SIGKILL);
       t->sent_shutdown_kill = true;
       t->emulate_SIGCONT();
     }
   }
 }

 void RecordSession::forward_SIGTERM() {
   if (!initial_thread_group->task_set().empty()) {
     kill(initial_thread_group->tgid, SIGTERM);
   }
 }

 void RecordSession::term_detached_tasks() {
   // Send SIGTERM to all detached child tasks first, so they may clean up
   // in parallel.
   for (auto& v : task_map) {
     RecordTask* t = static_cast<RecordTask*>(v.second);
     if (!t->detached_proxy) {
       continue;
     }
     ::kill(t->rec_tid, SIGTERM);
   }
   for (auto it = task_map.begin(); it != task_map.end(); ) {
     RecordTask* t = static_cast<RecordTask*>(it->second);
     if (!t->detached_proxy) {
       ++it;
       continue;
     }
     WaitResult result = WaitManager::wait_exit(WaitOptions(t->rec_tid));
     if (result.code != WAIT_OK) {
       LOG(warn) << "Wait failed";
     } else if (result.status.type() != WaitStatus::EXIT) {
       LOG(warn) << "Unexpected wait status " << result.status <<
         " while waiting for detached child " << t->rec_tid;
     }
     ++it;
     delete t;
   }
 }

 void RecordSession::close_trace_writer(TraceWriter::CloseStatus status) {
   trace_out.close(status, trace_id.get());
 }

 Task* RecordSession::new_task(pid_t tid, pid_t, uint32_t serial,
                               SupportedArch a, const std::string&) {
   return new RecordTask(*this, tid, serial, a);
 }

 void RecordSession::on_create(Task* t) {
   Session::on_create(t);
   scheduler().on_create(static_cast<RecordTask*>(t));
 }

 void RecordSession::on_destroy(Task* t) {
   RecordTask *rt = static_cast<RecordTask*>(t);
   scheduler().on_destroy(rt);
   if (rt->detached_proxy) {
     detached_task_map.erase(rt->tid);
   }
   Session::on_destroy(t);
 }

 RecordTask* RecordSession::find_task(pid_t rec_tid) const {
   return static_cast<RecordTask*>(Session::find_task(rec_tid));
 }

 RecordTask* RecordSession::find_task(const TaskUid& tuid) const {
   return static_cast<RecordTask*>(Session::find_task(tuid));
 }

 RecordTask* RecordSession::find_detached_proxy_task(pid_t proxy_tid) const {
   auto it = detached_task_map.find(proxy_tid);
   return detached_task_map.end() != it ? it->second : nullptr;
 }

 void RecordSession::on_proxy_detach(RecordTask *t, pid_t new_tid) {
   Session::on_destroy(t);
   task_map[new_tid] = t;
   detached_task_map[t->tid] = t;
 }

 uint64_t RecordSession::rr_signal_mask() const {
   return signal_bit(PerfCounters::TIME_SLICE_SIGNAL) |
          signal_bit(syscallbuf_desched_sig_);
 }

 static const uint32_t CPUID_RDRAND_FLAG = 1 << 30;
 static const uint32_t CPUID_RTM_FLAG = 1 << 11;
 static const uint32_t CPUID_RDSEED_FLAG = 1 << 18;
 static const uint32_t CPUID_XSAVEOPT_FLAG = 1 << 0;

 void DisableCPUIDFeatures::amend_cpuid_data(uint32_t eax_in, uint32_t ecx_in,
                                             CPUIDData* cpuid_data) const {
   switch (eax_in) {
     case CPUID_GETFEATURES:
       cpuid_data->ecx &= ~(CPUID_RDRAND_FLAG | features_ecx);
       cpuid_data->edx &= ~features_edx;
       break;
     case CPUID_GETEXTENDEDFEATURES:
       if (ecx_in == 0) {
         cpuid_data->ebx &= ~(CPUID_RDSEED_FLAG | CPUID_RTM_FLAG
             | extended_features_ebx);
         cpuid_data->ecx &= ~extended_features_ecx;
         cpuid_data->edx &= ~extended_features_edx;
       }
       break;
     case CPUID_GETXSAVE:
       if (ecx_in == 1) {
         // Always disable XSAVEOPT because it's nondeterministic,
         // possibly depending on context switching behavior. Intel
         // recommends not using it from user space.
         cpuid_data->eax &= ~(CPUID_XSAVEOPT_FLAG | xsave_features_eax);
       }
       break;
     default:
       break;
   }
 }

 } // namespace rr