src/record_signal.cc - toolchain/rr - Git at Google

 /* -*- Mode: C++; tab-width: 8; c-basic-offset: 2; indent-tabs-mode: nil; -*- */

 #include "record_signal.h"

 #include <fcntl.h>
 #include <linux/perf_event.h>
 #include <sched.h>
 #include <stdlib.h>
 #include <string.h>
 #include <sys/mman.h>
 #include <sys/prctl.h>
 #include <sys/resource.h>
 #include <sys/user.h>
 #include <syscall.h>

 #include "preload/preload_interface.h"

 #include "AutoRemoteSyscalls.h"
 #include "Flags.h"
 #include "PerfCounters.h"
 #include "RecordSession.h"
 #include "RecordTask.h"
 #include "TraceStream.h"
 #include "VirtualPerfCounterMonitor.h"
 #include "core.h"
 #include "kernel_metadata.h"
 #include "log.h"
 #include "util.h"

 using namespace std;

 namespace rr {

 static void restore_sighandler_if_not_default(RecordTask* t, int sig) {
   if (t->sig_disposition(sig) != SIGNAL_DEFAULT) {
     LOG(debug) << "Restoring signal handler for " << signal_name(sig);
     AutoRemoteSyscalls remote(t);
     size_t sigset_size = sigaction_sigset_size(remote.arch());
     const vector<uint8_t>& sa = t->signal_action(sig);
     AutoRestoreMem child_sa(remote, sa.data(), sa.size());
     remote.infallible_syscall(syscall_number_for_rt_sigaction(remote.arch()),
                               sig, child_sa.get().as_int(), nullptr,
                               sigset_size);
   }
 }

 /**
  * Restore the blocked-ness and sigaction for |sig| from |t|'s local
  * copy.
  */
 static void restore_signal_state(RecordTask* t, int sig,
                                  SignalBlocked signal_was_blocked) {
   restore_sighandler_if_not_default(t, sig);
   if (signal_was_blocked) {
     LOG(debug) << "Restoring signal blocked-ness for " << signal_name(sig);
     AutoRemoteSyscalls remote(t);
     size_t sigset_size = sigaction_sigset_size(remote.arch());
     vector<uint8_t> bytes;
     bytes.resize(sigset_size);
     memset(bytes.data(), 0, sigset_size);
     sig_set_t mask = signal_bit(sig);
     ASSERT(t, sigset_size >= sizeof(mask));
     memcpy(bytes.data(), &mask, sizeof(mask));
     AutoRestoreMem child_block(remote, bytes.data(), bytes.size());
     remote.infallible_syscall(syscall_number_for_rt_sigprocmask(remote.arch()),
                               SIG_BLOCK, child_block.get().as_int(), nullptr,
                               sigset_size);
     // We just changed the sigmask ourselves.
     t->invalidate_sigmask();
   }
 }

 /**
  * Return true if |t| was stopped because of a SIGSEGV resulting
  * from a disabled instruction and |t| was updated appropriately, false
  * otherwise.
  */
 static bool try_handle_trapped_instruction(RecordTask* t, siginfo_t* si) {
   ASSERT(t, si->si_signo == SIGSEGV);

   auto trapped_instruction = trapped_instruction_at(t, t->ip());
   switch (trapped_instruction) {
     case TrappedInstruction::RDTSC:
     case TrappedInstruction::RDTSCP:
       if (t->tsc_mode == PR_TSC_SIGSEGV) {
         return false;
       }
       break;
     case TrappedInstruction::CPUID:
       if (t->cpuid_mode == 0) {
         return false;
       }
       break;
     default:
       return false;
   }

   size_t len = trapped_instruction_len(trapped_instruction);
   ASSERT(t, len > 0);

   Registers r = t->regs();
   if (trapped_instruction == TrappedInstruction::RDTSC ||
       trapped_instruction == TrappedInstruction::RDTSCP) {
     if (trapped_instruction == TrappedInstruction::RDTSC &&
         t->vm()->monkeypatcher().try_patch_trapping_instruction(t, len, true)) {
       Event ev = Event::patch_syscall();
       ev.PatchSyscall().patch_trapping_instruction = true;
       t->record_event(ev);
       t->push_event(Event::noop());
       return true;
     }

     unsigned long long current_time = rdtsc();
     r.set_rdtsc_output(current_time);

     LOG(debug) << " trapped for rdtsc: returning " << current_time;
   } else if (trapped_instruction == TrappedInstruction::CPUID) {
     auto eax = r.syscallno();
     auto ecx = r.cx();
     auto cpuid_data = cpuid(eax, ecx);
     t->session().disable_cpuid_features()
         .amend_cpuid_data(eax, ecx, &cpuid_data);
     r.set_cpuid_output(cpuid_data.eax, cpuid_data.ebx, cpuid_data.ecx,
                        cpuid_data.edx);
     LOG(debug) << " trapped for cpuid: " << HEX(eax) << ":" << HEX(ecx);
   }

   r.set_ip(r.ip() + len);
   t->set_regs(r);
   t->record_event(Event::instruction_trap());

   if (t->retry_syscall_patching) {
     LOG(debug) << "Retrying deferred syscall patching";
     t->retry_syscall_patching = false;
     if (t->vm()->monkeypatcher().try_patch_trapping_instruction(t, len, false)) {
       // Instruction was patched. Emit event.
       auto ev = Event::patch_syscall();
       ev.PatchSyscall().patch_after_syscall = true;
       t->record_event(ev);
     }
   }

   t->push_event(Event::noop());
   return true;
 }

 /**
  * Return true if |t| was stopped because of a SIGSEGV and we want to retry
  * the instruction after emulating MAP_GROWSDOWN.
  */
 static bool try_grow_map(RecordTask* t, siginfo_t* si) {
   ASSERT(t, si->si_signo == SIGSEGV);

   // Use kernel_abi to avoid odd inconsistencies between distros
   auto arch_si = reinterpret_cast<NativeArch::siginfo_t*>(si);
   auto addr = arch_si->_sifields._sigfault.si_addr_.rptr();

   if (t->vm()->has_mapping(addr)) {
     LOG(debug) << "try_grow_map " << addr << ": address already mapped";
     return false;
   }
   auto maps = t->vm()->maps_starting_at(floor_page_size(addr));
   auto it = maps.begin();
   if (it == maps.end()) {
     LOG(debug) << "try_grow_map " << addr << ": no later map to grow downward";
     return false;
   }
   if (!(it->map.flags() & MAP_GROWSDOWN)) {
     LOG(debug) << "try_grow_map " << addr << ": map is not MAP_GROWSDOWN ("
                << it->map << ")";
     return false;
   }
   if (addr >= page_size() && t->vm()->has_mapping(addr - page_size())) {
     LOG(debug) << "try_grow_map " << addr << ": address would be in guard page";
     return false;
   }
   remote_ptr<void> limit_bottom;
 #if defined (__i386__)
   struct rlimit stack_limit;
   int ret = prlimit(t->tid, RLIMIT_STACK, NULL, &stack_limit);
 #else
   struct rlimit64 stack_limit;
   int ret = syscall(__NR_prlimit64, t->tid, RLIMIT_STACK, (void*)0, &stack_limit);
 #endif
   if (ret >= 0 && stack_limit.rlim_cur != RLIM_INFINITY) {
     limit_bottom = ceil_page_size(it->map.end() - stack_limit.rlim_cur);
     if (limit_bottom > addr) {
       LOG(debug) << "try_grow_map " << addr << ": RLIMIT_STACK exceeded";
       return false;
     }
   }

   // Try to grow by 64K at a time to reduce signal frequency.
   auto new_start = floor_page_size(addr);
   static const uintptr_t grow_size = 0x10000;
   if (it->map.start().as_int() >= grow_size) {
     auto possible_new_start = std::max(
         limit_bottom, std::min(new_start, it->map.start() - grow_size));
     // Ensure that no mapping exists between possible_new_start - page_size()
     // and new_start. If there is, possible_new_start is not valid, in which
     // case we just abandon the optimization.
     if (possible_new_start >= page_size() &&
         !t->vm()->has_mapping(possible_new_start - page_size()) &&
         t->vm()->maps_starting_at(possible_new_start - page_size())
                 .begin()
                 ->map.start() == it->map.start()) {
       new_start = possible_new_start;
     }
   }
   LOG(debug) << "try_grow_map " << addr << ": trying to grow map " << it->map;

   {
     AutoRemoteSyscalls remote(t, AutoRemoteSyscalls::DISABLE_MEMORY_PARAMS);
     remote.infallible_mmap_syscall_if_alive(
         new_start, it->map.start() - new_start, it->map.prot(),
         (it->map.flags() & ~MAP_GROWSDOWN) | MAP_ANONYMOUS | MAP_FIXED, -1, 0);
   }

   KernelMapping km =
       t->vm()->map(t, new_start, it->map.start() - new_start, it->map.prot(),
                    it->map.flags() | MAP_ANONYMOUS, 0, string(),
                    KernelMapping::NO_DEVICE, KernelMapping::NO_INODE);
   t->trace_writer().write_mapped_region(t, km, km.fake_stat(), km.fsname(), vector<TraceRemoteFd>());
   // No need to flush syscallbuf here. It's safe to map these pages "early"
   // before they're really needed.
   t->record_event(Event::grow_map(), RecordTask::DONT_FLUSH_SYSCALLBUF);
   t->push_event(Event::noop());
   LOG(debug) << "try_grow_map " << addr << ": extended map "
              << t->vm()->mapping_of(addr).map;
   return true;
 }

 void disarm_desched_event(RecordTask* t) {
   if (t->desched_fd.is_open() &&
       ioctl(t->desched_fd, PERF_EVENT_IOC_DISABLE, 0)) {
     FATAL() << "Failed to disarm desched event";
   }
 }

 void arm_desched_event(RecordTask* t) {
   if (t->desched_fd.is_open() &&
       ioctl(t->desched_fd, PERF_EVENT_IOC_ENABLE, 0)) {
     FATAL() << "Failed to arm desched event";
   }
 }

 template <typename Arch>
 static remote_code_ptr get_stub_scratch_1_arch(RecordTask* t) {
   auto remote_locals = AddressSpace::preload_thread_locals_start()
     .cast<preload_thread_locals<Arch>>();
   auto remote_stub_scratch_1 = REMOTE_PTR_FIELD(remote_locals, stub_scratch_1);
   return t->read_mem(remote_stub_scratch_1).rptr().as_int();
 }

 static remote_code_ptr get_stub_scratch_1(RecordTask* t) {
   RR_ARCH_FUNCTION(get_stub_scratch_1_arch, t->arch(), t);
 }

 template <typename Arch>
 static void get_stub_scratch_2_arch(RecordTask* t, void *buff, size_t sz) {
   auto remote_locals = AddressSpace::preload_thread_locals_start()
     .cast<preload_thread_locals<Arch>>();
   auto remote_stub_scratch_2 = REMOTE_PTR_FIELD(remote_locals, stub_scratch_2);
   t->read_bytes_helper(remote_stub_scratch_2, sz, buff);
 }

 static void get_stub_scratch_2(RecordTask* t, void *buff, size_t sz) {
   RR_ARCH_FUNCTION(get_stub_scratch_2_arch, t->arch(), t, buff, sz);
 }

 /**
  * This function is responsible for handling breakpoints we set in syscallbuf
  * code to detect sigprocmask calls and syscallbuf exit. It's called when we
  * get a SIGTRAP. Returns true if the SIGTRAP was called by one of our
  * breakpoints and should be hidden from the application.
  * If it was triggered by one of our breakpoints, we have to call
  * restore_sighandler_if_not_default(t, SIGTRAP) to make sure the SIGTRAP
  * handler is properly restored if the kernel cleared it.
  */
 bool handle_syscallbuf_breakpoint(RecordTask* t) {
   if (t->is_at_syscallbuf_final_instruction_breakpoint()) {
     LOG(debug) << "Reached final syscallbuf instruction, singlestepping to "
                   "enable signal dispatch";
     // Emulate the effect of the return from syscallbuf.
     // On x86, this is a single instruction that jumps to the location stored in
     // preload_thread_locals::stub_scratch_1.
     // On aarch64, the target of the jump is an instruction that restores
     // x15 and x30 and then jump back to the syscall.
     // To minimize the surprise to the tracee if we decide to deliver a signal
     // we'll emulate the register restore and return directly to the syscall site.
     // The address in stub_scratch_1 is already the correct address for this.
     if (t->arch() == aarch64) {
       uint64_t x15_x30[2];
       get_stub_scratch_2(t, x15_x30, 16);
       Registers r = t->regs();
       r.set_x15(x15_x30[0]);
       r.set_xlr(x15_x30[1]);
       t->set_regs(r);
       t->count_direct_jump();
     }
     t->emulate_jump(get_stub_scratch_1(t));

     restore_sighandler_if_not_default(t, SIGTRAP);
     // Now we're back in application code so any pending stashed signals
     // will be handled.
     return true;
   }

   if (t->is_at_syscallstub_exit_breakpoint()) {
     LOG(debug) << "Reached syscallstub exit instruction, singlestepping to "
                   "enable signal dispatch";
     ASSERT(t, t->arch() == aarch64 && t->syscallstub_exit_breakpoint);
     auto retaddr_addr = t->syscallstub_exit_breakpoint.to_data_ptr<uint8_t>() + 3 * 4;
     uint64_t retaddr;
     t->read_bytes_helper(retaddr_addr, sizeof(retaddr), &retaddr);
     Registers r = t->regs();
     r.set_ip(retaddr);
     t->set_regs(r);
     t->count_direct_jump();
     t->syscallstub_exit_breakpoint = nullptr;
     restore_sighandler_if_not_default(t, SIGTRAP);
     // Now we're back in application code so any pending stashed signals
     // will be handled.
     return true;
   }

   if (!t->is_at_syscallbuf_syscall_entry_breakpoint()) {
     return false;
   }

   Registers r = t->regs();
   r.set_ip(r.ip().undo_executed_bkpt(t->arch()));
   t->set_regs(r);

   if (t->is_at_traced_syscall_entry()) {
     // We will automatically dispatch stashed signals now since this is an
     // allowed place to dispatch signals.
     LOG(debug) << "Allowing signal dispatch at traced-syscall breakpoint";
     restore_sighandler_if_not_default(t, SIGTRAP);
     return true;
   }

   // We're at an untraced-syscall entry point.
   // To allow an AutoRemoteSyscall, we need to make sure desched signals are
   // disarmed (and rearmed afterward).
   bool armed_desched_event = t->read_mem(
       REMOTE_PTR_FIELD(t->syscallbuf_child, desched_signal_may_be_relevant));
   if (armed_desched_event) {
     disarm_desched_event(t);
   }
   restore_sighandler_if_not_default(t, SIGTRAP);
   if (armed_desched_event) {
     arm_desched_event(t);
   }

   // This is definitely a native-arch syscall.
   if (is_rt_sigprocmask_syscall(r.syscallno(), t->arch())) {
     // Don't proceed with this syscall. Emulate it returning EAGAIN.
     // Syscallbuf logic will retry using a traced syscall instead.
     r.set_syscall_result(-EAGAIN);
     r.set_ip(r.ip().increment_by_syscall_insn_length(t->arch()));
     t->set_regs(r);
     t->canonicalize_regs(t->arch());
     LOG(debug) << "Emulated EAGAIN to avoid untraced sigprocmask with pending "
                   "stashed signal";
     // Leave breakpoints enabled since we want to break at the traced-syscall
     // fallback for rt_sigprocmask.
     return true;
   }

   // We can proceed with the untraced syscall. Either it will complete and
   // execution will continue until we reach some point where we can deliver our
   // signal, or it will block at which point we'll be able to deliver our
   // signal.
   LOG(debug) << "Disabling breakpoints at untraced syscalls";
   t->break_at_syscallbuf_untraced_syscalls = false;
   return true;
 }

 /**
  * Return the event needing to be processed after this desched of |t|.
  * The tracee's execution may be advanced, and if so |regs| is updated
  * to the tracee's latest state.
  */
 static void handle_desched_event(RecordTask* t) {
   /* If the tracee isn't in the critical section where a desched
    * event is relevant, we can ignore it.  See the long comments
    * in syscall_buffer.c.
    *
    * It's OK if the tracee is in the critical section for a
    * may-block syscall B, but this signal was delivered by an
    * event programmed by a previous may-block syscall A.
    *
    * If we're running in a signal handler inside an interrupted syscallbuf
    * system call, never do anything here. Syscall buffering is disabled and
    * the desched_signal_may_be_relevant was set by the outermost syscallbuf
    * invocation.
    */
   if (!t->read_mem(REMOTE_PTR_FIELD(t->syscallbuf_child,
                                     desched_signal_may_be_relevant)) ||
       t->running_inside_desched()) {
     LOG(debug) << "  (not entering may-block syscall; resuming)";
     /* We have to disarm the event just in case the tracee
      * has cleared the relevancy flag, but not yet
      * disarmed the event itself. */
     disarm_desched_event(t);
     t->push_event(Event::noop());
     return;
   }

   /* TODO: how can signals interrupt us here? */

   /* The desched event just fired.  That implies that the
    * arm-desched ioctl went into effect, and that the
    * disarm-desched syscall didn't take effect.  Since a signal
    * is pending for the tracee, then if the tracee was in a
    * syscall, linux has exited it with an -ERESTART* error code.
    * That means the tracee is about to (re-)enter either
    *
    *  1. buffered syscall
    *  2. disarm-desched ioctl syscall
    *
    * We can figure out which one by simply issuing a
    * ptrace(SYSCALL) and examining the tracee's registers.
    *
    * If the tracee enters the disarm-desched ioctl, it's going
    * to commit a record of the buffered syscall to the
    * syscallbuf, and we can safely send the tracee back on its
    * way, ignoring the desched completely.
    *
    * If it enters the buffered syscall, then the desched event
    * has served its purpose and we need to prepare the tracee to
    * be context-switched.
    *
    * An annoyance of the desched signal is that when the tracer
    * is descheduled in interval (C) above, we see normally (see
    * below) see *two* signals.  The current theory of what's
    * happening is
    *
    *  o child gets descheduled, bumps counter to i and schedules
    *    signal
    *  o signal notification "schedules" child, but it doesn't
    *    actually run any application code
    *  o child is being ptraced, so we "deschedule" child to
    *    notify parent and bump counter to i+1.  (The parent
    *    hasn't had a chance to clear the counter yet.)
    *  o another counter signal is generated, but signal is
    *    already pending so this one is queued
    *  o parent is notified and sees counter value i+1
    *  o parent stops delivery of first signal and disarms
    *    counter
    *  o second signal dequeued and delivered, notifying parent
    *    (counter is disarmed now, so no pseudo-desched possible
    *    here)
    *  o parent notifiedand sees counter value i+1 again
    *  o parent stops delivery of second signal and we continue on
    *
    * So we "work around" this by the tracer expecting two signal
    * notifications, and silently discarding both.
    *
    * One really fun edge case is that sometimes the desched
    * signal will interrupt the arm-desched syscall itself.
    * Continuing to the next syscall boundary seems to restart
    * the arm-desched syscall, and advancing to the boundary
    * again exits it and we start receiving desched signals
    * again.
    *
    * That may be a kernel bug, but we handle it by just
    * continuing until we we continue past the arm-desched
    * syscall *and* stop seeing signals. */

   const auto untraced_record_only_entry =
     uintptr_t(RR_PAGE_SYSCALL_UNTRACED_RECORDING_ONLY);
   auto syscall_entry_ip = t->ip().decrement_by_syscall_insn_length(t->arch());
   if (syscall_entry_ip == remote_code_ptr(untraced_record_only_entry) &&
       t->regs().syscall_result_signed() == -EFAULT) {
     intptr_t syscallno;
     if (t->arch() == aarch64) {
       // Untraced syscall, we may not have set original_syscallno for this on aarch64.
       syscallno = t->regs().syscallno();
     } else {
       // On x86, syscall no is overwritten by return value.
       ASSERT(t, is_x86ish(t->arch()));
       syscallno = t->regs().original_syscallno();
     }
     if (syscallno == syscall_number_for_getsockopt(t->arch())) {
       // We've observed interrupted getsockopt syscalls returning `EFAULT`
       // rather than the normal ERESTART*.
       // This is a kernel bug caused by CONFIG_BPFILTER_UMH.
       // Try to reduce the effect caused by rr generated signals
       // by manually restarting the syscall
       // (since the previous syscall returned EFAULT
       //  we would in the worst case just get another EFAULT).
       // Note that setting syscall result to ERESTART* wouldn't work on aarch64
       // if the arg1 has been overwritten by AutoRemoteSyscalls.
       auto r = t->regs();
       r.set_ip(syscall_entry_ip);
       if (t->arch() == aarch64) {
         // On AArch64, we need to restore arg1 from the stack argument from syscallbuf.
         auto orig_arg1_ptr = r.sp() + sizeof(long);
         auto orig_arg1 = t->read_mem(orig_arg1_ptr.cast<long>());
         r.set_arg1(orig_arg1);
       } else {
         ASSERT(t, is_x86ish(t->arch()));
         // On x86, we need to restore syscall number
         r.set_syscallno(syscallno);
       }
       t->set_regs(r);
     }
   }

   while (true) {
     // Prevent further desched notifications from firing
     // while we're advancing the tracee.  We're going to
     // leave it in a consistent state anyway, so the event
     // is no longer useful.  We have to do this in each
     // loop iteration because a restarted arm-desched
     // syscall may have re-armed the event.
     disarm_desched_event(t);

     if (!t->resume_execution(RESUME_SYSCALL, RESUME_WAIT_NO_EXIT, RESUME_UNLIMITED_TICKS)) {
       LOG(debug) << "  (got exit, bailing out)";
       t->push_event(Event::noop());
       return;
     }

     if (t->status().is_syscall()) {
       t->apply_syscall_entry_regs();
       if (t->is_arm_desched_event_syscall()) {
         continue;
       }
       break;
     }
     if (t->ptrace_event() == PTRACE_EVENT_SECCOMP) {
       ASSERT(t,
              t->session().syscall_seccomp_ordering() ==
                  Session::SECCOMP_BEFORE_PTRACE_SYSCALL);
       // This is the old kernel event ordering. This must be a SECCOMP event
       // for the buffered syscall; it's not rr-generated because this is an
       // untraced syscall, but it could be generated by a tracee's
       // seccomp filter.
       break;
     }

     // Completely ignore spurious desched signals and
     // signals that aren't going to be delivered to the
     // tracee.
     //
     // Also ignore time-slice signals.  If the tracee ends
     // up at the disarm-desched ioctl, we'll reschedule it
     // with the ticks interrupt still programmed.  At worst,
     // the tracee will get an extra time-slice out of
     // this, on average, so we don't worry too much about
     // it.
     //
     // TODO: it's theoretically possible for this to
     // happen an unbounded number of consecutive times
     // and the tracee never switched out.
     int sig = t->stop_sig();
     ASSERT(t, sig) << "expected stop-signal, got " << t->status();
     if (SIGTRAP == sig && handle_syscallbuf_breakpoint(t)) {
       // We stopped at a breakpoint on an untraced may-block syscall.
       // This can't be relevant to us since sigprocmask isn't may-block.
       LOG(debug) << " disabling breakpoints on untraced syscalls";
       continue;
     }
     if (t->session().syscallbuf_desched_sig() == sig ||
         PerfCounters::TIME_SLICE_SIGNAL == sig || t->is_sig_ignored(sig)) {
       LOG(debug) << "  dropping ignored " << signal_name(sig);
       continue;
     }

     LOG(debug) << "  stashing " << signal_name(sig);
     t->stash_sig();
   }

   if (t->is_disarm_desched_event_syscall()) {
     LOG(debug)
         << "  (at disarm-desched, so finished buffered syscall; resuming)";
     t->push_event(Event::noop());
     return;
   }

   if (t->desched_rec()) {
     // We're already processing a desched. We probably reexecuted the
     // system call (e.g. because a signal was processed) and the syscall
     // blocked again. Carry on with the current desched.
   } else {
     /* This prevents the syscallbuf record counter from being
      * reset until we've finished guiding the tracee through this
      * interrupted call.  We use the record counter for
      * assertions. */
     ASSERT(t, !t->delay_syscallbuf_reset_for_desched);
     t->delay_syscallbuf_reset_for_desched = true;
     LOG(debug) << "Desched initiated";

     /* The tracee is (re-)entering the buffered syscall.  Stash
      * away this breadcrumb so that we can figure out what syscall
      * the tracee was in, and how much "scratch" space it carved
      * off the syscallbuf, if needed. */
     remote_ptr<const struct syscallbuf_record> desched_rec =
         t->next_syscallbuf_record();
     t->push_event(DeschedEvent(desched_rec));
     int call = t->read_mem(REMOTE_PTR_FIELD(t->desched_rec(), syscallno));

     /* The descheduled syscall was interrupted by a signal, like
      * all other may-restart syscalls, with the exception that
      * this one has already been restarted (which we'll detect
      * back in the main loop). */
     t->push_event(Event(interrupted, SyscallEvent(call, t->arch())));
     SyscallEvent& ev = t->ev().Syscall();
     ev.desched_rec = desched_rec;
   }

   SyscallEvent& ev = t->ev().Syscall();
   ev.regs = t->regs();
   /* For some syscalls (at least poll) but not all (at least not read),
    * repeated cont_syscall()s above of the same interrupted syscall
    * can set $orig_eax to 0 ... for unclear reasons. Fix that up here
    * otherwise we'll get a divergence during replay, which will not
    * encounter this problem.
    */
   int call = t->read_mem(REMOTE_PTR_FIELD(t->desched_rec(), syscallno));
   ev.regs.set_original_syscallno(call);
   t->set_regs(ev.regs);
   // runnable_state_changed will observe us entering this syscall and change
   // state to ENTERING_SYSCALL

   LOG(debug) << "  resuming (and probably switching out) blocked `"
              << syscall_name(call, ev.arch()) << "'";
 }

 static bool is_safe_to_deliver_signal(RecordTask* t, siginfo_t* si) {
   if (!t->is_in_syscallbuf()) {
     /* The tracee is outside the syscallbuf code,
      * so in most cases can't possibly affect
      * syscallbuf critical sections.  The
      * exception is signal handlers "re-entering"
      * desched'd syscalls, which are OK. */
     LOG(debug) << "Safe to deliver signal at " << t->ip()
                << " because not in syscallbuf";
     return true;
   }

   // Note that this will never fire on aarch64 in a signal stop
   // since the ip has been moved to the syscall entry.
   // We will catch it in the traced_syscall_entry case below.
   // We will miss the exit for rrcall_notify_syscall_hook_exit
   // but that should not be a big problem.
   if (t->is_in_traced_syscall()) {
     LOG(debug) << "Safe to deliver signal at " << t->ip()
                << " because in traced syscall";
     return true;
   }

   // Don't deliver signals just before entering rrcall_notify_syscall_hook_exit.
   // At that point, notify_on_syscall_hook_exit will be set, but we have
   // passed the point at which syscallbuf code has checked that flag.
   // Replay will set notify_on_syscall_hook_exit when we replay towards the
   // rrcall_notify_syscall_hook_exit *after* handling this signal, but
   // that will be too late for syscallbuf to notice.
   // It's OK to delay signal delivery until after rrcall_notify_syscall_hook_exit
   // anyway.
   if (t->is_at_traced_syscall_entry() &&
       !is_rrcall_notify_syscall_hook_exit_syscall(t->regs().syscallno(), t->arch())) {
     LOG(debug) << "Safe to deliver signal at " << t->ip()
                << " because at entry to traced syscall";
     return true;
   }

   // On aarch64, the untraced syscall here include both entry and exit
   // if we are at a signal stop.
   if (t->is_in_untraced_syscall() && t->desched_rec()) {
     // Untraced syscalls always use the architecture of the process
     LOG(debug) << "Safe to deliver signal at " << t->ip()
                << " because tracee interrupted by desched of "
                << syscall_name(t->read_mem(REMOTE_PTR_FIELD(t->desched_rec(),
                                                             syscallno)),
                                t->arch());
     return true;
   }

   if (t->is_in_untraced_syscall() && si->si_signo == SIGSYS &&
       si->si_code == SYS_SECCOMP) {
     LOG(debug) << "Safe to deliver signal at " << t->ip()
                << " because signal is seccomp trap.";
     return true;
   }

   // If the syscallbuf buffer hasn't been created yet, just delay the signal
   // with no need to set notify_on_syscall_hook_exit; the signal will be
   // delivered when rrcall_init_buffers is called.
   if (t->syscallbuf_child) {
     if (t->read_mem(REMOTE_PTR_FIELD(t->syscallbuf_child, locked)) & 2) {
       LOG(debug) << "Safe to deliver signal at " << t->ip()
                  << " because the syscallbuf is locked";
       return true;
     }

     // A signal (e.g. seccomp SIGSYS) interrupted a untraced syscall in a
     // non-restartable way. Defer it until SYS_rrcall_notify_syscall_hook_exit.
     if (t->is_in_untraced_syscall()) {
       // Our emulation of SYS_rrcall_notify_syscall_hook_exit clears this flag.
       t->write_mem(
           REMOTE_PTR_FIELD(t->syscallbuf_child, notify_on_syscall_hook_exit),
           (uint8_t)1);
     }
   }

   LOG(debug) << "Not safe to deliver signal at " << t->ip();
   return false;
 }

 SignalHandled handle_signal(RecordTask* t, siginfo_t* si,
                             SignalDeterministic deterministic,
                             SignalBlocked signal_was_blocked) {
   int sig = si->si_signo;
   LOG(debug) << t->tid << ": handling signal " << signal_name(sig)
              << " (pevent: " << ptrace_event_name(t->ptrace_event())
              << ", event: " << t->ev();

   // Conservatively invalidate the sigmask in case just accepting a signal has
   // sigmask effects.
   t->invalidate_sigmask();

   if (deterministic == DETERMINISTIC_SIG) {
     // When a deterministic signal is triggered, but the signal is currently
     // blocked or ignored, the kernel (in |force_sig_info|) unblocks it and
     // sets its disposition to SIG_DFL. It never undoes this (probably
     // because it expects the signal to be fatal, which it always would be
     // unless a ptracer intercepts the signal as we do). Therefore, if the
     // signal was generated for rr's purposes, we need to restore the signal
     // state ourselves.
     if (sig == SIGSEGV &&
         (try_handle_trapped_instruction(t, si) || try_grow_map(t, si))) {
       if (signal_was_blocked || t->is_sig_ignored(sig)) {
         restore_signal_state(t, sig, signal_was_blocked);
       }
       return SIGNAL_HANDLED;
     }

     // Since we're not undoing the kernel's changes, update our signal handler
     // state to match the kernel's.
     if (signal_was_blocked || t->is_sig_ignored(sig)) {
       t->did_set_sig_handler_default(sig);
     }
   }

   if (!VirtualPerfCounterMonitor::is_virtual_perf_counter_signal(si)) {
     /* We have to check for a desched event first, because for
      * those we *do not* want to (and cannot, most of the time)
      * step the tracee out of the syscallbuf code before
      * attempting to deliver the signal. */
     if (t->session().syscallbuf_desched_sig() == si->si_signo &&
         si->si_code == POLL_IN) {
       handle_desched_event(t);
       return SIGNAL_HANDLED;
     }

     if (!is_safe_to_deliver_signal(t, si)) {
       return DEFER_SIGNAL;
     }

     if (!t->set_siginfo_for_synthetic_SIGCHLD(si)) {
       return DEFER_SIGNAL;
     }

     if (sig == PerfCounters::TIME_SLICE_SIGNAL) {
       t->push_event(Event::sched());
       return SIGNAL_HANDLED;
     }
   } else {
     // Clear the magic flag so it doesn't leak into the program.
     si->si_errno = 0;
   }

   /* This signal was generated by the program or an external
    * source, record it normally. */

   if (t->emulate_ptrace_stop(WaitStatus::for_stop_sig(sig), si)) {
     // Record an event so that replay progresses the tracee to the
     // current point before we notify the tracer.
     // If the signal is deterministic, record it as an EV_SIGNAL so that
     // we replay it using the deterministic-signal replay path. This is
     // more efficient than emulate_async_signal. Also emulate_async_signal
     // currently assumes it won't encounter a deterministic SIGTRAP (due to
     // a hardcoded breakpoint in the tracee).
     if (deterministic == DETERMINISTIC_SIG) {
       t->record_event(Event(EV_SIGNAL, SignalEvent(*si, deterministic,
                                                    t->sig_resolved_disposition(
                                                        sig, deterministic))));
     } else {
       t->record_event(Event::sched());
     }
     // ptracer has been notified, so don't deliver the signal now.
     // The signal won't be delivered for real until the ptracer calls
     // PTRACE_CONT with the signal number (which we don't support yet!).
     return SIGNAL_PTRACE_STOP;
   }

   t->push_event(Event(
       EV_SIGNAL, SignalEvent(*si, deterministic,
                              t->sig_resolved_disposition(sig, deterministic))));
   return SIGNAL_HANDLED;
 }

 } // namespace rr