| /* -*- Mode: C++; tab-width: 8; c-basic-offset: 2; indent-tabs-mode: nil; -*- */ |
| |
| #include "record_signal.h" |
| |
| #include <fcntl.h> |
| #include <linux/perf_event.h> |
| #include <sched.h> |
| #include <stdlib.h> |
| #include <string.h> |
| #include <sys/mman.h> |
| #include <sys/prctl.h> |
| #include <sys/resource.h> |
| #include <sys/user.h> |
| #include <syscall.h> |
| |
| #include "preload/preload_interface.h" |
| |
| #include "AutoRemoteSyscalls.h" |
| #include "Flags.h" |
| #include "PerfCounters.h" |
| #include "RecordSession.h" |
| #include "RecordTask.h" |
| #include "TraceStream.h" |
| #include "VirtualPerfCounterMonitor.h" |
| #include "core.h" |
| #include "kernel_metadata.h" |
| #include "log.h" |
| #include "util.h" |
| |
| using namespace std; |
| |
| namespace rr { |
| |
| static void restore_sighandler_if_not_default(RecordTask* t, int sig) { |
| if (t->sig_disposition(sig) != SIGNAL_DEFAULT) { |
| LOG(debug) << "Restoring signal handler for " << signal_name(sig); |
| AutoRemoteSyscalls remote(t); |
| size_t sigset_size = sigaction_sigset_size(remote.arch()); |
| const vector<uint8_t>& sa = t->signal_action(sig); |
| AutoRestoreMem child_sa(remote, sa.data(), sa.size()); |
| remote.infallible_syscall(syscall_number_for_rt_sigaction(remote.arch()), |
| sig, child_sa.get().as_int(), nullptr, |
| sigset_size); |
| } |
| } |
| |
| /** |
| * Restore the blocked-ness and sigaction for |sig| from |t|'s local |
| * copy. |
| */ |
| static void restore_signal_state(RecordTask* t, int sig, |
| SignalBlocked signal_was_blocked) { |
| restore_sighandler_if_not_default(t, sig); |
| if (signal_was_blocked) { |
| LOG(debug) << "Restoring signal blocked-ness for " << signal_name(sig); |
| AutoRemoteSyscalls remote(t); |
| size_t sigset_size = sigaction_sigset_size(remote.arch()); |
| vector<uint8_t> bytes; |
| bytes.resize(sigset_size); |
| memset(bytes.data(), 0, sigset_size); |
| sig_set_t mask = signal_bit(sig); |
| ASSERT(t, sigset_size >= sizeof(mask)); |
| memcpy(bytes.data(), &mask, sizeof(mask)); |
| AutoRestoreMem child_block(remote, bytes.data(), bytes.size()); |
| remote.infallible_syscall(syscall_number_for_rt_sigprocmask(remote.arch()), |
| SIG_BLOCK, child_block.get().as_int(), nullptr, |
| sigset_size); |
| // We just changed the sigmask ourselves. |
| t->invalidate_sigmask(); |
| } |
| } |
| |
| /** |
| * Return true if |t| was stopped because of a SIGSEGV resulting |
| * from a disabled instruction and |t| was updated appropriately, false |
| * otherwise. |
| */ |
| static bool try_handle_trapped_instruction(RecordTask* t, siginfo_t* si) { |
| ASSERT(t, si->si_signo == SIGSEGV); |
| |
| auto trapped_instruction = trapped_instruction_at(t, t->ip()); |
| switch (trapped_instruction) { |
| case TrappedInstruction::RDTSC: |
| case TrappedInstruction::RDTSCP: |
| if (t->tsc_mode == PR_TSC_SIGSEGV) { |
| return false; |
| } |
| break; |
| case TrappedInstruction::CPUID: |
| if (t->cpuid_mode == 0) { |
| return false; |
| } |
| break; |
| default: |
| return false; |
| } |
| |
| size_t len = trapped_instruction_len(trapped_instruction); |
| ASSERT(t, len > 0); |
| |
| Registers r = t->regs(); |
| if (trapped_instruction == TrappedInstruction::RDTSC || |
| trapped_instruction == TrappedInstruction::RDTSCP) { |
| if (trapped_instruction == TrappedInstruction::RDTSC && |
| t->vm()->monkeypatcher().try_patch_trapping_instruction(t, len, true)) { |
| Event ev = Event::patch_syscall(); |
| ev.PatchSyscall().patch_trapping_instruction = true; |
| t->record_event(ev); |
| t->push_event(Event::noop()); |
| return true; |
| } |
| |
| unsigned long long current_time = rdtsc(); |
| r.set_rdtsc_output(current_time); |
| |
| LOG(debug) << " trapped for rdtsc: returning " << current_time; |
| } else if (trapped_instruction == TrappedInstruction::CPUID) { |
| auto eax = r.syscallno(); |
| auto ecx = r.cx(); |
| auto cpuid_data = cpuid(eax, ecx); |
| t->session().disable_cpuid_features() |
| .amend_cpuid_data(eax, ecx, &cpuid_data); |
| r.set_cpuid_output(cpuid_data.eax, cpuid_data.ebx, cpuid_data.ecx, |
| cpuid_data.edx); |
| LOG(debug) << " trapped for cpuid: " << HEX(eax) << ":" << HEX(ecx); |
| } |
| |
| r.set_ip(r.ip() + len); |
| t->set_regs(r); |
| t->record_event(Event::instruction_trap()); |
| |
| if (t->retry_syscall_patching) { |
| LOG(debug) << "Retrying deferred syscall patching"; |
| t->retry_syscall_patching = false; |
| if (t->vm()->monkeypatcher().try_patch_trapping_instruction(t, len, false)) { |
| // Instruction was patched. Emit event. |
| auto ev = Event::patch_syscall(); |
| ev.PatchSyscall().patch_after_syscall = true; |
| t->record_event(ev); |
| } |
| } |
| |
| t->push_event(Event::noop()); |
| return true; |
| } |
| |
| /** |
| * Return true if |t| was stopped because of a SIGSEGV and we want to retry |
| * the instruction after emulating MAP_GROWSDOWN. |
| */ |
| static bool try_grow_map(RecordTask* t, siginfo_t* si) { |
| ASSERT(t, si->si_signo == SIGSEGV); |
| |
| // Use kernel_abi to avoid odd inconsistencies between distros |
| auto arch_si = reinterpret_cast<NativeArch::siginfo_t*>(si); |
| auto addr = arch_si->_sifields._sigfault.si_addr_.rptr(); |
| |
| if (t->vm()->has_mapping(addr)) { |
| LOG(debug) << "try_grow_map " << addr << ": address already mapped"; |
| return false; |
| } |
| auto maps = t->vm()->maps_starting_at(floor_page_size(addr)); |
| auto it = maps.begin(); |
| if (it == maps.end()) { |
| LOG(debug) << "try_grow_map " << addr << ": no later map to grow downward"; |
| return false; |
| } |
| if (!(it->map.flags() & MAP_GROWSDOWN)) { |
| LOG(debug) << "try_grow_map " << addr << ": map is not MAP_GROWSDOWN (" |
| << it->map << ")"; |
| return false; |
| } |
| if (addr >= page_size() && t->vm()->has_mapping(addr - page_size())) { |
| LOG(debug) << "try_grow_map " << addr << ": address would be in guard page"; |
| return false; |
| } |
| remote_ptr<void> limit_bottom; |
| #if defined (__i386__) |
| struct rlimit stack_limit; |
| int ret = prlimit(t->tid, RLIMIT_STACK, NULL, &stack_limit); |
| #else |
| struct rlimit64 stack_limit; |
| int ret = syscall(__NR_prlimit64, t->tid, RLIMIT_STACK, (void*)0, &stack_limit); |
| #endif |
| if (ret >= 0 && stack_limit.rlim_cur != RLIM_INFINITY) { |
| limit_bottom = ceil_page_size(it->map.end() - stack_limit.rlim_cur); |
| if (limit_bottom > addr) { |
| LOG(debug) << "try_grow_map " << addr << ": RLIMIT_STACK exceeded"; |
| return false; |
| } |
| } |
| |
| // Try to grow by 64K at a time to reduce signal frequency. |
| auto new_start = floor_page_size(addr); |
| static const uintptr_t grow_size = 0x10000; |
| if (it->map.start().as_int() >= grow_size) { |
| auto possible_new_start = std::max( |
| limit_bottom, std::min(new_start, it->map.start() - grow_size)); |
| // Ensure that no mapping exists between possible_new_start - page_size() |
| // and new_start. If there is, possible_new_start is not valid, in which |
| // case we just abandon the optimization. |
| if (possible_new_start >= page_size() && |
| !t->vm()->has_mapping(possible_new_start - page_size()) && |
| t->vm()->maps_starting_at(possible_new_start - page_size()) |
| .begin() |
| ->map.start() == it->map.start()) { |
| new_start = possible_new_start; |
| } |
| } |
| LOG(debug) << "try_grow_map " << addr << ": trying to grow map " << it->map; |
| |
| { |
| AutoRemoteSyscalls remote(t, AutoRemoteSyscalls::DISABLE_MEMORY_PARAMS); |
| remote.infallible_mmap_syscall_if_alive( |
| new_start, it->map.start() - new_start, it->map.prot(), |
| (it->map.flags() & ~MAP_GROWSDOWN) | MAP_ANONYMOUS | MAP_FIXED, -1, 0); |
| } |
| |
| KernelMapping km = |
| t->vm()->map(t, new_start, it->map.start() - new_start, it->map.prot(), |
| it->map.flags() | MAP_ANONYMOUS, 0, string(), |
| KernelMapping::NO_DEVICE, KernelMapping::NO_INODE); |
| t->trace_writer().write_mapped_region(t, km, km.fake_stat(), km.fsname(), vector<TraceRemoteFd>()); |
| // No need to flush syscallbuf here. It's safe to map these pages "early" |
| // before they're really needed. |
| t->record_event(Event::grow_map(), RecordTask::DONT_FLUSH_SYSCALLBUF); |
| t->push_event(Event::noop()); |
| LOG(debug) << "try_grow_map " << addr << ": extended map " |
| << t->vm()->mapping_of(addr).map; |
| return true; |
| } |
| |
| void disarm_desched_event(RecordTask* t) { |
| if (t->desched_fd.is_open() && |
| ioctl(t->desched_fd, PERF_EVENT_IOC_DISABLE, 0)) { |
| FATAL() << "Failed to disarm desched event"; |
| } |
| } |
| |
| void arm_desched_event(RecordTask* t) { |
| if (t->desched_fd.is_open() && |
| ioctl(t->desched_fd, PERF_EVENT_IOC_ENABLE, 0)) { |
| FATAL() << "Failed to arm desched event"; |
| } |
| } |
| |
| template <typename Arch> |
| static remote_code_ptr get_stub_scratch_1_arch(RecordTask* t) { |
| auto remote_locals = AddressSpace::preload_thread_locals_start() |
| .cast<preload_thread_locals<Arch>>(); |
| auto remote_stub_scratch_1 = REMOTE_PTR_FIELD(remote_locals, stub_scratch_1); |
| return t->read_mem(remote_stub_scratch_1).rptr().as_int(); |
| } |
| |
| static remote_code_ptr get_stub_scratch_1(RecordTask* t) { |
| RR_ARCH_FUNCTION(get_stub_scratch_1_arch, t->arch(), t); |
| } |
| |
| template <typename Arch> |
| static void get_stub_scratch_2_arch(RecordTask* t, void *buff, size_t sz) { |
| auto remote_locals = AddressSpace::preload_thread_locals_start() |
| .cast<preload_thread_locals<Arch>>(); |
| auto remote_stub_scratch_2 = REMOTE_PTR_FIELD(remote_locals, stub_scratch_2); |
| t->read_bytes_helper(remote_stub_scratch_2, sz, buff); |
| } |
| |
| static void get_stub_scratch_2(RecordTask* t, void *buff, size_t sz) { |
| RR_ARCH_FUNCTION(get_stub_scratch_2_arch, t->arch(), t, buff, sz); |
| } |
| |
| /** |
| * This function is responsible for handling breakpoints we set in syscallbuf |
| * code to detect sigprocmask calls and syscallbuf exit. It's called when we |
| * get a SIGTRAP. Returns true if the SIGTRAP was called by one of our |
| * breakpoints and should be hidden from the application. |
| * If it was triggered by one of our breakpoints, we have to call |
| * restore_sighandler_if_not_default(t, SIGTRAP) to make sure the SIGTRAP |
| * handler is properly restored if the kernel cleared it. |
| */ |
| bool handle_syscallbuf_breakpoint(RecordTask* t) { |
| if (t->is_at_syscallbuf_final_instruction_breakpoint()) { |
| LOG(debug) << "Reached final syscallbuf instruction, singlestepping to " |
| "enable signal dispatch"; |
| // Emulate the effect of the return from syscallbuf. |
| // On x86, this is a single instruction that jumps to the location stored in |
| // preload_thread_locals::stub_scratch_1. |
| // On aarch64, the target of the jump is an instruction that restores |
| // x15 and x30 and then jump back to the syscall. |
| // To minimize the surprise to the tracee if we decide to deliver a signal |
| // we'll emulate the register restore and return directly to the syscall site. |
| // The address in stub_scratch_1 is already the correct address for this. |
| if (t->arch() == aarch64) { |
| uint64_t x15_x30[2]; |
| get_stub_scratch_2(t, x15_x30, 16); |
| Registers r = t->regs(); |
| r.set_x15(x15_x30[0]); |
| r.set_xlr(x15_x30[1]); |
| t->set_regs(r); |
| t->count_direct_jump(); |
| } |
| t->emulate_jump(get_stub_scratch_1(t)); |
| |
| restore_sighandler_if_not_default(t, SIGTRAP); |
| // Now we're back in application code so any pending stashed signals |
| // will be handled. |
| return true; |
| } |
| |
| if (t->is_at_syscallstub_exit_breakpoint()) { |
| LOG(debug) << "Reached syscallstub exit instruction, singlestepping to " |
| "enable signal dispatch"; |
| ASSERT(t, t->arch() == aarch64 && t->syscallstub_exit_breakpoint); |
| auto retaddr_addr = t->syscallstub_exit_breakpoint.to_data_ptr<uint8_t>() + 3 * 4; |
| uint64_t retaddr; |
| t->read_bytes_helper(retaddr_addr, sizeof(retaddr), &retaddr); |
| Registers r = t->regs(); |
| r.set_ip(retaddr); |
| t->set_regs(r); |
| t->count_direct_jump(); |
| t->syscallstub_exit_breakpoint = nullptr; |
| restore_sighandler_if_not_default(t, SIGTRAP); |
| // Now we're back in application code so any pending stashed signals |
| // will be handled. |
| return true; |
| } |
| |
| if (!t->is_at_syscallbuf_syscall_entry_breakpoint()) { |
| return false; |
| } |
| |
| Registers r = t->regs(); |
| r.set_ip(r.ip().undo_executed_bkpt(t->arch())); |
| t->set_regs(r); |
| |
| if (t->is_at_traced_syscall_entry()) { |
| // We will automatically dispatch stashed signals now since this is an |
| // allowed place to dispatch signals. |
| LOG(debug) << "Allowing signal dispatch at traced-syscall breakpoint"; |
| restore_sighandler_if_not_default(t, SIGTRAP); |
| return true; |
| } |
| |
| // We're at an untraced-syscall entry point. |
| // To allow an AutoRemoteSyscall, we need to make sure desched signals are |
| // disarmed (and rearmed afterward). |
| bool armed_desched_event = t->read_mem( |
| REMOTE_PTR_FIELD(t->syscallbuf_child, desched_signal_may_be_relevant)); |
| if (armed_desched_event) { |
| disarm_desched_event(t); |
| } |
| restore_sighandler_if_not_default(t, SIGTRAP); |
| if (armed_desched_event) { |
| arm_desched_event(t); |
| } |
| |
| // This is definitely a native-arch syscall. |
| if (is_rt_sigprocmask_syscall(r.syscallno(), t->arch())) { |
| // Don't proceed with this syscall. Emulate it returning EAGAIN. |
| // Syscallbuf logic will retry using a traced syscall instead. |
| r.set_syscall_result(-EAGAIN); |
| r.set_ip(r.ip().increment_by_syscall_insn_length(t->arch())); |
| t->set_regs(r); |
| t->canonicalize_regs(t->arch()); |
| LOG(debug) << "Emulated EAGAIN to avoid untraced sigprocmask with pending " |
| "stashed signal"; |
| // Leave breakpoints enabled since we want to break at the traced-syscall |
| // fallback for rt_sigprocmask. |
| return true; |
| } |
| |
| // We can proceed with the untraced syscall. Either it will complete and |
| // execution will continue until we reach some point where we can deliver our |
| // signal, or it will block at which point we'll be able to deliver our |
| // signal. |
| LOG(debug) << "Disabling breakpoints at untraced syscalls"; |
| t->break_at_syscallbuf_untraced_syscalls = false; |
| return true; |
| } |
| |
| /** |
| * Return the event needing to be processed after this desched of |t|. |
| * The tracee's execution may be advanced, and if so |regs| is updated |
| * to the tracee's latest state. |
| */ |
| static void handle_desched_event(RecordTask* t) { |
| /* If the tracee isn't in the critical section where a desched |
| * event is relevant, we can ignore it. See the long comments |
| * in syscall_buffer.c. |
| * |
| * It's OK if the tracee is in the critical section for a |
| * may-block syscall B, but this signal was delivered by an |
| * event programmed by a previous may-block syscall A. |
| * |
| * If we're running in a signal handler inside an interrupted syscallbuf |
| * system call, never do anything here. Syscall buffering is disabled and |
| * the desched_signal_may_be_relevant was set by the outermost syscallbuf |
| * invocation. |
| */ |
| if (!t->read_mem(REMOTE_PTR_FIELD(t->syscallbuf_child, |
| desched_signal_may_be_relevant)) || |
| t->running_inside_desched()) { |
| LOG(debug) << " (not entering may-block syscall; resuming)"; |
| /* We have to disarm the event just in case the tracee |
| * has cleared the relevancy flag, but not yet |
| * disarmed the event itself. */ |
| disarm_desched_event(t); |
| t->push_event(Event::noop()); |
| return; |
| } |
| |
| /* TODO: how can signals interrupt us here? */ |
| |
| /* The desched event just fired. That implies that the |
| * arm-desched ioctl went into effect, and that the |
| * disarm-desched syscall didn't take effect. Since a signal |
| * is pending for the tracee, then if the tracee was in a |
| * syscall, linux has exited it with an -ERESTART* error code. |
| * That means the tracee is about to (re-)enter either |
| * |
| * 1. buffered syscall |
| * 2. disarm-desched ioctl syscall |
| * |
| * We can figure out which one by simply issuing a |
| * ptrace(SYSCALL) and examining the tracee's registers. |
| * |
| * If the tracee enters the disarm-desched ioctl, it's going |
| * to commit a record of the buffered syscall to the |
| * syscallbuf, and we can safely send the tracee back on its |
| * way, ignoring the desched completely. |
| * |
| * If it enters the buffered syscall, then the desched event |
| * has served its purpose and we need to prepare the tracee to |
| * be context-switched. |
| * |
| * An annoyance of the desched signal is that when the tracer |
| * is descheduled in interval (C) above, we see normally (see |
| * below) see *two* signals. The current theory of what's |
| * happening is |
| * |
| * o child gets descheduled, bumps counter to i and schedules |
| * signal |
| * o signal notification "schedules" child, but it doesn't |
| * actually run any application code |
| * o child is being ptraced, so we "deschedule" child to |
| * notify parent and bump counter to i+1. (The parent |
| * hasn't had a chance to clear the counter yet.) |
| * o another counter signal is generated, but signal is |
| * already pending so this one is queued |
| * o parent is notified and sees counter value i+1 |
| * o parent stops delivery of first signal and disarms |
| * counter |
| * o second signal dequeued and delivered, notifying parent |
| * (counter is disarmed now, so no pseudo-desched possible |
| * here) |
| * o parent notifiedand sees counter value i+1 again |
| * o parent stops delivery of second signal and we continue on |
| * |
| * So we "work around" this by the tracer expecting two signal |
| * notifications, and silently discarding both. |
| * |
| * One really fun edge case is that sometimes the desched |
| * signal will interrupt the arm-desched syscall itself. |
| * Continuing to the next syscall boundary seems to restart |
| * the arm-desched syscall, and advancing to the boundary |
| * again exits it and we start receiving desched signals |
| * again. |
| * |
| * That may be a kernel bug, but we handle it by just |
| * continuing until we we continue past the arm-desched |
| * syscall *and* stop seeing signals. */ |
| |
| const auto untraced_record_only_entry = |
| uintptr_t(RR_PAGE_SYSCALL_UNTRACED_RECORDING_ONLY); |
| auto syscall_entry_ip = t->ip().decrement_by_syscall_insn_length(t->arch()); |
| if (syscall_entry_ip == remote_code_ptr(untraced_record_only_entry) && |
| t->regs().syscall_result_signed() == -EFAULT) { |
| intptr_t syscallno; |
| if (t->arch() == aarch64) { |
| // Untraced syscall, we may not have set original_syscallno for this on aarch64. |
| syscallno = t->regs().syscallno(); |
| } else { |
| // On x86, syscall no is overwritten by return value. |
| ASSERT(t, is_x86ish(t->arch())); |
| syscallno = t->regs().original_syscallno(); |
| } |
| if (syscallno == syscall_number_for_getsockopt(t->arch())) { |
| // We've observed interrupted getsockopt syscalls returning `EFAULT` |
| // rather than the normal ERESTART*. |
| // This is a kernel bug caused by CONFIG_BPFILTER_UMH. |
| // Try to reduce the effect caused by rr generated signals |
| // by manually restarting the syscall |
| // (since the previous syscall returned EFAULT |
| // we would in the worst case just get another EFAULT). |
| // Note that setting syscall result to ERESTART* wouldn't work on aarch64 |
| // if the arg1 has been overwritten by AutoRemoteSyscalls. |
| auto r = t->regs(); |
| r.set_ip(syscall_entry_ip); |
| if (t->arch() == aarch64) { |
| // On AArch64, we need to restore arg1 from the stack argument from syscallbuf. |
| auto orig_arg1_ptr = r.sp() + sizeof(long); |
| auto orig_arg1 = t->read_mem(orig_arg1_ptr.cast<long>()); |
| r.set_arg1(orig_arg1); |
| } else { |
| ASSERT(t, is_x86ish(t->arch())); |
| // On x86, we need to restore syscall number |
| r.set_syscallno(syscallno); |
| } |
| t->set_regs(r); |
| } |
| } |
| |
| while (true) { |
| // Prevent further desched notifications from firing |
| // while we're advancing the tracee. We're going to |
| // leave it in a consistent state anyway, so the event |
| // is no longer useful. We have to do this in each |
| // loop iteration because a restarted arm-desched |
| // syscall may have re-armed the event. |
| disarm_desched_event(t); |
| |
| if (!t->resume_execution(RESUME_SYSCALL, RESUME_WAIT_NO_EXIT, RESUME_UNLIMITED_TICKS)) { |
| LOG(debug) << " (got exit, bailing out)"; |
| t->push_event(Event::noop()); |
| return; |
| } |
| |
| if (t->status().is_syscall()) { |
| t->apply_syscall_entry_regs(); |
| if (t->is_arm_desched_event_syscall()) { |
| continue; |
| } |
| break; |
| } |
| if (t->ptrace_event() == PTRACE_EVENT_SECCOMP) { |
| ASSERT(t, |
| t->session().syscall_seccomp_ordering() == |
| Session::SECCOMP_BEFORE_PTRACE_SYSCALL); |
| // This is the old kernel event ordering. This must be a SECCOMP event |
| // for the buffered syscall; it's not rr-generated because this is an |
| // untraced syscall, but it could be generated by a tracee's |
| // seccomp filter. |
| break; |
| } |
| |
| // Completely ignore spurious desched signals and |
| // signals that aren't going to be delivered to the |
| // tracee. |
| // |
| // Also ignore time-slice signals. If the tracee ends |
| // up at the disarm-desched ioctl, we'll reschedule it |
| // with the ticks interrupt still programmed. At worst, |
| // the tracee will get an extra time-slice out of |
| // this, on average, so we don't worry too much about |
| // it. |
| // |
| // TODO: it's theoretically possible for this to |
| // happen an unbounded number of consecutive times |
| // and the tracee never switched out. |
| int sig = t->stop_sig(); |
| ASSERT(t, sig) << "expected stop-signal, got " << t->status(); |
| if (SIGTRAP == sig && handle_syscallbuf_breakpoint(t)) { |
| // We stopped at a breakpoint on an untraced may-block syscall. |
| // This can't be relevant to us since sigprocmask isn't may-block. |
| LOG(debug) << " disabling breakpoints on untraced syscalls"; |
| continue; |
| } |
| if (t->session().syscallbuf_desched_sig() == sig || |
| PerfCounters::TIME_SLICE_SIGNAL == sig || t->is_sig_ignored(sig)) { |
| LOG(debug) << " dropping ignored " << signal_name(sig); |
| continue; |
| } |
| |
| LOG(debug) << " stashing " << signal_name(sig); |
| t->stash_sig(); |
| } |
| |
| if (t->is_disarm_desched_event_syscall()) { |
| LOG(debug) |
| << " (at disarm-desched, so finished buffered syscall; resuming)"; |
| t->push_event(Event::noop()); |
| return; |
| } |
| |
| if (t->desched_rec()) { |
| // We're already processing a desched. We probably reexecuted the |
| // system call (e.g. because a signal was processed) and the syscall |
| // blocked again. Carry on with the current desched. |
| } else { |
| /* This prevents the syscallbuf record counter from being |
| * reset until we've finished guiding the tracee through this |
| * interrupted call. We use the record counter for |
| * assertions. */ |
| ASSERT(t, !t->delay_syscallbuf_reset_for_desched); |
| t->delay_syscallbuf_reset_for_desched = true; |
| LOG(debug) << "Desched initiated"; |
| |
| /* The tracee is (re-)entering the buffered syscall. Stash |
| * away this breadcrumb so that we can figure out what syscall |
| * the tracee was in, and how much "scratch" space it carved |
| * off the syscallbuf, if needed. */ |
| remote_ptr<const struct syscallbuf_record> desched_rec = |
| t->next_syscallbuf_record(); |
| t->push_event(DeschedEvent(desched_rec)); |
| int call = t->read_mem(REMOTE_PTR_FIELD(t->desched_rec(), syscallno)); |
| |
| /* The descheduled syscall was interrupted by a signal, like |
| * all other may-restart syscalls, with the exception that |
| * this one has already been restarted (which we'll detect |
| * back in the main loop). */ |
| t->push_event(Event(interrupted, SyscallEvent(call, t->arch()))); |
| SyscallEvent& ev = t->ev().Syscall(); |
| ev.desched_rec = desched_rec; |
| } |
| |
| SyscallEvent& ev = t->ev().Syscall(); |
| ev.regs = t->regs(); |
| /* For some syscalls (at least poll) but not all (at least not read), |
| * repeated cont_syscall()s above of the same interrupted syscall |
| * can set $orig_eax to 0 ... for unclear reasons. Fix that up here |
| * otherwise we'll get a divergence during replay, which will not |
| * encounter this problem. |
| */ |
| int call = t->read_mem(REMOTE_PTR_FIELD(t->desched_rec(), syscallno)); |
| ev.regs.set_original_syscallno(call); |
| t->set_regs(ev.regs); |
| // runnable_state_changed will observe us entering this syscall and change |
| // state to ENTERING_SYSCALL |
| |
| LOG(debug) << " resuming (and probably switching out) blocked `" |
| << syscall_name(call, ev.arch()) << "'"; |
| } |
| |
| static bool is_safe_to_deliver_signal(RecordTask* t, siginfo_t* si) { |
| if (!t->is_in_syscallbuf()) { |
| /* The tracee is outside the syscallbuf code, |
| * so in most cases can't possibly affect |
| * syscallbuf critical sections. The |
| * exception is signal handlers "re-entering" |
| * desched'd syscalls, which are OK. */ |
| LOG(debug) << "Safe to deliver signal at " << t->ip() |
| << " because not in syscallbuf"; |
| return true; |
| } |
| |
| // Note that this will never fire on aarch64 in a signal stop |
| // since the ip has been moved to the syscall entry. |
| // We will catch it in the traced_syscall_entry case below. |
| // We will miss the exit for rrcall_notify_syscall_hook_exit |
| // but that should not be a big problem. |
| if (t->is_in_traced_syscall()) { |
| LOG(debug) << "Safe to deliver signal at " << t->ip() |
| << " because in traced syscall"; |
| return true; |
| } |
| |
| // Don't deliver signals just before entering rrcall_notify_syscall_hook_exit. |
| // At that point, notify_on_syscall_hook_exit will be set, but we have |
| // passed the point at which syscallbuf code has checked that flag. |
| // Replay will set notify_on_syscall_hook_exit when we replay towards the |
| // rrcall_notify_syscall_hook_exit *after* handling this signal, but |
| // that will be too late for syscallbuf to notice. |
| // It's OK to delay signal delivery until after rrcall_notify_syscall_hook_exit |
| // anyway. |
| if (t->is_at_traced_syscall_entry() && |
| !is_rrcall_notify_syscall_hook_exit_syscall(t->regs().syscallno(), t->arch())) { |
| LOG(debug) << "Safe to deliver signal at " << t->ip() |
| << " because at entry to traced syscall"; |
| return true; |
| } |
| |
| // On aarch64, the untraced syscall here include both entry and exit |
| // if we are at a signal stop. |
| if (t->is_in_untraced_syscall() && t->desched_rec()) { |
| // Untraced syscalls always use the architecture of the process |
| LOG(debug) << "Safe to deliver signal at " << t->ip() |
| << " because tracee interrupted by desched of " |
| << syscall_name(t->read_mem(REMOTE_PTR_FIELD(t->desched_rec(), |
| syscallno)), |
| t->arch()); |
| return true; |
| } |
| |
| if (t->is_in_untraced_syscall() && si->si_signo == SIGSYS && |
| si->si_code == SYS_SECCOMP) { |
| LOG(debug) << "Safe to deliver signal at " << t->ip() |
| << " because signal is seccomp trap."; |
| return true; |
| } |
| |
| // If the syscallbuf buffer hasn't been created yet, just delay the signal |
| // with no need to set notify_on_syscall_hook_exit; the signal will be |
| // delivered when rrcall_init_buffers is called. |
| if (t->syscallbuf_child) { |
| if (t->read_mem(REMOTE_PTR_FIELD(t->syscallbuf_child, locked)) & 2) { |
| LOG(debug) << "Safe to deliver signal at " << t->ip() |
| << " because the syscallbuf is locked"; |
| return true; |
| } |
| |
| // A signal (e.g. seccomp SIGSYS) interrupted a untraced syscall in a |
| // non-restartable way. Defer it until SYS_rrcall_notify_syscall_hook_exit. |
| if (t->is_in_untraced_syscall()) { |
| // Our emulation of SYS_rrcall_notify_syscall_hook_exit clears this flag. |
| t->write_mem( |
| REMOTE_PTR_FIELD(t->syscallbuf_child, notify_on_syscall_hook_exit), |
| (uint8_t)1); |
| } |
| } |
| |
| LOG(debug) << "Not safe to deliver signal at " << t->ip(); |
| return false; |
| } |
| |
| SignalHandled handle_signal(RecordTask* t, siginfo_t* si, |
| SignalDeterministic deterministic, |
| SignalBlocked signal_was_blocked) { |
| int sig = si->si_signo; |
| LOG(debug) << t->tid << ": handling signal " << signal_name(sig) |
| << " (pevent: " << ptrace_event_name(t->ptrace_event()) |
| << ", event: " << t->ev(); |
| |
| // Conservatively invalidate the sigmask in case just accepting a signal has |
| // sigmask effects. |
| t->invalidate_sigmask(); |
| |
| if (deterministic == DETERMINISTIC_SIG) { |
| // When a deterministic signal is triggered, but the signal is currently |
| // blocked or ignored, the kernel (in |force_sig_info|) unblocks it and |
| // sets its disposition to SIG_DFL. It never undoes this (probably |
| // because it expects the signal to be fatal, which it always would be |
| // unless a ptracer intercepts the signal as we do). Therefore, if the |
| // signal was generated for rr's purposes, we need to restore the signal |
| // state ourselves. |
| if (sig == SIGSEGV && |
| (try_handle_trapped_instruction(t, si) || try_grow_map(t, si))) { |
| if (signal_was_blocked || t->is_sig_ignored(sig)) { |
| restore_signal_state(t, sig, signal_was_blocked); |
| } |
| return SIGNAL_HANDLED; |
| } |
| |
| // Since we're not undoing the kernel's changes, update our signal handler |
| // state to match the kernel's. |
| if (signal_was_blocked || t->is_sig_ignored(sig)) { |
| t->did_set_sig_handler_default(sig); |
| } |
| } |
| |
| if (!VirtualPerfCounterMonitor::is_virtual_perf_counter_signal(si)) { |
| /* We have to check for a desched event first, because for |
| * those we *do not* want to (and cannot, most of the time) |
| * step the tracee out of the syscallbuf code before |
| * attempting to deliver the signal. */ |
| if (t->session().syscallbuf_desched_sig() == si->si_signo && |
| si->si_code == POLL_IN) { |
| handle_desched_event(t); |
| return SIGNAL_HANDLED; |
| } |
| |
| if (!is_safe_to_deliver_signal(t, si)) { |
| return DEFER_SIGNAL; |
| } |
| |
| if (!t->set_siginfo_for_synthetic_SIGCHLD(si)) { |
| return DEFER_SIGNAL; |
| } |
| |
| if (sig == PerfCounters::TIME_SLICE_SIGNAL) { |
| t->push_event(Event::sched()); |
| return SIGNAL_HANDLED; |
| } |
| } else { |
| // Clear the magic flag so it doesn't leak into the program. |
| si->si_errno = 0; |
| } |
| |
| /* This signal was generated by the program or an external |
| * source, record it normally. */ |
| |
| if (t->emulate_ptrace_stop(WaitStatus::for_stop_sig(sig), si)) { |
| // Record an event so that replay progresses the tracee to the |
| // current point before we notify the tracer. |
| // If the signal is deterministic, record it as an EV_SIGNAL so that |
| // we replay it using the deterministic-signal replay path. This is |
| // more efficient than emulate_async_signal. Also emulate_async_signal |
| // currently assumes it won't encounter a deterministic SIGTRAP (due to |
| // a hardcoded breakpoint in the tracee). |
| if (deterministic == DETERMINISTIC_SIG) { |
| t->record_event(Event(EV_SIGNAL, SignalEvent(*si, deterministic, |
| t->sig_resolved_disposition( |
| sig, deterministic)))); |
| } else { |
| t->record_event(Event::sched()); |
| } |
| // ptracer has been notified, so don't deliver the signal now. |
| // The signal won't be delivered for real until the ptracer calls |
| // PTRACE_CONT with the signal number (which we don't support yet!). |
| return SIGNAL_PTRACE_STOP; |
| } |
| |
| t->push_event(Event( |
| EV_SIGNAL, SignalEvent(*si, deterministic, |
| t->sig_resolved_disposition(sig, deterministic)))); |
| return SIGNAL_HANDLED; |
| } |
| |
| } // namespace rr |