| /* -*- Mode: C++; tab-width: 8; c-basic-offset: 2; indent-tabs-mode: nil; -*- */ |
| |
| #include "RecordSession.h" |
| |
| #include <elf.h> |
| #include <limits.h> |
| #include <linux/capability.h> |
| #include <linux/futex.h> |
| #include <sys/types.h> |
| #include <sys/socket.h> |
| |
| #include <algorithm> |
| #include <sstream> |
| #include <string> |
| |
| #include "AutoRemoteSyscalls.h" |
| #include "ElfReader.h" |
| #include "Flags.h" |
| #include "RecordTask.h" |
| #include "TraceeAttentionSet.h" |
| #include "VirtualPerfCounterMonitor.h" |
| #include "WaitManager.h" |
| #include "core.h" |
| #include "ftrace.h" |
| #include "kernel_metadata.h" |
| #include "kernel_supplement.h" |
| #include "log.h" |
| #include "record_signal.h" |
| #include "record_syscall.h" |
| #include "seccomp-bpf.h" |
| |
| namespace rr { |
| |
| // Undef si_addr_lsb since it's an alias for a field name that doesn't exist, |
| // and we need to use the actual field name. |
| #ifdef si_addr_lsb |
| #undef si_addr_lsb |
| #endif |
| |
| using namespace rr; |
| using namespace std; |
| |
| template <typename T> static remote_ptr<T> mask_low_bit(remote_ptr<T> p) { |
| return p.as_int() & ~uintptr_t(1); |
| } |
| |
| template <typename Arch> |
| static void record_robust_futex_change( |
| RecordTask* t, const typename Arch::robust_list_head& head, |
| remote_ptr<void> base) { |
| if (base.is_null()) { |
| return; |
| } |
| remote_ptr<void> futex_void_ptr = base + head.futex_offset; |
| auto futex_ptr = futex_void_ptr.cast<uint32_t>(); |
| // We can't just record the current futex value because at this point |
| // in task exit the robust futex handling has not happened yet. So we have |
| // to emulate what the kernel will do! |
| bool ok = true; |
| uint32_t val = t->read_mem(futex_ptr, &ok); |
| if (!ok) { |
| return; |
| } |
| if (pid_t(val & FUTEX_TID_MASK) != t->own_namespace_rec_tid) { |
| return; |
| } |
| val = (val & FUTEX_WAITERS) | FUTEX_OWNER_DIED; |
| // Update memory now so that the kernel doesn't decide to do it later, at |
| // a time that might race with other tracee execution. |
| t->write_mem(futex_ptr, val); |
| t->record_local(futex_ptr, &val); |
| } |
| |
| /** |
| * Any user-space writes performed by robust futex handling are captured here. |
| * They must be emulated during replay; the kernel will not do it for us |
| * during replay because the TID value in each futex is the recorded |
| * TID, not the actual TID of the dying task. |
| */ |
| template <typename Arch> |
| static void record_robust_futex_changes_arch(RecordTask* t) { |
| if (t->did_record_robust_futex_changes) { |
| return; |
| } |
| t->did_record_robust_futex_changes = true; |
| |
| auto head_ptr = t->robust_list().cast<typename Arch::robust_list_head>(); |
| if (head_ptr.is_null()) { |
| return; |
| } |
| ASSERT(t, t->robust_list_len() == sizeof(typename Arch::robust_list_head)); |
| bool ok = true; |
| auto head = t->read_mem(head_ptr, &ok); |
| if (!ok) { |
| return; |
| } |
| record_robust_futex_change<Arch>(t, head, |
| mask_low_bit(head.list_op_pending.rptr())); |
| for (auto current = mask_low_bit(head.list.next.rptr()); |
| current.as_int() != head_ptr.as_int();) { |
| record_robust_futex_change<Arch>(t, head, current); |
| auto next = t->read_mem(current, &ok); |
| if (!ok) { |
| return; |
| } |
| current = mask_low_bit(next.next.rptr()); |
| } |
| } |
| |
| static void record_robust_futex_changes(RecordTask* t) { |
| RR_ARCH_FUNCTION(record_robust_futex_changes_arch, t->arch(), t); |
| } |
| |
| static void record_exit_trace_event(RecordTask* t, WaitStatus exit_status) { |
| t->session().trace_writer().write_task_event( |
| TraceTaskEvent::for_exit(t->tid, exit_status)); |
| if (t->thread_group()->tgid == t->tid) { |
| t->thread_group()->exit_status = exit_status; |
| } |
| } |
| |
| static bool looks_like_syscall_entry(RecordTask* t) { |
| bool ok; |
| bool at_syscall = is_at_syscall_instruction(t, |
| t->regs().ip().decrement_by_syscall_insn_length(t->arch()), &ok); |
| // It's possible for the task to have died (e.g. if it got signaled twice |
| // in rapid succession). In that case, try to just go by register contents. |
| if (ok && !at_syscall) { |
| return false; |
| } |
| if (is_x86ish(t->arch())) { |
| // On x86 rax gets set to ENOSYS on entry. Elsewhere this does not happen. |
| // Further, even if we did ask about the syscallno, it might have been |
| // reset by the signal handler. However, on non-x86 platforms we currently |
| // count taken braches, rather than only conditional ones, so it should |
| // be impossible to see the same syscall ip twice without intervening |
| // ticks, so the check that follows these conditions, should be sufficient |
| // there. |
| return t->regs().original_syscallno() >= 0 && |
| t->regs().syscall_result_signed() == -ENOSYS; |
| } else if (t->arch() == aarch64) { |
| // We recorded when we saw the last syscall entry |
| // so just use that to determine if we've already save it in the trace. |
| if (t->ticks_at_last_syscall_entry == t->tick_count() && |
| t->ip_at_last_syscall_entry == t->regs().ip()) { |
| return !t->last_syscall_entry_recorded; |
| } |
| } |
| // Getting a sched event here is better than a spurious syscall event. |
| // Syscall entry does not cause visible register modification, so upon |
| // hitting the sched event the register state would indeed match. |
| return ok; |
| } |
| |
| /** |
| * Return true if we handle a ptrace exit event for task t. When this returns |
| * true, t may have been deleted. |
| */ |
| static bool handle_ptrace_exit_event(RecordTask* t) { |
| if (t->was_reaped()) { |
| if (t->handled_ptrace_exit_event()) { |
| t->did_reach_zombie(); |
| return true; |
| } |
| } else if (t->ptrace_event() != PTRACE_EVENT_EXIT) { |
| return false; |
| } |
| |
| if (t->stable_exit || t->was_reaped()) { |
| LOG(debug) << "stable exit"; |
| } else { |
| if (!t->may_be_blocked()) { |
| // might have been hit by a SIGKILL or a SECCOMP_RET_KILL, in which case |
| // there might be some execution since its last recorded event that we |
| // need to replay. |
| // There's a weird case (in 4.13.5-200.fc26.x86_64 at least) where the |
| // task can enter the kernel but instead of receiving a syscall ptrace |
| // event, we receive a PTRACE_EVENT_EXIT due to a concurrent execve |
| // (and probably a concurrent SIGKILL could do the same). The task state |
| // has been updated to reflect syscall entry. If we record a SCHED in |
| // that state replay of the SCHED will fail. So detect that state and fix |
| // it up. |
| // If we got killed in an untraced syscall on AArch64, |
| // it is difficult/impossible to tell if the value of x0 has been overwritten |
| // with the syscall result/error number |
| // and it's even harder to recover the correct value of x0. |
| // Simply ignore these since we weren't going to record them anyway. |
| if (looks_like_syscall_entry(t) && !t->is_in_untraced_syscall()) { |
| // Either we're in a syscall, or we're immediately after a syscall |
| // and it exited. |
| if (t->ticks_at_last_recorded_syscall_exit == t->tick_count() && |
| t->regs().ip() == t->ip_at_last_recorded_syscall_exit) { |
| LOG(debug) << "Nothing to record after PTRACE_EVENT_EXIT"; |
| // It's the latter case; do nothing. |
| } else { |
| // It's the former case ... probably. Theoretically we could have |
| // re-executed a syscall without any ticks in between, but that seems |
| // highly improbable. |
| // Record the syscall-entry event that we otherwise failed to record. |
| t->canonicalize_regs(t->arch()); |
| auto r = t->regs(); |
| if (t->arch() == aarch64) { |
| // On AArch64, when we get here, there are 3 different cases, |
| // 1. EXIT before we hit the syscall entry stop |
| // 2. EXIT after syscall entry stop but |
| // before the result (X0) is overwritten |
| // 3. EXIT after syscall entry stop and |
| // after the result (X0) is overwritten |
| // (i.e. after the syscall but we got an EXIT |
| // before the syscall exit stop.) |
| |
| // We detect the first case based on `*_at_last_syscall_entry` |
| // set by `apply_syscall_entry_regs` and trust the current values |
| // `x0` and `x8`. |
| |
| // For the second and third cases, we rely on the syscall enter stop |
| // to set the orig_arg1 and original_syscallno correctly. |
| if (t->ticks_at_last_syscall_entry == t->tick_count() && |
| t->ip_at_last_syscall_entry == r.ip()) { |
| // We need to rely on the saved `orig_arg1` since in the third case |
| // the `x0` may already be overwritten. |
| // The assertion here assumes that |
| // `apply_syscall_entry_regs` is called when we enter the syscall |
| // and `x8` still holds the correct syscall number |
| // when we hit the process exit stop. |
| ASSERT(t, r.original_syscallno() == r.syscallno()) |
| << "syscallno not saved by syscall enter handler: " << r; |
| r.set_arg1(r.orig_arg1()); |
| } else { |
| r.set_original_syscallno(r.syscallno()); |
| } |
| } |
| // Assume it's a native-arch syscall. If it isn't, it doesn't matter |
| // all that much since we aren't actually going to do anything with it |
| // in this task. |
| // Avoid calling detect_syscall_arch here since it could fail if the |
| // task is already completely dead and gone. |
| SyscallEvent event(r.original_syscallno(), t->arch()); |
| event.state = ENTERING_SYSCALL; |
| // Don't try to reset the syscallbuf here. The task may be exiting |
| // while in arbitrary syscallbuf code. And of course, because it's |
| // exiting, it doesn't matter if we don't reset the syscallbuf. |
| t->record_event(event, RecordTask::FLUSH_SYSCALLBUF, |
| RecordTask::DONT_RESET_SYSCALLBUF, &r); |
| } |
| } else { |
| // Don't try to reset the syscallbuf here. The task may be exiting |
| // while in arbitrary syscallbuf code. And of course, because it's |
| // exiting, it doesn't matter if we don't reset the syscallbuf. |
| // XXX flushing the syscallbuf may be risky too... |
| auto event = Event::sched(); |
| // When replaying this SCHED, we won't proceed past the `syscall_hook` |
| // entry point. Code inside the syscallbuf may be in a bad state during |
| // replay because we didn't save buffered syscalls. |
| event.Sched().in_syscallbuf_syscall_hook = t->syscallbuf_code_layout.syscallbuf_syscall_hook; |
| t->record_event(event, RecordTask::FLUSH_SYSCALLBUF, |
| RecordTask::DONT_RESET_SYSCALLBUF); |
| } |
| } |
| /* XXX: We could try to find some tasks here to unmap our buffers, but it |
| * seems hardly worth it. |
| * Mark buffers as gone after recording events, in case they need to flush the syscallbuf. |
| */ |
| t->destroy_buffers(nullptr, nullptr); |
| } |
| |
| WaitStatus exit_status; |
| if (t->was_reaped()) { |
| exit_status = t->status(); |
| } else { |
| record_robust_futex_changes(t); |
| |
| unsigned long msg = 0; |
| // If ptrace_if_stopped fails, then the task has been killed by SIGKILL |
| // or equivalent. |
| if (t->ptrace_if_stopped(PTRACE_GETEVENTMSG, nullptr, &msg)) { |
| exit_status = WaitStatus(msg); |
| } else { |
| exit_status = WaitStatus::for_fatal_sig(SIGKILL); |
| } |
| } |
| |
| t->did_handle_ptrace_exit_event(); |
| |
| // If we died because of a coredumping signal, that is a barrier event, and |
| // every task in the address space needs to pass its PTRACE_EXIT_EVENT before |
| // they proceed to (potentially hidden) zombie state, so we can't wait for |
| // that to happen. |
| // Similarly we can't wait for this task to exit if there are other |
| // tasks in its pid namespace that need to exit and this is the last thread |
| // of pid-1 in that namespace, because the kernel must reap them before |
| // letting this task complete its exit. |
| bool may_wait_exit = !t->was_reaped() && !is_coredumping_signal(exit_status.fatal_sig()) && |
| !t->waiting_for_pid_namespace_tasks_to_exit(); |
| record_exit_trace_event(t, exit_status); |
| t->record_exit_event( |
| (!t->was_reaped() && !may_wait_exit) ? RecordTask::WRITE_CHILD_TID : RecordTask::KERNEL_WRITES_CHILD_TID); |
| if (!t->was_reaped()) { |
| t->proceed_to_exit(may_wait_exit); |
| } |
| t->do_ptrace_exit_stop(exit_status); |
| if (may_wait_exit) { |
| t->did_reach_zombie(); |
| } else if (!t->was_reaped()) { |
| t->waiting_for_reap = true; |
| } |
| return true; |
| } |
| |
| static void note_entering_syscall(RecordTask* t) { |
| ASSERT(t, EV_SYSCALL == t->ev().type()); |
| t->ev().Syscall().state = ENTERING_SYSCALL; |
| if (!t->ev().Syscall().is_restart) { |
| /* Save a copy of the arg registers so that we |
| * can use them to detect later restarted |
| * syscalls, if this syscall ends up being |
| * restarted. We have to save the registers |
| * in this rather awkward place because we |
| * need the original registers; the restart |
| * (if it's not a SYS_restart_syscall restart) |
| * will use the original registers. */ |
| t->ev().Syscall().regs = t->regs(); |
| } else { |
| t->ev().Syscall().regs.set_syscallno(t->regs().syscallno()); |
| // We may have intentionally stored the syscall result here. |
| // Now that we're safely past the signal delivery, make the |
| // registers look like they did at the original syscall entry |
| // again. |
| t->ev().Syscall().regs.set_arg1(t->ev().Syscall().regs.orig_arg1()); |
| if (t->arch() == aarch64) { |
| // We probably got here with a PTRACE_SYSCALL. The x7 |
| // value will be wrong due to the aarch64 kernel bug. |
| // Get it from the syscall event. |
| Registers r = t->regs(); |
| r.set_x7(t->ev().Syscall().regs.x7()); |
| t->set_regs(r); |
| } |
| } |
| } |
| |
| #if defined (__x86_64__) |
| static bool is_in_vsyscall(remote_code_ptr ip) |
| { |
| // This is hardcoded by the Linux ABI |
| remote_code_ptr vsyscall_start = 0xffffffffff600000; |
| remote_code_ptr vsyscall_end = 0xffffffffff601000; |
| return vsyscall_start <= ip && ip < vsyscall_end; |
| } |
| #else |
| static bool is_in_vsyscall(remote_code_ptr) |
| { |
| return false; |
| } |
| #endif |
| |
| void RecordSession::handle_seccomp_traced_syscall(RecordTask* t, |
| StepState* step_state, |
| RecordResult* result, |
| bool* did_enter_syscall) { |
| *did_enter_syscall = false; |
| |
| // Special case: If the tracee issues a vsyscall, we will get a seccomp trap, |
| // but no syscall traps whatsoever. In particular, we wouldn't see it during |
| // replay either. We try to monkeypatch the caller on the assumption that known |
| // callers of this (deprecated) interface all follow a common pattern. If we |
| // can't patch the caller, this is a fatal error, since the recording will |
| // otherwise be broken. |
| remote_code_ptr ip = t->regs().ip(); |
| if (is_in_vsyscall(ip)) { |
| remote_ptr<void> sp = t->regs().sp(); |
| // The kernel assumes the return address is on the stack - we do the same |
| remote_ptr<remote_code_ptr> ret_addr_addr = sp.cast<remote_code_ptr>(); |
| remote_code_ptr ret_addr = t->read_mem(ret_addr_addr); |
| |
| // Skip this syscall. We will attempt to patch it to the vdso entry and |
| // let the tracee retry there. |
| Registers regs = t->regs(); |
| regs.set_original_syscallno(-1); |
| // We can't modify the ip here, the kernel will kill the tracee with |
| // SIGSYS. Instead, we set a breakpoint at the return instruction. |
| t->set_regs(regs); |
| t->vm()->add_breakpoint(ret_addr, BKPT_INTERNAL); |
| while (true) { |
| if (!t->resume_execution(RESUME_SYSCALL, RESUME_WAIT_NO_EXIT, RESUME_NO_TICKS)) { |
| // Tracee exited unexpectedly |
| return; |
| } |
| ASSERT(t, !t->ptrace_event()); |
| if (t->stop_sig() == syscallbuf_desched_sig()) { |
| continue; |
| } |
| if (t->stop_sig() == SIGTRAP && |
| is_kernel_trap(t->get_siginfo().si_code)) { |
| // Hit the breakpoint |
| break; |
| } |
| t->stash_sig(); |
| } |
| t->vm()->remove_breakpoint(ret_addr, BKPT_INTERNAL); |
| |
| ASSERT(t, t->regs().ip().undo_executed_bkpt(t->arch()) == ret_addr); |
| |
| // Now that we're in a sane state, ask the Monkeypatcher to try and patch |
| // that. |
| bool patch_ok = t->vm()->monkeypatcher().try_patch_vsyscall_caller(t, ret_addr); |
| ASSERT(t, patch_ok) << "The tracee issues a vsyscall to " << ip |
| << " but we failed to monkeypatch the caller (return address " |
| << ret_addr << ", sp=" << sp << "). Recording will not succeed. Exiting."; |
| |
| // Reset to the start of the region and continue |
| regs = t->regs(); |
| regs.set_ip(ret_addr.decrement_by_vsyscall_entry_length(t->arch())); |
| t->set_regs(regs); |
| |
| // We patched this syscall, record that |
| auto ev = Event::patch_syscall(); |
| ev.PatchSyscall().patch_vsyscall = true; |
| t->record_event(ev); |
| |
| step_state->continue_type = RecordSession::CONTINUE; |
| return; |
| } |
| |
| int syscallno = t->regs().original_syscallno(); |
| if (syscallno < 0) { |
| // negative syscall numbers after a SECCOMP event |
| // are treated as "skip this syscall". There will be one syscall event |
| // reported instead of two. So fake an enter-syscall event now. |
| // It doesn't really matter what the syscall-arch is. |
| t->canonicalize_regs(t->arch()); |
| if (syscall_seccomp_ordering_ == SECCOMP_BEFORE_PTRACE_SYSCALL) { |
| // If the ptrace entry stop hasn't happened yet, we're at a weird |
| // intermediate state where the behavior of the next PTRACE_SYSCALL |
| // will depend on the register state (i.e. whether we see an entry |
| // trap or proceed right to the exit trap). To make things easier |
| // on the rest of the system, do a fake syscall entry, then reset |
| // the register state. |
| Registers orig_regs = t->regs(); |
| Registers r = orig_regs; |
| r.set_original_syscallno(syscall_number_for_gettid(t->arch())); |
| t->set_regs(r); |
| if (!t->resume_execution(RESUME_SYSCALL, RESUME_WAIT_NO_EXIT, RESUME_NO_TICKS)) { |
| // Tracee died unexpectedly. We did not enter a syscall. |
| // We shouldn't try to resume it now. |
| step_state->continue_type = RecordSession::DONT_CONTINUE; |
| return; |
| } |
| t->set_regs(orig_regs); |
| } |
| |
| // Don't continue yet. At the next iteration of record_step, we'll |
| // enter syscall_state_changed and that will trigger a continue to |
| // the syscall exit. |
| step_state->continue_type = RecordSession::DONT_CONTINUE; |
| if (!process_syscall_entry(t, step_state, result, t->arch())) { |
| return; |
| } |
| *did_enter_syscall = true; |
| return; |
| } |
| |
| if (syscall_seccomp_ordering_ == SECCOMP_BEFORE_PTRACE_SYSCALL) { |
| // The next continue needs to be a PTRACE_SYSCALL to observe |
| // the enter-syscall event. |
| step_state->continue_type = RecordSession::CONTINUE_SYSCALL; |
| } else { |
| ASSERT(t, syscall_seccomp_ordering_ == PTRACE_SYSCALL_BEFORE_SECCOMP); |
| if (t->ev().is_syscall_event() && |
| t->ev().Syscall().state == PROCESSING_SYSCALL) { |
| // We did PTRACE_SYSCALL and already saw a syscall trap. Just ignore this. |
| LOG(debug) << "Ignoring SECCOMP syscall trap since we already got a " |
| "PTRACE_SYSCALL trap"; |
| // The next continue needs to be a PTRACE_SYSCALL to observe |
| // the exit-syscall event. |
| step_state->continue_type = RecordSession::CONTINUE_SYSCALL; |
| // Need to restore last_task_switchable since it will have been |
| // reset to PREVENT_SWITCH |
| last_task_switchable = t->ev().Syscall().switchable; |
| } else { |
| // We've already passed the PTRACE_SYSCALL trap for syscall entry, so |
| // we need to handle that now. |
| SupportedArch syscall_arch = t->detect_syscall_arch(); |
| t->canonicalize_regs(syscall_arch); |
| if (!process_syscall_entry(t, step_state, result, syscall_arch)) { |
| step_state->continue_type = RecordSession::DONT_CONTINUE; |
| return; |
| } |
| *did_enter_syscall = true; |
| } |
| } |
| } |
| |
| static void seccomp_trap_done(RecordTask* t) { |
| t->pop_seccomp_trap(); |
| |
| // It's safe to reset the syscall buffer now. |
| t->delay_syscallbuf_reset_for_seccomp_trap = false; |
| |
| t->write_and_record(REMOTE_PTR_FIELD(t->syscallbuf_child, failed_during_preparation), |
| (uint8_t)1); |
| if (EV_DESCHED == t->ev().type()) { |
| // Desched processing will do the rest for us |
| return; |
| } |
| |
| // Abort the current syscallbuf record, which corresponds to the syscall that |
| // wasn't actually executed due to seccomp. |
| t->write_mem(REMOTE_PTR_FIELD(t->syscallbuf_child, abort_commit), (uint8_t)1); |
| t->record_event(Event::syscallbuf_abort_commit()); |
| |
| // In fact, we need to. Running the syscall exit hook will ensure we |
| // reset the buffer before we try to buffer another a syscall. |
| t->write_mem( |
| REMOTE_PTR_FIELD(t->syscallbuf_child, notify_on_syscall_hook_exit), |
| (uint8_t)1); |
| } |
| |
| static void handle_seccomp_trap(RecordTask* t, |
| RecordSession::StepState* step_state, |
| uint16_t seccomp_data) { |
| // The architecture may be wrong, but that's ok, because an actual syscall |
| // entry did happen, so the registers are already updated according to the |
| // architecture of the system call. |
| t->canonicalize_regs(t->detect_syscall_arch()); |
| t->apply_syscall_entry_regs(); |
| |
| Registers r = t->regs(); |
| int syscallno = r.original_syscallno(); |
| // Cause kernel processing to skip the syscall |
| r.set_original_syscallno(SECCOMP_MAGIC_SKIP_ORIGINAL_SYSCALLNO); |
| t->set_regs(r); |
| |
| bool syscall_entry_already_recorded = false; |
| if (t->ev().is_syscall_event()) { |
| // A syscall event was already pushed, probably because we did a |
| // PTRACE_SYSCALL to enter the syscall during handle_desched_event. Cancel |
| // that event now since the seccomp SIGSYS aborts it completely. |
| ASSERT(t, t->ev().Syscall().number == syscallno); |
| // Make sure any prepared syscall state is discarded and any temporary |
| // effects (e.g. redirecting pointers to scratch) undone. |
| rec_abort_prepared_syscall(t); |
| if (t->ev().type() == EV_SYSCALL_INTERRUPTION) { |
| // The event could be a syscall-interruption if it was pushed by |
| // `handle_desched_event`. In that case, it has not been recorded yet. |
| t->pop_syscall_interruption(); |
| } else { |
| t->pop_syscall(); |
| syscall_entry_already_recorded = true; |
| } |
| } |
| |
| if (t->is_in_untraced_syscall()) { |
| ASSERT(t, !t->delay_syscallbuf_reset_for_seccomp_trap); |
| // Don't reset the syscallbuf immediately after delivering the trap. We have |
| // to wait until this buffered syscall aborts completely before resetting |
| // the buffer. |
| t->delay_syscallbuf_reset_for_seccomp_trap = true; |
| |
| t->push_event(Event::seccomp_trap()); |
| |
| // desched may be armed but we're not going to execute the syscall, let |
| // alone block. If it fires, ignore it. |
| t->write_mem( |
| REMOTE_PTR_FIELD(t->syscallbuf_child, desched_signal_may_be_relevant), |
| (uint8_t)0); |
| } |
| |
| t->push_syscall_event(syscallno); |
| t->ev().Syscall().failed_during_preparation = true; |
| note_entering_syscall(t); |
| |
| if (t->is_in_untraced_syscall() && !syscall_entry_already_recorded) { |
| t->record_current_event(); |
| } |
| |
| // Use NativeArch here because different versions of system headers |
| // have inconsistent field naming. |
| union { |
| NativeArch::siginfo_t native_api; |
| siginfo_t linux_api; |
| } si; |
| memset(&si, 0, sizeof(si)); |
| si.native_api.si_signo = SIGSYS; |
| si.native_api.si_errno = seccomp_data; |
| si.native_api.si_code = SYS_SECCOMP; |
| si.native_api._sifields._sigsys._arch = to_audit_arch(r.arch()); |
| si.native_api._sifields._sigsys._syscall = syscallno; |
| // Documentation says that si_call_addr is the address of the syscall |
| // instruction, but in tests it's immediately after the syscall |
| // instruction. |
| si.native_api._sifields._sigsys._call_addr = t->ip().to_data_ptr<void>(); |
| LOG(debug) << "Synthesizing " << si.linux_api; |
| t->stash_synthetic_sig(si.linux_api, DETERMINISTIC_SIG); |
| |
| // Tests show that the current registers are preserved (on x86, eax/rax |
| // retains the syscall number). |
| r.set_syscallno(syscallno); |
| t->set_regs(r); |
| t->maybe_restore_original_syscall_registers(); |
| |
| if (t->is_in_untraced_syscall()) { |
| // For buffered syscalls, go ahead and record the exit state immediately. |
| t->ev().Syscall().state = EXITING_SYSCALL; |
| t->record_current_event(); |
| t->pop_syscall(); |
| |
| // The tracee is currently in the seccomp ptrace-stop. Advance it to the |
| // syscall-exit stop so that when we try to deliver the SIGSYS via |
| // PTRACE_SINGLESTEP, that doesn't trigger a SIGTRAP stop. |
| // If this fails, that's fine, we're not going to deliver the SIGSYS. |
| t->resume_execution(RESUME_SYSCALL, RESUME_WAIT_NO_EXIT, RESUME_NO_TICKS); |
| } |
| |
| // Don't continue yet. At the next iteration of record_step, if we |
| // recorded the syscall-entry we'll enter syscall_state_changed and |
| // that will trigger a continue to the syscall exit. If we recorded the |
| // syscall-exit we'll go straight into signal delivery. |
| step_state->continue_type = RecordSession::DONT_CONTINUE; |
| } |
| |
| static void handle_seccomp_errno(RecordTask* t, |
| RecordSession::StepState* step_state, |
| uint16_t seccomp_data) { |
| t->canonicalize_regs(t->detect_syscall_arch()); |
| |
| Registers r = t->regs(); |
| int syscallno = r.original_syscallno(); |
| // Cause kernel processing to skip the syscall |
| r.set_original_syscallno(SECCOMP_MAGIC_SKIP_ORIGINAL_SYSCALLNO); |
| t->set_regs(r); |
| |
| if (!t->is_in_untraced_syscall()) { |
| t->push_syscall_event(syscallno); |
| // Note that the syscall failed. prepare_clone() needs to know |
| // this during replay of the syscall entry. |
| t->ev().Syscall().failed_during_preparation = true; |
| note_entering_syscall(t); |
| } |
| |
| r.set_syscall_result(-seccomp_data); |
| t->set_regs(r); |
| // Don't continue yet. At the next iteration of record_step, if we |
| // recorded the syscall-entry we'll enter syscall_state_changed and |
| // that will trigger a continue to the syscall exit. |
| step_state->continue_type = RecordSession::DONT_CONTINUE; |
| } |
| |
| bool RecordSession::handle_ptrace_event(RecordTask** t_ptr, |
| StepState* step_state, |
| RecordResult* result, |
| bool* did_enter_syscall) { |
| *did_enter_syscall = false; |
| |
| RecordTask* t = *t_ptr; |
| if (t->status().group_stop() || t->has_stashed_group_stop()) { |
| t->clear_stashed_group_stop(); |
| last_task_switchable = ALLOW_SWITCH; |
| step_state->continue_type = DONT_CONTINUE; |
| return true; |
| } |
| |
| int event = t->ptrace_event(); |
| if (!event) { |
| return false; |
| } |
| |
| LOG(debug) << " " << t->tid << ": handle_ptrace_event " |
| << ptrace_event_name(event) << ": event " << t->ev(); |
| |
| switch (event) { |
| case PTRACE_EVENT_SECCOMP_OBSOLETE: |
| case PTRACE_EVENT_SECCOMP: { |
| if (syscall_seccomp_ordering_ == PTRACE_SYSCALL_BEFORE_SECCOMP_UNKNOWN) { |
| syscall_seccomp_ordering_ = SECCOMP_BEFORE_PTRACE_SYSCALL; |
| } |
| |
| int seccomp_data = t->get_ptrace_eventmsg_seccomp_data(); |
| // We need to set the orig_* values before we let the process continue to exit |
| // since the handler for the exit event will need them. |
| // See `handle_ptrace_exit_event` above. |
| t->apply_syscall_entry_regs(); |
| if (seccomp_data < 0) { |
| // Process just died. Urk. Just wait for the exit event and pretend this stop never happened! |
| last_task_switchable = ALLOW_SWITCH; |
| step_state->continue_type = DONT_CONTINUE; |
| return true; |
| } |
| int syscallno = t->regs().original_syscallno(); |
| if (seccomp_data == SECCOMP_RET_DATA) { |
| LOG(debug) << " traced syscall entered: " |
| << syscall_name(syscallno, t->arch()); |
| handle_seccomp_traced_syscall(t, step_state, result, did_enter_syscall); |
| } else { |
| // Note that we make no attempt to patch the syscall site when the |
| // user handle does not return ALLOW. Apart from the ERRNO case, |
| // handling these syscalls is necessarily slow anyway. |
| uint32_t real_result; |
| if (!seccomp_filter_rewriter().map_filter_data_to_real_result( |
| t, seccomp_data, &real_result)) { |
| LOG(debug) |
| << "Process terminated unexpectedly during PTRACE_GETEVENTMSG"; |
| step_state->continue_type = RecordSession::CONTINUE; |
| break; |
| } |
| uint16_t real_result_data = real_result & SECCOMP_RET_DATA; |
| switch (real_result & SECCOMP_RET_ACTION) { |
| case SECCOMP_RET_TRAP: |
| LOG(debug) << " seccomp trap for syscall: " |
| << syscall_name(syscallno, t->arch()); |
| handle_seccomp_trap(t, step_state, real_result_data); |
| break; |
| case SECCOMP_RET_ERRNO: |
| LOG(debug) << " seccomp errno " << errno_name(real_result_data) |
| << " for syscall: " |
| << syscall_name(syscallno, t->arch()); |
| handle_seccomp_errno(t, step_state, real_result_data); |
| break; |
| case SECCOMP_RET_KILL: |
| LOG(debug) << " seccomp kill for syscall: " |
| << syscall_name(syscallno, t->arch()); |
| t->tgkill(SIGKILL); |
| // Rely on the SIGKILL to bump us out of the ptrace stop. |
| step_state->continue_type = RecordSession::DONT_CONTINUE; |
| // Now wait for us to actually exit our ptrace-stop and proceed |
| // to the PTRACE_EVENT_EXIT. This avoids the race where our |
| // PTRACE_CONT might kick us out of the PTRACE_EVENT_EXIT before |
| // we can process it. |
| // If this fails because of *another* SIGKILL that's fine. |
| t->wait(); |
| break; |
| default: |
| ASSERT(t, false) << "Seccomp result not handled"; |
| break; |
| } |
| } |
| break; |
| } |
| |
| case PTRACE_EVENT_EXEC: { |
| if (t->thread_group()->task_set().size() > 1) { |
| // All tasks but the task that did the execve should have exited by |
| // now and notified us of their exits. However, it's possible that |
| // while running the thread-group leader, our PTRACE_CONT raced with its |
| // PTRACE_EVENT_EXIT and it exited, and the next event we got is this |
| // PTRACE_EVENT_EXEC after the exec'ing task changed its tid to the |
| // leader's tid. Or maybe there are kernel bugs; on |
| // 4.2.0-42-generic running exec_from_other_thread, we reproducibly |
| // enter PTRACE_EVENT_EXEC for the thread-group leader without seeing |
| // its PTRACE_EVENT_EXIT. |
| |
| // So, record this task's exit and destroy it. |
| // XXX We can't do record_robust_futex_changes here because the address |
| // space has already gone. That would only matter if some of them were |
| // in memory accessible to another process even after exec, i.e. a |
| // shared-memory mapping or two different thread-groups sharing the same |
| // address space. |
| pid_t tid = t->rec_tid; |
| WaitStatus status = t->status(); |
| record_exit_trace_event(t, WaitStatus(0)); |
| t->record_exit_event(); |
| // Don't call RecordTask::destroy() because we don't want to |
| // PTRACE_DETACH. |
| delete t; |
| // Steal the exec'ing task and make it the thread-group leader, and |
| // carry on! |
| t = revive_task_for_exec(tid); |
| scheduler().set_current(t); |
| *t_ptr = t; |
| // Tell t that it is actually stopped, because the stop we got is really |
| // for this task, not the old dead task. |
| if (!t->did_waitpid(status)) { |
| // This is totally untested and almost certainly broken, but if the |
| // task was SIGKILLed out of the EXEC stop then we should probably |
| // just pretend the exec never happened. |
| step_state->continue_type = CONTINUE_SYSCALL; |
| break; |
| } |
| } |
| t->post_exec(); |
| t->session().scheduler().did_exit_execve(t); |
| |
| // Forward ptrace exec notification |
| if (t->emulated_ptracer) { |
| if (t->emulated_ptrace_options & PTRACE_O_TRACEEXEC) { |
| t->emulate_ptrace_stop( |
| WaitStatus::for_ptrace_event(PTRACE_EVENT_EXEC)); |
| } else if (!t->emulated_ptrace_seized) { |
| // Inject legacy SIGTRAP-after-exec |
| t->tgkill(SIGTRAP); |
| } |
| } |
| |
| if (t->emulated_stop_pending) { |
| step_state->continue_type = DONT_CONTINUE; |
| } else { |
| // Skip past the ptrace event. |
| step_state->continue_type = CONTINUE_SYSCALL; |
| } |
| break; |
| } |
| |
| default: |
| ASSERT(t, false) << "Unhandled ptrace event " << ptrace_event_name(event) |
| << "(" << event << ")"; |
| break; |
| } |
| |
| return true; |
| } |
| |
| static void debug_exec_state(const char* msg, RecordTask* t) { |
| LOG(debug) << msg << ": status=" << t->status(); |
| } |
| |
| template <typename Arch> |
| static bool is_ptrace_any_singlestep_arch(int command) { |
| return command >= 0 && |
| (command == PTRACE_SINGLESTEP || command == Arch::PTRACE_SYSEMU_SINGLESTEP); |
| } |
| |
| static bool is_ptrace_any_singlestep(SupportedArch arch, int command) |
| { |
| RR_ARCH_FUNCTION(is_ptrace_any_singlestep_arch, arch, command); |
| } |
| |
| void RecordSession::task_continue(const StepState& step_state) { |
| RecordTask* t = scheduler().current(); |
| |
| ASSERT(t, step_state.continue_type != DONT_CONTINUE); |
| // A task in an emulated ptrace-stop must really stay stopped |
| ASSERT(t, !t->emulated_stop_pending); |
| |
| bool may_restart = t->at_may_restart_syscall(); |
| |
| if (may_restart && t->seccomp_bpf_enabled) { |
| LOG(debug) << " PTRACE_SYSCALL to possibly-restarted " << t->ev(); |
| } |
| |
| if (!t->vm()->first_run_event()) { |
| t->vm()->set_first_run_event(trace_writer().time()); |
| } |
| |
| if (!t->thread_group()->first_run_event()) { |
| t->thread_group()->set_first_run_event(trace_writer().time()); |
| } |
| |
| TicksRequest ticks_request; |
| ResumeRequest resume; |
| if (step_state.continue_type == CONTINUE_SYSCALL) { |
| ticks_request = RESUME_NO_TICKS; |
| resume = RESUME_SYSCALL; |
| } else { |
| if (t->has_stashed_sig(PerfCounters::TIME_SLICE_SIGNAL)) { |
| // timeslice signal already stashed, no point in generating another one |
| // (and potentially slow) |
| ticks_request = RESUME_UNLIMITED_TICKS; |
| } else if (scheduler().may_use_unlimited_ticks()) { |
| ticks_request = RESUME_UNLIMITED_TICKS; |
| } else { |
| ticks_request = (TicksRequest)max<Ticks>( |
| 0, scheduler().current_timeslice_end() - t->tick_count()); |
| } |
| |
| // Clear any lingering state, then see if we need to stop earlier for a |
| // tracee-requested pmc interrupt on the virtualized performance counter. |
| t->next_pmc_interrupt_is_for_user = false; |
| if (auto vpmc = |
| VirtualPerfCounterMonitor::interrupting_virtual_pmc_for_task(t)) { |
| ASSERT(t, vpmc->target_tuid() == t->tuid()); |
| |
| Ticks after = max<Ticks>(vpmc->target_ticks() - t->tick_count(), 0); |
| if ((uint64_t)after < (uint64_t)ticks_request) { |
| LOG(debug) << "ticks_request constrained from " << ticks_request |
| << " to " << after << " for vpmc"; |
| ticks_request = (TicksRequest)after; |
| t->next_pmc_interrupt_is_for_user = true; |
| } |
| } |
| |
| // Override requested by the tracee for testing purposes |
| if (t->tick_request_override != (TicksRequest)0) { |
| ASSERT(t, !t->next_pmc_interrupt_is_for_user); |
| ticks_request = t->tick_request_override; |
| t->tick_request_override = (TicksRequest)0; |
| } |
| |
| bool singlestep = is_ptrace_any_singlestep(t->arch(), |
| t->emulated_ptrace_cont_command); |
| if (singlestep && is_at_syscall_instruction(t, t->ip())) { |
| // We're about to singlestep into a syscall instruction. |
| // Act like we're NOT singlestepping since doing a PTRACE_SINGLESTEP would |
| // skip over the system call. |
| LOG(debug) |
| << "Clearing singlestep because we're about to enter a syscall"; |
| singlestep = false; |
| } |
| if (singlestep) { |
| resume = RESUME_SINGLESTEP; |
| } else { |
| /* We won't receive PTRACE_EVENT_SECCOMP events until |
| * the seccomp filter is installed by the |
| * syscall_buffer lib in the child, therefore we must |
| * record in the traditional way (with PTRACE_SYSCALL) |
| * until it is installed. */ |
| /* Kernel commit |
| https://github.com/torvalds/linux/commit/93e35efb8de45393cf61ed07f7b407629bf698ea |
| makes PTRACE_SYSCALL traps be delivered *before* seccomp RET_TRACE |
| traps. |
| Detect and handle this. */ |
| if (!t->seccomp_bpf_enabled || may_restart || |
| syscall_seccomp_ordering_ == PTRACE_SYSCALL_BEFORE_SECCOMP_UNKNOWN) { |
| resume = RESUME_SYSCALL; |
| } else { |
| /* When the seccomp filter is on, instead of capturing |
| * syscalls by using PTRACE_SYSCALL, the filter will |
| * generate the ptrace events. This means we allow the |
| * process to run using PTRACE_CONT, and rely on the |
| * seccomp filter to generate the special |
| * PTRACE_EVENT_SECCOMP event once a syscall happens. |
| * This event is handled here by simply allowing the |
| * process to continue to the actual entry point of |
| * the syscall (using cont_syscall_block()) and then |
| * using the same logic as before. */ |
| resume = RESUME_CONT; |
| } |
| } |
| } |
| t->resume_execution(resume, RESUME_NONBLOCKING, ticks_request); |
| } |
| |
| /** |
| * Step |t| forward until the tracee syscall that disarms the desched |
| * event. If a signal becomes pending in the interim, we stash it. |
| * This allows the caller to deliver the signal after this returns. |
| * (In reality the desched event will already have been disarmed before we |
| * enter this function.) |
| */ |
| static void advance_to_disarm_desched_syscall(RecordTask* t) { |
| int old_sig = 0; |
| |
| LOG(debug) << "desched: DISARMING_DESCHED_EVENT"; |
| /* TODO: send this through main loop. */ |
| /* TODO: mask off signals and avoid this loop. */ |
| do { |
| if (!t->resume_execution(RESUME_SYSCALL, RESUME_WAIT_NO_EXIT, RESUME_UNLIMITED_TICKS)) { |
| return; |
| } |
| if (t->status().is_syscall()) { |
| t->apply_syscall_entry_regs(); |
| } |
| /* We can safely ignore TIME_SLICE_SIGNAL while trying to |
| * reach the disarm-desched ioctl: once we reach it, |
| * the desched'd syscall will be "done" and the tracee |
| * will be at a preemption point. In fact, we *want* |
| * to ignore this signal. Syscalls like read() can |
| * have large buffers passed to them, and we have to |
| * copy-out the buffered out data to the user's |
| * buffer. This happens in the interval where we're |
| * reaching the disarm-desched ioctl, so that code is |
| * susceptible to receiving TIME_SLICE_SIGNAL. */ |
| int sig = t->stop_sig(); |
| if (PerfCounters::TIME_SLICE_SIGNAL == sig) { |
| continue; |
| } |
| // We should not receive SYSCALLBUF_DESCHED_SIGNAL since it should already |
| // have been disarmed. However, we observe these being received here when |
| // we arm the desched signal before we restart a blocking syscall, which |
| // completes successfully, then we disarm, then we see a desched signal |
| // here. |
| if (t->session().syscallbuf_desched_sig() == sig) { |
| continue; |
| } |
| if (sig && sig == old_sig) { |
| LOG(debug) << " coalescing pending " << signal_name(sig); |
| continue; |
| } |
| if (sig) { |
| LOG(debug) << " " << signal_name(sig) << " now pending"; |
| t->stash_sig(); |
| } |
| } while (!t->is_disarm_desched_event_syscall()); |
| |
| // Exit the syscall. If this fails, that's fine, we can ignore it. |
| t->resume_execution(RESUME_SYSCALL, RESUME_WAIT_NO_EXIT, RESUME_NO_TICKS); |
| } |
| |
| /** |
| * |t| is at a desched event and some relevant aspect of its state |
| * changed. (For now, changes except the original desched'd syscall |
| * being restarted.) |
| */ |
| void RecordSession::desched_state_changed(RecordTask* t) { |
| LOG(debug) << "desched: IN_SYSCALL"; |
| /* We need to ensure that the syscallbuf code doesn't |
| * try to commit the current record; we've already |
| * recorded that syscall. The following event sets |
| * the abort-commit bit. */ |
| t->write_mem(REMOTE_PTR_FIELD(t->syscallbuf_child, abort_commit), (uint8_t)1); |
| t->record_event(Event::syscallbuf_abort_commit()); |
| |
| advance_to_disarm_desched_syscall(t); |
| |
| t->pop_desched(); |
| |
| /* The tracee has just finished sanity-checking the |
| * aborted record, and won't touch the syscallbuf |
| * during this (aborted) transaction again. So now |
| * is a good time for us to reset the record counter. */ |
| t->delay_syscallbuf_reset_for_desched = false; |
| // Run the syscallbuf exit hook. This ensures we'll be able to reset |
| // the syscallbuf before trying to buffer another syscall. |
| t->write_mem( |
| REMOTE_PTR_FIELD(t->syscallbuf_child, notify_on_syscall_hook_exit), |
| (uint8_t)1); |
| } |
| |
| static void syscall_not_restarted(RecordTask* t) { |
| LOG(debug) << " " << t->tid << ": popping abandoned interrupted " << t->ev() |
| << "; pending events:"; |
| if (IS_LOGGING(debug)) { |
| t->log_pending_events(); |
| } |
| t->pop_syscall_interruption(); |
| } |
| |
| /** |
| * "Thaw" a frozen interrupted syscall if |t| is restarting it. |
| * Return true if a syscall is indeed restarted. |
| * |
| * A postcondition of this function is that |t->ev| is no longer a |
| * syscall interruption, whether or whether not a syscall was |
| * restarted. |
| */ |
| static bool maybe_restart_syscall(RecordTask* t) { |
| if (is_restart_syscall_syscall(t->regs().original_syscallno(), t->arch())) { |
| LOG(debug) << " " << t->tid << ": SYS_restart_syscall'ing " << t->ev(); |
| } |
| if (t->is_syscall_restart()) { |
| t->ev().transform(EV_SYSCALL); |
| Registers regs = t->regs(); |
| regs.set_original_syscallno(t->ev().Syscall().regs.original_syscallno()); |
| t->set_regs(regs); |
| t->canonicalize_regs(t->arch()); |
| return true; |
| } |
| if (EV_SYSCALL_INTERRUPTION == t->ev().type()) { |
| syscall_not_restarted(t); |
| } |
| return false; |
| } |
| |
| /** |
| * After a SYS_sigreturn "exit" of task |t| with return value |ret|, |
| * check to see if there's an interrupted syscall that /won't/ be |
| * restarted, and if so, pop it off the pending event stack. |
| */ |
| static void maybe_discard_syscall_interruption(RecordTask* t, intptr_t ret) { |
| int syscallno; |
| |
| if (EV_SYSCALL_INTERRUPTION != t->ev().type()) { |
| /* We currently don't track syscalls interrupted with |
| * ERESTARTSYS or ERESTARTNOHAND, so it's possible for |
| * a sigreturn not to affect the event stack. */ |
| LOG(debug) << " (no interrupted syscall to retire)"; |
| return; |
| } |
| |
| syscallno = t->ev().Syscall().number; |
| if (0 > ret) { |
| syscall_not_restarted(t); |
| } else if (t->arch() == x86 || t->arch() == x86_64) { |
| // On x86, we would have expected this to get restored to the syscallno. |
| // Since the syscallno is in a different register on other platforms, this |
| // assert does not apply. |
| ASSERT(t, syscallno == ret) |
| << "Interrupted call was " << t->ev().Syscall().syscall_name() |
| << " and sigreturn claims to be restarting " |
| << syscall_name(ret, t->ev().Syscall().arch()); |
| } |
| } |
| |
| /** |
| * Copy the registers used for syscall arguments (not including |
| * syscall number) from |from| to |to|. |
| */ |
| static void copy_syscall_arg_regs(Registers* to, const Registers& from) { |
| to->set_orig_arg1(from.arg1()); |
| to->set_arg2(from.arg2()); |
| to->set_arg3(from.arg3()); |
| to->set_arg4(from.arg4()); |
| to->set_arg5(from.arg5()); |
| to->set_arg6(from.arg6()); |
| } |
| |
| static void maybe_trigger_emulated_ptrace_syscall_exit_stop(RecordTask* t) { |
| if (t->emulated_ptrace_cont_command == PTRACE_SYSCALL) { |
| t->emulate_ptrace_stop(WaitStatus::for_syscall(t), SYSCALL_EXIT_STOP); |
| } else if (is_ptrace_any_singlestep(t->arch(), t->emulated_ptrace_cont_command)) { |
| // Deliver the singlestep trap now that we've finished executing the |
| // syscall. |
| t->emulate_ptrace_stop(WaitStatus::for_stop_sig(SIGTRAP), SIGNAL_DELIVERY_STOP, nullptr, |
| SI_KERNEL); |
| } |
| } |
| |
| static void save_interrupted_syscall_ret_in_syscallbuf(RecordTask* t, |
| intptr_t retval) { |
| // Record storing the return value in the syscallbuf record, where |
| // we expect to find it during replay. |
| auto child_rec = t->next_syscallbuf_record(); |
| // Also store it there now so that our memory checksums are correct. |
| // It will be overwritten by the tracee's syscallbuf code. |
| t->write_and_record(REMOTE_PTR_FIELD(child_rec, ret), |
| static_cast<int64_t>(retval)); |
| } |
| |
| static bool is_in_privileged_syscall(RecordTask* t) { |
| auto type = AddressSpace::rr_page_syscall_from_exit_point(t->arch(), t->ip()); |
| return type && type->privileged == AddressSpace::PRIVILEGED; |
| } |
| |
| void RecordSession::syscall_state_changed(RecordTask* t, |
| StepState* step_state) { |
| switch (t->ev().Syscall().state) { |
| case ENTERING_SYSCALL_PTRACE: |
| debug_exec_state("EXEC_SYSCALL_ENTRY_PTRACE", t); |
| step_state->continue_type = DONT_CONTINUE; |
| last_task_switchable = ALLOW_SWITCH; |
| if (t->emulated_stop_type != NOT_STOPPED) { |
| // Don't go any further. |
| return; |
| } |
| if (t->ev().Syscall().in_sysemu) { |
| // We'll have recorded just the ENTERING_SYSCALL_PTRACE event and |
| // nothing else. Resume with an invalid syscall to ensure no real |
| // syscall runs. |
| t->pop_syscall(); |
| Registers r = t->regs(); |
| Registers orig_regs = r; |
| r.set_original_syscallno(-1); |
| t->set_regs(r); |
| // If this fails because of premature exit, don't mess with the |
| // task anymore. |
| if (t->resume_execution(RESUME_SYSCALL, RESUME_WAIT_NO_EXIT, RESUME_NO_TICKS)) { |
| ASSERT(t, t->ip() == r.ip()); |
| t->set_regs(orig_regs); |
| maybe_trigger_emulated_ptrace_syscall_exit_stop(t); |
| } |
| return; |
| } |
| last_task_switchable = PREVENT_SWITCH; |
| t->ev().Syscall().regs = t->regs(); |
| t->ev().Syscall().state = ENTERING_SYSCALL; |
| // The syscallno may have been changed by the ptracer |
| t->ev().Syscall().number = t->regs().original_syscallno(); |
| return; |
| |
| case ENTERING_SYSCALL: { |
| debug_exec_state("EXEC_SYSCALL_ENTRY", t); |
| ASSERT(t, !t->emulated_stop_pending); |
| |
| // Flush syscallbuf now so that anything recorded by |
| // rec_prepare_syscall is associated with the syscall event |
| t->maybe_flush_syscallbuf(); |
| |
| last_task_switchable = t->ev().Syscall().switchable = |
| rec_prepare_syscall(t); |
| t->record_event(t->ev(), RecordTask::DONT_FLUSH_SYSCALLBUF, |
| RecordTask::ALLOW_RESET_SYSCALLBUF, |
| &t->ev().Syscall().regs); |
| |
| debug_exec_state("after cont", t); |
| t->ev().Syscall().state = PROCESSING_SYSCALL; |
| |
| if (t->emulated_stop_pending) { |
| step_state->continue_type = DONT_CONTINUE; |
| } else { |
| // Resume the syscall execution in the kernel context. |
| step_state->continue_type = CONTINUE_SYSCALL; |
| } |
| |
| if (t->session().done_initial_exec() && Flags::get().check_cached_mmaps) { |
| t->vm()->verify(t); |
| } |
| |
| if (t->desched_rec() && t->is_in_untraced_syscall() && |
| t->has_stashed_sig()) { |
| // We have a signal to deliver but we're about to (re?)enter an untraced |
| // syscall that may block and the desched event has been disarmed. |
| // Rearm the desched event so if the syscall blocks, it will be |
| // interrupted and we'll have a chance to deliver our signal. |
| LOG(debug) << "Rearming desched event so we'll get a chance to deliver " |
| "stashed signal"; |
| arm_desched_event(t); |
| } |
| |
| if (t->detached_proxy) { |
| // We detached. Record that. |
| t->record_event(Event::exit(), RecordTask::DONT_FLUSH_SYSCALLBUF, |
| RecordTask::DONT_RESET_SYSCALLBUF); |
| t->session().trace_writer().write_task_event( |
| TraceTaskEvent::for_detach(t->tid)); |
| step_state->continue_type = DONT_CONTINUE; |
| } |
| |
| return; |
| } |
| |
| case PROCESSING_SYSCALL: |
| debug_exec_state("EXEC_IN_SYSCALL", t); |
| |
| // Linux kicks tasks out of syscalls before delivering |
| // signals. |
| ASSERT(t, !t->stop_sig()) << "Signal " << signal_name(t->stop_sig()) |
| << " pending while in syscall???"; |
| |
| t->ev().Syscall().state = EXITING_SYSCALL; |
| step_state->continue_type = DONT_CONTINUE; |
| return; |
| |
| case EXITING_SYSCALL: { |
| debug_exec_state("EXEC_SYSCALL_DONE", t); |
| |
| DEBUG_ASSERT(t->stop_sig() == 0); |
| |
| SupportedArch syscall_arch = t->ev().Syscall().arch(); |
| int syscallno = t->ev().Syscall().number; |
| intptr_t retval = t->regs().syscall_result_signed(); |
| |
| if (t->desched_rec()) { |
| // If we enabled the desched event above, disable it. |
| disarm_desched_event(t); |
| // Write syscall return value to the syscallbuf now. This lets replay |
| // get the correct value even though we're aborting the commit. This |
| // value affects register values in the preload code (which must be |
| // correct since register values may escape). |
| save_interrupted_syscall_ret_in_syscallbuf(t, retval); |
| } |
| |
| // sigreturn is a special snowflake, because it |
| // doesn't actually return. Instead, it undoes the |
| // setup for signal delivery, which possibly includes |
| // preparing the tracee for a restart-syscall. So we |
| // take this opportunity to possibly pop an |
| // interrupted-syscall event. |
| if (is_sigreturn(syscallno, syscall_arch)) { |
| if (is_x86ish(t->arch())) { |
| ASSERT(t, t->regs().original_syscallno() == -1); |
| } |
| rec_did_sigreturn(t); |
| t->record_current_event(); |
| t->pop_syscall(); |
| |
| // We've finished processing this signal now. |
| t->pop_signal_handler(); |
| t->invalidate_sigmask(); |
| |
| maybe_discard_syscall_interruption(t, retval); |
| |
| if (EV_SECCOMP_TRAP == t->ev().type()) { |
| LOG(debug) << " exiting seccomp trap"; |
| save_interrupted_syscall_ret_in_syscallbuf(t, retval); |
| seccomp_trap_done(t); |
| } |
| if (EV_DESCHED == t->ev().type()) { |
| LOG(debug) << " exiting desched critical section"; |
| // The signal handler could have modified the apparent syscall |
| // return handler. Save that value into the syscall buf again so |
| // replay will pick it up later. |
| save_interrupted_syscall_ret_in_syscallbuf(t, retval); |
| desched_state_changed(t); |
| } |
| } else { |
| LOG(debug) << " original_syscallno:" << t->regs().original_syscallno() |
| << " (" << syscall_name(syscallno, syscall_arch) |
| << "); return val:" << HEX(t->regs().syscall_result()); |
| |
| /* a syscall_restart ending is equivalent to the |
| * restarted syscall ending */ |
| if (t->ev().Syscall().is_restart) { |
| LOG(debug) << " exiting restarted " |
| << syscall_name(syscallno, syscall_arch); |
| } |
| |
| /* TODO: is there any reason a restart_syscall can't |
| * be interrupted by a signal and itself restarted? */ |
| bool may_restart = !is_restart_syscall_syscall(syscallno, t->arch()) |
| // SYS_pause is either interrupted or |
| // never returns. It doesn't restart. |
| && !is_pause_syscall(syscallno, t->arch()) && |
| t->regs().syscall_may_restart(); |
| /* no need to process the syscall in case its |
| * restarted this will be done in the exit from the |
| * restart_syscall */ |
| if (!may_restart) { |
| rec_process_syscall(t); |
| if (t->session().done_initial_exec() && |
| Flags::get().check_cached_mmaps) { |
| t->vm()->verify(t); |
| } |
| } else { |
| LOG(debug) << " may restart " |
| << syscall_name(syscallno, syscall_arch) |
| << " (from retval " << HEX(retval) << ")"; |
| |
| rec_prepare_restart_syscall(t); |
| /* If we may restart this syscall, we've most |
| * likely fudged some of the argument |
| * registers with scratch pointers. We don't |
| * want to record those fudged registers, |
| * because scratch doesn't exist in replay. |
| * So cover our tracks here. */ |
| Registers r = t->regs(); |
| copy_syscall_arg_regs(&r, t->ev().Syscall().regs); |
| t->set_regs(r); |
| // We need to track what the return value was on architectures |
| // where the kernel replaces the return value by the new arg1 |
| // on restart. |
| t->ev().Syscall().regs = r; |
| } |
| t->record_current_event(); |
| |
| /* If we're not going to restart this syscall, we're |
| * done with it. But if we are, "freeze" it on the |
| * event stack until the execution point where it |
| * might be restarted. */ |
| if (!may_restart) { |
| t->pop_syscall(); |
| if (EV_DESCHED == t->ev().type()) { |
| LOG(debug) << " exiting desched critical section"; |
| desched_state_changed(t); |
| } |
| } else { |
| t->ev().transform(EV_SYSCALL_INTERRUPTION); |
| t->ev().Syscall().is_restart = true; |
| } |
| |
| t->canonicalize_regs(syscall_arch); |
| |
| if (!may_restart) { |
| if (t->retry_syscall_patching) { |
| LOG(debug) << "Retrying deferred syscall patching"; |
| t->retry_syscall_patching = false; |
| if (t->vm()->monkeypatcher().try_patch_syscall(t, false)) { |
| // Syscall was patched. Emit event and continue execution. |
| auto ev = Event::patch_syscall(); |
| ev.PatchSyscall().patch_after_syscall = true; |
| t->record_event(ev); |
| } |
| } |
| } |
| } |
| |
| last_task_switchable = ALLOW_SWITCH; |
| step_state->continue_type = DONT_CONTINUE; |
| |
| if (!is_in_privileged_syscall(t)) { |
| maybe_trigger_emulated_ptrace_syscall_exit_stop(t); |
| } |
| return; |
| } |
| |
| default: |
| FATAL() << "Unknown exec state " << t->ev().Syscall().state; |
| } |
| } |
| |
| /** If the perf counters seem to be working return, otherwise don't return. */ |
| void RecordSession::check_initial_task_syscalls(RecordTask* t, |
| RecordResult* step_result) { |
| if (done_initial_exec()) { |
| return; |
| } |
| |
| if (is_write_syscall(t->ev().Syscall().number, t->arch()) && |
| t->regs().arg1_signed() == -1) { |
| Ticks ticks = t->tick_count(); |
| LOG(debug) << "ticks on entry to dummy write: " << ticks; |
| if (ticks == 0) { |
| step_result->status = RecordSession::STEP_SPAWN_FAILED; |
| step_result->failure_message = string( |
| "rr internal recorder error: Performance counter doesn't seem to " |
| "be working. Are you perhaps running rr in a VM but didn't enable " |
| "perf-counter virtualization?"); |
| } |
| } |
| |
| if (is_exit_group_syscall(t->ev().Syscall().number, t->arch())) { |
| step_result->status = RecordSession::STEP_SPAWN_FAILED; |
| step_result->failure_message = read_spawned_task_error(); |
| } |
| } |
| |
| RecordTask* RecordSession::revive_task_for_exec(pid_t rec_tid) { |
| unsigned long msg = 0; |
| int ret = |
| ptrace(_ptrace_request(PTRACE_GETEVENTMSG), rec_tid, nullptr, &msg); |
| if (ret < 0) { |
| FATAL() << "Can't get old tid for execve (leader=" << rec_tid << ")"; |
| } |
| RecordTask* t = find_task(msg); |
| if (!t) { |
| FATAL() << "Can't find old task for execve"; |
| } |
| ASSERT(t, rec_tid == t->tgid()); |
| pid_t own_namespace_tid = t->thread_group()->tgid_own_namespace; |
| |
| LOG(debug) << "Changing task tid from " << t->tid << " to " << rec_tid; |
| |
| // Pretend the old task cloned a new task with the right tid, and then exited |
| trace_writer().write_task_event(TraceTaskEvent::for_clone( |
| rec_tid, t->tid, own_namespace_tid, |
| CLONE_VM | CLONE_FS | CLONE_FILES | CLONE_SIGHAND | CLONE_THREAD | |
| CLONE_SYSVSEM)); |
| trace_writer().write_task_event( |
| TraceTaskEvent::for_exit(t->tid, WaitStatus::for_exit_code(0))); |
| |
| // Account for tid change |
| task_map.erase(t->tid); |
| task_map.insert(make_pair(rec_tid, t)); |
| // Update the serial as if this task was really created by cloning the old |
| // task. |
| t->set_tid_and_update_serial(rec_tid, own_namespace_tid); |
| |
| return t; |
| } |
| |
| /** |
| * Take a NativeArch::siginfo_t& here instead of siginfo_t because different |
| * versions of system headers have inconsistent field naming. |
| */ |
| template <typename Arch> |
| static void setup_sigframe_siginfo_arch(RecordTask* t, |
| const siginfo_t& siginfo) { |
| remote_ptr<typename Arch::siginfo_t> dest; |
| switch (Arch::arch()) { |
| case x86: { |
| auto p = t->regs().sp().cast<typename Arch::unsigned_word>() + 2; |
| dest = t->read_mem(p); |
| break; |
| } |
| case x86_64: |
| dest = t->regs().si(); |
| break; |
| case aarch64: |
| dest = t->regs().x1(); |
| break; |
| default: |
| DEBUG_ASSERT(0 && "Unknown architecture"); |
| break; |
| } |
| typename Arch::siginfo_t si = t->read_mem(dest); |
| set_arch_siginfo(siginfo, t->arch(), &si, sizeof(si)); |
| t->write_mem(dest, si); |
| } |
| |
| static void setup_sigframe_siginfo(RecordTask* t, const siginfo_t& siginfo) { |
| RR_ARCH_FUNCTION(setup_sigframe_siginfo_arch, t->arch(), t, siginfo); |
| } |
| |
| /** |
| * Get t into a state where resume_execution with a signal will actually work. |
| */ |
| static bool preinject_signal(RecordTask* t) { |
| int sig = t->ev().Signal().siginfo.si_signo; |
| |
| /* Signal injection is tricky. Per the ptrace(2) man page, injecting |
| * a signal while the task is not in a signal-stop is not guaranteed to work |
| * (and indeed, we see that the kernel sometimes ignores such signals). |
| * But some signals must be delayed until after the signal-stop that notified |
| * us of them. |
| * So, first we check if we're in a signal-stop that we can use to inject |
| * a signal. Some (all?) SIGTRAP stops are *not* usable for signal injection. |
| */ |
| if (t->stop_sig() && t->stop_sig() != SIGTRAP) { |
| LOG(debug) << " in signal-stop for " << signal_name(t->stop_sig()); |
| } else { |
| /* We're not in a usable signal-stop. Force a signal-stop by sending |
| * a new signal with tgkill (as the ptrace(2) man page recommends). |
| */ |
| LOG(debug) << " maybe not in signal-stop (status " << t->status() |
| << "); doing tgkill(SYSCALLBUF_DESCHED_SIGNAL)"; |
| if (!t->move_to_signal_stop()) { |
| /* We raced with an exit (e.g. due to a pending SIGKILL). */ |
| return false; |
| } |
| |
| ASSERT(t, t->stop_sig() == t->session().syscallbuf_desched_sig()) |
| << "Expected SYSCALLBUF_DESCHED_SIGNAL, got " << t->status(); |
| /* We're now in a signal-stop */ |
| } |
| |
| /* Now that we're in a signal-stop, we can inject our signal and advance |
| * to the signal handler with one single-step. |
| */ |
| LOG(debug) << " injecting signal " << signal_name(sig); |
| t->set_siginfo(t->ev().Signal().siginfo); |
| return true; |
| } |
| |
| /** |
| * Returns true if the signal should be delivered. |
| * Returns false if this signal should not be delivered because another signal |
| * occurred during delivery or there was a premature exit. |
| * Must call t->stashed_signal_processed() once we're ready to unmask signals. |
| */ |
| static bool inject_handled_signal(RecordTask* t) { |
| if (!preinject_signal(t)) { |
| // Task prematurely exited. |
| return false; |
| } |
| // If there aren't any more stashed signals, it's OK to stop blocking all |
| // signals. |
| t->stashed_signal_processed(); |
| |
| int sig = t->ev().Signal().siginfo.si_signo; |
| do { |
| // We are ready to inject our signal. |
| // XXX we assume the kernel won't respond by notifying us of a different |
| // signal. We don't want to do this with signals blocked because that will |
| // save a bogus signal mask in the signal frame. |
| if (!t->resume_execution(RESUME_SINGLESTEP, RESUME_WAIT_NO_EXIT, RESUME_NO_TICKS, sig)) { |
| return false; |
| } |
| // Signal injection can change the sigmask due to sa_mask effects, lack of |
| // SA_NODEFER, and signal frame construction triggering a synchronous |
| // SIGSEGV. |
| t->invalidate_sigmask(); |
| // Repeat injection if we got a desched signal. We observe in Linux 4.14.12 |
| // that we get SYSCALLBUF_DESCHED_SIGNAL here once in a while. |
| } while (t->stop_sig() == t->session().syscallbuf_desched_sig()); |
| |
| if (t->stop_sig() == SIGSEGV) { |
| // Constructing the signal handler frame must have failed. Stash the signal |
| // to deliver it later. |
| t->stash_sig(); |
| if (sig == SIGSEGV) { |
| // The kernel will kill the process after this. Make sure we know to treat |
| // it as fatal when we inject it. Also disable the signal handler to match |
| // what the kernel does. |
| t->did_set_sig_handler_default(SIGSEGV); |
| t->thread_group()->received_sigframe_SIGSEGV = true; |
| } |
| return false; |
| } |
| |
| // We stepped into a user signal handler. |
| ASSERT(t, t->stop_sig() == SIGTRAP) |
| << "Got unexpected status " << t->status() << " trying to deliver " << sig |
| << " siginfo is " << t->get_siginfo(); |
| ASSERT(t, t->get_signal_user_handler(sig) == t->ip()) |
| << "Expected handler IP " << t->get_signal_user_handler(sig) << ", got " |
| << t->ip() |
| << "; actual signal mask=" << HEX(t->read_sigmask_from_process()) |
| << " (cached " << HEX(t->get_sigmask()) << ")"; |
| |
| if (t->signal_handler_takes_siginfo(sig)) { |
| // The kernel copied siginfo into userspace so it can pass a pointer to |
| // the signal handler. Replace the contents of that siginfo with |
| // the exact data we want to deliver. (We called Task::set_siginfo |
| // above to set that data, but the kernel sanitizes the passed-in data |
| // which wipes out certain fields; e.g. we can't set SI_KERNEL in si_code.) |
| setup_sigframe_siginfo(t, t->ev().Signal().siginfo); |
| } |
| |
| // The kernel clears the FPU state on entering the signal handler, but prior |
| // to 4.7 or thereabouts ptrace can still return stale values. Fix that here. |
| // This also sets bit 0 of the XINUSE register to 1 to avoid issues where it |
| // get set to 1 nondeterministically. |
| ExtraRegisters e = t->extra_regs(); |
| e.reset(); |
| t->set_extra_regs(e); |
| |
| return true; |
| } |
| |
| /** |
| * |t| is being delivered a signal, and its state changed. |
| * Must call t->stashed_signal_processed() once we're ready to unmask signals. |
| */ |
| bool RecordSession::signal_state_changed(RecordTask* t, StepState* step_state) { |
| int sig = t->ev().Signal().siginfo.si_signo; |
| |
| switch (t->ev().type()) { |
| case EV_SIGNAL: { |
| // This event is used by the replayer to advance to |
| // the point of signal delivery. |
| if (t->arch() == aarch64 && t->status().is_syscall() && |
| t->prev_ev() && t->prev_ev()->type() == EV_SYSCALL_INTERRUPTION) { |
| // On aarch64, replaying expects the signal to be delivered before |
| // the syscall instruction but the current pc during recording |
| // is after the syscall instruction with the arg1 clobbered |
| // with the return value (aborted syscall). |
| auto regs = t->regs(); |
| auto &syscall_regs = t->prev_ev()->Syscall().regs; |
| regs.set_ip(syscall_regs.ip().decrement_by_syscall_insn_length(t->arch())); |
| regs.set_arg1(syscall_regs.orig_arg1()); |
| t->record_event(t->ev(), RecordTask::FLUSH_SYSCALLBUF, |
| RecordTask::ALLOW_RESET_SYSCALLBUF, ®s); |
| } else { |
| t->record_current_event(); |
| } |
| t->ev().transform(EV_SIGNAL_DELIVERY); |
| ssize_t sigframe_size = 0; |
| |
| bool has_handler = t->signal_has_user_handler(sig); |
| if (has_handler) { |
| LOG(debug) << " " << t->tid << ": " << signal_name(sig) |
| << " has user handler"; |
| |
| if (!inject_handled_signal(t)) { |
| // Signal delivery isn't happening. Prepare to process the new |
| // signal that aborted signal delivery. |
| t->signal_delivered(sig); |
| t->pop_signal_delivery(); |
| step_state->continue_type = DONT_CONTINUE; |
| last_task_switchable = PREVENT_SWITCH; |
| break; |
| } |
| |
| if (is_x86ish(t->arch())) { |
| // It's somewhat difficult engineering-wise to |
| // compute the sigframe size at compile time, |
| // and it can vary across kernel versions and CPU |
| // microarchitectures. So this size is an overestimate |
| // of the real size(s). |
| // |
| // If this size becomes too small in the |
| // future, and unit tests that use sighandlers |
| // are run with checksumming enabled, then |
| // they can catch errors here. |
| sigframe_size = 1152 /* Overestimate of kernel sigframe */ + |
| 128 /* Redzone */ + |
| /* this returns 512 when XSAVE unsupported */ |
| xsave_area_size(); |
| } else if (t->arch() == aarch64) { |
| sigframe_size = sizeof(ARM64Arch::rt_sigframe) + |
| sizeof(ARM64Arch::user_fpsimd_state); |
| } else { |
| DEBUG_ASSERT(0 && "Add sigframe size for your architecture here"); |
| } |
| |
| t->ev().transform(EV_SIGNAL_HANDLER); |
| t->signal_delivered(sig); |
| // We already continued! Don't continue now, and allow switching. |
| step_state->continue_type = DONT_CONTINUE; |
| last_task_switchable = ALLOW_SWITCH; |
| } else { |
| t->stashed_signal_processed(); |
| LOG(debug) << " " << t->tid << ": no user handler for " |
| << signal_name(sig); |
| // Don't do another task continue. We want to deliver the signal |
| // as the next thing that the task does. |
| step_state->continue_type = DONT_CONTINUE; |
| // If we didn't set up the sighandler frame, we need |
| // to ensure that this tracee is scheduled next so |
| // that we can deliver the signal normally. We have |
| // to do that because setting up the sighandler frame |
| // is synchronous, but delivery otherwise is async. |
| // But right after this, we may have to process some |
| // syscallbuf state, so we can't let the tracee race |
| // with us. |
| last_task_switchable = PREVENT_SWITCH; |
| } |
| |
| // We record this data even if sigframe_size is zero to simplify replay. |
| // Stop recording data if we run off the end of a writable mapping. |
| // Our sigframe size is conservative so we need to do this. |
| t->record_remote_writable(t->regs().sp(), sigframe_size); |
| |
| // This event is used by the replayer to set up the signal handler frame. |
| // But if we don't have a handler, we don't want to record the event |
| // until we deal with the EV_SIGNAL_DELIVERY. |
| if (has_handler) { |
| t->record_current_event(); |
| } |
| break; |
| } |
| |
| case EV_SIGNAL_DELIVERY: { |
| // A SIGSTOP requires us to allow switching to another task. |
| // So does a fatal, core-dumping signal, since we need to allow other |
| // tasks to proceed to their exit events. |
| bool is_deterministic = t->ev().Signal().deterministic == DETERMINISTIC_SIG; |
| // Signals that would normally be fatal are just ignored for init processes, |
| // unless they're deterministic. |
| bool is_fatal = t->ev().Signal().disposition == DISPOSITION_FATAL && |
| (!t->is_container_init() || is_deterministic); |
| Switchable can_switch = ((is_fatal && is_coredumping_signal(sig)) || sig == SIGSTOP) ? |
| ALLOW_SWITCH : PREVENT_SWITCH; |
| |
| // We didn't record this event above, so do that now. |
| // NB: If there is no handler, and we interrupted a syscall, and there are |
| // no more actionable signals, the kernel sets us up for a syscall |
| // restart. But it does that *after* the ptrace trap. To replay this |
| // correctly we need to fake those changes here. But we don't do this |
| // if we're going to switch away at the ptrace trap, and for the moment, |
| // 'can_switch' is actually 'will_switch'. |
| // This is essentially copied from do_signal in arch/x86/kernel/signal.c |
| bool has_other_signals = t->has_any_actionable_signal(); |
| auto r = t->regs(); |
| if (!is_fatal) { |
| Event *prev_ev = t->prev_ev(); |
| if (can_switch == PREVENT_SWITCH && !has_other_signals && prev_ev && |
| EV_SYSCALL_INTERRUPTION == prev_ev->type()) { |
| switch (prev_ev->Syscall().regs.syscall_result_signed()) { |
| case -ERESTARTNOHAND: |
| case -ERESTARTSYS: |
| case -ERESTARTNOINTR: |
| r.set_syscallno(r.original_syscallno()); |
| break; |
| case -ERESTART_RESTARTBLOCK: |
| r.set_syscallno(syscall_number_for_restart_syscall(t->arch())); |
| break; |
| } |
| // On aarch64, the kernel modifies the registers before the signal stop. |
| // so we should not decrement the pc again or we'll rerun the instruction |
| // before the syscall. |
| // [1] https://github.com/torvalds/linux/blob/caffb99b6929f41a69edbb5aef3a359bf45f3315/arch/arm64/kernel/signal.c#L855-L862 |
| if (t->arch() != aarch64) |
| r.set_ip(r.ip().decrement_by_syscall_insn_length(t->arch())); |
| // Now that we've mucked with the registers, we can't switch tasks. That |
| // could allow more signals to be generated, breaking our assumption |
| // that we are the last signal. |
| } else { |
| // But if we didn't touch the registers switching here is ok. |
| can_switch = ALLOW_SWITCH; |
| } |
| } |
| |
| t->record_event(t->ev(), RecordTask::FLUSH_SYSCALLBUF, |
| RecordTask::ALLOW_RESET_SYSCALLBUF, &r); |
| // Don't actually set_regs(r), the kernel does these modifications. |
| |
| if (t->is_container_init() && is_fatal) { |
| // Nondeterministic signals were already filtered out. |
| ASSERT(t, is_deterministic); |
| // Usually, the kernel removes the killable-protection from an init process |
| // when a deterministic fatal signal gets executed, but (due to what is |
| // arguably a bug) when a ptracer is attached, this does not happen. |
| // If we try to inject it here, the kernel will just ignore it, |
| // and we'll go around again. As a hack, we detach here, in the |
| // expectation that the deterministic instruction will run again and |
| // actually kill the task now that it isn't under ptrace control anymore. |
| t->destroy_buffers(nullptr, nullptr); |
| WaitStatus exit_status = WaitStatus::for_fatal_sig(sig); |
| record_exit_trace_event(t, exit_status); |
| // Allow writing child_tid now because otherwise the write will race |
| t->record_exit_event(RecordTask::WRITE_CHILD_TID); |
| // On a real affected kernel, we probably would have never gotten here, |
| // since the signal we would be seeing was not deterministic, but let's |
| // be conservative and still try to emulate the ptrace stop. |
| t->do_ptrace_exit_stop(exit_status); |
| t->did_kill(); |
| t->detach(); |
| // Not really, but we detached, so we're never gonna see that event |
| // anyway, so just pretend we're there already |
| t->did_reach_zombie(); |
| return true; |
| } |
| |
| // Only inject fatal signals. Non-fatal signals with signal handlers |
| // were taken care of above; for non-fatal signals without signal |
| // handlers, there is no need to deliver the signal at all. In fact, |
| // there is really no way to inject a non-fatal, non-handled signal |
| // without letting the task execute at least one instruction, which |
| // we don't want to do here. |
| bool inject_signal = is_fatal && sig != get_continue_through_sig(); |
| if (inject_signal) { |
| preinject_signal(t); |
| t->resume_execution(RESUME_CONT, RESUME_NONBLOCKING, RESUME_NO_TICKS, |
| sig); |
| } |
| |
| t->signal_delivered(sig); |
| if (!inject_signal || !is_coredumping_signal(sig)) { |
| /* Fatal signals may core-dump, so we don't consider the signal |
| * delivery complete until we've actually managed to advance past that |
| */ |
| t->pop_signal_delivery(); |
| } |
| |
| // Mark each task in this address space as expecting a ptrace exit |
| // to avoid causing any ptrace_exit races. |
| if (is_fatal && is_coredumping_signal(sig)) { |
| for (Task *ot : t->vm()->task_set()) { |
| if (t != ot) { |
| if (t->tgid() == ot->tgid() || coredumping_signal_takes_down_entire_vm()) { |
| ((RecordTask *)ot)->waiting_for_ptrace_exit = true; |
| } |
| } |
| } |
| } |
| |
| last_task_switchable = can_switch; |
| step_state->continue_type = DONT_CONTINUE; |
| break; |
| } |
| |
| default: |
| FATAL() << "Unhandled signal state " << t->ev().type(); |
| break; |
| } |
| return false; |
| } |
| |
| bool RecordSession::handle_signal_event(RecordTask* t, StepState* step_state) { |
| int sig = t->stop_sig(); |
| if (!sig) { |
| return false; |
| } |
| if (!done_initial_exec()) { |
| // If the initial tracee isn't prepared to handle |
| // signals yet, then us ignoring the ptrace |
| // notification here will have the side effect of |
| // declining to deliver the signal. |
| // |
| // This doesn't really occur in practice, only in |
| // tests that force a degenerately low time slice. |
| LOG(warn) << "Dropping " << signal_name(sig) |
| << " because it can't be delivered yet"; |
| // These signals might have effects on the sigmask. |
| t->invalidate_sigmask(); |
| // No events to be recorded, so no syscallbuf updates |
| // needed. |
| return true; |
| } |
| |
| if (sig == SIGTRAP && handle_syscallbuf_breakpoint(t)) { |
| return true; |
| } |
| |
| SignalDeterministic deterministic = is_deterministic_signal(t); |
| // The kernel might have forcibly unblocked the signal. Check whether it |
| // was blocked now, before we update our cached sigmask. |
| SignalBlocked signal_was_blocked = |
| t->is_sig_blocked(sig) ? SIG_BLOCKED : SIG_UNBLOCKED; |
| if (deterministic || sig == t->session().syscallbuf_desched_sig()) { |
| // Don't stash these signals; deliver them immediately. |
| // We don't want them to be reordered around other signals. |
| // invalidate_sigmask() must not be called before we reach handle_signal! |
| siginfo_t siginfo = t->get_siginfo(); |
| switch (handle_signal(t, &siginfo, deterministic, signal_was_blocked)) { |
| case SIGNAL_PTRACE_STOP: |
| // Emulated ptrace-stop. Don't run the task again yet. |
| last_task_switchable = ALLOW_SWITCH; |
| step_state->continue_type = DONT_CONTINUE; |
| return true; |
| case DEFER_SIGNAL: |
| ASSERT(t, false) << "Can't defer deterministic or internal signal " |
| << siginfo << " at ip " << t->ip(); |
| break; |
| case SIGNAL_HANDLED: |
| if (t->ptrace_event() == PTRACE_EVENT_SECCOMP) { |
| // `handle_desched_event` detected a spurious desched followed |
| // by a SECCOMP event, which it left pending. Handle that SECCOMP |
| // event now. |
| bool dummy_did_enter_syscall; |
| handle_ptrace_event(&t, step_state, nullptr, |
| &dummy_did_enter_syscall); |
| ASSERT(t, !dummy_did_enter_syscall); |
| } |
| if (t->ptrace_event() == PTRACE_EVENT_EXIT) { |
| // Tracee was nuked (probably SIGKILL) during desched processing. |
| return true; |
| } |
| break; |
| } |
| return false; |
| } |
| // Conservatively invalidate the sigmask in case just accepting a signal has |
| // sigmask effects. |
| t->invalidate_sigmask(); |
| if (sig == PerfCounters::TIME_SLICE_SIGNAL) { |
| if (t->next_pmc_interrupt_is_for_user) { |
| auto vpmc = |
| VirtualPerfCounterMonitor::interrupting_virtual_pmc_for_task(t); |
| ASSERT(t, vpmc); |
| |
| // Synthesize the requested signal. |
| vpmc->synthesize_signal(t); |
| |
| t->next_pmc_interrupt_is_for_user = false; |
| return true; |
| } |
| |
| auto& si = t->get_siginfo(); |
| /* This implementation will of course fall over if rr tries to |
| * record itself. |
| * |
| * NB: we can't check that the ticks is >= the programmed |
| * target, because this signal may have become pending before |
| * we reset the HPC counters. There be a way to handle that |
| * more elegantly, but bridge will be crossed in due time. |
| * |
| * We can't check that the fd matches t->hpc.ticks_fd() because this |
| * signal could have been queued quite a long time ago and the PerfCounters |
| * might have been stopped (and restarted!), perhaps even more than once, |
| * since the signal was queued. possibly changing its fd. We could check |
| * against all fds the PerfCounters have ever used, but that seems like |
| * overkill. |
| */ |
| ASSERT(t, |
| PerfCounters::TIME_SLICE_SIGNAL == si.si_signo && |
| (RecordTask::SYNTHETIC_TIME_SLICE_SI_CODE == si.si_code || |
| POLL_IN == si.si_code)) |
| << "Tracee is using SIGSTKFLT??? (code=" << si.si_code |
| << ", fd=" << si.si_fd << ")"; |
| } |
| t->stash_sig(); |
| return true; |
| } |
| |
| template <typename Arch> |
| static bool is_ptrace_any_sysemu_arch(int command) { |
| return command >= 0 && |
| (command == Arch::PTRACE_SYSEMU || |
| command == Arch::PTRACE_SYSEMU_SINGLESTEP); |
| } |
| |
| static bool is_ptrace_any_sysemu(SupportedArch arch, int command) |
| { |
| RR_ARCH_FUNCTION(is_ptrace_any_sysemu_arch, arch, command); |
| } |
| |
| bool RecordSession::process_syscall_entry(RecordTask* t, StepState* step_state, |
| RecordResult* step_result, |
| SupportedArch syscall_arch) { |
| if (const RecordTask::StashedSignal* sig = t->stashed_sig_not_synthetic_SIGCHLD()) { |
| // The only four cases where we allow a stashed signal to be pending on |
| // syscall entry are: |
| // -- the signal is a ptrace-related signal, in which case if it's generated |
| // during a blocking syscall, it does not interrupt the syscall |
| // -- rrcall_notify_syscall_hook_exit, which is effectively a noop and |
| // lets us dispatch signals afterward |
| // -- when we're entering a blocking untraced syscall. If it really blocks, |
| // we'll get the desched-signal notification and dispatch our stashed |
| // signal. |
| // -- when we're doing a privileged syscall that's internal to the preload |
| // logic |
| // We do not generally want to have stashed signals pending when we enter |
| // a syscall, because that will execute with a hacked signal mask |
| // (see RecordTask::will_resume_execution) which could make things go wrong. |
| ASSERT(t, |
| t->desched_rec() || is_rrcall_notify_syscall_hook_exit_syscall( |
| t->regs().original_syscallno(), t->arch()) || |
| t->ip() == |
| t->vm() |
| ->privileged_traced_syscall_ip() |
| .increment_by_syscall_insn_length(t->arch())) |
| << "Stashed signal pending on syscall entry when it shouldn't be: " |
| << sig->siginfo << "; regs=" << t->regs() |
| << "; last_execution_resume=" << t->last_execution_resume() |
| << "; sig ip=" << sig->ip; |
| } |
| |
| // We just entered a syscall. |
| if (!maybe_restart_syscall(t)) { |
| if (syscall_seccomp_ordering_ == PTRACE_SYSCALL_BEFORE_SECCOMP_UNKNOWN && |
| t->seccomp_bpf_enabled) { |
| // We received a PTRACE_SYSCALL notification before the seccomp |
| // notification. Ignore it and continue to the seccomp notification. |
| syscall_seccomp_ordering_ = PTRACE_SYSCALL_BEFORE_SECCOMP; |
| step_state->continue_type = CONTINUE; |
| return true; |
| } |
| |
| // Don't ever patch a sigreturn syscall. These can't go through the syscallbuf. |
| if (!is_sigreturn(t->regs().original_syscallno(), t->arch())) { |
| if (t->vm()->monkeypatcher().try_patch_syscall(t)) { |
| // Syscall was patched. Emit event and continue execution. |
| t->record_event(Event::patch_syscall()); |
| return true; |
| } |
| if (!t->is_stopped()) { |
| // task exited while we were trying to patch it. |
| // Make sure that this exit event gets processed |
| step_state->continue_type = DONT_CONTINUE; |
| return false; |
| } |
| } |
| |
| t->push_event(SyscallEvent(t->regs().original_syscallno(), syscall_arch)); |
| } |
| |
| check_initial_task_syscalls(t, step_result); |
| note_entering_syscall(t); |
| if ((t->emulated_ptrace_cont_command == PTRACE_SYSCALL || |
| is_ptrace_any_sysemu(t->arch(), |
| t->emulated_ptrace_cont_command)) && |
| !is_in_privileged_syscall(t)) { |
| t->ev().Syscall().state = ENTERING_SYSCALL_PTRACE; |
| t->emulate_ptrace_stop(WaitStatus::for_syscall(t), SYSCALL_ENTRY_STOP); |
| t->record_current_event(); |
| |
| t->ev().Syscall().in_sysemu = is_ptrace_any_sysemu(t->arch(), |
| t->emulated_ptrace_cont_command); |
| } |
| return true; |
| } |
| |
| /** |
| * The execution of |t| has just been resumed, and it most likely has |
| * a new event that needs to be processed. Prepare that new event. |
| * Returns false if the task exits during processing |
| */ |
| void RecordSession::runnable_state_changed(RecordTask* t, StepState* step_state, |
| RecordResult* step_result, |
| bool can_consume_wait_status) { |
| switch (t->ev().type()) { |
| case EV_NOOP: |
| t->pop_noop(); |
| return; |
| case EV_INSTRUCTION_TRAP: |
| t->record_current_event(); |
| t->pop_event(t->ev().type()); |
| return; |
| case EV_SENTINEL: |
| case EV_SIGNAL_HANDLER: |
| case EV_SYSCALL_INTERRUPTION: { |
| if (!can_consume_wait_status) { |
| return; |
| } |
| |
| SupportedArch syscall_arch = t->detect_syscall_arch(); |
| t->canonicalize_regs(syscall_arch); |
| t->apply_syscall_entry_regs(); |
| process_syscall_entry(t, step_state, step_result, syscall_arch); |
| return; |
| } |
| |
| default: |
| return; |
| } |
| } |
| |
| bool RecordSession::prepare_to_inject_signal(RecordTask* t, |
| StepState* step_state) { |
| if (!done_initial_exec() || step_state->continue_type != CONTINUE) { |
| return false; |
| } |
| |
| union { |
| NativeArch::siginfo_t native_api; |
| siginfo_t linux_api; |
| } si; |
| const RecordTask::StashedSignal* sig; |
| |
| while (true) { |
| sig = t->peek_stashed_sig_to_deliver(); |
| if (!sig) { |
| return false; |
| } |
| si.linux_api = sig->siginfo; |
| if (si.linux_api.si_signo == get_ignore_sig()) { |
| LOG(debug) << "Declining to deliver " |
| << signal_name(si.linux_api.si_signo) << " by user request"; |
| t->pop_stash_sig(sig); |
| t->stashed_signal_processed(); |
| } else { |
| break; |
| } |
| } |
| |
| if (sig->deterministic == DETERMINISTIC_SIG && |
| sig->siginfo.si_signo == SIGSYS && |
| t->is_sig_blocked(sig->siginfo.si_signo) == SIG_BLOCKED) { |
| // Our synthesized deterministic SIGSYS (seccomp trap) needs to match the |
| // kernel behavior of unblocking the signal and resetting disposition to |
| // default. |
| (void)t->unblock_signal(SIGSYS); |
| t->set_sig_handler_default(SIGSYS); |
| } |
| switch (handle_signal(t, &si.linux_api, sig->deterministic, SIG_UNBLOCKED)) { |
| case SIGNAL_PTRACE_STOP: |
| // Emulated ptrace-stop. Don't run the task again yet. |
| last_task_switchable = ALLOW_SWITCH; |
| LOG(debug) << signal_name(si.linux_api.si_signo) |
| << ", emulating ptrace stop"; |
| break; |
| case DEFER_SIGNAL: |
| LOG(debug) << signal_name(si.linux_api.si_signo) << " deferred"; |
| // Leave signal on the stack and continue task execution. We'll try again |
| // later. |
| return false; |
| case SIGNAL_HANDLED: |
| LOG(debug) << signal_name(si.linux_api.si_signo) << " handled"; |
| // Signal is now a pending event on |t|'s event stack |
| |
| if (t->ev().type() == EV_SCHED) { |
| if (t->maybe_in_spinlock()) { |
| LOG(debug) << "Detected possible spinlock, forcing one round-robin"; |
| scheduler().schedule_one_round_robin(t); |
| } |
| // Allow switching after a SCHED. We'll flush the SCHED if and only |
| // if we really do a switch. |
| last_task_switchable = ALLOW_SWITCH; |
| } |
| break; |
| } |
| step_state->continue_type = DONT_CONTINUE; |
| t->pop_stash_sig(sig); |
| if (t->ev().type() != EV_SIGNAL) { |
| t->stashed_signal_processed(); |
| } |
| return true; |
| } |
| |
| static void inject_ld_helper_library(vector<string>& env, |
| string env_var, |
| string value) { |
| // Our preload lib should come first if possible, because that will speed up |
| // the loading of the other libraries; it's also a good idea to put our audit |
| // library at the head of the list, since there's only sixteen possible link |
| // namespaces on glibc and each audit library uses up one. |
| // |
| // We supply a placeholder which is then mutated to the correct filename in |
| // Monkeypatcher::patch_after_exec. |
| auto env_assignment = env_var + "="; |
| auto it = env.begin(); |
| for (; it != env.end(); ++it) { |
| if (it->find(env_assignment) != 0) { |
| continue; |
| } |
| // Honor old preloads too. This may cause |
| // problems, but only in those libs, and |
| // that's the user's problem. |
| value += ":"; |
| value += it->substr(it->find("=") + 1); |
| break; |
| } |
| value = env_assignment + value; |
| if (it == env.end()) { |
| env.push_back(value); |
| } else { |
| *it = value; |
| } |
| } |
| |
| void strip_outer_ld_preload(vector<string>& env) { |
| auto env_assignment = "LD_PRELOAD="; |
| auto it = env.begin(); |
| for (; it != env.end(); ++it) { |
| if (it->find(env_assignment) != 0) { |
| continue; |
| } |
| size_t colon_pos = it->find(":"); |
| if (colon_pos != string::npos) { |
| // If the preload library is loaded at all, it must be first |
| size_t preload_pos = it->find("librrpreload"); |
| if (preload_pos < colon_pos) { |
| string new_ld_preload = it->substr(++colon_pos); |
| *it = env_assignment + new_ld_preload; |
| return; |
| } else { |
| DEBUG_ASSERT(preload_pos == string::npos); |
| } |
| } |
| } |
| } |
| |
| static const MemoryRange asan_shadow(remote_ptr<void>((uintptr_t)0x00007fff7000LL), |
| remote_ptr<void>((uintptr_t)0x10007fff8000LL)); |
| static const MemoryRange asan_allocator_reserved(remote_ptr<void>((uintptr_t)0x600000000000LL), |
| remote_ptr<void>((uintptr_t)0x640000002000LL)); |
| |
| // See https://github.com/llvm/llvm-project/blob/main/compiler-rt/lib/tsan/rtl/tsan_platform_posix.cpp |
| static const MemoryRange tsan_shadow(remote_ptr<void>((uintptr_t)0x008000000000LL), |
| remote_ptr<void>((uintptr_t)0x550000000000LL)); |
| // The memory area 0x7b0000000000-0x7c0000002000 is reserved for TSAN's |
| // custom heap allocator --- applications end up using it, but *we* can't use |
| // it. |
| static const MemoryRange tsan_exclude(remote_ptr<void>((uintptr_t)0x568000000000LL), |
| remote_ptr<void>((uintptr_t)0x7e8000000000LL)); |
| // It's only 1TB because tsan can't fit more |
| static const MemoryRange tsan_fixed_global_exclusion_range(remote_ptr<void>((uintptr_t)0x7e8000000000LL), |
| remote_ptr<void>((uintptr_t)0x7f8000000000LL)); |
| |
| struct ExeInfo { |
| ExeInfo() : arch(NativeArch::arch()) {} |
| SupportedArch arch; |
| // Empty if anything fails |
| string sanitizer_path; |
| vector<MemoryRange> sanitizer_exclude_memory_ranges; |
| // If non-empty, use this as the global exclusion range. |
| MemoryRange fixed_global_exclusion_range; |
| |
| void setup_asan_memory_ranges() { |
| if (!check_sanitizer_arch()) { |
| return; |
| } |
| sanitizer_exclude_memory_ranges.push_back(asan_shadow); |
| sanitizer_exclude_memory_ranges.push_back(asan_allocator_reserved); |
| } |
| void setup_tsan_memory_ranges() { |
| if (!check_sanitizer_arch()) { |
| return; |
| } |
| sanitizer_exclude_memory_ranges.push_back(tsan_shadow); |
| sanitizer_exclude_memory_ranges.push_back(tsan_exclude); |
| fixed_global_exclusion_range = tsan_fixed_global_exclusion_range; |
| } |
| private: |
| bool check_sanitizer_arch() { |
| switch (arch) { |
| case x86_64: |
| return true; |
| default: |
| // We have no idea what's going on. Disable mmap randomization if |
| // chaos mode is active. |
| sanitizer_exclude_memory_ranges.push_back(MemoryRange::all()); |
| return false; |
| } |
| } |
| }; |
| |
| static ExeInfo read_exe_info(const string& exe_file) { |
| ExeInfo ret; |
| ScopedFd fd(exe_file.c_str(), O_RDONLY); |
| if (!fd.is_open()) { |
| return ret; |
| } |
| ElfFileReader reader(fd); |
| ret.arch = reader.arch(); |
| |
| DynamicSection dynamic = reader.read_dynamic(); |
| for (auto& entry : dynamic.entries) { |
| if (entry.tag == DT_NEEDED && entry.val < dynamic.strtab.size()) { |
| const char* name = &dynamic.strtab[entry.val]; |
| if (!strncmp(name, "libasan", 7)) { |
| ret.sanitizer_path = string(name); |
| ret.setup_asan_memory_ranges(); |
| } else if (!strncmp(name, "libtsan", 7)) { |
| ret.sanitizer_path = string(name); |
| ret.setup_tsan_memory_ranges(); |
| } |
| } |
| } |
| |
| auto syms = reader.read_symbols(".dynsym", ".dynstr"); |
| for (size_t i = 0; i < syms.size(); ++i) { |
| if (syms.is_name(i, "__asan_init")) { |
| ret.setup_asan_memory_ranges(); |
| } else if (syms.is_name(i, "__tsan_init")) { |
| ret.setup_tsan_memory_ranges(); |
| } |
| } |
| |
| return ret; |
| } |
| |
| static string lookup_by_path(const string& name) { |
| if (name.find('/') != string::npos) { |
| return name; |
| } |
| const char* env = getenv("PATH"); |
| if (!env) { |
| return name; |
| } |
| char* p = strdup(env); |
| char* s = p; |
| while (*s) { |
| char* next = strchr(s, ':'); |
| if (next) { |
| *next = 0; |
| } |
| string file = string(s) + "/" + name; |
| struct stat st; |
| if (!stat(file.c_str(), &st) && S_ISREG(st.st_mode) && |
| !access(file.c_str(), X_OK)) { |
| free(p); |
| return file; |
| } |
| if (!next) { |
| break; |
| } |
| s = next + 1; |
| } |
| free(p); |
| return name; |
| } |
| |
| /*static*/ RecordSession::shr_ptr RecordSession::create( |
| const vector<string>& argv, const vector<string>& extra_env, |
| const DisableCPUIDFeatures& disable_cpuid_features, |
| SyscallBuffering syscallbuf, |
| unsigned char syscallbuf_desched_sig, |
| BindCPU bind_cpu, |
| const string& output_trace_dir, |
| const TraceUuid* trace_id, |
| bool use_audit, |
| bool unmap_vdso, |
| bool force_asan_active, |
| bool force_tsan_active) { |
| TraceeAttentionSet::initialize(); |
| |
| // The syscallbuf library interposes some critical |
| // external symbols like XShmQueryExtension(), so we |
| // preload it whether or not syscallbuf is enabled. Indicate here whether |
| // syscallbuf is enabled. |
| if (syscallbuf == DISABLE_SYSCALL_BUF) { |
| unsetenv(SYSCALLBUF_ENABLED_ENV_VAR); |
| } else { |
| setenv(SYSCALLBUF_ENABLED_ENV_VAR, "1", 1); |
| |
| if (!has_effective_caps(uint64_t(1) << CAP_SYS_ADMIN) && |
| !has_effective_caps(uint64_t(1) << CAP_PERFMON)) { |
| ScopedFd fd("/proc/sys/kernel/perf_event_paranoid", O_RDONLY); |
| if (fd.is_open()) { |
| char buf[100]; |
| ssize_t size = read(fd, buf, sizeof(buf) - 1); |
| if (size >= 0) { |
| buf[size] = 0; |
| int val = atoi(buf); |
| if (val > 1) { |
| fprintf(stderr, |
| "rr needs /proc/sys/kernel/perf_event_paranoid <= 1, but it is %d.\n" |
| "Change it to 1, or use 'rr record -n' (slow).\n" |
| "Consider putting 'kernel.perf_event_paranoid = 1' in /etc/sysctl.d/10-rr.conf.\n" |
| "See 'man 8 sysctl', 'man 5 sysctl.d' (systemd systems)\n" |
| "and 'man 5 sysctl.conf' (non-systemd systems) for more details.\n", |
| val); |
| exit(1); |
| } |
| } |
| } |
| } |
| } |
| |
| vector<string> env = current_env(); |
| |
| // Have extra_env override anything already in the environment |
| for (string extra : extra_env) { |
| string extra_var = extra.substr(0, extra.find('=')); |
| auto it = env.begin(); |
| for (; it != env.end(); ++it) { |
| if (it->find(extra_var) != 0) { |
| continue; |
| } |
| it = env.erase(it); |
| break; |
| } |
| } |
| env.insert(env.end(), extra_env.begin(), extra_env.end()); |
| |
| string full_path = lookup_by_path(argv[0]); |
| struct stat st; |
| if (stat(full_path.c_str(), &st) == 0 && S_ISDIR(st.st_mode)) { |
| CLEAN_FATAL() << "Provided tracee '" << argv[0] << "' is a directory, not an executable"; |
| } |
| ExeInfo exe_info = read_exe_info(full_path); |
| if (exe_info.sanitizer_exclude_memory_ranges.empty()) { |
| if (force_asan_active) { |
| exe_info.setup_asan_memory_ranges(); |
| } else if (force_tsan_active) { |
| exe_info.setup_tsan_memory_ranges(); |
| } |
| } |
| |
| // Strip any LD_PRELOAD that an outer rr may have inserted |
| strip_outer_ld_preload(env); |
| |
| // LD_PRELOAD the syscall interception lib |
| string syscall_buffer_lib_path = find_helper_library(SYSCALLBUF_LIB_FILENAME); |
| if (!syscall_buffer_lib_path.empty()) { |
| string ld_preload = ""; |
| if (!exe_info.sanitizer_path.empty()) { |
| LOG(debug) << "Prepending " << exe_info.sanitizer_path << " to LD_PRELOAD"; |
| // Put an LD_PRELOAD entry for it before our preload library, because |
| // it checks that it's loaded first |
| ld_preload += exe_info.sanitizer_path + ":"; |
| } |
| ld_preload += syscall_buffer_lib_path + SYSCALLBUF_LIB_FILENAME_PADDED; |
| // When librrpreload is built against glibc 2.34 but runs in a process linking pre-2.34 glibc, |
| // its call to dlsym needs to search libdl before libc. When librrpreload found dlsym |
| // in libc at link time, pre-2.34 ld.so throws a fatal error if it searches for dlsym in libc and |
| // can't find it. |
| ld_preload += ":libdl.so.2"; |
| inject_ld_helper_library(env, "LD_PRELOAD", ld_preload); |
| } |
| |
| if (use_audit) { |
| string rtld_audit_lib_path = find_helper_library(RTLDAUDIT_LIB_FILENAME); |
| if (!rtld_audit_lib_path.empty()) { |
| string ld_audit = rtld_audit_lib_path + RTLDAUDIT_LIB_FILENAME_PADDED; |
| inject_ld_helper_library(env, "LD_AUDIT", ld_audit); |
| } |
| } |
| |
| env.push_back("RUNNING_UNDER_RR=1"); |
| // Stop Mesa using the GPU |
| env.push_back("LIBGL_ALWAYS_SOFTWARE=1"); |
| env.push_back("GBM_ALWAYS_SOFTWARE=1"); |
| env.push_back("SDL_RENDER_DRIVER=software"); |
| // Stop sssd from using shared-memory with its daemon |
| env.push_back("SSS_NSS_USE_MEMCACHE=NO"); |
| |
| // Disable Gecko's "wait for gdb to attach on process crash" behavior, since |
| // it is useless when running under rr. |
| env.push_back("MOZ_GDB_SLEEP=0"); |
| |
| // Avoid GVFS using separate processes that might run |
| // outside the recording but share memory mapped files. |
| env.push_back("GIO_USE_VFS=local"); |
| |
| // If we have CPUID faulting, don't use these environment hacks. We don't |
| // need them and the user might want to use them themselves for other reasons. |
| if (!Session::has_cpuid_faulting()) { |
| // OpenSSL uses RDRAND, but we can disable it. These bitmasks are inverted |
| // and ANDed with the results of CPUID. The number below is 2^62, which is the |
| // bit for RDRAND support. |
| env.push_back("OPENSSL_ia32cap=~4611686018427387904:0"); |
| // Disable Qt's use of RDRAND/RDSEED/RTM |
| env.push_back("QT_NO_CPU_FEATURE=rdrnd rdseed rtm"); |
| // Disable systemd's use of RDRAND |
| env.push_back("SYSTEMD_RDRAND=0"); |
| } |
| |
| shr_ptr session( |
| new RecordSession(full_path, argv, env, disable_cpuid_features, |
| syscallbuf, syscallbuf_desched_sig, bind_cpu, |
| output_trace_dir, trace_id, use_audit, unmap_vdso)); |
| session->excluded_ranges_ = std::move(exe_info.sanitizer_exclude_memory_ranges); |
| session->fixed_global_exclusion_range_ = std::move(exe_info.fixed_global_exclusion_range); |
| return session; |
| } |
| |
| RecordSession::RecordSession(const std::string& exe_path, |
| const std::vector<std::string>& argv, |
| const std::vector<std::string>& envp, |
| const DisableCPUIDFeatures& disable_cpuid_features, |
| SyscallBuffering syscallbuf, |
| int syscallbuf_desched_sig, |
| BindCPU bind_cpu, |
| const string& output_trace_dir, |
| const TraceUuid* trace_id, |
| bool use_audit, |
| bool unmap_vdso) |
| : trace_out(argv[0], output_trace_dir, ticks_semantics_), |
| scheduler_(*this), |
| trace_id(trace_id), |
| disable_cpuid_features_(disable_cpuid_features), |
| ignore_sig(0), |
| continue_through_sig(0), |
| last_task_switchable(PREVENT_SWITCH), |
| syscall_buffer_size_(1024 * 1024), |
| syscallbuf_desched_sig_(syscallbuf_desched_sig), |
| use_syscall_buffer_(syscallbuf == ENABLE_SYSCALL_BUF), |
| use_file_cloning_(true), |
| use_read_cloning_(true), |
| enable_chaos_(false), |
| wait_for_all_(false), |
| use_audit_(use_audit), |
| unmap_vdso_(unmap_vdso) { |
| if (!has_cpuid_faulting() && |
| disable_cpuid_features.any_features_disabled()) { |
| FATAL() << "CPUID faulting required to disable CPUID features"; |
| } |
| |
| if (rr::syscall_number_for_rrcall_init_preload(x86_64) != RR_CALL_BASE) { |
| FATAL() << "RR_CALL_BASE is incorrect"; |
| } |
| |
| trace_out.set_bound_cpu(choose_cpu(bind_cpu, cpu_lock)); |
| do_bind_cpu(); |
| ScopedFd error_fd = create_spawn_task_error_pipe(); |
| RecordTask* t = static_cast<RecordTask*>( |
| Task::spawn(*this, error_fd, &tracee_socket_fd(), |
| &tracee_socket_receiver_fd(), |
| &tracee_socket_fd_number, |
| exe_path, argv, envp)); |
| |
| if (NativeArch::is_x86ish()) { |
| // CPU affinity has been set. |
| trace_out.setup_cpuid_records(has_cpuid_faulting(), disable_cpuid_features_); |
| if (cpu_has_xsave_fip_fdp_quirk()) { |
| trace_out.set_xsave_fip_fdp_quirk(true); |
| // Clear FIP/FDP on every event to reduce the probability of this quirk |
| // causing divergence, especially when porting traces to Intel machines |
| trace_out.set_clear_fip_fdp(true); |
| } |
| if (cpu_has_fdp_exception_only_quirk()) { |
| trace_out.set_fdp_exception_only_quirk(true); |
| } |
| } |
| |
| initial_thread_group = t->thread_group(); |
| on_create(t); |
| } |
| |
| RecordSession::RecordResult RecordSession::record_step() { |
| RecordResult result; |
| |
| if (task_map.empty()) { |
| result.status = STEP_EXITED; |
| result.exit_status = initial_thread_group->exit_status; |
| return result; |
| } |
| |
| if (!wait_for_all_ && initial_thread_group->task_set().empty()) { |
| // SIGKILL any tasks we haven't already killed. |
| terminate_tracees(); |
| } |
| |
| result.status = STEP_CONTINUE; |
| |
| TaskUid prev_task_tuid; |
| if (scheduler().current()) { |
| prev_task_tuid = scheduler().current()->tuid(); |
| } |
| auto rescheduled = scheduler().reschedule(last_task_switchable); |
| if (rescheduled.interrupted_by_signal) { |
| // The scheduler was waiting for some task to become active, but was |
| // interrupted by a signal. Yield to our caller now to give the caller |
| // a chance to do something triggered by the signal |
| // (e.g. terminate the recording). |
| return result; |
| } |
| RecordTask* t = scheduler().current(); |
| if (!t) { |
| // No child to schedule. Yield to our caller to give it a chance |
| // to do something (e.g. terminate the recording). |
| return result; |
| } |
| // If the task has been reaped prematurely then it's not running |
| // and we can't get registers etc, so minimize what we do between here |
| // to handle_ptrace_exit_event(). |
| if (t->waiting_for_reap) { |
| // Give it another chance to be reaped |
| t->did_reach_zombie(); |
| return result; |
| } |
| RecordTask* prev_task = find_task(prev_task_tuid); |
| if (prev_task && prev_task->ev().type() == EV_SCHED) { |
| if (prev_task != t) { |
| // We did do a context switch, so record the SCHED event. Otherwise |
| // we'll just discard it. |
| prev_task->record_current_event(); |
| } |
| prev_task->pop_event(EV_SCHED); |
| } |
| |
| // Have to disable context-switching until we know it's safe |
| // to allow switching the context. |
| last_task_switchable = PREVENT_SWITCH; |
| |
| LOG(debug) << "trace time " << t->trace_time() << ": Active task is " |
| << t->tid << ". Events:"; |
| if (IS_LOGGING(debug)) { |
| t->log_pending_events(); |
| } |
| |
| if (handle_ptrace_exit_event(t)) { |
| // t may have been deleted. |
| last_task_switchable = ALLOW_SWITCH; |
| return result; |
| } |
| |
| if (rescheduled.started_new_timeslice) { |
| t->registers_at_start_of_last_timeslice = t->regs(); |
| t->time_at_start_of_last_timeslice = trace_writer().time(); |
| } |
| |
| StepState step_state(CONTINUE); |
| |
| ASSERT(t, t->is_stopped()) << "Somehow we're not stopped here; status=" |
| << t->status(); |
| bool did_enter_syscall; |
| if (rescheduled.by_waitpid && |
| handle_ptrace_event(&t, &step_state, &result, &did_enter_syscall)) { |
| if (result.status != STEP_CONTINUE || |
| step_state.continue_type == DONT_CONTINUE) { |
| last_task_switchable = ALLOW_SWITCH; |
| return result; |
| } |
| |
| if (did_enter_syscall && t->ev().type() == EV_SYSCALL) { |
| syscall_state_changed(t, &step_state); |
| } |
| } else { |
| ASSERT(t, t->is_stopped()) << "handle_ptrace_event left us in a not-stopped state"; |
| if (rescheduled.by_waitpid && handle_signal_event(t, &step_state)) { |
| // Tracee may have exited while processing descheds; handle that. |
| if (handle_ptrace_exit_event(t)) { |
| // t may have been deleted. |
| last_task_switchable = ALLOW_SWITCH; |
| return result; |
| } |
| } else { |
| ASSERT(t, t->is_stopped()) << "handle_signal_event left us in a not-stopped state"; |
| runnable_state_changed(t, &step_state, &result, rescheduled.by_waitpid); |
| |
| if (result.status != STEP_CONTINUE || |
| step_state.continue_type == DONT_CONTINUE) { |
| return result; |
| } |
| |
| switch (t->ev().type()) { |
| case EV_DESCHED: |
| desched_state_changed(t); |
| break; |
| case EV_SYSCALL: |
| syscall_state_changed(t, &step_state); |
| break; |
| case EV_SIGNAL: |
| case EV_SIGNAL_DELIVERY: |
| if (signal_state_changed(t, &step_state)) { |
| // t may have been deleted |
| return result; |
| } |
| break; |
| default: |
| break; |
| } |
| } |
| } |
| |
| t->verify_signal_states(); |
| |
| // We try to inject a signal if there's one pending; otherwise we continue |
| // task execution. |
| if (!prepare_to_inject_signal(t, &step_state) && |
| step_state.continue_type != DONT_CONTINUE) { |
| // Ensure that we aren't allowing switches away from a running task. |
| // Only tasks blocked in a syscall can be switched away from, otherwise |
| // we have races. |
| ASSERT(t, |
| last_task_switchable == PREVENT_SWITCH || |
| t->may_be_blocked()); |
| |
| debug_exec_state("EXEC_START", t); |
| |
| task_continue(step_state); |
| } |
| |
| return result; |
| } |
| |
| void RecordSession::terminate_tracees() { |
| for (auto& v : task_map) { |
| RecordTask* t = static_cast<RecordTask*>(v.second); |
| if (!t->detached_proxy && !t->sent_shutdown_kill) { |
| LOG(debug) << "Terminating tracee " << t->tid; |
| ::kill(t->rec_tid, SIGKILL); |
| t->sent_shutdown_kill = true; |
| t->emulate_SIGCONT(); |
| } |
| } |
| } |
| |
| void RecordSession::forward_SIGTERM() { |
| if (!initial_thread_group->task_set().empty()) { |
| kill(initial_thread_group->tgid, SIGTERM); |
| } |
| } |
| |
| void RecordSession::term_detached_tasks() { |
| // Send SIGTERM to all detached child tasks first, so they may clean up |
| // in parallel. |
| for (auto& v : task_map) { |
| RecordTask* t = static_cast<RecordTask*>(v.second); |
| if (!t->detached_proxy) { |
| continue; |
| } |
| ::kill(t->rec_tid, SIGTERM); |
| } |
| for (auto it = task_map.begin(); it != task_map.end(); ) { |
| RecordTask* t = static_cast<RecordTask*>(it->second); |
| if (!t->detached_proxy) { |
| ++it; |
| continue; |
| } |
| WaitResult result = WaitManager::wait_exit(WaitOptions(t->rec_tid)); |
| if (result.code != WAIT_OK) { |
| LOG(warn) << "Wait failed"; |
| } else if (result.status.type() != WaitStatus::EXIT) { |
| LOG(warn) << "Unexpected wait status " << result.status << |
| " while waiting for detached child " << t->rec_tid; |
| } |
| ++it; |
| delete t; |
| } |
| } |
| |
| void RecordSession::close_trace_writer(TraceWriter::CloseStatus status) { |
| trace_out.close(status, trace_id.get()); |
| } |
| |
| Task* RecordSession::new_task(pid_t tid, pid_t, uint32_t serial, |
| SupportedArch a, const std::string&) { |
| return new RecordTask(*this, tid, serial, a); |
| } |
| |
| void RecordSession::on_create(Task* t) { |
| Session::on_create(t); |
| scheduler().on_create(static_cast<RecordTask*>(t)); |
| } |
| |
| void RecordSession::on_destroy(Task* t) { |
| RecordTask *rt = static_cast<RecordTask*>(t); |
| scheduler().on_destroy(rt); |
| if (rt->detached_proxy) { |
| detached_task_map.erase(rt->tid); |
| } |
| Session::on_destroy(t); |
| } |
| |
| RecordTask* RecordSession::find_task(pid_t rec_tid) const { |
| return static_cast<RecordTask*>(Session::find_task(rec_tid)); |
| } |
| |
| RecordTask* RecordSession::find_task(const TaskUid& tuid) const { |
| return static_cast<RecordTask*>(Session::find_task(tuid)); |
| } |
| |
| RecordTask* RecordSession::find_detached_proxy_task(pid_t proxy_tid) const { |
| auto it = detached_task_map.find(proxy_tid); |
| return detached_task_map.end() != it ? it->second : nullptr; |
| } |
| |
| void RecordSession::on_proxy_detach(RecordTask *t, pid_t new_tid) { |
| Session::on_destroy(t); |
| task_map[new_tid] = t; |
| detached_task_map[t->tid] = t; |
| } |
| |
| uint64_t RecordSession::rr_signal_mask() const { |
| return signal_bit(PerfCounters::TIME_SLICE_SIGNAL) | |
| signal_bit(syscallbuf_desched_sig_); |
| } |
| |
| static const uint32_t CPUID_RDRAND_FLAG = 1 << 30; |
| static const uint32_t CPUID_RTM_FLAG = 1 << 11; |
| static const uint32_t CPUID_RDSEED_FLAG = 1 << 18; |
| static const uint32_t CPUID_XSAVEOPT_FLAG = 1 << 0; |
| |
| void DisableCPUIDFeatures::amend_cpuid_data(uint32_t eax_in, uint32_t ecx_in, |
| CPUIDData* cpuid_data) const { |
| switch (eax_in) { |
| case CPUID_GETFEATURES: |
| cpuid_data->ecx &= ~(CPUID_RDRAND_FLAG | features_ecx); |
| cpuid_data->edx &= ~features_edx; |
| break; |
| case CPUID_GETEXTENDEDFEATURES: |
| if (ecx_in == 0) { |
| cpuid_data->ebx &= ~(CPUID_RDSEED_FLAG | CPUID_RTM_FLAG |
| | extended_features_ebx); |
| cpuid_data->ecx &= ~extended_features_ecx; |
| cpuid_data->edx &= ~extended_features_edx; |
| } |
| break; |
| case CPUID_GETXSAVE: |
| if (ecx_in == 1) { |
| // Always disable XSAVEOPT because it's nondeterministic, |
| // possibly depending on context switching behavior. Intel |
| // recommends not using it from user space. |
| cpuid_data->eax &= ~(CPUID_XSAVEOPT_FLAG | xsave_features_eax); |
| } |
| break; |
| default: |
| break; |
| } |
| } |
| |
| } // namespace rr |