| /* -*- Mode: C++; tab-width: 8; c-basic-offset: 2; indent-tabs-mode: nil; -*- */ |
| |
| #include <arpa/inet.h> |
| #include <dirent.h> |
| #include <errno.h> |
| #include <fcntl.h> |
| #include <limits.h> |
| #include <linux/auxvec.h> |
| #include <linux/capability.h> |
| #include <linux/cdrom.h> |
| #include <linux/elf.h> |
| #include <linux/ethtool.h> |
| #include <linux/fb.h> |
| #include <linux/fiemap.h> |
| #include <linux/fs.h> |
| #include <linux/futex.h> |
| #include <linux/hidraw.h> |
| #include <linux/if.h> |
| #include <linux/if_bridge.h> |
| #include <linux/if_packet.h> |
| #include <linux/if_tun.h> |
| #include <linux/input.h> |
| #include <linux/ipc.h> |
| #include <linux/joystick.h> |
| #include <linux/kd.h> |
| #include <linux/msdos_fs.h> |
| #include <linux/msg.h> |
| #include <linux/net.h> |
| #include <linux/netlink.h> |
| #include <linux/perf_event.h> |
| #include <linux/personality.h> |
| #include <linux/prctl.h> |
| #include <linux/random.h> |
| #include <linux/seccomp.h> |
| #include <linux/sem.h> |
| #include <linux/shm.h> |
| #include <linux/sockios.h> |
| #include <linux/videodev2.h> |
| #include <linux/vt.h> |
| #include <linux/wireless.h> |
| #include <poll.h> |
| #include <sched.h> |
| #include <scsi/sg.h> |
| #include <sound/asound.h> |
| #include <sys/epoll.h> |
| #include <sys/ioctl.h> |
| #include <sys/mman.h> |
| #include <sys/quota.h> |
| #include <sys/resource.h> |
| #include <sys/socket.h> |
| #include <sys/syscall.h> |
| #include <sys/sysinfo.h> |
| #include <sys/time.h> |
| #include <sys/times.h> |
| #include <sys/un.h> |
| #include <sys/utsname.h> |
| #include <sys/vfs.h> |
| #include <sys/wait.h> |
| #include <sys/xattr.h> |
| #include <termios.h> |
| |
| #include <limits> |
| #include <sstream> |
| #include <utility> |
| #include <unordered_set> |
| |
| #include <rr/rr.h> |
| |
| #include "record_syscall.h" |
| |
| #include "preload/preload_interface.h" |
| |
| #include "AutoRemoteSyscalls.h" |
| #include "BpfMapMonitor.h" |
| #include "DiversionSession.h" |
| #include "ElfReader.h" |
| #include "FileMonitor.h" |
| #include "Flags.h" |
| #include "MmappedFileMonitor.h" |
| #include "NonvirtualPerfCounterMonitor.h" |
| #include "ODirectFileMonitor.h" |
| #include "ProcFdDirMonitor.h" |
| #include "ProcMemMonitor.h" |
| #include "ProcStatMonitor.h" |
| #include "RRPageMonitor.h" |
| #include "RecordSession.h" |
| #include "RecordTask.h" |
| #include "Scheduler.h" |
| #include "StdioMonitor.h" |
| #include "SysCpuMonitor.h" |
| #include "TraceStream.h" |
| #include "VirtualPerfCounterMonitor.h" |
| #include "cpp_supplement.h" |
| #include "ftrace.h" |
| #include "kernel_abi.h" |
| #include "kernel_metadata.h" |
| #include "kernel_supplement.h" |
| #include "log.h" |
| #include "util.h" |
| |
| using namespace std; |
| |
| namespace rr { |
| |
| union _semun { |
| int val; |
| struct semid64_ds* buf; |
| unsigned short int* array; |
| struct seminfo* __buf; |
| }; |
| |
| /* We can't include <sys/shm.h> to get shmctl because it clashes with |
| * linux/shm.h. |
| */ |
| static int _shmctl(int shmid, int cmd, shmid64_ds* buf) { |
| #ifdef SYS_shmctl |
| int ret = syscall(SYS_shmctl, shmid, cmd, buf); |
| if (ret >= 0 || errno != ENOSYS) { |
| return ret; |
| } |
| #endif |
| #ifdef SYS_ipc |
| if (sizeof(void*) == 4) { |
| cmd |= IPC_64; |
| } |
| return syscall(SYS_ipc, SHMCTL, shmid, cmd, 0, buf); |
| #else |
| return ret; |
| #endif |
| } |
| static int _semctl(int semid, int semnum, int cmd, _semun un_arg) { |
| #ifdef SYS_semctl |
| int ret = syscall(SYS_semctl, semid, semnum, cmd, un_arg); |
| if (ret >= 0 || errno != ENOSYS) { |
| return ret; |
| } |
| #endif |
| #ifdef SYS_ipc |
| if (sizeof(void*) == 4) { |
| cmd |= IPC_64; |
| } |
| return syscall(SYS_ipc, SEMCTL, semid, semnum, cmd, &un_arg); |
| #else |
| return ret; |
| #endif |
| } |
| |
| /** |
| * Modes used to register syscall memory parameter with TaskSyscallState. |
| */ |
| enum ArgMode { |
| // Syscall memory parameter is an in-parameter only. |
| // This is only important when we want to move the buffer to scratch memory |
| // so we can modify it without making the modifications potentially visible |
| // to user code. Otherwise, such parameters can be ignored. |
| IN, |
| // Syscall memory parameter is out-parameter only. |
| OUT, |
| // Syscall memory parameter is an in-out parameter. |
| IN_OUT, |
| // Syscall memory parameter is an in-out parameter but we must not use |
| // scratch (e.g. for futexes, we must use the actual memory word). |
| IN_OUT_NO_SCRATCH |
| }; |
| |
| /** |
| * Specifies how to determine the size of a syscall memory |
| * parameter. There is usually an incoming size determined before the syscall |
| * executes (which we need in order to allocate scratch memory), combined |
| * with an optional final size taken from the syscall result or a specific |
| * memory location after the syscall has executed. The minimum of the incoming |
| * and final sizes is used, if both are present. |
| */ |
| struct ParamSize { |
| ParamSize() : incoming_size(size_t(-1)), from_syscall_multiplier(0) {} |
| // Clamp incoming_size to INTPTR_MAX. No system call can read more data |
| // than that in practice (to a single output parameter). |
| ParamSize(size_t incoming_size) |
| : incoming_size(min<size_t>(INTPTR_MAX, incoming_size)), |
| from_syscall_multiplier(0) {} |
| /** |
| * p points to a tracee location that is already initialized with a |
| * "maximum buffer size" passed in by the tracee, and which will be filled |
| * in with the size of the data by the kernel when the syscall exits. |
| */ |
| template <typename T> |
| static ParamSize from_initialized_mem(RecordTask* t, remote_ptr<T> p) { |
| ParamSize r(p.is_null() ? size_t(0) : size_t(t->read_mem(p))); |
| r.mem_ptr = p; |
| r.read_size = sizeof(T); |
| return r; |
| } |
| /** |
| * p points to a tracee location which will be filled in with the size of |
| * the data by the kernel when the syscall exits, but the location |
| * is uninitialized before the syscall. |
| */ |
| template <typename T> static ParamSize from_mem(remote_ptr<T> p) { |
| ParamSize r; |
| r.mem_ptr = p; |
| r.read_size = sizeof(T); |
| return r; |
| } |
| /** |
| * When the syscall exits, the syscall result will be of type T and contain |
| * the size of the data. 'incoming_size', if present, is a bound on the size |
| * of the data. |
| */ |
| template <typename T> static ParamSize from_syscall_result() { |
| ParamSize r; |
| r.from_syscall_multiplier = 1; |
| r.read_size = sizeof(T); |
| return r; |
| } |
| template <typename T> |
| static ParamSize from_syscall_result(size_t incoming_size, uint32_t multiplier = 1) { |
| ParamSize r(incoming_size); |
| r.from_syscall_multiplier = multiplier; |
| r.read_size = sizeof(T); |
| return r; |
| } |
| /** |
| * Indicate that the size will be at most 'max'. |
| */ |
| ParamSize limit_size(size_t max) const { |
| ParamSize r(*this); |
| r.incoming_size = min(r.incoming_size, max); |
| return r; |
| } |
| |
| /** |
| * Return true if 'other' takes its dynamic size from the same source as |
| * this. |
| * When multiple syscall memory parameters take their dynamic size from the |
| * same source, the source size is distributed among them, with the first |
| * registered parameter taking up to its max_size bytes, followed by the next, |
| * etc. This lets us efficiently record iovec buffers. |
| */ |
| bool is_same_source(const ParamSize& other) const { |
| return ((!mem_ptr.is_null() && other.mem_ptr == mem_ptr) || |
| (from_syscall_multiplier && other.from_syscall_multiplier)) && |
| (read_size == other.read_size); |
| } |
| /** |
| * Compute the actual size after the syscall has executed. |
| * 'already_consumed' bytes are subtracted from the syscall-result/ |
| * memory-location part of the size. |
| */ |
| size_t eval(RecordTask* t, size_t already_consumed) const; |
| |
| size_t incoming_size; |
| /** If non-null, the size is limited by the value at this location after |
| * the syscall. */ |
| remote_ptr<void> mem_ptr; |
| /** Size of the value at mem_ptr or in the syscall result register. */ |
| size_t read_size; |
| /** If from_syscall_multiplier > 0, the size is limited by the value of |
| * the syscall result * from_syscall_multiplier. */ |
| uint32_t from_syscall_multiplier; |
| }; |
| |
| size_t ParamSize::eval(RecordTask* t, size_t already_consumed) const { |
| size_t s = incoming_size; |
| if (!mem_ptr.is_null()) { |
| size_t mem_size; |
| switch (read_size) { |
| case 4: |
| mem_size = t->read_mem(mem_ptr.cast<uint32_t>()); |
| break; |
| case 8: |
| mem_size = t->read_mem(mem_ptr.cast<uint64_t>()); |
| break; |
| default: |
| ASSERT(t, false) << "Unknown read_size"; |
| return 0; |
| } |
| ASSERT(t, already_consumed <= mem_size); |
| s = min(s, mem_size - already_consumed); |
| } |
| if (from_syscall_multiplier) { |
| size_t syscall_size = max<ssize_t>(0, t->regs().syscall_result_signed()) |
| * from_syscall_multiplier; |
| switch (read_size) { |
| case 4: |
| syscall_size = uint32_t(syscall_size); |
| break; |
| case 8: |
| syscall_size = uint64_t(syscall_size); |
| break; |
| default: |
| ASSERT(t, false) << "Unknown read_size"; |
| return 0; |
| } |
| ASSERT(t, already_consumed <= syscall_size); |
| s = min(s, syscall_size - already_consumed); |
| } |
| ASSERT(t, s < size_t(-1)); |
| return s; |
| } |
| |
| typedef bool (*ArgMutator)(RecordTask*, remote_ptr<void>, void*); |
| |
| /** |
| * When tasks enter syscalls that may block and so must be |
| * prepared for a context-switch, and the syscall params |
| * include (in)outparams that point to buffers, we need to |
| * redirect those arguments to scratch memory. This allows rr |
| * to serialize execution of what may be multiple blocked |
| * syscalls completing "simultaneously" (from rr's |
| * perspective). After the syscall exits, we restore the data |
| * saved in scratch memory to the original buffers. |
| * |
| * Then during replay, we simply restore the saved data to the |
| * tracee's passed-in buffer args and continue on. |
| * |
| * This is implemented by having rec_prepare_syscall_arch set up |
| * a record in param_list for syscall in-memory parameter (whether |
| * "in" or "out"). Then done_preparing is called, which does the actual |
| * scratch setup. process_syscall_results is called when the syscall is |
| * done, to write back scratch results to the real parameters and |
| * clean everything up. |
| * |
| * ... a fly in this ointment is may-block buffered syscalls. |
| * If a task blocks in one of those, it will look like it just |
| * entered a syscall that needs a scratch buffer. However, |
| * it's too late at that point to fudge the syscall args, |
| * because processing of the syscall has already begun in the |
| * kernel. But that's OK: the syscallbuf code has already |
| * swapped out the original buffer-pointers for pointers into |
| * the syscallbuf (which acts as its own scratch memory). We |
| * just have to worry about setting things up properly for |
| * replay. |
| * |
| * The descheduled syscall will "abort" its commit into the |
| * syscallbuf, so the outparam data won't actually be saved |
| * there (and thus, won't be restored during replay). During |
| * replay, we have to restore them like we restore the |
| * non-buffered-syscall scratch data. This is done by recording |
| * the relevant syscallbuf record data in rec_process_syscall_arch. |
| */ |
| struct TaskSyscallState : TaskSyscallStateBase { |
| static TaskSyscallState& get(RecordTask* t) { |
| auto base = t->syscall_state.get(); |
| ASSERT(t, base) << "Expected syscall-state but didn't find one"; |
| return *static_cast<TaskSyscallState*>(base); |
| } |
| static TaskSyscallState* maybe_get(RecordTask* t) { |
| auto base = t->syscall_state.get(); |
| return static_cast<TaskSyscallState*>(base); |
| } |
| |
| void init(RecordTask* t) { |
| if (preparation_done) { |
| return; |
| } |
| this->t = t; |
| scratch = t->scratch_ptr; |
| } |
| |
| /** |
| * Identify a syscall memory parameter whose address is in register 'arg' |
| * with type T. |
| * Returns a remote_ptr to the data in the child (before scratch relocation) |
| * or null if parameters have already been prepared (the syscall is |
| * resuming). |
| */ |
| template <typename T> |
| remote_ptr<T> reg_parameter(int arg, ArgMode mode = OUT, |
| ArgMutator mutator = nullptr) { |
| return reg_parameter(arg, sizeof(T), mode, mutator).cast<T>(); |
| } |
| /** |
| * Identify a syscall memory parameter whose address is in register 'arg' |
| * with size 'size'. |
| * Returns a remote_ptr to the data in the child (before scratch relocation) |
| * or null if parameters have already been prepared (the syscall is |
| * resuming). |
| */ |
| remote_ptr<void> reg_parameter(int arg, const ParamSize& size, |
| ArgMode mode = OUT, |
| ArgMutator mutator = nullptr); |
| /** |
| * Identify a syscall memory parameter whose address is in memory at |
| * location 'addr_of_buf_ptr' with type T. |
| * Returns a remote_ptr to the data in the child (before scratch relocation) |
| * or null if parameters have already been prepared (the syscall is |
| * resuming). |
| * addr_of_buf_ptr must be in a buffer identified by some init_..._parameter |
| * call. |
| */ |
| template <typename T> |
| remote_ptr<T> mem_ptr_parameter(remote_ptr<void> addr_of_buf_ptr, |
| ArgMode mode = OUT, |
| ArgMutator mutator = nullptr) { |
| return mem_ptr_parameter(addr_of_buf_ptr, sizeof(T), mode, mutator) |
| .cast<T>(); |
| } |
| /** |
| * Identify a syscall memory parameter whose address is in memory at |
| * location 'addr_of_buf_ptr' with type T. |
| * Returns a remote_ptr to the data in the child (before scratch relocation) |
| * or null if parameters have already been prepared (the syscall is |
| * resuming). |
| * addr_of_buf_ptr must be in a buffer identified by some init_..._parameter |
| * call. |
| */ |
| template <typename Ptr> |
| remote_ptr<typename Ptr::Referent> mem_ptr_parameter_inferred( |
| remote_ptr<Ptr> addr_of_buf_ptr, ArgMode mode = OUT, |
| ArgMutator mutator = nullptr) { |
| remote_ptr<void> p = |
| mem_ptr_parameter(addr_of_buf_ptr, Ptr::referent_size(), mode, mutator); |
| return p.cast<typename Ptr::Referent>(); |
| } |
| /** |
| * Identify a syscall memory parameter whose address is in memory at |
| * location 'addr_of_buf_ptr' with size 'size'. |
| * Returns a remote_ptr to the data in the child (before scratch relocation) |
| * or null if parameters have already been prepared (the syscall is |
| * resuming). |
| * addr_of_buf_ptr must be in a buffer identified by some init_..._parameter |
| * call. |
| */ |
| remote_ptr<void> mem_ptr_parameter(remote_ptr<void> addr_of_buf_ptr, |
| const ParamSize& size, ArgMode mode = OUT, |
| ArgMutator mutator = nullptr); |
| |
| typedef void (*AfterSyscallAction)(RecordTask* t); |
| // Register a callback to run when the syscall has completed. |
| // This runs after parameters have been restored. |
| void after_syscall_action(AfterSyscallAction action) { |
| after_syscall_actions.push_back(action); |
| } |
| |
| void emulate_result(uint64_t result) { |
| ASSERT(t, !preparation_done); |
| ASSERT(t, !should_emulate_result); |
| should_emulate_result = true; |
| emulated_result = result; |
| } |
| |
| /** |
| * Internal method that takes 'ptr', an address within some memory parameter, |
| * and relocates it to the parameter's location in scratch memory. |
| */ |
| remote_ptr<void> relocate_pointer_to_scratch(remote_ptr<void> ptr); |
| /** |
| * Internal method that takes the index of a MemoryParam and a vector |
| * containing the actual sizes assigned to each param < param_index, and |
| * computes the actual size to use for parameter param_index. |
| */ |
| size_t eval_param_size(size_t param_index, vector<size_t>& actual_sizes); |
| /** |
| * Called when all memory parameters have been identified. If 'sw' is |
| * ALLOW_SWITCH, sets up scratch memory and updates registers etc as |
| * necessary. |
| * If scratch can't be used for some reason, returns PREVENT_SWITCH, |
| * otherwise returns 'sw'. |
| */ |
| Switchable done_preparing(Switchable sw); |
| Switchable done_preparing_internal(Switchable sw); |
| enum WriteBack { WRITE_BACK, NO_WRITE_BACK }; |
| /** |
| * Called when a syscall exits to copy results from scratch memory to their |
| * original destinations, update registers, etc. |
| */ |
| void process_syscall_results(); |
| /** |
| * Called when a syscall has been completely aborted to undo any changes we |
| * made. |
| */ |
| void abort_syscall_results(); |
| |
| /** |
| * Upon successful syscall completion, each RestoreAndRecordScratch record |
| * in param_list consumes num_bytes from the t->scratch_ptr |
| * buffer, copying the data to remote_dest and recording the data at |
| * remote_dest. If ptr_in_reg is greater than zero, updates the task's |
| * ptr_in_reg register with 'remote_dest'. If ptr_in_memory is non-null, |
| * updates the ptr_in_memory location with the value 'remote_dest'. |
| */ |
| struct MemoryParam { |
| MemoryParam() : ptr_in_reg(0) {} |
| |
| remote_ptr<void> dest; |
| remote_ptr<void> scratch; |
| ParamSize num_bytes; |
| remote_ptr<void> ptr_in_memory; |
| int ptr_in_reg; |
| ArgMode mode; |
| ArgMutator mutator; |
| }; |
| |
| RecordTask* t; |
| |
| vector<MemoryParam> param_list; |
| /** Tracks the position in t's scratch_ptr buffer where we should allocate |
| * the next scratch area. |
| */ |
| remote_ptr<void> scratch; |
| |
| vector<AfterSyscallAction> after_syscall_actions; |
| |
| std::unique_ptr<TraceTaskEvent> exec_saved_event; |
| |
| RecordTask* emulate_wait_for_child; |
| |
| /** Saved syscall-entry registers, used by code paths that modify the |
| * registers temporarily. |
| */ |
| Registers syscall_entry_registers; |
| |
| /** When nonzero, syscall is expected to return the given errno and we should |
| * die if it does not. This is set when we detect an error condition during |
| * syscall-enter preparation. |
| */ |
| int expect_errno; |
| |
| /** When should_emulate_result is true, syscall result should be adjusted to |
| * be emulated_result. */ |
| bool should_emulate_result; |
| uint64_t emulated_result; |
| |
| /** Records whether the syscall is switchable. Only valid when |
| * preparation_done is true. */ |
| Switchable switchable; |
| |
| /** Whether we should write back the syscall results from scratch. Only |
| * valid when preparation_done is true. */ |
| WriteBack write_back; |
| |
| /** When true, this syscall has already been prepared and should not |
| * be set up again. |
| */ |
| bool preparation_done; |
| /** When true, the scratch area is enabled, otherwise we're letting |
| * syscall outputs be written directly to their destinations. |
| * Only valid when preparation_done is true. |
| */ |
| bool scratch_enabled; |
| |
| /** Miscellaneous saved data that can be used by particular syscalls */ |
| vector<uint8_t> saved_data; |
| |
| TaskSyscallState() |
| : t(nullptr), |
| emulate_wait_for_child(nullptr), |
| expect_errno(0), |
| should_emulate_result(false), |
| preparation_done(false), |
| scratch_enabled(false) {} |
| }; |
| |
| template <typename Arch> |
| static void set_remote_ptr_arch(RecordTask* t, remote_ptr<void> addr, |
| remote_ptr<void> value) { |
| auto typed_addr = addr.cast<typename Arch::unsigned_word>(); |
| t->write_mem(typed_addr, (typename Arch::unsigned_word)value.as_int()); |
| } |
| |
| static void set_remote_ptr(RecordTask* t, remote_ptr<void> addr, |
| remote_ptr<void> value) { |
| RR_ARCH_FUNCTION(set_remote_ptr_arch, t->arch(), t, addr, value); |
| } |
| |
| template <typename Arch> |
| static remote_ptr<void> get_remote_ptr_arch(RecordTask* t, |
| remote_ptr<void> addr) { |
| auto typed_addr = addr.cast<typename Arch::unsigned_word>(); |
| auto old = t->read_mem(typed_addr); |
| return remote_ptr<void>(old); |
| } |
| |
| static remote_ptr<void> get_remote_ptr(RecordTask* t, remote_ptr<void> addr) { |
| RR_ARCH_FUNCTION(get_remote_ptr_arch, t->arch(), t, addr); |
| } |
| |
| static void align_scratch(remote_ptr<void>* scratch, uintptr_t amount = 8) { |
| *scratch = (scratch->as_int() + amount - 1) & ~(amount - 1); |
| } |
| |
| remote_ptr<void> TaskSyscallState::reg_parameter(int arg, const ParamSize& size, |
| ArgMode mode, |
| ArgMutator mutator) { |
| if (preparation_done) { |
| return remote_ptr<void>(); |
| } |
| |
| MemoryParam param; |
| param.dest = syscall_entry_registers.arg(arg); |
| if (param.dest.is_null()) { |
| return remote_ptr<void>(); |
| } |
| param.num_bytes = size; |
| param.mode = mode; |
| param.mutator = mutator; |
| ASSERT(t, !mutator || mode == IN); |
| if (mode != IN_OUT_NO_SCRATCH) { |
| param.scratch = scratch; |
| scratch += param.num_bytes.incoming_size; |
| align_scratch(&scratch); |
| param.ptr_in_reg = arg; |
| } |
| param_list.push_back(param); |
| return param.dest; |
| } |
| |
| remote_ptr<void> TaskSyscallState::mem_ptr_parameter( |
| remote_ptr<void> addr_of_buf_ptr, const ParamSize& size, ArgMode mode, |
| ArgMutator mutator) { |
| if (preparation_done || addr_of_buf_ptr.is_null()) { |
| return remote_ptr<void>(); |
| } |
| |
| MemoryParam param; |
| param.dest = get_remote_ptr(t, addr_of_buf_ptr); |
| if (param.dest.is_null()) { |
| return remote_ptr<void>(); |
| } |
| param.num_bytes = size; |
| param.mode = mode; |
| param.mutator = mutator; |
| ASSERT(t, !mutator || mode == IN); |
| if (mode != IN_OUT_NO_SCRATCH) { |
| param.scratch = scratch; |
| scratch += param.num_bytes.incoming_size; |
| align_scratch(&scratch); |
| param.ptr_in_memory = addr_of_buf_ptr; |
| } |
| param_list.push_back(param); |
| return param.dest; |
| } |
| |
| remote_ptr<void> TaskSyscallState::relocate_pointer_to_scratch( |
| remote_ptr<void> ptr) { |
| int num_relocations = 0; |
| remote_ptr<void> result; |
| for (auto& param : param_list) { |
| if (param.dest <= ptr && ptr < param.dest + param.num_bytes.incoming_size) { |
| result = param.scratch + (ptr - param.dest); |
| ++num_relocations; |
| } |
| } |
| DEBUG_ASSERT( |
| num_relocations > 0 && |
| "Pointer in non-scratch memory being updated to point to scratch?"); |
| DEBUG_ASSERT(num_relocations <= 1 && |
| "Overlapping buffers containing relocated pointer?"); |
| return result; |
| } |
| |
| Switchable TaskSyscallState::done_preparing_internal(Switchable sw) { |
| ASSERT(t, !preparation_done); |
| |
| preparation_done = true; |
| write_back = WRITE_BACK; |
| switchable = sw; |
| |
| if (!t->scratch_ptr) { |
| return switchable; |
| } |
| |
| ASSERT(t, scratch >= t->scratch_ptr); |
| |
| if (sw == ALLOW_SWITCH && |
| scratch > t->scratch_ptr + t->usable_scratch_size()) { |
| LOG(warn) |
| << "`" << t->ev().Syscall().syscall_name() |
| << "' needed a scratch buffer of size " << scratch - t->scratch_ptr |
| << ", but only " << t->usable_scratch_size() |
| << " was available. Allowing the syscall to proceed without scratch, which may race."; |
| return switchable; |
| } |
| if (switchable == PREVENT_SWITCH || param_list.empty()) { |
| return switchable; |
| } |
| |
| scratch_enabled = true; |
| |
| // Step 1: Copy all IN/IN_OUT parameters to their scratch areas |
| for (auto& param : param_list) { |
| ASSERT(t, param.num_bytes.incoming_size < size_t(-1)); |
| if (param.mode == IN_OUT || param.mode == IN) { |
| // Initialize scratch buffer with input data |
| std::unique_ptr<uint8_t[]> buf( |
| new uint8_t[param.num_bytes.incoming_size]); |
| t->read_bytes_helper(param.dest, param.num_bytes.incoming_size, |
| buf.get()); |
| t->write_bytes_helper(param.scratch, param.num_bytes.incoming_size, |
| buf.get()); |
| } |
| } |
| // Step 2: Update pointers in registers/memory to point to scratch areas |
| { |
| Registers r = t->regs(); |
| for (auto& param : param_list) { |
| if (param.ptr_in_reg) { |
| r.set_arg(param.ptr_in_reg, param.scratch.as_int()); |
| } |
| if (!param.ptr_in_memory.is_null()) { |
| // Pointers being relocated must themselves be in scratch memory. |
| // We don't want to modify non-scratch memory. Find the pointer's |
| // location |
| // in scratch memory. |
| auto p = relocate_pointer_to_scratch(param.ptr_in_memory); |
| // Update pointer to point to scratch. |
| // Note that this can only happen after step 1 is complete and all |
| // parameter data has been copied to scratch memory. |
| set_remote_ptr(t, p, param.scratch); |
| } |
| // If the number of bytes to record is coming from a memory location, |
| // update that location to scratch. |
| if (!param.num_bytes.mem_ptr.is_null()) { |
| param.num_bytes.mem_ptr = |
| relocate_pointer_to_scratch(param.num_bytes.mem_ptr); |
| } |
| } |
| t->set_regs(r); |
| } |
| return switchable; |
| } |
| |
| Switchable TaskSyscallState::done_preparing(Switchable sw) { |
| if (preparation_done) { |
| return switchable; |
| } |
| |
| sw = done_preparing_internal(sw); |
| ASSERT(t, sw == switchable); |
| |
| // Step 3: Execute mutators. This must run even if the scratch steps do not. |
| for (auto& param : param_list) { |
| if (param.mutator) { |
| // Mutated parameters must be IN. If we have scratch space, we don't need |
| // to save anything. |
| void* saved_data_loc = nullptr; |
| if (!scratch_enabled) { |
| auto prev_size = saved_data.size(); |
| saved_data.resize(prev_size + param.num_bytes.incoming_size); |
| saved_data_loc = saved_data.data() + prev_size; |
| } |
| if (!(*param.mutator)(t, scratch_enabled ? param.scratch : param.dest, |
| saved_data_loc)) { |
| // Nothing was modified, no need to clean up when we unwind. |
| param.mutator = nullptr; |
| if (!scratch_enabled) { |
| saved_data.resize(saved_data.size() - param.num_bytes.incoming_size); |
| } |
| } |
| } |
| } |
| return switchable; |
| } |
| |
| size_t TaskSyscallState::eval_param_size(size_t i, |
| vector<size_t>& actual_sizes) { |
| DEBUG_ASSERT(actual_sizes.size() == i); |
| |
| size_t already_consumed = 0; |
| for (size_t j = 0; j < i; ++j) { |
| if (param_list[j].num_bytes.is_same_source(param_list[i].num_bytes)) { |
| already_consumed += actual_sizes[j]; |
| } |
| } |
| size_t size = param_list[i].num_bytes.eval(t, already_consumed); |
| actual_sizes.push_back(size); |
| return size; |
| } |
| |
| void TaskSyscallState::process_syscall_results() { |
| ASSERT(t, preparation_done); |
| |
| // XXX what's the best way to handle failed syscalls? Currently we just |
| // record everything as if it succeeded. That handles failed syscalls that |
| // wrote partial results, but doesn't handle syscalls that failed with |
| // EFAULT. |
| vector<size_t> actual_sizes; |
| if (scratch_enabled) { |
| size_t scratch_num_bytes = scratch - t->scratch_ptr; |
| auto data = t->read_mem(t->scratch_ptr.cast<uint8_t>(), scratch_num_bytes); |
| Registers r = t->regs(); |
| // Step 1: compute actual sizes of all buffers and copy outputs |
| // from scratch back to their origin |
| for (size_t i = 0; i < param_list.size(); ++i) { |
| auto& param = param_list[i]; |
| size_t size = eval_param_size(i, actual_sizes); |
| if (write_back == WRITE_BACK && |
| (param.mode == IN_OUT || param.mode == OUT)) { |
| const uint8_t* d = data.data() + (param.scratch - t->scratch_ptr); |
| t->write_bytes_helper(param.dest, size, d); |
| } |
| } |
| bool memory_cleaned_up = false; |
| // Step 2: restore modified in-memory pointers and registers |
| for (size_t i = 0; i < param_list.size(); ++i) { |
| auto& param = param_list[i]; |
| if (param.ptr_in_reg) { |
| r.set_orig_arg(param.ptr_in_reg, param.dest.as_int()); |
| } |
| if (!param.ptr_in_memory.is_null()) { |
| memory_cleaned_up = true; |
| set_remote_ptr(t, param.ptr_in_memory, param.dest); |
| } |
| } |
| if (write_back == WRITE_BACK) { |
| // Step 3: record all output memory areas |
| for (size_t i = 0; i < param_list.size(); ++i) { |
| auto& param = param_list[i]; |
| size_t size = actual_sizes[i]; |
| if (param.mode == IN_OUT_NO_SCRATCH) { |
| t->record_remote(param.dest, size); |
| } else if (param.mode == IN_OUT || param.mode == OUT) { |
| // If pointers in memory were fixed up in step 2, then record |
| // from tracee memory to ensure we record such fixes. Otherwise we |
| // can record from our local data. |
| // XXX This optimization can be improved if necessary... |
| if (memory_cleaned_up) { |
| t->record_remote(param.dest, size); |
| } else { |
| const uint8_t* d = data.data() + (param.scratch - t->scratch_ptr); |
| t->record_local(param.dest, size, d); |
| } |
| } |
| } |
| } |
| t->set_regs(r); |
| } else { |
| // Step 1: Determine the size of all output memory areas |
| for (size_t i = 0; i < param_list.size(); ++i) { |
| eval_param_size(i, actual_sizes); |
| } |
| // Step 2: restore all mutated memory |
| for (auto& param : param_list) { |
| if (param.mutator) { |
| size_t size = param.num_bytes.incoming_size; |
| ASSERT(t, saved_data.size() >= size); |
| // If this intersects an output region, we need to be careful not to |
| // clobber what the kernel gave us. |
| for (size_t i = 0; i < param_list.size(); ++i) { |
| auto& param2 = param_list[i]; |
| size_t param2_size = actual_sizes[i]; |
| if (param2.mode == IN) { |
| continue; |
| } |
| MemoryRange intersection = MemoryRange(param2.dest, param2_size). |
| intersect(MemoryRange(param.dest, size)); |
| if (intersection.size() != 0) { |
| // Just update the saved data we already have. We could try |
| // splitting the range and only writing what needs to still change, |
| // but we'd probably just end up doing the exact same number of |
| // syscalls and this is simpler. |
| t->read_bytes_helper(intersection.start(), intersection.size(), |
| saved_data.data() + (intersection.start() - param.dest)); |
| } |
| } |
| t->write_bytes_helper(param.dest, size, saved_data.data()); |
| saved_data.erase(saved_data.begin(), saved_data.begin() + size); |
| } |
| } |
| ASSERT(t, saved_data.empty()); |
| // Step 3: record all output memory areas |
| for (size_t i = 0; i < param_list.size(); ++i) { |
| auto& param = param_list[i]; |
| size_t size = actual_sizes[i]; |
| if (param.mode != IN) { |
| t->record_remote(param.dest, size); |
| } |
| } |
| } |
| |
| if (should_emulate_result) { |
| Registers r = t->regs(); |
| r.set_syscall_result(emulated_result); |
| t->set_regs(r); |
| } |
| |
| for (auto& action : after_syscall_actions) { |
| action(t); |
| } |
| } |
| |
| void TaskSyscallState::abort_syscall_results() { |
| ASSERT(t, preparation_done); |
| |
| if (scratch_enabled) { |
| Registers r = t->regs(); |
| // restore modified in-memory pointers and registers |
| for (size_t i = 0; i < param_list.size(); ++i) { |
| auto& param = param_list[i]; |
| if (param.ptr_in_reg) { |
| r.set_arg(param.ptr_in_reg, param.dest.as_int()); |
| } |
| if (!param.ptr_in_memory.is_null()) { |
| set_remote_ptr(t, param.ptr_in_memory, param.dest); |
| } |
| } |
| t->set_regs(r); |
| } else { |
| for (auto& param : param_list) { |
| if (param.mutator) { |
| size_t size = param.num_bytes.incoming_size; |
| ASSERT(t, saved_data.size() >= size); |
| t->write_bytes_helper(param.dest, size, saved_data.data()); |
| saved_data.erase(saved_data.begin(), saved_data.begin() + size); |
| } |
| } |
| } |
| } |
| |
| template <typename Arch> |
| static void prepare_recvmsg(RecordTask* t, TaskSyscallState& syscall_state, |
| remote_ptr<typename Arch::msghdr> msgp, |
| const ParamSize& io_size) { |
| auto namelen_ptr = REMOTE_PTR_FIELD(msgp, msg_namelen); |
| syscall_state.mem_ptr_parameter( |
| REMOTE_PTR_FIELD(msgp, msg_name), |
| ParamSize::from_initialized_mem(t, namelen_ptr)); |
| |
| auto msg = t->read_mem(msgp); |
| remote_ptr<void> iovecsp_void = syscall_state.mem_ptr_parameter( |
| REMOTE_PTR_FIELD(msgp, msg_iov), |
| sizeof(typename Arch::iovec) * msg.msg_iovlen, IN); |
| auto iovecsp = iovecsp_void.cast<typename Arch::iovec>(); |
| auto iovecs = t->read_mem(iovecsp, msg.msg_iovlen); |
| for (size_t i = 0; i < msg.msg_iovlen; ++i) { |
| syscall_state.mem_ptr_parameter(REMOTE_PTR_FIELD(iovecsp + i, iov_base), |
| io_size.limit_size(iovecs[i].iov_len)); |
| } |
| |
| auto controllen_ptr = REMOTE_PTR_FIELD(msgp, msg_controllen); |
| syscall_state.mem_ptr_parameter( |
| REMOTE_PTR_FIELD(msgp, msg_control), |
| ParamSize::from_initialized_mem(t, controllen_ptr)); |
| } |
| |
| template <typename Arch> |
| static void prepare_recvmmsg(RecordTask* t, TaskSyscallState& syscall_state, |
| remote_ptr<typename Arch::mmsghdr> mmsgp, |
| unsigned int vlen) { |
| for (unsigned int i = 0; i < vlen; ++i) { |
| auto msgp = mmsgp + i; |
| prepare_recvmsg<Arch>(t, syscall_state, REMOTE_PTR_FIELD(msgp, msg_hdr), |
| ParamSize::from_mem(REMOTE_PTR_FIELD(msgp, msg_len))); |
| } |
| } |
| |
| static bool block_sock_opt(int level, int optname, |
| TaskSyscallState& syscall_state) { |
| switch (level) { |
| case SOL_PACKET: |
| switch (optname) { |
| case PACKET_RX_RING: |
| case PACKET_TX_RING: |
| syscall_state.emulate_result(-ENOPROTOOPT); |
| return true; |
| } |
| break; |
| case SOL_NETLINK: |
| switch (optname) { |
| case NETLINK_RX_RING: |
| case NETLINK_TX_RING: |
| syscall_state.emulate_result(-ENOPROTOOPT); |
| return true; |
| } |
| break; |
| } |
| return false; |
| } |
| |
| template <typename Arch> |
| static Switchable prepare_setsockopt(RecordTask* t, |
| TaskSyscallState& syscall_state, |
| typename Arch::setsockopt_args& args) { |
| if (block_sock_opt(args.level, args.optname, syscall_state)) { |
| Registers r = t->regs(); |
| r.set_arg1(-1); |
| t->set_regs(r); |
| } else { |
| switch (args.level) { |
| case IPPROTO_IP: |
| case IPPROTO_IPV6: |
| switch (args.optname) { |
| case SO_SET_REPLACE: { |
| if (args.optlen < (ssize_t)sizeof(typename Arch::ipt_replace)) { |
| break; |
| } |
| auto repl_ptr = |
| args.optval.rptr().template cast<typename Arch::ipt_replace>(); |
| syscall_state.mem_ptr_parameter( |
| REMOTE_PTR_FIELD(repl_ptr, counters), |
| t->read_mem(REMOTE_PTR_FIELD(repl_ptr, num_counters)) * |
| sizeof(typename Arch::xt_counters)); |
| break; |
| } |
| default: |
| break; |
| } |
| break; |
| default: |
| break; |
| } |
| } |
| return PREVENT_SWITCH; |
| } |
| |
| template <typename Arch> |
| static Switchable maybe_blacklist_connect(RecordTask* t, |
| remote_ptr<void> addr_ptr, |
| socklen_t addrlen) { |
| struct sockaddr_un addr; |
| memset(&addr, 0, sizeof(addr)); |
| t->read_bytes_fallible(addr_ptr, min<socklen_t>(sizeof(addr), addrlen), |
| &addr); |
| // Ensure null termination; |
| addr.sun_path[sizeof(addr.sun_path) - 1] = 0; |
| if (addr.sun_family == AF_UNIX && is_blacklisted_socket(addr.sun_path)) { |
| LOG(warn) << "Cowardly refusing to connect to " << addr.sun_path; |
| // Hijack the syscall. |
| Registers r = t->regs(); |
| r.set_original_syscallno(Arch::gettid); |
| t->set_regs(r); |
| } |
| return PREVENT_SWITCH; |
| } |
| |
| template <typename Arch> |
| static Switchable prepare_socketcall(RecordTask* t, |
| TaskSyscallState& syscall_state) { |
| /* int socketcall(int call, unsigned long *args) { |
| * long a[6]; |
| * copy_from_user(a,args); |
| * sys_recv(a0, (void __user *)a1, a[2], a[3]); |
| * } |
| * |
| * (from http://lxr.linux.no/#linux+v3.6.3/net/socket.c#L2354) |
| */ |
| switch ((int)t->regs().arg1_signed()) { |
| /* int socket(int domain, int type, int protocol); */ |
| case SYS_SOCKET: |
| /* int bind(int sockfd, const struct sockaddr *addr, socklen_t addrlen); */ |
| case SYS_BIND: |
| /* int listen(int sockfd, int backlog) */ |
| case SYS_LISTEN: |
| /* int shutdown(int socket, int how) */ |
| case SYS_SHUTDOWN: |
| break; |
| |
| /* int connect(int sockfd, const struct sockaddr *addr, socklen_t addrlen); |
| */ |
| case SYS_CONNECT: { |
| auto argsp = |
| syscall_state.reg_parameter<typename Arch::connect_args>(2, IN); |
| auto args = t->read_mem(argsp); |
| return maybe_blacklist_connect<Arch>(t, args.addr.rptr(), args.addrlen); |
| } |
| |
| /* ssize_t send(int sockfd, const void *buf, size_t len, int flags) */ |
| case SYS_SEND: |
| /* ssize_t sendto(int sockfd, const void *buf, size_t len, int flags, const |
| * struct sockaddr *dest_addr, socklen_t addrlen); */ |
| case SYS_SENDTO: |
| // These can block when socket buffers are full. |
| return ALLOW_SWITCH; |
| |
| /* int setsockopt(int sockfd, int level, int optname, const void *optval, |
| * socklen_t optlen); */ |
| case SYS_SETSOCKOPT: { |
| auto argsp = |
| syscall_state.reg_parameter<typename Arch::setsockopt_args>(2, IN); |
| auto args = t->read_mem(argsp); |
| return prepare_setsockopt<Arch>(t, syscall_state, args); |
| } |
| |
| /* int getsockopt(int sockfd, int level, int optname, const void *optval, |
| * socklen_t* optlen); |
| */ |
| case SYS_GETSOCKOPT: { |
| auto argsp = |
| syscall_state.reg_parameter<typename Arch::getsockopt_args>(2, IN); |
| auto optlen_ptr = syscall_state.mem_ptr_parameter_inferred( |
| REMOTE_PTR_FIELD(argsp, optlen), IN_OUT); |
| syscall_state.mem_ptr_parameter( |
| REMOTE_PTR_FIELD(argsp, optval), |
| ParamSize::from_initialized_mem(t, optlen_ptr)); |
| break; |
| } |
| |
| /* int socketpair(int domain, int type, int protocol, int sv[2]); |
| * |
| * values returned in sv |
| */ |
| case SYS_SOCKETPAIR: { |
| auto argsp = |
| syscall_state.reg_parameter<typename Arch::socketpair_args>(2, IN); |
| syscall_state.mem_ptr_parameter(REMOTE_PTR_FIELD(argsp, sv), |
| sizeof(int) * 2); |
| break; |
| } |
| |
| /* int getpeername(int sockfd, struct sockaddr *addr, socklen_t *addrlen); |
| */ |
| case SYS_GETPEERNAME: |
| /* int getsockname(int sockfd, struct sockaddr *addr, socklen_t *addrlen); |
| */ |
| case SYS_GETSOCKNAME: { |
| auto argsp = |
| syscall_state.reg_parameter<typename Arch::getsockname_args>(2, IN); |
| auto addrlen_ptr = syscall_state.mem_ptr_parameter_inferred( |
| REMOTE_PTR_FIELD(argsp, addrlen), IN_OUT); |
| syscall_state.mem_ptr_parameter( |
| REMOTE_PTR_FIELD(argsp, addr), |
| ParamSize::from_initialized_mem(t, addrlen_ptr)); |
| break; |
| } |
| |
| /* ssize_t recv([int sockfd, void *buf, size_t len, int flags]) */ |
| case SYS_RECV: { |
| auto argsp = syscall_state.reg_parameter<typename Arch::recv_args>(2, IN); |
| auto args = t->read_mem(argsp); |
| syscall_state.mem_ptr_parameter( |
| REMOTE_PTR_FIELD(argsp, buf), |
| ParamSize::from_syscall_result<typename Arch::ssize_t>(args.len)); |
| return ALLOW_SWITCH; |
| } |
| |
| /* int accept([int sockfd, struct sockaddr *addr, socklen_t *addrlen]) */ |
| case SYS_ACCEPT: { |
| auto argsp = |
| syscall_state.reg_parameter<typename Arch::accept_args>(2, IN); |
| auto addrlen_ptr = syscall_state.mem_ptr_parameter_inferred( |
| REMOTE_PTR_FIELD(argsp, addrlen), IN_OUT); |
| syscall_state.mem_ptr_parameter( |
| REMOTE_PTR_FIELD(argsp, addr), |
| ParamSize::from_initialized_mem(t, addrlen_ptr)); |
| return ALLOW_SWITCH; |
| } |
| |
| /* int accept4([int sockfd, struct sockaddr *addr, socklen_t *addrlen, int |
| * flags]) */ |
| case SYS_ACCEPT4: { |
| auto argsp = |
| syscall_state.reg_parameter<typename Arch::accept4_args>(2, IN); |
| auto addrlen_ptr = syscall_state.mem_ptr_parameter_inferred( |
| REMOTE_PTR_FIELD(argsp, addrlen), IN_OUT); |
| syscall_state.mem_ptr_parameter( |
| REMOTE_PTR_FIELD(argsp, addr), |
| ParamSize::from_initialized_mem(t, addrlen_ptr)); |
| return ALLOW_SWITCH; |
| } |
| |
| case SYS_RECVFROM: { |
| auto argsp = |
| syscall_state.reg_parameter<typename Arch::recvfrom_args>(2, IN); |
| auto args = t->read_mem(argsp); |
| syscall_state.mem_ptr_parameter( |
| REMOTE_PTR_FIELD(argsp, buf), |
| ParamSize::from_syscall_result<typename Arch::ssize_t>(args.len)); |
| auto addrlen_ptr = syscall_state.mem_ptr_parameter_inferred( |
| REMOTE_PTR_FIELD(argsp, addrlen), IN_OUT); |
| syscall_state.mem_ptr_parameter( |
| REMOTE_PTR_FIELD(argsp, src_addr), |
| ParamSize::from_initialized_mem(t, addrlen_ptr)); |
| return ALLOW_SWITCH; |
| } |
| |
| case SYS_RECVMSG: { |
| auto argsp = |
| syscall_state.reg_parameter<typename Arch::recvmsg_args>(2, IN); |
| auto msgp = syscall_state.mem_ptr_parameter_inferred( |
| REMOTE_PTR_FIELD(argsp, msg), IN_OUT); |
| prepare_recvmsg<Arch>( |
| t, syscall_state, msgp, |
| ParamSize::from_syscall_result<typename Arch::ssize_t>()); |
| |
| auto args = t->read_mem(argsp); |
| if (!(args.flags & MSG_DONTWAIT)) { |
| return ALLOW_SWITCH; |
| } |
| break; |
| } |
| |
| case SYS_RECVMMSG: { |
| auto argsp = |
| syscall_state.reg_parameter<typename Arch::recvmmsg_args>(2, IN); |
| auto args = t->read_mem(argsp); |
| remote_ptr<void> mmsgp_void = syscall_state.mem_ptr_parameter( |
| REMOTE_PTR_FIELD(argsp, msgvec), |
| sizeof(typename Arch::mmsghdr) * args.vlen, IN_OUT); |
| auto mmsgp = mmsgp_void.cast<typename Arch::mmsghdr>(); |
| prepare_recvmmsg<Arch>(t, syscall_state, mmsgp, args.vlen); |
| if (!(args.flags & MSG_DONTWAIT)) { |
| return ALLOW_SWITCH; |
| } |
| break; |
| } |
| |
| /* ssize_t sendmsg(int sockfd, const struct msghdr *msg, int flags) */ |
| case SYS_SENDMSG: { |
| auto argsp = remote_ptr<typename Arch::sendmsg_args>(t->regs().arg2()); |
| auto args = t->read_mem(argsp); |
| if (!(args.flags & MSG_DONTWAIT)) { |
| return ALLOW_SWITCH; |
| } |
| break; |
| } |
| |
| case SYS_SENDMMSG: { |
| auto argsp = |
| syscall_state.reg_parameter<typename Arch::sendmmsg_args>(2, IN); |
| auto args = t->read_mem(argsp); |
| syscall_state.mem_ptr_parameter( |
| REMOTE_PTR_FIELD(argsp, msgvec), |
| sizeof(typename Arch::mmsghdr) * args.vlen, IN_OUT); |
| if (!(args.flags & MSG_DONTWAIT)) { |
| return ALLOW_SWITCH; |
| } |
| break; |
| } |
| |
| default: |
| syscall_state.expect_errno = EINVAL; |
| break; |
| } |
| return PREVENT_SWITCH; |
| } |
| |
| template <typename Arch> |
| static Switchable prepare_msgctl(TaskSyscallState& syscall_state, int cmd, |
| int ptr_reg) { |
| switch (cmd) { |
| case IPC_STAT: |
| case MSG_STAT: |
| syscall_state.reg_parameter<typename Arch::msqid64_ds>(ptr_reg); |
| break; |
| case IPC_INFO: |
| case MSG_INFO: |
| syscall_state.reg_parameter<typename Arch::msginfo>(ptr_reg); |
| break; |
| |
| case IPC_SET: |
| case IPC_RMID: |
| break; |
| |
| default: |
| syscall_state.expect_errno = EINVAL; |
| break; |
| } |
| return PREVENT_SWITCH; |
| } |
| |
| template <typename Arch> |
| static Switchable prepare_shmctl(TaskSyscallState& syscall_state, int cmd, |
| int ptr_reg) { |
| switch (cmd) { |
| case IPC_SET: |
| case IPC_RMID: |
| case SHM_LOCK: |
| case SHM_UNLOCK: |
| break; |
| |
| case IPC_STAT: |
| case SHM_STAT: |
| syscall_state.reg_parameter<typename Arch::shmid64_ds>(ptr_reg); |
| break; |
| |
| case IPC_INFO: |
| syscall_state.reg_parameter<typename Arch::shminfo64>(ptr_reg); |
| break; |
| |
| case SHM_INFO: |
| syscall_state.reg_parameter<typename Arch::shm_info>(ptr_reg); |
| break; |
| |
| default: |
| syscall_state.expect_errno = EINVAL; |
| break; |
| } |
| return PREVENT_SWITCH; |
| } |
| |
| enum SemctlDereference { DEREFERENCE, USE_DIRECTLY }; |
| |
| template <typename Arch> |
| static Switchable prepare_semctl(RecordTask* t, TaskSyscallState& syscall_state, |
| int semid, int cmd, int ptr_reg, |
| SemctlDereference deref) { |
| switch (cmd) { |
| case IPC_SET: |
| case IPC_RMID: |
| case GETNCNT: |
| case GETPID: |
| case GETVAL: |
| case GETZCNT: |
| case SETALL: |
| case SETVAL: |
| break; |
| |
| case IPC_STAT: |
| case SEM_STAT: |
| if (deref == DEREFERENCE) { |
| syscall_state.mem_ptr_parameter<typename Arch::semid64_ds>( |
| syscall_state.reg_parameter<typename Arch::unsigned_long>(ptr_reg)); |
| } else { |
| syscall_state.reg_parameter<typename Arch::semid64_ds>(ptr_reg); |
| } |
| break; |
| |
| case IPC_INFO: |
| case SEM_INFO: |
| if (deref == DEREFERENCE) { |
| syscall_state.mem_ptr_parameter<typename Arch::seminfo>( |
| syscall_state.reg_parameter<typename Arch::unsigned_long>(ptr_reg)); |
| } else { |
| syscall_state.reg_parameter<typename Arch::seminfo>(ptr_reg); |
| } |
| break; |
| |
| case GETALL: { |
| semid64_ds ds; |
| _semun un_arg; |
| un_arg.buf = &ds; |
| int ret = _semctl(semid, 0, IPC_STAT, un_arg); |
| msan_unpoison(&ds, sizeof(semid64_ds)); |
| ASSERT(t, ret == 0); |
| |
| ParamSize size = sizeof(unsigned short) * ds.sem_nsems; |
| if (deref == DEREFERENCE) { |
| syscall_state.mem_ptr_parameter( |
| syscall_state.reg_parameter<typename Arch::unsigned_long>(ptr_reg), |
| size); |
| } else { |
| syscall_state.reg_parameter(ptr_reg, size); |
| } |
| break; |
| } |
| |
| default: |
| syscall_state.expect_errno = EINVAL; |
| break; |
| } |
| return PREVENT_SWITCH; |
| } |
| |
| /** |
| * A change has been made to file 'fd' in task t. If the file has been mmapped |
| * somewhere in t's address space, record the changes. |
| * We check for matching files by comparing file names. This may not be |
| * reliable but hopefully it's good enough for the cases where we need this. |
| * This doesn't currently handle shared mappings very well. A file mapped |
| * shared in multiple locations will be recorded once per location. |
| * This doesn't handle mappings of the file into other address spaces. |
| */ |
| static void record_file_change(RecordTask* t, int fd, uint64_t offset, |
| uint64_t length) { |
| string file_name = t->file_name_of_fd(fd); |
| |
| for (const auto& m : t->vm()->maps()) { |
| if (m.map.fsname() == file_name) { |
| uint64_t start = max(offset, uint64_t(m.map.file_offset_bytes())); |
| uint64_t end = min(offset + length, |
| uint64_t(m.map.file_offset_bytes()) + m.map.size()); |
| if (start < end) { |
| t->record_remote(m.map.start() + (start - m.map.file_offset_bytes()), |
| end - start); |
| } |
| } |
| }; |
| } |
| |
| template <typename Arch> |
| static void record_v4l2_buffer_contents(RecordTask* t) { |
| remote_ptr<typename Arch::v4l2_buffer> bufp = t->regs().arg3(); |
| auto buf = t->read_mem(bufp); |
| |
| switch (buf.memory) { |
| case V4L2_MEMORY_MMAP: |
| record_file_change(t, (int)t->regs().arg1_signed(), buf.m.offset, |
| buf.length); |
| return; |
| |
| default: |
| ASSERT(t, false) << "Unhandled V4L2 memory type " << buf.memory; |
| return; |
| } |
| } |
| |
| template <typename Arch> static void record_usbdevfs_reaped_urb(RecordTask* t) { |
| if (t->regs().syscall_failed()) { |
| return; |
| } |
| |
| remote_ptr<typename Arch::unsigned_word> pp = t->regs().arg3(); |
| remote_ptr<typename Arch::usbdevfs_urb> p = t->read_mem(pp); |
| t->record_remote(p); |
| auto urb = t->read_mem(p); |
| size_t length; |
| if (urb.type == USBDEVFS_URB_TYPE_ISO) { |
| auto iso_frame_descs_ptr = REMOTE_PTR_FIELD(p, iso_frame_desc[0]); |
| auto iso_frame_descs = |
| t->read_mem(iso_frame_descs_ptr, urb.number_of_packets); |
| length = 0; |
| for (auto& f : iso_frame_descs) { |
| length += f.length; |
| } |
| t->record_local(iso_frame_descs_ptr, iso_frame_descs.data(), |
| iso_frame_descs.size()); |
| } else { |
| length = urb.buffer_length; |
| } |
| // It's tempting to use actual_length here but in some cases the kernel |
| // writes back more data than that. |
| t->record_remote(urb.buffer, length); |
| } |
| |
| static void record_page_below_stack_ptr(RecordTask* t) { |
| /* Record.the page above the top of |t|'s stack. The SIOC* ioctls |
| * have been observed to write beyond the end of tracees' stacks, as |
| * if they had allocated scratch space for themselves. All we can do |
| * for now is try to record the scratch data. |
| */ |
| t->record_remote(t->regs().sp() - page_size(), page_size()); |
| } |
| |
| #define IOCTL_MASK_SIZE(v) ((v) & ~(_IOC_SIZEMASK << _IOC_SIZESHIFT)) |
| |
| typedef ethtool_gstrings GStrings; |
| |
| template <typename Arch> void get_ethtool_gstrings_arch(RecordTask* t) { |
| auto& syscall_state = TaskSyscallState::get(t); |
| Registers& regs = syscall_state.syscall_entry_registers; |
| bool ok = true; |
| auto ifreq = t->read_mem(remote_ptr<typename Arch::ifreq>(regs.arg3()), &ok); |
| Registers new_regs = t->regs(); |
| if (!ok) { |
| new_regs.set_syscall_result(-EFAULT); |
| t->set_regs(new_regs); |
| return; |
| } |
| remote_ptr<void> p = ifreq.ifr_ifru.ifru_data.rptr(); |
| auto orig_gstrings = p.cast<ethtool_gstrings>(); |
| auto et_gstrings = t->read_mem(orig_gstrings, &ok); |
| if (!ok) { |
| new_regs.set_syscall_result(-EFAULT); |
| t->set_regs(new_regs); |
| return; |
| } |
| if (et_gstrings.string_set >= 64) { |
| new_regs.set_syscall_result(-EOPNOTSUPP); |
| t->set_regs(new_regs); |
| return; |
| } |
| |
| AutoRemoteSyscalls remote(t); |
| |
| // Do a ETHTOOL_GSSET_INFO to get the number of strings |
| ethtool_sset_info et; |
| et.cmd = ETHTOOL_GSSET_INFO; |
| et.reserved = 0; |
| et.sset_mask = 1 << et_gstrings.string_set; |
| std::vector<uint8_t> buffer; |
| buffer.resize(sizeof(et) + sizeof(uint32_t)); |
| memcpy(buffer.data(), &et, sizeof(et)); |
| memset(buffer.data() + sizeof(et), 0, sizeof(uint32_t)); |
| AutoRestoreMem et_mem(remote, buffer.data(), buffer.size()); |
| |
| ifreq.ifr_ifru.ifru_data = et_mem.get(); |
| AutoRestoreMem ifr_mem(remote, &ifreq, sizeof(ifreq)); |
| |
| long ret = remote.syscall(regs.original_syscallno(), regs.arg1(), |
| SIOCETHTOOL, ifr_mem.get()); |
| if (ret < 0) { |
| remote.regs().set_syscall_result(ret); |
| return; |
| } |
| |
| uint32_t data = t->read_mem((et_mem.get() + sizeof(et)).cast<uint32_t>()); |
| // Now do the ETHTOOL_GSTRINGS call |
| ret = remote.syscall(regs.original_syscallno(), regs.arg1(), SIOCETHTOOL, |
| regs.arg3()); |
| remote.regs().set_syscall_result(ret); |
| if (ret < 0) { |
| return; |
| } |
| t->record_remote(orig_gstrings, sizeof(ethtool_gstrings) + ETH_GSTRING_LEN*data); |
| } |
| |
| static void get_ethtool_gstrings(RecordTask* t) { |
| RR_ARCH_FUNCTION(get_ethtool_gstrings_arch, t->arch(), t); |
| } |
| |
| template <typename Arch> void prepare_ethtool_ioctl(RecordTask* t, TaskSyscallState& syscall_state) { |
| auto ifrp = syscall_state.reg_parameter<typename Arch::ifreq>(3, IN); |
| bool ok = true; |
| auto ifreq = t->read_mem(ifrp, &ok); |
| if (!ok) { |
| syscall_state.expect_errno = EFAULT; |
| return; |
| } |
| remote_ptr<void> payload = REMOTE_PTR_FIELD(ifrp, ifr_ifru.ifru_data); |
| remote_ptr<void> buf_ptr = ifreq.ifr_ifru.ifru_data.rptr(); |
| uint32_t cmd = t->read_mem(buf_ptr.cast<uint32_t>(), &ok); |
| if (!ok) { |
| syscall_state.expect_errno = EFAULT; |
| return; |
| } |
| |
| switch (cmd) { |
| case ETHTOOL_GSET: |
| syscall_state.mem_ptr_parameter<ethtool_cmd>(payload, IN_OUT); |
| break; |
| case ETHTOOL_GDRVINFO: |
| syscall_state.mem_ptr_parameter<ethtool_drvinfo>(payload, IN_OUT); |
| break; |
| case ETHTOOL_GWOL: |
| syscall_state.mem_ptr_parameter<ethtool_wolinfo>(payload, IN_OUT); |
| break; |
| case ETHTOOL_GLINK: |
| syscall_state.mem_ptr_parameter<ethtool_value>(payload, IN_OUT); |
| break; |
| case ETHTOOL_GRXRINGS: |
| syscall_state.mem_ptr_parameter<typename Arch::ethtool_rxnfc>(payload, IN_OUT); |
| break; |
| case ETHTOOL_GREGS: { |
| auto buf = t->read_mem(buf_ptr.cast<ethtool_regs>(), &ok); |
| if (ok) { |
| syscall_state.mem_ptr_parameter(payload, ParamSize(sizeof(buf) + buf.len), IN_OUT); |
| } else { |
| syscall_state.expect_errno = EFAULT; |
| return; |
| } |
| break; |
| } |
| case ETHTOOL_GMODULEEEPROM: |
| case ETHTOOL_GEEPROM: { |
| auto buf = t->read_mem(buf_ptr.cast<ethtool_eeprom>(), &ok); |
| if (ok) { |
| syscall_state.mem_ptr_parameter(payload, ParamSize(sizeof(buf) + buf.len), IN_OUT); |
| } else { |
| syscall_state.expect_errno = EFAULT; |
| return; |
| } |
| break; |
| } |
| case ETHTOOL_GEEE: |
| syscall_state.mem_ptr_parameter<ethtool_eee>(payload, IN_OUT); |
| break; |
| case ETHTOOL_GMODULEINFO: |
| syscall_state.mem_ptr_parameter<ethtool_modinfo>(payload, IN_OUT); |
| break; |
| case ETHTOOL_GCOALESCE: |
| syscall_state.mem_ptr_parameter<ethtool_coalesce>(payload, IN_OUT); |
| break; |
| case ETHTOOL_GRINGPARAM: |
| syscall_state.mem_ptr_parameter<ethtool_ringparam>(payload, IN_OUT); |
| break; |
| case ETHTOOL_GCHANNELS: |
| syscall_state.mem_ptr_parameter<ethtool_channels>(payload, IN_OUT); |
| break; |
| case ETHTOOL_GPAUSEPARAM: |
| syscall_state.mem_ptr_parameter<ethtool_pauseparam>(payload, IN_OUT); |
| break; |
| case ETHTOOL_GSSET_INFO: { |
| auto buf = t->read_mem(buf_ptr.cast<ethtool_sset_info>(), &ok); |
| if (ok) { |
| int bits = pop_count(buf.sset_mask); |
| syscall_state.mem_ptr_parameter(payload, ParamSize(sizeof(buf) + bits*sizeof(uint32_t)), IN_OUT); |
| } else { |
| syscall_state.expect_errno = EFAULT; |
| return; |
| } |
| break; |
| } |
| case ETHTOOL_GSTRINGS: { |
| // These are an enormous pain because to know how much data will be written |
| // back by the kernel, we have to perform a ETHTOOL_GSSET_INFO first. |
| // We can't do that right here because we've already entered the kernel. |
| // So, we emulate this. |
| Registers r = t->regs(); |
| r.set_arg1(-1); |
| t->set_regs(r); |
| syscall_state.after_syscall_action(get_ethtool_gstrings); |
| break; |
| } |
| case ETHTOOL_GFEATURES: { |
| auto buf = t->read_mem(buf_ptr.cast<ethtool_gfeatures>(), &ok); |
| if (ok) { |
| syscall_state.mem_ptr_parameter(payload, ParamSize(sizeof(buf) + buf.size*sizeof(ethtool_get_features_block)), IN_OUT); |
| } else { |
| syscall_state.expect_errno = EFAULT; |
| return; |
| } |
| break; |
| } |
| case ETHTOOL_GPERMADDR: { |
| auto buf = t->read_mem(buf_ptr.cast<ethtool_perm_addr>(), &ok); |
| if (ok) { |
| syscall_state.mem_ptr_parameter(payload, ParamSize(sizeof(buf) + buf.size), IN_OUT); |
| } else { |
| syscall_state.expect_errno = EFAULT; |
| return; |
| } |
| break; |
| } |
| case ETHTOOL_SSET: |
| case ETHTOOL_SWOL: |
| case ETHTOOL_SEEPROM: |
| case ETHTOOL_SEEE: |
| case ETHTOOL_SCOALESCE: |
| case ETHTOOL_SRINGPARAM: |
| case ETHTOOL_SCHANNELS: |
| case ETHTOOL_SPAUSEPARAM: |
| case ETHTOOL_SFEATURES: |
| break; |
| default: |
| LOG(debug) << "Unknown ETHTOOL cmd " << cmd; |
| syscall_state.expect_errno = EINVAL; |
| return; |
| } |
| syscall_state.after_syscall_action(record_page_below_stack_ptr); |
| } |
| |
| template <typename Arch> |
| static Switchable prepare_ioctl(RecordTask* t, |
| TaskSyscallState& syscall_state) { |
| int fd = t->regs().arg1(); |
| uint64_t result; |
| if (t->fd_table()->emulate_ioctl(fd, t, &result)) { |
| // Don't perform this syscall. |
| Registers r = t->regs(); |
| r.set_arg1(-1); |
| t->set_regs(r); |
| syscall_state.emulate_result(result); |
| return PREVENT_SWITCH; |
| } |
| |
| unsigned int request = t->regs().arg2(); |
| int type = _IOC_TYPE(request); |
| int nr = _IOC_NR(request); |
| int dir = _IOC_DIR(request); |
| int size = _IOC_SIZE(request); |
| |
| LOG(debug) << "handling ioctl(" << HEX(request) << "): type:" << HEX(type) |
| << " nr:" << HEX(nr) << " dir:" << HEX(dir) << " size:" << size; |
| |
| ASSERT(t, !t->is_desched_event_syscall()) |
| << "Failed to skip past desched ioctl()"; |
| |
| /* Some ioctl()s are irregular and don't follow the _IOC() |
| * conventions. Special case them here. */ |
| switch (request) { |
| case 0xc020462a: // Nvidia driver ioctl |
| syscall_state.emulate_result(-ENOTTY); |
| return PREVENT_SWITCH; |
| |
| case SIOCETHTOOL: |
| prepare_ethtool_ioctl<Arch>(t, syscall_state); |
| return PREVENT_SWITCH; |
| |
| case SIOCGIFCONF: { |
| auto ifconfp = |
| syscall_state.reg_parameter<typename Arch::ifconf>(3, IN_OUT); |
| syscall_state.mem_ptr_parameter( |
| REMOTE_PTR_FIELD(ifconfp, ifc_ifcu.ifcu_buf), |
| ParamSize::from_initialized_mem(t, |
| REMOTE_PTR_FIELD(ifconfp, ifc_len))); |
| syscall_state.after_syscall_action(record_page_below_stack_ptr); |
| return PREVENT_SWITCH; |
| } |
| |
| /* Privileged ioctls */ |
| case SIOCSIFADDR: |
| case SIOCSIFDSTADDR: |
| case SIOCSIFBRDADDR: |
| case SIOCSIFHWADDR: |
| case SIOCSIFFLAGS: |
| case SIOCSIFPFLAGS: |
| case SIOCSIFTXQLEN: |
| case SIOCSIFMTU: |
| case SIOCSIFNAME: |
| case SIOCSIFNETMASK: |
| case SIOCSIFMETRIC: |
| case SIOCSIFHWBROADCAST: |
| case SIOCSIFMAP: |
| case SIOCADDMULTI: |
| case SIOCDELMULTI: |
| /* Bridge ioctls */ |
| case SIOCBRADDBR: |
| case SIOCBRDELBR: |
| case SIOCBRADDIF: |
| case SIOCBRDELIF: |
| /* Routing table ioctls */ |
| case SIOCADDRT: |
| case SIOCDELRT: |
| return PREVENT_SWITCH; |
| |
| case SIOCBONDINFOQUERY: { |
| auto ifrp = syscall_state.reg_parameter<typename Arch::ifreq>(3, IN); |
| syscall_state.mem_ptr_parameter<typename Arch::ifbond>( |
| REMOTE_PTR_FIELD(ifrp, ifr_ifru.ifru_data)); |
| syscall_state.after_syscall_action(record_page_below_stack_ptr); |
| return PREVENT_SWITCH; |
| } |
| |
| case SIOCGIFADDR: |
| case SIOCGIFDSTADDR: |
| case SIOCGIFBRDADDR: |
| case SIOCGIFHWADDR: |
| case SIOCGIFFLAGS: |
| case SIOCGIFPFLAGS: |
| case SIOCGIFTXQLEN: |
| case SIOCGIFINDEX: |
| case SIOCGIFMTU: |
| case SIOCGIFNAME: |
| case SIOCGIFNETMASK: |
| case SIOCGIFMETRIC: |
| case SIOCGIFMAP: |
| syscall_state.reg_parameter<typename Arch::ifreq>(3); |
| syscall_state.after_syscall_action(record_page_below_stack_ptr); |
| return PREVENT_SWITCH; |
| |
| // https://github.com/torvalds/linux/blob/254ec036db1123b10e23e1412c191a3cf70dce71/net/bridge/br_ioctl.c#L316-L369 |
| case SIOCGIFBR: { |
| auto params = syscall_state.reg_parameter<unsigned long>(3, IN); |
| auto op = t->read_mem(params + 0); |
| switch (op) { |
| // does not mutate memory |
| case BRCTL_GET_VERSION: |
| break; |
| case BRCTL_GET_BRIDGES: { |
| auto len = t->read_mem(params + 2); |
| syscall_state.mem_ptr_parameter(params + 1, len * sizeof(int)); |
| break; |
| } |
| case BRCTL_ADD_BRIDGE: |
| case BRCTL_DEL_BRIDGE: |
| syscall_state.mem_ptr_parameter<uint8_t[IFNAMSIZ]>(params + 1, IN); |
| break; |
| } |
| return PREVENT_SWITCH; |
| } |
| |
| // These haven't been observed to write beyond |
| // tracees' stacks, but we record a stack page here |
| // just in case the behavior is driver-dependent. |
| case SIOCGIWFREQ: |
| case SIOCGIWMODE: |
| case SIOCGIWNAME: |
| case SIOCGIWRATE: |
| case SIOCGIWSENS: |
| syscall_state.reg_parameter<typename Arch::iwreq>(3); |
| syscall_state.after_syscall_action(record_page_below_stack_ptr); |
| return PREVENT_SWITCH; |
| case SIOCGIWESSID: { |
| auto argsp = syscall_state.reg_parameter<typename Arch::iwreq>(3, IN_OUT); |
| auto args = t->read_mem(argsp); |
| syscall_state.mem_ptr_parameter(REMOTE_PTR_FIELD(argsp, u.essid.pointer), |
| args.u.essid.length); |
| syscall_state.after_syscall_action(record_page_below_stack_ptr); |
| return PREVENT_SWITCH; |
| } |
| |
| case SIOCGSTAMP: |
| syscall_state.reg_parameter<typename Arch::timeval>(3); |
| return PREVENT_SWITCH; |
| |
| case SIOCGSTAMPNS: |
| syscall_state.reg_parameter<typename Arch::timespec>(3); |
| return PREVENT_SWITCH; |
| |
| case TCGETS: |
| case TIOCGLCKTRMIOS: |
| syscall_state.reg_parameter<typename Arch::termios>(3); |
| return PREVENT_SWITCH; |
| |
| case TCGETA: |
| syscall_state.reg_parameter<typename Arch::termio>(3); |
| return PREVENT_SWITCH; |
| |
| case BLKSSZGET: |
| case BLKALIGNOFF: |
| case KDGKBMODE: |
| case RNDGETENTCNT: |
| case TIOCINQ: |
| case TIOCOUTQ: |
| case TIOCGETD: |
| case VT_OPENQRY: |
| syscall_state.reg_parameter<int>(3); |
| return PREVENT_SWITCH; |
| |
| case BLKROGET: |
| case BLKIOMIN: |
| case BLKIOOPT: |
| case BLKPBSZGET: |
| case BLKDISCARDZEROES: |
| syscall_state.reg_parameter<unsigned int>(3); |
| return PREVENT_SWITCH; |
| |
| case BLKGETSIZE: |
| syscall_state.reg_parameter<typename Arch::unsigned_long>(3); |
| return PREVENT_SWITCH; |
| |
| case BLKRAGET: |
| case BLKFRAGET: |
| syscall_state.reg_parameter<typename Arch::signed_long>(3); |
| return PREVENT_SWITCH; |
| |
| case BLKSECTGET: |
| case BLKROTATIONAL: |
| syscall_state.reg_parameter<typename Arch::unsigned_short>(3); |
| return PREVENT_SWITCH; |
| |
| case TIOCGWINSZ: |
| syscall_state.reg_parameter<typename Arch::winsize>(3); |
| return PREVENT_SWITCH; |
| |
| case TIOCGPGRP: |
| case TIOCGSID: |
| syscall_state.reg_parameter<typename Arch::pid_t>(3); |
| return PREVENT_SWITCH; |
| |
| case TIOCGSERIAL: |
| syscall_state.reg_parameter<typename Arch::serial_struct>(3); |
| return PREVENT_SWITCH; |
| case TIOCSSERIAL: |
| return PREVENT_SWITCH; |
| |
| case SNDRV_CTL_IOCTL_PVERSION: |
| syscall_state.reg_parameter<int>(3); |
| return PREVENT_SWITCH; |
| |
| case SNDRV_CTL_IOCTL_CARD_INFO: |
| syscall_state.reg_parameter<typename Arch::snd_ctl_card_info>(3); |
| return PREVENT_SWITCH; |
| |
| case HCIGETDEVINFO: |
| syscall_state.reg_parameter<typename Arch::hci_dev_info>(3); |
| return PREVENT_SWITCH; |
| |
| case HCIGETDEVLIST: |
| syscall_state.reg_parameter<typename Arch::hci_dev_list_req>(3); |
| return PREVENT_SWITCH; |
| |
| case SG_EMULATED_HOST: |
| case SG_GET_RESERVED_SIZE: |
| case SG_GET_SG_TABLESIZE: |
| case SG_GET_VERSION_NUM: |
| syscall_state.reg_parameter<typename Arch::signed_int>(3); |
| return PREVENT_SWITCH; |
| |
| case SG_IO: { |
| auto argsp = syscall_state.reg_parameter<typename Arch::sg_io_hdr>(3, IN_OUT); |
| auto args = t->read_mem(argsp); |
| syscall_state.mem_ptr_parameter(REMOTE_PTR_FIELD(argsp, dxferp), args.dxfer_len); |
| //cmdp: The user memory pointed to is only read (not written to). |
| syscall_state.mem_ptr_parameter(REMOTE_PTR_FIELD(argsp, sbp), args.mx_sb_len); |
| //usr_ptr: This value is not acted upon by the sg driver. |
| return PREVENT_SWITCH; |
| } |
| |
| case VT_GETSTATE: |
| syscall_state.reg_parameter<typename Arch::vt_stat>(3); |
| return PREVENT_SWITCH; |
| |
| case FBIOGET_FSCREENINFO: |
| syscall_state.reg_parameter<typename Arch::fb_fix_screeninfo>(3); |
| return PREVENT_SWITCH; |
| |
| case FBIOGET_VSCREENINFO: |
| syscall_state.reg_parameter<typename Arch::fb_var_screeninfo>(3); |
| return PREVENT_SWITCH; |
| |
| case CDROMREADTOCHDR: |
| syscall_state.reg_parameter<typename Arch::cdrom_tochdr>(3); |
| return PREVENT_SWITCH; |
| |
| case CDROMREADTOCENTRY: |
| syscall_state.reg_parameter<typename Arch::cdrom_tocentry>(3); |
| return PREVENT_SWITCH; |
| } |
| |
| /* In ioctl language, "_IOC_READ" means "outparam". Both |
| * READ and WRITE can be set for inout params. |
| * USBDEVFS ioctls seem to be mostly backwards in their interpretation of the |
| * read/write bits :-(. |
| */ |
| if (!(_IOC_READ & dir)) { |
| switch (IOCTL_MASK_SIZE(request)) { |
| // Order by value |
| // Older ioctls don't use IOC macros at all, so don't mask size for them |
| case TCSETS: |
| case TCSETSW: |
| case TCSETSF: |
| case TCSETA: |
| case TCSETAW: |
| case TCSETAF: |
| case TIOCSLCKTRMIOS: |
| case TCSBRK: |
| case TCSBRKP: |
| case TIOCSBRK: |
| case TIOCCBRK: |
| case TCXONC: |
| case TCFLSH: |
| case TIOCEXCL: |
| case TIOCNXCL: |
| case TIOCSCTTY: |
| case TIOCNOTTY: |
| case TIOCSPGRP: |
| case TIOCSTI: |
| case TIOCSWINSZ: |
| // No test for TIOCCONS because if run as root it would do bad things |
| case TIOCCONS: |
| case TIOCPKT: |
| case FIONBIO: |
| case FIOASYNC: |
| case TIOCSETD: |
| case IOCTL_MASK_SIZE(TIOCSPTLCK): |
| case IOCTL_MASK_SIZE(TIOCGPTPEER): |
| case FIOCLEX: |
| case FIONCLEX: |
| case IOCTL_MASK_SIZE(BTRFS_IOC_CLONE): |
| case IOCTL_MASK_SIZE(BTRFS_IOC_CLONE_RANGE): |
| case IOCTL_MASK_SIZE(USBDEVFS_DISCARDURB): |
| case IOCTL_MASK_SIZE(USBDEVFS_RESET): |
| case IOCTL_MASK_SIZE(TUNSETNOCSUM): |
| case IOCTL_MASK_SIZE(TUNSETDEBUG): |
| case IOCTL_MASK_SIZE(TUNSETPERSIST): |
| case IOCTL_MASK_SIZE(TUNSETOWNER): |
| case IOCTL_MASK_SIZE(TUNSETLINK): |
| case IOCTL_MASK_SIZE(TUNSETGROUP): |
| case IOCTL_MASK_SIZE(TUNSETOFFLOAD): |
| case IOCTL_MASK_SIZE(TUNSETTXFILTER): |
| case IOCTL_MASK_SIZE(TUNSETSNDBUF): |
| case IOCTL_MASK_SIZE(TUNATTACHFILTER): |
| case IOCTL_MASK_SIZE(TUNDETACHFILTER): |
| case IOCTL_MASK_SIZE(TUNSETVNETHDRSZ): |
| case IOCTL_MASK_SIZE(TUNSETQUEUE): |
| case IOCTL_MASK_SIZE(TUNSETIFINDEX): |
| case IOCTL_MASK_SIZE(TUNSETVNETLE): |
| case IOCTL_MASK_SIZE(TUNSETVNETBE): |
| return PREVENT_SWITCH; |
| case IOCTL_MASK_SIZE(USBDEVFS_GETDRIVER): |
| // Reads and writes its parameter despite not having the _IOC_READ bit. |
| syscall_state.reg_parameter(3, size); |
| return PREVENT_SWITCH; |
| case IOCTL_MASK_SIZE(USBDEVFS_REAPURB): |
| case IOCTL_MASK_SIZE(USBDEVFS_REAPURBNDELAY): |
| syscall_state.reg_parameter(3, size); |
| syscall_state.after_syscall_action(record_usbdevfs_reaped_urb<Arch>); |
| return ALLOW_SWITCH; |
| case IOCTL_MASK_SIZE(TUNSETIFF): |
| // Reads and writes its parameter despite not having the _IOC_READ |
| // bit... |
| // And the parameter is an ifreq, not an int as in the ioctl definition! |
| syscall_state.reg_parameter<typename Arch::ifreq>(3); |
| return PREVENT_SWITCH; |
| } |
| |
| switch (type) { |
| case 0x54: // TIO* |
| case 0x89: // SIO* |
| case 0x8B: // SIO* wireless interface ioctls |
| // These ioctls are known to be irregular and don't usually have the |
| // correct |dir| bits. They must be handled above |
| syscall_state.expect_errno = EINVAL; |
| return PREVENT_SWITCH; |
| } |
| |
| /* If the kernel isn't going to write any data back to |
| * us, we hope and pray that the result of the ioctl |
| * (observable to the tracee) is deterministic. |
| * We're also assuming it doesn't block. |
| * This is risky! Many ioctls use irregular ioctl codes |
| * that do not have the _IOC_READ bit set but actually do write to |
| * user-space! */ |
| LOG(debug) << " (presumed ignorable ioctl, nothing to do)"; |
| return PREVENT_SWITCH; |
| } |
| |
| /* There are lots of ioctl values for EVIOCGBIT */ |
| if (type == 'E' && nr >= 0x20 && nr <= 0x7f) { |
| syscall_state.reg_parameter(3, size); |
| return PREVENT_SWITCH; |
| } |
| |
| /* The following are thought to be "regular" ioctls, the |
| * processing of which is only known to (observably) write to |
| * the bytes in the structure passed to the kernel. So all we |
| * need is to record |size| bytes. |
| * Since the size may vary across architectures we mask it out here to check |
| * only the type + number. */ |
| switch (IOCTL_MASK_SIZE(request)) { |
| case IOCTL_MASK_SIZE(VIDIOC_QUERYCAP): |
| case IOCTL_MASK_SIZE(VIDIOC_ENUM_FMT): |
| case IOCTL_MASK_SIZE(VIDIOC_ENUM_FRAMESIZES): |
| case IOCTL_MASK_SIZE(VIDIOC_ENUM_FRAMEINTERVALS): |
| case IOCTL_MASK_SIZE(VIDIOC_ENUMINPUT): |
| case IOCTL_MASK_SIZE(VIDIOC_G_FMT): |
| case IOCTL_MASK_SIZE(VIDIOC_S_FMT): |
| case IOCTL_MASK_SIZE(VIDIOC_TRY_FMT): |
| case IOCTL_MASK_SIZE(VIDIOC_G_PARM): |
| case IOCTL_MASK_SIZE(VIDIOC_S_PARM): |
| case IOCTL_MASK_SIZE(VIDIOC_REQBUFS): |
| case IOCTL_MASK_SIZE(VIDIOC_QUERYBUF): |
| case IOCTL_MASK_SIZE(VIDIOC_QUERYCTRL): |
| case IOCTL_MASK_SIZE(VIDIOC_QBUF): |
| case IOCTL_MASK_SIZE(VIDIOC_G_CTRL): |
| case IOCTL_MASK_SIZE(VIDIOC_G_OUTPUT): |
| case IOCTL_MASK_SIZE(VIDIOC_S_CTRL): |
| case IOCTL_MASK_SIZE(VIDIOC_G_INPUT): |
| case IOCTL_MASK_SIZE(VIDIOC_QUERY_EXT_CTRL): |
| case IOCTL_MASK_SIZE(VIDIOC_G_PRIORITY): |
| case IOCTL_MASK_SIZE(VFAT_IOCTL_READDIR_BOTH): |
| syscall_state.reg_parameter(3, size, IN_OUT); |
| return PREVENT_SWITCH; |
| |
| case IOCTL_MASK_SIZE(TIOCGPTN): |
| case IOCTL_MASK_SIZE(TIOCGPKT): |
| case IOCTL_MASK_SIZE(TIOCGPTLCK): |
| case IOCTL_MASK_SIZE(TIOCGEXCL): |
| case IOCTL_MASK_SIZE(USBDEVFS_GET_CAPABILITIES): |
| case IOCTL_MASK_SIZE(FS_IOC_FSGETXATTR): |
| // FS_IOC_GETVERSION has the same number as VIDIOCGCAP (but different size) |
| // but the same treatment works for both. |
| case IOCTL_MASK_SIZE(FS_IOC_GETVERSION): |
| case IOCTL_MASK_SIZE(FS_IOC_GETFLAGS): |
| case IOCTL_MASK_SIZE(TUNGETFEATURES): |
| case IOCTL_MASK_SIZE(TUNGETSNDBUF): |
| case IOCTL_MASK_SIZE(TUNGETVNETHDRSZ): |
| case IOCTL_MASK_SIZE(TUNGETVNETLE): |
| case IOCTL_MASK_SIZE(TUNGETVNETBE): |
| case IOCTL_MASK_SIZE(EVIOCGVERSION): |
| case IOCTL_MASK_SIZE(EVIOCGID): |
| case IOCTL_MASK_SIZE(EVIOCGREP): |
| case IOCTL_MASK_SIZE(EVIOCGKEYCODE): /* also covers EVIOCGKEYCODE_V2 */ |
| case IOCTL_MASK_SIZE(EVIOCGNAME(0)): |
| case IOCTL_MASK_SIZE(EVIOCGPHYS(0)): |
| case IOCTL_MASK_SIZE(EVIOCGUNIQ(0)): |
| case IOCTL_MASK_SIZE(EVIOCGPROP(0)): |
| case IOCTL_MASK_SIZE(EVIOCGMTSLOTS(0)): |
| case IOCTL_MASK_SIZE(EVIOCGKEY(0)): |
| case IOCTL_MASK_SIZE(EVIOCGLED(0)): |
| case IOCTL_MASK_SIZE(EVIOCGSND(0)): |
| case IOCTL_MASK_SIZE(EVIOCGSW(0)): |
| case IOCTL_MASK_SIZE(EVIOCGEFFECTS): |
| case IOCTL_MASK_SIZE(EVIOCGMASK): |
| case IOCTL_MASK_SIZE(JSIOCGVERSION): |
| case IOCTL_MASK_SIZE(JSIOCGAXES): |
| case IOCTL_MASK_SIZE(JSIOCGBUTTONS): |
| // This gets a list of js_corr structures whose length we don't know without |
| // querying the device ourselves. |
| // case IOCTL_MASK_SIZE(JSIOCGCORR): |
| case IOCTL_MASK_SIZE(JSIOCGAXMAP): |
| case IOCTL_MASK_SIZE(JSIOCGBTNMAP): |
| case IOCTL_MASK_SIZE(JSIOCGNAME(0)): |
| case IOCTL_MASK_SIZE(HIDIOCGRAWINFO): |
| case IOCTL_MASK_SIZE(HIDIOCGRAWNAME(0)): |
| case IOCTL_MASK_SIZE(BLKBSZGET): |
| case IOCTL_MASK_SIZE(BLKGETDISKSEQ): |
| syscall_state.reg_parameter(3, size); |
| return PREVENT_SWITCH; |
| |
| case IOCTL_MASK_SIZE(PERF_EVENT_IOC_ID): |
| syscall_state.reg_parameter<uint64_t>(3); |
| return PREVENT_SWITCH; |
| |
| case IOCTL_MASK_SIZE(USBDEVFS_ALLOC_STREAMS): |
| case IOCTL_MASK_SIZE(USBDEVFS_CLAIMINTERFACE): |
| case IOCTL_MASK_SIZE(USBDEVFS_CLEAR_HALT): |
| case IOCTL_MASK_SIZE(USBDEVFS_DISCONNECT_CLAIM): |
| case IOCTL_MASK_SIZE(USBDEVFS_FREE_STREAMS): |
| case IOCTL_MASK_SIZE(USBDEVFS_RELEASEINTERFACE): |
| case IOCTL_MASK_SIZE(USBDEVFS_SETCONFIGURATION): |
| case IOCTL_MASK_SIZE(USBDEVFS_SETINTERFACE): |
| case IOCTL_MASK_SIZE(USBDEVFS_SUBMITURB): |
| // Doesn't actually seem to write to userspace |
| return PREVENT_SWITCH; |
| |
| case IOCTL_MASK_SIZE(BLKGETSIZE64): |
| // The ioctl definition says "size_t" but it's actually a uint64! |
| syscall_state.reg_parameter<uint64_t>(3); |
| return PREVENT_SWITCH; |
| |
| case IOCTL_MASK_SIZE(TUNGETIFF): |
| // The ioctl definition says "unsigned int" but it's actually a |
| // struct ifreq! |
| syscall_state.reg_parameter<typename Arch::ifreq>(3); |
| return PREVENT_SWITCH; |
| case IOCTL_MASK_SIZE(TUNGETFILTER): |
| // The ioctl definition says "struct sock_fprog" but there is no kernel |
| // compat code so a 32-bit task on a 64-bit kernel needs to use the |
| // 64-bit type. |
| if (sizeof(void*) == 8) { |
| // 64-bit rr build. We must be on a 64-bit kernel so use the 64-bit |
| // sock_fprog type. |
| syscall_state.reg_parameter<typename NativeArch::sock_fprog>(3); |
| } else { |
| FATAL() << "TUNGETFILTER not supported on 32-bit since its behavior " |
| "depends on 32-bit vs 64-bit kernel"; |
| } |
| return PREVENT_SWITCH; |
| |
| case IOCTL_MASK_SIZE(USBDEVFS_IOCTL): { |
| auto argsp = |
| syscall_state.reg_parameter<typename Arch::usbdevfs_ioctl>(3, IN); |
| auto args = t->read_mem(argsp); |
| syscall_state.mem_ptr_parameter(REMOTE_PTR_FIELD(argsp, data), |
| _IOC_SIZE(args.ioctl_code)); |
| return PREVENT_SWITCH; |
| } |
| case IOCTL_MASK_SIZE(USBDEVFS_CONTROL): { |
| auto argsp = |
| syscall_state.reg_parameter<typename Arch::usbdevfs_ctrltransfer>(3, |
| IN); |
| auto args = t->read_mem(argsp); |
| syscall_state.mem_ptr_parameter(REMOTE_PTR_FIELD(argsp, data), |
| args.wLength); |
| return PREVENT_SWITCH; |
| } |
| case IOCTL_MASK_SIZE(FS_IOC_FIEMAP): { |
| auto argsp = remote_ptr<typename Arch::fiemap>(t->regs().arg3()); |
| auto args = t->read_mem(argsp); |
| size = sizeof(typename Arch::fiemap) + |
| sizeof(typename Arch::fiemap_extent) * args.fm_extent_count; |
| syscall_state.reg_parameter(3, size, IN_OUT); |
| return PREVENT_SWITCH; |
| } |
| } |
| |
| /* These ioctls are mostly regular but require additional recording. */ |
| switch (IOCTL_MASK_SIZE(request)) { |
| case IOCTL_MASK_SIZE(VIDIOC_DQBUF): { |
| if (size == sizeof(typename Arch::v4l2_buffer)) { |
| syscall_state.reg_parameter(3, size, IN_OUT); |
| syscall_state.after_syscall_action(record_v4l2_buffer_contents<Arch>); |
| // VIDIOC_DQBUF can block. It can't if the fd was opened O_NONBLOCK, |
| // but we don't try to determine that. |
| // Note that we're exposed to potential race conditions here because |
| // VIDIOC_DQBUF (blocking or not) assumes the driver has filled |
| // the mmapped data region at some point since the buffer was queued |
| // with VIDIOC_QBUF, and we don't/can't know exactly when that |
| // happened. Replay could fail if this thread or another thread reads |
| // the contents of mmapped contents queued with the driver. |
| return ALLOW_SWITCH; |
| } |
| } |
| } |
| |
| syscall_state.expect_errno = EINVAL; |
| return PREVENT_SWITCH; |
| } |
| |
| template <typename Arch> static BpfMapMonitor* bpf_map_monitor(RecordTask* t, |
| TaskSyscallState& syscall_state, remote_ptr<typename Arch::bpf_attr>* argsp_out) { |
| auto argsp = syscall_state.reg_parameter<typename Arch::bpf_attr>(2, IN); |
| auto args = t->read_mem(argsp); |
| FileMonitor* monitor = t->fd_table()->get_monitor(args.map_fd); |
| ASSERT(t, monitor) << "We need a BpfMapMonitor to handle this, but couldn't find it for fd " << args.map_fd; |
| ASSERT(t, monitor->type() == FileMonitor::BpfMap); |
| *argsp_out = argsp; |
| return static_cast<BpfMapMonitor*>(monitor); |
| } |
| |
| template <typename Arch> |
| static Switchable prepare_bpf(RecordTask* t, |
| TaskSyscallState& syscall_state) { |
| int cmd = t->regs().arg1(); |
| switch (cmd) { |
| case BPF_MAP_CREATE: |
| case BPF_MAP_UPDATE_ELEM: |
| case BPF_MAP_DELETE_ELEM: |
| break; |
| case BPF_OBJ_GET: |
| return ALLOW_SWITCH; |
| case BPF_PROG_LOAD: { |
| auto argsp = |
| syscall_state.reg_parameter<typename Arch::bpf_attr>(2, IN); |
| auto args = t->read_mem(argsp); |
| syscall_state.mem_ptr_parameter(REMOTE_PTR_FIELD(argsp, log_buf), |
| args.log_size); |
| break; |
| } |
| case BPF_MAP_LOOKUP_ELEM: { |
| remote_ptr<typename Arch::bpf_attr> argsp; |
| BpfMapMonitor* monitor = bpf_map_monitor<Arch>(t, syscall_state, &argsp); |
| syscall_state.mem_ptr_parameter(REMOTE_PTR_FIELD(argsp, value), |
| monitor->value_size()); |
| break; |
| } |
| case BPF_MAP_GET_NEXT_KEY: { |
| remote_ptr<typename Arch::bpf_attr> argsp; |
| BpfMapMonitor* monitor = bpf_map_monitor<Arch>(t, syscall_state, &argsp); |
| syscall_state.mem_ptr_parameter(REMOTE_PTR_FIELD(argsp, next_key), |
| monitor->key_size()); |
| break; |
| } |
| default: |
| syscall_state.expect_errno = EINVAL; |
| break; |
| } |
| return PREVENT_SWITCH; |
| } |
| |
| static bool maybe_emulate_wait(RecordTask* t, TaskSyscallState& syscall_state, |
| int options) { |
| for (RecordTask* child : t->emulated_ptrace_tracees) { |
| if (t->is_waiting_for_ptrace(child) && child->emulated_stop_pending) { |
| syscall_state.emulate_wait_for_child = child; |
| return true; |
| } |
| } |
| for (ThreadGroup* child_process : t->thread_group()->children()) { |
| for (Task* child : child_process->task_set()) { |
| auto rchild = static_cast<RecordTask*>(child); |
| if (rchild->emulated_stop_type == NOT_STOPPED) { |
| continue; |
| } |
| if (!(options & WUNTRACED) && rchild->emulated_stop_type != CHILD_STOP) { |
| continue; |
| } |
| if (!rchild->emulated_stop_pending || !t->is_waiting_for(rchild)) { |
| continue; |
| } |
| syscall_state.emulate_wait_for_child = rchild; |
| return true; |
| } |
| } |
| return false; |
| } |
| |
| static bool maybe_pause_instead_of_waiting(RecordTask* t, int options) { |
| if (t->in_wait_type != WAIT_TYPE_PID || (options & WNOHANG)) { |
| return false; |
| } |
| RecordTask* child = t->session().find_task(t->in_wait_pid); |
| if (!child) { |
| LOG(debug) << "Child " << t->in_wait_pid << " not found!"; |
| } |
| if (!child || !t->is_waiting_for_ptrace(child) || t->is_waiting_for(child)) { |
| return false; |
| } |
| // OK, t is waiting for a ptrace child by tid, but since t is not really |
| // ptracing child, entering a real wait syscall will not actually wait for |
| // the child, so the kernel may error out with ECHILD (non-ptracers can't |
| // wait on specific threads of another process, or for non-child processes). |
| // To avoid this problem, we'll replace the wait syscall with a pause() |
| // syscall. |
| // It would be nice if we didn't have to do this, but I can't see a better |
| // way. |
| Registers r = t->regs(); |
| // pause() would be sufficient here, but we don't have that on all |
| // architectures, so use ppoll(NULL, 0, NULL, NULL), which is what |
| // glibc uses to implement pause() on architectures where the former |
| // doesn't exist |
| r.set_original_syscallno(syscall_number_for_ppoll(t->arch())); |
| r.set_arg1(0); |
| r.set_arg2(0); |
| r.set_arg3(0); |
| r.set_arg4(0); |
| t->set_regs(r); |
| return true; |
| } |
| |
| static RecordTask* verify_ptrace_target(RecordTask* tracer, |
| TaskSyscallState& syscall_state, |
| pid_t pid, |
| bool require_stopped = true) { |
| RecordTask* tracee = tracer->session().find_task(pid); |
| if (!tracee) { |
| LOG(debug) << "tracee pid " << pid << " is unknown to rr"; |
| syscall_state.emulate_result(-ESRCH); |
| return nullptr; |
| } |
| if (tracee->emulated_ptracer != tracer) { |
| LOG(debug) << pid << " is not traced by " << tracer->tid; |
| syscall_state.emulate_result(-ESRCH); |
| return nullptr; |
| } |
| if (require_stopped && tracee->emulated_stop_type == NOT_STOPPED) { |
| LOG(debug) << pid << " is not in a ptrace stop"; |
| syscall_state.emulate_result(-ESRCH); |
| return nullptr; |
| } |
| return tracee; |
| } |
| |
| static void prepare_ptrace_cont(RecordTask* tracee, int sig, int command) { |
| if (sig) { |
| siginfo_t si = tracee->take_ptrace_signal_siginfo(sig); |
| LOG(debug) << "Doing ptrace resume with signal " << signal_name(sig); |
| // Treat signal as nondeterministic; it won't happen just by |
| // replaying the tracee. |
| tracee->push_event( |
| Event(EV_SIGNAL, SignalEvent(si, NONDETERMINISTIC_SIG, |
| tracee->sig_resolved_disposition( |
| si.si_signo, NONDETERMINISTIC_SIG)))); |
| } |
| |
| tracee->emulated_stop_type = NOT_STOPPED; |
| tracee->emulated_stop_pending = false; |
| tracee->emulated_stop_code = WaitStatus(); |
| tracee->emulated_ptrace_cont_command = command; |
| |
| if (tracee->ev().is_syscall_event() && |
| PROCESSING_SYSCALL == tracee->ev().Syscall().state) { |
| // Continue the task since we didn't in enter_syscall |
| tracee->resume_execution(RESUME_SYSCALL, RESUME_NONBLOCKING, |
| RESUME_NO_TICKS); |
| } |
| } |
| |
| static uint64_t widen_buffer_unsigned(const void* buf, size_t size) { |
| switch (size) { |
| case 1: |
| return *reinterpret_cast<const uint8_t*>(buf); |
| case 2: |
| return *reinterpret_cast<const uint16_t*>(buf); |
| case 4: |
| return *reinterpret_cast<const uint32_t*>(buf); |
| case 8: |
| return *reinterpret_cast<const uint64_t*>(buf); |
| default: |
| DEBUG_ASSERT(0 && "Unsupported size"); |
| return 0; |
| } |
| } |
| |
| static int64_t widen_buffer_signed(const void* buf, size_t size) { |
| switch (size) { |
| case 1: |
| return *reinterpret_cast<const int8_t*>(buf); |
| case 2: |
| return *reinterpret_cast<const int16_t*>(buf); |
| case 4: |
| return *reinterpret_cast<const int32_t*>(buf); |
| case 8: |
| return *reinterpret_cast<const int64_t*>(buf); |
| default: |
| DEBUG_ASSERT(0 && "Unsupported size"); |
| return 0; |
| } |
| } |
| |
| static uint64_t path_inode_number(const char* path) { |
| struct stat st; |
| int ret = stat(path, &st); |
| DEBUG_ASSERT(ret == 0); |
| return st.st_ino; |
| } |
| |
| static bool is_same_namespace(const char* name, pid_t tid1, pid_t tid2) { |
| char path1[PATH_MAX]; |
| char path2[PATH_MAX]; |
| sprintf(path1, "/proc/%d/ns/%s", tid1, name); |
| sprintf(path2, "/proc/%d/ns/%s", tid2, name); |
| return path_inode_number(path1) == path_inode_number(path2); |
| } |
| |
| template <typename Arch> |
| static void ptrace_get_reg_set(RecordTask* t, TaskSyscallState& syscall_state, |
| const vector<uint8_t>& regs) { |
| auto piov = syscall_state.reg_parameter<typename Arch::iovec>(4, IN_OUT); |
| auto iov = t->read_mem(piov); |
| iov.iov_len = min<size_t>(iov.iov_len, regs.size()); |
| t->write_mem(piov, iov); |
| auto data = syscall_state.mem_ptr_parameter(REMOTE_PTR_FIELD(piov, iov_base), |
| iov.iov_len); |
| t->write_bytes_helper(data, iov.iov_len, regs.data()); |
| syscall_state.emulate_result(0); |
| } |
| |
| template <typename Arch> |
| static void ptrace_verify_set_reg_set(RecordTask* t, size_t min_size, |
| TaskSyscallState& syscall_state) { |
| auto iov = t->read_mem(remote_ptr<typename Arch::iovec>(t->regs().arg4())); |
| if (iov.iov_len < min_size) { |
| syscall_state.emulate_result(-EIO); |
| return; |
| } |
| syscall_state.emulate_result(0); |
| } |
| |
| static bool verify_ptrace_options(RecordTask* t, |
| TaskSyscallState& syscall_state) { |
| // We "support" PTRACE_O_SYSGOOD because we don't support PTRACE_SYSCALL yet |
| static const int supported_ptrace_options = |
| PTRACE_O_TRACESYSGOOD | PTRACE_O_TRACEEXIT | PTRACE_O_TRACEFORK | |
| PTRACE_O_TRACECLONE | PTRACE_O_TRACEVFORK | PTRACE_O_TRACEEXEC | PTRACE_O_TRACEVFORKDONE; |
| |
| if ((int)t->regs().arg4() & ~supported_ptrace_options) { |
| LOG(debug) << "Unsupported ptrace options " << HEX(t->regs().arg4()); |
| syscall_state.emulate_result(-EINVAL); |
| return false; |
| } |
| return true; |
| } |
| |
| static bool check_ptracer_compatible(RecordTask* tracer, RecordTask* tracee) { |
| // Don't allow a 32-bit process to trace a 64-bit process. That doesn't |
| // make much sense (manipulating registers gets crazy), and would be hard to |
| // support. |
| if (tracee->emulated_ptracer || tracee->tgid() == tracer->tgid() || |
| (tracer->arch() == x86 && tracee->arch() == x86_64)) { |
| return false; |
| } |
| return true; |
| } |
| |
| static RecordTask* get_ptrace_partner(RecordTask* t, pid_t pid) { |
| // To simplify things, require that a ptracer be in the same pid |
| // namespace as rr itself. I.e., tracee tasks sandboxed in a pid |
| // namespace can't use ptrace. This is normally a requirement of |
| // sandboxes anyway. |
| // This could be supported, but would require some work to translate |
| // rr's pids to/from the ptracer's pid namespace. |
| ASSERT(t, is_same_namespace("pid", t->tid, getpid())); |
| RecordTask* partner = t->session().find_task(pid); |
| if (!partner) { |
| // XXX This prevents a tracee from attaching to a process which isn't |
| // under rr's control. We could support this but it would complicate |
| // things. |
| return nullptr; |
| } |
| return partner; |
| } |
| |
| static RecordTask* prepare_ptrace_attach(RecordTask* t, pid_t pid, |
| TaskSyscallState& syscall_state) { |
| RecordTask* tracee = get_ptrace_partner(t, pid); |
| if (!tracee) { |
| syscall_state.emulate_result(-ESRCH); |
| return nullptr; |
| } |
| if (!check_ptracer_compatible(t, tracee)) { |
| syscall_state.emulate_result(-EPERM); |
| return nullptr; |
| } |
| return tracee; |
| } |
| |
| static RecordTask* prepare_ptrace_traceme(RecordTask* t, |
| TaskSyscallState& syscall_state) { |
| RecordTask* tracer = get_ptrace_partner(t, t->get_parent_pid()); |
| if (!tracer) { |
| syscall_state.emulate_result(-ESRCH); |
| return nullptr; |
| } |
| if (!check_ptracer_compatible(tracer, t)) { |
| syscall_state.emulate_result(-EPERM); |
| return nullptr; |
| } |
| return tracer; |
| } |
| |
| static void ptrace_attach_to_already_stopped_task(RecordTask* t) { |
| ASSERT(t, t->emulated_stop_type == GROUP_STOP); |
| // tracee is already stopped because of a group-stop signal. |
| // Sending a SIGSTOP won't work, but we don't need to. |
| t->force_emulate_ptrace_stop(WaitStatus::for_stop_sig(SIGSTOP), t->emulated_stop_type); |
| siginfo_t si; |
| memset(&si, 0, sizeof(si)); |
| si.si_signo = SIGSTOP; |
| si.si_code = SI_USER; |
| t->save_ptrace_signal_siginfo(si); |
| } |
| |
| /** |
| * The PTRACE_GETREGS/PTRACE_SETREGS commands, as well various PEEK_* and POKE_* |
| * ptrace commands are legacy and not implemented on newer architectures. |
| * We split them out here, since these newer platforms |
| * will not define the requisite data structure to serve them. |
| */ |
| template <typename Arch> |
| static void prepare_ptrace_legacy(RecordTask* t, |
| TaskSyscallState& syscall_state) |
| { |
| pid_t pid = (pid_t)t->regs().arg2_signed(); |
| int command = (int)t->regs().arg1_signed(); |
| switch (command) { |
| case Arch::PTRACE_PEEKUSR: { |
| RecordTask* tracee = verify_ptrace_target(t, syscall_state, pid); |
| if (tracee) { |
| // The actual syscall returns the data via the 'data' out-parameter. |
| // The behavior of returning the data as the system call result is |
| // provided by the glibc wrapper. |
| size_t addr = t->regs().arg3(); |
| typename Arch::unsigned_word data; |
| if ((addr & (sizeof(data) - 1)) || |
| addr >= sizeof(typename Arch::user)) { |
| syscall_state.emulate_result(-EIO); |
| break; |
| } |
| |
| auto datap = |
| syscall_state.reg_parameter<typename Arch::unsigned_word>(4); |
| if (addr < sizeof(typename Arch::user_regs_struct)) { |
| uint8_t buf[Registers::MAX_SIZE]; |
| bool defined; |
| size_t size = |
| tracee->regs().read_register_by_user_offset(buf, addr, &defined); |
| if (defined) { |
| // For unclear reasons, all 32-bit user_regs_struct members are |
| // signed while all 64-bit user_regs_struct members are unsigned. |
| if (Arch::arch() == x86) { |
| data = widen_buffer_signed(buf, size); |
| } else { |
| data = widen_buffer_unsigned(buf, size); |
| } |
| } else { |
| data = 0; |
| } |
| } else if (addr >= offsetof(typename Arch::user, u_debugreg[0]) && |
| addr < offsetof(typename Arch::user, u_debugreg[8])) { |
| size_t regno = (addr - offsetof(typename Arch::user, u_debugreg[0])) / |
| sizeof(data); |
| data = tracee->get_debug_reg(regno); |
| } else { |
| data = 0; |
| } |
| |
| t->write_mem(datap, data); |
| syscall_state.emulate_result(0); |
| } |
| break; |
| } |
| case Arch::PTRACE_POKEUSR: { |
| RecordTask* tracee = verify_ptrace_target(t, syscall_state, pid); |
| if (tracee) { |
| // The actual syscall returns the data via the 'data' out-parameter. |
| // The behavior of returning the data as the system call result is |
| // provided by the glibc wrapper. |
| size_t addr = t->regs().arg3(); |
| if ((addr & (sizeof(typename Arch::unsigned_word) - 1)) || |
| addr >= sizeof(typename Arch::user)) { |
| syscall_state.emulate_result(-EIO); |
| break; |
| } |
| syscall_state.emulate_result(0); |
| } |
| break; |
| } |
| case Arch::PTRACE_GETREGS: { |
| RecordTask* tracee = verify_ptrace_target(t, syscall_state, pid); |
| if (tracee) { |
| auto data = |
| syscall_state.reg_parameter<typename Arch::user_regs_struct>(4); |
| auto regs = tracee->regs().get_ptrace_for_arch(Arch::arch()); |
| ASSERT(t, regs.size() == data.referent_size()); |
| t->write_bytes_helper(data, regs.size(), regs.data()); |
| syscall_state.emulate_result(0); |
| } |
| break; |
| } |
| case Arch::PTRACE_GETFPREGS: { |
| RecordTask* tracee = verify_ptrace_target(t, syscall_state, pid); |
| if (tracee) { |
| auto data = |
| syscall_state.reg_parameter<typename Arch::user_fpregs_struct>(4); |
| auto regs = tracee->extra_regs().get_user_fpregs_struct(Arch::arch()); |
| ASSERT(t, regs.size() == data.referent_size()); |
| t->write_bytes_helper(data, regs.size(), regs.data()); |
| syscall_state.emulate_result(0); |
| } |
| break; |
| } |
| case Arch::PTRACE_GETFPXREGS: { |
| if (Arch::arch() != x86) { |
| // GETFPXREGS is x86-32 only |
| syscall_state.expect_errno = EIO; |
| break; |
| } |
| RecordTask* tracee = verify_ptrace_target(t, syscall_state, pid); |
| if (tracee) { |
| auto data = |
| syscall_state.reg_parameter<X86Arch::user_fpxregs_struct>(4); |
| auto regs = tracee->extra_regs().get_user_fpxregs_struct(); |
| t->write_mem(data, regs); |
| syscall_state.emulate_result(0); |
| } |
| break; |
| } |
| case Arch::PTRACE_SETREGS: { |
| RecordTask* tracee = verify_ptrace_target(t, syscall_state, pid); |
| if (tracee) { |
| // The actual register effects are performed by |
| // Task::on_syscall_exit_arch |
| syscall_state.emulate_result(0); |
| } |
| break; |
| } |
| case Arch::PTRACE_SETFPREGS: { |
| RecordTask* tracee = verify_ptrace_target(t, syscall_state, pid); |
| if (tracee) { |
| // The actual register effects are performed by |
| // Task::on_syscall_exit_arch |
| syscall_state.emulate_result(0); |
| } |
| break; |
| } |
| case Arch::PTRACE_SETFPXREGS: { |
| if (Arch::arch() != x86) { |
| // SETFPXREGS is x86-32 only |
| syscall_state.expect_errno = EIO; |
| break; |
| } |
| RecordTask* tracee = verify_ptrace_target(t, syscall_state, pid); |
| if (tracee) { |
| // The actual register effects are performed by |
| // Task::on_syscall_exit_arch |
| syscall_state.emulate_result(0); |
| } |
| break; |
| } |
| } |
| } |
| |
| template <> |
| void prepare_ptrace_legacy<ARM64Arch>(RecordTask*, TaskSyscallState&) { |
| // Nothing to do - unimplemented on this architecture |
| } |
| |
| static int non_negative_command(int command) { return command < 0 ? INT32_MAX : command; } |
| |
| template <typename Arch> |
| static Switchable prepare_ptrace(RecordTask* t, |
| TaskSyscallState& syscall_state) { |
| pid_t pid = (pid_t)t->regs().arg2_signed(); |
| bool emulate = true; |
| int command = (int)t->regs().arg1_signed(); |
| switch (non_negative_command(command)) { |
| case PTRACE_ATTACH: { |
| RecordTask* tracee = prepare_ptrace_attach(t, pid, syscall_state); |
| if (!tracee) { |
| break; |
| } |
| tracee->set_emulated_ptracer(t); |
| tracee->emulated_ptrace_seized = false; |
| tracee->emulated_ptrace_options = 0; |
| syscall_state.emulate_result(0); |
| if (tracee->emulated_stop_type == NOT_STOPPED) { |
| // Send SIGSTOP to this specific thread. Otherwise the kernel might |
| // deliver SIGSTOP to some other thread of the process, and we won't |
| // generate any ptrace event if that thread isn't being ptraced. |
| tracee->tgkill(SIGSTOP); |
| } else { |
| ptrace_attach_to_already_stopped_task(tracee); |
| } |
| break; |
| } |
| case PTRACE_TRACEME: { |
| RecordTask* tracer = prepare_ptrace_traceme(t, syscall_state); |
| if (!tracer) { |
| break; |
| } |
| t->set_emulated_ptracer(tracer); |
| t->emulated_ptrace_seized = false; |
| t->emulated_ptrace_options = 0; |
| syscall_state.emulate_result(0); |
| break; |
| } |
| case PTRACE_SEIZE: { |
| RecordTask* tracee = prepare_ptrace_attach(t, pid, syscall_state); |
| if (!tracee) { |
| break; |
| } |
| if (t->regs().arg3()) { |
| syscall_state.emulate_result(-EIO); |
| break; |
| } |
| if (!verify_ptrace_options(t, syscall_state)) { |
| break; |
| } |
| tracee->set_emulated_ptracer(t); |
| tracee->emulated_ptrace_seized = true; |
| tracee->emulated_ptrace_options = (int)t->regs().arg4(); |
| if (tracee->emulated_stop_type == GROUP_STOP) { |
| ptrace_attach_to_already_stopped_task(tracee); |
| } |
| syscall_state.emulate_result(0); |
| break; |
| } |
| case Arch::PTRACE_OLDSETOPTIONS: |
| RR_FALLTHROUGH; |
| case PTRACE_SETOPTIONS: { |
| RecordTask* tracee = verify_ptrace_target(t, syscall_state, pid); |
| if (tracee) { |
| if (!verify_ptrace_options(t, syscall_state)) { |
| break; |
| } |
| tracee->emulated_ptrace_options = (int)t->regs().arg4(); |
| syscall_state.emulate_result(0); |
| } |
| break; |
| } |
| case PTRACE_GETEVENTMSG: { |
| RecordTask* tracee = verify_ptrace_target(t, syscall_state, pid); |
| if (tracee) { |
| auto datap = |
| syscall_state.reg_parameter<typename Arch::unsigned_long>(4); |
| t->write_mem( |
| datap, |
| (typename Arch::unsigned_long)tracee->emulated_ptrace_event_msg); |
| syscall_state.emulate_result(0); |
| } |
| break; |
| } |
| case PTRACE_GETSIGINFO: { |
| RecordTask* tracee = verify_ptrace_target(t, syscall_state, pid); |
| if (tracee) { |
| auto datap = syscall_state.reg_parameter<typename Arch::siginfo_t>(4); |
| typename Arch::siginfo_t dest; |
| memset(&dest, 0, sizeof(dest)); |
| set_arch_siginfo(tracee->get_saved_ptrace_siginfo(), Arch::arch(), |
| &dest, sizeof(dest)); |
| t->write_mem(datap, dest); |
| syscall_state.emulate_result(0); |
| } |
| break; |
| } |
| case PTRACE_GETREGSET: { |
| switch ((int)t->regs().arg3()) { |
| case NT_PRSTATUS: { |
| RecordTask* tracee = verify_ptrace_target(t, syscall_state, pid); |
| if (tracee) { |
| auto regs = tracee->regs().get_ptrace_for_arch(tracee->arch()); |
| ptrace_get_reg_set<Arch>(t, syscall_state, regs); |
| } |
| break; |
| } |
| case NT_PRFPREG: { |
| RecordTask* tracee = verify_ptrace_target(t, syscall_state, pid); |
| if (tracee) { |
| auto regs = |
| tracee->extra_regs().get_user_fpregs_struct(tracee->arch()); |
| ptrace_get_reg_set<Arch>(t, syscall_state, regs); |
| } |
| break; |
| } |
| case NT_ARM_SYSTEM_CALL: { |
| if (Arch::arch() != aarch64) { |
| syscall_state.expect_errno = EINVAL; |
| emulate = false; |
| break; |
| } |
| RecordTask* tracee = verify_ptrace_target(t, syscall_state, pid); |
| if (tracee) { |
| int syscallno = tracee->regs().original_syscallno(); |
| uint8_t *data = (uint8_t*)&syscallno; |
| vector<uint8_t> regs(data, data+sizeof(syscallno)); |
| ptrace_get_reg_set<Arch>(t, syscall_state, regs); |
| } |
| break; |
| } |
| case NT_ARM_HW_BREAK: |
| case NT_ARM_HW_WATCH: { |
| if (Arch::arch() != aarch64) { |
| syscall_state.expect_errno = EINVAL; |
| emulate = false; |
| break; |
| } |
| RecordTask* tracee = verify_ptrace_target(t, syscall_state, pid); |
| if (tracee) { |
| ARM64Arch::user_hwdebug_state bps; |
| bool ok = tracee->get_aarch64_debug_regs((int)t->regs().arg3(), &bps); |
| ASSERT(tracee, ok); |
| uint8_t *data = (uint8_t*)&bps; |
| vector<uint8_t> regs(data, data+sizeof(bps)); |
| ptrace_get_reg_set<Arch>(t, syscall_state, regs); |
| } |
| break; |
| } |
| case NT_X86_XSTATE: { |
| if (!Arch::is_x86ish()) { |
| syscall_state.expect_errno = EINVAL; |
| emulate = false; |
| break; |
| } |
| RecordTask* tracee = verify_ptrace_target(t, syscall_state, pid); |
| if (tracee) { |
| switch (tracee->extra_regs().format()) { |
| case ExtraRegisters::XSAVE: |
| ptrace_get_reg_set<Arch>(t, syscall_state, |
| tracee->extra_regs().data()); |
| break; |
| default: |
| syscall_state.emulate_result(-EINVAL); |
| break; |
| } |
| } |
| break; |
| } |
| default: |
| syscall_state.expect_errno = EINVAL; |
| emulate = false; |
| break; |
| } |
| break; |
| } |
| case PTRACE_SETREGSET: { |
| // The actual register effects are performed by |
| // Task::on_syscall_exit_arch |
| switch ((int)t->regs().arg3()) { |
| case NT_PRSTATUS: { |
| RecordTask* tracee = verify_ptrace_target(t, syscall_state, pid); |
| if (tracee) { |
| ptrace_verify_set_reg_set<Arch>( |
| t, user_regs_struct_size(tracee->arch()), syscall_state); |
| } |
| break; |
| } |
| case NT_PRFPREG: { |
| RecordTask* tracee = verify_ptrace_target(t, syscall_state, pid); |
| if (tracee) { |
| ptrace_verify_set_reg_set<Arch>( |
| t, user_fpregs_struct_size(tracee->arch()), syscall_state); |
| } |
| break; |
| } |
| case NT_ARM_SYSTEM_CALL: { |
| if (Arch::arch() != aarch64) { |
| syscall_state.expect_errno = EINVAL; |
| emulate = false; |
| break; |
| } |
| RecordTask* tracee = verify_ptrace_target(t, syscall_state, pid); |
| if (tracee) { |
| ptrace_verify_set_reg_set<Arch>( |
| t, sizeof(int), syscall_state); |
| } |
| break; |
| } |
| case NT_ARM_HW_WATCH: |
| case NT_ARM_HW_BREAK: { |
| if (Arch::arch() != aarch64) { |
| syscall_state.expect_errno = EINVAL; |
| emulate = false; |
| break; |
| } |
| RecordTask* tracee = verify_ptrace_target(t, syscall_state, pid); |
| if (tracee) { |
| ptrace_verify_set_reg_set<Arch>( |
| t, offsetof(ARM64Arch::user_hwdebug_state, dbg_regs[0]), |
| syscall_state); |
| } |
| break; |
| } |
| case NT_X86_XSTATE: { |
| if (!Arch::is_x86ish()) { |
| syscall_state.expect_errno = EINVAL; |
| emulate = false; |
| break; |
| } |
| RecordTask* tracee = verify_ptrace_target(t, syscall_state, pid); |
| if (tracee) { |
| switch (tracee->extra_regs().format()) { |
| case ExtraRegisters::XSAVE: |
| ptrace_verify_set_reg_set<Arch>( |
| t, tracee->extra_regs().data_size(), syscall_state); |
| break; |
| default: |
| syscall_state.emulate_result(-EINVAL); |
| break; |
| } |
| } |
| break; |
| } |
| default: |
| syscall_state.expect_errno = EINVAL; |
| emulate = false; |
| break; |
| } |
| break; |
| } |
| case PTRACE_SYSCALL: |
| case PTRACE_SINGLESTEP: |
| case Arch::PTRACE_SYSEMU: |
| case Arch::PTRACE_SYSEMU_SINGLESTEP: |
| case PTRACE_CONT: { |
| RecordTask* tracee = verify_ptrace_target(t, syscall_state, pid); |
| // If the tracer wants to observe syscall entries, we can't use the |
| // syscallbuf, because the tracer may want to change syscall numbers |
| // which the syscallbuf code is not prepared to handle. Additionally, |
| // we also lock the syscallbuf for PTRACE_SINGLESTEP, since we usually |
| // try to avoid delivering signals (e.g. PTRACE_SINGLESTEP's SIGTRAP) |
| // inside syscallbuf code. However, if the syscallbuf if locked, doing |
| // so should be safe. |
| if (tracee) { |
| if (!((unsigned int)t->regs().arg4() < _NSIG)) { |
| // Invalid signals in ptrace resume cause EIO |
| syscall_state.emulate_result(-EIO); |
| break; |
| } |
| tracee->set_syscallbuf_locked(command != PTRACE_CONT); |
| prepare_ptrace_cont(tracee, t->regs().arg4(), command); |
| syscall_state.emulate_result(0); |
| } |
| break; |
| } |
| case PTRACE_DETACH: { |
| RecordTask* tracee = verify_ptrace_target(t, syscall_state, pid); |
| if (tracee) { |
| tracee->set_syscallbuf_locked(0); |
| tracee->emulated_ptrace_options = 0; |
| tracee->emulated_ptrace_cont_command = 0; |
| tracee->emulated_stop_pending = false; |
| prepare_ptrace_cont(tracee, t->regs().arg4(), 0); |
| tracee->set_emulated_ptracer(nullptr); |
| syscall_state.emulate_result(0); |
| } |
| break; |
| } |
| case PTRACE_KILL: { |
| RecordTask* tracee = verify_ptrace_target(t, syscall_state, pid); |
| if (tracee) { |
| tracee->kill_if_alive(); |
| syscall_state.emulate_result(0); |
| } |
| break; |
| } |
| case PTRACE_INTERRUPT: { |
| RecordTask* tracee = verify_ptrace_target(t, syscall_state, pid, false); |
| if (tracee) { |
| if (!tracee->is_stopped()) { |
| // Running in a blocked syscall. Forward the PTRACE_INTERRUPT. |
| // Regular syscall exit handling will take over from here. |
| errno = 0; |
| tracee->fallible_ptrace(PTRACE_INTERRUPT, nullptr, nullptr); |
| syscall_state.emulate_result(-errno); |
| } else if (tracee->status().is_syscall()) { |
| tracee->emulate_ptrace_stop(tracee->status(), SYSCALL_EXIT_STOP); |
| } else if (tracee->emulated_stop_pending == NOT_STOPPED) { |
| // The tracee is stopped from our perspective, but not stopped from |
| // the perspective of the ptracer. Emulate a stop now. |
| tracee->apply_group_stop(SIGSTOP); |
| } |
| // Otherwise, there's nothing to do. |
| syscall_state.emulate_result(0); |
| } |
| break; |
| } |
| case PTRACE_GET_SYSCALL_INFO: { |
| RecordTask* tracee = verify_ptrace_target(t, syscall_state, pid); |
| if (tracee) { |
| remote_ptr<uint8_t> remote_addr(t->regs().arg4()); |
| bool ok = true; |
| typename Arch::ptrace_syscall_info info; |
| memset(&info, 0, sizeof(info)); |
| info.op = |
| tracee->emulated_stop_type == SYSCALL_ENTRY_STOP ? PTRACE_SYSCALL_INFO_ENTRY : |
| tracee->emulated_stop_type == SYSCALL_EXIT_STOP ? PTRACE_SYSCALL_INFO_EXIT : |
| tracee->emulated_stop_type == SECCOMP_STOP ? PTRACE_SYSCALL_INFO_SECCOMP : |
| PTRACE_SYSCALL_INFO_NONE; |
| info.arch = to_audit_arch(tracee->arch()); |
| info.instruction_pointer = tracee->ip().register_value(); |
| info.stack_pointer = tracee->regs().sp().as_int(); |
| size_t max_size = 0; |
| if (info.op == PTRACE_SYSCALL_INFO_ENTRY) { |
| info.entry.nr = tracee->regs().original_syscallno(); |
| for (int i = 0; i < 6; ++i) { |
| info.entry.args[i] = tracee->regs().arg(i+1); |
| } |
| max_size = ((char*)&info.entry.args[6] - (char*)&info); |
| } else if (info.op == PTRACE_SYSCALL_INFO_EXIT) { |
| info.exit.rval = tracee->regs().syscall_result_signed(); |
| info.exit.is_error = tracee->regs().syscall_result_signed() < 0; |
| max_size = ((char*)&info.exit.is_error - (char*)&info) + 1; |
| } else if (info.op == PTRACE_SYSCALL_INFO_SECCOMP) { |
| ASSERT(tracee, false) << "Unimplemented: PTRACE_SYSCALL_INFO_SECCOMP"; |
| } |
| size_t user_size = t->regs().arg3(); |
| size_t to_write = min(user_size, max_size); |
| t->write_mem(remote_addr, (uint8_t*)&info, to_write, &ok); |
| if (!ok) { |
| syscall_state.emulate_result(-EFAULT); |
| break; |
| } |
| t->record_local(remote_addr, (uint8_t*)&info, to_write); |
| syscall_state.emulate_result(max_size); |
| } |
| break; |
| } |
| case Arch::PTRACE_GET_THREAD_AREA: |
| case Arch::PTRACE_SET_THREAD_AREA: { |
| RecordTask* tracee = verify_ptrace_target(t, syscall_state, pid); |
| if (tracee) { |
| if (tracee->arch() != SupportedArch::x86) { |
| // This syscall should fail if the tracee is not x86 |
| syscall_state.expect_errno = EIO; |
| emulate = false; |
| break; |
| } |
| remote_ptr<X86Arch::user_desc> remote_addr(t->regs().arg4()); |
| bool ok = true; |
| struct X86Arch::user_desc desc; |
| memset(&desc, 0, sizeof(struct X86Arch::user_desc)); |
| // Do the ptrace request ourselves |
| if (command == Arch::PTRACE_GET_THREAD_AREA) { |
| int ret = -tracee->emulate_get_thread_area(t->regs().arg3(), desc); |
| if (ret == 0) { |
| t->write_mem(remote_addr, desc, &ok); |
| if (!ok) { |
| syscall_state.emulate_result(-EFAULT); |
| break; |
| } |
| t->record_local(remote_addr, &desc); |
| } |
| syscall_state.emulate_result(ret); |
| } else { |
| desc = t->read_mem(remote_addr, &ok); |
| if (!ok) { |
| syscall_state.emulate_result(-EFAULT); |
| break; |
| } |
| syscall_state.emulate_result( |
| -tracee->emulate_set_thread_area((int)t->regs().arg3(), desc)); |
| } |
| } |
| break; |
| } |
| case Arch::PTRACE_ARCH_PRCTL: { |
| RecordTask* tracee = verify_ptrace_target(t, syscall_state, pid); |
| if (tracee) { |
| if (tracee->arch() != SupportedArch::x86_64) { |
| // This syscall should fail if the tracee is not |
| // x86_64 |
| syscall_state.expect_errno = EIO; |
| emulate = false; |
| break; |
| } |
| int code = (int)t->regs().arg4(); |
| switch (code) { |
| case ARCH_GET_FS: |
| case ARCH_GET_GS: { |
| bool ok = true; |
| remote_ptr<uint64_t> addr(t->regs().arg3()); |
| uint64_t data = code == ARCH_GET_FS ? |
| tracee->regs().fs_base() : tracee->regs().gs_base(); |
| t->write_mem(addr, data, &ok); |
| if (ok) { |
| t->record_local(addr, &data); |
| syscall_state.emulate_result(0); |
| } else { |
| syscall_state.emulate_result(-EIO); |
| } |
| break; |
| } |
| case ARCH_SET_FS: |
| case ARCH_SET_GS: |
| syscall_state.emulate_result(0); |
| break; |
| default: |
| syscall_state.emulate_result(-EINVAL); |
| break; |
| } |
| } |
| break; |
| } |
| case PTRACE_PEEKTEXT: |
| case PTRACE_PEEKDATA: { |
| RecordTask* tracee = verify_ptrace_target(t, syscall_state, pid); |
| if (tracee) { |
| // The actual syscall returns the data via the 'data' out-parameter. |
| // The behavior of returning the data as the system call result is |
| // provided by the glibc wrapper. |
| auto datap = |
| syscall_state.reg_parameter<typename Arch::unsigned_word>(4); |
| remote_ptr<typename Arch::unsigned_word> addr = t->regs().arg3(); |
| bool ok = true; |
| auto v = tracee->read_mem(addr, &ok); |
| if (ok) { |
| t->write_mem(datap, v); |
| syscall_state.emulate_result(0); |
| } else { |
| syscall_state.emulate_result(-EIO); |
| } |
| } |
| break; |
| } |
| case PTRACE_POKETEXT: |
| case PTRACE_POKEDATA: { |
| RecordTask* tracee = verify_ptrace_target(t, syscall_state, pid); |
| if (tracee) { |
| remote_ptr<typename Arch::unsigned_word> addr = t->regs().arg3(); |
| typename Arch::unsigned_word data = t->regs().arg4(); |
| bool ok = true; |
| tracee->write_mem(addr, data, &ok); |
| if (ok) { |
| // Since we're recording data that might not be for |t|, we have to |
| // handle this specially during replay. |
| tracee->record_local(addr, &data); |
| syscall_state.emulate_result(0); |
| } else { |
| syscall_state.emulate_result(-EIO); |
| } |
| } |
| break; |
| } |
| case Arch::PTRACE_PEEKUSR: |
| case Arch::PTRACE_POKEUSR: |
| case Arch::PTRACE_GETREGS: |
| case Arch::PTRACE_GETFPREGS: |
| case Arch::PTRACE_GETFPXREGS: |
| case Arch::PTRACE_SETREGS: |
| case Arch::PTRACE_SETFPREGS: |
| case Arch::PTRACE_SETFPXREGS: |
| prepare_ptrace_legacy<Arch>(t, syscall_state); |
| break; |
| default: |
| syscall_state.expect_errno = EIO; |
| emulate = false; |
| break; |
| } |
| if (emulate) { |
| Registers r = t->regs(); |
| r.set_arg1((intptr_t)-1); |
| t->set_regs(r); |
| } |
| return PREVENT_SWITCH; |
| } |
| |
| static void check_signals_while_exiting(RecordTask* t) { |
| const RecordTask::StashedSignal* s = t->peek_stashed_sig_to_deliver(); |
| if (s) { |
| // An unblockable signal (SIGKILL, SIGSTOP) might be received |
| // and stashed. Since these signals are unblockable they take |
| // effect no matter what and we don't need to deliver them to an exiting |
| // thread. |
| int sig = s->siginfo.si_signo; |
| ASSERT(t, sig == SIGKILL || sig == SIGSTOP) |
| << "Got unexpected signal " << s->siginfo |
| << " (should have been blocked)"; |
| } |
| } |
| |
| static bool send_signal_during_init_buffers() { |
| static bool send = getenv("RR_INIT_BUFFERS_SEND_SIGNAL") != nullptr; |
| return send; |
| } |
| |
| /** |
| * At thread exit time, undo the work that init_buffers() did. |
| * |
| * Call this when the tracee has already entered SYS_exit/SYS_exit_group. The |
| * tracee will be returned at a state in which it has entered (or |
| * re-entered) SYS_exit/SYS_exit_group. |
| */ |
| static void prepare_exit(RecordTask* t) { |
| // RecordSession is responsible for ensuring we don't get here with |
| // pending signals. |
| ASSERT(t, !t->has_stashed_sig()); |
| |
| t->stable_exit = true; |
| t->session().scheduler().in_stable_exit(t); |
| |
| Registers r = t->regs(); |
| Registers exit_regs = r; |
| ASSERT(t, |
| is_exit_syscall(exit_regs.original_syscallno(), |
| t->ev().Syscall().arch()) || |
| is_exit_group_syscall(exit_regs.original_syscallno(), |
| t->ev().Syscall().arch())) |
| << "Tracee should have been at exit/exit_group, but instead at " |
| << t->ev().Syscall().syscall_name(); |
| |
| // The first thing we need to do is to block all signals to prevent |
| // a signal being delivered to the thread (since it's going to exit and |
| // won't be able to handle any more signals). |
| // |
| // The tracee is at the entry to SYS_exit/SYS_exit_group, but hasn't started |
| // the call yet. We can't directly start injecting syscalls |
| // because the tracee is still in the kernel. And obviously, |
| // if we finish the SYS_exit/SYS_exit_group syscall, the tracee isn't around |
| // anymore. |
| // |
| // So hijack this SYS_exit call and rewrite it into a SYS_rt_sigprocmask. |
| r.set_original_syscallno(syscall_number_for_rt_sigprocmask(t->arch())); |
| r.set_arg1(SIG_BLOCK); |
| r.set_arg2(AddressSpace::rr_page_record_ff_bytes()); |
| r.set_arg3(0); |
| r.set_arg4(sizeof(sig_set_t)); |
| t->set_regs(r); |
| // This exits the SYS_rt_sigprocmask. Now the tracee is ready to do our |
| // bidding. |
| t->exit_syscall(); |
| check_signals_while_exiting(t); |
| |
| // Do the actual buffer and fd cleanup. |
| t->destroy_buffers(); |
| |
| check_signals_while_exiting(t); |
| |
| // Restore these regs to what they would have been just before |
| // the tracee trapped at SYS_exit/SYS_exit_group. When we've finished |
| // cleanup, we'll restart the call. |
| exit_regs.set_syscallno(exit_regs.original_syscallno()); |
| exit_regs.set_original_syscallno(-1); |
| exit_regs.set_ip(exit_regs.ip() - syscall_instruction_length(t->arch())); |
| ASSERT(t, is_at_syscall_instruction(t, exit_regs.ip())) |
| << "Tracee should have entered through int $0x80."; |
| // Restart the SYS_exit call. |
| t->set_regs(exit_regs); |
| t->enter_syscall(); |
| check_signals_while_exiting(t); |
| |
| if (t->emulated_ptrace_options & PTRACE_O_TRACEEXIT) { |
| t->emulate_ptrace_stop(WaitStatus::for_ptrace_event(PTRACE_EVENT_EXIT)); |
| } |
| } |
| |
| static void prepare_mmap_register_params(RecordTask* t) { |
| Registers r = t->regs(); |
| |
| FileMonitor* monitor = t->fd_table()->get_monitor(r.arg5_signed()); |
| if (monitor) { |
| switch (monitor->type()) { |
| case FileMonitor::VirtualPerfCounter: |
| case FileMonitor::NonvirtualPerfCounter: |
| LOG(info) << "Faking failure of mmap for perf event counter"; |
| // Force mmap to fail by setting fd to our tracee socket |
| r.set_arg5(t->session().tracee_fd_number()); |
| t->set_regs(r); |
| return; |
| default: |
| break; |
| } |
| } |
| |
| intptr_t mask_flag = MAP_FIXED; |
| #ifdef MAP_32BIT |
| mask_flag |= MAP_32BIT; |
| #endif |
| if (t->enable_chaos_memory_allocations() && !(r.arg4_signed() & mask_flag)) { |
| // Not MAP_FIXED. Randomize the allocation address. |
| remote_ptr<void> hint = floor_page_size(r.arg1()); |
| size_t orig_len = ceil_page_size(r.arg1() + r.arg2()) - hint.as_int(); |
| size_t len = orig_len; |
| if (r.arg4_signed() & MAP_GROWSDOWN) { |
| // Ensure stacks can grow to the minimum size we choose |
| len = max<size_t>(AddressSpace::chaos_mode_min_stack_size(), len); |
| } |
| remote_ptr<void> addr = t->vm()->chaos_mode_find_free_memory(t, len, hint); |
| if (addr.is_null()) { |
| // force ENOMEM if other flags are valid |
| r.set_arg2(uintptr_t(1) << (word_size(t->arch())*8 - 1)); |
| t->set_regs(r); |
| return; |
| } |
| // We don't set MAP_FIXED. The new map *should* land at the address we request, |
| // because we tried to choose a free address. |
| // If that fails because there is something mapped there, that's an rr bug |
| // but we don't want to wipe out that mapping. Better to just carry on. |
| // This may mean the mapping lands in an area we tried to exclude; that's |
| // probably better than failing to record. |
| // We could use MAP_FIXED_NOREPLACE at some point (after the kernels |
| // that shipped the broken version (< 4.19) are no longer relevant). |
| r.set_arg1(addr + len - orig_len); |
| LOG(debug) << "Chaos mode selected address " << HEX(r.arg1()); |
| } |
| r.set_arg4(r.arg4_signed() & ~MAP_GROWSDOWN); |
| t->set_regs(r); |
| } |
| |
| enum ScratchAddrType { FIXED_ADDRESS, DYNAMIC_ADDRESS }; |
| /* Pointer used when running RR in WINE. Memory below this address is |
| unmapped by WINE immediately after exec, so start the scratch buffer |
| here. */ |
| static const uintptr_t FIXED_SCRATCH_PTR = 0x68000000; |
| |
| static void init_scratch_memory(RecordTask* t, |
| ScratchAddrType addr_type = DYNAMIC_ADDRESS) { |
| const int scratch_size = 512 * page_size(); |
| size_t sz = scratch_size; |
| // The PROT_EXEC looks scary, and it is, but it's to prevent |
| // this region from being coalesced with another anonymous |
| // segment mapped just after this one. If we named this |
| // segment, we could remove this hack. |
| int prot = PROT_READ | PROT_WRITE | PROT_EXEC; |
| int flags = MAP_PRIVATE | MAP_ANONYMOUS; |
| { |
| /* initialize the scratchpad for blocking system calls */ |
| AutoRemoteSyscalls remote(t); |
| |
| if (addr_type == DYNAMIC_ADDRESS) { |
| t->scratch_ptr = remote.infallible_mmap_syscall_if_alive(remote_ptr<void>(), sz, |
| prot, flags, -1, 0); |
| } else { |
| t->scratch_ptr = |
| remote.infallible_mmap_syscall_if_alive(remote_ptr<void>(FIXED_SCRATCH_PTR), |
| sz, prot, flags | MAP_FIXED, -1, 0); |
| } |
| t->scratch_size = scratch_size; |
| } |
| |
| t->setup_preload_thread_locals(); |
| |
| // record this mmap for the replay |
| Registers r = t->regs(); |
| uintptr_t saved_result = r.syscall_result(); |
| r.set_syscall_result(t->scratch_ptr); |
| t->set_regs(r); |
| |
| KernelMapping km = |
| t->vm()->map(t, t->scratch_ptr, sz, prot, flags, 0, string()); |
| struct stat stat; |
| memset(&stat, 0, sizeof(stat)); |
| auto record_in_trace = t->trace_writer().write_mapped_region(t, |
| km, stat, km.fsname(), vector<TraceRemoteFd>()); |
| ASSERT(t, record_in_trace == TraceWriter::DONT_RECORD_IN_TRACE); |
| |
| r.set_syscall_result(saved_result); |
| t->set_regs(r); |
| } |
| |
| static int ptrace_option_for_event(int ptrace_event) { |
| switch (ptrace_event) { |
| case PTRACE_EVENT_FORK: |
| return PTRACE_O_TRACEFORK; |
| case PTRACE_EVENT_CLONE: |
| return PTRACE_O_TRACECLONE; |
| case PTRACE_EVENT_VFORK: |
| return PTRACE_O_TRACEVFORK; |
| default: |
| FATAL() << "Unsupported ptrace event"; |
| return 0; |
| } |
| } |
| |
| template <typename Arch> |
| static Switchable prepare_clone(RecordTask* t, TaskSyscallState& syscall_state) { |
| uintptr_t flags; |
| CloneParameters params; |
| Registers r = t->regs(); |
| int original_syscall = r.original_syscallno(); |
| int ptrace_event; |
| int termination_signal = SIGCHLD; |
| |
| if (is_clone_syscall(original_syscall, r.arch())) { |
| params = extract_clone_parameters(t); |
| flags = r.arg1(); |
| r.set_arg1(flags & ~uintptr_t(CLONE_UNTRACED)); |
| t->set_regs(r); |
| termination_signal = flags & 0xff; |
| if (flags & CLONE_VFORK) { |
| ptrace_event = PTRACE_EVENT_VFORK; |
| } else if (termination_signal == SIGCHLD) { |
| ptrace_event = PTRACE_EVENT_FORK; |
| } else { |
| ptrace_event = PTRACE_EVENT_CLONE; |
| } |
| } else if (is_vfork_syscall(original_syscall, r.arch())) { |
| ptrace_event = PTRACE_EVENT_VFORK; |
| flags = CLONE_VM | CLONE_VFORK | SIGCHLD; |
| } else { |
| ptrace_event = PTRACE_EVENT_FORK; |
| flags = SIGCHLD; |
| } |
| |
| while (true) { |
| if (!t->resume_execution(RESUME_SYSCALL, RESUME_WAIT_NO_EXIT, RESUME_NO_TICKS)) { |
| // Tracee died unexpectedly during clone. |
| return ALLOW_SWITCH; |
| } |
| // XXX handle stray signals? |
| if (t->ptrace_event()) { |
| break; |
| } |
| ASSERT(t, !t->stop_sig()); |
| ASSERT(t, t->regs().syscall_result_signed() < 0); |
| if (!t->regs().syscall_may_restart()) { |
| LOG(debug) << "clone failed, returning " |
| << errno_name(-t->regs().syscall_result_signed()); |
| syscall_state.emulate_result(t->regs().syscall_result()); |
| // clone failed and we're exiting the syscall with an error. Reenter |
| // the syscall so that we're in the same state as the normal execution |
| // path. |
| t->ev().Syscall().failed_during_preparation = true; |
| // Restore register we might have changed |
| r.set_arg1(syscall_state.syscall_entry_registers.arg1()); |
| r.set_syscallno(Arch::gettid); |
| r.set_ip(r.ip().decrement_by_syscall_insn_length(r.arch())); |
| t->set_regs(r); |
| t->enter_syscall(); |
| r.set_ip(t->regs().ip()); |
| r.set_syscallno(original_syscall); |
| r.set_original_syscallno(original_syscall); |
| t->set_regs(r); |
| t->canonicalize_regs(t->arch()); |
| return ALLOW_SWITCH; |
| } |
| // Reenter the syscall. If we try to return an ERESTART* error using the |
| // code path above, our set_syscallno(SYS_gettid) fails to take effect and |
| // we actually do the clone, and things get horribly confused. |
| r.set_syscallno(r.original_syscallno()); |
| r.set_ip(r.ip().decrement_by_syscall_insn_length(r.arch())); |
| t->set_regs(r); |
| t->enter_syscall(); |
| } |
| |
| ASSERT(t, t->ptrace_event() == ptrace_event); |
| |
| // Ideally we'd just use t->get_ptrace_eventmsg_pid() here, but |
| // kernels failed to translate that value from other pid namespaces to |
| // our pid namespace until June 2014: |
| // https://github.com/torvalds/linux/commit/4e52365f279564cef0ddd41db5237f0471381093 |
| pid_t new_tid; |
| if (flags & CLONE_THREAD) { |
| new_tid = t->find_newborn_thread(); |
| } else { |
| new_tid = t->find_newborn_process(flags & CLONE_PARENT ? t->get_parent_pid() |
| : t->real_tgid()); |
| } |
| RecordTask* new_task = static_cast<RecordTask*>( |
| t->session().clone(t, clone_flags_to_task_flags(flags), params.stack, |
| params.tls, params.ctid, new_tid)); |
| |
| // Restore modified registers in cloned task |
| Registers new_r = new_task->regs(); |
| new_r.set_original_syscallno( |
| syscall_state.syscall_entry_registers.original_syscallno()); |
| new_r.set_orig_arg1(syscall_state.syscall_entry_registers.arg1()); |
| new_task->set_regs(new_r); |
| new_task->canonicalize_regs(new_task->arch()); |
| new_task->set_termination_signal(termination_signal); |
| // If the task got killed right away, we need to treat this |
| // as if we are just finished a syscall |
| // so as not to trigger the logic in handle_ptrace_exit_event(). |
| // Otherwise, we'll capture the current registers as if it's a syscall entry |
| // which would look like a clone/fork/vfork syscall on the dead thread. |
| // During replay, we'll see this ghost clone and would fail to find |
| // the resulting thread from the clone and crash. |
| new_task->ip_at_last_recorded_syscall_exit = new_r.ip(); |
| |
| /* record child id here */ |
| if (is_clone_syscall(original_syscall, r.arch())) { |
| CloneParameters child_params = extract_clone_parameters(new_task); |
| t->record_remote_even_if_null(params.ptid); |
| |
| if (Arch::clone_tls_type == Arch::UserDescPointer) { |
| t->record_remote_even_if_null( |
| params.tls.cast<typename Arch::user_desc>()); |
| new_task->record_remote_even_if_null( |
| child_params.tls.cast<typename Arch::user_desc>()); |
| } else { |
| DEBUG_ASSERT(Arch::clone_tls_type == Arch::PthreadStructurePointer); |
| } |
| new_task->record_remote_even_if_null(child_params.ptid); |
| new_task->record_remote_even_if_null(child_params.ctid); |
| } |
| t->session().trace_writer().write_task_event(TraceTaskEvent::for_clone( |
| new_task->tid, t->tid, new_task->own_namespace_rec_tid, flags)); |
| |
| init_scratch_memory(new_task); |
| |
| Switchable switchable = (flags & CLONE_VFORK) ? ALLOW_SWITCH : PREVENT_SWITCH; |
| if ((t->emulated_ptrace_options & ptrace_option_for_event(ptrace_event)) && |
| !(flags & CLONE_UNTRACED)) { |
| new_task->set_emulated_ptracer(t->emulated_ptracer); |
| new_task->emulated_ptrace_seized = t->emulated_ptrace_seized; |
| new_task->emulated_ptrace_options = t->emulated_ptrace_options; |
| t->emulated_ptrace_event_msg = new_task->rec_tid; |
| t->emulate_ptrace_stop(WaitStatus::for_ptrace_event(ptrace_event)); |
| // ptrace(2) man page says that SIGSTOP is used here, but it's really |
| // SIGTRAP (in 4.4.4-301.fc23.x86_64 anyway). |
| new_task->apply_group_stop(SIGTRAP); |
| switchable = ALLOW_SWITCH; |
| } |
| |
| // Restore our register modifications now, so that the emulated ptracer will |
| // see the original registers without our modifications if it inspects them |
| // in the ptrace event. |
| r = t->regs(); |
| r.set_orig_arg1(syscall_state.syscall_entry_registers.arg1()); |
| r.set_original_syscallno( |
| syscall_state.syscall_entry_registers.original_syscallno()); |
| t->set_regs(r); |
| t->canonicalize_regs(t->arch()); |
| |
| // We're in a PTRACE_EVENT_FORK/VFORK/CLONE so the next PTRACE_SYSCALL for |
| // |t| will go to the exit of the syscall, as expected. |
| |
| // For non-vfork cases, resume the cloning thread, not the new thread. |
| // In some applications the new thread must wait for the cloning thread to |
| // exit a critical section. Allowing the cloning thread to run reduces the |
| // likelihood that the new thread will have to block on that wait. |
| return switchable; |
| } |
| |
| static bool protect_rr_sigs(RecordTask* t, remote_ptr<void> p, void* save) { |
| remote_ptr<sig_set_t> setp = p.cast<sig_set_t>(); |
| if (setp.is_null()) { |
| return false; |
| } |
| |
| auto sig_set = t->read_mem(setp); |
| auto new_sig_set = sig_set; |
| new_sig_set &= ~t->session().rr_signal_mask(); |
| if (sig_set == new_sig_set) { |
| return false; |
| } |
| |
| t->write_mem(setp, new_sig_set); |
| if (save) { |
| memcpy(save, &sig_set, sizeof(sig_set)); |
| } |
| |
| return true; |
| } |
| |
| template <typename Arch> |
| static bool protect_rr_sigs_sa_mask_arch(RecordTask* t, remote_ptr<void> p, |
| void* save) { |
| remote_ptr<typename Arch::kernel_sigaction> sap = |
| p.cast<typename Arch::kernel_sigaction>(); |
| if (sap.is_null()) { |
| return false; |
| } |
| |
| auto sa = t->read_mem(sap); |
| auto new_sig_set = sa.sa_mask; |
| // Don't let the tracee block TIME_SLICE_SIGNAL or |
| // SYSCALLBUF_DESCHED_SIGNAL. |
| new_sig_set.__val[0] &= ~t->session().rr_signal_mask(); |
| |
| if (!memcmp(&sa.sa_mask, &new_sig_set, sizeof(new_sig_set))) { |
| return false; |
| } |
| |
| if (save) { |
| memcpy(save, &sa, sizeof(sa)); |
| } |
| sa.sa_mask = new_sig_set; |
| t->write_mem(sap, sa); |
| |
| return true; |
| } |
| |
| static bool protect_rr_sigs_sa_mask(RecordTask* t, remote_ptr<void> p, |
| void* save) { |
| RR_ARCH_FUNCTION(protect_rr_sigs_sa_mask_arch, t->arch(), t, p, save); |
| } |
| |
| static void record_ranges(RecordTask* t, |
| const vector<FileMonitor::Range>& ranges, |
| size_t size) { |
| size_t s = size; |
| for (auto& r : ranges) { |
| size_t bytes = min(s, r.length); |
| if (bytes > 0) { |
| t->record_remote(r.data, bytes); |
| s -= bytes; |
| } |
| } |
| } |
| |
| static pid_t do_detach_teleport(RecordTask *t) |
| { |
| DiversionSession session(t->session().cpu_binding()); |
| // Use the old task's exe path to make sure that /proc/<pid>/exe looks right |
| // for the teleported task. |
| std::string exe_path(t->proc_exe_path()); |
| std::vector<std::string> argv, env; |
| ScopedFd error_fd; |
| int tracee_fd_number = t->session().tracee_fd_number(); |
| Task *new_t = Task::spawn(session, error_fd, &session.tracee_socket_fd(), |
| &session.tracee_socket_receiver_fd(), |
| &tracee_fd_number, |
| exe_path, argv, env, |
| -1); |
| pid_t new_tid = new_t->tid; |
| LOG(debug) << "Detached task with tid " << new_tid; |
| session.on_create(new_t); |
| session.set_tracee_fd_number(tracee_fd_number); |
| new_t->os_exec(t->arch(), exe_path); |
| session.post_exec(); |
| new_t->post_exec(exe_path); |
| new_t->post_exec_syscall(exe_path); |
| new_t->dup_from(t); |
| // Emulate the success of the syscall in the new task |
| Registers regs = new_t->regs(); |
| regs.set_arg1(0); |
| new_t->set_regs(regs); |
| // Disable syscall buffering. XXX: We could also try to unpatch syscalls here |
| new_t->hpc.stop(); |
| new_t->set_in_diversion(true); |
| // Just clean up some additional state |
| new_t->reenable_cpuid_tsc(); |
| { |
| AutoRemoteSyscalls remote(new_t, AutoRemoteSyscalls::DISABLE_MEMORY_PARAMS); |
| remote.infallible_close_syscall_if_alive(tracee_fd_number); |
| } |
| t->vm()->monkeypatcher().unpatch_syscalls_in(new_t); |
| // Try to reset the scheduler affinity that we enforced upon the task. |
| // XXX: It would be nice to track what affinity the tracee requested and |
| // restore that. |
| // For now honor whatever affinity rr itself has (e.g. for running on P-cores |
| // on Alder Lake). |
| cpu_set_t mask = t->session().original_affinity(); |
| syscall(SYS_sched_setaffinity, new_t->tid, sizeof(mask), &mask); |
| // Task::spawn my lave the task in a group-stop if the task SIGSTOPs itself |
| // before we can PTRACE_SEIZE it. Kick it out of that group-stop now. |
| ::kill(new_tid, SIGCONT); |
| new_t->detach(); |
| new_t->did_kill(); |
| delete new_t; |
| return new_tid; |
| } |
| |
| template <typename Arch> |
| static Switchable did_emulate_read(int syscallno, RecordTask* t, |
| const std::vector<FileMonitor::Range>& ranges, |
| uint64_t result, |
| TaskSyscallState& syscall_state) |
| { |
| syscall_state.emulate_result(result); |
| record_ranges(t, ranges, result); |
| if (syscallno == Arch::pread64 || syscallno == Arch::preadv || result <= 0) { |
| // Don't perform this syscall. |
| Registers r = t->regs(); |
| r.set_arg1(-1); |
| t->set_regs(r); |
| } else { |
| // Turn this into an lseek to emulate the advance of the fd ptr |
| Registers r = t->regs(); |
| r.set_original_syscallno(Arch::lseek); |
| r.set_arg2(result); |
| r.set_arg3(SEEK_CUR); |
| t->set_regs(r); |
| } |
| return PREVENT_SWITCH; |
| } |
| |
| template <typename Arch> |
| static Switchable rec_prepare_syscall_arch(RecordTask* t, |
| TaskSyscallState& syscall_state, |
| const Registers& regs) { |
| int syscallno = t->ev().Syscall().number; |
| |
| if (t->regs().original_syscallno() == SECCOMP_MAGIC_SKIP_ORIGINAL_SYSCALLNO) { |
| // rr vetoed this syscall. Don't do any pre-processing. |
| return PREVENT_SWITCH; |
| } |
| |
| syscall_state.syscall_entry_registers = regs; |
| |
| if (t->desched_rec()) { |
| /* |t| was descheduled while in a buffered syscall. We normally don't |
| * use scratch memory for the call, because the syscallbuf itself |
| * is serving that purpose. More importantly, we *can't* set up |
| * scratch for |t|, because it's already in the syscall. Instead, we will |
| * record the syscallbuf memory in rec_process_syscall_arch. |
| * |
| * However there is one case where we use scratch memory: when |
| * sys_read's block-cloning path is interrupted. In that case, record |
| * the scratch memory. |
| */ |
| if (syscallno == Arch::read && |
| remote_ptr<void>(regs.arg2()) == t->scratch_ptr) { |
| syscall_state.reg_parameter( |
| 2, ParamSize::from_syscall_result<typename Arch::ssize_t>( |
| (size_t)regs.arg3()), |
| IN_OUT_NO_SCRATCH); |
| } |
| return ALLOW_SWITCH; |
| } |
| |
| if (syscallno < 0) { |
| // Invalid syscall. Don't let it accidentally match a |
| // syscall number below that's for an undefined syscall. |
| if (!Arch::is_x86ish()) { |
| // On architectures where arg1 is shared with the return value, the kernel |
| // may not set -ENOSYS for us. There, it instead copies the arg1 to the return |
| // value (frankly this is probably a bug, but likely nothing we can do |
| // about it). |
| Registers new_regs = regs; |
| new_regs.set_syscall_result(-ENOSYS); |
| t->set_regs(new_regs); |
| } |
| syscall_state.expect_errno = ENOSYS; |
| return PREVENT_SWITCH; |
| } |
| |
| switch (syscallno) { |
| // All the regular syscalls are handled here. |
| #include "SyscallRecordCase.generated" |
| |
| case Arch::splice: { |
| syscall_state.reg_parameter<loff_t>(2, IN_OUT); |
| syscall_state.reg_parameter<loff_t>(4, IN_OUT); |
| return ALLOW_SWITCH; |
| } |
| |
| case Arch::sendfile: { |
| syscall_state.reg_parameter<typename Arch::off_t>(3, IN_OUT); |
| return ALLOW_SWITCH; |
| } |
| case Arch::sendfile64: { |
| syscall_state.reg_parameter<typename Arch::off64_t>(3, IN_OUT); |
| return ALLOW_SWITCH; |
| } |
| |
| case Arch::capget: { |
| auto hdr = t->read_mem( |
| syscall_state.reg_parameter<typename Arch::__user_cap_header_struct>( |
| 1, IN_OUT)); |
| int struct_count; |
| switch (hdr.version) { |
| case _LINUX_CAPABILITY_VERSION_1: |
| struct_count = _LINUX_CAPABILITY_U32S_1; |
| break; |
| case _LINUX_CAPABILITY_VERSION_2: |
| struct_count = _LINUX_CAPABILITY_U32S_2; |
| break; |
| case _LINUX_CAPABILITY_VERSION_3: |
| struct_count = _LINUX_CAPABILITY_U32S_3; |
| break; |
| default: |
| struct_count = 0; |
| break; |
| } |
| if (struct_count > 0) { |
| syscall_state.reg_parameter( |
| 2, sizeof(typename Arch::__user_cap_data_struct) * struct_count, |
| OUT); |
| } |
| return PREVENT_SWITCH; |
| } |
| |
| case Arch::vfork: |
| case Arch::fork: |
| case Arch::clone: |
| return prepare_clone<Arch>(t, syscall_state); |
| |
| case Arch::exit: |
| prepare_exit(t); |
| return ALLOW_SWITCH; |
| |
| case Arch::exit_group: |
| if (t->thread_group()->task_set().size() == 1) { |
| prepare_exit(t); |
| return ALLOW_SWITCH; |
| } |
| return PREVENT_SWITCH; |
| |
| case Arch::execve: |
| case Arch::execveat: { |
| t->session().scheduler().did_enter_execve(t); |
| vector<string> cmd_line; |
| remote_ptr<typename Arch::unsigned_word> argv; |
| string raw_filename; |
| t->did_execveat = syscallno == Arch::execveat; |
| bool ok = true; |
| if (t->did_execveat) { |
| argv = regs.arg3(); |
| raw_filename = t->read_c_str(regs.arg2(), &ok); |
| } else { |
| argv = regs.arg2(); |
| raw_filename = t->read_c_str(regs.arg1(), &ok); |
| } |
| if (!ok) { |
| syscall_state.expect_errno = EFAULT; |
| return ALLOW_SWITCH; |
| } |
| |
| while (true) { |
| auto p = t->read_mem(argv, &ok); |
| if (!ok) { |
| syscall_state.expect_errno = EFAULT; |
| return ALLOW_SWITCH; |
| } |
| if (!p) { |
| break; |
| } |
| cmd_line.push_back(t->read_c_str(p, &ok)); |
| if (!ok) { |
| syscall_state.expect_errno = EFAULT; |
| return ALLOW_SWITCH; |
| } |
| argv++; |
| } |
| |
| // Save the event. We can't record it here because the exec might fail. |
| syscall_state.exec_saved_event = |
| unique_ptr<TraceTaskEvent>(new TraceTaskEvent( |
| TraceTaskEvent::for_exec(t->tid, raw_filename, cmd_line))); |
| |
| // This can trigger exits of non-main threads, so we have to |
| // allow them to be handled. |
| return ALLOW_SWITCH; |
| } |
| |
| case Arch::fcntl: |
| case Arch::fcntl64: { |
| int fd = regs.arg1(); |
| uint64_t result; |
| if (t->fd_table()->emulate_fcntl(fd, t, &result)) { |
| // Don't perform this syscall. |
| Registers r = regs; |
| r.set_arg1(-1); |
| t->set_regs(r); |
| syscall_state.emulate_result(result); |
| return PREVENT_SWITCH; |
| } |
| switch ((int)regs.arg2_signed()) { |
| case Arch::DUPFD: |
| case Arch::DUPFD_CLOEXEC: |
| case Arch::GETFD: |
| case Arch::GETFL: |
| case Arch::SETFL: |
| case Arch::SETLK: |
| case Arch::SETLK64: |
| case Arch::OFD_SETLK: |
| case Arch::SETOWN: |
| case Arch::SETOWN_EX: |
| case Arch::GETSIG: |
| case Arch::SETSIG: |
| case Arch::NOTIFY: |
| case Arch::SETPIPE_SZ: |
| case Arch::GETPIPE_SZ: |
| case Arch::ADD_SEALS: |
| case Arch::GET_SEALS: |
| case Arch::SET_RW_HINT: |
| case Arch::SET_FILE_RW_HINT: |
| break; |
| |
| case Arch::SETFD: |
| if (t->fd_table()->is_rr_fd(fd)) { |
| // Don't let tracee set FD_CLOEXEC on this fd. Disable the syscall, |
| // but emulate a successful return. |
| Registers r = regs; |
| r.set_arg1(-1); |
| t->set_regs(r); |
| syscall_state.emulate_result(0); |
| } |
| break; |
| |
| case Arch::GETLK: |
| syscall_state.reg_parameter<typename Arch::_flock>(3, IN_OUT); |
| break; |
| |
| case Arch::OFD_GETLK: |
| case Arch::GETLK64: |
| // flock and flock64 better be different on 32-bit architectures, |
| // but on 64-bit architectures, it's OK if they're the same. |
| static_assert( |
| sizeof(typename Arch::_flock) < sizeof(typename Arch::flock64) || |
| Arch::elfclass == ELFCLASS64, |
| "struct flock64 not declared differently from struct flock"); |
| syscall_state.reg_parameter<typename Arch::flock64>(3, IN_OUT); |
| break; |
| |
| case Arch::GETOWN_EX: |
| syscall_state.reg_parameter<typename Arch::f_owner_ex>(3); |
| break; |
| |
| case Arch::SETLKW: |
| case Arch::SETLKW64: |
| case Arch::OFD_SETLKW: |
| // SETLKW blocks, but doesn't write any |
| // outparam data to the |struct flock| |
| // argument, so no need for scratch. |
| return ALLOW_SWITCH; |
| |
| case Arch::GET_RW_HINT: |
| case Arch::GET_FILE_RW_HINT: |
| syscall_state.reg_parameter<int64_t>(3); |
| break; |
| |
| default: |
| // Unknown command should trigger EINVAL. |
| syscall_state.expect_errno = EINVAL; |
| break; |
| } |
| return PREVENT_SWITCH; |
| } |
| |
| /* futex parameters are in-out but they can't be moved to scratch |
| * addresses. */ |
| case Arch::futex_time64: |
| case Arch::futex: { |
| int op = regs.arg2_signed(); |
| switch (op & FUTEX_CMD_MASK) { |
| case FUTEX_WAIT: |
| case FUTEX_WAIT_BITSET: |
| return ALLOW_SWITCH; |
| |
| case FUTEX_REQUEUE: |
| case FUTEX_CMP_REQUEUE: |
| case FUTEX_WAKE_OP: |
| syscall_state.reg_parameter<int>(5, IN_OUT_NO_SCRATCH); |
| break; |
| |
| case FUTEX_WAKE: |
| case FUTEX_WAKE_BITSET: |
| break; |
| |
| case FUTEX_LOCK_PI: |
| case FUTEX_UNLOCK_PI: |
| case FUTEX_TRYLOCK_PI: |
| case FUTEX_CMP_REQUEUE_PI: |
| case FUTEX_WAIT_REQUEUE_PI: { |
| Registers r = regs; |
| r.set_arg2(-1); |
| t->set_regs(r); |
| syscall_state.emulate_result(-ENOSYS); |
| break; |
| } |
| |
| default: |
| syscall_state.expect_errno = ENOSYS; |
| break; |
| } |
| return PREVENT_SWITCH; |
| } |
| |
| case Arch::getrandom: |
| syscall_state.reg_parameter( |
| 1, ParamSize::from_syscall_result<int>((size_t)regs.arg2())); |
| return (GRND_NONBLOCK & regs.arg3()) ? PREVENT_SWITCH : ALLOW_SWITCH; |
| |
| case Arch::get_thread_area: |
| case Arch::set_thread_area: |
| syscall_state.reg_parameter<typename Arch::user_desc>(1, IN_OUT); |
| return PREVENT_SWITCH; |
| |
| case Arch::ipc: |
| switch ((int)regs.arg1_signed()) { |
| case MSGGET: |
| case SHMDT: |
| case SHMGET: |
| case SEMGET: |
| break; |
| |
| case MSGCTL: { |
| int cmd = (int)regs.arg3_signed() & ~IPC_64; |
| return prepare_msgctl<Arch>(syscall_state, cmd, 5); |
| } |
| |
| case MSGSND: |
| case SEMOP: |
| case SEMTIMEDOP: |
| return ALLOW_SWITCH; |
| |
| case MSGRCV: { |
| size_t msgsize = regs.arg3(); |
| auto kluge_args = |
| syscall_state.reg_parameter<typename Arch::ipc_kludge_args>(5, |
| IN); |
| syscall_state.mem_ptr_parameter(REMOTE_PTR_FIELD(kluge_args, msgbuf), |
| sizeof(typename Arch::signed_long) + |
| msgsize); |
| return ALLOW_SWITCH; |
| } |
| |
| case SHMAT: { |
| // Insane legacy feature: ipc SHMAT returns its pointer via an |
| // in-memory out parameter. |
| syscall_state.reg_parameter<typename Arch::unsigned_long>(4); |
| return PREVENT_SWITCH; |
| } |
| |
| case SHMCTL: { |
| int cmd = (int)regs.arg3_signed() & ~IPC_64; |
| return prepare_shmctl<Arch>(syscall_state, cmd, 5); |
| } |
| |
| case SEMCTL: { |
| int cmd = (int)regs.arg4_signed() & ~IPC_64; |
| return prepare_semctl<Arch>(t, syscall_state, (int)regs.arg2_signed(), |
| cmd, 5, DEREFERENCE); |
| } |
| |
| default: |
| syscall_state.expect_errno = EINVAL; |
| break; |
| } |
| return PREVENT_SWITCH; |
| |
| case Arch::msgctl: |
| return prepare_msgctl<Arch>(syscall_state, (int)regs.arg2_signed(), 3); |
| |
| case Arch::msgrcv: { |
| size_t msgsize = regs.arg3(); |
| syscall_state.reg_parameter(2, |
| sizeof(typename Arch::signed_long) + msgsize); |
| return ALLOW_SWITCH; |
| } |
| |
| // Various syscalls that can block but don't otherwise have behavior we need |
| // to record. |
| case Arch::fdatasync: |
| case Arch::fsync: |
| case Arch::msgsnd: |
| case Arch::msync: |
| case Arch::open: |
| case Arch::openat: |
| case Arch::semop: |
| case Arch::semtimedop_time64: |
| case Arch::semtimedop: |
| case Arch::sync: |
| case Arch::sync_file_range: |
| case Arch::syncfs: |
| return ALLOW_SWITCH; |
| |
| case Arch::sysfs: { |
| int option = regs.arg1(); |
| switch (option) { |
| case 1: |
| case 3: |
| break; |
| case 2: { |
| remote_ptr<char> buf(regs.arg3()); |
| // Assume no filesystem type name is more than 1K |
| char tmp[1024]; |
| ssize_t bytes = t->read_bytes_fallible(buf, sizeof(tmp), tmp); |
| if (bytes > 0) { |
| syscall_state.reg_parameter(3, bytes); |
| } |
| break; |
| } |
| default: |
| syscall_state.expect_errno = EINVAL; |
| break; |
| } |
| return PREVENT_SWITCH; |
| } |
| |
| case Arch::socketcall: |
| return prepare_socketcall<Arch>(t, syscall_state); |
| |
| case Arch::select: |
| case Arch::_newselect: |
| if (syscallno == Arch::select && |
| Arch::select_semantics == Arch::SelectStructArguments) { |
| auto argsp = |
| syscall_state.reg_parameter<typename Arch::select_args>(1, IN); |
| syscall_state.mem_ptr_parameter_inferred( |
| REMOTE_PTR_FIELD(argsp, read_fds), IN_OUT); |
| syscall_state.mem_ptr_parameter_inferred( |
| REMOTE_PTR_FIELD(argsp, write_fds), IN_OUT); |
| syscall_state.mem_ptr_parameter_inferred( |
| REMOTE_PTR_FIELD(argsp, except_fds), IN_OUT); |
| syscall_state.mem_ptr_parameter_inferred( |
| REMOTE_PTR_FIELD(argsp, timeout), IN_OUT); |
| } else { |
| syscall_state.reg_parameter<typename Arch::fd_set>(2, IN_OUT); |
| syscall_state.reg_parameter<typename Arch::fd_set>(3, IN_OUT); |
| syscall_state.reg_parameter<typename Arch::fd_set>(4, IN_OUT); |
| syscall_state.reg_parameter<typename Arch::timeval>(5, IN_OUT); |
| } |
| return ALLOW_SWITCH; |
| |
| case Arch::pselect6_time64: |
| case Arch::pselect6: { |
| syscall_state.reg_parameter<typename Arch::fd_set>(2, IN_OUT); |
| syscall_state.reg_parameter<typename Arch::fd_set>(3, IN_OUT); |
| syscall_state.reg_parameter<typename Arch::fd_set>(4, IN_OUT); |
| if (syscallno == Arch::pselect6) { |
| syscall_state.reg_parameter<typename Arch::timespec>(5, IN_OUT); |
| } else { |
| syscall_state.reg_parameter<typename Arch::Arch64::timespec>(5, IN_OUT); |
| } |
| auto arg6p = |
| syscall_state.reg_parameter<typename Arch::pselect6_arg6>(6, IN); |
| syscall_state.mem_ptr_parameter_inferred(REMOTE_PTR_FIELD(arg6p, ss), IN, |
| protect_rr_sigs); |
| t->invalidate_sigmask(); |
| return ALLOW_SWITCH; |
| } |
| |
| case Arch::recvfrom: { |
| syscall_state.reg_parameter( |
| 2, |
| ParamSize::from_syscall_result<typename Arch::ssize_t>(regs.arg3())); |
| auto addrlen_ptr = |
| syscall_state.reg_parameter<typename Arch::socklen_t>(6, IN_OUT); |
| syscall_state.reg_parameter( |
| 5, ParamSize::from_initialized_mem(t, addrlen_ptr)); |
| return ALLOW_SWITCH; |
| } |
| |
| case Arch::recvmsg: { |
| auto msgp = syscall_state.reg_parameter<typename Arch::msghdr>(2, IN_OUT); |
| prepare_recvmsg<Arch>( |
| t, syscall_state, msgp, |
| ParamSize::from_syscall_result<typename Arch::ssize_t>()); |
| if (!((int)regs.arg3() & MSG_DONTWAIT)) { |
| return ALLOW_SWITCH; |
| } |
| return PREVENT_SWITCH; |
| } |
| |
| case Arch::recvmmsg_time64: |
| case Arch::recvmmsg: { |
| auto vlen = (unsigned int)regs.arg3(); |
| auto mmsgp = |
| syscall_state |
| .reg_parameter(2, sizeof(typename Arch::mmsghdr) * vlen, IN_OUT) |
| .cast<typename Arch::mmsghdr>(); |
| prepare_recvmmsg<Arch>(t, syscall_state, mmsgp, vlen); |
| if (!((unsigned int)regs.arg4() & MSG_DONTWAIT)) { |
| return ALLOW_SWITCH; |
| } |
| return PREVENT_SWITCH; |
| } |
| |
| case Arch::sendto: |
| if (!((unsigned int)regs.arg4() & MSG_DONTWAIT)) { |
| return ALLOW_SWITCH; |
| } |
| return PREVENT_SWITCH; |
| |
| case Arch::sendmsg: |
| if (!((unsigned int)regs.arg3() & MSG_DONTWAIT)) { |
| return ALLOW_SWITCH; |
| } |
| return PREVENT_SWITCH; |
| |
| case Arch::sendmmsg: { |
| auto vlen = (unsigned int)regs.arg3(); |
| syscall_state.reg_parameter(2, sizeof(typename Arch::mmsghdr) * vlen, |
| IN_OUT); |
| if (!((unsigned int)regs.arg4() & MSG_DONTWAIT)) { |
| return ALLOW_SWITCH; |
| } |
| return PREVENT_SWITCH; |
| } |
| |
| case Arch::getsockname: |
| case Arch::getpeername: { |
| auto addrlen_ptr = |
| syscall_state.reg_parameter<typename Arch::socklen_t>(3, IN_OUT); |
| syscall_state.reg_parameter( |
| 2, ParamSize::from_initialized_mem(t, addrlen_ptr)); |
| return PREVENT_SWITCH; |
| } |
| |
| case Arch::setsockopt: { |
| typename Arch::setsockopt_args args; |
| args.sockfd = regs.arg1(); |
| args.level = regs.arg2(); |
| args.optname = regs.arg3(); |
| args.optval = remote_ptr<void>(regs.arg4()); |
| args.optlen = regs.arg5(); |
| return prepare_setsockopt<Arch>(t, syscall_state, args); |
| } |
| |
| case Arch::getsockopt: { |
| auto optlen_ptr = |
| syscall_state.reg_parameter<typename Arch::socklen_t>(5, IN_OUT); |
| syscall_state.reg_parameter( |
| 4, ParamSize::from_initialized_mem(t, optlen_ptr)); |
| return PREVENT_SWITCH; |
| } |
| |
| case Arch::pread64: |
| /* ssize_t read(int fd, void *buf, size_t count); */ |
| case Arch::read: { |
| int fd = regs.arg1(); |
| uint64_t result; |
| vector<FileMonitor::Range> ranges; |
| ranges.push_back(FileMonitor::Range(regs.arg2(), regs.arg3())); |
| FileMonitor::LazyOffset offset(t, regs, syscallno); |
| if (t->fd_table()->emulate_read(fd, t, ranges, offset, &result)) { |
| return did_emulate_read<Arch>(syscallno, t, ranges, result, syscall_state); |
| } |
| syscall_state.reg_parameter( |
| 2, ParamSize::from_syscall_result<typename Arch::ssize_t>( |
| (size_t)regs.arg3())); |
| return ALLOW_SWITCH; |
| } |
| |
| case Arch::accept: |
| case Arch::accept4: { |
| auto addrlen_ptr = |
| syscall_state.reg_parameter<typename Arch::socklen_t>(3, IN_OUT); |
| syscall_state.reg_parameter( |
| 2, ParamSize::from_initialized_mem(t, addrlen_ptr)); |
| return ALLOW_SWITCH; |
| } |
| |
| case Arch::getcwd: |
| syscall_state.reg_parameter( |
| 1, ParamSize::from_syscall_result<typename Arch::ssize_t>( |
| (size_t)regs.arg2())); |
| return PREVENT_SWITCH; |
| |
| case Arch::getdents: |
| case Arch::getdents64: |
| syscall_state.reg_parameter( |
| 2, ParamSize::from_syscall_result<int>((unsigned int)regs.arg3())); |
| return PREVENT_SWITCH; |
| |
| case Arch::readlink: |
| syscall_state.reg_parameter( |
| 2, ParamSize::from_syscall_result<typename Arch::ssize_t>( |
| (size_t)regs.arg3())); |
| return PREVENT_SWITCH; |
| |
| case Arch::readlinkat: |
| syscall_state.reg_parameter( |
| 3, ParamSize::from_syscall_result<typename Arch::ssize_t>( |
| (size_t)regs.arg4())); |
| return PREVENT_SWITCH; |
| |
| case Arch::close_range: |
| case Arch::clone3: |
| case Arch::io_uring_setup: |
| case Arch::io_setup: { |
| // Prevent the various syscalls that we don't support from being used by |
| // applications and fake an ENOSYS return. |
| Registers r = regs; |
| r.set_arg2(0); |
| t->set_regs(r); |
| syscall_state.emulate_result(-ENOSYS); |
| return PREVENT_SWITCH; |
| } |
| |
| case Arch::rseq: { |
| auto rseq = remote_ptr<typename Arch::rseq_t>(regs.arg1()); |
| uint32_t rseq_len = regs.arg2(); |
| int flags = regs.arg3(); |
| uint32_t sig = regs.arg4(); |
| |
| // Clear size to ensure syscall fails. |
| Registers r = regs; |
| r.set_arg2(0); |
| t->set_regs(r); |
| |
| if (flags & RR_RSEQ_FLAG_UNREGISTER) { |
| if ((flags & ~RR_RSEQ_FLAG_UNREGISTER) || !t->rseq_state || |
| t->rseq_state->ptr != rseq || rseq_len != sizeof(typename Arch::rseq_t)) { |
| syscall_state.emulate_result(-EINVAL); |
| } else if (t->rseq_state->abort_prefix_signature != sig) { |
| syscall_state.emulate_result(-EPERM); |
| } else { |
| auto addr = REMOTE_PTR_FIELD(rseq, cpu_id); |
| uint32_t cpu_id = RR_RSEQ_CPU_ID_UNINITIALIZED; |
| t->write_mem(addr, cpu_id); |
| t->record_local(addr, &cpu_id); |
| addr = REMOTE_PTR_FIELD(rseq, cpu_id_start); |
| uint32_t cpu_id_start = 0; |
| t->write_mem(addr, cpu_id_start); |
| t->record_local(addr, &cpu_id_start); |
| t->rseq_state = nullptr; |
| syscall_state.emulate_result(0); |
| } |
| return PREVENT_SWITCH; |
| } |
| |
| if (flags) { |
| syscall_state.emulate_result(-EINVAL); |
| return PREVENT_SWITCH; |
| } |
| |
| if (t->rseq_state) { |
| if (t->rseq_state->ptr != rseq || rseq_len != sizeof(typename Arch::rseq_t)) { |
| syscall_state.emulate_result(-EINVAL); |
| } else if (t->rseq_state->abort_prefix_signature != sig) { |
| syscall_state.emulate_result(-EPERM); |
| } else { |
| syscall_state.emulate_result(-EBUSY); |
| } |
| return PREVENT_SWITCH; |
| } |
| |
| if ((rseq.as_int() & 31) || rseq_len != sizeof(typename Arch::rseq_t)) { |
| syscall_state.emulate_result(-EINVAL); |
| return PREVENT_SWITCH; |
| } |
| |
| t->rseq_state = make_unique<RseqState>(rseq, sig); |
| ASSERT(t, t->session().trace_writer().bound_to_cpu() >= 0) << "rseq not supported with unbound tasks"; |
| uint32_t cpu_id = t->session().trace_writer().bound_to_cpu(); |
| auto addr = REMOTE_PTR_FIELD(rseq, cpu_id); |
| // We can only support rseq when the tracee is bound to a specific CPU. otherwise cpu_id_start |
| // and cpu_id fields would need to be managed by rr and would not match reality. |
| t->write_mem(addr, cpu_id); |
| t->record_local(addr, &cpu_id); |
| addr = REMOTE_PTR_FIELD(rseq, cpu_id_start); |
| t->write_mem(addr, cpu_id); |
| t->record_local(addr, &cpu_id); |
| |
| auto remote_locals = AddressSpace::preload_thread_locals_start() |
| .cast<preload_thread_locals<Arch>>(); |
| if (remote_locals) { |
| auto rseq_called_ptr = REMOTE_PTR_FIELD(remote_locals, rseq_called); |
| int32_t rseq_called = 1; |
| t->write_mem(rseq_called_ptr, rseq_called); |
| t->record_local(rseq_called_ptr, &rseq_called); |
| } |
| |
| syscall_state.emulate_result(0); |
| return PREVENT_SWITCH; |
| } |
| |
| case Arch::userfaultfd: { |
| // Pretend the kernel doesn't support this. |
| Registers r = regs; |
| r.set_arg1(0xffffffff); |
| t->set_regs(r); |
| syscall_state.emulate_result(-ENOSYS); |
| return PREVENT_SWITCH; |
| } |
| |
| case Arch::memfd_create: { |
| string name = t->read_c_str(remote_ptr<char>(regs.arg1())); |
| if (is_blacklisted_memfd(name.c_str())) { |
| LOG(warn) << "Cowardly refusing to memfd_create " << name; |
| Registers r = regs; |
| r.set_arg1(0); |
| t->set_regs(r); |
| syscall_state.emulate_result(-ENOSYS); |
| } |
| return PREVENT_SWITCH; |
| } |
| |
| case Arch::getgroups: |
| // We could record a little less data by restricting the recorded data |
| // to the syscall result * sizeof(Arch::legacy_gid_t), but that would |
| // require more infrastructure and it's not worth worrying about. |
| syscall_state.reg_parameter( |
| 2, (int)regs.arg1_signed() * sizeof(typename Arch::legacy_gid_t)); |
| return PREVENT_SWITCH; |
| |
| case Arch::getgroups32: |
| // We could record a little less data by restricting the recorded data |
| // to the syscall result * sizeof(Arch::gid_t), but that would |
| // require more infrastructure and it's not worth worrying about. |
| syscall_state.reg_parameter( |
| 2, (int)regs.arg1_signed() * sizeof(typename Arch::gid_t)); |
| return PREVENT_SWITCH; |
| |
| case Arch::write: |
| case Arch::writev: { |
| int fd = (int)regs.arg1_signed(); |
| return t->fd_table()->will_write(t, fd); |
| } |
| |
| case Arch::copy_file_range: { |
| syscall_state.reg_parameter<typename Arch::loff_t>(2, IN_OUT); |
| syscall_state.reg_parameter<typename Arch::loff_t>(4, IN_OUT); |
| int in_fd = (int)regs.arg1_signed(); |
| int out_fd = (int)regs.arg3_signed(); |
| ASSERT(t, !t->fd_table()->is_monitoring(in_fd) && |
| !t->fd_table()->is_monitoring(out_fd)) |
| << "copy_file_range for monitored fds not supported yet"; |
| return ALLOW_SWITCH; |
| } |
| |
| /* ssize_t readv(int fd, const struct iovec *iov, int iovcnt); */ |
| case Arch::readv: |
| /* ssize_t preadv(int fd, const struct iovec *iov, int iovcnt, |
| off_t offset); */ |
| case Arch::preadv: { |
| int fd = (int)regs.arg1_signed(); |
| int iovcnt = (int)regs.arg3_signed(); |
| remote_ptr<void> iovecsp_void = syscall_state.reg_parameter( |
| 2, sizeof(typename Arch::iovec) * iovcnt, IN); |
| auto iovecsp = iovecsp_void.cast<typename Arch::iovec>(); |
| auto iovecs = t->read_mem(iovecsp, iovcnt); |
| uint64_t result; |
| vector<FileMonitor::Range> ranges; |
| ranges.reserve(iovcnt); |
| for (int i = 0; i < iovcnt; ++i) { |
| ranges.push_back( |
| FileMonitor::Range(iovecs[i].iov_base, iovecs[i].iov_len)); |
| } |
| FileMonitor::LazyOffset offset(t, regs, syscallno); |
| if (t->fd_table()->emulate_read(fd, t, ranges, offset, &result)) { |
| return did_emulate_read<Arch>(syscallno, t, ranges, result, syscall_state); |
| } |
| ParamSize io_size = |
| ParamSize::from_syscall_result<typename Arch::ssize_t>(); |
| for (int i = 0; i < iovcnt; ++i) { |
| syscall_state.mem_ptr_parameter(REMOTE_PTR_FIELD(iovecsp + i, iov_base), |
| io_size.limit_size(iovecs[i].iov_len)); |
| } |
| return ALLOW_SWITCH; |
| } |
| |
| /* pid_t waitpid(pid_t pid, int *status, int options); */ |
| /* pid_t wait4(pid_t pid, int *status, int options, struct rusage |
| * *rusage); |
| */ |
| case Arch::waitpid: |
| case Arch::wait4: { |
| Switchable should_switch = ALLOW_SWITCH; |
| pid_t pid = (pid_t)regs.arg1_signed(); |
| if (pid < -1) { |
| t->in_wait_type = WAIT_TYPE_PGID; |
| t->in_wait_pid = -pid; |
| } else if (pid == -1) { |
| t->in_wait_type = WAIT_TYPE_ANY; |
| } else if (pid == 0) { |
| t->in_wait_type = WAIT_TYPE_SAME_PGID; |
| } else { |
| t->in_wait_type = WAIT_TYPE_PID; |
| t->in_wait_pid = pid; |
| } |
| int options = (int)regs.arg3(); |
| bool pausing = false; |
| if (maybe_emulate_wait(t, syscall_state, options)) { |
| Registers r = regs; |
| // Set options to an invalid value to force syscall to fail |
| r.set_arg3(0xffffffff); |
| t->set_regs(r); |
| should_switch = PREVENT_SWITCH; |
| } else if (maybe_pause_instead_of_waiting(t, options)) { |
| pausing = true; |
| } |
| // When pausing, we've modified the registers and will emulate the |
| // memory changes on syscall exit. We avoid modifying these registers |
| // with pointers to scratch memory, so mark them _NO_SCRATCH if we're |
| // pausing. |
| syscall_state.reg_parameter<int>(2, pausing ? IN_OUT_NO_SCRATCH : IN_OUT); |
| if (syscallno == Arch::wait4) { |
| syscall_state.reg_parameter<typename Arch::rusage>(4, |
| pausing ? IN_OUT_NO_SCRATCH : OUT); |
| } |
| return should_switch; |
| } |
| |
| case Arch::waitid: { |
| id_t wait_pid = (id_t)regs.arg2(); |
| switch ((uint32_t)regs.arg1()) { |
| case P_ALL: |
| t->in_wait_type = WAIT_TYPE_ANY; |
| break; |
| case P_PID: |
| t->in_wait_type = WAIT_TYPE_PID; |
| break; |
| case P_PGID: |
| t->in_wait_type = WAIT_TYPE_PGID; |
| break; |
| case P_PIDFD: |
| wait_pid = t->pid_of_pidfd(regs.arg2()); |
| if (!t->session().find_task(wait_pid)) { |
| // Waiting on a non-tracee; just let it happen as normal |
| // We also take this path if pid_of_pidfd returns -1 |
| // because the fd is not a pidfd. |
| syscall_state.reg_parameter<typename Arch::siginfo_t>(3); |
| return ALLOW_SWITCH; |
| } |
| t->in_wait_type = WAIT_TYPE_PID; |
| break; |
| default: |
| syscall_state.expect_errno = EINVAL; |
| break; |
| } |
| Switchable should_switch = ALLOW_SWITCH; |
| t->in_wait_pid = wait_pid; |
| int options = (int)regs.arg4(); |
| bool pausing = false; |
| if (maybe_emulate_wait(t, syscall_state, options)) { |
| Registers r = regs; |
| // Set options to an invalid value to force syscall to fail |
| r.set_arg4(0xffffffff); |
| t->set_regs(r); |
| should_switch = PREVENT_SWITCH; |
| } else { |
| pausing = maybe_pause_instead_of_waiting(t, options); |
| } |
| syscall_state.reg_parameter<typename Arch::siginfo_t>(3, |
| pausing ? IN_OUT_NO_SCRATCH : IN_OUT); |
| return should_switch; |
| } |
| |
| case Arch::setpriority: |
| // The syscall might fail due to insufficient |
| // permissions (e.g. while trying to decrease the nice value |
| // while not root). |
| // We'll choose to honor the new value anyway since we'd like |
| // to be able to test configurations where a child thread |
| // has a lower nice value than its parent, which requires |
| // lowering the child's nice value. |
| if ((int)regs.arg1_signed() == PRIO_PROCESS) { |
| RecordTask* target = |
| (int)regs.arg2_signed() |
| ? t->session().find_task((int)regs.arg2_signed()) |
| : t; |
| if (target) { |
| LOG(debug) << "Setting nice value for tid " << t->tid << " to " |
| << regs.arg3(); |
| target->session().scheduler().update_task_priority( |
| target, (int)regs.arg3_signed()); |
| } |
| } |
| return PREVENT_SWITCH; |
| |
| case Arch::pause: |
| return ALLOW_SWITCH; |
| |
| case Arch::ppoll_time64: |
| case Arch::ppoll: |
| /* The raw syscall modifies this with the time remaining. The libc |
| does not expose this functionality however */ |
| if (syscallno == Arch::ppoll) { |
| syscall_state.reg_parameter<typename Arch::timespec>(3, IN_OUT); |
| } else { |
| syscall_state.reg_parameter<typename Arch::Arch64::timespec>(3, IN_OUT); |
| } |
| syscall_state.reg_parameter<typename Arch::kernel_sigset_t>( |
| 4, IN, protect_rr_sigs); |
| t->invalidate_sigmask(); |
| RR_FALLTHROUGH; |
| case Arch::poll: { |
| auto nfds = (nfds_t)regs.arg2(); |
| syscall_state.reg_parameter(1, sizeof(typename Arch::pollfd) * nfds, |
| IN_OUT); |
| return ALLOW_SWITCH; |
| } |
| |
| case Arch::perf_event_open: { |
| RecordTask* target = t->session().find_task((pid_t)regs.arg2_signed()); |
| int cpu = regs.arg3_signed(); |
| unsigned long flags = regs.arg5(); |
| int allowed_perf_flags = PERF_FLAG_FD_CLOEXEC; |
| if (target && cpu == -1 && !(flags & ~allowed_perf_flags)) { |
| auto attr = |
| t->read_mem(remote_ptr<struct perf_event_attr>(regs.arg1())); |
| if (VirtualPerfCounterMonitor::should_virtualize(attr)) { |
| Registers r = regs; |
| // Turn this into an inotify_init() syscall. This just gives us an |
| // allocated fd. Syscalls using this fd will be emulated (except for |
| // close()). |
| r.set_original_syscallno(Arch::inotify_init1); |
| int in_flags = (flags & PERF_FLAG_FD_CLOEXEC) ? O_CLOEXEC : 0; |
| r.set_arg1(in_flags); |
| t->set_regs(r); |
| } |
| } |
| return PREVENT_SWITCH; |
| } |
| |
| case Arch::connect: |
| return maybe_blacklist_connect<Arch>(t, regs.arg2(), regs.arg3()); |
| |
| case Arch::close: |
| if (t->fd_table()->is_rr_fd((int)regs.arg1())) { |
| // Don't let processes close this fd. Abort with EBADF by setting |
| // oldfd to -1, as if the fd is already closed. |
| Registers r = regs; |
| r.set_arg1(intptr_t(-1)); |
| t->set_regs(r); |
| } |
| return PREVENT_SWITCH; |
| |
| case Arch::dup2: |
| case Arch::dup3: |
| if (t->fd_table()->is_rr_fd((int)regs.arg2())) { |
| // Don't let processes dup over this fd. Abort with EBADF by setting |
| // oldfd to -1. |
| Registers r = regs; |
| r.set_arg1(intptr_t(-1)); |
| t->set_regs(r); |
| } |
| return PREVENT_SWITCH; |
| |
| /* int prctl(int option, unsigned long arg2, unsigned long arg3, unsigned |
| * long arg4, unsigned long arg5); */ |
| case Arch::prctl: |
| switch ((int)regs.arg1_signed()) { |
| case PR_GET_CHILD_SUBREAPER: |
| case PR_GET_ENDIAN: |
| case PR_GET_FPEMU: |
| case PR_GET_FPEXC: |
| case PR_GET_PDEATHSIG: |
| case PR_GET_UNALIGN: |
| syscall_state.reg_parameter<int>(2); |
| break; |
| |
| case PR_GET_KEEPCAPS: |
| case PR_GET_NO_NEW_PRIVS: |
| case PR_GET_TIMERSLACK: |
| case PR_MCE_KILL: |
| case PR_MCE_KILL_GET: |
| case PR_SET_CHILD_SUBREAPER: |
| case PR_SET_KEEPCAPS: |
| case PR_SET_NAME: |
| case PR_SET_PDEATHSIG: |
| case PR_SET_TIMERSLACK: |
| case PR_CAP_AMBIENT: |
| case PR_CAPBSET_DROP: |
| case PR_CAPBSET_READ: |
| case PR_GET_SPECULATION_CTRL: |
| case PR_SET_SPECULATION_CTRL: |
| case PR_GET_THP_DISABLE: |
| case PR_SET_THP_DISABLE: |
| case PR_SET_SECUREBITS: |
| case PR_GET_SECUREBITS: |
| break; |
| |
| case PR_SET_DUMPABLE: |
| if (regs.arg2() == 0) { |
| // Don't let processes make themselves undumpable. If a process |
| // becomes undumpable, calling perf_event_open on it fails. |
| Registers r = regs; |
| r.set_arg1(intptr_t(-1)); |
| t->set_regs(r); |
| syscall_state.emulate_result(0); |
| t->thread_group()->dumpable = false; |
| } else if (regs.arg2() == 1) { |
| t->thread_group()->dumpable = true; |
| } |
| break; |
| |
| case PR_GET_DUMPABLE: |
| syscall_state.emulate_result(t->thread_group()->dumpable); |
| break; |
| |
| case PR_GET_SECCOMP: |
| syscall_state.emulate_result(t->prctl_seccomp_status); |
| break; |
| |
| case PR_GET_TSC: { |
| // Prevent the actual GET_TSC call and return our emulated state. |
| Registers r = regs; |
| r.set_arg1(intptr_t(-1)); |
| t->set_regs(r); |
| syscall_state.emulate_result(0); |
| t->write_mem(syscall_state.reg_parameter<int>(2, IN_OUT_NO_SCRATCH), |
| t->tsc_mode); |
| break; |
| } |
| |
| case PR_SET_TSC: { |
| // Prevent the actual SET_TSC call. |
| Registers r = regs; |
| r.set_arg1(intptr_t(-1)); |
| t->set_regs(r); |
| int val = (int)regs.arg2(); |
| if (val != PR_TSC_ENABLE && val != PR_TSC_SIGSEGV) { |
| syscall_state.emulate_result(-EINVAL); |
| } else { |
| syscall_state.emulate_result(0); |
| t->tsc_mode = val; |
| } |
| break; |
| } |
| |
| case PR_GET_NAME: |
| syscall_state.reg_parameter(2, 16); |
| break; |
| |
| case PR_SET_NO_NEW_PRIVS: |
| if ((unsigned long)regs.arg2() != 1) { |
| syscall_state.expect_errno = EINVAL; |
| } |
| break; |
| |
| case PR_SET_SECCOMP: |
| // Allow all known seccomp calls. We must allow the seccomp call |
| // that rr triggers when spawning the initial tracee. |
| switch ((unsigned long)regs.arg2()) { |
| case SECCOMP_MODE_STRICT: |
| break; |
| case SECCOMP_MODE_FILTER: { |
| // If we're bootstrapping then this must be rr's own syscall |
| // filter, so just install it normally now. |
| if (t->session().done_initial_exec()) { |
| // Prevent the actual prctl call. We'll fix this up afterwards. |
| Registers r = regs; |
| r.set_arg1(intptr_t(-1)); |
| t->set_regs(r); |
| } |
| break; |
| } |
| default: |
| syscall_state.expect_errno = EINVAL; |
| break; |
| } |
| break; |
| |
| case PR_SET_PTRACER: { |
| // Prevent any PR_SET_PTRACER call, but pretend it succeeded, since |
| // we don't want any interference with our ptracing. |
| Registers r = regs; |
| r.set_arg1(intptr_t(-1)); |
| t->set_regs(r); |
| syscall_state.emulate_result(0); |
| break; |
| } |
| |
| case PR_SET_MM: { |
| switch ((unsigned long)regs.arg2()) { |
| case PR_SET_MM_MAP_SIZE: |
| syscall_state.reg_parameter(3, sizeof(unsigned int)); |
| break; |
| |
| case PR_SET_MM_ARG_START: |
| case PR_SET_MM_ARG_END: |
| break; |
| |
| default: |
| syscall_state.expect_errno = EINVAL; |
| break; |
| } |
| break; |
| } |
| |
| case PR_SET_VMA: { |
| switch (regs.arg2()) { |
| case PR_SET_VMA_ANON_NAME: |
| // PR_SET_VMA_ANON_NAME is used to communicate additional details |
| // about the VMA to the kernel. VMAs with different anonymous |
| // names are not merged by the kernel. None of this affects rr, |
| // and this prctl has no outparams. |
| break; |
| default: |
| syscall_state.expect_errno = EINVAL; |
| break; |
| } |
| break; |
| } |
| |
| default: |
| syscall_state.expect_errno = EINVAL; |
| break; |
| } |
| return PREVENT_SWITCH; |
| |
| case Arch::arch_prctl: |
| switch ((int)regs.arg1_signed()) { |
| case ARCH_SET_FS: |
| case ARCH_SET_GS: |
| break; |
| |
| case ARCH_GET_FS: |
| case ARCH_GET_GS: |
| syscall_state.reg_parameter<typename Arch::unsigned_long>(2); |
| break; |
| |
| case ARCH_SET_CPUID: { |
| if (!t->session().has_cpuid_faulting()) { |
| // Not supported or not working. Allow the call to go |
| // through to get the right result. |
| break; |
| } |
| |
| // Prevent the actual SET_CPUID call. |
| Registers r = t->regs(); |
| r.set_arg1(intptr_t(-1)); |
| t->set_regs(r); |
| int val = (int)t->regs().arg2(); |
| t->cpuid_mode = !!val; |
| syscall_state.emulate_result(0); |
| break; |
| } |
| |
| case ARCH_GET_CPUID: { |
| if (!t->session().has_cpuid_faulting()) { |
| // Not supported or not working. Allow the call to go |
| // through to get the right result. |
| break; |
| } |
| |
| // Prevent the actual GET_CPUID call and return our emulated state. |
| Registers r = t->regs(); |
| r.set_arg1(intptr_t(-1)); |
| t->set_regs(r); |
| syscall_state.emulate_result(t->cpuid_mode); |
| break; |
| } |
| |
| default: |
| syscall_state.expect_errno = EINVAL; |
| break; |
| } |
| return PREVENT_SWITCH; |
| |
| case Arch::ioctl: |
| return prepare_ioctl<Arch>(t, syscall_state); |
| |
| case Arch::bpf: |
| return prepare_bpf<Arch>(t, syscall_state); |
| |
| case Arch::_sysctl: { |
| auto argsp = |
| syscall_state.reg_parameter<typename Arch::__sysctl_args>(1, IN); |
| auto oldlenp = syscall_state.mem_ptr_parameter_inferred( |
| REMOTE_PTR_FIELD(argsp, oldlenp), IN_OUT); |
| syscall_state.mem_ptr_parameter( |
| REMOTE_PTR_FIELD(argsp, oldval), |
| ParamSize::from_initialized_mem(t, oldlenp)); |
| return PREVENT_SWITCH; |
| } |
| |
| case Arch::quotactl: |
| switch (regs.arg1() >> SUBCMDSHIFT) { |
| case Q_GETQUOTA: |
| syscall_state.reg_parameter<typename Arch::dqblk>(4); |
| break; |
| case Q_GETINFO: |
| syscall_state.reg_parameter<typename Arch::dqinfo>(4); |
| break; |
| case Q_GETFMT: |
| syscall_state.reg_parameter<int>(4); |
| break; |
| case Q_SETQUOTA: |
| FATAL() << "Trying to set disk quota usage, this may interfere with " |
| "rr recording"; |
| // not reached; the break is just to silence fallthrough warnings |
| break; |
| case Q_QUOTAON: |
| case Q_QUOTAOFF: |
| case Q_SETINFO: |
| case Q_SYNC: |
| break; |
| default: |
| // Don't set expect_errno here because quotactl can fail with |
| // various error codes before checking the command |
| break; |
| } |
| return PREVENT_SWITCH; |
| |
| case Arch::keyctl: |
| switch ((int)regs.arg1_signed()) { |
| case KEYCTL_GET_KEYRING_ID: |
| case KEYCTL_JOIN_SESSION_KEYRING: |
| case KEYCTL_UPDATE: |
| case KEYCTL_REVOKE: |
| case KEYCTL_CHOWN: |
| case KEYCTL_SETPERM: |
| case KEYCTL_CLEAR: |
| case KEYCTL_LINK: |
| case KEYCTL_UNLINK: |
| case KEYCTL_SEARCH: |
| case KEYCTL_INSTANTIATE: |
| case KEYCTL_INSTANTIATE_IOV: |
| case KEYCTL_NEGATE: |
| case KEYCTL_REJECT: |
| case KEYCTL_SET_REQKEY_KEYRING: |
| case KEYCTL_SET_TIMEOUT: |
| case KEYCTL_ASSUME_AUTHORITY: |
| case KEYCTL_SESSION_TO_PARENT: |
| case KEYCTL_INVALIDATE: |
| case KEYCTL_GET_PERSISTENT: |
| break; |
| |
| case KEYCTL_DESCRIBE: |
| case KEYCTL_READ: |
| case KEYCTL_GET_SECURITY: |
| case KEYCTL_DH_COMPUTE: |
| syscall_state.reg_parameter( |
| 3, ParamSize::from_syscall_result<typename Arch::signed_long>( |
| regs.arg4())); |
| break; |
| |
| default: |
| syscall_state.expect_errno = EINVAL; |
| break; |
| } |
| return PREVENT_SWITCH; |
| |
| /* int epoll_wait(int epfd, struct epoll_event *events, int maxevents, int |
| * timeout); */ |
| case Arch::epoll_wait: |
| syscall_state.reg_parameter( |
| 2, ParamSize::from_syscall_result<int>(sizeof(typename Arch::epoll_event) * regs.arg3_signed(), |
| sizeof(typename Arch::epoll_event))); |
| return ALLOW_SWITCH; |
| |
| case Arch::epoll_pwait2: |
| case Arch::epoll_pwait: { |
| syscall_state.reg_parameter( |
| 2, ParamSize::from_syscall_result<int>(sizeof(typename Arch::epoll_event) * regs.arg3_signed(), |
| sizeof(typename Arch::epoll_event))); |
| t->invalidate_sigmask(); |
| return ALLOW_SWITCH; |
| } |
| |
| /* The following two syscalls enable context switching not for |
| * liveness/correctness reasons, but rather because if we |
| * didn't context-switch away, rr might end up busy-waiting |
| * needlessly. In addition, albeit far less likely, the |
| * client program may have carefully optimized its own context |
| * switching and we should take the hint. */ |
| |
| case Arch::nanosleep: |
| syscall_state.reg_parameter<typename Arch::timespec>(2); |
| return ALLOW_SWITCH; |
| |
| case Arch::clock_nanosleep: |
| syscall_state.reg_parameter<typename Arch::timespec>(4); |
| return ALLOW_SWITCH; |
| |
| case Arch::clock_nanosleep_time64: |
| syscall_state.reg_parameter<typename Arch::Arch64::timespec>(4); |
| return ALLOW_SWITCH; |
| |
| case Arch::sched_yield: |
| t->session().scheduler().schedule_one_round_robin(t); |
| return ALLOW_SWITCH; |
| |
| case Arch::rt_sigpending: |
| syscall_state.reg_parameter(1, (size_t)regs.arg2()); |
| return PREVENT_SWITCH; |
| |
| case Arch::rt_sigtimedwait_time64: |
| case Arch::rt_sigtimedwait: |
| syscall_state.reg_parameter<typename Arch::siginfo_t>(2); |
| return ALLOW_SWITCH; |
| |
| case Arch::rt_sigprocmask: |
| case Arch::sigprocmask: { |
| syscall_state.reg_parameter<typename Arch::kernel_sigset_t>(3); |
| syscall_state.reg_parameter<typename Arch::kernel_sigset_t>( |
| 2, IN, protect_rr_sigs); |
| return PREVENT_SWITCH; |
| } |
| |
| case Arch::sigaction: |
| case Arch::rt_sigaction: { |
| syscall_state.reg_parameter<typename Arch::kernel_sigaction>( |
| 2, IN, protect_rr_sigs_sa_mask); |
| syscall_state.reg_parameter<typename Arch::kernel_sigaction>(3, OUT); |
| return PREVENT_SWITCH; |
| } |
| |
| case Arch::getxattr: |
| case Arch::lgetxattr: |
| case Arch::fgetxattr: |
| syscall_state.reg_parameter( |
| 3, ParamSize::from_syscall_result<ssize_t>(regs.arg4())); |
| return PREVENT_SWITCH; |
| |
| case Arch::listxattr: |
| case Arch::llistxattr: |
| case Arch::flistxattr: |
| syscall_state.reg_parameter( |
| 2, ParamSize::from_syscall_result<ssize_t>(regs.arg3())); |
| return PREVENT_SWITCH; |
| |
| case Arch::sched_getattr: { |
| syscall_state.reg_parameter(2, ParamSize(regs.arg3())); |
| return PREVENT_SWITCH; |
| } |
| |
| case Arch::sched_setaffinity: { |
| // Ignore all sched_setaffinity syscalls. They might interfere |
| // with our own affinity settings. |
| Registers r = regs; |
| // Set arg1 to an invalid PID to ensure this syscall is ignored. |
| r.set_arg1(-1); |
| t->set_regs(r); |
| syscall_state.emulate_result(0); |
| return PREVENT_SWITCH; |
| } |
| |
| case Arch::sched_getaffinity: |
| syscall_state.reg_parameter(3, ParamSize(regs.arg2())); |
| return PREVENT_SWITCH; |
| |
| case Arch::ptrace: |
| return prepare_ptrace<Arch>(t, syscall_state); |
| |
| case Arch::mincore: |
| syscall_state.reg_parameter( |
| 3, (regs.arg2() + page_size() - 1) / page_size()); |
| return PREVENT_SWITCH; |
| |
| case Arch::shmctl: |
| return prepare_shmctl<Arch>(syscall_state, (int)regs.arg2_signed(), 3); |
| |
| case Arch::semctl: |
| return prepare_semctl<Arch>(t, syscall_state, (int)regs.arg1_signed(), |
| (int)regs.arg3_signed(), 4, USE_DIRECTLY); |
| |
| case Arch::seccomp: |
| switch ((unsigned int)regs.arg1()) { |
| case SECCOMP_SET_MODE_STRICT: |
| case SECCOMP_GET_ACTION_AVAIL: |
| break; |
| case SECCOMP_SET_MODE_FILTER: { |
| // Prevent the actual seccomp call. We'll fix this up afterwards. |
| Registers r = regs; |
| r.set_arg1(intptr_t(-1)); |
| t->set_regs(r); |
| break; |
| } |
| case SECCOMP_GET_NOTIF_SIZES: |
| syscall_state.reg_parameter<typename Arch::seccomp_notif_sizes>(3); |
| break; |
| default: |
| syscall_state.expect_errno = EINVAL; |
| break; |
| } |
| return PREVENT_SWITCH; |
| |
| case Arch::get_mempolicy: { |
| syscall_state.reg_parameter(1, sizeof(int)); |
| unsigned long maxnode = t->regs().arg3(); |
| unsigned long align_mask = 8 * sizeof(typename Arch::unsigned_long) - 1; |
| unsigned long aligned_maxnode = (maxnode + align_mask) & ~align_mask; |
| syscall_state.reg_parameter(2, aligned_maxnode / 8); |
| return PREVENT_SWITCH; |
| } |
| |
| case Arch::madvise: |
| switch ((int)regs.arg3()) { |
| case MADV_NORMAL: |
| case MADV_RANDOM: |
| case MADV_SEQUENTIAL: |
| case MADV_WILLNEED: |
| case MADV_DONTNEED: |
| case MADV_REMOVE: |
| case MADV_DONTFORK: |
| case MADV_DOFORK: |
| case MADV_SOFT_OFFLINE: |
| case MADV_HWPOISON: |
| case MADV_MERGEABLE: |
| case MADV_UNMERGEABLE: |
| case MADV_HUGEPAGE: |
| case MADV_NOHUGEPAGE: |
| case MADV_DONTDUMP: |
| case MADV_DODUMP: |
| case MADV_WIPEONFORK: |
| case MADV_KEEPONFORK: |
| break; |
| case MADV_FREE: { |
| // MADV_FREE introduces nondeterminism --- the kernel zeroes the |
| // pages when under memory pressure. So we don't allow it. |
| Registers r = regs; |
| r.set_arg3(-1); |
| t->set_regs(r); |
| break; |
| } |
| default: |
| syscall_state.expect_errno = EINVAL; |
| break; |
| } |
| return PREVENT_SWITCH; |
| |
| case Arch::personality: { |
| int p = regs.arg1(); |
| if (p == -1) { |
| // A special argument that only returns the existing personality. |
| return PREVENT_SWITCH; |
| } |
| switch ((uint8_t)p) { |
| case PER_LINUX32: |
| case PER_LINUX: |
| // The default personality requires no handling. |
| break; |
| default: |
| syscall_state.expect_errno = EINVAL; |
| break; |
| } |
| |
| if (t->session().enable_chaos()) { |
| // XXX fix this to actually disable chaos mode ASLR? |
| ASSERT(t, |
| !(p & (ADDR_COMPAT_LAYOUT | ADDR_NO_RANDOMIZE | |
| ADDR_LIMIT_32BIT | ADDR_LIMIT_3GB))) |
| << "Personality value " << HEX(p) |
| << " not compatible with chaos mode address-space randomization"; |
| } |
| if (p & 0xffffff00 & |
| ~(ADDR_COMPAT_LAYOUT | ADDR_NO_RANDOMIZE | ADDR_LIMIT_32BIT | |
| ADDR_LIMIT_3GB | FDPIC_FUNCPTRS | MMAP_PAGE_ZERO | SHORT_INODE | |
| STICKY_TIMEOUTS | UNAME26 | WHOLE_SECONDS | READ_IMPLIES_EXEC)) { |
| syscall_state.expect_errno = EINVAL; |
| } |
| return PREVENT_SWITCH; |
| } |
| |
| case Arch::mmap: |
| switch (Arch::mmap_semantics) { |
| case Arch::StructArguments: { |
| auto args = |
| t->read_mem(remote_ptr<typename Arch::mmap_args>(regs.arg1())); |
| // XXX fix these unsupported features? |
| // only the most ancient code should be using old-style mmap on 32bit, |
| // modern glibc uses mmap2. |
| FileMonitor* monitor = t->fd_table()->get_monitor(args.fd); |
| if (monitor) { |
| FileMonitor::Type monitor_type = monitor->type(); |
| ASSERT(t, monitor_type != FileMonitor::VirtualPerfCounter && |
| monitor_type != FileMonitor::NonvirtualPerfCounter); |
| } |
| ASSERT(t, !(args.flags & MAP_GROWSDOWN)); |
| break; |
| } |
| case Arch::RegisterArguments: { |
| prepare_mmap_register_params(t); |
| break; |
| } |
| } |
| return PREVENT_SWITCH; |
| |
| case Arch::mmap2: |
| prepare_mmap_register_params(t); |
| return PREVENT_SWITCH; |
| |
| case Arch::pkey_mprotect: |
| case Arch::mprotect: |
| // Since we're stripping MAP_GROWSDOWN from kernel mmap calls, we need |
| // to implement PROT_GROWSDOWN ourselves. |
| t->vm()->fixup_mprotect_growsdown_parameters(t); |
| return PREVENT_SWITCH; |
| |
| case SYS_rrcall_notify_control_msg: |
| case SYS_rrcall_init_preload: |
| case SYS_rrcall_notify_stap_semaphore_added: |
| case SYS_rrcall_notify_stap_semaphore_removed: |
| syscall_state.emulate_result(0); |
| return PREVENT_SWITCH; |
| |
| // This normally won't be executed but it can be if an RDTSC traps to |
| // the syscallbuf as a fake rrcall_rdtsc, but we then can't buffer it |
| // because the buffer is full or disabled. |
| case SYS_rrcall_rdtsc: { |
| syscall_state.emulate_result(0); |
| uint64_t tsc = rdtsc(); |
| remote_ptr<uint64_t> addr(t->regs().arg1()); |
| t->write_mem(addr, tsc); |
| t->record_local(addr, &tsc); |
| return PREVENT_SWITCH; |
| } |
| |
| case SYS_rrcall_init_buffers: |
| // This is purely for testing purposes. See signal_during_preload_init. |
| if (send_signal_during_init_buffers()) { |
| syscall(SYS_tgkill, t->tgid(), t->tid, SIGCHLD); |
| } |
| syscall_state.reg_parameter<rrcall_init_buffers_params<Arch>>(1, IN_OUT); |
| return PREVENT_SWITCH; |
| |
| case SYS_rrcall_check_presence: { |
| // Since this is "user" facing, we follow best practices for regular |
| // syscalls and make sure that unused arguments (in this case all of them) |
| // are zero. |
| bool arguments_are_zero = true; |
| Registers r = t->regs(); |
| for (int i = 1; i <= 6; ++i) { |
| arguments_are_zero &= r.arg(i) == 0; |
| } |
| syscall_state.emulate_result(arguments_are_zero ? 0 : (uintptr_t)-EINVAL); |
| syscall_state.expect_errno = ENOSYS; |
| return PREVENT_SWITCH; |
| } |
| |
| case SYS_rrcall_detach_teleport: { |
| bool arguments_are_zero = true; |
| Registers r = t->regs(); |
| for (int i = 1; i <= 6; ++i) { |
| arguments_are_zero &= r.arg(i) == 0; |
| } |
| if (!arguments_are_zero) { |
| syscall_state.emulate_result((uintptr_t)-EINVAL); |
| syscall_state.expect_errno = ENOSYS; |
| return PREVENT_SWITCH; |
| } |
| |
| t->exit_syscall(); |
| pid_t new_tid = do_detach_teleport(t); |
| |
| // Leave the proxy where it is --- just exited the detach_teleport |
| // syscall. We won't resume it again until we kill it in ~RecordTask. |
| t->detached_proxy = true; |
| // Just have the same task object represent both the zombie task |
| // and be able to receive death notices for the detached tracee |
| // to forward. |
| t->session().on_proxy_detach(t, new_tid); |
| t->rec_tid = new_tid; |
| |
| return ALLOW_SWITCH; |
| } |
| |
| case SYS_rrcall_arm_time_slice: { |
| Registers r = t->regs(); |
| bool arguments_are_zero = true; |
| for (int i = 2; i <= 6; ++i) { |
| arguments_are_zero &= r.arg(i) == 0; |
| } |
| // Ticks request of zero is invalid for the moment |
| // for purposes of this syscall. In the future we |
| // want to have it mean to simulate a timeslice expiry |
| // at the end of this syscall, but we have no use for |
| // that at the moment. |
| if (r.arg(1) == 0 || r.arg(1) > (uintptr_t)MAX_TICKS_REQUEST || |
| !arguments_are_zero) { |
| syscall_state.emulate_result((uintptr_t)-EINVAL); |
| syscall_state.expect_errno = ENOSYS; |
| return PREVENT_SWITCH; |
| } |
| t->tick_request_override = (TicksRequest)r.arg(1); |
| syscall_state.emulate_result(0); |
| syscall_state.expect_errno = ENOSYS; |
| return PREVENT_SWITCH; |
| } |
| |
| case SYS_rrcall_freeze_tid: { |
| Registers r = t->regs(); |
| bool arguments_are_zero = true; |
| for (int i = 3; i <= 6; ++i) { |
| arguments_are_zero &= r.arg(i) == 0; |
| } |
| pid_t tid = r.arg(1); |
| int enable = r.arg(2); |
| if (!arguments_are_zero || (enable != 0 && enable != 1)) { |
| syscall_state.emulate_result((uintptr_t)-EINVAL); |
| syscall_state.expect_errno = ENOSYS; |
| return PREVENT_SWITCH; |
| } |
| RecordTask *requested_task = t->session().find_task(tid); |
| if (!requested_task) { |
| syscall_state.emulate_result((uintptr_t)-ESRCH); |
| syscall_state.expect_errno = ENOSYS; |
| return PREVENT_SWITCH; |
| } |
| requested_task->schedule_frozen = enable; |
| syscall_state.emulate_result(0); |
| syscall_state.expect_errno = ENOSYS; |
| return PREVENT_SWITCH; |
| } |
| |
| case Arch::brk: |
| case Arch::munmap: |
| case Arch::process_vm_readv: |
| case Arch::process_vm_writev: |
| case SYS_rrcall_notify_syscall_hook_exit: |
| case Arch::mremap: |
| case Arch::shmat: |
| case Arch::shmdt: |
| return PREVENT_SWITCH; |
| |
| case Arch::sigsuspend: |
| case Arch::rt_sigsuspend: |
| t->invalidate_sigmask(); |
| return ALLOW_SWITCH; |
| |
| case Arch::sigreturn: |
| case Arch::rt_sigreturn: |
| // If a sigreturn/rt_sigreturn ever comes through the syscallbuf, we |
| // have switched to the syscallbuf stack (which does not contain any of the |
| // kernel's sigframe data) and we are about to explode (when the kernel restores |
| // the program's registers to random garbage from the syscallbuf stack). Die now |
| // with a useful error message. |
| // |
| // We decline to patch syscall instructions when they're invoked with the |
| // sigreturn/rt_sigreturn syscall. This covers the kernel's inserted sigreturn |
| // trampolines. If the program intentionally invokes these syscalls through a |
| // generic wrapper like syscall(2), it'll have to be recorded with the syscallbuf |
| // disabled. |
| ASSERT(t, !t->is_in_rr_page()) << |
| "sigreturn/rt_sigreturn syscalls cannot be processed through the syscallbuf " |
| "because the stack pointer will be wrong. Is this program invoking them " |
| "through the glibc `syscall` wrapper?\nrerecord with -n to fix this"; |
| if (t->arch() == aarch64) { |
| // This is a bit of a hack, but we don't really have a |
| // good way to do this otherwise. We need to record the |
| // restored x7 register, but the kernel will lie to us |
| // about it. |
| remote_ptr<ARM64Arch::rt_sigframe> frame = t->regs().sp().cast<ARM64Arch::rt_sigframe>(); |
| auto x_regs_arr = REMOTE_PTR_FIELD(REMOTE_PTR_FIELD( |
| REMOTE_PTR_FIELD(REMOTE_PTR_FIELD(frame, uc), uc_mcontext), |
| regs),x); |
| auto x7 = t->read_mem(x_regs_arr.field((uintptr_t*)nullptr, |
| 7 * sizeof(uintptr_t))); |
| syscall_state.syscall_entry_registers.set_x7(x7); |
| } |
| t->invalidate_sigmask(); |
| return PREVENT_SWITCH; |
| |
| case Arch::mq_timedreceive_time64: |
| case Arch::mq_timedreceive: { |
| syscall_state.reg_parameter( |
| 2, ParamSize::from_syscall_result<typename Arch::ssize_t>( |
| (size_t)regs.arg3())); |
| syscall_state.reg_parameter<unsigned>(4); |
| return ALLOW_SWITCH; |
| } |
| |
| case Arch::modify_ldt: { |
| int func = regs.arg1(); |
| if (func == 0 || func == 2) { |
| syscall_state.reg_parameter( |
| 2, ParamSize::from_syscall_result<int>((size_t)regs.arg3())); |
| } |
| // N.B. Unlike set_thread_area, the entry number is not written |
| // for (func == 1 || func == 0x11) |
| return ALLOW_SWITCH; |
| } |
| |
| case Arch::name_to_handle_at: { |
| syscall_state.reg_parameter(3, |
| sizeof(typename Arch::file_handle) + MAX_HANDLE_SZ); |
| syscall_state.reg_parameter(4, sizeof(int)); |
| return ALLOW_SWITCH; |
| } |
| |
| default: |
| // Invalid syscalls return -ENOSYS. Assume any such |
| // result means the syscall was completely ignored by the |
| // kernel so it's OK for us to not do anything special. |
| // Other results mean we probably need to understand this |
| // syscall, but we don't. |
| syscall_state.expect_errno = ENOSYS; |
| return PREVENT_SWITCH; |
| } |
| } |
| |
| static Switchable rec_prepare_syscall_internal( |
| RecordTask* t, TaskSyscallState& syscall_state) { |
| SupportedArch arch = t->ev().Syscall().arch(); |
| return with_converted_registers<Switchable>( |
| t->regs(), arch, [&](const Registers& regs) -> Switchable { |
| RR_ARCH_FUNCTION(rec_prepare_syscall_arch, arch, t, syscall_state, |
| regs); |
| }); |
| } |
| |
| Switchable rec_prepare_syscall(RecordTask* t) { |
| t->syscall_state = make_unique<TaskSyscallState>(); |
| auto& syscall_state = TaskSyscallState::get(t); |
| syscall_state.init(t); |
| |
| Switchable s = rec_prepare_syscall_internal(t, syscall_state); |
| return syscall_state.done_preparing(s); |
| } |
| |
| void rec_abort_prepared_syscall(RecordTask* t) { |
| auto syscall_state = TaskSyscallState::maybe_get(t); |
| if (syscall_state) { |
| syscall_state->abort_syscall_results(); |
| t->syscall_state = nullptr; |
| } |
| } |
| |
| static void aarch64_kernel_bug_workaround(RecordTask *t, |
| const TaskSyscallState &syscall_state) |
| { |
| if (syscall_state.syscall_entry_registers.arch() == aarch64) { |
| // The kernel lies about the real register state during syscall exits. |
| // Try to fix that up to retain some measure of sanity (otherwise we |
| // might leak an incorrect register into userspace, causing an |
| // un-recorded divergence). I'm really hoping to get this fixed |
| // in the kernel. |
| Registers r = t->regs(); |
| r.set_x7(syscall_state.syscall_entry_registers.x7()); |
| t->set_regs(r); |
| } |
| } |
| |
| template <typename Arch> |
| static void rec_prepare_restart_syscall_arch(RecordTask* t, |
| TaskSyscallState& syscall_state) { |
| int syscallno = t->ev().Syscall().number; |
| aarch64_kernel_bug_workaround(t, syscall_state); |
| switch (syscallno) { |
| case Arch::nanosleep: |
| case Arch::clock_nanosleep: |
| case Arch::clock_nanosleep_time64: |
| /* Hopefully uniquely among syscalls, nanosleep()/clock_nanosleep() |
| * requires writing to its remaining-time outparam |
| * *only if* the syscall fails with -EINTR. When a |
| * nanosleep() is interrupted by a signal, we don't |
| * know a priori whether it's going to be eventually |
| * restarted or not. (Not easily, anyway.) So we |
| * don't know whether it will eventually return -EINTR |
| * and would need the outparam written. To resolve |
| * that, we do what the kernel does, and update the |
| * outparam at the -ERESTART_RESTART interruption |
| * regardless. */ |
| syscall_state.process_syscall_results(); |
| break; |
| case Arch::ppoll: |
| case Arch::ppoll_time64: |
| case Arch::pselect6: |
| case Arch::pselect6_time64: |
| case Arch::sigsuspend: |
| case Arch::rt_sigsuspend: |
| t->invalidate_sigmask(); |
| break; |
| case Arch::wait4: |
| case Arch::waitid: |
| case Arch::waitpid: { |
| Registers r = t->regs(); |
| r.set_original_syscallno( |
| syscall_state.syscall_entry_registers.original_syscallno()); |
| t->set_regs(r); |
| t->canonicalize_regs(t->arch()); |
| t->in_wait_type = WAIT_TYPE_NONE; |
| break; |
| } |
| } |
| } |
| |
| static void rec_prepare_restart_syscall_internal( |
| RecordTask* t, TaskSyscallState& syscall_state) { |
| RR_ARCH_FUNCTION(rec_prepare_restart_syscall_arch, t->arch(), t, |
| syscall_state); |
| } |
| |
| void rec_prepare_restart_syscall(RecordTask* t) { |
| auto& syscall_state = TaskSyscallState::get(t); |
| rec_prepare_restart_syscall_internal(t, syscall_state); |
| t->syscall_state = nullptr; |
| } |
| |
| static const char* dropped_privs_warning = |
| "[WARNING] rr: Executed file with setuid or file capabilities set.\n" |
| " Capabilities did not take effect. Errors may follow.\n" |
| " To record this execution faithfully, re-run rr as:\n" |
| "\n" |
| " sudo -EP --preserve-env=HOME rr record --setuid-sudo\n" |
| "\n"; |
| |
| static bool is_privileged_executable(RecordTask* t, const string& path) { |
| struct vfs_cap_data actual, empty; |
| memset(actual.data, 0, sizeof(actual.data)); |
| memset(empty.data, 0, sizeof(empty.data)); |
| if (-1 != getxattr(path.c_str(), "security.capability", &actual, |
| sizeof(vfs_cap_data))) { |
| if (memcmp(&actual, &empty, sizeof(actual.data)) != 0) { |
| return true; |
| } |
| } else { |
| ASSERT(t, errno == ENODATA || errno == ENOTSUP); |
| struct stat buf; |
| stat(path.c_str(), &buf); |
| if (buf.st_mode & (S_ISUID | S_ISGID)) { |
| return true; |
| } |
| } |
| return false; |
| } |
| |
| static bool in_same_mount_namespace_as(RecordTask* t) { |
| char proc_ns_mount[PATH_MAX]; |
| snprintf(proc_ns_mount, sizeof(proc_ns_mount), "/proc/%d/ns/mnt", t->tid); |
| struct stat my_buf, their_buf; |
| ASSERT(t, stat("/proc/self/ns/mnt", &my_buf) == 0); |
| ASSERT(t, stat(proc_ns_mount, &their_buf) == 0); |
| return my_buf.st_ino == their_buf.st_ino; |
| } |
| |
| static void check_privileged_exe(RecordTask* t) { |
| // Check if the executable we just execed has setuid bits or file capabilities |
| // If so (and rr doesn't have CAP_SYS_ADMIN, which would have let us avoid, |
| // no_new privs), they may have been ignored, due to our no_new_privs setting |
| // in the tracee. That's most likely not what the user intended (and setuid |
| // applications may not handle not being root particularly gracefully - after |
| // all under usual circumstances, it would be an exec-time error). Give a loud |
| // warning to tell the user what happened, but continue anyway. |
| static bool gave_stern_warning = false; |
| if (!in_same_mount_namespace_as(t)) { |
| // We could try to enter the mount namespace and perform the below check |
| // there, but don't bother. We know we must have privileges over the mount |
| // namespaces (either because it's an unprivileged user namespace, in which |
| // case we have full privileges, or because at some point one of our |
| // tracees had to have CAP_SYS_ADMIN/CAP_SETUID to create the mount |
| // namespace - as a result we must have at least as much privilege). |
| // Nevertheless, we still need to stop the hpc counters, since |
| // the executable may be privileged with respect to its namespace. |
| t->hpc.stop(); |
| } else if (is_privileged_executable(t, t->vm()->exe_image())) { |
| if (has_effective_caps(1 << CAP_SYS_ADMIN)) { |
| // perf_events may have decided to stop counting for security reasons. |
| // To be safe, close all perf counters now, to force re-opening the |
| // perf file descriptors the next time we resume the task. |
| t->hpc.stop(); |
| } else { |
| // Only issue the warning once. If it's a problem, the user will likely |
| // find out soon enough. If not, no need to keep bothering them. |
| if (!gave_stern_warning) { |
| fputs(dropped_privs_warning, stderr); |
| gave_stern_warning = true; |
| } |
| } |
| } |
| } |
| |
| static uint64_t word_at(uint8_t* buf, size_t wsize) { |
| union { |
| uint8_t buf[8]; |
| uint64_t v; |
| } u; |
| memcpy(u.buf, buf, wsize); |
| memset(u.buf + wsize, 0, 8 - wsize); |
| return u.v; |
| } |
| |
| static pair<remote_ptr<void>, remote_ptr<void>> get_exe_entry_interp_base(Task* t) { |
| remote_ptr<void> exe_entry; |
| remote_ptr<void> interp_base; |
| vector<uint8_t> v = read_auxv(t); |
| size_t i = 0; |
| size_t wsize = word_size(t->arch()); |
| while ((i + 1)*wsize*2 <= v.size()) { |
| if (word_at(v.data() + i*2*wsize, wsize) == AT_ENTRY) { |
| exe_entry = word_at(v.data() + (i*2 + 1)*wsize, wsize); |
| } else if (word_at(v.data() + i*2*wsize, wsize) == AT_BASE) { |
| interp_base = word_at(v.data() + (i*2 + 1)*wsize, wsize); |
| } |
| ++i; |
| } |
| return make_pair(exe_entry, interp_base); |
| } |
| |
| /** |
| * Given `file_name`, where `file_name` is relative to our root directory |
| * but is in the mount namespace of `t`, try to make it a file we can read. |
| */ |
| static string try_make_process_file_name(RecordTask* t, |
| const std::string& file_name) { |
| char proc_root[32]; |
| // /proc/<pid>/root has magical properties; not only is it a link, but |
| // it links to a view of the filesystem as the process sees it, taking into |
| // account the process mount namespace etc. |
| snprintf(proc_root, sizeof(proc_root), "/proc/%d/root", t->tid); |
| char root[PATH_MAX]; |
| ssize_t ret = readlink(proc_root, root, sizeof(root) - 1); |
| ASSERT(t, ret >= 0); |
| root[ret] = 0; |
| |
| if (strncmp(root, file_name.c_str(), ret)) { |
| LOG(debug) << "File " << file_name << " is outside known root " |
| << proc_root; |
| return file_name; |
| } |
| return string(proc_root) + (ret == 1 ? file_name : file_name.substr(ret)); |
| } |
| |
| |
| static void process_execve(RecordTask* t, TaskSyscallState& syscall_state) { |
| Registers r = t->regs(); |
| if (r.syscall_failed()) { |
| // Otherwise we would have done this during PTRACE_EVENT_EXEC |
| t->session().scheduler().did_exit_execve(t); |
| return; |
| } |
| |
| string interp_name; |
| { |
| std::string exe_path = t->proc_exe_path(); |
| ScopedFd fd(exe_path.c_str(), O_RDONLY); |
| ASSERT(t, fd.is_open()); |
| |
| if (ElfFileReader::is_x32_abi(fd)) { |
| // NB: We opened proc_exe_path because exe_path may not be correct in |
| // this namespace, but we want to report something more useful than |
| // /proc/<pid>/exe in the event of a failure. |
| FATAL() << "rr does not support the x32 ABI, but " << t->exe_path() |
| << " is an x32 ABI program."; |
| } |
| |
| ElfFileReader reader(fd); |
| interp_name = reader.read_interp(); |
| } |
| |
| t->post_exec_syscall(t->exe_path()); |
| t->ev().Syscall().exec_fds_to_close = |
| t->fd_table()->fds_to_close_after_exec(t); |
| |
| check_privileged_exe(t); |
| |
| KernelMapping rr_page_mapping = |
| t->vm()->mapping_of(AddressSpace::rr_page_start()).map; |
| auto mode = t->trace_writer().write_mapped_region( |
| t, rr_page_mapping, rr_page_mapping.fake_stat(), |
| rr_page_mapping.fsname(), |
| vector<TraceRemoteFd>(), |
| TraceWriter::RR_BUFFER_MAPPING); |
| ASSERT(t, mode == TraceWriter::DONT_RECORD_IN_TRACE); |
| |
| KernelMapping preload_thread_locals_mapping = |
| t->vm()->mapping_of(AddressSpace::preload_thread_locals_start()).map; |
| mode = t->trace_writer().write_mapped_region( |
| t, preload_thread_locals_mapping, |
| preload_thread_locals_mapping.fake_stat(), |
| preload_thread_locals_mapping.fsname(), |
| vector<TraceRemoteFd>(), |
| TraceWriter::RR_BUFFER_MAPPING); |
| ASSERT(t, mode == TraceWriter::DONT_RECORD_IN_TRACE); |
| |
| KernelMapping vvar; |
| KernelMapping vdso; |
| |
| // get the remote executable entry point |
| // with the pointer, we find out which mapping is the executable |
| auto auxv_pointers = get_exe_entry_interp_base(t); |
| auto exe_entry = auxv_pointers.first; |
| auto interp_base = auxv_pointers.second; |
| // NB: A binary is not required to have an interpreter. |
| |
| // Write out stack mappings first since during replay we need to set up the |
| // stack before any files get mapped. |
| vector<KernelMapping> stacks; |
| for (const auto& m : t->vm()->maps()) { |
| auto& km = m.map; |
| if (km.is_stack()) { |
| stacks.push_back(km); |
| } else if (km.is_vvar()) { |
| vvar = km; |
| } else if (km.is_vdso()) { |
| vdso = km; |
| } |
| |
| // if true, this mapping is our executable |
| if (km.start() <= exe_entry && exe_entry < km.end()) { |
| ASSERT(t, km.prot() & PROT_EXEC) << "Entry point not in executable code?"; |
| syscall_state.exec_saved_event->set_exe_base(km.start()); |
| } |
| if (km.start() == interp_base) { |
| t->vm()->set_interp_base(interp_base); |
| syscall_state.exec_saved_event->set_interp_base(interp_base); |
| t->vm()->set_interp_name(interp_name); |
| syscall_state.exec_saved_event->set_interp_name(interp_name); |
| } |
| } |
| |
| t->session().trace_writer().write_task_event(*syscall_state.exec_saved_event); |
| |
| { |
| AutoRemoteSyscalls remote(t, AutoRemoteSyscalls::DISABLE_MEMORY_PARAMS); |
| |
| if (vvar.size()) { |
| // We're not going to map [vvar] during replay --- that wouldn't |
| // make sense, since it contains data from the kernel that isn't correct |
| // for replay, and we patch out the vdso syscalls that would use it. |
| // Unmapping it now makes recording look more like replay. |
| // Also note that under 4.0.7-300.fc22.x86_64 (at least) /proc/<pid>/mem |
| // can't read the contents of [vvar]. |
| remote.infallible_syscall(syscall_number_for_munmap(remote.arch()), |
| vvar.start(), vvar.size()); |
| t->vm()->unmap(t, vvar.start(), vvar.size()); |
| } |
| |
| if (t->session().unmap_vdso() && vdso.size()) { |
| remote.infallible_syscall(syscall_number_for_munmap(remote.arch()), |
| vdso.start(), vdso.size()); |
| t->vm()->unmap(t, vdso.start(), vdso.size()); |
| } |
| |
| for (auto& km : stacks) { |
| mode = t->trace_writer().write_mapped_region(t, km, km.fake_stat(), |
| km.fsname(), vector<TraceRemoteFd>(), |
| TraceWriter::EXEC_MAPPING); |
| ASSERT(t, mode == TraceWriter::RECORD_IN_TRACE); |
| auto buf = t->read_mem(km.start().cast<uint8_t>(), km.size()); |
| t->trace_writer().write_raw(t->rec_tid, buf.data(), km.size(), |
| km.start()); |
| |
| // Remove MAP_GROWSDOWN from stacks by remapping the memory and |
| // writing the contents back. |
| int flags = (km.flags() & ~MAP_GROWSDOWN) | MAP_ANONYMOUS; |
| remote.infallible_syscall(syscall_number_for_munmap(remote.arch()), |
| km.start(), km.size()); |
| if (!t->vm()->has_mapping(km.start() - page_size())) { |
| // Unmap an extra page at the start; this seems to be necessary |
| // to properly wipe out the growsdown mapping. Doing it as a separate |
| // munmap call also seems to be necessary. |
| remote.infallible_syscall(syscall_number_for_munmap(remote.arch()), |
| km.start() - page_size(), page_size()); |
| } |
| remote.infallible_mmap_syscall_if_alive(km.start(), km.size(), km.prot(), flags, |
| -1, 0); |
| t->write_mem(km.start().cast<uint8_t>(), buf.data(), buf.size()); |
| } |
| } |
| |
| // The kernel may modify some of the pages in the mapping according to |
| // ELF BSS metadata. We use /proc/<pid>/pagemap to observe which pages |
| // have been changed and mark them for recording. |
| ScopedFd& pagemap = t->pagemap_fd(); |
| ASSERT(t, pagemap.is_open()); |
| vector<remote_ptr<void>> pages_to_record; |
| |
| for (const auto& m : t->vm()->maps()) { |
| auto& km = m.map; |
| if (km.start() == AddressSpace::rr_page_start() || |
| km.start() == AddressSpace::preload_thread_locals_start()) { |
| continue; |
| } |
| if (km.is_stack() || km.is_vsyscall()) { |
| // [stack] has already been handled. |
| // [vsyscall] can't be read via /proc/<pid>/mem, *should* |
| // be the same across all execs, and can't be munmapped so we can't fix |
| // it even if it does vary. Plus no-one should be using it anymore. |
| continue; |
| } |
| struct stat st; |
| string file_name = try_make_process_file_name(t, km.fsname()); |
| if (stat(file_name.c_str(), &st) != 0) { |
| st = km.fake_stat(); |
| // Size is not real. Don't confuse the logic below |
| st.st_size = 0; |
| } |
| if (t->trace_writer().write_mapped_region(t, km, st, file_name, vector<TraceRemoteFd>(), |
| TraceWriter::EXEC_MAPPING) == |
| TraceWriter::RECORD_IN_TRACE) { |
| if (st.st_size > 0) { |
| off64_t end = (off64_t)st.st_size - km.file_offset_bytes(); |
| t->record_remote(km.start(), min(end, (off64_t)km.size())); |
| } else { |
| // st_size is not valid. Some device files are mmappable but have zero |
| // size. We also take this path if there's no file at all (vdso etc). |
| t->record_remote(km.start(), km.size()); |
| } |
| } else { |
| auto ptr = km.start(); |
| auto r = lseek(pagemap.get(), ptr.as_int() / page_size() * 8, SEEK_SET); |
| ASSERT(t, r >= 0); |
| while (ptr != km.end()) { |
| uint64_t pfn; |
| r = read(pagemap.get(), &pfn, sizeof(pfn)); |
| ASSERT(t, r == sizeof(pfn)); |
| // If the page is physically present (bit 63) or in swap (bit 62) |
| // then it was modified by the kernel and we need to record it. |
| if (pfn & ((1ULL << 63) | (1ULL << 62))) { |
| pages_to_record.push_back(ptr); |
| } |
| ptr += page_size(); |
| } |
| } |
| } |
| |
| for (auto& p : pages_to_record) { |
| t->record_remote(p, page_size()); |
| } |
| |
| // Patch LD_PRELOAD and VDSO after saving the mappings. Replay will apply |
| // patches to the saved mappings. |
| t->vm()->monkeypatcher().patch_after_exec(t); |
| |
| init_scratch_memory(t, FIXED_ADDRESS); |
| } |
| |
| static bool is_writable(RecordTask* t, int fd) { |
| struct stat lst = t->lstat_fd(fd); |
| return (lst.st_mode & S_IWUSR) != 0; |
| } |
| |
| // Returns true if the fd used to map the file is writable and thus needs |
| // monitoring. |
| static bool monitor_fd_for_mapping(RecordTask* mapped_t, int mapped_fd, const struct stat& file, |
| vector<TraceRemoteFd>& extra_fds) { |
| unordered_set<FdTable*> tables; |
| bool found_our_mapping = false; |
| bool our_mapping_writable = false; |
| auto mapped_table = mapped_t->fd_table(); |
| for (auto& ts : mapped_t->session().tasks()) { |
| auto rt = static_cast<RecordTask*>(ts.second); |
| if (rt->already_exited()) { |
| continue; |
| } |
| auto table = rt->fd_table(); |
| if (tables.find(table.get()) != tables.end()) { |
| continue; |
| } |
| |
| char buf[100]; |
| sprintf(buf, "/proc/%d/fd", rt->tid); |
| DIR* dir = opendir(buf); |
| if (!dir && (errno == EACCES || errno == ENOENT)) { |
| LOG(warn) << "Task must have exited out from underneath us. Skipping it"; |
| continue; |
| } |
| ASSERT(rt, dir) << "Can't open fd directory " << buf; |
| tables.insert(table.get()); |
| |
| struct dirent* d; |
| errno = 0; |
| vector<string> names; |
| while ((d = readdir(dir)) != nullptr) { |
| char* end; |
| int fd = strtol(d->d_name, &end, 10); |
| if (*end) { |
| // Some kind of parse error |
| continue; |
| } |
| struct stat fd_stat = rt->stat_fd(fd); |
| if (fd_stat.st_dev != file.st_dev || fd_stat.st_ino != file.st_ino) { |
| // Not our file |
| continue; |
| } |
| bool writable = is_writable(rt, fd); |
| if (table == mapped_table && fd == mapped_fd) { |
| // This is what we're using to do the mmap. Don't put it in extra_fds. |
| found_our_mapping = true; |
| our_mapping_writable = writable; |
| continue; |
| } |
| if (!writable) { |
| // Ignore non-writable fds since they can't modify memory |
| continue; |
| } |
| extra_fds.push_back({ rt->tid, fd }); |
| } |
| ASSERT(rt, !errno) << "Can't read fd directory " << buf; |
| closedir(dir); |
| } |
| ASSERT(mapped_t, found_our_mapping) << "Can't find fd for mapped file"; |
| return our_mapping_writable; |
| } |
| |
| // The returned hole offsets are relative to 'offset' |
| static vector<WriteHole> find_holes(RecordTask* t, int desc, uint64_t offset, uint64_t size) { |
| vector<WriteHole> ret; |
| ScopedFd fd = t->open_fd(desc, O_RDONLY); |
| if (!fd.is_open()) { |
| return ret; |
| } |
| uint64_t file_start = offset; |
| uint64_t file_end = offset + size; |
| while (offset < file_end) { |
| off64_t r = lseek(fd, offset, SEEK_HOLE); |
| if (r < 0) { |
| // SEEK_HOLE not supported? |
| return ret; |
| } |
| uint64_t hole = (uint64_t)r; |
| ASSERT(t, hole >= offset) << "Found hole at " << hole << " which is before " << offset; |
| if (hole >= file_end) { |
| return ret; |
| } |
| r = lseek(fd, hole, SEEK_DATA); |
| if (r < 0) { |
| if (errno == ENXIO) { |
| r = file_end; |
| } else { |
| return ret; |
| } |
| } else if (r == 0) { |
| // The file has no data. |
| r = file_end; |
| } |
| uint64_t data = min((uint64_t)r, file_end); |
| ASSERT(t, data > hole) << "Found data at " << data << " which should be after hole " << hole |
| << "; file end " << file_end << ", data offset " << r; |
| ret.push_back({ hole - file_start, data - hole }); |
| offset = data; |
| } |
| return ret; |
| } |
| |
| static void process_mmap(RecordTask* t, size_t length, int prot, int flags, |
| int fd, off64_t offset) { |
| if (t->regs().syscall_failed()) { |
| // We purely emulate failed mmaps. |
| return; |
| } |
| |
| size_t size = ceil_page_size(length); |
| remote_ptr<void> addr = t->regs().syscall_result(); |
| if (flags & MAP_ANONYMOUS) { |
| KernelMapping km; |
| if (!(flags & MAP_SHARED)) { |
| // Anonymous mappings are by definition not backed by any file-like |
| // object, and are initialized to zero, so there's no nondeterminism to |
| // record. |
| km = t->vm()->map(t, addr, size, prot, flags, 0, string()); |
| } else { |
| ASSERT(t, !(flags & MAP_GROWSDOWN)); |
| // Read the kernel's mapping. There doesn't seem to be any other way to |
| // get the correct device/inode numbers. Fortunately anonymous shared |
| // mappings are rare. |
| KernelMapping kernel_info = t->vm()->read_kernel_mapping(t, addr); |
| km = t->vm()->map(t, addr, size, prot, flags, 0, kernel_info.fsname(), |
| kernel_info.device(), kernel_info.inode()); |
| } |
| auto d = t->trace_writer().write_mapped_region(t, km, km.fake_stat(), km.fsname(), vector<TraceRemoteFd>()); |
| ASSERT(t, d == TraceWriter::DONT_RECORD_IN_TRACE); |
| return; |
| } |
| |
| ASSERT(t, fd >= 0) << "Valid fd required for file mapping"; |
| ASSERT(t, !(flags & MAP_GROWSDOWN)); |
| |
| bool effectively_anonymous = false; |
| auto st = t->stat_fd(fd); |
| string our_file_name =t->proc_fd_path(fd); |
| string tracee_file_name = t->file_name_of_fd(fd); |
| if (MAJOR(st.st_rdev) == 1 && |
| MINOR(st.st_rdev) == 5) { |
| // mmapping /dev/zero is equivalent to MAP_ANONYMOUS, just more annoying. |
| // grab the device/inode from the kernel mapping so that it will be unique. |
| KernelMapping kernel_synthetic_info = t->vm()->read_kernel_mapping(t, addr); |
| st.st_dev = kernel_synthetic_info.device(); |
| st.st_ino = kernel_synthetic_info.inode(); |
| our_file_name = tracee_file_name = kernel_synthetic_info.fsname(); |
| effectively_anonymous = true; |
| } |
| |
| KernelMapping km = |
| t->vm()->map(t, addr, size, prot, flags, offset, tracee_file_name, st.st_dev, |
| st.st_ino, unique_ptr<struct stat>(new struct stat(st))); |
| |
| bool adjusted_size = false; |
| if (!st.st_size && !S_ISREG(st.st_mode)) { |
| // Some device files are mmappable but have zero size. Increasing the |
| // size here is safe even if the mapped size is greater than the real size. |
| st.st_size = offset + size; |
| adjusted_size = true; |
| } |
| |
| vector<TraceRemoteFd> extra_fds; |
| bool monitor_this_fd = false; |
| if ((flags & MAP_SHARED) && !effectively_anonymous) { |
| monitor_this_fd = monitor_fd_for_mapping(t, fd, st, extra_fds); |
| } |
| if (t->trace_writer().write_mapped_region(t, km, st, our_file_name, extra_fds, |
| TraceWriter::SYSCALL_MAPPING, |
| !monitor_this_fd) == |
| TraceWriter::RECORD_IN_TRACE) { |
| off64_t end = (off64_t)st.st_size - km.file_offset_bytes(); |
| off64_t nbytes = min(end, (off64_t)km.size()); |
| vector<WriteHole> holes = find_holes(t, fd, km.file_offset_bytes(), (uint64_t)nbytes); |
| ssize_t nread = t->record_remote_fallible(addr, nbytes, holes); |
| if (!adjusted_size && nread != nbytes) { |
| // If we adjusted the size, we're not guaranteed that the bytes we're |
| // reading are actually valid (it could actually have been a zero-sized |
| // file). |
| auto st2 = t->stat_fd(fd); |
| AddressSpace::print_process_maps(t); |
| if (system("df -h")) { |
| // Couldn't run 'df'... |
| } |
| ASSERT(t, false) << "Failed to read expected mapped data at " << km |
| << "; expected " << nbytes << " bytes, got " << nread << " bytes," |
| << " got file size " << st.st_size << " before and " << st2.st_size |
| << " after; is filesystem full?"; |
| } |
| } |
| if ((flags & MAP_SHARED) && !effectively_anonymous) { |
| // Setting up MmappedFileMonitor may trigger updates to syscallbuf_fds_disabled |
| // in the tracee, recording memory records. Those should be recorded now, after the |
| // memory region data itself. Needs to be consistent with replay_syscall. |
| if (monitor_this_fd) { |
| extra_fds.push_back({ t->tid, fd}); |
| } |
| for (auto& f : extra_fds) { |
| auto rt = t->session().find_task(f.tid); |
| if (rt->fd_table()->is_monitoring(f.fd)) { |
| auto type = rt->fd_table()->get_monitor(f.fd)->type(); |
| if (type == FileMonitor::Type::Mmapped) { |
| ((MmappedFileMonitor*)rt->fd_table()->get_monitor(f.fd))->revive(); |
| } else if (type == FileMonitor::Type::ODirect) { |
| rt->fd_table()->replace_monitor(rt, f.fd, new MmappedFileMonitor(rt, f.fd)); |
| } else { |
| ASSERT(rt, false) |
| << "Expected monitor type Mmapped | ODirect for fd " << f.fd << ", got monitor type " |
| << file_monitor_type_name(type); |
| } |
| } else { |
| rt->fd_table()->add_monitor(rt, f.fd, new MmappedFileMonitor(rt, f.fd)); |
| } |
| } |
| |
| if ((prot & PROT_WRITE)) { |
| LOG(debug) << tracee_file_name << |
| " is SHARED|writable; that's not handled " |
| "correctly yet. Optimistically hoping it's not " |
| "written by programs outside the rr tracee " |
| "tree."; |
| } |
| } |
| |
| // We don't want to patch MAP_SHARED files. In the best case we'd end crashing |
| // at an assertion, in the worst case, we'd end up modifying the underlying |
| // file. |
| if (!(flags & MAP_SHARED)) { |
| t->vm()->monkeypatcher().patch_after_mmap(t, addr, size, offset, fd, |
| Monkeypatcher::MMAP_SYSCALL); |
| } |
| |
| if ((prot & (PROT_WRITE | PROT_READ)) == PROT_READ && (flags & MAP_SHARED) && |
| !effectively_anonymous) { |
| MonitoredSharedMemory::maybe_monitor(t, tracee_file_name, |
| t->vm()->mapping_of(addr), fd, offset); |
| } |
| } |
| |
| static void process_mremap(RecordTask* t, remote_ptr<void> old_addr, |
| size_t old_length, size_t new_length, int flags) { |
| if (t->regs().syscall_failed()) { |
| // We purely emulate failed mremaps. |
| return; |
| } |
| |
| size_t old_size = ceil_page_size(old_length); |
| size_t new_size = ceil_page_size(new_length); |
| remote_ptr<void> new_addr = t->regs().syscall_result(); |
| |
| t->vm()->remap(t, old_addr, old_size, new_addr, new_size, flags); |
| AddressSpace::Mapping m = t->vm()->mapping_of(new_addr); |
| KernelMapping km = |
| m.map.subrange(new_addr, new_addr + min(new_size, old_size)); |
| struct stat st = m.mapped_file_stat ? *m.mapped_file_stat : km.fake_stat(); |
| |
| // Make sure that the trace records the mapping at the new location, even |
| // if the mapping didn't grow. |
| auto r = t->trace_writer().write_mapped_region(t, km, st, km.fsname(), |
| vector<TraceRemoteFd>(), |
| TraceWriter::REMAP_MAPPING); |
| ASSERT(t, r == TraceWriter::DONT_RECORD_IN_TRACE); |
| if (old_size >= new_size) { |
| return; |
| } |
| |
| // Now record the new part of the mapping. |
| km = m.map.subrange(new_addr + old_size, new_addr + new_size); |
| if (!st.st_size) { |
| // Some device files are mmappable but have zero size. Increasing the |
| // size here is safe even if the mapped size is greater than the real size. |
| st.st_size = m.map.file_offset_bytes() + new_size; |
| } |
| |
| if (t->trace_writer().write_mapped_region(t, km, st, km.fsname(), |
| vector<TraceRemoteFd>()) == |
| TraceWriter::RECORD_IN_TRACE) { |
| off64_t end = max<off64_t>(st.st_size - km.file_offset_bytes(), 0); |
| // Allow failure; the underlying file may have true zero size, in which |
| // case this may try to record unmapped memory. |
| t->record_remote_fallible(km.start(), min(end, (off64_t)km.size())); |
| } |
| |
| // If the original mapping was monitored, we'll continue monitoring it |
| // automatically. |
| } |
| |
| static void process_shmat(RecordTask* t, int shmid, int shm_flags, |
| remote_ptr<void> addr) { |
| if (t->regs().syscall_failed()) { |
| // We purely emulate failed shmats. |
| return; |
| } |
| |
| struct shmid64_ds ds; |
| int ret = _shmctl(shmid, IPC_STAT, &ds); |
| msan_unpoison(&ds, sizeof(semid64_ds)); |
| ASSERT(t, !ret) << "shmid should be readable by rr since rr has the same " |
| "UID as tracees"; |
| size_t size = ceil_page_size(ds.shm_segsz); |
| |
| int prot = shm_flags_to_mmap_prot(shm_flags); |
| int flags = MAP_SHARED; |
| |
| // Read the kernel's mapping for the shm segment. There doesn't seem to be |
| // any other way to get the correct device number. (The inode number seems to |
| // be the shm key.) This should be OK since SysV shmem is not used very much |
| // and reading /proc/<pid>/maps should be reasonably cheap. |
| KernelMapping kernel_info = t->vm()->read_kernel_mapping(t, addr); |
| KernelMapping km = |
| t->vm()->map(t, addr, size, prot, flags, 0, kernel_info.fsname(), |
| kernel_info.device(), kernel_info.inode()); |
| t->vm()->set_shm_size(km.start(), km.size()); |
| auto disposition = |
| t->trace_writer().write_mapped_region(t, km, km.fake_stat(), km.fsname(), vector<TraceRemoteFd>()); |
| ASSERT(t, disposition == TraceWriter::RECORD_IN_TRACE); |
| t->record_remote(addr, size); |
| |
| LOG(debug) << "Optimistically hoping that SysV segment is not used outside " |
| "of tracees"; |
| } |
| |
| static void maybe_process_new_socket(RecordTask* t, int fd) { |
| if (t->regs().syscall_failed()) { |
| return; |
| } |
| |
| std::array<typename NativeArch::sockaddr_storage, 2> socket_addresses; |
| if (!read_proc_net_socket_addresses(t, fd, socket_addresses)) { |
| return; |
| } |
| |
| auto& syscall = t->ev().Syscall(); |
| syscall.socket_addrs = make_shared<std::array<typename NativeArch::sockaddr_storage, 2>>(std::move(socket_addresses)); |
| } |
| |
| template <typename Arch> |
| static string extra_expected_errno_info(RecordTask* t, |
| TaskSyscallState& syscall_state) { |
| stringstream ss; |
| switch (syscall_state.expect_errno) { |
| case ENOSYS: |
| ss << "; execution of syscall unsupported by rr"; |
| break; |
| case EINVAL: |
| switch (t->regs().original_syscallno()) { |
| case Arch::ioctl: { |
| int request = (int)t->regs().arg2_signed(); |
| if (request == SIOCETHTOOL) { |
| auto ifreq_ptr = remote_ptr<typename Arch::ifreq>(t->regs().arg3()); |
| auto ifreq = t->read_mem(ifreq_ptr); |
| remote_ptr<void> cmd_ptr = ifreq.ifr_ifru.ifru_data.rptr(); |
| auto cmd = t->read_mem(cmd_ptr.cast<uint32_t>()); |
| ss << "; unknown ETHTOOL command " << cmd; |
| } else { |
| int type = _IOC_TYPE(request); |
| int nr = _IOC_NR(request); |
| int dir = _IOC_DIR(request); |
| int size = _IOC_SIZE(request); |
| ss << "; Unknown ioctl(" << HEX(request) << "): type:" << HEX(type) |
| << " nr:" << HEX(nr) << " dir:" << HEX(dir) << " size:" << size |
| << " addr:" << HEX(t->regs().arg3()); |
| } |
| break; |
| } |
| case Arch::fcntl: |
| case Arch::fcntl64: |
| ss << "; unknown fcntl(" << HEX((int)t->regs().arg2_signed()) << ")"; |
| break; |
| case Arch::prctl: { |
| int request = (int)t->regs().arg1_signed(); |
| if (request == PR_SET_MM) { |
| ss << "; unknown prctl(PR_SET_MM, " << HEX((int)t->regs().arg2_signed()) << ")"; |
| } else { |
| ss << "; unknown prctl(" << HEX(request) << ")"; |
| } |
| break; |
| } |
| case Arch::arch_prctl: |
| ss << "; unknown arch_prctl(" << HEX((int)t->regs().arg1_signed()) |
| << ")"; |
| break; |
| case Arch::keyctl: |
| ss << "; unknown keyctl(" << HEX((int)t->regs().arg1_signed()) << ")"; |
| break; |
| case Arch::socketcall: |
| ss << "; unknown socketcall(" << HEX((int)t->regs().arg1_signed()) |
| << ")"; |
| break; |
| case Arch::ipc: |
| ss << "; unknown ipc(" << HEX((int)t->regs().arg1_signed()) << ")"; |
| break; |
| case Arch::futex_time64: |
| case Arch::futex: |
| ss << "; unknown futex(" |
| << HEX((int)t->regs().arg2_signed() & FUTEX_CMD_MASK) << ")"; |
| break; |
| case Arch::waitid: |
| ss << "; unknown waitid(" << HEX((idtype_t)t->regs().arg1()) << ")"; |
| break; |
| case Arch::seccomp: |
| ss << "; unknown seccomp(" << HEX((unsigned int)t->regs().arg1()) |
| << ")"; |
| break; |
| case Arch::madvise: |
| ss << "; unknown madvise(" << (int)t->regs().arg3() << ")"; |
| break; |
| case Arch::bpf: |
| ss << "; unknown bpf(cmd=" << (int)t->regs().arg1() << ")"; |
| break; |
| } |
| break; |
| case EIO: |
| switch (t->regs().original_syscallno()) { |
| case Arch::ptrace: |
| ss << "; unsupported ptrace(" << HEX((int)t->regs().arg1()) << " [" |
| << ptrace_req_name<Arch>((int)t->regs().arg1_signed()) << "])"; |
| break; |
| } |
| break; |
| } |
| return ss.str(); |
| } |
| |
| static bool is_rr_fd_terminal(int fd, const string& pathname) { |
| char buf[PATH_MAX]; |
| if (ttyname_r(fd, buf, sizeof(buf))) { |
| return false; |
| } |
| buf[sizeof(buf) - 1] = 0; |
| return pathname == buf; |
| } |
| |
| static bool is_rr_terminal(const string& pathname) { |
| if (pathname == "/dev/tty") { |
| // XXX the tracee's /dev/tty could refer to a tty other than |
| // the recording tty, in which case output should not be |
| // redirected. That's not too bad, replay will still work, just |
| // with some spurious echoes. |
| return true; |
| } |
| |
| return is_rr_fd_terminal(STDIN_FILENO, pathname) || |
| is_rr_fd_terminal(STDOUT_FILENO, pathname) || |
| is_rr_fd_terminal(STDERR_FILENO, pathname); |
| } |
| |
| static int dev_tty_fd() { |
| static int fd = -1; |
| if (fd < 0) { |
| fd = open("/dev/tty", O_WRONLY); |
| } |
| return fd; |
| } |
| |
| template <typename Arch> |
| static void record_iovec_output(RecordTask* t, RecordTask* dest, |
| remote_ptr<typename Arch::iovec> piov, |
| uint32_t iov_cnt) { |
| // Ignore the syscall result, the kernel may have written more data than that. |
| // See https://bugzilla.kernel.org/show_bug.cgi?id=113541 |
| auto iovs = t->read_mem(piov, iov_cnt); |
| for (auto& iov : iovs) { |
| dest->record_remote_writable(iov.iov_base, iov.iov_len); |
| } |
| } |
| |
| static bool all_tasks_exited(AddressSpace* vm) { |
| for (Task* t : vm->task_set()) { |
| if (!t->already_exited()) { |
| return false; |
| } |
| } |
| return true; |
| } |
| |
| static bool is_mapped_shared(RecordTask* t, const struct stat& st) { |
| for (AddressSpace* vm : t->session().vms()) { |
| if (all_tasks_exited(vm)) { |
| continue; |
| } |
| for (auto& m : vm->maps()) { |
| if ((m.map.flags() & MAP_SHARED) && |
| m.mapped_file_stat && m.mapped_file_stat->st_dev == st.st_dev && |
| m.mapped_file_stat->st_ino == st.st_ino) { |
| LOG(debug) << "is_mapped_shared is shared: " |
| << st.st_dev << " " << st.st_ino; |
| return true; |
| } |
| } |
| } |
| return false; |
| } |
| |
| // Returns the file path. This could be a blacklisted file so don't |
| // do anything for blacklisted files. |
| static string handle_opened_file(RecordTask* t, int fd, int flags) { |
| string pathname = t->file_name_of_fd(fd); |
| struct stat st = t->stat_fd(fd); |
| |
| // This must be kept in sync with replay_syscall's handle_opened_files. |
| FileMonitor* file_monitor = nullptr; |
| if (is_writable(t, fd) && is_mapped_shared(t, st)) { |
| // This is quite subtle. Because open(2) is ALLOW_SWITCH, we could have been |
| // descheduled after entering the syscall we're now exiting. If that happened, |
| // and another task did a shared mapping of this file while we were suspended, |
| // it would have trawled the proc filesystem looking for other open fds for |
| // the same file. If this syscall had been completed in the kernel by then, |
| // it will have already installed this monitor for us. So we must allow this |
| // benign race. |
| if (t->fd_table()->is_monitoring(fd)) { |
| LOG(debug) << "Already monitoring " << fd; |
| ASSERT(t, |
| t->fd_table()->get_monitor(fd)->type() == |
| FileMonitor::Type::Mmapped); |
| } else { |
| // The normal case, we are unmonitored because we are a new file. |
| LOG(info) << "Installing MmappedFileMonitor for " << fd; |
| file_monitor = new MmappedFileMonitor(t, fd); |
| } |
| } else if (is_rr_terminal(pathname)) { |
| // This will let rr event annotations echo to the terminal. It will also |
| // ensure writes to this fd are not syscall-buffered. |
| LOG(info) << "Installing StdioMonitor for " << fd; |
| file_monitor = new StdioMonitor(dev_tty_fd()); |
| pathname = "terminal"; |
| } else if (is_proc_mem_file(pathname.c_str())) { |
| LOG(info) << "Installing ProcMemMonitor for " << fd; |
| file_monitor = new ProcMemMonitor(t, pathname); |
| } else if (is_proc_fd_dir(pathname.c_str())) { |
| LOG(info) << "Installing ProcFdDirMonitor for " << fd; |
| file_monitor = new ProcFdDirMonitor(t, pathname); |
| } else if (is_sys_cpu_online_file(pathname.c_str())) { |
| LOG(info) << "Installing SysCpuMonitor for " << fd; |
| file_monitor = new SysCpuMonitor(t, pathname); |
| } else if (is_proc_stat_file(pathname.c_str())) { |
| LOG(info) << "Installing ProcStatMonitor for " << fd; |
| file_monitor = new ProcStatMonitor(t, pathname); |
| } else if (is_rr_page_lib(pathname.c_str())) { |
| LOG(info) << "Installing RRPageMonitor for " << fd; |
| file_monitor = new RRPageMonitor(); |
| } else if (flags & O_DIRECT) { |
| // O_DIRECT can impose unknown alignment requirements, in which case |
| // syscallbuf records will not be properly aligned and will cause I/O |
| // to fail. Disable syscall buffering for O_DIRECT files. |
| LOG(info) << "Installing ODirectFileMonitor for O_DIRECT " << fd; |
| file_monitor = new ODirectFileMonitor(); |
| } |
| |
| if (file_monitor) { |
| // Write absolute file name |
| auto& syscall = t->ev().Syscall(); |
| syscall.opened.push_back({ pathname, fd, st.st_dev, st.st_ino }); |
| t->fd_table()->add_monitor(t, fd, file_monitor); |
| } |
| return pathname; |
| } |
| |
| template <typename Arch> |
| static void check_scm_rights_fd(RecordTask* t, typename Arch::msghdr& msg) { |
| if (msg.msg_controllen < sizeof(typename Arch::cmsghdr)) { |
| return; |
| } |
| auto data = t->read_mem(msg.msg_control.rptr().template cast<uint8_t>(), |
| msg.msg_controllen); |
| size_t index = 0; |
| while (true) { |
| auto cmsg = reinterpret_cast<typename Arch::cmsghdr*>(data.data() + index); |
| if (cmsg->cmsg_len < sizeof(*cmsg) || |
| index + Arch::cmsg_align(cmsg->cmsg_len) > data.size()) { |
| break; |
| } |
| if (cmsg->cmsg_level == SOL_SOCKET && cmsg->cmsg_type == SCM_RIGHTS) { |
| int* fds = static_cast<int*>(Arch::cmsg_data(cmsg)); |
| int fd_count = (cmsg->cmsg_len - sizeof(*cmsg)) / sizeof(int); |
| for (int i = 0; i < fd_count; ++i) { |
| handle_opened_file(t, fds[i], 0); |
| } |
| } |
| index += Arch::cmsg_align(cmsg->cmsg_len); |
| if (index + sizeof(*cmsg) > data.size()) { |
| break; |
| } |
| } |
| } |
| |
| static void fake_gcrypt_file(RecordTask* t, Registers* r) { |
| // We hijacked this call to deal with /etc/gcrypt/hwf.deny. |
| char fname[255]; |
| snprintf(fname, sizeof(fname), "rr-gcrypt-hwf-deny-%d", getpid()); |
| ScopedFd fd(open_memory_file(fname)); |
| |
| struct stat dummy; |
| if (!stat("/etc/gcrypt/hwf.deny", &dummy)) { |
| // Copy the contents into our temporary file |
| ScopedFd existing("/etc/gcrypt/hwf.deny", O_RDONLY); |
| if (!copy_file(fd, existing)) { |
| FATAL() << "Can't copy file"; |
| } |
| } |
| |
| static const char disable_rdrand[] = "\nintel-rdrand\n"; |
| write_all(fd, disable_rdrand, sizeof(disable_rdrand) - 1); |
| |
| // Now open the file in the child. |
| int child_fd; |
| { |
| AutoRemoteSyscalls remote(t); |
| lseek(fd, 0, SEEK_SET); |
| child_fd = remote.infallible_send_fd_if_alive(fd); |
| if (child_fd < 0) { |
| // Tracee died. |
| return; |
| } |
| } |
| |
| // And hand out our fake file |
| r->set_syscall_result(child_fd); |
| } |
| |
| template <typename Arch> |
| static void rec_process_syscall_arch(RecordTask* t, |
| TaskSyscallState& syscall_state) { |
| int syscallno = t->ev().Syscall().number; |
| |
| if (t->regs().original_syscallno() == SECCOMP_MAGIC_SKIP_ORIGINAL_SYSCALLNO) { |
| // rr vetoed this syscall. Don't do any post-processing. |
| return; |
| } |
| |
| LOG(debug) << t->tid << ": processing: " << t->ev() |
| << " -- time: " << t->trace_time(); |
| |
| if (remote_ptr<const struct syscallbuf_record> rec = t->desched_rec()) { |
| // If the syscallbuf has already been unmapped, there's no need to record |
| // the entry. |
| if (t->syscallbuf_child) { |
| t->record_remote(REMOTE_PTR_FIELD(rec, extra_data), |
| t->read_mem(REMOTE_PTR_FIELD(rec, size)) - |
| sizeof(struct syscallbuf_record)); |
| } |
| return; |
| } |
| |
| if (syscall_state.expect_errno) { |
| if (syscall_state.expect_errno == EINVAL && syscallno == Arch::ioctl && |
| t->regs().syscall_result_signed() == -ENOTTY) { |
| // Unsupported ioctl was called, but is not supported for this device, |
| // so we can safely ignore it. |
| return; |
| } |
| ASSERT(t, t->regs().syscall_result_signed() == -syscall_state.expect_errno) |
| << "Expected " << errno_name(syscall_state.expect_errno) << " for '" |
| << syscall_name(syscallno, Arch::arch()) << "' but got result " |
| << t->regs().syscall_result_signed() << " (errno " |
| << errno_name(-t->regs().syscall_result_signed()) << ")" |
| << extra_expected_errno_info<Arch>(t, syscall_state); |
| if (syscallno == Arch::execve || syscallno == Arch::execveat) { |
| t->session().scheduler().did_exit_execve(t); |
| } |
| return; |
| } |
| |
| // Here we handle syscalls that need work that can only happen after the |
| // syscall completes --- and that our TaskSyscallState infrastructure can't |
| // handle. |
| switch (syscallno) { |
| case Arch::vfork: |
| case Arch::fork: |
| case Arch::clone: |
| if ((syscallno == Arch::vfork || |
| (syscallno == Arch::clone && (t->regs().orig_arg1() & CLONE_VFORK))) && |
| (t->emulated_ptrace_options & PTRACE_O_TRACEVFORKDONE)) { |
| t->emulate_ptrace_stop( |
| WaitStatus::for_ptrace_event(PTRACE_EVENT_VFORK_DONE)); |
| } |
| if (Arch::is_x86ish()) { |
| // On a 3.19.0-39-generic #44-Ubuntu kernel we have observed clone() |
| // clearing the parity flag internally. |
| Registers r = t->regs(); |
| r.set_flags(syscall_state.syscall_entry_registers.flags()); |
| t->set_regs(r); |
| } |
| break; |
| |
| case Arch::execve: |
| case Arch::execveat: |
| process_execve(t, syscall_state); |
| break; |
| |
| case Arch::brk: { |
| remote_ptr<void> old_brk = ceil_page_size(t->vm()->current_brk()); |
| remote_ptr<void> new_brk = ceil_page_size(t->regs().syscall_result()); |
| KernelMapping km; |
| if (old_brk < new_brk) { |
| // Read the kernel's mapping. There doesn't seem to be any other way to |
| // get the correct prot bits for heaps. Usually it's READ|WRITE but |
| // there seem to be exceptions depending on system settings. |
| KernelMapping kernel_info = t->vm()->read_kernel_mapping(t, old_brk); |
| ASSERT(t, kernel_info.device() == KernelMapping::NO_DEVICE); |
| ASSERT(t, kernel_info.inode() == KernelMapping::NO_INODE); |
| km = kernel_info.subrange(old_brk, new_brk); |
| } else { |
| // Write a dummy KernelMapping that indicates an unmap |
| km = KernelMapping(new_brk, old_brk, string(), KernelMapping::NO_DEVICE, |
| KernelMapping::NO_INODE, 0, 0, 0); |
| } |
| auto d = t->trace_writer().write_mapped_region(t, km, km.fake_stat(), km.fsname(), vector<TraceRemoteFd>()); |
| ASSERT(t, d == TraceWriter::DONT_RECORD_IN_TRACE); |
| t->vm()->brk(t, t->regs().syscall_result(), km.prot()); |
| break; |
| } |
| |
| case Arch::mmap: |
| switch (Arch::mmap_semantics) { |
| case Arch::StructArguments: { |
| auto args = t->read_mem( |
| remote_ptr<typename Arch::mmap_args>(t->regs().orig_arg1())); |
| process_mmap(t, args.len, args.prot, args.flags, args.fd, |
| args.offset); |
| break; |
| } |
| case Arch::RegisterArguments: { |
| Registers r = t->regs(); |
| r.set_orig_arg1(syscall_state.syscall_entry_registers.arg1()); |
| r.set_arg4(syscall_state.syscall_entry_registers.arg4_signed()); |
| process_mmap(t, (size_t)r.arg2(), (int)r.arg3_signed(), |
| (int)r.arg4_signed(), (int)r.arg5_signed(), |
| ((off_t)r.arg6_signed())); |
| r.set_arg2(syscall_state.syscall_entry_registers.arg2_signed()); |
| r.set_arg3(syscall_state.syscall_entry_registers.arg3_signed()); |
| r.set_arg5(syscall_state.syscall_entry_registers.arg5_signed()); |
| t->set_regs(r); |
| break; |
| } |
| } |
| break; |
| |
| case Arch::mmap2: { |
| Registers r = t->regs(); |
| r.set_orig_arg1(syscall_state.syscall_entry_registers.arg1()); |
| r.set_arg4(syscall_state.syscall_entry_registers.arg4_signed()); |
| process_mmap(t, (size_t)r.arg2(), (int)r.arg3_signed(), |
| (int)r.arg4_signed(), (int)r.arg5_signed(), |
| (off_t)r.arg6_signed() * 4096); |
| r.set_arg2(syscall_state.syscall_entry_registers.arg2_signed()); |
| r.set_arg3(syscall_state.syscall_entry_registers.arg3_signed()); |
| r.set_arg5(syscall_state.syscall_entry_registers.arg5_signed()); |
| t->set_regs(r); |
| break; |
| } |
| |
| case Arch::mremap: |
| process_mremap(t, t->regs().orig_arg1(), t->regs().arg2(), t->regs().arg3(), |
| (int)t->regs().arg4_signed()); |
| break; |
| |
| case Arch::shmat: |
| process_shmat(t, (int)t->regs().orig_arg1_signed(), |
| (int)t->regs().arg3_signed(), t->regs().syscall_result()); |
| break; |
| |
| case Arch::ipc: |
| switch ((int)t->regs().orig_arg1_signed()) { |
| case SHMAT: { |
| auto out_ptr = t->read_mem( |
| remote_ptr<typename Arch::unsigned_long>(t->regs().arg4())); |
| process_shmat(t, (int)t->regs().arg2_signed(), |
| (int)t->regs().arg3_signed(), out_ptr); |
| break; |
| } |
| default: |
| break; |
| } |
| break; |
| |
| case Arch::bpf: |
| if (!t->regs().syscall_failed()) { |
| switch ((int)t->regs().orig_arg1()) { |
| case BPF_MAP_CREATE: { |
| int fd = t->regs().syscall_result_signed(); |
| auto attr = t->read_mem(remote_ptr<typename Arch::bpf_attr>(t->regs().arg2())); |
| t->fd_table()->add_monitor(t, fd, new BpfMapMonitor(attr.key_size, attr.value_size)); |
| break; |
| } |
| default: |
| break; |
| } |
| } |
| break; |
| |
| case Arch::clock_nanosleep: |
| case Arch::nanosleep: { |
| /* If the sleep completes, the kernel doesn't |
| * write back to the remaining-time |
| * argument. */ |
| if (!(int)t->regs().syscall_result_signed()) { |
| syscall_state.write_back = TaskSyscallState::NO_WRITE_BACK; |
| } |
| break; |
| } |
| |
| case Arch::perf_event_open: |
| if (!t->regs().syscall_failed()) { |
| int fd = t->regs().syscall_result_signed(); |
| if (t->regs().original_syscallno() == Arch::inotify_init1) { |
| Registers r = t->regs(); |
| r.set_original_syscallno( |
| syscall_state.syscall_entry_registers.original_syscallno()); |
| r.set_orig_arg1(syscall_state.syscall_entry_registers.arg1()); |
| t->set_regs(r); |
| auto attr = |
| t->read_mem(remote_ptr<struct perf_event_attr>(t->regs().orig_arg1())); |
| t->fd_table()->add_monitor(t, |
| fd, new VirtualPerfCounterMonitor( |
| t, t->session().find_task((pid_t)t->regs().arg2_signed()), |
| attr)); |
| } else if (t->ip() != t->vm()->privileged_traced_syscall_ip().increment_by_syscall_insn_length(t->arch())) { |
| // Ignoring perf_event_open from syscallbuf; we'll attach a PreserveFileMonitor to it if it stays open |
| t->fd_table()->add_monitor(t, fd, new NonvirtualPerfCounterMonitor()); |
| } |
| } |
| break; |
| |
| case Arch::connect: { |
| // Restore the registers that we may have altered. |
| Registers r = t->regs(); |
| if (r.original_syscallno() == Arch::gettid) { |
| // We hijacked this call to deal with blacklisted sockets |
| r.set_original_syscallno(Arch::connect); |
| r.set_syscall_result(-EACCES); |
| t->set_regs(r); |
| } |
| maybe_process_new_socket(t, r.orig_arg1()); |
| break; |
| } |
| |
| case Arch::accept: |
| case Arch::accept4: { |
| Registers r = t->regs(); |
| maybe_process_new_socket(t, r.syscall_result()); |
| break; |
| } |
| |
| case Arch::open: |
| case Arch::openat: { |
| Registers r = t->regs(); |
| if (r.syscall_failed()) { |
| uintptr_t path = syscallno == Arch::openat ? r.arg2() : r.orig_arg1(); |
| string pathname = t->read_c_str(remote_ptr<char>(path)); |
| if (is_gcrypt_deny_file(pathname.c_str())) { |
| fake_gcrypt_file(t, &r); |
| t->set_regs(r); |
| } |
| } else { |
| int fd = r.syscall_result_signed(); |
| int flags = syscallno == Arch::openat ? r.arg3() : r.arg2(); |
| string pathname = handle_opened_file(t, fd, flags); |
| bool gcrypt = is_gcrypt_deny_file(pathname.c_str()); |
| if (gcrypt || is_blacklisted_filename(pathname.c_str())) { |
| { |
| AutoRemoteSyscalls remote(t); |
| remote.infallible_close_syscall_if_alive(fd); |
| } |
| if (gcrypt) { |
| fake_gcrypt_file(t, &r); |
| } else { |
| LOG(warn) << "Cowardly refusing to open " << pathname; |
| r.set_syscall_result(-ENOENT); |
| } |
| t->set_regs(r); |
| } |
| } |
| break; |
| } |
| |
| case SYS_rrcall_notify_control_msg: { |
| auto msg = |
| t->read_mem(remote_ptr<typename Arch::msghdr>(t->regs().orig_arg1())); |
| check_scm_rights_fd<Arch>(t, msg); |
| break; |
| } |
| |
| case Arch::recvmsg: |
| if (!t->regs().syscall_failed()) { |
| auto msg = |
| t->read_mem(remote_ptr<typename Arch::msghdr>(t->regs().arg2())); |
| check_scm_rights_fd<Arch>(t, msg); |
| } |
| break; |
| |
| case Arch::recvmmsg_time64: |
| case Arch::recvmmsg: |
| if (!t->regs().syscall_failed()) { |
| int msg_count = (int)t->regs().syscall_result_signed(); |
| auto msgs = t->read_mem( |
| remote_ptr<typename Arch::mmsghdr>(t->regs().arg2()), msg_count); |
| for (auto& m : msgs) { |
| check_scm_rights_fd<Arch>(t, m.msg_hdr); |
| } |
| } |
| break; |
| |
| case Arch::sched_getaffinity: { |
| pid_t pid = (pid_t)t->regs().orig_arg1(); |
| if (!t->regs().syscall_failed() && (pid == 0 || pid == t->rec_tid)) { |
| if (t->regs().syscall_result() > sizeof(cpu_set_t)) { |
| LOG(warn) << "Don't understand kernel's sched_getaffinity result"; |
| } else { |
| t->write_bytes_helper(t->regs().arg3(), t->regs().syscall_result(), |
| &t->session().scheduler().pretend_affinity_mask()); |
| } |
| } |
| break; |
| } |
| |
| case Arch::setsockopt: { |
| // restore possibly-modified regs |
| Registers r = t->regs(); |
| r.set_orig_arg1(syscall_state.syscall_entry_registers.arg1()); |
| t->set_regs(r); |
| break; |
| } |
| |
| case Arch::socketcall: { |
| // restore possibly-modified regs |
| Registers r = t->regs(); |
| if (r.original_syscallno() == Arch::gettid) { |
| // `connect` was suppressed |
| r.set_syscall_result(-EACCES); |
| } |
| r.set_orig_arg1(syscall_state.syscall_entry_registers.arg1()); |
| r.set_original_syscallno( |
| syscall_state.syscall_entry_registers.original_syscallno()); |
| t->set_regs(r); |
| |
| if (!t->regs().syscall_failed()) { |
| switch ((int)t->regs().orig_arg1_signed()) { |
| case SYS_RECVMSG: { |
| auto args = t->read_mem( |
| remote_ptr<typename Arch::recvmsg_args>(t->regs().arg2())); |
| auto msg = t->read_mem(args.msg.rptr()); |
| check_scm_rights_fd<Arch>(t, msg); |
| break; |
| } |
| case SYS_RECVMMSG: { |
| auto args = t->read_mem( |
| remote_ptr<typename Arch::recvmmsg_args>(t->regs().arg2())); |
| int msg_count = (int)t->regs().syscall_result_signed(); |
| auto msgs = t->read_mem(args.msgvec.rptr(), msg_count); |
| for (auto& m : msgs) { |
| check_scm_rights_fd<Arch>(t, m.msg_hdr); |
| } |
| break; |
| } |
| case SYS_CONNECT: { |
| auto args = t->read_mem( |
| remote_ptr<typename Arch::connect_args>(t->regs().arg2())); |
| maybe_process_new_socket(t, args.sockfd); |
| break; |
| } |
| case SYS_ACCEPT: |
| case SYS_ACCEPT4: { |
| Registers r = t->regs(); |
| maybe_process_new_socket(t, r.syscall_result()); |
| break; |
| } |
| } |
| } |
| break; |
| } |
| |
| case Arch::process_vm_readv: |
| record_iovec_output<Arch>(t, t, t->regs().arg2(), t->regs().arg3()); |
| break; |
| |
| case Arch::process_vm_writev: { |
| RecordTask* dest = t->session().find_task(t->regs().orig_arg1()); |
| if (dest) { |
| record_iovec_output<Arch>(t, dest, t->regs().arg4(), t->regs().arg5()); |
| } |
| break; |
| } |
| |
| case Arch::fcntl: |
| case Arch::fcntl64: { |
| // Restore the registers that we may have altered. |
| Registers r = t->regs(); |
| r.set_orig_arg1(syscall_state.syscall_entry_registers.arg1()); |
| r.set_arg2(syscall_state.syscall_entry_registers.arg2()); |
| r.set_arg3(syscall_state.syscall_entry_registers.arg3()); |
| t->set_regs(r); |
| |
| if (!r.syscall_failed() && r.arg3() == O_DIRECT) { |
| int fd = r.orig_arg1(); |
| // O_DIRECT can impose unknown alignment requirements, in which case |
| // syscallbuf records will not be properly aligned and will cause I/O |
| // to fail. Disable syscall buffering for O_DIRECT files. |
| // If it already has a monitor (e.g. somebody tries to O_DIRECT |
| // /proc/pid/mem or something) then we don't need to do anything. |
| // since syscall buffering is already disabled. |
| if (!t->fd_table()->get_monitor(fd)) { |
| LOG(info) << "Installing ODirectFileMonitor for O_DIRECT " << fd; |
| FileMonitor* file_monitor = new ODirectFileMonitor(); |
| t->fd_table()->add_monitor(t, fd, file_monitor); |
| } |
| } |
| break; |
| } |
| |
| case Arch::clone3: |
| case Arch::close_range: |
| case Arch::close: |
| case Arch::dup2: |
| case Arch::dup3: |
| case Arch::futex_time64: |
| case Arch::futex: |
| case Arch::ioctl: |
| case Arch::io_setup: |
| case Arch::io_uring_setup: |
| case Arch::madvise: |
| case Arch::memfd_create: |
| case Arch::mprotect: |
| case Arch::pkey_mprotect: |
| case Arch::pread64: |
| case Arch::preadv: |
| case Arch::ptrace: |
| case Arch::read: |
| case Arch::readv: |
| case Arch::rseq: |
| case Arch::sched_setaffinity: |
| case Arch::userfaultfd: { |
| // Restore the registers that we may have altered. |
| Registers r = t->regs(); |
| r.set_orig_arg1(syscall_state.syscall_entry_registers.arg1()); |
| r.set_arg2(syscall_state.syscall_entry_registers.arg2()); |
| r.set_arg3(syscall_state.syscall_entry_registers.arg3()); |
| t->set_regs(r); |
| break; |
| } |
| |
| case Arch::getdents: |
| case Arch::getdents64: { |
| Registers r = t->regs(); |
| int fd = r.orig_arg1(); |
| t->fd_table()->filter_getdents(fd, t); |
| break; |
| } |
| |
| case Arch::waitpid: |
| case Arch::wait4: |
| case Arch::waitid: { |
| t->in_wait_type = WAIT_TYPE_NONE; |
| // Restore possibly-modified registers |
| Registers r = t->regs(); |
| r.set_orig_arg1(syscall_state.syscall_entry_registers.arg1()); |
| r.set_arg2(syscall_state.syscall_entry_registers.arg2()); |
| r.set_arg3(syscall_state.syscall_entry_registers.arg3()); |
| r.set_arg4(syscall_state.syscall_entry_registers.arg4()); |
| r.set_original_syscallno( |
| syscall_state.syscall_entry_registers.original_syscallno()); |
| t->set_regs(r); |
| |
| RecordTask* tracee = syscall_state.emulate_wait_for_child; |
| if (tracee) { |
| // Finish emulation of ptrace result or stop-signal |
| Registers r = t->regs(); |
| r.set_syscall_result(syscallno == Arch::waitid ? 0 : tracee->tid); |
| t->set_regs(r); |
| if (syscallno == Arch::waitid) { |
| remote_ptr<typename Arch::siginfo_t> sip = r.arg3(); |
| if (!sip.is_null()) { |
| typename Arch::siginfo_t si; |
| memset(&si, 0, sizeof(si)); |
| si.si_signo = SIGCHLD; |
| tracee->set_siginfo_for_waited_task<Arch>(&si); |
| t->write_mem(sip, si); |
| } |
| } else { |
| remote_ptr<int> statusp = r.arg2(); |
| if (!statusp.is_null()) { |
| t->write_mem(statusp, tracee->emulated_stop_code.get()); |
| } |
| } |
| if (syscallno == Arch::waitid && (r.arg4() & WNOWAIT)) { |
| // Leave the child in a waitable state |
| } else { |
| if (tracee->emulated_ptracer == t) { |
| tracee->emulated_stop_pending = false; |
| } else { |
| for (Task* thread : tracee->thread_group()->task_set()) { |
| auto rt = static_cast<RecordTask*>(thread); |
| rt->emulated_stop_pending = false; |
| } |
| } |
| if (tracee->detached_proxy && |
| (tracee->emulated_stop_code.type() == WaitStatus::EXIT || |
| tracee->emulated_stop_code.type() == WaitStatus::FATAL_SIGNAL)) { |
| // parent has reaped the proxy, so we're done with this task. |
| // This kills the proxy. |
| delete tracee; |
| tracee = nullptr; |
| } |
| } |
| if (tracee && tracee->already_exited()) { |
| // Have another go at reaping the task |
| tracee->did_reach_zombie(); |
| } |
| } |
| break; |
| } |
| |
| case Arch::prctl: { |
| // Restore arg1 in case we modified it to disable the syscall |
| Registers r = t->regs(); |
| r.set_orig_arg1(syscall_state.syscall_entry_registers.arg1()); |
| t->set_regs(r); |
| switch ((int)t->regs().orig_arg1()) { |
| case PR_SET_SECCOMP: |
| if (t->session().done_initial_exec()) { |
| t->session() |
| .as_record() |
| ->seccomp_filter_rewriter() |
| .install_patched_seccomp_filter(t); |
| } |
| break; |
| } |
| break; |
| } |
| |
| case Arch::arch_prctl: { |
| // Restore arg1 in case we modified it to disable the syscall |
| Registers r = t->regs(); |
| r.set_orig_arg1(syscall_state.syscall_entry_registers.arg1()); |
| t->set_regs(r); |
| break; |
| } |
| |
| case Arch::quotactl: |
| switch (t->regs().orig_arg1() >> SUBCMDSHIFT) { |
| case Q_GETQUOTA: |
| case Q_GETINFO: |
| case Q_GETFMT: |
| case Q_SETQUOTA: |
| case Q_QUOTAON: |
| case Q_QUOTAOFF: |
| case Q_SETINFO: |
| case Q_SYNC: |
| break; |
| default: { |
| auto ret = t->regs().syscall_result_signed(); |
| ASSERT(t, |
| ret == -ENOENT || ret == -ENODEV || ret == -ENOTBLK || |
| ret == -EINVAL) |
| << " unknown quotactl(" << HEX(t->regs().orig_arg1() >> SUBCMDSHIFT) |
| << ")"; |
| break; |
| } |
| } |
| break; |
| |
| case Arch::seccomp: { |
| // Restore arg1 in case we modified it to disable the syscall |
| Registers r = t->regs(); |
| r.set_orig_arg1(syscall_state.syscall_entry_registers.arg1()); |
| t->set_regs(r); |
| if (t->regs().orig_arg1() == SECCOMP_SET_MODE_FILTER) { |
| ASSERT(t, t->session().done_initial_exec()) |
| << "no seccomp calls during spawn"; |
| t->session() |
| .as_record() |
| ->seccomp_filter_rewriter() |
| .install_patched_seccomp_filter(t); |
| } |
| break; |
| } |
| |
| case SYS_rrcall_init_buffers: |
| t->init_buffers(); |
| break; |
| |
| case SYS_rrcall_init_preload: { |
| t->at_preload_init(); |
| break; |
| } |
| |
| case SYS_rrcall_notify_syscall_hook_exit: { |
| remote_ptr<uint8_t> child_addr = |
| REMOTE_PTR_FIELD(t->syscallbuf_child, notify_on_syscall_hook_exit); |
| t->write_mem(child_addr, (uint8_t)0); |
| t->record_remote(child_addr); |
| |
| struct rrcall_params { |
| typename Arch::unsigned_word result; |
| typename Arch::unsigned_word original_syscallno; |
| }; |
| Registers r = t->regs(); |
| auto params_ptr = r.sp() + sizeof(typename Arch::unsigned_word); |
| auto params = t->read_mem(params_ptr.cast<rrcall_params>()); |
| r.set_syscall_result((uintptr_t)params.result); |
| r.set_original_syscallno((intptr_t)params.original_syscallno); |
| t->set_regs(r); |
| break; |
| } |
| } |
| } |
| |
| /* N.B.: `arch` is the architecture of the syscall, which may be different |
| from the architecture of the call (e.g. x86_64 may invoke x86 syscalls) |
| */ |
| static void rec_process_syscall_internal(RecordTask* t, SupportedArch arch, |
| TaskSyscallState& syscall_state) { |
| RR_ARCH_FUNCTION(rec_process_syscall_arch, arch, t, syscall_state) |
| } |
| |
| void rec_did_sigreturn(RecordTask *t) { |
| auto& syscall_state = TaskSyscallState::get(t); |
| aarch64_kernel_bug_workaround(t, syscall_state); |
| t->syscall_state = nullptr; |
| } |
| |
| void rec_process_syscall(RecordTask* t) { |
| auto& syscall_state = TaskSyscallState::get(t); |
| const SyscallEvent& sys_ev = t->ev().Syscall(); |
| if (sys_ev.arch() != t->arch()) { |
| static bool did_warn = false; |
| if (!did_warn) { |
| LOG(warn) |
| << "Cross architecture syscall detected. Support is best effort"; |
| did_warn = true; |
| } |
| } |
| rec_process_syscall_internal(t, sys_ev.arch(), syscall_state); |
| syscall_state.process_syscall_results(); |
| |
| aarch64_kernel_bug_workaround(t, syscall_state); |
| |
| t->on_syscall_exit(sys_ev.number, sys_ev.arch(), t->regs()); |
| t->syscall_state = nullptr; |
| |
| MonitoredSharedMemory::check_all(t); |
| } |
| |
| } // namespace rr |