| /* -*- Mode: C++; tab-width: 8; c-basic-offset: 2; indent-tabs-mode: nil; -*- */ |
| |
| #include "Monkeypatcher.h" |
| |
| #include <limits.h> |
| #include <linux/auxvec.h> |
| |
| #include <sstream> |
| |
| #include "AddressSpace.h" |
| #include "AutoRemoteSyscalls.h" |
| #include "ElfReader.h" |
| #include "Flags.h" |
| #include "RecordSession.h" |
| #include "RecordTask.h" |
| #include "ReplaySession.h" |
| #include "ScopedFd.h" |
| #include "core.h" |
| #include "kernel_abi.h" |
| #include "kernel_metadata.h" |
| #include "log.h" |
| |
| using namespace std; |
| |
| namespace rr { |
| |
| #include "AssemblyTemplates.generated" |
| |
| static void write_and_record_bytes(RecordTask* t, remote_ptr<void> child_addr, |
| size_t size, const void* buf, bool* ok = nullptr) { |
| t->write_bytes_helper(child_addr, size, buf, ok); |
| if (!ok || *ok) { |
| t->record_local(child_addr, size, buf); |
| } |
| } |
| |
| template <size_t N> |
| static void write_and_record_bytes(RecordTask* t, remote_ptr<void> child_addr, |
| const uint8_t (&buf)[N], bool* ok = nullptr) { |
| write_and_record_bytes(t, child_addr, N, buf, ok); |
| } |
| |
| template <typename T> |
| static void write_and_record_mem(RecordTask* t, remote_ptr<T> child_addr, |
| const T* val, int count) { |
| t->write_bytes_helper(child_addr, sizeof(*val) * count, |
| static_cast<const void*>(val)); |
| t->record_local(child_addr, sizeof(T) * count, val); |
| } |
| |
| /** |
| * RecordSession sets up an LD_PRELOAD environment variable with an entry |
| * SYSCALLBUF_LIB_FILENAME_PADDED (and, if enabled, an LD_AUDIT environment |
| * variable with an entry RTLDAUDIT_LIB_FILENAME_PADDED) which is big enough to |
| * hold either the 32-bit or 64-bit preload/audit library file names. |
| * Immediately after exec we enter this function, which patches the environment |
| * variable value with the correct library name for the task's architecture. |
| * |
| * It's possible for this to fail if a tracee alters the LD_PRELOAD value |
| * and then does an exec. That's just too bad. If we ever have to handle that, |
| * we should modify the environment passed to the exec call. This function |
| * failing isn't necessarily fatal; a tracee might not rely on the functions |
| * overridden by the preload library, or might override them itself (e.g. |
| * because we're recording an rr replay). |
| */ |
| #define setup_library_path(arch, env_var, soname, task) \ |
| setup_library_path_arch<arch>(task, env_var, soname ## _BASE, \ |
| soname ## _PADDED, soname ## _32) |
| |
| template <typename Arch> |
| static void setup_library_path_arch(RecordTask* t, const char* env_var, |
| const char* soname_base, |
| const char* soname_padded, |
| const char* soname_32) { |
| const char* lib_name = |
| sizeof(typename Arch::unsigned_word) < sizeof(uintptr_t) |
| ? soname_32 |
| : soname_padded; |
| auto env_assignment = string(env_var) + "="; |
| |
| auto p = t->regs().sp().cast<typename Arch::unsigned_word>(); |
| auto argc = t->read_mem(p); |
| p += 1 + argc + 1; // skip argc, argc parameters, and trailing NULL |
| while (true) { |
| auto envp = t->read_mem(p); |
| if (!envp) { |
| LOG(debug) << env_var << " not found"; |
| return; |
| } |
| string env = t->read_c_str(envp); |
| if (env.find(env_assignment) != 0) { |
| ++p; |
| continue; |
| } |
| size_t lib_pos = env.find(soname_base); |
| if (lib_pos == string::npos) { |
| LOG(debug) << soname_base << " not found in " << env_var; |
| return; |
| } |
| size_t next_colon = env.find(':', lib_pos); |
| if (next_colon != string::npos) { |
| while ((next_colon + 1 < env.length()) && |
| (env[next_colon + 1] == ':' || env[next_colon + 1] == 0)) { |
| ++next_colon; |
| } |
| if (next_colon + 1 < |
| lib_pos + sizeof(soname_padded) - 1) { |
| LOG(debug) << "Insufficient space for " << lib_name |
| << " in " << env_var << " before next ':'"; |
| return; |
| } |
| } |
| if (env.length() < lib_pos + sizeof(soname_padded) - 1) { |
| LOG(debug) << "Insufficient space for " << lib_name |
| << " in " << env_var << " before end of string"; |
| return; |
| } |
| remote_ptr<void> dest = envp + lib_pos; |
| write_and_record_mem(t, dest.cast<char>(), lib_name, strlen(soname_padded)); |
| return; |
| } |
| } |
| |
| template <typename Arch> static void setup_preload_library_path(RecordTask* t) { |
| static_assert(sizeof(SYSCALLBUF_LIB_FILENAME_PADDED) == |
| sizeof(SYSCALLBUF_LIB_FILENAME_32), |
| "filename length mismatch"); |
| setup_library_path(Arch, "LD_PRELOAD", SYSCALLBUF_LIB_FILENAME, t); |
| } |
| |
| template <typename Arch> static void setup_audit_library_path(RecordTask* t) { |
| static_assert(sizeof(RTLDAUDIT_LIB_FILENAME_PADDED) == |
| sizeof(RTLDAUDIT_LIB_FILENAME_32), |
| "filename length mismatch"); |
| if (t->session().use_audit()) { |
| setup_library_path(Arch, "LD_AUDIT", RTLDAUDIT_LIB_FILENAME, t); |
| } |
| } |
| |
| void Monkeypatcher::init_dynamic_syscall_patching( |
| RecordTask* t, int syscall_patch_hook_count, |
| remote_ptr<struct syscall_patch_hook> syscall_patch_hooks) { |
| if (syscall_patch_hook_count && syscall_hooks.empty()) { |
| syscall_hooks = t->read_mem(syscall_patch_hooks, syscall_patch_hook_count); |
| } |
| } |
| |
| template <typename Arch> |
| static bool patch_syscall_with_hook_arch(Monkeypatcher& patcher, RecordTask* t, |
| const syscall_patch_hook& hook, |
| remote_code_ptr ip_of_instruction, |
| size_t instruction_length, |
| uint32_t fake_syscall_number); |
| |
| template <typename StubPatch> |
| static void substitute(uint8_t* buffer, uint64_t return_addr, |
| uint32_t trampoline_relative_addr); |
| |
| template <typename ExtendedJumpPatch> |
| static void substitute_extended_jump(uint8_t* buffer, uint64_t patch_addr, |
| uint64_t return_addr, |
| uint64_t target_addr, |
| uint32_t fake_syscall_number); |
| |
| template <> |
| void substitute_extended_jump<X86SyscallStubExtendedJump>( |
| uint8_t* buffer, uint64_t patch_addr, uint64_t return_addr, |
| uint64_t target_addr, uint32_t) { |
| int64_t offset = |
| target_addr - |
| (patch_addr + X86SyscallStubExtendedJump::trampoline_relative_addr_end); |
| // An offset that appears to be > 2GB is OK here, since EIP will just |
| // wrap around. |
| X86SyscallStubExtendedJump::substitute(buffer, (uint32_t)return_addr, |
| (uint32_t)offset); |
| } |
| |
| template <> |
| void substitute_extended_jump<X64SyscallStubExtendedJump>( |
| uint8_t* buffer, uint64_t, uint64_t return_addr, uint64_t target_addr, |
| uint32_t) { |
| X64SyscallStubExtendedJump::substitute(buffer, (uint32_t)return_addr, |
| (uint32_t)(return_addr >> 32), |
| target_addr); |
| } |
| |
| template <> |
| void substitute_extended_jump<X86TrapInstructionStubExtendedJump>( |
| uint8_t* buffer, uint64_t patch_addr, uint64_t return_addr, |
| uint64_t target_addr, uint32_t fake_syscall_number) { |
| int64_t offset = |
| target_addr - |
| (patch_addr + X86SyscallStubExtendedJump::trampoline_relative_addr_end); |
| // An offset that appears to be > 2GB is OK here, since EIP will just |
| // wrap around. |
| X86TrapInstructionStubExtendedJump::substitute(buffer, (uint32_t)return_addr, |
| fake_syscall_number, (uint32_t)offset); |
| } |
| |
| template <> |
| void substitute_extended_jump<X64TrapInstructionStubExtendedJump>( |
| uint8_t* buffer, uint64_t, uint64_t return_addr, uint64_t target_addr, |
| uint32_t fake_syscall_number) { |
| X64TrapInstructionStubExtendedJump::substitute(buffer, (uint32_t)return_addr, |
| (uint32_t)(return_addr >> 32), |
| fake_syscall_number, |
| target_addr); |
| } |
| |
| /** |
| * Allocate an extended jump in an extended jump page and return its address. |
| * The resulting address must be within 2G of from_end, and the instruction |
| * there must jump to to_start. |
| */ |
| template <typename ExtendedJumpPatch> |
| static remote_ptr<uint8_t> allocate_extended_jump_x86ish( |
| RecordTask* t, vector<Monkeypatcher::ExtendedJumpPage>& pages, |
| remote_ptr<uint8_t> from_end) { |
| Monkeypatcher::ExtendedJumpPage* page = nullptr; |
| for (auto& p : pages) { |
| remote_ptr<uint8_t> page_jump_start = p.addr + p.allocated; |
| int64_t offset = page_jump_start - from_end; |
| if ((int32_t)offset == offset && |
| p.allocated + ExtendedJumpPatch::size <= page_size()) { |
| page = &p; |
| break; |
| } |
| } |
| |
| if (!page) { |
| // We're looking for a gap of three pages --- one page to allocate and |
| // a page on each side as a guard page. |
| uint32_t required_space = 3 * page_size(); |
| remote_ptr<void> free_mem = |
| t->vm()->find_free_memory(t, required_space, |
| // Find free space after the patch site. |
| t->vm()->mapping_of(from_end).map.start()); |
| if (!free_mem) { |
| LOG(debug) << "Can't find free memory anywhere after the jump"; |
| return nullptr; |
| } |
| |
| remote_ptr<uint8_t> addr = (free_mem + page_size()).cast<uint8_t>(); |
| int64_t offset = addr - from_end; |
| if ((int32_t)offset != offset) { |
| LOG(debug) << "Can't find space close enough for the jump"; |
| return nullptr; |
| } |
| |
| { |
| AutoRemoteSyscalls remote(t); |
| int prot = PROT_READ | PROT_EXEC; |
| int flags = MAP_ANONYMOUS | MAP_FIXED | MAP_PRIVATE; |
| auto ret = remote.infallible_mmap_syscall_if_alive(addr, page_size(), prot, flags, -1, 0); |
| if (!ret) { |
| /* Tracee died */ |
| return nullptr; |
| } |
| KernelMapping recorded(addr, addr + page_size(), string(), |
| KernelMapping::NO_DEVICE, KernelMapping::NO_INODE, |
| prot, flags); |
| t->vm()->map(t, addr, page_size(), prot, flags, 0, string(), |
| KernelMapping::NO_DEVICE, KernelMapping::NO_INODE, nullptr, |
| &recorded); |
| t->vm()->mapping_flags_of(addr) |= AddressSpace::Mapping::IS_PATCH_STUBS; |
| t->trace_writer().write_mapped_region(t, recorded, recorded.fake_stat(), |
| recorded.fsname(), |
| vector<TraceRemoteFd>(), |
| TraceWriter::PATCH_MAPPING); |
| } |
| |
| pages.push_back(Monkeypatcher::ExtendedJumpPage(addr)); |
| page = &pages.back(); |
| } |
| |
| remote_ptr<uint8_t> jump_addr = page->addr + page->allocated; |
| page->allocated += ExtendedJumpPatch::size; |
| return jump_addr; |
| } |
| |
| /** |
| * Encode the standard movz|movk sequence for moving constant `v` into register `reg` |
| */ |
| static void encode_immediate_aarch64(std::vector<uint32_t> &buff, |
| uint8_t reg, uint64_t v) |
| { |
| DEBUG_ASSERT(reg < 31); |
| const uint32_t movz_inst = 0xd2800000; |
| const uint32_t movk_inst = 0xf2800000; |
| uint32_t mov_inst = movz_inst; |
| for (int lsl = 3; lsl >= 0; lsl--) { |
| uint32_t bits = (v >> (lsl * 16)) & 0xffff; |
| if (bits == 0 && !(lsl == 0 && mov_inst == movz_inst)) { |
| // Skip zero bits unless it's the only instruction, i.e. v == 0 |
| continue; |
| } |
| // movz|movk x[reg], #bits, LSL #lsl |
| buff.push_back(mov_inst | (uint32_t(lsl) << 21) | (bits << 5) | reg); |
| mov_inst = movk_inst; |
| } |
| } |
| |
| /** |
| * Encode the following assembly. |
| * |
| * cmp x8, 1024 |
| * b.hi .Lnosys |
| * movk x8, preload_thread_locals >> 16, lsl 16 |
| * stp x15, x30, [x8, stub_scratch_2 - preload_thread_locals] |
| * movz x30, #:abs_g3:_syscall_hook_trampoline |
| * movk x30, #:abs_g2_nc:_syscall_hook_trampoline |
| * movk x30, #:abs_g1_nc:_syscall_hook_trampoline |
| * movk x30, #:abs_g0_nc:_syscall_hook_trampoline // Might be shorter depending on the address |
| * blr x30 |
| * ldp x15, x30, [x15] |
| .Lreturn: |
| * b syscall_return_address |
| .Lnosys: |
| * svc 0x0 // the test relies on invalid syscall triggering an event. |
| * // mov x0, -ENOSYS |
| * b .Lreturn |
| * .long <syscall return address> |
| * |
| * And return the instruction index of `.Lreturn`. |
| * The branch instruction following that label will not be encoded |
| * since it depends on the address of this code. |
| */ |
| static uint32_t encode_extended_jump_aarch64(std::vector<uint32_t> &buff, |
| uint64_t target, uint64_t return_addr, |
| uint32_t *_retaddr_idx = nullptr) |
| { |
| // cmp x8, 1024 |
| buff.push_back(0xf110011f); |
| uint32_t b_hi_idx = buff.size(); |
| buff.push_back(0); // place holder |
| // movk x8, preload_thread_locals >> 16, lsl 16 |
| buff.push_back(0xf2ae0028); |
| // stp x15, x30, [x8, #104] |
| buff.push_back(0xa906f90f); |
| encode_immediate_aarch64(buff, 30, target); |
| // blr x30 |
| buff.push_back(0xd63f03c0); |
| // ldp x15, x30, [x15] |
| buff.push_back(0xa94079ef); |
| uint32_t ret_idx = buff.size(); |
| buff.push_back(0); // place holder |
| // b.hi . + (ret_inst + 4 - .) |
| buff[b_hi_idx] = 0x54000000 | ((ret_idx + 1 - b_hi_idx) << 5) | 0x8; |
| // movn x0, (ENOSYS - 1), i.e. mov x0, -ENOSYS |
| // buff.push_back(0x92800000 | ((ENOSYS - 1) << 5) | 0); |
| buff.push_back(0xd4000001); // svc 0 |
| // b .-2 |
| buff.push_back(0x17fffffe); |
| uint32_t retaddr_idx = buff.size(); |
| if (_retaddr_idx) |
| *_retaddr_idx = retaddr_idx; |
| buff.resize(retaddr_idx + 2); |
| memcpy(&buff[retaddr_idx], &return_addr, 8); |
| return ret_idx; |
| } |
| |
| // b and bl has a 26bit signed immediate in unit of 4 bytes |
| constexpr int32_t aarch64_b_max_offset = ((1 << 25) - 1) * 4; |
| constexpr int32_t aarch64_b_min_offset = (1 << 25) * -4; |
| |
| static remote_ptr<uint8_t> allocate_extended_jump_aarch64( |
| RecordTask* t, vector<Monkeypatcher::ExtendedJumpPage>& pages, |
| remote_ptr<uint8_t> svc_ip, uint64_t to, std::vector<uint32_t> &inst_buff) { |
| uint64_t return_addr = svc_ip.as_int() + 4; |
| auto ret_idx = encode_extended_jump_aarch64(inst_buff, to, return_addr); |
| auto total_patch_size = inst_buff.size() * 4; |
| |
| Monkeypatcher::ExtendedJumpPage* page = nullptr; |
| |
| // There are two jumps we need to worry about for the offset |
| // (actually 3 since there's also the jump back after unpatching |
| // but the requirement for that is always more relaxed than the combination |
| // of these two), |
| // the jump to the stub and the jump back. |
| // The jump to the stub has offset `stub - syscall` and the jump back has offset |
| // `syscall + 4 - (stub + ret_idx * 4)` |
| // We need to make sure both are within the offset range so |
| // * aarch64_b_min_offset <= stub - syscall <= aarch64_b_max_offset |
| // * aarch64_b_min_offset <= syscall + 4 - (stub + ret_idx * 4) <= aarch64_b_max_offset |
| // or |
| // * aarch64_b_min_offset <= stub - syscall <= aarch64_b_max_offset |
| // * -aarch64_b_max_offset + 4 - ret_idx * 4 <= stub - syscall <= -aarch64_b_min_offset + 4 - ret_idx * 4 |
| |
| int64_t patch_offset_min = std::max(aarch64_b_min_offset, |
| -aarch64_b_max_offset + 4 - int(ret_idx) * 4); |
| int64_t patch_offset_max = std::min(aarch64_b_max_offset, |
| -aarch64_b_min_offset + 4 - int(ret_idx) * 4); |
| for (auto& p : pages) { |
| remote_ptr<uint8_t> page_jump_start = p.addr + p.allocated; |
| int64_t offset = page_jump_start - svc_ip; |
| if (offset <= patch_offset_max && offset >= patch_offset_min && |
| p.allocated + total_patch_size <= page_size()) { |
| page = &p; |
| break; |
| } |
| } |
| |
| if (!page) { |
| // We're looking for a gap of three pages --- one page to allocate and |
| // a page on each side as a guard page. |
| uint32_t required_space = 3 * page_size(); |
| remote_ptr<void> free_mem = |
| t->vm()->find_free_memory(t, required_space, |
| // Find free space after the patch site. |
| t->vm()->mapping_of(svc_ip).map.start()); |
| if (!free_mem) { |
| LOG(debug) << "Can't find free memory anywhere after the jump"; |
| return nullptr; |
| } |
| |
| remote_ptr<uint8_t> addr = (free_mem + page_size()).cast<uint8_t>(); |
| int64_t offset = addr - svc_ip; |
| if (offset > patch_offset_max || offset < patch_offset_min) { |
| LOG(debug) << "Can't find space close enough for the jump"; |
| return nullptr; |
| } |
| |
| { |
| AutoRemoteSyscalls remote(t); |
| int prot = PROT_READ | PROT_EXEC; |
| int flags = MAP_ANONYMOUS | MAP_FIXED | MAP_PRIVATE; |
| auto ret = remote.infallible_mmap_syscall_if_alive(addr, page_size(), prot, flags, -1, 0); |
| if (!ret) { |
| /* Tracee died */ |
| return nullptr; |
| } |
| KernelMapping recorded(addr, addr + page_size(), string(), |
| KernelMapping::NO_DEVICE, KernelMapping::NO_INODE, |
| prot, flags); |
| t->vm()->map(t, addr, page_size(), prot, flags, 0, string(), |
| KernelMapping::NO_DEVICE, KernelMapping::NO_INODE, nullptr, |
| &recorded); |
| t->vm()->mapping_flags_of(addr) |= AddressSpace::Mapping::IS_PATCH_STUBS; |
| t->trace_writer().write_mapped_region(t, recorded, recorded.fake_stat(), |
| recorded.fsname(), |
| vector<TraceRemoteFd>(), |
| TraceWriter::PATCH_MAPPING); |
| } |
| |
| pages.push_back(Monkeypatcher::ExtendedJumpPage(addr)); |
| page = &pages.back(); |
| } |
| |
| remote_ptr<uint8_t> jump_addr = page->addr + page->allocated; |
| |
| const uint64_t reverse_jump_addr = jump_addr.as_int() + ret_idx * 4; |
| const int64_t reverse_offset = int64_t(return_addr - reverse_jump_addr); |
| const uint32_t offset_imm26 = (reverse_offset >> 2) & 0x03ffffff; |
| inst_buff[ret_idx] = 0x14000000 | offset_imm26; |
| |
| page->allocated += total_patch_size; |
| |
| return jump_addr; |
| } |
| |
| bool Monkeypatcher::is_jump_stub_instruction(remote_code_ptr ip, bool include_safearea) { |
| remote_ptr<uint8_t> pp = ip.to_data_ptr<uint8_t>(); |
| auto it = syscallbuf_stubs.upper_bound(pp); |
| if (it == syscallbuf_stubs.begin()) { |
| return false; |
| } |
| --it; |
| auto begin = it->first; |
| auto end = begin + it->second.size; |
| if (!include_safearea) { |
| begin += it->second.safe_prefix; |
| end -= it->second.safe_suffix; |
| } |
| return begin <= pp && pp < end; |
| } |
| |
| remote_code_ptr Monkeypatcher::get_jump_stub_exit_breakpoint(remote_code_ptr ip, |
| RecordTask *t) { |
| if (t->arch() != aarch64) { |
| return nullptr; |
| } |
| remote_ptr<uint8_t> pp = ip.to_data_ptr<uint8_t>(); |
| auto it = syscallbuf_stubs.upper_bound(pp); |
| if (it == syscallbuf_stubs.begin()) { |
| return nullptr; |
| } |
| --it; |
| auto bp = it->first + it->second.size - it->second.safe_suffix; |
| if (pp == bp || pp == bp - 4) { |
| return remote_code_ptr(bp.as_int()); |
| } |
| return nullptr; |
| } |
| |
| static bool hook_can_ignore_interfering_branches(const syscall_patch_hook& hook, size_t jump_patch_size) { |
| return hook.patch_region_length >= jump_patch_size && |
| (hook.flags & (PATCH_IS_MULTIPLE_INSTRUCTIONS | PATCH_IS_NOP_INSTRUCTIONS)) == PATCH_IS_NOP_INSTRUCTIONS; |
| } |
| |
| /** |
| * Some functions make system calls while storing local variables in memory |
| * below the stack pointer. We need to decrement the stack pointer by |
| * some "safety zone" amount to get clear of those variables before we make |
| * a call instruction. So, we allocate a stub per patched callsite, and jump |
| * from the callsite to the stub. The stub decrements the stack pointer, |
| * calls the appropriate syscall hook function, reincrements the stack pointer, |
| * and jumps back to immediately after the patched callsite. |
| * |
| * It's important that gdb stack traces work while a thread is stopped in the |
| * syscallbuf code. To ensure that the above manipulations don't foil gdb's |
| * stack walking code, we add CFI data to all the stubs. To ease that, the |
| * stubs are written in assembly and linked into the preload library. |
| * |
| * On x86-64 with ASLR, we need to be able to patch a call to a stub from |
| * sites more than 2^31 bytes away. We only have space for a 5-byte jump |
| * instruction. So, we allocate "extender pages" --- pages of memory within |
| * 2GB of the patch site, that contain the stub code. We don't really need this |
| * on x86, but we do it there too for consistency. |
| * |
| * If fake_syscall_number > 0 then we'll ensure AX is set to that number |
| * by the stub code. |
| */ |
| template <typename JumpPatch, typename ExtendedJumpPatch, typename FakeSyscallExtendedJumpPatch> |
| static bool patch_syscall_with_hook_x86ish(Monkeypatcher& patcher, |
| RecordTask* t, |
| const syscall_patch_hook& hook, |
| remote_code_ptr ip_of_instruction, |
| size_t instruction_length, |
| uint32_t fake_syscall_number) { |
| size_t patch_region_size = instruction_length + hook.patch_region_length; |
| uint8_t jump_patch[patch_region_size]; |
| // We're patching in a relative jump, so we need to compute the offset from |
| // the end of the jump to our actual destination. |
| remote_ptr<uint8_t> jump_patch_start = ip_of_instruction.to_data_ptr<uint8_t>(); |
| if (hook.flags & PATCH_SYSCALL_INSTRUCTION_IS_LAST) { |
| jump_patch_start -= hook.patch_region_length; |
| } |
| remote_ptr<uint8_t> jump_patch_end = jump_patch_start + JumpPatch::size; |
| remote_ptr<uint8_t> return_addr = |
| jump_patch_start + patch_region_size; |
| |
| remote_ptr<uint8_t> extended_jump_start; |
| if (fake_syscall_number) { |
| extended_jump_start = allocate_extended_jump_x86ish<FakeSyscallExtendedJumpPatch>( |
| t, patcher.extended_jump_pages, jump_patch_end); |
| } else { |
| extended_jump_start = allocate_extended_jump_x86ish<ExtendedJumpPatch>( |
| t, patcher.extended_jump_pages, jump_patch_end); |
| } |
| if (extended_jump_start.is_null()) { |
| return false; |
| } |
| |
| if (fake_syscall_number) { |
| uint8_t stub_patch[FakeSyscallExtendedJumpPatch::size]; |
| substitute_extended_jump<FakeSyscallExtendedJumpPatch>(stub_patch, |
| extended_jump_start.as_int(), |
| return_addr.as_int(), |
| hook.hook_address, |
| fake_syscall_number); |
| write_and_record_bytes(t, extended_jump_start, stub_patch); |
| |
| patcher.syscallbuf_stubs[extended_jump_start] = { &hook, FakeSyscallExtendedJumpPatch::size }; |
| } else { |
| uint8_t stub_patch[ExtendedJumpPatch::size]; |
| substitute_extended_jump<ExtendedJumpPatch>(stub_patch, |
| extended_jump_start.as_int(), |
| return_addr.as_int(), |
| hook.hook_address, |
| 0); |
| write_and_record_bytes(t, extended_jump_start, stub_patch); |
| |
| patcher.syscallbuf_stubs[extended_jump_start] = { &hook, ExtendedJumpPatch::size }; |
| } |
| |
| intptr_t jump_offset = extended_jump_start - jump_patch_end; |
| int32_t jump_offset32 = (int32_t)jump_offset; |
| ASSERT(t, jump_offset32 == jump_offset) |
| << "allocate_extended_jump_x86ish didn't work"; |
| |
| // pad with NOPs to the next instruction |
| static const uint8_t NOP = 0x90; |
| memset(jump_patch, NOP, sizeof(jump_patch)); |
| if (hook_can_ignore_interfering_branches(hook, JumpPatch::size)) { |
| // If the preceding instruction is long enough to contain the entire jump, |
| // and is a nop, replace the original instruction by a jump back to the |
| // start of the patch region. This allows us to ignore (likely spurious, |
| // but nevertheless), interfering branches, because whether we jump to the |
| // instruction or the start of the patch region, the effect is the same. |
| jump_patch[patch_region_size-2] = 0xeb; // jmp rel |
| jump_patch[patch_region_size-1] = (int8_t)-patch_region_size; |
| } |
| JumpPatch::substitute(jump_patch, jump_offset32); |
| bool ok = true; |
| write_and_record_bytes(t, jump_patch_start, sizeof(jump_patch), jump_patch, &ok); |
| if (!ok) { |
| LOG(warn) << "Couldn't write patch; errno=" << errno; |
| } |
| return ok; |
| } |
| |
| template <> |
| bool patch_syscall_with_hook_arch<X86Arch>(Monkeypatcher& patcher, |
| RecordTask* t, |
| const syscall_patch_hook& hook, |
| remote_code_ptr ip_of_instruction, |
| size_t instruction_length, |
| uint32_t fake_syscall_number) { |
| return patch_syscall_with_hook_x86ish<X86SysenterVsyscallSyscallHook, |
| X86SyscallStubExtendedJump, |
| X86TrapInstructionStubExtendedJump>(patcher, t, |
| hook, |
| ip_of_instruction, |
| instruction_length, |
| fake_syscall_number); |
| } |
| |
| template <> |
| bool patch_syscall_with_hook_arch<X64Arch>(Monkeypatcher& patcher, |
| RecordTask* t, |
| const syscall_patch_hook& hook, |
| remote_code_ptr ip_of_instruction, |
| size_t instruction_length, |
| uint32_t fake_syscall_number) { |
| return patch_syscall_with_hook_x86ish<X64JumpMonkeypatch, |
| X64SyscallStubExtendedJump, |
| X64TrapInstructionStubExtendedJump>(patcher, t, |
| hook, |
| ip_of_instruction, |
| instruction_length, |
| fake_syscall_number); |
| } |
| |
| template <> |
| bool patch_syscall_with_hook_arch<ARM64Arch>(Monkeypatcher& patcher, |
| RecordTask *t, |
| const syscall_patch_hook &hook, |
| remote_code_ptr, |
| size_t, |
| uint32_t) { |
| Registers r = t->regs(); |
| remote_ptr<uint8_t> svc_ip = r.ip().to_data_ptr<uint8_t>(); |
| std::vector<uint32_t> inst_buff; |
| |
| remote_ptr<uint8_t> extended_jump_start = |
| allocate_extended_jump_aarch64( |
| t, patcher.extended_jump_pages, svc_ip, hook.hook_address, inst_buff); |
| if (extended_jump_start.is_null()) { |
| return false; |
| } |
| LOG(debug) << "Allocated stub size " << inst_buff.size() * sizeof(uint32_t) |
| << " bytes at " << extended_jump_start << " for syscall at " |
| << svc_ip; |
| |
| auto total_patch_size = inst_buff.size() * 4; |
| write_and_record_bytes(t, extended_jump_start, total_patch_size, &inst_buff[0]); |
| |
| patcher.syscallbuf_stubs[extended_jump_start] = { |
| &hook, total_patch_size, |
| /** |
| * safe_prefix: |
| * We have not modified any registers yet in the first two instructions. |
| * More importantly, we may bail out and return to user code without |
| * hitting the breakpoint in syscallbuf |
| */ |
| 2 * 4, |
| /** |
| * safe_suffix: |
| * We've returned from syscallbuf and continue execution |
| * won't hit syscallbuf breakpoint |
| * (this also include the 8 bytes that stores the return address) |
| * Note that the 4th last instruction also belongs to the syscallbuf return path |
| * However, since it is still using the scratch memory, |
| * it doesn't belong to the safe area. |
| * The caller needs to have special handling for that instruction. |
| */ |
| 3 * 4 + 8 |
| }; |
| |
| intptr_t jump_offset = extended_jump_start - svc_ip; |
| ASSERT(t, jump_offset <= aarch64_b_max_offset && jump_offset >= aarch64_b_min_offset) |
| << "allocate_extended_jump_aarch64 didn't work"; |
| |
| const uint32_t offset_imm26 = (jump_offset >> 2) & 0x03ffffff; |
| const uint32_t b_inst = 0x14000000 | offset_imm26; |
| bool ok = true; |
| write_and_record_bytes(t, svc_ip, 4, &b_inst, &ok); |
| if (!ok) { |
| LOG(warn) << "Couldn't write patch; errno=" << errno; |
| } |
| return ok; |
| } |
| |
| |
| static bool patch_syscall_with_hook(Monkeypatcher& patcher, RecordTask* t, |
| const syscall_patch_hook& hook, |
| remote_code_ptr ip_of_instruction, |
| size_t instruction_length, |
| uint32_t fake_syscall_number) { |
| RR_ARCH_FUNCTION(patch_syscall_with_hook_arch, t->arch(), patcher, t, hook, |
| ip_of_instruction, instruction_length, fake_syscall_number); |
| } |
| |
| template <typename ExtendedJumpPatch> |
| static bool match_extended_jump_patch(Task* t, |
| uint8_t patch[], uint64_t* return_addr, vector<uint8_t>* instruction); |
| |
| template <> |
| bool match_extended_jump_patch<X64SyscallStubExtendedJump>( |
| Task*, uint8_t patch[], uint64_t* return_addr, vector<uint8_t>* instruction) { |
| uint32_t return_addr_lo, return_addr_hi; |
| uint64_t jmp_target; |
| if (!X64SyscallStubExtendedJump::match(patch, &return_addr_lo, &return_addr_hi, &jmp_target)) { |
| return false; |
| } |
| *instruction = rr::syscall_instruction(x86_64); |
| *return_addr = return_addr_lo | (((uint64_t)return_addr_hi) << 32); |
| return true; |
| } |
| |
| template <> |
| bool match_extended_jump_patch<X64TrapInstructionStubExtendedJump>( |
| Task* t, uint8_t patch[], uint64_t* return_addr, vector<uint8_t>* instruction) { |
| uint32_t return_addr_lo, return_addr_hi, fake_syscall_no; |
| uint64_t jmp_target; |
| if (!X64TrapInstructionStubExtendedJump::match(patch, &return_addr_lo, &return_addr_hi, |
| &fake_syscall_no, &jmp_target)) { |
| return false; |
| } |
| *return_addr = return_addr_lo | (((uint64_t)return_addr_hi) << 32); |
| if ((int)fake_syscall_no == t->session().syscall_number_for_rrcall_rdtsc()) { |
| instruction->resize(sizeof(rdtsc_insn)); |
| memcpy(instruction->data(), rdtsc_insn, instruction->size()); |
| } else { |
| ASSERT(t, false) << "Unknown fake-syscall number " << fake_syscall_no; |
| } |
| return true; |
| } |
| |
| template <> |
| bool match_extended_jump_patch<X86SyscallStubExtendedJump>( |
| Task*, uint8_t patch[], uint64_t* return_addr, vector<uint8_t>* instruction) { |
| uint32_t return_addr_32, jmp_target_relative; |
| if (!X86SyscallStubExtendedJump::match(patch, &return_addr_32, &jmp_target_relative)) { |
| return false; |
| } |
| *return_addr = return_addr_32; |
| *instruction = rr::syscall_instruction(x86); |
| return true; |
| } |
| |
| template <typename ReplacementPatch> |
| static void substitute_replacement_patch(uint8_t *buffer, uint64_t patch_addr, |
| uint64_t jmp_target); |
| |
| template <> |
| void substitute_replacement_patch<X64SyscallStubRestore>(uint8_t *buffer, uint64_t patch_addr, |
| uint64_t jmp_target) { |
| (void)patch_addr; |
| X64SyscallStubRestore::substitute(buffer, jmp_target); |
| } |
| |
| template <> |
| void substitute_replacement_patch<X86SyscallStubRestore>(uint8_t *buffer, uint64_t patch_addr, |
| uint64_t jmp_target) { |
| int64_t offset = |
| jmp_target - |
| (patch_addr + X86SyscallStubRestore::trampoline_relative_addr_end); |
| // An offset that appears to be > 2GB is OK here, since EIP will just |
| // wrap around. |
| X86SyscallStubRestore::substitute(buffer, (uint32_t)offset); |
| } |
| |
| template <typename ExtendedJumpPatch, typename FakeSyscallExtendedJumpPatch, typename ReplacementPatch> |
| static void unpatch_extended_jumps(Monkeypatcher& patcher, |
| Task* t) { |
| static_assert(ExtendedJumpPatch::size < FakeSyscallExtendedJumpPatch::size, |
| "If these were the same size then the logic below wouldn't work"); |
| for (auto patch : patcher.syscallbuf_stubs) { |
| const syscall_patch_hook &hook = *patch.second.hook; |
| uint8_t bytes[FakeSyscallExtendedJumpPatch::size]; |
| t->read_bytes_helper(patch.first, patch.second.size, bytes); |
| uint64_t return_addr = 0; |
| vector<uint8_t> syscall; |
| if (patch.second.size == ExtendedJumpPatch::size) { |
| if (!match_extended_jump_patch<ExtendedJumpPatch>( |
| t, bytes, &return_addr, &syscall)) { |
| ASSERT(t, false) << "Failed to match extended jump patch at " << patch.first; |
| return; |
| } |
| } else if (patch.second.size == FakeSyscallExtendedJumpPatch::size) { |
| if (!match_extended_jump_patch<FakeSyscallExtendedJumpPatch>( |
| t, bytes, &return_addr, &syscall)) { |
| ASSERT(t, false) << "Failed to match trap-instruction extended jump patch at " << patch.first; |
| return; |
| } |
| } else { |
| ASSERT(t, false) << "Unknown patch size " << patch.second.size; |
| } |
| |
| // Replace with |
| // extended_jump: |
| // <syscall> (unless PATCH_SYSCALL_INSTRUCTION_IS_LAST) |
| // <original bytes> |
| // <syscall> (if PATCH_SYSCALL_INSTRUCTION_IS_LAST) |
| // jmp *(return_addr) |
| // As long as there are not relative branches or anything, this should |
| // always be correct. |
| size_t new_patch_size = hook.patch_region_length + syscall.size() + ReplacementPatch::size; |
| ASSERT(t, new_patch_size <= sizeof(bytes)); |
| uint8_t* ptr = bytes; |
| if (!(hook.flags & PATCH_SYSCALL_INSTRUCTION_IS_LAST)) { |
| memcpy(ptr, syscall.data(), syscall.size()); |
| ptr += syscall.size(); |
| } |
| memcpy(ptr, hook.patch_region_bytes, hook.patch_region_length); |
| ptr += hook.patch_region_length; |
| if (hook.flags & PATCH_SYSCALL_INSTRUCTION_IS_LAST) { |
| memcpy(ptr, syscall.data(), syscall.size()); |
| ptr += syscall.size(); |
| } |
| substitute_replacement_patch<ReplacementPatch>(ptr, |
| patch.first.as_int() + hook.patch_region_length + syscall.size(), return_addr); |
| t->write_bytes_helper(patch.first, new_patch_size, bytes); |
| } |
| } |
| |
| template <typename Arch> |
| static void unpatch_syscalls_arch(Monkeypatcher &patcher, Task *t); |
| |
| template <> |
| void unpatch_syscalls_arch<X86Arch>(Monkeypatcher &patcher, Task *t) { |
| // There is no 32-bit equivalent to X64TrapInstructionStubExtendedJump. |
| // We just pass the X64TrapInstructionStubExtendedJump; its length |
| // will never match any jump stub for 32-bit. |
| return unpatch_extended_jumps<X86SyscallStubExtendedJump, |
| X64TrapInstructionStubExtendedJump, |
| X86SyscallStubRestore>(patcher, t); |
| } |
| |
| template <> |
| void unpatch_syscalls_arch<X64Arch>(Monkeypatcher &patcher, Task *t) { |
| return unpatch_extended_jumps<X64SyscallStubExtendedJump, |
| X64TrapInstructionStubExtendedJump, |
| X64SyscallStubRestore>(patcher, t); |
| } |
| |
| template <> |
| void unpatch_syscalls_arch<ARM64Arch>(Monkeypatcher &patcher, Task *t) { |
| for (auto patch : patcher.syscallbuf_stubs) { |
| const syscall_patch_hook &hook = *patch.second.hook; |
| std::vector<uint32_t> hook_prefix; |
| uint32_t prefix_ninst; |
| encode_extended_jump_aarch64(hook_prefix, hook.hook_address, 0, &prefix_ninst); |
| uint32_t prefix_size = prefix_ninst * 4; |
| DEBUG_ASSERT(prefix_size <= 13 * 4); |
| ASSERT(t, patch.second.size >= prefix_size + 8); |
| uint8_t bytes[15 * 4]; |
| t->read_bytes_helper(patch.first, prefix_size + 8, bytes); |
| // 3rd last instruction is the one jumping back and it won't match |
| if (memcmp(&hook_prefix[0], bytes, prefix_size - 3 * 4) != 0) { |
| ASSERT(t, false) << "Failed to match extended jump patch at " << patch.first; |
| return; |
| } |
| |
| uint64_t return_addr; |
| memcpy(&return_addr, &bytes[prefix_size], 8); |
| |
| uint32_t svc_inst = 0xd4000001; |
| memcpy(bytes, &svc_inst, 4); |
| |
| uint64_t reverse_jump_addr = patch.first.as_int() + 4; |
| int64_t reverse_offset = int64_t(return_addr - reverse_jump_addr); |
| ASSERT(t, reverse_offset <= aarch64_b_max_offset && |
| reverse_offset >= aarch64_b_min_offset) |
| << "Cannot encode b instruction to jump back"; |
| uint32_t offset_imm26 = (reverse_offset >> 2) & 0x03ffffff; |
| uint32_t binst = 0x14000000 | offset_imm26; |
| memcpy(&bytes[4], &binst, 4); |
| |
| t->write_bytes_helper(patch.first, 4 * 2, bytes); |
| } |
| } |
| |
| void Monkeypatcher::unpatch_syscalls_in(Task *t) { |
| RR_ARCH_FUNCTION(unpatch_syscalls_arch, t->arch(), *this, t); |
| } |
| |
| static string bytes_to_string(uint8_t* bytes, size_t size) { |
| stringstream ss; |
| for (size_t i = 0; i < size; ++i) { |
| if (i > 0) { |
| ss << ' '; |
| } |
| ss << HEX(bytes[i]); |
| } |
| return ss.str(); |
| } |
| |
| static bool task_safe_for_syscall_patching(RecordTask* t, remote_code_ptr start, |
| remote_code_ptr end) { |
| if (t->is_stopped()) { |
| remote_code_ptr ip = t->ip(); |
| if (start <= ip && ip < end) { |
| return false; |
| } |
| } |
| for (auto& e : t->pending_events) { |
| if (e.is_syscall_event()) { |
| remote_code_ptr ip = e.Syscall().regs.ip(); |
| if (start <= ip && ip < end) { |
| return false; |
| } |
| } |
| } |
| return true; |
| } |
| |
| static bool safe_for_syscall_patching(remote_code_ptr start, |
| remote_code_ptr end, |
| RecordTask* exclude) { |
| for (auto& p : exclude->session().tasks()) { |
| RecordTask* rt = static_cast<RecordTask*>(p.second); |
| if (rt != exclude && !task_safe_for_syscall_patching(rt, start, end)) { |
| return false; |
| } |
| } |
| return true; |
| } |
| |
| bool Monkeypatcher::try_patch_vsyscall_caller(RecordTask* t, remote_code_ptr ret_addr) |
| { |
| // Emit FLUSH_SYSCALLBUF if there's one pending. |
| // We want our mmap records to be associated with the next (PATCH_SYSCALL) |
| // event, not a FLUSH_SYSCALLBUF event. |
| t->maybe_flush_syscallbuf(); |
| |
| uint8_t bytes[X64VSyscallEntry::size]; |
| remote_ptr<uint8_t> patch_start = ret_addr.to_data_ptr<uint8_t>() - sizeof(bytes); |
| size_t bytes_count = t->read_bytes_fallible(patch_start, sizeof(bytes), bytes); |
| if (bytes_count < sizeof(bytes)) { |
| return false; |
| } |
| uint32_t target_addr = 0; |
| if (!X64VSyscallEntry::match(bytes, &target_addr)) { |
| return false; |
| } |
| uint64_t target_addr_sext = (uint64_t)(int32_t)target_addr; |
| int syscallno = 0; |
| switch (target_addr_sext) { |
| case 0xffffffffff600000: |
| syscallno = X64Arch::gettimeofday; |
| break; |
| case 0xffffffffff600400: |
| syscallno = X64Arch::time; |
| break; |
| case 0xffffffffff600800: |
| syscallno = X64Arch::getcpu; |
| break; |
| default: |
| return false; |
| } |
| X64VSyscallReplacement::substitute(bytes, syscallno); |
| write_and_record_bytes(t, patch_start, bytes); |
| LOG(debug) << "monkeypatched vsyscall caller at " << patch_start; |
| return true; |
| } |
| |
| static uint64_t jump_patch_size(SupportedArch arch) |
| { |
| switch (arch) { |
| case x86: return X86SysenterVsyscallSyscallHook::size; |
| case x86_64: return X64JumpMonkeypatch::size; |
| case aarch64: return 2*rr::syscall_instruction_length(arch); |
| default: |
| FATAL() << "Unimplemented for this architecture"; |
| return 0; |
| } |
| } |
| |
| const syscall_patch_hook* Monkeypatcher::find_syscall_hook(RecordTask* t, |
| remote_code_ptr ip, |
| bool entering_syscall, |
| size_t instruction_length) { |
| /* we need to inspect this many bytes before the start of the instruction, |
| to find every short jump that might land after it. Conservative. */ |
| static const intptr_t LOOK_BACK = 0x80; |
| /* we need to inspect this many bytes after the start of the instruction, |
| to find every short jump that might land after it into the patch area. |
| Conservative. */ |
| static const intptr_t LOOK_FORWARD = 15 + 15 + 0x80; |
| uint8_t bytes[LOOK_BACK + LOOK_FORWARD]; |
| memset(bytes, 0, sizeof(bytes)); |
| |
| // Split reading the code into separate reads for each page, so that if we can't read |
| // from one page, we still get the data from the other page. |
| ASSERT(t, sizeof(bytes) < page_size()); |
| remote_ptr<uint8_t> code_start = ip.to_data_ptr<uint8_t>() - LOOK_BACK; |
| size_t buf_valid_start_offset = 0; |
| size_t buf_valid_end_offset = sizeof(bytes); |
| ssize_t first_page_bytes = min<size_t>(ceil_page_size(code_start) - code_start, sizeof(bytes)); |
| if (t->read_bytes_fallible(code_start, first_page_bytes, bytes) < first_page_bytes) { |
| buf_valid_start_offset = first_page_bytes; |
| } |
| if (first_page_bytes < (ssize_t)sizeof(bytes)) { |
| if (t->read_bytes_fallible(code_start + first_page_bytes, sizeof(bytes) - first_page_bytes, |
| bytes + first_page_bytes) < (ssize_t)sizeof(bytes) - first_page_bytes) { |
| buf_valid_end_offset = first_page_bytes; |
| } |
| } |
| |
| if (buf_valid_start_offset > LOOK_BACK || |
| buf_valid_end_offset < LOOK_BACK + instruction_length) { |
| ASSERT(t, false) |
| << "Can't read memory containing patchable instruction, why are we trying this?"; |
| } |
| |
| uint8_t* following_bytes = &bytes[LOOK_BACK + instruction_length]; |
| size_t following_bytes_count = buf_valid_end_offset - (LOOK_BACK + instruction_length); |
| size_t preceding_bytes_count = LOOK_BACK - buf_valid_start_offset; |
| |
| for (const auto& hook : syscall_hooks) { |
| bool matches_hook = false; |
| if ((!(hook.flags & PATCH_SYSCALL_INSTRUCTION_IS_LAST) && |
| following_bytes_count >= hook.patch_region_length && |
| memcmp(following_bytes, hook.patch_region_bytes, |
| hook.patch_region_length) == 0)) { |
| matches_hook = true; |
| } else if ((hook.flags & PATCH_SYSCALL_INSTRUCTION_IS_LAST) && |
| hook.patch_region_length <= preceding_bytes_count && |
| memcmp(bytes + LOOK_BACK - hook.patch_region_length, |
| hook.patch_region_bytes, |
| hook.patch_region_length) == 0) { |
| if (entering_syscall) { |
| // A patch that uses bytes before the syscall can't be done when |
| // entering the syscall, it must be done when exiting. So set a flag on |
| // the Task that tells us to come back later. |
| t->retry_syscall_patching = true; |
| LOG(debug) << "Deferring syscall patching at " << ip << " in " << t |
| << " until syscall exit."; |
| return nullptr; |
| } |
| matches_hook = true; |
| } |
| |
| if (!matches_hook) { |
| continue; |
| } |
| |
| if (!hook_can_ignore_interfering_branches(hook, jump_patch_size(t->arch()))) { |
| // Search for a following short-jump instruction that targets an |
| // instruction |
| // after the syscall. False positives are OK. |
| // glibc-2.23.1-8.fc24.x86_64's __clock_nanosleep needs this. |
| bool found_potential_interfering_branch = false; |
| for (size_t i = buf_valid_start_offset; i + 2 <= buf_valid_end_offset; ++i) { |
| uint8_t b = bytes[i]; |
| // Check for short conditional or unconditional jump |
| if (b == 0xeb || (b >= 0x70 && b < 0x80)) { |
| int offset_from_instruction_end = (int)i + 2 + (int8_t)bytes[i + 1] - |
| (LOOK_BACK + instruction_length); |
| if (hook.flags & PATCH_SYSCALL_INSTRUCTION_IS_LAST) { |
| if (hook.flags & PATCH_IS_MULTIPLE_INSTRUCTIONS) { |
| found_potential_interfering_branch = |
| offset_from_instruction_end <= -(ssize_t)instruction_length && |
| offset_from_instruction_end > -(ssize_t)(instruction_length + hook.patch_region_length); |
| } else { |
| found_potential_interfering_branch = offset_from_instruction_end == -(ssize_t)instruction_length; |
| } |
| } else { |
| if (hook.flags & PATCH_IS_MULTIPLE_INSTRUCTIONS) { |
| found_potential_interfering_branch = |
| offset_from_instruction_end >= 0 && offset_from_instruction_end < hook.patch_region_length; |
| } else { |
| found_potential_interfering_branch = offset_from_instruction_end == 0; |
| } |
| } |
| if (found_potential_interfering_branch) { |
| LOG(debug) << "Found potential interfering branch at " |
| << ip.to_data_ptr<uint8_t>() - LOOK_BACK + i; |
| break; |
| } |
| } |
| } |
| if (found_potential_interfering_branch) { |
| continue; |
| } |
| } |
| |
| remote_code_ptr start_range, end_range; |
| if (hook.flags & PATCH_SYSCALL_INSTRUCTION_IS_LAST) { |
| start_range = ip - hook.patch_region_length; |
| // if a thread has its RIP at the end of our range, |
| // it could be immediately after a syscall instruction that |
| // will need to be restarted. Patching out that instruction will |
| // prevent the kernel from restarting it. So, extend our range by |
| // one byte to detect such threads. |
| end_range = ip + instruction_length + 1; |
| } else { |
| start_range = ip; |
| end_range = ip + instruction_length + hook.patch_region_length; |
| } |
| if (!safe_for_syscall_patching(start_range, end_range, t)) { |
| LOG(debug) |
| << "Temporarily declining to patch syscall at " << ip |
| << " because a different task has its ip in the patched range"; |
| return nullptr; |
| } |
| LOG(debug) << "Trying to patch bytes " |
| << bytes_to_string( |
| following_bytes, |
| min<size_t>(following_bytes_count, |
| sizeof(syscall_patch_hook::patch_region_bytes))); |
| |
| return &hook; |
| } |
| |
| LOG(debug) << "Failed to find a syscall hook for bytes " |
| << bytes_to_string( |
| following_bytes, |
| min<size_t>(following_bytes_count, |
| sizeof(syscall_patch_hook::patch_region_bytes))); |
| |
| return nullptr; |
| } |
| |
| // Syscalls can be patched either on entry or exit. For most syscall |
| // instruction code patterns we can steal bytes after the syscall instruction |
| // and thus we patch on entry, but some patterns require using bytes from |
| // before the syscall instruction itself and thus can only be patched on exit. |
| // The `entering_syscall` flag tells us whether or not we're at syscall entry. |
| // If we are, and we find a pattern that can only be patched at exit, we'll |
| // set a flag on the RecordTask telling it to try again after syscall exit. |
| bool Monkeypatcher::try_patch_syscall_x86ish(RecordTask* t, bool entering_syscall, |
| SupportedArch arch) { |
| Registers r = t->regs(); |
| remote_code_ptr ip = r.ip(); |
| |
| ASSERT(t, is_x86ish(arch)) << "Unsupported architecture"; |
| |
| size_t instruction_length = rr::syscall_instruction_length(arch); |
| const syscall_patch_hook* hook_ptr = find_syscall_hook(t, ip - instruction_length, |
| entering_syscall, instruction_length); |
| bool success = false; |
| intptr_t syscallno = r.original_syscallno(); |
| if (hook_ptr) { |
| // Get out of executing the current syscall before we patch it. |
| if (entering_syscall && !t->exit_syscall_and_prepare_restart()) { |
| return false; |
| } |
| |
| LOG(debug) << "Patching syscall at " << ip << " syscall " |
| << syscall_name(syscallno, t->arch()) << " tid " << t->tid; |
| |
| success = patch_syscall_with_hook(*this, t, *hook_ptr, ip - instruction_length, instruction_length, 0); |
| if (!success && entering_syscall) { |
| // Need to reenter the syscall to undo exit_syscall_and_prepare_restart |
| t->enter_syscall(); |
| } |
| } |
| |
| if (!success) { |
| if (!t->retry_syscall_patching) { |
| LOG(debug) << "Failed to patch syscall at " << ip << " syscall " |
| << syscall_name(syscallno, t->arch()) << " tid " << t->tid; |
| tried_to_patch_syscall_addresses.insert(ip); |
| } |
| return false; |
| } |
| |
| return true; |
| } |
| |
| bool Monkeypatcher::try_patch_syscall_aarch64(RecordTask* t, bool entering_syscall) { |
| Registers r = t->regs(); |
| remote_code_ptr ip = r.ip() - 4; |
| |
| uint32_t inst[2] = {0, 0}; |
| size_t bytes_count = t->read_bytes_fallible(ip.to_data_ptr<uint8_t>() - 4, 8, &inst); |
| if (bytes_count < sizeof(inst) || inst[1] != 0xd4000001) { |
| LOG(debug) << "Declining to patch syscall at " |
| << ip << " for unexpected instruction"; |
| tried_to_patch_syscall_addresses.insert(ip); |
| return false; |
| } |
| // mov x8, 0xdc |
| if (inst[0] == 0xd2801b88) { |
| // Clone may either cause the new and the old process to share stack (vfork) |
| // or replacing the stack (pthread_create) |
| // and requires special handling on the caller. |
| // Our syscall hook cannot do that so this would have to be a raw syscall. |
| // We can handle this at runtime but if we know the call is definitely |
| // a clone we can avoid patching it here. |
| LOG(debug) << "Declining to patch clone syscall at " << ip; |
| tried_to_patch_syscall_addresses.insert(ip); |
| return false; |
| } |
| |
| ASSERT(t, (syscall_hooks.size() == 1 && syscall_hooks[0].patch_region_length == 4 && |
| memcmp(syscall_hooks[0].patch_region_bytes, &inst[1], 4) == 0)) |
| << "Unknown syscall hook"; |
| |
| if (!safe_for_syscall_patching(ip, ip + 4, t)) { |
| LOG(debug) |
| << "Temporarily declining to patch syscall at " << ip |
| << " because a different task has its ip in the patched range"; |
| return false; |
| } |
| |
| // Get out of executing the current syscall before we patch it. |
| if (entering_syscall && !t->exit_syscall_and_prepare_restart()) { |
| return false; |
| } |
| |
| LOG(debug) << "Patching syscall at " << ip << " syscall " |
| << syscall_name(r.original_syscallno(), aarch64) << " tid " << t->tid; |
| |
| auto success = patch_syscall_with_hook(*this, t, syscall_hooks[0], ip, 4, 0); |
| if (!success && entering_syscall) { |
| // Need to reenter the syscall to undo exit_syscall_and_prepare_restart |
| if (!t->enter_syscall()) { |
| return false; |
| } |
| } |
| |
| if (!success) { |
| LOG(debug) << "Failed to patch syscall at " << ip << " syscall " |
| << syscall_name(r.original_syscallno(), aarch64) << " tid " << t->tid; |
| tried_to_patch_syscall_addresses.insert(ip); |
| return false; |
| } |
| |
| return true; |
| } |
| |
| bool Monkeypatcher::try_patch_syscall(RecordTask* t, bool entering_syscall) { |
| if (syscall_hooks.empty()) { |
| // Syscall hooks not set up yet. Don't spew warnings, and don't |
| // fill tried_to_patch_syscall_addresses with addresses that we might be |
| // able to patch later. |
| return false; |
| } |
| if (t->emulated_ptracer) { |
| // Syscall patching can confuse ptracers, which may be surprised to see |
| // a syscall instruction at the current IP but then when running |
| // forwards, that the syscall occurs deep in the preload library instead. |
| return false; |
| } |
| if (t->is_in_traced_syscall()) { |
| // Never try to patch the traced-syscall in our preload library! |
| return false; |
| } |
| |
| Registers r = t->regs(); |
| remote_code_ptr ip = r.ip(); |
| // We should not get here for untraced syscalls or anything else from the rr page. |
| // These should be normally prevented by our seccomp filter |
| // and in the case of syscalls interrupted by signals, |
| // the check for the syscall restart should prevent us from reaching here. |
| DEBUG_ASSERT(ip.to_data_ptr<void>() < AddressSpace::rr_page_start() || |
| ip.to_data_ptr<void>() >= AddressSpace::rr_page_end()); |
| if (tried_to_patch_syscall_addresses.count(ip) || is_jump_stub_instruction(ip, true)) { |
| return false; |
| } |
| |
| // We could examine the current syscall number and if it's not one that |
| // we support syscall buffering for, refuse to patch the syscall instruction. |
| // This would, on the face of it, reduce overhead since patching the |
| // instruction just means a useless trip through the syscall buffering logic. |
| // However, it actually wouldn't help much since we'd still do a switch |
| // on the syscall number in this function instead, and due to context |
| // switching costs any overhead saved would be insignificant. |
| // Also, implementing that would require keeping a buffered-syscalls |
| // list in sync with the preload code, which is unnecessary complexity. |
| |
| SupportedArch arch; |
| if (!get_syscall_instruction_arch( |
| t, ip.decrement_by_syscall_insn_length(t->arch()), &arch) || |
| arch != t->arch()) { |
| LOG(debug) << "Declining to patch cross-architecture syscall at " << ip; |
| tried_to_patch_syscall_addresses.insert(ip); |
| return false; |
| } |
| |
| // Emit FLUSH_SYSCALLBUF if there's one pending. |
| // We want our mmap records to be associated with the next (PATCH_SYSCALL) |
| // event, not a FLUSH_SYSCALLBUF event. |
| t->maybe_flush_syscallbuf(); |
| if (!t->is_stopped()) { |
| // Tracee was unexpectedly kicked out of a ptrace-stop by SIGKILL or |
| // equivalent. Abort trying to patch. |
| return false; |
| } |
| |
| if (arch == aarch64) { |
| return try_patch_syscall_aarch64(t, entering_syscall); |
| } |
| return try_patch_syscall_x86ish(t, entering_syscall, arch); |
| } |
| |
| bool Monkeypatcher::try_patch_trapping_instruction(RecordTask* t, size_t instruction_length, |
| bool before_instruction) { |
| if (syscall_hooks.empty()) { |
| // Syscall hooks not set up yet. Don't spew warnings, and don't |
| // fill tried_to_patch_syscall_addresses with addresses that we might be |
| // able to patch later. |
| return false; |
| } |
| if (t->emulated_ptracer) { |
| // Patching can confuse ptracers. |
| return false; |
| } |
| |
| Registers r = t->regs(); |
| remote_code_ptr ip_of_instruction = r.ip() - (before_instruction ? 0 : instruction_length); |
| if (tried_to_patch_syscall_addresses.count(ip_of_instruction + instruction_length)) { |
| return false; |
| } |
| |
| // Emit FLUSH_SYSCALLBUF if there's one pending. |
| // We want our mmap records to be associated with the next (PATCH_SYSCALL) |
| // event, not a FLUSH_SYSCALLBUF event. |
| t->maybe_flush_syscallbuf(); |
| |
| const syscall_patch_hook* hook_ptr = |
| find_syscall_hook(t, ip_of_instruction, before_instruction, instruction_length); |
| bool success = false; |
| if (hook_ptr) { |
| LOG(debug) << "Patching trapping instruction at " << ip_of_instruction << " tid " << t->tid; |
| |
| success = patch_syscall_with_hook(*this, t, *hook_ptr, ip_of_instruction, |
| instruction_length, SYS_rrcall_rdtsc); |
| } |
| |
| if (!success) { |
| if (!t->retry_syscall_patching) { |
| LOG(debug) << "Failed to patch trapping instruction at " << ip_of_instruction << " tid " << t->tid; |
| tried_to_patch_syscall_addresses.insert(ip_of_instruction + instruction_length); |
| } |
| return false; |
| } |
| |
| return true; |
| } |
| |
| // VDSOs are filled with overhead critical functions related to getting the |
| // time and current CPU. We need to ensure that these syscalls get redirected |
| // into actual trap-into-the-kernel syscalls so rr can intercept them. |
| |
| template <typename Arch> |
| static void patch_after_exec_arch(RecordTask* t, Monkeypatcher& patcher); |
| |
| template <typename Arch> |
| static void patch_at_preload_init_arch(RecordTask* t, Monkeypatcher& patcher); |
| |
| template <> |
| void patch_after_exec_arch<X86Arch>(RecordTask* t, Monkeypatcher& patcher) { |
| (void)patcher; |
| setup_preload_library_path<X86Arch>(t); |
| setup_audit_library_path<X86Arch>(t); |
| |
| if (!t->vm()->has_vdso()) { |
| patch_auxv_vdso(t, AT_SYSINFO_EHDR, AT_IGNORE); |
| } else { |
| size_t librrpage_base = RR_PAGE_ADDR - AddressSpace::RRPAGE_RECORD_PAGE_OFFSET*PRELOAD_LIBRARY_PAGE_SIZE; |
| patch_auxv_vdso(t, AT_SYSINFO_EHDR, librrpage_base); |
| patch_auxv_vdso(t, X86Arch::RR_AT_SYSINFO, librrpage_base + |
| AddressSpace::RRVDSO_PAGE_OFFSET*PRELOAD_LIBRARY_PAGE_SIZE); |
| } |
| } |
| |
| // Monkeypatch x86 vsyscall hook only after the preload library |
| // has initialized. The vsyscall hook expects to be able to use the syscallbuf. |
| // Before the preload library has initialized, the regular vsyscall code |
| // will trigger ptrace traps and be handled correctly by rr. |
| template <> |
| void patch_at_preload_init_arch<X86Arch>(RecordTask* t, |
| Monkeypatcher& patcher) { |
| auto params = t->read_mem( |
| remote_ptr<rrcall_init_preload_params<X86Arch>>(t->regs().arg1())); |
| if (!params.syscallbuf_enabled) { |
| return; |
| } |
| |
| patcher.init_dynamic_syscall_patching(t, params.syscall_patch_hook_count, |
| params.syscall_patch_hooks); |
| } |
| |
| template <> |
| void patch_after_exec_arch<X64Arch>(RecordTask* t, Monkeypatcher& patcher) { |
| setup_preload_library_path<X64Arch>(t); |
| setup_audit_library_path<X64Arch>(t); |
| |
| for (const auto& m : t->vm()->maps()) { |
| auto& km = m.map; |
| patcher.patch_after_mmap(t, km.start(), km.size(), |
| km.file_offset_bytes(), -1, |
| Monkeypatcher::MMAP_EXEC); |
| } |
| |
| if (!t->vm()->has_vdso()) { |
| patch_auxv_vdso(t, AT_SYSINFO_EHDR, AT_IGNORE); |
| } else { |
| size_t librrpage_base = RR_PAGE_ADDR - AddressSpace::RRPAGE_RECORD_PAGE_OFFSET*PRELOAD_LIBRARY_PAGE_SIZE; |
| patch_auxv_vdso(t, AT_SYSINFO_EHDR, librrpage_base); |
| } |
| } |
| |
| template <> |
| void patch_after_exec_arch<ARM64Arch>(RecordTask* t, Monkeypatcher& patcher) { |
| setup_preload_library_path<ARM64Arch>(t); |
| setup_audit_library_path<ARM64Arch>(t); |
| |
| for (const auto& m : t->vm()->maps()) { |
| auto& km = m.map; |
| patcher.patch_after_mmap(t, km.start(), km.size(), |
| km.file_offset_bytes(), -1, |
| Monkeypatcher::MMAP_EXEC); |
| } |
| |
| if (!t->vm()->has_vdso()) { |
| patch_auxv_vdso(t, AT_SYSINFO_EHDR, AT_IGNORE); |
| } else { |
| size_t librrpage_base = RR_PAGE_ADDR - AddressSpace::RRPAGE_RECORD_PAGE_OFFSET*PRELOAD_LIBRARY_PAGE_SIZE; |
| patch_auxv_vdso(t, AT_SYSINFO_EHDR, librrpage_base); |
| } |
| } |
| |
| template <> |
| void patch_at_preload_init_arch<X64Arch>(RecordTask* t, |
| Monkeypatcher& patcher) { |
| auto params = t->read_mem( |
| remote_ptr<rrcall_init_preload_params<X64Arch>>(t->regs().arg1())); |
| if (!params.syscallbuf_enabled) { |
| return; |
| } |
| |
| patcher.init_dynamic_syscall_patching(t, params.syscall_patch_hook_count, |
| params.syscall_patch_hooks); |
| } |
| |
| template <> |
| void patch_at_preload_init_arch<ARM64Arch>(RecordTask* t, |
| Monkeypatcher& patcher) { |
| auto params = t->read_mem( |
| remote_ptr<rrcall_init_preload_params<ARM64Arch>>(t->regs().orig_arg1())); |
| if (!params.syscallbuf_enabled) { |
| return; |
| } |
| |
| patcher.init_dynamic_syscall_patching(t, params.syscall_patch_hook_count, |
| params.syscall_patch_hooks); |
| } |
| |
| void Monkeypatcher::patch_after_exec(RecordTask* t) { |
| ASSERT(t, 1 == t->vm()->task_set().size()) |
| << "Can't have multiple threads immediately after exec!"; |
| |
| RR_ARCH_FUNCTION(patch_after_exec_arch, t->arch(), t, *this); |
| } |
| |
| void Monkeypatcher::patch_at_preload_init(RecordTask* t) { |
| // NB: the tracee can't be interrupted with a signal while |
| // we're processing the rrcall, because it's masked off all |
| // signals. |
| RR_ARCH_FUNCTION(patch_at_preload_init_arch, t->arch(), t, *this); |
| } |
| |
| static remote_ptr<void> resolve_address(ElfReader& reader, uintptr_t elf_addr, |
| remote_ptr<void> map_start, |
| size_t map_size, |
| uintptr_t map_offset) { |
| uintptr_t file_offset; |
| if (!reader.addr_to_offset(elf_addr, file_offset)) { |
| LOG(warn) << "ELF address " << HEX(elf_addr) << " not in file"; |
| } |
| if (file_offset < map_offset || file_offset + 32 > map_offset + map_size) { |
| // The value(s) to be set are outside the mapped range. This happens |
| // because code and data can be mapped in separate, partial mmaps in which |
| // case some symbols will be outside the mapped range. |
| return nullptr; |
| } |
| return map_start + uintptr_t(file_offset - map_offset); |
| } |
| |
| static void set_and_record_bytes(RecordTask* t, ElfReader& reader, |
| uintptr_t elf_addr, const void* bytes, |
| size_t size, remote_ptr<void> map_start, |
| size_t map_size, size_t map_offset) { |
| remote_ptr<void> addr = |
| resolve_address(reader, elf_addr, map_start, map_size, map_offset); |
| if (!addr) { |
| return; |
| } |
| bool ok = true; |
| t->write_bytes_helper(addr, size, bytes, &ok); |
| // Writing can fail when the value appears to be in the mapped range, but it |
| // actually is beyond the file length. |
| if (ok) { |
| t->record_local(addr, size, bytes); |
| } |
| } |
| |
| /** |
| * Patch _dl_runtime_resolve_(fxsave,xsave,xsavec) to clear "FDP Data Pointer" |
| * register so that CPU-specific behaviors involving that register don't leak |
| * into stack memory. |
| */ |
| static void patch_dl_runtime_resolve(Monkeypatcher& patcher, |
| RecordTask* t, ElfReader& reader, |
| uintptr_t elf_addr, |
| remote_ptr<void> map_start, |
| size_t map_size, |
| size_t map_offset) { |
| if (t->arch() != x86_64) { |
| return; |
| } |
| remote_ptr<void> addr = |
| resolve_address(reader, elf_addr, map_start, map_size, map_offset); |
| if (!addr) { |
| return; |
| } |
| |
| uint8_t impl[X64DLRuntimeResolve::size + X64EndBr::size]; |
| uint8_t *impl_start = impl; |
| t->read_bytes(addr, impl); |
| if (X64EndBr::match(impl) || X86EndBr::match(impl)) { |
| assert(X64EndBr::size == X86EndBr::size); |
| LOG(debug) << "Starts with endbr, skipping"; |
| addr += X64EndBr::size; |
| impl_start += X64EndBr::size; |
| } |
| |
| if (!X64DLRuntimeResolve::match(impl_start) && |
| !X64DLRuntimeResolve2::match(impl_start)) { |
| LOG(warn) << "_dl_runtime_resolve implementation doesn't look right"; |
| return; |
| } |
| |
| uint8_t call_patch[X64CallMonkeypatch::size]; |
| // We're patching in a relative call, so we need to compute the offset from |
| // the end of the call to our actual destination. |
| auto call_patch_start = addr.cast<uint8_t>(); |
| auto call_patch_end = call_patch_start + sizeof(call_patch); |
| |
| remote_ptr<uint8_t> extended_call_start = |
| allocate_extended_jump_x86ish<X64DLRuntimeResolvePrelude>( |
| t, patcher.extended_jump_pages, call_patch_end); |
| if (extended_call_start.is_null()) { |
| return; |
| } |
| uint8_t stub_patch[X64DLRuntimeResolvePrelude::size]; |
| X64DLRuntimeResolvePrelude::substitute(stub_patch); |
| write_and_record_bytes(t, extended_call_start, stub_patch); |
| |
| intptr_t call_offset = extended_call_start - call_patch_end; |
| int32_t call_offset32 = (int32_t)call_offset; |
| ASSERT(t, call_offset32 == call_offset) |
| << "allocate_extended_jump_x86ish didn't work"; |
| X64CallMonkeypatch::substitute(call_patch, call_offset32); |
| write_and_record_bytes(t, call_patch_start, call_patch); |
| |
| // pad with NOPs to the next instruction |
| static const uint8_t NOP = 0x90; |
| uint8_t nops[X64DLRuntimeResolve::size - sizeof(call_patch)]; |
| memset(nops, NOP, sizeof(nops)); |
| write_and_record_mem(t, call_patch_start + sizeof(call_patch), nops, |
| sizeof(nops)); |
| } |
| |
| static bool file_may_need_instrumentation(const AddressSpace::Mapping& map) { |
| size_t file_part = map.map.fsname().rfind('/'); |
| if (file_part == string::npos) { |
| file_part = 0; |
| } else { |
| ++file_part; |
| } |
| const string& fsname = map.map.fsname(); |
| return fsname.find("libpthread", file_part) != string::npos || |
| fsname.find("ld", file_part) != string::npos; |
| } |
| |
| void Monkeypatcher::patch_after_mmap(RecordTask* t, remote_ptr<void> start, |
| size_t size, size_t offset_bytes, |
| int child_fd, MmapMode mode) { |
| const auto& map = t->vm()->mapping_of(start); |
| if (file_may_need_instrumentation(map) && |
| (t->arch() == x86 || t->arch() == x86_64)) { |
| ScopedFd open_fd; |
| if (child_fd >= 0) { |
| open_fd = t->open_fd(child_fd, O_RDONLY); |
| ASSERT(t, open_fd.is_open()) << "Failed to open child fd " << child_fd; |
| } else { |
| char buf[100]; |
| sprintf(buf, "/proc/%d/map_files/%llx-%llx", t->tid, |
| (long long)start.as_int(), (long long)start.as_int() + size); |
| // Reading these directly requires CAP_SYS_ADMIN, so open the link target |
| // instead. |
| char link[PATH_MAX]; |
| int ret = readlink(buf, link, sizeof(link) - 1); |
| if (ret < 0) { |
| return; |
| } |
| link[ret] = 0; |
| open_fd = ScopedFd(link, O_RDONLY); |
| if (!open_fd.is_open()) { |
| return; |
| } |
| } |
| ElfFileReader reader(open_fd, t->arch()); |
| // Check for symbols first in the library itself, regardless of whether |
| // there is a debuglink. For example, on Fedora 26, the .symtab and |
| // .strtab sections are stripped from the debuginfo file for |
| // libpthread.so. |
| SymbolTable syms = reader.read_symbols(".symtab", ".strtab"); |
| if (syms.size() == 0) { |
| ScopedFd debug_fd = reader.open_debug_file(map.map.fsname()); |
| if (debug_fd.is_open()) { |
| ElfFileReader debug_reader(debug_fd, t->arch()); |
| syms = debug_reader.read_symbols(".symtab", ".strtab"); |
| } |
| } |
| for (size_t i = 0; i < syms.size(); ++i) { |
| if (syms.is_name(i, "__elision_aconf")) { |
| static const int zero = 0; |
| // Setting __elision_aconf.retry_try_xbegin to zero means that |
| // pthread rwlocks don't try to use elision at all. See ELIDE_LOCK |
| // in glibc's elide.h. |
| set_and_record_bytes(t, reader, syms.addr(i) + 8, &zero, sizeof(zero), |
| start, size, offset_bytes); |
| } |
| if (syms.is_name(i, "elision_init")) { |
| // Make elision_init return without doing anything. This means |
| // the __elision_available and __pthread_force_elision flags will |
| // remain zero, disabling elision for mutexes. See glibc's |
| // elision-conf.c. |
| static const uint8_t ret = 0xC3; |
| set_and_record_bytes(t, reader, syms.addr(i), &ret, sizeof(ret), start, |
| size, offset_bytes); |
| } |
| // The following operations can only be applied once because after the |
| // patch is applied the code no longer matches the expected template. |
| // For replaying a replay to work, we need to only apply these changes |
| // during a real exec, not during the mmap operations performed when rr |
| // replays an exec. |
| if (mode == MMAP_EXEC && |
| (syms.is_name(i, "_dl_runtime_resolve_fxsave") || |
| syms.is_name(i, "_dl_runtime_resolve_xsave") || |
| syms.is_name(i, "_dl_runtime_resolve_xsavec"))) { |
| patch_dl_runtime_resolve(*this, t, reader, syms.addr(i), start, size, |
| offset_bytes); |
| } |
| } |
| } |
| } |
| |
| } // namespace rr |