blob: 9584c235feda474e4090d99a081cb01939c6f63b [file] [log] [blame]
/* -*- Mode: C++; tab-width: 8; c-basic-offset: 2; indent-tabs-mode: nil; -*- */
#include "AddressSpace.h"
#include <limits.h>
#include <linux/kdev_t.h>
#include <linux/prctl.h>
#include <sys/stat.h>
#include <unistd.h>
#include <limits>
#include "rr/rr.h"
#include "preload/preload_interface.h"
#include "AutoRemoteSyscalls.h"
#include "MonitoredSharedMemory.h"
#include "RecordSession.h"
#include "RecordTask.h"
#include "Session.h"
#include "Task.h"
#include "core.h"
#include "log.h"
using namespace std;
namespace rr {
static const uint8_t x86_breakpoint_insn[] = { 0xcc }; // int $3
static const uint8_t arm64_breakpoint_insn[4] = {0x0, 0x0, 0x20, 0xd4}; // brk #0
static const uint8_t *breakpoint_insn(SupportedArch arch) {
switch (arch) {
case x86:
case x86_64:
return x86_breakpoint_insn;
case aarch64:
return arm64_breakpoint_insn;
default:
DEBUG_ASSERT(0 && "Must define breakpoint insn for this architecture");
return nullptr;
}
}
/**
* Advance *str to skip leading blank characters.
*/
static const char* trim_leading_blanks(const char* str) {
const char* trimmed = str;
while (isblank(*trimmed)) {
++trimmed;
}
return trimmed;
}
/**
* Returns true if a task in t's thread-group other than t is doing an exec.
*/
static bool thread_group_in_exec(Task* t) {
if (!t->session().is_recording()) {
return false;
}
for (Task* tt : t->thread_group()->task_set()) {
if (tt == t || t->already_exited()) {
continue;
}
RecordTask* rt = static_cast<RecordTask*>(tt);
Event& ev = rt->ev();
if (ev.is_syscall_event() && ev.Syscall().is_exec()) {
return true;
}
}
return false;
}
KernelMapIterator::KernelMapIterator(Task* t, bool* ok)
: tid(t->tid) {
// See https://lkml.org/lkml/2016/9/21/423
ASSERT(t, !thread_group_in_exec(t)) << "Task-group in execve, so reading "
"/proc/.../maps may trigger kernel "
"deadlock!";
init(ok);
}
KernelMapIterator::~KernelMapIterator() {
if (maps_file) {
fclose(maps_file);
}
}
void KernelMapIterator::init(bool* ok) {
char maps_path[PATH_MAX];
sprintf(maps_path, "/proc/%d/maps", tid);
if (ok) {
*ok = true;
}
if (!(maps_file = fopen(maps_path, "r"))) {
if (ok) {
*ok = false;
} else {
FATAL() << "Failed to open " << maps_path;
}
}
++*this;
}
void KernelMapIterator::operator++() {
char line[PATH_MAX * 2];
if (!fgets(line, sizeof(line), maps_file)) {
fclose(maps_file);
maps_file = nullptr;
return;
}
uint64_t start, end, offset, inode;
int dev_major, dev_minor;
char flags[32];
int chars_scanned;
int nparsed = sscanf(line, "%" SCNx64 "-%" SCNx64 " %31s %" SCNx64
" %x:%x %" SCNu64 " %n",
&start, &end, flags, &offset, &dev_major, &dev_minor,
&inode, &chars_scanned);
DEBUG_ASSERT(8 /*number of info fields*/ == nparsed ||
7 /*num fields if name is blank*/ == nparsed);
// trim trailing newline, if any
int last_char = strlen(line) - 1;
if (line[last_char] == '\n') {
line[last_char] = 0;
}
raw_line = line;
const char* name = trim_leading_blanks(line + chars_scanned);
#if defined(__i386__)
if (start > numeric_limits<uint32_t>::max() ||
end > numeric_limits<uint32_t>::max() ||
strcmp(name, "[vsyscall]") == 0) {
// We manually read the exe link here because
// this helper is used to set
// |t->vm()->exe_image()|, so we can't rely on
// that being correct yet.
char proc_exe[PATH_MAX];
char exe[PATH_MAX];
snprintf(proc_exe, sizeof(proc_exe), "/proc/%d/exe", tid);
ssize_t size = readlink(proc_exe, exe, sizeof(exe));
if (size < 0) {
FATAL() << "readlink failed";
}
FATAL() << "Sorry, tracee " << tid << " has x86-64 image " << exe
<< " and that's not supported with a 32-bit rr.";
}
#endif
int prot = (strchr(flags, 'r') ? PROT_READ : 0) |
(strchr(flags, 'w') ? PROT_WRITE : 0) |
(strchr(flags, 'x') ? PROT_EXEC : 0);
int f = (strchr(flags, 'p') ? MAP_PRIVATE : 0) |
(strchr(flags, 's') ? MAP_SHARED : 0);
string tmp_name;
if (strchr(name, '\\')) {
// Unescape any '\012' sequences
while (*name) {
if (strncmp(name, "\\012", 4) == 0) {
tmp_name.push_back('\n');
name += 4;
} else {
tmp_name.push_back(*name);
++name;
}
}
name = tmp_name.c_str();
}
km = KernelMapping(start, end, name, MKDEV(dev_major, dev_minor), inode, prot,
f, offset);
}
static KernelMapping read_kernel_mapping(pid_t tid, remote_ptr<void> addr) {
MemoryRange range(addr, 1);
bool ok;
KernelMapIterator it(tid, &ok);
if (!ok) {
return KernelMapping();
}
for (; !it.at_end(); ++it) {
const KernelMapping& km = it.current();
if (km.contains(range)) {
return km;
}
}
return KernelMapping();
}
KernelMapping AddressSpace::read_kernel_mapping(Task* t,
remote_ptr<void> addr) {
return rr::read_kernel_mapping(t->tid, addr);
}
KernelMapping AddressSpace::read_local_kernel_mapping(uint8_t* addr) {
return rr::read_kernel_mapping(getpid(), remote_ptr<void>((uintptr_t)addr));
}
/**
* Cat the /proc/[t->tid]/maps file to stdout, line by line.
*/
void AddressSpace::print_process_maps(Task* t) {
for (KernelMapIterator it(t); !it.at_end(); ++it) {
string line;
it.current(&line);
cerr << line << '\n';
}
}
AddressSpace::Mapping::Mapping(const KernelMapping& map,
const KernelMapping& recorded_map,
EmuFile::shr_ptr emu_file,
std::unique_ptr<struct stat> mapped_file_stat,
void* local_addr,
shared_ptr<MonitoredSharedMemory>&& monitored)
: map(map),
recorded_map(recorded_map),
emu_file(emu_file),
mapped_file_stat(std::move(mapped_file_stat)),
local_addr(static_cast<uint8_t*>(local_addr)),
monitored_shared_memory(std::move(monitored)),
flags(FLAG_NONE) {}
static unique_ptr<struct stat> clone_stat(
const unique_ptr<struct stat>& other) {
return other ? unique_ptr<struct stat>(new struct stat(*other)) : nullptr;
}
AddressSpace::Mapping::Mapping(const Mapping& other)
: map(other.map),
recorded_map(other.recorded_map),
emu_file(other.emu_file),
mapped_file_stat(clone_stat(other.mapped_file_stat)),
local_addr(other.local_addr),
monitored_shared_memory(other.monitored_shared_memory),
flags(other.flags) {}
AddressSpace::Mapping::~Mapping() {}
AddressSpace::~AddressSpace() {
for (auto& m : mem) {
if (m.second.local_addr) {
int ret = munmap(m.second.local_addr, m.second.map.size());
if (ret < 0) {
FATAL() << "Can't munmap";
}
}
}
session_->on_destroy(this);
}
static uint32_t find_offset_of_syscall_instruction_in(SupportedArch arch,
uint8_t* vdso_data,
size_t vdso_len) {
auto instruction = syscall_instruction(arch);
for (uint32_t i = 1; i < vdso_len - instruction.size(); ++i) {
if (memcmp(vdso_data + i, instruction.data(), instruction.size()) == 0) {
return i;
}
}
return 0;
}
uint32_t AddressSpace::offset_to_syscall_in_vdso[SupportedArch_MAX + 1];
remote_code_ptr AddressSpace::find_syscall_instruction(Task* t) {
SupportedArch arch = t->arch();
// This assert passes even if --unmap-vdso is passed because this only ever
// gets called at the start of process_execve before we unmap the vdso. After
// the rr page is mapped in, we use the syscall instructions contained therein
ASSERT(t, has_vdso()) << "Kernel with vDSO disabled?";
if (!offset_to_syscall_in_vdso[arch]) {
auto vdso_data = t->read_mem(vdso().start().cast<uint8_t>(), vdso().size());
offset_to_syscall_in_vdso[arch] = find_offset_of_syscall_instruction_in(
arch, vdso_data.data(), vdso_data.size());
ASSERT(t, offset_to_syscall_in_vdso[arch])
<< "No syscall instruction found in VDSO";
}
return remote_code_ptr(
(vdso().start().cast<uint8_t>() + offset_to_syscall_in_vdso[arch])
.as_int());
}
void AddressSpace::map_rr_page(AutoRemoteSyscalls& remote) {
int prot = PROT_EXEC | PROT_READ;
int flags = MAP_PRIVATE | MAP_FIXED;
string file_name;
Task* t = remote.task();
SupportedArch arch = t->arch();
const char *fname = nullptr;
switch (t->arch()) {
case x86_64:
case aarch64:
fname = RRPAGE_LIB_FILENAME;
break;
case x86:
#if defined(__x86_64__)
fname = RRPAGE_LIB_FILENAME_32;
#else
fname = RRPAGE_LIB_FILENAME;
#endif
break;
}
string path = find_helper_library(fname);
if (path.empty()) {
FATAL() << "Failed to locate " << fname;
}
path += fname;
size_t offset_pages = t->session().is_recording() ?
RRPAGE_RECORD_PAGE_OFFSET : RRPAGE_REPLAY_PAGE_OFFSET;
size_t offset_bytes = offset_pages * PRELOAD_LIBRARY_PAGE_SIZE;
{
ScopedFd page(path.c_str(), O_RDONLY);
ASSERT(t, page.is_open()) << "Failed to open rrpage library " << path;
int child_fd = remote.infallible_send_fd_if_alive(page);
if (child_fd >= 0) {
if (t->session().is_recording()) {
remote.infallible_mmap_syscall_if_alive(rr_page_start() - offset_bytes, offset_bytes, prot, flags,
child_fd, 0);
}
remote.infallible_mmap_syscall_if_alive(rr_page_start(), PRELOAD_LIBRARY_PAGE_SIZE, prot, flags,
child_fd, offset_bytes);
struct stat fstat = t->stat_fd(child_fd);
file_name = t->file_name_of_fd(child_fd);
remote.infallible_close_syscall_if_alive(child_fd);
map(t, rr_page_start(), PRELOAD_LIBRARY_PAGE_SIZE, prot, flags,
offset_bytes, file_name,
fstat.st_dev, fstat.st_ino);
mapping_flags_of(rr_page_start()) = Mapping::IS_RR_PAGE;
if (t->session().is_recording()) {
map(t, rr_page_start() - offset_bytes, offset_bytes, prot, flags,
0, file_name,
fstat.st_dev, fstat.st_ino);
}
}
}
if (t->session().is_recording()) {
// brk() will not have been called yet so the brk area is empty.
brk_start = brk_end =
remote.infallible_syscall(syscall_number_for_brk(arch), 0);
ASSERT(t, !brk_end.is_null());
}
traced_syscall_ip_ = rr_page_syscall_entry_point(
TRACED, UNPRIVILEGED, RECORDING_AND_REPLAY, t->arch());
privileged_traced_syscall_ip_ = rr_page_syscall_entry_point(
TRACED, PRIVILEGED, RECORDING_AND_REPLAY, t->arch());
}
void AddressSpace::unmap_all_but_rr_mappings(AutoRemoteSyscalls& remote,
UnmapOptions options) {
vector<MemoryRange> unmaps;
for (const auto& m : maps()) {
// Do not attempt to unmap [vsyscall] --- it doesn't work.
if (m.map.start() != AddressSpace::rr_page_start() &&
m.map.start() != AddressSpace::preload_thread_locals_start() &&
!m.map.is_vsyscall() &&
(!options.exclude_vdso_vvar || (!m.map.is_vdso() && m.map.is_vvar()))) {
unmaps.push_back(m.map);
}
}
for (auto& m : unmaps) {
remote.infallible_syscall(syscall_number_for_munmap(remote.task()->arch()),
m.start(), m.size());
unmap(remote.task(), m.start(), m.size());
}
}
/**
* Must match generate_rr_page.py
*/
static const AddressSpace::SyscallType entry_points[] = {
{ AddressSpace::TRACED, AddressSpace::UNPRIVILEGED,
AddressSpace::RECORDING_AND_REPLAY },
{ AddressSpace::TRACED, AddressSpace::PRIVILEGED,
AddressSpace::RECORDING_AND_REPLAY },
{ AddressSpace::UNTRACED, AddressSpace::UNPRIVILEGED,
AddressSpace::RECORDING_AND_REPLAY },
{ AddressSpace::UNTRACED, AddressSpace::UNPRIVILEGED,
AddressSpace::REPLAY_ONLY },
{ AddressSpace::UNTRACED, AddressSpace::UNPRIVILEGED,
AddressSpace::RECORDING_ONLY },
{ AddressSpace::UNTRACED, AddressSpace::PRIVILEGED,
AddressSpace::RECORDING_AND_REPLAY },
{ AddressSpace::UNTRACED, AddressSpace::PRIVILEGED,
AddressSpace::REPLAY_ONLY },
{ AddressSpace::UNTRACED, AddressSpace::PRIVILEGED,
AddressSpace::RECORDING_ONLY },
{ AddressSpace::UNTRACED, AddressSpace::UNPRIVILEGED,
AddressSpace::REPLAY_ASSIST },
};
static int rr_page_syscall_stub_size(SupportedArch arch) {
int val = 0;
switch (arch) {
case x86:
case x86_64:
val = 3;
break;
case aarch64:
val = 8;
break;
default:
FATAL() << "Syscall stub size not defined for this architecture";
}
if (arch == NativeArch::arch()) {
DEBUG_ASSERT(val == RR_PAGE_SYSCALL_STUB_SIZE);
}
return val;
}
static int rr_page_syscall_instruction_end(SupportedArch arch) {
int val = 0;
switch (arch) {
case x86:
case x86_64:
val = 2;
break;
case aarch64:
val = 4;
break;
default:
FATAL() << "Syscall stub size not defined for this architecture";
}
if (arch == NativeArch::arch()) {
DEBUG_ASSERT(val == RR_PAGE_SYSCALL_INSTRUCTION_END);
}
return val;
}
static remote_code_ptr entry_ip_from_index(SupportedArch arch, size_t i) {
return remote_code_ptr(RR_PAGE_ADDR + rr_page_syscall_stub_size(arch) * i);
}
static remote_code_ptr exit_ip_from_index(SupportedArch arch, size_t i) {
return remote_code_ptr(RR_PAGE_ADDR + rr_page_syscall_stub_size(arch) * i +
rr_page_syscall_instruction_end(arch));
}
remote_code_ptr AddressSpace::rr_page_syscall_exit_point(Traced traced,
Privileged privileged,
Enabled enabled,
SupportedArch arch) {
for (auto& e : entry_points) {
if (e.traced == traced && e.privileged == privileged &&
e.enabled == enabled) {
return exit_ip_from_index(arch, &e - entry_points);
}
}
return nullptr;
}
remote_code_ptr AddressSpace::rr_page_syscall_entry_point(Traced traced,
Privileged privileged,
Enabled enabled,
SupportedArch arch) {
for (auto& e : entry_points) {
if (e.traced == traced && e.privileged == privileged &&
e.enabled == enabled) {
return entry_ip_from_index(arch, &e - entry_points);
}
}
return nullptr;
}
const AddressSpace::SyscallType* AddressSpace::rr_page_syscall_from_exit_point(
SupportedArch arch, remote_code_ptr ip) {
for (size_t i = 0; i < array_length(entry_points); ++i) {
if (exit_ip_from_index(arch, i) == ip) {
return &entry_points[i];
}
}
return nullptr;
}
const AddressSpace::SyscallType* AddressSpace::rr_page_syscall_from_entry_point(
SupportedArch arch, remote_code_ptr ip) {
for (size_t i = 0; i < array_length(entry_points); ++i) {
if (entry_ip_from_index(arch, i) == ip) {
return &entry_points[i];
}
}
return nullptr;
}
vector<AddressSpace::SyscallType> AddressSpace::rr_page_syscalls() {
vector<SyscallType> result;
for (auto& e : entry_points) {
result.push_back(e);
}
return result;
}
void AddressSpace::save_auxv(Task* t) {
saved_auxv_ = read_auxv(t);
save_interpreter_base(t, saved_auxv());
}
void AddressSpace::save_interpreter_base(Task* t, std::vector<uint8_t> auxv) {
saved_interpreter_base_ = read_interpreter_base(auxv);
save_ld_path(t, saved_interpreter_base());
}
void AddressSpace::save_ld_path(Task* t, remote_ptr<void> interpreter_base) {
saved_ld_path_ = read_ld_path(t, interpreter_base);
}
void AddressSpace::read_mm_map(Task* t, NativeArch::prctl_mm_map* map) {
char buf[PATH_MAX+1024];
{
string proc_stat = t->proc_stat_path();
ScopedFd fd(proc_stat.c_str(), O_RDONLY);
memset(buf, 0, sizeof(buf));
int err = read_to_end(fd, 0, buf, sizeof(buf)-1);
if (err < 0) {
FATAL() << "Failed to read /proc/<pid>/stat";
}
}
// The last close-paren indicates the end of the comm and the
// start of the fixed-width area
char* fixed = strrchr(buf, ')');
// We don't change /proc/pid/exe, since we're unlikely to have CAP_SYS_ADMIN
map->exe_fd = -1;
// auxv is restored separately
map->auxv.val = 0;
map->auxv_size = 0;
// All of these fields of /proc/pid/stat, we don't use (currently)
char state;
pid_t ppid;
pid_t pgrp;
int session;
int tty_nr;
int tpgid;
unsigned int flags;
unsigned long minflt, cminflt, majflt, cmajflt, utime, stime;
long cutime, cstime, priority, nice, num_threads, itrealvalue;
unsigned long long starttime;
unsigned long vsize;
long rss;
unsigned long rsslim, kstkesp, kstskip, signal;
unsigned long blocked, sigignore, sigcatch, wchan, nswap, cnswap;
int exit_signal, processor;
unsigned int rt_priority, policy;
unsigned long long delayacct_blkio_ticks;
unsigned long guest_time;
long cguest_time;
int exit_code;
// See the proc(5) man page for the correct scan codes for these
size_t n = sscanf(fixed + 1,
// state ppid pgrp session tty_nr tpgid
" %c %d %d %d %d %d"
// flags minflt cminflt majflt cmajflt utime stime cutime cstime
" %u %lu %lu %lu %lu %lu %lu %ld %ld"
// priority nice num_threads itrealvalue starttime vsize rss
" %ld %ld %ld %ld %llu %lu %ld"
// rsslim startcode endcode startstack kstkesp kstskip signal
" %lu %lu %lu %lu %lu %lu %lu"
// blocked sigignore sigcatch wchan nswap cnswap exit_signal
" %lu %lu %lu %lu %lu %lu %d"
// processor rt_priority policy delayacct_blkio_ticks guest_time cguest_time
" %d %u %u %llu %lu %ld "
// start_data end_data start_brk arg_start arg_end env_start env_end exit_code
" %lu %lu %lu %lu %lu %lu %lu %d",
&state, &ppid, &pgrp, &session, &tty_nr, &tpgid,
&flags, &minflt, &cminflt, &majflt, &cmajflt, &utime, &stime, &cutime, &cstime,
&priority, &nice, &num_threads, &itrealvalue, &starttime, &vsize, &rss,
&rsslim, (unsigned long *)&map->start_code, (unsigned long *)&map->end_code,
(unsigned long *)&map->start_stack, &kstkesp, &kstskip, &signal,
&blocked, &sigignore, &sigcatch, &wchan, &nswap, &cnswap, &exit_signal,
&processor, &rt_priority, &policy, &delayacct_blkio_ticks, &guest_time,
&cguest_time, (unsigned long *)&map->start_data, (unsigned long *)&map->end_data,
(unsigned long *)&map->start_brk, (unsigned long *)&map->arg_start,
(unsigned long *)&map->arg_end, (unsigned long *)&map->env_start,
(unsigned long *)&map->env_end, &exit_code);
ASSERT(t, n == 50);
// Fill in brk end
ASSERT(t, map->start_brk == this->brk_start.as_int());
map->brk = this->brk_end.as_int();
}
void AddressSpace::post_exec_syscall(Task* t) {
// First locate a syscall instruction we can use for remote syscalls.
traced_syscall_ip_ = find_syscall_instruction(t);
privileged_traced_syscall_ip_ = nullptr;
do_breakpoint_fault_addr_ = nullptr;
stopping_breakpoint_table_ = nullptr;
stopping_breakpoint_table_entry_size_ = 0;
// Now remote syscalls work, we can open_mem_fd.
t->open_mem_fd();
// Set up AutoRemoteSyscalls again now that the mem-fd is open.
AutoRemoteSyscalls remote(t);
// Now we can set up the "rr page" at its fixed address. This gives
// us traced and untraced syscall instructions at known, fixed addresses.
map_rr_page(remote);
// Set up the preload_thread_locals shared area.
t->session().create_shared_mmap(remote, PRELOAD_THREAD_LOCALS_SIZE,
preload_thread_locals_start(),
"preload_thread_locals");
mapping_flags_of(preload_thread_locals_start()) |=
AddressSpace::Mapping::IS_THREAD_LOCALS;
}
void AddressSpace::brk(Task* t, remote_ptr<void> addr, int prot) {
LOG(debug) << "brk(" << addr << ")";
remote_ptr<void> old_brk = ceil_page_size(brk_end);
remote_ptr<void> new_brk = ceil_page_size(addr);
if (old_brk < new_brk) {
map(t, old_brk, new_brk - old_brk, prot, MAP_ANONYMOUS | MAP_PRIVATE, 0,
"[heap]");
} else {
unmap(t, new_brk, old_brk - new_brk);
}
brk_end = addr;
}
static const char* stringify_flags(int flags) {
switch (flags) {
case AddressSpace::Mapping::FLAG_NONE:
return "";
case AddressSpace::Mapping::IS_SYSCALLBUF:
return " [syscallbuf]";
case AddressSpace::Mapping::IS_THREAD_LOCALS:
return " [thread_locals]";
case AddressSpace::Mapping::IS_PATCH_STUBS:
return " [patch_stubs]";
default:
return "[unknown_flags]";
}
}
void AddressSpace::dump() const {
fprintf(stderr, " (heap: %p-%p)\n", (void*)brk_start.as_int(),
(void*)brk_end.as_int());
for (auto it = mem.begin(); it != mem.end(); ++it) {
const KernelMapping& m = it->second.map;
fprintf(stderr, "%s%s\n", m.str().c_str(),
stringify_flags(it->second.flags));
}
}
SupportedArch AddressSpace::arch() const {
return (*task_set().begin())->arch();
}
BreakpointType AddressSpace::get_breakpoint_type_for_retired_insn(
remote_code_ptr ip) {
remote_code_ptr addr = ip.undo_executed_bkpt(arch());
return get_breakpoint_type_at_addr(addr);
}
BreakpointType AddressSpace::get_breakpoint_type_at_addr(remote_code_ptr addr) {
auto it = breakpoints.find(addr);
return it == breakpoints.end() ? BKPT_NONE : it->second.type();
}
bool AddressSpace::is_exec_watchpoint(remote_code_ptr addr) {
for (auto& kv : watchpoints) {
if (kv.first.contains(addr.to_data_ptr<void>()) &&
(kv.second.watched_bits() & EXEC_BIT)) {
return true;
}
}
return false;
}
bool AddressSpace::is_breakpoint_in_private_read_only_memory(
remote_code_ptr addr) {
for (const auto& m : maps_containing_or_after(addr.to_data_ptr<void>())) {
if (m.map.start() >=
addr.increment_by_bkpt_insn_length(arch()).to_data_ptr<void>()) {
break;
}
if ((m.map.prot() & PROT_WRITE) || (m.map.flags() & MAP_SHARED)) {
return false;
}
}
return true;
}
void AddressSpace::replace_breakpoints_with_original_values(
uint8_t* dest, size_t length, remote_ptr<uint8_t> addr) {
for (auto& it : breakpoints) {
remote_ptr<uint8_t> bkpt_location = it.first.to_data_ptr<uint8_t>();
remote_ptr<uint8_t> start = max(addr, bkpt_location);
remote_ptr<uint8_t> end =
min(addr + length, bkpt_location + bkpt_instruction_length(arch()));
if (start < end) {
memcpy(dest + (start - addr),
it.second.original_data() + (start - bkpt_location), end - start);
}
}
}
bool AddressSpace::is_breakpoint_instruction(Task* t, remote_code_ptr ip) {
bool ok = true;
uint8_t data[MAX_BKPT_INSTRUCTION_LENGTH];
t->read_bytes_helper(ip.to_data_ptr<uint8_t>(),
bkpt_instruction_length(t->arch()), data, &ok);
return memcmp(data, breakpoint_insn(t->arch()),
bkpt_instruction_length(t->arch())) == 0 && ok;
}
static void remove_range(set<MemoryRange>& ranges, const MemoryRange& range) {
if (ranges.empty()) {
return;
}
auto start = ranges.lower_bound(range);
// An earlier range might extend into range, so check for that.
if (start != ranges.begin()) {
--start;
if (start->end() <= range.start()) {
++start;
}
}
auto end = start;
auto prev_end = start;
while (end != ranges.end() && end->start() < range.end()) {
prev_end = end;
++end;
}
if (start == end) {
return;
}
MemoryRange start_range = *start;
MemoryRange end_range = *prev_end;
ranges.erase(start, end);
if (start_range.start() < range.start()) {
ranges.insert(MemoryRange(start_range.start(), range.start()));
}
if (range.end() < end_range.end()) {
ranges.insert(MemoryRange(range.end(), end_range.end()));
}
}
static void add_range(set<MemoryRange>& ranges, const MemoryRange& range) {
// Remove overlapping ranges
remove_range(ranges, range);
ranges.insert(range);
// We could coalesce adjacent ranges, but there's probably no need.
}
KernelMapping AddressSpace::map(Task* t, remote_ptr<void> addr,
size_t num_bytes, int prot, int flags,
off64_t offset_bytes, const string& fsname,
dev_t device, ino_t inode,
unique_ptr<struct stat> mapped_file_stat,
const KernelMapping* recorded_map,
EmuFile::shr_ptr emu_file, void* local_addr,
shared_ptr<MonitoredSharedMemory> monitored) {
LOG(debug) << "mmap(" << addr << ", " << num_bytes << ", " << HEX(prot)
<< ", " << HEX(flags) << ", " << HEX(offset_bytes) << ")";
num_bytes = ceil_page_size(num_bytes);
KernelMapping m(addr, addr + num_bytes, fsname, device, inode, prot, flags,
offset_bytes);
if (!num_bytes) {
return m;
}
remove_range(dont_fork, MemoryRange(addr, num_bytes));
remove_range(wipe_on_fork, MemoryRange(addr, num_bytes));
// The mmap() man page doesn't specifically describe
// what should happen if an existing map is
// "overwritten" by a new map (of the same resource).
// In testing, the behavior seems to be as if the
// overlapping region is unmapped and then remapped
// per the arguments to the second call.
unmap_internal(t, addr, num_bytes);
const KernelMapping& actual_recorded_map = recorded_map ? *recorded_map : m;
map_and_coalesce(t, m, actual_recorded_map, emu_file,
std::move(mapped_file_stat),
std::move(local_addr),
std::move(monitored));
// During an emulated exec, we will explicitly map in a (copy of) the VDSO
// at the recorded address.
if (actual_recorded_map.is_vdso()) {
vdso_start_addr = addr;
}
return m;
}
template <typename Arch> void AddressSpace::at_preload_init_arch(Task* t) {
auto params = t->read_mem(
remote_ptr<rrcall_init_preload_params<Arch>>(t->regs().orig_arg1()));
if (t->session().is_recording()) {
ASSERT(t,
t->session().as_record()->use_syscall_buffer() ==
params.syscallbuf_enabled)
<< "Tracee thinks syscallbuf is "
<< (params.syscallbuf_enabled ? "en" : "dis")
<< "abled, but tracer thinks "
<< (t->session().as_record()->use_syscall_buffer() ? "en" : "dis")
<< "abled";
} else {
if (params.breakpoint_table_entry_size == -1) {
do_breakpoint_fault_addr_ = params.breakpoint_instr_addr.rptr().as_int();
} else {
stopping_breakpoint_table_ = params.breakpoint_table.rptr().as_int();
stopping_breakpoint_table_entry_size_ =
params.breakpoint_table_entry_size;
}
}
if (!params.syscallbuf_enabled) {
return;
}
syscallbuf_enabled_ = true;
if (t->session().is_recording()) {
monkeypatch_state->patch_at_preload_init(static_cast<RecordTask*>(t));
}
}
void AddressSpace::at_preload_init(Task* t) {
RR_ARCH_FUNCTION(at_preload_init_arch, t->arch(), t);
}
const AddressSpace::Mapping& AddressSpace::mapping_of(
remote_ptr<void> addr) const {
MemoryRange range(floor_page_size(addr), 1);
auto it = mem.find(range);
DEBUG_ASSERT(it != mem.end());
DEBUG_ASSERT(it->second.map.contains(range));
return it->second;
}
uint32_t& AddressSpace::mapping_flags_of(remote_ptr<void> addr) {
return const_cast<AddressSpace::Mapping&>(
static_cast<const AddressSpace*>(this)->mapping_of(addr))
.flags;
}
uint8_t* AddressSpace::local_mapping(remote_ptr<void> addr, size_t size) {
MemoryRange range(floor_page_size(addr), 1);
auto it = mem.find(range);
if (it == mem.end()) {
return nullptr;
}
DEBUG_ASSERT(it->second.map.contains(range));
const Mapping& map = it->second;
// Fall back to the slow path if we can't get the entire region
if (size > static_cast<size_t>(map.map.end() - addr)) {
return nullptr;
}
if (map.local_addr != nullptr) {
size_t offset = addr - map.map.start();
return static_cast<uint8_t*>(map.local_addr) + offset;
}
return nullptr;
}
void* AddressSpace::detach_local_mapping(remote_ptr<void> addr) {
auto m = const_cast<AddressSpace::Mapping&>(mapping_of(addr));
void* p = m.local_addr;
m.local_addr = nullptr;
return p;
}
bool AddressSpace::has_mapping(remote_ptr<void> addr) const {
if (addr + page_size() < addr) {
// Assume the last byte in the address space is never mapped; avoid overflow
return false;
}
MemoryRange m(floor_page_size(addr), 1);
auto it = mem.find(m);
return it != mem.end() && it->first.contains(m);
}
bool AddressSpace::has_rr_page() const {
MemoryRange m(RR_PAGE_ADDR, 1);
auto it = mem.find(m);
return it != mem.end() && (it->second.flags & Mapping::IS_RR_PAGE);
}
void AddressSpace::protect(Task* t, remote_ptr<void> addr, size_t num_bytes,
int prot) {
LOG(debug) << "mprotect(" << addr << ", " << num_bytes << ", " << HEX(prot)
<< ")";
MemoryRange last_overlap;
auto protector = [this, prot, &last_overlap](Mapping m,
MemoryRange rem) {
LOG(debug) << " protecting (" << rem << ") ...";
remove_from_map(m.map);
// PROT_GROWSDOWN means that if this is a grows-down segment
// (which for us means "stack") then the change should be
// extended to the start of the segment.
// We don't try to handle the analogous PROT_GROWSUP, because we
// don't understand the idea of a grows-up segment.
remote_ptr<void> new_start;
if ((m.map.start() < rem.start()) && (prot & PROT_GROWSDOWN)) {
new_start = m.map.start();
LOG(debug) << " PROT_GROWSDOWN: expanded region down to " << new_start;
} else {
new_start = rem.start();
}
LOG(debug) << " erased (" << m.map << ")";
// If the first segment we protect underflows the
// region, remap the underflow region with previous
// prot.
auto monitored = m.monitored_shared_memory;
if (m.map.start() < new_start) {
Mapping underflow(
m.map.subrange(m.map.start(), rem.start()),
m.recorded_map.subrange(m.recorded_map.start(), rem.start()),
m.emu_file, clone_stat(m.mapped_file_stat), m.local_addr,
std::move(monitored));
underflow.flags = m.flags;
add_to_map(underflow);
}
// Remap the overlapping region with the new prot.
remote_ptr<void> new_end = min(rem.end(), m.map.end());
int new_prot = prot & (PROT_READ | PROT_WRITE | PROT_EXEC);
Mapping overlap(
m.map.subrange(new_start, new_end).set_prot(new_prot),
m.recorded_map.subrange(new_start, new_end).set_prot(new_prot),
m.emu_file, clone_stat(m.mapped_file_stat),
m.local_addr ? m.local_addr + (new_start - m.map.start()) : 0,
m.monitored_shared_memory
? m.monitored_shared_memory->subrange(new_start - m.map.start(),
new_end - new_start)
: nullptr);
overlap.flags = m.flags;
add_to_map(overlap);
last_overlap = overlap.map;
// If the last segment we protect overflows the
// region, remap the overflow region with previous
// prot.
if (rem.end() < m.map.end()) {
Mapping overflow(
m.map.subrange(rem.end(), m.map.end()),
m.recorded_map.subrange(rem.end(), m.map.end()), m.emu_file,
clone_stat(m.mapped_file_stat),
m.local_addr ? m.local_addr + (rem.end() - m.map.start()) : 0,
m.monitored_shared_memory
? m.monitored_shared_memory->subrange(rem.end() - m.map.start(),
m.map.end() - rem.end())
: nullptr);
overflow.flags = m.flags;
add_to_map(overflow);
}
};
for_each_in_range(addr, num_bytes, protector, ITERATE_CONTIGUOUS);
if (last_overlap.size()) {
// All mappings that we altered which might need coalescing
// are adjacent to |last_overlap|.
coalesce_around(t, mem.find(last_overlap));
}
}
void AddressSpace::fixup_mprotect_growsdown_parameters(Task* t) {
ASSERT(t, !(t->regs().arg3() & PROT_GROWSUP));
if (t->regs().arg3() & PROT_GROWSDOWN) {
Registers r = t->regs();
if (r.arg1() == floor_page_size(r.arg1()) && has_mapping(r.arg1())) {
auto& km = mapping_of(r.arg1()).map;
if (km.flags() & MAP_GROWSDOWN) {
auto new_start = km.start();
r.set_arg2(remote_ptr<void>(r.arg1()) + size_t(r.arg2()) - new_start);
r.set_arg1(new_start);
r.set_arg3(r.arg3() & ~PROT_GROWSDOWN);
t->set_regs(r);
}
}
}
}
void AddressSpace::remap(Task* t, remote_ptr<void> old_addr,
size_t old_num_bytes, remote_ptr<void> new_addr,
size_t new_num_bytes, int flags) {
LOG(debug) << "mremap(" << old_addr << ", " << old_num_bytes << ", "
<< new_addr << ", " << new_num_bytes << ")";
old_num_bytes = ceil_page_size(old_num_bytes);
Mapping mr = mapping_of(old_addr);
DEBUG_ASSERT(!mr.monitored_shared_memory);
KernelMapping km = mr.map.subrange(old_addr, min(mr.map.end(), old_addr + old_num_bytes));
unmap_internal(t, old_addr, old_num_bytes);
if (flags & MREMAP_DONTUNMAP) {
// This can only ever be an anonymous private mapping.
map(t, old_addr, old_num_bytes, km.prot(), km.flags(), 0, string());
}
if (0 == new_num_bytes) {
return;
}
new_num_bytes = ceil_page_size(new_num_bytes);
auto it = dont_fork.lower_bound(MemoryRange(old_addr, old_num_bytes));
if (it != dont_fork.end() && it->start() < old_addr + old_num_bytes) {
// mremap fails if some but not all pages are marked DONTFORK
DEBUG_ASSERT(*it == MemoryRange(old_addr, old_num_bytes));
remove_range(dont_fork, MemoryRange(old_addr, old_num_bytes));
add_range(dont_fork, MemoryRange(new_addr, new_num_bytes));
} else {
remove_range(dont_fork, MemoryRange(old_addr, old_num_bytes));
remove_range(dont_fork, MemoryRange(new_addr, new_num_bytes));
}
it = wipe_on_fork.lower_bound(MemoryRange(old_addr, old_num_bytes));
if (it != wipe_on_fork.end() && it->start() < old_addr + old_num_bytes) {
// hopefully mremap fails if some but not all pages are marked DONTFORK
DEBUG_ASSERT(*it == MemoryRange(old_addr, old_num_bytes));
remove_range(wipe_on_fork, MemoryRange(old_addr, old_num_bytes));
add_range(wipe_on_fork, MemoryRange(new_addr, new_num_bytes));
} else {
remove_range(wipe_on_fork, MemoryRange(old_addr, old_num_bytes));
remove_range(wipe_on_fork, MemoryRange(new_addr, new_num_bytes));
}
unmap_internal(t, new_addr, new_num_bytes);
remote_ptr<void> new_end = new_addr + new_num_bytes;
map_and_coalesce(t, km.set_range(new_addr, new_end),
mr.recorded_map.set_range(new_addr, new_end), mr.emu_file,
clone_stat(mr.mapped_file_stat), nullptr, nullptr);
}
void AddressSpace::remove_breakpoint(remote_code_ptr addr,
BreakpointType type) {
auto it = breakpoints.find(addr);
if (it == breakpoints.end() || it->second.unref(type) > 0) {
return;
}
destroy_breakpoint(it);
}
bool AddressSpace::add_breakpoint(remote_code_ptr addr, BreakpointType type) {
auto it = breakpoints.find(addr);
if (it == breakpoints.end()) {
uint8_t overwritten_data[MAX_BKPT_INSTRUCTION_LENGTH];
ssize_t bkpt_size = bkpt_instruction_length(arch());
// Grab a random task from the VM so we can use its
// read/write_mem() helpers.
Task* t = first_running_task();
if (!t ||
bkpt_size !=
t->read_bytes_fallible(addr.to_data_ptr<uint8_t>(),
bkpt_size, overwritten_data)) {
return false;
}
t->write_bytes_helper(addr.to_data_ptr<uint8_t>(), bkpt_size,
breakpoint_insn(arch()), nullptr,
Task::IS_BREAKPOINT_RELATED);
auto it_and_is_new = breakpoints.insert(make_pair(addr, Breakpoint()));
DEBUG_ASSERT(it_and_is_new.second);
memcpy(it_and_is_new.first->second.overwritten_data,
overwritten_data, sizeof(overwritten_data));
it = it_and_is_new.first;
}
it->second.ref(type);
return true;
}
void AddressSpace::remove_all_breakpoints() {
while (!breakpoints.empty()) {
destroy_breakpoint(breakpoints.begin());
}
}
void AddressSpace::suspend_breakpoint_at(remote_code_ptr addr) {
auto it = breakpoints.find(addr);
if (it != breakpoints.end()) {
Task* t = first_running_task();
if (t) {
t->write_bytes_helper(addr.to_data_ptr<uint8_t>(),
bkpt_instruction_length(arch()), it->second.overwritten_data);
}
}
}
void AddressSpace::restore_breakpoint_at(remote_code_ptr addr) {
auto it = breakpoints.find(addr);
if (it != breakpoints.end()) {
Task* t = first_running_task();
if (t) {
t->write_bytes_helper(addr.to_data_ptr<uint8_t>(),
bkpt_instruction_length(arch()),
breakpoint_insn(arch()));
}
}
}
int AddressSpace::access_bits_of(WatchType type) {
switch (type) {
case WATCH_EXEC:
return EXEC_BIT;
case WATCH_WRITE:
return WRITE_BIT;
case WATCH_READWRITE:
return READ_BIT | WRITE_BIT;
default:
FATAL() << "Unknown watchpoint type " << type;
return 0; // not reached
}
}
/**
* We do not allow a watchpoint to watch the last byte of memory addressable
* by rr. This avoids constructing a MemoryRange that wraps around.
* For 64-bit builds this is no problem because addresses at the top of memory
* are in kernel space. For 32-bit builds it seems impossible to map the last
* page of memory in Linux so we should be OK there too.
* Note that zero-length watchpoints are OK. configure_watch_registers just
* ignores them.
*/
static MemoryRange range_for_watchpoint(remote_ptr<void> addr,
size_t num_bytes) {
uintptr_t p = addr.as_int();
uintptr_t max_len = UINTPTR_MAX - p;
return MemoryRange(addr, min<uintptr_t>(num_bytes, max_len));
}
void AddressSpace::remove_watchpoint(remote_ptr<void> addr, size_t num_bytes,
WatchType type) {
auto it = watchpoints.find(range_for_watchpoint(addr, num_bytes));
if (it != watchpoints.end() &&
0 == it->second.unwatch(access_bits_of(type))) {
watchpoints.erase(it);
}
allocate_watchpoints();
}
bool AddressSpace::add_watchpoint(remote_ptr<void> addr, size_t num_bytes,
WatchType type) {
MemoryRange key = range_for_watchpoint(addr, num_bytes);
auto it = watchpoints.find(key);
if (it == watchpoints.end()) {
auto it_and_is_new =
watchpoints.insert(make_pair(key, Watchpoint(num_bytes)));
DEBUG_ASSERT(it_and_is_new.second);
it = it_and_is_new.first;
update_watchpoint_value(it->first, it->second);
}
it->second.watch(access_bits_of(type));
return allocate_watchpoints();
}
void AddressSpace::save_watchpoints() {
saved_watchpoints.push_back(watchpoints);
}
bool AddressSpace::restore_watchpoints() {
DEBUG_ASSERT(!saved_watchpoints.empty());
watchpoints = saved_watchpoints[saved_watchpoints.size() - 1];
saved_watchpoints.pop_back();
return allocate_watchpoints();
}
bool AddressSpace::update_watchpoint_value(const MemoryRange& range,
Watchpoint& watchpoint) {
Task* t = first_running_task();
if (!t) {
return false;
}
bool valid = true;
vector<uint8_t> value_bytes = watchpoint.value_bytes;
for (size_t i = 0; i < value_bytes.size(); ++i) {
value_bytes[i] = 0xFF;
}
remote_ptr<void> addr = range.start();
size_t num_bytes = range.size();
while (num_bytes > 0) {
ssize_t bytes_read = t->read_bytes_fallible(
addr, num_bytes, value_bytes.data() + (addr - range.start()));
if (bytes_read <= 0) {
valid = false;
// advance to next page and try to read more. We want to know
// when the valid part of a partially invalid watchpoint changes.
bytes_read =
min<size_t>(num_bytes, (floor_page_size(addr) + page_size()) - addr);
}
addr += bytes_read;
num_bytes -= bytes_read;
}
bool changed = valid != watchpoint.valid ||
memcmp(value_bytes.data(), watchpoint.value_bytes.data(),
value_bytes.size()) != 0;
watchpoint.valid = valid;
watchpoint.value_bytes = value_bytes;
return changed;
}
void AddressSpace::update_watchpoint_values(remote_ptr<void> start,
remote_ptr<void> end) {
MemoryRange r(start, end);
for (auto& it : watchpoints) {
if (it.first.intersects(r) &&
update_watchpoint_value(it.first, it.second)) {
it.second.changed = true;
// We do nothing to track kernel reads of read-write watchpoints...
}
}
}
static int DR_WATCHPOINT(int n) { return 1 << n; }
static bool watchpoint_triggered(uintptr_t debug_status,
const vector<int8_t>& regs) {
for (auto reg : regs) {
if (debug_status & DR_WATCHPOINT(reg)) {
return true;
}
}
return false;
}
bool AddressSpace::notify_watchpoint_fired(uintptr_t debug_status,
remote_ptr<void> hit_addr,
remote_code_ptr address_of_singlestep_start) {
bool triggered = false;
for (auto& it : watchpoints) {
// On Skylake/4.14.13-300.fc27.x86_64 at least, we have observed a
// situation where singlestepping through the instruction before a hardware
// execution watchpoint causes singlestep completion *and* also reports the
// hardware execution watchpoint being triggered. The latter is incorrect.
// This could be a HW issue or a kernel issue. Work around it by ignoring
// triggered watchpoints that aren't on the instruction we just tried to
// execute.
bool write_triggered = (it.second.watched_bits() & WRITE_BIT) &&
update_watchpoint_value(it.first, it.second);
// Depending on the architecture the hardware may indicate hit watchpoints
// either by number, or by the address that triggered the watchpoint hit
// - support either.
bool read_triggered = false;
bool exec_triggered = false;
bool watchpoint_in_range = false;
if (is_x86ish(arch())) {
read_triggered = (it.second.watched_bits() & READ_BIT) &&
watchpoint_triggered(debug_status,
it.second.debug_regs_for_exec_read);
exec_triggered = (it.second.watched_bits() & EXEC_BIT) &&
(address_of_singlestep_start.is_null() ||
it.first.start() == address_of_singlestep_start.to_data_ptr<void>()) &&
watchpoint_triggered(debug_status,
it.second.debug_regs_for_exec_read);
} else {
// The reported address may not match our watchpoint exactly.
// The ARM manual says:
// The address recorded is within an address range of the size defined by the
// DCZID_EL0.BS field. The start of the range is aligned to the size defined
// by the DCZID_EL0.BS field and its end is not greater than the address that
// triggered the watchpoint.
// So we construct a range spanning the whole block, then test that the range
// intersects a watchpoint range *and* that hit_addr is not past the first byte
// of the watched region.
auto block_size = dczid_el0_block_size();
auto slop = hit_addr.as_int() % block_size;
auto hit_range = MemoryRange(hit_addr - slop, block_size);
watchpoint_in_range = it.first.intersects(hit_range) &&
it.first.start() >= hit_addr;
}
if (write_triggered || read_triggered || exec_triggered || watchpoint_in_range) {
it.second.changed = true;
triggered = true;
}
}
return triggered;
}
void AddressSpace::notify_written(remote_ptr<void> addr, size_t num_bytes,
uint32_t flags) {
if (!(flags & Task::IS_BREAKPOINT_RELATED)) {
update_watchpoint_values(addr, addr + num_bytes);
}
session()->accumulate_bytes_written(num_bytes);
}
void AddressSpace::remove_all_watchpoints() {
watchpoints.clear();
allocate_watchpoints();
}
void AddressSpace::unmap(Task* t, remote_ptr<void> addr, ssize_t num_bytes) {
LOG(debug) << "munmap(" << addr << ", " << num_bytes << ")";
num_bytes = ceil_page_size(num_bytes);
if (!num_bytes) {
return;
}
remove_range(dont_fork, MemoryRange(addr, num_bytes));
remove_range(wipe_on_fork, MemoryRange(addr, num_bytes));
return unmap_internal(t, addr, num_bytes);
}
void AddressSpace::unmap_internal(Task*, remote_ptr<void> addr,
ssize_t num_bytes) {
LOG(debug) << "munmap(" << addr << ", " << num_bytes << ")";
auto unmapper = [this](Mapping m, MemoryRange rem) {
LOG(debug) << " unmapping (" << rem << ") ...";
remove_from_map(m.map);
LOG(debug) << " erased (" << m.map << ") ...";
// If the first segment we unmap underflows the unmap
// region, remap the underflow region.
auto monitored = m.monitored_shared_memory;
if (m.map.start() < rem.start()) {
Mapping underflow(m.map.subrange(m.map.start(), rem.start()),
m.recorded_map.subrange(m.map.start(), rem.start()),
m.emu_file, clone_stat(m.mapped_file_stat),
m.local_addr, std::move(monitored));
underflow.flags = m.flags;
add_to_map(underflow);
}
// If the last segment we unmap overflows the unmap
// region, remap the overflow region.
if (rem.end() < m.map.end()) {
Mapping overflow(
m.map.subrange(rem.end(), m.map.end()),
m.recorded_map.subrange(rem.end(), m.map.end()), m.emu_file,
clone_stat(m.mapped_file_stat),
m.local_addr ? m.local_addr + (rem.end() - m.map.start()) : 0,
m.monitored_shared_memory
? m.monitored_shared_memory->subrange(rem.end() - m.map.start(),
m.map.end() - rem.end())
: nullptr);
overflow.flags = m.flags;
add_to_map(overflow);
}
if (m.local_addr) {
auto addr = m.local_addr + (rem.start() - m.map.start());
auto size = std::min(rem.size(), m.map.size() - (rem.start() - m.map.start()));
int ret = munmap(addr, size);
if (ret < 0) {
FATAL() << "Can't munmap";
}
}
};
for_each_in_range(addr, num_bytes, unmapper);
update_watchpoint_values(addr, addr + num_bytes);
}
void AddressSpace::advise(Task*, remote_ptr<void> addr, ssize_t num_bytes,
int advice) {
LOG(debug) << "madvise(" << addr << ", " << num_bytes << ", " << advice
<< ")";
num_bytes = ceil_page_size(num_bytes);
switch (advice) {
case MADV_DONTFORK:
add_range(dont_fork, MemoryRange(addr, num_bytes));
break;
case MADV_DOFORK:
remove_range(dont_fork, MemoryRange(addr, num_bytes));
break;
case MADV_WIPEONFORK:
add_range(wipe_on_fork, MemoryRange(addr, num_bytes));
break;
case MADV_KEEPONFORK:
remove_range(wipe_on_fork, MemoryRange(addr, num_bytes));
break;
default:
break;
}
}
void AddressSpace::did_fork_into(Task* t) {
// MADV_WIPEONFORK is inherited across fork and cleared on exec.
// We'll copy it here, then do the `dont_fork` unmappings, and then
// whatever survives in the new AddressSpace's wipe_on_fork gets wiped.
t->vm()->wipe_on_fork = wipe_on_fork;
for (auto& range : dont_fork) {
// During recording we execute MADV_DONTFORK so the forked child will
// have had its dontfork areas unmapped by the kernel already
if (!t->session().is_recording()) {
AutoRemoteSyscalls remote(t);
remote.infallible_syscall(syscall_number_for_munmap(remote.arch()),
range.start(), range.size());
}
t->vm()->unmap(t, range.start(), range.size());
}
// Any ranges that were dropped were unmapped (and thus removed from
// wipe_on_fork), so now we can record anything that's left.
for (auto& range : t->vm()->wipe_on_fork) {
if (t->session().is_recording()) {
// Record that these mappings were wiped.
RecordTask* rt = static_cast<RecordTask*>(t);
rt->record_remote(range);
}
}
}
static string strip_deleted(const string& s) {
static const char deleted[] = " (deleted)";
ssize_t find_deleted = s.size() - (sizeof(deleted) - 1);
if (s.find(deleted) == size_t(find_deleted)) {
return s.substr(0, find_deleted);
}
return s;
}
string KernelMapping::fsname_strip_deleted() const {
return strip_deleted(fsname_);
}
enum HandleHeap { TREAT_HEAP_AS_ANONYMOUS, RESPECT_HEAP };
static bool normalized_file_names_equal(const KernelMapping& km1,
const KernelMapping& km2,
HandleHeap handle_heap) {
if (km1.is_stack() || km2.is_stack()) {
// The kernel seems to use "[stack:<tid>]" for any mapping area containing
// thread |tid|'s stack pointer. When the thread exits, the next read of
// the maps doesn't treat the area as stack at all. We don't want to track
// thread exits, so if one of the mappings is a stack, skip the name
// comparison. Device and inode numbers will still be checked.
return true;
}
if (handle_heap == TREAT_HEAP_AS_ANONYMOUS &&
(km1.is_heap() || km2.is_heap())) {
// The kernel's heuristics for treating an anonymous mapping as "[heap]"
// are obscure. Just skip the name check. Device and inode numbers will
// still be checked.
return true;
}
// We don't track when a file gets deleted, so it's possible for the kernel
// to have " (deleted)" when we don't.
return strip_deleted(km1.fsname()) == strip_deleted(km2.fsname());
}
/**
* Return true iff |left| and |right| are located adjacently in memory
* with the same metadata, and map adjacent locations of the same
* underlying (real) device.
*/
static bool is_adjacent_mapping(const KernelMapping& mleft,
const KernelMapping& mright,
HandleHeap handle_heap,
int32_t flags_to_check = 0xFFFFFFFF) {
if (mleft.end() != mright.start()) {
return false;
}
if (((mleft.flags() ^ mright.flags()) & flags_to_check) ||
mleft.prot() != mright.prot()) {
return false;
}
if (!normalized_file_names_equal(mleft, mright, handle_heap)) {
return false;
}
if (mleft.device() != mright.device() || mleft.inode() != mright.inode()) {
return false;
}
if (mleft.is_real_device() &&
mleft.file_offset_bytes() + off64_t(mleft.size()) !=
mright.file_offset_bytes()) {
return false;
}
return true;
}
/**
* If |*left_m| and |right_m| are adjacent (see
* |is_adjacent_mapping()|), write a merged segment descriptor to
* |*left_m| and return true. Otherwise return false.
*/
static bool try_merge_adjacent(KernelMapping* left_m,
const KernelMapping& right_m) {
if (is_adjacent_mapping(*left_m, right_m, TREAT_HEAP_AS_ANONYMOUS,
KernelMapping::checkable_flags_mask)) {
*left_m = KernelMapping(left_m->start(), right_m.end(), left_m->fsname(),
left_m->device(), left_m->inode(), right_m.prot(),
right_m.flags(), left_m->file_offset_bytes());
return true;
}
return false;
}
static dev_t normalized_device_number(const KernelMapping& m) {
if (m.fsname().c_str()[0] != '/') {
return m.device();
}
// btrfs files can report the wrong device number in /proc/<pid>/maps, so
// restrict ourselves to checking whether the device number is != 0
if (m.device() != KernelMapping::NO_DEVICE) {
return (dev_t)-1;
}
return m.device();
}
static void assert_segments_match(Task* t, const KernelMapping& input_m,
const KernelMapping& km) {
KernelMapping m = input_m;
string err;
if (m.start() != km.start()) {
err = "starts differ";
} else if (m.end() != km.end()) {
err = "ends differ";
} else if (m.prot() != km.prot()) {
err = "prots differ";
} else if ((m.flags() ^ km.flags()) & KernelMapping::checkable_flags_mask) {
err = "flags differ";
} else if (!normalized_file_names_equal(m, km, TREAT_HEAP_AS_ANONYMOUS) &&
!(km.is_heap() && m.fsname() == "") &&
!(m.is_heap() && km.fsname() == "") && !km.is_vdso()) {
// Due to emulated exec, the kernel may identify any of our anonymous maps
// as [heap] (or not).
// Kernels before 3.16 have a bug where any mapping at the original VDSO
// address is marked [vdso] even if the VDSO was unmapped and replaced by
// something else, so if the kernel reports [vdso] it may be spurious and
// we skip this check. See kernel commit
// a62c34bd2a8a3f159945becd57401e478818d51c.
err = "filenames differ";
} else if (normalized_device_number(m) != normalized_device_number(km)) {
err = "devices_differ";
} else if (m.inode() != km.inode()) {
err = "inodes differ";
}
if (err.size()) {
cerr << "cached mmap:" << endl;
t->vm()->dump();
cerr << "/proc/" << t->tid << "/mmaps:" << endl;
AddressSpace::print_process_maps(t);
ASSERT(t, false) << "\nCached mapping " << m << " should be " << km << "; "
<< err;
}
}
void AddressSpace::ensure_replay_matches_single_recorded_mapping(Task* t, MemoryRange range) {
// The only case where we eagerly coalesced during recording but not replay should
// be where we mapped private memory beyond-end-of-file.
// Don't do an actual coalescing check here; we rely on the caller to tell us
// the range to coalesce.
ASSERT(t, range.start() == floor_page_size(range.start()));
ASSERT(t, range.end() == ceil_page_size(range.end()));
auto fixer = [this, t, range](Mapping mapping, MemoryRange) {
if (mapping.map == range) {
// Existing single mapping covers entire range; nothing to do.
return;
}
// These should be null during replay
ASSERT(t, !mapping.mapped_file_stat);
// These should not be in use for a beyond-end-of-file mapping
ASSERT(t, !mapping.local_addr);
// The mapping should be private
ASSERT(t, mapping.map.flags() & MAP_PRIVATE);
ASSERT(t, !mapping.emu_file);
ASSERT(t, !mapping.monitored_shared_memory);
// Flagged mappings shouldn't be coalescable ever
ASSERT(t, !mapping.flags);
if (!(mapping.map.flags() & MAP_ANONYMOUS)) {
// Direct-mapped piece. Turn it into an anonymous mapping.
vector<uint8_t> buffer;
buffer.resize(mapping.map.size());
t->read_bytes_helper(mapping.map.start(), buffer.size(), buffer.data());
{
AutoRemoteSyscalls remote(t);
remote.infallible_mmap_syscall_if_alive(mapping.map.start(), buffer.size(),
mapping.map.prot(), mapping.map.flags() | MAP_ANONYMOUS | MAP_FIXED, -1, 0);
}
t->write_bytes_helper(mapping.map.start(), buffer.size(), buffer.data());
// We replace the entire mapping even if part of it falls outside the desired range.
// That's OK, this replacement preserves behaviour, it's simpler, even if a bit
// less efficient in weird cases.
mem.erase(mapping.map);
KernelMapping anonymous_km(mapping.map.start(), mapping.map.end(),
string(), KernelMapping::NO_DEVICE, KernelMapping::NO_INODE,
mapping.map.prot(), mapping.map.flags() | MAP_ANONYMOUS);
Mapping new_mapping(anonymous_km, mapping.recorded_map);
mem[new_mapping.map] = new_mapping;
}
};
for_each_in_range(range.start(), range.size(), fixer);
coalesce_around(t, mem.find(range));
}
KernelMapping AddressSpace::vdso() const {
DEBUG_ASSERT(!vdso_start_addr.is_null());
return mapping_of(vdso_start_addr).map;
}
/**
* Iterate over /proc/maps segments for a task and verify that the
* task's cached mapping matches the kernel's (given a lenient fuzz
* factor).
*/
void AddressSpace::verify(Task* t) const {
ASSERT(t, task_set().end() != task_set().find(t));
if (thread_group_in_exec(t)) {
return;
}
LOG(debug) << "Verifying address space for task " << t->tid;
MemoryMap::const_iterator mem_it = mem.begin();
KernelMapIterator kernel_it(t);
if (kernel_it.at_end()) {
LOG(debug) << "Task " << t->tid << " exited unexpectedly, ignoring";
return;
}
while (!kernel_it.at_end() && mem_it != mem.end()) {
KernelMapping km = kernel_it.current();
++kernel_it;
while (!kernel_it.at_end()) {
KernelMapping next_km = kernel_it.current();
if (!try_merge_adjacent(&km, next_km)) {
break;
}
++kernel_it;
}
KernelMapping vm = mem_it->second.map;
++mem_it;
while (mem_it != mem.end() && try_merge_adjacent(&vm, mem_it->second.map)) {
++mem_it;
}
assert_segments_match(t, vm, km);
}
ASSERT(t, kernel_it.at_end() && mem_it == mem.end());
}
// Just a place that rr's AutoSyscall functionality can use as a syscall
// instruction in rr's address space for use before we have exec'd.
extern "C" {
// Mark this as hidden, otherwise we might get the address of the GOT entry,
// which could cause problems.
extern char rr_syscall_addr __attribute__ ((visibility ("hidden")));
}
static void __attribute__((noinline, used)) fake_syscall() {
__asm__ __volatile__(".global rr_syscall_addr\n\t");
#ifdef __i386__
__asm__ __volatile__("rr_syscall_addr: int $0x80\n\t"
"nop\n\t"
"nop\n\t"
"nop\n\t");
#elif defined(__x86_64__)
__asm__ __volatile__("rr_syscall_addr: syscall\n\t"
"nop\n\t"
"nop\n\t"
"nop\n\t");
#elif defined(__aarch64__)
__asm__ __volatile__("rr_syscall_addr: svc #0\n\t"
"nop\n\t"
"nop\n\t"
"nop\n\t");
#endif
}
AddressSpace::AddressSpace(Task* t, const string& exe, uint32_t exec_count)
: exe(exe),
leader_tid_(t->rec_tid),
leader_serial(t->tuid().serial()),
exec_count(exec_count),
session_(&t->session()),
monkeypatch_state(t->session().is_recording() ? new Monkeypatcher()
: nullptr),
syscallbuf_enabled_(false),
do_breakpoint_fault_addr_(nullptr),
stopping_breakpoint_table_(nullptr),
stopping_breakpoint_table_entry_size_(0),
first_run_event_(0) {
// TODO: this is a workaround of
// https://github.com/rr-debugger/rr/issues/1113 .
if (session_->done_initial_exec()) {
populate_address_space(t);
DEBUG_ASSERT(!vdso_start_addr.is_null());
} else {
// Setup traced_syscall_ip_ now because we need to do AutoRemoteSyscalls
// (for open_mem_fd) before the first exec. We rely on the fact that we
// haven't execed yet, so the address space layout is the same.
traced_syscall_ip_ = remote_code_ptr((uintptr_t)&rr_syscall_addr);
}
}
// Does not copy the task set; the new AddressSpace will be for new tasks.
AddressSpace::AddressSpace(Session* session, const AddressSpace& o,
pid_t leader_tid, uint32_t leader_serial,
uint32_t exec_count)
: exe(o.exe),
leader_tid_(leader_tid),
leader_serial(leader_serial),
exec_count(exec_count),
brk_start(o.brk_start),
brk_end(o.brk_end),
mem(o.mem),
shm_sizes(o.shm_sizes),
monitored_mem(o.monitored_mem),
dont_fork(o.dont_fork),
wipe_on_fork(o.wipe_on_fork),
session_(session),
vdso_start_addr(o.vdso_start_addr),
monkeypatch_state(o.monkeypatch_state
? new Monkeypatcher(*o.monkeypatch_state)
: nullptr),
traced_syscall_ip_(o.traced_syscall_ip_),
privileged_traced_syscall_ip_(o.privileged_traced_syscall_ip_),
syscallbuf_enabled_(o.syscallbuf_enabled_),
do_breakpoint_fault_addr_(o.do_breakpoint_fault_addr_),
stopping_breakpoint_table_(o.stopping_breakpoint_table_),
stopping_breakpoint_table_entry_size_(o.stopping_breakpoint_table_entry_size_),
saved_auxv_(o.saved_auxv_),
saved_interpreter_base_(o.saved_interpreter_base_),
saved_ld_path_(o.saved_ld_path_),
last_free_memory(o.last_free_memory),
first_run_event_(0) {
for (auto& m : mem) {
// The original address space continues to have exclusive ownership of
// all local mappings.
m.second.local_addr = nullptr;
}
for (auto& it : o.breakpoints) {
breakpoints.insert(make_pair(it.first, it.second));
}
for (auto& it : o.watchpoints) {
watchpoints.insert(make_pair(it.first, it.second));
}
if (session != o.session()) {
// Cloning into a new session means we're checkpointing.
first_run_event_ = o.first_run_event_;
}
// cloned tasks will automatically get cloned debug registers and
// cloned address-space memory, so we don't need to do any more work here.
}
bool AddressSpace::post_vm_clone(Task* t) {
if (has_mapping(preload_thread_locals_start()) &&
(mapping_flags_of(preload_thread_locals_start()) &
AddressSpace::Mapping::IS_THREAD_LOCALS) == 0) {
// The tracee already has a mapping at this address that doesn't belong to
// us. Don't touch it.
return false;
}
// Otherwise, the preload_thread_locals mapping is nonexistent or ours.
// Recreate it.
AutoRemoteSyscalls remote(t);
t->session().create_shared_mmap(remote, PRELOAD_THREAD_LOCALS_SIZE,
preload_thread_locals_start(),
"preload_thread_locals");
mapping_flags_of(preload_thread_locals_start()) |=
AddressSpace::Mapping::IS_THREAD_LOCALS;
return true;
}
static bool try_split_unaligned_range(MemoryRange& range, size_t bytes,
vector<MemoryRange>& result) {
if ((range.start().as_int() & (bytes - 1)) || range.size() < bytes) {
return false;
}
result.push_back(MemoryRange(range.start(), bytes));
range = MemoryRange(range.start() + bytes, range.end());
return true;
}
static vector<MemoryRange> split_range(const MemoryRange& range) {
vector<MemoryRange> result;
MemoryRange r = range;
while (r.size() > 0) {
if ((sizeof(void*) < 8 || !try_split_unaligned_range(r, 8, result)) &&
!try_split_unaligned_range(r, 4, result) &&
!try_split_unaligned_range(r, 2, result)) {
bool ret = try_split_unaligned_range(r, 1, result);
DEBUG_ASSERT(ret);
}
}
return result;
}
static void configure_watch_registers(vector<WatchConfig>& regs,
const MemoryRange& range, WatchType type,
vector<int8_t>* assigned_regs,
AddressSpace::WatchpointAlignment alignment) {
if (alignment == AddressSpace::UNALIGNED) {
regs.push_back(WatchConfig(range.start(), range.size(), type));
return;
}
// Zero-sized WatchConfigs return no ranges here, so are ignored.
auto split_ranges = split_range(range);
if (type == WATCH_WRITE && range.size() > 1) {
// We can suppress spurious write-watchpoint triggerings by checking
// whether memory values have changed. So we can sometimes conserve
// debug registers by upgrading an unaligned range to an aligned range
// of a larger size.
uintptr_t align;
if (range.size() <= 2) {
align = 2;
} else if (range.size() <= 4 || sizeof(void*) <= 4) {
align = 4;
} else {
align = 8;
}
remote_ptr<void> aligned_start(range.start().as_int() & ~(align - 1));
remote_ptr<void> aligned_end((range.end().as_int() + (align - 1)) &
~(align - 1));
auto split = split_range(MemoryRange(aligned_start, aligned_end));
// If the aligned range doesn't reduce register usage, use the original
// split to avoid spurious triggerings
if (split.size() < split_ranges.size()) {
split_ranges = split;
}
}
for (auto& r : split_ranges) {
if (assigned_regs) {
assigned_regs->push_back(regs.size());
}
regs.push_back(WatchConfig(r.start(), r.size(), type));
}
}
vector<WatchConfig> AddressSpace::get_watchpoints_internal(
WatchpointFilter filter,
WatchpointAlignment alignment,
UpdateWatchpointRegisterAssignments update_watchpoint_register_assignments) {
vector<WatchConfig> result;
for (auto& kv : watchpoints) {
if (filter == CHANGED_WATCHPOINTS) {
if (!kv.second.changed) {
continue;
}
kv.second.changed = false;
}
vector<int8_t>* assigned_regs = nullptr;
if (update_watchpoint_register_assignments == UPDATE_WATCHPOINT_REGISTER_ASSIGNMENTS) {
kv.second.debug_regs_for_exec_read.clear();
assigned_regs = &kv.second.debug_regs_for_exec_read;
}
const MemoryRange& r = kv.first;
int watching = kv.second.watched_bits();
if (EXEC_BIT & watching) {
configure_watch_registers(result, r, WATCH_EXEC, assigned_regs, alignment);
}
if (READ_BIT & watching) {
configure_watch_registers(result, r, WATCH_READWRITE, assigned_regs, alignment);
} else if (WRITE_BIT & watching) {
configure_watch_registers(result, r, WATCH_WRITE, nullptr, alignment);
}
}
return result;
}
bool AddressSpace::has_any_watchpoint_changes() {
for (auto& kv : watchpoints) {
if (kv.second.changed) {
return true;
}
}
return false;
}
bool AddressSpace::has_exec_watchpoint_fired(remote_code_ptr addr) {
for (auto& kv : watchpoints) {
if (kv.second.changed && kv.second.exec_count > 0 &&
kv.first.start() == addr.to_data_ptr<void>()) {
return true;
}
}
return false;
}
bool AddressSpace::allocate_watchpoints() {
vector<WatchConfig> regs = get_watchpoints_internal(ALL_WATCHPOINTS, ALIGNED,
UPDATE_WATCHPOINT_REGISTER_ASSIGNMENTS);
if (task_set().empty()) {
// We can't validate the watchpoint set in this case
FATAL() << "No tasks???";
}
if ((*task_set().begin())->set_debug_regs(regs)) {
return true;
}
for (auto kv : watchpoints) {
kv.second.debug_regs_for_exec_read.clear();
}
return false;
}
static inline void assert_coalescable(Task* t,
const AddressSpace::Mapping& lower,
const AddressSpace::Mapping& higher) {
ASSERT(t, lower.emu_file == higher.emu_file);
ASSERT(t, lower.flags == higher.flags);
ASSERT(t,
(lower.local_addr == 0 && higher.local_addr == 0) ||
lower.local_addr + lower.map.size() == higher.local_addr);
ASSERT(t, !lower.monitored_shared_memory && !higher.monitored_shared_memory);
}
static bool is_coalescable(const AddressSpace::Mapping& mleft,
const AddressSpace::Mapping& mright) {
if (!is_adjacent_mapping(mleft.map, mright.map, RESPECT_HEAP) ||
!is_adjacent_mapping(mleft.recorded_map, mright.recorded_map,
RESPECT_HEAP)) {
return false;
}
return mleft.flags == mright.flags;
}
void AddressSpace::coalesce_around(Task* t, MemoryMap::iterator it) {
auto first_kv = it;
while (mem.begin() != first_kv) {
auto next = first_kv;
--first_kv;
if (!is_coalescable(first_kv->second, next->second)) {
first_kv = next;
break;
}
assert_coalescable(t, first_kv->second, next->second);
}
auto last_kv = it;
while (true) {
auto prev = last_kv;
++last_kv;
if (mem.end() == last_kv ||
!is_coalescable(prev->second, last_kv->second)) {
last_kv = prev;
break;
}
assert_coalescable(t, prev->second, last_kv->second);
}
ASSERT(t, last_kv != mem.end());
if (first_kv == last_kv) {
LOG(debug) << " no mappings to coalesce";
return;
}
Mapping new_m(first_kv->second.map.extend(last_kv->first.end()),
first_kv->second.recorded_map.extend(last_kv->first.end()),
first_kv->second.emu_file,
clone_stat(first_kv->second.mapped_file_stat),
first_kv->second.local_addr);
new_m.flags = first_kv->second.flags;
LOG(debug) << " coalescing " << new_m.map;
// monitored-memory currently isn't coalescable so we don't need to
// adjust monitored_mem
mem.erase(first_kv, ++last_kv);
auto ins = mem.insert(MemoryMap::value_type(new_m.map, new_m));
DEBUG_ASSERT(ins.second); // key didn't already exist
}
void AddressSpace::destroy_breakpoint(BreakpointMap::const_iterator it) {
if (task_set().empty()) {
return;
}
Task* t = first_running_task();
if (!t) {
return;
}
auto ptr = it->first.to_data_ptr<uint8_t>();
auto data = it->second.overwritten_data;
if (bkpt_instruction_length(arch()) == 1) {
LOG(debug) << "Writing back " << HEX(data[0]) << " at " << ptr;
} else {
LOG(debug) << "Writing back " << bkpt_instruction_length(arch()) << " bytes at " << ptr;
}
t->write_bytes_helper(ptr, bkpt_instruction_length(arch()),
data, nullptr, Task::IS_BREAKPOINT_RELATED);
breakpoints.erase(it);
}
void AddressSpace::maybe_update_breakpoints(Task* t, remote_ptr<uint8_t> addr,
size_t len) {
for (auto& it : breakpoints) {
remote_ptr<uint8_t> bp_addr = it.first.to_data_ptr<uint8_t>();
if (addr <= bp_addr && bp_addr < addr + len - 1) {
// This breakpoint was overwritten. Note the new data and reset the
// breakpoint.
bool ok = true;
t->read_bytes_helper(bp_addr, bkpt_instruction_length(arch()),
&it.second.overwritten_data, &ok);
ASSERT(t, ok);
t->write_bytes_helper(bp_addr, bkpt_instruction_length(arch()),
breakpoint_insn(arch()));
}
}
}
void AddressSpace::for_each_in_range(
remote_ptr<void> addr, ssize_t num_bytes,
function<void(Mapping m, MemoryRange rem)> f, int how) {
remote_ptr<void> region_start = floor_page_size(addr);
remote_ptr<void> last_unmapped_end = region_start;
remote_ptr<void> region_end = ceil_page_size(addr + num_bytes);
while (last_unmapped_end < region_end) {
// Invariant: |rem| is always exactly the region of
// memory remaining to be examined for pages to be
// unmapped.
MemoryRange rem(last_unmapped_end, region_end);
// The next page to iterate may not be contiguous with
// the last one seen.
auto it = mem.lower_bound(rem);
if (mem.end() == it) {
LOG(debug) << " not found, done.";
return;
}
// Don't make a reference here. |f| is allowed to erase Mappings.
MemoryRange range = it->first;
if (rem.end() <= range.start()) {
LOG(debug) << " mapping at " << range.start() << " out of range, done.";
return;
}
if (ITERATE_CONTIGUOUS == how &&
!(range.start() < region_start || rem.start() == range.start())) {
LOG(debug) << " discontiguous mapping at " << range.start() << ", done.";
return;
}
f(it->second, rem);
// Maintain the loop invariant.
last_unmapped_end = range.end();
}
}
void AddressSpace::map_and_coalesce(
Task* t, const KernelMapping& m, const KernelMapping& recorded_map,
EmuFile::shr_ptr emu_file, unique_ptr<struct stat> mapped_file_stat,
void* local_addr, shared_ptr<MonitoredSharedMemory> monitored) {
LOG(debug) << " mapping " << m;
if (monitored) {
monitored_mem.insert(m.start());
}
auto ins = mem.insert(MemoryMap::value_type(
m, Mapping(m, recorded_map, emu_file, std::move(mapped_file_stat),
local_addr, std::move(monitored))));
coalesce_around(t, ins.first);
update_watchpoint_values(m.start(), m.end());
}
static bool could_be_stack(const KernelMapping& km) {
// On 4.1.6-200.fc22.x86_64 we observe that during exec of the rr_exec_stub
// during replay, when the process switches from 32-bit to 64-bit, the 64-bit
// registers seem truncated to 32 bits during the initial PTRACE_GETREGS so
// our sp looks wrong and /proc/<pid>/maps doesn't identify the region as
// stack.
// On stub execs there should only be one read-writable memory area anyway.
return km.prot() == (PROT_READ | PROT_WRITE) && km.fsname() == "" &&
km.device() == KernelMapping::NO_DEVICE &&
km.inode() == KernelMapping::NO_INODE;
}
static dev_t check_device(const KernelMapping& km) {
if (km.fsname().c_str()[0] != '/') {
return km.device();
}
// btrfs files can return the wrong device number in /proc/<pid>/maps
struct stat st;
int ret = stat(km.fsname().c_str(), &st);
if (ret < 0) {
return km.device();
}
return st.st_dev;
}
void AddressSpace::populate_address_space(Task* t) {
bool found_proper_stack = false;
for (KernelMapIterator it(t); !it.at_end(); ++it) {
auto& km = it.current();
if (km.is_stack()) {
found_proper_stack = true;
}
}
// If we're being recorded by rr, we'll see the outer rr's rr_page and
// preload_thread_locals. In post_exec() we'll remap those with our
// own mappings. That's OK because a) the rr_page contents are the same
// anyway and immutable and b) the preload_thread_locals page is only
// used by the preload library, and the preload library only knows about
// the inner rr. I.e. as far as the outer rr is concerned, the tracee is
// not doing syscall buffering.
int found_stacks = 0;
for (KernelMapIterator it(t); !it.at_end(); ++it) {
auto& km = it.current();
int flags = km.flags();
remote_ptr<void> start = km.start();
bool is_stack = found_proper_stack ? km.is_stack() : could_be_stack(km);
if (is_stack) {
++found_stacks;
flags |= MAP_GROWSDOWN;
if (uses_invisible_guard_page()) {
// MAP_GROWSDOWN segments really occupy one additional page before
// the start address shown by /proc/<pid>/maps --- unless that page
// is already occupied by another mapping.
if (!has_mapping(start - page_size())) {
start -= page_size();
}
}
}
map(t, start, km.end() - start, km.prot(), flags, km.file_offset_bytes(),
km.fsname(), check_device(km), km.inode(), nullptr);
}
ASSERT(t, found_stacks == 1);
}
static int addr_bits(SupportedArch arch) {
switch (arch) {
default:
DEBUG_ASSERT(0 && "Unknown architecture");
RR_FALLTHROUGH;
case x86:
return 32;
// Current x86-64 systems have only 48 bits of virtual address space,
// and only the bottom half is usable by user space
case x86_64:
return 47;
// Aarch64 has 48 bit address space, with user and kernel each getting
// their own 48 bits worth of address space at opposite end of the full
// 64-bit address space.
case aarch64:
return 48;
}
}
static MemoryRange adjust_range_for_stack_growth(const KernelMapping& km) {
remote_ptr<void> start = km.start();
if (km.flags() & MAP_GROWSDOWN) {
start = min(start, km.end() - AddressSpace::chaos_mode_min_stack_size());
}
return MemoryRange(start, km.end());
}
static MemoryRange overlaps_excluded_range(const RecordSession& session, MemoryRange range) {
for (const auto& r : session.excluded_ranges()) {
if (r.intersects(range)) {
return r;
}
}
return MemoryRange();
}
static bool is_all_memory_excluded(const RecordSession& session) {
for (const auto& r : session.excluded_ranges()) {
if (r == MemoryRange::all()) {
return true;
}
}
return false;
}
// Choose a 4TB range to exclude from random mappings. This makes room for
// advanced trace analysis tools that require a large address range in tracees
// that is never mapped.
static MemoryRange choose_global_exclusion_range(const RecordSession* session) {
if (session && is_all_memory_excluded(*session)) {
return MemoryRange(nullptr, 0);
}
if (session && session->fixed_global_exclusion_range().size()) {
// For TSAN we have a hardcoded range stored in the session.
return session->fixed_global_exclusion_range();
}
const uint64_t range_size = uint64_t(4)*1024*1024*1024*1024;
while (true) {
int bits = addr_bits(x86_64);
uint64_t r = ((uint64_t)(uint32_t)random() << 32) | (uint32_t)random();
uint64_t r_addr = r & ((uint64_t(1) << bits) - 1);
r_addr = min(r_addr, (uint64_t(1) << bits) - range_size);
remote_ptr<void> addr = floor_page_size(remote_ptr<void>(r_addr));
MemoryRange ret(addr, (uintptr_t)range_size);
if (!session || !overlaps_excluded_range(*session, ret).size()) {
return ret;
}
}
}
MemoryRange AddressSpace::get_global_exclusion_range(const RecordSession* session) {
static MemoryRange global_exclusion_range = choose_global_exclusion_range(session);
return global_exclusion_range;
}
static remote_ptr<void> usable_address_space_end(Task* t) {
return remote_ptr<void>((uint64_t(1) << addr_bits(t->arch())) - page_size());
}
static const remote_ptr<void> addr_space_start(0x40000);
remote_ptr<void> AddressSpace::chaos_mode_find_free_memory(RecordTask* t,
size_t len, remote_ptr<void> hint) {
if (is_all_memory_excluded(t->session())) {
return nullptr;
}
MemoryRange global_exclusion_range = get_global_exclusion_range(&t->session());
// NB: Above RR_PAGE_ADDR is probably not free anyways, but if it somehow is
// don't hand it out again.
static MemoryRange rrpage_so_range = MemoryRange(RR_PAGE_ADDR - PRELOAD_LIBRARY_PAGE_SIZE, RR_PAGE_ADDR + PRELOAD_LIBRARY_PAGE_SIZE);
// Ignore the hint half the time.
if (hint && (random() & 1)) {
hint = nullptr;
}
remote_ptr<void> start = hint;
if (!start) {
// Half the time, try to allocate at a completely random address. The other
// half of the time, we'll try to allocate immediately before or after a
// randomly chosen existing mapping.
if (random() % 2) {
uint64_t r = ((uint64_t)(uint32_t)random() << 32) | (uint32_t)random();
start = floor_page_size(remote_ptr<void>(r & ((uint64_t(1) << addr_bits(t->arch())) - 1)));
} else {
ASSERT(t, !mem.empty());
int map_index = random() % mem.size();
int map_count = 0;
for (const auto& m : maps()) {
if (map_count == map_index) {
start = m.map.start();
break;
}
++map_count;
}
}
}
// Reserve 3 pages at the end of userspace in case Monkeypatcher wants
// to allocate something there.
uint64_t reserve_area_for_monkeypatching = 3 * page_size();
remote_ptr<void> addr_space_end = usable_address_space_end(t) - reserve_area_for_monkeypatching;
// Clamp start so that we're in the usable address space.
start = max(start, addr_space_start);
start = min(start, addr_space_end - len);
// Search the address space in one direction all the way to the end,
// then in the other direction.
int direction = (random() % 2) ? 1 : -1;
remote_ptr<void> addr;
for (int iteration = 0; iteration < 2; ++iteration) {
// Invariant: [addr, addr+len) is always in the usable address space
// [addr_space_start, addr_space_end).
addr = start;
while (true) {
// Look for any reserved address space that overlaps [addr, addr+len]
// and store any overlapping range here. If multiple reserved areas
// overlap, we just pick one arbitrarily.
MemoryRange overlapping_range;
Maps m = maps_containing_or_after(addr);
if (m.begin() != m.end()) {
MemoryRange range = adjust_range_for_stack_growth(m.begin()->map);
if (range.start() < addr + len) {
overlapping_range = range;
}
}
if (!overlapping_range.size()) {
MemoryRange r(addr, ceil_page_size(len));
if (r.intersects(rrpage_so_range)) {
overlapping_range = rrpage_so_range;
} else if (r.intersects(global_exclusion_range)) {
overlapping_range = global_exclusion_range;
} else if (!t->session().excluded_ranges().empty()) {
ASSERT(t, word_size(t->arch()) >= 8)
<< "Chaos mode with ASAN/TSAN not supported in 32-bit processes";
MemoryRange excluded = overlaps_excluded_range(t->session(), r);
if (excluded.size()) {
overlapping_range = excluded;
}
}
}
if (!overlapping_range.size()) {
// No overlap and the range fits into our address space. Stop.
return addr;
}
if (direction == -1) {
// Try moving backwards to allocate just before the start of
// the overlapping range.
if (overlapping_range.start() < addr_space_start + len) {
break;
}
addr = overlapping_range.start() - len;
} else {
// Try moving forwards to allocate just after the end of
// the overlapping range.
if (overlapping_range.end() + len > addr_space_end) {
break;
}
addr = overlapping_range.end();
}
}
direction = -direction;
}
return nullptr;
}
remote_ptr<void> AddressSpace::find_free_memory(Task* t,
size_t required_space,
remote_ptr<void> after,
FindFreeMemoryPolicy policy) {
if (after < last_free_memory &&
policy == FindFreeMemoryPolicy::USE_LAST_FREE_HINT) {
// Search for free memory starting at the last place we finished
// our search. This is more efficient than starting at the beginning
// every time.
after = last_free_memory;
}
remote_ptr<void> addr_space_end = usable_address_space_end(t);
ASSERT(t, required_space < UINT64_MAX - addr_space_end.as_int());
bool started_from_beginning = after.is_null();
while (true) {
auto maps = maps_starting_at(after);
auto current = maps.begin();
while (current != maps.end()) {
auto next = current;
++next;
remote_ptr<void> end_of_free_space;
if (next == maps.end()) {
end_of_free_space = addr_space_end;
} else {
end_of_free_space = min(addr_space_end, next->map.start());
}
if (current->map.end() + required_space <= end_of_free_space) {
return current->map.end();
}
current = next;
}
if (started_from_beginning) {
return nullptr;
}
started_from_beginning = true;
after = addr_space_start;
}
}
void AddressSpace::add_stap_semaphore_range(Task* task, MemoryRange range) {
ASSERT(task, range.start() != range.end())
<< "Unexpected zero-length SystemTap semaphore range: " << range;
ASSERT(task, (range.size() & 1) == 0)
<< "Invalid SystemTap semaphore range at "
<< range
<< ": size is not a multiple of the size of a STap semaphore!";
auto ptr = range.start().cast<uint16_t>(),
end = range.end().cast<uint16_t>();
for (; ptr < end; ++ptr) {
stap_semaphores.insert(ptr);
}
}
void AddressSpace::remove_stap_semaphore_range(Task* task, MemoryRange range) {
ASSERT(task, range.start() != range.end())
<< "Unexpected zero-length SystemTap semaphore range: " << range;
ASSERT(task, (range.size() & 1) == 0)
<< "Invalid SystemTap semaphore range at "
<< range
<< ": size is not a multiple of the size of a STap semaphore!";
auto ptr = range.start().cast<uint16_t>(),
end = range.end().cast<uint16_t>();
for (; ptr < end; ++ptr) {
stap_semaphores.erase(ptr);
}
}
bool AddressSpace::is_stap_semaphore(remote_ptr<uint16_t> addr) {
return stap_semaphores.find(addr) != stap_semaphores.end();
}
void AddressSpace::fd_tables_changed() {
if (!session()->is_recording()) {
// All modifications are recorded during record
return;
}
if (!syscallbuf_enabled()) {
return;
}
DEBUG_ASSERT(task_set().size() != 0);
uint8_t fdt_uniform = true;
RecordTask* rt = static_cast<RecordTask*>(first_running_task());
if (!rt) {
return;
}
auto fdt = rt->fd_table();
for (auto* t : task_set()) {
if (t->fd_table() != fdt) {
fdt_uniform = false;
}
}
auto addr = REMOTE_PTR_FIELD(rt->preload_globals, fdt_uniform);
bool ok = true;
if (rt->read_mem(addr, &ok) != fdt_uniform) {
if (!ok) {
return;
}
rt->write_mem(addr, fdt_uniform);
rt->record_local(addr, sizeof(fdt_uniform), &fdt_uniform);
}
}
bool AddressSpace::range_is_private_mapping(const MemoryRange& range) const {
MemoryRange r = range;
while (r.size() > 0) {
if (!has_mapping(r.start())) {
return false;
}
const AddressSpace::Mapping& m = mapping_of(r.start());
if (!(m.map.flags() & MAP_PRIVATE)) {
return false;
}
if (m.map.end() >= r.end()) {
return true;
}
r = MemoryRange(m.map.end(), r.end());
}
return true;
}
} // namespace rr