src/Session.cc - toolchain/rr - Git at Google

 /* -*- Mode: C++; tab-width: 8; c-basic-offset: 2; indent-tabs-mode: nil; -*- */

 #include "Session.h"

 #include <linux/limits.h>
 #include <linux/unistd.h>
 #include <syscall.h>
 #include <sys/wait.h>

 #include <algorithm>
 #include <limits>

 #include "rr/rr.h"

 #include "AutoRemoteSyscalls.h"
 #include "EmuFs.h"
 #include "Flags.h"
 #include "PerfCounters.h"
 #include "RecordSession.h"
 #include "RecordTask.h"
 #include "Task.h"
 #include "ThreadGroup.h"
 #include "core.h"
 #include "kernel_metadata.h"
 #include "log.h"
 #include "util.h"
 #include "preload/preload_interface.h"

 using namespace std;

 namespace rr {

 struct Session::CloneCompletion {
   struct AddressSpaceClone {
     Task* clone_leader;
     Task::CapturedState clone_leader_state;
     vector<Task::CapturedState> member_states;
     vector<pair<remote_ptr<void>, vector<uint8_t>>> captured_memory;
   };
   vector<AddressSpaceClone> address_spaces;
   Task::ClonedFdTables cloned_fd_tables;
 };

 Session::Session()
     : tracee_socket(make_shared<ScopedFd>()),
       tracee_socket_receiver(make_shared<ScopedFd>()),
       tracee_socket_fd_number(0),
       next_task_serial_(1),
       rrcall_base_(RR_CALL_BASE),
       syscallbuf_fds_disabled_size_(SYSCALLBUF_FDS_DISABLED_SIZE),
       syscall_seccomp_ordering_(PTRACE_SYSCALL_BEFORE_SECCOMP_UNKNOWN),
       ticks_semantics_(PerfCounters::default_ticks_semantics()),
       done_initial_exec_(false),
       visible_execution_(true) {
   LOG(debug) << "Session " << this << " created";
 }

 Session::~Session() {
   kill_all_tasks();
   LOG(debug) << "Session " << this << " destroyed";

   for (auto tg : thread_group_map_) {
     tg.second->forget_session();
   }
 }

 Session::Session(const Session& other) {
   statistics_ = other.statistics_;
   next_task_serial_ = other.next_task_serial_;
   done_initial_exec_ = other.done_initial_exec_;
   rrcall_base_ = other.rrcall_base_;
   syscallbuf_fds_disabled_size_ = other.syscallbuf_fds_disabled_size_;
   visible_execution_ = other.visible_execution_;
   tracee_socket = other.tracee_socket;
   tracee_socket_receiver = other.tracee_socket_receiver;
   tracee_socket_fd_number = other.tracee_socket_fd_number;
   ticks_semantics_ = other.ticks_semantics_;
   original_affinity_ = other.original_affinity_;
 }

 void Session::on_create(ThreadGroup* tg) { thread_group_map_[tg->tguid()] = tg; }
 void Session::on_destroy(ThreadGroup* tg) {
   thread_group_map_.erase(tg->tguid());
 }

 void Session::post_exec() {
   /* We just saw a successful exec(), so from now on we know
    * that the address space layout for the replay tasks will
    * (should!) be the same as for the recorded tasks.  So we can
    * start validating registers at events. */
   assert_fully_initialized();
   if (done_initial_exec_) {
     return;
   }
   done_initial_exec_ = true;
   DEBUG_ASSERT(tasks().size() == 1);
   Task* t = tasks().begin()->second;
   t->flush_inconsistent_state();
   spawned_task_error_fd_.close();
 }

 AddressSpace::shr_ptr Session::create_vm(Task* t, const std::string& exe,
                                          uint32_t exec_count) {
   assert_fully_initialized();
   AddressSpace::shr_ptr as(new AddressSpace(t, exe, exec_count));
   as->insert_task(t);
   vm_map[as->uid()] = as.get();
   return as;
 }

 AddressSpace::shr_ptr Session::clone(Task* t, AddressSpace::shr_ptr vm) {
   assert_fully_initialized();
   // If vm already belongs to our session this is a fork, otherwise it's
   // a session-clone
   AddressSpace::shr_ptr as;
   if (this == vm->session()) {
     as = AddressSpace::shr_ptr(
         new AddressSpace(this, *vm, t->rec_tid, t->tuid().serial(), 0));
   } else {
     as = AddressSpace::shr_ptr(new AddressSpace(this, *vm, vm->uid().tid(),
                                                 vm->uid().serial(),
                                                 vm->uid().exec_count()));
   }
   vm_map[as->uid()] = as.get();
   return as;
 }

 ThreadGroup::shr_ptr Session::create_initial_tg(Task* t) {
   ThreadGroup::shr_ptr tg(
       new ThreadGroup(this, nullptr, t->rec_tid, t->rec_tid,
                       t->tuid().serial()));
   tg->insert_task(t);
   return tg;
 }

 ThreadGroup::shr_ptr Session::clone(Task* t, ThreadGroup::shr_ptr tg) {
   assert_fully_initialized();
   // If tg already belongs to our session this is a fork to create a new
   // taskgroup, otherwise it's a session-clone of an existing taskgroup
   if (this == tg->session()) {
     return ThreadGroup::shr_ptr(
        new ThreadGroup(this, tg.get(), t->rec_tid,
                        t->own_namespace_tid(), t->tuid().serial()));
   }
   ThreadGroup* parent =
       tg->parent() ? find_thread_group(tg->parent()->tguid()) : nullptr;
   return ThreadGroup::shr_ptr(
       new ThreadGroup(this, parent, tg->tgid,
                       t->own_namespace_tid(), tg->tguid().serial()));
 }

 Task* Session::new_task(pid_t tid, pid_t rec_tid, uint32_t serial,
                         SupportedArch a, const std::string&) {
   return new Task(*this, tid, rec_tid, serial, a);
 }

 vector<AddressSpace*> Session::vms() const {
   vector<AddressSpace*> result;
   for (auto& vm : vm_map) {
     result.push_back(vm.second);
   }
   return result;
 }

 Task* Session::clone(Task* p, int flags, remote_ptr<void> stack,
                      remote_ptr<void> tls, remote_ptr<int> cleartid_addr,
                      pid_t new_tid, pid_t new_rec_tid) {
   assert_fully_initialized();
   Task* c = p->clone(Task::TRACEE_CLONE, flags, stack, tls, cleartid_addr,
                      new_tid, new_rec_tid, next_task_serial());
   on_create(c);
   return c;
 }

 Task* Session::find_task(pid_t rec_tid) const {
   finish_initializing();
   auto it = tasks().find(rec_tid);
   return tasks().end() != it ? it->second : nullptr;
 }

 Task* Session::find_task(const TaskUid& tuid) const {
   Task* t = find_task(tuid.tid());
   return t && t->tuid() == tuid ? t : nullptr;
 }

 ThreadGroup* Session::find_thread_group(const ThreadGroupUid& tguid) const {
   finish_initializing();
   auto it = thread_group_map_.find(tguid);
   if (thread_group_map_.end() == it) {
     return nullptr;
   }
   return it->second;
 }

 ThreadGroup* Session::find_thread_group(pid_t pid) const {
   finish_initializing();
   for (auto& tg : thread_group_map_) {
     if (tg.first.tid() == pid) {
       return tg.second;
     }
   }
   return nullptr;
 }

 AddressSpace* Session::find_address_space(const AddressSpaceUid& vmuid) const {
   finish_initializing();
   auto it = vm_map.find(vmuid);
   if (vm_map.end() == it) {
     return nullptr;
   }
   return it->second;
 }

 void Session::kill_all_tasks() {
   LOG(debug) << "Killing all tasks ...";
   for (int pass = 0; pass <= 1; ++pass) {
     /* We delete tasks in two passes. First, we kill
      * every non-thread-group-leader, then we kill every group leader.
      * Linux expects threads group leaders to survive until the last
      * member of the thread group has exited, so we accomodate that.
      */
     for (auto& v : task_map) {
       Task* t = v.second;
       bool is_group_leader = t->tid == t->real_tgid();
       if (pass == 0 ? is_group_leader : !is_group_leader) {
         continue;
       }
       t->kill();
     }
   }
   while (!task_map.empty()) {
     Task* t = task_map.rbegin()->second;
     delete t;
   }
   assert(task_map.empty());
 }

 void Session::on_destroy(AddressSpace* vm) {
   DEBUG_ASSERT(vm->task_set().size() == 0);
   DEBUG_ASSERT(vm_map.count(vm->uid()) == 1);
   vm_map.erase(vm->uid());
 }

 void Session::on_destroy(Task* t) {
   DEBUG_ASSERT(task_map.count(t->rec_tid) == 1);
   task_map.erase(t->rec_tid);
 }

 void Session::on_create(Task* t) { task_map[t->rec_tid] = t; }

 ScopedFd Session::create_spawn_task_error_pipe() {
   int fds[2];
   if (0 != pipe2(fds, O_CLOEXEC)) {
     FATAL();
   }
   spawned_task_error_fd_ = ScopedFd(fds[0]);
   return ScopedFd(fds[1]);
 }

 string Session::read_spawned_task_error() const {
   char buf[1024] = "";
   ssize_t len = read(spawned_task_error_fd_, buf, sizeof(buf));
   if (len <= 0) {
     return string();
   }
   buf[len] = 0;
   return string(buf, len);
 }

 BreakStatus Session::diagnose_debugger_trap(Task* t, RunCommand run_command) {
   assert_fully_initialized();
   BreakStatus break_status;
   break_status.task_context = TaskContext(t);

   int stop_sig = t->stop_sig();
   if (!stop_sig) {
     // This can happen if we were INCOMPLETE because we're close to
     // the ticks_target.
     return break_status;
   }

   if (SIGTRAP != stop_sig) {
     BreakpointType pending_bp = t->vm()->get_breakpoint_type_at_addr(t->ip());
     if (BKPT_USER == pending_bp) {
       // A signal was raised /just/ before a trap
       // instruction for a SW breakpoint.  This is
       // observed when debuggers write trap
       // instructions into no-exec memory, for
       // example the stack.
       //
       // We report the breakpoint before any signal
       // that might have been raised in order to let
       // the debugger do something at the breakpoint
       // insn; possibly clearing the breakpoint and
       // changing the $ip.  Otherwise, we expect the
       // debugger to clear the breakpoint and resume
       // execution, which should raise the original
       // signal again.
       LOG(debug) << "hit debugger breakpoint BEFORE ip " << t->ip() << " for "
                  << t->get_siginfo();
       break_status.breakpoint_hit = true;
     } else if (stop_sig && stop_sig != PerfCounters::TIME_SLICE_SIGNAL) {
       break_status.signal =
           unique_ptr<siginfo_t>(new siginfo_t(t->get_siginfo()));
       LOG(debug) << "Got signal " << *break_status.signal << " (expected sig "
                  << stop_sig << ")";
       break_status.signal->si_signo = stop_sig;
     }
   } else {
     TrapReasons trap_reasons = t->compute_trap_reasons();

     // Conceal any internal singlestepping
     if (trap_reasons.singlestep && is_singlestep(run_command)) {
       LOG(debug) << "  finished debugger stepi";
       break_status.singlestep_complete = true;
     }

     if (trap_reasons.watchpoint) {
       check_for_watchpoint_changes(t, break_status);
     }

     if (trap_reasons.breakpoint) {
       BreakpointType retired_bp =
           t->vm()->get_breakpoint_type_for_retired_insn(t->ip());
       if (BKPT_USER == retired_bp) {
         // SW breakpoint: $ip is just past the
         // breakpoint instruction.  Move $ip back
         // right before it.
         t->move_ip_before_breakpoint();
         break_status.breakpoint_hit = true;
         LOG(debug) << "hit debugger breakpoint at ip " << t->ip();
       }
     }
   }

   return break_status;
 }

 void Session::check_for_watchpoint_changes(Task* t, BreakStatus& break_status) {
   assert_fully_initialized();
   break_status.watchpoints_hit = t->vm()->consume_watchpoint_changes();
 }

 void Session::assert_fully_initialized() const {
   DEBUG_ASSERT(!clone_completion && "Session not fully initialized");
 }

 void Session::finish_initializing() const {
   if (!clone_completion) {
     return;
   }

   Session* self = const_cast<Session*>(this);
   for (auto& asleader : clone_completion->address_spaces) {
     {
       AutoRemoteSyscalls remote(asleader.clone_leader);
       for (const auto& m : asleader.clone_leader->vm()->maps()) {
         // Creating this mapping was delayed in capture_state for performance
         if (m.flags & AddressSpace::Mapping::IS_SYSCALLBUF) {
           self->recreate_shared_mmap(remote, m);
         }
       }
       for (auto& mem : asleader.captured_memory) {
         asleader.clone_leader->write_bytes_helper(mem.first, mem.second.size(),
                                                   mem.second.data());
       }
       for (auto& asmember : asleader.member_states) {
         auto it = thread_group_map_.find(asmember.tguid);
         ThreadGroup::shr_ptr tg(it == thread_group_map_.end() ? nullptr :
           it->second->shared_from_this());
         if (!tg) {
           tg = std::make_shared<ThreadGroup>
             (self, nullptr, asmember.tguid.tid(), asmember.tguid.tid(), asmember.tguid.serial());
         }
         Task* t_clone = Task::os_clone_into(
             asmember, remote, clone_completion->cloned_fd_tables, tg);
         self->on_create(t_clone);
         t_clone->copy_state(asmember);
       }
     }
     asleader.clone_leader->copy_state(asleader.clone_leader_state);
   }

   self->clone_completion = nullptr;
 }

 static void remap_shared_mmap(AutoRemoteSyscalls& remote, EmuFs& emu_fs,
                               EmuFs& dest_emu_fs,
                               const AddressSpace::Mapping& m_in_mem) {
   AddressSpace::Mapping m = m_in_mem;

   LOG(debug) << "    remapping shared region at " << m.map.start() << "-"
              << m.map.end();
   remote.infallible_syscall(syscall_number_for_munmap(remote.arch()),
                             m.map.start(), m.map.size());

   EmuFile::shr_ptr emu_file;
   if (dest_emu_fs.has_file_for(m.recorded_map)) {
     emu_file = dest_emu_fs.at(m.recorded_map);
   } else {
     emu_file = dest_emu_fs.clone_file(emu_fs.at(m.recorded_map));
   }

   // TODO: this duplicates some code in replay_syscall.cc, but
   // it's somewhat nontrivial to factor that code out.
   int remote_fd = remote.infallible_send_fd_if_alive(emu_file->fd());
   if (remote_fd < 0) {
     if (remote.task()->vm()->task_set().size() > remote.task()->thread_group()->task_set().size()) {
       // XXX not sure how to handle the case where the tracee died after
       // we unmapped the area
       FATAL() << "Unexpected task death leaving this address space in a bad state";
     }
     return;
   }
   struct stat real_file = remote.task()->stat_fd(remote_fd);
   string real_file_name = remote.task()->file_name_of_fd(remote_fd);
   // XXX this condition is x86/x64-specific, I imagine.
   // The remapped segment *must* be remapped at the same address,
   // or else many things will go haywire.
   auto ret = remote.infallible_mmap_syscall_if_alive(m.map.start(), m.map.size(), m.map.prot(),
                                                      (m.map.flags() & ~MAP_ANONYMOUS) | MAP_FIXED,
                                                      remote_fd,
                                                      m.map.file_offset_bytes());
   if (!ret) {
     if (remote.task()->vm()->task_set().size() > remote.task()->thread_group()->task_set().size()) {
       // XXX not sure how to handle the case where the tracee died after
       // we unmapped the area
       FATAL() << "Unexpected task death leaving this address space in a bad state";
     }
     return;
   }

   // We update the AddressSpace mapping too, since that tracks the real file
   // name and we need to update that.
   remote.task()->vm()->map(
       remote.task(), m.map.start(), m.map.size(), m.map.prot(), m.map.flags(),
       m.map.file_offset_bytes(), real_file_name, real_file.st_dev,
       real_file.st_ino, nullptr, &m.recorded_map, emu_file);

   remote.infallible_close_syscall_if_alive(remote_fd);
 }

 /*static*/ const char* Session::rr_mapping_prefix() { return "/rr-shared-"; }

 KernelMapping Session::create_shared_mmap(
     AutoRemoteSyscalls& remote, size_t size, remote_ptr<void> required_child_addr,
     const char* name, int tracee_prot, int tracee_flags,
     MonitoredSharedMemory::shr_ptr monitored) {
   Task* t = remote.task();
   static int nonce = 0;
   // Create the segment we'll share with the tracee.
   char path[PATH_MAX];
   snprintf(path, sizeof(path) - 1, "%s%s%s-%d-%d", tmp_dir(),
            rr_mapping_prefix(), name, t->real_tgid(), nonce++);

   ScopedFd shmem_fd(path, O_CREAT | O_EXCL | O_RDWR);
   ASSERT(t, shmem_fd.is_open());
   /* Remove the fs name so that we don't have to worry about
    * cleaning up this segment in error conditions. */
   unlink(path);

   void* map_addr = mmap(nullptr, size, PROT_READ | PROT_WRITE,
                         MAP_SHARED, shmem_fd, 0);
   if (map_addr == MAP_FAILED) {
     FATAL() << "Failed to mmap shmem region";
   }
   resize_shmem_segment(shmem_fd, size);

   remote_ptr<void> child_map_addr = required_child_addr;
   if (child_map_addr.is_null()) {
     if (t->session().is_recording() &&
         static_cast<RecordTask*>(t)->enable_chaos_memory_allocations()) {
       child_map_addr = t->vm()->chaos_mode_find_free_memory(static_cast<RecordTask*>(t),
           size, nullptr);
     } else {
       child_map_addr = t->vm()->find_free_memory(t, size, RR_PAGE_ADDR,
           AddressSpace::FindFreeMemoryPolicy::USE_LAST_FREE_HINT);
       if (!child_map_addr) {
         FATAL() << "Can't find free memory for shared mmap";
       }
     }
   }

   struct stat st;
   ASSERT(t, 0 == ::fstat(shmem_fd, &st));
   int flags = MAP_SHARED;
   if (!required_child_addr.is_null()) {
     flags |= MAP_FIXED;
   }

   int child_shmem_fd = remote.infallible_send_fd_if_alive(shmem_fd);
   if (child_shmem_fd < 0) {
     return KernelMapping();
   }
   LOG(debug) << "created shmem segment " << path;

   // Map the segment in ours and the tracee's address spaces.
   remote_ptr<void> addr = remote.infallible_mmap_syscall_if_alive(
       child_map_addr, size, tracee_prot, flags | MAP_FIXED, child_shmem_fd, 0);
   if (!addr) {
     // tracee unexpectedly died.
     // We leak the fd; cleaning it up is probably impossible/unnecessary.
     return KernelMapping();
   }

   // Note the mapping after we successfully created it in the child.
   // If the child mapping fails for some reason (e.g. SIGKILL) we still
   // want our cache to be correct (and not contain the mapping).
   KernelMapping km = t->vm()->map(
       t, child_map_addr, size, tracee_prot, flags | tracee_flags, 0,
       path, st.st_dev, st.st_ino, nullptr, nullptr, nullptr, map_addr,
       std::move(monitored));

   remote.infallible_close_syscall_if_alive(child_shmem_fd);
   return km;
 }

 static char* extract_name(char* name_buffer, size_t buffer_size) {
   // Recover the name that was originally chosen by finding the part of the
   // name between rr_mapping_prefix and the -%d-%d at the end.
   char* path_start = strstr(name_buffer, Session::rr_mapping_prefix());
   DEBUG_ASSERT(path_start &&
                "Passed something to create_shared_mmap that"
                " wasn't a mapping shared between rr and the tracee?");
   size_t prefix_len = path_start - name_buffer;
   buffer_size -= prefix_len;
   name_buffer += prefix_len;

   char* name_end = name_buffer + strnlen(name_buffer, buffer_size);
   char* name_start = name_buffer + strlen(Session::rr_mapping_prefix());
   int hyphens_seen = 0;
   while (name_end > name_start) {
     --name_end;
     if (*name_end == '-') {
       ++hyphens_seen;
     } else if (*name_end == '/') {
       DEBUG_ASSERT(false &&
                    "Passed something to create_shared_mmap that"
                    " wasn't a mapping shared between rr and the tracee?");
     }
     if (hyphens_seen == 2) {
       break;
     }
   }
   DEBUG_ASSERT(hyphens_seen == 2);
   *name_end = '\0';
   return name_start;
 }

 const AddressSpace::Mapping Session::recreate_shared_mmap(
     AutoRemoteSyscalls& remote, const AddressSpace::Mapping& m,
     PreserveContents preserve, MonitoredSharedMemory::shr_ptr monitored) {
   char name[PATH_MAX];
   strncpy(name, m.map.fsname().c_str(), sizeof(name) - 1);
   name[sizeof(name) - 1] = 0;
   uint32_t flags = m.flags;
   size_t size = m.map.size();
   void* preserved_data = preserve == PRESERVE_CONTENTS ? m.local_addr : nullptr;
   if (preserved_data) {
     remote.task()->vm()->detach_local_mapping(m.map.start());
   }
   remote_ptr<void> new_addr =
       create_shared_mmap(remote, m.map.size(), m.map.start(),
                          extract_name(name, sizeof(name)), m.map.prot(), 0,
                          std::move(monitored))
           .start();
   AddressSpace::Mapping new_map;
   if (new_addr) {
     // m may be invalid now
     remote.task()->vm()->mapping_flags_of(new_addr) = flags;
     new_map = remote.task()->vm()->mapping_of(new_addr);
     if (preserved_data) {
       memcpy(new_map.local_addr, preserved_data, size);
       munmap(preserved_data, size);
     }
   }
   return new_map;
 }

 AddressSpace::Mapping Session::steal_mapping(
     AutoRemoteSyscalls& remote, const AddressSpace::Mapping& m,
     MonitoredSharedMemory::shr_ptr monitored) {
   // We will include the name of the full path of the original mapping in the
   // name of the shared mapping, replacing slashes by dashes.
   char name[PATH_MAX - 40];
   strncpy(name, m.map.fsname().c_str(), sizeof(name)-1);
   name[sizeof(name) - 1] = '\0';
   for (char* ptr = name; *ptr != '\0'; ++ptr) {
     if (*ptr == '/') {
       *ptr = '-';
     }
   }

   // Now create the new mapping in its place
   remote_ptr<void> start = m.map.start();
   size_t sz = m.map.size();
   const AddressSpace::Mapping& new_m = remote.task()->vm()->mapping_of(
       create_shared_mmap(remote, sz, start, name, m.map.prot(),
                          m.map.flags() & (MAP_GROWSDOWN | MAP_STACK),
                          std::move(monitored))
           .start());
   return new_m;
 }

 // Replace a MAP_PRIVATE segment by one that is shared between rr and the
 // tracee.
 void Session::make_private_shared(AutoRemoteSyscalls& remote,
                                   const AddressSpace::Mapping m) {
   if (!(m.map.flags() & MAP_PRIVATE)) {
     return;
   }
   // Find a place to map the current segment to temporarily
   remote_ptr<void> start = m.map.start();
   size_t sz = m.map.size();
   remote_ptr<void> free_mem = remote.task()->vm()->find_free_memory(remote.task(), sz);
   remote.infallible_syscall(syscall_number_for_mremap(remote.arch()), start, sz,
                             sz, MREMAP_MAYMOVE | MREMAP_FIXED, free_mem);
   remote.task()->vm()->remap(remote.task(), start, sz, free_mem, sz,
                              MREMAP_MAYMOVE | MREMAP_FIXED);

   // AutoRemoteSyscalls may have gotten unlucky and picked the old stack
   // segment as it's scratch space, reevaluate that choice
   AutoRemoteSyscalls remote2(remote.task());

   AddressSpace::Mapping new_m = steal_mapping(remote2, m);

   if (!new_m.local_addr) {
     return;
   }
   // And copy over the contents. Since we can't just call memcpy in the
   // inferior, just copy directly from the remote private into the local
   // reference of the shared mapping. We use the fallible read method to
   // handle the case where the mapping is larger than the backing file, which
   // would otherwise cause a short read.
   remote2.task()->read_bytes_fallible(free_mem, sz, new_m.local_addr);

   // Finally unmap the original segment
   remote2.infallible_syscall(syscall_number_for_munmap(remote.arch()), free_mem,
                              sz);
   remote.task()->vm()->unmap(remote.task(), free_mem, sz);
 }

 static vector<uint8_t> capture_syscallbuf(const AddressSpace::Mapping& m,
                                           Task* clone_leader) {
   remote_ptr<uint8_t> start = m.map.start().cast<uint8_t>();
   auto syscallbuf_hdr = start.cast<struct syscallbuf_hdr>();
   size_t data_size;
   if (clone_leader->read_mem(REMOTE_PTR_FIELD(syscallbuf_hdr, locked))) {
     // There may be an incomplete syscall record after num_rec_bytes that
     // we need to capture here. We don't know how big that record is,
     // so just record the entire buffer. This should not be common.
     data_size = m.map.size();
   } else {
     data_size = clone_leader->read_mem(
                     REMOTE_PTR_FIELD(syscallbuf_hdr, num_rec_bytes)) +
                 sizeof(struct syscallbuf_hdr);
   }
   return clone_leader->read_mem(start, data_size);
 }

 static FdTable::shr_ptr& get_or_clone_fd_table(
     Task::ClonedFdTables& existing_clones, Task* task_to_clone) {
   auto original_fd_table = task_to_clone->fd_table();
   FdTable::shr_ptr& existing_clone =
       existing_clones[uintptr_t(original_fd_table.get())];
   if (!existing_clone) {
     existing_clone = original_fd_table->clone();
   }
   return existing_clone;
 }

 void Session::copy_state_to(Session& dest, EmuFs& emu_fs, EmuFs& dest_emu_fs) {
   assert_fully_initialized();
   DEBUG_ASSERT(!dest.clone_completion);

   auto completion = unique_ptr<CloneCompletion>(new CloneCompletion());
   auto& cloned_fd_tables = completion->cloned_fd_tables;

   for (auto vm : vm_map) {
     // Pick an arbitrary task to be group leader. The actual group leader
     // might have died already.
     Task* group_leader = *vm.second->task_set().begin();
     LOG(debug) << "  forking tg " << group_leader->tgid()
                << " (real: " << group_leader->real_tgid() << ")";

     completion->address_spaces.push_back(CloneCompletion::AddressSpaceClone());
     auto& group = completion->address_spaces.back();

     group.clone_leader = group_leader->os_fork_into(
         &dest, get_or_clone_fd_table(cloned_fd_tables, group_leader));
     dest.on_create(group.clone_leader);
     LOG(debug) << "  forked new group leader " << group.clone_leader->tid;

     {
       AutoRemoteSyscalls remote(group.clone_leader);
       vector<AddressSpace::Mapping> shared_maps_to_clone;
       for (const auto& m : group.clone_leader->vm()->maps()) {
         // Special case the syscallbuf as a performance optimization. The amount
         // of data we need to capture is usually significantly smaller than the
         // size of the mapping, so allocating the whole mapping here would be
         // wasteful.
         if (m.flags & AddressSpace::Mapping::IS_SYSCALLBUF) {
           group.captured_memory.push_back(make_pair(
               m.map.start(), capture_syscallbuf(m, group.clone_leader)));
         } else if (m.local_addr != nullptr) {
           ASSERT(group.clone_leader,
                  m.map.start() == AddressSpace::preload_thread_locals_start());
         } else if ((m.recorded_map.flags() & MAP_SHARED) &&
                    emu_fs.has_file_for(m.recorded_map)) {
           shared_maps_to_clone.push_back(m);
         }
       }
       // Do this in a separate loop to avoid iteration invalidation issues
       for (const auto& m : shared_maps_to_clone) {
         remap_shared_mmap(remote, emu_fs, dest_emu_fs, m);
       }

       for (auto t : vm.second->task_set()) {
         if (group_leader == t) {
           continue;
         }
         LOG(debug) << "    cloning " << t->rec_tid;

         get_or_clone_fd_table(cloned_fd_tables, t);
         group.member_states.push_back(t->capture_state());
       }
     }

     group.clone_leader_state = group_leader->capture_state();
   }
   dest.clone_completion = std::move(completion);

   DEBUG_ASSERT(dest.vms().size() > 0);
 }

 bool Session::has_cpuid_faulting() {
   return !Flags::get().disable_cpuid_faulting && cpuid_faulting_works();
 }

 int Session::cpu_binding() const {
   return const_cast<Session*>(this)->trace_stream()->bound_to_cpu();
 }

 // Returns true if we succeeded, false if we failed because the
 // requested CPU does not exist/is not available.
 static bool set_cpu_affinity(int cpu) {
   DEBUG_ASSERT(cpu >= 0);

   cpu_set_t mask;
   CPU_ZERO(&mask);
   CPU_SET(cpu, &mask);
   if (0 > sched_setaffinity(0, sizeof(mask), &mask)) {
     if (errno == EINVAL) {
       return false;
     }
     FATAL() << "Couldn't bind to CPU " << cpu;
   }
   return true;
 }

 void Session::do_bind_cpu() {
   sched_getaffinity(0, sizeof(original_affinity_), &original_affinity_);

   int cpu_index = this->cpu_binding();
   if (cpu_index >= 0) {
     // Set CPU affinity now, after we've created any helper threads
     // (so they aren't affected), but before we create any
     // tracees (so they are all affected).
     // Note that we're binding rr itself to the same CPU as the
     // tracees, since this seems to help performance.
     if (!set_cpu_affinity(cpu_index)) {
       if (has_cpuid_faulting() && !is_recording()) {
         cpu_index = choose_cpu(BIND_CPU, cpu_lock);
         if (!set_cpu_affinity(cpu_index)) {
           FATAL() << "Can't bind to requested CPU " << cpu_index
                   << " even after we re-selected it";
         }
         LOG(warn) << "Bound to CPU " << cpu_index
                   << "instead of selected " << trace_stream()->bound_to_cpu()
                   << "because the latter is not available;\n"
                   << "Hoping tracee doesn't use LSL instruction!";
         trace_stream()->set_bound_cpu(cpu_index);
       } else {
         FATAL() << "Can't bind to requested CPU " << cpu_index
                 << ", and CPUID faulting not available";
       }
     } else if (!is_recording()) {
       // Make sure to mark this CPU as in use in the cpu_lock.
       (void)choose_cpu((BindCPU)cpu_index, cpu_lock);
     }
   }
 }

 } // namespace rr