Update rr prebuilts to build 10827144. am: 72266da4be am: 74924e4573 am: 77e9faec15
Original change: https://android-review.googlesource.com/c/platform/tools/rr_prebuilt/+/2755597
Change-Id: Iea5da26ecc3ea6953c7b68f1c5f8918715da5795
Signed-off-by: Automerger Merge Worker <[email protected]>
diff --git a/rr/android/x86_64/bin/rr b/rr/android/x86_64/bin/rr
new file mode 100755
index 0000000..3128823
--- /dev/null
+++ b/rr/android/x86_64/bin/rr
Binary files differ
diff --git a/rr/android/x86_64/bin/rr-collect-symbols.py b/rr/android/x86_64/bin/rr-collect-symbols.py
new file mode 100755
index 0000000..79c72a7
--- /dev/null
+++ b/rr/android/x86_64/bin/rr-collect-symbols.py
@@ -0,0 +1,203 @@
+#!/usr/bin/env python3
+
+import errno
+import glob
+import os
+import re
+import shutil
+import subprocess
+import sys
+import tempfile
+from urllib.request import urlretrieve
+from urllib.error import HTTPError, ContentTooShortError
+
+# Usage: rr-collect-symbols.py <trace-dir> [<url> | <path>]
+#
+# Given a <url>, downloads the zip/.tar.zst file at <url>, uncompresses it,
+# runs "gunzip" on any .gz files, and for any ELF files found whose build-ids
+# match the build-id of an ELF file in the trace, moves it into the trace.
+#
+# Given a <path>, which must contain a .build-id directory with the usual
+# structure (e.g. as Ubuntu and Fedora create under /usr/lib/debug), searches
+# the directory tree for any ELF files whose build-ids match the build-id of
+# an ELF file in the trace and copies them into the trace. <path> defaults to
+# "/usr/lib/debug", which will grab any available system debuginfo files
+# in Ubuntu and Fedora at least.
+#
+# This script assumes that the trace-dir has been packed via `rr pack` so all
+# relevant files actually appear in the trace-dir.
+# It also assumes rr is on the PATH.
+#
+# The debuginfo files are placed in the trace under a "debug" subdirectory,
+# in a ".build-id" subdirectory with the usual structure.
+#
+# If a debuginfo file contains a .gnu_debugaltlink section then we also
+# attempt to find the referenced file and copy it into the trace with the
+# same file name as the .debug file, but with a .sup suffix.
+
+if len(sys.argv) < 2:
+ print("Usage: rr-collect-symbols.py <trace-dir> [<url> | <path>]", file=sys.stderr)
+ sys.exit(1)
+trace_dir = sys.argv[1]
+
+if len(sys.argv) < 3:
+ source = "/usr/lib/debug"
+else:
+ source = sys.argv[2]
+
+rr_buildid = subprocess.Popen(["rr", "buildid"],
+ stdin=subprocess.PIPE,
+ stdout=subprocess.PIPE)
+
+def build_id_for(file):
+ global rr_buildid
+ rr_buildid.stdin.write(("%s\n"%file).encode('utf-8'))
+ try:
+ rr_buildid.stdin.flush()
+ except BrokenPipeError:
+ print("Can't write to rr, termination code %s"%rr_buildid.returncode, file=sys.stderr)
+ sys.exit(2)
+ return rr_buildid.stdout.readline().rstrip().decode('utf-8')
+
+altref_regex = re.compile(rb"^\s+\[\s*0\]\s+(.*)");
+
+def find_altref(file):
+ proc = subprocess.Popen(["readelf", "-p", ".gnu_debugaltlink", file], stdout=subprocess.PIPE, stderr=subprocess.DEVNULL)
+ try:
+ for line in proc.stdout:
+ m = altref_regex.match(line)
+ if m:
+ return m.group(1).rstrip()
+ finally:
+ proc.wait()
+ return None
+
+def find_altref_for_trace_file(trace_file, altref):
+ proc = subprocess.Popen(["rr", "filename", trace_file], stdout=subprocess.PIPE)
+ try:
+ for line in proc.stdout:
+ file = line.rstrip()
+ altref_file = os.path.join(os.path.dirname(file), altref)
+ if os.path.isfile(altref_file):
+ return altref_file
+ finally:
+ proc.wait()
+ return None
+
+def mkdir_p(path):
+ try:
+ os.makedirs(path)
+ except OSError as exc:
+ if exc.errno == errno.EEXIST and os.path.isdir(path):
+ pass
+ else:
+ raise
+
+# 'dst' must be a complete file name, not a directory.
+def copy_file(src, dst):
+ try:
+ # Remove the destination file in case it's a hard link
+ # or owned by someone else.
+ os.remove(dst)
+ except:
+ pass
+ shutil.copy(src, dst)
+
+# 'dst' must be a complete file name, not a directory
+def create_link(src, dst):
+ try:
+ # Remove the destination file in case it's wrong.
+ os.remove(dst)
+ except:
+ pass
+ os.symlink(src, dst)
+
+def collect_trace_build_ids():
+ ret = {}
+ for file in glob.iglob("%s/mmap_*"%trace_dir):
+ build_id = build_id_for(file)
+ if build_id:
+ ret[build_id] = True
+ altref = find_altref(file)
+ if altref:
+ altref_file = find_altref_for_trace_file(file, altref)
+ if not altref_file:
+ print("WARNING: Can't find alt file %s for %s"%(altref, file))
+ continue
+ dir = "%s/debug/.build-id/%s"%(trace_dir, build_id[:2])
+ mkdir_p(dir)
+ copy_file(altref_file, "%s/%s.sup"%(dir, build_id[2:]))
+ return ret
+
+trace_build_ids = collect_trace_build_ids()
+
+def collect_archive(url):
+ is_tar_zst = url.endswith(".tar.zst")
+ tmp_dir = tempfile.mkdtemp(dir=trace_dir)
+ if is_tar_zst:
+ tmp_file_name = "%s/archive.tar.zst"%tmp_dir
+ else:
+ # Assume its a ZIP
+ tmp_file_name = "%s/archive.zip"%tmp_dir
+ try:
+ (file, headers) = urlretrieve(url, tmp_file_name)
+ except (HTTPError, ContentTooShortError) as exc:
+ print("Failed to load archive %s: %s"%(url, exc), file=sys.stderr)
+ sys.exit(2)
+ if is_tar_zst:
+ subprocess.check_call(["tar", "-C", tmp_dir, "-I", "zstd", "-xvf", file])
+ else:
+ subprocess.check_call(["unzip", "-d", tmp_dir, file])
+ os.remove(file)
+
+ for root, dirs, files in os.walk(tmp_dir):
+ for name in files:
+ file = os.path.join(root, name)
+ if file.endswith(".gz"):
+ subprocess.check_call(["gunzip", file])
+ file = file[:-3]
+ build_id = build_id_for(file)
+ if build_id and build_id in trace_build_ids:
+ dir = "%s/debug/.build-id/%s"%(trace_dir, build_id[:2])
+ mkdir_p(dir)
+ dst = "%s/%s.debug"%(dir, build_id[2:])
+ os.rename(file, dst)
+ else:
+ os.remove(file)
+
+ shutil.rmtree(tmp_dir)
+
+def collect_filesystem(path):
+ for root, dirs, files in os.walk(path):
+ for name in files:
+ file = os.path.join(root, name)
+ if not os.path.islink(file):
+ build_id = build_id_for(file)
+ if build_id and build_id in trace_build_ids:
+ dir = "%s/debug/.build-id/%s"%(trace_dir, build_id[:2])
+ mkdir_p(dir)
+ copy_file(file, "%s/%s.debug"%(dir, build_id[2:]))
+ altref = find_altref(file)
+ if altref:
+ altref = altref.decode('utf-8')
+ altref_file = os.path.join(os.path.dirname(file), altref)
+ copy_file(altref_file, "%s/%s.sup"%(dir, build_id[2:]))
+ if altref.startswith("../../../.dwz/"):
+ mkdir_p("%s/.dwz"%trace_dir)
+ src = "../debug/.build-id/%s/%s.sup"%(build_id[:2], build_id[2:])
+ create_link(src, "%s/.dwz/%s"%(trace_dir, altref[14:]))
+ elif altref.startswith("../../.dwz/"):
+ mkdir_p("%s/debug/.dwz"%trace_dir)
+ src = "../.build-id/%s/%s.sup"%(build_id[:2], build_id[2:])
+ create_link(src, "%s/debug/.dwz/%s"%(trace_dir, altref[11:]))
+ elif altref.startswith("../.dwz/"):
+ mkdir_p("%s/debug/.build-id/.dwz"%trace_dir)
+ src = "../%s/%s.sup"%(build_id[:2], build_id[2:])
+ create_link(src, "%s/debug/.build-id/.dwz/%s"%(trace_dir, altref[8:]))
+
+if re.search("^[^:/]+:", source):
+ collect_archive(source)
+else:
+ collect_filesystem(source)
+
+rr_buildid.terminate()
diff --git a/rr/android/x86_64/bin/rr_exec_stub b/rr/android/x86_64/bin/rr_exec_stub
new file mode 100755
index 0000000..5136526
--- /dev/null
+++ b/rr/android/x86_64/bin/rr_exec_stub
Binary files differ
diff --git a/rr/android/x86_64/bin/signal-rr-recording.sh b/rr/android/x86_64/bin/signal-rr-recording.sh
new file mode 100755
index 0000000..18a4cfd
--- /dev/null
+++ b/rr/android/x86_64/bin/signal-rr-recording.sh
@@ -0,0 +1,22 @@
+#!/usr/bin/env bash
+
+signal=$1
+if [[ "$signal" == "" ]]; then
+ echo "Usage: $0 <signal>" >&2
+ echo "Sends <signal> to all processes being recorded by rr" >&2
+ exit 1
+fi
+
+function signal_descendants { pid=$1
+ for child in `ps -o pid= --ppid $pid`; do
+ echo Sending $signal to $child
+ kill -s $signal $child
+ signal_descendants $child
+ done
+}
+
+for rr_pid in `pidof rr` ; do
+ if cat /proc/$rr_pid/cmdline | tr '\0' '\n' | head -n2 | tail -n1 | grep -qz '\(^record$\)\|/' ; then
+ signal_descendants $rr_pid
+ fi
+done
diff --git a/rr/android/x86_64/lib/rr/librrpage.so b/rr/android/x86_64/lib/rr/librrpage.so
new file mode 100644
index 0000000..1290a2d
--- /dev/null
+++ b/rr/android/x86_64/lib/rr/librrpage.so
Binary files differ
diff --git a/rr/android/x86_64/lib/rr/librrpreload.so b/rr/android/x86_64/lib/rr/librrpreload.so
new file mode 100644
index 0000000..f3e3b1b
--- /dev/null
+++ b/rr/android/x86_64/lib/rr/librrpreload.so
Binary files differ
diff --git a/rr/android/x86_64/share/bash-completion/completions/rr b/rr/android/x86_64/share/bash-completion/completions/rr
new file mode 100755
index 0000000..7325165
--- /dev/null
+++ b/rr/android/x86_64/share/bash-completion/completions/rr
@@ -0,0 +1,29 @@
+# vi:syntax=sh
+#
+# completion script for rr commands (to be sourced)
+
+_rr_subcmd_completion() {
+ local cmd=$1
+ local short_opts=$(rr help $cmd | sed -n 's/\s*-\([a-zA-Z]\),.*/-\1/p')
+ local long_opts=$(rr help $cmd | sed -n 's/.*--\([^= ]*\).*/--\1/p')
+ echo "$short_opts" "$long_opts"
+}
+
+_rr_completion() {
+ COMPREPLY=()
+ local rr_commands="$(rr --list-commands | cut -s -d ' ' -f 3)"
+
+ # completion for rr
+ if [ $COMP_CWORD -eq 1 ]; then
+ COMPREPLY=( $( compgen -W "$rr_commands" -- "${COMP_WORDS[1]}" ) )
+ return
+ fi
+
+ # completion for rr <command>'s options
+ local cmd="$(echo "${COMP_WORDS[1]}" | tr -d '[:space:]')"
+
+ if [ "$(echo $rr_commands | grep -w "$cmd")" ] ; then
+ COMPREPLY=( $( compgen -W "$(_rr_subcmd_completion "$cmd")" -- "${COMP_WORDS[COMP_CWORD]}" ) )
+ fi
+}
+complete -F _rr_completion rr
diff --git a/rr/android/x86_64/share/rr/32bit-avx.xml b/rr/android/x86_64/share/rr/32bit-avx.xml
new file mode 100644
index 0000000..6eb44fe
--- /dev/null
+++ b/rr/android/x86_64/share/rr/32bit-avx.xml
@@ -0,0 +1,18 @@
+<?xml version="1.0"?>
+<!-- Copyright (C) 2010-2014 Free Software Foundation, Inc.
+
+ Copying and distribution of this file, with or without modification,
+ are permitted in any medium without royalty provided the copyright
+ notice and this notice are preserved. -->
+
+<!DOCTYPE feature SYSTEM "gdb-target.dtd">
+<feature name="org.gnu.gdb.i386.avx">
+ <reg name="ymm0h" bitsize="128" type="uint128"/>
+ <reg name="ymm1h" bitsize="128" type="uint128"/>
+ <reg name="ymm2h" bitsize="128" type="uint128"/>
+ <reg name="ymm3h" bitsize="128" type="uint128"/>
+ <reg name="ymm4h" bitsize="128" type="uint128"/>
+ <reg name="ymm5h" bitsize="128" type="uint128"/>
+ <reg name="ymm6h" bitsize="128" type="uint128"/>
+ <reg name="ymm7h" bitsize="128" type="uint128"/>
+</feature>
diff --git a/rr/android/x86_64/share/rr/32bit-core.xml b/rr/android/x86_64/share/rr/32bit-core.xml
new file mode 100644
index 0000000..48c5890
--- /dev/null
+++ b/rr/android/x86_64/share/rr/32bit-core.xml
@@ -0,0 +1,65 @@
+<?xml version="1.0"?>
+<!-- Copyright (C) 2010-2014 Free Software Foundation, Inc.
+
+ Copying and distribution of this file, with or without modification,
+ are permitted in any medium without royalty provided the copyright
+ notice and this notice are preserved. -->
+
+<!DOCTYPE feature SYSTEM "gdb-target.dtd">
+<feature name="org.gnu.gdb.i386.core">
+ <flags id="i386_eflags" size="4">
+ <field name="CF" start="0" end="0"/>
+ <field name="" start="1" end="1"/>
+ <field name="PF" start="2" end="2"/>
+ <field name="AF" start="4" end="4"/>
+ <field name="ZF" start="6" end="6"/>
+ <field name="SF" start="7" end="7"/>
+ <field name="TF" start="8" end="8"/>
+ <field name="IF" start="9" end="9"/>
+ <field name="DF" start="10" end="10"/>
+ <field name="OF" start="11" end="11"/>
+ <field name="NT" start="14" end="14"/>
+ <field name="RF" start="16" end="16"/>
+ <field name="VM" start="17" end="17"/>
+ <field name="AC" start="18" end="18"/>
+ <field name="VIF" start="19" end="19"/>
+ <field name="VIP" start="20" end="20"/>
+ <field name="ID" start="21" end="21"/>
+ </flags>
+
+ <reg name="eax" bitsize="32" type="int32"/>
+ <reg name="ecx" bitsize="32" type="int32"/>
+ <reg name="edx" bitsize="32" type="int32"/>
+ <reg name="ebx" bitsize="32" type="int32"/>
+ <reg name="esp" bitsize="32" type="data_ptr"/>
+ <reg name="ebp" bitsize="32" type="data_ptr"/>
+ <reg name="esi" bitsize="32" type="int32"/>
+ <reg name="edi" bitsize="32" type="int32"/>
+
+ <reg name="eip" bitsize="32" type="code_ptr"/>
+ <reg name="eflags" bitsize="32" type="i386_eflags"/>
+ <reg name="cs" bitsize="32" type="int32"/>
+ <reg name="ss" bitsize="32" type="int32"/>
+ <reg name="ds" bitsize="32" type="int32"/>
+ <reg name="es" bitsize="32" type="int32"/>
+ <reg name="fs" bitsize="32" type="int32"/>
+ <reg name="gs" bitsize="32" type="int32"/>
+
+ <reg name="st0" bitsize="80" type="i387_ext"/>
+ <reg name="st1" bitsize="80" type="i387_ext"/>
+ <reg name="st2" bitsize="80" type="i387_ext"/>
+ <reg name="st3" bitsize="80" type="i387_ext"/>
+ <reg name="st4" bitsize="80" type="i387_ext"/>
+ <reg name="st5" bitsize="80" type="i387_ext"/>
+ <reg name="st6" bitsize="80" type="i387_ext"/>
+ <reg name="st7" bitsize="80" type="i387_ext"/>
+
+ <reg name="fctrl" bitsize="32" type="int" group="float"/>
+ <reg name="fstat" bitsize="32" type="int" group="float"/>
+ <reg name="ftag" bitsize="32" type="int" group="float"/>
+ <reg name="fiseg" bitsize="32" type="int" group="float"/>
+ <reg name="fioff" bitsize="32" type="int" group="float"/>
+ <reg name="foseg" bitsize="32" type="int" group="float"/>
+ <reg name="fooff" bitsize="32" type="int" group="float"/>
+ <reg name="fop" bitsize="32" type="int" group="float"/>
+</feature>
diff --git a/rr/android/x86_64/share/rr/32bit-linux.xml b/rr/android/x86_64/share/rr/32bit-linux.xml
new file mode 100644
index 0000000..7139db8
--- /dev/null
+++ b/rr/android/x86_64/share/rr/32bit-linux.xml
@@ -0,0 +1,11 @@
+<?xml version="1.0"?>
+<!-- Copyright (C) 2010-2014 Free Software Foundation, Inc.
+
+ Copying and distribution of this file, with or without modification,
+ are permitted in any medium without royalty provided the copyright
+ notice and this notice are preserved. -->
+
+<!DOCTYPE feature SYSTEM "gdb-target.dtd">
+<feature name="org.gnu.gdb.i386.linux">
+ <reg name="orig_eax" bitsize="32" type="int" regnum="41"/>
+</feature>
diff --git a/rr/android/x86_64/share/rr/32bit-pkeys.xml b/rr/android/x86_64/share/rr/32bit-pkeys.xml
new file mode 100644
index 0000000..6f6723c
--- /dev/null
+++ b/rr/android/x86_64/share/rr/32bit-pkeys.xml
@@ -0,0 +1,13 @@
+<?xml version="1.0"?>
+<!-- Copyright (C) 2016-2021 Free Software Foundation, Inc.
+
+ Copying and distribution of this file, with or without modification,
+ are permitted in any medium without royalty provided the copyright
+ notice and this notice are preserved. -->
+
+<!DOCTYPE feature SYSTEM "gdb-target.dtd">
+<feature name="org.gnu.gdb.i386.pkeys">
+
+ <reg name="pkru" bitsize="32" type="uint32"/>
+
+</feature>
diff --git a/rr/android/x86_64/share/rr/32bit-sse.xml b/rr/android/x86_64/share/rr/32bit-sse.xml
new file mode 100644
index 0000000..03b6421
--- /dev/null
+++ b/rr/android/x86_64/share/rr/32bit-sse.xml
@@ -0,0 +1,52 @@
+<?xml version="1.0"?>
+<!-- Copyright (C) 2010-2014 Free Software Foundation, Inc.
+
+ Copying and distribution of this file, with or without modification,
+ are permitted in any medium without royalty provided the copyright
+ notice and this notice are preserved. -->
+
+<!DOCTYPE feature SYSTEM "gdb-target.dtd">
+<feature name="org.gnu.gdb.i386.sse">
+ <vector id="v4f" type="ieee_single" count="4"/>
+ <vector id="v2d" type="ieee_double" count="2"/>
+ <vector id="v16i8" type="int8" count="16"/>
+ <vector id="v8i16" type="int16" count="8"/>
+ <vector id="v4i32" type="int32" count="4"/>
+ <vector id="v2i64" type="int64" count="2"/>
+ <union id="vec128">
+ <field name="v4_float" type="v4f"/>
+ <field name="v2_double" type="v2d"/>
+ <field name="v16_int8" type="v16i8"/>
+ <field name="v8_int16" type="v8i16"/>
+ <field name="v4_int32" type="v4i32"/>
+ <field name="v2_int64" type="v2i64"/>
+ <field name="uint128" type="uint128"/>
+ </union>
+ <flags id="i386_mxcsr" size="4">
+ <field name="IE" start="0" end="0"/>
+ <field name="DE" start="1" end="1"/>
+ <field name="ZE" start="2" end="2"/>
+ <field name="OE" start="3" end="3"/>
+ <field name="UE" start="4" end="4"/>
+ <field name="PE" start="5" end="5"/>
+ <field name="DAZ" start="6" end="6"/>
+ <field name="IM" start="7" end="7"/>
+ <field name="DM" start="8" end="8"/>
+ <field name="ZM" start="9" end="9"/>
+ <field name="OM" start="10" end="10"/>
+ <field name="UM" start="11" end="11"/>
+ <field name="PM" start="12" end="12"/>
+ <field name="FZ" start="15" end="15"/>
+ </flags>
+
+ <reg name="xmm0" bitsize="128" type="vec128" regnum="32"/>
+ <reg name="xmm1" bitsize="128" type="vec128"/>
+ <reg name="xmm2" bitsize="128" type="vec128"/>
+ <reg name="xmm3" bitsize="128" type="vec128"/>
+ <reg name="xmm4" bitsize="128" type="vec128"/>
+ <reg name="xmm5" bitsize="128" type="vec128"/>
+ <reg name="xmm6" bitsize="128" type="vec128"/>
+ <reg name="xmm7" bitsize="128" type="vec128"/>
+
+ <reg name="mxcsr" bitsize="32" type="i386_mxcsr" group="vector"/>
+</feature>
diff --git a/rr/android/x86_64/share/rr/64bit-avx.xml b/rr/android/x86_64/share/rr/64bit-avx.xml
new file mode 100644
index 0000000..5dfe45e
--- /dev/null
+++ b/rr/android/x86_64/share/rr/64bit-avx.xml
@@ -0,0 +1,26 @@
+<?xml version="1.0"?>
+<!-- Copyright (C) 2010-2014 Free Software Foundation, Inc.
+
+ Copying and distribution of this file, with or without modification,
+ are permitted in any medium without royalty provided the copyright
+ notice and this notice are preserved. -->
+
+<!DOCTYPE feature SYSTEM "gdb-target.dtd">
+<feature name="org.gnu.gdb.i386.avx">
+ <reg name="ymm0h" bitsize="128" type="uint128"/>
+ <reg name="ymm1h" bitsize="128" type="uint128"/>
+ <reg name="ymm2h" bitsize="128" type="uint128"/>
+ <reg name="ymm3h" bitsize="128" type="uint128"/>
+ <reg name="ymm4h" bitsize="128" type="uint128"/>
+ <reg name="ymm5h" bitsize="128" type="uint128"/>
+ <reg name="ymm6h" bitsize="128" type="uint128"/>
+ <reg name="ymm7h" bitsize="128" type="uint128"/>
+ <reg name="ymm8h" bitsize="128" type="uint128"/>
+ <reg name="ymm9h" bitsize="128" type="uint128"/>
+ <reg name="ymm10h" bitsize="128" type="uint128"/>
+ <reg name="ymm11h" bitsize="128" type="uint128"/>
+ <reg name="ymm12h" bitsize="128" type="uint128"/>
+ <reg name="ymm13h" bitsize="128" type="uint128"/>
+ <reg name="ymm14h" bitsize="128" type="uint128"/>
+ <reg name="ymm15h" bitsize="128" type="uint128"/>
+</feature>
diff --git a/rr/android/x86_64/share/rr/64bit-core.xml b/rr/android/x86_64/share/rr/64bit-core.xml
new file mode 100644
index 0000000..7cd0673
--- /dev/null
+++ b/rr/android/x86_64/share/rr/64bit-core.xml
@@ -0,0 +1,73 @@
+<?xml version="1.0"?>
+<!-- Copyright (C) 2010-2014 Free Software Foundation, Inc.
+
+ Copying and distribution of this file, with or without modification,
+ are permitted in any medium without royalty provided the copyright
+ notice and this notice are preserved. -->
+
+<!DOCTYPE feature SYSTEM "gdb-target.dtd">
+<feature name="org.gnu.gdb.i386.core">
+ <flags id="i386_eflags" size="4">
+ <field name="CF" start="0" end="0"/>
+ <field name="" start="1" end="1"/>
+ <field name="PF" start="2" end="2"/>
+ <field name="AF" start="4" end="4"/>
+ <field name="ZF" start="6" end="6"/>
+ <field name="SF" start="7" end="7"/>
+ <field name="TF" start="8" end="8"/>
+ <field name="IF" start="9" end="9"/>
+ <field name="DF" start="10" end="10"/>
+ <field name="OF" start="11" end="11"/>
+ <field name="NT" start="14" end="14"/>
+ <field name="RF" start="16" end="16"/>
+ <field name="VM" start="17" end="17"/>
+ <field name="AC" start="18" end="18"/>
+ <field name="VIF" start="19" end="19"/>
+ <field name="VIP" start="20" end="20"/>
+ <field name="ID" start="21" end="21"/>
+ </flags>
+
+ <reg name="rax" bitsize="64" type="int64"/>
+ <reg name="rbx" bitsize="64" type="int64"/>
+ <reg name="rcx" bitsize="64" type="int64"/>
+ <reg name="rdx" bitsize="64" type="int64"/>
+ <reg name="rsi" bitsize="64" type="int64"/>
+ <reg name="rdi" bitsize="64" type="int64"/>
+ <reg name="rbp" bitsize="64" type="data_ptr"/>
+ <reg name="rsp" bitsize="64" type="data_ptr"/>
+ <reg name="r8" bitsize="64" type="int64"/>
+ <reg name="r9" bitsize="64" type="int64"/>
+ <reg name="r10" bitsize="64" type="int64"/>
+ <reg name="r11" bitsize="64" type="int64"/>
+ <reg name="r12" bitsize="64" type="int64"/>
+ <reg name="r13" bitsize="64" type="int64"/>
+ <reg name="r14" bitsize="64" type="int64"/>
+ <reg name="r15" bitsize="64" type="int64"/>
+
+ <reg name="rip" bitsize="64" type="code_ptr"/>
+ <reg name="eflags" bitsize="32" type="i386_eflags"/>
+ <reg name="cs" bitsize="32" type="int32"/>
+ <reg name="ss" bitsize="32" type="int32"/>
+ <reg name="ds" bitsize="32" type="int32"/>
+ <reg name="es" bitsize="32" type="int32"/>
+ <reg name="fs" bitsize="32" type="int32"/>
+ <reg name="gs" bitsize="32" type="int32"/>
+
+ <reg name="st0" bitsize="80" type="i387_ext"/>
+ <reg name="st1" bitsize="80" type="i387_ext"/>
+ <reg name="st2" bitsize="80" type="i387_ext"/>
+ <reg name="st3" bitsize="80" type="i387_ext"/>
+ <reg name="st4" bitsize="80" type="i387_ext"/>
+ <reg name="st5" bitsize="80" type="i387_ext"/>
+ <reg name="st6" bitsize="80" type="i387_ext"/>
+ <reg name="st7" bitsize="80" type="i387_ext"/>
+
+ <reg name="fctrl" bitsize="32" type="int" group="float"/>
+ <reg name="fstat" bitsize="32" type="int" group="float"/>
+ <reg name="ftag" bitsize="32" type="int" group="float"/>
+ <reg name="fiseg" bitsize="32" type="int" group="float"/>
+ <reg name="fioff" bitsize="32" type="int" group="float"/>
+ <reg name="foseg" bitsize="32" type="int" group="float"/>
+ <reg name="fooff" bitsize="32" type="int" group="float"/>
+ <reg name="fop" bitsize="32" type="int" group="float"/>
+</feature>
diff --git a/rr/android/x86_64/share/rr/64bit-linux.xml b/rr/android/x86_64/share/rr/64bit-linux.xml
new file mode 100644
index 0000000..b4229d0
--- /dev/null
+++ b/rr/android/x86_64/share/rr/64bit-linux.xml
@@ -0,0 +1,11 @@
+<?xml version="1.0"?>
+<!-- Copyright (C) 2010-2014 Free Software Foundation, Inc.
+
+ Copying and distribution of this file, with or without modification,
+ are permitted in any medium without royalty provided the copyright
+ notice and this notice are preserved. -->
+
+<!DOCTYPE feature SYSTEM "gdb-target.dtd">
+<feature name="org.gnu.gdb.i386.linux">
+ <reg name="orig_rax" bitsize="64" type="int" regnum="57"/>
+</feature>
diff --git a/rr/android/x86_64/share/rr/64bit-pkeys.xml b/rr/android/x86_64/share/rr/64bit-pkeys.xml
new file mode 100644
index 0000000..6f6723c
--- /dev/null
+++ b/rr/android/x86_64/share/rr/64bit-pkeys.xml
@@ -0,0 +1,13 @@
+<?xml version="1.0"?>
+<!-- Copyright (C) 2016-2021 Free Software Foundation, Inc.
+
+ Copying and distribution of this file, with or without modification,
+ are permitted in any medium without royalty provided the copyright
+ notice and this notice are preserved. -->
+
+<!DOCTYPE feature SYSTEM "gdb-target.dtd">
+<feature name="org.gnu.gdb.i386.pkeys">
+
+ <reg name="pkru" bitsize="32" type="uint32"/>
+
+</feature>
diff --git a/rr/android/x86_64/share/rr/64bit-seg.xml b/rr/android/x86_64/share/rr/64bit-seg.xml
new file mode 100644
index 0000000..1fa6c9e
--- /dev/null
+++ b/rr/android/x86_64/share/rr/64bit-seg.xml
@@ -0,0 +1,5 @@
+<!DOCTYPE feature SYSTEM "gdb-target.dtd">
+<feature name="org.gnu.gdb.i386.segments">
+ <reg name="fs_base" bitsize="64" type="data_ptr"/>
+ <reg name="gs_base" bitsize="64" type="data_ptr"/>
+</feature>
diff --git a/rr/android/x86_64/share/rr/64bit-sse.xml b/rr/android/x86_64/share/rr/64bit-sse.xml
new file mode 100644
index 0000000..eec4b79
--- /dev/null
+++ b/rr/android/x86_64/share/rr/64bit-sse.xml
@@ -0,0 +1,60 @@
+<?xml version="1.0"?>
+<!-- Copyright (C) 2010-2014 Free Software Foundation, Inc.
+
+ Copying and distribution of this file, with or without modification,
+ are permitted in any medium without royalty provided the copyright
+ notice and this notice are preserved. -->
+
+<!DOCTYPE feature SYSTEM "gdb-target.dtd">
+<feature name="org.gnu.gdb.i386.sse">
+ <vector id="v4f" type="ieee_single" count="4"/>
+ <vector id="v2d" type="ieee_double" count="2"/>
+ <vector id="v16i8" type="int8" count="16"/>
+ <vector id="v8i16" type="int16" count="8"/>
+ <vector id="v4i32" type="int32" count="4"/>
+ <vector id="v2i64" type="int64" count="2"/>
+ <union id="vec128">
+ <field name="v4_float" type="v4f"/>
+ <field name="v2_double" type="v2d"/>
+ <field name="v16_int8" type="v16i8"/>
+ <field name="v8_int16" type="v8i16"/>
+ <field name="v4_int32" type="v4i32"/>
+ <field name="v2_int64" type="v2i64"/>
+ <field name="uint128" type="uint128"/>
+ </union>
+ <flags id="i386_mxcsr" size="4">
+ <field name="IE" start="0" end="0"/>
+ <field name="DE" start="1" end="1"/>
+ <field name="ZE" start="2" end="2"/>
+ <field name="OE" start="3" end="3"/>
+ <field name="UE" start="4" end="4"/>
+ <field name="PE" start="5" end="5"/>
+ <field name="DAZ" start="6" end="6"/>
+ <field name="IM" start="7" end="7"/>
+ <field name="DM" start="8" end="8"/>
+ <field name="ZM" start="9" end="9"/>
+ <field name="OM" start="10" end="10"/>
+ <field name="UM" start="11" end="11"/>
+ <field name="PM" start="12" end="12"/>
+ <field name="FZ" start="15" end="15"/>
+ </flags>
+
+ <reg name="xmm0" bitsize="128" type="vec128" regnum="40"/>
+ <reg name="xmm1" bitsize="128" type="vec128"/>
+ <reg name="xmm2" bitsize="128" type="vec128"/>
+ <reg name="xmm3" bitsize="128" type="vec128"/>
+ <reg name="xmm4" bitsize="128" type="vec128"/>
+ <reg name="xmm5" bitsize="128" type="vec128"/>
+ <reg name="xmm6" bitsize="128" type="vec128"/>
+ <reg name="xmm7" bitsize="128" type="vec128"/>
+ <reg name="xmm8" bitsize="128" type="vec128"/>
+ <reg name="xmm9" bitsize="128" type="vec128"/>
+ <reg name="xmm10" bitsize="128" type="vec128"/>
+ <reg name="xmm11" bitsize="128" type="vec128"/>
+ <reg name="xmm12" bitsize="128" type="vec128"/>
+ <reg name="xmm13" bitsize="128" type="vec128"/>
+ <reg name="xmm14" bitsize="128" type="vec128"/>
+ <reg name="xmm15" bitsize="128" type="vec128"/>
+
+ <reg name="mxcsr" bitsize="32" type="i386_mxcsr" group="vector"/>
+</feature>
diff --git a/rr/android/x86_64/share/rr/aarch64-core.xml b/rr/android/x86_64/share/rr/aarch64-core.xml
new file mode 100644
index 0000000..ee6a3a6
--- /dev/null
+++ b/rr/android/x86_64/share/rr/aarch64-core.xml
@@ -0,0 +1,91 @@
+<?xml version="1.0"?>
+<!-- Copyright (C) 2009-2020 Free Software Foundation, Inc.
+ Contributed by ARM Ltd.
+
+ Copying and distribution of this file, with or without modification,
+ are permitted in any medium without royalty provided the copyright
+ notice and this notice are preserved. -->
+
+<!DOCTYPE feature SYSTEM "gdb-target.dtd">
+<feature name="org.gnu.gdb.aarch64.core">
+ <reg name="x0" bitsize="64"/>
+ <reg name="x1" bitsize="64"/>
+ <reg name="x2" bitsize="64"/>
+ <reg name="x3" bitsize="64"/>
+ <reg name="x4" bitsize="64"/>
+ <reg name="x5" bitsize="64"/>
+ <reg name="x6" bitsize="64"/>
+ <reg name="x7" bitsize="64"/>
+ <reg name="x8" bitsize="64"/>
+ <reg name="x9" bitsize="64"/>
+ <reg name="x10" bitsize="64"/>
+ <reg name="x11" bitsize="64"/>
+ <reg name="x12" bitsize="64"/>
+ <reg name="x13" bitsize="64"/>
+ <reg name="x14" bitsize="64"/>
+ <reg name="x15" bitsize="64"/>
+ <reg name="x16" bitsize="64"/>
+ <reg name="x17" bitsize="64"/>
+ <reg name="x18" bitsize="64"/>
+ <reg name="x19" bitsize="64"/>
+ <reg name="x20" bitsize="64"/>
+ <reg name="x21" bitsize="64"/>
+ <reg name="x22" bitsize="64"/>
+ <reg name="x23" bitsize="64"/>
+ <reg name="x24" bitsize="64"/>
+ <reg name="x25" bitsize="64"/>
+ <reg name="x26" bitsize="64"/>
+ <reg name="x27" bitsize="64"/>
+ <reg name="x28" bitsize="64"/>
+ <reg name="x29" bitsize="64"/>
+ <reg name="x30" bitsize="64"/>
+ <reg name="sp" bitsize="64" type="data_ptr"/>
+
+ <reg name="pc" bitsize="64" type="code_ptr"/>
+
+ <flags id="cpsr_flags" size="4">
+ <!-- Stack Pointer. -->
+ <field name="SP" start="0" end="0"/>
+
+ <!-- Exception Level. -->
+ <field name="EL" start="2" end="3"/>
+ <!-- Execution state. -->
+ <field name="nRW" start="4" end="4"/>
+
+ <!-- FIQ interrupt mask. -->
+ <field name="F" start="6" end="6"/>
+ <!-- IRQ interrupt mask. -->
+ <field name="I" start="7" end="7"/>
+ <!-- SError interrupt mask. -->
+ <field name="A" start="8" end="8"/>
+ <!-- Debug exception mask. -->
+ <field name="D" start="9" end="9"/>
+
+ <!-- ARMv8.0-A: Speculative Store Bypass. -->
+ <field name="SSBS" start="12" end="12"/>
+
+ <!-- Illegal Execution state. -->
+ <field name="IL" start="20" end="20"/>
+ <!-- Software Step. -->
+ <field name="SS" start="21" end="21"/>
+ <!-- ARMv8.1-A: Privileged Access Never. -->
+ <field name="PAN" start="22" end="22"/>
+ <!-- ARMv8.2-A: User Access Override. -->
+ <field name="UAO" start="23" end="23"/>
+ <!-- ARMv8.4-A: Data Independent Timing. -->
+ <field name="DIT" start="24" end="24"/>
+ <!-- ARMv8.5-A: Tag Check Override. -->
+ <field name="TCO" start="25" end="25"/>
+
+ <!-- Overflow Condition flag. -->
+ <field name="V" start="28" end="28"/>
+ <!-- Carry Condition flag. -->
+ <field name="C" start="29" end="29"/>
+ <!-- Zero Condition flag. -->
+ <field name="Z" start="30" end="30"/>
+ <!-- Negative Condition flag. -->
+ <field name="N" start="31" end="31"/>
+ </flags>
+ <reg name="cpsr" bitsize="32" type="cpsr_flags"/>
+
+</feature>
diff --git a/rr/android/x86_64/share/rr/aarch64-fpu.xml b/rr/android/x86_64/share/rr/aarch64-fpu.xml
new file mode 100644
index 0000000..eae763c
--- /dev/null
+++ b/rr/android/x86_64/share/rr/aarch64-fpu.xml
@@ -0,0 +1,88 @@
+<?xml version="1.0"?>
+<!-- Copyright (C) 2009-2020 Free Software Foundation, Inc.
+ Contributed by ARM Ltd.
+
+ Copying and distribution of this file, with or without modification,
+ are permitted in any medium without royalty provided the copyright
+ notice and this notice are preserved. -->
+
+<!DOCTYPE feature SYSTEM "gdb-target.dtd">
+<feature name="org.gnu.gdb.aarch64.fpu">
+ <vector id="v2d" type="ieee_double" count="2"/>
+ <vector id="v2u" type="uint64" count="2"/>
+ <vector id="v2i" type="int64" count="2"/>
+ <vector id="v4f" type="ieee_single" count="4"/>
+ <vector id="v4u" type="uint32" count="4"/>
+ <vector id="v4i" type="int32" count="4"/>
+ <vector id="v8f" type="ieee_half" count="8"/>
+ <vector id="v8u" type="uint16" count="8"/>
+ <vector id="v8i" type="int16" count="8"/>
+ <vector id="v16u" type="uint8" count="16"/>
+ <vector id="v16i" type="int8" count="16"/>
+ <vector id="v1u" type="uint128" count="1"/>
+ <vector id="v1i" type="int128" count="1"/>
+ <union id="vnd">
+ <field name="f" type="v2d"/>
+ <field name="u" type="v2u"/>
+ <field name="s" type="v2i"/>
+ </union>
+ <union id="vns">
+ <field name="f" type="v4f"/>
+ <field name="u" type="v4u"/>
+ <field name="s" type="v4i"/>
+ </union>
+ <union id="vnh">
+ <field name="f" type="v8f"/>
+ <field name="u" type="v8u"/>
+ <field name="s" type="v8i"/>
+ </union>
+ <union id="vnb">
+ <field name="u" type="v16u"/>
+ <field name="s" type="v16i"/>
+ </union>
+ <union id="vnq">
+ <field name="u" type="v1u"/>
+ <field name="s" type="v1i"/>
+ </union>
+ <union id="aarch64v">
+ <field name="d" type="vnd"/>
+ <field name="s" type="vns"/>
+ <field name="h" type="vnh"/>
+ <field name="b" type="vnb"/>
+ <field name="q" type="vnq"/>
+ </union>
+ <reg name="v0" bitsize="128" type="aarch64v" regnum="34"/>
+ <reg name="v1" bitsize="128" type="aarch64v" />
+ <reg name="v2" bitsize="128" type="aarch64v" />
+ <reg name="v3" bitsize="128" type="aarch64v" />
+ <reg name="v4" bitsize="128" type="aarch64v" />
+ <reg name="v5" bitsize="128" type="aarch64v" />
+ <reg name="v6" bitsize="128" type="aarch64v" />
+ <reg name="v7" bitsize="128" type="aarch64v" />
+ <reg name="v8" bitsize="128" type="aarch64v" />
+ <reg name="v9" bitsize="128" type="aarch64v" />
+ <reg name="v10" bitsize="128" type="aarch64v"/>
+ <reg name="v11" bitsize="128" type="aarch64v"/>
+ <reg name="v12" bitsize="128" type="aarch64v"/>
+ <reg name="v13" bitsize="128" type="aarch64v"/>
+ <reg name="v14" bitsize="128" type="aarch64v"/>
+ <reg name="v15" bitsize="128" type="aarch64v"/>
+ <reg name="v16" bitsize="128" type="aarch64v"/>
+ <reg name="v17" bitsize="128" type="aarch64v"/>
+ <reg name="v18" bitsize="128" type="aarch64v"/>
+ <reg name="v19" bitsize="128" type="aarch64v"/>
+ <reg name="v20" bitsize="128" type="aarch64v"/>
+ <reg name="v21" bitsize="128" type="aarch64v"/>
+ <reg name="v22" bitsize="128" type="aarch64v"/>
+ <reg name="v23" bitsize="128" type="aarch64v"/>
+ <reg name="v24" bitsize="128" type="aarch64v"/>
+ <reg name="v25" bitsize="128" type="aarch64v"/>
+ <reg name="v26" bitsize="128" type="aarch64v"/>
+ <reg name="v27" bitsize="128" type="aarch64v"/>
+ <reg name="v28" bitsize="128" type="aarch64v"/>
+ <reg name="v29" bitsize="128" type="aarch64v"/>
+ <reg name="v30" bitsize="128" type="aarch64v"/>
+ <reg name="v31" bitsize="128" type="aarch64v"/>
+ <reg name="fpsr" bitsize="32"/>
+ <reg name="fpcr" bitsize="32"/>
+</feature>
diff --git a/rr/android/x86_64/share/rr/aarch64-pauth.xml b/rr/android/x86_64/share/rr/aarch64-pauth.xml
new file mode 100644
index 0000000..2ce14b4
--- /dev/null
+++ b/rr/android/x86_64/share/rr/aarch64-pauth.xml
@@ -0,0 +1,13 @@
+<?xml version="1.0"?>
+<!-- Copyright (C) 2018-2020 Free Software Foundation, Inc.
+
+ Copying and distribution of this file, with or without modification,
+ are permitted in any medium without royalty provided the copyright
+ notice and this notice are preserved. -->
+
+<!DOCTYPE feature SYSTEM "gdb-target.dtd">
+<feature name="org.gnu.gdb.aarch64.pauth">
+ <reg name="pauth_dmask" bitsize="64"/>
+ <reg name="pauth_cmask" bitsize="64"/>
+</feature>
+
diff --git a/rr/android/x86_64/share/rr/amd64-avx-linux.xml b/rr/android/x86_64/share/rr/amd64-avx-linux.xml
new file mode 100644
index 0000000..d2dc3bc
--- /dev/null
+++ b/rr/android/x86_64/share/rr/amd64-avx-linux.xml
@@ -0,0 +1,19 @@
+<?xml version="1.0"?>
+<!-- Copyright (C) 2010-2014 Free Software Foundation, Inc.
+
+ Copying and distribution of this file, with or without modification,
+ are permitted in any medium without royalty provided the copyright
+ notice and this notice are preserved. -->
+
+<!-- AMD64 with AVX - Includes Linux-only special "register". -->
+
+<!DOCTYPE target SYSTEM "gdb-target.dtd">
+<target>
+ <architecture>i386:x86-64</architecture>
+ <osabi>GNU/Linux</osabi>
+ <xi:include href="64bit-core.xml"/>
+ <xi:include href="64bit-sse.xml"/>
+ <xi:include href="64bit-linux.xml"/>
+ <xi:include href="64bit-seg.xml"/>
+ <xi:include href="64bit-avx.xml"/>
+</target>
diff --git a/rr/android/x86_64/share/rr/amd64-linux.xml b/rr/android/x86_64/share/rr/amd64-linux.xml
new file mode 100644
index 0000000..aad02a3
--- /dev/null
+++ b/rr/android/x86_64/share/rr/amd64-linux.xml
@@ -0,0 +1,19 @@
+<?xml version="1.0"?>
+<!-- Copyright (C) 2010-2014 Free Software Foundation, Inc.
+
+ Copying and distribution of this file, with or without modification,
+ are permitted in any medium without royalty provided the copyright
+ notice and this notice are preserved. -->
+
+<!-- AMD64 - Includes Linux-only special "register". -->
+
+<!DOCTYPE target SYSTEM "gdb-target.dtd">
+<target>
+ <architecture>i386:x86-64</architecture>
+ <osabi>GNU/Linux</osabi>
+ <xi:include href="64bit-core.xml"/>
+ <xi:include href="64bit-sse.xml"/>
+ <xi:include href="64bit-linux.xml"/>
+ <xi:include href="64bit-seg.xml"/>
+ <xi:include href="64bit-pkeys.xml"/>
+</target>
diff --git a/rr/android/x86_64/share/rr/amd64-pkeys-linux.xml b/rr/android/x86_64/share/rr/amd64-pkeys-linux.xml
new file mode 100644
index 0000000..1fa5bde
--- /dev/null
+++ b/rr/android/x86_64/share/rr/amd64-pkeys-linux.xml
@@ -0,0 +1,20 @@
+<?xml version="1.0"?>
+<!-- Copyright (C) 2010-2014 Free Software Foundation, Inc.
+
+ Copying and distribution of this file, with or without modification,
+ are permitted in any medium without royalty provided the copyright
+ notice and this notice are preserved. -->
+
+<!-- AMD64 with AVX - Includes Linux-only special "register". -->
+
+<!DOCTYPE target SYSTEM "gdb-target.dtd">
+<target>
+ <architecture>i386:x86-64</architecture>
+ <osabi>GNU/Linux</osabi>
+ <xi:include href="64bit-core.xml"/>
+ <xi:include href="64bit-sse.xml"/>
+ <xi:include href="64bit-linux.xml"/>
+ <xi:include href="64bit-seg.xml"/>
+ <xi:include href="64bit-avx.xml"/>
+ <xi:include href="64bit-pkeys.xml"/>
+</target>
diff --git a/rr/android/x86_64/share/rr/i386-avx-linux.xml b/rr/android/x86_64/share/rr/i386-avx-linux.xml
new file mode 100644
index 0000000..c957fab
--- /dev/null
+++ b/rr/android/x86_64/share/rr/i386-avx-linux.xml
@@ -0,0 +1,18 @@
+<?xml version="1.0"?>
+<!-- Copyright (C) 2010-2014 Free Software Foundation, Inc.
+
+ Copying and distribution of this file, with or without modification,
+ are permitted in any medium without royalty provided the copyright
+ notice and this notice are preserved. -->
+
+<!-- I386 with AVX- Includes Linux-only special "register". -->
+
+<!DOCTYPE target SYSTEM "gdb-target.dtd">
+<target>
+ <architecture>i386</architecture>
+ <osabi>GNU/Linux</osabi>
+ <xi:include href="32bit-core.xml"/>
+ <xi:include href="32bit-sse.xml"/>
+ <xi:include href="32bit-linux.xml"/>
+ <xi:include href="32bit-avx.xml"/>
+</target>
diff --git a/rr/android/x86_64/share/rr/i386-linux.xml b/rr/android/x86_64/share/rr/i386-linux.xml
new file mode 100644
index 0000000..625984e
--- /dev/null
+++ b/rr/android/x86_64/share/rr/i386-linux.xml
@@ -0,0 +1,18 @@
+<?xml version="1.0"?>
+<!-- Copyright (C) 2010-2014 Free Software Foundation, Inc.
+
+ Copying and distribution of this file, with or without modification,
+ are permitted in any medium without royalty provided the copyright
+ notice and this notice are preserved. -->
+
+<!-- I386 with SSE - Includes Linux-only special "register". -->
+
+<!DOCTYPE target SYSTEM "gdb-target.dtd">
+<target>
+ <architecture>i386</architecture>
+ <osabi>GNU/Linux</osabi>
+ <xi:include href="32bit-core.xml"/>
+ <xi:include href="32bit-linux.xml"/>
+ <xi:include href="32bit-sse.xml"/>
+ <xi:include href="32bit-pkeys.xml"/>
+</target>
diff --git a/rr/android/x86_64/share/rr/i386-pkeys-linux.xml b/rr/android/x86_64/share/rr/i386-pkeys-linux.xml
new file mode 100644
index 0000000..47f7b2f
--- /dev/null
+++ b/rr/android/x86_64/share/rr/i386-pkeys-linux.xml
@@ -0,0 +1,19 @@
+<?xml version="1.0"?>
+<!-- Copyright (C) 2010-2014 Free Software Foundation, Inc.
+
+ Copying and distribution of this file, with or without modification,
+ are permitted in any medium without royalty provided the copyright
+ notice and this notice are preserved. -->
+
+<!-- I386 with AVX- Includes Linux-only special "register". -->
+
+<!DOCTYPE target SYSTEM "gdb-target.dtd">
+<target>
+ <architecture>i386</architecture>
+ <osabi>GNU/Linux</osabi>
+ <xi:include href="32bit-core.xml"/>
+ <xi:include href="32bit-sse.xml"/>
+ <xi:include href="32bit-linux.xml"/>
+ <xi:include href="32bit-avx.xml"/>
+ <xi:include href="32bit-pkeys.xml"/>
+</target>
diff --git a/rr/android/x86_64/share/rr/src/preload/overrides.c b/rr/android/x86_64/share/rr/src/preload/overrides.c
new file mode 100644
index 0000000..2f572b3
--- /dev/null
+++ b/rr/android/x86_64/share/rr/src/preload/overrides.c
@@ -0,0 +1,334 @@
+/* -*- Mode: C; tab-width: 8; c-basic-offset: 2; indent-tabs-mode: nil; -*- */
+
+#define RR_IMPLEMENT_PRELOAD
+
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE 1
+#endif
+
+#include <dlfcn.h>
+#include <pthread.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/syscall.h>
+#include <unistd.h>
+
+#include "preload_interface.h"
+#include "syscallbuf.h"
+
+#define PTHREAD_MUTEX_PRIO_INHERIT_NP 32
+
+#define DOUBLE_UNDERSCORE_PTHREAD_LOCK_AVAILABLE 1
+#ifdef __GLIBC_PREREQ
+#if __GLIBC_PREREQ(2, 34)
+#undef DOUBLE_UNDERSCORE_PTHREAD_LOCK_AVAILABLE
+#endif
+#endif
+
+#ifndef __BIONIC__
+
+// Use an old version of dlsym so this code still works when built against glibc > 2.34
+// but loaded into a process linking a pre-2.34 glibc.
+#ifdef __x86_64__
+__asm__(".symver dlsym,dlsym@GLIBC_2.2.5");
+#elif defined(__i386__)
+__asm__(".symver dlsym,dlsym@GLIBC_2.0");
+#endif
+
+static int (*real_pthread_mutex_init)(void* mutex, const void* attr);
+static int (*real_pthread_mutex_lock)(void* mutex);
+static int (*real_pthread_mutex_trylock)(void* mutex);
+static int (*real_pthread_mutex_timedlock)(void* mutex,
+ const struct timespec* abstime);
+static int (*real_pthread_mutexattr_setprotocol)(void* attr, int protocol);
+
+static void __attribute__((constructor)) init_override(void) {
+ real_pthread_mutex_init = dlsym(RTLD_NEXT, "pthread_mutex_init");
+ real_pthread_mutex_lock = dlsym(RTLD_NEXT, "pthread_mutex_lock");
+ real_pthread_mutex_trylock = dlsym(RTLD_NEXT, "pthread_mutex_trylock");
+ real_pthread_mutex_timedlock = dlsym(RTLD_NEXT, "pthread_mutex_timedlock");
+ real_pthread_mutexattr_setprotocol = dlsym(RTLD_NEXT, "pthread_mutexattr_setprotocol");
+}
+
+static void fix_mutex_kind(pthread_mutex_t* mutex) {
+ /* Disable priority inheritance. */
+ mutex->__data.__kind &= ~PTHREAD_MUTEX_PRIO_INHERIT_NP;
+}
+
+#ifdef DOUBLE_UNDERSCORE_PTHREAD_LOCK_AVAILABLE
+/*
+ * We need to able to call directly to __pthread_mutex_lock and
+ * __pthread_mutex_trylock because setting up our indirect function pointers
+ * calls dlsym which itself can call pthread_mutex_lock (e.g. via application
+ * code overriding malloc/calloc to use a pthreads-based implementation).
+ * So before our pointers are set up, call these.
+ *
+ * If we're building against glibc 2.34 *but* we get run against a binary
+ * linking with glibc < 2.34 *and* the application overrides malloc to use
+ * pthreads-based synchronization then this won't work and we lose. Let's
+ * hope this doesn't happen.
+ */
+extern int __pthread_mutex_init(pthread_mutex_t* mutex,
+ const pthread_mutexattr_t* attr);
+extern int __pthread_mutex_lock(pthread_mutex_t* mutex);
+extern int __pthread_mutex_trylock(pthread_mutex_t* mutex);
+#endif
+
+int pthread_mutex_init(pthread_mutex_t* mutex,
+ const pthread_mutexattr_t* attr) {
+ int ret;
+ pthread_mutexattr_t realattr;
+
+ if (attr) {
+ /* We wish to enforce the use of plain (no PI) mutex to avoid
+ * needing to handle PI futex() operations.
+ * We also wish to ensure that pthread_mutexattr_getprotocol()
+ * still returns the requested protocol.
+ * So we copy the attribute and force PTHREAD_PRIO_NONE.
+ */
+ memcpy(&realattr, attr, sizeof(realattr));
+ // We assume dlsym doesn't call pthread_mutex_init with attributes.
+ // We avoid calling pthread_mutexattr_setprotocol (and any other pthread functions)
+ // directly because that won't work when we're built against glibc 2.34 but loaded
+ // into a process using glibc < 2.34. (pthread functions got a symbol version bump
+ // in 2.34.)
+ //
+ // But note that we can't use dlsym in cases where we would want to use the double
+ // underscore methods (i.e. glibc < 2.34). There is no double underscore version of
+ // pthread_mutexattr_setprotocol, so we call it directly.
+ if (!real_pthread_mutexattr_setprotocol) {
+#ifdef DOUBLE_UNDERSCORE_PTHREAD_LOCK_AVAILABLE
+ ret = pthread_mutexattr_setprotocol(&realattr, PTHREAD_PRIO_NONE);
+ goto setprotocol;
+#else
+ real_pthread_mutexattr_setprotocol = dlsym(RTLD_NEXT, "pthread_mutexattr_setprotocol");
+#endif
+ }
+ ret = real_pthread_mutexattr_setprotocol(&realattr, PTHREAD_PRIO_NONE);
+#ifdef DOUBLE_UNDERSCORE_PTHREAD_LOCK_AVAILABLE
+setprotocol:
+#endif
+ if (ret) {
+ return ret;
+ }
+ attr = &realattr;
+ }
+ if (!real_pthread_mutex_init) {
+#ifdef DOUBLE_UNDERSCORE_PTHREAD_LOCK_AVAILABLE
+ return __pthread_mutex_init(mutex, attr);
+#else
+ real_pthread_mutex_init = dlsym(RTLD_NEXT, "pthread_mutex_init");
+#endif
+ }
+ return real_pthread_mutex_init(mutex, attr);
+}
+
+/* Prevent use of lock elision; Haswell's TSX/RTM features used by
+ lock elision increment the rbc perf counter for instructions which
+ are later rolled back if the transaction fails. */
+int pthread_mutex_lock(pthread_mutex_t* mutex) {
+ fix_mutex_kind(mutex);
+ if (!real_pthread_mutex_lock) {
+#ifdef DOUBLE_UNDERSCORE_PTHREAD_LOCK_AVAILABLE
+ return __pthread_mutex_lock(mutex);
+#else
+ real_pthread_mutex_lock = dlsym(RTLD_NEXT, "pthread_mutex_lock");
+#endif
+ }
+ return real_pthread_mutex_lock(mutex);
+}
+
+int pthread_mutex_timedlock(pthread_mutex_t* mutex,
+ const struct timespec* abstime) {
+ fix_mutex_kind(mutex);
+ /* No __pthread_mutex_timedlock stub exists, so we have to use the
+ * indirect call no matter what.
+ */
+ if (!real_pthread_mutex_timedlock) {
+ real_pthread_mutex_timedlock = dlsym(RTLD_NEXT, "pthread_mutex_timedlock");
+ }
+ return real_pthread_mutex_timedlock(mutex, abstime);
+}
+
+int pthread_mutex_trylock(pthread_mutex_t* mutex) {
+ fix_mutex_kind(mutex);
+ if (!real_pthread_mutex_trylock) {
+#ifdef DOUBLE_UNDERSCORE_PTHREAD_LOCK_AVAILABLE
+ return __pthread_mutex_trylock(mutex);
+#else
+ real_pthread_mutex_trylock = dlsym(RTLD_NEXT, "pthread_mutex_trylock");
+#endif
+ }
+ return real_pthread_mutex_trylock(mutex);
+}
+
+#endif
+
+typedef void* Dlopen(const char* filename, int flags);
+
+void* dlopen(const char* filename, int flags) {
+ // Give up our timeslice now. This gives us a full timeslice to
+ // execute the dlopen(), reducing the chance we'll hit
+ // https://sourceware.org/bugzilla/show_bug.cgi?id=19329.
+ Dlopen* f_ptr = (Dlopen*)dlsym(RTLD_NEXT, "dlopen");
+ sched_yield();
+ return f_ptr(filename, flags);
+}
+
+/** Disable XShm since rr doesn't work with it */
+int XShmQueryExtension(__attribute__((unused)) void* dpy) { return 0; }
+
+/** Make sure XShmCreateImage returns null in case an application doesn't do
+ extension checks first. */
+void* XShmCreateImage(__attribute__((unused)) register void* dpy,
+ __attribute__((unused)) register void* visual,
+ __attribute__((unused)) unsigned int depth,
+ __attribute__((unused)) int format,
+ __attribute__((unused)) char* data,
+ __attribute__((unused)) void* shminfo,
+ __attribute__((unused)) unsigned int width,
+ __attribute__((unused)) unsigned int height) {
+ return 0;
+}
+
+RR_HIDDEN char impose_syscall_delay;
+RR_HIDDEN char impose_spurious_desched;
+
+/**
+ * This is for testing purposes only.
+ */
+void delayed_syscall(struct syscall_info* info) {
+ impose_syscall_delay = 1;
+ /* Make sure 'result' is used so it's not optimized out! */
+ syscall(info->no, info->args[0], info->args[1], info->args[2], info->args[3],
+ info->args[4], info->args[5]);
+ impose_syscall_delay = 0;
+}
+
+/**
+ * This is for testing purposes only.
+ * Note that this must be defined outside of the syscallbuf code.
+ * Otherwise, the signal recording code may expect exit from this function
+ * to trigger the syscallbuf exit breakpoint.
+ */
+void* syscallbuf_ptr(void) {
+ return ((struct preload_thread_locals*)PRELOAD_THREAD_LOCALS_ADDR)->buffer;
+}
+
+/**
+ * This is for testing purposes only.
+ */
+void spurious_desched_syscall(struct syscall_info* info) {
+ impose_spurious_desched = 1;
+ /* Make sure 'result' is used so it's not optimized out! */
+ syscall(info->no, info->args[0], info->args[1], info->args[2], info->args[3],
+ info->args[4], info->args[5]);
+ impose_spurious_desched = 0;
+}
+
+/**
+ * clang's LeakSanitizer has regular threads call sched_yield() in a loop while
+ * a helper thread ptrace-attaches to them. If we let sched_yield() enter the
+ * syscallbuf, the helper thread sees that the regular thread SP register
+ * is pointing to the syscallbuf alt-stack, outside the stack region it
+ * expects, which causes it to freak out.
+ * So, override sched_yield() to perform the syscall in a way that can't
+ * be syscall-buffered.
+ */
+int sched_yield(void) {
+#ifdef __i386__
+ // We have no syscall hook for `syscall` followed by `inc %ecx`
+ int trash;
+ asm volatile ("int $0x80; inc %0" : "=c"(trash) : "a"(SYS_sched_yield));
+#elif defined(__x86_64__)
+ // We have no syscall hook for `syscall` followed by `inc %ecx`
+ int trash;
+ asm volatile ("syscall; inc %0" : "=c"(trash) : "a"(SYS_sched_yield));
+#elif defined(__aarch64__)
+ register long x8 __asm__("x8") = SYS_sched_yield;
+ // We explicitly blacklisted syscall that follows `mov x8, 0xdc`
+ // to avoid patching clone. Abuse that to prevent this from being patched.
+ __asm__ __volatile__("b 1f\n\t"
+ "mov x8, 0xdc\n"
+ "1:\n\t"
+ "svc 0\n"
+ :: "r"(x8) : "x0", "x30"); // x30 = lr
+#else
+#error "Unknown architecture"
+#endif
+ return 0;
+}
+
+#ifndef __aarch64__
+
+/**
+ * glibc geteuid() can be compiled to instructions ending in "syscall; ret"
+ * which sometimes can't be hooked. So override it here with something that
+ * can be hooked.
+ * This is not an issue on aarch64 since we only need to patch a single instruction.
+ */
+uid_t geteuid(void) {
+#ifdef __i386__
+ return syscall(SYS_geteuid32);
+#else
+ return syscall(SYS_geteuid);
+#endif
+}
+
+static void libstdcpp_not_found(void) {
+ const char msg[] = "[rr] Interposition for libstdc++ called but symbol lookups into libstdc++ failed.\n"
+ "Was libstdc++ loaded with RTLD_LOCAL? Try recording with `-v LD_PRELOAD=libstdc++.so.6`.\n"
+ "About to crash! ";
+ syscall(SYS_write, STDERR_FILENO, msg, sizeof(msg));
+}
+
+/**
+ * libstdc++3 uses RDRAND. Bypass that with this incredible hack.
+ */
+void _ZNSt13random_device7_M_initERKNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEE(
+ void* this, __attribute__((unused)) void* token) {
+ static void (*assign_string)(void *, char*) = NULL;
+ static void (*random_init)(void *, void*) = NULL;
+ if (!assign_string) {
+ assign_string = (void (*)(void *, char*))dlsym(RTLD_NEXT,
+ "_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE6assignEPKc");
+ if (!assign_string) {
+ libstdcpp_not_found();
+ }
+ }
+ assign_string(token, "/dev/urandom");
+ if (!random_init) {
+ random_init = (void (*)(void *, void*))dlsym(RTLD_NEXT, __func__);
+ if (!random_init) {
+ libstdcpp_not_found();
+ }
+ }
+ random_init(this, token);
+}
+
+/**
+ * gcc 4.8.4 in Ubuntu 14.04-32
+ */
+void _ZNSt13random_device7_M_initERKSs(void* this,
+ __attribute__((unused)) void* token) {
+ static void (*assign_string)(void *, char*) = NULL;
+ static void (*random_init)(void *, void*) = NULL;
+ if (!assign_string) {
+ assign_string = (void (*)(void *, char*))dlsym(RTLD_NEXT,
+ "_ZNSs6assignEPKc");
+ if (!assign_string) {
+ libstdcpp_not_found();
+ }
+ }
+ assign_string(token, "/dev/urandom");
+ if (!random_init) {
+ random_init = (void (*)(void *, void*))dlsym(RTLD_NEXT, __func__);
+ if (!random_init) {
+ libstdcpp_not_found();
+ }
+ }
+ random_init(this, token);
+}
+
+#endif
diff --git a/rr/android/x86_64/share/rr/src/preload/preload_interface.h b/rr/android/x86_64/share/rr/src/preload/preload_interface.h
new file mode 100644
index 0000000..5266498
--- /dev/null
+++ b/rr/android/x86_64/share/rr/src/preload/preload_interface.h
@@ -0,0 +1,750 @@
+/* -*- Mode: C; tab-width: 8; c-basic-offset: 2; indent-tabs-mode: nil; -*- */
+
+#ifndef RR_PRELOAD_INTERFACE_H_
+#define RR_PRELOAD_INTERFACE_H_
+
+/* Bump this whenever the interface between syscallbuf and rr changes in a way
+ * that would require changes to replay. So be very careful making changes to
+ * this file! Many changes would require a bump in this value, and support
+ * code in rr to handle old protocol versions. And when we bump it we'll need
+ * to figure out a way to test the old protocol versions.
+ * To be clear, changes that only affect recording and not replay, such as
+ * changes to the layout of syscall_patch_hook, do not need to bump this.
+ * Note also that SYSCALLBUF_PROTOCOL_VERSION is stored in the trace header, so
+ * replay always has access to the SYSCALLBUF_PROTOCOL_VERSION used during
+ * recording, even before the preload library is ever loaded.
+ *
+ * Version 0: initial rr 5.0.0 release
+ */
+#define SYSCALLBUF_PROTOCOL_VERSION 0
+
+#if defined(RR_IMPLEMENT_PRELOAD) || defined(RR_IMPLEMENT_AUDIT)
+/* Avoid using <string.h> library functions */
+static inline int streq(const char* s1, const char* s2) {
+ while (1) {
+ if (*s1 != *s2) {
+ return 0;
+ }
+ if (!*s1) {
+ return 1;
+ }
+ ++s1;
+ ++s2;
+ }
+ return 1;
+}
+static inline size_t rrstrlen(const char* s) {
+ size_t ret = 0;
+ while (*s) {
+ ++s;
+ ++ret;
+ }
+ return ret;
+}
+#else
+#include <string.h>
+static inline int streq(const char* s1, const char* s2) {
+ return !strcmp(s1, s2);
+}
+static inline size_t rrstrlen(const char* s) { return strlen(s); }
+#include "../remote_ptr.h"
+#endif
+
+#include <stdint.h>
+#include <stddef.h>
+
+static inline int strprefix(const char* s1, const char* s2) {
+ while (1) {
+ if (!*s1) {
+ return 1;
+ }
+ if (*s1 != *s2) {
+ return 0;
+ }
+ ++s1;
+ ++s2;
+ }
+ return 1;
+}
+
+static inline const char* extract_file_name(const char* s) {
+ const char* ret = s;
+ while (*s) {
+ if (*s == '/') {
+ ret = s + 1;
+ }
+ ++s;
+ }
+ return ret;
+}
+
+/* This header file is included by preload.c and various rr .cc files. It
+ * defines the interface between the preload library and rr. preload.c
+ * #defines RR_IMPLEMENT_PRELOAD to let us handle situations where rr and
+ * preload.c need to see slightly different definitions of the same constructs.
+ *
+ * preload.c compiles this as C code. All rr modules compile this as C++ code.
+ * We do not use 'extern "C"' because we don't actually link between C and C++
+ * and 'extern "C"' is not compatible with our use of templates below.
+ */
+
+#define SYSCALLBUF_LIB_FILENAME_BASE "librrpreload"
+#define SYSCALLBUF_LIB_FILENAME SYSCALLBUF_LIB_FILENAME_BASE ".so"
+#define SYSCALLBUF_LIB_FILENAME_PADDED SYSCALLBUF_LIB_FILENAME_BASE ".so:::"
+#define SYSCALLBUF_LIB_FILENAME_32 SYSCALLBUF_LIB_FILENAME_BASE "_32.so"
+
+#define RTLDAUDIT_LIB_FILENAME_BASE "librraudit"
+#define RTLDAUDIT_LIB_FILENAME RTLDAUDIT_LIB_FILENAME_BASE ".so"
+#define RTLDAUDIT_LIB_FILENAME_PADDED RTLDAUDIT_LIB_FILENAME_BASE ".so:::"
+#define RTLDAUDIT_LIB_FILENAME_32 RTLDAUDIT_LIB_FILENAME_BASE "_32.so"
+
+#define RRPAGE_LIB_FILENAME_BASE "librrpage"
+#define RRPAGE_LIB_FILENAME RRPAGE_LIB_FILENAME_BASE ".so"
+#define RRPAGE_LIB_FILENAME_32 RRPAGE_LIB_FILENAME_BASE "_32.so"
+
+/* Set this env var to enable syscall buffering. */
+#define SYSCALLBUF_ENABLED_ENV_VAR "_RR_USE_SYSCALLBUF"
+
+/* Size of table mapping fd numbers to syscallbuf-disabled flag. */
+#define SYSCALLBUF_FDS_DISABLED_SIZE 16384
+
+#define MPROTECT_RECORD_COUNT 1000
+
+#if defined(__x86_64__) || defined(__i386__)
+#define RR_PAGE_SYSCALL_STUB_SIZE 3
+#define RR_PAGE_SYSCALL_INSTRUCTION_END 2
+#elif defined(__aarch64__)
+#define RR_PAGE_SYSCALL_STUB_SIZE 8
+#define RR_PAGE_SYSCALL_INSTRUCTION_END 4
+#else
+#error "Must be defined for this architecture"
+#endif
+
+/* Must match generate_rr_page.py */
+#define RR_PAGE_ADDR 0x70000000
+#ifdef __aarch64__
+#define PRELOAD_LIBRARY_PAGE_SIZE 65536
+#else
+#define PRELOAD_LIBRARY_PAGE_SIZE 4096
+#endif
+#define RR_PAGE_SYSCALL_ADDR(index) \
+ ((void*)(RR_PAGE_ADDR + RR_PAGE_SYSCALL_STUB_SIZE * (index)))
+#define RR_PAGE_SYSCALL_TRACED RR_PAGE_SYSCALL_ADDR(0)
+#define RR_PAGE_SYSCALL_PRIVILEGED_TRACED RR_PAGE_SYSCALL_ADDR(1)
+#define RR_PAGE_SYSCALL_UNTRACED RR_PAGE_SYSCALL_ADDR(2)
+#define RR_PAGE_SYSCALL_UNTRACED_REPLAY_ONLY RR_PAGE_SYSCALL_ADDR(3)
+#define RR_PAGE_SYSCALL_UNTRACED_RECORDING_ONLY RR_PAGE_SYSCALL_ADDR(4)
+#define RR_PAGE_SYSCALL_PRIVILEGED_UNTRACED RR_PAGE_SYSCALL_ADDR(5)
+#define RR_PAGE_SYSCALL_PRIVILEGED_UNTRACED_REPLAY_ONLY RR_PAGE_SYSCALL_ADDR(6)
+#define RR_PAGE_SYSCALL_PRIVILEGED_UNTRACED_RECORDING_ONLY \
+ RR_PAGE_SYSCALL_ADDR(7)
+#define RR_PAGE_SYSCALL_UNTRACED_REPLAY_ASSIST RR_PAGE_SYSCALL_ADDR(8)
+#define RR_PAGE_IN_REPLAY_FLAG (RR_PAGE_ADDR + RR_PAGE_SYSCALL_STUB_SIZE * 9)
+#define RR_PAGE_BREAKPOINT_VALUE (RR_PAGE_IN_REPLAY_FLAG + 4)
+
+/* Not ABI stable - in record page only */
+#define RR_PAGE_FF_BYTES RR_PAGE_BREAKPOINT_VALUE
+
+/* PRELOAD_THREAD_LOCALS_ADDR should not change.
+ * Tools depend on this address. */
+#define PRELOAD_THREAD_LOCALS_ADDR (RR_PAGE_ADDR + PRELOAD_LIBRARY_PAGE_SIZE)
+#ifdef __aarch64__
+#define PRELOAD_THREAD_LOCAL_SCRATCH2_SIZE (1024 + 8 * 2)
+#else
+#define PRELOAD_THREAD_LOCAL_SCRATCH2_SIZE 0
+#endif
+#define PRELOAD_THREAD_LOCALS_SIZE (144 + PRELOAD_THREAD_LOCAL_SCRATCH2_SIZE)
+
+#include "rrcalls.h"
+
+/* Define macros that let us compile a struct definition either "natively"
+ * (when included by preload.c) or as a template over Arch for use by rr.
+ */
+#if defined(RR_IMPLEMENT_PRELOAD) || defined(RR_IMPLEMENT_AUDIT)
+#define TEMPLATE_ARCH
+#define PTR(T) T*
+#define PTR_ARCH(T) T*
+#define EMBED_STRUCT(T) struct T
+#define VOLATILE volatile
+#define SIGNED_LONG long
+#define UNSIGNED_LONG unsigned long
+#else
+#define TEMPLATE_ARCH template <typename Arch>
+#define PTR(T) typename Arch::template ptr<T>
+#define PTR_ARCH(T) typename Arch::template ptr<T<Arch>>
+#define EMBED_STRUCT(T) T<Arch>
+#define VOLATILE
+#define SIGNED_LONG typename Arch::signed_long
+#define UNSIGNED_LONG typename Arch::unsigned_long
+#endif
+
+#define PATCH_IS_MULTIPLE_INSTRUCTIONS (1 << 0)
+/* The syscall instruction is the last instruction in the patched area
+ * (rather than the first), which requires special handling.
+ */
+#define PATCH_SYSCALL_INSTRUCTION_IS_LAST (1 << 1)
+/* All instructions in the patch are nop and their execution is thus not
+ * observable. This may allow more aggressive handling of interfering branches.
+ */
+#define PATCH_IS_NOP_INSTRUCTIONS (1 << 2)
+
+
+/**
+ * To support syscall buffering, we replace syscall instructions with a "call"
+ * instruction that calls a hook in the preload library to handle the syscall.
+ * Since the call instruction takes more space than the syscall instruction,
+ * the patch replaces one or more instructions after the syscall instruction as
+ * well; those instructions are folded into the tail of the hook function
+ * and we have multiple hook functions, each one corresponding to an
+ * instruction that follows a syscall instruction.
+ * Each instance of this struct describes an instruction that can follow a
+ * syscall and a hook function to patch with.
+ *
+ * This is not (and must not ever be) used during replay so we can change it
+ * without bumping SYSCALLBUF_PROTOCOL_VERSION.
+ */
+struct syscall_patch_hook {
+ uint8_t flags;
+ uint8_t patch_region_length;
+ /* Avoid any padding or anything that would make the layout arch-specific. */
+ uint8_t patch_region_bytes[14];
+ uint64_t hook_address;
+};
+
+/**
+ * We buffer mprotect syscalls. Their effects need to be noted so we can
+ * update AddressSpace's cache of memory layout, which stores prot bits. So,
+ * the preload code builds a list of mprotect_records corresponding to the
+ * mprotect syscalls that have been buffered. This list is read by rr whenever
+ * we flush the syscallbuf, and its effects performed. The actual mprotect
+ * syscalls are performed during recording and replay.
+ *
+ * We simplify things by making this arch-independent.
+ */
+struct mprotect_record {
+ uint64_t start;
+ uint64_t size;
+ int32_t prot;
+ int32_t padding;
+};
+
+/**
+ * Must be arch-independent.
+ * Variables used to communicate between preload and rr.
+ * We package these up into a single struct to simplify the preload/rr
+ * interface.
+ * You can add to the end of this struct without breaking trace compatibility,
+ * but don't move existing fields. Do not write to it during replay except for
+ * the 'in_replay' field. Be careful reading fields during replay as noted
+ * below, since they don't all exist in all trace versions.
+ */
+struct preload_globals {
+ /* RESERVED in current versions of rr.
+ *
+ * QUIRK: With UsesGlobalsInReplayQuirk:
+ * 0 during recording, 1 during replay. Set by rr.
+ * This MUST NOT be used in conditional branches. It should only be used
+ * as the condition for conditional moves so that control flow during replay
+ * does not diverge from control flow during recording.
+ * We also have to be careful that values different between record and replay
+ * don't accidentally leak into other memory locations or registers.
+ * USE WITH CAUTION.
+ */
+ unsigned char reserved_legacy_in_replay;
+ /* 0 during recording and replay, 1 during diversion. Set by rr.
+ */
+ unsigned char in_diversion;
+ /* 1 if chaos mode is enabled. DO NOT READ from rr during replay, because
+ this field is not initialized in old traces. */
+ unsigned char in_chaos;
+ /* The signal to use for desched events */
+ unsigned char desched_sig;
+ /* RESERVED */
+ int reserved;
+ /**
+ * Set by rr.
+ * For each fd, indicate a class that is valid for all fds with the given
+ * number in all tasks that share this address space. For fds >=
+ * SYSCALLBUF_FDS_DISABLED_SIZE - 1, the class is given by by
+ * syscallbuf_fd_class[SYSCALLBUF_FDS_DISABLED_SIZE - 1]. See the
+ */
+ VOLATILE char syscallbuf_fd_class[SYSCALLBUF_FDS_DISABLED_SIZE];
+
+ /* WARNING! SYSCALLBUF_FDS_DISABLED_SIZE can change, so
+ access to the following fields during replay is dangerous. Use
+ PRELOAD_GLOBALS_FIELD_AFTER_SYSCALLBUF_FDS_DISABLED or something
+ like it! */
+ /* mprotect records. Set by preload. Us
+ PRELOAD_GLOBALS_FIELD_AFTER_SYSCALLBUF_FDS_DISABLED to access. */
+ struct mprotect_record mprotect_records[MPROTECT_RECORD_COUNT];
+ /* Random seed that can be used for various purposes. DO NOT READ from rr
+ during replay, because this field does not exist in old traces. */
+ uint64_t random_seed;
+ /* RESERVED in current versions of rr.
+ *
+ * QUIRK: With UsesGlobalsInReplayQuirk:
+ * Indicates the value (in 8-byte increments) at which to raise a SIGSEGV
+ * trap once reached. NOTE: This remains constant during record, and is
+ * used only during replay. The same restrictions as in_replay above apply.
+ *
+ * Use PRELOAD_GLOBALS_FIELD_AFTER_SYSCALLBUF_FDS_DISABLED to access during
+ * replay. */
+ uint64_t reserved_legacy_breakpoint_value;
+ /* Indicates whether or not all tasks in this address space have the same
+ fd table. Set by rr during record (modifications are recorded).
+ Read by the syscallbuf. Not read during replay. */
+ unsigned char fdt_uniform;
+ /* The CPU we're bound to, if any; -1 if not bound. Not read during replay. */
+ int32_t cpu_binding;
+};
+
+/**
+ * Represents syscall params. Makes it simpler to pass them around,
+ * and avoids pushing/popping all the data for calls.
+ */
+TEMPLATE_ARCH
+struct syscall_info {
+ SIGNED_LONG no;
+ SIGNED_LONG args[6];
+};
+
+TEMPLATE_ARCH
+struct robust_list_info {
+ PTR(void) head;
+ uint32_t len;
+};
+
+TEMPLATE_ARCH
+struct rseq_info {
+ PTR(void) rseq;
+ uint32_t len;
+ uint32_t sig;
+};
+
+/**
+ * Can be architecture dependent. The rr process does not manipulate
+ * these except to save and restore the values on task switches so that
+ * the values are always effectively local to the current task. rr also
+ * sets the |syscallbuf_stub_alt_stack| field.
+ * We use this instead of regular libc TLS because sometimes buggy application
+ * code breaks libc TLS for some tasks. With this approach we can be sure
+ * thread-locals are usable for any task in any state.
+ */
+TEMPLATE_ARCH
+struct preload_thread_locals {
+ /* The offset of this field MUST NOT CHANGE, it is part of the preload ABI
+ * rr depends on.
+ * Offset of this field is hardcoded in syscall_hook.S and
+ * assembly_templates.py.
+ * Pointer to alt-stack used by syscallbuf stubs (allocated at the end of
+ * the scratch buffer.
+ */
+ PTR(void) syscallbuf_stub_alt_stack;
+ /* The offset of this field MUST NOT CHANGE, it is part of the preload ABI
+ * tools can depend on.
+ * Where syscall result will be (or during replay, has been) saved.
+ */
+ PTR(int64_t) pending_untraced_syscall_result;
+ /* The offset of this field MUST NOT CHANGE, it is part of the preload ABI
+ * rr depends on.
+ * Scratch space used by stub code.
+ */
+ PTR(void) stub_scratch_1;
+ /* The offset of this field MUST NOT CHANGE, it is part of the preload ABI
+ * rr depends on.
+ */
+ int32_t alt_stack_nesting_level;
+ /* Syscall hook saved flags (bottom 16 bits only) */
+ int32_t saved_flags;
+ /* The offset of this field MUST NOT CHANGE, it is part of the preload ABI
+ * rr depends on. It contains the parameters to the patched syscall, or
+ * zero if we're not processing a buffered syscall. Do not depend on this
+ * existing during replay, some traces with SYSCALLBUF_PROTOCOL_VERSION 0
+ * don't have it.
+ */
+ PTR_ARCH(const struct syscall_info) original_syscall_parameters;
+
+ /* Nonzero when thread-local state like the syscallbuf has been
+ * initialized. */
+ int32_t thread_inited;
+ /* The offset of this field MUST NOT CHANGE, it is part of the ABI tools
+ * depend on. When buffering is enabled, points at the thread's mapped buffer
+ * segment. At the start of the segment is an object of type |struct
+ * syscallbuf_hdr|, so |buffer| is also a pointer to the buffer
+ * header. */
+ PTR(uint8_t) buffer;
+ UNSIGNED_LONG buffer_size;
+ /* This is used to support the buffering of "may-block" system calls.
+ * The problem that needs to be addressed can be introduced with a
+ * simple example; assume that we're buffering the "read" and "write"
+ * syscalls.
+ *
+ * o (Tasks W and R set up a synchronous-IO pipe open between them; W
+ * "owns" the write end of the pipe; R owns the read end; the pipe
+ * buffer is full)
+ * o Task W invokes the write syscall on the pipe
+ * o Since write is a buffered syscall, the seccomp filter traps W
+ * directly to the kernel; there's no trace event for W delivered
+ * to rr.
+ * o The pipe is full, so W is descheduled by the kernel because W
+ * can't make progress.
+ * o rr thinks W is still running and doesn't schedule R.
+ *
+ * At this point, progress in the recorded application can only be
+ * made by scheduling R, but no one tells rr to do that. Oops!
+ *
+ * Thus enter the "desched counter". It's a perf_event for the "sw t
+ * switches" event (which, more precisely, is "sw deschedule"; it
+ * counts schedule-out, not schedule-in). We program the counter to
+ * deliver a signal to this task when there's new counter data
+ * available. And we set up the "sample period", how many descheds
+ * are triggered before the signal is delivered, to be "1". This
+ * means that when the counter is armed, the next desched (i.e., the
+ * next time the desched counter is bumped up) of this task will
+ * deliver the signal to it. And signal delivery always generates a
+ * ptrace trap, so rr can deduce that this task was descheduled and
+ * schedule another.
+ *
+ * The description above is sort of an idealized view; there are
+ * numerous implementation details that are documented in
+ * handle_signal.c, where they're dealt with. */
+ int32_t desched_counter_fd;
+ int32_t cloned_file_data_fd;
+ SIGNED_LONG cloned_file_data_offset;
+ PTR(void) scratch_buf;
+ UNSIGNED_LONG usable_scratch_size;
+
+ PTR(struct msghdr) notify_control_msg;
+
+ /* The offset of this field MUST NOT CHANGE, it is part of the preload ABI
+ * rr depends on, on ARM.
+ */
+ uint8_t stub_scratch_2[PRELOAD_THREAD_LOCAL_SCRATCH2_SIZE];
+
+ /** When the size is non-zero, there has been a buffered set_robust_list
+ * that must be accounted for. Set by preload code only, read by rr
+ * only during recording.
+ */
+ EMBED_STRUCT(robust_list_info) robust_list;
+
+ /** True when either a buffered rseq or unbuffered rseq has been called
+ * for this thread. Set by rr for buffered rseq and preload for unbuffered
+ * rseq. */
+ int32_t rseq_called;
+
+ /** When the len is non-zero, there has been a buffered rseq
+ * that must be accounted for. Set by preload code only, read by rr
+ * only during recording.
+ */
+ EMBED_STRUCT(rseq_info) rseq;
+};
+#if defined(__aarch64__) && (defined(RR_IMPLEMENT_PRELOAD) || \
+ defined(RR_IMPLEMENT_AUDIT))
+// On aarch64, we the stub_scratch_2 offset is hardcoded in the syscallbuf code
+_Static_assert(offsetof(struct preload_thread_locals, stub_scratch_2) == 8 * 13,
+ "stub_scratch_2 offset mismatch");
+#endif
+
+// The set of flags that can be set for each fd in syscallbuf_fds_disabled.
+enum syscallbuf_fd_classes {
+ // fd is invalid, all syscalls will error (syscallbuf internal use only)
+ FD_CLASS_INVALID = -1,
+ // The fd is allowed to be completely untraced. No notification to the
+ // syscall buf is required.
+ FD_CLASS_UNTRACED = 0x0,
+ // This is the most conservative option. All operations on this fd are
+ // always traced. If there is a conflict between other options, this one
+ // should be chosen.
+ FD_CLASS_TRACED = 0x1,
+ // This fd either refers to a /proc/<pid>/mem or is untrace (if this as
+ // is shared with another fd table)
+ FD_CLASS_PROC_MEM = 0x2,
+};
+
+#define CURRENT_INIT_PRELOAD_PARAMS_VERSION 2
+
+/**
+ * Packs up the parameters passed to |SYS_rrcall_init_preload|.
+ * We use this struct because it's a little cleaner.
+ * When evolving this struct, add new fields at the end and don't
+ * depend on them during replay.
+ */
+TEMPLATE_ARCH
+struct rrcall_init_preload_params {
+ /* All "In" params. */
+ /* The syscallbuf lib's idea of whether buffering is enabled.
+ * We let the syscallbuf code decide in order to more simply
+ * replay the same decision that was recorded. */
+ int syscallbuf_enabled;
+ int syscall_patch_hook_count;
+ PTR(struct syscall_patch_hook) syscall_patch_hooks;
+ PTR(void) unused;
+ PTR(void) syscallbuf_code_start;
+ PTR(void) syscallbuf_code_end;
+ PTR(void) get_pc_thunks_start;
+ PTR(void) get_pc_thunks_end;
+ PTR(void) syscallbuf_final_exit_instruction;
+ PTR(struct preload_globals) globals;
+ union {
+ struct {
+ /* Address of the first entry of the breakpoint table.
+ * After processing a sycallbuf record (and unlocking the syscallbuf),
+ * we call a function in this table corresponding to the record processed.
+ * rr can set a breakpoint in this table to break on the completion of a
+ * particular syscallbuf record.
+ * This method of setting the breakpoint is deprecated. Instead, use the
+ * interface below. It is retained for compatibility */
+ PTR(void) breakpoint_table;
+ int breakpoint_table_entry_size;
+ };
+ struct {
+ PTR(void) breakpoint_instr_addr;
+ // Set of -1 to indicate non-legacy mode
+ int breakpoint_mode_sentinel;
+ };
+ };
+ PTR(void) syscallbuf_syscall_hook;
+};
+
+/**
+ * Packs up the inout parameters passed to |SYS_rrcall_init_buffers|.
+ * We use this struct because there are too many params to pass
+ * through registers on at least x86. (It's also a little cleaner.)
+ */
+TEMPLATE_ARCH
+struct rrcall_init_buffers_params {
+ /* The fd we're using to track desched events. */
+ int desched_counter_fd;
+ /* "Out" params. */
+ int cloned_file_data_fd;
+ /* Returned pointer to and size of the shared syscallbuf
+ * segment. */
+ PTR(void) syscallbuf_ptr;
+ /* Returned pointer to rr's syscall scratch buffer */
+ PTR(void) scratch_buf;
+ uint32_t syscallbuf_size;
+ uint32_t usable_scratch_size;
+};
+
+/**
+ * The syscall buffer comprises an array of these variable-length
+ * records, along with the header below.
+ */
+struct syscallbuf_record {
+ /* Return value from the syscall. This can be a memory
+ * address, so must be as big as a memory address can be.
+ * We use 64 bits rather than make syscallbuf_record Arch-specific as that
+ * gets cumbersome.
+ */
+ int64_t ret;
+ /* Syscall number.
+ *
+ * NB: the x86 linux ABI has 350 syscalls as of 3.9.6 and
+ * x86-64 defines 313, so this is a pretty safe storage
+ * allocation. It would be an earth-shattering event if the
+ * syscall surface were doubled in a short period of time, and
+ * even then we would have a comfortable cushion. Still,
+ *
+ * TODO: static_assert this can hold largest syscall num */
+ uint16_t syscallno;
+ /* Did the tracee arm/disarm the desched notification for this
+ * syscall? */
+ uint8_t desched : 1;
+ /* Does this record require an assist during replay ? */
+ uint8_t replay_assist : 1;
+ uint8_t _flags_padding : 6;
+ uint8_t _padding;
+ /* Size of entire record in bytes: this struct plus extra
+ * recorded data stored inline after the last field, not
+ * including padding.
+ *
+ * TODO: static_assert this can repr >= buffer size */
+ uint32_t size;
+ /* Extra recorded outparam data starts here. */
+ uint8_t extra_data[0];
+};
+
+/**
+ * This struct summarizes the state of the syscall buffer. It happens
+ * to be located at the start of the buffer.
+ */
+struct syscallbuf_hdr {
+ /* The number of valid syscallbuf_record bytes in the buffer,
+ * not counting this header.
+ * Make this volatile so that memory writes aren't reordered around
+ * updates to this field. */
+ volatile uint32_t num_rec_bytes;
+ /* Number of mprotect calls since last syscallbuf flush. The last record in
+ * the list may not have been applied yet.
+ */
+ volatile uint32_t mprotect_record_count;
+ /* Number of records whose syscalls have definitely completed.
+ * May be one less than mprotect_record_count.
+ */
+ volatile uint32_t mprotect_record_count_completed;
+ /* True if the current syscall should not be committed to the
+ * buffer, for whatever reason; likely interrupted by
+ * desched. Set by rr. */
+ volatile uint8_t abort_commit;
+ /* True if, next time we exit the syscall buffer hook, libpreload should
+ * execute SYS_rrcall_notify_syscall_hook_exit to give rr the opportunity to
+ * deliver a signal and/or reset the syscallbuf. */
+ volatile uint8_t notify_on_syscall_hook_exit;
+ /* This tracks whether the buffer is currently in use for a
+ * system call or otherwise unavailable. This is helpful when
+ * a signal handler runs during a wrapped system call; we don't want
+ * it to use the buffer for its system calls. The different reasons why the
+ * buffer could be locked, use different bits of this field and the buffer
+ * may be used only if all are clear. See enum syscallbuf_locked_why for
+ * used bits.
+ */
+ volatile uint8_t locked;
+ /* Nonzero when rr needs to worry about the desched signal.
+ * When it's zero, the desched signal can safely be
+ * discarded. */
+ volatile uint8_t desched_signal_may_be_relevant;
+ /* A copy of the tasks's signal mask. Updated by preload when a buffered
+ * rt_sigprocmask executes.
+ */
+ volatile uint64_t blocked_sigs;
+ /* Incremented by preload every time a buffered rt_sigprocmask executes.
+ * Cleared during syscallbuf reset.
+ */
+ volatile uint32_t blocked_sigs_generation;
+ /* Nonzero when preload is in the process of calling an untraced
+ * sigprocmask; the real sigprocmask may or may not match blocked_sigs.
+ */
+ volatile uint8_t in_sigprocmask_critical_section;
+ /* Nonzero when the syscall was aborted during preparation without doing
+ * anything. This is set when a user seccomp filter forces a SIGSYS. */
+ volatile uint8_t failed_during_preparation;
+
+ struct syscallbuf_record recs[0];
+} __attribute__((__packed__));
+/* TODO: static_assert(sizeof(uint32_t) ==
+ * sizeof(struct syscallbuf_hdr)) */
+
+/**
+ * Each bit of of syscallbuf_hdr->locked indicates a reason why the syscallbuf
+ * is locked. These are all the bits that are currently defined.
+ */
+enum syscallbuf_locked_why {
+ /* Used by the tracee, during interruptible syscalls to avoid recursion */
+ SYSCALLBUF_LOCKED_TRACEE = 0x1,
+ /* Used by the tracer to prevent syscall buffering when necessary to preserve
+ semantics (e.g. for ptracees whose syscalls are being observed) */
+ SYSCALLBUF_LOCKED_TRACER = 0x2
+};
+
+/**
+ * Return a pointer to what may be the next syscall record.
+ *
+ * THIS POINTER IS NOT GUARANTEED TO BE VALID!!! Caveat emptor.
+ */
+inline static struct syscallbuf_record* next_record(
+ struct syscallbuf_hdr* hdr) {
+ uintptr_t next = (uintptr_t)hdr->recs + hdr->num_rec_bytes;
+ return (struct syscallbuf_record*)next;
+}
+
+/**
+ * Return the amount of space that a record of |length| will occupy in
+ * the buffer if committed, including padding.
+ */
+inline static long stored_record_size(size_t length) {
+ /* Round up to a whole number of 64-bit words. */
+ return (length + 7) & ~7;
+}
+
+/**
+ * Return nonzero if an attempted open() of |filename| should be
+ * blocked.
+ *
+ * The background of this hack is that rr doesn't support DRI/DRM
+ * currently, so we use the blunt stick of refusing to open this
+ * interface file as a way of disabling it entirely. (In addition to
+ * tickling xorg.conf, which doesn't entirely do the trick.) It's
+ * known how to fix this particular, so let's not let this hack grow
+ * too much by piling on.
+ */
+inline static int is_blacklisted_filename(const char* filename) {
+ const char* f;
+ if (strprefix("/dev/dri/", filename) || streq("/dev/nvidiactl", filename) ||
+ streq("/usr/share/alsa/alsa.conf", filename) ||
+ streq("/dev/nvidia-uvm", filename)) {
+ return 1;
+ }
+ f = extract_file_name(filename);
+ return strprefix("rr-test-blacklist-file_name", f) ||
+ strprefix("pulse-shm-", f);
+}
+
+inline static int is_blacklisted_memfd(const char* name) {
+ return streq("pulseaudio", name);
+}
+
+inline static int is_blacklisted_socket(const char* filename) {
+ /* Blacklist the nscd socket because glibc communicates with the daemon over
+ * shared memory rr can't handle.
+ */
+ return streq("/var/run/nscd/socket", filename);
+}
+
+inline static int is_gcrypt_deny_file(const char* filename) {
+ return streq("/etc/gcrypt/hwf.deny", filename);
+}
+
+inline static int is_terminal(const char* filename) {
+ return strprefix("/dev/tty", filename) || strprefix("/dev/pts", filename);
+}
+
+inline static int is_proc_mem_file(const char* filename) {
+ if (!strprefix("/proc/", filename)) {
+ return 0;
+ }
+ return streq(filename + rrstrlen(filename) - 4, "/mem");
+}
+
+inline static int is_proc_fd_dir(const char* filename) {
+ if (!strprefix("/proc/", filename)) {
+ return 0;
+ }
+
+ int len = rrstrlen(filename);
+ const char* fd_bit = filename + len;
+ if (*fd_bit == '/') {
+ fd_bit--;
+ }
+
+ return strprefix("/fd", fd_bit - 3);
+}
+
+inline static int is_sys_cpu_online_file(const char* filename) {
+ return streq("/sys/devices/system/cpu/online", filename);
+}
+
+inline static int is_proc_stat_file(const char* filename) {
+ return streq("/proc/stat", filename);
+}
+
+inline static int is_rr_page_lib(const char* filename) {
+ return streq(extract_file_name(filename), RRPAGE_LIB_FILENAME) ||
+ streq(extract_file_name(filename), RRPAGE_LIB_FILENAME_32);
+}
+
+/**
+ * Returns nonzero if an attempted open() of |filename| can be syscall-buffered.
+ * When this returns zero, the open must be forwarded to the rr process.
+ * |filename| must be absolute.
+ * This is imperfect because it doesn't handle hard links and files (re)mounted
+ * in different places.
+ */
+inline static int allow_buffered_open(const char* filename) {
+ return filename &&
+ !is_blacklisted_filename(filename) && !is_gcrypt_deny_file(filename) &&
+ !is_terminal(filename) && !is_proc_mem_file(filename) &&
+ !is_proc_fd_dir(filename) && !is_sys_cpu_online_file(filename) &&
+ !is_proc_stat_file(filename) && !is_rr_page_lib(filename);
+}
+
+#endif /* RR_PRELOAD_INTERFACE_H_ */
diff --git a/rr/android/x86_64/share/rr/src/preload/raw_syscall.S b/rr/android/x86_64/share/rr/src/preload/raw_syscall.S
new file mode 100644
index 0000000..4c7b6a3
--- /dev/null
+++ b/rr/android/x86_64/share/rr/src/preload/raw_syscall.S
@@ -0,0 +1,176 @@
+#if defined(__i386__)
+ .text
+ .globl _raw_syscall
+ .hidden _raw_syscall
+ .type _raw_syscall, @function
+_raw_syscall: /* syscallno = 4(%esp) */
+ .cfi_startproc
+ pushl %ebx /* syscallno = 8(%esp) */
+ .cfi_adjust_cfa_offset 4
+ .cfi_rel_offset %ebx, 0
+ pushl %esi /* syscallno = 12(%esp) */
+ .cfi_adjust_cfa_offset 4
+ .cfi_rel_offset %esi, 0
+ pushl %edi /* syscallno = 16(%esp) */
+ .cfi_adjust_cfa_offset 4
+ .cfi_rel_offset %edi, 0
+ pushl %ebp /* syscallno = 20(%esp) */
+ .cfi_adjust_cfa_offset 4
+ .cfi_rel_offset %ebp, 0
+
+ movl 20(%esp), %eax /* %eax = syscallno */
+ movl 24(%esp), %ebx /* %ebx = a0 */
+ movl 28(%esp), %ecx /* %ecx = a1 */
+ movl 32(%esp), %edx /* %edx = a2 */
+ movl 36(%esp), %esi /* %esi = a3 */
+ movl 40(%esp), %edi /* %edi = a4 */
+ movl 44(%esp), %ebp /* %ebp = a5 */
+
+ pushl 56(%esp)
+ .cfi_adjust_cfa_offset 4
+ pushl 56(%esp)
+ .cfi_adjust_cfa_offset 4
+
+ call *56(%esp)
+
+ addl $8,%esp
+ .cfi_adjust_cfa_offset -8
+ popl %ebp
+ .cfi_adjust_cfa_offset -4
+ .cfi_restore %ebp
+ popl %edi
+ .cfi_adjust_cfa_offset -4
+ .cfi_restore %edi
+ popl %esi
+ .cfi_adjust_cfa_offset -4
+ .cfi_restore %esi
+ popl %ebx
+ .cfi_adjust_cfa_offset -4
+ .cfi_restore %ebx
+ ret
+ .cfi_endproc
+ .size _raw_syscall, . - _raw_syscall
+
+#elif defined(__x86_64__)
+ .text
+ .globl _raw_syscall
+ .hidden _raw_syscall
+ .type _raw_syscall, @function
+_raw_syscall:
+ .cfi_startproc
+ /* Incoming args are in %rdi, %rsi, %rdx, %rcx, %r8, %r9, and 8(%rsp).
+ Syscall arguments are %rdi, %rsi, %rdx, %r10, %r8, %r9. */
+ movq %rdi, %rax /* syscall number */
+ movq %rsi, %rdi /* first syscall arg */
+ movq %rdx, %rsi /* second syscall arg */
+ movq %rcx, %rdx /* third syscall arg */
+ movq %r8, %r10 /* fourth syscall arg */
+ movq %r9, %r8 /* fifth syscall arg */
+ movq 8(%rsp), %r9 /* sixth syscall arg */
+
+ pushq 32(%rsp)
+ .cfi_adjust_cfa_offset 8
+ pushq 32(%rsp)
+ .cfi_adjust_cfa_offset 8
+
+ /* During a system call the kernel makes some user-space-visible
+ register changes:
+ a) on entry, %r11 is set to %rflags
+ b) %rcx is sometimes set to -1 (perhaps because of something rr does)
+ c) on entry or exit, some flags are sometimes changed
+ Also, during replay we may perform single-stepping which can set
+ TF (trace flag). We need to hide this.
+
+ fixup_syscall_registers is responsible for fixing up registers
+ to hide these effects when we get a ptrace trap from system calls
+ in the kernel: it clears TF from %r11, forces %rcx to -1, and sets
+ flags to fixed values (ZF+PF+IF+reserved, same as for "xor reg,reg").
+ Task::canonicalize_and_set_regs is responsible for fixing up registers
+ when we emulate a system call that was traced during recording (by
+ running to a breakpoint at that system call). It does the above
+ effects after setting %r11 to %rflags.
+
+ For untraced system calls there is no trap to rr during recording or
+ replay, so we must handle these issues here. We do not need
+ untraced system calls to behave exactly the same as traced
+ system calls, since whether a given system call was traced or not is
+ the same whether recording or replaying, but it's a good idea to
+ make them as similar as possible. We do need register values
+ to be perfectly consistent at every instruction in every replay
+ whether or not singlestepping is used (because a ReplayTimeline::mark
+ might be created at any point). During replay, untraced syscall
+ instructions are replaced with "xor %eax,%eax".
+
+ The following code is harmless for traced syscalls (and needs to be,
+ because traced syscalls go through here too).
+ */
+
+ /* Set %r11 and %rcx to the values we expect them to have after the
+ system call.
+ Set flags to ZF+PF+IF+reserved (0x246) first. This simplifies
+ everything.
+ This all has to be independent of TF being set at any point during
+ replay! But the way we're doing it here, it's trivial.
+ */
+ xor %ecx,%ecx
+ /* At this point, flags are 0x246 + possibly TF. */
+ movq $0x246,%r11
+ movq $-1,%rcx
+
+ callq *32(%rsp)
+
+ /* At this point, during recording we don't trust the kernel to have
+ restored flags correctly. It probably doesn't matter, but fix it
+ anyway. */
+ xor %ecx,%ecx
+ /* At this point, the high 32 bits of %rcx are unknown. Fix that by
+ setting to -1 to match traced syscalls. */
+ movq $-1,%rcx
+ /* At this point, %r11 is always 0x246 during replay and during
+ recording (because TF is never set during recording). Nothing to
+ fix in %r11. */
+
+ addq $16,%rsp
+ .cfi_adjust_cfa_offset -16
+ ret
+ .cfi_endproc
+ .size _raw_syscall, . - _raw_syscall
+
+#elif defined(__aarch64__)
+ .text
+ .globl _raw_syscall
+ .hidden _raw_syscall
+ .type _raw_syscall, @function
+_raw_syscall:
+ .cfi_startproc
+ // The two stack arguments needs to be at sp + 8 and sp + 16
+ // but they are currently at sp and sp + 8.
+ // Since sp needs to be 16 bytes aligned we need to load and push them again.
+ str x30, [sp, -32]!
+ .cfi_def_cfa_offset 32
+ .cfi_offset x30, -32
+ ldp x8, x30, [sp, 32]
+ stp x8, x30, [sp, 8]
+ mov x8,x0
+ mov x0,x1
+ mov x1,x2
+ mov x2,x3
+ mov x3,x4
+ mov x4,x5
+ mov x5,x6
+ blr x7
+ ldr x30, [sp], 32
+ .cfi_def_cfa_offset 0
+ .cfi_restore x30
+ ret
+ .cfi_endproc
+ .size _raw_syscall, . - _raw_syscall
+#else
+#error unknown CPU architecture
+#endif /* __i386__/__x86_64__ */
+ .global _syscallbuf_code_end
+ .hidden _syscallbuf_code_end
+_syscallbuf_code_end:
+
+ .section .note.GNU-stack,"",@progbits
+ .previous
diff --git a/rr/android/x86_64/share/rr/src/preload/rr_page.S b/rr/android/x86_64/share/rr/src/preload/rr_page.S
new file mode 100644
index 0000000..e0d253e
--- /dev/null
+++ b/rr/android/x86_64/share/rr/src/preload/rr_page.S
@@ -0,0 +1,100 @@
+// # Layout of the librrpage.so file
+//
+// The `rr page` is a special page mapped in low memory (at RR_PAGE_ADDR) that
+// contains syscall instructions at known ip values. These values must be fixed
+// for all processes in a given rr session, since rr cannot adjust the seccomp
+// filter that makes use of these values once it has been set. `librrpage.so`
+// contains this page, and rr will map it in place at process start and inform
+// the process about it by passing it as the address of the vdso. This way
+// the tracee's unwinders, as well as GDB will load the librrpage.so symbols and
+// unwind info and function correctly if execution is stopped in these locations.
+//
+// The `librrpage.so` file is made up of five pages:
+// 1: The ELF header, dynamic symbol/string table, and eh_frame sections
+// 2: The ELF section, symbol string tables (moved here in a post-processing step)
+// 3: A fake vdso that rr will ask the kernel to treat as the real vdso
+// 4: The rr page to be used during recording
+// 5: The rr page to be used during replay
+//
+// During record, rr will map the first four pages of librrpage.so only.
+// During replay, rr will replace the record page by the replay page.
+// Note however, that we only have one copy of the eh_frame and symbol
+// information - we expect all offsets and unwind instructions to match between
+// the record and replay versions (anything else would likely result in
+// divergences anyway)
+
+#ifdef __i386__
+#define CALL \
+ int $0x80; \
+ ret
+#define NOCALL \
+ xor %eax, %eax; \
+ ret
+#define TRAP \
+ nop; int $3; \
+ ret
+#define PAGE_ALIGN \
+ .align 0x1000
+#define PRELOAD_LIBRARY_PAGE_SIZE 0x1000
+#elif defined(__x86_64__)
+#define CALL \
+ syscall; \
+ ret
+#define NOCALL \
+ xor %eax, %eax; \
+ ret
+#define TRAP \
+ nop; int $3; \
+ ret
+#define PAGE_ALIGN \
+ .align 0x1000
+#define PRELOAD_LIBRARY_PAGE_SIZE 0x1000
+#elif defined(__aarch64__)
+#define CALL \
+ svc #0; \
+ ret
+#define NOCALL \
+ movz x0, #0; \
+ ret
+#define TRAP \
+ brk #0; \
+ ret
+#define PAGE_ALIGN \
+ .align 16
+#define PRELOAD_LIBRARY_PAGE_SIZE 0x10000
+#endif
+
+.section .sh_placeholder, "a"
+PAGE_ALIGN
+.fill PRELOAD_LIBRARY_PAGE_SIZE, 1, 0xff
+
+.section .vdso.text, "a", @progbits
+PAGE_ALIGN
+
+#include "rr_vdso.S"
+
+.section .record.text, "a", @progbits
+PAGE_ALIGN
+
+.global rr_page_start
+rr_page_start:
+
+#define LABEL(name) #name:;
+#define STARTPROC(name) #name:; .cfi_startproc
+#define STARTPROC_GLOBAL(name) .global #name; #name:; .cfi_startproc
+#define CFI_ENDPROC .cfi_endproc
+#include "rr_page_instructions.S"
+
+.section .replay.text, "", @progbits
+PAGE_ALIGN
+replay_page:
+// No CFI instructions or symbols for the replay page - we'll implicitly share
+// those of the record copy
+#undef LABEL
+#undef STARTPROC
+#undef CFI_ENDPROC
+#define LABEL(name)
+#define STARTPROC(name)
+#define CFI_ENDPROC
+#define IS_REPLAY 1
+#include "rr_page_instructions.S"
diff --git a/rr/android/x86_64/share/rr/src/preload/rr_page.ld b/rr/android/x86_64/share/rr/src/preload/rr_page.ld
new file mode 100644
index 0000000..df30100
--- /dev/null
+++ b/rr/android/x86_64/share/rr/src/preload/rr_page.ld
@@ -0,0 +1,58 @@
+PHDRS
+{
+ header PT_LOAD FILEHDR PHDRS;
+ text PT_LOAD ;
+ dynamic PT_DYNAMIC ;
+ note PT_NOTE ;
+ eh_frame 0x6474e550 ;
+ replay PT_NULL;
+}
+SECTIONS
+{
+ . = 0x70000000 - 3 * 4096 + SIZEOF_HEADERS;
+ .eh_frame_hdr : { *(.eh_frame_hdr) *(.eh_frame_entry .eh_frame_entry.*) } :header :eh_frame
+ .eh_frame : { KEEP (*(.eh_frame)) *(.eh_frame.*) } :header :eh_frame
+ .note.gnu.build-id : { *(.note.gnu.build-id) } :header :note
+ .note.gnu.property : { *(.note.gnu.property) } :header :note
+ .hash : { *(.hash) } :header
+ .gnu.hash : { *(.gnu.hash) } :header
+ .dynsym : { *(.dynsym) } :header
+ .dynstr : { *(.dynstr) } :header
+ .dynamic : { *(.dynamic) } :header :dynamic
+ .gnu.version : { *(.gnu.version) } :header
+ .gnu.version_d : { *(.gnu.version_d) } :header
+ .gnu.version_r : { *(.gnu.version_r) } :header
+ .got : { *(.got) } :header
+ .got.plt : { *(.got.plt) } :header
+ . = 0x70000000 - 2 * 4096;
+ /* This space in .sh_placeholder is reserved for the section table
+ symtab/strtab, which ordinarily go after the text sections,
+ but we need to have before the rr page.
+ We move it there in a post-processing step, since linker
+ scripts can't specify these locations for legacy reasons */
+ .sh_placeholder : { *(.sh_placeholder) } :header
+ . = 0x70000000 - 4096;
+ .vdso.text : { *(.vdso.text) } :text
+ . = 0x70000000;
+ .record.text : { *(.record.text) } :text
+ . = 0x70000000 + 4096;
+ .replay.text : { *(.replay.text) } :replay
+ /DISCARD/ : { *(.debug_* ) }
+}
+
+VERSION {
+ LINUX_2.6 {
+ global:
+ gettimeofday;
+ clock_gettime;
+ __vdso_gettimeofday;
+ __vdso_clock_getres;
+ __vdso_time;
+ __vdso_clock_gettime;
+ __vdso_getcpu;
+ __kernel_clock_getres;
+ __kernel_rt_sigreturn;
+ __kernel_gettimeofday;
+ __kernel_clock_gettime;
+ };
+}
diff --git a/rr/android/x86_64/share/rr/src/preload/rr_page_instructions.S b/rr/android/x86_64/share/rr/src/preload/rr_page_instructions.S
new file mode 100644
index 0000000..a679187
--- /dev/null
+++ b/rr/android/x86_64/share/rr/src/preload/rr_page_instructions.S
@@ -0,0 +1,61 @@
+// See rr_page.S
+
+#ifdef IS_REPLAY
+#define REPLAY_ONLY_CALL CALL
+#define RECORD_ONLY_CALL NOCALL
+#else
+#define REPLAY_ONLY_CALL NOCALL
+#define RECORD_ONLY_CALL CALL
+#endif
+
+STARTPROC(syscall_traced)
+ CALL
+ CFI_ENDPROC
+STARTPROC(syscall_priv_traced)
+ CALL
+ CFI_ENDPROC
+STARTPROC(syscall_untraced)
+ CALL
+ CFI_ENDPROC
+STARTPROC(syscall_untraced_replay_only)
+ REPLAY_ONLY_CALL
+ CFI_ENDPROC
+STARTPROC(syscall_untraced_record_only)
+ RECORD_ONLY_CALL
+ CFI_ENDPROC
+STARTPROC(syscall_priv_untraced)
+ CALL
+ CFI_ENDPROC
+STARTPROC(syscall_priv_untraced_replay_only)
+ REPLAY_ONLY_CALL
+ CFI_ENDPROC
+STARTPROC(syscall_priv_untraced_record_only)
+ RECORD_ONLY_CALL
+ CFI_ENDPROC
+STARTPROC(syscall_untraced_replay_assist)
+#ifdef IS_REPLAY
+ TRAP
+#else
+ CALL
+#endif
+ CFI_ENDPROC
+
+LABEL(in_replay_flag)
+#ifdef IS_REPLAY
+ .byte 0x01
+#else
+ .byte 0x00
+#endif
+.byte 0x00, 0x00, 0x00
+
+// During replay, we put the breakpoint_value here. During record this remains
+// as -1, giving us 8 ff bytes at a well known address during record. These are used
+// during exit.
+LABEL(breakpoint_value)
+LABEL(ff_bytes)
+.byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+
+// ABI stability ends here.
+
+#undef REPLAY_ONLY_CALL
+#undef RECORD_ONLY_CALL
diff --git a/rr/android/x86_64/share/rr/src/preload/rr_vdso.S b/rr/android/x86_64/share/rr/src/preload/rr_vdso.S
new file mode 100644
index 0000000..faa1799
--- /dev/null
+++ b/rr/android/x86_64/share/rr/src/preload/rr_vdso.S
@@ -0,0 +1,138 @@
+#ifdef __aarch64__
+#define STARTPROC_GLOBAL(name) .globl #name; .type #name, @function; \
+ #name:; .cfi_startproc
+#else
+#define STARTPROC_GLOBAL(name) .global #name; .type #name, @function; \
+ #name:; .cfi_startproc
+#endif
+#define CFI_ENDPROC .cfi_endproc
+
+// Older libs don't use the __vdso symbols, but try to look for the syscall
+// names directly. Follow the kernel vdso and make them weak aliases
+#define WEAK_ALIAS(sym, target) .weak sym; .set sym, target
+
+#if defined(__x86_64__)
+
+#define SYSCALL(which) \
+ movq $which, %rax; \
+ syscall; \
+ nop; \
+ nop; \
+ nop; \
+ retq
+
+STARTPROC_GLOBAL(__vdso_clock_getres)
+SYSCALL(229)
+CFI_ENDPROC
+STARTPROC_GLOBAL(__vdso_getcpu)
+SYSCALL(309)
+CFI_ENDPROC
+STARTPROC_GLOBAL(__vdso_time)
+SYSCALL(201)
+CFI_ENDPROC
+STARTPROC_GLOBAL(__vdso_clock_gettime)
+SYSCALL(228)
+CFI_ENDPROC
+STARTPROC_GLOBAL(__vdso_gettimeofday)
+SYSCALL(96)
+CFI_ENDPROC
+
+WEAK_ALIAS(clock_getres, __vdso_clock_getres)
+WEAK_ALIAS(getcpu, __vdso_getcpu)
+WEAK_ALIAS(time, __vdso_time)
+WEAK_ALIAS(clock_gettime, __vdso_clock_gettime)
+WEAK_ALIAS(gettimeofday,__vdso_gettimeofday)
+
+.symver gettimeofday,gettimeofday@LINUX_2.6
+.symver clock_gettime,clock_gettime@LINUX_2.6
+.symver __vdso_gettimeofday,__vdso_gettimeofday@LINUX_2.6
+.symver __vdso_clock_getres,__vdso_clock_getres@LINUX_2.6
+.symver __vdso_time,__vdso_time@LINUX_2.6
+.symver __vdso_clock_gettime,__vdso_clock_gettime@LINUX_2.6
+.symver __vdso_getcpu,__vdso_getcpu@LINUX_2.6
+
+#elif defined(__i386__)
+
+// __vdso functions use the C calling convention, so
+// we have to set up the syscall parameters here.
+// No x86-32 __vdso functions take more than two parameters.
+#define SYSCALL(which) \
+ push %ebx; \
+ .cfi_adjust_cfa_offset 4; \
+ .cfi_rel_offset %ebx, 0; \
+ mov 8(%esp),%ebx; \
+ mov 12(%esp),%ecx; \
+ mov $which, %eax; \
+ int $0x80; \
+ nop; \
+ nop; \
+ nop; \
+ pop %ebx; \
+ .cfi_adjust_cfa_offset -4; \
+ .cfi_restore %ebx; \
+ ret
+
+// N.B.: We depend on this being the first symbol in the vdso page.
+STARTPROC_GLOBAL(__kernel_vsyscall)
+int $0x80
+nop
+nop
+nop
+ret
+CFI_ENDPROC
+STARTPROC_GLOBAL(__vdso_clock_getres)
+SYSCALL(266)
+CFI_ENDPROC
+STARTPROC_GLOBAL(__vdso_time)
+SYSCALL(13)
+CFI_ENDPROC
+STARTPROC_GLOBAL(__vdso_clock_gettime)
+SYSCALL(265)
+CFI_ENDPROC
+STARTPROC_GLOBAL(__vdso_clock_gettime64)
+SYSCALL(403)
+CFI_ENDPROC
+STARTPROC_GLOBAL(__vdso_gettimeofday)
+SYSCALL(78)
+CFI_ENDPROC
+
+WEAK_ALIAS(clock_getres, __vdso_clock_getres)
+WEAK_ALIAS(time, __vdso_time)
+WEAK_ALIAS(clock_gettime, __vdso_clock_gettime)
+WEAK_ALIAS(clock_gettime64, __vdso_clock_gettime64)
+WEAK_ALIAS(gettimeofday,__vdso_gettimeofday)
+
+.symver __vdso_gettimeofday,__vdso_gettimeofday@LINUX_2.6
+.symver __vdso_clock_getres,__vdso_clock_getres@LINUX_2.6
+.symver __vdso_time,__vdso_time@LINUX_2.6
+.symver __vdso_clock_gettime,__vdso_clock_gettime@LINUX_2.6
+.symver __vdso_getcpu,__vdso_getcpu@LINUX_2.6
+
+#elif defined(__aarch64__)
+
+#define SYSCALL(which) \
+ mov x8, which; \
+ svc 0; \
+ ret
+
+STARTPROC_GLOBAL(__kernel_clock_getres)
+SYSCALL(114)
+CFI_ENDPROC
+STARTPROC_GLOBAL(__kernel_rt_sigreturn)
+SYSCALL(139)
+CFI_ENDPROC
+STARTPROC_GLOBAL(__kernel_gettimeofday)
+SYSCALL(169)
+CFI_ENDPROC
+STARTPROC_GLOBAL(__kernel_clock_gettime)
+SYSCALL(113)
+CFI_ENDPROC
+
+#else
+
+#error "VDSO Hooks not defined for this platform"
+
+#endif
+
+#undef STARTPROC_GLOBAL
+#undef CFI_ENDPROC
diff --git a/rr/android/x86_64/share/rr/src/preload/rrcalls.h b/rr/android/x86_64/share/rr/src/preload/rrcalls.h
new file mode 100644
index 0000000..b448495
--- /dev/null
+++ b/rr/android/x86_64/share/rr/src/preload/rrcalls.h
@@ -0,0 +1,103 @@
+/* "Magic" (rr-implemented) syscalls that we use to initialize the
+ * syscallbuf.
+ *
+ * NB: magic syscalls must be positive, because with at least linux
+ * 3.8.0 / eglibc 2.17, rr only gets a trap for the *entry* of invalid
+ * syscalls, not the exit. rr can't handle that yet. */
+/* TODO: static_assert(LAST_SYSCALL < SYS_rrcall_init_buffers) */
+
+#define RR_CALL_BASE 1000
+
+/**
+ * The preload library calls SYS_rrcall_init_preload during its
+ * initialization.
+ */
+#define SYS_rrcall_init_preload RR_CALL_BASE
+/**
+ * The preload library calls SYS_rrcall_init_buffers in each thread that
+ * gets created (including the initial main thread).
+ */
+#define SYS_rrcall_init_buffers (RR_CALL_BASE + 1)
+/**
+ * The preload library calls SYS_rrcall_notify_syscall_hook_exit when
+ * unlocking the syscallbuf and notify_after_syscall_hook_exit has been set.
+ * The word at 4/8(sp) is returned in the syscall result and the word at
+ * 8/16(sp) is stored in original_syscallno.
+ */
+#define SYS_rrcall_notify_syscall_hook_exit (RR_CALL_BASE + 2)
+/**
+ * When the preload library detects that control data has been received in a
+ * syscallbuf'ed recvmsg, it calls this syscall with a pointer to the
+ * 'struct msg' returned.
+ */
+#define SYS_rrcall_notify_control_msg (RR_CALL_BASE + 3)
+/**
+ * When rr replay has restored the auxv vectors for a new process (completing
+ * emulation of exec), it calls this syscall. It takes one parameter, the tid
+ * of the task that it has restored auxv vectors for.
+ */
+#define SYS_rrcall_reload_auxv (RR_CALL_BASE + 4)
+/**
+ * When rr replay has flushed a syscallbuf 'mprotect' record, notify any outer
+ * rr of that flush. The first parameter is the tid of the task, the second
+ * parameter is the address, the third parameter is the length, and the
+ * fourth parameter is the prot.
+ */
+#define SYS_rrcall_mprotect_record (RR_CALL_BASE + 5)
+/**
+ * The audit library calls SYS_rrcall_notify_stap_semaphore_added once a batch
+ * of SystemTap semaphores have been incremented. The first parameter is the
+ * beginning of an address interval containing semaphores (inclusive) and the
+ * second parameter is the end of the address interval (exclusive).
+ *
+ * In practice a particular probe may be listed in an object's notes more than
+ * once, so be prepared to handle overlapping or redundant intervals.
+ */
+#define SYS_rrcall_notify_stap_semaphore_added (RR_CALL_BASE + 6)
+/**
+ * The audit library calls SYS_rrcall_notify_stap_semaphore_removed once a
+ * batch of previously-incremented SystemTap semaphores have been decremented.
+ * The first parameter is the beginning of an address interval containing
+ * semaphores (inclusive) and the second parameter is the end of the address
+ * interval (exclusive).
+ *
+ * In practice a particular probe may be listed in an object's notes more than
+ * once, so be prepared to handle overlapping or redundant intervals.
+ */
+#define SYS_rrcall_notify_stap_semaphore_removed (RR_CALL_BASE + 7)
+/**
+ * This syscall can be used be the application being recorded to check for the
+ * presence of the rr recorder. It is used e.g. to enable nested recording of
+ * rr itself. Use of this syscall should be limited to situations where it is
+ * absolutely necessary to avoid deviation of behavior depending on the
+ * presence of absence of rr.
+ */
+#define SYS_rrcall_check_presence (RR_CALL_BASE + 8)
+/**
+ * Requests that rr detach from this process and re-create outside of its
+ * process tree, such that it may run without seccomp.
+ */
+#define SYS_rrcall_detach_teleport (RR_CALL_BASE + 9)
+/**
+ * Requests that rr reset the time slice signal to the
+ * requested period. Used for testing interaction corner
+ * cases between the time slice signal and other rr behavior.
+ */
+#define SYS_rrcall_arm_time_slice (RR_CALL_BASE + 10)
+/**
+ * Use as
+ *
+ * int rr_freeze_tid(pid_t tid, int freeze) {
+ * return syscall(SYS_rrcall_freeze_tid, tid, freeze, 0, 0, 0, 0); }
+ *
+ * With `freeze=1`, requests that rr's Scheduler not schedule task `tid` again
+ * until unfrozen using `rr_freeze_tid(tid, 0)`. Note that kernel scheduling
+ * behavior is unaffected. Used for testing Scheduler-sensitive scenarios.
+ */
+#define SYS_rrcall_freeze_tid (RR_CALL_BASE + 11)
+/**
+ * Requests a simulated (buffered) RDTSC.
+ * The RDTSC value is returned as a 64-bit value stored in the
+ * memory location given by the first argument. RAX returns 0.
+ */
+#define SYS_rrcall_rdtsc (RR_CALL_BASE + 12)
\ No newline at end of file
diff --git a/rr/android/x86_64/share/rr/src/preload/syscall_hook.S b/rr/android/x86_64/share/rr/src/preload/syscall_hook.S
new file mode 100644
index 0000000..45b4d98
--- /dev/null
+++ b/rr/android/x86_64/share/rr/src/preload/syscall_hook.S
@@ -0,0 +1,1047 @@
+#if defined(__aarch64__)
+ .set preload_thread_locals,0x70010000
+#else
+ .set preload_thread_locals,0x70001000
+#endif
+
+ .global _syscallbuf_code_start
+ .hidden _syscallbuf_code_start
+
+ .global _syscallbuf_final_exit_instruction
+ .hidden _syscallbuf_final_exit_instruction
+ .type _syscallbuf_final_exit_instruction, @function
+
+#define DW_OP_CONST4U(val) \
+ 0x0c, /* DW_OP_const4u */ \
+ /* Individually place bytes */ \
+ (val) & 0xFF, \
+ ((val) & (0xFF << 0x8)) >> 0x8, \
+ ((val) & (0xFF << 0x10)) >> 0x10, \
+ ((val) & (0xFF << 0x18)) >> 0x18
+
+#define DW_OP_CONST8U(val) \
+ 0x0e, /* DW_OP_const8u */ \
+ /* Individually place bytes */ \
+ (val) & 0xFF, \
+ ((val) & (0xFF << 0x8)) >> 0x8, \
+ ((val) & (0xFF << 0x10)) >> 0x10, \
+ ((val) & (0xFF << 0x18)) >> 0x18, \
+ ((val) & (0xFF << 0x20)) >> 0x20, \
+ ((val) & (0xFF << 0x28)) >> 0x28, \
+ ((val) & (0xFF << 0x30)) >> 0x30, \
+ ((val) & (0xFF << 0x38)) >> 0x38
+
+#define REG_AT_ADDR32(reg, addr) \
+ .cfi_escape 0x10, /* DW_CFA_expression */ \
+ reg, \
+ 0x05, /* 5 byte expression follows */ \
+ DW_OP_CONST4U(addr)
+#define REG_AT_ADDR64(reg, addr) \
+ .cfi_escape 0x10, /* DW_CFA_expression */ \
+ reg, \
+ 0x09, /* 9 byte expression follows */ \
+ DW_OP_CONST8U(addr)
+
+// 10 bytes LEB128 is enough to encode 64bit integer and we shouldn't
+// really need anything longer than that.
+#define COUNT_LEB128(lebs...) \
+ _COUNT_LEB128(lebs, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1)
+#define _COUNT_LEB128(_1, _2, _3, _4, _5, _6, _7, _8, _9, _10, N, ...) N
+
+#define REG_AT_REG_OFFSET(reg, base, lebs...) \
+ .cfi_escape 0x10, /* DW_CFA_expression */ \
+ reg, \
+ (COUNT_LEB128(lebs) + 1), /* 1 byte + LEB128 bytes */ \
+ (0x70 + base), /* DW_OP_breg0 + base */ \
+ lebs
+
+#if defined(__i386__)
+.text
+.set syscallbuf_stub_alt_stack, preload_thread_locals
+.set stub_scratch_1, preload_thread_locals + 8
+.set alt_stack_nesting_level, preload_thread_locals + 12
+.set saved_flags, preload_thread_locals + 16
+
+.p2align 4
+
+_syscallbuf_code_start:
+/* Insert a NOP here so we have no symbol clashes. Otherwise
+ in some configurations (gdb 7.7.1, Ubuntu 14.04) gdb sometimes gets confused.
+ */
+ nop
+
+
+_syscallbuf_final_exit_instruction:
+ jmp *(stub_scratch_1)
+
+_syscall_hook_trampoline:
+ .cfi_startproc
+ /* Build a |struct syscall_info| by pushing all the syscall
+ * args and the number onto the stack. */
+ /* struct syscall_info info; */
+ pushl %ebp /* info.args[5] = $ebp; */
+ .cfi_adjust_cfa_offset 4
+ .cfi_rel_offset %ebp, 0
+ pushl %edi /* info.args[4] = $edi; */
+ .cfi_adjust_cfa_offset 4
+ .cfi_rel_offset %edi, 0
+ pushl %esi /* info.args[3] = $esi; */
+ .cfi_adjust_cfa_offset 4
+ .cfi_rel_offset %esi, 0
+ pushl %edx /* info.args[2] = $edx; */
+ .cfi_adjust_cfa_offset 4
+ .cfi_rel_offset %edx, 0
+ pushl %ecx /* info.args[1] = $ecx; */
+ .cfi_adjust_cfa_offset 4
+ .cfi_rel_offset %ecx, 0
+ pushl %ebx /* info.args[0] = $ebx; */
+ .cfi_adjust_cfa_offset 4
+ .cfi_rel_offset %ebx, 0
+ pushl %eax /* info.no = $eax; */
+ .cfi_adjust_cfa_offset 4
+
+ /* $esp points at &info. Push that pointer on the stack as
+ * our arg for vsyscall_hook().
+ * Use %ebp as our temporary CFA register here. Don't use %ebx or
+ * any other GP register, since x86-64 gdb 7.7 (at least) treats all GP
+ * regs other than %esp/%ebp as *signed* and sign-extends their values.
+ * Having some CFA values sign-extended and others not breaks gdb
+ * stack walking.
+ */
+ movl %esp, %ebp
+ .cfi_def_cfa_register %ebp
+
+ /* Align stack to 16 bytes */
+ and $0xfffffff0,%esp
+
+ /* Save XMM registers */
+ sub $0x80,%esp
+ movdqa %xmm0,(%esp)
+ movdqa %xmm1,0x10(%esp)
+ movdqa %xmm2,0x20(%esp)
+ movdqa %xmm3,0x30(%esp)
+ movdqa %xmm4,0x40(%esp)
+ movdqa %xmm5,0x50(%esp)
+ movdqa %xmm6,0x60(%esp)
+ movdqa %xmm7,0x70(%esp)
+
+ sub $12,%esp
+ pushl %ebp
+
+ call syscall_hook
+ /* $eax = vsyscall_hook(&info); */
+
+ movdqa 0x10(%esp),%xmm0
+ movdqa 0x20(%esp),%xmm1
+ movdqa 0x30(%esp),%xmm2
+ movdqa 0x40(%esp),%xmm3
+ movdqa 0x50(%esp),%xmm4
+ movdqa 0x60(%esp),%xmm5
+ movdqa 0x70(%esp),%xmm6
+ movdqa 0x80(%esp),%xmm7
+
+ mov $saved_flags, %esp
+ popfw
+ /* From here on, non-application flag changes are not allowed */
+
+ /* Restore ESP */
+ mov %ebp, %esp
+ .cfi_def_cfa_register %esp
+
+ /* $eax is now the syscall return value. Erase |info.no| from the
+ * stack so that we can restore the other registers we saved. */
+ lea 4(%esp),%esp
+ .cfi_adjust_cfa_offset -4
+
+ /* Contract of __kernel_vsyscall() and real syscalls is that even
+ * callee-save registers aren't touched, so we restore everything
+ * here. */
+ popl %ebx
+ .cfi_adjust_cfa_offset -4
+ .cfi_restore %ebx
+ popl %ecx
+ .cfi_adjust_cfa_offset -4
+ .cfi_restore %ecx
+ popl %edx
+ .cfi_adjust_cfa_offset -4
+ .cfi_restore %edx
+ popl %esi
+ .cfi_adjust_cfa_offset -4
+ .cfi_restore %esi
+ popl %edi
+ .cfi_adjust_cfa_offset -4
+ .cfi_restore %edi
+ mov (alt_stack_nesting_level),%ebp
+ lea -1(%ebp),%ebp
+ mov %ebp,(alt_stack_nesting_level)
+ popl %ebp
+ .cfi_adjust_cfa_offset -4
+ .cfi_restore %ebp
+
+ ret
+ .cfi_endproc
+ .size _syscall_hook_trampoline, .-_syscall_hook_trampoline
+
+#define SYSCALLHOOK_START(name) \
+ .global name; \
+ .hidden name; \
+ .type name, @function; \
+name: \
+ .cfi_startproc; \
+ .cfi_def_cfa_offset 0; \
+ .cfi_offset %eip, 0; \
+ .cfi_offset %esp, 4
+
+#define SYSCALLHOOK_END(name) \
+ pop (stub_scratch_1); \
+ .cfi_adjust_cfa_offset -4; \
+ pop %esp; \
+ .cfi_same_value %esp; \
+ REG_AT_ADDR32(0x08 /* %eip */, stub_scratch_1); \
+ jmp _syscallbuf_final_exit_instruction; \
+ .cfi_endproc; \
+ .size name, .-name
+
+SYSCALLHOOK_START(_syscall_hook_trampoline_3d_01_f0_ff_ff)
+ call _syscall_hook_trampoline
+ cmpl $0xfffff001,%eax
+SYSCALLHOOK_END(_syscall_hook_trampoline_3d_01_f0_ff_ff)
+
+SYSCALLHOOK_START(_syscall_hook_trampoline_90_90_90)
+ call _syscall_hook_trampoline
+SYSCALLHOOK_END(_syscall_hook_trampoline_90_90_90)
+
+/* Declare gcc get_pc thunks here so they're in a known region of code */
+
+ .global _get_pc_thunks_start
+ .hidden _get_pc_thunks_start
+_get_pc_thunks_start:
+
+#define THUNK(name, reg) \
+ .section .text.__x86.get_pc_thunk.name,"axG",@progbits,__x86.get_pc_thunk.name,comdat; \
+ .global __x86.get_pc_thunk.name; \
+ .hidden __x86.get_pc_thunk.name; \
+ .type __x86.get_pc_thunk.name, @function; \
+__x86.get_pc_thunk.name: \
+ .cfi_startproc; \
+ movl (%esp), %reg; \
+ ret; \
+ .cfi_endproc
+
+THUNK(ax, eax)
+THUNK(bx, ebx)
+THUNK(cx, ecx)
+THUNK(dx, edx)
+THUNK(si, esi)
+THUNK(di, edi)
+THUNK(bp, ebp)
+
+ .global _get_pc_thunks_end
+ .hidden _get_pc_thunks_end
+_get_pc_thunks_end:
+
+#elif defined(__x86_64__)
+ .text
+
+ .set stub_scratch_1, preload_thread_locals + 16
+ .set alt_stack_nesting_level, preload_thread_locals + 24
+ .set saved_flags, preload_thread_locals + 28
+
+ .p2align 4
+_syscallbuf_code_start:
+
+_syscall_hook_trampoline:
+ .cfi_startproc
+ /* Save RBX because we need a callee-saves register */
+ pushq %rbx
+ .cfi_adjust_cfa_offset 8
+ .cfi_rel_offset %rbx, 0
+
+ /* Build a |struct syscall_info| on the stack by pushing the arguments
+ and syscall number. */
+ pushq %r9
+ .cfi_adjust_cfa_offset 8
+ .cfi_rel_offset %r9, 0
+ pushq %r8
+ .cfi_adjust_cfa_offset 8
+ .cfi_rel_offset %r8, 0
+ pushq %r10
+ .cfi_adjust_cfa_offset 8
+ .cfi_rel_offset %r10, 0
+ pushq %rdx
+ .cfi_adjust_cfa_offset 8
+ .cfi_rel_offset %rdx, 0
+ pushq %rsi
+ .cfi_adjust_cfa_offset 8
+ .cfi_rel_offset %rsi, 0
+ pushq %rdi
+ .cfi_adjust_cfa_offset 8
+ .cfi_rel_offset %rdi, 0
+ pushq %rax
+ .cfi_adjust_cfa_offset 8
+ .cfi_rel_offset %rax, 0
+
+ /* Align stack */
+ mov %rsp,%rbx
+ .cfi_def_cfa_register %rbx
+ and $0xfffffffffffffff0,%rsp
+
+ /* Save XMM registers */
+ sub $0x80,%rsp
+ movdqa %xmm0,(%rsp)
+ movdqa %xmm1,0x10(%rsp)
+ movdqa %xmm2,0x20(%rsp)
+ movdqa %xmm3,0x30(%rsp)
+ movdqa %xmm4,0x40(%rsp)
+ movdqa %xmm5,0x50(%rsp)
+ movdqa %xmm6,0x60(%rsp)
+ movdqa %xmm7,0x70(%rsp)
+
+ /* Save registers that aren't callee-saves preserved by syscall_hook,
+ and that we aren't already restoring from the syscall args */
+ push %rcx
+ push %r11
+ /* stack is 16-byte aligned again for entry to C */
+
+ /* Call our hook. */
+ mov %rbx,%rdi
+ callq syscall_hook
+
+ pop %r11
+ pop %rcx
+
+ /* Restore XMM registers */
+ movdqa (%rsp),%xmm0
+ movdqa 0x10(%rsp),%xmm1
+ movdqa 0x20(%rsp),%xmm2
+ movdqa 0x30(%rsp),%xmm3
+ movdqa 0x40(%rsp),%xmm4
+ movdqa 0x50(%rsp),%xmm5
+ movdqa 0x60(%rsp),%xmm6
+ movdqa 0x70(%rsp),%xmm7
+
+ mov $saved_flags, %rsp
+ popfw
+ /* From here on, non-application flag changes are not allowed */
+
+ mov %rbx,%rsp
+ .cfi_def_cfa_register %rsp
+
+ /* On entrance, we pushed the %rax, the syscall number. But we don't
+ want to |pop %rax|, as that will overwrite our return value. Skip over it. */
+ pop %rdi
+ .cfi_adjust_cfa_offset -8
+
+ /* We don't really *need* to restore these, since the kernel could have
+ trashed them all anyway. But it seems reasonable to do so. */
+ pop %rdi
+ .cfi_adjust_cfa_offset -8
+ .cfi_restore %rdi
+ pop %rsi
+ .cfi_adjust_cfa_offset -8
+ .cfi_restore %rsi
+ pop %rdx
+ .cfi_adjust_cfa_offset -8
+ .cfi_restore %rdx
+ pop %r10
+ .cfi_adjust_cfa_offset -8
+ .cfi_restore %r10
+ pop %r8
+ .cfi_adjust_cfa_offset -8
+ .cfi_restore %r8
+ mov (alt_stack_nesting_level),%r9d
+ lea -1(%r9),%r9
+ mov %r9d,(alt_stack_nesting_level)
+ pop %r9
+ .cfi_adjust_cfa_offset -8
+ .cfi_restore %r9
+
+ pop %rbx
+ .cfi_adjust_cfa_offset -8
+ .cfi_restore %rbx
+
+ /* ...and we're done. */
+ ret
+ .cfi_endproc
+ .size _syscall_hook_trampoline, . - _syscall_hook_trampoline
+
+_syscallbuf_final_exit_instruction:
+ jmp *(stub_scratch_1)
+
+/**
+ * Ok, bear with me here. When gdb sees our stack switch, it gets suspicious and if
+ * we're unlucky may decide that our unwind info is broken and abort the unwind. However,
+ * it decides to allow the unwind to proceed anyway if we happen to be in a function called
+ * __morestack (because that's what gcc calls its stack switching mechanism). Now,
+ * GDB does the stack switching comparison based on the CFA. What we thus need to do is keep the
+ * CFA pointing to the old stack until we get to a function named __morestack. We set the CFA for every
+ * syscallhook to what it will be at the end of the function (which, well, is an ok definition
+ * of the CFA). Then, we insert a __morestack function (still with the old CFA) that just jumps
+ * through to the trampoline. This way, we can force gdb's stack switch detection to think the
+ * stack switch happens between the hook and the common trampoline code (and add a __morestack
+ * local symbol to the trampoline code to avoid GDB messing with our stack trace).
+ */
+#define CFA_AT_RSP_OFFSET(offset) \
+.cfi_escape 0x0f, /* DW_CFA_def_cfa_expression */\
+ 0x03, /* 3 bytes follow */\
+ 0x77, offset, /* DW_OP_breg7, offset */\
+ 0x06; /* DW_OP_deref */
+
+#define RSP_IS_CFA \
+.cfi_escape 0x16, /* DW_CFA_val_expression */\
+ 0x7, /* %rsp */\
+ 0; /* 0 bytes follow */
+
+#define RSP_IS_CFA_PLUS_OFFSET(offset) \
+.cfi_escape 0x16, /* DW_CFA_val_expression */\
+ 0x7, /* %rsp */\
+ 2, /* 2 bytes follow */\
+ 0x23, /* DW_OP_plus_uconst */\
+ offset;
+
+#define RSP_IS_RSP_PLUS_OFFSET(offset) \
+.cfi_escape 0x16, /* DW_CFA_val_expression */\
+ 0x07, /* %rsp */\
+ 0x02, /* 2 bytes follow */\
+ 0x77, offset; /* DW_OP_breg7, offset */
+
+#define RIP_IS_DEREF_RSP(offset) REG_AT_REG_OFFSET(0x10 /* %rip */, 7, offset)
+
+/**
+ * On syscallhook entry, the stack has been switched to the end of per-task
+ * scratch space, then the old RSP and the return address have been pushed.
+ */
+#define SYSCALLHOOK_START(name) \
+ .global name; \
+ .hidden name; \
+ .type name, @function; \
+name: \
+ .cfi_startproc; \
+ CFA_AT_RSP_OFFSET(8) \
+ RSP_IS_CFA \
+ RIP_IS_DEREF_RSP(0)
+
+#define SYSCALLHOOK_END(name) \
+ pop (stub_scratch_1); \
+ CFA_AT_RSP_OFFSET(0) \
+ REG_AT_ADDR32(0x10 /* %rip */, stub_scratch_1); \
+ pop %rsp; \
+ .cfi_def_cfa %rsp, 0; \
+ jmp _syscallbuf_final_exit_instruction; \
+ .cfi_endproc; \
+ .size name, .-name
+
+/* See note above on what __morestack is for */
+.global __morestack
+.hidden __morestack
+.type __morestack, @function
+__morestack:
+.cfi_startproc
+CFA_AT_RSP_OFFSET(16)
+RSP_IS_RSP_PLUS_OFFSET(8)
+RIP_IS_DEREF_RSP(0)
+callq _syscall_hook_trampoline
+/* GDB likes to override valid CFI with its own heuristics if the current
+ instruction is a retq. This becomes a problem here, because GDB will set
+ a breakpoint at the next instruction after the callq when continuing out of
+ `_syscall_hook_trampoline`. This `nop` makes said instruction not a retq,
+ thus preventing that GDB heuristic from kicking in and letting GDB realize
+ that it did in fact manage to step out of the `_syscall_hook_trampoline`
+ frame. */
+nop
+retq
+.cfi_endproc
+.size __morestack, .-__morestack
+
+SYSCALLHOOK_START(_syscall_hook_trampoline_48_3d_01_f0_ff_ff)
+ callq __morestack
+ cmpq $0xfffffffffffff001,%rax
+SYSCALLHOOK_END(_syscall_hook_trampoline_48_3d_01_f0_ff_ff)
+
+SYSCALLHOOK_START(_syscall_hook_trampoline_48_3d_00_f0_ff_ff)
+ callq __morestack
+ cmpq $0xfffffffffffff000,%rax
+SYSCALLHOOK_END(_syscall_hook_trampoline_48_3d_00_f0_ff_ff)
+
+SYSCALLHOOK_START(_syscall_hook_trampoline_3d_00_f0_ff_ff)
+ callq __morestack
+ cmpl $0xfffff000,%eax
+SYSCALLHOOK_END(_syscall_hook_trampoline_3d_00_f0_ff_ff)
+
+SYSCALLHOOK_START(_syscall_hook_trampoline_48_89_45_f8)
+ callq __morestack
+ mov %rax,-8(%rbp)
+SYSCALLHOOK_END(_syscall_hook_trampoline_48_89_45_f8)
+
+SYSCALLHOOK_START(_syscall_hook_trampoline_48_89_c3)
+ callq __morestack
+ mov %rax,%rbx
+SYSCALLHOOK_END(_syscall_hook_trampoline_48_89_c3)
+
+SYSCALLHOOK_START(_syscall_hook_trampoline_48_8b_3c_24)
+ callq __morestack
+ /* The original instruction after the syscall is movq (%rsp),%rdi. */
+ movq 8(%rsp),%rdi
+ movq (%rdi),%rdi
+SYSCALLHOOK_END(_syscall_hook_trampoline_48_8b_3c_24)
+
+SYSCALLHOOK_START(_syscall_hook_trampoline_5a_5e_c3)
+ .cfi_offset %rip, 16
+ RSP_IS_CFA_PLUS_OFFSET(24)
+ callq __morestack
+ /* The original instructions after the syscall are
+ pop %rdx; pop %rsi; retq. */
+ /* We're not returning to the dynamically generated stub, so
+ we need to fix the stack pointer ourselves. */
+ pop %rdx
+ CFA_AT_RSP_OFFSET(0)
+ pop %rsp
+ .cfi_def_cfa %rsp, 0;
+ pop %rdx
+ .cfi_adjust_cfa_offset -8
+ pop %rsi
+ .cfi_adjust_cfa_offset -8
+ pop (stub_scratch_1)
+ .cfi_adjust_cfa_offset -8
+ jmp _syscallbuf_final_exit_instruction
+
+ .cfi_endproc
+ .size _syscall_hook_trampoline_5a_5e_c3, .-_syscall_hook_trampoline_5a_5e_c3
+
+SYSCALLHOOK_START(_syscall_hook_trampoline_89_c2_f7_da)
+ call __morestack
+ mov %eax,%edx
+ neg %edx
+SYSCALLHOOK_END(_syscall_hook_trampoline_89_c2_f7_da)
+
+SYSCALLHOOK_START(_syscall_hook_trampoline_90_90_90)
+ call __morestack
+SYSCALLHOOK_END(_syscall_hook_trampoline_90_90_90)
+
+SYSCALLHOOK_START(_syscall_hook_trampoline_ba_01_00_00_00)
+ call __morestack
+ mov $1,%edx
+SYSCALLHOOK_END(_syscall_hook_trampoline_ba_01_00_00_00)
+
+SYSCALLHOOK_START(_syscall_hook_trampoline_89_c1_31_d2)
+ call __morestack
+ mov %eax,%ecx
+ xor %edx,%edx
+SYSCALLHOOK_END(_syscall_hook_trampoline_89_c1_31_d2)
+
+SYSCALLHOOK_START(_syscall_hook_trampoline_c3_nop)
+ .cfi_offset %rip, 16
+ RSP_IS_CFA_PLUS_OFFSET(24)
+ callq __morestack
+ /* The original instructions after the syscall are
+ retq; nopl 0x0(%rax,%rax,1) */
+ /* We're not returning to the dynamically generated stub, so
+ we need to fix the stack pointer ourselves. */
+ pop %rdx
+ CFA_AT_RSP_OFFSET(0)
+ pop %rsp
+ .cfi_def_cfa %rsp, 0;
+ pop (stub_scratch_1)
+ .cfi_adjust_cfa_offset -8
+ jmp _syscallbuf_final_exit_instruction
+
+ .cfi_endproc
+ .size _syscall_hook_trampoline_c3_nop, .-_syscall_hook_trampoline_c3_nop
+
+SYSCALLHOOK_START(_syscall_hook_trampoline_40_80_f6_81)
+ xor $0x81, %sil
+ call __morestack
+SYSCALLHOOK_END(_syscall_hook_trampoline_40_80_f6_81)
+
+SYSCALLHOOK_START(_syscall_hook_trampoline_48_8d_b3_f0_08_00_00)
+ lea 0x8f0(%rbx),%rsi
+ callq __morestack
+SYSCALLHOOK_END(_syscall_hook_trampoline_48_8d_b3_f0_08_00_00)
+
+SYSCALLHOOK_START(_syscall_hook_trampoline_49_89_ca)
+ mov %rcx, %r10
+ call __morestack
+SYSCALLHOOK_END(_syscall_hook_trampoline_49_89_ca)
+
+SYSCALLHOOK_START(_syscall_hook_trampoline_48_89_c1)
+ callq __morestack
+ mov %rax, %rcx
+SYSCALLHOOK_END(_syscall_hook_trampoline_48_89_c1)
+
+#define MOV_RDX_VARIANTS \
+ MOV_RDX_TO_REG(48, d0) \
+ MOV_RDX_TO_REG(48, d1) \
+ MOV_RDX_TO_REG(48, d2) \
+ MOV_RDX_TO_REG(48, d3) \
+ MOV_RDX_TO_REG(48, d4) \
+ MOV_RDX_TO_REG(48, d5) \
+ MOV_RDX_TO_REG(48, d6) \
+ MOV_RDX_TO_REG(48, d7) \
+ MOV_RDX_TO_REG(49, d0) \
+ MOV_RDX_TO_REG(49, d1) \
+ MOV_RDX_TO_REG(49, d2) \
+ MOV_RDX_TO_REG(49, d3) \
+ MOV_RDX_TO_REG(49, d4) \
+ MOV_RDX_TO_REG(49, d5) \
+ MOV_RDX_TO_REG(49, d6) \
+ MOV_RDX_TO_REG(49, d7)
+
+#define MOV_RDX_TO_REG(rex, op) \
+SYSCALLHOOK_START(_syscall_hook_trampoline_##rex##_89_##op); \
+ callq __morestack; \
+ .byte 0x##rex, 0x89, 0x##op; \
+SYSCALLHOOK_END(_syscall_hook_trampoline_##rex##_89_##op);
+
+ MOV_RDX_VARIANTS
+
+SYSCALLHOOK_START(_syscall_hook_trampoline_48_c1_e2_20)
+ callq __morestack
+ shl $32, %rdx
+SYSCALLHOOK_END(_syscall_hook_trampoline_48_c1_e2_20)
+
+SYSCALLHOOK_START(_syscall_hook_trampoline_49_8b_44_24_28)
+ callq __morestack
+ mov 0x28(%r12),%rax
+SYSCALLHOOK_END(_syscall_hook_trampoline_49_8b_44_24_28)
+
+SYSCALLHOOK_START(_syscall_hook_trampoline_4c_89_f7)
+ mov %r14, %rdi
+ callq __morestack
+SYSCALLHOOK_END(_syscall_hook_trampoline_4c_89_f7)
+
+SYSCALLHOOK_START(_syscall_hook_trampoline_4c_89_ff)
+ mov %r15, %rdi
+ callq __morestack
+SYSCALLHOOK_END(_syscall_hook_trampoline_4c_89_ff)
+
+SYSCALLHOOK_START(_syscall_hook_trampoline_49_c7_c1_ff_ff_ff_ff)
+ mov $0xffffffffffffffff,%r9
+ callq __morestack
+SYSCALLHOOK_END(_syscall_hook_trampoline_49_c7_c1_ff_ff_ff_ff)
+
+SYSCALLHOOK_START(_syscall_hook_trampoline_b8_0e_00_00_00)
+ mov $0x0e,%eax
+ callq __morestack
+SYSCALLHOOK_END(_syscall_hook_trampoline_b8_0e_00_00_00)
+
+SYSCALLHOOK_START(_syscall_hook_trampoline_b8_11_01_00_00)
+ mov $0x111,%eax
+ callq __morestack
+SYSCALLHOOK_END(_syscall_hook_trampoline_b8_11_01_00_00)
+
+SYSCALLHOOK_START(_syscall_hook_trampoline_b8_ca_00_00_00)
+ mov $0xca,%eax
+ callq __morestack
+SYSCALLHOOK_END(_syscall_hook_trampoline_b8_ca_00_00_00)
+
+SYSCALLHOOK_START(_syscall_hook_trampoline_be_18_00_00_00)
+ mov $0x18,%esi
+ callq __morestack
+SYSCALLHOOK_END(_syscall_hook_trampoline_be_18_00_00_00)
+
+SYSCALLHOOK_START(_syscall_hook_trampoline_48_89_e5)
+ /* Previous RSP is stored on the stack above our return address */
+ mov 8(%rsp),%rbp
+ callq __morestack
+SYSCALLHOOK_END(_syscall_hook_trampoline_48_89_e5)
+
+SYSCALLHOOK_START(_syscall_hook_trampoline_48_89_fb)
+ mov %rdi,%rbx
+ callq __morestack
+SYSCALLHOOK_END(_syscall_hook_trampoline_48_89_fb)
+
+SYSCALLHOOK_START(_syscall_hook_trampoline_nops)
+ callq __morestack
+SYSCALLHOOK_END(_syscall_hook_trampoline_nops)
+
+#elif defined(__aarch64__)
+ .text
+
+ .set syscallbuf_stub_alt_stack, preload_thread_locals
+ .set stub_scratch_1, preload_thread_locals + 16
+ .set alt_stack_nesting_level, preload_thread_locals + 24
+ .set stub_scratch_2, preload_thread_locals + 8 * 13
+
+// Store a pair of x registers to stack at offset sp.
+// Assuming that CFA register is sp
+#define STPX_STACK(r1, r2, offset) \
+ stp x##r1, x##r2, [sp, offset]; \
+ .cfi_rel_offset x##r1, offset; \
+ .cfi_rel_offset x##r2, offset + 8
+
+#define LDPX_STACK(r1, r2, offset) \
+ ldp x##r1, x##r2, [sp, offset]; \
+ .cfi_same_value x##r1; \
+ .cfi_same_value x##r2
+
+// Store a pair of q registers to stack at offset sp.
+// Assuming that CFA register is sp
+#define STPQ_STACK(r1, r2, offset) \
+ stp q##r1, q##r2, [sp, offset]; \
+ .cfi_rel_offset q##r1, offset; \
+ .cfi_rel_offset q##r2, offset + 16
+
+#define LDPQ_STACK(r1, r2, offset) \
+ ldp q##r1, q##r2, [sp, offset]; \
+ .cfi_same_value q##r1; \
+ .cfi_same_value q##r2
+
+// Mark all temporary registers as same_value except x8 and x15
+#define SAME_VALUE_X_NO8_NO15 \
+ .cfi_same_value x0; \
+ .cfi_same_value x1; \
+ .cfi_same_value x2; \
+ .cfi_same_value x3; \
+ .cfi_same_value x4; \
+ .cfi_same_value x5; \
+ .cfi_same_value x6; \
+ .cfi_same_value x7; \
+ .cfi_same_value x9; \
+ .cfi_same_value x10; \
+ .cfi_same_value x11; \
+ .cfi_same_value x12; \
+ .cfi_same_value x13; \
+ .cfi_same_value x14; \
+ .cfi_same_value x16; \
+ .cfi_same_value x17; \
+ .cfi_same_value x18
+
+#define SAME_VALUE_X \
+ SAME_VALUE_X_NO8_NO15; \
+ .cfi_same_value x8; \
+ .cfi_same_value x15
+
+#define SAME_VALUE_ALL_Q \
+ .cfi_same_value q0; \
+ .cfi_same_value q1; \
+ .cfi_same_value q2; \
+ .cfi_same_value q3; \
+ .cfi_same_value q4; \
+ .cfi_same_value q5; \
+ .cfi_same_value q6; \
+ .cfi_same_value q7; \
+ .cfi_same_value q8; \
+ .cfi_same_value q9; \
+ .cfi_same_value q10; \
+ .cfi_same_value q11; \
+ .cfi_same_value q12; \
+ .cfi_same_value q13; \
+ .cfi_same_value q14; \
+ .cfi_same_value q15; \
+ .cfi_same_value q16; \
+ .cfi_same_value q17; \
+ .cfi_same_value q18; \
+ .cfi_same_value q19; \
+ .cfi_same_value q20; \
+ .cfi_same_value q21; \
+ .cfi_same_value q22; \
+ .cfi_same_value q23; \
+ .cfi_same_value q24; \
+ .cfi_same_value q25; \
+ .cfi_same_value q26; \
+ .cfi_same_value q27; \
+ .cfi_same_value q28; \
+ .cfi_same_value q29; \
+ .cfi_same_value q30; \
+ .cfi_same_value q31
+
+ .p2align 4
+_syscallbuf_code_start:
+
+_syscall_hook_trampoline:
+ // stack frame:
+ // 208-688: q2 - q31
+ // 128-200: x10 - x18
+ // 112-128: x7, x9
+ // 104-112: x6
+ // 48-104: syscall_info
+ // 32-48: x29, x30
+ // 0-32: q0, q1
+ .cfi_startproc
+ // GAS correctly put these in CIE as long as they
+ // appears right after .cfi_startproc
+ SAME_VALUE_X
+ SAME_VALUE_ALL_Q
+ // Store the vector registers at the bottom so that we can take advantage of
+ // the larger pre-offset that can be encoded in the instruction
+ // to adjust the stack pointer.
+ stp q0, q1, [sp, -688]!
+ .cfi_def_cfa_offset 688
+ .cfi_rel_offset q0, 0
+ .cfi_rel_offset q1, 0 + 16
+ STPX_STACK(29, 30, 32)
+ /* Build a |struct syscall_info| on the stack by pushing the arguments
+ and syscall number. */
+ STPX_STACK(8, 0, 48)
+ add x0, sp, 48 // x0 saved, store new argument for syscall_hook in x0.
+ STPX_STACK(1, 2, 64)
+ STPX_STACK(3, 4, 80)
+ STPX_STACK(5, 6, 96)
+ STPX_STACK(7, 9, 112)
+ STPX_STACK(10, 11, 128)
+ STPX_STACK(12, 13, 144)
+ STPX_STACK(14, 15, 160)
+ STPX_STACK(16, 17, 176)
+ str x18, [sp, 192]
+ .cfi_rel_offset x18, 192
+ STPQ_STACK(2, 3, 208)
+ STPQ_STACK(4, 5, 240)
+ STPQ_STACK(6, 7, 272)
+ // function call only maintain the bottom half of v8-v15
+ // whereas syscall maintains all the v registers
+ // so we actually need to save and restore v8-v15 as well...
+ // (in principle we could save only the upper half but
+ // that's too much effort especially for the unwind info...)
+ STPQ_STACK(8, 9, 304)
+ STPQ_STACK(10, 11, 336)
+ STPQ_STACK(12, 13, 368)
+ STPQ_STACK(14, 15, 400)
+ STPQ_STACK(16, 17, 432)
+ STPQ_STACK(18, 19, 464)
+ STPQ_STACK(20, 21, 496)
+ STPQ_STACK(22, 23, 528)
+ STPQ_STACK(24, 25, 560)
+ STPQ_STACK(26, 27, 592)
+ STPQ_STACK(28, 29, 624)
+ STPQ_STACK(30, 31, 656)
+
+ bl syscall_hook
+
+ movz x29, #:abs_g1:alt_stack_nesting_level // assume 32bit address
+ movk x29, #:abs_g0_nc:alt_stack_nesting_level
+ ldr w30, [x29]
+ sub w30, w30, 1
+ str w30, [x29]
+
+ ldp x29, x30, [sp, 32]
+ .cfi_same_value x29
+ // x30 should not use same_value since it's value is changed
+ // by the function call instruction
+ .cfi_restore x30
+ ldr x8, [sp, 48]
+ .cfi_same_value x8
+ LDPX_STACK(1, 2, 64)
+ LDPX_STACK(3, 4, 80)
+ LDPX_STACK(5, 6, 96)
+ LDPX_STACK(7, 9, 112)
+ LDPX_STACK(10, 11, 128)
+ LDPX_STACK(14, 15, 160)
+ LDPX_STACK(16, 17, 176)
+ ldr x18, [sp, 192]
+ .cfi_same_value x18
+
+ LDPQ_STACK(2, 3, 208)
+ LDPQ_STACK(4, 5, 240)
+ LDPQ_STACK(6, 7, 272)
+ LDPQ_STACK(8, 9, 304)
+ LDPQ_STACK(10, 11, 336)
+ LDPQ_STACK(12, 13, 368)
+ LDPQ_STACK(14, 15, 400)
+ LDPQ_STACK(16, 17, 432)
+ LDPQ_STACK(18, 19, 464)
+ LDPQ_STACK(20, 21, 496)
+ LDPQ_STACK(22, 23, 528)
+ LDPQ_STACK(24, 25, 560)
+ LDPQ_STACK(26, 27, 592)
+ LDPQ_STACK(28, 29, 624)
+ LDPQ_STACK(30, 31, 656)
+
+ ldp q0, q1, [sp], 688
+ .cfi_same_value q0
+ .cfi_same_value q1
+ .cfi_def_cfa_offset 0
+ ret
+ .cfi_endproc
+ .size _syscall_hook_trampoline, .-_syscall_hook_trampoline
+
+/**
+ * On syscallhook entry, we are still on the old stack,
+ * with x30 (lr) points to right after the blr instruction that got us here.
+ * The old values of x15 and x30 are saved to [x8], which is the syscall number
+ * with an offset to land in the stub_scratch_2 area.
+ */
+ .globl _syscall_hook_trampoline_raw
+ .hidden _syscall_hook_trampoline_raw
+ .type _syscall_hook_trampoline_raw, @function
+_syscall_hook_trampoline_raw:
+ .cfi_startproc
+ // GAS correctly put these in CIE as long as they
+ // appears right after .cfi_startproc
+ .cfi_return_column 32 // pc
+ SAME_VALUE_X_NO8_NO15
+ SAME_VALUE_ALL_Q
+ // We define CFA as the value of the stack pointer when we enter this function
+ // as specified in aadwarf64.
+ // Since we aren't using the caller stack, none of the registers
+ // we save will be in the CFA...
+ .cfi_def_cfa sp, 0
+ REG_AT_REG_OFFSET(0x20 /* pc */, 30, 16)
+ REG_AT_REG_OFFSET(0x0f /* x15 */, 8,
+ (stub_scratch_2 - preload_thread_locals) | 0x80, 0)
+ REG_AT_REG_OFFSET(0x1e /* x30 */, 8,
+ (stub_scratch_2 - preload_thread_locals + 8) | 0x80, 0)
+ // x8 = x8 - preload_thread_locals
+ // The last byte of the signed number LEB128 contains the top 4 bits
+ // from the 32bit negative number (obtained using the shifted 0xF mask)
+ // and 3 bits of leading ones above it (the `or`ing of the `0x70`).
+ // The top bit of the byte is 0 signaling the end of the LEB128 encoding.
+ .cfi_escape 0x16, /* DW_CFA_val_expression */ \
+ 0x08, /* x8 */ \
+ 0x06, /* length 6 */ \
+ 0x78, /* DW_OP_breg8 */ \
+ ((-preload_thread_locals) & 0x7F) | 0x80, \
+ ((-preload_thread_locals) & (0x7F << 7)) >> 7 | 0x80, \
+ ((-preload_thread_locals) & (0x7F << 14)) >> 14 | 0x80, \
+ ((-preload_thread_locals) & (0x7F << 21)) >> 21 | 0x80, \
+ ((-preload_thread_locals) & ( 0xF << 28)) >> 28 | 0x70
+ // old gcc version doesn't want to encode bti
+ // unless we specify armv8.5-a even though this was in the nop space.
+ .inst 0xd503245f // bti c
+ mov x15, preload_thread_locals
+ // Stash away x30 so that we can have two registers to use again
+ // we can't use stub_scratch_2 since we might overwrite the data there
+ str x30, [x15, stub_scratch_1 - preload_thread_locals]
+ .cfi_escape 0x10, /* DW_CFA_expression */ \
+ 0x20, /* pc */ \
+ 0x08, /* length 8 */ \
+ DW_OP_CONST4U(stub_scratch_1), \
+ 0x06, /* DW_OP_deref */ \
+ 0x23, /* DW_OP_plus_uconst */ \
+ 16
+ // Move the register stash region from
+ // `x8 + stub_scratch_2 - preload_thread_locals`
+ // (i.e. `stub_scratch_2 + original_x8`) to the start of `stub_scratch_2`
+ // Do it in the forward order since we know x8 >= stub_scratch_2
+ ldr x30, [x8, stub_scratch_2 - preload_thread_locals]
+ str x30, [x15, stub_scratch_2 - preload_thread_locals]
+ ldr x30, [x8, stub_scratch_2 - preload_thread_locals + 8]
+ str x30, [x15, stub_scratch_2 - preload_thread_locals + 8]
+ // Restore x8
+ movk x8, 0, lsl 16
+ .cfi_same_value x8
+ REG_AT_ADDR32(0x0f /* x15 */, stub_scratch_2)
+ REG_AT_ADDR32(0x1e /* x30 */, stub_scratch_2 + 8)
+
+ cmp x8, 0xdc // SYS_clone
+ .cfi_remember_state
+ b.eq .Lfallback_rawsyscall
+
+ ldr w30, [x15, alt_stack_nesting_level - preload_thread_locals]
+ cmp w30, 0
+ add w30, w30, 1
+ str w30, [x15, alt_stack_nesting_level - preload_thread_locals]
+
+ b.ne .Lnest_syscall_hook_trampoline_raw
+ ldr x30, [x15, syscallbuf_stub_alt_stack - preload_thread_locals]
+ sub x30, x30, 48
+ b .Lstackset_syscall_hook_trampoline_raw
+.Lnest_syscall_hook_trampoline_raw:
+ sub x30, sp, 48
+.Lstackset_syscall_hook_trampoline_raw:
+ // Now x30 points to the new stack with 48 bytes of space allocated
+
+ // Move sp into a normal register. Otherwise we can't store it
+ mov x15, sp
+ // Save sp to new stack.
+ str x15, [x30, 16]
+ mov sp, x30
+ REG_AT_REG_OFFSET(0x1f /* sp */, 31, 16)
+ .cfi_escape 0x0f, /* DW_CFA_def_cfa_expression */ \
+ 0x03, /* 3 bytes follow */ \
+ 0x8f, /* DW_OP_breg31 */ \
+ 16, \
+ 0x06 /* DW_OP_deref */
+ // sp is switched, x15 and x30 are free to use
+ // [stub_scratch_1] holds the stub address
+
+ // Now we need to construct the stack frame, with everything
+ // in the scratch area copied over so that we can nest again.
+ mov x15, preload_thread_locals
+ // load runtime stub address
+ ldr x30, [x15, stub_scratch_1 - preload_thread_locals]
+ // save stub return address
+ str x30, [sp]
+ // load syscall return address
+ ldr x30, [x30, 16]
+ str x30, [sp, 8]
+ ldr x30, [x15, stub_scratch_2 - preload_thread_locals]
+ str x30, [sp, 24]
+ ldr x30, [x15, stub_scratch_2 - preload_thread_locals + 8]
+ str x30, [sp, 32]
+
+ // stackframe layout
+ // 32: original x30
+ // 24: original x15
+ // 16: original sp
+ // 8: return address to syscall
+ // 0: return address to stub
+ REG_AT_REG_OFFSET(0x20 /* pc */, 31, 8)
+ REG_AT_REG_OFFSET(0x0f /* x15 */, 31, 24)
+ REG_AT_REG_OFFSET(0x1e /* x30 */, 31, 32)
+
+ bl _syscall_hook_trampoline
+
+/**
+ * The _syscall_hook_trampoline restores all the registers to the previous values
+ * (minus the register for syscall return value) so we just need to restore
+ * the registers we’ve overwritten by the end of the stack switch,
+ * i.e. x15 , x30 and sp.
+ * x15 and x30 will be restored when we get back to the stub
+ * so we don’t need to restore them here but we do need to copy their values
+ * to stub_scratch_2 again so that the stub can restore them
+ * (since without a valid stack that is still the only memory
+ * we can use to restore things).
+ * We also need to store the return address to stub_scratch_1
+ * since that’ll help rr with setting breakpoint.
+ */
+
+ movz x15, #:abs_g1:stub_scratch_2 // assume 32bit address
+ movk x15, #:abs_g0_nc:stub_scratch_2
+ ldr x30, [sp, 24] // x15
+ str x30, [x15]
+ ldr x30, [sp, 32] // x30
+ str x30, [x15, 8]
+ REG_AT_ADDR32(0x0f /* x15 */, stub_scratch_2)
+ REG_AT_ADDR32(0x1e /* x30 */, stub_scratch_2 + 8)
+ ldr x30, [sp, 8] // syscall return address
+ // tell rr breakpoint handling where we are going
+ str x30, [x15, stub_scratch_1 - stub_scratch_2]
+ REG_AT_ADDR32(0x20 /* pc */, stub_scratch_1)
+ ldr x30, [sp] // stub return address
+ ldr x15, [sp, 16] // sp
+ mov sp, x15
+ .cfi_restore sp
+ .cfi_def_cfa sp, 0
+ movz x15, #:abs_g1:stub_scratch_2 // assume 32bit address
+ movk x15, #:abs_g0_nc:stub_scratch_2
+_syscallbuf_final_exit_instruction:
+ ret
+
+.Lfallback_rawsyscall:
+ .cfi_restore_state
+ // Must not touch sp in this branch.
+ // Use x15 to remember the return address since we are only copying
+ // the first two elements of stub_scratch_2 for the child.
+ ldr x15, [x15, stub_scratch_1 - preload_thread_locals]
+ REG_AT_REG_OFFSET(0x20 /* pc */, 15, 16)
+ mov x30, 0x70000000 // RR_PAGE_SYSCALL_TRACED
+ blr x30
+ // stub_scratch_2 content is maintained by rr
+ // we need to put the syscall return address in stub_scratch_1
+ movz x30, #:abs_g1:stub_scratch_2 // assume 32bit address
+ movk x30, #:abs_g0_nc:stub_scratch_2
+ str x15, [x30, 16] // stash away stub address
+ ldr x15, [x15, 16] // syscall return address
+ .cfi_register 32, x15
+ str x15, [x30, stub_scratch_1 - stub_scratch_2]
+ REG_AT_ADDR32(0x20 /* pc */, stub_scratch_1)
+ mov x15, x30
+ ldr x30, [x15, 16]
+ b _syscallbuf_final_exit_instruction
+
+ .cfi_endproc
+ .size _syscall_hook_trampoline_raw, .-_syscall_hook_trampoline_raw
+
+#endif /* __aarch64__ */
+
+ .section .note.GNU-stack,"",@progbits
diff --git a/rr/android/x86_64/share/rr/src/preload/syscallbuf.c b/rr/android/x86_64/share/rr/src/preload/syscallbuf.c
new file mode 100644
index 0000000..c201ba7
--- /dev/null
+++ b/rr/android/x86_64/share/rr/src/preload/syscallbuf.c
@@ -0,0 +1,4327 @@
+/* -*- Mode: C; tab-width: 8; c-basic-offset: 2; indent-tabs-mode: nil; -*- */
+
+#define RR_IMPLEMENT_PRELOAD
+
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE 1
+#endif
+#include "syscallbuf.h"
+
+/**
+ * Buffer syscalls, so that rr can process the entire buffer with one
+ * trap instead of a trap per call.
+ *
+ * This file is compiled into a dso that's PRELOADed in recorded
+ * applications. The dso replaces libc syscall wrappers with our own
+ * implementation that saves nondeterministic outparams in a fixed-size
+ * buffer. When the buffer is full or the recorded application
+ * invokes an un-buffered syscall or receives a signal, we trap to rr
+ * and it records the state of the buffer.
+ *
+ * During replay, rr simply refills the buffer with the recorded data
+ * when it reaches the "flush-buffer" events that were recorded. Then
+ * rr emulates each buffered syscall, and the code here restores the
+ * client data from the refilled buffer.
+ *
+ * The crux of the implementation here is to selectively ptrace-trap
+ * syscalls. The normal (un-buffered) syscalls generate a ptrace
+ * trap, and the buffered syscalls trap directly to the kernel. This
+ * is implemented with a seccomp-bpf which examines the syscall and
+ * decides how to handle it (see seccomp-bpf.h and Task::spawn).
+ *
+ * Because this code runs in the tracee's address space and overrides
+ * system calls, the code is rather delicate. The following rules
+ * must be followed
+ *
+ * o No rr headers (other than seccomp-bpf.h and rr.h) may be included
+ * o All syscalls invoked by this code must be called directly, not
+ * through libc wrappers (which this file may itself indirectly override)
+ *
+ * The wrapper functions are named sys_xxxx. Each wrapper normally makes one
+ * untraced syscall or one traced syscall of the same type, but there are
+ * exceptions. For example sys_read can make a number of untraced syscalls
+ * instead of a single untraced syscall. A critical rule is that any traced
+ * or MAY_BLOCK untraced syscall *must* be the last syscall performed by the
+ * wrapper.
+ */
+
+#include <dlfcn.h>
+#include <limits.h>
+#include <unistd.h>
+#include <asm/errno.h>
+#include <asm/ioctls.h>
+#include <asm/poll.h>
+#include <asm/signal.h>
+#include <asm/siginfo.h>
+#include <asm/stat.h>
+#include <asm/statfs.h>
+#include <linux/eventpoll.h>
+#include <linux/futex.h>
+#include <linux/fcntl.h>
+#include <linux/if_packet.h>
+#include <linux/ioctl.h>
+#include <linux/mman.h>
+#include <linux/net.h>
+#include <linux/netlink.h>
+#include <linux/perf_event.h>
+#include <linux/ptrace.h>
+#include <linux/quota.h>
+#include <linux/resource.h>
+#include <linux/stat.h>
+#include <linux/socket.h>
+#include <linux/stat.h>
+#include <linux/time.h>
+#include <linux/types.h>
+#include <linux/uio.h>
+#include <linux/un.h>
+#include <linux/utsname.h>
+#include <stdarg.h>
+#include <stdio.h>
+#include <syscall.h>
+#include <sysexits.h>
+#include <sys/mman.h>
+#include <sys/prctl.h>
+#include <unistd.h>
+
+#include "preload_interface.h"
+#include "rr/rr.h"
+
+#ifndef SOL_NETLINK
+#define SOL_NETLINK 270
+#endif
+
+#ifndef BTRFS_IOCTL_MAGIC
+#define BTRFS_IOCTL_MAGIC 0x94
+#endif
+#ifndef BTRFS_IOC_CLONE_RANGE
+struct btrfs_ioctl_clone_range_args {
+ int64_t src_fd;
+ uint64_t src_offset;
+ uint64_t src_length;
+ uint64_t dest_offset;
+};
+#define BTRFS_IOC_CLONE_RANGE \
+ _IOW(BTRFS_IOCTL_MAGIC, 13, struct btrfs_ioctl_clone_range_args)
+#endif
+#ifndef MADV_FREE
+#define MADV_FREE 8
+#endif
+
+#ifndef GRND_NONBLOCK
+#define GRND_NONBLOCK 1
+#endif
+
+struct rr_rseq {
+ uint32_t cpu_id_start;
+ uint32_t cpu_id;
+ uint64_t rseq_cs;
+ uint32_t flags;
+} __attribute__((aligned(32)));
+
+/* NB: don't include any other local headers here. */
+
+#ifdef memcpy
+#undef memcpy
+#endif
+#define memcpy you_must_use_local_memcpy
+
+static long _traced_init_syscall(int syscallno, long a0, long a1, long a2,
+ long a3, long a4, long a5)
+{
+ return syscall(syscallno, a0, a1, a2, a3, a4, a5);
+}
+
+#ifdef syscall
+#undef syscall
+#endif
+#define syscall you_must_use_traced_syscall
+
+static inline unsigned char *rr_page_replay_flag_addr(void) {
+ return (unsigned char *)RR_PAGE_IN_REPLAY_FLAG;
+}
+
+/**
+ * Declaring this to avoid issues with the declaration of f_owner_ex
+ * across distros. See https://github.com/rr-debugger/rr/issues/2693 */
+struct rr_f_owner_ex {
+ int type;
+ int pid;
+};
+
+#ifndef __ARCH_FLOCK64_PAD
+#define __ARCH_FLOCK64_PAD
+#endif
+struct rr_flock64 {
+ short l_type;
+ short l_whence;
+ __kernel_loff_t l_start;
+ __kernel_loff_t l_len;
+ __kernel_pid_t l_pid;
+ __ARCH_FLOCK64_PAD
+};
+
+// The alignment of this struct is incorrect, but as long as it's not
+// used inside other structures, defining it this way makes the code below
+// easier.
+typedef uint64_t kernel_sigset_t;
+
+/* Nonzero when syscall buffering is enabled. */
+static int buffer_enabled;
+/* Nonzero after process-global state has been initialized. */
+static int process_inited;
+
+RR_HIDDEN struct preload_globals globals;
+
+static struct preload_thread_locals* const thread_locals =
+ (struct preload_thread_locals*)PRELOAD_THREAD_LOCALS_ADDR;
+
+/**
+ * Return a pointer to the buffer header, which happens to occupy the
+ * initial bytes in the mapped region.
+ */
+static struct syscallbuf_hdr* buffer_hdr(void) {
+ return (struct syscallbuf_hdr*)thread_locals->buffer;
+}
+
+/**
+ * Return a pointer to the byte just after the last valid syscall record in
+ * the buffer.
+ */
+static uint8_t* buffer_last(void) {
+ return (uint8_t*)next_record(buffer_hdr());
+}
+
+/**
+ * Return a pointer to the byte just after the very end of the mapped
+ * region.
+ */
+static uint8_t* buffer_end(void) {
+ return thread_locals->buffer + thread_locals->buffer_size;
+}
+
+/**
+ * Same as libc memcpy(), but usable within syscallbuf transaction
+ * critical sections.
+ */
+static void local_memcpy(void* dest, const void* source, int n) {
+#if defined(__i386__) || defined(__x86_64__)
+ /* On modern x86-ish CPUs rep movsb is fast, usually able to move
+ * 64 bytes at a time.
+ */
+ __asm__ __volatile__("rep movsb\n\t"
+ : "+S"(source), "+D"(dest), "+c"(n)
+ :
+ : "cc", "memory");
+#elif defined(__aarch64__)
+ long c1;
+ long c2;
+ __asm__ __volatile__("subs %4, %2, 16\n\t"
+ "b.lt 2f\n\t"
+ "1:\n\t"
+ "mov %2, %4\n\t"
+ "ldp %3, %4, [%1], #16\n\t"
+ "stp %3, %4, [%0], #16\n\t"
+ "subs %4, %2, #16\n\t"
+ "b.ge 1b\n"
+ "2:\n\t"
+ "tbz %2, 3, 3f\n\t"
+ "ldr %3, [%1], #8\n\t"
+ "str %3, [%0], #8\n\t"
+ "3:\n\t"
+ "tbz %2, 2, 3f\n\t"
+ "ldr %w3, [%1], #4\n\t"
+ "str %w3, [%0], #4\n\t"
+ "3:\n\t"
+ "tbz %2, 1, 3f\n\t"
+ "ldrh %w3, [%1], #2\n\t"
+ "strh %w3, [%0], #2\n\t"
+ "3:\n\t"
+ "tbz %2, 0, 3f\n\t"
+ "ldrb %w3, [%1]\n\t"
+ "strb %w3, [%0]\n\t"
+ "3:\n\t"
+ : "+r"(dest), "+r"(source), "+r"(n), "=&r"(c1), "=&r"(c2)
+ :
+ : "cc", "memory");
+#else
+#error Unknown architecture
+#endif
+}
+
+/**
+ * Same as libc memset(), but usable within syscallbuf transaction
+ * critical sections.
+ */
+static void local_memset(void* dest, uint8_t c, int n) {
+#if defined(__i386__) || defined(__x86_64__)
+ /* On modern x86-ish CPUs rep stosb is fast, usually able to move
+ * 64 bytes at a time.
+ */
+ __asm__ __volatile__("rep stosb\n\t"
+ : "+a"(c), "+D"(dest), "+c"(n)
+ :
+ : "cc", "memory");
+#elif defined(__aarch64__)
+ double v1;
+ long n2;
+ __asm__ __volatile__("subs %4, %2, 32\n\t"
+ "b.lt 2f\n\t"
+ "dup %3.16b, %w0\n"
+ "1:\n\t"
+ "mov %2, %4\n\t"
+ "stp %q3, %q3, [%1], #32\n\t"
+ "subs %4, %2, #32\n\t"
+ "b.ge 1b\n"
+ "2:\n\t"
+ "cbz %2, 4f\n"
+ "3:\n\t"
+ "strb %w0, [%1], #1\n\t"
+ "subs %2, %2, #1\n\t"
+ "b.ne 3b\n"
+ "4:\n\t"
+ : "+r"(c), "+r"(dest), "+r"(n), "=x"(v1), "=r"(n2)
+ :
+ : "cc", "memory");
+#else
+#error Unknown architecture
+#endif
+}
+
+/**
+ * Xorshift* RNG
+ */
+static int64_t local_random(void) {
+ uint64_t x = globals.random_seed;
+ x ^= x >> 12;
+ x ^= x << 25;
+ x ^= x >> 27;
+ globals.random_seed = x;
+ return x * 0x2545F4914F6CDD1D;
+}
+
+/* The following are wrappers for the syscalls invoked by this library
+ * itself. These syscalls will generate ptrace traps.
+ * stack_param_1 and stack_param_2 are pushed onto the stack just before
+ * the syscall, for SYS_rrcall_notify_syscall_hook_exit which takes stack
+ * parameters as well as register parameters.
+ * syscall_instruction is the actual syscall invocation instruction
+ * (a function which we call with the registers set up appropriately).
+ */
+
+extern RR_HIDDEN long _raw_syscall(int syscallno, long a0, long a1, long a2,
+ long a3, long a4, long a5,
+ void* syscall_instruction,
+ long stack_param_1, long stack_param_2);
+
+static int privileged_traced_syscall(int syscallno, long a0, long a1, long a2,
+ long a3, long a4, long a5) {
+ return _raw_syscall(syscallno, a0, a1, a2, a3, a4, a5,
+ RR_PAGE_SYSCALL_PRIVILEGED_TRACED, 0, 0);
+}
+#define privileged_traced_syscall6(no, a0, a1, a2, a3, a4, a5) \
+ privileged_traced_syscall(no, (uintptr_t)a0, (uintptr_t)a1, (uintptr_t)a2, \
+ (uintptr_t)a3, (uintptr_t)a4, (uintptr_t)a5)
+#define privileged_traced_syscall5(no, a0, a1, a2, a3, a4) \
+ privileged_traced_syscall6(no, a0, a1, a2, a3, a4, 0)
+#define privileged_traced_syscall4(no, a0, a1, a2, a3) \
+ privileged_traced_syscall5(no, a0, a1, a2, a3, 0)
+#define privileged_traced_syscall3(no, a0, a1, a2) \
+ privileged_traced_syscall4(no, a0, a1, a2, 0)
+#define privileged_traced_syscall2(no, a0, a1) \
+ privileged_traced_syscall3(no, a0, a1, 0)
+#define privileged_traced_syscall1(no, a0) privileged_traced_syscall2(no, a0, 0)
+#define privileged_traced_syscall0(no) privileged_traced_syscall1(no, 0)
+
+/**
+ * Make a raw traced syscall using the params in |call|.
+ */
+static long traced_raw_syscall(struct syscall_info* call) {
+ if (call->no == SYS_rrcall_rdtsc) {
+ // Handle this specially because the rrcall writes to a memory out-param
+ // and we need to actually modify the outgoing AX/DX registers instead.
+ uint32_t tsc[2];
+ privileged_traced_syscall1(SYS_rrcall_rdtsc, tsc);
+ // Overwrite RDX (syscall arg 3) with our TSC value.
+ call->args[2] = tsc[1];
+ return tsc[0];
+ }
+ /* FIXME: pass |call| to avoid pushing these on the stack
+ * again. */
+ return _raw_syscall(call->no, call->args[0], call->args[1], call->args[2],
+ call->args[3], call->args[4], call->args[5],
+ RR_PAGE_SYSCALL_TRACED, 0, 0);
+}
+
+/**
+ * Make a raw traced syscall using the params in |call|, privileged.
+ */
+static long privileged_traced_raw_syscall(const struct syscall_info* call) {
+ /* FIXME: pass |call| to avoid pushing these on the stack
+ * again. */
+ return _raw_syscall(call->no, call->args[0], call->args[1], call->args[2],
+ call->args[3], call->args[4], call->args[5],
+ RR_PAGE_SYSCALL_PRIVILEGED_TRACED, 0, 0);
+}
+
+#if defined(SYS_fcntl64)
+#define RR_FCNTL_SYSCALL SYS_fcntl64
+#else
+#define RR_FCNTL_SYSCALL SYS_fcntl
+#endif
+
+static int privileged_traced_fcntl(int fd, int cmd, ...) {
+ va_list ap;
+ void* arg;
+
+ va_start(ap, cmd);
+ arg = va_arg(ap, void*);
+ va_end(ap);
+
+ return privileged_traced_syscall3(RR_FCNTL_SYSCALL, fd, cmd, arg);
+}
+
+static pid_t privileged_traced_getpid(void) {
+ return privileged_traced_syscall0(SYS_getpid);
+}
+
+static pid_t privileged_traced_gettid(void) {
+ return privileged_traced_syscall0(SYS_gettid);
+}
+
+static int privileged_traced_perf_event_open(struct perf_event_attr* attr,
+ pid_t pid, int cpu, int group_fd,
+ unsigned long flags) {
+ return privileged_traced_syscall5(SYS_perf_event_open, attr, pid, cpu,
+ group_fd, flags);
+}
+
+static __attribute__((noreturn)) void privileged_traced_raise(int sig) {
+ privileged_traced_syscall2(SYS_kill, privileged_traced_getpid(), sig);
+ __builtin_unreachable();
+}
+
+static ssize_t privileged_traced_write(int fd, const void* buf, size_t count) {
+ return privileged_traced_syscall3(SYS_write, fd, buf, count);
+}
+
+static void logmsg(const char* msg) {
+ privileged_traced_write(STDERR_FILENO, msg, rrstrlen(msg));
+}
+
+#define STR_HELPER(x) #x
+#define STR(x) STR_HELPER(x)
+
+#ifndef NDEBUG
+#define assert(cond) \
+ do { \
+ if (!(cond)) { \
+ logmsg(__FILE__ ":" STR(__LINE__) ": Assertion `" #cond "' failed.\n"); \
+ privileged_traced_raise(SIGABRT); \
+ } \
+ } while (0)
+#else
+#define assert(cond) \
+ do { \
+ __attribute__((unused)) size_t s = sizeof(cond); \
+ } while (0)
+#endif
+
+#define fatal(msg) \
+ do { \
+ logmsg(__FILE__ ":" STR(__LINE__) ": Fatal error: " msg "\n"); \
+ privileged_traced_raise(SIGABRT); \
+ } while (0)
+
+/**
+ * Unlike |traced_syscall()|, this helper is implicitly "raw" (returns
+ * the direct kernel return value), because the syscall hooks have to
+ * save that raw return value.
+ * This is only called from syscall wrappers that are doing a proper
+ * buffered syscall.
+ */
+static long untraced_syscall_full(int syscallno, long a0, long a1, long a2,
+ long a3, long a4, long a5,
+ void* syscall_instruction,
+ long stack_param_1, long stack_param_2) {
+ struct syscallbuf_record* rec = (struct syscallbuf_record*)buffer_last();
+ /* Ensure tools analyzing the replay can find the pending syscall result */
+ thread_locals->pending_untraced_syscall_result = &rec->ret;
+ long ret = _raw_syscall(syscallno, a0, a1, a2, a3, a4, a5,
+ syscall_instruction, stack_param_1, stack_param_2);
+/* During replay, return the result that's already in the buffer, instead
+ of what our "syscall" returned. */
+#if defined(__i386__) || defined(__x86_64__)
+ /* On entry, during recording %eax/%rax are whatever the kernel returned
+ * but during replay they may be invalid (e.g. 0). During replay, reload
+ * %eax/%rax from |rec->ret|. At the end of this sequence all registers
+ * will match between recording and replay. We clobber the temporary
+ * in_replay register, and the condition codes, to ensure this.
+ * This all assumes the compiler doesn't create unnecessary temporaries
+ * holding values like |ret|. Inspection of generated code shows it doesn't.
+ */
+ unsigned char tmp_in_replay = *rr_page_replay_flag_addr();
+ __asm__("test %1,%1\n\t"
+ "cmovne %2,%0\n\t"
+ "xor %1,%1\n\t"
+ : "+a"(ret), "+c"(tmp_in_replay)
+ : "m"(rec->ret)
+ : "cc");
+#elif defined(__aarch64__)
+ unsigned char *globals_in_replay = rr_page_replay_flag_addr();
+ long *rec_ret = &rec->ret;
+ __asm__("ldrb %w1, [%1]\n\t" // tmp_in_replay = *rr_page_replay_flag_addr()
+ "ldr %2, [%2]\n\t" // tmp = rec->ret
+ "cmp %w1, #0\n\t"
+ "csel %0, %0, %2, eq\n\t" // ret = tmp_in_replay ? tmp : ret
+ "subs %1, xzr, xzr\n\t" // clear tmp_in_replay and flag
+ "mov %2, xzr\n\t" // clear tmp
+ : "+r"(ret), "+r"(globals_in_replay), "+r"(rec_ret)
+ :
+ : "cc");
+#else
+#error Unknown architecture
+#endif
+ return ret;
+}
+#define untraced_syscall_base(no, a0, a1, a2, a3, a4, a5, inst) \
+ untraced_syscall_full(no, a0, a1, a2, a3, a4, a5, inst, 0, 0)
+#define untraced_syscall6(no, a0, a1, a2, a3, a4, a5) \
+ untraced_syscall_base(no, (uintptr_t)a0, (uintptr_t)a1, (uintptr_t)a2, \
+ (uintptr_t)a3, (uintptr_t)a4, (uintptr_t)a5, \
+ RR_PAGE_SYSCALL_UNTRACED_RECORDING_ONLY)
+#define untraced_syscall5(no, a0, a1, a2, a3, a4) \
+ untraced_syscall6(no, a0, a1, a2, a3, a4, 0)
+#define untraced_syscall4(no, a0, a1, a2, a3) \
+ untraced_syscall5(no, a0, a1, a2, a3, 0)
+#define untraced_syscall3(no, a0, a1, a2) untraced_syscall4(no, a0, a1, a2, 0)
+#define untraced_syscall2(no, a0, a1) untraced_syscall3(no, a0, a1, 0)
+#define untraced_syscall1(no, a0) untraced_syscall2(no, a0, 0)
+#define untraced_syscall0(no) untraced_syscall1(no, 0)
+
+#define untraced_replayed_syscall6(no, a0, a1, a2, a3, a4, a5) \
+ untraced_syscall_base(no, (uintptr_t)a0, (uintptr_t)a1, (uintptr_t)a2, \
+ (uintptr_t)a3, (uintptr_t)a4, (uintptr_t)a5, \
+ RR_PAGE_SYSCALL_UNTRACED)
+#define untraced_replayed_syscall5(no, a0, a1, a2, a3, a4) \
+ untraced_replayed_syscall6(no, a0, a1, a2, a3, a4, 0)
+#define untraced_replayed_syscall4(no, a0, a1, a2, a3) \
+ untraced_replayed_syscall5(no, a0, a1, a2, a3, 0)
+#define untraced_replayed_syscall3(no, a0, a1, a2) \
+ untraced_replayed_syscall4(no, a0, a1, a2, 0)
+#define untraced_replayed_syscall2(no, a0, a1) \
+ untraced_replayed_syscall3(no, a0, a1, 0)
+#define untraced_replayed_syscall1(no, a0) untraced_replayed_syscall2(no, a0, 0)
+#define untraced_replayed_syscall0(no) untraced_replayed_syscall1(no, 0)
+
+static long __attribute__((unused))
+untraced_replay_assist_syscall_base(int syscallno, long a0, long a1, long a2,
+ long a3, long a4, long a5,
+ void* syscall_instruction) {
+ struct syscallbuf_record* rec = (struct syscallbuf_record*)buffer_last();
+ rec->replay_assist = 1;
+ return untraced_syscall_base(syscallno, a0, a1, a2, a3, a4, a5, syscall_instruction);
+}
+
+#define untraced_replay_assist_syscall6(no, a0, a1, a2, a3, a4, a5) \
+ untraced_replay_assist_syscall_base( \
+ no, (uintptr_t)a0, (uintptr_t)a1, (uintptr_t)a2, \
+ (uintptr_t)a3, (uintptr_t)a4, (uintptr_t)a5, \
+ RR_PAGE_SYSCALL_UNTRACED_REPLAY_ASSIST)
+#define untraced_replay_assist_syscall5(no, a0, a1, a2, a3, a4) \
+ untraced_replay_assist_syscall6(no, a0, a1, a2, a3, a4, 0)
+#define untraced_replay_assist_syscall4(no, a0, a1, a2, a3) \
+ untraced_replay_assist_syscall5(no, a0, a1, a2, a3, 0)
+#define untraced_replay_assist_syscall3(no, a0, a1, a2) \
+ untraced_replay_assist_syscall4(no, a0, a1, a2, 0)
+#define untraced_replay_assist_syscall2(no, a0, a1) \
+ untraced_replay_assist_syscall3(no, a0, a1, 0)
+#define untraced_replay_assist_syscall1(no, a0) \
+ untraced_replay_assist_syscall2(no, a0, 0)
+#define untraced_replay_assist_syscall0(no) \
+ untraced_replay_assist_syscall1(no, 0)
+
+// "Privileged" syscalls are not affected by the application's own seccomp
+// filters.
+#define privileged_untraced_syscall6(no, a0, a1, a2, a3, a4, a5) \
+ untraced_syscall_base(no, (uintptr_t)a0, (uintptr_t)a1, (uintptr_t)a2, \
+ (uintptr_t)a3, (uintptr_t)a4, (uintptr_t)a5, \
+ RR_PAGE_SYSCALL_PRIVILEGED_UNTRACED_RECORDING_ONLY)
+#define privileged_untraced_syscall5(no, a0, a1, a2, a3, a4) \
+ privileged_untraced_syscall6(no, a0, a1, a2, a3, a4, 0)
+#define privileged_untraced_syscall4(no, a0, a1, a2, a3) \
+ privileged_untraced_syscall5(no, a0, a1, a2, a3, 0)
+#define privileged_untraced_syscall3(no, a0, a1, a2) \
+ privileged_untraced_syscall4(no, a0, a1, a2, 0)
+#define privileged_untraced_syscall2(no, a0, a1) \
+ privileged_untraced_syscall3(no, a0, a1, 0)
+#define privileged_untraced_syscall1(no, a0) \
+ privileged_untraced_syscall2(no, a0, 0)
+#define privileged_untraced_syscall0(no) privileged_untraced_syscall1(no, 0)
+
+// "Unrecorded" syscalls are performed during recording only and are "raw";
+// they are not associated with syscallbuf records.
+#define privileged_unrecorded_syscall6(no, a0, a1, a2, a3, a4, a5) \
+ _raw_syscall(no, (uintptr_t)a0, (uintptr_t)a1, (uintptr_t)a2, \
+ (uintptr_t)a3, (uintptr_t)a4, (uintptr_t)a5, \
+ RR_PAGE_SYSCALL_PRIVILEGED_UNTRACED_RECORDING_ONLY, 0, 0)
+#define privileged_unrecorded_syscall5(no, a0, a1, a2, a3, a4) \
+ privileged_unrecorded_syscall6(no, a0, a1, a2, a3, a4, 0)
+#define privileged_unrecorded_syscall4(no, a0, a1, a2, a3) \
+ privileged_unrecorded_syscall5(no, a0, a1, a2, a3, 0)
+#define privileged_unrecorded_syscall3(no, a0, a1, a2) \
+ privileged_unrecorded_syscall4(no, a0, a1, a2, 0)
+#define privileged_unrecorded_syscall2(no, a0, a1) \
+ privileged_unrecorded_syscall3(no, a0, a1, 0)
+#define privileged_unrecorded_syscall1(no, a0) \
+ privileged_unrecorded_syscall2(no, a0, 0)
+#define privileged_unrecorded_syscall0(no) privileged_unrecorded_syscall1(no, 0)
+
+#define replay_only_syscall6(no, a0, a1, a2, a3, a4, a5) \
+ _raw_syscall(no, (uintptr_t)a0, (uintptr_t)a1, (uintptr_t)a2, (uintptr_t)a3, \
+ (uintptr_t)a4, (uintptr_t)a5, \
+ RR_PAGE_SYSCALL_PRIVILEGED_UNTRACED_REPLAY_ONLY, 0, 0)
+#define replay_only_syscall5(no, a0, a1, a2, a3, a4) \
+ replay_only_syscall6(no, a0, a1, a2, a3, a4, 0)
+#define replay_only_syscall4(no, a0, a1, a2, a3) \
+ replay_only_syscall5(no, a0, a1, a2, a3, 0)
+#define replay_only_syscall3(no, a0, a1, a2) \
+ replay_only_syscall4(no, a0, a1, a2, 0)
+#define replay_only_syscall2(no, a0, a1) replay_only_syscall3(no, a0, a1, 0)
+#define replay_only_syscall1(no, a0) replay_only_syscall2(no, a0, 0)
+#define replay_only_syscall0(no) replay_only_syscall1(no, 0)
+
+static int privileged_untraced_close(int fd) {
+ return privileged_unrecorded_syscall1(SYS_close, fd);
+}
+
+static int privileged_untraced_fcntl(int fd, int cmd, ...) {
+ va_list ap;
+ void* arg;
+
+ va_start(ap, cmd);
+ arg = va_arg(ap, void*);
+ va_end(ap);
+
+ return privileged_unrecorded_syscall3(RR_FCNTL_SYSCALL, fd, cmd, arg);
+}
+
+/**
+ * Do what's necessary to set up buffers for the caller.
+ * |untraced_syscall_ip| lets rr know where our untraced syscalls will
+ * originate from. |addr| is the address of the control socket the
+ * child expects to connect to. |msg| is a pre-prepared IPC that can
+ * be used to share fds; |fdptr| is a pointer to the control-message
+ * data buffer where the fd number being shared will be stored.
+ * |args_vec| provides the tracer with preallocated space to make
+ * socketcall syscalls.
+ *
+ * Return a pointer to the syscallbuf (with an initialized header
+ * including the available size), if syscallbuf is enabled.
+ *
+ * This is a "magic" syscall implemented by rr.
+ */
+static void rrcall_init_buffers(struct rrcall_init_buffers_params* args) {
+ privileged_traced_syscall1(SYS_rrcall_init_buffers, args);
+}
+
+/**
+ * Return a counter that generates a signal targeted at this task
+ * every time the task is descheduled |nr_descheds| times.
+ */
+static int open_desched_event_counter(size_t nr_descheds, pid_t tid) {
+ struct perf_event_attr attr;
+ int tmp_fd, fd;
+ struct rr_f_owner_ex own;
+
+ local_memset(&attr, 0, sizeof(attr));
+ attr.size = sizeof(attr);
+ attr.type = PERF_TYPE_SOFTWARE;
+ attr.config = PERF_COUNT_SW_CONTEXT_SWITCHES;
+ attr.disabled = 1;
+ attr.sample_period = nr_descheds;
+
+ tmp_fd = privileged_traced_perf_event_open(&attr, 0 /*self*/, -1 /*any cpu*/,
+ -1, 0);
+ if (0 > tmp_fd) {
+ fatal("Failed to perf_event_open");
+ }
+ fd = privileged_traced_fcntl(tmp_fd, F_DUPFD_CLOEXEC,
+ RR_DESCHED_EVENT_FLOOR_FD);
+ if (fd > 0) {
+ if (privileged_untraced_close(tmp_fd)) {
+ fatal("Failed to close tmp_fd");
+ }
+ } else {
+ // We may be unable to find an fd above the RR_DESCHED_EVENT_FLOOR_FD (e.g
+ // because of a low ulimit). In that case, just use the tmp_fd we already
+ // have.
+ fd = tmp_fd;
+ }
+ if (privileged_untraced_fcntl(fd, F_SETFL, FASYNC)) {
+ fatal("Failed to fcntl(FASYNC) the desched counter");
+ }
+ own.type = F_OWNER_TID;
+ own.pid = tid;
+ if (privileged_untraced_fcntl(fd, F_SETOWN_EX, &own)) {
+ fatal("Failed to fcntl(SETOWN_EX) the desched counter to this");
+ }
+ if (privileged_untraced_fcntl(fd, F_SETSIG, globals.desched_sig)) {
+ fatal("Failed to fcntl(SETSIG) the desched counter");
+ }
+
+ return fd;
+}
+
+/**
+ * Initialize thread-local buffering state, if enabled and not already
+ * initialized.
+ */
+static void init_thread(void) {
+ struct rrcall_init_buffers_params args;
+
+ assert(process_inited);
+ if (thread_locals->thread_inited) {
+ return;
+ }
+ thread_locals->thread_inited = 1;
+
+ /* Do not do any syscall buffering in a DiversionSession! */
+ if (!buffer_enabled || globals.in_diversion) {
+ return;
+ }
+
+ /* NB: we want this setup emulated during replay. */
+ thread_locals->desched_counter_fd =
+ open_desched_event_counter(1, privileged_traced_gettid());
+
+ args.desched_counter_fd = thread_locals->desched_counter_fd;
+
+ /* Trap to rr: let the magic begin!
+ *
+ * If the desched signal is currently blocked, then the tracer
+ * will clear our TCB guard and we won't be able to buffer
+ * syscalls. But the tracee will set the guard when (or if)
+ * the signal is unblocked. */
+ rrcall_init_buffers(&args);
+
+ thread_locals->cloned_file_data_fd = args.cloned_file_data_fd;
+ /* rr initializes the buffer header. */
+ thread_locals->buffer = args.syscallbuf_ptr;
+ thread_locals->buffer_size = args.syscallbuf_size;
+ thread_locals->scratch_buf = args.scratch_buf;
+ thread_locals->usable_scratch_size = args.usable_scratch_size;
+}
+
+// We don't include libc headers, since they include with Linux headers,
+// so declared this prototype manually
+extern const char* getenv(const char*);
+
+// getauxval is from glibc 2.16 (2012) - don't assume it exists.
+unsigned long getauxval(unsigned long type) __attribute__((weak));
+#ifndef AT_SYSINFO_EHDR
+#define AT_SYSINFO_EHDR 33
+#endif
+
+extern RR_HIDDEN long syscall_hook(struct syscall_info* call);
+
+/**
+ * Initialize process-global buffering state, if enabled.
+ * NOTE: constructors go into a special section by default so this won't
+ * be counted as syscall-buffering code!
+ */
+static void __attribute__((constructor)) init_process(void) {
+ struct rrcall_init_preload_params params;
+
+ extern char _syscallbuf_final_exit_instruction;
+ extern char _syscallbuf_code_start;
+ extern char _syscallbuf_code_end;
+ extern char do_breakpoint_fault_addr;
+
+#if defined(__i386__)
+ extern RR_HIDDEN void __morestack(void);
+ extern RR_HIDDEN void _syscall_hook_trampoline_3d_01_f0_ff_ff(void);
+ extern RR_HIDDEN void _syscall_hook_trampoline_90_90_90(void);
+ struct syscall_patch_hook syscall_patch_hooks[] = {
+ /* pthread_cond_broadcast has 'int 80' followed by
+ * cmp $-4095,%eax (in glibc-2.18-16.fc20.i686) */
+ { 0,
+ 5,
+ { 0x3d, 0x01, 0xf0, 0xff, 0xff },
+ (uintptr_t)_syscall_hook_trampoline_3d_01_f0_ff_ff },
+ /* Our vdso syscall patch has 'int 80' followed by onp; nop; nop */
+ { PATCH_IS_MULTIPLE_INSTRUCTIONS,
+ 3,
+ { 0x90, 0x90, 0x90 },
+ (uintptr_t)_syscall_hook_trampoline_90_90_90 }
+ };
+ extern char _get_pc_thunks_start;
+ extern char _get_pc_thunks_end;
+#elif defined(__x86_64__)
+ extern RR_HIDDEN void _syscall_hook_trampoline_48_3d_01_f0_ff_ff(void);
+ extern RR_HIDDEN void _syscall_hook_trampoline_48_3d_00_f0_ff_ff(void);
+ extern RR_HIDDEN void _syscall_hook_trampoline_3d_00_f0_ff_ff(void);
+ extern RR_HIDDEN void _syscall_hook_trampoline_48_8b_3c_24(void);
+ extern RR_HIDDEN void _syscall_hook_trampoline_48_89_45_f8(void);
+ extern RR_HIDDEN void _syscall_hook_trampoline_48_89_c3(void);
+ extern RR_HIDDEN void _syscall_hook_trampoline_5a_5e_c3(void);
+ extern RR_HIDDEN void _syscall_hook_trampoline_89_c2_f7_da(void);
+ extern RR_HIDDEN void _syscall_hook_trampoline_90_90_90(void);
+ extern RR_HIDDEN void _syscall_hook_trampoline_ba_01_00_00_00(void);
+ extern RR_HIDDEN void _syscall_hook_trampoline_89_c1_31_d2(void);
+ extern RR_HIDDEN void _syscall_hook_trampoline_c3_nop(void);
+ extern RR_HIDDEN void _syscall_hook_trampoline_40_80_f6_81(void);
+ extern RR_HIDDEN void _syscall_hook_trampoline_49_89_ca(void);
+ extern RR_HIDDEN void _syscall_hook_trampoline_48_89_c1(void);
+ extern RR_HIDDEN void _syscall_hook_trampoline_48_c1_e2_20(void);
+ extern RR_HIDDEN void _syscall_hook_trampoline_49_8b_44_24_28(void);
+ extern RR_HIDDEN void _syscall_hook_trampoline_4c_89_f7(void);
+ extern RR_HIDDEN void _syscall_hook_trampoline_4c_89_ff(void);
+ extern RR_HIDDEN void _syscall_hook_trampoline_49_c7_c1_ff_ff_ff_ff(void);
+ extern RR_HIDDEN void _syscall_hook_trampoline_b8_0e_00_00_00(void);
+ extern RR_HIDDEN void _syscall_hook_trampoline_b8_11_01_00_00(void);
+ extern RR_HIDDEN void _syscall_hook_trampoline_b8_ca_00_00_00(void);
+ extern RR_HIDDEN void _syscall_hook_trampoline_be_18_00_00_00(void);
+ extern RR_HIDDEN void _syscall_hook_trampoline_48_89_e5(void);
+ extern RR_HIDDEN void _syscall_hook_trampoline_48_89_fb(void);
+ extern RR_HIDDEN void _syscall_hook_trampoline_48_8d_b3_f0_08_00_00(void);
+ extern RR_HIDDEN void _syscall_hook_trampoline_nops(void);
+
+#define MOV_RDX_VARIANTS \
+ MOV_RDX_TO_REG(48, d0) \
+ MOV_RDX_TO_REG(48, d1) \
+ MOV_RDX_TO_REG(48, d2) \
+ MOV_RDX_TO_REG(48, d3) \
+ MOV_RDX_TO_REG(48, d4) \
+ MOV_RDX_TO_REG(48, d5) \
+ MOV_RDX_TO_REG(48, d6) \
+ MOV_RDX_TO_REG(48, d7) \
+ MOV_RDX_TO_REG(49, d0) \
+ MOV_RDX_TO_REG(49, d1) \
+ MOV_RDX_TO_REG(49, d2) \
+ MOV_RDX_TO_REG(49, d3) \
+ MOV_RDX_TO_REG(49, d4) \
+ MOV_RDX_TO_REG(49, d5) \
+ MOV_RDX_TO_REG(49, d6) \
+ MOV_RDX_TO_REG(49, d7)
+
+#define MOV_RDX_TO_REG(rex, op) \
+ extern RR_HIDDEN void _syscall_hook_trampoline_##rex##_89_##op(void);
+ MOV_RDX_VARIANTS
+
+ struct syscall_patch_hook syscall_patch_hooks[] = {
+ /* Many glibc syscall wrappers (e.g. read) have 'syscall' followed
+ * by
+ * cmp $-4095,%rax (in glibc-2.18-16.fc20.x86_64) */
+ { 0,
+ 6,
+ { 0x48, 0x3d, 0x01, 0xf0, 0xff, 0xff },
+ (uintptr_t)_syscall_hook_trampoline_48_3d_01_f0_ff_ff },
+ /* Many glibc syscall wrappers (e.g. __libc_recv) have 'syscall'
+ * followed by
+ * cmp $-4096,%rax (in glibc-2.18-16.fc20.x86_64) */
+ { 0,
+ 6,
+ { 0x48, 0x3d, 0x00, 0xf0, 0xff, 0xff },
+ (uintptr_t)_syscall_hook_trampoline_48_3d_00_f0_ff_ff },
+ /* glibc-2.35-20.fc36.x86_64 start_thread has 'syscall'
+ * followed by 'cmp $-4096,%eax' */
+ { 0,
+ 5,
+ { 0x3d, 0x00, 0xf0, 0xff, 0xff },
+ (uintptr_t)_syscall_hook_trampoline_3d_00_f0_ff_ff },
+ /* Many glibc syscall wrappers (e.g. read) have 'syscall' followed
+ * by
+ * mov (%rsp),%rdi (in glibc-2.18-16.fc20.x86_64) */
+ { 0,
+ 4,
+ { 0x48, 0x8b, 0x3c, 0x24 },
+ (uintptr_t)_syscall_hook_trampoline_48_8b_3c_24 },
+ /* Some syscall wrappers have 'syscall' followed
+ * by
+ * mov %rax,-8(%rbp) */
+ { 0,
+ 4,
+ { 0x48, 0x89, 0x45, 0xf8 },
+ (uintptr_t)_syscall_hook_trampoline_48_89_45_f8 },
+ /* Some syscall wrappers (e.g. read) have 'syscall' followed
+ * by
+ * mov %rax,%rbx */
+ { 0,
+ 3,
+ { 0x48, 0x89, 0xc3 },
+ (uintptr_t)_syscall_hook_trampoline_48_89_c3 },
+ /* Some RDTSC instructions are followed by 'mov %rax,%rcx'. */
+ { 0,
+ 3,
+ { 0x48, 0x89, 0xc1 },
+ (uintptr_t)_syscall_hook_trampoline_48_89_c1 },
+ /* __lll_unlock_wake has 'syscall' followed by
+ * pop %rdx; pop %rsi; ret */
+ { PATCH_IS_MULTIPLE_INSTRUCTIONS,
+ 3,
+ { 0x5a, 0x5e, 0xc3 },
+ (uintptr_t)_syscall_hook_trampoline_5a_5e_c3 },
+ /* posix_fadvise64 has 'syscall' followed by
+ * mov %eax,%edx; neg %edx (in glibc-2.22-11.fc23.x86_64) */
+ { PATCH_IS_MULTIPLE_INSTRUCTIONS,
+ 4,
+ { 0x89, 0xc2, 0xf7, 0xda },
+ (uintptr_t)_syscall_hook_trampoline_89_c2_f7_da },
+ /* Our VDSO vsyscall patches have 'syscall' followed by "nop; nop;
+ nop" */
+ { PATCH_IS_MULTIPLE_INSTRUCTIONS,
+ 3,
+ { 0x90, 0x90, 0x90 },
+ (uintptr_t)_syscall_hook_trampoline_90_90_90 },
+ /* glibc-2.22-17.fc23.x86_64 has 'syscall' followed by 'mov $1,%rdx'
+ * in
+ * pthread_barrier_wait.
+ */
+ { 0,
+ 5,
+ { 0xba, 0x01, 0x00, 0x00, 0x00 },
+ (uintptr_t)_syscall_hook_trampoline_ba_01_00_00_00 },
+ /* pthread_sigmask has 'syscall' followed by 'mov %eax,%ecx; xor
+ %edx,%edx' */
+ { PATCH_IS_MULTIPLE_INSTRUCTIONS,
+ 4,
+ { 0x89, 0xc1, 0x31, 0xd2 },
+ (uintptr_t)_syscall_hook_trampoline_89_c1_31_d2 },
+ /* getpid has 'syscall' followed by 'retq; nopl 0x0(%rax,%rax,1) */
+ { PATCH_IS_MULTIPLE_INSTRUCTIONS,
+ 9,
+ { 0xc3, 0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ (uintptr_t)_syscall_hook_trampoline_c3_nop },
+ /* liblsan internal_close has 'syscall' followed by 'retq; nopl 0x0(%rax,%rax,1) */
+ { PATCH_IS_MULTIPLE_INSTRUCTIONS,
+ 6,
+ { 0xc3, 0x0f, 0x1f, 0x44, 0x00, 0x00 },
+ (uintptr_t)_syscall_hook_trampoline_c3_nop },
+ /* glibc-2.29-15.fc30.x86_64 getpid has 'syscall' followed by 'retq; nopl 0x0(%rax) */
+ { PATCH_IS_MULTIPLE_INSTRUCTIONS,
+ 5,
+ { 0xc3, 0x0f, 0x1f, 0x40, 0x00 },
+ (uintptr_t)_syscall_hook_trampoline_c3_nop },
+ /* liblsan internal_open has 'syscall' followed by 'retq; nopl (%rax) */
+ { PATCH_IS_MULTIPLE_INSTRUCTIONS,
+ 4,
+ { 0xc3, 0x0f, 0x1f, 0x00 },
+ (uintptr_t)_syscall_hook_trampoline_c3_nop },
+ /* liblsan internal_dup2 has 'syscall' followed by 'retq; xchg %ax,%ax */
+ { PATCH_IS_MULTIPLE_INSTRUCTIONS,
+ 3,
+ { 0xc3, 0x66, 0x90 },
+ (uintptr_t)_syscall_hook_trampoline_c3_nop },
+ /* Go runtime has 'syscall' followed by 'retq; int3; int3 */
+ { PATCH_IS_MULTIPLE_INSTRUCTIONS,
+ 3,
+ { 0xc3, 0xcc, 0xcc },
+ (uintptr_t)_syscall_hook_trampoline_c3_nop },
+ /* glibc-2.31 on Ubuntu 20.04 has 'xor $0x81, %sil' followed by 'syscall' */
+ { PATCH_SYSCALL_INSTRUCTION_IS_LAST,
+ 4,
+ { 0x40, 0x80, 0xf6, 0x81 },
+ (uintptr_t)_syscall_hook_trampoline_40_80_f6_81 },
+ /* DynamoRIO has 'mov r10, rcx' followed by 'syscall' */
+ {
+ PATCH_SYSCALL_INSTRUCTION_IS_LAST,
+ 3,
+ { 0x49, 0x89, 0xca },
+ (uintptr_t)_syscall_hook_trampoline_49_89_ca },
+ /* Some applications have RDTSC followed by 'mov %rdx,any-reg' */
+#undef MOV_RDX_TO_REG
+#define MOV_RDX_TO_REG(rex, op) \
+ { \
+ 0, \
+ 3, \
+ { 0x##rex, 0x89, 0x##op }, \
+ (uintptr_t)_syscall_hook_trampoline_##rex##_89_##op },
+ MOV_RDX_VARIANTS
+ /* Some application has RDTSC followed by 'shl $32,%rdx' */
+ {
+ 0,
+ 4,
+ { 0x48, 0xc1, 0xe2, 0x20 },
+ (uintptr_t)_syscall_hook_trampoline_48_c1_e2_20 },
+ /* glibc-2.35-20.fc36.x86_64 __pthread_create_2_1 application has
+ syscall followed by 'mov 0x28(%r12),%rax' */
+ {
+ 0,
+ 5,
+ { 0x49, 0x8b, 0x44, 0x24, 0x28 },
+ (uintptr_t)_syscall_hook_trampoline_49_8b_44_24_28 },
+ /* glibc-2.35-20.fc36.x86_64 thread_start has
+ 'lea 0x8f0(%rbx),%rsi' followed by 'syscall' */
+ { PATCH_SYSCALL_INSTRUCTION_IS_LAST,
+ 7,
+ { 0x48, 0x8d, 0xb3, 0xf0, 0x08, 0x00, 0x00 },
+ (uintptr_t)_syscall_hook_trampoline_48_8d_b3_f0_08_00_00 },
+ /* Some application has 'mov %r14,%rdi' followed by 'syscall' */
+ { PATCH_SYSCALL_INSTRUCTION_IS_LAST,
+ 3,
+ { 0x4c, 0x89, 0xf7 },
+ (uintptr_t)_syscall_hook_trampoline_4c_89_f7 },
+ /* Some application has 'mov %r15,%rdi' followed by 'syscall' */
+ { PATCH_SYSCALL_INSTRUCTION_IS_LAST,
+ 3,
+ { 0x4c, 0x89, 0xff },
+ (uintptr_t)_syscall_hook_trampoline_4c_89_ff },
+ /* Some application has 'mov $0xffffffffffffffff,%r9' followed by 'syscall' */
+ { PATCH_SYSCALL_INSTRUCTION_IS_LAST,
+ 7,
+ { 0x49, 0xc7, 0xc1, 0xff, 0xff, 0xff, 0xff },
+ (uintptr_t)_syscall_hook_trampoline_49_c7_c1_ff_ff_ff_ff },
+ /* glibc-2.35-20.fc36.x86_64 __pthread_create_2_1 has
+ 'mov $0xe,%eax' (sigprocmask) followed by 'syscall' */
+ { PATCH_SYSCALL_INSTRUCTION_IS_LAST,
+ 5,
+ { 0xb8, 0x0e, 0x00, 0x00, 0x00 },
+ (uintptr_t)_syscall_hook_trampoline_b8_0e_00_00_00 },
+ /* glibc-2.35-20.fc36.x86_64 thread_start has
+ 'mov $0x111,%eax' (set_robust_list) followed by 'syscall' */
+ { PATCH_SYSCALL_INSTRUCTION_IS_LAST,
+ 5,
+ { 0xb8, 0x11, 0x01, 0x00, 0x00 },
+ (uintptr_t)_syscall_hook_trampoline_b8_11_01_00_00 },
+ /* Some application has 'mov $0xca,%eax' (futex) followed by 'syscall' */
+ { PATCH_SYSCALL_INSTRUCTION_IS_LAST,
+ 5,
+ { 0xb8, 0xca, 0x00, 0x00, 0x00 },
+ (uintptr_t)_syscall_hook_trampoline_b8_ca_00_00_00 },
+ /* Some application has 'mov $0x18,%esi' (sizeof(robust_list)) followed by 'syscall' */
+ { PATCH_SYSCALL_INSTRUCTION_IS_LAST,
+ 5,
+ { 0xbe, 0x18, 0x00, 0x00, 0x00 },
+ (uintptr_t)_syscall_hook_trampoline_be_18_00_00_00 },
+ /* Some application has 'mov %rsp,%rbp' followed by 'rdtsc' */
+ { PATCH_SYSCALL_INSTRUCTION_IS_LAST,
+ 3,
+ { 0x48, 0x89, 0xe5 },
+ (uintptr_t)_syscall_hook_trampoline_48_89_e5 },
+ /* Some application has 'mov %rdi,%rbx' followed by 'rdtsc' */
+ { PATCH_SYSCALL_INSTRUCTION_IS_LAST,
+ 3,
+ { 0x48, 0x89, 0xfb },
+ (uintptr_t)_syscall_hook_trampoline_48_89_fb },
+ /* Support explicit 5 byte nop (`nopl 0(%ax, %ax, 1)`) before 'rdtsc' or syscall (may ignore interfering branches) */
+ { PATCH_SYSCALL_INSTRUCTION_IS_LAST |
+ PATCH_IS_NOP_INSTRUCTIONS,
+ 5,
+ { 0x0f, 0x1f, 0x44, 0x00, 0x00 },
+ (uintptr_t)_syscall_hook_trampoline_nops }
+ };
+#elif defined(__aarch64__)
+ extern RR_HIDDEN void _syscall_hook_trampoline_raw(void);
+ struct syscall_patch_hook syscall_patch_hooks[] = {
+ { 0, 4, { 0x01, 0, 0, 0xd4 }, (uintptr_t)_syscall_hook_trampoline_raw }
+ };
+#endif
+
+ assert(sizeof(struct preload_thread_locals) <= PRELOAD_THREAD_LOCALS_SIZE);
+
+ if (process_inited) {
+ return;
+ }
+
+ // Check if the rr page is mapped. We avoid a syscall if it looks like
+ // rr places librrpage as the vdso
+ // Use 1 as size since linux implementation of msync round it up to page size
+ if ((!getauxval || (getauxval(AT_SYSINFO_EHDR) != RR_PAGE_ADDR - 3*PRELOAD_LIBRARY_PAGE_SIZE)) &&
+ msync((void*)RR_PAGE_ADDR, 1, MS_ASYNC) != 0) {
+ // The RR page is not mapped - this process is not rr traced.
+ buffer_enabled = 0;
+ return;
+ }
+
+ buffer_enabled = !!getenv(SYSCALLBUF_ENABLED_ENV_VAR);
+
+ if (!buffer_enabled) {
+ // Don't risk executing the syscall before. If there is an external seccomp
+ // filter that doesn't like unknown syscalls, we risk breaking the recording.
+ return;
+ }
+
+ params.syscallbuf_enabled = buffer_enabled;
+
+#ifdef __i386__
+ params.get_pc_thunks_start = &_get_pc_thunks_start;
+ params.get_pc_thunks_end = &_get_pc_thunks_end;
+#else
+ params.get_pc_thunks_start = NULL;
+ params.get_pc_thunks_end = NULL;
+#endif
+ params.syscallbuf_code_start = &_syscallbuf_code_start;
+ params.syscallbuf_code_end = &_syscallbuf_code_end;
+ params.syscallbuf_final_exit_instruction =
+ &_syscallbuf_final_exit_instruction;
+ params.syscall_patch_hook_count =
+ sizeof(syscall_patch_hooks) / sizeof(syscall_patch_hooks[0]);
+ params.syscall_patch_hooks = syscall_patch_hooks;
+ params.globals = &globals;
+
+ globals.fdt_uniform = 1;
+ params.breakpoint_instr_addr = &do_breakpoint_fault_addr;
+ params.breakpoint_mode_sentinel = -1;
+ params.syscallbuf_syscall_hook = (void*)syscall_hook;
+
+ // We must not make any call into the syscall buffer in the init function
+ // in case a signal is delivered to us during initialization.
+ // This means that we must not call `_raw_syscall`.
+ int err = _traced_init_syscall(SYS_rrcall_init_preload, (long)¶ms,
+ 0, 0, 0, 0, 0);
+ if (err != 0) {
+ // Check if the rr tracer is present by looking for the thread local page
+ // (mapped just after the rr page). If it is not present, we were
+ // preloaded without rr listening, which is allowed (e.g. after detach).
+ // Otherwise give an intelligent error message indicating that our connection
+ // to rr is broken.
+ // Use 1 as size since linux implementation of msync round it up to page size
+ if (msync((void*)RR_PAGE_ADDR + PRELOAD_LIBRARY_PAGE_SIZE, 1, MS_ASYNC) == 0) {
+ fatal("Failed to communicated with rr tracer.\n"
+ "Perhaps a restrictive seccomp filter is in effect (e.g. docker?)?\n"
+ "Adjust the seccomp filter to allow syscalls above 1000, disable it,\n"
+ "or try using `rr record -n` (slow).");
+ } else {
+ buffer_enabled = 0;
+ return;
+ }
+ }
+
+ process_inited = 1;
+}
+
+/**
+ * syscall hooks start here.
+ *
+ * !!! NBB !!!: from here on, all code that executes within the
+ * critical sections of transactions *MUST KEEP $ip IN THE SYSCALLBUF
+ * CODE*. That means no calls into libc, even for innocent-looking
+ * functions like |memcpy()|.
+ *
+ * How syscall hooks operate:
+ *
+ * 1. The rr tracer monkey-patches __kernel_vsyscall() to jump to
+ * _syscall_hook_trampoline() above.
+ * 2. When a call is made to __kernel_vsyscall(), it jumps to
+ * _syscall_hook_trampoline(), where the syscall params are
+ * packaged up into a call to syscall_hook() below.
+ * 3. syscall_hook() dispatches to a syscall processor function.
+ * 4. The syscall processor prepares a new record in the buffer. See
+ * struct syscallbuf_record for record fields. If the buffer runs
+ * out of space, the processor function aborts and makes a traced
+ * syscall, trapping to rr. rr then flushes the buffer. Records
+ * are directly saved to trace, and a buffer-flush event is
+ * recorded without execution info because it's a synthetic event.
+ * 5. Then, the syscall processor redirects all potential output
+ * for the syscall to the record (and corrects the overall size of
+ * the record while it does so).
+ * 6. The syscall is invoked through a asm helper that does *not*
+ * ptrace-trap to rr.
+ * 7. The syscall output, written on the buffer, is copied to the
+ * original pointers provided by the user. Take notice that this
+ * part saves us the injection of the data on replay, as we only
+ * need to push the data to the buffer and the wrapper code will
+ * copy it to the user address for us.
+ * 8. The return value and overall size are saved to the record.
+ */
+
+/**
+ * Call this and save the result at the start of every system call we
+ * want to buffer. The result is a pointer into the record space. You
+ * can add to this pointer to allocate space in the trace record.
+ * However, do not read or write through this pointer until
+ * start_commit_syscall() has been called. And you *must* call
+ * start_commit_syscall() after this is called, otherwise buffering
+ * state will be inconsistent between syscalls.
+ *
+ * See |sys_clock_gettime()| for a simple example of how this helper
+ * should be used to buffer outparam data.
+ */
+static void* prep_syscall(void) {
+ /* We don't need to worry about a race between testing
+ * |locked| and setting it here. rr recording is responsible
+ * for ensuring signals are not delivered during
+ * syscall_buffer prologue and epilogue code.
+ *
+ * XXX except for synchronous signals generated in the syscall
+ * buffer code, while reading/writing user pointers */
+ buffer_hdr()->locked |= SYSCALLBUF_LOCKED_TRACEE;
+ /* "Allocate" space for a new syscall record, not including
+ * syscall outparam data. */
+ return buffer_last() + sizeof(struct syscallbuf_record);
+}
+
+static enum syscallbuf_fd_classes fd_class(int fd) {
+ if (fd < 0) {
+ return FD_CLASS_INVALID;
+ }
+ if (fd >= SYSCALLBUF_FDS_DISABLED_SIZE - 1) {
+ fd = SYSCALLBUF_FDS_DISABLED_SIZE - 1;
+ }
+ return globals.syscallbuf_fd_class[fd];
+}
+
+static int is_bufferable_fd(int fd) {
+ switch (fd_class(fd)) {
+ case FD_CLASS_INVALID:
+ case FD_CLASS_UNTRACED:
+ return 1;
+ default:
+ return 0;
+ }
+}
+
+/**
+ * Like prep_syscall, but preps a syscall to operate on a particular fd. If
+ * syscallbuf is disabled for this fd, returns NULL (in which case
+ * start_commit_syscall will abort cleanly and a traced syscall will be used).
+ * Allow negative fds to pass through; they'll either trigger an error or
+ * receive special treatment by the kernel (e.g. AT_FDCWD).
+ */
+static void* prep_syscall_for_fd(int fd) {
+ if (!is_bufferable_fd(fd)) {
+ return NULL;
+ }
+ return prep_syscall();
+}
+
+static void arm_desched_event(void) {
+ /* Don't trace the ioctl; doing so would trigger a flushing
+ * ptrace trap, which is exactly what this code is trying to
+ * avoid! :) Although we don't allocate extra space for these
+ * ioctl's, we do record that we called them; the replayer
+ * knows how to skip over them. */
+ if ((int)privileged_unrecorded_syscall3(SYS_ioctl,
+ thread_locals->desched_counter_fd,
+ PERF_EVENT_IOC_ENABLE, 0)) {
+ fatal("Failed to ENABLE counter");
+ }
+}
+
+static void disarm_desched_event(void) {
+ /* See above. */
+ if ((int)privileged_unrecorded_syscall3(SYS_ioctl,
+ thread_locals->desched_counter_fd,
+ PERF_EVENT_IOC_DISABLE, 0)) {
+ fatal("Failed to DISABLE counter");
+ }
+}
+
+/**
+ * Return 1 if it's ok to proceed with buffering this system call.
+ * Return 0 if we should trace the system call.
+ * This must be checked before proceeding with the buffered system call.
+ */
+/* (Negative numbers so as to not be valid syscall numbers, in case
+ * the |int| arguments below are passed in the wrong order.) */
+enum { MAY_BLOCK = -1, WONT_BLOCK = -2 };
+
+static int fd_write_blocks(int fd) {
+ if (!globals.fdt_uniform) {
+ // If we're not uniform, it is possible for this fd to be untraced in one
+ // of the other tasks that share this fd table. Always assume it could block.
+ return MAY_BLOCK;
+ }
+ switch (fd_class(fd)) {
+ case FD_CLASS_UNTRACED:
+ case FD_CLASS_TRACED:
+ return MAY_BLOCK;
+ case FD_CLASS_INVALID:
+ case FD_CLASS_PROC_MEM:
+ return WONT_BLOCK;
+ }
+ fatal("Unknown or corrupted fd class");
+}
+
+static int start_commit_buffered_syscall(int syscallno, void* record_end,
+ int blockness) {
+ void* record_start;
+ void* stored_end;
+ struct syscallbuf_record* rec;
+
+ if (!thread_locals->buffer) {
+ return 0;
+ }
+ record_start = buffer_last();
+ stored_end = record_start + stored_record_size(record_end - record_start);
+ rec = record_start;
+
+ if (stored_end < record_start + sizeof(struct syscallbuf_record)) {
+ /* Either a catastrophic buffer overflow or
+ * we failed to lock the buffer. Just bail out. */
+ return 0;
+ }
+ if (stored_end > (void*)buffer_end() - sizeof(struct syscallbuf_record)) {
+ /* Buffer overflow.
+ * Unlock the buffer and then execute the system call
+ * with a trap to rr. Note that we reserve enough
+ * space in the buffer for the next prep_syscall(). */
+ buffer_hdr()->locked &= ~SYSCALLBUF_LOCKED_TRACEE;
+ return 0;
+ }
+ /* Store this breadcrumb so that the tracer can find out what
+ * syscall we're executing if our registers are in a weird
+ * state. If we end up aborting this syscall, no worry, this
+ * will just be overwritten later.
+ *
+ * NBB: this *MUST* be set before the desched event is
+ * armed. */
+ rec->syscallno = syscallno;
+ rec->desched = MAY_BLOCK == blockness;
+ rec->size = record_end - record_start;
+
+ if (rec->desched) {
+ pid_t pid = 0;
+ pid_t tid = 0;
+ uid_t uid = 0;
+ if (impose_spurious_desched) {
+ pid = privileged_unrecorded_syscall0(SYS_getpid);
+ tid = privileged_unrecorded_syscall0(SYS_gettid);
+ uid = privileged_unrecorded_syscall0(SYS_getuid);
+ }
+
+ /* NB: the ordering of the next two statements is
+ * important.
+ *
+ * We set this flag to notify rr that it should pay
+ * attention to desched signals pending for this task.
+ * We have to set it *before* we arm the notification
+ * because we can't set the flag atomically with
+ * arming the event (too bad there's no ioctl() for
+ * querying the event enabled-ness state). That's
+ * important because if the notification is armed,
+ * then rr must be confident that when it disarms the
+ * event, the tracee is at an execution point that
+ * *must not* need the desched event.
+ *
+ * If we were to set the flag non-atomically after the
+ * event was armed, then if a desched signal was
+ * delivered right at the instruction that set the
+ * flag, rr wouldn't know that it needed to advance
+ * the tracee to the untraced syscall entry point.
+ * (And if rr didn't do /that/, then the syscall might
+ * block without rr knowing it, and the recording
+ * session would deadlock.) */
+ buffer_hdr()->desched_signal_may_be_relevant = 1;
+ arm_desched_event();
+ if (impose_spurious_desched) {
+ siginfo_t si;
+ si.si_code = POLL_IN;
+ si.si_fd = thread_locals->desched_counter_fd;
+ si.si_pid = pid;
+ si.si_uid = uid;
+ privileged_unrecorded_syscall4(SYS_rt_tgsigqueueinfo, pid, tid,
+ globals.desched_sig,
+ &si);
+ }
+ }
+ return 1;
+}
+
+static void force_tick(void) {
+#if defined(__i386__) || defined(__x86_64__)
+ __asm__ __volatile__("je 1f\n\t"
+ "1:");
+#elif defined(__aarch64__)
+ __asm__ __volatile__("cbz xzr, 1f\n"
+ "1:");
+#else
+#error Unknown architecture
+#endif
+}
+
+static void __attribute__((noinline)) do_breakpoint(size_t value)
+{
+ char *unsafe_value = ((char*)-1)-0xf;
+ char **safe_value = &unsafe_value;
+ uint64_t *breakpoint_value_addr = (uint64_t*)RR_PAGE_BREAKPOINT_VALUE;
+#if defined(__i386__) || defined(__x86_64__)
+ __asm__ __volatile__(
+ "mov (%1),%1\n\t"
+ "cmp %0,%1\n\t"
+ "cmove %3,%2\n\t"
+ // This will segfault if `value` matches
+ // the `breakpoint_value` set by rr. We
+ // detect this segfault and treat it
+ // specially.
+ "do_breakpoint_fault_addr:\n\t"
+ ".global do_breakpoint_fault_addr\n\t"
+ "mov (%2),%2\n\t"
+ "xor %1,%1\n\t"
+ "xor %2,%2\n\t"
+ "xor %3,%3\n\t"
+ : "+a"(value), "+D"(breakpoint_value_addr),
+ "+S"(safe_value), "+c"(unsafe_value)
+ :
+ : "cc", "memory");
+#elif defined(__aarch64__)
+ __asm__ __volatile__("ldr %1, [%1]\n\t"
+ "cmp %0, %1\n\t"
+ "csel %0, %3, %2, eq\n\t"
+ "do_breakpoint_fault_addr:\n\t"
+ ".global do_breakpoint_fault_addr\n\t"
+ "ldr %0, [%0]\n\t"
+ "subs %0, xzr, xzr\n\t"
+ "mov %1, xzr\n\t"
+ : "+r"(value), "+r"(breakpoint_value_addr),
+ "+r"(safe_value), "+r"(unsafe_value)
+ :
+ : "cc", "memory");
+#else
+#error Unknown architecture
+#endif
+}
+
+/**
+ * Commit the record for a buffered system call. record_end can be
+ * adjusted downward from what was passed to
+ * start_commit_buffered_syscall, if not all of the initially
+ * requested space is needed. The result of this function should be
+ * returned directly by the kernel syscall hook.
+ */
+static long commit_raw_syscall(int syscallno, void* record_end, long ret) {
+ void* record_start = buffer_last();
+ struct syscallbuf_record* rec = record_start;
+ struct syscallbuf_hdr* hdr = buffer_hdr();
+ int call_breakpoint = 0;
+
+ assert(record_end >= record_start);
+ rec->size = record_end - record_start;
+
+ assert(hdr->locked);
+
+ /* NB: the ordering of this statement with the
+ * |disarm_desched_event()| call below is important.
+ *
+ * We clear this flag to notify rr that the may-block syscall
+ * has finished, so there's no danger of blocking anymore.
+ * (And thus the desched signal is no longer relevant.) We
+ * have to clear this *before* disarming the event, because if
+ * rr sees the flag set, it has to PTRACE_SYSCALL this task to
+ * ensure it reaches an execution point where the desched
+ * signal is no longer relevant. We have to use the ioctl()
+ * that disarms the event as a safe "backstop" that can be hit
+ * by the PTRACE_SYSCALL.
+ *
+ * If we were to clear the flag *after* disarming the event,
+ * and the signal arrived at the instruction that cleared the
+ * flag, and rr issued the PTRACE_SYSCALL, then this tracee
+ * could fly off to any unknown execution point, including an
+ * iloop. So the recording session could livelock. */
+ hdr->desched_signal_may_be_relevant = 0;
+
+ if (rec->syscallno != syscallno) {
+ fatal("Record syscall number mismatch");
+ }
+
+ if (hdr->abort_commit) {
+ /* We were descheduled in the middle of a may-block
+ * syscall, and it was recorded as a normal entry/exit
+ * pair. So don't record the syscall in the buffer or
+ * replay will go haywire. */
+ hdr->abort_commit = 0;
+ hdr->failed_during_preparation = 0;
+ /* Clear the return value that rr puts there during replay */
+ rec->ret = 0;
+ } else {
+ rec->ret = ret;
+ // Finish 'rec' first before updating num_rec_bytes, since
+ // rr might read the record anytime after this update.
+ hdr->num_rec_bytes += stored_record_size(rec->size);
+ call_breakpoint = 1;
+ }
+
+ if (rec->desched) {
+ disarm_desched_event();
+ }
+ /* NBB: for may-block syscalls that are descheduled, the
+ * tracer uses the previous ioctl() as a stable point to reset
+ * the record counter. Therefore nothing from here on in the
+ * current txn must touch the record counter (at least, must
+ * not assume it's unchanged). */
+
+ buffer_hdr()->locked &= ~SYSCALLBUF_LOCKED_TRACEE;
+
+ if (call_breakpoint) {
+ /* Call the breakpoint function corresponding to the record we just
+ * committed. This function just returns, but during replay it gives rr
+ * a chance to set a breakpoint for when a specific syscallbuf record
+ * has been processed.
+ */
+ do_breakpoint(hdr->num_rec_bytes/8);
+ /* Force a tick now.
+ * During replay, if an async event (SIGKILL) happens between committing the syscall
+ * above and before this forced tick, we can detect that because the number of ticks
+ * recorded for the SIGKILL will be less than or equal to the number of ticks reported
+ * when the replay hits do_breakpoint.
+ */
+ force_tick();
+ }
+
+ return ret;
+}
+
+/**
+ * |ret_size| is the result of a syscall indicating how much data was returned
+ * in scratch buffer |buf2|; this function copies that data to |buf| and returns
+ * a pointer to the end of it. If there is no scratch buffer (|buf2| is NULL)
+ * just returns |ptr|.
+ */
+static void* copy_output_buffer(long ret_size, void* ptr, void* buf,
+ void* buf2) {
+ if (!buf2) {
+ return ptr;
+ }
+ if (ret_size <= 0 || buffer_hdr()->failed_during_preparation) {
+ return buf2;
+ }
+ local_memcpy(buf, buf2, ret_size);
+ return buf2 + ret_size;
+}
+
+/**
+ * Copy an input parameter to the syscallbuf where the kernel needs to
+ * read and write it. During replay, we do a no-op self-copy in the buffer
+ * so that the buffered data is not lost.
+ * This code is written in assembler to ensure that the registers that receive
+ * values differing between record and replay (%0, rsi/esi, and flags)
+ * are reset to values that are the same between record and replay immediately
+ * afterward. This guards against diverging register values leaking into
+ * later code.
+ * Use local_memcpy or plain assignment instead if the kernel is not going to
+ * overwrite the values.
+ */
+static void memcpy_input_parameter(void* buf, void* src, int size) {
+#if defined(__i386__) || defined(__x86_64__)
+ unsigned char tmp_in_replay = *rr_page_replay_flag_addr();
+ __asm__ __volatile__("test %0,%0\n\t"
+ "cmovne %1,%2\n\t"
+ "rep movsb\n\t"
+ "xor %0,%0\n\t"
+ "xor %2,%2\n\t"
+ : "+a"(tmp_in_replay), "+D"(buf), "+S"(src), "+c"(size)
+ :
+ : "cc", "memory");
+#elif defined(__aarch64__)
+ long c1;
+ long c2;
+ unsigned char *globals_in_replay = rr_page_replay_flag_addr();
+ __asm__ __volatile__("ldrb %w3, [%5]\n\t"
+ "cmp %3, #0\n\t" // eq -> record
+ "csel %1, %1, %0, eq\n\t"
+ "subs %4, %2, 16\n\t"
+ "b.lt 2f\n\t"
+ "1:\n\t"
+ "mov %2, %4\n\t"
+ "ldp %3, %4, [%1], #16\n\t"
+ "stp %3, %4, [%0], #16\n\t"
+ "subs %4, %2, #16\n\t"
+ "b.ge 1b\n"
+ "2:\n\t"
+ "tbz %2, 3, 3f\n\t"
+ "ldr %3, [%1], #8\n\t"
+ "str %3, [%0], #8\n\t"
+ "3:\n\t"
+ "tbz %2, 2, 3f\n\t"
+ "ldr %w3, [%1], #4\n\t"
+ "str %w3, [%0], #4\n\t"
+ "3:\n\t"
+ "tbz %2, 1, 3f\n\t"
+ "ldrh %w3, [%1], #2\n\t"
+ "strh %w3, [%0], #2\n\t"
+ "3:\n\t"
+ "tbz %2, 0, 3f\n\t"
+ "ldrb %w3, [%1]\n\t"
+ "strb %w3, [%0]\n\t"
+ "3:\n\t"
+ "subs %3, xzr, xzr\n\t"
+ "mov %4, xzr\n\t"
+ "mov %1, xzr\n\t"
+ : "+r"(buf), "+r"(src),
+ "+r"(size), "=&r"(c1), "=&r"(c2), "+r"(globals_in_replay)
+ :
+ : "cc", "memory");
+#else
+#error Unknown architecture
+#endif
+}
+
+#if defined(__i386__) || defined(__x86_64__)
+/**
+ * Perform an RDTSC, writing the output to 'buf', but only if we're in recording mode.
+ * Otherwise 'buf' is unchanged.
+ */
+static void rdtsc_recording_only(uint32_t buf[2]) {
+ unsigned char tmp_in_replay = *rr_page_replay_flag_addr();
+ __asm__ __volatile__("test %%eax,%%eax\n\t"
+ "jne 1f\n\t"
+ "rdtsc\n\t"
+ "mov %%eax,(%1)\n\t"
+ "mov %%edx,4(%1)\n\t"
+ "1:\n\t"
+ "xor %%eax,%%eax\n\t"
+ "xor %%edx,%%edx\n\t"
+ : "+a"(tmp_in_replay)
+ : "S"(buf)
+ : "cc", "memory", "rdx");
+}
+#endif
+
+/**
+ * During recording, we copy *real to *buf.
+ * During replay, we copy *buf to *real.
+ * Behaves like memcpy_input_parameter in terms of hiding differences between
+ * recording and replay.
+ */
+static void copy_futex_int(uint32_t* buf, uint32_t* real) {
+#if defined(__i386__) || defined(__x86_64__)
+ uint32_t tmp_in_replay = *rr_page_replay_flag_addr();
+ __asm__ __volatile__("test %0,%0\n\t"
+ "mov %2,%0\n\t"
+ "cmovne %1,%0\n\t"
+ "mov %0,%1\n\t"
+ "mov %0,%2\n\t"
+ /* This instruction is just to clear flags */
+ "xor %0,%0\n\t"
+ : "+a"(tmp_in_replay)
+ : "m"(*buf), "m"(*real)
+ : "cc", "memory");
+#elif defined(__aarch64__)
+ unsigned char *globals_in_replay = rr_page_replay_flag_addr();
+ __asm__ __volatile__("ldrb %w2, [%2]\n\t"
+ "cmp %w2, #0\n\t" // eq -> record
+ "csel %2, %1, %0, eq\n\t"
+ "ldr %w2, [%2]\n\t"
+ "csel %0, %0, %1, eq\n\t"
+ "str %w2, [%0]\n\t"
+ "subs %0, xzr, xzr\n\t"
+ "mov %2, xzr\n\t"
+ : "+r"(buf), "+r"(real), "+r"(globals_in_replay)
+ :
+ : "cc", "memory");
+#else
+#error Unknown architecture
+#endif
+}
+
+static int trace_chaos_mode_syscalls = 0;
+static int buffer_chaos_mode_syscalls = 0;
+
+static int force_traced_syscall_for_chaos_mode(void) {
+ if (!globals.in_chaos) {
+ return 0;
+ }
+ while (1) {
+ if (buffer_chaos_mode_syscalls) {
+ --buffer_chaos_mode_syscalls;
+ return 0;
+ }
+ if (trace_chaos_mode_syscalls) {
+ --trace_chaos_mode_syscalls;
+ return 1;
+ }
+ /* force a run of up to 50 syscalls to be traced */
+ trace_chaos_mode_syscalls = (local_random() % 50) + 1;
+ buffer_chaos_mode_syscalls = (trace_chaos_mode_syscalls - 5) * 10;
+ if (buffer_chaos_mode_syscalls < 0) {
+ buffer_chaos_mode_syscalls = 0;
+ }
+ }
+}
+
+/* Keep syscalls in alphabetical order, please. */
+
+/**
+ * Call this for syscalls that have no memory effects, don't block, and
+ * aren't fd-related.
+ */
+static long sys_generic_nonblocking(struct syscall_info* call) {
+ void* ptr = prep_syscall();
+ long ret;
+
+ if (!start_commit_buffered_syscall(call->no, ptr, WONT_BLOCK)) {
+ return traced_raw_syscall(call);
+ }
+ ret = untraced_syscall6(call->no, call->args[0], call->args[1], call->args[2],
+ call->args[3], call->args[4], call->args[5]);
+ return commit_raw_syscall(call->no, ptr, ret);
+}
+
+/**
+ * Call this for syscalls that have no memory effects, don't block, and
+ * have an fd as their first parameter.
+ */
+static long sys_generic_nonblocking_fd(struct syscall_info* call) {
+ int fd = call->args[0];
+ void* ptr = prep_syscall_for_fd(fd);
+ long ret;
+
+ if (!start_commit_buffered_syscall(call->no, ptr, WONT_BLOCK)) {
+ return traced_raw_syscall(call);
+ }
+ ret = untraced_syscall6(call->no, fd, call->args[1], call->args[2],
+ call->args[3], call->args[4], call->args[5]);
+ return commit_raw_syscall(call->no, ptr, ret);
+}
+
+/**
+ * Call this for syscalls that have no memory effects, don't block, and
+ * have an fd as their first parameter, and should run privileged.
+ */
+static long privileged_sys_generic_nonblocking_fd(const struct syscall_info* call) {
+ int fd = call->args[0];
+ void* ptr = prep_syscall_for_fd(fd);
+ long ret;
+
+ if (!start_commit_buffered_syscall(call->no, ptr, WONT_BLOCK)) {
+ return privileged_traced_raw_syscall(call);
+ }
+ ret = privileged_untraced_syscall6(call->no, fd, call->args[1], call->args[2],
+ call->args[3], call->args[4], call->args[5]);
+ return commit_raw_syscall(call->no, ptr, ret);
+}
+
+static long sys_clock_gettime(struct syscall_info* call) {
+ const int syscallno = SYS_clock_gettime;
+ __kernel_clockid_t clk_id = (__kernel_clockid_t)call->args[0];
+ struct timespec* tp = (struct timespec*)call->args[1];
+
+ void* ptr = prep_syscall();
+ struct timespec* tp2 = NULL;
+ long ret;
+
+ assert(syscallno == call->no);
+
+ if (tp) {
+ tp2 = ptr;
+ ptr += sizeof(*tp2);
+ }
+ if (!start_commit_buffered_syscall(syscallno, ptr, WONT_BLOCK)) {
+ return traced_raw_syscall(call);
+ }
+ ret = untraced_syscall2(syscallno, clk_id, tp2);
+ if (tp && ret >= 0 && !buffer_hdr()->failed_during_preparation) {
+ /* This is small and won't get optimized to a memcpy call outside
+ our library. */
+ *tp = *tp2;
+ }
+ return commit_raw_syscall(syscallno, ptr, ret);
+}
+
+#ifdef SYS_clock_gettime64
+
+static long sys_clock_gettime64(struct syscall_info* call) {
+ const int syscallno = SYS_clock_gettime64;
+ __kernel_clockid_t clk_id = (__kernel_clockid_t)call->args[0];
+ struct __kernel_timespec* tp = (struct __kernel_timespec*)call->args[1];
+
+ void* ptr = prep_syscall();
+ struct __kernel_timespec* tp2 = NULL;
+ long ret;
+
+ assert(syscallno == call->no);
+
+ if (tp) {
+ tp2 = ptr;
+ ptr += sizeof(*tp2);
+ }
+ if (!start_commit_buffered_syscall(syscallno, ptr, WONT_BLOCK)) {
+ return traced_raw_syscall(call);
+ }
+ ret = untraced_syscall2(syscallno, clk_id, tp2);
+ if (tp && ret >= 0 && !buffer_hdr()->failed_during_preparation) {
+ /* This is small and won't get optimized to a memcpy call outside
+ our library. */
+ *tp = *tp2;
+ }
+ return commit_raw_syscall(syscallno, ptr, ret);
+}
+#endif
+
+#if defined(SYS_creat)
+static long sys_open(struct syscall_info* call);
+static long sys_creat(struct syscall_info* call) {
+ const char* pathname = (const char*)call->args[0];
+ __kernel_mode_t mode = call->args[1];
+ /* Thus sayeth the man page:
+ *
+ * creat() is equivalent to open() with flags equal to
+ * O_CREAT|O_WRONLY|O_TRUNC. */
+ struct syscall_info open_call =
+ { SYS_open, { (long)pathname, O_CREAT | O_TRUNC | O_WRONLY, mode } };
+ return sys_open(&open_call);
+}
+#endif
+
+static int sys_fcntl64_no_outparams(struct syscall_info* call) {
+ const int syscallno = RR_FCNTL_SYSCALL;
+ int fd = call->args[0];
+ int cmd = call->args[1];
+ long arg = call->args[2];
+
+ /* None of the no-outparam fcntl's are known to be
+ * may-block. */
+ void* ptr = prep_syscall_for_fd(fd);
+ long ret;
+
+ assert(syscallno == call->no);
+
+ if (!start_commit_buffered_syscall(syscallno, ptr, WONT_BLOCK)) {
+ return traced_raw_syscall(call);
+ }
+ ret = untraced_syscall3(syscallno, fd, cmd, arg);
+ return commit_raw_syscall(syscallno, ptr, ret);
+}
+
+static int sys_fcntl64_own_ex(struct syscall_info* call) {
+ const int syscallno = RR_FCNTL_SYSCALL;
+ int fd = call->args[0];
+ int cmd = call->args[1];
+ struct rr_f_owner_ex* owner = (struct rr_f_owner_ex*)call->args[2];
+
+ /* The OWN_EX fcntl's aren't may-block. */
+ void* ptr = prep_syscall_for_fd(fd);
+ struct rr_f_owner_ex* owner2 = NULL;
+ long ret;
+
+ assert(syscallno == call->no);
+
+ if (owner) {
+ owner2 = ptr;
+ ptr += sizeof(*owner2);
+ }
+ if (!start_commit_buffered_syscall(syscallno, ptr, WONT_BLOCK)) {
+ return traced_raw_syscall(call);
+ }
+ if (owner2) {
+ memcpy_input_parameter(owner2, owner, sizeof(*owner2));
+ }
+ ret = untraced_syscall3(syscallno, fd, cmd, owner2);
+ if (owner2 && ret >= 0 && !buffer_hdr()->failed_during_preparation) {
+ local_memcpy(owner, owner2, sizeof(*owner));
+ }
+ return commit_raw_syscall(syscallno, ptr, ret);
+}
+
+static int sys_fcntl64_setlk64(struct syscall_info* call) {
+ if (force_traced_syscall_for_chaos_mode()) {
+ /* Releasing a lock could unblock a higher priority task */
+ return traced_raw_syscall(call);
+ }
+
+ const int syscallno = RR_FCNTL_SYSCALL;
+ int fd = call->args[0];
+ int cmd = call->args[1];
+ struct rr_flock64* lock = (struct rr_flock64*)call->args[2];
+
+ void* ptr = prep_syscall_for_fd(fd);
+ struct rr_flock64* lock2 = NULL;
+ long ret;
+
+ assert(syscallno == call->no);
+
+ if (lock) {
+ lock2 = ptr;
+ ptr += sizeof(*lock2);
+ }
+ if (!start_commit_buffered_syscall(syscallno, ptr, WONT_BLOCK)) {
+ return traced_raw_syscall(call);
+ }
+ if (lock2) {
+ memcpy_input_parameter(lock2, lock, sizeof(*lock2));
+ }
+ ret = untraced_syscall3(syscallno, fd, cmd, lock2);
+ if (lock2 && ret >= 0 && !buffer_hdr()->failed_during_preparation) {
+ local_memcpy(lock, lock2, sizeof(*lock));
+ }
+ return commit_raw_syscall(syscallno, ptr, ret);
+}
+
+static int sys_fcntl64_setlkw64(struct syscall_info* call) {
+ if (force_traced_syscall_for_chaos_mode()) {
+ /* Releasing a lock could unblock a higher priority task */
+ return traced_raw_syscall(call);
+ }
+
+ const int syscallno = RR_FCNTL_SYSCALL;
+ int fd = call->args[0];
+ int cmd = call->args[1];
+ struct rr_flock64* lock = (struct rr_flock64*)call->args[2];
+
+ void* ptr = prep_syscall_for_fd(fd);
+ long ret;
+
+ assert(syscallno == call->no);
+
+ if (!start_commit_buffered_syscall(syscallno, ptr, MAY_BLOCK)) {
+ return traced_raw_syscall(call);
+ }
+ ret = untraced_syscall3(syscallno, fd, cmd, lock);
+ return commit_raw_syscall(syscallno, ptr, ret);
+}
+
+#if defined(SYS_fcntl64)
+/* 32-bit system */
+static long sys_fcntl64(struct syscall_info* call)
+#else
+/* 64-bit system */
+static long sys_fcntl(struct syscall_info* call)
+#endif
+{
+ switch (call->args[1]) {
+ case F_SETFL:
+ if (call->args[2] == O_DIRECT) {
+ /* This needs to go to rr so we can disable syscall buffering
+ on this fd. */
+ return traced_raw_syscall(call);
+ }
+ /* Falls through. */
+ case F_DUPFD:
+ case F_GETFD:
+ case F_GETFL:
+ case F_GETOWN:
+ case F_SETFD:
+ case F_SETOWN:
+ case F_SETSIG:
+ return sys_fcntl64_no_outparams(call);
+
+ case F_GETOWN_EX:
+ case F_SETOWN_EX:
+ return sys_fcntl64_own_ex(call);
+
+#ifndef F_SETLK64
+#define F_SETLK64 13
+#endif
+ case F_SETLK64:
+#if !defined(SYS_fcntl64)
+ /* Also uses 64-bit flock format */
+ case F_SETLK:
+#endif
+ return sys_fcntl64_setlk64(call);
+
+#ifndef F_SETLKW64
+#define F_SETLKW64 14
+#endif
+ case F_SETLKW64:
+#if !defined(SYS_fcntl64)
+ /* Also uses 64-bit flock format */
+ case F_SETLKW:
+#endif
+ return sys_fcntl64_setlkw64(call);
+
+ default:
+ return traced_raw_syscall(call);
+ }
+}
+
+static long ret_buf_len(long ret, size_t len) {
+ if (ret < 0) {
+ return 0;
+ }
+ if (len > LONG_MAX) {
+ return ret;
+ }
+ return ret < (long)len ? ret : (long)len;
+}
+
+static long sys_flistxattr(struct syscall_info* call) {
+ const int syscallno = SYS_flistxattr;
+ int fd = (int)call->args[0];
+ char* buf = (char*)call->args[1];
+ size_t size = call->args[2];
+
+ void* ptr = prep_syscall_for_fd(fd);
+ void* buf2 = NULL;
+ long ret;
+
+ assert(syscallno == call->no);
+
+ if (buf && size > 0) {
+ buf2 = ptr;
+ ptr += size;
+ }
+ if (!start_commit_buffered_syscall(syscallno, ptr, WONT_BLOCK)) {
+ return traced_raw_syscall(call);
+ }
+
+ ret = untraced_syscall3(syscallno, fd, buf2, size);
+ ptr = copy_output_buffer(ret_buf_len(ret, size), ptr, buf, buf2);
+ return commit_raw_syscall(syscallno, ptr, ret);
+}
+
+static long sys_safe_nonblocking_ioctl(struct syscall_info* call) {
+ const int syscallno = SYS_ioctl;
+ int fd = call->args[0];
+
+ void* ptr = prep_syscall_for_fd(fd);
+ long ret;
+
+ if (!start_commit_buffered_syscall(syscallno, ptr, WONT_BLOCK)) {
+ return traced_raw_syscall(call);
+ }
+ ret = untraced_syscall3(syscallno, fd, call->args[1], call->args[2]);
+ return commit_raw_syscall(syscallno, ptr, ret);
+}
+
+static long sys_ioctl_fionread(struct syscall_info* call) {
+ const int syscallno = SYS_ioctl;
+ int fd = call->args[0];
+ int* value = (int*)call->args[2];
+ void* buf = NULL;
+
+ void* ptr = prep_syscall_for_fd(fd);
+ long ret;
+
+ if (value) {
+ buf = ptr;
+ ptr += sizeof(*value);
+ }
+ if (!start_commit_buffered_syscall(syscallno, ptr, WONT_BLOCK)) {
+ return traced_raw_syscall(call);
+ }
+ ret = untraced_syscall3(syscallno, fd, FIONREAD, buf);
+ if (buf && ret >= 0 && !buffer_hdr()->failed_during_preparation) {
+ local_memcpy(value, buf, sizeof(*value));
+ }
+ return commit_raw_syscall(syscallno, ptr, ret);
+}
+
+static long sys_ioctl(struct syscall_info* call) {
+ switch (call->args[1]) {
+ case BTRFS_IOC_CLONE_RANGE:
+ case FIOCLEX:
+ case FIONCLEX:
+ return sys_safe_nonblocking_ioctl(call);
+ case FIONREAD:
+ return sys_ioctl_fionread(call);
+ default:
+ return traced_raw_syscall(call);
+ }
+}
+
+static long sys_futex(struct syscall_info* call) {
+ enum {
+ FUTEX_USES_UADDR2 = 1 << 0,
+ };
+
+ /* This can make wakeups a lot more expensive. We assume
+ that wakeups are only used when some thread is actually waiting,
+ in which case we're at most doubling the overhead of the combined
+ wait + wakeup. */
+ if (globals.in_chaos) {
+ return traced_raw_syscall(call);
+ }
+
+ int op = call->args[1];
+ int flags = 0;
+ switch (FUTEX_CMD_MASK & op) {
+ case FUTEX_WAKE_BITSET:
+ case FUTEX_WAKE:
+ break;
+ case FUTEX_REQUEUE:
+ case FUTEX_CMP_REQUEUE:
+ case FUTEX_WAKE_OP:
+ flags |= FUTEX_USES_UADDR2;
+ break;
+
+ /* It turns out not to be worth buffering the FUTEX_WAIT*
+ * calls. When a WAIT call is made, we know almost for sure
+ * that the tracee is going to be desched'd (otherwise the
+ * userspace CAS would have succeeded). This is unlike
+ * read/write, f.e., where the vast majority of calls aren't
+ * desched'd and the overhead is worth it. So all that
+ * buffering WAIT does is add the overhead of arming/disarming
+ * desched (which is a measurable perf loss).
+ *
+ * NB: don't ever try to buffer FUTEX_LOCK_PI; it requires
+ * special processing in the tracer process (in addition to
+ * not being worth doing for perf reasons). */
+ default:
+ return traced_raw_syscall(call);
+ }
+
+ const int syscallno = SYS_futex;
+ uint32_t* uaddr = (uint32_t*)call->args[0];
+ uint32_t val = call->args[2];
+ const struct timespec* timeout = (const struct timespec*)call->args[3];
+ uint32_t* uaddr2 = (uint32_t*)call->args[4];
+ uint32_t val3 = call->args[5];
+
+ void* ptr = prep_syscall();
+ uint32_t* saved_uaddr;
+ uint32_t* saved_uaddr2 = NULL;
+ long ret;
+
+ assert(syscallno == call->no);
+
+ /* We have to record the value of the futex at kernel exit,
+ * but we can't substitute a scratch pointer for the uaddrs:
+ * the futex identity is the memory cell. There are schemes
+ * that would allow us to use scratch futexes, but they get
+ * complicated quickly. */
+ saved_uaddr = ptr;
+ ptr += sizeof(*saved_uaddr);
+ if (FUTEX_USES_UADDR2 & flags) {
+ saved_uaddr2 = ptr;
+ ptr += sizeof(*saved_uaddr2);
+ }
+ /* See above; it's not worth buffering may-block futex
+ * calls. */
+ if (!start_commit_buffered_syscall(syscallno, ptr, WONT_BLOCK)) {
+ return traced_raw_syscall(call);
+ }
+
+ ret = untraced_syscall6(syscallno, uaddr, op, val, timeout, uaddr2, val3);
+ /* During recording, save the real outparams to the buffer.
+ * During replay, save the values from the buffer to the real outparams.
+ *
+ * The *ONLY* reason it's correct for us to read the outparams
+ * carelessly is that rr protects this syscallbuf
+ * transaction as as a critical section. */
+ copy_futex_int(saved_uaddr, uaddr);
+ if (saved_uaddr2) {
+ copy_futex_int(saved_uaddr2, uaddr2);
+ }
+ return commit_raw_syscall(syscallno, ptr, ret);
+}
+
+static long sys_getrandom(struct syscall_info* call) {
+ void* buf = (void*)call->args[0];
+ size_t buf_len = (size_t)call->args[1];
+ unsigned int flags = (unsigned int)call->args[2];
+ const int syscallno = SYS_getrandom;
+
+ void* ptr = prep_syscall();
+ void* buf2 = NULL;
+ long ret;
+
+ assert(syscallno == call->no);
+
+ if (buf && buf_len > 0) {
+ buf2 = ptr;
+ ptr += buf_len;
+ }
+ if (!start_commit_buffered_syscall(call->no, ptr, (flags & GRND_NONBLOCK) ? WONT_BLOCK : MAY_BLOCK)) {
+ return traced_raw_syscall(call);
+ }
+
+ ret = untraced_syscall3(call->no, buf2, buf_len, flags);
+ ptr = copy_output_buffer(ret, ptr, buf, buf2);
+ return commit_raw_syscall(call->no, ptr, ret);
+}
+
+static long sys_generic_getdents(struct syscall_info* call) {
+ int fd = (int)call->args[0];
+ void* buf = (void*)call->args[1];
+ unsigned int count = (unsigned int)call->args[2];
+
+ void* ptr = prep_syscall_for_fd(fd);
+ void* buf2 = NULL;
+ long ret;
+
+ if (buf && count > 0) {
+ buf2 = ptr;
+ ptr += count;
+ }
+ if (!start_commit_buffered_syscall(call->no, ptr, WONT_BLOCK)) {
+ return traced_raw_syscall(call);
+ }
+
+ ret = untraced_syscall3(call->no, fd, buf2, count);
+ ptr = copy_output_buffer(ret, ptr, buf, buf2);
+ return commit_raw_syscall(call->no, ptr, ret);
+}
+
+#if defined(SYS_getdents)
+static long sys_getdents(struct syscall_info* call) {
+ return sys_generic_getdents(call);
+}
+#endif
+
+static long sys_getdents64(struct syscall_info* call) {
+ return sys_generic_getdents(call);
+}
+
+static long sys_gettimeofday(struct syscall_info* call) {
+ const int syscallno = SYS_gettimeofday;
+ struct timeval* tp = (struct timeval*)call->args[0];
+ struct timezone* tzp = (struct timezone*)call->args[1];
+
+ /* XXX it seems odd that clock_gettime() is spec'd to be
+ * async-signal-safe while gettimeofday() isn't, but that's
+ * what the docs say! */
+ void* ptr = prep_syscall();
+ struct timeval* tp2 = NULL;
+ struct timezone* tzp2 = NULL;
+ long ret;
+
+ assert(syscallno == call->no);
+
+ if (tp) {
+ tp2 = ptr;
+ ptr += sizeof(*tp2);
+ }
+ if (tzp) {
+ tzp2 = ptr;
+ ptr += sizeof(*tzp2);
+ }
+ if (!start_commit_buffered_syscall(syscallno, ptr, WONT_BLOCK)) {
+ return traced_raw_syscall(call);
+ }
+ ret = untraced_syscall2(syscallno, tp2, tzp2);
+ if (ret >= 0 && !buffer_hdr()->failed_during_preparation) {
+ if (tp) {
+ /* This is small and won't get optimized to a memcpy call outside
+ our library. */
+ *tp = *tp2;
+ }
+ if (tzp) {
+ /* This is small and won't get optimized to a memcpy call outside
+ our library. */
+ *tzp = *tzp2;
+ }
+ }
+ return commit_raw_syscall(syscallno, ptr, ret);
+}
+
+static long sys_generic_getxattr(struct syscall_info* call) {
+ const char* path = (const char*)call->args[0];
+ const char* name = (const char*)call->args[1];
+ void* value = (void*)call->args[2];
+ size_t size = call->args[3];
+
+ void* ptr = prep_syscall();
+ void* value2 = NULL;
+ long ret;
+
+ if (value && size > 0) {
+ value2 = ptr;
+ ptr += size;
+ }
+ if (!start_commit_buffered_syscall(call->no, ptr, WONT_BLOCK)) {
+ return traced_raw_syscall(call);
+ }
+
+ ret = untraced_syscall4(call->no, path, name, value2, size);
+ ptr = copy_output_buffer(ret_buf_len(ret, size), ptr, value, value2);
+ return commit_raw_syscall(call->no, ptr, ret);
+}
+
+static long sys_getxattr(struct syscall_info* call) {
+ return sys_generic_getxattr(call);
+}
+
+static long sys_lgetxattr(struct syscall_info* call) {
+ return sys_generic_getxattr(call);
+}
+
+static long sys_fgetxattr(struct syscall_info* call) {
+ int fd = (int)call->args[0];
+ const char* name = (const char*)call->args[1];
+ void* value = (void*)call->args[2];
+ size_t size = call->args[3];
+
+ void* ptr = prep_syscall_for_fd(fd);
+ void* value2 = NULL;
+ long ret;
+
+ if (value && size > 0) {
+ value2 = ptr;
+ ptr += size;
+ }
+ if (!start_commit_buffered_syscall(call->no, ptr, WONT_BLOCK)) {
+ return traced_raw_syscall(call);
+ }
+
+ ret = untraced_syscall4(call->no, fd, name, value2, size);
+ ptr = copy_output_buffer(ret_buf_len(ret, size), ptr, value, value2);
+ return commit_raw_syscall(call->no, ptr, ret);
+}
+
+static long sys_generic_listxattr(struct syscall_info* call) {
+ char* path = (char*)call->args[0];
+ char* buf = (char*)call->args[1];
+ size_t size = call->args[2];
+
+ void* ptr = prep_syscall();
+ void* buf2 = NULL;
+ long ret;
+
+ if (buf && size > 0) {
+ buf2 = ptr;
+ ptr += size;
+ }
+ if (!start_commit_buffered_syscall(call->no, ptr, WONT_BLOCK)) {
+ return traced_raw_syscall(call);
+ }
+
+ ret = untraced_syscall3(call->no, path, buf2, size);
+ ptr = copy_output_buffer(ret_buf_len(ret, size), ptr, buf, buf2);
+ return commit_raw_syscall(call->no, ptr, ret);
+}
+
+static long sys_listxattr(struct syscall_info* call) {
+ return sys_generic_listxattr(call);
+}
+
+static long sys_llistxattr(struct syscall_info* call) {
+ return sys_generic_listxattr(call);
+}
+
+#if defined(SYS__llseek)
+static long sys__llseek(struct syscall_info* call) {
+ const int syscallno = SYS__llseek;
+ int fd = call->args[0];
+ unsigned long offset_high = call->args[1];
+ unsigned long offset_low = call->args[2];
+ __kernel_loff_t* result = (__kernel_loff_t*)call->args[3];
+ unsigned int whence = call->args[4];
+
+ void* ptr = prep_syscall_for_fd(fd);
+ __kernel_loff_t* result2 = NULL;
+ long ret;
+
+ assert(syscallno == call->no);
+
+ if (result) {
+ result2 = ptr;
+ ptr += sizeof(*result2);
+ }
+ if (!start_commit_buffered_syscall(syscallno, ptr, WONT_BLOCK)) {
+ return traced_raw_syscall(call);
+ }
+
+ if (result2) {
+ memcpy_input_parameter(result2, result, sizeof(*result2));
+ }
+ ret = untraced_syscall5(syscallno, fd, offset_high, offset_low, result2,
+ whence);
+ if (result2) {
+ *result = *result2;
+ }
+ return commit_raw_syscall(syscallno, ptr, ret);
+}
+#endif
+
+static long sys_madvise(struct syscall_info* call) {
+ const int syscallno = SYS_madvise;
+ void* addr = (void*)call->args[0];
+ size_t length = call->args[1];
+ int advice = call->args[2];
+
+ void* ptr;
+ long ret;
+
+ switch (advice) {
+ // Whitelist advice values that we know are OK to pass through to the
+ // kernel directly.
+ case MADV_NORMAL:
+ case MADV_RANDOM:
+ case MADV_SEQUENTIAL:
+ case MADV_WILLNEED:
+ case MADV_DONTNEED:
+ case MADV_MERGEABLE:
+ case MADV_UNMERGEABLE:
+ case MADV_HUGEPAGE:
+ case MADV_NOHUGEPAGE:
+ case MADV_DONTDUMP:
+ case MADV_DODUMP:
+ break;
+ case MADV_FREE:
+ // See record_syscall. We disallow MADV_FREE because it creates
+ // nondeterminism.
+ advice = -1;
+ break;
+ default:
+ return traced_raw_syscall(call);
+ }
+
+ ptr = prep_syscall();
+
+ assert(syscallno == call->no);
+
+ if (!start_commit_buffered_syscall(syscallno, ptr, WONT_BLOCK)) {
+ return traced_raw_syscall(call);
+ }
+
+ /* Ensure this syscall happens during replay. In particular MADV_DONTNEED
+ * must be executed.
+ */
+ ret = untraced_replayed_syscall3(syscallno, addr, length, advice);
+ return commit_raw_syscall(syscallno, ptr, ret);
+}
+
+static long sys_mprotect(struct syscall_info* call) {
+ const int syscallno = SYS_mprotect;
+ void* addr = (void*)call->args[0];
+ size_t length = call->args[1];
+ int prot = call->args[2];
+ struct mprotect_record* mrec;
+
+ void* ptr;
+ long ret;
+
+ if ((prot & ~(PROT_READ | PROT_WRITE | PROT_EXEC)) || !buffer_hdr() ||
+ buffer_hdr()->mprotect_record_count >= MPROTECT_RECORD_COUNT) {
+ return traced_raw_syscall(call);
+ }
+
+ ptr = prep_syscall();
+
+ assert(syscallno == call->no);
+
+ if (!start_commit_buffered_syscall(syscallno, ptr, WONT_BLOCK)) {
+ return traced_raw_syscall(call);
+ }
+
+ mrec = &globals.mprotect_records[buffer_hdr()->mprotect_record_count++];
+ mrec->start = (uint64_t)(uintptr_t)addr;
+ mrec->size = length;
+ mrec->prot = prot;
+ ret = untraced_replayed_syscall3(syscallno, addr, length, prot);
+ if (ret < 0 && ret != -ENOMEM) {
+ /* indicate that nothing was mprotected */
+ mrec->size = 0;
+ }
+ buffer_hdr()->mprotect_record_count_completed++;
+
+ return commit_raw_syscall(syscallno, ptr, ret);
+}
+
+static int supported_open(const char* file_name, int flags) {
+ if (is_gcrypt_deny_file(file_name)) {
+ /* This needs to be a traced syscall. We want to return an
+ open file even if the file doesn't exist and the untraced syscall
+ returns ENOENT. */
+ return 0;
+ }
+ if (flags & O_DIRECT) {
+ /* O_DIRECT needs to go to rr so we can blacklist the file for
+ syscall buffering. */
+ return 0;
+ }
+ /* Writeable opens need to go to rr to be checked in case
+ they could write to a mapped file.
+ But if they're O_EXCL | O_CREAT, a new file must be created
+ so that will be fine. */
+ return !(flags & (O_RDWR | O_WRONLY)) ||
+ (flags & (O_EXCL | O_CREAT)) == (O_EXCL | O_CREAT);
+}
+
+static long sys_readlinkat(struct syscall_info* call, int privileged);
+
+struct check_open_state {
+ uint8_t did_abort;
+ uint8_t did_fail_during_preparation;
+};
+
+static int check_file_open_ok(struct syscall_info* call, int ret, struct check_open_state state) {
+ /* If we failed during preparation then a SIGSYS or similar prevented the syscall
+ from doing anything, so there is nothing for us to do here and we shouldn't
+ try to interpret the "syscall result". */
+ if (state.did_fail_during_preparation || ret < 0) {
+ return ret;
+ }
+ char buf[100];
+ sprintf(buf, "/proc/self/fd/%d", ret);
+ char link[PATH_MAX];
+ long link_ret;
+ if (state.did_abort) {
+ /* Don't add any new syscallbuf records, that won't work. */
+ link_ret = privileged_traced_syscall4(SYS_readlinkat, -1, (long)buf, (long)link, sizeof(link));
+ } else {
+ struct syscall_info readlink_call =
+ { SYS_readlinkat, { -1, (long)buf, (long)link, sizeof(link), 0, 0 } };
+ link_ret = sys_readlinkat(&readlink_call, 1);
+ }
+ if (link_ret >= 0 && link_ret < (ssize_t)sizeof(link)) {
+ link[link_ret] = 0;
+ if (allow_buffered_open(link)) {
+ return ret;
+ }
+ }
+ /* Clean up by closing the file descriptor we should not have opened and
+ opening it again, traced this time.
+ Use a privileged traced syscall for the close to ensure it
+ can't fail due to lack of privilege.
+ We expect this to return an error.
+ We could try an untraced close syscall here, falling back to traced
+ syscall, but that's a bit more complicated and we're already on
+ the slow (and hopefully rare) path. */
+ privileged_traced_syscall1(SYS_close, ret);
+ return traced_raw_syscall(call);
+}
+
+static struct check_open_state capture_check_open_state(void) {
+ struct check_open_state ret;
+ ret.did_abort = buffer_hdr()->abort_commit;
+ ret.did_fail_during_preparation = buffer_hdr()->failed_during_preparation;
+ return ret;
+}
+
+#if defined(SYS_open)
+static long sys_open(struct syscall_info* call) {
+ if (force_traced_syscall_for_chaos_mode()) {
+ /* Opening a FIFO could unblock a higher priority task */
+ return traced_raw_syscall(call);
+ }
+
+ const int syscallno = SYS_open;
+ const char* pathname = (const char*)call->args[0];
+ int flags = call->args[1];
+ __kernel_mode_t mode = call->args[2];
+ void* ptr;
+ long ret;
+
+ assert(syscallno == call->no);
+
+ if (!supported_open(pathname, flags)) {
+ return traced_raw_syscall(call);
+ }
+
+ ptr = prep_syscall();
+ if (!start_commit_buffered_syscall(syscallno, ptr, MAY_BLOCK)) {
+ return traced_raw_syscall(call);
+ }
+
+ ret = untraced_syscall3(syscallno, pathname, flags, mode);
+ struct check_open_state state = capture_check_open_state();
+ ret = commit_raw_syscall(syscallno, ptr, ret);
+ return check_file_open_ok(call, ret, state);
+}
+#endif
+
+static long sys_openat(struct syscall_info* call) {
+ if (force_traced_syscall_for_chaos_mode()) {
+ /* Opening a FIFO could unblock a higher priority task */
+ return traced_raw_syscall(call);
+ }
+
+ const int syscallno = SYS_openat;
+ int dirfd = call->args[0];
+ const char* pathname = (const char*)call->args[1];
+ int flags = call->args[2];
+ __kernel_mode_t mode = call->args[3];
+ void* ptr;
+ long ret;
+
+ assert(syscallno == call->no);
+
+ if (!supported_open(pathname, flags)) {
+ return traced_raw_syscall(call);
+ }
+
+ ptr = prep_syscall();
+ if (!start_commit_buffered_syscall(syscallno, ptr, MAY_BLOCK)) {
+ return traced_raw_syscall(call);
+ }
+
+ ret = untraced_syscall4(syscallno, dirfd, pathname, flags, mode);
+ struct check_open_state state = capture_check_open_state();
+ ret = commit_raw_syscall(syscallno, ptr, ret);
+ return check_file_open_ok(call, ret, state);
+}
+
+#if defined(SYS_poll) || defined(SYS_ppoll)
+/**
+ * Make this function external so desched_ticks.py can set a breakpoint on it.
+ * Make it visibility-"protected" so that our local definition binds to it
+ * directly and doesn't go through a PLT thunk (which would mean temporarily
+ * leaving syscallbuf code).
+ */
+__attribute__((visibility("protected"))) void __before_poll_syscall_breakpoint(
+ void) {}
+#endif
+
+#if defined(SYS_poll)
+static long sys_poll(struct syscall_info* call) {
+ const int syscallno = SYS_poll;
+ struct pollfd* fds = (struct pollfd*)call->args[0];
+ unsigned int nfds = call->args[1];
+ int timeout = call->args[2];
+
+ void* ptr = prep_syscall();
+ struct pollfd* fds2 = NULL;
+ long ret;
+
+ assert(syscallno == call->no);
+
+ if (fds && nfds > 0) {
+ fds2 = ptr;
+ ptr += nfds * sizeof(*fds2);
+ }
+ if (!start_commit_buffered_syscall(syscallno, ptr, WONT_BLOCK)) {
+ return traced_raw_syscall(call);
+ }
+ if (fds2) {
+ memcpy_input_parameter(fds2, fds, nfds * sizeof(*fds2));
+ }
+
+ __before_poll_syscall_breakpoint();
+
+ /* Try a no-timeout version of the syscall first. If this doesn't return
+ anything, and we should have blocked, we'll try again with a traced syscall
+ which will be the one that blocks. This usually avoids the
+ need to trigger desched logic, which adds overhead, especially the
+ rrcall_notify_syscall_hook_exit that gets triggered. */
+ ret = untraced_syscall3(syscallno, fds2, nfds, 0);
+
+ if (fds2 && ret >= 0 && !buffer_hdr()->failed_during_preparation) {
+ /* NB: even when poll returns 0 indicating no pending
+ * fds, it still sets each .revent outparam to 0.
+ * (Reasonably.) So we always need to copy on return
+ * value >= 0.
+ * It's important that we not copy when there's an error.
+ * The syscallbuf commit might have been aborted, which means
+ * during replay fds2 might be non-recorded data, so we'd be
+ * incorrectly trashing 'fds'. */
+ local_memcpy(fds, fds2, nfds * sizeof(*fds));
+ }
+ commit_raw_syscall(syscallno, ptr, ret);
+
+ if (ret != 0 || timeout == 0) {
+ return ret;
+ }
+ /* The syscall didn't return anything, and we should have blocked.
+ Just perform a raw syscall now since we're almost certain to block. */
+ return traced_raw_syscall(call);
+}
+#endif
+
+#if defined(SYS_ppoll)
+static long sys_ppoll(struct syscall_info* call) {
+ const int syscallno = SYS_ppoll;
+ struct pollfd* fds = (struct pollfd*)call->args[0];
+ unsigned int nfds = call->args[1];
+ const struct timespec *tmo_p = (struct timespec*)call->args[2];
+ const kernel_sigset_t *sigmask = (const kernel_sigset_t*)call->args[3];
+ size_t sigmask_size = call->args[4];
+
+ if (sigmask) {
+ // See ppoll_deliver. ppoll calls that temporarily change the
+ // sigmask are hard to handle; we may get a signal that we can't
+ // deliver later because it's blocked by the application.
+ return traced_raw_syscall(call);
+ }
+
+ void* ptr = prep_syscall();
+ struct pollfd* fds2 = NULL;
+ long ret;
+
+ assert(syscallno == call->no);
+
+ if (fds && nfds > 0) {
+ fds2 = ptr;
+ ptr += nfds * sizeof(*fds2);
+ }
+ if (!start_commit_buffered_syscall(syscallno, ptr, WONT_BLOCK)) {
+ return traced_raw_syscall(call);
+ }
+ if (fds2) {
+ memcpy_input_parameter(fds2, fds, nfds * sizeof(*fds2));
+ }
+
+ __before_poll_syscall_breakpoint();
+
+ /* Try a no-timeout version of the syscall first. If this doesn't return
+ anything, and we should have blocked, we'll try again with a traced syscall
+ which will be the one that blocks. This usually avoids the
+ need to trigger desched logic, which adds overhead, especially the
+ rrcall_notify_syscall_hook_exit that gets triggered. */
+ const struct timespec tmo0 = {0, 0};
+ ret = untraced_syscall5(syscallno, fds2, nfds, &tmo0, sigmask, sigmask_size);
+
+ if (fds2 && ret >= 0 && !buffer_hdr()->failed_during_preparation) {
+ /* NB: even when poll returns 0 indicating no pending
+ * fds, it still sets each .revent outparam to 0.
+ * (Reasonably.) So we always need to copy on return
+ * value >= 0.
+ * It's important that we not copy when there's an error.
+ * The syscallbuf commit might have been aborted, which means
+ * during replay fds2 might be non-recorded data, so we'd be
+ * incorrectly trashing 'fds'. */
+ local_memcpy(fds, fds2, nfds * sizeof(*fds));
+ }
+ commit_raw_syscall(syscallno, ptr, ret);
+
+ if (ret != 0 || (tmo_p && tmo_p->tv_sec == 0 && tmo_p->tv_nsec == 0)) {
+ return ret;
+ }
+ /* The syscall didn't return anything, and we should have blocked.
+ Just perform a raw syscall now since we're almost certain to block. */
+ return traced_raw_syscall(call);
+}
+#endif
+
+static long sys_epoll_wait(struct syscall_info* call) {
+ int epfd = call->args[0];
+ struct epoll_event* events = (struct epoll_event*)call->args[1];
+ int max_events = call->args[2];
+ int timeout = call->args[3];
+
+ void* ptr;
+ struct epoll_event* events2 = NULL;
+ long ret;
+
+ ptr = prep_syscall();
+
+ assert(SYS_epoll_pwait == call->no
+#if defined(SYS_epoll_wait)
+ || SYS_epoll_wait == call->no
+#endif
+ );
+
+ if (events && max_events > 0) {
+ events2 = ptr;
+ ptr += max_events * sizeof(*events2);
+ }
+ if (!start_commit_buffered_syscall(call->no, ptr, WONT_BLOCK)) {
+ return traced_raw_syscall(call);
+ }
+
+ /* Try a no-timeout version of the syscall first. If this doesn't return
+ anything, and we should have blocked, we'll try again with a traced syscall
+ which will be the one that blocks. This usually avoids the
+ need to trigger desched logic, which adds overhead, especially the
+ rrcall_notify_syscall_hook_exit that gets triggered.
+ N.B.: SYS_epoll_wait only has four arguments, but we don't care
+ if the last two arguments are garbage */
+ ret = untraced_syscall6(call->no, epfd, events2, max_events, 0,
+ call->args[4] /*sigmask*/, call->args[5] /*sizeof(*sigmask)*/);
+
+ ptr = copy_output_buffer(ret * sizeof(*events2), ptr, events, events2);
+ ret = commit_raw_syscall(call->no, ptr, ret);
+ if (timeout == 0 || (ret != EINTR && ret != 0)) {
+ /* If we got some real results, or a non-EINTR error, we can just
+ return it directly.
+ If we got no results and the timeout was 0, we can just return 0.
+ If we got EINTR and the timeout was 0, a signal must have
+ interrupted the syscall (not sure if this can happen...). If the signal
+ needs to be handled, we'll handle it as we exit the syscallbuf.
+ Returning EINTR is fine because that's what the syscall would have
+ returned had it run traced. (We didn't enable the desched signal
+ so no extra signals could have affected our untraced syscall that
+ could not have been delivered to a traced syscall.) */
+ return ret;
+ }
+ /* Some timeout was requested and either we got no results or we got
+ EINTR.
+ In the former case we just have to wait, so we do a traced syscall.
+ In the latter case, the syscall must have been interrupted by a
+ signal (which rr will have handled or stashed, and won't deliver until
+ we exit syscallbuf code or do a traced syscall). The kernel doesn't
+ automatically restart the syscall because of a longstanding bug (as of
+ 4.17 anyway). Doing a traced syscall will allow a stashed signal to be
+ processed (if necessary) and allow things to proceed normally after that.
+ Note that if rr decides to deliver a signal to the tracee, that will
+ itself interrupt the syscall and cause it to return EINTR just as
+ would happen without rr.
+ */
+ return traced_raw_syscall(call);
+}
+
+struct timespec64 {
+ uint64_t tv_sec;
+ uint64_t tv_nsec;
+};
+
+#ifdef SYS_epoll_pwait2
+static long sys_epoll_pwait2(struct syscall_info* call) {
+ int epfd = call->args[0];
+ struct epoll_event* events = (struct epoll_event*)call->args[1];
+ int max_events = call->args[2];
+ struct timespec64* timeout = (struct timespec64*)call->args[3];
+
+ void* ptr;
+ struct epoll_event* events2 = NULL;
+ long ret;
+
+ ptr = prep_syscall();
+
+ assert(SYS_epoll_pwait2 == call->no);
+
+ if (events && max_events > 0) {
+ events2 = ptr;
+ ptr += max_events * sizeof(*events2);
+ }
+ if (!start_commit_buffered_syscall(call->no, ptr, WONT_BLOCK)) {
+ return traced_raw_syscall(call);
+ }
+
+ /* Try a no-timeout version of the syscall first. If this doesn't return
+ anything, and we should have blocked, we'll try again with a traced syscall
+ which will be the one that blocks. This usually avoids the
+ need to trigger desched logic, which adds overhead, especially the
+ rrcall_notify_syscall_hook_exit that gets triggered. */
+ struct timespec64 no_timeout = { 0, 0 };
+ ret = untraced_syscall6(call->no, epfd, events2, max_events, &no_timeout,
+ call->args[4] /*sigmask*/, call->args[5] /*sizeof(*sigmask)*/);
+
+ ptr = copy_output_buffer(ret * sizeof(*events2), ptr, events, events2);
+ ret = commit_raw_syscall(call->no, ptr, ret);
+ if ((timeout && timeout->tv_sec == 0 && timeout->tv_nsec == 0) ||
+ (ret != EINTR && ret != 0)) {
+ /* If we got some real results, or a non-EINTR error, we can just
+ return it directly.
+ If we got no results and the timeout was 0, we can just return 0.
+ If we got EINTR and the timeout was 0, a signal must have
+ interrupted the syscall (not sure if this can happen...). If the signal
+ needs to be handled, we'll handle it as we exit the syscallbuf.
+ Returning EINTR is fine because that's what the syscall would have
+ returned had it run traced. (We didn't enable the desched signal
+ so no extra signals could have affected our untraced syscall that
+ could not have been delivered to a traced syscall.) */
+ return ret;
+ }
+ /* Some timeout was requested and either we got no results or we got
+ EINTR.
+ In the former case we just have to wait, so we do a traced syscall.
+ In the latter case, the syscall must have been interrupted by a
+ signal (which rr will have handled or stashed, and won't deliver until
+ we exit syscallbuf code or do a traced syscall). The kernel doesn't
+ automatically restart the syscall because of a longstanding bug (as of
+ 4.17 anyway). Doing a traced syscall will allow a stashed signal to be
+ processed (if necessary) and allow things to proceed normally after that.
+ Note that if rr decides to deliver a signal to the tracee, that will
+ itself interrupt the syscall and cause it to return EINTR just as
+ would happen without rr.
+ */
+ return traced_raw_syscall(call);
+}
+#endif
+
+#define CLONE_SIZE_THRESHOLD 0x10000
+
+static long sys_read(struct syscall_info* call) {
+ if (force_traced_syscall_for_chaos_mode()) {
+ /* Reading from a pipe could unblock a higher priority task */
+ return traced_raw_syscall(call);
+ }
+
+ const int syscallno = SYS_read;
+ int fd = call->args[0];
+ void* buf = (void*)call->args[1];
+ size_t count = call->args[2];
+
+ void* ptr;
+ void* buf2 = NULL;
+ long ret;
+
+ /* Try cloning data using CLONE_RANGE ioctl.
+ * XXX switch to FIOCLONERANGE when that's more widely available. It's the
+ * same ioctl number so it won't affect rr per se but it'd be cleaner code.
+ * 64-bit only for now, since lseek and pread64 need special handling for
+ * 32-bit.
+ * Basically we break down the read into three syscalls lseek, clone and
+ * read-from-clone, each of which is individually syscall-buffered.
+ * Crucially, the read-from-clone syscall does NOT store data in the syscall
+ * buffer; instead, we perform the syscall during replay, assuming that
+ * cloned_file_data_fd is open to the same file during replay.
+ * Reads that hit EOF are rejected by the CLONE_RANGE ioctl so we take the
+ * slow path. That's OK.
+ * There is a possible race here: between cloning the data and reading from
+ * |fd|, |fd|'s data may be overwritten, in which case the data read during
+ * replay will not match the data read during recording, causing divergence.
+ * I don't see any performant way to avoid this race; I tried reading from
+ * the cloned data instead of |fd|, but that is very slow because readahead
+ * doesn't work. (The cloned data file always ends at the current offset so
+ * there is nothing to readahead.) However, if an application triggers this
+ * race, it's almost certainly a bad bug because Linux can return any
+ * interleaving of old+new data for the read even without rr.
+ */
+ if (buf && count >= CLONE_SIZE_THRESHOLD &&
+ thread_locals->cloned_file_data_fd >= 0 && is_bufferable_fd(fd) &&
+ sizeof(void*) == 8 && !(count & 4095)) {
+ struct syscall_info lseek_call = { SYS_lseek,
+ { fd, 0, SEEK_CUR, 0, 0, 0 } };
+ off_t lseek_ret = privileged_sys_generic_nonblocking_fd(&lseek_call);
+ if (lseek_ret >= 0 && !(lseek_ret & 4095)) {
+ struct btrfs_ioctl_clone_range_args ioctl_args;
+ int ioctl_ret;
+ void* ioctl_ptr = prep_syscall();
+ ioctl_args.src_fd = fd;
+ ioctl_args.src_offset = lseek_ret;
+ ioctl_args.src_length = count;
+ ioctl_args.dest_offset = thread_locals->cloned_file_data_offset;
+
+ /* Don't call sys_ioctl here; cloned_file_data_fd has syscall buffering
+ * disabled for it so rr can reject attempts to close/dup to it. But
+ * we want to allow syscall buffering of this ioctl on it.
+ */
+ if (!start_commit_buffered_syscall(SYS_ioctl, ioctl_ptr, WONT_BLOCK)) {
+ struct syscall_info ioctl_call = { SYS_ioctl,
+ { thread_locals->cloned_file_data_fd,
+ BTRFS_IOC_CLONE_RANGE,
+ (long)&ioctl_args, 0, 0, 0 } };
+ ioctl_ret = privileged_traced_raw_syscall(&ioctl_call);
+ } else {
+ ioctl_ret =
+ privileged_untraced_syscall3(SYS_ioctl, thread_locals->cloned_file_data_fd,
+ BTRFS_IOC_CLONE_RANGE, &ioctl_args);
+ ioctl_ret = commit_raw_syscall(SYS_ioctl, ioctl_ptr, ioctl_ret);
+ }
+
+ if (ioctl_ret >= 0) {
+ struct syscall_info read_call = { SYS_read,
+ { fd, (long)buf, count, 0, 0, 0 } };
+ thread_locals->cloned_file_data_offset += count;
+
+ replay_only_syscall3(SYS_dup3, thread_locals->cloned_file_data_fd, fd, 0);
+
+ ptr = prep_syscall();
+ if (count > thread_locals->usable_scratch_size) {
+ if (!start_commit_buffered_syscall(SYS_read, ptr, WONT_BLOCK)) {
+ return traced_raw_syscall(&read_call);
+ }
+ ret = untraced_replayed_syscall3(SYS_read, fd, buf, count);
+ } else {
+ if (!start_commit_buffered_syscall(SYS_read, ptr, MAY_BLOCK)) {
+ return traced_raw_syscall(&read_call);
+ }
+ ret = untraced_replayed_syscall3(SYS_read, fd,
+ thread_locals->scratch_buf, count);
+ copy_output_buffer(ret, NULL, buf, thread_locals->scratch_buf);
+ }
+ // Do this now before we finish processing the syscallbuf record.
+ // This means the syscall will be executed in
+ // ReplaySession::flush_syscallbuf instead of
+ // ReplaySession::enter_syscall or something similar.
+ replay_only_syscall1(SYS_close, fd);
+ ret = commit_raw_syscall(SYS_read, ptr, ret);
+ return ret;
+ }
+ }
+ }
+
+ ptr = prep_syscall_for_fd(fd);
+
+ assert(syscallno == call->no);
+
+ if (buf && count > 0) {
+ buf2 = ptr;
+ ptr += count;
+ }
+ if (!start_commit_buffered_syscall(syscallno, ptr, MAY_BLOCK)) {
+ return traced_raw_syscall(call);
+ }
+
+ ret = untraced_syscall3(syscallno, fd, buf2, count);
+ ptr = copy_output_buffer(ret, ptr, buf, buf2);
+ return commit_raw_syscall(syscallno, ptr, ret);
+}
+
+/* On x86-32, pread/pwrite take the offset in two registers. We don't bother
+ * handling that.
+ */
+#if !defined(__i386__)
+static long sys_pread64(struct syscall_info* call) {
+ const int syscallno = SYS_pread64;
+ int fd = call->args[0];
+ void* buf = (void*)call->args[1];
+ size_t count = call->args[2];
+ off_t offset = call->args[3];
+
+ void* ptr;
+ void* buf2 = NULL;
+ long ret;
+
+ ptr = prep_syscall_for_fd(fd);
+
+ assert(syscallno == call->no);
+
+ if (buf && count > 0) {
+ buf2 = ptr;
+ ptr += count;
+ }
+ if (!start_commit_buffered_syscall(syscallno, ptr, MAY_BLOCK)) {
+ return traced_raw_syscall(call);
+ }
+
+ ret = untraced_syscall4(syscallno, fd, buf2, count, offset);
+ ptr = copy_output_buffer(ret, ptr, buf, buf2);
+ return commit_raw_syscall(syscallno, ptr, ret);
+}
+#endif
+
+#if defined(SYS_readlink)
+static long sys_readlink(struct syscall_info* call) {
+ const int syscallno = SYS_readlink;
+ const char* path = (const char*)call->args[0];
+ char* buf = (char*)call->args[1];
+ int bufsiz = call->args[2];
+
+ void* ptr = prep_syscall();
+ char* buf2 = NULL;
+ long ret;
+
+ assert(syscallno == call->no);
+
+ if (buf && bufsiz > 0) {
+ buf2 = ptr;
+ ptr += bufsiz;
+ }
+ if (!start_commit_buffered_syscall(syscallno, ptr, WONT_BLOCK)) {
+ return traced_raw_syscall(call);
+ }
+
+ ret = untraced_syscall3(syscallno, path, buf2, bufsiz);
+ ptr = copy_output_buffer(ret, ptr, buf, buf2);
+ return commit_raw_syscall(syscallno, ptr, ret);
+}
+#endif
+
+static long sys_readlinkat(struct syscall_info* call, int privileged) {
+ const int syscallno = SYS_readlinkat;
+ int dirfd = call->args[0];
+ const char* path = (const char*)call->args[1];
+ char* buf = (char*)call->args[2];
+ int bufsiz = call->args[3];
+
+ void* ptr = prep_syscall();
+ char* buf2 = NULL;
+ long ret;
+
+ assert(syscallno == call->no);
+
+ if (buf && bufsiz > 0) {
+ buf2 = ptr;
+ ptr += bufsiz;
+ }
+ if (!start_commit_buffered_syscall(syscallno, ptr, WONT_BLOCK)) {
+ if (privileged) {
+ return privileged_traced_raw_syscall(call);
+ }
+ return traced_raw_syscall(call);
+ }
+
+ if (privileged) {
+ ret = privileged_untraced_syscall4(syscallno, dirfd, path, buf2, bufsiz);
+ } else {
+ ret = untraced_syscall4(syscallno, dirfd, path, buf2, bufsiz);
+ }
+ ptr = copy_output_buffer(ret, ptr, buf, buf2);
+ return commit_raw_syscall(syscallno, ptr, ret);
+}
+
+#if defined(SYS_socketcall)
+static long sys_socketcall_recv(struct syscall_info* call) {
+ if (force_traced_syscall_for_chaos_mode()) {
+ /* Reading from a socket could unblock a higher priority task */
+ return traced_raw_syscall(call);
+ }
+
+ const int syscallno = SYS_socketcall;
+ long* args = (long*)call->args[1];
+ int sockfd = args[0];
+ void* buf = (void*)args[1];
+ size_t len = args[2];
+ unsigned int flags = args[3];
+ unsigned long new_args[4];
+
+ void* ptr = prep_syscall_for_fd(sockfd);
+ void* buf2 = NULL;
+ long ret;
+
+ assert(syscallno == call->no);
+
+ if (buf && len > 0) {
+ buf2 = ptr;
+ ptr += len;
+ }
+ if (!start_commit_buffered_syscall(syscallno, ptr, MAY_BLOCK)) {
+ return traced_raw_syscall(call);
+ }
+
+ new_args[0] = sockfd;
+ new_args[1] = (unsigned long)buf2;
+ new_args[2] = len;
+ new_args[3] = flags;
+ ret = untraced_syscall2(SYS_socketcall, SYS_RECV, new_args);
+ /* Account for MSG_TRUNC */
+ ptr = copy_output_buffer(ret_buf_len(ret, len), ptr, buf, buf2);
+ return commit_raw_syscall(syscallno, ptr, ret);
+}
+
+static long sys_socketcall(struct syscall_info* call) {
+ switch (call->args[0]) {
+ case SYS_RECV:
+ return sys_socketcall_recv(call);
+ default:
+ return traced_raw_syscall(call);
+ }
+}
+#endif
+
+#ifdef SYS_recvfrom
+static long sys_recvfrom(struct syscall_info* call) {
+ if (force_traced_syscall_for_chaos_mode()) {
+ /* Reading from a socket could unblock a higher priority task */
+ return traced_raw_syscall(call);
+ }
+
+ const int syscallno = SYS_recvfrom;
+ int sockfd = call->args[0];
+ void* buf = (void*)call->args[1];
+ size_t len = call->args[2];
+ int flags = call->args[3];
+ /* struct sockaddr isn't useful here since some sockaddrs are bigger than
+ * it. To avoid making false assumptions, treat the sockaddr parameter
+ * as an untyped buffer.
+ */
+ void* src_addr = (void*)call->args[4];
+ socklen_t* addrlen = (socklen_t*)call->args[5];
+
+ void* ptr = prep_syscall_for_fd(sockfd);
+ void* buf2 = NULL;
+ struct sockaddr* src_addr2 = NULL;
+ socklen_t* addrlen2 = NULL;
+ long ret;
+
+ assert(syscallno == call->no);
+ /* If addrlen is NULL then src_addr must also be null */
+ assert(addrlen || !src_addr);
+
+ if (src_addr) {
+ src_addr2 = ptr;
+ ptr += *addrlen;
+ }
+ if (addrlen) {
+ addrlen2 = ptr;
+ ptr += sizeof(*addrlen);
+ }
+ if (buf && len > 0) {
+ buf2 = ptr;
+ ptr += len;
+ }
+ if (!start_commit_buffered_syscall(syscallno, ptr, MAY_BLOCK)) {
+ return traced_raw_syscall(call);
+ }
+ if (addrlen) {
+ memcpy_input_parameter(addrlen2, addrlen, sizeof(*addrlen2));
+ }
+ ret = untraced_syscall6(syscallno, sockfd, buf2, len, flags, src_addr2,
+ addrlen2);
+
+ if (ret >= 0 && !buffer_hdr()->failed_during_preparation) {
+ if (src_addr2) {
+ socklen_t actual_size = *addrlen2;
+ if (actual_size > *addrlen) {
+ actual_size = *addrlen;
+ }
+ local_memcpy(src_addr, src_addr2, actual_size);
+ }
+ if (addrlen2) {
+ *addrlen = *addrlen2;
+ }
+ }
+ ptr = copy_output_buffer(ret_buf_len(ret, len), ptr, buf, buf2);
+ return commit_raw_syscall(syscallno, ptr, ret);
+}
+#endif
+
+#ifdef SYS_recvmsg
+
+/* These macros are from musl Copyright © 2005-2020 Rich Felker, et al. (MIT LICENSE) */
+#define __CMSG_LEN(cmsg) (((cmsg)->cmsg_len + sizeof(long) - 1) & ~(long)(sizeof(long) - 1))
+#define __CMSG_NEXT(cmsg) ((unsigned char *)(cmsg) + __CMSG_LEN(cmsg))
+#define __MHDR_END(mhdr) ((unsigned char *)(mhdr)->msg_control + (mhdr)->msg_controllen)
+
+#define CMSG_DATA(cmsg) ((unsigned char *) (((struct cmsghdr *)(cmsg)) + 1))
+#define CMSG_NXTHDR(mhdr, cmsg) ((cmsg)->cmsg_len < sizeof (struct cmsghdr) || \
+ (__CMSG_LEN(cmsg) + sizeof(struct cmsghdr) >= (unsigned long)(__MHDR_END(mhdr) - (unsigned char *)(cmsg))) \
+ ? 0 : (struct cmsghdr *)__CMSG_NEXT(cmsg))
+#define CMSG_FIRSTHDR(mhdr) ((size_t) (mhdr)->msg_controllen >= sizeof (struct cmsghdr) ? (struct cmsghdr *) (mhdr)->msg_control : (struct cmsghdr *) 0)
+
+struct cmsghdr {
+ __kernel_size_t cmsg_len;
+ int cmsg_level;
+ int cmsg_type;
+};
+
+struct msghdr /* struct user_msghdr in the kernel */ {
+ void* msg_name;
+ int msg_namelen;
+ struct iovec* msg_iov;
+ __kernel_size_t msg_iovlen;
+ void* msg_control;
+ __kernel_size_t msg_controllen;
+ unsigned int msg_flags;
+};
+
+#define SCM_RIGHTS 0x01
+#define SOL_PACKET 263
+
+static int msg_received_file_descriptors(struct msghdr* msg) {
+ struct cmsghdr* cmh;
+ for (cmh = CMSG_FIRSTHDR(msg); cmh; cmh = CMSG_NXTHDR(msg, cmh)) {
+ if (cmh->cmsg_level == SOL_SOCKET && cmh->cmsg_type == SCM_RIGHTS) {
+ return 1;
+ }
+ }
+ return 0;
+}
+
+static long sys_recvmsg(struct syscall_info* call) {
+ if (force_traced_syscall_for_chaos_mode()) {
+ /* Reading from a socket could unblock a higher priority task */
+ return traced_raw_syscall(call);
+ }
+
+ const int syscallno = SYS_recvmsg;
+ int sockfd = call->args[0];
+ struct msghdr* msg = (struct msghdr*)call->args[1];
+ int flags = call->args[2];
+
+ void* ptr = prep_syscall_for_fd(sockfd);
+ long ret;
+ struct msghdr* msg2;
+ void* ptr_base = ptr;
+ void* ptr_overwritten_end;
+ void* ptr_bytes_start;
+ void* ptr_end;
+ size_t i;
+
+ assert(syscallno == call->no);
+
+ /* Compute final buffer size up front, before writing syscall inputs to the
+ * buffer. Thus if we decide not to buffer this syscall, we bail out
+ * before trying to write to a buffer that won't be recorded and may be
+ * invalid (e.g. overflow).
+ */
+ ptr += sizeof(struct msghdr) + sizeof(struct iovec) * msg->msg_iovlen;
+ if (msg->msg_name) {
+ ptr += msg->msg_namelen;
+ }
+ if (msg->msg_control) {
+ ptr += msg->msg_controllen;
+ }
+ for (i = 0; i < msg->msg_iovlen; ++i) {
+ ptr += msg->msg_iov[i].iov_len;
+ }
+ if (!start_commit_buffered_syscall(syscallno, ptr, MAY_BLOCK)) {
+ return traced_raw_syscall(call);
+ }
+
+ /**
+ * The kernel only writes to the struct msghdr, and the iov buffers. We must
+ * not overwrite that data (except using memcpy_input_parameter) during
+ * replay. For the rest of the data, the values we write here during replay
+ * are guaranteed to match what was recorded in the buffer.
+ * We can't rely on the values we wrote here during recording also being
+ * here during replay since the syscall might have been aborted and our
+ * written data not recorded.
+ */
+ msg2 = ptr = ptr_base;
+ memcpy_input_parameter(msg2, msg, sizeof(*msg));
+ ptr += sizeof(struct msghdr);
+ msg2->msg_iov = ptr;
+ ptr += sizeof(struct iovec) * msg->msg_iovlen;
+ ptr_overwritten_end = ptr;
+ if (msg->msg_name) {
+ msg2->msg_name = ptr;
+ ptr += msg->msg_namelen;
+ }
+ if (msg->msg_control) {
+ msg2->msg_control = ptr;
+ ptr += msg->msg_controllen;
+ }
+ ptr_bytes_start = ptr;
+ for (i = 0; i < msg->msg_iovlen; ++i) {
+ msg2->msg_iov[i].iov_base = ptr;
+ ptr += msg->msg_iov[i].iov_len;
+ msg2->msg_iov[i].iov_len = msg->msg_iov[i].iov_len;
+ }
+
+ ret = untraced_syscall3(syscallno, sockfd, msg2, flags);
+
+ if (ret >= 0 && !buffer_hdr()->failed_during_preparation) {
+ size_t bytes = ret;
+ size_t i;
+ if (msg->msg_name) {
+ local_memcpy(msg->msg_name, msg2->msg_name, msg2->msg_namelen);
+ }
+ msg->msg_namelen = msg2->msg_namelen;
+ if (msg->msg_control) {
+ local_memcpy(msg->msg_control, msg2->msg_control, msg2->msg_controllen);
+ }
+ msg->msg_controllen = msg2->msg_controllen;
+ ptr_end = ptr_bytes_start + bytes;
+ for (i = 0; i < msg->msg_iovlen; ++i) {
+ long copy_bytes =
+ bytes < msg->msg_iov[i].iov_len ? bytes : msg->msg_iov[i].iov_len;
+ local_memcpy(msg->msg_iov[i].iov_base, msg2->msg_iov[i].iov_base,
+ copy_bytes);
+ bytes -= copy_bytes;
+ }
+ msg->msg_flags = msg2->msg_flags;
+
+ if (msg_received_file_descriptors(msg)) {
+ /* When we reach a safe point, notify rr that the control message with
+ * file descriptors was received.
+ */
+ thread_locals->notify_control_msg = msg;
+ }
+ } else {
+ /* Allocate record space as least to cover the data we overwrote above.
+ * We don't want to start the next record overlapping that data, since then
+ * we'll corrupt it during replay.
+ */
+ ptr_end = ptr_overwritten_end;
+ }
+ return commit_raw_syscall(syscallno, ptr_end, ret);
+}
+#endif
+
+#ifdef SYS_sendmsg
+static long sys_sendmsg(struct syscall_info* call) {
+ if (force_traced_syscall_for_chaos_mode()) {
+ /* Sending to a socket could unblock a higher priority task */
+ return traced_raw_syscall(call);
+ }
+
+ const int syscallno = SYS_sendmsg;
+ int sockfd = call->args[0];
+ struct msghdr* msg = (struct msghdr*)call->args[1];
+ int flags = call->args[2];
+
+ void* ptr = prep_syscall_for_fd(sockfd);
+ long ret;
+
+ assert(syscallno == call->no);
+
+ if (!start_commit_buffered_syscall(syscallno, ptr, MAY_BLOCK)) {
+ return traced_raw_syscall(call);
+ }
+
+ ret = untraced_syscall3(syscallno, sockfd, msg, flags);
+
+ return commit_raw_syscall(syscallno, ptr, ret);
+}
+#endif
+
+#ifdef SYS_sendto
+static long sys_sendto(struct syscall_info* call) {
+ if (force_traced_syscall_for_chaos_mode()) {
+ /* Sending to a socket could unblock a higher priority task */
+ return traced_raw_syscall(call);
+ }
+
+ const int syscallno = SYS_sendto;
+ int sockfd = call->args[0];
+ void* buf = (void*)call->args[1];
+ size_t len = call->args[2];
+ int flags = call->args[3];
+ const struct sockaddr* dest_addr = (const struct sockaddr*)call->args[4];
+ socklen_t addrlen = call->args[5];
+
+ void* ptr = prep_syscall_for_fd(sockfd);
+ long ret;
+
+ assert(syscallno == call->no);
+
+ if (!start_commit_buffered_syscall(syscallno, ptr, MAY_BLOCK)) {
+ return traced_raw_syscall(call);
+ }
+
+ ret =
+ untraced_syscall6(syscallno, sockfd, buf, len, flags, dest_addr, addrlen);
+
+ return commit_raw_syscall(syscallno, ptr, ret);
+}
+#endif
+
+#ifdef SYS_setsockopt
+static long sys_setsockopt(struct syscall_info* call) {
+ const int syscallno = SYS_setsockopt;
+ int sockfd = call->args[0];
+ int level = call->args[1];
+ int optname = call->args[2];
+ void* optval = (void*)call->args[3];
+ socklen_t optlen = (socklen_t)call->args[4];
+
+ if (level == SOL_PACKET &&
+ (optname == PACKET_RX_RING || optname == PACKET_TX_RING)) {
+ // Let rr intercept this (and probably disable it)
+ return traced_raw_syscall(call);
+ }
+ if (level == SOL_NETLINK &&
+ (optname == NETLINK_RX_RING || optname == NETLINK_TX_RING)) {
+ // Let rr intercept this (and probably disable it)
+ return traced_raw_syscall(call);
+ }
+
+ void* ptr = prep_syscall_for_fd(sockfd);
+ long ret;
+
+ assert(syscallno == call->no);
+
+ if (!start_commit_buffered_syscall(syscallno, ptr, MAY_BLOCK)) {
+ return traced_raw_syscall(call);
+ }
+
+ ret = untraced_syscall5(syscallno, sockfd, level, optname, optval, optlen);
+
+ return commit_raw_syscall(syscallno, ptr, ret);
+}
+#endif
+
+#ifdef SYS_getsockopt
+static long sys_getsockopt(struct syscall_info* call) {
+ const int syscallno = SYS_getsockopt;
+ int sockfd = call->args[0];
+ int level = call->args[1];
+ int optname = call->args[2];
+ void* optval = (void*)call->args[3];
+ socklen_t* optlen = (socklen_t*)call->args[4];
+ socklen_t* optlen2;
+ void* optval2;
+
+ if (!optlen || !optval) {
+ return traced_raw_syscall(call);
+ }
+
+ void* ptr = prep_syscall_for_fd(sockfd);
+ long ret;
+
+ optlen2 = ptr;
+ ptr += sizeof(*optlen2);
+ optval2 = ptr;
+ ptr += *optlen;
+
+ assert(syscallno == call->no);
+
+ if (!start_commit_buffered_syscall(syscallno, ptr, MAY_BLOCK)) {
+ return traced_raw_syscall(call);
+ }
+
+ memcpy_input_parameter(optlen2, optlen, sizeof(*optlen2));
+ // Some variance of getsockopt does use the initial content of *optval
+ // (e.g. SOL_IP + IPT_SO_GET_INFO) so we need to copy it.
+ memcpy_input_parameter(optval2, optval, *optlen);
+
+ // We may need to manually restart this syscall due to kernel bug
+ // returning a EFAULT when interrupted by signal and we won't have
+ // access to the actual arg1 on aarch64 in a normal way in such case.
+ // Pass in the arg1 in the stack argument so that we can use it in the tracer.
+ ret = untraced_syscall_full(syscallno, sockfd, level, optname,
+ (long)optval2, (long)optlen2, 0,
+ RR_PAGE_SYSCALL_UNTRACED_RECORDING_ONLY, sockfd, 0);
+
+ if (ret >= 0) {
+ socklen_t val_len = *optlen < *optlen2 ? *optlen : *optlen2;
+ local_memcpy(optval, optval2, val_len);
+ local_memcpy(optlen, optlen2, sizeof(*optlen));
+ }
+
+ return commit_raw_syscall(syscallno, ptr, ret);
+}
+#endif
+
+#ifdef SYS_getsockname
+static long sys_getsockname(struct syscall_info* call) {
+ const int syscallno = SYS_getsockname;
+ int sockfd = call->args[0];
+ struct sockaddr* addr = (struct sockaddr*)call->args[1];
+ socklen_t* addrlen = (socklen_t*)call->args[2];
+ socklen_t* addrlen2;
+ struct sockaddr* addr2 = NULL;
+
+ void* ptr = prep_syscall_for_fd(sockfd);
+ long ret;
+
+ addrlen2 = ptr;
+ ptr += sizeof(*addrlen2);
+ if (addr) {
+ addr2 = ptr;
+ ptr += *addrlen;
+ }
+
+ assert(syscallno == call->no);
+
+ if (addrlen2) {
+ memcpy_input_parameter(addrlen2, addrlen, sizeof(*addrlen2));
+ }
+
+ if (!start_commit_buffered_syscall(syscallno, ptr, MAY_BLOCK)) {
+ return traced_raw_syscall(call);
+ }
+
+ ret = untraced_syscall3(syscallno, sockfd, addr2, addrlen2);
+
+ if (ret >= 0) {
+ if (addr) {
+ socklen_t addr_len = *addrlen < *addrlen2 ? *addrlen : *addrlen2;
+ local_memcpy(addr, addr2, addr_len);
+ }
+ local_memcpy(addrlen, addrlen2, sizeof(*addrlen));
+ }
+
+ return commit_raw_syscall(syscallno, ptr, ret);
+}
+#endif
+
+#ifdef SYS_socketpair
+typedef int two_ints[2];
+static long sys_socketpair(struct syscall_info* call) {
+ const int syscallno = SYS_socketpair;
+ int domain = call->args[0];
+ int type = call->args[1];
+ int protocol = call->args[2];
+ two_ints* sv = (two_ints*)call->args[3];
+
+ void* ptr = prep_syscall();
+ struct timezone* sv2 = NULL;
+ long ret;
+
+ assert(syscallno == call->no);
+
+ sv2 = ptr;
+ ptr += sizeof(*sv2);
+ if (!start_commit_buffered_syscall(syscallno, ptr, WONT_BLOCK)) {
+ return traced_raw_syscall(call);
+ }
+ ret = untraced_syscall4(syscallno, domain, type, protocol, sv2);
+ if (ret >= 0 && !buffer_hdr()->failed_during_preparation) {
+ local_memcpy(sv, sv2, sizeof(*sv));
+ }
+ return commit_raw_syscall(syscallno, ptr, ret);
+}
+#endif
+
+static long sys_uname(struct syscall_info* call) {
+ const int syscallno = SYS_uname;
+ void* buf = (void*)call->args[0];
+
+ void* ptr = prep_syscall();
+ void* buf2;
+ long ret;
+ size_t bufsize = sizeof(struct new_utsname);
+
+ assert(syscallno == call->no);
+
+ buf2 = ptr;
+ ptr += bufsize;
+ if (!start_commit_buffered_syscall(syscallno, ptr, WONT_BLOCK)) {
+ return traced_raw_syscall(call);
+ }
+ ret = untraced_syscall1(syscallno, buf2);
+ if (ret >= 0 && !buffer_hdr()->failed_during_preparation) {
+ local_memcpy(buf, buf2, bufsize);
+ }
+ return commit_raw_syscall(syscallno, ptr, ret);
+}
+
+
+#if defined(SYS_time)
+static long sys_time(struct syscall_info* call) {
+ const int syscallno = SYS_time;
+ __kernel_time_t* tp = (__kernel_time_t*)call->args[0];
+
+ void* ptr = prep_syscall();
+ long ret;
+
+ assert(syscallno == call->no);
+
+ if (!start_commit_buffered_syscall(syscallno, ptr, WONT_BLOCK)) {
+ return traced_raw_syscall(call);
+ }
+ ret = untraced_syscall1(syscallno, NULL);
+ if (tp) {
+ /* No error is possible here. */
+ *tp = ret;
+ }
+ return commit_raw_syscall(syscallno, ptr, ret);
+}
+#endif
+
+#if defined(__i386__)
+typedef struct stat64 stat64_t;
+#else
+typedef struct stat stat64_t;
+#endif
+static long sys_xstat64(struct syscall_info* call) {
+ const int syscallno = call->no;
+ /* NB: this arg may be a string or an fd, but for the purposes
+ * of this generic helper we don't care. */
+ long what = call->args[0];
+ stat64_t* buf = (stat64_t*)call->args[1];
+
+ /* Like open(), not arming the desched event because it's not
+ * needed for correctness, and there are no data to suggest
+ * whether it's a good idea perf-wise. */
+ void* ptr = prep_syscall();
+ stat64_t* buf2 = NULL;
+ long ret;
+
+ if (buf) {
+ buf2 = ptr;
+ ptr += sizeof(*buf2);
+ }
+ if (!start_commit_buffered_syscall(syscallno, ptr, WONT_BLOCK)) {
+ return traced_raw_syscall(call);
+ }
+ ret = untraced_syscall2(syscallno, what, buf2);
+ if (buf2 && ret >= 0 && !buffer_hdr()->failed_during_preparation) {
+ local_memcpy(buf, buf2, sizeof(*buf));
+ }
+ return commit_raw_syscall(syscallno, ptr, ret);
+}
+
+#ifdef SYS_statx
+/* Like sys_xstat64, but with different arguments */
+static long sys_statx(struct syscall_info* call) {
+ const int syscallno = call->no;
+ struct statx* buf = (struct statx*)call->args[4];
+
+ void* ptr = prep_syscall();
+ struct statx* buf2 = NULL;
+ long ret;
+
+ if (buf) {
+ buf2 = ptr;
+ ptr += sizeof(*buf2);
+ }
+ if (!start_commit_buffered_syscall(syscallno, ptr, WONT_BLOCK)) {
+ return traced_raw_syscall(call);
+ }
+ ret = untraced_syscall5(syscallno,
+ call->args[0], call->args[1], call->args[2], call->args[3],
+ buf2);
+ if (buf2 && ret >= 0 && !buffer_hdr()->failed_during_preparation) {
+ local_memcpy(buf, buf2, sizeof(*buf));
+ }
+ return commit_raw_syscall(syscallno, ptr, ret);
+}
+#endif
+
+static long sys_fstatat(struct syscall_info* call) {
+ const int syscallno = call->no;
+ stat64_t* buf = (stat64_t*)call->args[2];
+
+ /* Like stat(), not arming the desched event because it's not
+ * needed for correctness, and there are no data to suggest
+ * whether it's a good idea perf-wise. */
+ void* ptr = prep_syscall();
+ stat64_t* buf2 = NULL;
+ long ret;
+
+ if (buf) {
+ buf2 = ptr;
+ ptr += sizeof(*buf2);
+ }
+
+ if (!start_commit_buffered_syscall(syscallno, ptr, WONT_BLOCK)) {
+ return traced_raw_syscall(call);
+ }
+ ret = untraced_syscall4(syscallno,
+ call->args[0], call->args[1], buf2, call->args[3]);
+ if (buf2 && ret >= 0 && !buffer_hdr()->failed_during_preparation) {
+ local_memcpy(buf, buf2, sizeof(*buf));
+ }
+ return commit_raw_syscall(syscallno, ptr, ret);
+}
+
+static long sys_quotactl(struct syscall_info* call) {
+ const int syscallno = call->no;
+ int cmd = call->args[0];
+ const char* special = (const char*)call->args[1];
+ int id = call->args[2];
+ void* addr = (void*)call->args[3];
+
+ if ((cmd >> SUBCMDSHIFT) != Q_GETQUOTA) {
+ return traced_raw_syscall(call);
+ }
+
+ void* ptr = prep_syscall();
+ struct if_dqblk* buf2 = NULL;
+ long ret;
+
+ if (addr) {
+ buf2 = ptr;
+ ptr += sizeof(*buf2);
+ }
+ if (!start_commit_buffered_syscall(syscallno, ptr, WONT_BLOCK)) {
+ return traced_raw_syscall(call);
+ }
+ ret = untraced_syscall4(syscallno, cmd, special, id, buf2);
+ if (buf2 && ret >= 0 && !buffer_hdr()->failed_during_preparation) {
+ local_memcpy(addr, buf2, sizeof(*buf2));
+ }
+ return commit_raw_syscall(syscallno, ptr, ret);
+}
+
+static long sys_statfs(struct syscall_info* call) {
+ const int syscallno = call->no;
+ /* NB: this arg may be a string or an fd, but for the purposes
+ * of this generic helper we don't care. */
+ long what = call->args[0];
+ struct statfs* buf = (struct statfs*)call->args[1];
+
+ /* Like open(), not arming the desched event because it's not
+ * needed for correctness, and there are no data to suggest
+ * whether it's a good idea perf-wise. */
+ void* ptr = prep_syscall();
+ struct statfs* buf2 = NULL;
+ long ret;
+
+ if (buf) {
+ buf2 = ptr;
+ ptr += sizeof(*buf2);
+ }
+ if (!start_commit_buffered_syscall(syscallno, ptr, WONT_BLOCK)) {
+ return traced_raw_syscall(call);
+ }
+ ret = untraced_syscall2(syscallno, what, buf2);
+ if (buf2 && ret >= 0 && !buffer_hdr()->failed_during_preparation) {
+ local_memcpy(buf, buf2, sizeof(*buf));
+ }
+ return commit_raw_syscall(syscallno, ptr, ret);
+}
+
+static long sys_write(struct syscall_info* call) {
+ if (force_traced_syscall_for_chaos_mode()) {
+ /* Writing to a pipe or FIFO could unblock a higher priority task */
+ return traced_raw_syscall(call);
+ }
+
+ const int syscallno = SYS_write;
+ int fd = call->args[0];
+ const void* buf = (const void*)call->args[1];
+ size_t count = call->args[2];
+
+ void* ptr = prep_syscall_for_fd(fd);
+ long ret;
+
+ assert(syscallno == call->no);
+
+ if (!start_commit_buffered_syscall(syscallno, ptr, fd_write_blocks(fd))) {
+ return traced_raw_syscall(call);
+ }
+
+ ret = untraced_syscall3(syscallno, fd, buf, count);
+
+ return commit_raw_syscall(syscallno, ptr, ret);
+}
+
+/* On x86-32, pread/pwrite take the offset in two registers. We don't bother
+ * handling that.
+ */
+#if !defined(__i386__)
+static long sys_pwrite64(struct syscall_info* call) {
+ const int syscallno = SYS_pwrite64;
+ int fd = call->args[0];
+ const void* buf = (const void*)call->args[1];
+ size_t count = call->args[2];
+ off_t offset = call->args[3];
+
+ enum syscallbuf_fd_classes cls = fd_class(fd);
+ if (cls == FD_CLASS_TRACED) {
+ return traced_raw_syscall(call);
+ }
+ void* ptr = prep_syscall();
+ assert(syscallno == call->no);
+
+ if (!start_commit_buffered_syscall(syscallno, ptr, fd_write_blocks(fd))) {
+ return traced_raw_syscall(call);
+ }
+
+ long ret;
+ if (cls == FD_CLASS_PROC_MEM) {
+ ret = untraced_replay_assist_syscall4(syscallno, fd, buf, count, offset);
+ } else {
+ ret = untraced_syscall4(syscallno, fd, buf, count, offset);
+ }
+
+ return commit_raw_syscall(syscallno, ptr, ret);
+}
+#endif
+
+static long sys_writev(struct syscall_info* call) {
+ if (force_traced_syscall_for_chaos_mode()) {
+ /* Writing to a pipe or FIFO could unblock a higher priority task */
+ return traced_raw_syscall(call);
+ }
+
+ int syscallno = SYS_writev;
+ int fd = call->args[0];
+ const struct iovec* iov = (const struct iovec*)call->args[1];
+ unsigned long iovcnt = call->args[2];
+
+ void* ptr = prep_syscall_for_fd(fd);
+ long ret;
+
+ assert(syscallno == call->no);
+
+ if (!start_commit_buffered_syscall(syscallno, ptr, fd_write_blocks(fd))) {
+ return traced_raw_syscall(call);
+ }
+
+ ret = untraced_syscall3(syscallno, fd, iov, iovcnt);
+
+ return commit_raw_syscall(syscallno, ptr, ret);
+}
+
+static long sys_prctl(struct syscall_info* call) {
+ int syscallno = SYS_prctl;
+ long option = call->args[0];
+ unsigned long arg2 = call->args[1];
+ unsigned long arg3 = call->args[2];
+ unsigned long arg4 = call->args[3];
+ unsigned long arg5 = call->args[4];
+
+ if (option != PR_SET_NAME) {
+ return traced_raw_syscall(call);
+ }
+
+ void* ptr = prep_syscall();
+ long ret;
+
+ assert(syscallno == call->no);
+
+ if (!start_commit_buffered_syscall(syscallno, ptr, WONT_BLOCK)) {
+ return traced_raw_syscall(call);
+ }
+
+ ret = untraced_replay_assist_syscall5(syscallno, option, arg2, arg3, arg4, arg5);
+ return commit_raw_syscall(syscallno, ptr, ret);
+}
+
+static long sys_set_robust_list(struct syscall_info* call) {
+ int syscallno = SYS_set_robust_list;
+ void* head = (void*)call->args[0];
+ size_t len = call->args[1];
+ long ret;
+
+ assert(syscallno == call->no);
+
+ /* Avoid len values we don't support via our buffering mechanism */
+ if (len == 0 || len >= UINT32_MAX) {
+ return traced_raw_syscall(call);
+ }
+
+ void* ptr = prep_syscall();
+ if (!start_commit_buffered_syscall(syscallno, ptr, WONT_BLOCK)) {
+ return traced_raw_syscall(call);
+ }
+
+ ret = untraced_syscall2(syscallno, head, len);
+ if (!ret) {
+ thread_locals->robust_list.head = head;
+ thread_locals->robust_list.len = len;
+ }
+ return commit_raw_syscall(syscallno, ptr, ret);
+}
+
+#if defined(SYS_rseq)
+static long sys_rseq(struct syscall_info* call) {
+ int syscallno = SYS_rseq;
+ struct rr_rseq* rseq = (struct rr_rseq*)call->args[0];
+ size_t rseq_len = call->args[1];
+ int flags = call->args[2];
+ uint32_t sig = call->args[3];
+
+ assert(syscallno == call->no);
+
+ if (flags || ((uintptr_t)rseq & 31) || rseq_len != sizeof(*rseq) ||
+ thread_locals->rseq_called || globals.cpu_binding < 0) {
+ return traced_raw_syscall(call);
+ }
+
+ void* ptr = prep_syscall();
+ /* Allow buffering only for the simplest case: setting up the
+ initial rseq, all parameters OK and CPU binding in place. */
+ if (!start_commit_buffered_syscall(syscallno, ptr, WONT_BLOCK)) {
+ return traced_raw_syscall(call);
+ }
+
+ /* We don't actually need to make a syscall since rr is
+ going to emulate everything. */
+ rseq->cpu_id_start = rseq->cpu_id = globals.cpu_binding;
+ thread_locals->rseq_called = 1;
+ thread_locals->rseq.rseq = rseq;
+ thread_locals->rseq.len = rseq_len;
+ thread_locals->rseq.sig = sig;
+ /* We do need to commit a syscallbuf record to ensure that flushing
+ happens with associated processing. */
+ return commit_raw_syscall(syscallno, ptr, 0);
+}
+#endif
+
+static long sys_ptrace(struct syscall_info* call) {
+ int syscallno = SYS_ptrace;
+ long request = call->args[0];
+ pid_t pid = call->args[1];
+ void* addr = (void*)call->args[2];
+ void* data = (void*)call->args[3];
+
+ if (request != PTRACE_PEEKDATA || !data) {
+ return traced_raw_syscall(call);
+ }
+
+ /* We try to emulate PTRACE_PEEKDATA using process_vm_readv. That might not
+ * work for permissions reasons; if it fails for any reason, we retry with
+ * a traced syscall.
+ * This does mean that if a process issues a PTRACE_PEEKDATA while not
+ * actually ptracing the target, it might succeed under rr whereas normally
+ * it would have failed. That's hard to avoid and unlikely to be a real
+ * problem in practice (typically it would fail on some other ptrace call like
+ * PTRACE_GETREGS before or after the PEEKDATA).
+ */
+ void* ptr = prep_syscall();
+ long ret;
+ void* data2;
+
+ assert(syscallno == call->no);
+ syscallno = SYS_process_vm_readv;
+
+ data2 = ptr;
+ ptr += sizeof(long);
+
+ if (!start_commit_buffered_syscall(syscallno, ptr, WONT_BLOCK)) {
+ return traced_raw_syscall(call);
+ }
+
+ struct iovec local_iov = { data2, sizeof(long) };
+ struct iovec remote_iov = { addr, sizeof(long) };
+ ret = untraced_syscall6(syscallno, pid, &local_iov, 1, &remote_iov, 1, 0);
+ if (ret > 0 && !buffer_hdr()->failed_during_preparation) {
+ local_memcpy(data, data2, ret);
+ }
+ commit_raw_syscall(syscallno, ptr, ret);
+
+ if (ret != sizeof(long)) {
+ return traced_raw_syscall(call);
+ }
+ return ret;
+}
+
+static long sys_getrusage(struct syscall_info* call) {
+ const int syscallno = SYS_getrusage;
+ int who = (int)call->args[0];
+ struct rusage* buf = (struct rusage*)call->args[1];
+ void* ptr = prep_syscall();
+ long ret;
+ struct rusage* buf2 = NULL;
+
+ assert(syscallno == call->no);
+
+ if (buf) {
+ buf2 = ptr;
+ ptr += sizeof(struct rusage);
+ }
+ if (!start_commit_buffered_syscall(syscallno, ptr, WONT_BLOCK)) {
+ return traced_raw_syscall(call);
+ }
+
+ ret = untraced_syscall2(syscallno, who, buf2);
+ if (buf2 && ret >= 0 && !buffer_hdr()->failed_during_preparation) {
+ local_memcpy(buf, buf2, sizeof(*buf));
+ }
+ return commit_raw_syscall(syscallno, ptr, ret);
+}
+
+static long sys_rt_sigprocmask(struct syscall_info* call) {
+ const int syscallno = SYS_rt_sigprocmask;
+ long ret;
+ kernel_sigset_t modified_set;
+ void* oldset2;
+ struct syscallbuf_hdr* hdr;
+
+ if (call->args[3] != sizeof(kernel_sigset_t)) {
+ // Unusual sigset size. Bail.
+ return traced_raw_syscall(call);
+ }
+
+ void* ptr = prep_syscall();
+
+ int how = (int)call->args[0];
+ const kernel_sigset_t* set = (const kernel_sigset_t*)call->args[1];
+ kernel_sigset_t* oldset = (kernel_sigset_t*)call->args[2];
+
+ oldset2 = ptr;
+ ptr += sizeof(kernel_sigset_t);
+
+ if (!start_commit_buffered_syscall(syscallno, ptr, WONT_BLOCK)) {
+ return traced_raw_syscall(call);
+ }
+
+ if (set && (how == SIG_BLOCK || how == SIG_SETMASK)) {
+ local_memcpy(&modified_set, set, sizeof(kernel_sigset_t));
+ // SIGSTKFLT (PerfCounters::TIME_SLICE_SIGNAL) and
+ // SIGPWR(SYSCALLBUF_DESCHED_SIGNAL) are used by rr
+ modified_set &=
+ ~(((uint64_t)1) << (SIGSTKFLT - 1)) &
+ ~(((uint64_t)1) << (globals.desched_sig - 1));
+ set = &modified_set;
+ }
+
+ hdr = buffer_hdr();
+ hdr->in_sigprocmask_critical_section = 1;
+
+ ret =
+ untraced_syscall4(syscallno, how, set, oldset2, sizeof(kernel_sigset_t));
+ if (ret >= 0 && !buffer_hdr()->failed_during_preparation) {
+ if (oldset) {
+ local_memcpy(oldset, oldset2, sizeof(kernel_sigset_t));
+ }
+ if (set) {
+ kernel_sigset_t previous_set;
+ local_memcpy(&previous_set, oldset2, sizeof(kernel_sigset_t));
+ switch (how) {
+ case SIG_UNBLOCK:
+ previous_set &= ~*set;
+ break;
+ case SIG_BLOCK:
+ previous_set |= *set;
+ break;
+ case SIG_SETMASK:
+ previous_set = *set;
+ break;
+ }
+ hdr->blocked_sigs = previous_set;
+ // We must update the generation last to ensure that an update is not
+ // lost.
+ ++hdr->blocked_sigs_generation;
+ }
+ }
+ hdr->in_sigprocmask_critical_section = 0;
+
+ commit_raw_syscall(syscallno, ptr, ret);
+
+ if (ret == -EAGAIN) {
+ // The rr supervisor emulated EAGAIN because there was a pending signal.
+ // Retry using a traced syscall so the pending signal(s) can be delivered.
+ return traced_raw_syscall(call);
+ }
+ return ret;
+}
+
+static long sys_sigaltstack(struct syscall_info* call) {
+ const int syscallno = SYS_sigaltstack;
+ stack_t* ss = (void*)call->args[0];
+ stack_t* old_ss = (void*)call->args[1];
+
+ void* ptr = prep_syscall();
+ stack_t* old_ss2 = NULL;
+ long ret;
+
+ assert(syscallno == call->no);
+
+ if (old_ss) {
+ old_ss2 = ptr;
+ ptr += sizeof(*old_ss2);
+ }
+ if (!start_commit_buffered_syscall(syscallno, ptr, WONT_BLOCK)) {
+ return traced_raw_syscall(call);
+ }
+ ret = untraced_syscall2(syscallno, ss, old_ss2);
+ if (old_ss && ret >= 0 && !buffer_hdr()->failed_during_preparation) {
+ /* This is small and won't get optimized to a memcpy call outside
+ our library. */
+ *old_ss = *old_ss2;
+ }
+ return commit_raw_syscall(syscallno, ptr, ret);
+}
+
+static long sys_rrcall_rdtsc(struct syscall_info* call) {
+#if defined(__i386__) || defined(__x86_64__)
+ const int syscallno = SYS_rrcall_rdtsc;
+ uint32_t tsc[2];
+ void* ptr = prep_syscall();
+ void* buf = ptr;
+ ptr += 8;
+ if (!start_commit_buffered_syscall(syscallno, ptr, WONT_BLOCK)) {
+ return traced_raw_syscall(call);
+ }
+
+ // Do an RDTSC without context-switching to rr. This is still a lot slower
+ // than a plain RDTSC. Maybe we coud do something better with RDPMC...
+ privileged_unrecorded_syscall5(SYS_prctl, PR_SET_TSC, PR_TSC_ENABLE, 0, 0, 0);
+ rdtsc_recording_only(buf);
+ privileged_unrecorded_syscall5(SYS_prctl, PR_SET_TSC, PR_TSC_SIGSEGV, 0, 0, 0);
+
+ local_memcpy(tsc, buf, sizeof(tsc));
+ // Overwrite RDX (syscall arg 3) with our TSC value.
+ call->args[2] = tsc[1];
+ return commit_raw_syscall(syscallno, ptr, tsc[0]);
+#else
+ (void)call;
+ fatal("RDTSC not supported in this architecture");
+ return 0;
+#endif
+}
+
+static long syscall_hook_internal(struct syscall_info* call) {
+ switch (call->no) {
+#define CASE(syscallname) \
+ case SYS_##syscallname: \
+ return sys_##syscallname(call)
+#define CASE_GENERIC_NONBLOCKING(syscallname) \
+ case SYS_##syscallname: \
+ return sys_generic_nonblocking(call)
+#define CASE_GENERIC_NONBLOCKING_FD(syscallname) \
+ case SYS_##syscallname: \
+ return sys_generic_nonblocking_fd(call)
+ CASE(rrcall_rdtsc);
+#if defined(SYS_access)
+ CASE_GENERIC_NONBLOCKING(access);
+#endif
+ CASE(clock_gettime);
+#if defined(SYS_clock_gettime64)
+ CASE(clock_gettime64);
+#endif
+ CASE_GENERIC_NONBLOCKING_FD(close);
+#if defined(SYS_creat)
+ CASE(creat);
+#endif
+ CASE_GENERIC_NONBLOCKING_FD(dup);
+#if defined(SYS_epoll_wait)
+case SYS_epoll_wait:
+#endif
+case SYS_epoll_pwait:
+ return sys_epoll_wait(call);
+#if defined(SYS_epoll_pwait2)
+ CASE(epoll_pwait2);
+#endif
+ CASE_GENERIC_NONBLOCKING_FD(fadvise64);
+ CASE_GENERIC_NONBLOCKING(fchmod);
+#if defined(SYS_fcntl64)
+ CASE(fcntl64);
+#else
+ CASE(fcntl);
+#endif
+ CASE(fgetxattr);
+ CASE(flistxattr);
+ CASE_GENERIC_NONBLOCKING_FD(fsetxattr);
+ CASE_GENERIC_NONBLOCKING_FD(ftruncate);
+ CASE(futex);
+#if defined(SYS_getdents)
+ CASE(getdents);
+#endif
+ CASE(getdents64);
+ CASE_GENERIC_NONBLOCKING(getegid);
+ CASE_GENERIC_NONBLOCKING(geteuid);
+ CASE_GENERIC_NONBLOCKING(getgid);
+ CASE_GENERIC_NONBLOCKING(getpid);
+ CASE_GENERIC_NONBLOCKING(getppid);
+ CASE(getrandom);
+ CASE(getrusage);
+ CASE_GENERIC_NONBLOCKING(gettid);
+ CASE(gettimeofday);
+ CASE_GENERIC_NONBLOCKING(getuid);
+ CASE(getxattr);
+ CASE(ioctl);
+#if defined(lchown)
+ CASE_GENERIC_NONBLOCKING(lchown);
+#endif
+ CASE(lgetxattr);
+ CASE(listxattr);
+ CASE(llistxattr);
+#if defined(SYS__llseek)
+ CASE(_llseek);
+#endif
+ CASE_GENERIC_NONBLOCKING_FD(lseek);
+ CASE(madvise);
+#if defined(SYS_mkdir)
+ CASE_GENERIC_NONBLOCKING(mkdir);
+#endif
+#if defined(SYS_mkdor)
+ CASE_GENERIC_NONBLOCKING(mknod);
+#endif
+ CASE(mprotect);
+#if defined(SYS_open)
+ CASE(open);
+#endif
+ CASE(openat);
+#if defined(SYS_poll)
+ CASE(poll);
+#endif
+#if defined(SYS_ppoll)
+ CASE(ppoll);
+#endif
+ CASE(prctl);
+#if !defined(__i386__)
+ CASE(pread64);
+ CASE(pwrite64);
+#endif
+ CASE(ptrace);
+ CASE(quotactl);
+ CASE(read);
+#if defined(SYS_readlink)
+ CASE(readlink);
+#endif
+ case SYS_readlinkat:
+ return sys_readlinkat(call, 0);
+#if defined(SYS_recvfrom)
+ CASE(recvfrom);
+#endif
+#if defined(SYS_recvmsg)
+ CASE(recvmsg);
+#endif
+#if defined(SYS_rseq)
+ CASE(rseq);
+#endif
+#if defined(SYS_rmdir)
+ CASE_GENERIC_NONBLOCKING(rmdir);
+#endif
+ CASE(rt_sigprocmask);
+#if defined(SYS_sendmsg)
+ CASE(sendmsg);
+#endif
+#if defined(SYS_sendto)
+ CASE(sendto);
+#endif
+ CASE(set_robust_list);
+#if defined(SYS_setsockopt)
+ CASE(setsockopt);
+#endif
+#if defined(SYS_getsockopt)
+ CASE(getsockopt);
+#endif
+#if defined(SYS_getsockname)
+ CASE(getsockname);
+#endif
+ CASE_GENERIC_NONBLOCKING(setxattr);
+ CASE(sigaltstack);
+#if defined(SYS_socketcall)
+ CASE(socketcall);
+#endif
+#if defined(SYS_socketpair)
+ CASE(socketpair);
+#endif
+#if defined(SYS_symlink)
+ CASE_GENERIC_NONBLOCKING(symlink);
+#endif
+#if defined(SYS_time)
+ CASE(time);
+#endif
+ CASE_GENERIC_NONBLOCKING(truncate);
+ CASE(uname);
+#if defined(SYS_unlink)
+ CASE_GENERIC_NONBLOCKING(unlink);
+#endif
+ CASE_GENERIC_NONBLOCKING(unlinkat);
+ CASE_GENERIC_NONBLOCKING_FD(utimensat);
+ CASE(write);
+ CASE(writev);
+#if defined(SYS_fstat64)
+ case SYS_fstat64:
+#elif defined(SYS_fstat)
+ case SYS_fstat:
+#endif
+#if defined(SYS_lstat64)
+ case SYS_lstat64:
+#elif defined(SYS_lstat)
+ case SYS_lstat:
+#endif
+#if defined(SYS_stat64)
+ case SYS_stat64:
+#elif defined(SYS_stat)
+ case SYS_stat:
+#endif
+ return sys_xstat64(call);
+#if defined(SYS_statx)
+ case SYS_statx:
+ return sys_statx(call);
+#endif
+ case SYS_statfs:
+ case SYS_fstatfs:
+ return sys_statfs(call);
+#if defined(SYS_newfstatat)
+ case SYS_newfstatat:
+#elif defined(SYS_fstatat64)
+ case SYS_fstatat64:
+#endif
+ return sys_fstatat(call);
+#undef CASE
+#undef CASE_GENERIC_NONBLOCKING
+#undef CASE_GENERIC_NONBLOCKING_FD
+ default:
+ return traced_raw_syscall(call);
+ }
+}
+
+/* Delay for testing purposes */
+static void do_delay(void) {
+ int i;
+ int result = 0;
+ for (i = 0; i < 10000000; ++i) {
+ result += i * i;
+ }
+ // Make sure result is used so this doesn't get optimized away
+ impose_syscall_delay = result | 1;
+}
+
+/* Explicitly declare this as hidden so we can call it from
+ * _syscall_hook_trampoline without doing all sorts of special PIC handling.
+ */
+RR_HIDDEN long syscall_hook(struct syscall_info* call) {
+ // Initialize thread-local state if this is the first syscall for this
+ // thread.
+ init_thread();
+
+ if (!thread_locals->buffer || buffer_hdr()->locked) {
+ /* We may be reentering via a signal handler. Bail. */
+ return traced_raw_syscall(call);
+ }
+
+ thread_locals->original_syscall_parameters = call;
+
+ if (impose_syscall_delay) {
+ do_delay();
+ }
+
+ long result = syscall_hook_internal(call);
+ if (buffer_hdr() && buffer_hdr()->notify_on_syscall_hook_exit) {
+ // Sometimes a signal is delivered to interrupt an untraced syscall in
+ // a non-restartable way (e.g. seccomp SIGSYS). Those signals must be
+ // handled outside any syscallbuf transactions. We defer them until
+ // this SYS_rrcall_notify_syscall_hook_exit, which is triggered by rr
+ // setting notify_on_syscall_hook_exit. The parameters to the
+ // SYS_rrcall_notify_syscall_hook_exit are magical and fully control
+ // the syscall parameters and result seen by the signal handler.
+ //
+ // SYS_rrcall_notify_syscall_hook_exit will clear
+ // notify_on_syscall_hook_exit. Clearing it ourselves is tricky to get
+ // right without races.
+ //
+ // During recording, this flag is set when the recorder needs to delay
+ // delivery of a signal until we've stopped using the syscallbuf.
+ // During replay, this flag is set when the next event is entering a
+ // SYS_rrcall_notify_syscall_hook_exit.
+ //
+ // The correctness argument is as follows:
+ // Correctness requires that a) replay's setting of the flag happens before
+ // we read the flag in the call to syscall_hook that triggered the
+ // SYS_rrcall_notify_syscall_hook_exit and b) replay's setting of the flag
+ // must happen after we read the flag in the previous execution of
+ // syscall_hook.
+ // Condition a) holds as long as no events are recorded between the
+ // checking of the flag above and the execution of this syscall. This
+ // should be the case; no synchronous signals or syscalls are
+ // triggerable, all async signals other than SYSCALLBUF_DESCHED_SIGNAL
+ // are delayed, and SYSCALLBUF_DESCHED_SIGNAL shouldn't fire since we've
+ // disarmed the desched fd at this point. SYSCALLBUF_FLUSH events may be
+ // emitted when we process the SYS_rrcall_notify_syscall_hook_exit event,
+ // but replay of those events ends at the last flushed syscall, before
+ // we exit syscall_hook_internal.
+ // Condition b) failing would mean no new events were generated between
+ // testing the flag in the previous syscall_hook and the execution of this
+ // SYS_rrcall_notify_syscall_hook_exit. However, every invocation of
+ // syscall_hook_internal generates either a traced syscall or a syscallbuf
+ // record that would be flushed by SYSCALLBUF_FLUSH, so that can't
+ // happen.
+ result = _raw_syscall(SYS_rrcall_notify_syscall_hook_exit, call->args[0],
+ call->args[1], call->args[2], call->args[3],
+ call->args[4], call->args[5],
+ RR_PAGE_SYSCALL_PRIVILEGED_TRACED, result, call->no);
+ }
+ // Do work that can only be safely done after syscallbuf can be flushed
+ if (thread_locals->notify_control_msg) {
+ privileged_traced_syscall1(SYS_rrcall_notify_control_msg,
+ thread_locals->notify_control_msg);
+ thread_locals->notify_control_msg = NULL;
+ }
+ thread_locals->original_syscall_parameters = NULL;
+ return result;
+}
diff --git a/rr/android/x86_64/share/rr/src/preload/syscallbuf.h b/rr/android/x86_64/share/rr/src/preload/syscallbuf.h
new file mode 100644
index 0000000..84e87d3
--- /dev/null
+++ b/rr/android/x86_64/share/rr/src/preload/syscallbuf.h
@@ -0,0 +1,15 @@
+/* -*- Mode: C; tab-width: 8; c-basic-offset: 2; indent-tabs-mode: nil; -*- */
+
+#ifndef RR_SYSCALLBUF_H_
+#define RR_SYSCALLBUF_H_
+
+struct timespec;
+
+#define RR_HIDDEN __attribute__((visibility("hidden")))
+
+RR_HIDDEN extern struct preload_globals globals;
+
+RR_HIDDEN extern char impose_syscall_delay;
+RR_HIDDEN extern char impose_spurious_desched;
+
+#endif /* RR_SYSCALLBUF_H_ */