blob: e18ef3c1db7f5cc1ba9be5c230ef4dc6c7877733 [file] [log] [blame] [edit]
//! A Linux mechanism for handling page faults in user space.
//!
//! The main way to interact with this library is to create a `Uffd` object with a `UffdBuilder`,
//! then use the methods of `Uffd` from a worker thread.
//!
//! See [`userfaultfd(2)`](http://man7.org/linux/man-pages/man2/userfaultfd.2.html) and
//! [`ioctl_userfaultfd(2)`](http://man7.org/linux/man-pages/man2/ioctl_userfaultfd.2.html) for more
//! details.
mod builder;
mod error;
mod event;
mod raw;
pub use crate::builder::{FeatureFlags, UffdBuilder};
pub use crate::error::{Error, Result};
pub use crate::event::{Event, FaultKind, ReadWrite};
use bitflags::bitflags;
use libc::{self, c_void};
use nix::errno::Errno;
use nix::unistd::read;
use std::mem;
use std::os::fd::{AsFd, BorrowedFd};
use std::os::unix::io::{AsRawFd, FromRawFd, IntoRawFd, RawFd};
/// Represents an opaque buffer where userfaultfd events are stored.
///
/// This is used in conjunction with [`Uffd::read_events`].
pub struct EventBuffer(Vec<raw::uffd_msg>);
impl EventBuffer {
/// Creates a new buffer for `size` number of events.
///
/// [`Uffd::read_events`] will read up to this many events at a time.
pub fn new(size: usize) -> Self {
Self(vec![unsafe { mem::zeroed() }; size])
}
}
/// The userfaultfd object.
///
/// The userspace representation of the object is a file descriptor, so this type implements
/// `AsRawFd`, `FromRawFd`, and `IntoRawFd`. These methods should be used with caution, but can be
/// essential for using functions like `poll` on a worker thread.
#[derive(Debug)]
pub struct Uffd {
fd: RawFd,
}
impl Drop for Uffd {
fn drop(&mut self) {
unsafe { libc::close(self.fd) };
}
}
impl AsFd for Uffd {
fn as_fd(&self) -> BorrowedFd<'_> {
unsafe { BorrowedFd::borrow_raw(self.as_raw_fd()) }
}
}
impl AsRawFd for Uffd {
fn as_raw_fd(&self) -> RawFd {
self.fd
}
}
impl IntoRawFd for Uffd {
fn into_raw_fd(self) -> RawFd {
self.fd
}
}
impl FromRawFd for Uffd {
unsafe fn from_raw_fd(fd: RawFd) -> Self {
Uffd { fd }
}
}
bitflags! {
/// The registration mode used when registering an address range with `Uffd`.
#[derive(Copy, Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)]
pub struct RegisterMode: u64 {
/// Registers the range for missing page faults.
const MISSING = raw::UFFDIO_REGISTER_MODE_MISSING;
/// Registers the range for write faults.
#[cfg(feature = "linux5_7")]
const WRITE_PROTECT = raw::UFFDIO_REGISTER_MODE_WP;
}
}
impl Uffd {
/// Register a memory address range with the userfaultfd object, and returns the `IoctlFlags`
/// that are available for the selected range.
///
/// This method only registers the given range for missing page faults.
pub fn register(&self, start: *mut c_void, len: usize) -> Result<IoctlFlags> {
self.register_with_mode(start, len, RegisterMode::MISSING)
}
/// Register a memory address range with the userfaultfd object for the given mode and
/// returns the `IoctlFlags` that are available for the selected range.
pub fn register_with_mode(
&self,
start: *mut c_void,
len: usize,
mode: RegisterMode,
) -> Result<IoctlFlags> {
let mut register = raw::uffdio_register {
range: raw::uffdio_range {
start: start as u64,
len: len as u64,
},
mode: mode.bits(),
ioctls: 0,
};
unsafe {
raw::register(self.as_raw_fd(), &mut register as *mut raw::uffdio_register)?;
}
IoctlFlags::from_bits(register.ioctls).ok_or(Error::UnrecognizedIoctls(register.ioctls))
}
/// Unregister a memory address range from the userfaultfd object.
pub fn unregister(&self, start: *mut c_void, len: usize) -> Result<()> {
let mut range = raw::uffdio_range {
start: start as u64,
len: len as u64,
};
unsafe {
raw::unregister(self.as_raw_fd(), &mut range as *mut raw::uffdio_range)?;
}
Ok(())
}
/// Atomically copy a continuous memory chunk into the userfaultfd-registered range, and return
/// the number of bytes that were successfully copied.
///
/// If `wake` is `true`, wake up the thread waiting for page fault resolution on the memory
/// range.
pub unsafe fn copy(
&self,
src: *const c_void,
dst: *mut c_void,
len: usize,
wake: bool,
) -> Result<usize> {
let mut copy = raw::uffdio_copy {
src: src as u64,
dst: dst as u64,
len: len as u64,
mode: if wake {
0
} else {
raw::UFFDIO_COPY_MODE_DONTWAKE
},
copy: 0,
};
let _ =
raw::copy(self.as_raw_fd(), &mut copy as *mut raw::uffdio_copy).map_err(|errno| {
match errno {
Errno::EAGAIN => Error::PartiallyCopied(copy.copy as usize),
_ => Error::CopyFailed(errno),
}
})?;
if copy.copy < 0 {
// shouldn't ever get here, as errno should be caught above
Err(Error::CopyFailed(Errno::from_i32(-copy.copy as i32)))
} else {
Ok(copy.copy as usize)
}
}
/// Zero out a memory address range registered with userfaultfd, and return the number of bytes
/// that were successfully zeroed.
///
/// If `wake` is `true`, wake up the thread waiting for page fault resolution on the memory
/// address range.
pub unsafe fn zeropage(&self, start: *mut c_void, len: usize, wake: bool) -> Result<usize> {
let mut zeropage = raw::uffdio_zeropage {
range: raw::uffdio_range {
start: start as u64,
len: len as u64,
},
mode: if wake {
0
} else {
raw::UFFDIO_ZEROPAGE_MODE_DONTWAKE
},
zeropage: 0,
};
let _ = raw::zeropage(self.as_raw_fd(), &mut zeropage as &mut raw::uffdio_zeropage)
.map_err(Error::ZeropageFailed)?;
if zeropage.zeropage < 0 {
// shouldn't ever get here, as errno should be caught above
Err(Error::ZeropageFailed(Errno::from_i32(
-zeropage.zeropage as i32,
)))
} else {
Ok(zeropage.zeropage as usize)
}
}
/// Wake up the thread waiting for page fault resolution on the specified memory address range.
pub fn wake(&self, start: *mut c_void, len: usize) -> Result<()> {
let mut range = raw::uffdio_range {
start: start as u64,
len: len as u64,
};
unsafe {
raw::wake(self.as_raw_fd(), &mut range as *mut raw::uffdio_range)?;
}
Ok(())
}
/// Makes a range write-protected.
#[cfg(feature = "linux5_7")]
pub fn write_protect(&self, start: *mut c_void, len: usize) -> Result<()> {
let mut ioctl = raw::uffdio_writeprotect {
range: raw::uffdio_range {
start: start as u64,
len: len as u64,
},
mode: raw::UFFDIO_WRITEPROTECT_MODE_WP,
};
unsafe {
raw::write_protect(
self.as_raw_fd(),
&mut ioctl as *mut raw::uffdio_writeprotect,
)?;
}
Ok(())
}
/// Removes the write-protection for a range.
///
/// If `wake` is `true`, wake up the thread waiting for page fault resolution on the memory
/// address range.
#[cfg(feature = "linux5_7")]
pub fn remove_write_protection(
&self,
start: *mut c_void,
len: usize,
wake: bool,
) -> Result<()> {
let mut ioctl = raw::uffdio_writeprotect {
range: raw::uffdio_range {
start: start as u64,
len: len as u64,
},
mode: if wake {
0
} else {
raw::UFFDIO_WRITEPROTECT_MODE_DONTWAKE
},
};
unsafe {
raw::write_protect(
self.as_raw_fd(),
&mut ioctl as *mut raw::uffdio_writeprotect,
)?;
}
Ok(())
}
/// Read an `Event` from the userfaultfd object.
///
/// If the `Uffd` object was created with `non_blocking` set to `false`, this will block until
/// an event is successfully read (returning `Some(event)`, or an error is returned.
///
/// If `non_blocking` was `true`, this will immediately return `None` if no event is ready to
/// read.
///
/// Note that while this method doesn't require a mutable reference to the `Uffd` object, it
/// does consume bytes (thread-safely) from the underlying file descriptor.
///
/// # Examples
///
/// ```rust
/// # use userfaultfd::{Uffd, Result};
/// fn read_event(uffd: &Uffd) -> Result<()> {
/// // Read a single event
/// match uffd.read_event()? {
/// Some(e) => {
/// // Do something with the event
/// },
/// None => {
/// // This was a non-blocking read and the descriptor was not ready for read
/// },
/// }
/// Ok(())
/// }
/// ```
pub fn read_event(&self) -> Result<Option<Event>> {
let mut buf = [unsafe { std::mem::zeroed() }; 1];
let mut iter = self.read(&mut buf)?;
let event = iter.next().transpose()?;
assert!(iter.next().is_none());
Ok(event)
}
/// Read multiple events from the userfaultfd object using the given event buffer.
///
/// If the `Uffd` object was created with `non_blocking` set to `false`, this will block until
/// an event is successfully read or an error is returned.
///
/// If `non_blocking` was `true`, this will immediately return an empty iterator if the file
/// descriptor is not ready for reading.
///
/// # Examples
///
/// ```rust
/// # use userfaultfd::{Uffd, EventBuffer};
/// fn read_events(uffd: &Uffd) -> userfaultfd::Result<()> {
/// // Read up to 100 events at a time
/// let mut buf = EventBuffer::new(100);
/// for event in uffd.read_events(&mut buf)? {
/// let event = event?;
/// // Do something with the event...
/// }
/// Ok(())
/// }
/// ```
pub fn read_events<'a>(
&self,
buf: &'a mut EventBuffer,
) -> Result<impl Iterator<Item = Result<Event>> + 'a> {
self.read(&mut buf.0)
}
fn read<'a>(
&self,
msgs: &'a mut [raw::uffd_msg],
) -> Result<impl Iterator<Item = Result<Event>> + 'a> {
const MSG_SIZE: usize = std::mem::size_of::<raw::uffd_msg>();
let buf = unsafe {
std::slice::from_raw_parts_mut(msgs.as_mut_ptr() as _, msgs.len() * MSG_SIZE)
};
let count = match read(self.as_raw_fd(), buf) {
Err(e) if e == Errno::EAGAIN => 0,
Err(e) => return Err(Error::SystemError(e)),
Ok(0) => return Err(Error::ReadEof),
Ok(bytes_read) => {
let remainder = bytes_read % MSG_SIZE;
if remainder != 0 {
return Err(Error::IncompleteMsg {
read: remainder,
expected: MSG_SIZE,
});
}
bytes_read / MSG_SIZE
}
};
Ok(msgs.iter().take(count).map(|msg| Event::from_uffd_msg(msg)))
}
}
bitflags! {
/// Used with `UffdBuilder` and `Uffd::register()` to determine which operations are available.
#[derive(Copy, Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)]
pub struct IoctlFlags: u64 {
const REGISTER = 1 << raw::_UFFDIO_REGISTER;
const UNREGISTER = 1 << raw::_UFFDIO_UNREGISTER;
const WAKE = 1 << raw::_UFFDIO_WAKE;
const COPY = 1 << raw::_UFFDIO_COPY;
const ZEROPAGE = 1 << raw::_UFFDIO_ZEROPAGE;
#[cfg(feature = "linux5_7")]
const WRITE_PROTECT = 1 << raw::_UFFDIO_WRITEPROTECT;
const API = 1 << raw::_UFFDIO_API;
}
}
#[cfg(test)]
mod test {
use super::*;
use std::ptr;
use std::thread;
#[test]
fn test_read_event() -> Result<()> {
const PAGE_SIZE: usize = 4096;
unsafe {
let uffd = UffdBuilder::new().close_on_exec(true).create()?;
let mapping = libc::mmap(
ptr::null_mut(),
PAGE_SIZE,
libc::PROT_READ | libc::PROT_WRITE,
libc::MAP_PRIVATE | libc::MAP_ANON,
-1,
0,
);
assert!(!mapping.is_null());
uffd.register(mapping, PAGE_SIZE)?;
let ptr = mapping as usize;
let thread = thread::spawn(move || {
let ptr = ptr as *mut u8;
*ptr = 1;
});
match uffd.read_event()? {
Some(Event::Pagefault {
rw: ReadWrite::Write,
addr,
..
}) => {
assert_eq!(addr, mapping);
uffd.zeropage(addr, PAGE_SIZE, true)?;
}
_ => panic!("unexpected event"),
}
thread.join().expect("failed to join thread");
uffd.unregister(mapping, PAGE_SIZE)?;
assert_eq!(libc::munmap(mapping, PAGE_SIZE), 0);
}
Ok(())
}
#[test]
fn test_nonblocking_read_event() -> Result<()> {
const PAGE_SIZE: usize = 4096;
unsafe {
let uffd = UffdBuilder::new()
.close_on_exec(true)
.non_blocking(true)
.create()?;
let mapping = libc::mmap(
ptr::null_mut(),
PAGE_SIZE,
libc::PROT_READ | libc::PROT_WRITE,
libc::MAP_PRIVATE | libc::MAP_ANON,
-1,
0,
);
assert!(!mapping.is_null());
uffd.register(mapping, PAGE_SIZE)?;
assert!(uffd.read_event()?.is_none());
let ptr = mapping as usize;
let thread = thread::spawn(move || {
let ptr = ptr as *mut u8;
*ptr = 1;
});
loop {
match uffd.read_event()? {
Some(Event::Pagefault {
rw: ReadWrite::Write,
addr,
..
}) => {
assert_eq!(addr, mapping);
uffd.zeropage(addr, PAGE_SIZE, true)?;
break;
}
Some(_) => panic!("unexpected event"),
None => thread::sleep(std::time::Duration::from_millis(50)),
}
}
thread.join().expect("failed to join thread");
uffd.unregister(mapping, PAGE_SIZE)?;
assert_eq!(libc::munmap(mapping, PAGE_SIZE), 0);
}
Ok(())
}
#[test]
fn test_read_events() -> Result<()> {
unsafe {
const MAX_THREADS: usize = 5;
const PAGE_SIZE: usize = 4096;
const MEM_SIZE: usize = PAGE_SIZE * MAX_THREADS;
let uffd = UffdBuilder::new().close_on_exec(true).create()?;
let mapping = libc::mmap(
ptr::null_mut(),
MEM_SIZE,
libc::PROT_READ | libc::PROT_WRITE,
libc::MAP_PRIVATE | libc::MAP_ANON,
-1,
0,
);
assert!(!mapping.is_null());
uffd.register(mapping, MEM_SIZE)?;
// As accessing the memory will suspend each thread with a page fault event,
// there is no way to signal that the operations the test thread is waiting on to
// complete have been performed.
//
// Therefore, this is inherently racy. The best we can do is simply sleep-wait for
// all threads to have signaled that the operation is *about to be performed*.
let mut seen = [false; MAX_THREADS];
let mut threads = Vec::new();
for i in 0..MAX_THREADS {
let seen = &mut seen[i] as *mut _ as usize;
let ptr = (mapping as *mut u8).add(PAGE_SIZE * i) as usize;
threads.push(thread::spawn(move || {
let seen = seen as *mut bool;
let ptr = ptr as *mut u8;
*seen = true;
*ptr = 1;
}));
}
loop {
// Sleep even if all threads have "signaled", just in case any
// thread is preempted prior to faulting the memory access.
// Still, there's no guarantee that the call to `read_events` below will
// read all the events at once, but this should be "good enough".
let done = seen.iter().all(|b| *b);
thread::sleep(std::time::Duration::from_millis(50));
if done {
break;
}
}
// Read all the events at once
let mut buf = EventBuffer::new(MAX_THREADS);
let mut iter = uffd.read_events(&mut buf)?;
let mut seen = [false; MAX_THREADS];
for _ in 0..MAX_THREADS {
match iter
.next()
.transpose()?
.expect("failed to read all events; potential race condition was hit")
{
Event::Pagefault {
rw: ReadWrite::Write,
addr,
..
} => {
let index = (addr as usize - mapping as usize) / PAGE_SIZE;
assert_eq!(seen[index], false);
seen[index] = true;
uffd.zeropage(addr, PAGE_SIZE, true)?;
}
_ => panic!("unexpected event"),
}
}
assert!(seen.iter().all(|b| *b));
for thread in threads {
thread.join().expect("failed to join thread");
}
uffd.unregister(mapping, MEM_SIZE)?;
assert_eq!(libc::munmap(mapping, MEM_SIZE), 0);
}
Ok(())
}
#[cfg(feature = "linux5_7")]
#[test]
fn test_write_protect() -> Result<()> {
const PAGE_SIZE: usize = 4096;
unsafe {
let uffd = UffdBuilder::new()
.require_features(FeatureFlags::PAGEFAULT_FLAG_WP)
.close_on_exec(true)
.create()?;
let mapping = libc::mmap(
ptr::null_mut(),
PAGE_SIZE,
libc::PROT_READ | libc::PROT_WRITE,
libc::MAP_PRIVATE | libc::MAP_ANON,
-1,
0,
);
assert!(!mapping.is_null());
// This test uses both missing and write-protect modes for a reason.
// The `uffdio_writeprotect` ioctl can only be used on a range *after*
// the missing fault is handled, it seems. This means we either need to
// read/write the page *before* we protect it or handle the missing
// page fault by changing the protection level *after* we zero the page.
assert!(uffd
.register_with_mode(
mapping,
PAGE_SIZE,
RegisterMode::MISSING | RegisterMode::WRITE_PROTECT
)?
.contains(IoctlFlags::WRITE_PROTECT));
let ptr = mapping as usize;
let thread = thread::spawn(move || {
let ptr = ptr as *mut u8;
*ptr = 1;
*ptr = 2;
});
loop {
match uffd.read_event()? {
Some(Event::Pagefault {
kind,
rw: ReadWrite::Write,
addr,
..
}) => match kind {
FaultKind::WriteProtected => {
assert_eq!(addr, mapping);
assert_eq!(*(addr as *const u8), 0);
// Remove the protection and wake the page
uffd.remove_write_protection(mapping, PAGE_SIZE, true)?;
break;
}
FaultKind::Missing => {
assert_eq!(addr, mapping);
uffd.zeropage(mapping, PAGE_SIZE, false)?;
// Technically, we already know it was a write that triggered
// the missing page fault, so there's little point in immediately
// write-protecting the page to cause another fault; in the real
// world, a missing fault with `rw` being `ReadWrite::Write` would
// be enough to mark the page as "dirty". For this test, however,
// we do it this way to ensure a write-protected fault is read.
assert_eq!(*(addr as *const u8), 0);
uffd.write_protect(mapping, PAGE_SIZE)?;
uffd.wake(mapping, PAGE_SIZE)?;
}
},
_ => panic!("unexpected event"),
}
}
thread.join().expect("failed to join thread");
assert_eq!(*(mapping as *const u8), 2);
uffd.unregister(mapping, PAGE_SIZE)?;
assert_eq!(libc::munmap(mapping, PAGE_SIZE), 0);
}
Ok(())
}
}