blob: f67de77a14cc085a0331acfb324ed77117b71209 [file] [log] [blame]
use std::{
io::{Read, Write},
path::{Path, PathBuf},
process::{Command, Stdio},
};
use bstr::{BStr, ByteSlice};
use gix_filter::{
driver::apply::{Delay, MaybeDelayed},
pipeline::convert::{ToGitOutcome, ToWorktreeOutcome},
};
use gix_object::tree::EntryKind;
use crate::blob::{Driver, Pipeline, ResourceKind};
/// A way to access roots for different kinds of resources that are possibly located and accessible in a worktree.
#[derive(Clone, Debug, Default)]
pub struct WorktreeRoots {
/// A place where the source of a rewrite, rename or copy, or generally the previous version of resources, are located.
pub old_root: Option<PathBuf>,
/// A place where the destination of a rewrite, rename or copy, or generally the new version of resources, are located.
pub new_root: Option<PathBuf>,
}
impl WorktreeRoots {
/// Return the root path for the given `kind`
pub fn by_kind(&self, kind: ResourceKind) -> Option<&Path> {
match kind {
ResourceKind::OldOrSource => self.old_root.as_deref(),
ResourceKind::NewOrDestination => self.new_root.as_deref(),
}
}
}
/// Data as part of an [Outcome].
#[derive(Copy, Clone, Eq, PartialEq, Ord, PartialOrd, Debug)]
pub enum Data {
/// The data to use for diffing was written into the buffer that was passed during the call to [`Pipeline::convert_to_diffable()`].
Buffer,
/// The size that the binary blob had at the given revision, without having applied filters, as it's either
/// considered binary or above the big-file threshold.
///
/// In this state, the binary file cannot be diffed.
Binary {
/// The size of the object prior to performing any filtering or as it was found on disk.
///
/// Note that technically, the size isn't always representative of the same 'state' of the
/// content, as once it can be the size of the blob in git, and once it's the size of file
/// in the worktree.
size: u64,
},
}
/// The outcome returned by [Pipeline::convert_to_diffable()](super::Pipeline::convert_to_diffable()).
#[derive(Copy, Clone, Eq, PartialEq, Ord, PartialOrd, Debug)]
pub struct Outcome {
/// If available, an index into the `drivers` field to access more diff-related information of the driver for items
/// at the given path, as previously determined by git-attributes.
///
/// Note that drivers are queried even if there is no object available.
pub driver_index: Option<usize>,
/// The data itself, suitable for diffing, and if the object or worktree item is present at all.
pub data: Option<Data>,
}
/// Options for use in a [`Pipeline`].
#[derive(Default, Clone, Copy, PartialEq, Eq, Debug, Hash, Ord, PartialOrd)]
pub struct Options {
/// The amount of bytes that an object has to reach before being treated as binary.
/// These objects will not be queried, nor will their data be processed in any way.
/// If `0`, no file is ever considered binary due to their size.
///
/// Note that for files stored in `git`, what counts is their stored, decompressed size,
/// thus `git-lfs` files would typically not be considered binary unless one explicitly sets
/// them
pub large_file_threshold_bytes: u64,
/// Capabilities of the file system which affect how we read worktree files.
pub fs: gix_fs::Capabilities,
}
/// The specific way to convert a resource.
#[derive(Default, Debug, Copy, Clone, Eq, PartialEq, Ord, PartialOrd, Hash)]
pub enum Mode {
/// Always prepare the version of the resource as it would be in the work-tree, and
/// apply binary-to-text filters if present.
///
/// This is typically free for resources in the worktree, and will apply filters to resources in the
/// object database.
#[default]
ToWorktreeAndBinaryToText,
/// Prepare the version of the resource as it would be in the work-tree if
/// binary-to-text filters are present (and apply them), or use the version in `git` otherwise.
ToGitUnlessBinaryToTextIsPresent,
/// Always prepare resources as they are stored in `git`.
///
/// This is usually fastest, even though resources in the worktree needed to be converted files.
ToGit,
}
impl Mode {
fn to_worktree(self) -> bool {
matches!(
self,
Mode::ToGitUnlessBinaryToTextIsPresent | Mode::ToWorktreeAndBinaryToText
)
}
fn to_git(self) -> bool {
matches!(self, Mode::ToGitUnlessBinaryToTextIsPresent | Mode::ToGit)
}
}
///
#[allow(clippy::empty_docs)]
pub mod convert_to_diffable {
use std::collections::TryReserveError;
use bstr::BString;
use gix_object::tree::EntryKind;
/// The error returned by [Pipeline::convert_to_diffable()](super::Pipeline::convert_to_diffable()).
#[derive(Debug, thiserror::Error)]
#[allow(missing_docs)]
pub enum Error {
#[error("Entry at '{rela_path}' must be regular file or symlink, but was {actual:?}")]
InvalidEntryKind { rela_path: BString, actual: EntryKind },
#[error("Entry at '{rela_path}' could not be read as symbolic link")]
ReadLink { rela_path: BString, source: std::io::Error },
#[error("Entry at '{rela_path}' could not be opened for reading or read from")]
OpenOrRead { rela_path: BString, source: std::io::Error },
#[error("Entry at '{rela_path}' could not be copied from a filter process to a memory buffer")]
StreamCopy { rela_path: BString, source: std::io::Error },
#[error("Failed to run '{cmd}' for binary-to-text conversion of entry at {rela_path}")]
RunTextConvFilter {
rela_path: BString,
cmd: String,
source: std::io::Error,
},
#[error("Tempfile for binary-to-text conversion for entry at {rela_path} could not be created")]
CreateTempfile { rela_path: BString, source: std::io::Error },
#[error("Binary-to-text conversion '{cmd}' for entry at {rela_path} failed with: {stderr}")]
TextConvFilterFailed {
rela_path: BString,
cmd: String,
stderr: BString,
},
#[error(transparent)]
FindObject(#[from] gix_object::find::existing_object::Error),
#[error(transparent)]
ConvertToWorktree(#[from] gix_filter::pipeline::convert::to_worktree::Error),
#[error(transparent)]
ConvertToGit(#[from] gix_filter::pipeline::convert::to_git::Error),
#[error("Memory allocation failed")]
OutOfMemory(#[from] TryReserveError),
}
}
/// Lifecycle
impl Pipeline {
/// Create a new instance of a pipeline which produces blobs suitable for diffing. `roots` allow to read worktree files directly, otherwise
/// `worktree_filter` is used to transform object database data directly. `drivers` further configure individual paths.
/// `options` are used to further configure the way we act..
pub fn new(
roots: WorktreeRoots,
worktree_filter: gix_filter::Pipeline,
mut drivers: Vec<super::Driver>,
options: Options,
) -> Self {
drivers.sort_by(|a, b| a.name.cmp(&b.name));
Pipeline {
roots,
worktree_filter,
drivers,
options,
attrs: {
let mut out = gix_filter::attributes::search::Outcome::default();
out.initialize_with_selection(&Default::default(), Some("diff"));
out
},
path: Default::default(),
}
}
}
/// Access
impl Pipeline {
/// Return all drivers that this instance was initialized with.
pub fn drivers(&self) -> &[super::Driver] {
&self.drivers
}
}
/// Conversion
impl Pipeline {
/// Convert the object at `id`, `mode`, `rela_path` and `kind`, providing access to `attributes` and `objects`.
/// The resulting diff-able data is written into `out`, assuming it's not too large. The returned [`Outcome`]
/// contains information on how to use `out`, or if it's filled at all.
///
/// `attributes` must be returning the attributes at `rela_path`, and `objects` must be usable if `kind` is
/// a resource in the object database, i.e. has no worktree root available.
///
/// If `id` [is null](gix_hash::ObjectId::is_null()) or the file in question doesn't exist in the worktree in case
/// [a root](WorktreeRoots) is present, then `out` will be left cleared and [Outcome::data] will be `None`.
///
/// Note that `mode` is trusted, and we will not re-validate that the entry in the worktree actually is of that mode.
///
/// Use `convert` to control what kind of the resource will be produced.
///
/// ### About Tempfiles
///
/// When querying from the object database and a binary and a [binary-to-text](Driver::binary_to_text_command) is set,
/// a temporary file will be created to serve as input for the converter program, containing the worktree-data that
/// exactly as it would be present in the worktree if checked out.
///
/// As these files are ultimately named tempfiles, they will be leaked unless the [gix_tempfile] is configured with
/// a signal handler. If they leak, they would remain in the system's `$TMP` directory.
#[allow(clippy::too_many_arguments)]
pub fn convert_to_diffable(
&mut self,
id: &gix_hash::oid,
mode: EntryKind,
rela_path: &BStr,
kind: ResourceKind,
attributes: &mut dyn FnMut(&BStr, &mut gix_filter::attributes::search::Outcome),
objects: &dyn gix_object::FindObjectOrHeader,
convert: Mode,
out: &mut Vec<u8>,
) -> Result<Outcome, convert_to_diffable::Error> {
let is_symlink = match mode {
EntryKind::Link if self.options.fs.symlink => true,
EntryKind::Blob | EntryKind::BlobExecutable => false,
_ => {
return Err(convert_to_diffable::Error::InvalidEntryKind {
rela_path: rela_path.to_owned(),
actual: mode,
})
}
};
out.clear();
attributes(rela_path, &mut self.attrs);
let attr = self.attrs.iter_selected().next().expect("pre-initialized with 'diff'");
let driver_index = attr
.assignment
.state
.as_bstr()
.and_then(|name| self.drivers.binary_search_by(|d| d.name.as_bstr().cmp(name)).ok());
let driver = driver_index.map(|idx| &self.drivers[idx]);
let mut is_binary = if let Some(driver) = driver {
driver
.is_binary
.map(|is_binary| is_binary && driver.binary_to_text_command.is_none())
} else {
attr.assignment.state.is_unset().then_some(true)
};
match self.roots.by_kind(kind) {
Some(root) => {
self.path.clear();
self.path.push(root);
self.path.push(gix_path::from_bstr(rela_path));
let data = if is_symlink {
let target = none_if_missing(std::fs::read_link(&self.path)).map_err(|err| {
convert_to_diffable::Error::ReadLink {
rela_path: rela_path.to_owned(),
source: err,
}
})?;
target.map(|target| {
out.extend_from_slice(gix_path::into_bstr(target).as_ref());
Data::Buffer
})
} else {
let need_size_only = is_binary == Some(true);
let size_in_bytes = (need_size_only
|| (is_binary != Some(false) && self.options.large_file_threshold_bytes > 0))
.then(|| {
none_if_missing(self.path.metadata().map(|md| md.len())).map_err(|err| {
convert_to_diffable::Error::OpenOrRead {
rela_path: rela_path.to_owned(),
source: err,
}
})
})
.transpose()?;
match size_in_bytes {
Some(None) => None, // missing as identified by the size check
Some(Some(size)) if size > self.options.large_file_threshold_bytes || need_size_only => {
Some(Data::Binary { size })
}
_ => {
match driver
.filter(|_| convert.to_worktree())
.and_then(|d| d.prepare_binary_to_text_cmd(&self.path))
{
Some(cmd) => {
// Avoid letting the driver program fail if it doesn't exist.
if self.options.large_file_threshold_bytes == 0
&& none_if_missing(std::fs::symlink_metadata(&self.path))
.map_err(|err| convert_to_diffable::Error::OpenOrRead {
rela_path: rela_path.to_owned(),
source: err,
})?
.is_none()
{
None
} else {
run_cmd(rela_path, cmd, out)?;
Some(Data::Buffer)
}
}
None => {
let file = none_if_missing(std::fs::File::open(&self.path)).map_err(|err| {
convert_to_diffable::Error::OpenOrRead {
rela_path: rela_path.to_owned(),
source: err,
}
})?;
match file {
Some(mut file) => {
if convert.to_git() {
let res = self.worktree_filter.convert_to_git(
file,
gix_path::from_bstr(rela_path).as_ref(),
attributes,
&mut |buf| objects.try_find(id, buf).map(|obj| obj.map(|_| ())),
)?;
match res {
ToGitOutcome::Unchanged(mut file) => {
file.read_to_end(out).map_err(|err| {
convert_to_diffable::Error::OpenOrRead {
rela_path: rela_path.to_owned(),
source: err,
}
})?;
}
ToGitOutcome::Process(mut stream) => {
stream.read_to_end(out).map_err(|err| {
convert_to_diffable::Error::OpenOrRead {
rela_path: rela_path.to_owned(),
source: err,
}
})?;
}
ToGitOutcome::Buffer(buf) => {
out.clear();
out.try_reserve(buf.len())?;
out.extend_from_slice(buf);
}
}
} else {
file.read_to_end(out).map_err(|err| {
convert_to_diffable::Error::OpenOrRead {
rela_path: rela_path.to_owned(),
source: err,
}
})?;
}
Some(if is_binary.unwrap_or_else(|| is_binary_buf(out)) {
let size = out.len() as u64;
out.clear();
Data::Binary { size }
} else {
Data::Buffer
})
}
None => None,
}
}
}
}
}
};
Ok(Outcome { driver_index, data })
}
None => {
let data = if id.is_null() {
None
} else {
let header = objects
.try_header(id)
.map_err(gix_object::find::existing_object::Error::Find)?
.ok_or_else(|| gix_object::find::existing_object::Error::NotFound { oid: id.to_owned() })?;
if is_binary.is_none()
&& self.options.large_file_threshold_bytes > 0
&& header.size > self.options.large_file_threshold_bytes
{
is_binary = Some(true);
};
let data = if is_binary == Some(true) {
Data::Binary { size: header.size }
} else {
objects
.try_find(id, out)
.map_err(gix_object::find::existing_object::Error::Find)?
.ok_or_else(|| gix_object::find::existing_object::Error::NotFound { oid: id.to_owned() })?;
if matches!(mode, EntryKind::Blob | EntryKind::BlobExecutable)
&& convert == Mode::ToWorktreeAndBinaryToText
|| (convert == Mode::ToGitUnlessBinaryToTextIsPresent
&& driver.map_or(false, |d| d.binary_to_text_command.is_some()))
{
let res =
self.worktree_filter
.convert_to_worktree(out, rela_path, attributes, Delay::Forbid)?;
let cmd_and_file = driver
.and_then(|d| {
d.binary_to_text_command.is_some().then(|| {
gix_tempfile::new(
std::env::temp_dir(),
gix_tempfile::ContainingDirectory::Exists,
gix_tempfile::AutoRemove::Tempfile,
)
.and_then(|mut tmp_file| {
self.path.clear();
tmp_file.with_mut(|tmp| self.path.push(tmp.path()))?;
Ok(tmp_file)
})
.map(|tmp_file| {
(
d.prepare_binary_to_text_cmd(&self.path)
.expect("always get cmd if command is set"),
tmp_file,
)
})
})
})
.transpose()
.map_err(|err| convert_to_diffable::Error::CreateTempfile {
source: err,
rela_path: rela_path.to_owned(),
})?;
match cmd_and_file {
Some((cmd, mut tmp_file)) => {
match res {
ToWorktreeOutcome::Unchanged(buf) | ToWorktreeOutcome::Buffer(buf) => {
tmp_file.write_all(buf)
}
ToWorktreeOutcome::Process(MaybeDelayed::Immediate(mut stream)) => {
std::io::copy(&mut stream, &mut tmp_file).map(|_| ())
}
ToWorktreeOutcome::Process(MaybeDelayed::Delayed(_)) => {
unreachable!("we prohibit this")
}
}
.map_err(|err| {
convert_to_diffable::Error::CreateTempfile {
source: err,
rela_path: rela_path.to_owned(),
}
})?;
out.clear();
run_cmd(rela_path, cmd, out)?;
}
None => {
match res {
ToWorktreeOutcome::Unchanged(_) => {}
ToWorktreeOutcome::Buffer(src) => {
out.clear();
out.try_reserve(src.len())?;
out.extend_from_slice(src);
}
ToWorktreeOutcome::Process(MaybeDelayed::Immediate(mut stream)) => {
std::io::copy(&mut stream, out).map_err(|err| {
convert_to_diffable::Error::StreamCopy {
rela_path: rela_path.to_owned(),
source: err,
}
})?;
}
ToWorktreeOutcome::Process(MaybeDelayed::Delayed(_)) => {
unreachable!("we prohibit this")
}
};
}
}
}
if driver.map_or(true, |d| d.binary_to_text_command.is_none())
&& is_binary.unwrap_or_else(|| is_binary_buf(out))
{
let size = out.len() as u64;
out.clear();
Data::Binary { size }
} else {
Data::Buffer
}
};
Some(data)
};
Ok(Outcome { driver_index, data })
}
}
}
}
fn is_binary_buf(buf: &[u8]) -> bool {
let buf = &buf[..buf.len().min(8000)];
buf.contains(&0)
}
fn none_if_missing<T>(res: std::io::Result<T>) -> std::io::Result<Option<T>> {
match res {
Ok(data) => Ok(Some(data)),
Err(err) if err.kind() == std::io::ErrorKind::NotFound => Ok(None),
Err(err) => Err(err),
}
}
fn run_cmd(rela_path: &BStr, mut cmd: Command, out: &mut Vec<u8>) -> Result<(), convert_to_diffable::Error> {
gix_trace::debug!(cmd = ?cmd, "Running binary-to-text command");
let mut res = cmd
.output()
.map_err(|err| convert_to_diffable::Error::RunTextConvFilter {
rela_path: rela_path.to_owned(),
cmd: format!("{cmd:?}"),
source: err,
})?;
if !res.status.success() {
return Err(convert_to_diffable::Error::TextConvFilterFailed {
rela_path: rela_path.to_owned(),
cmd: format!("{cmd:?}"),
stderr: res.stderr.into(),
});
}
out.append(&mut res.stdout);
Ok(())
}
impl Driver {
/// Produce an invocable command pre-configured to produce the filtered output on stdout after reading `path`.
pub fn prepare_binary_to_text_cmd(&self, path: &Path) -> Option<std::process::Command> {
let command: &BStr = self.binary_to_text_command.as_ref()?.as_ref();
let cmd = gix_command::prepare(gix_path::from_bstr(command).into_owned())
.with_shell()
.stdin(Stdio::null())
.stdout(Stdio::piped())
.stderr(Stdio::piped())
.arg(path)
.into();
Some(cmd)
}
}