| use std::{ |
| io::{Read, Write}, |
| path::{Path, PathBuf}, |
| process::{Command, Stdio}, |
| }; |
| |
| use bstr::{BStr, ByteSlice}; |
| use gix_filter::{ |
| driver::apply::{Delay, MaybeDelayed}, |
| pipeline::convert::{ToGitOutcome, ToWorktreeOutcome}, |
| }; |
| use gix_object::tree::EntryKind; |
| |
| use crate::blob::{Driver, Pipeline, ResourceKind}; |
| |
| /// A way to access roots for different kinds of resources that are possibly located and accessible in a worktree. |
| #[derive(Clone, Debug, Default)] |
| pub struct WorktreeRoots { |
| /// A place where the source of a rewrite, rename or copy, or generally the previous version of resources, are located. |
| pub old_root: Option<PathBuf>, |
| /// A place where the destination of a rewrite, rename or copy, or generally the new version of resources, are located. |
| pub new_root: Option<PathBuf>, |
| } |
| |
| impl WorktreeRoots { |
| /// Return the root path for the given `kind` |
| pub fn by_kind(&self, kind: ResourceKind) -> Option<&Path> { |
| match kind { |
| ResourceKind::OldOrSource => self.old_root.as_deref(), |
| ResourceKind::NewOrDestination => self.new_root.as_deref(), |
| } |
| } |
| } |
| |
| /// Data as part of an [Outcome]. |
| #[derive(Copy, Clone, Eq, PartialEq, Ord, PartialOrd, Debug)] |
| pub enum Data { |
| /// The data to use for diffing was written into the buffer that was passed during the call to [`Pipeline::convert_to_diffable()`]. |
| Buffer, |
| /// The size that the binary blob had at the given revision, without having applied filters, as it's either |
| /// considered binary or above the big-file threshold. |
| /// |
| /// In this state, the binary file cannot be diffed. |
| Binary { |
| /// The size of the object prior to performing any filtering or as it was found on disk. |
| /// |
| /// Note that technically, the size isn't always representative of the same 'state' of the |
| /// content, as once it can be the size of the blob in git, and once it's the size of file |
| /// in the worktree. |
| size: u64, |
| }, |
| } |
| |
| /// The outcome returned by [Pipeline::convert_to_diffable()](super::Pipeline::convert_to_diffable()). |
| #[derive(Copy, Clone, Eq, PartialEq, Ord, PartialOrd, Debug)] |
| pub struct Outcome { |
| /// If available, an index into the `drivers` field to access more diff-related information of the driver for items |
| /// at the given path, as previously determined by git-attributes. |
| /// |
| /// Note that drivers are queried even if there is no object available. |
| pub driver_index: Option<usize>, |
| /// The data itself, suitable for diffing, and if the object or worktree item is present at all. |
| pub data: Option<Data>, |
| } |
| |
| /// Options for use in a [`Pipeline`]. |
| #[derive(Default, Clone, Copy, PartialEq, Eq, Debug, Hash, Ord, PartialOrd)] |
| pub struct Options { |
| /// The amount of bytes that an object has to reach before being treated as binary. |
| /// These objects will not be queried, nor will their data be processed in any way. |
| /// If `0`, no file is ever considered binary due to their size. |
| /// |
| /// Note that for files stored in `git`, what counts is their stored, decompressed size, |
| /// thus `git-lfs` files would typically not be considered binary unless one explicitly sets |
| /// them |
| pub large_file_threshold_bytes: u64, |
| /// Capabilities of the file system which affect how we read worktree files. |
| pub fs: gix_fs::Capabilities, |
| } |
| |
| /// The specific way to convert a resource. |
| #[derive(Default, Debug, Copy, Clone, Eq, PartialEq, Ord, PartialOrd, Hash)] |
| pub enum Mode { |
| /// Always prepare the version of the resource as it would be in the work-tree, and |
| /// apply binary-to-text filters if present. |
| /// |
| /// This is typically free for resources in the worktree, and will apply filters to resources in the |
| /// object database. |
| #[default] |
| ToWorktreeAndBinaryToText, |
| /// Prepare the version of the resource as it would be in the work-tree if |
| /// binary-to-text filters are present (and apply them), or use the version in `git` otherwise. |
| ToGitUnlessBinaryToTextIsPresent, |
| /// Always prepare resources as they are stored in `git`. |
| /// |
| /// This is usually fastest, even though resources in the worktree needed to be converted files. |
| ToGit, |
| } |
| |
| impl Mode { |
| fn to_worktree(self) -> bool { |
| matches!( |
| self, |
| Mode::ToGitUnlessBinaryToTextIsPresent | Mode::ToWorktreeAndBinaryToText |
| ) |
| } |
| |
| fn to_git(self) -> bool { |
| matches!(self, Mode::ToGitUnlessBinaryToTextIsPresent | Mode::ToGit) |
| } |
| } |
| |
| /// |
| #[allow(clippy::empty_docs)] |
| pub mod convert_to_diffable { |
| use std::collections::TryReserveError; |
| |
| use bstr::BString; |
| use gix_object::tree::EntryKind; |
| |
| /// The error returned by [Pipeline::convert_to_diffable()](super::Pipeline::convert_to_diffable()). |
| #[derive(Debug, thiserror::Error)] |
| #[allow(missing_docs)] |
| pub enum Error { |
| #[error("Entry at '{rela_path}' must be regular file or symlink, but was {actual:?}")] |
| InvalidEntryKind { rela_path: BString, actual: EntryKind }, |
| #[error("Entry at '{rela_path}' could not be read as symbolic link")] |
| ReadLink { rela_path: BString, source: std::io::Error }, |
| #[error("Entry at '{rela_path}' could not be opened for reading or read from")] |
| OpenOrRead { rela_path: BString, source: std::io::Error }, |
| #[error("Entry at '{rela_path}' could not be copied from a filter process to a memory buffer")] |
| StreamCopy { rela_path: BString, source: std::io::Error }, |
| #[error("Failed to run '{cmd}' for binary-to-text conversion of entry at {rela_path}")] |
| RunTextConvFilter { |
| rela_path: BString, |
| cmd: String, |
| source: std::io::Error, |
| }, |
| #[error("Tempfile for binary-to-text conversion for entry at {rela_path} could not be created")] |
| CreateTempfile { rela_path: BString, source: std::io::Error }, |
| #[error("Binary-to-text conversion '{cmd}' for entry at {rela_path} failed with: {stderr}")] |
| TextConvFilterFailed { |
| rela_path: BString, |
| cmd: String, |
| stderr: BString, |
| }, |
| #[error(transparent)] |
| FindObject(#[from] gix_object::find::existing_object::Error), |
| #[error(transparent)] |
| ConvertToWorktree(#[from] gix_filter::pipeline::convert::to_worktree::Error), |
| #[error(transparent)] |
| ConvertToGit(#[from] gix_filter::pipeline::convert::to_git::Error), |
| #[error("Memory allocation failed")] |
| OutOfMemory(#[from] TryReserveError), |
| } |
| } |
| |
| /// Lifecycle |
| impl Pipeline { |
| /// Create a new instance of a pipeline which produces blobs suitable for diffing. `roots` allow to read worktree files directly, otherwise |
| /// `worktree_filter` is used to transform object database data directly. `drivers` further configure individual paths. |
| /// `options` are used to further configure the way we act.. |
| pub fn new( |
| roots: WorktreeRoots, |
| worktree_filter: gix_filter::Pipeline, |
| mut drivers: Vec<super::Driver>, |
| options: Options, |
| ) -> Self { |
| drivers.sort_by(|a, b| a.name.cmp(&b.name)); |
| Pipeline { |
| roots, |
| worktree_filter, |
| drivers, |
| options, |
| attrs: { |
| let mut out = gix_filter::attributes::search::Outcome::default(); |
| out.initialize_with_selection(&Default::default(), Some("diff")); |
| out |
| }, |
| path: Default::default(), |
| } |
| } |
| } |
| |
| /// Access |
| impl Pipeline { |
| /// Return all drivers that this instance was initialized with. |
| pub fn drivers(&self) -> &[super::Driver] { |
| &self.drivers |
| } |
| } |
| |
| /// Conversion |
| impl Pipeline { |
| /// Convert the object at `id`, `mode`, `rela_path` and `kind`, providing access to `attributes` and `objects`. |
| /// The resulting diff-able data is written into `out`, assuming it's not too large. The returned [`Outcome`] |
| /// contains information on how to use `out`, or if it's filled at all. |
| /// |
| /// `attributes` must be returning the attributes at `rela_path`, and `objects` must be usable if `kind` is |
| /// a resource in the object database, i.e. has no worktree root available. |
| /// |
| /// If `id` [is null](gix_hash::ObjectId::is_null()) or the file in question doesn't exist in the worktree in case |
| /// [a root](WorktreeRoots) is present, then `out` will be left cleared and [Outcome::data] will be `None`. |
| /// |
| /// Note that `mode` is trusted, and we will not re-validate that the entry in the worktree actually is of that mode. |
| /// |
| /// Use `convert` to control what kind of the resource will be produced. |
| /// |
| /// ### About Tempfiles |
| /// |
| /// When querying from the object database and a binary and a [binary-to-text](Driver::binary_to_text_command) is set, |
| /// a temporary file will be created to serve as input for the converter program, containing the worktree-data that |
| /// exactly as it would be present in the worktree if checked out. |
| /// |
| /// As these files are ultimately named tempfiles, they will be leaked unless the [gix_tempfile] is configured with |
| /// a signal handler. If they leak, they would remain in the system's `$TMP` directory. |
| #[allow(clippy::too_many_arguments)] |
| pub fn convert_to_diffable( |
| &mut self, |
| id: &gix_hash::oid, |
| mode: EntryKind, |
| rela_path: &BStr, |
| kind: ResourceKind, |
| attributes: &mut dyn FnMut(&BStr, &mut gix_filter::attributes::search::Outcome), |
| objects: &dyn gix_object::FindObjectOrHeader, |
| convert: Mode, |
| out: &mut Vec<u8>, |
| ) -> Result<Outcome, convert_to_diffable::Error> { |
| let is_symlink = match mode { |
| EntryKind::Link if self.options.fs.symlink => true, |
| EntryKind::Blob | EntryKind::BlobExecutable => false, |
| _ => { |
| return Err(convert_to_diffable::Error::InvalidEntryKind { |
| rela_path: rela_path.to_owned(), |
| actual: mode, |
| }) |
| } |
| }; |
| |
| out.clear(); |
| attributes(rela_path, &mut self.attrs); |
| let attr = self.attrs.iter_selected().next().expect("pre-initialized with 'diff'"); |
| let driver_index = attr |
| .assignment |
| .state |
| .as_bstr() |
| .and_then(|name| self.drivers.binary_search_by(|d| d.name.as_bstr().cmp(name)).ok()); |
| let driver = driver_index.map(|idx| &self.drivers[idx]); |
| let mut is_binary = if let Some(driver) = driver { |
| driver |
| .is_binary |
| .map(|is_binary| is_binary && driver.binary_to_text_command.is_none()) |
| } else { |
| attr.assignment.state.is_unset().then_some(true) |
| }; |
| match self.roots.by_kind(kind) { |
| Some(root) => { |
| self.path.clear(); |
| self.path.push(root); |
| self.path.push(gix_path::from_bstr(rela_path)); |
| let data = if is_symlink { |
| let target = none_if_missing(std::fs::read_link(&self.path)).map_err(|err| { |
| convert_to_diffable::Error::ReadLink { |
| rela_path: rela_path.to_owned(), |
| source: err, |
| } |
| })?; |
| target.map(|target| { |
| out.extend_from_slice(gix_path::into_bstr(target).as_ref()); |
| Data::Buffer |
| }) |
| } else { |
| let need_size_only = is_binary == Some(true); |
| let size_in_bytes = (need_size_only |
| || (is_binary != Some(false) && self.options.large_file_threshold_bytes > 0)) |
| .then(|| { |
| none_if_missing(self.path.metadata().map(|md| md.len())).map_err(|err| { |
| convert_to_diffable::Error::OpenOrRead { |
| rela_path: rela_path.to_owned(), |
| source: err, |
| } |
| }) |
| }) |
| .transpose()?; |
| match size_in_bytes { |
| Some(None) => None, // missing as identified by the size check |
| Some(Some(size)) if size > self.options.large_file_threshold_bytes || need_size_only => { |
| Some(Data::Binary { size }) |
| } |
| _ => { |
| match driver |
| .filter(|_| convert.to_worktree()) |
| .and_then(|d| d.prepare_binary_to_text_cmd(&self.path)) |
| { |
| Some(cmd) => { |
| // Avoid letting the driver program fail if it doesn't exist. |
| if self.options.large_file_threshold_bytes == 0 |
| && none_if_missing(std::fs::symlink_metadata(&self.path)) |
| .map_err(|err| convert_to_diffable::Error::OpenOrRead { |
| rela_path: rela_path.to_owned(), |
| source: err, |
| })? |
| .is_none() |
| { |
| None |
| } else { |
| run_cmd(rela_path, cmd, out)?; |
| Some(Data::Buffer) |
| } |
| } |
| None => { |
| let file = none_if_missing(std::fs::File::open(&self.path)).map_err(|err| { |
| convert_to_diffable::Error::OpenOrRead { |
| rela_path: rela_path.to_owned(), |
| source: err, |
| } |
| })?; |
| |
| match file { |
| Some(mut file) => { |
| if convert.to_git() { |
| let res = self.worktree_filter.convert_to_git( |
| file, |
| gix_path::from_bstr(rela_path).as_ref(), |
| attributes, |
| &mut |buf| objects.try_find(id, buf).map(|obj| obj.map(|_| ())), |
| )?; |
| |
| match res { |
| ToGitOutcome::Unchanged(mut file) => { |
| file.read_to_end(out).map_err(|err| { |
| convert_to_diffable::Error::OpenOrRead { |
| rela_path: rela_path.to_owned(), |
| source: err, |
| } |
| })?; |
| } |
| ToGitOutcome::Process(mut stream) => { |
| stream.read_to_end(out).map_err(|err| { |
| convert_to_diffable::Error::OpenOrRead { |
| rela_path: rela_path.to_owned(), |
| source: err, |
| } |
| })?; |
| } |
| ToGitOutcome::Buffer(buf) => { |
| out.clear(); |
| out.try_reserve(buf.len())?; |
| out.extend_from_slice(buf); |
| } |
| } |
| } else { |
| file.read_to_end(out).map_err(|err| { |
| convert_to_diffable::Error::OpenOrRead { |
| rela_path: rela_path.to_owned(), |
| source: err, |
| } |
| })?; |
| } |
| |
| Some(if is_binary.unwrap_or_else(|| is_binary_buf(out)) { |
| let size = out.len() as u64; |
| out.clear(); |
| Data::Binary { size } |
| } else { |
| Data::Buffer |
| }) |
| } |
| None => None, |
| } |
| } |
| } |
| } |
| } |
| }; |
| Ok(Outcome { driver_index, data }) |
| } |
| None => { |
| let data = if id.is_null() { |
| None |
| } else { |
| let header = objects |
| .try_header(id) |
| .map_err(gix_object::find::existing_object::Error::Find)? |
| .ok_or_else(|| gix_object::find::existing_object::Error::NotFound { oid: id.to_owned() })?; |
| if is_binary.is_none() |
| && self.options.large_file_threshold_bytes > 0 |
| && header.size > self.options.large_file_threshold_bytes |
| { |
| is_binary = Some(true); |
| }; |
| let data = if is_binary == Some(true) { |
| Data::Binary { size: header.size } |
| } else { |
| objects |
| .try_find(id, out) |
| .map_err(gix_object::find::existing_object::Error::Find)? |
| .ok_or_else(|| gix_object::find::existing_object::Error::NotFound { oid: id.to_owned() })?; |
| if matches!(mode, EntryKind::Blob | EntryKind::BlobExecutable) |
| && convert == Mode::ToWorktreeAndBinaryToText |
| || (convert == Mode::ToGitUnlessBinaryToTextIsPresent |
| && driver.map_or(false, |d| d.binary_to_text_command.is_some())) |
| { |
| let res = |
| self.worktree_filter |
| .convert_to_worktree(out, rela_path, attributes, Delay::Forbid)?; |
| |
| let cmd_and_file = driver |
| .and_then(|d| { |
| d.binary_to_text_command.is_some().then(|| { |
| gix_tempfile::new( |
| std::env::temp_dir(), |
| gix_tempfile::ContainingDirectory::Exists, |
| gix_tempfile::AutoRemove::Tempfile, |
| ) |
| .and_then(|mut tmp_file| { |
| self.path.clear(); |
| tmp_file.with_mut(|tmp| self.path.push(tmp.path()))?; |
| Ok(tmp_file) |
| }) |
| .map(|tmp_file| { |
| ( |
| d.prepare_binary_to_text_cmd(&self.path) |
| .expect("always get cmd if command is set"), |
| tmp_file, |
| ) |
| }) |
| }) |
| }) |
| .transpose() |
| .map_err(|err| convert_to_diffable::Error::CreateTempfile { |
| source: err, |
| rela_path: rela_path.to_owned(), |
| })?; |
| match cmd_and_file { |
| Some((cmd, mut tmp_file)) => { |
| match res { |
| ToWorktreeOutcome::Unchanged(buf) | ToWorktreeOutcome::Buffer(buf) => { |
| tmp_file.write_all(buf) |
| } |
| ToWorktreeOutcome::Process(MaybeDelayed::Immediate(mut stream)) => { |
| std::io::copy(&mut stream, &mut tmp_file).map(|_| ()) |
| } |
| ToWorktreeOutcome::Process(MaybeDelayed::Delayed(_)) => { |
| unreachable!("we prohibit this") |
| } |
| } |
| .map_err(|err| { |
| convert_to_diffable::Error::CreateTempfile { |
| source: err, |
| rela_path: rela_path.to_owned(), |
| } |
| })?; |
| out.clear(); |
| run_cmd(rela_path, cmd, out)?; |
| } |
| None => { |
| match res { |
| ToWorktreeOutcome::Unchanged(_) => {} |
| ToWorktreeOutcome::Buffer(src) => { |
| out.clear(); |
| out.try_reserve(src.len())?; |
| out.extend_from_slice(src); |
| } |
| ToWorktreeOutcome::Process(MaybeDelayed::Immediate(mut stream)) => { |
| std::io::copy(&mut stream, out).map_err(|err| { |
| convert_to_diffable::Error::StreamCopy { |
| rela_path: rela_path.to_owned(), |
| source: err, |
| } |
| })?; |
| } |
| ToWorktreeOutcome::Process(MaybeDelayed::Delayed(_)) => { |
| unreachable!("we prohibit this") |
| } |
| }; |
| } |
| } |
| } |
| |
| if driver.map_or(true, |d| d.binary_to_text_command.is_none()) |
| && is_binary.unwrap_or_else(|| is_binary_buf(out)) |
| { |
| let size = out.len() as u64; |
| out.clear(); |
| Data::Binary { size } |
| } else { |
| Data::Buffer |
| } |
| }; |
| Some(data) |
| }; |
| Ok(Outcome { driver_index, data }) |
| } |
| } |
| } |
| } |
| |
| fn is_binary_buf(buf: &[u8]) -> bool { |
| let buf = &buf[..buf.len().min(8000)]; |
| buf.contains(&0) |
| } |
| |
| fn none_if_missing<T>(res: std::io::Result<T>) -> std::io::Result<Option<T>> { |
| match res { |
| Ok(data) => Ok(Some(data)), |
| Err(err) if err.kind() == std::io::ErrorKind::NotFound => Ok(None), |
| Err(err) => Err(err), |
| } |
| } |
| |
| fn run_cmd(rela_path: &BStr, mut cmd: Command, out: &mut Vec<u8>) -> Result<(), convert_to_diffable::Error> { |
| gix_trace::debug!(cmd = ?cmd, "Running binary-to-text command"); |
| let mut res = cmd |
| .output() |
| .map_err(|err| convert_to_diffable::Error::RunTextConvFilter { |
| rela_path: rela_path.to_owned(), |
| cmd: format!("{cmd:?}"), |
| source: err, |
| })?; |
| if !res.status.success() { |
| return Err(convert_to_diffable::Error::TextConvFilterFailed { |
| rela_path: rela_path.to_owned(), |
| cmd: format!("{cmd:?}"), |
| stderr: res.stderr.into(), |
| }); |
| } |
| out.append(&mut res.stdout); |
| Ok(()) |
| } |
| |
| impl Driver { |
| /// Produce an invocable command pre-configured to produce the filtered output on stdout after reading `path`. |
| pub fn prepare_binary_to_text_cmd(&self, path: &Path) -> Option<std::process::Command> { |
| let command: &BStr = self.binary_to_text_command.as_ref()?.as_ref(); |
| let cmd = gix_command::prepare(gix_path::from_bstr(command).into_owned()) |
| .with_shell() |
| .stdin(Stdio::null()) |
| .stdout(Stdio::piped()) |
| .stderr(Stdio::piped()) |
| .arg(path) |
| .into(); |
| Some(cmd) |
| } |
| } |