blob: c20b750cacae16466e8be535aa8b7567266908de [file] [log] [blame]
//! Imara-diff is a solid (imara in swahili) diff library for rust.
//! Solid refers to the fact that imara-diff provides very good runtime performance even
//! in pathologic cases so that your application never appears to freeze while waiting on a diff.
//! The performance improvements are achieved using battle tested heuristics used in gnu-diff and git
//! that are known to yield fast runtime and performance.
//!
//! Imara-diff is also designed to be flexible so that it can be used with arbitrary collections and
//! not just lists and strings and even allows reusing large parts of the computation when
//! comparing the same file to multiple different files.
//!
//! Imara-diff provides two diff algorithms:
//!
//! * The linear-space variant of the well known [**myer** algorithm](http://www.xmailserver.org/diff2.pdf)
//! * The **Histogram** algorithm which variant of the patience diff algorithm.
//!
//! Myers algorithm has been enhanced with preprocessing and multiple heuristics to ensure fast runtime in pathological
//! cases to avoid quadratic time complexity and closely matches the behaviour of gnu-diff and git.
//! The Histogram algorithm was originally ported from git but has been heavily optimized.
//! The **Histogram algorithm outperforms Myers diff** by 10% - 100% across a **wide variety of workloads**.
//!
//! Imara-diffs algorithms have been benchmarked over a wide variety of real-world code.
//! For example while comparing multiple different linux kernel it performs up to 30 times better than the `similar` crate:
#![cfg_attr(doc, doc=concat!("<img width=\"600\" class=\"figure\" src=\"data:image/svg+xml;base64,", include_str!("../plots/linux_comparison.svg.base64"), "\"></img>"))]
//!
//! # Api Overview
//!
//! Imara-diff provides the [`UnifiedDiffBuilder`](crate::UnifiedDiffBuilder) for building
//! a human-redable diff similar to the output of `git diff` or `diff -u`.
//! This makes building a tool similar to gnu diff easy:
//!
//! ```
//! use imara_diff::intern::InternedInput;
//! use imara_diff::{diff, Algorithm, UnifiedDiffBuilder};
//!
//! let before = r#"fn foo() -> Bar {
//! let mut foo = 2;
//! foo *= 50;
//! println!("hello world")
//! }"#;
//!
//! let after = r#"// lorem ipsum
//! fn foo() -> Bar {
//! let mut foo = 2;
//! foo *= 50;
//! println!("hello world");
//! println!("{foo}");
//! }
//! // foo
//! "#;
//!
//! let input = InternedInput::new(before, after);
//! let diff = diff(Algorithm::Histogram, &input, UnifiedDiffBuilder::new(&input));
//! assert_eq!(
//! diff,
//! r#"@@ -1,5 +1,8 @@
//! +// lorem ipsum
//! fn foo() -> Bar {
//! let mut foo = 2;
//! foo *= 50;
//! - println!("hello world")
//! + println!("hello world");
//! + println!("{foo}");
//! }
//! +// foo
//! "#
//! );
//! ```
//!
//! If you want to process the diff in some way you can provide your own implementation of [`Sink`](crate::sink::Sink).
//! For closures [`Sink`](crate::sink::Sink) is already implemented, so simple [`Sink`]s can be easily added:
//!
//! ```
//! use std::ops::Range;
//!
//! use imara_diff::intern::InternedInput;
//! use imara_diff::{diff, Algorithm, UnifiedDiffBuilder};
//!
//! let before = r#"fn foo() -> Bar {
//! let mut foo = 2;
//! foo *= 50;
//! println!("hello world")
//! }"#;
//!
//! let after = r#"// lorem ipsum
//! fn foo() -> Bar {
//! let mut foo = 2;
//! foo *= 50;
//! println!("hello world");
//! println!("{foo}");
//! }
//! // foo
//! "#;
//!
//! let mut insertions = Vec::new();
//! let mut removals = Vec::new();
//! let mut replacements = Vec::new();
//!
//! let input = InternedInput::new(before, after);
//! let sink = |before: Range<u32>, after: Range<u32>| {
//! let hunk_before: Vec<_> = input.before[before.start as usize..before.end as usize]
//! .iter()
//! .map(|&line| input.interner[line])
//! .collect();
//! let hunk_after: Vec<_> = input.after[after.start as usize..after.end as usize]
//! .iter()
//! .map(|&line| input.interner[line])
//! .collect();
//! if hunk_after.is_empty() {
//! removals.push(hunk_before)
//! } else if hunk_before.is_empty() {
//! insertions.push(hunk_after)
//! } else {
//! replacements.push((hunk_before, hunk_after))
//! }
//! };
//! let diff = diff(Algorithm::Histogram, &input, sink);
//! assert_eq!(&insertions, &[vec!["// lorem ipsum"], vec!["// foo"]]);
//! assert!(removals.is_empty());
//! assert_eq!(
//! &replacements,
//! &[(
//! vec![" println!(\"hello world\")"],
//! vec![" println!(\"hello world\");", " println!(\"{foo}\");"]
//! )]
//! );
//! ```
//!
//! For `&str` and `&[u8]` imara-diff will compute a line diff by default.
//! To perform diffs of different tokenizations and collections you can implement the [`TokenSource`](crate::intern::TokenSource) trait.
//! For example the imara-diff provides an alternative tokenziser for line-diffs that includes the line terminator in the line:
//!
//! ```
//! use imara_diff::intern::InternedInput;
//! use imara_diff::sink::Counter;
//! use imara_diff::sources::lines_with_terminator;
//! use imara_diff::{diff, Algorithm, UnifiedDiffBuilder};
//!
//! let before = "foo";
//! let after = "foo\n";
//!
//! let input = InternedInput::new(before, after);
//! let changes = diff(Algorithm::Histogram, &input, Counter::default());
//! assert_eq!(changes.insertions, 0);
//! assert_eq!(changes.removals, 0);
//!
//! let input = InternedInput::new(lines_with_terminator(before), lines_with_terminator(after));
//! let changes = diff(Algorithm::Histogram, &input, Counter::default());
//! assert_eq!(changes.insertions, 1);
//! assert_eq!(changes.removals, 1);
//! ```
use std::hash::Hash;
#[cfg(feature = "unified_diff")]
pub use unified_diff::UnifiedDiffBuilder;
use crate::intern::{InternedInput, Token, TokenSource};
pub use crate::sink::Sink;
mod histogram;
pub mod intern;
mod myers;
pub mod sink;
pub mod sources;
#[cfg(feature = "unified_diff")]
mod unified_diff;
mod util;
#[cfg(test)]
mod tests;
/// `imara-diff` supports multiple different algorithms
/// for computing an edit sequence.
/// These algorithms have different performance and all produce different output.
#[derive(Debug, PartialEq, Eq, Clone, Copy)]
pub enum Algorithm {
/// A variation of the [`patience` diff algorithm described by Bram Cohen's blog post](https://bramcohen.livejournal.com/73318.html)
/// that uses a histogram to find the least common LCS.
/// Just like the `patience` diff algorithm, this algorithm usually produces
/// more human readable output then myers algorithm.
/// However compared to the `patience` diff algorithm (which is slower then myers algorithm),
/// the Histogram algorithm performs much better.
///
/// The implementation here was originally ported from `git` but has been significantly
/// modified to improve performance.
/// As a result it consistently **performs better then myers algorithm** (5%-100%) over
/// a wide variety of test data. For example a benchmark of diffing linux kernel commits is shown below:
#[cfg_attr(doc, doc=concat!("<img width=\"600\" class=\"figure\" src=\"data:image/svg+xml;base64,", include_str!("../plots/linux_speedup.svg.base64"), "\"></img>"))]
///
/// For pathological subsequences that only contain highly repeating tokens (64+ occurrences)
/// the algorithm falls back on Myers algorithm (with heuristics) to avoid quadratic behavior.
///
/// Compared to Myers algorithm, the Histogram diff algorithm is more focused on providing
/// human readable diffs instead of minimal diffs. In practice this means that the edit-sequences
/// produced by the histogram diff are often longer then those produced by Myers algorithm.
///
/// The heuristic used by the histogram diff does not work well for inputs with small (often repeated)
/// tokens. For example **character diffs do not work well** as most (english) text is madeup of
/// a fairly small set of characters. The `Histogram` algorithm will automatically these cases and
/// fallback to Myers algorithm. However this detection has a nontrivial overhead, so
/// if its known upfront that the sort of tokens is very small `Myers` algorithm should
/// be used instead.
Histogram,
/// An implementation of the linear space variant of
/// [Myers `O((N+M)D)` algorithm](http://www.xmailserver.org/diff2.pdf).
/// The algorithm is enhanced with preprocessing that removes
/// tokens that don't occur in the other file at all.
/// Furthermore two heuristics to the middle snake search are implemented
/// that ensure reasonable runtime (mostly linear time complexity) even for large files.
///
/// Due to the divide and conquer nature of the algorithm
/// the edit sequenced produced are still fairly small even when the middle snake
/// search is aborted by a heuristic.
/// However, the produced edit sequences are not guaranteed to be fully minimal.
/// If that property is vital to you, use the `MyersMinimal` algorithm instead.
///
/// The implementation (including the preprocessing) are mostly
/// ported from `git` and `gnu-diff` where Myers algorithm is used
/// as the default diff algorithm.
/// Therefore the used heuristics have been heavily battle tested and
/// are known to behave well over a large variety of inputs
Myers,
/// Same as `Myers` but the early abort heuristics are disabled to guarantee
/// a minimal edit sequence.
/// This can mean significant slowdown in pathological cases.
MyersMinimal,
}
impl Algorithm {
#[cfg(test)]
const ALL: [Self; 2] = [Algorithm::Histogram, Algorithm::Myers];
}
impl Default for Algorithm {
fn default() -> Self {
Algorithm::Histogram
}
}
/// Computes an edit-script that transforms `input.before` into `input.after` using
/// the specified `algorithm`
/// The edit-script is passed to `sink.process_change` while it is produced.
pub fn diff<S: Sink, T: Eq + Hash>(
algorithm: Algorithm,
input: &InternedInput<T>,
sink: S,
) -> S::Out {
diff_with_tokens(algorithm, &input.before, &input.after, input.interner.num_tokens(), sink)
}
/// Computes an edit-script that transforms `before` into `after` using
/// the specified `algorithm`
/// The edit-script is passed to `sink.process_change` while it is produced.
pub fn diff_with_tokens<S: Sink>(
algorithm: Algorithm,
before: &[Token],
after: &[Token],
num_tokens: u32,
sink: S,
) -> S::Out {
assert!(before.len() < i32::MAX as usize, "imara-diff only supports up to {} tokens", i32::MAX);
assert!(after.len() < i32::MAX as usize, "imara-diff only supports up to {} tokens", i32::MAX);
match algorithm {
Algorithm::Histogram => histogram::diff(before, after, num_tokens, sink),
Algorithm::Myers => myers::diff(before, after, num_tokens, sink, false),
Algorithm::MyersMinimal => myers::diff(before, after, num_tokens, sink, true),
}
}