vendor/imara-diff/src/lib.rs - toolchain/rustc - Git at Google

 //! Imara-diff is a solid (imara in swahili) diff library for rust.
 //! Solid refers to the fact that imara-diff provides very good runtime performance even
 //! in pathologic cases so that your application never appears to freeze while waiting on a diff.
 //! The performance improvements are achieved using battle tested heuristics used in gnu-diff and git
 //! that are known to yield fast runtime and performance.
 //!
 //! Imara-diff is also designed to be flexible so that it can be used with arbitrary collections and
 //! not just lists and strings and even allows reusing large parts of the computation when
 //! comparing the same file to multiple different files.
 //!
 //! Imara-diff provides two diff algorithms:
 //!
 //! * The linear-space variant of the well known [**myer** algorithm](http://www.xmailserver.org/diff2.pdf)
 //! * The **Histogram** algorithm which variant of the patience diff algorithm.
 //!
 //! Myers algorithm has been enhanced with preprocessing and multiple heuristics to ensure fast runtime in pathological
 //! cases to avoid quadratic time complexity and closely matches the behaviour of gnu-diff and git.
 //! The Histogram algorithm was originally ported from git but has been heavily optimized.
 //! The **Histogram algorithm outperforms Myers diff** by 10% - 100% across a **wide variety of workloads**.
 //!
 //! Imara-diffs algorithms have been benchmarked over a wide variety of real-world code.
 //! For example while comparing multiple different linux kernel it performs up to 30 times better than the `similar` crate:
 #![cfg_attr(doc, doc=concat!("<img width=\"600\" class=\"figure\" src=\"data:image/svg+xml;base64,", include_str!("../plots/linux_comparison.svg.base64"), "\"></img>"))]
 //!
 //! # Api Overview
 //!
 //! Imara-diff provides the [`UnifiedDiffBuilder`](crate::UnifiedDiffBuilder) for building
 //! a human-redable diff similar to the output of `git diff` or `diff -u`.
 //! This makes building a tool similar to gnu diff easy:
 //!
 //! ```
 //! use imara_diff::intern::InternedInput;
 //! use imara_diff::{diff, Algorithm, UnifiedDiffBuilder};
 //!
 //! let before = r#"fn foo() -> Bar {
 //!     let mut foo = 2;
 //!     foo *= 50;
 //!     println!("hello world")
 //! }"#;
 //!
 //! let after = r#"// lorem ipsum
 //! fn foo() -> Bar {
 //!     let mut foo = 2;
 //!     foo *= 50;
 //!     println!("hello world");
 //!     println!("{foo}");
 //! }
 //! // foo
 //! "#;
 //!
 //! let input = InternedInput::new(before, after);
 //! let diff = diff(Algorithm::Histogram, &input, UnifiedDiffBuilder::new(&input));
 //! assert_eq!(
 //!     diff,
 //!     r#"@@ -1,5 +1,8 @@
 //! +// lorem ipsum
 //!  fn foo() -> Bar {
 //!      let mut foo = 2;
 //!      foo *= 50;
 //! -    println!("hello world")
 //! +    println!("hello world");
 //! +    println!("{foo}");
 //!  }
 //! +// foo
 //! "#
 //! );
 //! ```
 //!
 //! If you want to process the diff in some way you can provide your own implementation of [`Sink`](crate::sink::Sink).
 //! For closures [`Sink`](crate::sink::Sink) is already implemented, so simple [`Sink`]s can be easily added:
 //!
 //! ```
 //! use std::ops::Range;
 //!
 //! use imara_diff::intern::InternedInput;
 //! use imara_diff::{diff, Algorithm, UnifiedDiffBuilder};
 //!
 //! let before = r#"fn foo() -> Bar {
 //!     let mut foo = 2;
 //!     foo *= 50;
 //!     println!("hello world")
 //! }"#;
 //!
 //! let after = r#"// lorem ipsum
 //! fn foo() -> Bar {
 //!     let mut foo = 2;
 //!     foo *= 50;
 //!     println!("hello world");
 //!     println!("{foo}");
 //! }
 //! // foo
 //! "#;
 //!
 //! let mut insertions = Vec::new();
 //! let mut removals = Vec::new();
 //! let mut replacements = Vec::new();
 //!
 //! let input = InternedInput::new(before, after);
 //! let sink = |before: Range<u32>, after: Range<u32>| {
 //!     let hunk_before: Vec<_> = input.before[before.start as usize..before.end as usize]
 //!         .iter()
 //!         .map(|&line| input.interner[line])
 //!         .collect();
 //!     let hunk_after: Vec<_> = input.after[after.start as usize..after.end as usize]
 //!         .iter()
 //!         .map(|&line| input.interner[line])
 //!         .collect();
 //!     if hunk_after.is_empty() {
 //!         removals.push(hunk_before)
 //!     } else if hunk_before.is_empty() {
 //!         insertions.push(hunk_after)
 //!     } else {
 //!         replacements.push((hunk_before, hunk_after))
 //!     }
 //! };
 //! let diff = diff(Algorithm::Histogram, &input, sink);
 //! assert_eq!(&insertions, &[vec!["// lorem ipsum"], vec!["// foo"]]);
 //! assert!(removals.is_empty());
 //! assert_eq!(
 //!     &replacements,
 //!     &[(
 //!         vec!["    println!(\"hello world\")"],
 //!         vec!["    println!(\"hello world\");", "    println!(\"{foo}\");"]
 //!     )]
 //! );
 //! ```
 //!
 //! For `&str` and `&[u8]` imara-diff will compute a line diff by default.
 //! To perform diffs of different tokenizations and collections you can implement the [`TokenSource`](crate::intern::TokenSource) trait.
 //! For example the imara-diff provides an alternative tokenziser for line-diffs that includes the line terminator in the line:
 //!
 //! ```
 //! use imara_diff::intern::InternedInput;
 //! use imara_diff::sink::Counter;
 //! use imara_diff::sources::lines_with_terminator;
 //! use imara_diff::{diff, Algorithm, UnifiedDiffBuilder};
 //!
 //! let before = "foo";
 //! let after = "foo\n";
 //!
 //! let input = InternedInput::new(before, after);
 //! let changes = diff(Algorithm::Histogram, &input, Counter::default());
 //! assert_eq!(changes.insertions, 0);
 //! assert_eq!(changes.removals, 0);
 //!
 //! let input = InternedInput::new(lines_with_terminator(before), lines_with_terminator(after));
 //! let changes = diff(Algorithm::Histogram, &input, Counter::default());
 //! assert_eq!(changes.insertions, 1);
 //! assert_eq!(changes.removals, 1);
 //! ```

 use std::hash::Hash;

 #[cfg(feature = "unified_diff")]
 pub use unified_diff::UnifiedDiffBuilder;

 use crate::intern::{InternedInput, Token, TokenSource};
 pub use crate::sink::Sink;
 mod histogram;
 pub mod intern;
 mod myers;
 pub mod sink;
 pub mod sources;
 #[cfg(feature = "unified_diff")]
 mod unified_diff;
 mod util;

 #[cfg(test)]
 mod tests;

 /// `imara-diff` supports multiple different algorithms
 /// for computing an edit sequence.
 /// These algorithms have different performance and all produce different output.
 #[derive(Debug, PartialEq, Eq, Clone, Copy)]
 pub enum Algorithm {
     /// A variation of the [`patience` diff algorithm described by Bram Cohen's blog post](https://bramcohen.livejournal.com/73318.html)
     /// that uses a histogram to find the least common LCS.
     /// Just like the `patience` diff algorithm, this algorithm usually produces
     /// more human readable output then myers algorithm.
     /// However compared to the `patience` diff algorithm (which is slower then myers algorithm),
     /// the Histogram algorithm performs much better.
     ///
     /// The implementation here was originally ported from `git` but has been significantly
     /// modified to improve performance.
     /// As a result it consistently **performs better then myers algorithm** (5%-100%) over
     /// a wide variety of test data. For example a benchmark of diffing linux kernel commits is shown below:
     #[cfg_attr(doc, doc=concat!("<img width=\"600\" class=\"figure\" src=\"data:image/svg+xml;base64,", include_str!("../plots/linux_speedup.svg.base64"), "\"></img>"))]
     ///
     /// For pathological subsequences that only contain highly repeating tokens (64+ occurrences)
     /// the algorithm falls back on Myers algorithm (with heuristics) to avoid quadratic behavior.
     ///
     /// Compared to Myers algorithm, the Histogram diff algorithm is more focused on providing
     /// human readable diffs instead of minimal diffs. In practice this means that the edit-sequences
     /// produced by the histogram diff are often longer then those produced by Myers algorithm.
     ///
     /// The heuristic used by the histogram diff does not work well for inputs with small (often repeated)
     /// tokens. For example **character diffs do not work well** as most (english) text is madeup of
     /// a fairly small set of characters. The `Histogram` algorithm will automatically these cases and
     /// fallback to Myers algorithm. However this detection has a nontrivial overhead, so
     /// if its known upfront that the sort of tokens is very small `Myers` algorithm should
     /// be used instead.
     Histogram,
     /// An implementation of the linear space variant of
     /// [Myers  `O((N+M)D)` algorithm](http://www.xmailserver.org/diff2.pdf).
     /// The algorithm is enhanced with preprocessing that removes
     /// tokens that don't occur in the other file at all.
     /// Furthermore two heuristics to the middle snake search are implemented
     /// that ensure reasonable runtime (mostly linear time complexity) even for large files.
     ///
     /// Due to the divide and conquer nature of the algorithm
     /// the edit sequenced produced are still fairly small even when the middle snake
     /// search is aborted by a heuristic.
     /// However, the produced edit sequences are not guaranteed to be fully minimal.
     /// If that property is vital to you, use the `MyersMinimal` algorithm instead.
     ///
     /// The implementation (including the preprocessing) are mostly
     /// ported from `git` and `gnu-diff` where Myers algorithm is used
     /// as the default diff algorithm.
     /// Therefore the used heuristics have been heavily battle tested and
     /// are known to behave well over a large variety of inputs
     Myers,
     /// Same as `Myers` but the early abort heuristics are disabled to guarantee
     /// a minimal edit sequence.
     /// This can mean significant slowdown in pathological cases.
     MyersMinimal,
 }

 impl Algorithm {
     #[cfg(test)]
     const ALL: [Self; 2] = [Algorithm::Histogram, Algorithm::Myers];
 }

 impl Default for Algorithm {
     fn default() -> Self {
         Algorithm::Histogram
     }
 }

 /// Computes an edit-script that transforms `input.before` into `input.after` using
 /// the specified `algorithm`
 /// The edit-script is passed to `sink.process_change` while it is produced.
 pub fn diff<S: Sink, T: Eq + Hash>(
     algorithm: Algorithm,
     input: &InternedInput<T>,
     sink: S,
 ) -> S::Out {
     diff_with_tokens(algorithm, &input.before, &input.after, input.interner.num_tokens(), sink)
 }

 /// Computes an edit-script that transforms `before` into `after` using
 /// the specified `algorithm`
 /// The edit-script is passed to `sink.process_change` while it is produced.
 pub fn diff_with_tokens<S: Sink>(
     algorithm: Algorithm,
     before: &[Token],
     after: &[Token],
     num_tokens: u32,
     sink: S,
 ) -> S::Out {
     assert!(before.len() < i32::MAX as usize, "imara-diff only supports up to {} tokens", i32::MAX);
     assert!(after.len() < i32::MAX as usize, "imara-diff only supports up to {} tokens", i32::MAX);
     match algorithm {
         Algorithm::Histogram => histogram::diff(before, after, num_tokens, sink),
         Algorithm::Myers => myers::diff(before, after, num_tokens, sink, false),
         Algorithm::MyersMinimal => myers::diff(before, after, num_tokens, sink, true),
     }
 }
	//! Imara-diff is a solid (imara in swahili) diff library for rust.
	//! Solid refers to the fact that imara-diff provides very good runtime performance even
	//! in pathologic cases so that your application never appears to freeze while waiting on a diff.
	//! The performance improvements are achieved using battle tested heuristics used in gnu-diff and git
	//! that are known to yield fast runtime and performance.
	//!
	//! Imara-diff is also designed to be flexible so that it can be used with arbitrary collections and
	//! not just lists and strings and even allows reusing large parts of the computation when
	//! comparing the same file to multiple different files.
	//!
	//! Imara-diff provides two diff algorithms:
	//!
	//! * The linear-space variant of the well known [myer algorithm](http://www.xmailserver.org/diff2.pdf)
	//! * The Histogram algorithm which variant of the patience diff algorithm.
	//!
	//! Myers algorithm has been enhanced with preprocessing and multiple heuristics to ensure fast runtime in pathological
	//! cases to avoid quadratic time complexity and closely matches the behaviour of gnu-diff and git.
	//! The Histogram algorithm was originally ported from git but has been heavily optimized.
	//! The Histogram algorithm outperforms Myers diff by 10% - 100% across a wide variety of workloads.
	//!
	//! Imara-diffs algorithms have been benchmarked over a wide variety of real-world code.
	//! For example while comparing multiple different linux kernel it performs up to 30 times better than the `similar` crate:
	#![cfg_attr(doc, doc=concat!("<img width=\"600\" class=\"figure\" src=\"data:image/svg+xml;base64,", include_str!("../plots/linux_comparison.svg.base64"), "\"></img>"))]
	//!
	//! # Api Overview
	//!
	//! Imara-diff provides the [`UnifiedDiffBuilder`](crate::UnifiedDiffBuilder) for building
	//! a human-redable diff similar to the output of `git diff` or `diff -u`.
	//! This makes building a tool similar to gnu diff easy:
	//!
	//! ```
	//! use imara_diff::intern::InternedInput;
	//! use imara_diff::{diff, Algorithm, UnifiedDiffBuilder};
	//!
	//! let before = r#"fn foo() -> Bar {
	//! let mut foo = 2;
	//! foo *= 50;
	//! println!("hello world")
	//! }"#;
	//!
	//! let after = r#"// lorem ipsum
	//! fn foo() -> Bar {
	//! let mut foo = 2;
	//! foo *= 50;
	//! println!("hello world");
	//! println!("{foo}");
	//! }
	//! // foo
	//! "#;
	//!
	//! let input = InternedInput::new(before, after);
	//! let diff = diff(Algorithm::Histogram, &input, UnifiedDiffBuilder::new(&input));
	//! assert_eq!(
	//! diff,
	//! r#"@@ -1,5 +1,8 @@
	//! +// lorem ipsum
	//! fn foo() -> Bar {
	//! let mut foo = 2;
	//! foo *= 50;
	//! - println!("hello world")
	//! + println!("hello world");
	//! + println!("{foo}");
	//! }
	//! +// foo
	//! "#
	//! );
	//! ```
	//!
	//! If you want to process the diff in some way you can provide your own implementation of [`Sink`](crate::sink::Sink).
	//! For closures [`Sink`](crate::sink::Sink) is already implemented, so simple [`Sink`]s can be easily added:
	//!
	//! ```
	//! use std::ops::Range;
	//!
	//! use imara_diff::intern::InternedInput;
	//! use imara_diff::{diff, Algorithm, UnifiedDiffBuilder};
	//!
	//! let before = r#"fn foo() -> Bar {
	//! let mut foo = 2;
	//! foo *= 50;
	//! println!("hello world")
	//! }"#;
	//!
	//! let after = r#"// lorem ipsum
	//! fn foo() -> Bar {
	//! let mut foo = 2;
	//! foo *= 50;
	//! println!("hello world");
	//! println!("{foo}");
	//! }
	//! // foo
	//! "#;
	//!
	//! let mut insertions = Vec::new();
	//! let mut removals = Vec::new();
	//! let mut replacements = Vec::new();
	//!
	//! let input = InternedInput::new(before, after);
	//! let sink = \|before: Range<u32>, after: Range<u32>\| {
	//! let hunk_before: Vec<_> = input.before[before.start as usize..before.end as usize]
	//! .iter()
	//! .map(\|&line\| input.interner[line])
	//! .collect();
	//! let hunk_after: Vec<_> = input.after[after.start as usize..after.end as usize]
	//! .iter()
	//! .map(\|&line\| input.interner[line])
	//! .collect();
	//! if hunk_after.is_empty() {
	//! removals.push(hunk_before)
	//! } else if hunk_before.is_empty() {
	//! insertions.push(hunk_after)
	//! } else {
	//! replacements.push((hunk_before, hunk_after))
	//! }
	//! };
	//! let diff = diff(Algorithm::Histogram, &input, sink);
	//! assert_eq!(&insertions, &[vec!["// lorem ipsum"], vec!["// foo"]]);
	//! assert!(removals.is_empty());
	//! assert_eq!(
	//! &replacements,
	//! &[(
	//! vec![" println!(\"hello world\")"],
	//! vec![" println!(\"hello world\");", " println!(\"{foo}\");"]
	//! )]
	//! );
	//! ```
	//!
	//! For `&str` and `&[u8]` imara-diff will compute a line diff by default.
	//! To perform diffs of different tokenizations and collections you can implement the [`TokenSource`](crate::intern::TokenSource) trait.
	//! For example the imara-diff provides an alternative tokenziser for line-diffs that includes the line terminator in the line:
	//!
	//! ```
	//! use imara_diff::intern::InternedInput;
	//! use imara_diff::sink::Counter;
	//! use imara_diff::sources::lines_with_terminator;
	//! use imara_diff::{diff, Algorithm, UnifiedDiffBuilder};
	//!
	//! let before = "foo";
	//! let after = "foo\n";
	//!
	//! let input = InternedInput::new(before, after);
	//! let changes = diff(Algorithm::Histogram, &input, Counter::default());
	//! assert_eq!(changes.insertions, 0);
	//! assert_eq!(changes.removals, 0);
	//!
	//! let input = InternedInput::new(lines_with_terminator(before), lines_with_terminator(after));
	//! let changes = diff(Algorithm::Histogram, &input, Counter::default());
	//! assert_eq!(changes.insertions, 1);
	//! assert_eq!(changes.removals, 1);
	//! ```

	use std::hash::Hash;

	#[cfg(feature = "unified_diff")]
	pub use unified_diff::UnifiedDiffBuilder;

	use crate::intern::{InternedInput, Token, TokenSource};
	pub use crate::sink::Sink;
	mod histogram;
	pub mod intern;
	mod myers;
	pub mod sink;
	pub mod sources;
	#[cfg(feature = "unified_diff")]
	mod unified_diff;
	mod util;

	#[cfg(test)]
	mod tests;

	/// `imara-diff` supports multiple different algorithms
	/// for computing an edit sequence.
	/// These algorithms have different performance and all produce different output.
	#[derive(Debug, PartialEq, Eq, Clone, Copy)]
	pub enum Algorithm {
	/// A variation of the [`patience` diff algorithm described by Bram Cohen's blog post](https://bramcohen.livejournal.com/73318.html)
	/// that uses a histogram to find the least common LCS.
	/// Just like the `patience` diff algorithm, this algorithm usually produces
	/// more human readable output then myers algorithm.
	/// However compared to the `patience` diff algorithm (which is slower then myers algorithm),
	/// the Histogram algorithm performs much better.
	///
	/// The implementation here was originally ported from `git` but has been significantly
	/// modified to improve performance.
	/// As a result it consistently performs better then myers algorithm (5%-100%) over
	/// a wide variety of test data. For example a benchmark of diffing linux kernel commits is shown below:
	#[cfg_attr(doc, doc=concat!("<img width=\"600\" class=\"figure\" src=\"data:image/svg+xml;base64,", include_str!("../plots/linux_speedup.svg.base64"), "\"></img>"))]
	///
	/// For pathological subsequences that only contain highly repeating tokens (64+ occurrences)
	/// the algorithm falls back on Myers algorithm (with heuristics) to avoid quadratic behavior.
	///
	/// Compared to Myers algorithm, the Histogram diff algorithm is more focused on providing
	/// human readable diffs instead of minimal diffs. In practice this means that the edit-sequences
	/// produced by the histogram diff are often longer then those produced by Myers algorithm.
	///
	/// The heuristic used by the histogram diff does not work well for inputs with small (often repeated)
	/// tokens. For example character diffs do not work well as most (english) text is madeup of
	/// a fairly small set of characters. The `Histogram` algorithm will automatically these cases and
	/// fallback to Myers algorithm. However this detection has a nontrivial overhead, so
	/// if its known upfront that the sort of tokens is very small `Myers` algorithm should
	/// be used instead.
	Histogram,
	/// An implementation of the linear space variant of
	/// [Myers `O((N+M)D)` algorithm](http://www.xmailserver.org/diff2.pdf).
	/// The algorithm is enhanced with preprocessing that removes
	/// tokens that don't occur in the other file at all.
	/// Furthermore two heuristics to the middle snake search are implemented
	/// that ensure reasonable runtime (mostly linear time complexity) even for large files.
	///
	/// Due to the divide and conquer nature of the algorithm
	/// the edit sequenced produced are still fairly small even when the middle snake
	/// search is aborted by a heuristic.
	/// However, the produced edit sequences are not guaranteed to be fully minimal.
	/// If that property is vital to you, use the `MyersMinimal` algorithm instead.
	///
	/// The implementation (including the preprocessing) are mostly
	/// ported from `git` and `gnu-diff` where Myers algorithm is used
	/// as the default diff algorithm.
	/// Therefore the used heuristics have been heavily battle tested and
	/// are known to behave well over a large variety of inputs
	Myers,
	/// Same as `Myers` but the early abort heuristics are disabled to guarantee
	/// a minimal edit sequence.
	/// This can mean significant slowdown in pathological cases.
	MyersMinimal,
	}

	impl Algorithm {
	#[cfg(test)]
	const ALL: [Self; 2] = [Algorithm::Histogram, Algorithm::Myers];
	}

	impl Default for Algorithm {
	fn default() -> Self {
	Algorithm::Histogram
	}
	}

	/// Computes an edit-script that transforms `input.before` into `input.after` using
	/// the specified `algorithm`
	/// The edit-script is passed to `sink.process_change` while it is produced.
	pub fn diff<S: Sink, T: Eq + Hash>(
	algorithm: Algorithm,
	input: &InternedInput<T>,
	sink: S,
	) -> S::Out {
	diff_with_tokens(algorithm, &input.before, &input.after, input.interner.num_tokens(), sink)
	}

	/// Computes an edit-script that transforms `before` into `after` using
	/// the specified `algorithm`
	/// The edit-script is passed to `sink.process_change` while it is produced.
	pub fn diff_with_tokens<S: Sink>(
	algorithm: Algorithm,
	before: &[Token],
	after: &[Token],
	num_tokens: u32,
	sink: S,
	) -> S::Out {
	assert!(before.len() < i32::MAX as usize, "imara-diff only supports up to {} tokens", i32::MAX);
	assert!(after.len() < i32::MAX as usize, "imara-diff only supports up to {} tokens", i32::MAX);
	match algorithm {
	Algorithm::Histogram => histogram::diff(before, after, num_tokens, sink),
	Algorithm::Myers => myers::diff(before, after, num_tokens, sink, false),
	Algorithm::MyersMinimal => myers::diff(before, after, num_tokens, sink, true),
	}
	}