blob: efebec031de1c57b3fa4e2bd98ec77c40a2b898b [file] [log] [blame]
use std::ptr::NonNull;
use crate::intern::Token;
use crate::myers::middle_snake::{MiddleSnakeSearch, SearchResult};
use crate::myers::preprocess::PreprocessedFile;
use crate::myers::slice::FileSlice;
use crate::util::sqrt;
use crate::Sink;
mod middle_snake;
mod preprocess;
mod slice;
pub struct Myers {
kvec: NonNull<[i32]>,
kforward: NonNull<i32>,
kbackward: NonNull<i32>,
max_cost: u32,
}
pub fn diff<S: Sink>(
before: &[Token],
after: &[Token],
_num_tokens: u32,
mut sink: S,
minimal: bool,
) -> S::Out {
// preprocess the files by removing parts of the file that are not contained in the other file at all
// this process remaps the token indices and therefore requires us to track changed files in a char array
// PERF use a bitset?
let (mut before, mut after) = preprocess::preprocess(before, after);
// Perform the actual diff
Myers::new(before.tokens.len(), after.tokens.len()).run(
FileSlice::new(&mut before),
FileSlice::new(&mut after),
minimal,
);
process_changes_with_sink(&before, &after, &mut sink);
sink.finish()
}
const HEUR_MIN_COST: u32 = 256;
const MAX_COST_MIN: u32 = 256;
impl Drop for Myers {
fn drop(&mut self) {
unsafe { drop(Box::from_raw(self.kvec.as_ptr())) }
}
}
impl Myers {
fn new(len1: usize, len2: usize) -> Self {
let ndiags = len1 + len2 as usize + 3;
let kvec = Box::leak(vec![0; 2 * ndiags + 2].into_boxed_slice());
let kforward = NonNull::from(&mut kvec[len2 + 1]);
let kbackward = NonNull::from(&mut kvec[ndiags + len2 + 1]);
Self { kvec: kvec.into(), kforward, kbackward, max_cost: sqrt(ndiags).max(MAX_COST_MIN) }
}
fn run<'f>(&mut self, mut file1: FileSlice<'f>, mut file2: FileSlice<'f>, mut need_min: bool) {
loop {
file1.strip_common(&mut file2);
if file1.is_empty() {
file2.mark_changed();
return;
} else if file2.is_empty() {
file1.mark_changed();
return;
}
let split = self.split(&file1, &file2, need_min);
self.run(
file1.borrow().slice(..split.token_idx1 as u32),
file2.borrow().slice(..split.token_idx2 as u32),
split.minimized_lo,
);
file1 = file1.slice(split.token_idx1 as u32..);
file2 = file2.slice(split.token_idx2 as u32..);
need_min = split.minimized_hi
}
}
/// See "An O(ND) Difference Algorithm and its Variations", by Eugene Myers.
/// Basically considers a "box" (off1, off2, lim1, lim2) and scan from both
/// the forward diagonal starting from (off1, off2) and the backward diagonal
/// starting from (lim1, lim2). If the K values on the same diagonal crosses
/// returns the furthest point of reach. We might encounter expensive edge cases
/// using this algorithm, so a little bit of heuristic is needed to cut the
/// search and to return a suboptimal point.
fn split(&mut self, file1: &FileSlice, file2: &FileSlice, need_min: bool) -> Split {
let mut forward_search =
unsafe { MiddleSnakeSearch::<false>::new(self.kforward, file1, file2) };
let mut backwards_search =
unsafe { MiddleSnakeSearch::<true>::new(self.kbackward, file1, file2) };
let is_odd = (file2.len() - file2.len()) & 1 != 0;
let mut ec = 0;
while ec <= self.max_cost {
let mut found_snake = false;
forward_search.next_d();
if is_odd {
if let Some(res) = forward_search.run(file1, file2, |k, token_idx1| {
backwards_search.contains(k)
&& backwards_search.x_pos_at_diagonal(k) <= token_idx1
}) {
match res {
SearchResult::Snake => found_snake = true,
SearchResult::Found { token_idx1, token_idx2 } => {
return Split {
token_idx1,
token_idx2,
minimized_lo: true,
minimized_hi: true,
};
}
}
}
} else {
found_snake |= forward_search.run(file1, file2, |_, _| false).is_some()
};
backwards_search.next_d();
if !is_odd {
if let Some(res) = backwards_search.run(file1, file2, |k, token_idx1| {
forward_search.contains(k) && token_idx1 <= forward_search.x_pos_at_diagonal(k)
}) {
match res {
SearchResult::Snake => found_snake = true,
SearchResult::Found { token_idx1, token_idx2 } => {
return Split {
token_idx1,
token_idx2,
minimized_lo: true,
minimized_hi: true,
};
}
}
}
} else {
found_snake |= backwards_search.run(file1, file2, |_, _| false).is_some()
};
if need_min {
continue;
}
// If the edit cost is above the heuristic trigger and if
// we got a good snake, we sample current diagonals to see
// if some of them have reached an "interesting" path. Our
// measure is a function of the distance from the diagonal
// corner (i1 + i2) penalized with the distance from the
// mid diagonal itself. If this value is above the current
// edit cost times a magic factor (XDL_K_HEUR) we consider
// it interesting.
if found_snake && ec > HEUR_MIN_COST {
if let Some((token_idx1, token_idx2)) = forward_search.found_snake(ec, file1, file2)
{
return Split {
token_idx1,
token_idx2,
minimized_lo: true,
minimized_hi: false,
};
}
if let Some((token_idx1, token_idx2)) =
backwards_search.found_snake(ec, file1, file2)
{
return Split {
token_idx1,
token_idx2,
minimized_lo: false,
minimized_hi: true,
};
}
}
ec += 1;
}
let (distance_forward, token_idx1_forward) = forward_search.best_position(file1, file2);
let (distance_backwards, token_idx1_backwards) =
backwards_search.best_position(file1, file2);
if distance_forward > file1.len() as isize + file2.len() as isize - distance_backwards {
Split {
token_idx1: token_idx1_forward,
token_idx2: (distance_forward - token_idx1_forward as isize) as i32,
minimized_lo: true,
minimized_hi: false,
}
} else {
Split {
token_idx1: token_idx1_backwards,
token_idx2: (distance_backwards - token_idx1_backwards as isize) as i32,
minimized_lo: false,
minimized_hi: true,
}
}
}
}
#[derive(Debug)]
struct Split {
token_idx1: i32,
token_idx2: i32,
minimized_lo: bool,
minimized_hi: bool,
}
/// the mapping performed during preprocessing makes it impossible to directly call
/// the `sink` during the diff itself. Instead `file.changed` is set to true for all
/// tokens that are changed
/// below these arrays are used to call the sink function
fn process_changes_with_sink(
before: &PreprocessedFile,
after: &PreprocessedFile,
sink: &mut impl Sink,
) {
let before_end = before.is_changed.len() as u32 + before.offset;
let after_end = after.is_changed.len() as u32 + after.offset;
let mut before = before
.is_changed
.iter()
.enumerate()
.map(|(i, removed)| (i as u32 + before.offset, *removed));
let mut after = after
.is_changed
.iter()
.enumerate()
.map(|(i, inserted)| (i as u32 + after.offset, *inserted));
let mut next1 = before.next();
let mut next2 = after.next();
while let (Some((before_pos, removed)), Some((after_pos, inserted))) = (next1, next2) {
if !(removed | inserted) {
next1 = before.next();
next2 = after.next();
continue;
}
let mut hunk_before = before_pos..before_pos;
let mut hunk_after = after_pos..after_pos;
if removed {
let end = before.find(|(_, changed)| !changed);
next1 = end.map(|(end, _)| (end, false));
hunk_before.end = end.map_or(before_end, |(end, _)| end);
};
if inserted {
let end = after.find(|(_, changed)| !changed);
next2 = end.map(|(end, _)| (end, false));
hunk_after.end = end.map_or(after_end, |(end, _)| end);
}
sink.process_change(hunk_before, hunk_after);
}
if let Some((before_pos, _)) = next1 {
sink.process_change(before_pos..before_end, after_end..after_end);
} else if let Some((after_pos, _)) = next2 {
sink.process_change(before_end..before_end, after_pos..after_end);
}
}