android/vendor/imara-diff-0.1.5/src/myers/preprocess.rs - toolchain/cargo-deny - Git at Google

 use crate::intern::Token;
 use crate::myers::sqrt;
 use crate::util::{strip_common_postfix, strip_common_prefix};

 pub fn preprocess(
     mut file1: &[Token],
     mut file2: &[Token],
 ) -> (PreprocessedFile, PreprocessedFile) {
     let common_prefix = strip_common_prefix(&mut file1, &mut file2);
     strip_common_postfix(&mut file1, &mut file2);
     let (hdiff1, hdiff2) = token_occurrences(file1, file2);
     let file1 = PreprocessedFile::new(common_prefix, &hdiff1, file1);
     let file2 = PreprocessedFile::new(common_prefix, &hdiff2, file2);
     (file1, file2)
 }

 /// computes how
 fn token_occurrences(file1: &[Token], file2: &[Token]) -> (Vec<Occurances>, Vec<Occurances>) {
     const MAX_EQLIMIT: u32 = 1024;

     // compute the limit after which tokens are treated as `Occurances::COMMON`
     let eqlimit1 = sqrt(file1.len()).min(MAX_EQLIMIT);
     let eqlimit2 = sqrt(file2.len()).min(MAX_EQLIMIT);

     // first collect how often each token occurs in a file
     let mut occurances1 = Vec::new();
     for token in file1 {
         let bucket = token.0 as usize;
         if bucket >= occurances1.len() {
             occurances1.resize(bucket + 1, 0u32);
         }
         occurances1[bucket] += 1;
     }

     // do the same thing for
     let mut occurances2 = Vec::new();
     let token_occurances2: Vec<_> = file2
         .iter()
         .map(|token| {
             let bucket = token.0 as usize;
             if bucket >= occurances2.len() {
                 occurances2.resize(bucket + 1, 0);
             }
             occurances2[bucket] += 1;
             let occurances1 = *occurances1.get(bucket).unwrap_or(&0);
             Occurances::from_occurances(occurances1, eqlimit2)
         })
         .collect();

     let token_occurances1: Vec<_> = file1
         .iter()
         .map(|token| {
             let bucket = token.0 as usize;
             let occurances2 = *occurances2.get(bucket).unwrap_or(&0);
             Occurances::from_occurances(occurances2, eqlimit1)
         })
         .collect();

     (token_occurances1, token_occurances2)
 }

 #[derive(Clone, Copy, Debug)]
 enum Occurances {
     /// Token does not occur in this file
     None,
     /// Token occurs at least once
     Some,
     /// Token occurs very frequently (exact number depends on file size).
     /// Such a tokens are usually empty lines or braces and are often not meaningful to a diff
     Common,
 }

 impl Occurances {
     pub fn from_occurances(occurances: u32, eqlimit: u32) -> Occurances {
         if occurances == 0 {
             Occurances::None
         } else if occurances >= eqlimit {
             Occurances::Common
         } else {
             Occurances::Some
         }
     }
 }

 #[derive(Debug)]
 pub struct PreprocessedFile {
     pub offset: u32,
     pub is_changed: Vec<bool>,
     pub indices: Vec<u32>,
     pub tokens: Vec<Token>,
 }

 impl PreprocessedFile {
     fn new(offset: u32, token_diff: &[Occurances], tokens: &[Token]) -> PreprocessedFile {
         let mut changed = vec![false; tokens.len()];
         let (tokens, indices) = prune_unmatched_tokens(tokens, token_diff, &mut changed);
         PreprocessedFile { offset, is_changed: changed, indices, tokens }
     }
 }

 fn prune_unmatched_tokens(
     file: &[Token],
     token_status: &[Occurances],
     changed: &mut [bool],
 ) -> (Vec<Token>, Vec<u32>) {
     assert_eq!(token_status.len(), file.len());
     file.iter()
         .zip(token_status)
         .enumerate()
         .filter_map(|(i, (&token, &status))| {
             let prune = match status {
                 Occurances::None => true,
                 Occurances::Some => false,
                 Occurances::Common => should_prune_common_line(token_status, i),
             };
             if prune {
                 changed[i] = true;
                 None
             } else {
                 Some((token, i as u32))
             }
         })
         .unzip()
 }

 // TODO do not unnecessarily rescan lines
 fn should_prune_common_line(token_status: &[Occurances], pos: usize) -> bool {
     const WINDOW_SIZE: usize = 100;

     let mut unmatched_before = 0;
     let mut common_before = 0;

     let start = if pos > WINDOW_SIZE { WINDOW_SIZE } else { 0 };
     for status in token_status[start..pos].iter().rev() {
         match status {
             Occurances::None => {
                 unmatched_before += 1;
             }
             Occurances::Common => {
                 common_before += 1;
             }
             Occurances::Some => break,
         }
     }

     if unmatched_before == 0 {
         return false;
     }

     let end = token_status.len().min(pos + WINDOW_SIZE);
     let mut unmatched_after = 0;
     let mut common_after = 0;
     for status in token_status[pos..end].iter() {
         match status {
             Occurances::None => {
                 unmatched_after += 1;
             }
             Occurances::Common => {
                 common_after += 1;
             }
             Occurances::Some => break,
         }
     }

     if unmatched_after == 0 {
         return false;
     }

     let common = common_before + common_after;
     let unmatched = unmatched_before + unmatched_after;

     unmatched > 3 * common
 }
	use crate::intern::Token;
	use crate::myers::sqrt;
	use crate::util::{strip_common_postfix, strip_common_prefix};

	pub fn preprocess(
	mut file1: &[Token],
	mut file2: &[Token],
	) -> (PreprocessedFile, PreprocessedFile) {
	let common_prefix = strip_common_prefix(&mut file1, &mut file2);
	strip_common_postfix(&mut file1, &mut file2);
	let (hdiff1, hdiff2) = token_occurrences(file1, file2);
	let file1 = PreprocessedFile::new(common_prefix, &hdiff1, file1);
	let file2 = PreprocessedFile::new(common_prefix, &hdiff2, file2);
	(file1, file2)
	}

	/// computes how
	fn token_occurrences(file1: &[Token], file2: &[Token]) -> (Vec<Occurances>, Vec<Occurances>) {
	const MAX_EQLIMIT: u32 = 1024;

	// compute the limit after which tokens are treated as `Occurances::COMMON`
	let eqlimit1 = sqrt(file1.len()).min(MAX_EQLIMIT);
	let eqlimit2 = sqrt(file2.len()).min(MAX_EQLIMIT);

	// first collect how often each token occurs in a file
	let mut occurances1 = Vec::new();
	for token in file1 {
	let bucket = token.0 as usize;
	if bucket >= occurances1.len() {
	occurances1.resize(bucket + 1, 0u32);
	}
	occurances1[bucket] += 1;
	}

	// do the same thing for
	let mut occurances2 = Vec::new();
	let token_occurances2: Vec<_> = file2
	.iter()
	.map(\|token\| {
	let bucket = token.0 as usize;
	if bucket >= occurances2.len() {
	occurances2.resize(bucket + 1, 0);
	}
	occurances2[bucket] += 1;
	let occurances1 = *occurances1.get(bucket).unwrap_or(&0);
	Occurances::from_occurances(occurances1, eqlimit2)
	})
	.collect();

	let token_occurances1: Vec<_> = file1
	.iter()
	.map(\|token\| {
	let bucket = token.0 as usize;
	let occurances2 = *occurances2.get(bucket).unwrap_or(&0);
	Occurances::from_occurances(occurances2, eqlimit1)
	})
	.collect();

	(token_occurances1, token_occurances2)
	}

	#[derive(Clone, Copy, Debug)]
	enum Occurances {
	/// Token does not occur in this file
	None,
	/// Token occurs at least once
	Some,
	/// Token occurs very frequently (exact number depends on file size).
	/// Such a tokens are usually empty lines or braces and are often not meaningful to a diff
	Common,
	}

	impl Occurances {
	pub fn from_occurances(occurances: u32, eqlimit: u32) -> Occurances {
	if occurances == 0 {
	Occurances::None
	} else if occurances >= eqlimit {
	Occurances::Common
	} else {
	Occurances::Some
	}
	}
	}

	#[derive(Debug)]
	pub struct PreprocessedFile {
	pub offset: u32,
	pub is_changed: Vec<bool>,
	pub indices: Vec<u32>,
	pub tokens: Vec<Token>,
	}

	impl PreprocessedFile {
	fn new(offset: u32, token_diff: &[Occurances], tokens: &[Token]) -> PreprocessedFile {
	let mut changed = vec![false; tokens.len()];
	let (tokens, indices) = prune_unmatched_tokens(tokens, token_diff, &mut changed);
	PreprocessedFile { offset, is_changed: changed, indices, tokens }
	}
	}

	fn prune_unmatched_tokens(
	file: &[Token],
	token_status: &[Occurances],
	changed: &mut [bool],
	) -> (Vec<Token>, Vec<u32>) {
	assert_eq!(token_status.len(), file.len());
	file.iter()
	.zip(token_status)
	.enumerate()
	.filter_map(\|(i, (&token, &status))\| {
	let prune = match status {
	Occurances::None => true,
	Occurances::Some => false,
	Occurances::Common => should_prune_common_line(token_status, i),
	};
	if prune {
	changed[i] = true;
	None
	} else {
	Some((token, i as u32))
	}
	})
	.unzip()
	}

	// TODO do not unnecessarily rescan lines
	fn should_prune_common_line(token_status: &[Occurances], pos: usize) -> bool {
	const WINDOW_SIZE: usize = 100;

	let mut unmatched_before = 0;
	let mut common_before = 0;

	let start = if pos > WINDOW_SIZE { WINDOW_SIZE } else { 0 };
	for status in token_status[start..pos].iter().rev() {
	match status {
	Occurances::None => {
	unmatched_before += 1;
	}
	Occurances::Common => {
	common_before += 1;
	}
	Occurances::Some => break,
	}
	}

	if unmatched_before == 0 {
	return false;
	}

	let end = token_status.len().min(pos + WINDOW_SIZE);
	let mut unmatched_after = 0;
	let mut common_after = 0;
	for status in token_status[pos..end].iter() {
	match status {
	Occurances::None => {
	unmatched_after += 1;
	}
	Occurances::Common => {
	common_after += 1;
	}
	Occurances::Some => break,
	}
	}

	if unmatched_after == 0 {
	return false;
	}

	let common = common_before + common_after;
	let unmatched = unmatched_before + unmatched_after;

	unmatched > 3 * common
	}