vendor/gsgdt/src/diff/match_graph.rs - toolchain/rustc

blob: a75545ef7c63cc1fb13d0e83a3b156f5b82a371b [file] [log] [blame]

	use crate::diff::*;
	use crate::levenshtein::distance;
	use std::collections::{BTreeMap, HashSet};

	pub type Mapping<'a> = BTreeMap<&'a str, &'a str>;

	// TODO: Is it better to return a list of matches or a hashmap
	// It might be better to do former because we may need to distinguise
	// b/w full and partial match
	#[derive(Debug, PartialEq)]
	pub struct Matching<'a> {
	pub from: &'a str,
	pub to: &'a str,
	}

	impl<'a> Matching<'a> {
	pub fn new(from: &'a str, to: &'a str) -> Matching<'a> {
	Matching { from, to }
	}
	}

	#[derive(Debug, PartialEq)]
	pub enum Match<'a> {
	Full(Matching<'a>),
	Partial(Matching<'a>),
	}
	//
	// impl<'a> Match<'a> {
	// fn is_full(&self) -> bool {
	// match self {
	// Match::Full(_) => true,
	// _ => false
	// }
	// }
	// }

	/// Matches both graphs and returns the mapping of nodes from g1 to g2
	pub fn match_graphs<'a>(d1: &'a DiffGraph<'_>, d2: &'a DiffGraph<'_>) -> Vec<Match<'a>> {
	let mut mapping: BTreeMap<&str, &str> = get_initial_mapping(d1, d2);

	// TODO: This mapping may have duplicate mappings, remove them

	let mut matches: Vec<Match> = mapping
	.iter()
	.map(\|(from, to)\| Match::Full(Matching { from, to }))
	.collect();

	// we use rev adjacency list because we are going to compare the parents
	let rev_adj_list1 = d1.graph.rev_adj_list();
	let rev_adj_list2 = d2.graph.rev_adj_list();

	let mut matched_labels_in_2: HashSet<&str> = mapping.iter().map(\|(_, v)\| *v).collect();

	let mut prev_mapping = mapping.clone();
	loop {
	let mut new_mapping: Mapping = BTreeMap::new();
	for (k, v) in prev_mapping.iter() {
	let parents_in_1 = rev_adj_list1.get(k).unwrap();
	let parents_in_2 = rev_adj_list2.get(v).unwrap();

	for n in parents_in_1.iter() {
	// don't bother selecting if the node in 1 is already matched
	// as we use a stricter selection for the initial matching
	if mapping.contains_key(n) {
	continue;
	}
	if let Some(lab) = select(d1, d2, n, &parents_in_2, false) {
	// if the matched label is already matched to some node in 1, skip
	if !matched_labels_in_2.contains(lab) {
	matched_labels_in_2.insert(lab);
	new_mapping.insert(n, lab);
	}
	}
	}
	}
	// println!("{:?}", new_mapping);
	// merge
	let new_matches = get_new_matches(&mapping, &new_mapping);
	if new_matches.len() == 0 {
	break;
	}
	for mch in new_matches {
	mapping.insert(mch.from, mch.to);
	matches.push(Match::Partial(mch));
	}
	prev_mapping = new_mapping;
	}

	matches
	}

	fn get_initial_mapping<'a>(d1: &'a DiffGraph<'_>, d2: &'a DiffGraph<'_>) -> Mapping<'a> {
	let mut mapping: BTreeMap<&str, &str> = BTreeMap::new();
	let mut reverse_mapping: BTreeMap<&str, &str> = BTreeMap::new();
	let g2_labels: Vec<&str> = d2.graph.nodes.iter().map(\|n\| n.label.as_str()).collect();

	// TODO: this can be functional
	for node in d1.graph.nodes.iter() {
	if let Some(matched_lab) = select(d1, d2, &node.label, &g2_labels, true) {
	if let Some(lab_in_1) = reverse_mapping.get(matched_lab) {
	// matched_lab was already matched
	// select the one with the lowest cost
	// this is done so that no duplicate matching will occur
	let dist_already = dist_bw_nodes(d1, d2, lab_in_1, matched_lab);
	let dist_new = dist_bw_nodes(d1, d2, &node.label, matched_lab);
	if dist_new > dist_already {
	continue;
	}
	mapping.remove(lab_in_1);
	}
	mapping.insert(&node.label, matched_lab);
	reverse_mapping.insert(matched_lab, &node.label);
	}
	}

	mapping
	}

	fn dist_bw_nodes(d1: &DiffGraph<'_>, d2: &DiffGraph<'_>, n1: &str, n2: &str) -> Option<usize> {
	let node1 = d1.graph.get_node_by_label(n1).unwrap();
	let node2 = d2.graph.get_node_by_label(n2).unwrap();

	let tup1 = (
	d1.dist_start[n1] as i64,
	d1.dist_end[n1] as i64,
	node1.stmts.len() as i64,
	);
	let tup2 = (
	d2.dist_start[n2] as i64,
	d2.dist_end[n2] as i64,
	node2.stmts.len() as i64,
	);

	let s1 = node1.stmts.join("");
	let s2 = node2.stmts.join("");
	let dist = distance(&s1, &s2);

	Some(
	((tup1.0 - tup2.0).pow(2) + (tup1.1 - tup2.1).pow(2) + (tup1.2 - tup2.2).pow(2)) as usize
	+ dist,
	)
	}

	/// Selects the most suitable match for n from L
	fn select<'a>(
	d1: &'a DiffGraph<'_>,
	d2: &'a DiffGraph<'_>,
	n: &'a str,
	list_of_labs: &[&'a str],
	use_text_dist_filter: bool,
	) -> Option<&'a str> {
	let node = d1.graph.get_node_by_label(n).unwrap();
	let node_stmt_len = node.stmts.len();
	let node_stmts = node.stmts.join("");
	list_of_labs
	.iter()
	.filter(\|lab\| {
	let other_node = d2.graph.get_node_by_label(lab).unwrap();
	// filter out nodes that may differ by more than 2 lines
	(other_node.stmts.len() as i64 - node.stmts.len() as i64).abs() <= 2
	})
	.filter(\|lab\| {
	if !use_text_dist_filter {
	return true;
	}
	let other_node_stmts = d2.graph.get_node_by_label(lab).unwrap().stmts.join("");
	// allow bigger basic blocks have more edits
	// 2 here is arbitary
	distance(&node_stmts, &other_node_stmts) < 2 * node_stmt_len
	})
	.min_by_key(\|x\| dist_bw_nodes(d1, d2, n, x))
	.map(\|x\| *x)
	}

	fn get_new_matches<'a>(mapping: &Mapping<'a>, new_mapping: &Mapping<'a>) -> Vec<Matching<'a>> {
	let mut changed = Vec::new();
	for (k, v) in new_mapping.iter() {
	if !mapping.contains_key(k) {
	changed.push(Matching { from: k, to: v })
	}
	}

	changed
	}