blob: b8d00df8af0efe68398dd153ce4f183e9af206a9 [file] [log] [blame]
//! Overlap coefficient
#![cfg(feature = "std")]
use crate::counter::Counter;
use crate::{Algorithm, Result};
/// [Overlap similarity] is the size of the intersection divided by the smaller of the size of the two sets.
///
/// [Overlap similarity]: https://en.wikipedia.org/wiki/Overlap_coefficient
#[derive(Default)]
pub struct Overlap {}
impl Algorithm<f64> for Overlap {
fn for_iter<C, E>(&self, s1: C, s2: C) -> Result<f64>
where
C: Iterator<Item = E>,
E: Eq + core::hash::Hash,
{
let c1 = Counter::from_iter(s1);
let c2 = Counter::from_iter(s2);
let n1 = c1.count();
let n2 = c2.count();
let res = match (n1, n2) {
(0, 0) => 1.,
(_, 0) | (0, _) => 0.,
(_, _) => {
let ic = c1.intersect_count(&c2);
ic as f64 / n1.min(n2) as f64
}
};
Result {
abs: res,
is_distance: false,
max: 1.,
len1: c1.count(),
len2: c2.count(),
}
}
}
#[cfg(test)]
mod tests {
use crate::str::overlap;
use assert2::assert;
use rstest::rstest;
fn is_close(a: f64, b: f64) -> bool {
(a - b).abs() < 1E-5
}
#[rstest]
#[case("", "", 1.)]
#[case("nelson", "", 0.)]
#[case("", "neilsen", 0.)]
// parity with textdistance
#[case("test", "text", 3. / 4.)]
#[case("testme", "textthis", 4. / 6.)]
#[case("nelson", "neilsen", 5. / 6.)]
fn function_str(#[case] s1: &str, #[case] s2: &str, #[case] exp: f64) {
let act = overlap(s1, s2);
let ok = is_close(act, exp);
assert!(ok, "overlap({}, {}) is {}, not {}", s1, s2, act, exp);
}
}