blob: b52cc3a53ccac9e6e0efd7b88a9b63c44d4a7802 [file] [log] [blame]
//! Jaccard index
#![cfg(feature = "std")]
use crate::counter::Counter;
use crate::{Algorithm, Result};
/// [Jaccard similarity] is a ratio of intersection to union of two sets.
///
/// The metric works with the set of input elements, so strings "abc" and "bca"
/// are the same if compared by letters.
///
/// The metric is always normalized on the interval from 0.0 to 1.0.
///
/// [Jaccard similarity]: https://en.wikipedia.org/wiki/Jaccard_index
#[derive(Default)]
pub struct Jaccard {}
impl Algorithm<f64> for Jaccard {
fn for_iter<C, E>(&self, s1: C, s2: C) -> Result<f64>
where
C: Iterator<Item = E>,
E: Eq + core::hash::Hash,
{
let c1 = Counter::from_iter(s1);
let c2 = Counter::from_iter(s2);
let uc = c1.union_count(&c2);
let res = if uc == 0 {
1.
} else {
let ic = c1.intersect_count(&c2);
ic as f64 / uc as f64
};
Result {
abs: res,
is_distance: false,
max: 1.,
len1: c1.count(),
len2: c2.count(),
}
}
}
#[cfg(test)]
mod tests {
use crate::str::jaccard;
use assert2::assert;
use rstest::rstest;
fn is_close(a: f64, b: f64) -> bool {
(a - b).abs() < 1E-5
}
#[rstest]
#[case("", "", 1.)]
#[case("nelson", "", 0.)]
#[case("", "neilsen", 0.)]
#[case("abc", "abc", 1.)]
#[case("abc", "bac", 1.)]
// parity with textdistance
#[case("nelson", "neilsen", 5. / 8.)]
#[case("test", "text", 3. / 5.)]
#[case("decide", "resize", 3. / 9.)]
fn function_str(#[case] s1: &str, #[case] s2: &str, #[case] exp: f64) {
let act = jaccard(s1, s2);
let ok = is_close(act, exp);
assert!(ok, "jaccard({}, {}) is {}, not {}", s1, s2, act, exp);
}
}