blob: 1aae4aab44edcf140f60006e67a555e234744ec1 [file] [log] [blame]
//! Roberts similarity
#![cfg(feature = "std")]
use crate::counter::Counter;
use crate::{Algorithm, Result};
/// [Roberts similarity].
///
/// The metric is always normalized on the interval from 0.0 to 1.0.
///
/// [Roberts similarity]: https://github.com/chrislit/abydos/blob/master/abydos/distance/_roberts.py
#[derive(Default)]
pub struct Roberts {}
impl Algorithm<f64> for Roberts {
fn for_iter<C, E>(&self, s1: C, s2: C) -> Result<f64>
where
C: Iterator<Item = E>,
E: Eq + core::hash::Hash,
{
let c1 = Counter::from_iter(s1);
let c2 = Counter::from_iter(s2);
let n1 = c1.count();
let n2 = c2.count();
if n1 == 0 && n2 == 0 {
return Result {
abs: 1.0,
is_distance: false,
max: 1.,
len1: n1,
len2: n2,
};
}
let cm = c1.merge(&c2);
let alphabet = cm.keys();
let mut s1: f64 = 0.;
let mut s2: usize = 0;
for key in alphabet {
let v1 = c1.get(key).unwrap_or(&0);
let v2 = c2.get(key).unwrap_or(&0);
if v1 != &0 && v2 != &0 {
s1 += ((v1 + v2) * v1.min(v2)) as f64 / *v1.max(v2) as f64;
}
s2 += v1 + v2;
}
Result {
abs: s1 / s2 as f64,
is_distance: false,
max: 1.,
len1: n1,
len2: n2,
}
}
}
#[cfg(test)]
mod tests {
use crate::str::roberts;
use assert2::assert;
use rstest::rstest;
fn is_close(a: f64, b: f64) -> bool {
(a - b).abs() < 1E-5
}
#[rstest]
#[case("", "", 1.)]
#[case("a", "a", 1.)]
#[case("", "a", 0.)]
#[case("a", "", 0.)]
// Parity with abydos.
// By default, abydos uses bi-grams with word separators to tokenize any passed text
// for Roberts. And that's what gets tested. However, textdistance uses bag of chars
// by default and doesn't add any word separators ever. So, instead of using results
// from tests, I've put results of running the values through `Roberts(qval=1).sim(a, b)`.
#[case("cat", "hat", 0.6666666666666666)]
#[case("Niall", "Neil", 0.6111111111111112)]
#[case("aluminum", "Catalan", 0.3555555555555555)]
#[case("ATCG", "TAGC", 1.0)]
#[case("Nigel", "Niall", 0.55)]
#[case("Niall", "Nigel", 0.55)]
#[case("Colin", "Coiln", 1.0)]
#[case("Coiln", "Colin", 1.0)]
#[case("ATCAACGAGT", "AACGATTAG", 0.9210526315789473)]
fn function_str(#[case] s1: &str, #[case] s2: &str, #[case] exp: f64) {
let act = roberts(s1, s2);
let ok = is_close(act, exp);
assert!(ok, "roberts({}, {}) is {}, not {}", s1, s2, act, exp);
}
}