blob: 3f93a08e13c00aece4553c427fbe03e5c7cf44ce [file] [log] [blame]
// Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
// SPDX-License-Identifier: Apache-2.0
use std::{collections::HashMap, fmt};
use serde::{Deserialize, Serialize};
use crate::{
ngram::NgramSet,
preproc::{apply_aggressive, apply_normalizers},
};
/// The type of a license entry (typically in a `Store`).
#[derive(Clone, Copy, PartialEq, Debug, Serialize, Deserialize)]
#[serde(rename_all = "lowercase")]
pub enum LicenseType {
/// The canonical text of the license.
Original,
/// A license header. There may be more than one in a `Store`.
Header,
/// An alternate form of a license. This is intended to be used for
/// alternate _formats_ of a license, not for variants where the text has
/// different meaning. Not currently used in askalono's SPDX dataset.
Alternate,
}
impl fmt::Display for LicenseType {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(
f,
"{}",
match *self {
LicenseType::Original => "original text",
LicenseType::Header => "license header",
LicenseType::Alternate => "alternate text",
}
)
}
}
/// A structure representing compiled text/matching data.
///
/// This is the key structure used to compare two texts against one another. It
/// handles pre-processing the text to n-grams, scoring, and optimizing the
/// result to try to identify specific details about a match.
///
/// # Examples
///
/// Basic scoring of two texts:
///
/// ```
/// use askalono::TextData;
///
/// let license = TextData::from("My First License");
/// let sample = TextData::from("copyright 20xx me irl\n\n // my first license");
/// assert_eq!(sample.match_score(&license), 1.0);
/// ```
///
/// The above example is a perfect match, as identifiable copyright statements
/// are stripped out during pre-processing.
///
/// Building on that, TextData is able to tell you _where_ in the text a
/// license is located:
///
/// ```
/// # use std::error::Error;
/// # use askalono::TextData;
/// # fn main() -> Result<(), Box<Error>> {
/// # let license = TextData::from("My First License");
/// let sample = TextData::from("copyright 20xx me irl\n// My First License\nfn hello() {\n ...");
/// let (optimized, score) = sample.optimize_bounds(&license);
/// assert_eq!((1, 2), optimized.lines_view());
/// assert!(score > 0.99f32, "license within text matches");
/// # Ok(())
/// # }
/// ```
#[derive(Serialize, Deserialize, Clone, Debug)]
pub struct TextData {
match_data: NgramSet,
lines_view: (usize, usize),
lines_normalized: Option<Vec<String>>,
text_processed: Option<String>,
}
const TEXTDATA_TEXT_ERROR: &str = "TextData does not have original text";
impl TextData {
/// Create a new TextData structure from a string.
///
/// The given text will be normalized, then smashed down into n-grams for
/// matching. By default, the normalized text is stored inside the
/// structure for future diagnostics. This is necessary for optimizing a
/// match and for diffing against other texts. If you don't want this extra
/// data, you can call `without_text` throw it out. Generally, as a user of
/// this library you want to keep the text data, but askalono will throw it
/// away in its own `Store` as it's not needed.
pub fn new(text: &str) -> TextData {
let normalized = apply_normalizers(text);
let normalized_joined = normalized.join("\n");
let processed = apply_aggressive(&normalized_joined);
let match_data = NgramSet::from_str(&processed, 2);
TextData {
match_data,
lines_view: (0, normalized.len()),
lines_normalized: Some(normalized),
text_processed: Some(processed),
}
}
/// Consume this `TextData`, returning one without normalized/processed
/// text stored.
///
/// Unless you know you don't want the text, you probably don't want to use
/// this. Other methods on `TextData` require that text is present.
pub fn without_text(self) -> Self {
TextData {
match_data: self.match_data,
lines_view: (0, 0),
lines_normalized: None,
text_processed: None,
}
}
/// Get the bounds of the active line view.
///
/// This represents the "active" region of lines that matches are generated
/// from. The bounds are a 0-indexed `(start, end)` tuple, with inclusive
/// start and exclusive end indicies. See `optimize_bounds`.
///
/// This is largely for informational purposes; other methods in
/// `TextView`, such as `lines` and `match_score`, will already account for
/// the line range. However, it's useful to call it after running
/// `optimize_bounds` to discover where the input text was discovered.
pub fn lines_view(&self) -> (usize, usize) {
self.lines_view
}
/// Clone this `TextView`, creating a copy with the given view.
///
/// This will re-generate match data for the given view. It's used in
/// `optimize_bounds` to shrink/expand the view of the text to discover
/// bounds.
///
/// Other methods on `TextView` respect this boundary, so it's not needed
/// outside this struct.
pub fn with_view(&self, start: usize, end: usize) -> Self {
let view = &self.lines_normalized.as_ref().expect(TEXTDATA_TEXT_ERROR)[start..end];
let view_joined = view.join("\n");
let processed = apply_aggressive(&view_joined);
TextData {
match_data: NgramSet::from_str(&processed, 2),
lines_view: (start, end),
lines_normalized: self.lines_normalized.clone(),
text_processed: Some(processed),
}
}
/// "Erase" the current lines in view and restore the view to its original
/// bounds.
///
/// For example, consider a file with two licenses in it. One was identified
/// (and located) with `optimize_bounds`. Now you want to find the other:
/// white-out the matched lines, and re-run the overall search to find a
/// new high score.
pub fn white_out(&self) -> Self {
// note that we're not using the view here...
let lines = self.lines_normalized.as_ref().expect(TEXTDATA_TEXT_ERROR);
// ...because it's used here to exclude lines
let new_normalized: Vec<String> = lines
.iter()
.enumerate()
.map(|(i, line)| {
if i >= self.lines_view.0 && i < self.lines_view.1 {
"".to_string()
} else {
line.clone()
}
})
.collect();
let processed = apply_aggressive(&new_normalized.join("\n"));
TextData {
match_data: NgramSet::from_str(&processed, 2),
lines_view: (0, new_normalized.len()),
lines_normalized: Some(new_normalized),
text_processed: Some(processed),
}
}
/// Get a slice of the normalized lines in this `TextData`.
pub fn lines(&self) -> &[String] {
&self.lines_normalized.as_ref().expect(TEXTDATA_TEXT_ERROR)
[self.lines_view.0..self.lines_view.1]
}
#[doc(hidden)]
pub fn text_processed(&self) -> Option<&str> {
self.text_processed.as_ref().map(String::as_ref)
}
/// Compare this `TextData` with another, returning a similarity score.
///
/// This is what's used during analysis to rank licenses.
pub fn match_score(&self, other: &TextData) -> f32 {
self.match_data.dice(&other.match_data)
}
#[cfg(feature = "spdx")]
pub(crate) fn eq_data(&self, other: &Self) -> bool {
self.match_data.eq(&other.match_data)
}
/// Attempt to optimize a known match to locate possible line ranges.
///
/// Returns a new `TextData` struct and a score. The returned struct is a
/// clone of `self`, with its view set to the best match against `other`.
///
/// This will respect any views set on the TextData (an optimized result
/// won't go outside the original view).
///
/// Note that this won't be 100% optimal if there are blank lines
/// surrounding the actual match, since successive blank lines in a range
/// will likely have the same score.
///
/// You should check the value of `lines_view` on the returned struct to
/// find the line ranges.
pub fn optimize_bounds(&self, other: &TextData) -> (Self, f32) {
assert!(self.lines_normalized.is_some(), "{}", TEXTDATA_TEXT_ERROR);
let view = self.lines_view;
// optimize the ending bounds of the text match
let (end_optimized, _) = self.search_optimize(
&|end| self.with_view(view.0, end).match_score(other),
&|end| self.with_view(view.0, end),
);
let new_end = end_optimized.lines_view.1;
// then optimize the starting bounds
let (optimized, score) = end_optimized.search_optimize(
&|start| end_optimized.with_view(start, new_end).match_score(other),
&|start| end_optimized.with_view(start, new_end),
);
(optimized, score)
}
fn search_optimize(
&self,
score: &dyn Fn(usize) -> f32,
value: &dyn Fn(usize) -> Self,
) -> (Self, f32) {
// cache score checks, since they're kinda expensive
let mut memo: HashMap<usize, f32> = HashMap::new();
let mut check_score =
|index: usize| -> f32 { *memo.entry(index).or_insert_with(|| score(index)) };
fn search(score: &mut dyn FnMut(usize) -> f32, left: usize, right: usize) -> (usize, f32) {
if right - left <= 3 {
// find the index of the highest score in the remaining items
return (left..=right)
.map(|x| (x, score(x)))
.fold((0usize, 0f32), |acc, x| if x.1 >= acc.1 { x } else { acc });
}
let low = (left * 2 + right) / 3;
let high = (left + right * 2) / 3;
let score_low = score(low);
let score_high = score(high);
if score_low > score_high {
search(score, left, high - 1)
} else {
search(score, low + 1, right)
}
}
let optimal = search(&mut check_score, self.lines_view.0, self.lines_view.1);
(value(optimal.0), optimal.1)
}
}
impl<'a> From<&'a str> for TextData {
fn from(text: &'a str) -> Self {
Self::new(text)
}
}
impl<'a> From<String> for TextData {
fn from(text: String) -> Self {
Self::new(&text)
}
}
#[cfg(test)]
mod tests {
use super::*;
// psst:
// cargo test -- --nocapture
#[test]
fn optimize_bounds() {
let license_text = "this is a license text\nor it pretends to be one\nit's just a test";
let sample_text = "this is a license text\nor it pretends to be one\nit's just a test\nwords\n\nhere is some\ncode\nhello();\n\n//a comment too";
let license = TextData::from(license_text).without_text();
let sample = TextData::from(sample_text);
let (optimized, _) = sample.optimize_bounds(&license);
println!("{:?}", optimized.lines_view);
println!("{:?}", optimized.lines_normalized);
assert_eq!((0, 3), optimized.lines_view);
// add more to the string, try again (avoid int trunc screwups)
let sample_text = format!("{}\none more line", sample_text);
let sample = TextData::from(sample_text.as_str());
let (optimized, _) = sample.optimize_bounds(&license);
println!("{:?}", optimized.lines_view);
println!("{:?}", optimized.lines_normalized);
assert_eq!((0, 3), optimized.lines_view);
// add to the beginning too
let sample_text = format!("some content\nat\n\nthe beginning\n{}", sample_text);
let sample = TextData::from(sample_text.as_str());
let (optimized, _) = sample.optimize_bounds(&license);
println!("{:?}", optimized.lines_view);
println!("{:?}", optimized.lines_normalized);
// end bounds at 7 and 8 have the same score, since they're empty lines (not
// counted). askalono is not smart enough to trim this as close as it
// can.
assert!(
(4, 7) == optimized.lines_view || (4, 8) == optimized.lines_view,
"bounds are (4, 7) or (4, 8)"
);
}
// if a view is set on the text data, optimize_bounds must not find text
// outside of that range
#[test]
fn optimize_doesnt_grow_view() {
let sample_text = "0\n1\n2\naaa aaa\naaa\naaa\naaa\n7\n8";
let license_text = "aaa aaa aaa aaa aaa";
let sample = TextData::from(sample_text);
let license = TextData::from(license_text).without_text();
// sanity: the optimized bounds should be at (3, 7)
let (optimized, _) = sample.optimize_bounds(&license);
assert_eq!((3, 7), optimized.lines_view);
// this should still work
let sample = sample.with_view(3, 7);
let (optimized, _) = sample.optimize_bounds(&license);
assert_eq!((3, 7), optimized.lines_view);
// but if we shrink the view further, it shouldn't be outside that range
let sample = sample.with_view(4, 6);
let (optimized, _) = sample.optimize_bounds(&license);
assert_eq!((4, 6), optimized.lines_view);
// restoring the view should still be OK too
let sample = sample.with_view(0, 9);
let (optimized, _) = sample.optimize_bounds(&license);
assert_eq!((3, 7), optimized.lines_view);
}
// ensure we don't choke on small TextData matches
#[test]
fn match_small() {
let a = TextData::from("a b");
let b = TextData::from("a\nlong\nlicense\nfile\n\n\n\n\nabcdefg");
let x = a.match_score(&b);
let y = b.match_score(&a);
assert_eq!(x, y);
}
// don't choke on empty TextData either
#[test]
fn match_empty() {
let a = TextData::from("");
let b = TextData::from("a\nlong\nlicense\nfile\n\n\n\n\nabcdefg");
let x = a.match_score(&b);
let y = b.match_score(&a);
assert_eq!(x, y);
}
#[test]
fn view_and_white_out() {
let a = TextData::from("aaa\nbbb\nccc\nddd");
assert_eq!(Some("aaa bbb ccc ddd"), a.text_processed());
let b = a.with_view(1, 3);
assert_eq!(2, b.lines().len());
assert_eq!(Some("bbb ccc"), b.text_processed());
let c = b.white_out();
assert_eq!(Some("aaa ddd"), c.text_processed());
}
}