| // Copyright (C) 2024 The Android Open Source Project |
| // |
| // Licensed under the Apache License, Version 2.0 (the "License"); |
| // you may not use this file except in compliance with the License. |
| // You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, software |
| // distributed under the License is distributed on an "AS IS" BASIS, |
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| // See the License for the specific language governing permissions and |
| // limitations under the License. |
| |
| #[cfg(feature = "fuzzy_content_match")] |
| use itertools::Itertools; |
| use spdx::{LicenseReq, Licensee}; |
| use std::sync::LazyLock; |
| #[cfg(feature = "fuzzy_content_match")] |
| use textdistance::str::ratcliff_obershelp; |
| |
| fn strip_punctuation(text: &str) -> String { |
| let lowercase = text.to_lowercase(); |
| let mut processed = String::with_capacity(lowercase.len()); |
| for c in lowercase.chars() { |
| if c.is_alphanumeric() || c == '.' { |
| processed.push(c) |
| } else if !processed.ends_with(' ') { |
| processed.push(' ') |
| } |
| } |
| processed.trim().to_string() |
| } |
| |
| pub(crate) fn classify_license_file_contents(contents: &str) -> Vec<LicenseReq> { |
| let contents = strip_punctuation(contents); |
| |
| // Exact match |
| let mut matches = Vec::new(); |
| for (req, required_text) in LICENSE_CONTENT_CLASSIFICATION.iter() { |
| if contents.contains(required_text) { |
| matches.push(req.clone()); |
| } |
| } |
| if !matches.is_empty() { |
| return matches; |
| } |
| |
| // Fuzzy match. This is expensive, so start with licenses that are closest in length to the file, |
| // and only return a single match at most. |
| #[cfg(feature = "fuzzy_content_match")] |
| for (req, required_text) in LICENSE_CONTENT_CLASSIFICATION.iter().sorted_by(|a, b| { |
| let mut ra = a.1.len() as f32 / contents.len() as f32; |
| let mut rb = b.1.len() as f32 / contents.len() as f32; |
| if ra > 1.0 { |
| ra = 1.0 / ra; |
| } |
| if rb > 1.0 { |
| rb = 1.0 / rb; |
| } |
| rb.partial_cmp(&ra).unwrap() |
| }) { |
| let similarity = ratcliff_obershelp(contents.as_str(), required_text); |
| if similarity > 0.95 { |
| matches.push(req.clone()); |
| break; |
| } |
| } |
| |
| matches |
| } |
| |
| static LICENSE_CONTENT_CLASSIFICATION: LazyLock<Vec<(LicenseReq, String)>> = LazyLock::new(|| { |
| vec![ |
| ("MIT", include_str!("licenses/MIT.txt")), |
| ("Apache-2.0", include_str!("licenses/Apache-2.0.txt")), |
| ("ISC", include_str!("licenses/ISC.txt")), |
| ("MPL-2.0", include_str!("licenses/MPL-2.0.txt")), |
| ("BSD-2-Clause", include_str!("licenses/BSD-2-Clause.txt")), |
| ("BSD-3-Clause", include_str!("licenses/BSD-3-Clause.txt")), |
| ("Unicode-3.0", include_str!("licenses/Unicode-3.0.txt")), |
| ("Unlicense", include_str!("licenses/Unlicense.txt")), |
| ("Zlib", include_str!("licenses/Zlib.txt")), |
| ("OpenSSL", include_str!("licenses/OpenSSL.txt")), |
| ("NCSA", include_str!("licenses/NCSA.txt")), |
| ] |
| .into_iter() |
| .map(|(req, tokens)| { |
| let tokens = strip_punctuation(tokens); |
| assert!(!tokens.is_empty()); |
| (Licensee::parse(req).unwrap().into_req(), tokens) |
| }) |
| .collect() |
| }); |
| |
| #[cfg(test)] |
| mod tests { |
| use super::*; |
| |
| #[test] |
| fn test_strip_punctuation() { |
| assert_eq!(strip_punctuation("FOO BAR"), "foo bar", "Converted to lowercase"); |
| assert_eq!(strip_punctuation("foo, bar"), "foo bar", "Punctuation removed"); |
| assert_eq!(strip_punctuation("foo. bar"), "foo. bar", "Periods preserved"); |
| assert_eq!( |
| strip_punctuation(" foo bar "), |
| "foo bar", |
| "Leading and trailing whitespace stripped" |
| ); |
| assert_eq!( |
| strip_punctuation(" foo\n\n\n\nbar "), |
| "foo bar", |
| "Multiple whitespace replaced with single space" |
| ); |
| } |
| |
| #[test] |
| fn test_classify() { |
| assert!(classify_license_file_contents("foo").is_empty()); |
| assert_eq!( |
| classify_license_file_contents(include_str!("testdata/LICENSE-MIT-aarch64-paging.txt")), |
| vec![Licensee::parse("MIT").unwrap().into_req()] |
| ); |
| } |
| |
| #[cfg(feature = "fuzzy_content_match")] |
| #[test] |
| fn test_classify_fuzzy() { |
| assert_eq!( |
| classify_license_file_contents(include_str!("testdata/BSD-3-Clause-bindgen.txt")), |
| vec![Licensee::parse("BSD-3-Clause").unwrap().into_req()] |
| ); |
| } |
| |
| #[test] |
| fn concatenated_licenses() { |
| assert_eq!( |
| classify_license_file_contents( |
| format!( |
| "{}\n\n{}", |
| include_str!("licenses/Apache-2.0.txt"), |
| include_str!("licenses/MIT.txt") |
| ) |
| .as_str() |
| ), |
| vec![ |
| Licensee::parse("MIT").unwrap().into_req(), |
| Licensee::parse("Apache-2.0").unwrap().into_req() |
| ] |
| ); |
| } |
| } |