blob: 41ff1af5b4ca9c67a5cbf19dcd2c465d929dd756 [file] [log] [blame] [edit]
use crate::{
error::{ParseError, Reason},
ExceptionId, LicenseId,
};
/// Parsing configuration for SPDX expression
#[derive(Default, Copy, Clone)]
pub struct ParseMode {
/// The `AND`, `OR`, and `WITH` operators are required to be uppercase in
/// the SPDX spec, but enabling this option allows them to be lowercased
pub allow_lower_case_operators: bool,
/// Allows the use of `/` as a synonym for the `OR` operator.
///
/// This also allows for not having whitespace between the `/` and the terms
/// on either side
pub allow_slash_as_or_operator: bool,
/// Allows some invalid/imprecise identifiers as synonyms for an actual
/// license identifier.
///
/// See [`IMPRECISE_NAMES`](crate::identifiers::IMPRECISE_NAMES) for a list
/// of the current synonyms. Note that this list is not comprehensive but
/// can be expanded upon when invalid identifiers are found in the wild.
pub allow_imprecise_license_names: bool,
/// The various GPL licenses diverge from every other license in the SPDX
/// license list by having an `-or-later` variant that is used as a suffix
/// on a base license (eg. `GPL-3.0-or-later`) rather than the canonical
/// `GPL-3.0+`.
///
/// This option just allows GPL licenses to be treated similarly to all of
/// the other SPDX licenses.
pub allow_postfix_plus_on_gpl: bool,
}
impl ParseMode {
/// Strict, specification compliant SPDX parsing.
///
/// 1. Only license identifiers in the SPDX license list, or
/// Document/LicenseRef, are allowed. The license identifiers are also
/// case-sensitive.
/// 1. `WITH`, `AND`, and `OR` are the only valid operators
pub const STRICT: Self = Self {
allow_lower_case_operators: false,
allow_slash_as_or_operator: false,
allow_imprecise_license_names: false,
allow_postfix_plus_on_gpl: false,
};
/// Allow non-conforming syntax for crates-io compatibility
///
/// 1. Additional, invalid, identifiers are accepted and mapped to a correct
/// SPDX license identifier.
/// See [`IMPRECISE_NAMES`](crate::identifiers::IMPRECISE_NAMES) for the
/// list of additionally accepted identifiers and the license they
/// correspond to.
/// 1. `/` can by used as a synonym for `OR`, and doesn't need to be
/// separated by whitespace from the terms it combines
pub const LAX: Self = Self {
allow_lower_case_operators: true,
allow_slash_as_or_operator: true,
allow_imprecise_license_names: true,
allow_postfix_plus_on_gpl: true,
};
}
/// A single token in an SPDX license expression
#[derive(Clone, Debug, PartialEq, Eq)]
pub enum Token<'a> {
/// A recognized SPDX license id
Spdx(LicenseId),
/// A `LicenseRef-` prefixed id, with an optional `DocumentRef-`
LicenseRef {
doc_ref: Option<&'a str>,
lic_ref: &'a str,
},
/// A recognized SPDX exception id
Exception(ExceptionId),
/// A postfix `+` indicating "or later" for a particular SPDX license id
Plus,
/// A `(` for starting a group
OpenParen,
/// A `)` for ending a group
CloseParen,
/// A `WITH` operator
With,
/// An `AND` operator
And,
/// An `OR` operator
Or,
}
impl<'a> std::fmt::Display for Token<'a> {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
std::fmt::Debug::fmt(self, f)
}
}
impl<'a> Token<'a> {
fn len(&self) -> usize {
match self {
Token::Spdx(id) => id.name.len(),
Token::Exception(e) => e.name.len(),
Token::With => 4,
Token::And => 3,
Token::Or => 2,
Token::Plus | Token::OpenParen | Token::CloseParen => 1,
Token::LicenseRef { doc_ref, lic_ref } => {
doc_ref.map_or(0, |d| {
// +1 is for the `:`
"DocumentRef-".len() + d.len() + 1
}) + "LicenseRef-".len()
+ lic_ref.len()
}
}
}
}
/// Allows iteration through an SPDX license expression, yielding
/// a token or a `ParseError`.
///
/// Prefer to use `Expression::parse` or `Licensee::parse` rather
/// than directly using the lexer
pub struct Lexer<'a> {
inner: &'a str,
original: &'a str,
offset: usize,
mode: ParseMode,
}
impl<'a> Lexer<'a> {
/// Creates a Lexer over a license expression
#[must_use]
pub fn new(text: &'a str) -> Self {
Self {
inner: text,
original: text,
offset: 0,
mode: ParseMode::STRICT,
}
}
/// Creates a Lexer over a license expression
///
/// With `ParseMode::Lax` it allows non-conforming syntax
/// used in crates-io crates.
#[must_use]
pub fn new_mode(text: &'a str, mode: ParseMode) -> Self {
Self {
inner: text,
original: text,
offset: 0,
mode,
}
}
#[inline]
fn is_ref_char(c: &char) -> bool {
c.is_ascii_alphanumeric() || *c == '-' || *c == '.'
}
/// Return a matching text token if found - equivalent to the regex `^[-a-zA-Z0-9.:]+`
fn find_text_token(text: &'a str) -> Option<&'a str> {
let is_token_char = |c: &char| Self::is_ref_char(c) || *c == ':';
match text.chars().take_while(is_token_char).count() {
index if index > 0 => Some(&text[..index]),
_ => None,
}
}
/// Extract the text after `prefix` if made up of valid ref characters
fn find_ref(prefix: &str, text: &'a str) -> Option<&'a str> {
text.strip_prefix(prefix).map(|value| {
let end = value.chars().take_while(Self::is_ref_char).count();
&text[prefix.len()..prefix.len() + end]
})
}
/// Return a license ref if found - equivalent to the regex `^LicenseRef-([-a-zA-Z0-9.]+)`
#[inline]
fn find_license_ref(text: &'a str) -> Option<&'a str> {
Self::find_ref("LicenseRef-", text)
}
/// Return a document ref and license ref if found,
/// equivalent to the regex `^DocumentRef-([-a-zA-Z0-9.]+):LicenseRef-([-a-zA-Z0-9.]+)`
fn find_document_and_license_ref(text: &'a str) -> Option<(&'a str, &'a str)> {
let split = text.split_once(':');
let doc_ref = split.and_then(|(doc, _)| Self::find_ref("DocumentRef-", doc));
let lic_ref = split.and_then(|(_, lic)| Self::find_license_ref(lic));
Option::zip(doc_ref, lic_ref)
}
}
/// A wrapper around a particular token that includes the span of the characters
/// in the original string, for diagnostic purposes
#[derive(Debug)]
pub struct LexerToken<'a> {
/// The token that was lexed
pub token: Token<'a>,
/// The range of the token characters in the original license expression
pub span: std::ops::Range<usize>,
}
impl<'a> Iterator for Lexer<'a> {
type Item = Result<LexerToken<'a>, ParseError>;
fn next(&mut self) -> Option<Self::Item> {
#[allow(clippy::unnecessary_wraps)]
fn ok_token(token: Token<'_>) -> Option<Result<(Token<'_>, usize), ParseError>> {
let len = token.len();
Some(Ok((token, len)))
}
// Jump over any whitespace, updating `self.inner` and `self.offset` appropriately
let non_whitespace_index = match self.inner.find(|c: char| !c.is_whitespace()) {
Some(idx) => idx,
None => self.inner.len(),
};
self.inner = &self.inner[non_whitespace_index..];
self.offset += non_whitespace_index;
match self.inner.chars().next() {
None => None,
// From SPDX 2.1 spec
// There MUST NOT be whitespace between a license-id and any following "+".
Some('+') => {
if non_whitespace_index == 0 {
ok_token(Token::Plus)
} else {
Some(Err(ParseError {
original: self.original.to_owned(),
span: self.offset - non_whitespace_index..self.offset,
reason: Reason::SeparatedPlus,
}))
}
}
Some('(') => ok_token(Token::OpenParen),
Some(')') => ok_token(Token::CloseParen),
Some('/') if self.mode.allow_slash_as_or_operator => Some(Ok((Token::Or, 1))),
Some(_) => match Lexer::find_text_token(self.inner) {
None => Some(Err(ParseError {
original: self.original.to_owned(),
span: self.offset..self.offset + self.inner.len(),
reason: Reason::InvalidCharacters,
})),
Some(m) => {
if m == "WITH" {
ok_token(Token::With)
} else if m == "AND" {
ok_token(Token::And)
} else if m == "OR" {
ok_token(Token::Or)
} else if self.mode.allow_lower_case_operators && m == "and" {
ok_token(Token::And)
} else if self.mode.allow_lower_case_operators && m == "or" {
ok_token(Token::Or)
} else if self.mode.allow_lower_case_operators && m == "with" {
ok_token(Token::With)
} else if let Some(lic_id) = crate::license_id(m) {
ok_token(Token::Spdx(lic_id))
} else if let Some(exc_id) = crate::exception_id(m) {
ok_token(Token::Exception(exc_id))
} else if let Some((doc_ref, lic_ref)) = Lexer::find_document_and_license_ref(m)
{
ok_token(Token::LicenseRef {
doc_ref: Some(doc_ref),
lic_ref,
})
} else if let Some(lic_ref) = Lexer::find_license_ref(m) {
ok_token(Token::LicenseRef {
doc_ref: None,
lic_ref,
})
} else if let Some((lic_id, token_len)) =
if self.mode.allow_imprecise_license_names {
crate::imprecise_license_id(self.inner)
} else {
None
}
{
Some(Ok((Token::Spdx(lic_id), token_len)))
} else {
Some(Err(ParseError {
original: self.original.to_owned(),
span: self.offset..self.offset + m.len(),
reason: Reason::UnknownTerm,
}))
}
}
},
}
.map(|res| {
res.map(|(tok, len)| {
let start = self.offset;
self.inner = &self.inner[len..];
self.offset += len;
LexerToken {
token: tok,
span: start..self.offset,
}
})
})
}
}