vendor/spdx-0.10.6/src/lexer.rs - toolchain/rustc - Git at Google

 use crate::{
     error::{ParseError, Reason},
     ExceptionId, LicenseId,
 };

 /// Parsing configuration for SPDX expression
 #[derive(Default, Copy, Clone)]
 pub struct ParseMode {
     /// The `AND`, `OR`, and `WITH` operators are required to be uppercase in
     /// the SPDX spec, but enabling this option allows them to be lowercased
     pub allow_lower_case_operators: bool,
     /// Allows the use of `/` as a synonym for the `OR` operator.
     ///
     /// This also allows for not having whitespace between the `/` and the terms
     /// on either side
     pub allow_slash_as_or_operator: bool,
     /// Allows some invalid/imprecise identifiers as synonyms for an actual
     /// license identifier.
     ///
     /// See [`IMPRECISE_NAMES`](crate::identifiers::IMPRECISE_NAMES) for a list
     /// of the current synonyms. Note that this list is not comprehensive but
     /// can be expanded upon when invalid identifiers are found in the wild.
     pub allow_imprecise_license_names: bool,
     /// The various GPL licenses diverge from every other license in the SPDX
     /// license list by having an `-or-later` variant that is used as a suffix
     /// on a base license (eg. `GPL-3.0-or-later`) rather than the canonical
     /// `GPL-3.0+`.
     ///
     /// This option just allows GPL licenses to be treated similarly to all of
     /// the other SPDX licenses.
     pub allow_postfix_plus_on_gpl: bool,
 }

 impl ParseMode {
     /// Strict, specification compliant SPDX parsing.
     ///
     /// 1. Only license identifiers in the SPDX license list, or
     /// Document/LicenseRef, are allowed. The license identifiers are also
     /// case-sensitive.
     /// 1. `WITH`, `AND`, and `OR` are the only valid operators
     pub const STRICT: Self = Self {
         allow_lower_case_operators: false,
         allow_slash_as_or_operator: false,
         allow_imprecise_license_names: false,
         allow_postfix_plus_on_gpl: false,
     };

     /// Allow non-conforming syntax for crates-io compatibility
     ///
     /// 1. Additional, invalid, identifiers are accepted and mapped to a correct
     /// SPDX license identifier.
     /// See [`IMPRECISE_NAMES`](crate::identifiers::IMPRECISE_NAMES) for the
     /// list of additionally accepted identifiers and the license they
     /// correspond to.
     /// 1. `/` can by used as a synonym for `OR`, and doesn't need to be
     /// separated by whitespace from the terms it combines
     pub const LAX: Self = Self {
         allow_lower_case_operators: true,
         allow_slash_as_or_operator: true,
         allow_imprecise_license_names: true,
         allow_postfix_plus_on_gpl: true,
     };
 }

 /// A single token in an SPDX license expression
 #[derive(Clone, Debug, PartialEq, Eq)]
 pub enum Token<'a> {
     /// A recognized SPDX license id
     Spdx(LicenseId),
     /// A `LicenseRef-` prefixed id, with an optional `DocumentRef-`
     LicenseRef {
         doc_ref: Option<&'a str>,
         lic_ref: &'a str,
     },
     /// A recognized SPDX exception id
     Exception(ExceptionId),
     /// A postfix `+` indicating "or later" for a particular SPDX license id
     Plus,
     /// A `(` for starting a group
     OpenParen,
     /// A `)` for ending a group
     CloseParen,
     /// A `WITH` operator
     With,
     /// An `AND` operator
     And,
     /// An `OR` operator
     Or,
 }

 impl<'a> std::fmt::Display for Token<'a> {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
         std::fmt::Debug::fmt(self, f)
     }
 }

 impl<'a> Token<'a> {
     fn len(&self) -> usize {
         match self {
             Token::Spdx(id) => id.name.len(),
             Token::Exception(e) => e.name.len(),
             Token::With => 4,
             Token::And => 3,
             Token::Or => 2,
             Token::Plus | Token::OpenParen | Token::CloseParen => 1,
             Token::LicenseRef { doc_ref, lic_ref } => {
                 doc_ref.map_or(0, |d| {
                     // +1 is for the `:`
                     "DocumentRef-".len() + d.len() + 1
                 }) + "LicenseRef-".len()
                     + lic_ref.len()
             }
         }
     }
 }

 /// Allows iteration through an SPDX license expression, yielding
 /// a token or a `ParseError`.
 ///
 /// Prefer to use `Expression::parse` or `Licensee::parse` rather
 /// than directly using the lexer
 pub struct Lexer<'a> {
     inner: &'a str,
     original: &'a str,
     offset: usize,
     mode: ParseMode,
 }

 impl<'a> Lexer<'a> {
     /// Creates a Lexer over a license expression
     #[must_use]
     pub fn new(text: &'a str) -> Self {
         Self {
             inner: text,
             original: text,
             offset: 0,
             mode: ParseMode::STRICT,
         }
     }

     /// Creates a Lexer over a license expression
     ///
     /// With `ParseMode::Lax` it allows non-conforming syntax
     /// used in crates-io crates.
     #[must_use]
     pub fn new_mode(text: &'a str, mode: ParseMode) -> Self {
         Self {
             inner: text,
             original: text,
             offset: 0,
             mode,
         }
     }

     #[inline]
     fn is_ref_char(c: &char) -> bool {
         c.is_ascii_alphanumeric() || *c == '-' || *c == '.'
     }

     /// Return a matching text token if found - equivalent to the regex `^[-a-zA-Z0-9.:]+`
     fn find_text_token(text: &'a str) -> Option<&'a str> {
         let is_token_char = |c: &char| Self::is_ref_char(c) || *c == ':';
         match text.chars().take_while(is_token_char).count() {
             index if index > 0 => Some(&text[..index]),
             _ => None,
         }
     }

     /// Extract the text after `prefix` if made up of valid ref characters
     fn find_ref(prefix: &str, text: &'a str) -> Option<&'a str> {
         text.strip_prefix(prefix).map(|value| {
             let end = value.chars().take_while(Self::is_ref_char).count();
             &text[prefix.len()..prefix.len() + end]
         })
     }

     /// Return a license ref if found - equivalent to the regex `^LicenseRef-([-a-zA-Z0-9.]+)`
     #[inline]
     fn find_license_ref(text: &'a str) -> Option<&'a str> {
         Self::find_ref("LicenseRef-", text)
     }

     /// Return a document ref and license ref if found,
     /// equivalent to the regex `^DocumentRef-([-a-zA-Z0-9.]+):LicenseRef-([-a-zA-Z0-9.]+)`
     fn find_document_and_license_ref(text: &'a str) -> Option<(&'a str, &'a str)> {
         let split = text.split_once(':');
         let doc_ref = split.and_then(|(doc, _)| Self::find_ref("DocumentRef-", doc));
         let lic_ref = split.and_then(|(_, lic)| Self::find_license_ref(lic));
         Option::zip(doc_ref, lic_ref)
     }
 }

 /// A wrapper around a particular token that includes the span of the characters
 /// in the original string, for diagnostic purposes
 #[derive(Debug)]
 pub struct LexerToken<'a> {
     /// The token that was lexed
     pub token: Token<'a>,
     /// The range of the token characters in the original license expression
     pub span: std::ops::Range<usize>,
 }

 impl<'a> Iterator for Lexer<'a> {
     type Item = Result<LexerToken<'a>, ParseError>;

     fn next(&mut self) -> Option<Self::Item> {
         #[allow(clippy::unnecessary_wraps)]
         fn ok_token(token: Token<'_>) -> Option<Result<(Token<'_>, usize), ParseError>> {
             let len = token.len();
             Some(Ok((token, len)))
         }

         // Jump over any whitespace, updating `self.inner` and `self.offset` appropriately
         let non_whitespace_index = match self.inner.find(|c: char| !c.is_whitespace()) {
             Some(idx) => idx,
             None => self.inner.len(),
         };
         self.inner = &self.inner[non_whitespace_index..];
         self.offset += non_whitespace_index;

         match self.inner.chars().next() {
             None => None,
             // From SPDX 2.1 spec
             // There MUST NOT be whitespace between a license-id and any following "+".
             Some('+') => {
                 if non_whitespace_index == 0 {
                     ok_token(Token::Plus)
                 } else {
                     Some(Err(ParseError {
                         original: self.original.to_owned(),
                         span: self.offset - non_whitespace_index..self.offset,
                         reason: Reason::SeparatedPlus,
                     }))
                 }
             }
             Some('(') => ok_token(Token::OpenParen),
             Some(')') => ok_token(Token::CloseParen),
             Some('/') if self.mode.allow_slash_as_or_operator => Some(Ok((Token::Or, 1))),
             Some(_) => match Lexer::find_text_token(self.inner) {
                 None => Some(Err(ParseError {
                     original: self.original.to_owned(),
                     span: self.offset..self.offset + self.inner.len(),
                     reason: Reason::InvalidCharacters,
                 })),
                 Some(m) => {
                     if m == "WITH" {
                         ok_token(Token::With)
                     } else if m == "AND" {
                         ok_token(Token::And)
                     } else if m == "OR" {
                         ok_token(Token::Or)
                     } else if self.mode.allow_lower_case_operators && m == "and" {
                         ok_token(Token::And)
                     } else if self.mode.allow_lower_case_operators && m == "or" {
                         ok_token(Token::Or)
                     } else if self.mode.allow_lower_case_operators && m == "with" {
                         ok_token(Token::With)
                     } else if let Some(lic_id) = crate::license_id(m) {
                         ok_token(Token::Spdx(lic_id))
                     } else if let Some(exc_id) = crate::exception_id(m) {
                         ok_token(Token::Exception(exc_id))
                     } else if let Some((doc_ref, lic_ref)) = Lexer::find_document_and_license_ref(m)
                     {
                         ok_token(Token::LicenseRef {
                             doc_ref: Some(doc_ref),
                             lic_ref,
                         })
                     } else if let Some(lic_ref) = Lexer::find_license_ref(m) {
                         ok_token(Token::LicenseRef {
                             doc_ref: None,
                             lic_ref,
                         })
                     } else if let Some((lic_id, token_len)) =
                         if self.mode.allow_imprecise_license_names {
                             crate::imprecise_license_id(self.inner)
                         } else {
                             None
                         }
                     {
                         Some(Ok((Token::Spdx(lic_id), token_len)))
                     } else {
                         Some(Err(ParseError {
                             original: self.original.to_owned(),
                             span: self.offset..self.offset + m.len(),
                             reason: Reason::UnknownTerm,
                         }))
                     }
                 }
             },
         }
         .map(|res| {
             res.map(|(tok, len)| {
                 let start = self.offset;
                 self.inner = &self.inner[len..];
                 self.offset += len;

                 LexerToken {
                     token: tok,
                     span: start..self.offset,
                 }
             })
         })
     }
 }
	use crate::{
	error::{ParseError, Reason},
	ExceptionId, LicenseId,
	};

	/// Parsing configuration for SPDX expression
	#[derive(Default, Copy, Clone)]
	pub struct ParseMode {
	/// The `AND`, `OR`, and `WITH` operators are required to be uppercase in
	/// the SPDX spec, but enabling this option allows them to be lowercased
	pub allow_lower_case_operators: bool,
	/// Allows the use of `/` as a synonym for the `OR` operator.
	///
	/// This also allows for not having whitespace between the `/` and the terms
	/// on either side
	pub allow_slash_as_or_operator: bool,
	/// Allows some invalid/imprecise identifiers as synonyms for an actual
	/// license identifier.
	///
	/// See [`IMPRECISE_NAMES`](crate::identifiers::IMPRECISE_NAMES) for a list
	/// of the current synonyms. Note that this list is not comprehensive but
	/// can be expanded upon when invalid identifiers are found in the wild.
	pub allow_imprecise_license_names: bool,
	/// The various GPL licenses diverge from every other license in the SPDX
	/// license list by having an `-or-later` variant that is used as a suffix
	/// on a base license (eg. `GPL-3.0-or-later`) rather than the canonical
	/// `GPL-3.0+`.
	///
	/// This option just allows GPL licenses to be treated similarly to all of
	/// the other SPDX licenses.
	pub allow_postfix_plus_on_gpl: bool,
	}

	impl ParseMode {
	/// Strict, specification compliant SPDX parsing.
	///
	/// 1. Only license identifiers in the SPDX license list, or
	/// Document/LicenseRef, are allowed. The license identifiers are also
	/// case-sensitive.
	/// 1. `WITH`, `AND`, and `OR` are the only valid operators
	pub const STRICT: Self = Self {
	allow_lower_case_operators: false,
	allow_slash_as_or_operator: false,
	allow_imprecise_license_names: false,
	allow_postfix_plus_on_gpl: false,
	};

	/// Allow non-conforming syntax for crates-io compatibility
	///
	/// 1. Additional, invalid, identifiers are accepted and mapped to a correct
	/// SPDX license identifier.
	/// See [`IMPRECISE_NAMES`](crate::identifiers::IMPRECISE_NAMES) for the
	/// list of additionally accepted identifiers and the license they
	/// correspond to.
	/// 1. `/` can by used as a synonym for `OR`, and doesn't need to be
	/// separated by whitespace from the terms it combines
	pub const LAX: Self = Self {
	allow_lower_case_operators: true,
	allow_slash_as_or_operator: true,
	allow_imprecise_license_names: true,
	allow_postfix_plus_on_gpl: true,
	};
	}

	/// A single token in an SPDX license expression
	#[derive(Clone, Debug, PartialEq, Eq)]
	pub enum Token<'a> {
	/// A recognized SPDX license id
	Spdx(LicenseId),
	/// A `LicenseRef-` prefixed id, with an optional `DocumentRef-`
	LicenseRef {
	doc_ref: Option<&'a str>,
	lic_ref: &'a str,
	},
	/// A recognized SPDX exception id
	Exception(ExceptionId),
	/// A postfix `+` indicating "or later" for a particular SPDX license id
	Plus,
	/// A `(` for starting a group
	OpenParen,
	/// A `)` for ending a group
	CloseParen,
	/// A `WITH` operator
	With,
	/// An `AND` operator
	And,
	/// An `OR` operator
	Or,
	}

	impl<'a> std::fmt::Display for Token<'a> {
	fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
	std::fmt::Debug::fmt(self, f)
	}
	}

	impl<'a> Token<'a> {
	fn len(&self) -> usize {
	match self {
	Token::Spdx(id) => id.name.len(),
	Token::Exception(e) => e.name.len(),
	Token::With => 4,
	Token::And => 3,
	Token::Or => 2,
	Token::Plus \| Token::OpenParen \| Token::CloseParen => 1,
	Token::LicenseRef { doc_ref, lic_ref } => {
	doc_ref.map_or(0, \|d\| {
	// +1 is for the `:`
	"DocumentRef-".len() + d.len() + 1
	}) + "LicenseRef-".len()
	+ lic_ref.len()
	}
	}
	}
	}

	/// Allows iteration through an SPDX license expression, yielding
	/// a token or a `ParseError`.
	///
	/// Prefer to use `Expression::parse` or `Licensee::parse` rather
	/// than directly using the lexer
	pub struct Lexer<'a> {
	inner: &'a str,
	original: &'a str,
	offset: usize,
	mode: ParseMode,
	}

	impl<'a> Lexer<'a> {
	/// Creates a Lexer over a license expression
	#[must_use]
	pub fn new(text: &'a str) -> Self {
	Self {
	inner: text,
	original: text,
	offset: 0,
	mode: ParseMode::STRICT,
	}
	}

	/// Creates a Lexer over a license expression
	///
	/// With `ParseMode::Lax` it allows non-conforming syntax
	/// used in crates-io crates.
	#[must_use]
	pub fn new_mode(text: &'a str, mode: ParseMode) -> Self {
	Self {
	inner: text,
	original: text,
	offset: 0,
	mode,
	}
	}

	#[inline]
	fn is_ref_char(c: &char) -> bool {
	c.is_ascii_alphanumeric() \|\| c == '-' \|\| c == '.'
	}

	/// Return a matching text token if found - equivalent to the regex `^[-a-zA-Z0-9.:]+`
	fn find_text_token(text: &'a str) -> Option<&'a str> {
	let is_token_char = \|c: &char\| Self::is_ref_char(c) \|\| *c == ':';
	match text.chars().take_while(is_token_char).count() {
	index if index > 0 => Some(&text[..index]),
	_ => None,
	}
	}

	/// Extract the text after `prefix` if made up of valid ref characters
	fn find_ref(prefix: &str, text: &'a str) -> Option<&'a str> {
	text.strip_prefix(prefix).map(\|value\| {
	let end = value.chars().take_while(Self::is_ref_char).count();
	&text[prefix.len()..prefix.len() + end]
	})
	}

	/// Return a license ref if found - equivalent to the regex `^LicenseRef-([-a-zA-Z0-9.]+)`
	#[inline]
	fn find_license_ref(text: &'a str) -> Option<&'a str> {
	Self::find_ref("LicenseRef-", text)
	}

	/// Return a document ref and license ref if found,
	/// equivalent to the regex `^DocumentRef-([-a-zA-Z0-9.]+):LicenseRef-([-a-zA-Z0-9.]+)`
	fn find_document_and_license_ref(text: &'a str) -> Option<(&'a str, &'a str)> {
	let split = text.split_once(':');
	let doc_ref = split.and_then(\|(doc, _)\| Self::find_ref("DocumentRef-", doc));
	let lic_ref = split.and_then(\|(_, lic)\| Self::find_license_ref(lic));
	Option::zip(doc_ref, lic_ref)
	}
	}

	/// A wrapper around a particular token that includes the span of the characters
	/// in the original string, for diagnostic purposes
	#[derive(Debug)]
	pub struct LexerToken<'a> {
	/// The token that was lexed
	pub token: Token<'a>,
	/// The range of the token characters in the original license expression
	pub span: std::ops::Range<usize>,
	}

	impl<'a> Iterator for Lexer<'a> {
	type Item = Result<LexerToken<'a>, ParseError>;

	fn next(&mut self) -> Option<Self::Item> {
	#[allow(clippy::unnecessary_wraps)]
	fn ok_token(token: Token<'_>) -> Option<Result<(Token<'_>, usize), ParseError>> {
	let len = token.len();
	Some(Ok((token, len)))
	}

	// Jump over any whitespace, updating `self.inner` and `self.offset` appropriately
	let non_whitespace_index = match self.inner.find(\|c: char\| !c.is_whitespace()) {
	Some(idx) => idx,
	None => self.inner.len(),
	};
	self.inner = &self.inner[non_whitespace_index..];
	self.offset += non_whitespace_index;

	match self.inner.chars().next() {
	None => None,
	// From SPDX 2.1 spec
	// There MUST NOT be whitespace between a license-id and any following "+".
	Some('+') => {
	if non_whitespace_index == 0 {
	ok_token(Token::Plus)
	} else {
	Some(Err(ParseError {
	original: self.original.to_owned(),
	span: self.offset - non_whitespace_index..self.offset,
	reason: Reason::SeparatedPlus,
	}))
	}
	}
	Some('(') => ok_token(Token::OpenParen),
	Some(')') => ok_token(Token::CloseParen),
	Some('/') if self.mode.allow_slash_as_or_operator => Some(Ok((Token::Or, 1))),
	Some(_) => match Lexer::find_text_token(self.inner) {
	None => Some(Err(ParseError {
	original: self.original.to_owned(),
	span: self.offset..self.offset + self.inner.len(),
	reason: Reason::InvalidCharacters,
	})),
	Some(m) => {
	if m == "WITH" {
	ok_token(Token::With)
	} else if m == "AND" {
	ok_token(Token::And)
	} else if m == "OR" {
	ok_token(Token::Or)
	} else if self.mode.allow_lower_case_operators && m == "and" {
	ok_token(Token::And)
	} else if self.mode.allow_lower_case_operators && m == "or" {
	ok_token(Token::Or)
	} else if self.mode.allow_lower_case_operators && m == "with" {
	ok_token(Token::With)
	} else if let Some(lic_id) = crate::license_id(m) {
	ok_token(Token::Spdx(lic_id))
	} else if let Some(exc_id) = crate::exception_id(m) {
	ok_token(Token::Exception(exc_id))
	} else if let Some((doc_ref, lic_ref)) = Lexer::find_document_and_license_ref(m)
	{
	ok_token(Token::LicenseRef {
	doc_ref: Some(doc_ref),
	lic_ref,
	})
	} else if let Some(lic_ref) = Lexer::find_license_ref(m) {
	ok_token(Token::LicenseRef {
	doc_ref: None,
	lic_ref,
	})
	} else if let Some((lic_id, token_len)) =
	if self.mode.allow_imprecise_license_names {
	crate::imprecise_license_id(self.inner)
	} else {
	None
	}
	{
	Some(Ok((Token::Spdx(lic_id), token_len)))
	} else {
	Some(Err(ParseError {
	original: self.original.to_owned(),
	span: self.offset..self.offset + m.len(),
	reason: Reason::UnknownTerm,
	}))
	}
	}
	},
	}
	.map(\|res\| {
	res.map(\|(tok, len)\| {
	let start = self.offset;
	self.inner = &self.inner[len..];
	self.offset += len;

	LexerToken {
	token: tok,
	span: start..self.offset,
	}
	})
	})
	}
	}