vendor/regex-1.10.6/testdata/regex-lite.toml - toolchain/rustc - Git at Google

 # These tests are specifically written to test the regex-lite crate. While it
 # largely has the same semantics as the regex crate, there are some differences
 # around Unicode support and UTF-8.
 #
 # To be clear, regex-lite supports far fewer patterns because of its lack of
 # Unicode support, nested character classes and character class set operations.
 # What we're talking about here are the patterns that both crates support but
 # where the semantics might differ.

 # regex-lite uses ASCII definitions for Perl character classes.
 [[test]]
 name = "perl-class-decimal"
 regex = '\d'
 haystack = '᠕'
 matches = []
 unicode = true

 # regex-lite uses ASCII definitions for Perl character classes.
 [[test]]
 name = "perl-class-space"
 regex = '\s'
 haystack = "\u2000"
 matches = []
 unicode = true

 # regex-lite uses ASCII definitions for Perl character classes.
 [[test]]
 name = "perl-class-word"
 regex = '\w'
 haystack = 'δ'
 matches = []
 unicode = true

 # regex-lite uses the ASCII definition of word for word boundary assertions.
 [[test]]
 name = "word-boundary"
 regex = '\b'
 haystack = 'δ'
 matches = []
 unicode = true

 # regex-lite uses the ASCII definition of word for negated word boundary
 # assertions. But note that it should still not split codepoints!
 [[test]]
 name = "word-boundary-negated"
 regex = '\B'
 haystack = 'δ'
 matches = [[0, 0], [2, 2]]
 unicode = true

 # While we're here, the empty regex---which matches at every
 # position---shouldn't split a codepoint either.
 [[test]]
 name = "empty-no-split-codepoint"
 regex = ''
 haystack = '💩'
 matches = [[0, 0], [4, 4]]
 unicode = true

 # A dot always matches a full codepoint.
 [[test]]
 name = "dot-always-matches-codepoint"
 regex = '.'
 haystack = '💩'
 matches = [[0, 4]]
 unicode = false

 # A negated character class also always matches a full codepoint.
 [[test]]
 name = "negated-class-always-matches-codepoint"
 regex = '[^a]'
 haystack = '💩'
 matches = [[0, 4]]
 unicode = false

 # regex-lite only supports ASCII-aware case insensitive matching.
 [[test]]
 name = "case-insensitive-is-ascii-only"
 regex = 's'
 haystack = 'ſ'
 matches = []
 unicode = true
 case-insensitive = true

 # Negated word boundaries shouldn't split a codepoint, but they will match
 # between invalid UTF-8.
 #
 # This test is only valid for a 'bytes' API, but that doesn't (yet) exist in
 # regex-lite. This can't happen in the main API because &str can't contain
 # invalid UTF-8.
 # [[test]]
 # name = "word-boundary-invalid-utf8"
 # regex = '\B'
 # haystack = '\xFF\xFF\xFF\xFF'
 # unescape = true
 # matches = [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4]]
 # unicode = true
 # utf8 = false
	# These tests are specifically written to test the regex-lite crate. While it
	# largely has the same semantics as the regex crate, there are some differences
	# around Unicode support and UTF-8.
	#
	# To be clear, regex-lite supports far fewer patterns because of its lack of
	# Unicode support, nested character classes and character class set operations.
	# What we're talking about here are the patterns that both crates support but
	# where the semantics might differ.

	# regex-lite uses ASCII definitions for Perl character classes.
	[[test]]
	name = "perl-class-decimal"
	regex = '\d'
	haystack = '᠕'
	matches = []
	unicode = true

	# regex-lite uses ASCII definitions for Perl character classes.
	[[test]]
	name = "perl-class-space"
	regex = '\s'
	haystack = "\u2000"
	matches = []
	unicode = true

	# regex-lite uses ASCII definitions for Perl character classes.
	[[test]]
	name = "perl-class-word"
	regex = '\w'
	haystack = 'δ'
	matches = []
	unicode = true

	# regex-lite uses the ASCII definition of word for word boundary assertions.
	[[test]]
	name = "word-boundary"
	regex = '\b'
	haystack = 'δ'
	matches = []
	unicode = true

	# regex-lite uses the ASCII definition of word for negated word boundary
	# assertions. But note that it should still not split codepoints!
	[[test]]
	name = "word-boundary-negated"
	regex = '\B'
	haystack = 'δ'
	matches = [[0, 0], [2, 2]]
	unicode = true

	# While we're here, the empty regex---which matches at every
	# position---shouldn't split a codepoint either.
	[[test]]
	name = "empty-no-split-codepoint"
	regex = ''
	haystack = '💩'
	matches = [[0, 0], [4, 4]]
	unicode = true

	# A dot always matches a full codepoint.
	[[test]]
	name = "dot-always-matches-codepoint"
	regex = '.'
	haystack = '💩'
	matches = [[0, 4]]
	unicode = false

	# A negated character class also always matches a full codepoint.
	[[test]]
	name = "negated-class-always-matches-codepoint"
	regex = '[^a]'
	haystack = '💩'
	matches = [[0, 4]]
	unicode = false

	# regex-lite only supports ASCII-aware case insensitive matching.
	[[test]]
	name = "case-insensitive-is-ascii-only"
	regex = 's'
	haystack = 'ſ'
	matches = []
	unicode = true
	case-insensitive = true

	# Negated word boundaries shouldn't split a codepoint, but they will match
	# between invalid UTF-8.
	#
	# This test is only valid for a 'bytes' API, but that doesn't (yet) exist in
	# regex-lite. This can't happen in the main API because &str can't contain
	# invalid UTF-8.
	# [[test]]
	# name = "word-boundary-invalid-utf8"
	# regex = '\B'
	# haystack = '\xFF\xFF\xFF\xFF'
	# unescape = true
	# matches = [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4]]
	# unicode = true
	# utf8 = false