vendor/regex-1.10.6/testdata/no-unicode.toml - toolchain/rustc - Git at Google

 [[test]]
 name = "invalid-utf8-literal1"
 regex = '\xFF'
 haystack = '\xFF'
 matches = [[0, 1]]
 unicode = false
 utf8 = false
 unescape = true


 [[test]]
 name = "mixed"
 regex = '(?:.+)(?-u)(?:.+)'
 haystack = '\xCE\x93\xCE\x94\xFF'
 matches = [[0, 5]]
 utf8 = false
 unescape = true


 [[test]]
 name = "case1"
 regex = "a"
 haystack = "A"
 matches = [[0, 1]]
 case-insensitive = true
 unicode = false

 [[test]]
 name = "case2"
 regex = "[a-z]+"
 haystack = "AaAaA"
 matches = [[0, 5]]
 case-insensitive = true
 unicode = false

 [[test]]
 name = "case3"
 regex = "[a-z]+"
 haystack = "aA\u212AaA"
 matches = [[0, 7]]
 case-insensitive = true

 [[test]]
 name = "case4"
 regex = "[a-z]+"
 haystack = "aA\u212AaA"
 matches = [[0, 2], [5, 7]]
 case-insensitive = true
 unicode = false


 [[test]]
 name = "negate1"
 regex = "[^a]"
 haystack = "δ"
 matches = [[0, 2]]

 [[test]]
 name = "negate2"
 regex = "[^a]"
 haystack = "δ"
 matches = [[0, 1], [1, 2]]
 unicode = false
 utf8 = false


 [[test]]
 name = "dotstar-prefix1"
 regex = "a"
 haystack = '\xFFa'
 matches = [[1, 2]]
 unicode = false
 utf8 = false
 unescape = true

 [[test]]
 name = "dotstar-prefix2"
 regex = "a"
 haystack = '\xFFa'
 matches = [[1, 2]]
 utf8 = false
 unescape = true


 [[test]]
 name = "null-bytes1"
 regex = '[^\x00]+\x00'
 haystack = 'foo\x00'
 matches = [[0, 4]]
 unicode = false
 utf8 = false
 unescape = true


 [[test]]
 name = "word-ascii"
 regex = '\w+'
 haystack = "aδ"
 matches = [[0, 1]]
 unicode = false

 [[test]]
 name = "word-unicode"
 regex = '\w+'
 haystack = "aδ"
 matches = [[0, 3]]

 [[test]]
 name = "decimal-ascii"
 regex = '\d+'
 haystack = "1२३9"
 matches = [[0, 1], [7, 8]]
 unicode = false

 [[test]]
 name = "decimal-unicode"
 regex = '\d+'
 haystack = "1२३9"
 matches = [[0, 8]]

 [[test]]
 name = "space-ascii"
 regex = '\s+'
 haystack = " \u1680"
 matches = [[0, 1]]
 unicode = false

 [[test]]
 name = "space-unicode"
 regex = '\s+'
 haystack = " \u1680"
 matches = [[0, 4]]


 [[test]]
 # See: https://github.com/rust-lang/regex/issues/484
 name = "iter1-bytes"
 regex = ''
 haystack = "☃"
 matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
 utf8 = false

 [[test]]
 # See: https://github.com/rust-lang/regex/issues/484
 name = "iter1-utf8"
 regex = ''
 haystack = "☃"
 matches = [[0, 0], [3, 3]]

 [[test]]
 # See: https://github.com/rust-lang/regex/issues/484
 # Note that iter2-utf8 doesn't make sense here, since the input isn't UTF-8.
 name = "iter2-bytes"
 regex = ''
 haystack = 'b\xFFr'
 matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
 unescape = true
 utf8 = false


 # These test that unanchored prefixes can munch through invalid UTF-8 even when
 # utf8 is enabled.
 #
 # This test actually reflects an interesting simplification in how the Thompson
 # NFA is constructed. It used to be that the NFA could be built with an
 # unanchored prefix that either matched any byte or _only_ matched valid UTF-8.
 # But the latter turns out to be pretty precarious when it comes to prefilters,
 # because if you search a haystack that contains invalid UTF-8 but have an
 # unanchored prefix that requires UTF-8, then prefilters are no longer a valid
 # optimization because you actually have to check that everything is valid
 # UTF-8.
 #
 # Originally, I had thought that we needed a valid UTF-8 unanchored prefix in
 # order to guarantee that we only match at valid UTF-8 boundaries. But this
 # isn't actually true! There are really only two things to consider here:
 #
 # 1) Will a regex match split an encoded codepoint? No. Because by construction,
 # we ensure that a MATCH state can only be reached by following valid UTF-8 (assuming
 # all of the UTF-8 modes are enabled).
 #
 # 2) Will a regex match arbitrary bytes that aren't valid UTF-8? Again, no,
 # assuming all of the UTF-8 modes are enabled.
 [[test]]
 name = "unanchored-invalid-utf8-match-100"
 regex = '[a-z]'
 haystack = '\xFFa\xFF'
 matches = [[1, 2]]
 unescape = true
 utf8 = false

 # This test shows that we can still prevent a match from occurring by requiring
 # that valid UTF-8 match by inserting our own unanchored prefix. Thus, if the
 # behavior of not munching through invalid UTF-8 anywhere is needed, then it
 # can be achieved thusly.
 [[test]]
 name = "unanchored-invalid-utf8-nomatch"
 regex = '^(?s:.)*?[a-z]'
 haystack = '\xFFa\xFF'
 matches = []
 unescape = true
 utf8 = false

 # This is a tricky test that makes sure we don't accidentally do a kind of
 # unanchored search when we've requested that a regex engine not report
 # empty matches that split a codepoint. This test caught a regression during
 # development where the code for skipping over bad empty matches would do so
 # even if the search should have been anchored. This is ultimately what led to
 # making 'anchored' an 'Input' option, so that it was always clear what kind
 # of search was being performed. (Before that, whether a search was anchored
 # or not was a config knob on the regex engine.) This did wind up making DFAs
 # a little more complex to configure (with their 'StartKind' knob), but it
 # generally smoothed out everything else.
 #
 # Great example of a test whose failure motivated a sweeping API refactoring.
 [[test]]
 name = "anchored-iter-empty-utf8"
 regex = ''
 haystack = 'a☃z'
 matches = [[0, 0], [1, 1]]
 unescape = false
 utf8 = true
 anchored = true
	[[test]]
	name = "invalid-utf8-literal1"
	regex = '\xFF'
	haystack = '\xFF'
	matches = [[0, 1]]
	unicode = false
	utf8 = false
	unescape = true


	[[test]]
	name = "mixed"
	regex = '(?:.+)(?-u)(?:.+)'
	haystack = '\xCE\x93\xCE\x94\xFF'
	matches = [[0, 5]]
	utf8 = false
	unescape = true


	[[test]]
	name = "case1"
	regex = "a"
	haystack = "A"
	matches = [[0, 1]]
	case-insensitive = true
	unicode = false

	[[test]]
	name = "case2"
	regex = "[a-z]+"
	haystack = "AaAaA"
	matches = [[0, 5]]
	case-insensitive = true
	unicode = false

	[[test]]
	name = "case3"
	regex = "[a-z]+"
	haystack = "aA\u212AaA"
	matches = [[0, 7]]
	case-insensitive = true

	[[test]]
	name = "case4"
	regex = "[a-z]+"
	haystack = "aA\u212AaA"
	matches = [[0, 2], [5, 7]]
	case-insensitive = true
	unicode = false


	[[test]]
	name = "negate1"
	regex = "[^a]"
	haystack = "δ"
	matches = [[0, 2]]

	[[test]]
	name = "negate2"
	regex = "[^a]"
	haystack = "δ"
	matches = [[0, 1], [1, 2]]
	unicode = false
	utf8 = false


	[[test]]
	name = "dotstar-prefix1"
	regex = "a"
	haystack = '\xFFa'
	matches = [[1, 2]]
	unicode = false
	utf8 = false
	unescape = true

	[[test]]
	name = "dotstar-prefix2"
	regex = "a"
	haystack = '\xFFa'
	matches = [[1, 2]]
	utf8 = false
	unescape = true


	[[test]]
	name = "null-bytes1"
	regex = '[^\x00]+\x00'
	haystack = 'foo\x00'
	matches = [[0, 4]]
	unicode = false
	utf8 = false
	unescape = true


	[[test]]
	name = "word-ascii"
	regex = '\w+'
	haystack = "aδ"
	matches = [[0, 1]]
	unicode = false

	[[test]]
	name = "word-unicode"
	regex = '\w+'
	haystack = "aδ"
	matches = [[0, 3]]

	[[test]]
	name = "decimal-ascii"
	regex = '\d+'
	haystack = "1२३9"
	matches = [[0, 1], [7, 8]]
	unicode = false

	[[test]]
	name = "decimal-unicode"
	regex = '\d+'
	haystack = "1२३9"
	matches = [[0, 8]]

	[[test]]
	name = "space-ascii"
	regex = '\s+'
	haystack = " \u1680"
	matches = [[0, 1]]
	unicode = false

	[[test]]
	name = "space-unicode"
	regex = '\s+'
	haystack = " \u1680"
	matches = [[0, 4]]


	[[test]]
	# See: https://github.com/rust-lang/regex/issues/484
	name = "iter1-bytes"
	regex = ''
	haystack = "☃"
	matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
	utf8 = false

	[[test]]
	# See: https://github.com/rust-lang/regex/issues/484
	name = "iter1-utf8"
	regex = ''
	haystack = "☃"
	matches = [[0, 0], [3, 3]]

	[[test]]
	# See: https://github.com/rust-lang/regex/issues/484
	# Note that iter2-utf8 doesn't make sense here, since the input isn't UTF-8.
	name = "iter2-bytes"
	regex = ''
	haystack = 'b\xFFr'
	matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
	unescape = true
	utf8 = false


	# These test that unanchored prefixes can munch through invalid UTF-8 even when
	# utf8 is enabled.
	#
	# This test actually reflects an interesting simplification in how the Thompson
	# NFA is constructed. It used to be that the NFA could be built with an
	# unanchored prefix that either matched any byte or _only_ matched valid UTF-8.
	# But the latter turns out to be pretty precarious when it comes to prefilters,
	# because if you search a haystack that contains invalid UTF-8 but have an
	# unanchored prefix that requires UTF-8, then prefilters are no longer a valid
	# optimization because you actually have to check that everything is valid
	# UTF-8.
	#
	# Originally, I had thought that we needed a valid UTF-8 unanchored prefix in
	# order to guarantee that we only match at valid UTF-8 boundaries. But this
	# isn't actually true! There are really only two things to consider here:
	#
	# 1) Will a regex match split an encoded codepoint? No. Because by construction,
	# we ensure that a MATCH state can only be reached by following valid UTF-8 (assuming
	# all of the UTF-8 modes are enabled).
	#
	# 2) Will a regex match arbitrary bytes that aren't valid UTF-8? Again, no,
	# assuming all of the UTF-8 modes are enabled.
	[[test]]
	name = "unanchored-invalid-utf8-match-100"
	regex = '[a-z]'
	haystack = '\xFFa\xFF'
	matches = [[1, 2]]
	unescape = true
	utf8 = false

	# This test shows that we can still prevent a match from occurring by requiring
	# that valid UTF-8 match by inserting our own unanchored prefix. Thus, if the
	# behavior of not munching through invalid UTF-8 anywhere is needed, then it
	# can be achieved thusly.
	[[test]]
	name = "unanchored-invalid-utf8-nomatch"
	regex = '^(?s:.)*?[a-z]'
	haystack = '\xFFa\xFF'
	matches = []
	unescape = true
	utf8 = false

	# This is a tricky test that makes sure we don't accidentally do a kind of
	# unanchored search when we've requested that a regex engine not report
	# empty matches that split a codepoint. This test caught a regression during
	# development where the code for skipping over bad empty matches would do so
	# even if the search should have been anchored. This is ultimately what led to
	# making 'anchored' an 'Input' option, so that it was always clear what kind
	# of search was being performed. (Before that, whether a search was anchored
	# or not was a config knob on the regex engine.) This did wind up making DFAs
	# a little more complex to configure (with their 'StartKind' knob), but it
	# generally smoothed out everything else.
	#
	# Great example of a test whose failure motivated a sweeping API refactoring.
	[[test]]
	name = "anchored-iter-empty-utf8"
	regex = ''
	haystack = 'a☃z'
	matches = [[0, 0], [1, 1]]
	unescape = false
	utf8 = true
	anchored = true