vendor/regex-1.10.6/testdata/bytes.toml - toolchain/rustc - Git at Google

 # These are tests specifically crafted for regexes that can match arbitrary
 # bytes. In some cases, we also test the Unicode variant as well, just because
 # it's good sense to do so. But also, these tests aren't really about Unicode,
 # but whether matches are only reported at valid UTF-8 boundaries. For most
 # tests in this entire collection, utf8 = true. But for these tests, we use
 # utf8 = false.

 [[test]]
 name = "word-boundary-ascii"
 regex = ' \b'
 haystack = " δ"
 matches = []
 unicode = false
 utf8 = false

 [[test]]
 name = "word-boundary-unicode"
 regex = ' \b'
 haystack = " δ"
 matches = [[0, 1]]
 unicode = true
 utf8 = false

 [[test]]
 name = "word-boundary-ascii-not"
 regex = ' \B'
 haystack = " δ"
 matches = [[0, 1]]
 unicode = false
 utf8 = false

 [[test]]
 name = "word-boundary-unicode-not"
 regex = ' \B'
 haystack = " δ"
 matches = []
 unicode = true
 utf8 = false

 [[test]]
 name = "perl-word-ascii"
 regex = '\w+'
 haystack = "aδ"
 matches = [[0, 1]]
 unicode = false
 utf8 = false

 [[test]]
 name = "perl-word-unicode"
 regex = '\w+'
 haystack = "aδ"
 matches = [[0, 3]]
 unicode = true
 utf8 = false

 [[test]]
 name = "perl-decimal-ascii"
 regex = '\d+'
 haystack = "1२३9"
 matches = [[0, 1], [7, 8]]
 unicode = false
 utf8 = false

 [[test]]
 name = "perl-decimal-unicode"
 regex = '\d+'
 haystack = "1२३9"
 matches = [[0, 8]]
 unicode = true
 utf8 = false

 [[test]]
 name = "perl-whitespace-ascii"
 regex = '\s+'
 haystack = " \u1680"
 matches = [[0, 1]]
 unicode = false
 utf8 = false

 [[test]]
 name = "perl-whitespace-unicode"
 regex = '\s+'
 haystack = " \u1680"
 matches = [[0, 4]]
 unicode = true
 utf8 = false

 # The first `(.+)` matches two Unicode codepoints, but can't match the 5th
 # byte, which isn't valid UTF-8. The second (byte based) `(.+)` takes over and
 # matches.
 [[test]]
 name = "mixed-dot"
 regex = '(.+)(?-u)(.+)'
 haystack = '\xCE\x93\xCE\x94\xFF'
 matches = [
   [[0, 5], [0, 4], [4, 5]],
 ]
 unescape = true
 unicode = true
 utf8 = false

 [[test]]
 name = "case-one-ascii"
 regex = 'a'
 haystack = "A"
 matches = [[0, 1]]
 case-insensitive = true
 unicode = false
 utf8 = false

 [[test]]
 name = "case-one-unicode"
 regex = 'a'
 haystack = "A"
 matches = [[0, 1]]
 case-insensitive = true
 unicode = true
 utf8 = false

 [[test]]
 name = "case-class-simple-ascii"
 regex = '[a-z]+'
 haystack = "AaAaA"
 matches = [[0, 5]]
 case-insensitive = true
 unicode = false
 utf8 = false

 [[test]]
 name = "case-class-ascii"
 regex = '[a-z]+'
 haystack = "aA\u212AaA"
 matches = [[0, 2], [5, 7]]
 case-insensitive = true
 unicode = false
 utf8 = false

 [[test]]
 name = "case-class-unicode"
 regex = '[a-z]+'
 haystack = "aA\u212AaA"
 matches = [[0, 7]]
 case-insensitive = true
 unicode = true
 utf8 = false

 [[test]]
 name = "negate-ascii"
 regex = '[^a]'
 haystack = "δ"
 matches = [[0, 1], [1, 2]]
 unicode = false
 utf8 = false

 [[test]]
 name = "negate-unicode"
 regex = '[^a]'
 haystack = "δ"
 matches = [[0, 2]]
 unicode = true
 utf8 = false

 # When utf8=true, this won't match, because the implicit '.*?' prefix is
 # Unicode aware and will refuse to match through invalid UTF-8 bytes.
 [[test]]
 name = "dotstar-prefix-ascii"
 regex = 'a'
 haystack = '\xFFa'
 matches = [[1, 2]]
 unescape = true
 unicode = false
 utf8 = false

 [[test]]
 name = "dotstar-prefix-unicode"
 regex = 'a'
 haystack = '\xFFa'
 matches = [[1, 2]]
 unescape = true
 unicode = true
 utf8 = false

 [[test]]
 name = "null-bytes"
 regex = '(?P<cstr>[^\x00]+)\x00'
 haystack = 'foo\x00'
 matches = [
   [[0, 4], [0, 3]],
 ]
 unescape = true
 unicode = false
 utf8 = false

 [[test]]
 name = "invalid-utf8-anchor-100"
 regex = '\xCC?^'
 haystack = '\x8d#;\x1a\xa4s3\x05foobarX\\\x0f0t\xe4\x9b\xa4'
 matches = [[0, 0]]
 unescape = true
 unicode = false
 utf8 = false

 [[test]]
 name = "invalid-utf8-anchor-200"
 regex = '^\xf7|4\xff\d\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a##########[] d\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a##########\[] #####\x80\S7|$'
 haystack = '\x8d#;\x1a\xa4s3\x05foobarX\\\x0f0t\xe4\x9b\xa4'
 matches = [[22, 22]]
 unescape = true
 unicode = false
 utf8 = false

 [[test]]
 name = "invalid-utf8-anchor-300"
 regex = '^|ddp\xff\xffdddddlQd@\x80'
 haystack = '\x8d#;\x1a\xa4s3\x05foobarX\\\x0f0t\xe4\x9b\xa4'
 matches = [[0, 0]]
 unescape = true
 unicode = false
 utf8 = false

 [[test]]
 name = "word-boundary-ascii-100"
 regex = '\Bx\B'
 haystack = "áxβ"
 matches = []
 unicode = false
 utf8 = false

 [[test]]
 name = "word-boundary-ascii-200"
 regex = '\B'
 haystack = "0\U0007EF5E"
 matches = [[2, 2], [3, 3], [4, 4], [5, 5]]
 unicode = false
 utf8 = false
	# These are tests specifically crafted for regexes that can match arbitrary
	# bytes. In some cases, we also test the Unicode variant as well, just because
	# it's good sense to do so. But also, these tests aren't really about Unicode,
	# but whether matches are only reported at valid UTF-8 boundaries. For most
	# tests in this entire collection, utf8 = true. But for these tests, we use
	# utf8 = false.

	[[test]]
	name = "word-boundary-ascii"
	regex = ' \b'
	haystack = " δ"
	matches = []
	unicode = false
	utf8 = false

	[[test]]
	name = "word-boundary-unicode"
	regex = ' \b'
	haystack = " δ"
	matches = [[0, 1]]
	unicode = true
	utf8 = false

	[[test]]
	name = "word-boundary-ascii-not"
	regex = ' \B'
	haystack = " δ"
	matches = [[0, 1]]
	unicode = false
	utf8 = false

	[[test]]
	name = "word-boundary-unicode-not"
	regex = ' \B'
	haystack = " δ"
	matches = []
	unicode = true
	utf8 = false

	[[test]]
	name = "perl-word-ascii"
	regex = '\w+'
	haystack = "aδ"
	matches = [[0, 1]]
	unicode = false
	utf8 = false

	[[test]]
	name = "perl-word-unicode"
	regex = '\w+'
	haystack = "aδ"
	matches = [[0, 3]]
	unicode = true
	utf8 = false

	[[test]]
	name = "perl-decimal-ascii"
	regex = '\d+'
	haystack = "1२३9"
	matches = [[0, 1], [7, 8]]
	unicode = false
	utf8 = false

	[[test]]
	name = "perl-decimal-unicode"
	regex = '\d+'
	haystack = "1२३9"
	matches = [[0, 8]]
	unicode = true
	utf8 = false

	[[test]]
	name = "perl-whitespace-ascii"
	regex = '\s+'
	haystack = " \u1680"
	matches = [[0, 1]]
	unicode = false
	utf8 = false

	[[test]]
	name = "perl-whitespace-unicode"
	regex = '\s+'
	haystack = " \u1680"
	matches = [[0, 4]]
	unicode = true
	utf8 = false

	# The first `(.+)` matches two Unicode codepoints, but can't match the 5th
	# byte, which isn't valid UTF-8. The second (byte based) `(.+)` takes over and
	# matches.
	[[test]]
	name = "mixed-dot"
	regex = '(.+)(?-u)(.+)'
	haystack = '\xCE\x93\xCE\x94\xFF'
	matches = [
	[[0, 5], [0, 4], [4, 5]],
	]
	unescape = true
	unicode = true
	utf8 = false

	[[test]]
	name = "case-one-ascii"
	regex = 'a'
	haystack = "A"
	matches = [[0, 1]]
	case-insensitive = true
	unicode = false
	utf8 = false

	[[test]]
	name = "case-one-unicode"
	regex = 'a'
	haystack = "A"
	matches = [[0, 1]]
	case-insensitive = true
	unicode = true
	utf8 = false

	[[test]]
	name = "case-class-simple-ascii"
	regex = '[a-z]+'
	haystack = "AaAaA"
	matches = [[0, 5]]
	case-insensitive = true
	unicode = false
	utf8 = false

	[[test]]
	name = "case-class-ascii"
	regex = '[a-z]+'
	haystack = "aA\u212AaA"
	matches = [[0, 2], [5, 7]]
	case-insensitive = true
	unicode = false
	utf8 = false

	[[test]]
	name = "case-class-unicode"
	regex = '[a-z]+'
	haystack = "aA\u212AaA"
	matches = [[0, 7]]
	case-insensitive = true
	unicode = true
	utf8 = false

	[[test]]
	name = "negate-ascii"
	regex = '[^a]'
	haystack = "δ"
	matches = [[0, 1], [1, 2]]
	unicode = false
	utf8 = false

	[[test]]
	name = "negate-unicode"
	regex = '[^a]'
	haystack = "δ"
	matches = [[0, 2]]
	unicode = true
	utf8 = false

	# When utf8=true, this won't match, because the implicit '.*?' prefix is
	# Unicode aware and will refuse to match through invalid UTF-8 bytes.
	[[test]]
	name = "dotstar-prefix-ascii"
	regex = 'a'
	haystack = '\xFFa'
	matches = [[1, 2]]
	unescape = true
	unicode = false
	utf8 = false

	[[test]]
	name = "dotstar-prefix-unicode"
	regex = 'a'
	haystack = '\xFFa'
	matches = [[1, 2]]
	unescape = true
	unicode = true
	utf8 = false

	[[test]]
	name = "null-bytes"
	regex = '(?P<cstr>[^\x00]+)\x00'
	haystack = 'foo\x00'
	matches = [
	[[0, 4], [0, 3]],
	]
	unescape = true
	unicode = false
	utf8 = false

	[[test]]
	name = "invalid-utf8-anchor-100"
	regex = '\xCC?^'
	haystack = '\x8d#;\x1a\xa4s3\x05foobarX\\\x0f0t\xe4\x9b\xa4'
	matches = [[0, 0]]
	unescape = true
	unicode = false
	utf8 = false

	[[test]]
	name = "invalid-utf8-anchor-200"
	regex = '^\xf7\|4\xff\d\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a##########[] d\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a##########\[] #####\x80\S7\|$'
	haystack = '\x8d#;\x1a\xa4s3\x05foobarX\\\x0f0t\xe4\x9b\xa4'
	matches = [[22, 22]]
	unescape = true
	unicode = false
	utf8 = false

	[[test]]
	name = "invalid-utf8-anchor-300"
	regex = '^\|ddp\xff\xffdddddlQd@\x80'
	haystack = '\x8d#;\x1a\xa4s3\x05foobarX\\\x0f0t\xe4\x9b\xa4'
	matches = [[0, 0]]
	unescape = true
	unicode = false
	utf8 = false

	[[test]]
	name = "word-boundary-ascii-100"
	regex = '\Bx\B'
	haystack = "áxβ"
	matches = []
	unicode = false
	utf8 = false

	[[test]]
	name = "word-boundary-ascii-200"
	regex = '\B'
	haystack = "0\U0007EF5E"
	matches = [[2, 2], [3, 3], [4, 4], [5, 5]]
	unicode = false
	utf8 = false