| [[test]] |
| name = "invalid-utf8-literal1" |
| regex = '\xFF' |
| haystack = '\xFF' |
| matches = [[0, 1]] |
| unicode = false |
| utf8 = false |
| unescape = true |
| |
| |
| [[test]] |
| name = "mixed" |
| regex = '(?:.+)(?-u)(?:.+)' |
| haystack = '\xCE\x93\xCE\x94\xFF' |
| matches = [[0, 5]] |
| utf8 = false |
| unescape = true |
| |
| |
| [[test]] |
| name = "case1" |
| regex = "a" |
| haystack = "A" |
| matches = [[0, 1]] |
| case-insensitive = true |
| unicode = false |
| |
| [[test]] |
| name = "case2" |
| regex = "[a-z]+" |
| haystack = "AaAaA" |
| matches = [[0, 5]] |
| case-insensitive = true |
| unicode = false |
| |
| [[test]] |
| name = "case3" |
| regex = "[a-z]+" |
| haystack = "aA\u212AaA" |
| matches = [[0, 7]] |
| case-insensitive = true |
| |
| [[test]] |
| name = "case4" |
| regex = "[a-z]+" |
| haystack = "aA\u212AaA" |
| matches = [[0, 2], [5, 7]] |
| case-insensitive = true |
| unicode = false |
| |
| |
| [[test]] |
| name = "negate1" |
| regex = "[^a]" |
| haystack = "δ" |
| matches = [[0, 2]] |
| |
| [[test]] |
| name = "negate2" |
| regex = "[^a]" |
| haystack = "δ" |
| matches = [[0, 1], [1, 2]] |
| unicode = false |
| utf8 = false |
| |
| |
| [[test]] |
| name = "dotstar-prefix1" |
| regex = "a" |
| haystack = '\xFFa' |
| matches = [[1, 2]] |
| unicode = false |
| utf8 = false |
| unescape = true |
| |
| [[test]] |
| name = "dotstar-prefix2" |
| regex = "a" |
| haystack = '\xFFa' |
| matches = [[1, 2]] |
| utf8 = false |
| unescape = true |
| |
| |
| [[test]] |
| name = "null-bytes1" |
| regex = '[^\x00]+\x00' |
| haystack = 'foo\x00' |
| matches = [[0, 4]] |
| unicode = false |
| utf8 = false |
| unescape = true |
| |
| |
| [[test]] |
| name = "word-ascii" |
| regex = '\w+' |
| haystack = "aδ" |
| matches = [[0, 1]] |
| unicode = false |
| |
| [[test]] |
| name = "word-unicode" |
| regex = '\w+' |
| haystack = "aδ" |
| matches = [[0, 3]] |
| |
| [[test]] |
| name = "decimal-ascii" |
| regex = '\d+' |
| haystack = "1२३9" |
| matches = [[0, 1], [7, 8]] |
| unicode = false |
| |
| [[test]] |
| name = "decimal-unicode" |
| regex = '\d+' |
| haystack = "1२३9" |
| matches = [[0, 8]] |
| |
| [[test]] |
| name = "space-ascii" |
| regex = '\s+' |
| haystack = " \u1680" |
| matches = [[0, 1]] |
| unicode = false |
| |
| [[test]] |
| name = "space-unicode" |
| regex = '\s+' |
| haystack = " \u1680" |
| matches = [[0, 4]] |
| |
| |
| [[test]] |
| # See: https://github.com/rust-lang/regex/issues/484 |
| name = "iter1-bytes" |
| regex = '' |
| haystack = "☃" |
| matches = [[0, 0], [1, 1], [2, 2], [3, 3]] |
| utf8 = false |
| |
| [[test]] |
| # See: https://github.com/rust-lang/regex/issues/484 |
| name = "iter1-utf8" |
| regex = '' |
| haystack = "☃" |
| matches = [[0, 0], [3, 3]] |
| |
| [[test]] |
| # See: https://github.com/rust-lang/regex/issues/484 |
| # Note that iter2-utf8 doesn't make sense here, since the input isn't UTF-8. |
| name = "iter2-bytes" |
| regex = '' |
| haystack = 'b\xFFr' |
| matches = [[0, 0], [1, 1], [2, 2], [3, 3]] |
| unescape = true |
| utf8 = false |
| |
| |
| # These test that unanchored prefixes can munch through invalid UTF-8 even when |
| # utf8 is enabled. |
| # |
| # This test actually reflects an interesting simplification in how the Thompson |
| # NFA is constructed. It used to be that the NFA could be built with an |
| # unanchored prefix that either matched any byte or _only_ matched valid UTF-8. |
| # But the latter turns out to be pretty precarious when it comes to prefilters, |
| # because if you search a haystack that contains invalid UTF-8 but have an |
| # unanchored prefix that requires UTF-8, then prefilters are no longer a valid |
| # optimization because you actually have to check that everything is valid |
| # UTF-8. |
| # |
| # Originally, I had thought that we needed a valid UTF-8 unanchored prefix in |
| # order to guarantee that we only match at valid UTF-8 boundaries. But this |
| # isn't actually true! There are really only two things to consider here: |
| # |
| # 1) Will a regex match split an encoded codepoint? No. Because by construction, |
| # we ensure that a MATCH state can only be reached by following valid UTF-8 (assuming |
| # all of the UTF-8 modes are enabled). |
| # |
| # 2) Will a regex match arbitrary bytes that aren't valid UTF-8? Again, no, |
| # assuming all of the UTF-8 modes are enabled. |
| [[test]] |
| name = "unanchored-invalid-utf8-match-100" |
| regex = '[a-z]' |
| haystack = '\xFFa\xFF' |
| matches = [[1, 2]] |
| unescape = true |
| utf8 = false |
| |
| # This test shows that we can still prevent a match from occurring by requiring |
| # that valid UTF-8 match by inserting our own unanchored prefix. Thus, if the |
| # behavior of not munching through invalid UTF-8 anywhere is needed, then it |
| # can be achieved thusly. |
| [[test]] |
| name = "unanchored-invalid-utf8-nomatch" |
| regex = '^(?s:.)*?[a-z]' |
| haystack = '\xFFa\xFF' |
| matches = [] |
| unescape = true |
| utf8 = false |
| |
| # This is a tricky test that makes sure we don't accidentally do a kind of |
| # unanchored search when we've requested that a regex engine not report |
| # empty matches that split a codepoint. This test caught a regression during |
| # development where the code for skipping over bad empty matches would do so |
| # even if the search should have been anchored. This is ultimately what led to |
| # making 'anchored' an 'Input' option, so that it was always clear what kind |
| # of search was being performed. (Before that, whether a search was anchored |
| # or not was a config knob on the regex engine.) This did wind up making DFAs |
| # a little more complex to configure (with their 'StartKind' knob), but it |
| # generally smoothed out everything else. |
| # |
| # Great example of a test whose failure motivated a sweeping API refactoring. |
| [[test]] |
| name = "anchored-iter-empty-utf8" |
| regex = '' |
| haystack = 'a☃z' |
| matches = [[0, 0], [1, 1]] |
| unescape = false |
| utf8 = true |
| anchored = true |