| # These test the UTF-8 modes expose by regex-automata. Namely, when utf8 is |
| # true, then we promise that the haystack is valid UTF-8. (Otherwise behavior |
| # is unspecified.) This also corresponds to building the regex engine with the |
| # following two guarantees: |
| # |
| # 1) For any non-empty match reported, its span is guaranteed to correspond to |
| # valid UTF-8. |
| # 2) All empty or zero-width matches reported must never split a UTF-8 |
| # encoded codepoint. If the haystack has invalid UTF-8, then this results in |
| # unspecified behavior. |
| # |
| # The (2) is in particular what we focus our testing on since (1) is generally |
| # guaranteed by regex-syntax's AST-to-HIR translator and is well tested there. |
| # The thing with (2) is that it can't be described in the HIR, so the regex |
| # engines have to handle that case. Thus, we test it here. |
| # |
| # Note that it is possible to build a regex that has property (1) but not |
| # (2), and vice versa. This is done by building the HIR with 'utf8=true' but |
| # building the Thompson NFA with 'utf8=false'. We don't test that here because |
| # the harness doesn't expose a way to enable or disable UTF-8 mode with that |
| # granularity. Instead, those combinations are lightly tested via doc examples. |
| # That's not to say that (1) without (2) is uncommon. Indeed, ripgrep uses it |
| # because it cannot guarantee that its haystack is valid UTF-8. |
| |
| # This tests that an empty regex doesn't split a codepoint. |
| [[test]] |
| name = "empty-utf8yes" |
| regex = '' |
| haystack = '☃' |
| matches = [[0, 0], [3, 3]] |
| unicode = true |
| utf8 = true |
| |
| # Tests the overlapping case of the above. |
| [[test]] |
| name = "empty-utf8yes-overlapping" |
| regex = '' |
| haystack = '☃' |
| matches = [[0, 0], [3, 3]] |
| unicode = true |
| utf8 = true |
| match-kind = "all" |
| search-kind = "overlapping" |
| |
| # This tests that an empty regex DOES split a codepoint when utf=false. |
| [[test]] |
| name = "empty-utf8no" |
| regex = '' |
| haystack = '☃' |
| matches = [[0, 0], [1, 1], [2, 2], [3, 3]] |
| unicode = true |
| utf8 = false |
| |
| # Tests the overlapping case of the above. |
| [[test]] |
| name = "empty-utf8no-overlapping" |
| regex = '' |
| haystack = '☃' |
| matches = [[0, 0], [1, 1], [2, 2], [3, 3]] |
| unicode = true |
| utf8 = false |
| match-kind = "all" |
| search-kind = "overlapping" |
| |
| # This tests that an empty regex doesn't split a codepoint, even if we give |
| # it bounds entirely within the codepoint. |
| # |
| # This is one of the trickier cases and is what motivated the current UTF-8 |
| # mode design. In particular, at one point, this test failed the 'is_match' |
| # variant of the test but not 'find'. This is because the 'is_match' code path |
| # is specifically optimized for "was a match found" rather than "where is the |
| # match." In the former case, you don't really care about the empty-vs-non-empty |
| # matches, and thus, the codepoint splitting filtering logic wasn't getting |
| # applied. (In multiple ways across multiple regex engines.) In this way, you |
| # can wind up with a situation where 'is_match' says "yes," but 'find' says, |
| # "I didn't find anything." Which is... not great. |
| # |
| # I could have decided to say that providing boundaries that themselves split |
| # a codepoint would have unspecified behavior. But I couldn't quite convince |
| # myself that such boundaries were the only way to get an inconsistency between |
| # 'is_match' and 'find'. |
| # |
| # Note that I also tried to come up with a test like this that fails without |
| # using `bounds`. Specifically, a test where 'is_match' and 'find' disagree. |
| # But I couldn't do it, and I'm tempted to conclude it is impossible. The |
| # fundamental problem is that you need to simultaneously produce an empty match |
| # that splits a codepoint while *not* matching before or after the codepoint. |
| [[test]] |
| name = "empty-utf8yes-bounds" |
| regex = '' |
| haystack = '𝛃' |
| bounds = [1, 3] |
| matches = [] |
| unicode = true |
| utf8 = true |
| |
| # Tests the overlapping case of the above. |
| [[test]] |
| name = "empty-utf8yes-bounds-overlapping" |
| regex = '' |
| haystack = '𝛃' |
| bounds = [1, 3] |
| matches = [] |
| unicode = true |
| utf8 = true |
| match-kind = "all" |
| search-kind = "overlapping" |
| |
| # This tests that an empty regex splits a codepoint when the bounds are |
| # entirely within the codepoint. |
| [[test]] |
| name = "empty-utf8no-bounds" |
| regex = '' |
| haystack = '𝛃' |
| bounds = [1, 3] |
| matches = [[1, 1], [2, 2], [3, 3]] |
| unicode = true |
| utf8 = false |
| |
| # Tests the overlapping case of the above. |
| [[test]] |
| name = "empty-utf8no-bounds-overlapping" |
| regex = '' |
| haystack = '𝛃' |
| bounds = [1, 3] |
| matches = [[1, 1], [2, 2], [3, 3]] |
| unicode = true |
| utf8 = false |
| match-kind = "all" |
| search-kind = "overlapping" |
| |
| # In this test, we anchor the search. Since the start position is also a UTF-8 |
| # boundary, we get a match. |
| [[test]] |
| name = "empty-utf8yes-anchored" |
| regex = '' |
| haystack = '𝛃' |
| matches = [[0, 0]] |
| anchored = true |
| unicode = true |
| utf8 = true |
| |
| # Tests the overlapping case of the above. |
| [[test]] |
| name = "empty-utf8yes-anchored-overlapping" |
| regex = '' |
| haystack = '𝛃' |
| matches = [[0, 0]] |
| anchored = true |
| unicode = true |
| utf8 = true |
| match-kind = "all" |
| search-kind = "overlapping" |
| |
| # Same as above, except with UTF-8 mode disabled. It almost doesn't change the |
| # result, except for the fact that since this is an anchored search and we |
| # always find all matches, the test harness will keep reporting matches until |
| # none are found. Because it's anchored, matches will be reported so long as |
| # they are directly adjacent. Since with UTF-8 mode the next anchored search |
| # after the match at [0, 0] fails, iteration stops (and doesn't find the last |
| # match at [4, 4]). |
| [[test]] |
| name = "empty-utf8no-anchored" |
| regex = '' |
| haystack = '𝛃' |
| matches = [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4]] |
| anchored = true |
| unicode = true |
| utf8 = false |
| |
| # Tests the overlapping case of the above. |
| # |
| # Note that overlapping anchored searches are a little weird, and it's not |
| # totally clear what their semantics ought to be. For now, we just test the |
| # current behavior of our test shim that implements overlapping search. (This |
| # is one of the reasons why we don't really expose regex-level overlapping |
| # searches.) |
| [[test]] |
| name = "empty-utf8no-anchored-overlapping" |
| regex = '' |
| haystack = '𝛃' |
| matches = [[0, 0]] |
| anchored = true |
| unicode = true |
| utf8 = false |
| match-kind = "all" |
| search-kind = "overlapping" |
| |
| # In this test, we anchor the search, but also set bounds. The bounds start the |
| # search in the middle of a codepoint, so there should never be a match. |
| [[test]] |
| name = "empty-utf8yes-anchored-bounds" |
| regex = '' |
| haystack = '𝛃' |
| matches = [] |
| bounds = [1, 3] |
| anchored = true |
| unicode = true |
| utf8 = true |
| |
| # Tests the overlapping case of the above. |
| [[test]] |
| name = "empty-utf8yes-anchored-bounds-overlapping" |
| regex = '' |
| haystack = '𝛃' |
| matches = [] |
| bounds = [1, 3] |
| anchored = true |
| unicode = true |
| utf8 = true |
| match-kind = "all" |
| search-kind = "overlapping" |
| |
| # Same as above, except with UTF-8 mode disabled. Without UTF-8 mode enabled, |
| # matching within a codepoint is allowed. And remember, as in the anchored test |
| # above with UTF-8 mode disabled, iteration will report all adjacent matches. |
| # The matches at [0, 0] and [4, 4] are not included because of the bounds of |
| # the search. |
| [[test]] |
| name = "empty-utf8no-anchored-bounds" |
| regex = '' |
| haystack = '𝛃' |
| bounds = [1, 3] |
| matches = [[1, 1], [2, 2], [3, 3]] |
| anchored = true |
| unicode = true |
| utf8 = false |
| |
| # Tests the overlapping case of the above. |
| # |
| # Note that overlapping anchored searches are a little weird, and it's not |
| # totally clear what their semantics ought to be. For now, we just test the |
| # current behavior of our test shim that implements overlapping search. (This |
| # is one of the reasons why we don't really expose regex-level overlapping |
| # searches.) |
| [[test]] |
| name = "empty-utf8no-anchored-bounds-overlapping" |
| regex = '' |
| haystack = '𝛃' |
| bounds = [1, 3] |
| matches = [[1, 1]] |
| anchored = true |
| unicode = true |
| utf8 = false |
| match-kind = "all" |
| search-kind = "overlapping" |
| |
| # This tests that we find the match at the end of the string when the bounds |
| # exclude the first match. |
| [[test]] |
| name = "empty-utf8yes-startbound" |
| regex = '' |
| haystack = '𝛃' |
| bounds = [1, 4] |
| matches = [[4, 4]] |
| unicode = true |
| utf8 = true |
| |
| # Tests the overlapping case of the above. |
| [[test]] |
| name = "empty-utf8yes-startbound-overlapping" |
| regex = '' |
| haystack = '𝛃' |
| bounds = [1, 4] |
| matches = [[4, 4]] |
| unicode = true |
| utf8 = true |
| match-kind = "all" |
| search-kind = "overlapping" |
| |
| # Same as above, except since UTF-8 mode is disabled, we also find the matches |
| # inbetween that split the codepoint. |
| [[test]] |
| name = "empty-utf8no-startbound" |
| regex = '' |
| haystack = '𝛃' |
| bounds = [1, 4] |
| matches = [[1, 1], [2, 2], [3, 3], [4, 4]] |
| unicode = true |
| utf8 = false |
| |
| # Tests the overlapping case of the above. |
| [[test]] |
| name = "empty-utf8no-startbound-overlapping" |
| regex = '' |
| haystack = '𝛃' |
| bounds = [1, 4] |
| matches = [[1, 1], [2, 2], [3, 3], [4, 4]] |
| unicode = true |
| utf8 = false |
| match-kind = "all" |
| search-kind = "overlapping" |
| |
| # This tests that we don't find any matches in an anchored search, even when |
| # the bounds include a match (at the end). |
| [[test]] |
| name = "empty-utf8yes-anchored-startbound" |
| regex = '' |
| haystack = '𝛃' |
| bounds = [1, 4] |
| matches = [] |
| anchored = true |
| unicode = true |
| utf8 = true |
| |
| # Tests the overlapping case of the above. |
| [[test]] |
| name = "empty-utf8yes-anchored-startbound-overlapping" |
| regex = '' |
| haystack = '𝛃' |
| bounds = [1, 4] |
| matches = [] |
| anchored = true |
| unicode = true |
| utf8 = true |
| match-kind = "all" |
| search-kind = "overlapping" |
| |
| # Same as above, except since UTF-8 mode is disabled, we also find the matches |
| # inbetween that split the codepoint. Even though this is an anchored search, |
| # since the matches are adjacent, we find all of them. |
| [[test]] |
| name = "empty-utf8no-anchored-startbound" |
| regex = '' |
| haystack = '𝛃' |
| bounds = [1, 4] |
| matches = [[1, 1], [2, 2], [3, 3], [4, 4]] |
| anchored = true |
| unicode = true |
| utf8 = false |
| |
| # Tests the overlapping case of the above. |
| # |
| # Note that overlapping anchored searches are a little weird, and it's not |
| # totally clear what their semantics ought to be. For now, we just test the |
| # current behavior of our test shim that implements overlapping search. (This |
| # is one of the reasons why we don't really expose regex-level overlapping |
| # searches.) |
| [[test]] |
| name = "empty-utf8no-anchored-startbound-overlapping" |
| regex = '' |
| haystack = '𝛃' |
| bounds = [1, 4] |
| matches = [[1, 1]] |
| anchored = true |
| unicode = true |
| utf8 = false |
| match-kind = "all" |
| search-kind = "overlapping" |
| |
| # This tests that we find the match at the end of the haystack in UTF-8 mode |
| # when our bounds only include the empty string at the end of the haystack. |
| [[test]] |
| name = "empty-utf8yes-anchored-endbound" |
| regex = '' |
| haystack = '𝛃' |
| bounds = [4, 4] |
| matches = [[4, 4]] |
| anchored = true |
| unicode = true |
| utf8 = true |
| |
| # Tests the overlapping case of the above. |
| [[test]] |
| name = "empty-utf8yes-anchored-endbound-overlapping" |
| regex = '' |
| haystack = '𝛃' |
| bounds = [4, 4] |
| matches = [[4, 4]] |
| anchored = true |
| unicode = true |
| utf8 = true |
| match-kind = "all" |
| search-kind = "overlapping" |
| |
| # Same as above, but with UTF-8 mode disabled. Results remain the same since |
| # the only possible match does not split a codepoint. |
| [[test]] |
| name = "empty-utf8no-anchored-endbound" |
| regex = '' |
| haystack = '𝛃' |
| bounds = [4, 4] |
| matches = [[4, 4]] |
| anchored = true |
| unicode = true |
| utf8 = false |
| |
| # Tests the overlapping case of the above. |
| [[test]] |
| name = "empty-utf8no-anchored-endbound-overlapping" |
| regex = '' |
| haystack = '𝛃' |
| bounds = [4, 4] |
| matches = [[4, 4]] |
| anchored = true |
| unicode = true |
| utf8 = false |
| match-kind = "all" |
| search-kind = "overlapping" |