| # These tests are specifically written to test the regex-lite crate. While it |
| # largely has the same semantics as the regex crate, there are some differences |
| # around Unicode support and UTF-8. |
| # |
| # To be clear, regex-lite supports far fewer patterns because of its lack of |
| # Unicode support, nested character classes and character class set operations. |
| # What we're talking about here are the patterns that both crates support but |
| # where the semantics might differ. |
| |
| # regex-lite uses ASCII definitions for Perl character classes. |
| [[test]] |
| name = "perl-class-decimal" |
| regex = '\d' |
| haystack = '᠕' |
| matches = [] |
| unicode = true |
| |
| # regex-lite uses ASCII definitions for Perl character classes. |
| [[test]] |
| name = "perl-class-space" |
| regex = '\s' |
| haystack = "\u2000" |
| matches = [] |
| unicode = true |
| |
| # regex-lite uses ASCII definitions for Perl character classes. |
| [[test]] |
| name = "perl-class-word" |
| regex = '\w' |
| haystack = 'δ' |
| matches = [] |
| unicode = true |
| |
| # regex-lite uses the ASCII definition of word for word boundary assertions. |
| [[test]] |
| name = "word-boundary" |
| regex = '\b' |
| haystack = 'δ' |
| matches = [] |
| unicode = true |
| |
| # regex-lite uses the ASCII definition of word for negated word boundary |
| # assertions. But note that it should still not split codepoints! |
| [[test]] |
| name = "word-boundary-negated" |
| regex = '\B' |
| haystack = 'δ' |
| matches = [[0, 0], [2, 2]] |
| unicode = true |
| |
| # While we're here, the empty regex---which matches at every |
| # position---shouldn't split a codepoint either. |
| [[test]] |
| name = "empty-no-split-codepoint" |
| regex = '' |
| haystack = '💩' |
| matches = [[0, 0], [4, 4]] |
| unicode = true |
| |
| # A dot always matches a full codepoint. |
| [[test]] |
| name = "dot-always-matches-codepoint" |
| regex = '.' |
| haystack = '💩' |
| matches = [[0, 4]] |
| unicode = false |
| |
| # A negated character class also always matches a full codepoint. |
| [[test]] |
| name = "negated-class-always-matches-codepoint" |
| regex = '[^a]' |
| haystack = '💩' |
| matches = [[0, 4]] |
| unicode = false |
| |
| # regex-lite only supports ASCII-aware case insensitive matching. |
| [[test]] |
| name = "case-insensitive-is-ascii-only" |
| regex = 's' |
| haystack = 'ſ' |
| matches = [] |
| unicode = true |
| case-insensitive = true |
| |
| # Negated word boundaries shouldn't split a codepoint, but they will match |
| # between invalid UTF-8. |
| # |
| # This test is only valid for a 'bytes' API, but that doesn't (yet) exist in |
| # regex-lite. This can't happen in the main API because &str can't contain |
| # invalid UTF-8. |
| # [[test]] |
| # name = "word-boundary-invalid-utf8" |
| # regex = '\B' |
| # haystack = '\xFF\xFF\xFF\xFF' |
| # unescape = true |
| # matches = [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4]] |
| # unicode = true |
| # utf8 = false |