| # Some of these are cribbed from RE2's test suite. |
| |
| # These test \b. Below are tests for \B. |
| [[test]] |
| name = "wb1" |
| regex = '\b' |
| haystack = "" |
| matches = [] |
| unicode = false |
| |
| [[test]] |
| name = "wb2" |
| regex = '\b' |
| haystack = "a" |
| matches = [[0, 0], [1, 1]] |
| unicode = false |
| |
| [[test]] |
| name = "wb3" |
| regex = '\b' |
| haystack = "ab" |
| matches = [[0, 0], [2, 2]] |
| unicode = false |
| |
| [[test]] |
| name = "wb4" |
| regex = '^\b' |
| haystack = "ab" |
| matches = [[0, 0]] |
| unicode = false |
| |
| [[test]] |
| name = "wb5" |
| regex = '\b$' |
| haystack = "ab" |
| matches = [[2, 2]] |
| unicode = false |
| |
| [[test]] |
| name = "wb6" |
| regex = '^\b$' |
| haystack = "ab" |
| matches = [] |
| unicode = false |
| |
| [[test]] |
| name = "wb7" |
| regex = '\bbar\b' |
| haystack = "nobar bar foo bar" |
| matches = [[6, 9], [14, 17]] |
| unicode = false |
| |
| [[test]] |
| name = "wb8" |
| regex = 'a\b' |
| haystack = "faoa x" |
| matches = [[3, 4]] |
| unicode = false |
| |
| [[test]] |
| name = "wb9" |
| regex = '\bbar' |
| haystack = "bar x" |
| matches = [[0, 3]] |
| unicode = false |
| |
| [[test]] |
| name = "wb10" |
| regex = '\bbar' |
| haystack = "foo\nbar x" |
| matches = [[4, 7]] |
| unicode = false |
| |
| [[test]] |
| name = "wb11" |
| regex = 'bar\b' |
| haystack = "foobar" |
| matches = [[3, 6]] |
| unicode = false |
| |
| [[test]] |
| name = "wb12" |
| regex = 'bar\b' |
| haystack = "foobar\nxxx" |
| matches = [[3, 6]] |
| unicode = false |
| |
| [[test]] |
| name = "wb13" |
| regex = '(?:foo|bar|[A-Z])\b' |
| haystack = "foo" |
| matches = [[0, 3]] |
| unicode = false |
| |
| [[test]] |
| name = "wb14" |
| regex = '(?:foo|bar|[A-Z])\b' |
| haystack = "foo\n" |
| matches = [[0, 3]] |
| unicode = false |
| |
| [[test]] |
| name = "wb15" |
| regex = '\b(?:foo|bar|[A-Z])' |
| haystack = "foo" |
| matches = [[0, 3]] |
| unicode = false |
| |
| [[test]] |
| name = "wb16" |
| regex = '\b(?:foo|bar|[A-Z])\b' |
| haystack = "X" |
| matches = [[0, 1]] |
| unicode = false |
| |
| [[test]] |
| name = "wb17" |
| regex = '\b(?:foo|bar|[A-Z])\b' |
| haystack = "XY" |
| matches = [] |
| unicode = false |
| |
| [[test]] |
| name = "wb18" |
| regex = '\b(?:foo|bar|[A-Z])\b' |
| haystack = "bar" |
| matches = [[0, 3]] |
| unicode = false |
| |
| [[test]] |
| name = "wb19" |
| regex = '\b(?:foo|bar|[A-Z])\b' |
| haystack = "foo" |
| matches = [[0, 3]] |
| unicode = false |
| |
| [[test]] |
| name = "wb20" |
| regex = '\b(?:foo|bar|[A-Z])\b' |
| haystack = "foo\n" |
| matches = [[0, 3]] |
| unicode = false |
| |
| [[test]] |
| name = "wb21" |
| regex = '\b(?:foo|bar|[A-Z])\b' |
| haystack = "ffoo bbar N x" |
| matches = [[10, 11]] |
| unicode = false |
| |
| [[test]] |
| name = "wb22" |
| regex = '\b(?:fo|foo)\b' |
| haystack = "fo" |
| matches = [[0, 2]] |
| unicode = false |
| |
| [[test]] |
| name = "wb23" |
| regex = '\b(?:fo|foo)\b' |
| haystack = "foo" |
| matches = [[0, 3]] |
| unicode = false |
| |
| [[test]] |
| name = "wb24" |
| regex = '\b\b' |
| haystack = "" |
| matches = [] |
| unicode = false |
| |
| [[test]] |
| name = "wb25" |
| regex = '\b\b' |
| haystack = "a" |
| matches = [[0, 0], [1, 1]] |
| unicode = false |
| |
| [[test]] |
| name = "wb26" |
| regex = '\b$' |
| haystack = "" |
| matches = [] |
| unicode = false |
| |
| [[test]] |
| name = "wb27" |
| regex = '\b$' |
| haystack = "x" |
| matches = [[1, 1]] |
| unicode = false |
| |
| [[test]] |
| name = "wb28" |
| regex = '\b$' |
| haystack = "y x" |
| matches = [[3, 3]] |
| unicode = false |
| |
| [[test]] |
| name = "wb29" |
| regex = '(?-u:\b).$' |
| haystack = "x" |
| matches = [[0, 1]] |
| |
| [[test]] |
| name = "wb30" |
| regex = '^\b(?:fo|foo)\b' |
| haystack = "fo" |
| matches = [[0, 2]] |
| unicode = false |
| |
| [[test]] |
| name = "wb31" |
| regex = '^\b(?:fo|foo)\b' |
| haystack = "foo" |
| matches = [[0, 3]] |
| unicode = false |
| |
| [[test]] |
| name = "wb32" |
| regex = '^\b$' |
| haystack = "" |
| matches = [] |
| unicode = false |
| |
| [[test]] |
| name = "wb33" |
| regex = '^\b$' |
| haystack = "x" |
| matches = [] |
| unicode = false |
| |
| [[test]] |
| name = "wb34" |
| regex = '^(?-u:\b).$' |
| haystack = "x" |
| matches = [[0, 1]] |
| |
| [[test]] |
| name = "wb35" |
| regex = '^(?-u:\b).(?-u:\b)$' |
| haystack = "x" |
| matches = [[0, 1]] |
| |
| [[test]] |
| name = "wb36" |
| regex = '^^^^^\b$$$$$' |
| haystack = "" |
| matches = [] |
| unicode = false |
| |
| [[test]] |
| name = "wb37" |
| regex = '^^^^^(?-u:\b).$$$$$' |
| haystack = "x" |
| matches = [[0, 1]] |
| |
| [[test]] |
| name = "wb38" |
| regex = '^^^^^\b$$$$$' |
| haystack = "x" |
| matches = [] |
| unicode = false |
| |
| [[test]] |
| name = "wb39" |
| regex = '^^^^^(?-u:\b\b\b).(?-u:\b\b\b)$$$$$' |
| haystack = "x" |
| matches = [[0, 1]] |
| |
| [[test]] |
| name = "wb40" |
| regex = '(?-u:\b).+(?-u:\b)' |
| haystack = "$$abc$$" |
| matches = [[2, 5]] |
| |
| [[test]] |
| name = "wb41" |
| regex = '\b' |
| haystack = "a b c" |
| matches = [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4], [5, 5]] |
| unicode = false |
| |
| [[test]] |
| name = "wb42" |
| regex = '\bfoo\b' |
| haystack = "zzz foo zzz" |
| matches = [[4, 7]] |
| unicode = false |
| |
| [[test]] |
| name = "wb43" |
| regex = '\b^' |
| haystack = "ab" |
| matches = [[0, 0]] |
| unicode = false |
| |
| [[test]] |
| name = "wb44" |
| regex = '$\b' |
| haystack = "ab" |
| matches = [[2, 2]] |
| unicode = false |
| |
| |
| # Tests for \B. Note that \B is not allowed if UTF-8 mode is enabled, so we |
| # have to disable it for most of these tests. This is because \B can match at |
| # non-UTF-8 boundaries. |
| [[test]] |
| name = "nb1" |
| regex = '\Bfoo\B' |
| haystack = "n foo xfoox that" |
| matches = [[7, 10]] |
| unicode = false |
| utf8 = false |
| |
| [[test]] |
| name = "nb2" |
| regex = 'a\B' |
| haystack = "faoa x" |
| matches = [[1, 2]] |
| unicode = false |
| utf8 = false |
| |
| [[test]] |
| name = "nb3" |
| regex = '\Bbar' |
| haystack = "bar x" |
| matches = [] |
| unicode = false |
| utf8 = false |
| |
| [[test]] |
| name = "nb4" |
| regex = '\Bbar' |
| haystack = "foo\nbar x" |
| matches = [] |
| unicode = false |
| utf8 = false |
| |
| [[test]] |
| name = "nb5" |
| regex = 'bar\B' |
| haystack = "foobar" |
| matches = [] |
| unicode = false |
| utf8 = false |
| |
| [[test]] |
| name = "nb6" |
| regex = 'bar\B' |
| haystack = "foobar\nxxx" |
| matches = [] |
| unicode = false |
| utf8 = false |
| |
| [[test]] |
| name = "nb7" |
| regex = '(?:foo|bar|[A-Z])\B' |
| haystack = "foox" |
| matches = [[0, 3]] |
| unicode = false |
| utf8 = false |
| |
| [[test]] |
| name = "nb8" |
| regex = '(?:foo|bar|[A-Z])\B' |
| haystack = "foo\n" |
| matches = [] |
| unicode = false |
| utf8 = false |
| |
| [[test]] |
| name = "nb9" |
| regex = '\B' |
| haystack = "" |
| matches = [[0, 0]] |
| unicode = false |
| utf8 = false |
| |
| [[test]] |
| name = "nb10" |
| regex = '\B' |
| haystack = "x" |
| matches = [] |
| unicode = false |
| utf8 = false |
| |
| [[test]] |
| name = "nb11" |
| regex = '\B(?:foo|bar|[A-Z])' |
| haystack = "foo" |
| matches = [] |
| unicode = false |
| utf8 = false |
| |
| [[test]] |
| name = "nb12" |
| regex = '\B(?:foo|bar|[A-Z])\B' |
| haystack = "xXy" |
| matches = [[1, 2]] |
| unicode = false |
| utf8 = false |
| |
| [[test]] |
| name = "nb13" |
| regex = '\B(?:foo|bar|[A-Z])\B' |
| haystack = "XY" |
| matches = [] |
| unicode = false |
| utf8 = false |
| |
| [[test]] |
| name = "nb14" |
| regex = '\B(?:foo|bar|[A-Z])\B' |
| haystack = "XYZ" |
| matches = [[1, 2]] |
| unicode = false |
| utf8 = false |
| |
| [[test]] |
| name = "nb15" |
| regex = '\B(?:foo|bar|[A-Z])\B' |
| haystack = "abara" |
| matches = [[1, 4]] |
| unicode = false |
| utf8 = false |
| |
| [[test]] |
| name = "nb16" |
| regex = '\B(?:foo|bar|[A-Z])\B' |
| haystack = "xfoo_" |
| matches = [[1, 4]] |
| unicode = false |
| utf8 = false |
| |
| [[test]] |
| name = "nb17" |
| regex = '\B(?:foo|bar|[A-Z])\B' |
| haystack = "xfoo\n" |
| matches = [] |
| unicode = false |
| utf8 = false |
| |
| [[test]] |
| name = "nb18" |
| regex = '\B(?:foo|bar|[A-Z])\B' |
| haystack = "foo bar vNX" |
| matches = [[9, 10]] |
| unicode = false |
| utf8 = false |
| |
| [[test]] |
| name = "nb19" |
| regex = '\B(?:fo|foo)\B' |
| haystack = "xfoo" |
| matches = [[1, 3]] |
| unicode = false |
| utf8 = false |
| |
| [[test]] |
| name = "nb20" |
| regex = '\B(?:foo|fo)\B' |
| haystack = "xfooo" |
| matches = [[1, 4]] |
| unicode = false |
| utf8 = false |
| |
| [[test]] |
| name = "nb21" |
| regex = '\B\B' |
| haystack = "" |
| matches = [[0, 0]] |
| unicode = false |
| utf8 = false |
| |
| [[test]] |
| name = "nb22" |
| regex = '\B\B' |
| haystack = "x" |
| matches = [] |
| unicode = false |
| utf8 = false |
| |
| [[test]] |
| name = "nb23" |
| regex = '\B$' |
| haystack = "" |
| matches = [[0, 0]] |
| unicode = false |
| utf8 = false |
| |
| [[test]] |
| name = "nb24" |
| regex = '\B$' |
| haystack = "x" |
| matches = [] |
| unicode = false |
| utf8 = false |
| |
| [[test]] |
| name = "nb25" |
| regex = '\B$' |
| haystack = "y x" |
| matches = [] |
| unicode = false |
| utf8 = false |
| |
| [[test]] |
| name = "nb26" |
| regex = '\B.$' |
| haystack = "x" |
| matches = [] |
| unicode = false |
| utf8 = false |
| |
| [[test]] |
| name = "nb27" |
| regex = '^\B(?:fo|foo)\B' |
| haystack = "fo" |
| matches = [] |
| unicode = false |
| utf8 = false |
| |
| [[test]] |
| name = "nb28" |
| regex = '^\B(?:fo|foo)\B' |
| haystack = "fo" |
| matches = [] |
| unicode = false |
| utf8 = false |
| |
| [[test]] |
| name = "nb29" |
| regex = '^\B' |
| haystack = "" |
| matches = [[0, 0]] |
| unicode = false |
| utf8 = false |
| |
| [[test]] |
| name = "nb30" |
| regex = '^\B' |
| haystack = "x" |
| matches = [] |
| unicode = false |
| utf8 = false |
| |
| [[test]] |
| name = "nb31" |
| regex = '^\B\B' |
| haystack = "" |
| matches = [[0, 0]] |
| unicode = false |
| utf8 = false |
| |
| [[test]] |
| name = "nb32" |
| regex = '^\B\B' |
| haystack = "x" |
| matches = [] |
| unicode = false |
| utf8 = false |
| |
| [[test]] |
| name = "nb33" |
| regex = '^\B$' |
| haystack = "" |
| matches = [[0, 0]] |
| unicode = false |
| utf8 = false |
| |
| [[test]] |
| name = "nb34" |
| regex = '^\B$' |
| haystack = "x" |
| matches = [] |
| unicode = false |
| utf8 = false |
| |
| [[test]] |
| name = "nb35" |
| regex = '^\B.$' |
| haystack = "x" |
| matches = [] |
| unicode = false |
| utf8 = false |
| |
| [[test]] |
| name = "nb36" |
| regex = '^\B.\B$' |
| haystack = "x" |
| matches = [] |
| unicode = false |
| utf8 = false |
| |
| [[test]] |
| name = "nb37" |
| regex = '^^^^^\B$$$$$' |
| haystack = "" |
| matches = [[0, 0]] |
| unicode = false |
| utf8 = false |
| |
| [[test]] |
| name = "nb38" |
| regex = '^^^^^\B.$$$$$' |
| haystack = "x" |
| matches = [] |
| unicode = false |
| utf8 = false |
| |
| [[test]] |
| name = "nb39" |
| regex = '^^^^^\B$$$$$' |
| haystack = "x" |
| matches = [] |
| unicode = false |
| utf8 = false |
| |
| |
| # unicode1* and unicode2* work for both Unicode and ASCII because all matches |
| # are reported as byte offsets, and « and » do not correspond to word |
| # boundaries at either the character or byte level. |
| [[test]] |
| name = "unicode1" |
| regex = '\bx\b' |
| haystack = "«x" |
| matches = [[2, 3]] |
| |
| [[test]] |
| name = "unicode1-only-ascii" |
| regex = '\bx\b' |
| haystack = "«x" |
| matches = [[2, 3]] |
| unicode = false |
| |
| [[test]] |
| name = "unicode2" |
| regex = '\bx\b' |
| haystack = "x»" |
| matches = [[0, 1]] |
| |
| [[test]] |
| name = "unicode2-only-ascii" |
| regex = '\bx\b' |
| haystack = "x»" |
| matches = [[0, 1]] |
| unicode = false |
| |
| # ASCII word boundaries are completely oblivious to Unicode characters, so |
| # even though β is a character, an ASCII \b treats it as a word boundary |
| # when it is adjacent to another ASCII character. (The ASCII \b only looks |
| # at the leading byte of β.) For Unicode \b, the tests are precisely inverted. |
| [[test]] |
| name = "unicode3" |
| regex = '\bx\b' |
| haystack = 'áxβ' |
| matches = [] |
| |
| [[test]] |
| name = "unicode3-only-ascii" |
| regex = '\bx\b' |
| haystack = 'áxβ' |
| matches = [[2, 3]] |
| unicode = false |
| |
| [[test]] |
| name = "unicode4" |
| regex = '\Bx\B' |
| haystack = 'áxβ' |
| matches = [[2, 3]] |
| |
| [[test]] |
| name = "unicode4-only-ascii" |
| regex = '\Bx\B' |
| haystack = 'áxβ' |
| matches = [] |
| unicode = false |
| utf8 = false |
| |
| # The same as above, but with \b instead of \B as a sanity check. |
| [[test]] |
| name = "unicode5" |
| regex = '\b' |
| haystack = "0\U0007EF5E" |
| matches = [[0, 0], [1, 1]] |
| |
| [[test]] |
| name = "unicode5-only-ascii" |
| regex = '\b' |
| haystack = "0\U0007EF5E" |
| matches = [[0, 0], [1, 1]] |
| unicode = false |
| utf8 = false |
| |
| [[test]] |
| name = "unicode5-noutf8" |
| regex = '\b' |
| haystack = '0\xFF\xFF\xFF\xFF' |
| matches = [[0, 0], [1, 1]] |
| unescape = true |
| utf8 = false |
| |
| [[test]] |
| name = "unicode5-noutf8-only-ascii" |
| regex = '\b' |
| haystack = '0\xFF\xFF\xFF\xFF' |
| matches = [[0, 0], [1, 1]] |
| unescape = true |
| unicode = false |
| utf8 = false |
| |
| # Weird special case to ensure that ASCII \B treats each individual code unit |
| # as a non-word byte. (The specific codepoint is irrelevant. It's an arbitrary |
| # codepoint that uses 4 bytes in its UTF-8 encoding and is not a member of the |
| # \w character class.) |
| [[test]] |
| name = "unicode5-not" |
| regex = '\B' |
| haystack = "0\U0007EF5E" |
| matches = [[5, 5]] |
| |
| [[test]] |
| name = "unicode5-not-only-ascii" |
| regex = '\B' |
| haystack = "0\U0007EF5E" |
| matches = [[2, 2], [3, 3], [4, 4], [5, 5]] |
| unicode = false |
| utf8 = false |
| |
| # This gets no matches since \B only matches in the presence of valid UTF-8 |
| # when Unicode is enabled, even when UTF-8 mode is disabled. |
| [[test]] |
| name = "unicode5-not-noutf8" |
| regex = '\B' |
| haystack = '0\xFF\xFF\xFF\xFF' |
| matches = [] |
| unescape = true |
| utf8 = false |
| |
| # But this DOES get matches since \B in ASCII mode only looks at individual |
| # bytes. |
| [[test]] |
| name = "unicode5-not-noutf8-only-ascii" |
| regex = '\B' |
| haystack = '0\xFF\xFF\xFF\xFF' |
| matches = [[2, 2], [3, 3], [4, 4], [5, 5]] |
| unescape = true |
| unicode = false |
| utf8 = false |
| |
| # Some tests of no particular significance. |
| [[test]] |
| name = "unicode6" |
| regex = '\b[0-9]+\b' |
| haystack = "foo 123 bar 456 quux 789" |
| matches = [[4, 7], [12, 15], [21, 24]] |
| |
| [[test]] |
| name = "unicode7" |
| regex = '\b[0-9]+\b' |
| haystack = "foo 123 bar a456 quux 789" |
| matches = [[4, 7], [22, 25]] |
| |
| [[test]] |
| name = "unicode8" |
| regex = '\b[0-9]+\b' |
| haystack = "foo 123 bar 456a quux 789" |
| matches = [[4, 7], [22, 25]] |
| |
| # A variant of the problem described here: |
| # https://github.com/google/re2/blob/89567f5de5b23bb5ad0c26cbafc10bdc7389d1fa/re2/dfa.cc#L658-L667 |
| [[test]] |
| name = "alt-with-assertion-repetition" |
| regex = '(?:\b|%)+' |
| haystack = "z%" |
| bounds = [1, 2] |
| anchored = true |
| matches = [[1, 1]] |