| //===-- GlobPattern.cpp - Glob pattern matcher implementation -------------===// |
| // |
| // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| // See https://llvm.org/LICENSE.txt for license information. |
| // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| // |
| //===----------------------------------------------------------------------===// |
| // |
| // This file implements a glob pattern matcher. |
| // |
| //===----------------------------------------------------------------------===// |
| |
| #include "llvm/Support/GlobPattern.h" |
| #include "llvm/ADT/StringRef.h" |
| #include "llvm/Support/Errc.h" |
| |
| using namespace llvm; |
| |
| // Expands character ranges and returns a bitmap. |
| // For example, "a-cf-hz" is expanded to "abcfghz". |
| static Expected<BitVector> expand(StringRef S, StringRef Original) { |
| BitVector BV(256, false); |
| |
| // Expand X-Y. |
| for (;;) { |
| if (S.size() < 3) |
| break; |
| |
| uint8_t Start = S[0]; |
| uint8_t End = S[2]; |
| |
| // If it doesn't start with something like X-Y, |
| // consume the first character and proceed. |
| if (S[1] != '-') { |
| BV[Start] = true; |
| S = S.substr(1); |
| continue; |
| } |
| |
| // It must be in the form of X-Y. |
| // Validate it and then interpret the range. |
| if (Start > End) |
| return make_error<StringError>("invalid glob pattern: " + Original, |
| errc::invalid_argument); |
| |
| for (int C = Start; C <= End; ++C) |
| BV[(uint8_t)C] = true; |
| S = S.substr(3); |
| } |
| |
| for (char C : S) |
| BV[(uint8_t)C] = true; |
| return BV; |
| } |
| |
| // Identify brace expansions in S and return the list of patterns they expand |
| // into. |
| static Expected<SmallVector<std::string, 1>> |
| parseBraceExpansions(StringRef S, std::optional<size_t> MaxSubPatterns) { |
| SmallVector<std::string> SubPatterns = {S.str()}; |
| if (!MaxSubPatterns || !S.contains('{')) |
| return std::move(SubPatterns); |
| |
| struct BraceExpansion { |
| size_t Start; |
| size_t Length; |
| SmallVector<StringRef, 2> Terms; |
| }; |
| SmallVector<BraceExpansion, 0> BraceExpansions; |
| |
| BraceExpansion *CurrentBE = nullptr; |
| size_t TermBegin; |
| for (size_t I = 0, E = S.size(); I != E; ++I) { |
| if (S[I] == '[') { |
| I = S.find(']', I + 2); |
| if (I == std::string::npos) |
| return make_error<StringError>("invalid glob pattern, unmatched '['", |
| errc::invalid_argument); |
| } else if (S[I] == '{') { |
| if (CurrentBE) |
| return make_error<StringError>( |
| "nested brace expansions are not supported", |
| errc::invalid_argument); |
| CurrentBE = &BraceExpansions.emplace_back(); |
| CurrentBE->Start = I; |
| TermBegin = I + 1; |
| } else if (S[I] == ',') { |
| if (!CurrentBE) |
| continue; |
| CurrentBE->Terms.push_back(S.substr(TermBegin, I - TermBegin)); |
| TermBegin = I + 1; |
| } else if (S[I] == '}') { |
| if (!CurrentBE) |
| continue; |
| if (CurrentBE->Terms.empty()) |
| return make_error<StringError>( |
| "empty or singleton brace expansions are not supported", |
| errc::invalid_argument); |
| CurrentBE->Terms.push_back(S.substr(TermBegin, I - TermBegin)); |
| CurrentBE->Length = I - CurrentBE->Start + 1; |
| CurrentBE = nullptr; |
| } else if (S[I] == '\\') { |
| if (++I == E) |
| return make_error<StringError>("invalid glob pattern, stray '\\'", |
| errc::invalid_argument); |
| } |
| } |
| if (CurrentBE) |
| return make_error<StringError>("incomplete brace expansion", |
| errc::invalid_argument); |
| |
| size_t NumSubPatterns = 1; |
| for (auto &BE : BraceExpansions) { |
| if (NumSubPatterns > std::numeric_limits<size_t>::max() / BE.Terms.size()) { |
| NumSubPatterns = std::numeric_limits<size_t>::max(); |
| break; |
| } |
| NumSubPatterns *= BE.Terms.size(); |
| } |
| if (NumSubPatterns > *MaxSubPatterns) |
| return make_error<StringError>("too many brace expansions", |
| errc::invalid_argument); |
| // Replace brace expansions in reverse order so that we don't invalidate |
| // earlier start indices |
| for (auto &BE : reverse(BraceExpansions)) { |
| SmallVector<std::string> OrigSubPatterns; |
| std::swap(SubPatterns, OrigSubPatterns); |
| for (StringRef Term : BE.Terms) |
| for (StringRef Orig : OrigSubPatterns) |
| SubPatterns.emplace_back(Orig).replace(BE.Start, BE.Length, Term); |
| } |
| return std::move(SubPatterns); |
| } |
| |
| Expected<GlobPattern> |
| GlobPattern::create(StringRef S, std::optional<size_t> MaxSubPatterns) { |
| GlobPattern Pat; |
| |
| // Store the prefix that does not contain any metacharacter. |
| size_t PrefixSize = S.find_first_of("?*[{\\"); |
| Pat.Prefix = S.substr(0, PrefixSize); |
| if (PrefixSize == std::string::npos) |
| return Pat; |
| S = S.substr(PrefixSize); |
| |
| SmallVector<std::string, 1> SubPats; |
| if (auto Err = parseBraceExpansions(S, MaxSubPatterns).moveInto(SubPats)) |
| return std::move(Err); |
| for (StringRef SubPat : SubPats) { |
| auto SubGlobOrErr = SubGlobPattern::create(SubPat); |
| if (!SubGlobOrErr) |
| return SubGlobOrErr.takeError(); |
| Pat.SubGlobs.push_back(*SubGlobOrErr); |
| } |
| |
| return Pat; |
| } |
| |
| Expected<GlobPattern::SubGlobPattern> |
| GlobPattern::SubGlobPattern::create(StringRef S) { |
| SubGlobPattern Pat; |
| |
| // Parse brackets. |
| Pat.Pat.assign(S.begin(), S.end()); |
| for (size_t I = 0, E = S.size(); I != E; ++I) { |
| if (S[I] == '[') { |
| // ']' is allowed as the first character of a character class. '[]' is |
| // invalid. So, just skip the first character. |
| ++I; |
| size_t J = S.find(']', I + 1); |
| if (J == StringRef::npos) |
| return make_error<StringError>("invalid glob pattern, unmatched '['", |
| errc::invalid_argument); |
| StringRef Chars = S.substr(I, J - I); |
| bool Invert = S[I] == '^' || S[I] == '!'; |
| Expected<BitVector> BV = |
| Invert ? expand(Chars.substr(1), S) : expand(Chars, S); |
| if (!BV) |
| return BV.takeError(); |
| if (Invert) |
| BV->flip(); |
| Pat.Brackets.push_back(Bracket{J + 1, std::move(*BV)}); |
| I = J; |
| } else if (S[I] == '\\') { |
| if (++I == E) |
| return make_error<StringError>("invalid glob pattern, stray '\\'", |
| errc::invalid_argument); |
| } |
| } |
| return Pat; |
| } |
| |
| bool GlobPattern::match(StringRef S) const { |
| if (!S.consume_front(Prefix)) |
| return false; |
| if (SubGlobs.empty() && S.empty()) |
| return true; |
| for (auto &Glob : SubGlobs) |
| if (Glob.match(S)) |
| return true; |
| return false; |
| } |
| |
| // Factor the pattern into segments split by '*'. The segment is matched |
| // sequentianlly by finding the first occurrence past the end of the previous |
| // match. |
| bool GlobPattern::SubGlobPattern::match(StringRef Str) const { |
| const char *P = Pat.data(), *SegmentBegin = nullptr, *S = Str.data(), |
| *SavedS = S; |
| const char *const PEnd = P + Pat.size(), *const End = S + Str.size(); |
| size_t B = 0, SavedB = 0; |
| while (S != End) { |
| if (P == PEnd) |
| ; |
| else if (*P == '*') { |
| // The non-* substring on the left of '*' matches the tail of S. Save the |
| // positions to be used by backtracking if we see a mismatch later. |
| SegmentBegin = ++P; |
| SavedS = S; |
| SavedB = B; |
| continue; |
| } else if (*P == '[') { |
| if (Brackets[B].Bytes[uint8_t(*S)]) { |
| P = Pat.data() + Brackets[B++].NextOffset; |
| ++S; |
| continue; |
| } |
| } else if (*P == '\\') { |
| if (*++P == *S) { |
| ++P; |
| ++S; |
| continue; |
| } |
| } else if (*P == *S || *P == '?') { |
| ++P; |
| ++S; |
| continue; |
| } |
| if (!SegmentBegin) |
| return false; |
| // We have seen a '*'. Backtrack to the saved positions. Shift the S |
| // position to probe the next starting position in the segment. |
| P = SegmentBegin; |
| S = ++SavedS; |
| B = SavedB; |
| } |
| // All bytes in Str have been matched. Return true if the rest part of Pat is |
| // empty or contains only '*'. |
| return getPat().find_first_not_of('*', P - Pat.data()) == std::string::npos; |
| } |