Chris Wailes | 32f7835 | 2021-07-20 14:04:55 -0700 | [diff] [blame] | 1 | use ast; |
| 2 | use hir; |
| 3 | |
| 4 | use Result; |
| 5 | |
| 6 | /// A builder for a regular expression parser. |
| 7 | /// |
| 8 | /// This builder permits modifying configuration options for the parser. |
| 9 | /// |
| 10 | /// This type combines the builder options for both the |
| 11 | /// [AST `ParserBuilder`](ast/parse/struct.ParserBuilder.html) |
| 12 | /// and the |
| 13 | /// [HIR `TranslatorBuilder`](hir/translate/struct.TranslatorBuilder.html). |
| 14 | #[derive(Clone, Debug, Default)] |
| 15 | pub struct ParserBuilder { |
| 16 | ast: ast::parse::ParserBuilder, |
| 17 | hir: hir::translate::TranslatorBuilder, |
| 18 | } |
| 19 | |
| 20 | impl ParserBuilder { |
| 21 | /// Create a new parser builder with a default configuration. |
| 22 | pub fn new() -> ParserBuilder { |
| 23 | ParserBuilder::default() |
| 24 | } |
| 25 | |
| 26 | /// Build a parser from this configuration with the given pattern. |
| 27 | pub fn build(&self) -> Parser { |
| 28 | Parser { ast: self.ast.build(), hir: self.hir.build() } |
| 29 | } |
| 30 | |
| 31 | /// Set the nesting limit for this parser. |
| 32 | /// |
| 33 | /// The nesting limit controls how deep the abstract syntax tree is allowed |
| 34 | /// to be. If the AST exceeds the given limit (e.g., with too many nested |
| 35 | /// groups), then an error is returned by the parser. |
| 36 | /// |
| 37 | /// The purpose of this limit is to act as a heuristic to prevent stack |
| 38 | /// overflow for consumers that do structural induction on an `Ast` using |
| 39 | /// explicit recursion. While this crate never does this (instead using |
| 40 | /// constant stack space and moving the call stack to the heap), other |
| 41 | /// crates may. |
| 42 | /// |
| 43 | /// This limit is not checked until the entire Ast is parsed. Therefore, |
| 44 | /// if callers want to put a limit on the amount of heap space used, then |
| 45 | /// they should impose a limit on the length, in bytes, of the concrete |
| 46 | /// pattern string. In particular, this is viable since this parser |
| 47 | /// implementation will limit itself to heap space proportional to the |
| 48 | /// lenth of the pattern string. |
| 49 | /// |
| 50 | /// Note that a nest limit of `0` will return a nest limit error for most |
| 51 | /// patterns but not all. For example, a nest limit of `0` permits `a` but |
| 52 | /// not `ab`, since `ab` requires a concatenation, which results in a nest |
| 53 | /// depth of `1`. In general, a nest limit is not something that manifests |
| 54 | /// in an obvious way in the concrete syntax, therefore, it should not be |
| 55 | /// used in a granular way. |
| 56 | pub fn nest_limit(&mut self, limit: u32) -> &mut ParserBuilder { |
| 57 | self.ast.nest_limit(limit); |
| 58 | self |
| 59 | } |
| 60 | |
| 61 | /// Whether to support octal syntax or not. |
| 62 | /// |
| 63 | /// Octal syntax is a little-known way of uttering Unicode codepoints in |
| 64 | /// a regular expression. For example, `a`, `\x61`, `\u0061` and |
| 65 | /// `\141` are all equivalent regular expressions, where the last example |
| 66 | /// shows octal syntax. |
| 67 | /// |
| 68 | /// While supporting octal syntax isn't in and of itself a problem, it does |
| 69 | /// make good error messages harder. That is, in PCRE based regex engines, |
| 70 | /// syntax like `\0` invokes a backreference, which is explicitly |
| 71 | /// unsupported in Rust's regex engine. However, many users expect it to |
| 72 | /// be supported. Therefore, when octal support is disabled, the error |
| 73 | /// message will explicitly mention that backreferences aren't supported. |
| 74 | /// |
| 75 | /// Octal syntax is disabled by default. |
| 76 | pub fn octal(&mut self, yes: bool) -> &mut ParserBuilder { |
| 77 | self.ast.octal(yes); |
| 78 | self |
| 79 | } |
| 80 | |
| 81 | /// When enabled, the parser will permit the construction of a regular |
| 82 | /// expression that may match invalid UTF-8. |
| 83 | /// |
| 84 | /// When disabled (the default), the parser is guaranteed to produce |
| 85 | /// an expression that will only ever match valid UTF-8 (otherwise, the |
| 86 | /// parser will return an error). |
| 87 | /// |
| 88 | /// Perhaps surprisingly, when invalid UTF-8 isn't allowed, a negated ASCII |
| 89 | /// word boundary (uttered as `(?-u:\B)` in the concrete syntax) will cause |
| 90 | /// the parser to return an error. Namely, a negated ASCII word boundary |
| 91 | /// can result in matching positions that aren't valid UTF-8 boundaries. |
| 92 | pub fn allow_invalid_utf8(&mut self, yes: bool) -> &mut ParserBuilder { |
| 93 | self.hir.allow_invalid_utf8(yes); |
| 94 | self |
| 95 | } |
| 96 | |
| 97 | /// Enable verbose mode in the regular expression. |
| 98 | /// |
| 99 | /// When enabled, verbose mode permits insigificant whitespace in many |
| 100 | /// places in the regular expression, as well as comments. Comments are |
| 101 | /// started using `#` and continue until the end of the line. |
| 102 | /// |
| 103 | /// By default, this is disabled. It may be selectively enabled in the |
| 104 | /// regular expression by using the `x` flag regardless of this setting. |
| 105 | pub fn ignore_whitespace(&mut self, yes: bool) -> &mut ParserBuilder { |
| 106 | self.ast.ignore_whitespace(yes); |
| 107 | self |
| 108 | } |
| 109 | |
| 110 | /// Enable or disable the case insensitive flag by default. |
| 111 | /// |
| 112 | /// By default this is disabled. It may alternatively be selectively |
| 113 | /// enabled in the regular expression itself via the `i` flag. |
| 114 | pub fn case_insensitive(&mut self, yes: bool) -> &mut ParserBuilder { |
| 115 | self.hir.case_insensitive(yes); |
| 116 | self |
| 117 | } |
| 118 | |
| 119 | /// Enable or disable the multi-line matching flag by default. |
| 120 | /// |
| 121 | /// By default this is disabled. It may alternatively be selectively |
| 122 | /// enabled in the regular expression itself via the `m` flag. |
| 123 | pub fn multi_line(&mut self, yes: bool) -> &mut ParserBuilder { |
| 124 | self.hir.multi_line(yes); |
| 125 | self |
| 126 | } |
| 127 | |
| 128 | /// Enable or disable the "dot matches any character" flag by default. |
| 129 | /// |
| 130 | /// By default this is disabled. It may alternatively be selectively |
| 131 | /// enabled in the regular expression itself via the `s` flag. |
| 132 | pub fn dot_matches_new_line(&mut self, yes: bool) -> &mut ParserBuilder { |
| 133 | self.hir.dot_matches_new_line(yes); |
| 134 | self |
| 135 | } |
| 136 | |
| 137 | /// Enable or disable the "swap greed" flag by default. |
| 138 | /// |
| 139 | /// By default this is disabled. It may alternatively be selectively |
| 140 | /// enabled in the regular expression itself via the `U` flag. |
| 141 | pub fn swap_greed(&mut self, yes: bool) -> &mut ParserBuilder { |
| 142 | self.hir.swap_greed(yes); |
| 143 | self |
| 144 | } |
| 145 | |
| 146 | /// Enable or disable the Unicode flag (`u`) by default. |
| 147 | /// |
| 148 | /// By default this is **enabled**. It may alternatively be selectively |
| 149 | /// disabled in the regular expression itself via the `u` flag. |
| 150 | /// |
| 151 | /// Note that unless `allow_invalid_utf8` is enabled (it's disabled by |
| 152 | /// default), a regular expression will fail to parse if Unicode mode is |
| 153 | /// disabled and a sub-expression could possibly match invalid UTF-8. |
| 154 | pub fn unicode(&mut self, yes: bool) -> &mut ParserBuilder { |
| 155 | self.hir.unicode(yes); |
| 156 | self |
| 157 | } |
| 158 | } |
| 159 | |
| 160 | /// A convenience parser for regular expressions. |
| 161 | /// |
| 162 | /// This parser takes as input a regular expression pattern string (the |
| 163 | /// "concrete syntax") and returns a high-level intermediate representation |
| 164 | /// (the HIR) suitable for most types of analysis. In particular, this parser |
| 165 | /// hides the intermediate state of producing an AST (the "abstract syntax"). |
| 166 | /// The AST is itself far more complex than the HIR, so this parser serves as a |
| 167 | /// convenience for never having to deal with it at all. |
| 168 | /// |
| 169 | /// If callers have more fine grained use cases that need an AST, then please |
| 170 | /// see the [`ast::parse`](ast/parse/index.html) module. |
| 171 | /// |
| 172 | /// A `Parser` can be configured in more detail via a |
| 173 | /// [`ParserBuilder`](struct.ParserBuilder.html). |
| 174 | #[derive(Clone, Debug)] |
| 175 | pub struct Parser { |
| 176 | ast: ast::parse::Parser, |
| 177 | hir: hir::translate::Translator, |
| 178 | } |
| 179 | |
| 180 | impl Parser { |
| 181 | /// Create a new parser with a default configuration. |
| 182 | /// |
| 183 | /// The parser can be run with `parse` method. The parse method returns |
| 184 | /// a high level intermediate representation of the given regular |
| 185 | /// expression. |
| 186 | /// |
| 187 | /// To set configuration options on the parser, use |
| 188 | /// [`ParserBuilder`](struct.ParserBuilder.html). |
| 189 | pub fn new() -> Parser { |
| 190 | ParserBuilder::new().build() |
| 191 | } |
| 192 | |
| 193 | /// Parse the regular expression into a high level intermediate |
| 194 | /// representation. |
| 195 | pub fn parse(&mut self, pattern: &str) -> Result<hir::Hir> { |
| 196 | let ast = self.ast.parse(pattern)?; |
| 197 | let hir = self.hir.translate(pattern, &ast)?; |
| 198 | Ok(hir) |
| 199 | } |
| 200 | } |