Jakub Kotur | 3bceaeb | 2020-12-21 17:28:16 +0100 | [diff] [blame] | 1 | use std::collections::BTreeMap; |
| 2 | use std::env; |
| 3 | use std::fmt::{self, Write}; |
| 4 | use std::thread; |
| 5 | |
| 6 | use regex; |
| 7 | use regex_automata::{DenseDFA, ErrorKind, Regex, RegexBuilder, StateID, DFA}; |
| 8 | use serde_bytes; |
| 9 | use toml; |
| 10 | |
| 11 | macro_rules! load { |
| 12 | ($col:ident, $path:expr) => { |
| 13 | $col.extend(RegexTests::load( |
| 14 | concat!("../data/tests/", $path), |
| 15 | include_bytes!(concat!("../data/tests/", $path)), |
| 16 | )); |
| 17 | }; |
| 18 | } |
| 19 | |
| 20 | lazy_static! { |
| 21 | pub static ref SUITE: RegexTestCollection = { |
| 22 | let mut col = RegexTestCollection::new(); |
| 23 | load!(col, "fowler/basic.toml"); |
| 24 | load!(col, "fowler/nullsubexpr.toml"); |
| 25 | load!(col, "fowler/repetition.toml"); |
| 26 | load!(col, "fowler/repetition-long.toml"); |
| 27 | load!(col, "crazy.toml"); |
| 28 | load!(col, "flags.toml"); |
| 29 | load!(col, "iter.toml"); |
| 30 | load!(col, "no-unicode.toml"); |
| 31 | load!(col, "unicode.toml"); |
| 32 | col |
| 33 | }; |
| 34 | } |
| 35 | |
| 36 | #[derive(Clone, Debug)] |
| 37 | pub struct RegexTestCollection { |
| 38 | pub by_name: BTreeMap<String, RegexTest>, |
| 39 | } |
| 40 | |
| 41 | #[derive(Clone, Debug, Deserialize)] |
| 42 | pub struct RegexTests { |
| 43 | pub tests: Vec<RegexTest>, |
| 44 | } |
| 45 | |
| 46 | #[derive(Clone, Debug, Deserialize)] |
| 47 | pub struct RegexTest { |
| 48 | pub name: String, |
| 49 | #[serde(default)] |
| 50 | pub options: Vec<RegexTestOption>, |
| 51 | pub pattern: String, |
| 52 | #[serde(with = "serde_bytes")] |
| 53 | pub input: Vec<u8>, |
| 54 | #[serde(rename = "matches")] |
| 55 | pub matches: Vec<Match>, |
| 56 | #[serde(default)] |
| 57 | pub captures: Vec<Option<Match>>, |
| 58 | #[serde(default)] |
| 59 | pub fowler_line_number: Option<u64>, |
| 60 | } |
| 61 | |
| 62 | #[derive(Clone, Copy, Debug, Deserialize, Eq, PartialEq)] |
| 63 | #[serde(rename_all = "kebab-case")] |
| 64 | pub enum RegexTestOption { |
| 65 | Anchored, |
| 66 | CaseInsensitive, |
| 67 | NoUnicode, |
| 68 | Escaped, |
| 69 | #[serde(rename = "invalid-utf8")] |
| 70 | InvalidUTF8, |
| 71 | } |
| 72 | |
| 73 | #[derive(Clone, Copy, Deserialize, Eq, PartialEq)] |
| 74 | pub struct Match { |
| 75 | pub start: usize, |
| 76 | pub end: usize, |
| 77 | } |
| 78 | |
| 79 | impl RegexTestCollection { |
| 80 | fn new() -> RegexTestCollection { |
| 81 | RegexTestCollection { by_name: BTreeMap::new() } |
| 82 | } |
| 83 | |
| 84 | fn extend(&mut self, tests: RegexTests) { |
| 85 | for test in tests.tests { |
| 86 | let name = test.name.clone(); |
| 87 | if self.by_name.contains_key(&name) { |
| 88 | panic!("found duplicate test {}", name); |
| 89 | } |
| 90 | self.by_name.insert(name, test); |
| 91 | } |
| 92 | } |
| 93 | |
| 94 | pub fn tests(&self) -> Vec<&RegexTest> { |
| 95 | self.by_name.values().collect() |
| 96 | } |
| 97 | } |
| 98 | |
| 99 | impl RegexTests { |
| 100 | fn load(path: &str, slice: &[u8]) -> RegexTests { |
| 101 | let mut data: RegexTests = toml::from_slice(slice) |
| 102 | .expect(&format!("failed to load {}", path)); |
| 103 | for test in &mut data.tests { |
| 104 | if test.options.contains(&RegexTestOption::Escaped) { |
| 105 | test.input = unescape_bytes(&test.input); |
| 106 | } |
| 107 | } |
| 108 | data |
| 109 | } |
| 110 | } |
| 111 | |
| 112 | #[derive(Debug)] |
| 113 | pub struct RegexTester { |
| 114 | asserted: bool, |
| 115 | results: RegexTestResults, |
| 116 | skip_expensive: bool, |
| 117 | whitelist: Vec<regex::Regex>, |
| 118 | blacklist: Vec<regex::Regex>, |
| 119 | } |
| 120 | |
| 121 | impl Drop for RegexTester { |
| 122 | fn drop(&mut self) { |
| 123 | // If we haven't asserted yet, then the test is probably buggy, so |
| 124 | // fail it. But if we're already panicking (e.g., a bug in the regex |
| 125 | // engine), then don't double-panic, which causes an immediate abort. |
| 126 | if !thread::panicking() && !self.asserted { |
| 127 | panic!("must call RegexTester::assert at end of test"); |
| 128 | } |
| 129 | } |
| 130 | } |
| 131 | |
| 132 | impl RegexTester { |
| 133 | pub fn new() -> RegexTester { |
| 134 | let mut tester = RegexTester { |
| 135 | asserted: false, |
| 136 | results: RegexTestResults::default(), |
| 137 | skip_expensive: false, |
| 138 | whitelist: vec![], |
| 139 | blacklist: vec![], |
| 140 | }; |
| 141 | for x in env::var("REGEX_TEST").unwrap_or("".to_string()).split(",") { |
| 142 | let x = x.trim(); |
| 143 | if x.is_empty() { |
| 144 | continue; |
| 145 | } |
| 146 | if x.starts_with("-") { |
| 147 | tester = tester.blacklist(&x[1..]); |
| 148 | } else { |
| 149 | tester = tester.whitelist(x); |
| 150 | } |
| 151 | } |
| 152 | tester |
| 153 | } |
| 154 | |
| 155 | pub fn skip_expensive(mut self) -> RegexTester { |
| 156 | self.skip_expensive = true; |
| 157 | self |
| 158 | } |
| 159 | |
| 160 | pub fn whitelist(mut self, name: &str) -> RegexTester { |
| 161 | self.whitelist.push(regex::Regex::new(name).unwrap()); |
| 162 | self |
| 163 | } |
| 164 | |
| 165 | pub fn blacklist(mut self, name: &str) -> RegexTester { |
| 166 | self.blacklist.push(regex::Regex::new(name).unwrap()); |
| 167 | self |
| 168 | } |
| 169 | |
| 170 | pub fn assert(&mut self) { |
| 171 | self.asserted = true; |
| 172 | self.results.assert(); |
| 173 | } |
| 174 | |
| 175 | pub fn build_regex<S: StateID>( |
| 176 | &self, |
| 177 | mut builder: RegexBuilder, |
| 178 | test: &RegexTest, |
| 179 | ) -> Option<Regex<DenseDFA<Vec<S>, S>>> { |
| 180 | if self.skip(test) { |
| 181 | return None; |
| 182 | } |
| 183 | self.apply_options(test, &mut builder); |
| 184 | |
| 185 | match builder.build_with_size::<S>(&test.pattern) { |
| 186 | Ok(re) => Some(re), |
| 187 | Err(err) => { |
| 188 | if let ErrorKind::Unsupported(_) = *err.kind() { |
| 189 | None |
| 190 | } else { |
| 191 | panic!( |
| 192 | "failed to build {:?} with pattern '{:?}': {}", |
| 193 | test.name, test.pattern, err |
| 194 | ); |
| 195 | } |
| 196 | } |
| 197 | } |
| 198 | } |
| 199 | |
| 200 | pub fn test_all<'a, I, T>(&mut self, builder: RegexBuilder, tests: I) |
| 201 | where |
| 202 | I: IntoIterator<IntoIter = T, Item = &'a RegexTest>, |
| 203 | T: Iterator<Item = &'a RegexTest>, |
| 204 | { |
| 205 | for test in tests { |
| 206 | let builder = builder.clone(); |
| 207 | let re: Regex = match self.build_regex(builder, test) { |
| 208 | None => continue, |
| 209 | Some(re) => re, |
| 210 | }; |
| 211 | self.test(test, &re); |
| 212 | } |
| 213 | } |
| 214 | |
| 215 | pub fn test<'a, D: DFA>(&mut self, test: &RegexTest, re: &Regex<D>) { |
| 216 | self.test_is_match(test, re); |
| 217 | self.test_find(test, re); |
| 218 | // Some tests (namely, fowler) are designed only to detect the |
| 219 | // first match even if there are more subsequent matches. To that |
| 220 | // end, we only test match iteration when the number of matches |
| 221 | // expected is not 1, or if the test name has 'iter' in it. |
| 222 | if test.name.contains("iter") || test.matches.len() != 1 { |
| 223 | self.test_find_iter(test, re); |
| 224 | } |
| 225 | } |
| 226 | |
| 227 | pub fn test_is_match<'a, D: DFA>( |
| 228 | &mut self, |
| 229 | test: &RegexTest, |
| 230 | re: &Regex<D>, |
| 231 | ) { |
| 232 | self.asserted = false; |
| 233 | |
| 234 | let got = re.is_match(&test.input); |
| 235 | let expected = test.matches.len() >= 1; |
| 236 | if got == expected { |
| 237 | self.results.succeeded.push(test.clone()); |
| 238 | return; |
| 239 | } |
| 240 | self.results.failed.push(RegexTestFailure { |
| 241 | test: test.clone(), |
| 242 | kind: RegexTestFailureKind::IsMatch, |
| 243 | }); |
| 244 | } |
| 245 | |
| 246 | pub fn test_find<'a, D: DFA>(&mut self, test: &RegexTest, re: &Regex<D>) { |
| 247 | self.asserted = false; |
| 248 | |
| 249 | let got = |
| 250 | re.find(&test.input).map(|(start, end)| Match { start, end }); |
| 251 | if got == test.matches.get(0).map(|&m| m) { |
| 252 | self.results.succeeded.push(test.clone()); |
| 253 | return; |
| 254 | } |
| 255 | self.results.failed.push(RegexTestFailure { |
| 256 | test: test.clone(), |
| 257 | kind: RegexTestFailureKind::Find { got }, |
| 258 | }); |
| 259 | } |
| 260 | |
| 261 | pub fn test_find_iter<'a, D: DFA>( |
| 262 | &mut self, |
| 263 | test: &RegexTest, |
| 264 | re: &Regex<D>, |
| 265 | ) { |
| 266 | self.asserted = false; |
| 267 | |
| 268 | let got: Vec<Match> = re |
| 269 | .find_iter(&test.input) |
| 270 | .map(|(start, end)| Match { start, end }) |
| 271 | .collect(); |
| 272 | if got == test.matches { |
| 273 | self.results.succeeded.push(test.clone()); |
| 274 | return; |
| 275 | } |
| 276 | self.results.failed.push(RegexTestFailure { |
| 277 | test: test.clone(), |
| 278 | kind: RegexTestFailureKind::FindIter { got }, |
| 279 | }); |
| 280 | } |
| 281 | |
| 282 | fn skip(&self, test: &RegexTest) -> bool { |
| 283 | if self.skip_expensive { |
| 284 | if test.name.starts_with("repetition-long") { |
| 285 | return true; |
| 286 | } |
| 287 | } |
| 288 | if !self.blacklist.is_empty() { |
| 289 | if self.blacklist.iter().any(|re| re.is_match(&test.name)) { |
| 290 | return true; |
| 291 | } |
| 292 | } |
| 293 | if !self.whitelist.is_empty() { |
| 294 | if !self.whitelist.iter().any(|re| re.is_match(&test.name)) { |
| 295 | return true; |
| 296 | } |
| 297 | } |
| 298 | false |
| 299 | } |
| 300 | |
| 301 | fn apply_options(&self, test: &RegexTest, builder: &mut RegexBuilder) { |
| 302 | for opt in &test.options { |
| 303 | match *opt { |
| 304 | RegexTestOption::Anchored => { |
| 305 | builder.anchored(true); |
| 306 | } |
| 307 | RegexTestOption::CaseInsensitive => { |
| 308 | builder.case_insensitive(true); |
| 309 | } |
| 310 | RegexTestOption::NoUnicode => { |
| 311 | builder.unicode(false); |
| 312 | } |
| 313 | RegexTestOption::Escaped => {} |
| 314 | RegexTestOption::InvalidUTF8 => { |
| 315 | builder.allow_invalid_utf8(true); |
| 316 | } |
| 317 | } |
| 318 | } |
| 319 | } |
| 320 | } |
| 321 | |
| 322 | #[derive(Clone, Debug, Default)] |
| 323 | pub struct RegexTestResults { |
| 324 | /// Tests that succeeded. |
| 325 | pub succeeded: Vec<RegexTest>, |
| 326 | /// Failed tests, indexed by group name. |
| 327 | pub failed: Vec<RegexTestFailure>, |
| 328 | } |
| 329 | |
| 330 | #[derive(Clone, Debug)] |
| 331 | pub struct RegexTestFailure { |
| 332 | test: RegexTest, |
| 333 | kind: RegexTestFailureKind, |
| 334 | } |
| 335 | |
| 336 | #[derive(Clone, Debug)] |
| 337 | pub enum RegexTestFailureKind { |
| 338 | IsMatch, |
| 339 | Find { got: Option<Match> }, |
| 340 | FindIter { got: Vec<Match> }, |
| 341 | } |
| 342 | |
| 343 | impl RegexTestResults { |
| 344 | pub fn assert(&self) { |
| 345 | if self.failed.is_empty() { |
| 346 | return; |
| 347 | } |
| 348 | let failures = self |
| 349 | .failed |
| 350 | .iter() |
| 351 | .map(|f| f.to_string()) |
| 352 | .collect::<Vec<String>>() |
| 353 | .join("\n\n"); |
| 354 | panic!( |
| 355 | "found {} failures:\n{}\n{}\n{}\n\n\ |
| 356 | Set the REGEX_TEST environment variable to filter tests, \n\ |
| 357 | e.g., REGEX_TEST=crazy-misc,-crazy-misc2 runs every test \n\ |
| 358 | whose name contains crazy-misc but not crazy-misc2\n\n", |
| 359 | self.failed.len(), |
| 360 | "~".repeat(79), |
| 361 | failures.trim(), |
| 362 | "~".repeat(79) |
| 363 | ) |
| 364 | } |
| 365 | } |
| 366 | |
| 367 | impl fmt::Display for RegexTestFailure { |
| 368 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { |
| 369 | write!( |
| 370 | f, |
| 371 | "{}: {}\n \ |
| 372 | options: {:?}\n \ |
| 373 | pattern: {}\n \ |
| 374 | pattern (escape): {}\n \ |
| 375 | input: {}\n \ |
| 376 | input (escape): {}\n \ |
| 377 | input (hex): {}", |
| 378 | self.test.name, |
| 379 | self.kind.fmt(&self.test)?, |
| 380 | self.test.options, |
| 381 | self.test.pattern, |
| 382 | escape_default(&self.test.pattern), |
| 383 | nice_raw_bytes(&self.test.input), |
| 384 | escape_bytes(&self.test.input), |
| 385 | hex_bytes(&self.test.input) |
| 386 | ) |
| 387 | } |
| 388 | } |
| 389 | |
| 390 | impl RegexTestFailureKind { |
| 391 | fn fmt(&self, test: &RegexTest) -> Result<String, fmt::Error> { |
| 392 | let mut buf = String::new(); |
| 393 | match *self { |
| 394 | RegexTestFailureKind::IsMatch => { |
| 395 | if let Some(&m) = test.matches.get(0) { |
| 396 | write!(buf, "expected match (at {}), but none found", m)? |
| 397 | } else { |
| 398 | write!(buf, "expected no match, but found a match")? |
| 399 | } |
| 400 | } |
| 401 | RegexTestFailureKind::Find { got } => write!( |
| 402 | buf, |
| 403 | "expected {:?}, but found {:?}", |
| 404 | test.matches.get(0), |
| 405 | got |
| 406 | )?, |
| 407 | RegexTestFailureKind::FindIter { ref got } => write!( |
| 408 | buf, |
| 409 | "expected {:?}, but found {:?}", |
| 410 | test.matches, got |
| 411 | )?, |
| 412 | } |
| 413 | Ok(buf) |
| 414 | } |
| 415 | } |
| 416 | |
| 417 | impl fmt::Display for Match { |
| 418 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { |
| 419 | write!(f, "({}, {})", self.start, self.end) |
| 420 | } |
| 421 | } |
| 422 | |
| 423 | impl fmt::Debug for Match { |
| 424 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { |
| 425 | write!(f, "({}, {})", self.start, self.end) |
| 426 | } |
| 427 | } |
| 428 | |
| 429 | fn nice_raw_bytes(bytes: &[u8]) -> String { |
| 430 | use std::str; |
| 431 | |
| 432 | match str::from_utf8(bytes) { |
| 433 | Ok(s) => s.to_string(), |
| 434 | Err(_) => escape_bytes(bytes), |
| 435 | } |
| 436 | } |
| 437 | |
| 438 | fn escape_bytes(bytes: &[u8]) -> String { |
| 439 | use std::ascii; |
| 440 | |
| 441 | let escaped = bytes |
| 442 | .iter() |
| 443 | .flat_map(|&b| ascii::escape_default(b)) |
| 444 | .collect::<Vec<u8>>(); |
| 445 | String::from_utf8(escaped).unwrap() |
| 446 | } |
| 447 | |
| 448 | fn hex_bytes(bytes: &[u8]) -> String { |
| 449 | bytes.iter().map(|&b| format!(r"\x{:02X}", b)).collect() |
| 450 | } |
| 451 | |
| 452 | fn escape_default(s: &str) -> String { |
| 453 | s.chars().flat_map(|c| c.escape_default()).collect() |
| 454 | } |
| 455 | |
| 456 | fn unescape_bytes(bytes: &[u8]) -> Vec<u8> { |
| 457 | use std::str; |
| 458 | use unescape::unescape; |
| 459 | |
| 460 | unescape(&str::from_utf8(bytes).expect("all input must be valid UTF-8")) |
| 461 | } |