vendor/regex-automata/tests/hybrid/api.rs - toolchain/rustc - Git at Google

 use std::error::Error;

 use regex_automata::{
     hybrid::{
         dfa::{self, DFA},
         regex::Regex,
         OverlappingState,
     },
     nfa::thompson,
     HalfMatch, MatchError, MatchKind, MultiMatch,
 };

 use crate::util::{BunkPrefilter, SubstringPrefilter};

 // Tests that too many cache resets cause the lazy DFA to quit.
 //
 // We only test this on 64-bit because the test is gingerly crafted based on
 // implementation details of cache sizes. It's not a great test because of
 // that, but it does check some interesting properties around how positions are
 // reported when a search "gives up."
 #[test]
 #[cfg(target_pointer_width = "64")]
 fn too_many_cache_resets_cause_quit() -> Result<(), Box<dyn Error>> {
     // This is a carefully chosen regex. The idea is to pick one that requires
     // some decent number of states (hence the bounded repetition). But we
     // specifically choose to create a class with an ASCII letter and a
     // non-ASCII letter so that we can check that no new states are created
     // once the cache is full. Namely, if we fill up the cache on a haystack
     // of 'a's, then in order to match one 'β', a new state will need to be
     // created since a 'β' is encoded with multiple bytes. Since there's no
     // room for this state, the search should quit at the very first position.
     let pattern = r"[aβ]{100}";
     let dfa = DFA::builder()
         .configure(
             // Configure it so that we have the minimum cache capacity
             // possible. And that if any resets occur, the search quits.
             DFA::config()
                 .skip_cache_capacity_check(true)
                 .cache_capacity(0)
                 .minimum_cache_clear_count(Some(0)),
         )
         .build(pattern)?;
     let mut cache = dfa.create_cache();

     let haystack = "a".repeat(101).into_bytes();
     let err = MatchError::GaveUp { offset: 25 };
     assert_eq!(dfa.find_earliest_fwd(&mut cache, &haystack), Err(err.clone()));
     assert_eq!(dfa.find_leftmost_fwd(&mut cache, &haystack), Err(err.clone()));
     assert_eq!(
         dfa.find_overlapping_fwd(
             &mut cache,
             &haystack,
             &mut OverlappingState::start()
         ),
         Err(err.clone())
     );

     let haystack = "β".repeat(101).into_bytes();
     let err = MatchError::GaveUp { offset: 0 };
     assert_eq!(dfa.find_earliest_fwd(&mut cache, &haystack), Err(err));
     // no need to test that other find routines quit, since we did that above

     // OK, if we reset the cache, then we should be able to create more states
     // and make more progress with searching for betas.
     cache.reset(&dfa);
     let err = MatchError::GaveUp { offset: 26 };
     assert_eq!(dfa.find_earliest_fwd(&mut cache, &haystack), Err(err));

     // ... switching back to ASCII still makes progress since it just needs to
     // set transitions on existing states!
     let haystack = "a".repeat(101).into_bytes();
     let err = MatchError::GaveUp { offset: 13 };
     assert_eq!(dfa.find_earliest_fwd(&mut cache, &haystack), Err(err));

     Ok(())
 }

 // Tests that quit bytes in the forward direction work correctly.
 #[test]
 fn quit_fwd() -> Result<(), Box<dyn Error>> {
     let dfa = DFA::builder()
         .configure(DFA::config().quit(b'x', true))
         .build("[[:word:]]+$")?;
     let mut cache = dfa.create_cache();

     assert_eq!(
         dfa.find_earliest_fwd(&mut cache, b"abcxyz"),
         Err(MatchError::Quit { byte: b'x', offset: 3 })
     );
     assert_eq!(
         dfa.find_leftmost_fwd(&mut cache, b"abcxyz"),
         Err(MatchError::Quit { byte: b'x', offset: 3 })
     );
     assert_eq!(
         dfa.find_overlapping_fwd(
             &mut cache,
             b"abcxyz",
             &mut OverlappingState::start()
         ),
         Err(MatchError::Quit { byte: b'x', offset: 3 })
     );

     Ok(())
 }

 // Tests that quit bytes in the reverse direction work correctly.
 #[test]
 fn quit_rev() -> Result<(), Box<dyn Error>> {
     let dfa = DFA::builder()
         .configure(DFA::config().quit(b'x', true))
         .thompson(thompson::Config::new().reverse(true))
         .build("^[[:word:]]+")?;
     let mut cache = dfa.create_cache();

     assert_eq!(
         dfa.find_earliest_rev(&mut cache, b"abcxyz"),
         Err(MatchError::Quit { byte: b'x', offset: 3 })
     );
     assert_eq!(
         dfa.find_leftmost_rev(&mut cache, b"abcxyz"),
         Err(MatchError::Quit { byte: b'x', offset: 3 })
     );

     Ok(())
 }

 // Tests that if we heuristically enable Unicode word boundaries but then
 // instruct that a non-ASCII byte should NOT be a quit byte, then the builder
 // will panic.
 #[test]
 #[should_panic]
 fn quit_panics() {
     DFA::config().unicode_word_boundary(true).quit(b'\xFF', false);
 }

 // This tests an intesting case where even if the Unicode word boundary option
 // is disabled, setting all non-ASCII bytes to be quit bytes will cause Unicode
 // word boundaries to be enabled.
 #[test]
 fn unicode_word_implicitly_works() -> Result<(), Box<dyn Error>> {
     let mut config = DFA::config();
     for b in 0x80..=0xFF {
         config = config.quit(b, true);
     }
     let dfa = DFA::builder().configure(config).build(r"\b")?;
     let mut cache = dfa.create_cache();
     let expected = HalfMatch::must(0, 1);
     assert_eq!(dfa.find_leftmost_fwd(&mut cache, b" a"), Ok(Some(expected)));
     Ok(())
 }

 // Tests that we can provide a prefilter to a Regex, and the search reports
 // correct results.
 #[test]
 fn prefilter_works() -> Result<(), Box<dyn Error>> {
     let mut re = Regex::new(r"a[0-9]+").unwrap();
     re.set_prefilter(Some(Box::new(SubstringPrefilter::new("a"))));
     let mut cache = re.create_cache();

     let text = b"foo abc foo a1a2a3 foo a123 bar aa456";
     let matches: Vec<(usize, usize)> = re
         .find_leftmost_iter(&mut cache, text)
         .map(|m| (m.start(), m.end()))
         .collect();
     assert_eq!(
         matches,
         vec![(12, 14), (14, 16), (16, 18), (23, 27), (33, 37),]
     );
     Ok(())
 }

 // This test confirms that a prefilter is active by using a prefilter that
 // reports false negatives.
 #[test]
 fn prefilter_is_active() -> Result<(), Box<dyn Error>> {
     let text = b"za123";
     let mut re = Regex::new(r"a[0-9]+").unwrap();
     let mut cache = re.create_cache();

     re.set_prefilter(Some(Box::new(SubstringPrefilter::new("a"))));
     assert_eq!(
         re.find_leftmost(&mut cache, b"za123"),
         Some(MultiMatch::must(0, 1, 5))
     );
     assert_eq!(
         re.find_leftmost(&mut cache, b"a123"),
         Some(MultiMatch::must(0, 0, 4))
     );
     re.set_prefilter(Some(Box::new(BunkPrefilter::new())));
     assert_eq!(re.find_leftmost(&mut cache, b"za123"), None);
     // This checks that the prefilter is used when first starting the search,
     // instead of waiting until at least one transition has occurred.
     assert_eq!(re.find_leftmost(&mut cache, b"a123"), None);
     Ok(())
 }
	use std::error::Error;

	use regex_automata::{
	hybrid::{
	dfa::{self, DFA},
	regex::Regex,
	OverlappingState,
	},
	nfa::thompson,
	HalfMatch, MatchError, MatchKind, MultiMatch,
	};

	use crate::util::{BunkPrefilter, SubstringPrefilter};

	// Tests that too many cache resets cause the lazy DFA to quit.
	//
	// We only test this on 64-bit because the test is gingerly crafted based on
	// implementation details of cache sizes. It's not a great test because of
	// that, but it does check some interesting properties around how positions are
	// reported when a search "gives up."
	#[test]
	#[cfg(target_pointer_width = "64")]
	fn too_many_cache_resets_cause_quit() -> Result<(), Box<dyn Error>> {
	// This is a carefully chosen regex. The idea is to pick one that requires
	// some decent number of states (hence the bounded repetition). But we
	// specifically choose to create a class with an ASCII letter and a
	// non-ASCII letter so that we can check that no new states are created
	// once the cache is full. Namely, if we fill up the cache on a haystack
	// of 'a's, then in order to match one 'β', a new state will need to be
	// created since a 'β' is encoded with multiple bytes. Since there's no
	// room for this state, the search should quit at the very first position.
	let pattern = r"[aβ]{100}";
	let dfa = DFA::builder()
	.configure(
	// Configure it so that we have the minimum cache capacity
	// possible. And that if any resets occur, the search quits.
	DFA::config()
	.skip_cache_capacity_check(true)
	.cache_capacity(0)
	.minimum_cache_clear_count(Some(0)),
	)
	.build(pattern)?;
	let mut cache = dfa.create_cache();

	let haystack = "a".repeat(101).into_bytes();
	let err = MatchError::GaveUp { offset: 25 };
	assert_eq!(dfa.find_earliest_fwd(&mut cache, &haystack), Err(err.clone()));
	assert_eq!(dfa.find_leftmost_fwd(&mut cache, &haystack), Err(err.clone()));
	assert_eq!(
	dfa.find_overlapping_fwd(
	&mut cache,
	&haystack,
	&mut OverlappingState::start()
	),
	Err(err.clone())
	);

	let haystack = "β".repeat(101).into_bytes();
	let err = MatchError::GaveUp { offset: 0 };
	assert_eq!(dfa.find_earliest_fwd(&mut cache, &haystack), Err(err));
	// no need to test that other find routines quit, since we did that above

	// OK, if we reset the cache, then we should be able to create more states
	// and make more progress with searching for betas.
	cache.reset(&dfa);
	let err = MatchError::GaveUp { offset: 26 };
	assert_eq!(dfa.find_earliest_fwd(&mut cache, &haystack), Err(err));

	// ... switching back to ASCII still makes progress since it just needs to
	// set transitions on existing states!
	let haystack = "a".repeat(101).into_bytes();
	let err = MatchError::GaveUp { offset: 13 };
	assert_eq!(dfa.find_earliest_fwd(&mut cache, &haystack), Err(err));

	Ok(())
	}

	// Tests that quit bytes in the forward direction work correctly.
	#[test]
	fn quit_fwd() -> Result<(), Box<dyn Error>> {
	let dfa = DFA::builder()
	.configure(DFA::config().quit(b'x', true))
	.build("[[:word:]]+$")?;
	let mut cache = dfa.create_cache();

	assert_eq!(
	dfa.find_earliest_fwd(&mut cache, b"abcxyz"),
	Err(MatchError::Quit { byte: b'x', offset: 3 })
	);
	assert_eq!(
	dfa.find_leftmost_fwd(&mut cache, b"abcxyz"),
	Err(MatchError::Quit { byte: b'x', offset: 3 })
	);
	assert_eq!(
	dfa.find_overlapping_fwd(
	&mut cache,
	b"abcxyz",
	&mut OverlappingState::start()
	),
	Err(MatchError::Quit { byte: b'x', offset: 3 })
	);

	Ok(())
	}

	// Tests that quit bytes in the reverse direction work correctly.
	#[test]
	fn quit_rev() -> Result<(), Box<dyn Error>> {
	let dfa = DFA::builder()
	.configure(DFA::config().quit(b'x', true))
	.thompson(thompson::Config::new().reverse(true))
	.build("^[[:word:]]+")?;
	let mut cache = dfa.create_cache();

	assert_eq!(
	dfa.find_earliest_rev(&mut cache, b"abcxyz"),
	Err(MatchError::Quit { byte: b'x', offset: 3 })
	);
	assert_eq!(
	dfa.find_leftmost_rev(&mut cache, b"abcxyz"),
	Err(MatchError::Quit { byte: b'x', offset: 3 })
	);

	Ok(())
	}

	// Tests that if we heuristically enable Unicode word boundaries but then
	// instruct that a non-ASCII byte should NOT be a quit byte, then the builder
	// will panic.
	#[test]
	#[should_panic]
	fn quit_panics() {
	DFA::config().unicode_word_boundary(true).quit(b'\xFF', false);
	}

	// This tests an intesting case where even if the Unicode word boundary option
	// is disabled, setting all non-ASCII bytes to be quit bytes will cause Unicode
	// word boundaries to be enabled.
	#[test]
	fn unicode_word_implicitly_works() -> Result<(), Box<dyn Error>> {
	let mut config = DFA::config();
	for b in 0x80..=0xFF {
	config = config.quit(b, true);
	}
	let dfa = DFA::builder().configure(config).build(r"\b")?;
	let mut cache = dfa.create_cache();
	let expected = HalfMatch::must(0, 1);
	assert_eq!(dfa.find_leftmost_fwd(&mut cache, b" a"), Ok(Some(expected)));
	Ok(())
	}

	// Tests that we can provide a prefilter to a Regex, and the search reports
	// correct results.
	#[test]
	fn prefilter_works() -> Result<(), Box<dyn Error>> {
	let mut re = Regex::new(r"a[0-9]+").unwrap();
	re.set_prefilter(Some(Box::new(SubstringPrefilter::new("a"))));
	let mut cache = re.create_cache();

	let text = b"foo abc foo a1a2a3 foo a123 bar aa456";
	let matches: Vec<(usize, usize)> = re
	.find_leftmost_iter(&mut cache, text)
	.map(\|m\| (m.start(), m.end()))
	.collect();
	assert_eq!(
	matches,
	vec![(12, 14), (14, 16), (16, 18), (23, 27), (33, 37),]
	);
	Ok(())
	}

	// This test confirms that a prefilter is active by using a prefilter that
	// reports false negatives.
	#[test]
	fn prefilter_is_active() -> Result<(), Box<dyn Error>> {
	let text = b"za123";
	let mut re = Regex::new(r"a[0-9]+").unwrap();
	let mut cache = re.create_cache();

	re.set_prefilter(Some(Box::new(SubstringPrefilter::new("a"))));
	assert_eq!(
	re.find_leftmost(&mut cache, b"za123"),
	Some(MultiMatch::must(0, 1, 5))
	);
	assert_eq!(
	re.find_leftmost(&mut cache, b"a123"),
	Some(MultiMatch::must(0, 0, 4))
	);
	re.set_prefilter(Some(Box::new(BunkPrefilter::new())));
	assert_eq!(re.find_leftmost(&mut cache, b"za123"), None);
	// This checks that the prefilter is used when first starting the search,
	// instead of waiting until at least one transition has occurred.
	assert_eq!(re.find_leftmost(&mut cache, b"a123"), None);
	Ok(())
	}