vendor/regex-1.5.5/src/backtrack.rs - toolchain/rustc - Git at Google

 // This is the backtracking matching engine. It has the same exact capability
 // as the full NFA simulation, except it is artificially restricted to small
 // regexes on small inputs because of its memory requirements.
 //
 // In particular, this is a *bounded* backtracking engine. It retains worst
 // case linear time by keeping track of the states that it has visited (using a
 // bitmap). Namely, once a state is visited, it is never visited again. Since a
 // state is keyed by `(instruction index, input index)`, we have that its time
 // complexity is `O(mn)` (i.e., linear in the size of the search text).
 //
 // The backtracking engine can beat out the NFA simulation on small
 // regexes/inputs because it doesn't have to keep track of multiple copies of
 // the capture groups. In benchmarks, the backtracking engine is roughly twice
 // as fast as the full NFA simulation. Note though that its performance doesn't
 // scale, even if you're willing to live with the memory requirements. Namely,
 // the bitset has to be zeroed on each execution, which becomes quite expensive
 // on large bitsets.

 use crate::exec::ProgramCache;
 use crate::input::{Input, InputAt};
 use crate::prog::{InstPtr, Program};
 use crate::re_trait::Slot;

 type Bits = u32;

 const BIT_SIZE: usize = 32;
 const MAX_SIZE_BYTES: usize = 256 * (1 << 10); // 256 KB

 /// Returns true iff the given regex and input should be executed by this
 /// engine with reasonable memory usage.
 pub fn should_exec(num_insts: usize, text_len: usize) -> bool {
     // Total memory usage in bytes is determined by:
     //
     //   ((len(insts) * (len(input) + 1) + bits - 1) / bits) * (size_of(u32))
     //
     // The actual limit picked is pretty much a heuristic.
     // See: https://github.com/rust-lang/regex/issues/215
     let size = ((num_insts * (text_len + 1) + BIT_SIZE - 1) / BIT_SIZE) * 4;
     size <= MAX_SIZE_BYTES
 }

 /// A backtracking matching engine.
 #[derive(Debug)]
 pub struct Bounded<'a, 'm, 'r, 's, I> {
     prog: &'r Program,
     input: I,
     matches: &'m mut [bool],
     slots: &'s mut [Slot],
     m: &'a mut Cache,
 }

 /// Shared cached state between multiple invocations of a backtracking engine
 /// in the same thread.
 #[derive(Clone, Debug)]
 pub struct Cache {
     jobs: Vec<Job>,
     visited: Vec<Bits>,
 }

 impl Cache {
     /// Create new empty cache for the backtracking engine.
     pub fn new(_prog: &Program) -> Self {
         Cache { jobs: vec![], visited: vec![] }
     }
 }

 /// A job is an explicit unit of stack space in the backtracking engine.
 ///
 /// The "normal" representation is a single state transition, which corresponds
 /// to an NFA state and a character in the input. However, the backtracking
 /// engine must keep track of old capture group values. We use the explicit
 /// stack to do it.
 #[derive(Clone, Copy, Debug)]
 enum Job {
     Inst { ip: InstPtr, at: InputAt },
     SaveRestore { slot: usize, old_pos: Option<usize> },
 }

 impl<'a, 'm, 'r, 's, I: Input> Bounded<'a, 'm, 'r, 's, I> {
     /// Execute the backtracking matching engine.
     ///
     /// If there's a match, `exec` returns `true` and populates the given
     /// captures accordingly.
     pub fn exec(
         prog: &'r Program,
         cache: &ProgramCache,
         matches: &'m mut [bool],
         slots: &'s mut [Slot],
         input: I,
         start: usize,
         end: usize,
     ) -> bool {
         let mut cache = cache.borrow_mut();
         let cache = &mut cache.backtrack;
         let start = input.at(start);
         let mut b = Bounded {
             prog: prog,
             input: input,
             matches: matches,
             slots: slots,
             m: cache,
         };
         b.exec_(start, end)
     }

     /// Clears the cache such that the backtracking engine can be executed
     /// on some input of fixed length.
     fn clear(&mut self) {
         // Reset the job memory so that we start fresh.
         self.m.jobs.clear();

         // Now we need to clear the bit state set.
         // We do this by figuring out how much space we need to keep track
         // of the states we've visited.
         // Then we reset all existing allocated space to 0.
         // Finally, we request more space if we need it.
         //
         // This is all a little circuitous, but doing this using unchecked
         // operations doesn't seem to have a measurable impact on performance.
         // (Probably because backtracking is limited to such small
         // inputs/regexes in the first place.)
         let visited_len =
             (self.prog.len() * (self.input.len() + 1) + BIT_SIZE - 1)
                 / BIT_SIZE;
         self.m.visited.truncate(visited_len);
         for v in &mut self.m.visited {
             *v = 0;
         }
         if visited_len > self.m.visited.len() {
             let len = self.m.visited.len();
             self.m.visited.reserve_exact(visited_len - len);
             for _ in 0..(visited_len - len) {
                 self.m.visited.push(0);
             }
         }
     }

     /// Start backtracking at the given position in the input, but also look
     /// for literal prefixes.
     fn exec_(&mut self, mut at: InputAt, end: usize) -> bool {
         self.clear();
         // If this is an anchored regex at the beginning of the input, then
         // we're either already done or we only need to try backtracking once.
         if self.prog.is_anchored_start {
             return if !at.is_start() { false } else { self.backtrack(at) };
         }
         let mut matched = false;
         loop {
             if !self.prog.prefixes.is_empty() {
                 at = match self.input.prefix_at(&self.prog.prefixes, at) {
                     None => break,
                     Some(at) => at,
                 };
             }
             matched = self.backtrack(at) || matched;
             if matched && self.prog.matches.len() == 1 {
                 return true;
             }
             if at.pos() >= end {
                 break;
             }
             at = self.input.at(at.next_pos());
         }
         matched
     }

     /// The main backtracking loop starting at the given input position.
     fn backtrack(&mut self, start: InputAt) -> bool {
         // N.B. We use an explicit stack to avoid recursion.
         // To avoid excessive pushing and popping, most transitions are handled
         // in the `step` helper function, which only pushes to the stack when
         // there's a capture or a branch.
         let mut matched = false;
         self.m.jobs.push(Job::Inst { ip: 0, at: start });
         while let Some(job) = self.m.jobs.pop() {
             match job {
                 Job::Inst { ip, at } => {
                     if self.step(ip, at) {
                         // Only quit if we're matching one regex.
                         // If we're matching a regex set, then mush on and
                         // try to find other matches (if we want them).
                         if self.prog.matches.len() == 1 {
                             return true;
                         }
                         matched = true;
                     }
                 }
                 Job::SaveRestore { slot, old_pos } => {
                     if slot < self.slots.len() {
                         self.slots[slot] = old_pos;
                     }
                 }
             }
         }
         matched
     }

     fn step(&mut self, mut ip: InstPtr, mut at: InputAt) -> bool {
         use crate::prog::Inst::*;
         loop {
             // This loop is an optimization to avoid constantly pushing/popping
             // from the stack. Namely, if we're pushing a job only to run it
             // next, avoid the push and just mutate `ip` (and possibly `at`)
             // in place.
             if self.has_visited(ip, at) {
                 return false;
             }
             match self.prog[ip] {
                 Match(slot) => {
                     if slot < self.matches.len() {
                         self.matches[slot] = true;
                     }
                     return true;
                 }
                 Save(ref inst) => {
                     if let Some(&old_pos) = self.slots.get(inst.slot) {
                         // If this path doesn't work out, then we save the old
                         // capture index (if one exists) in an alternate
                         // job. If the next path fails, then the alternate
                         // job is popped and the old capture index is restored.
                         self.m.jobs.push(Job::SaveRestore {
                             slot: inst.slot,
                             old_pos: old_pos,
                         });
                         self.slots[inst.slot] = Some(at.pos());
                     }
                     ip = inst.goto;
                 }
                 Split(ref inst) => {
                     self.m.jobs.push(Job::Inst { ip: inst.goto2, at: at });
                     ip = inst.goto1;
                 }
                 EmptyLook(ref inst) => {
                     if self.input.is_empty_match(at, inst) {
                         ip = inst.goto;
                     } else {
                         return false;
                     }
                 }
                 Char(ref inst) => {
                     if inst.c == at.char() {
                         ip = inst.goto;
                         at = self.input.at(at.next_pos());
                     } else {
                         return false;
                     }
                 }
                 Ranges(ref inst) => {
                     if inst.matches(at.char()) {
                         ip = inst.goto;
                         at = self.input.at(at.next_pos());
                     } else {
                         return false;
                     }
                 }
                 Bytes(ref inst) => {
                     if let Some(b) = at.byte() {
                         if inst.matches(b) {
                             ip = inst.goto;
                             at = self.input.at(at.next_pos());
                             continue;
                         }
                     }
                     return false;
                 }
             }
         }
     }

     fn has_visited(&mut self, ip: InstPtr, at: InputAt) -> bool {
         let k = ip * (self.input.len() + 1) + at.pos();
         let k1 = k / BIT_SIZE;
         let k2 = usize_to_u32(1 << (k & (BIT_SIZE - 1)));
         if self.m.visited[k1] & k2 == 0 {
             self.m.visited[k1] |= k2;
             false
         } else {
             true
         }
     }
 }

 fn usize_to_u32(n: usize) -> u32 {
     if (n as u64) > (::std::u32::MAX as u64) {
         panic!("BUG: {} is too big to fit into u32", n)
     }
     n as u32
 }
	// This is the backtracking matching engine. It has the same exact capability
	// as the full NFA simulation, except it is artificially restricted to small
	// regexes on small inputs because of its memory requirements.
	//
	// In particular, this is a bounded backtracking engine. It retains worst
	// case linear time by keeping track of the states that it has visited (using a
	// bitmap). Namely, once a state is visited, it is never visited again. Since a
	// state is keyed by `(instruction index, input index)`, we have that its time
	// complexity is `O(mn)` (i.e., linear in the size of the search text).
	//
	// The backtracking engine can beat out the NFA simulation on small
	// regexes/inputs because it doesn't have to keep track of multiple copies of
	// the capture groups. In benchmarks, the backtracking engine is roughly twice
	// as fast as the full NFA simulation. Note though that its performance doesn't
	// scale, even if you're willing to live with the memory requirements. Namely,
	// the bitset has to be zeroed on each execution, which becomes quite expensive
	// on large bitsets.

	use crate::exec::ProgramCache;
	use crate::input::{Input, InputAt};
	use crate::prog::{InstPtr, Program};
	use crate::re_trait::Slot;

	type Bits = u32;

	const BIT_SIZE: usize = 32;
	const MAX_SIZE_BYTES: usize = 256 * (1 << 10); // 256 KB

	/// Returns true iff the given regex and input should be executed by this
	/// engine with reasonable memory usage.
	pub fn should_exec(num_insts: usize, text_len: usize) -> bool {
	// Total memory usage in bytes is determined by:
	//
	// ((len(insts) * (len(input) + 1) + bits - 1) / bits) * (size_of(u32))
	//
	// The actual limit picked is pretty much a heuristic.
	// See: https://github.com/rust-lang/regex/issues/215
	let size = ((num_insts * (text_len + 1) + BIT_SIZE - 1) / BIT_SIZE) * 4;
	size <= MAX_SIZE_BYTES
	}

	/// A backtracking matching engine.
	#[derive(Debug)]
	pub struct Bounded<'a, 'm, 'r, 's, I> {
	prog: &'r Program,
	input: I,
	matches: &'m mut [bool],
	slots: &'s mut [Slot],
	m: &'a mut Cache,
	}

	/// Shared cached state between multiple invocations of a backtracking engine
	/// in the same thread.
	#[derive(Clone, Debug)]
	pub struct Cache {
	jobs: Vec<Job>,
	visited: Vec<Bits>,
	}

	impl Cache {
	/// Create new empty cache for the backtracking engine.
	pub fn new(_prog: &Program) -> Self {
	Cache { jobs: vec![], visited: vec![] }
	}
	}

	/// A job is an explicit unit of stack space in the backtracking engine.
	///
	/// The "normal" representation is a single state transition, which corresponds
	/// to an NFA state and a character in the input. However, the backtracking
	/// engine must keep track of old capture group values. We use the explicit
	/// stack to do it.
	#[derive(Clone, Copy, Debug)]
	enum Job {
	Inst { ip: InstPtr, at: InputAt },
	SaveRestore { slot: usize, old_pos: Option<usize> },
	}

	impl<'a, 'm, 'r, 's, I: Input> Bounded<'a, 'm, 'r, 's, I> {
	/// Execute the backtracking matching engine.
	///
	/// If there's a match, `exec` returns `true` and populates the given
	/// captures accordingly.
	pub fn exec(
	prog: &'r Program,
	cache: &ProgramCache,
	matches: &'m mut [bool],
	slots: &'s mut [Slot],
	input: I,
	start: usize,
	end: usize,
	) -> bool {
	let mut cache = cache.borrow_mut();
	let cache = &mut cache.backtrack;
	let start = input.at(start);
	let mut b = Bounded {
	prog: prog,
	input: input,
	matches: matches,
	slots: slots,
	m: cache,
	};
	b.exec_(start, end)
	}

	/// Clears the cache such that the backtracking engine can be executed
	/// on some input of fixed length.
	fn clear(&mut self) {
	// Reset the job memory so that we start fresh.
	self.m.jobs.clear();

	// Now we need to clear the bit state set.
	// We do this by figuring out how much space we need to keep track
	// of the states we've visited.
	// Then we reset all existing allocated space to 0.
	// Finally, we request more space if we need it.
	//
	// This is all a little circuitous, but doing this using unchecked
	// operations doesn't seem to have a measurable impact on performance.
	// (Probably because backtracking is limited to such small
	// inputs/regexes in the first place.)
	let visited_len =
	(self.prog.len() * (self.input.len() + 1) + BIT_SIZE - 1)
	/ BIT_SIZE;
	self.m.visited.truncate(visited_len);
	for v in &mut self.m.visited {
	*v = 0;
	}
	if visited_len > self.m.visited.len() {
	let len = self.m.visited.len();
	self.m.visited.reserve_exact(visited_len - len);
	for _ in 0..(visited_len - len) {
	self.m.visited.push(0);
	}
	}
	}

	/// Start backtracking at the given position in the input, but also look
	/// for literal prefixes.
	fn exec_(&mut self, mut at: InputAt, end: usize) -> bool {
	self.clear();
	// If this is an anchored regex at the beginning of the input, then
	// we're either already done or we only need to try backtracking once.
	if self.prog.is_anchored_start {
	return if !at.is_start() { false } else { self.backtrack(at) };
	}
	let mut matched = false;
	loop {
	if !self.prog.prefixes.is_empty() {
	at = match self.input.prefix_at(&self.prog.prefixes, at) {
	None => break,
	Some(at) => at,
	};
	}
	matched = self.backtrack(at) \|\| matched;
	if matched && self.prog.matches.len() == 1 {
	return true;
	}
	if at.pos() >= end {
	break;
	}
	at = self.input.at(at.next_pos());
	}
	matched
	}

	/// The main backtracking loop starting at the given input position.
	fn backtrack(&mut self, start: InputAt) -> bool {
	// N.B. We use an explicit stack to avoid recursion.
	// To avoid excessive pushing and popping, most transitions are handled
	// in the `step` helper function, which only pushes to the stack when
	// there's a capture or a branch.
	let mut matched = false;
	self.m.jobs.push(Job::Inst { ip: 0, at: start });
	while let Some(job) = self.m.jobs.pop() {
	match job {
	Job::Inst { ip, at } => {
	if self.step(ip, at) {
	// Only quit if we're matching one regex.
	// If we're matching a regex set, then mush on and
	// try to find other matches (if we want them).
	if self.prog.matches.len() == 1 {
	return true;
	}
	matched = true;
	}
	}
	Job::SaveRestore { slot, old_pos } => {
	if slot < self.slots.len() {
	self.slots[slot] = old_pos;
	}
	}
	}
	}
	matched
	}

	fn step(&mut self, mut ip: InstPtr, mut at: InputAt) -> bool {
	use crate::prog::Inst::*;
	loop {
	// This loop is an optimization to avoid constantly pushing/popping
	// from the stack. Namely, if we're pushing a job only to run it
	// next, avoid the push and just mutate `ip` (and possibly `at`)
	// in place.
	if self.has_visited(ip, at) {
	return false;
	}
	match self.prog[ip] {
	Match(slot) => {
	if slot < self.matches.len() {
	self.matches[slot] = true;
	}
	return true;
	}
	Save(ref inst) => {
	if let Some(&old_pos) = self.slots.get(inst.slot) {
	// If this path doesn't work out, then we save the old
	// capture index (if one exists) in an alternate
	// job. If the next path fails, then the alternate
	// job is popped and the old capture index is restored.
	self.m.jobs.push(Job::SaveRestore {
	slot: inst.slot,
	old_pos: old_pos,
	});
	self.slots[inst.slot] = Some(at.pos());
	}
	ip = inst.goto;
	}
	Split(ref inst) => {
	self.m.jobs.push(Job::Inst { ip: inst.goto2, at: at });
	ip = inst.goto1;
	}
	EmptyLook(ref inst) => {
	if self.input.is_empty_match(at, inst) {
	ip = inst.goto;
	} else {
	return false;
	}
	}
	Char(ref inst) => {
	if inst.c == at.char() {
	ip = inst.goto;
	at = self.input.at(at.next_pos());
	} else {
	return false;
	}
	}
	Ranges(ref inst) => {
	if inst.matches(at.char()) {
	ip = inst.goto;
	at = self.input.at(at.next_pos());
	} else {
	return false;
	}
	}
	Bytes(ref inst) => {
	if let Some(b) = at.byte() {
	if inst.matches(b) {
	ip = inst.goto;
	at = self.input.at(at.next_pos());
	continue;
	}
	}
	return false;
	}
	}
	}
	}

	fn has_visited(&mut self, ip: InstPtr, at: InputAt) -> bool {
	let k = ip * (self.input.len() + 1) + at.pos();
	let k1 = k / BIT_SIZE;
	let k2 = usize_to_u32(1 << (k & (BIT_SIZE - 1)));
	if self.m.visited[k1] & k2 == 0 {
	self.m.visited[k1] \|= k2;
	false
	} else {
	true
	}
	}
	}

	fn usize_to_u32(n: usize) -> u32 {
	if (n as u64) > (::std::u32::MAX as u64) {
	panic!("BUG: {} is too big to fit into u32", n)
	}
	n as u32
	}