vendor/regex-1.8.4/src/compile.rs - toolchain/rustc - Git at Google

 use std::collections::HashMap;
 use std::fmt;
 use std::iter;
 use std::result;
 use std::sync::Arc;

 use regex_syntax::hir::{self, Hir, Look};
 use regex_syntax::is_word_byte;
 use regex_syntax::utf8::{Utf8Range, Utf8Sequence, Utf8Sequences};

 use crate::prog::{
     EmptyLook, Inst, InstBytes, InstChar, InstEmptyLook, InstPtr, InstRanges,
     InstSave, InstSplit, Program,
 };

 use crate::Error;

 type Result = result::Result<Patch, Error>;
 type ResultOrEmpty = result::Result<Option<Patch>, Error>;

 #[derive(Debug)]
 struct Patch {
     hole: Hole,
     entry: InstPtr,
 }

 /// A compiler translates a regular expression AST to a sequence of
 /// instructions. The sequence of instructions represents an NFA.
 // `Compiler` is only public via the `internal` module, so avoid deriving
 // `Debug`.
 #[allow(missing_debug_implementations)]
 pub struct Compiler {
     insts: Vec<MaybeInst>,
     compiled: Program,
     capture_name_idx: HashMap<String, usize>,
     num_exprs: usize,
     size_limit: usize,
     suffix_cache: SuffixCache,
     utf8_seqs: Option<Utf8Sequences>,
     byte_classes: ByteClassSet,
     // This keeps track of extra bytes allocated while compiling the regex
     // program. Currently, this corresponds to two things. First is the heap
     // memory allocated by Unicode character classes ('InstRanges'). Second is
     // a "fake" amount of memory used by empty sub-expressions, so that enough
     // empty sub-expressions will ultimately trigger the compiler to bail
     // because of a size limit restriction. (That empty sub-expressions don't
     // add to heap memory usage is more-or-less an implementation detail.) In
     // the second case, if we don't bail, then an excessively large repetition
     // on an empty sub-expression can result in the compiler using a very large
     // amount of CPU time.
     extra_inst_bytes: usize,
 }

 impl Compiler {
     /// Create a new regular expression compiler.
     ///
     /// Various options can be set before calling `compile` on an expression.
     pub fn new() -> Self {
         Compiler {
             insts: vec![],
             compiled: Program::new(),
             capture_name_idx: HashMap::new(),
             num_exprs: 0,
             size_limit: 10 * (1 << 20),
             suffix_cache: SuffixCache::new(1000),
             utf8_seqs: Some(Utf8Sequences::new('\x00', '\x00')),
             byte_classes: ByteClassSet::new(),
             extra_inst_bytes: 0,
         }
     }

     /// The size of the resulting program is limited by size_limit. If
     /// the program approximately exceeds the given size (in bytes), then
     /// compilation will stop and return an error.
     pub fn size_limit(mut self, size_limit: usize) -> Self {
         self.size_limit = size_limit;
         self
     }

     /// If bytes is true, then the program is compiled as a byte based
     /// automaton, which incorporates UTF-8 decoding into the machine. If it's
     /// false, then the automaton is Unicode scalar value based, e.g., an
     /// engine utilizing such an automaton is responsible for UTF-8 decoding.
     ///
     /// The specific invariant is that when returning a byte based machine,
     /// the neither the `Char` nor `Ranges` instructions are produced.
     /// Conversely, when producing a Unicode scalar value machine, the `Bytes`
     /// instruction is never produced.
     ///
     /// Note that `dfa(true)` implies `bytes(true)`.
     pub fn bytes(mut self, yes: bool) -> Self {
         self.compiled.is_bytes = yes;
         self
     }

     /// When disabled, the program compiled may match arbitrary bytes.
     ///
     /// When enabled (the default), all compiled programs exclusively match
     /// valid UTF-8 bytes.
     pub fn only_utf8(mut self, yes: bool) -> Self {
         self.compiled.only_utf8 = yes;
         self
     }

     /// When set, the machine returned is suitable for use in the DFA matching
     /// engine.
     ///
     /// In particular, this ensures that if the regex is not anchored in the
     /// beginning, then a preceding `.*?` is included in the program. (The NFA
     /// based engines handle the preceding `.*?` explicitly, which is difficult
     /// or impossible in the DFA engine.)
     pub fn dfa(mut self, yes: bool) -> Self {
         self.compiled.is_dfa = yes;
         self
     }

     /// When set, the machine returned is suitable for matching text in
     /// reverse. In particular, all concatenations are flipped.
     pub fn reverse(mut self, yes: bool) -> Self {
         self.compiled.is_reverse = yes;
         self
     }

     /// Compile a regular expression given its AST.
     ///
     /// The compiler is guaranteed to succeed unless the program exceeds the
     /// specified size limit. If the size limit is exceeded, then compilation
     /// stops and returns an error.
     pub fn compile(mut self, exprs: &[Hir]) -> result::Result<Program, Error> {
         debug_assert!(!exprs.is_empty());
         self.num_exprs = exprs.len();
         if exprs.len() == 1 {
             self.compile_one(&exprs[0])
         } else {
             self.compile_many(exprs)
         }
     }

     fn compile_one(mut self, expr: &Hir) -> result::Result<Program, Error> {
         if self.compiled.only_utf8
             && expr.properties().look_set().contains(Look::WordAsciiNegate)
         {
             return Err(Error::Syntax(
                 "ASCII-only \\B is not allowed in Unicode regexes \
                  because it may result in invalid UTF-8 matches"
                     .to_string(),
             ));
         }
         // If we're compiling a forward DFA and we aren't anchored, then
         // add a `.*?` before the first capture group.
         // Other matching engines handle this by baking the logic into the
         // matching engine itself.
         let mut dotstar_patch = Patch { hole: Hole::None, entry: 0 };
         self.compiled.is_anchored_start =
             expr.properties().look_set_prefix().contains(Look::Start);
         self.compiled.is_anchored_end =
             expr.properties().look_set_suffix().contains(Look::End);
         if self.compiled.needs_dotstar() {
             dotstar_patch = self.c_dotstar()?;
             self.compiled.start = dotstar_patch.entry;
         }
         self.compiled.captures = vec![None];
         let patch =
             self.c_capture(0, expr)?.unwrap_or_else(|| self.next_inst());
         if self.compiled.needs_dotstar() {
             self.fill(dotstar_patch.hole, patch.entry);
         } else {
             self.compiled.start = patch.entry;
         }
         self.fill_to_next(patch.hole);
         self.compiled.matches = vec![self.insts.len()];
         self.push_compiled(Inst::Match(0));
         self.compiled.static_captures_len =
             expr.properties().static_explicit_captures_len();
         self.compile_finish()
     }

     fn compile_many(
         mut self,
         exprs: &[Hir],
     ) -> result::Result<Program, Error> {
         debug_assert!(exprs.len() > 1);

         self.compiled.is_anchored_start = exprs
             .iter()
             .all(|e| e.properties().look_set_prefix().contains(Look::Start));
         self.compiled.is_anchored_end = exprs
             .iter()
             .all(|e| e.properties().look_set_suffix().contains(Look::End));
         let mut dotstar_patch = Patch { hole: Hole::None, entry: 0 };
         if self.compiled.needs_dotstar() {
             dotstar_patch = self.c_dotstar()?;
             self.compiled.start = dotstar_patch.entry;
         } else {
             self.compiled.start = 0; // first instruction is always split
         }
         self.fill_to_next(dotstar_patch.hole);

         let mut prev_hole = Hole::None;
         for (i, expr) in exprs[0..exprs.len() - 1].iter().enumerate() {
             self.fill_to_next(prev_hole);
             let split = self.push_split_hole();
             let Patch { hole, entry } =
                 self.c_capture(0, expr)?.unwrap_or_else(|| self.next_inst());
             self.fill_to_next(hole);
             self.compiled.matches.push(self.insts.len());
             self.push_compiled(Inst::Match(i));
             prev_hole = self.fill_split(split, Some(entry), None);
         }
         let i = exprs.len() - 1;
         let Patch { hole, entry } =
             self.c_capture(0, &exprs[i])?.unwrap_or_else(|| self.next_inst());
         self.fill(prev_hole, entry);
         self.fill_to_next(hole);
         self.compiled.matches.push(self.insts.len());
         self.push_compiled(Inst::Match(i));
         self.compile_finish()
     }

     fn compile_finish(mut self) -> result::Result<Program, Error> {
         self.compiled.insts =
             self.insts.into_iter().map(|inst| inst.unwrap()).collect();
         self.compiled.byte_classes = self.byte_classes.byte_classes();
         self.compiled.capture_name_idx = Arc::new(self.capture_name_idx);
         Ok(self.compiled)
     }

     /// Compile expr into self.insts, returning a patch on success,
     /// or an error if we run out of memory.
     ///
     /// All of the c_* methods of the compiler share the contract outlined
     /// here.
     ///
     /// The main thing that a c_* method does is mutate `self.insts`
     /// to add a list of mostly compiled instructions required to execute
     /// the given expression. `self.insts` contains MaybeInsts rather than
     /// Insts because there is some backpatching required.
     ///
     /// The `Patch` value returned by each c_* method provides metadata
     /// about the compiled instructions emitted to `self.insts`. The
     /// `entry` member of the patch refers to the first instruction
     /// (the entry point), while the `hole` member contains zero or
     /// more offsets to partial instructions that need to be backpatched.
     /// The c_* routine can't know where its list of instructions are going to
     /// jump to after execution, so it is up to the caller to patch
     /// these jumps to point to the right place. So compiling some
     /// expression, e, we would end up with a situation that looked like:
     ///
     /// ```text
     /// self.insts = [ ..., i1, i2, ..., iexit1, ..., iexitn, ...]
     ///                     ^              ^             ^
     ///                     |                \         /
     ///                   entry                \     /
     ///                                         hole
     /// ```
     ///
     /// To compile two expressions, e1 and e2, concatenated together we
     /// would do:
     ///
     /// ```ignore
     /// let patch1 = self.c(e1);
     /// let patch2 = self.c(e2);
     /// ```
     ///
     /// while leaves us with a situation that looks like
     ///
     /// ```text
     /// self.insts = [ ..., i1, ..., iexit1, ..., i2, ..., iexit2 ]
     ///                     ^        ^            ^        ^
     ///                     |        |            |        |
     ///                entry1        hole1   entry2        hole2
     /// ```
     ///
     /// Then to merge the two patches together into one we would backpatch
     /// hole1 with entry2 and return a new patch that enters at entry1
     /// and has hole2 for a hole. In fact, if you look at the c_concat
     /// method you will see that it does exactly this, though it handles
     /// a list of expressions rather than just the two that we use for
     /// an example.
     ///
     /// Ok(None) is returned when an expression is compiled to no
     /// instruction, and so no patch.entry value makes sense.
     fn c(&mut self, expr: &Hir) -> ResultOrEmpty {
         use crate::prog;
         use regex_syntax::hir::HirKind::*;

         self.check_size()?;
         match *expr.kind() {
             Empty => self.c_empty(),
             Literal(hir::Literal(ref bytes)) => {
                 if self.compiled.is_reverse {
                     let mut bytes = bytes.to_vec();
                     bytes.reverse();
                     self.c_literal(&bytes)
                 } else {
                     self.c_literal(bytes)
                 }
             }
             Class(hir::Class::Unicode(ref cls)) => self.c_class(cls.ranges()),
             Class(hir::Class::Bytes(ref cls)) => {
                 if self.compiled.uses_bytes() {
                     self.c_class_bytes(cls.ranges())
                 } else {
                     assert!(cls.is_ascii());
                     let mut char_ranges = vec![];
                     for r in cls.iter() {
                         let (s, e) = (r.start() as char, r.end() as char);
                         char_ranges.push(hir::ClassUnicodeRange::new(s, e));
                     }
                     self.c_class(&char_ranges)
                 }
             }
             Look(ref look) => match *look {
                 hir::Look::Start if self.compiled.is_reverse => {
                     self.c_empty_look(prog::EmptyLook::EndText)
                 }
                 hir::Look::Start => {
                     self.c_empty_look(prog::EmptyLook::StartText)
                 }
                 hir::Look::End if self.compiled.is_reverse => {
                     self.c_empty_look(prog::EmptyLook::StartText)
                 }
                 hir::Look::End => self.c_empty_look(prog::EmptyLook::EndText),
                 hir::Look::StartLF if self.compiled.is_reverse => {
                     self.byte_classes.set_range(b'\n', b'\n');
                     self.c_empty_look(prog::EmptyLook::EndLine)
                 }
                 hir::Look::StartLF => {
                     self.byte_classes.set_range(b'\n', b'\n');
                     self.c_empty_look(prog::EmptyLook::StartLine)
                 }
                 hir::Look::EndLF if self.compiled.is_reverse => {
                     self.byte_classes.set_range(b'\n', b'\n');
                     self.c_empty_look(prog::EmptyLook::StartLine)
                 }
                 hir::Look::EndLF => {
                     self.byte_classes.set_range(b'\n', b'\n');
                     self.c_empty_look(prog::EmptyLook::EndLine)
                 }
                 hir::Look::StartCRLF | hir::Look::EndCRLF => {
                     return Err(Error::Syntax(
                         "CRLF-aware line anchors are not supported yet"
                             .to_string(),
                     ));
                 }
                 hir::Look::WordAscii => {
                     self.byte_classes.set_word_boundary();
                     self.c_empty_look(prog::EmptyLook::WordBoundaryAscii)
                 }
                 hir::Look::WordAsciiNegate => {
                     self.byte_classes.set_word_boundary();
                     self.c_empty_look(prog::EmptyLook::NotWordBoundaryAscii)
                 }
                 hir::Look::WordUnicode => {
                     if !cfg!(feature = "unicode-perl") {
                         return Err(Error::Syntax(
                             "Unicode word boundaries are unavailable when \
                          the unicode-perl feature is disabled"
                                 .to_string(),
                         ));
                     }
                     self.compiled.has_unicode_word_boundary = true;
                     self.byte_classes.set_word_boundary();
                     // We also make sure that all ASCII bytes are in a different
                     // class from non-ASCII bytes. Otherwise, it's possible for
                     // ASCII bytes to get lumped into the same class as non-ASCII
                     // bytes. This in turn may cause the lazy DFA to falsely start
                     // when it sees an ASCII byte that maps to a byte class with
                     // non-ASCII bytes. This ensures that never happens.
                     self.byte_classes.set_range(0, 0x7F);
                     self.c_empty_look(prog::EmptyLook::WordBoundary)
                 }
                 hir::Look::WordUnicodeNegate => {
                     if !cfg!(feature = "unicode-perl") {
                         return Err(Error::Syntax(
                             "Unicode word boundaries are unavailable when \
                          the unicode-perl feature is disabled"
                                 .to_string(),
                         ));
                     }
                     self.compiled.has_unicode_word_boundary = true;
                     self.byte_classes.set_word_boundary();
                     // See comments above for why we set the ASCII range here.
                     self.byte_classes.set_range(0, 0x7F);
                     self.c_empty_look(prog::EmptyLook::NotWordBoundary)
                 }
             },
             Capture(hir::Capture { index, ref name, ref sub }) => {
                 if index as usize >= self.compiled.captures.len() {
                     let name = match *name {
                         None => None,
                         Some(ref boxed_str) => Some(boxed_str.to_string()),
                     };
                     self.compiled.captures.push(name.clone());
                     if let Some(name) = name {
                         self.capture_name_idx.insert(name, index as usize);
                     }
                 }
                 self.c_capture(2 * index as usize, sub)
             }
             Concat(ref es) => {
                 if self.compiled.is_reverse {
                     self.c_concat(es.iter().rev())
                 } else {
                     self.c_concat(es)
                 }
             }
             Alternation(ref es) => self.c_alternate(&**es),
             Repetition(ref rep) => self.c_repeat(rep),
         }
     }

     fn c_empty(&mut self) -> ResultOrEmpty {
         // See: https://github.com/rust-lang/regex/security/advisories/GHSA-m5pq-gvj9-9vr8
         // See: CVE-2022-24713
         //
         // Since 'empty' sub-expressions don't increase the size of
         // the actual compiled object, we "fake" an increase in its
         // size so that our 'check_size_limit' routine will eventually
         // stop compilation if there are too many empty sub-expressions
         // (e.g., via a large repetition).
         self.extra_inst_bytes += std::mem::size_of::<Inst>();
         Ok(None)
     }

     fn c_capture(&mut self, first_slot: usize, expr: &Hir) -> ResultOrEmpty {
         if self.num_exprs > 1 || self.compiled.is_dfa {
             // Don't ever compile Save instructions for regex sets because
             // they are never used. They are also never used in DFA programs
             // because DFAs can't handle captures.
             self.c(expr)
         } else {
             let entry = self.insts.len();
             let hole = self.push_hole(InstHole::Save { slot: first_slot });
             let patch = self.c(expr)?.unwrap_or_else(|| self.next_inst());
             self.fill(hole, patch.entry);
             self.fill_to_next(patch.hole);
             let hole = self.push_hole(InstHole::Save { slot: first_slot + 1 });
             Ok(Some(Patch { hole, entry }))
         }
     }

     fn c_dotstar(&mut self) -> Result {
         let hir = if self.compiled.only_utf8() {
             Hir::dot(hir::Dot::AnyChar)
         } else {
             Hir::dot(hir::Dot::AnyByte)
         };
         Ok(self
             .c(&Hir::repetition(hir::Repetition {
                 min: 0,
                 max: None,
                 greedy: false,
                 sub: Box::new(hir),
             }))?
             .unwrap())
     }

     fn c_char(&mut self, c: char) -> ResultOrEmpty {
         if self.compiled.uses_bytes() {
             if c.is_ascii() {
                 let b = c as u8;
                 let hole =
                     self.push_hole(InstHole::Bytes { start: b, end: b });
                 self.byte_classes.set_range(b, b);
                 Ok(Some(Patch { hole, entry: self.insts.len() - 1 }))
             } else {
                 self.c_class(&[hir::ClassUnicodeRange::new(c, c)])
             }
         } else {
             let hole = self.push_hole(InstHole::Char { c });
             Ok(Some(Patch { hole, entry: self.insts.len() - 1 }))
         }
     }

     fn c_class(&mut self, ranges: &[hir::ClassUnicodeRange]) -> ResultOrEmpty {
         use std::mem::size_of;

         if ranges.is_empty() {
             return Err(Error::Syntax(
                 "empty character classes are not allowed".to_string(),
             ));
         }
         if self.compiled.uses_bytes() {
             Ok(Some(CompileClass { c: self, ranges }.compile()?))
         } else {
             let ranges: Vec<(char, char)> =
                 ranges.iter().map(|r| (r.start(), r.end())).collect();
             let hole = if ranges.len() == 1 && ranges[0].0 == ranges[0].1 {
                 self.push_hole(InstHole::Char { c: ranges[0].0 })
             } else {
                 self.extra_inst_bytes +=
                     ranges.len() * (size_of::<char>() * 2);
                 self.push_hole(InstHole::Ranges { ranges })
             };
             Ok(Some(Patch { hole, entry: self.insts.len() - 1 }))
         }
     }

     fn c_byte(&mut self, b: u8) -> ResultOrEmpty {
         self.c_class_bytes(&[hir::ClassBytesRange::new(b, b)])
     }

     fn c_class_bytes(
         &mut self,
         ranges: &[hir::ClassBytesRange],
     ) -> ResultOrEmpty {
         if ranges.is_empty() {
             return Err(Error::Syntax(
                 "empty character classes are not allowed".to_string(),
             ));
         }

         let first_split_entry = self.insts.len();
         let mut holes = vec![];
         let mut prev_hole = Hole::None;
         for r in &ranges[0..ranges.len() - 1] {
             self.fill_to_next(prev_hole);
             let split = self.push_split_hole();
             let next = self.insts.len();
             self.byte_classes.set_range(r.start(), r.end());
             holes.push(self.push_hole(InstHole::Bytes {
                 start: r.start(),
                 end: r.end(),
             }));
             prev_hole = self.fill_split(split, Some(next), None);
         }
         let next = self.insts.len();
         let r = &ranges[ranges.len() - 1];
         self.byte_classes.set_range(r.start(), r.end());
         holes.push(
             self.push_hole(InstHole::Bytes { start: r.start(), end: r.end() }),
         );
         self.fill(prev_hole, next);
         Ok(Some(Patch { hole: Hole::Many(holes), entry: first_split_entry }))
     }

     fn c_empty_look(&mut self, look: EmptyLook) -> ResultOrEmpty {
         let hole = self.push_hole(InstHole::EmptyLook { look });
         Ok(Some(Patch { hole, entry: self.insts.len() - 1 }))
     }

     fn c_literal(&mut self, bytes: &[u8]) -> ResultOrEmpty {
         match core::str::from_utf8(bytes) {
             Ok(string) => {
                 let mut it = string.chars();
                 let Patch { mut hole, entry } = loop {
                     match it.next() {
                         None => return self.c_empty(),
                         Some(ch) => {
                             if let Some(p) = self.c_char(ch)? {
                                 break p;
                             }
                         }
                     }
                 };
                 for ch in it {
                     if let Some(p) = self.c_char(ch)? {
                         self.fill(hole, p.entry);
                         hole = p.hole;
                     }
                 }
                 Ok(Some(Patch { hole, entry }))
             }
             Err(_) => {
                 assert!(self.compiled.uses_bytes());
                 let mut it = bytes.iter().copied();
                 let Patch { mut hole, entry } = loop {
                     match it.next() {
                         None => return self.c_empty(),
                         Some(byte) => {
                             if let Some(p) = self.c_byte(byte)? {
                                 break p;
                             }
                         }
                     }
                 };
                 for byte in it {
                     if let Some(p) = self.c_byte(byte)? {
                         self.fill(hole, p.entry);
                         hole = p.hole;
                     }
                 }
                 Ok(Some(Patch { hole, entry }))
             }
         }
     }

     fn c_concat<'a, I>(&mut self, exprs: I) -> ResultOrEmpty
     where
         I: IntoIterator<Item = &'a Hir>,
     {
         let mut exprs = exprs.into_iter();
         let Patch { mut hole, entry } = loop {
             match exprs.next() {
                 None => return self.c_empty(),
                 Some(e) => {
                     if let Some(p) = self.c(e)? {
                         break p;
                     }
                 }
             }
         };
         for e in exprs {
             if let Some(p) = self.c(e)? {
                 self.fill(hole, p.entry);
                 hole = p.hole;
             }
         }
         Ok(Some(Patch { hole, entry }))
     }

     fn c_alternate(&mut self, exprs: &[Hir]) -> ResultOrEmpty {
         debug_assert!(
             exprs.len() >= 2,
             "alternates must have at least 2 exprs"
         );

         // Initial entry point is always the first split.
         let first_split_entry = self.insts.len();

         // Save up all of the holes from each alternate. They will all get
         // patched to point to the same location.
         let mut holes = vec![];

         // true indicates that the hole is a split where we want to fill
         // the second branch.
         let mut prev_hole = (Hole::None, false);
         for e in &exprs[0..exprs.len() - 1] {
             if prev_hole.1 {
                 let next = self.insts.len();
                 self.fill_split(prev_hole.0, None, Some(next));
             } else {
                 self.fill_to_next(prev_hole.0);
             }
             let split = self.push_split_hole();
             if let Some(Patch { hole, entry }) = self.c(e)? {
                 holes.push(hole);
                 prev_hole = (self.fill_split(split, Some(entry), None), false);
             } else {
                 let (split1, split2) = split.dup_one();
                 holes.push(split1);
                 prev_hole = (split2, true);
             }
         }
         if let Some(Patch { hole, entry }) = self.c(&exprs[exprs.len() - 1])? {
             holes.push(hole);
             if prev_hole.1 {
                 self.fill_split(prev_hole.0, None, Some(entry));
             } else {
                 self.fill(prev_hole.0, entry);
             }
         } else {
             // We ignore prev_hole.1. When it's true, it means we have two
             // empty branches both pushing prev_hole.0 into holes, so both
             // branches will go to the same place anyway.
             holes.push(prev_hole.0);
         }
         Ok(Some(Patch { hole: Hole::Many(holes), entry: first_split_entry }))
     }

     fn c_repeat(&mut self, rep: &hir::Repetition) -> ResultOrEmpty {
         match (rep.min, rep.max) {
             (0, Some(1)) => self.c_repeat_zero_or_one(&rep.sub, rep.greedy),
             (0, None) => self.c_repeat_zero_or_more(&rep.sub, rep.greedy),
             (1, None) => self.c_repeat_one_or_more(&rep.sub, rep.greedy),
             (min, None) => {
                 self.c_repeat_range_min_or_more(&rep.sub, rep.greedy, min)
             }
             (min, Some(max)) => {
                 self.c_repeat_range(&rep.sub, rep.greedy, min, max)
             }
         }
     }

     fn c_repeat_zero_or_one(
         &mut self,
         expr: &Hir,
         greedy: bool,
     ) -> ResultOrEmpty {
         let split_entry = self.insts.len();
         let split = self.push_split_hole();
         let Patch { hole: hole_rep, entry: entry_rep } = match self.c(expr)? {
             Some(p) => p,
             None => return self.pop_split_hole(),
         };
         let split_hole = if greedy {
             self.fill_split(split, Some(entry_rep), None)
         } else {
             self.fill_split(split, None, Some(entry_rep))
         };
         let holes = vec![hole_rep, split_hole];
         Ok(Some(Patch { hole: Hole::Many(holes), entry: split_entry }))
     }

     fn c_repeat_zero_or_more(
         &mut self,
         expr: &Hir,
         greedy: bool,
     ) -> ResultOrEmpty {
         let split_entry = self.insts.len();
         let split = self.push_split_hole();
         let Patch { hole: hole_rep, entry: entry_rep } = match self.c(expr)? {
             Some(p) => p,
             None => return self.pop_split_hole(),
         };

         self.fill(hole_rep, split_entry);
         let split_hole = if greedy {
             self.fill_split(split, Some(entry_rep), None)
         } else {
             self.fill_split(split, None, Some(entry_rep))
         };
         Ok(Some(Patch { hole: split_hole, entry: split_entry }))
     }

     fn c_repeat_one_or_more(
         &mut self,
         expr: &Hir,
         greedy: bool,
     ) -> ResultOrEmpty {
         let Patch { hole: hole_rep, entry: entry_rep } = match self.c(expr)? {
             Some(p) => p,
             None => return Ok(None),
         };
         self.fill_to_next(hole_rep);
         let split = self.push_split_hole();

         let split_hole = if greedy {
             self.fill_split(split, Some(entry_rep), None)
         } else {
             self.fill_split(split, None, Some(entry_rep))
         };
         Ok(Some(Patch { hole: split_hole, entry: entry_rep }))
     }

     fn c_repeat_range_min_or_more(
         &mut self,
         expr: &Hir,
         greedy: bool,
         min: u32,
     ) -> ResultOrEmpty {
         let min = u32_to_usize(min);
         // Using next_inst() is ok, because we can't return it (concat would
         // have to return Some(_) while c_repeat_range_min_or_more returns
         // None).
         let patch_concat = self
             .c_concat(iter::repeat(expr).take(min))?
             .unwrap_or_else(|| self.next_inst());
         if let Some(patch_rep) = self.c_repeat_zero_or_more(expr, greedy)? {
             self.fill(patch_concat.hole, patch_rep.entry);
             Ok(Some(Patch { hole: patch_rep.hole, entry: patch_concat.entry }))
         } else {
             Ok(None)
         }
     }

     fn c_repeat_range(
         &mut self,
         expr: &Hir,
         greedy: bool,
         min: u32,
         max: u32,
     ) -> ResultOrEmpty {
         let (min, max) = (u32_to_usize(min), u32_to_usize(max));
         debug_assert!(min <= max);
         let patch_concat = self.c_concat(iter::repeat(expr).take(min))?;
         if min == max {
             return Ok(patch_concat);
         }
         // Same reasoning as in c_repeat_range_min_or_more (we know that min <
         // max at this point).
         let patch_concat = patch_concat.unwrap_or_else(|| self.next_inst());
         let initial_entry = patch_concat.entry;
         // It is much simpler to compile, e.g., `a{2,5}` as:
         //
         //     aaa?a?a?
         //
         // But you end up with a sequence of instructions like this:
         //
         //     0: 'a'
         //     1: 'a',
         //     2: split(3, 4)
         //     3: 'a'
         //     4: split(5, 6)
         //     5: 'a'
         //     6: split(7, 8)
         //     7: 'a'
         //     8: MATCH
         //
         // This is *incredibly* inefficient because the splits end
         // up forming a chain, which has to be resolved everything a
         // transition is followed.
         let mut holes = vec![];
         let mut prev_hole = patch_concat.hole;
         for _ in min..max {
             self.fill_to_next(prev_hole);
             let split = self.push_split_hole();
             let Patch { hole, entry } = match self.c(expr)? {
                 Some(p) => p,
                 None => return self.pop_split_hole(),
             };
             prev_hole = hole;
             if greedy {
                 holes.push(self.fill_split(split, Some(entry), None));
             } else {
                 holes.push(self.fill_split(split, None, Some(entry)));
             }
         }
         holes.push(prev_hole);
         Ok(Some(Patch { hole: Hole::Many(holes), entry: initial_entry }))
     }

     /// Can be used as a default value for the c_* functions when the call to
     /// c_function is followed by inserting at least one instruction that is
     /// always executed after the ones written by the c* function.
     fn next_inst(&self) -> Patch {
         Patch { hole: Hole::None, entry: self.insts.len() }
     }

     fn fill(&mut self, hole: Hole, goto: InstPtr) {
         match hole {
             Hole::None => {}
             Hole::One(pc) => {
                 self.insts[pc].fill(goto);
             }
             Hole::Many(holes) => {
                 for hole in holes {
                     self.fill(hole, goto);
                 }
             }
         }
     }

     fn fill_to_next(&mut self, hole: Hole) {
         let next = self.insts.len();
         self.fill(hole, next);
     }

     fn fill_split(
         &mut self,
         hole: Hole,
         goto1: Option<InstPtr>,
         goto2: Option<InstPtr>,
     ) -> Hole {
         match hole {
             Hole::None => Hole::None,
             Hole::One(pc) => match (goto1, goto2) {
                 (Some(goto1), Some(goto2)) => {
                     self.insts[pc].fill_split(goto1, goto2);
                     Hole::None
                 }
                 (Some(goto1), None) => {
                     self.insts[pc].half_fill_split_goto1(goto1);
                     Hole::One(pc)
                 }
                 (None, Some(goto2)) => {
                     self.insts[pc].half_fill_split_goto2(goto2);
                     Hole::One(pc)
                 }
                 (None, None) => unreachable!(
                     "at least one of the split \
                      holes must be filled"
                 ),
             },
             Hole::Many(holes) => {
                 let mut new_holes = vec![];
                 for hole in holes {
                     new_holes.push(self.fill_split(hole, goto1, goto2));
                 }
                 if new_holes.is_empty() {
                     Hole::None
                 } else if new_holes.len() == 1 {
                     new_holes.pop().unwrap()
                 } else {
                     Hole::Many(new_holes)
                 }
             }
         }
     }

     fn push_compiled(&mut self, inst: Inst) {
         self.insts.push(MaybeInst::Compiled(inst));
     }

     fn push_hole(&mut self, inst: InstHole) -> Hole {
         let hole = self.insts.len();
         self.insts.push(MaybeInst::Uncompiled(inst));
         Hole::One(hole)
     }

     fn push_split_hole(&mut self) -> Hole {
         let hole = self.insts.len();
         self.insts.push(MaybeInst::Split);
         Hole::One(hole)
     }

     fn pop_split_hole(&mut self) -> ResultOrEmpty {
         self.insts.pop();
         Ok(None)
     }

     fn check_size(&self) -> result::Result<(), Error> {
         use std::mem::size_of;

         let size =
             self.extra_inst_bytes + (self.insts.len() * size_of::<Inst>());
         if size > self.size_limit {
             Err(Error::CompiledTooBig(self.size_limit))
         } else {
             Ok(())
         }
     }
 }

 #[derive(Debug)]
 enum Hole {
     None,
     One(InstPtr),
     Many(Vec<Hole>),
 }

 impl Hole {
     fn dup_one(self) -> (Self, Self) {
         match self {
             Hole::One(pc) => (Hole::One(pc), Hole::One(pc)),
             Hole::None | Hole::Many(_) => {
                 unreachable!("must be called on single hole")
             }
         }
     }
 }

 #[derive(Clone, Debug)]
 enum MaybeInst {
     Compiled(Inst),
     Uncompiled(InstHole),
     Split,
     Split1(InstPtr),
     Split2(InstPtr),
 }

 impl MaybeInst {
     fn fill(&mut self, goto: InstPtr) {
         let maybeinst = match *self {
             MaybeInst::Split => MaybeInst::Split1(goto),
             MaybeInst::Uncompiled(ref inst) => {
                 MaybeInst::Compiled(inst.fill(goto))
             }
             MaybeInst::Split1(goto1) => {
                 MaybeInst::Compiled(Inst::Split(InstSplit {
                     goto1,
                     goto2: goto,
                 }))
             }
             MaybeInst::Split2(goto2) => {
                 MaybeInst::Compiled(Inst::Split(InstSplit {
                     goto1: goto,
                     goto2,
                 }))
             }
             _ => unreachable!(
                 "not all instructions were compiled! \
                  found uncompiled instruction: {:?}",
                 self
             ),
         };
         *self = maybeinst;
     }

     fn fill_split(&mut self, goto1: InstPtr, goto2: InstPtr) {
         let filled = match *self {
             MaybeInst::Split => Inst::Split(InstSplit { goto1, goto2 }),
             _ => unreachable!(
                 "must be called on Split instruction, \
                  instead it was called on: {:?}",
                 self
             ),
         };
         *self = MaybeInst::Compiled(filled);
     }

     fn half_fill_split_goto1(&mut self, goto1: InstPtr) {
         let half_filled = match *self {
             MaybeInst::Split => goto1,
             _ => unreachable!(
                 "must be called on Split instruction, \
                  instead it was called on: {:?}",
                 self
             ),
         };
         *self = MaybeInst::Split1(half_filled);
     }

     fn half_fill_split_goto2(&mut self, goto2: InstPtr) {
         let half_filled = match *self {
             MaybeInst::Split => goto2,
             _ => unreachable!(
                 "must be called on Split instruction, \
                  instead it was called on: {:?}",
                 self
             ),
         };
         *self = MaybeInst::Split2(half_filled);
     }

     fn unwrap(self) -> Inst {
         match self {
             MaybeInst::Compiled(inst) => inst,
             _ => unreachable!(
                 "must be called on a compiled instruction, \
                  instead it was called on: {:?}",
                 self
             ),
         }
     }
 }

 #[derive(Clone, Debug)]
 enum InstHole {
     Save { slot: usize },
     EmptyLook { look: EmptyLook },
     Char { c: char },
     Ranges { ranges: Vec<(char, char)> },
     Bytes { start: u8, end: u8 },
 }

 impl InstHole {
     fn fill(&self, goto: InstPtr) -> Inst {
         match *self {
             InstHole::Save { slot } => Inst::Save(InstSave { goto, slot }),
             InstHole::EmptyLook { look } => {
                 Inst::EmptyLook(InstEmptyLook { goto, look })
             }
             InstHole::Char { c } => Inst::Char(InstChar { goto, c }),
             InstHole::Ranges { ref ranges } => Inst::Ranges(InstRanges {
                 goto,
                 ranges: ranges.clone().into_boxed_slice(),
             }),
             InstHole::Bytes { start, end } => {
                 Inst::Bytes(InstBytes { goto, start, end })
             }
         }
     }
 }

 struct CompileClass<'a, 'b> {
     c: &'a mut Compiler,
     ranges: &'b [hir::ClassUnicodeRange],
 }

 impl<'a, 'b> CompileClass<'a, 'b> {
     fn compile(mut self) -> Result {
         let mut holes = vec![];
         let mut initial_entry = None;
         let mut last_split = Hole::None;
         let mut utf8_seqs = self.c.utf8_seqs.take().unwrap();
         self.c.suffix_cache.clear();

         for (i, range) in self.ranges.iter().enumerate() {
             let is_last_range = i + 1 == self.ranges.len();
             utf8_seqs.reset(range.start(), range.end());
             let mut it = (&mut utf8_seqs).peekable();
             loop {
                 let utf8_seq = match it.next() {
                     None => break,
                     Some(utf8_seq) => utf8_seq,
                 };
                 if is_last_range && it.peek().is_none() {
                     let Patch { hole, entry } = self.c_utf8_seq(&utf8_seq)?;
                     holes.push(hole);
                     self.c.fill(last_split, entry);
                     last_split = Hole::None;
                     if initial_entry.is_none() {
                         initial_entry = Some(entry);
                     }
                 } else {
                     if initial_entry.is_none() {
                         initial_entry = Some(self.c.insts.len());
                     }
                     self.c.fill_to_next(last_split);
                     last_split = self.c.push_split_hole();
                     let Patch { hole, entry } = self.c_utf8_seq(&utf8_seq)?;
                     holes.push(hole);
                     last_split =
                         self.c.fill_split(last_split, Some(entry), None);
                 }
             }
         }
         self.c.utf8_seqs = Some(utf8_seqs);
         Ok(Patch { hole: Hole::Many(holes), entry: initial_entry.unwrap() })
     }

     fn c_utf8_seq(&mut self, seq: &Utf8Sequence) -> Result {
         if self.c.compiled.is_reverse {
             self.c_utf8_seq_(seq)
         } else {
             self.c_utf8_seq_(seq.into_iter().rev())
         }
     }

     fn c_utf8_seq_<'r, I>(&mut self, seq: I) -> Result
     where
         I: IntoIterator<Item = &'r Utf8Range>,
     {
         // The initial instruction for each UTF-8 sequence should be the same.
         let mut from_inst = ::std::usize::MAX;
         let mut last_hole = Hole::None;
         for byte_range in seq {
             let key = SuffixCacheKey {
                 from_inst,
                 start: byte_range.start,
                 end: byte_range.end,
             };
             {
                 let pc = self.c.insts.len();
                 if let Some(cached_pc) = self.c.suffix_cache.get(key, pc) {
                     from_inst = cached_pc;
                     continue;
                 }
             }
             self.c.byte_classes.set_range(byte_range.start, byte_range.end);
             if from_inst == ::std::usize::MAX {
                 last_hole = self.c.push_hole(InstHole::Bytes {
                     start: byte_range.start,
                     end: byte_range.end,
                 });
             } else {
                 self.c.push_compiled(Inst::Bytes(InstBytes {
                     goto: from_inst,
                     start: byte_range.start,
                     end: byte_range.end,
                 }));
             }
             from_inst = self.c.insts.len().checked_sub(1).unwrap();
             debug_assert!(from_inst < ::std::usize::MAX);
         }
         debug_assert!(from_inst < ::std::usize::MAX);
         Ok(Patch { hole: last_hole, entry: from_inst })
     }
 }

 /// `SuffixCache` is a simple bounded hash map for caching suffix entries in
 /// UTF-8 automata. For example, consider the Unicode range \u{0}-\u{FFFF}.
 /// The set of byte ranges looks like this:
 ///
 /// [0-7F]
 /// [C2-DF][80-BF]
 /// [E0][A0-BF][80-BF]
 /// [E1-EC][80-BF][80-BF]
 /// [ED][80-9F][80-BF]
 /// [EE-EF][80-BF][80-BF]
 ///
 /// Each line above translates to one alternate in the compiled regex program.
 /// However, all but one of the alternates end in the same suffix, which is
 /// a waste of an instruction. The suffix cache facilitates reusing them across
 /// alternates.
 ///
 /// Note that a HashMap could be trivially used for this, but we don't need its
 /// overhead. Some small bounded space (LRU style) is more than enough.
 ///
 /// This uses similar idea to [`SparseSet`](../sparse/struct.SparseSet.html),
 /// except it uses hashes as original indices and then compares full keys for
 /// validation against `dense` array.
 #[derive(Debug)]
 struct SuffixCache {
     sparse: Box<[usize]>,
     dense: Vec<SuffixCacheEntry>,
 }

 #[derive(Clone, Copy, Debug, Default, Eq, Hash, PartialEq)]
 struct SuffixCacheEntry {
     key: SuffixCacheKey,
     pc: InstPtr,
 }

 #[derive(Clone, Copy, Debug, Default, Eq, Hash, PartialEq)]
 struct SuffixCacheKey {
     from_inst: InstPtr,
     start: u8,
     end: u8,
 }

 impl SuffixCache {
     fn new(size: usize) -> Self {
         SuffixCache {
             sparse: vec![0usize; size].into(),
             dense: Vec::with_capacity(size),
         }
     }

     fn get(&mut self, key: SuffixCacheKey, pc: InstPtr) -> Option<InstPtr> {
         let hash = self.hash(&key);
         let pos = &mut self.sparse[hash];
         if let Some(entry) = self.dense.get(*pos) {
             if entry.key == key {
                 return Some(entry.pc);
             }
         }
         *pos = self.dense.len();
         self.dense.push(SuffixCacheEntry { key, pc });
         None
     }

     fn clear(&mut self) {
         self.dense.clear();
     }

     fn hash(&self, suffix: &SuffixCacheKey) -> usize {
         // Basic FNV-1a hash as described:
         // https://en.wikipedia.org/wiki/Fowler%E2%80%93Noll%E2%80%93Vo_hash_function
         const FNV_PRIME: u64 = 1_099_511_628_211;
         let mut h = 14_695_981_039_346_656_037;
         h = (h ^ (suffix.from_inst as u64)).wrapping_mul(FNV_PRIME);
         h = (h ^ (suffix.start as u64)).wrapping_mul(FNV_PRIME);
         h = (h ^ (suffix.end as u64)).wrapping_mul(FNV_PRIME);
         (h as usize) % self.sparse.len()
     }
 }

 struct ByteClassSet([bool; 256]);

 impl ByteClassSet {
     fn new() -> Self {
         ByteClassSet([false; 256])
     }

     fn set_range(&mut self, start: u8, end: u8) {
         debug_assert!(start <= end);
         if start > 0 {
             self.0[start as usize - 1] = true;
         }
         self.0[end as usize] = true;
     }

     fn set_word_boundary(&mut self) {
         // We need to mark all ranges of bytes whose pairs result in
         // evaluating \b differently.
         let iswb = is_word_byte;
         let mut b1: u16 = 0;
         let mut b2: u16;
         while b1 <= 255 {
             b2 = b1 + 1;
             while b2 <= 255 && iswb(b1 as u8) == iswb(b2 as u8) {
                 b2 += 1;
             }
             self.set_range(b1 as u8, (b2 - 1) as u8);
             b1 = b2;
         }
     }

     fn byte_classes(&self) -> Vec<u8> {
         // N.B. If you're debugging the DFA, it's useful to simply return
         // `(0..256).collect()`, which effectively removes the byte classes
         // and makes the transitions easier to read.
         // (0usize..256).map(|x| x as u8).collect()
         let mut byte_classes = vec![0; 256];
         let mut class = 0u8;
         let mut i = 0;
         loop {
             byte_classes[i] = class as u8;
             if i >= 255 {
                 break;
             }
             if self.0[i] {
                 class = class.checked_add(1).unwrap();
             }
             i += 1;
         }
         byte_classes
     }
 }

 impl fmt::Debug for ByteClassSet {
     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
         f.debug_tuple("ByteClassSet").field(&&self.0[..]).finish()
     }
 }

 fn u32_to_usize(n: u32) -> usize {
     // In case usize is less than 32 bits, we need to guard against overflow.
     // On most platforms this compiles to nothing.
     // TODO Use `std::convert::TryFrom` once it's stable.
     if (n as u64) > (::std::usize::MAX as u64) {
         panic!("BUG: {} is too big to be pointer sized", n)
     }
     n as usize
 }

 #[cfg(test)]
 mod tests {
     use super::ByteClassSet;

     #[test]
     fn byte_classes() {
         let mut set = ByteClassSet::new();
         set.set_range(b'a', b'z');
         let classes = set.byte_classes();
         assert_eq!(classes[0], 0);
         assert_eq!(classes[1], 0);
         assert_eq!(classes[2], 0);
         assert_eq!(classes[b'a' as usize - 1], 0);
         assert_eq!(classes[b'a' as usize], 1);
         assert_eq!(classes[b'm' as usize], 1);
         assert_eq!(classes[b'z' as usize], 1);
         assert_eq!(classes[b'z' as usize + 1], 2);
         assert_eq!(classes[254], 2);
         assert_eq!(classes[255], 2);

         let mut set = ByteClassSet::new();
         set.set_range(0, 2);
         set.set_range(4, 6);
         let classes = set.byte_classes();
         assert_eq!(classes[0], 0);
         assert_eq!(classes[1], 0);
         assert_eq!(classes[2], 0);
         assert_eq!(classes[3], 1);
         assert_eq!(classes[4], 2);
         assert_eq!(classes[5], 2);
         assert_eq!(classes[6], 2);
         assert_eq!(classes[7], 3);
         assert_eq!(classes[255], 3);
     }

     #[test]
     fn full_byte_classes() {
         let mut set = ByteClassSet::new();
         for i in 0..256u16 {
             set.set_range(i as u8, i as u8);
         }
         assert_eq!(set.byte_classes().len(), 256);
     }
 }