re2: memory diet part 2

Change program representation from Inst* graph
with forwarding pointers to contiguous array of Inst
with forwarding indices.  Cuts sizeof(Inst) from 24 to 8 bytes,
fewer allocated objects, presumably better cache locality.

The SparseArray<Inst*> would now be SparseArray<int>
except that the instruction number was always the array index,
so the value is redundant.  Use SparseSets instead, saving 60%
of the storage footprint due to SparseArrays.

Convert all back ends to use new program representation.

Free unused storage at end of IsOnePass (good for ~30%).

Build with -DNDEBUG by default, to disable DCHECKs.
Add obj/dbg/* target to Makefile to build in debug mode.

R=r
CC=re2-dev
http://codereview.appspot.com/1129043
diff --git a/Makefile b/Makefile
index 8997293..ca7b61b 100644
--- a/Makefile
+++ b/Makefile
@@ -39,6 +39,7 @@
 	util/pcre.h\
 	util/random.h\
 	util/sparse_array.h\
+	util/sparse_set.h\
 	util/test.h\
 	util/utf.h\
 	util/util.h\
@@ -105,6 +106,7 @@
 	obj/test/possible_match_test\
 	obj/test/re2_test\
 	obj/test/re2_arg_test\
+	obj/test/regexp_test\
 	obj/test/required_prefix_test\
 	obj/test/search_test\
 	obj/test/simplify_test\
@@ -120,28 +122,44 @@
 STESTOFILES=$(patsubst obj/%,obj/so/%,$(TESTOFILES))
 STESTS=$(patsubst obj/%,obj/so/%,$(TESTS))
 
+DOFILES=$(patsubst obj/%,obj/dbg/%,$(OFILES))
+DTESTOFILES=$(patsubst obj/%,obj/dbg/%,$(TESTOFILES))
+DTESTS=$(patsubst obj/%,obj/dbg/%,$(TESTS))
+
 obj/%.o: %.cc $(HFILES)
 	@mkdir -p $$(dirname $@)
-	$(CC) -o $@ $(CXXFLAGS) $(RE2_CXXFLAGS) $*.cc
+	$(CC) -o $@ $(CXXFLAGS) $(RE2_CXXFLAGS) -DNDEBUG $*.cc
 
-obj/so/%.o: %.cc $(HFILES)
+obj/dbg/%.o: %.cc $(HFILES)
 	@mkdir -p $$(dirname $@)
 	$(CC) -o $@ -fPIC $(CXXFLAGS) $(RE2_CXXFLAGS) $*.cc
 
+obj/so/%.o: %.cc $(HFILES)
+	@mkdir -p $$(dirname $@)
+	$(CC) -o $@ -fPIC $(CXXFLAGS) $(RE2_CXXFLAGS) -DNDEBUG $*.cc
+
 obj/%.o: %.c $(HFILES)
 	@mkdir -p $$(dirname $@)
+	$(CC) -o $@ $(CXXFLAGS) $(RE2_CXXFLAGS) -DNDEBUG $*.c
+
+obj/dbg/%.o: %.c $(HFILES)
+	@mkdir -p $$(dirname $@)
 	$(CC) -o $@ $(CXXFLAGS) $(RE2_CXXFLAGS) $*.c
 
 obj/so/%.o: %.c $(HFILES)
 	@mkdir -p $$(dirname $@)
-	$(CC) -o $@ -fPIC $(CXXFLAGS) $(RE2_CXXFLAGS) $*.c
+	$(CC) -o $@ -fPIC $(CXXFLAGS) $(RE2_CXXFLAGS) -DNDEBUG $*.c
 
 obj/libre2.a: $(OFILES)
 	@mkdir -p obj
 	$(AR) $(ARFLAGS) obj/libre2.a $(OFILES)
 
+obj/dbg/libre2.a: $(DOFILES)
+	@mkdir -p obj/dbg
+	$(AR) $(ARFLAGS) obj/dbg/libre2.a $(DOFILES)
+
 obj/so/libre2.so: $(SOFILES)
-	@mkdir -p obj
+	@mkdir -p obj/so
 	$(MAKE_SHARED_LIBRARY) -o [email protected] $(SOFILES)
 	ln -sf libre2.so.0 $@
 
@@ -149,6 +167,10 @@
 	@mkdir -p obj/test
 	$(CC) -o $@ obj/re2/testing/$*.o $(TESTOFILES) obj/util/test.o obj/libre2.a $(LDFLAGS) $(LDPCRE)
 
+obj/dbg/test/%: obj/dbg/libre2.a obj/dbg/re2/testing/%.o $(DTESTOFILES) obj/dbg/util/test.o
+	@mkdir -p obj/dbg/test
+	$(CC) -o $@ obj/dbg/re2/testing/$*.o $(DTESTOFILES) obj/dbg/util/test.o obj/dbg/libre2.a $(LDFLAGS) $(LDPCRE)
+
 obj/so/test/%: obj/so/libre2.so obj/libre2.a obj/so/re2/testing/%.o $(STESTOFILES) obj/so/util/test.o
 	@mkdir -p obj/so/test
 	$(CC) -o $@ obj/so/re2/testing/$*.o $(STESTOFILES) obj/so/util/test.o -Lobj/so -lre2 obj/libre2.a $(LDFLAGS) $(LDPCRE)
@@ -174,7 +196,10 @@
 
 testofiles: $(TESTOFILES)
 
-test: static-test shared-test
+test: debug-test static-test shared-test
+
+debug-test: $(DTESTS)
+	@./runtests $(DTESTS)
 
 static-test: $(TESTS)
 	@./runtests $(TESTS)
@@ -205,3 +230,11 @@
 	  (uname -a; g++ --version; hg identify; file obj/test/regexp_benchmark) | sed 's/^/# /'; \
 	  echo; \
 	  ./obj/test/regexp_benchmark 'PCRE|RE2') | tee -a benchlog.$$(hostname | sed 's/\..*//')
+
+# Keep gmake from deleting intermediate files it creates.
+# This makes repeated builds faster and preserves debug info on OS X.
+
+.PRECIOUS: obj/%.o obj/dbg/%.o obj/so/%.o obj/libre2.a \
+	obj/dbg/libre2.a obj/so/libre2.a \
+	obj/test/% obj/so/test/% obj/dbg/test/%
+
diff --git a/re2/bitstate.cc b/re2/bitstate.cc
index bdc3c5a..518d642 100644
--- a/re2/bitstate.cc
+++ b/re2/bitstate.cc
@@ -23,9 +23,9 @@
 namespace re2 {
 
 struct Job {
-  Inst* ip;
-  const char* p;
+  int id;
   int arg;
+  const char* p;
 };
 
 class BitState {
@@ -40,10 +40,10 @@
               StringPiece* submatch, int nsubmatch);
 
  private:
-  inline bool ShouldVisit(Inst* ip, const char* p);
-  void Push(Inst* ip, const char* p, int arg);
+  inline bool ShouldVisit(int id, const char* p);
+  void Push(int id, const char* p, int arg);
   bool GrowStack();
-  bool TrySearch(Inst* ip, const char* p);
+  bool TrySearch(int id, const char* p);
 
   // Search parameters
   Prog* prog_;              // program being run
@@ -93,8 +93,8 @@
 // Should the search visit the pair ip, p?
 // If so, remember that it was visited so that the next time,
 // we don't repeat the visit.
-bool BitState::ShouldVisit(Inst* ip, const char* p) {
-  uint n = ip->id_unchecked() * (text_.size() + 1) + (p - text_.begin());
+bool BitState::ShouldVisit(int id, const char* p) {
+  uint n = id * (text_.size() + 1) + (p - text_.begin());
   if (visited_[n/VisitedBits] & (1 << (n & (VisitedBits-1))))
     return false;
   visited_[n/VisitedBits] |= 1 << (n & (VisitedBits-1));
@@ -116,38 +116,38 @@
   return true;
 }
 
-// Push the triple (ip, p, arg) onto the stack, growing it if necessary.
-void BitState::Push(Inst* ip, const char* p, int arg) {
+// Push the triple (id, p, arg) onto the stack, growing it if necessary.
+void BitState::Push(int id, const char* p, int arg) {
   if (njob_ >= maxjob_) {
     if (!GrowStack())
       return;
   }
-  int op = ip->opcode();
+  int op = prog_->inst(id)->opcode();
   if (op == kInstFail)
     return;
 
   // Only check ShouldVisit when arg == 0.
   // When arg > 0, we are continuing a previous visit.
-  if (arg == 0 && !ShouldVisit(ip, p))
+  if (arg == 0 && !ShouldVisit(id, p))
     return;
 
   Job* j = &job_[njob_++];
-  j->ip = ip;
+  j->id = id;
   j->p = p;
   j->arg = arg;
 }
 
-// Try a search from instruction ip0 in state p0.
+// Try a search from instruction id0 in state p0.
 // Return whether it succeeded.
-bool BitState::TrySearch(Inst* ip0, const char* p0) {
+bool BitState::TrySearch(int id0, const char* p0) {
   bool matched = false;
   const char* end = text_.end();
   njob_ = 0;
-  Push(ip0, p0, 0);
+  Push(id0, p0, 0);
   while (njob_ > 0) {
     // Pop job off stack.
     --njob_;
-    Inst* ip = job_[njob_].ip;
+    int id = job_[njob_].id;
     const char* p = job_[njob_].p;
     int arg = job_[njob_].arg;
 
@@ -160,13 +160,14 @@
     // manipulation.
     if (0) {
     CheckAndLoop:
-      if (!ShouldVisit(ip, p))
+      if (!ShouldVisit(id, p))
         continue;
     }
 
     // Visit ip, p.
     // VLOG(0) << "Job: " << ip->id() << " "
     //         << (p - text_.begin()) << " " << arg;
+    Prog::Inst* ip = prog_->inst(id);
     switch (ip->opcode()) {
       case kInstFail:
       default:
@@ -183,14 +184,14 @@
         // ip with arg==1 as a reminder to push ip->out1() later.
         switch (arg) {
           case 0:
-            Push(ip, p, 1);  // come back when we're done
-            ip = ip->out();
+            Push(id, p, 1);  // come back when we're done
+            id = ip->out();
             goto CheckAndLoop;
 
           case 1:
             // Finished ip->out(); try ip->out1().
             arg = 0;
-            ip = ip->out1();
+            id = ip->out1();
             goto CheckAndLoop;
         }
         LOG(DFATAL) << "Bad arg in kInstCapture: " << arg;
@@ -198,16 +199,16 @@
 
       case kInstAltMatch:
         // One opcode is byte range; the other leads to match.
-        if (ip->greedy()) {
+        if (ip->greedy(prog_)) {
           // out1 is the match
           Push(ip->out1(), p, 0);
-          ip = ip->out1();
+          id = ip->out1();
           p = end;
           goto CheckAndLoop;
         }
         // out is the match - non-greedy
         Push(ip->out(), end, 0);
-        ip = ip->out();
+        id = ip->out();
         goto CheckAndLoop;
 
       case kInstByteRange: {
@@ -215,7 +216,7 @@
         if (p < end)
           c = *p & 0xFF;
         if (ip->Matches(c)) {
-          ip = ip->out();
+          id = ip->out();
           p++;
           goto CheckAndLoop;
         }
@@ -227,11 +228,11 @@
           case 0:
             if (0 <= ip->cap() && ip->cap() < ncap_) {
               // Capture p to register, but save old value.
-              Push(ip, cap_[ip->cap()], 1);  // come back when we're done
+              Push(id, cap_[ip->cap()], 1);  // come back when we're done
               cap_[ip->cap()] = p;
             }
             // Continue on.
-            ip = ip->out();
+            id = ip->out();
             goto CheckAndLoop;
           case 1:
             // Finished ip->out(); restore the old value.
@@ -244,11 +245,11 @@
       case kInstEmptyWidth:
         if (ip->empty() & ~Prog::EmptyFlags(context_, p))
           continue;
-        ip = ip->out();
+        id = ip->out();
         goto CheckAndLoop;
 
       case kInstNop:
-        ip = ip->out();
+        id = ip->out();
         goto CheckAndLoop;
 
       case kInstMatch: {
diff --git a/re2/compile.cc b/re2/compile.cc
index 8f6ef36..08871c2 100644
--- a/re2/compile.cc
+++ b/re2/compile.cc
@@ -19,74 +19,97 @@
 // we can use the Inst* word to hold the list's "next" pointer.
 // It's kind of sleazy, but it works well in practice.
 // See http://swtch.com/~rsc/regexp/regexp1.html for inspiration.
+//
+// Because the out and out1 fields in Inst are no longer pointers,
+// we can't use pointers directly here either.  Instead, p refers
+// to inst_[p>>1].out (p&1 == 0) or inst_[p>>1].out1 (p&1 == 1).
+// p == 0 represents the NULL list.  This is okay because instruction #0
+// is always the fail instruction, which never appears on a list.
 
 struct PatchList {
-  union {
-    PatchList* next;
-    Inst* val;
-  };
+  uint32 p;
 
   // Returns patch list containing just p.
-  static PatchList* Mk(Inst** p);
+  static PatchList Mk(uint32 p);
 
   // Patches all the entries on l to have value v.
   // Caller must not ever use patch list again.
-  static void Patch(PatchList* l, Inst* v);
+  static void Patch(Prog::Inst *inst0, PatchList l, uint32 v);
+
+  // Deref returns the next pointer pointed at by p.
+  static PatchList Deref(Prog::Inst *inst0, PatchList l);
 
   // Appends two patch lists and returns result.
-  static PatchList* Append(PatchList* l1, PatchList* l2);
+  static PatchList Append(Prog::Inst *inst0, PatchList l1, PatchList l2);
 };
 
+static PatchList nullPatchList = { 0 };
+
 // Returns patch list containing just p.
-PatchList* PatchList::Mk(Inst** p) {
-  COMPILE_ASSERT(sizeof(PatchList) == sizeof(*p), PatchListAssert);
-  return reinterpret_cast<PatchList*>(p);
+PatchList PatchList::Mk(uint32 p) {
+  PatchList l;
+  l.p = p;
+  return l;
+}
+
+// Returns the next pointer pointed at by l.
+PatchList PatchList::Deref(Prog::Inst* inst0, PatchList l) {
+  Prog::Inst* ip = &inst0[l.p>>1];
+  if (l.p&1)
+    l.p = ip->out1();
+  else
+    l.p = ip->out();
+  return l;
 }
 
 // Patches all the entries on l to have value v.
-void PatchList::Patch(PatchList* l, Inst* val) {
-  PatchList* next;
-  for (PatchList *ll = l; ll; ll = next) {
-    next = ll->next;
-    ll->val = val;
+void PatchList::Patch(Prog::Inst *inst0, PatchList l, uint32 val) {
+  while (l.p != 0) {
+    Prog::Inst* ip = &inst0[l.p>>1];
+    if (l.p&1) {
+      l.p = ip->out1();
+      ip->out1_ = val;
+    } else {
+      l.p = ip->out();
+      ip->set_out(val);
+    }
   }
 }
 
 // Appends two patch lists and returns result.
-PatchList* PatchList::Append(PatchList* l1, PatchList* l2) {
-  if (l1 == NULL)
+PatchList PatchList::Append(Prog::Inst* inst0, PatchList l1, PatchList l2) {
+  if (l1.p == 0)
     return l2;
-  if (l2 == NULL)
+  if (l2.p == 0)
     return l1;
 
-  PatchList *l = l1;
-  while (l->next)
-    l = l->next;
-  l->next = l2;
+  PatchList l = l1;
+  for (;;) {
+    PatchList next = PatchList::Deref(inst0, l);
+    if (next.p == 0)
+      break;
+    l = next;
+  }
+  
+  Prog::Inst* ip = &inst0[l.p>>1];
+  if(l.p&1)
+    ip->out1_ = l2.p;
+  else
+    ip->set_out(l2.p);
+
   return l1;
 }
 
-
 // Compiled program fragment.
 struct Frag {
-  Inst* begin;
-  PatchList* end;
+  uint32 begin;
+  PatchList end;
 
-
-  Frag() : begin(NULL), end(NULL) {}  // needed so Frag can go in vector
-  Frag(Inst* begin, PatchList* end) : begin(begin), end(end) {}
+  Frag() : begin(0) { end.p = 0; }  // needed so Frag can go in vector
+  Frag(uint32 begin, PatchList end) : begin(begin), end(end) {}
 };
 
-static Frag kNullFrag(NULL, NULL);
-
-// Rune cache entry.
-struct RuneCacheLine {
-  uint64 key;
-  Inst* value;
-
-  RuneCacheLine() : key(0), value(NULL) {}  // so RuneCacheLine can go in vector
-  RuneCacheLine(uint64 key, Inst* value) : key(key), value(value) {}
-};
+static Frag kNullFrag;
 
 // Input encodings.
 enum Encoding {
@@ -146,12 +169,13 @@
   // Returns a fragment matching an empty-width special op.
   Frag EmptyWidth(EmptyOp op);
 
-  // Maintains the instruction budget.
-  // Must be called before each AllocInst to check whether
-  // it is okay to allocate a new instruction.
-  // Adds 1 to inst_count_, returning true if it hasn't reached max_inst_.
-  // Otherwise returns false, sets failed_ = true.
-  bool CanAllocInst();
+  // Adds n instructions to the program.
+  // Returns the index of the first one.
+  // Returns -1 if no more instructions are available.
+  int AllocInst(int n);
+  
+  // Deletes unused instructions.
+  void Trim();
 
   // Rune range compiler.
 
@@ -165,11 +189,11 @@
   void Add_80_10ffff();
 
   // New suffix that matches the byte range lo-hi, then goes to next.
-  Inst* RuneByteSuffix(uint8 lo, uint8 hi, bool foldcase, Inst* next);
-  Inst* UncachedRuneByteSuffix(uint8 lo, uint8 hi, bool foldcase, Inst* next);
+  int RuneByteSuffix(uint8 lo, uint8 hi, bool foldcase, int next);
+  int UncachedRuneByteSuffix(uint8 lo, uint8 hi, bool foldcase, int next);
 
   // Adds a suffix to alternation.
-  void AddSuffix(Inst* ip);
+  void AddSuffix(int id);
 
   // Returns the alternation of all the added suffixes.
   Frag EndRange();
@@ -179,15 +203,17 @@
 
  private:
   Prog* prog_;         // Program being built.
-  Inst* fail_inst_;    // kInstFail instruction in program
   bool failed_;        // Did we give up compiling?
   Encoding encoding_;  // Input encoding
   bool reversed_;      // Should program run backward over text?
 
-  int64 inst_count_;   // Number of instructions allocated.
-  int64 max_inst_;     // Maximum number of instructions.
+  int max_inst_;       // Maximum number of instructions.
+  
+  Prog::Inst* inst_;   // Pointer to first instruction.
+  int inst_len_;       // Number of instructions used.
+  int inst_cap_;       // Number of instructions allocated.
 
-  map<uint64, Inst*> rune_cache_;
+  map<uint64, int> rune_cache_;
   Frag rune_range_;
 
   DISALLOW_EVIL_CONSTRUCTORS(Compiler);
@@ -195,27 +221,53 @@
 
 Compiler::Compiler() {
   prog_ = new Prog();
-  fail_inst_ = prog_->AllocInst();
-  fail_inst_->InitFail();
   failed_ = false;
   encoding_ = kEncodingUTF8;
   reversed_ = false;
-  inst_count_ = 1;
+  inst_ = NULL;
+  inst_len_ = 0;
+  inst_cap_ = 0;
+  max_inst_ = 1;  // make AllocInst for fail instruction okay
+  int fail = AllocInst(1);
+  inst_[fail].InitFail();
   max_inst_ = 0;  // Caller must change
 }
 
 Compiler::~Compiler() {
   delete prog_;
+  delete[] inst_;
 }
 
-bool Compiler::CanAllocInst() {
-  if (failed_)
-    return false;
-  if (inst_count_++ > max_inst_) {
+int Compiler::AllocInst(int n) {
+  if (failed_ || inst_len_ + n > max_inst_) {
     failed_ = true;
-    return false;
+    return -1;
   }
-  return true;
+
+  if (inst_len_ + n > inst_cap_) {
+    if (inst_cap_ == 0)
+      inst_cap_ = 8;
+    while (inst_len_ + n > inst_cap_)
+      inst_cap_ *= 2;
+    Prog::Inst* ip = new Prog::Inst[inst_cap_];
+    memmove(ip, inst_, inst_len_ * sizeof ip[0]);
+    memset(ip + inst_len_, 0, (inst_cap_ - inst_len_) * sizeof ip[0]);
+    delete[] inst_;
+    inst_ = ip;
+  }
+  int id = inst_len_;
+  inst_len_ += n;
+  return id;
+}
+
+void Compiler::Trim() {
+  if (inst_len_ < inst_cap_) {
+    Prog::Inst* ip = new Prog::Inst[inst_len_];
+    memmove(ip, inst_, inst_len_ * sizeof ip[0]);
+    delete[] inst_;
+    inst_ = ip;
+    inst_cap_ = inst_len_;
+  }
 }
 
 // These routines are somewhat hard to visualize in text --
@@ -224,12 +276,12 @@
 
 // Returns an unmatchable fragment.
 Frag Compiler::NoMatch() {
-  return Frag(fail_inst_, NULL);
+  return Frag(0, nullPatchList);
 }
 
 // Is a an unmatchable fragment?
 static bool IsNoMatch(Frag a) {
-  return a.begin == NULL || a.begin->opcode() == kInstFail;
+  return a.begin == 0;
 }
 
 // Given fragments a and b, returns fragment for ab.
@@ -238,20 +290,21 @@
     return NoMatch();
 
   // Elide no-op.
-  if (a.begin->opcode() == kInstNop &&
-      a.end == PatchList::Mk(&a.begin->out_) &&
-      a.begin->out() == NULL) {
-    PatchList::Patch(a.end, b.begin);  // in case refs to a somewhere
+  Prog::Inst* begin = &inst_[a.begin];
+  if (begin->opcode() == kInstNop &&
+      a.end.p == (a.begin << 1) &&
+      begin->out() == 0) {
+    PatchList::Patch(inst_, a.end, b.begin);  // in case refs to a somewhere
     return b;
   }
 
   // To run backward over string, reverse all concatenations.
   if (reversed_) {
-    PatchList::Patch(b.end, a.begin);
+    PatchList::Patch(inst_, b.end, a.begin);
     return Frag(b.begin, a.end);
   }
 
-  PatchList::Patch(a.end, b.begin);
+  PatchList::Patch(inst_, a.end, b.begin);
   return Frag(a.begin, b.end);
 }
 
@@ -263,12 +316,12 @@
   if (IsNoMatch(b))
     return a;
 
-  if (!CanAllocInst())
+  int id = AllocInst(1);
+  if (id < 0)
     return NoMatch();
 
-  Inst* ip = prog_->AllocInst();
-  ip->InitAlt(a.begin, b.begin);
-  return Frag(ip, PatchList::Append(a.end, b.end));
+  inst_[id].InitAlt(a.begin, b.begin);
+  return Frag(id, PatchList::Append(inst_, a.end, b.end));
 }
 
 // When capturing submatches in like-Perl mode, a kOpAlt Inst
@@ -280,18 +333,17 @@
 
 // Given a fragment a, returns a fragment for a* or a*? (if nongreedy)
 Frag Compiler::Star(Frag a, bool nongreedy) {
-  if (!CanAllocInst())
+  int id = AllocInst(1);
+  if (id < 0)
     return NoMatch();
-
-  Inst* ip = prog_->AllocInst();
-  ip->InitAlt(NULL, NULL);
-  PatchList::Patch(a.end, ip);
+  inst_[id].InitAlt(0, 0);
+  PatchList::Patch(inst_, a.end, id);
   if (nongreedy) {
-    ip->out1_ = a.begin;
-    return Frag(ip, PatchList::Mk(&ip->out_));
+    inst_[id].out1_ = a.begin;
+    return Frag(id, PatchList::Mk(id << 1));
   } else {
-    ip->out_ = a.begin;
-    return Frag(ip, PatchList::Mk(&ip->out1_));
+    inst_[id].set_out(a.begin);
+    return Frag(id, PatchList::Mk((id << 1) | 1));
   }
 }
 
@@ -304,29 +356,27 @@
 
 // Given a fragment for a, returns a fragment for a? or a?? (if nongreedy)
 Frag Compiler::Quest(Frag a, bool nongreedy) {
-  if (!CanAllocInst())
+  int id = AllocInst(1);
+  if (id < 0)
     return NoMatch();
-
-  Inst* ip = prog_->AllocInst();
-  PatchList* pl;
+  PatchList pl;
   if (nongreedy) {
-    ip->InitAlt(NULL, a.begin);
-    pl = PatchList::Mk(&ip->out_);
+    inst_[id].InitAlt(0, a.begin);
+    pl = PatchList::Mk(id << 1);
   } else {
-    ip->InitAlt(a.begin, NULL);
-    pl = PatchList::Mk(&ip->out1_);
+    inst_[id].InitAlt(a.begin, 0);
+    pl = PatchList::Mk((id << 1) | 1);
   }
-  return Frag(ip, PatchList::Append(pl, a.end));
+  return Frag(id, PatchList::Append(inst_, pl, a.end));
 }
 
 // Returns a fragment for the byte range lo-hi.
 Frag Compiler::ByteRange(int lo, int hi, bool foldcase) {
-  if (!CanAllocInst())
+  int id = AllocInst(1);
+  if (id < 0)
     return NoMatch();
-
+  inst_[id].InitByteRange(lo, hi, foldcase, NULL);
   prog_->byte_inst_count_++;
-  Inst* ip = prog_->AllocInst();
-  ip->InitByteRange(lo, hi, foldcase, NULL);
   prog_->MarkByteRange(lo, hi);
   if (foldcase && lo <= 'z' && hi >= 'a') {
     if (lo < 'a')
@@ -336,36 +386,33 @@
     if (lo <= hi)
       prog_->MarkByteRange(lo + 'A' - 'a', hi + 'A' - 'a');
   }
-  return Frag(ip, PatchList::Mk(&ip->out_));
+  return Frag(id, PatchList::Mk(id << 1));
 }
 
 // Returns a no-op fragment.  Sometimes unavoidable.
 Frag Compiler::Nop() {
-  if (!CanAllocInst())
+  int id = AllocInst(1);
+  if (id < 0)
     return NoMatch();
-
-  Inst* ip = prog_->AllocInst();
-  ip->InitNop(NULL);
-  return Frag(ip, PatchList::Mk(&ip->out_));
+  inst_[id].InitNop(0);
+  return Frag(id, PatchList::Mk(id << 1));
 }
 
 // Returns a fragment that signals a match.
 Frag Compiler::Match() {
-  if (!CanAllocInst())
+  int id = AllocInst(1);
+  if (id < 0)
     return NoMatch();
-
-  Inst* ip = prog_->AllocInst();
-  ip->InitMatch();
-  return Frag(ip, NULL);
+  inst_[id].InitMatch();
+  return Frag(id, nullPatchList);
 }
 
 // Returns a fragment matching a particular empty-width op (like ^ or $)
 Frag Compiler::EmptyWidth(EmptyOp empty) {
-  if (!CanAllocInst())
+  int id = AllocInst(1);
+  if (id < 0)
     return NoMatch();
-
-  Inst* ip = prog_->AllocInst();
-  ip->InitEmptyWidth(empty, NULL);
+  inst_[id].InitEmptyWidth(empty, 0);
   if (empty & (kEmptyBeginLine|kEmptyEndLine))
     prog_->MarkByteRange('\n', '\n');
   if (empty & (kEmptyWordBoundary|kEmptyNonWordBoundary)) {
@@ -376,22 +423,19 @@
       prog_->MarkByteRange(i, j-1);
     }
   }
-  return Frag(ip, PatchList::Mk(&ip->out_));
+  return Frag(id, PatchList::Mk(id << 1));
 }
 
 // Given a fragment a, returns a fragment with capturing parens around a.
 Frag Compiler::Capture(Frag a, int n) {
-  if (!CanAllocInst() || !CanAllocInst())
+  int id = AllocInst(2);
+  if (id < 0)
     return NoMatch();
+  inst_[id].InitCapture(2*n, a.begin);
+  inst_[id+1].InitCapture(2*n+1, 0);
+  PatchList::Patch(inst_, a.end, id+1);
 
-  Inst* left = prog_->AllocInst();
-  left->InitCapture(2*n, a.begin);
-
-  Inst* right = prog_->AllocInst();
-  right->InitCapture(2*n+1, NULL);
-  PatchList::Patch(a.end, right);
-
-  return Frag(left, PatchList::Mk(&right->out_));
+  return Frag(id, PatchList::Mk((id+1) << 1));
 }
 
 // A Rune is a name for a Unicode code point.
@@ -416,20 +460,20 @@
 void Compiler::BeginRange() {
   rune_cache_.clear();
   rune_range_.begin = NULL;
-  rune_range_.end = NULL;
+  rune_range_.end = nullPatchList;
 }
 
-Inst* Compiler::UncachedRuneByteSuffix(uint8 lo, uint8 hi, bool foldcase, Inst* next) {
+int Compiler::UncachedRuneByteSuffix(uint8 lo, uint8 hi, bool foldcase, int next) {
   Frag f = ByteRange(lo, hi, foldcase);
-  if (next != NULL) {
-    PatchList::Patch(f.end, next);
+  if (next != 0) {
+    PatchList::Patch(inst_, f.end, next);
   } else {
-    rune_range_.end = PatchList::Append(rune_range_.end, f.end);
+    rune_range_.end = PatchList::Append(inst_, rune_range_.end, f.end);
   }
   return f.begin;
 }
 
-Inst* Compiler::RuneByteSuffix(uint8 lo, uint8 hi, bool foldcase, Inst* next) {
+int Compiler::RuneByteSuffix(uint8 lo, uint8 hi, bool foldcase, int next) {
   // In Latin1 mode, there's no point in caching.
   // In forward UTF-8 mode, only need to cache continuation bytes.
   if (encoding_ == kEncodingLatin1 ||
@@ -437,26 +481,27 @@
     return UncachedRuneByteSuffix(lo, hi, foldcase, next);
   }
 
-  uint64 key =  ((uint64)next->id() << 17) | (lo<<9) | (hi<<1) | foldcase;
-  map<uint64, Inst*>::iterator it = rune_cache_.find(key);
+  uint64 key = ((uint64)next << 17) | (lo<<9) | (hi<<1) | foldcase;
+  map<uint64, int>::iterator it = rune_cache_.find(key);
   if (it != rune_cache_.end())
     return it->second;
-  Inst* inst = UncachedRuneByteSuffix(lo, hi, foldcase, next);
-  rune_cache_[key] = inst;
-  return inst;
+  int id = UncachedRuneByteSuffix(lo, hi, foldcase, next);
+  rune_cache_[key] = id;
+  return id;
 }
 
-void Compiler::AddSuffix(Inst* ip) {
-  if (rune_range_.begin == NULL) {
-    rune_range_.begin = ip;
+void Compiler::AddSuffix(int id) {
+  if (rune_range_.begin == 0) {
+    rune_range_.begin = id;
     return;
   }
 
-  if (!CanAllocInst())
-    rune_range_.begin = fail_inst_;
-
-  Inst* alt = prog_->AllocInst();
-  alt->InitAlt(rune_range_.begin, ip);
+  int alt = AllocInst(1);
+  if (alt < 0) {
+    rune_range_.begin = 0;
+    return;
+  }
+  inst_[alt].InitAlt(rune_range_.begin, id);
   rune_range_.begin = alt;
 }
 
@@ -488,7 +533,7 @@
     return;
   if (hi > 0xFF)
     hi = 0xFF;
-  AddSuffix(RuneByteSuffix(lo, hi, foldcase, NULL));
+  AddSuffix(RuneByteSuffix(lo, hi, foldcase, 0));
 }
 
 // Table describing how to make a UTF-8 matching machine
@@ -523,10 +568,10 @@
 };
 
 void Compiler::Add_80_10ffff() {
-  Inst* inst[arraysize(prog_80_10ffff)];
+  int inst[arraysize(prog_80_10ffff)];
   for (int i = 0; i < arraysize(prog_80_10ffff); i++) {
     const ByteRangeProg& p = prog_80_10ffff[i];
-    Inst* next = NULL;
+    int next = 0;
     if (p.next >= 0)
       next = inst[p.next];
     inst[i] = UncachedRuneByteSuffix(p.lo, p.hi, false, next);
@@ -558,7 +603,7 @@
 
   // ASCII range is always a special case.
   if (hi < Runeself) {
-    AddSuffix(RuneByteSuffix(lo, hi, foldcase, NULL));
+    AddSuffix(RuneByteSuffix(lo, hi, foldcase, 0));
     return;
   }
 
@@ -583,17 +628,18 @@
   uint8 ulo[UTFmax], uhi[UTFmax];
   int n = runetochar(reinterpret_cast<char*>(ulo), &lo);
   int m = runetochar(reinterpret_cast<char*>(uhi), &hi);
+  (void)m;  // USED(m)
   DCHECK_EQ(n, m);
 
-  Inst* ip = NULL;
+  int id = 0;
   if (reversed_) {
     for (int i = 0; i < n; i++)
-      ip = RuneByteSuffix(ulo[i], uhi[i], false, ip);
+      id = RuneByteSuffix(ulo[i], uhi[i], false, id);
   } else {
     for (int i = n-1; i >= 0; i--)
-      ip = RuneByteSuffix(ulo[i], uhi[i], false, ip);
+      id = RuneByteSuffix(ulo[i], uhi[i], false, id);
   }
-  AddSuffix(ip);
+  AddSuffix(id);
 }
 
 // Should not be called.
@@ -853,7 +899,7 @@
     // No room for anything.
     c.max_inst_ = 0;
   } else {
-    int64 m = (max_mem - sizeof(Prog)) / sizeof(Inst);
+    int64 m = (max_mem - sizeof(Prog)) / sizeof(Prog::Inst);
     // Limit instruction count so that inst->id() fits nicely in an int.
     // SparseArray also assumes that the indices (inst->id()) are ints.
     // The call to WalkExponential uses 2*c.max_inst_ below,
@@ -864,6 +910,11 @@
     // on the program.)
     if (m >= 1<<24)
       m = 1<<24;
+    
+    // Inst imposes its own limit (currently bigger than 2^24 but be safe).
+    if (m > Prog::Inst::kMaxInst)
+      m = Prog::Inst::kMaxInst;
+
     c.max_inst_ = m;
   }
 
@@ -909,6 +960,17 @@
     Frag unanchored = c.Cat(dotloop, all);
     c.prog_->set_start_unanchored(unanchored.begin);
   }
+  
+  if (c.prog_->start() == 0 && c.prog_->start_unanchored() == 0) {
+    // No possible matches; keep Fail instruction only.
+    c.inst_len_ = 1;
+  }
+
+  // Trim instruction to minimum array and transfer to Prog.
+  c.Trim();
+  c.prog_->inst_ = c.inst_;
+  c.prog_->size_ = c.inst_len_;
+  c.inst_ = NULL;
 
   // Record whether prog is reversed.
   c.prog_->set_reversed(reversed);
@@ -922,7 +984,7 @@
   if (max_mem <= 0) {
     c.prog_->set_dfa_mem(1<<20);
   } else {
-    int64 m = max_mem - sizeof(Prog) - c.inst_count_*sizeof(Inst);
+    int64 m = max_mem - sizeof(Prog) - c.inst_len_*sizeof(Prog::Inst);
     if (m < 0)
       m = 0;
     c.prog_->set_dfa_mem(m);
diff --git a/re2/dfa.cc b/re2/dfa.cc
index 7b7ec7b..8e8daf6 100644
--- a/re2/dfa.cc
+++ b/re2/dfa.cc
@@ -25,7 +25,7 @@
 #include "re2/stringpiece.h"
 #include "util/atomicops.h"
 #include "util/flags.h"
-#include "util/sparse_array.h"
+#include "util/sparse_set.h"
 
 DEFINE_bool(re2_dfa_bail_when_slow, true,
             "Whether the RE2 DFA should bail out early "
@@ -95,7 +95,7 @@
   // byte c, the next state should be s->next_[c].
   struct State {
     inline bool IsMatch() const { return flag_ & kFlagMatch; }
-    Inst** inst_;       // Instruction pointers in the state.
+    int* inst_;         // Instruction pointers in the state.
     int ninst_;         // # of inst_ pointers.
     uint flag_;         // Empty string bitfield flags in effect on the way
                         // into this state, along with kFlagMatch if this
@@ -180,7 +180,7 @@
 
   // Looks up and returns a State matching the inst, ninst, and flag.
   // L >= mutex_
-  State* CachedState(Inst** inst, int ninst, uint flag);
+  State* CachedState(int* inst, int ninst, uint flag);
 
   // Clear the cache entirely.
   // Must hold cache_mutex_.w or be in destructor.
@@ -201,16 +201,16 @@
   void RunWorkqOnByte(Workq* q, Workq* nq,
                              int c, uint flag, bool* ismatch,
                              Prog::MatchKind kind,
-                             Inst* new_byte_loop);
+                             int new_byte_loop);
 
   // Runs a Workq on a set of empty-string flags, producing a new Workq in nq.
   // L >= mutex_
   void RunWorkqOnEmptyString(Workq* q, Workq* nq, uint flag);
 
-  // Adds the instruction ip to the Workq, following empty arrows
+  // Adds the instruction id to the Workq, following empty arrows
   // according to flag.
   // L >= mutex_
-  void AddToQueue(Workq* q, Inst* ip, uint flag);
+  void AddToQueue(Workq* q, int id, uint flag);
 
   // For debugging, returns a text representation of State.
   static string DumpState(State* state);
@@ -309,7 +309,7 @@
   // Constant after initialization.
   Prog* prog_;              // The regular expression program to run.
   Prog::MatchKind kind_;    // The kind of DFA.
-  Inst* start_unanchored_;  // start of unanchored program
+  int start_unanchored_;  // start of unanchored program
   bool init_failed_;        // initialization failed (out of memory)
 
   Mutex mutex_;  // mutex_ >= cache_mutex_.r
@@ -317,7 +317,7 @@
   // Scratch areas, protected by mutex_.
   Workq* q0_;             // Two pre-allocated work queues.
   Workq* q1_;
-  Inst** astack_;         // Pre-allocated stack for AddToQueue
+  int* astack_;         // Pre-allocated stack for AddToQueue
   int nastack_;
 
   // State* cache.  Many threads use and add to the cache simultaneously,
@@ -343,31 +343,30 @@
 
 // Marks separate thread groups of different priority
 // in the work queue when in leftmost-longest matching mode.
-// NOTE(rsc): The code would work if Mark == NULL too,
-// but I want to catch unexpected NULLs from other sources.
-// Mark is never dereferenced.
-#define Mark reinterpret_cast<Inst*>(1)
+#define Mark (-1)
 
 // Internally, the DFA uses a sparse array of
 // program instruction pointers as a work queue.
 // In leftmost longest mode, marks separate sections
 // of workq that started executing at different
 // locations in the string (earlier locations first).
-class DFA::Workq : public SparseArray<Inst*> {
+class DFA::Workq : public SparseSet {
  public:
   // Constructor: n is number of normal slots, maxmark number of mark slots.
   Workq(int n, int maxmark) :
-    SparseArray<Inst*>(n+maxmark),
+    SparseSet(n+maxmark),
     n_(n),
     maxmark_(maxmark),
     nextmark_(n),
     last_was_mark_(true) {
   }
 
+  bool is_mark(int i) { return i >= n_; }
+
   int maxmark() { return maxmark_; }
 
   void clear() {
-    SparseArray<Inst*>::clear();
+    SparseSet::clear();
     nextmark_ = n_;
   }
 
@@ -375,22 +374,22 @@
     if (last_was_mark_)
       return;
     last_was_mark_ = false;
-    SparseArray<Inst*>::set_new(nextmark_++, Mark);
+    SparseSet::insert_new(nextmark_++);
   }
 
   int size() {
     return n_ + maxmark_;
   }
 
-  void set(int id, Inst* inst) {
-    if (has_index(id))
+  void insert(int id) {
+    if (contains(id))
       return;
-    set_new(id, inst);
+    insert_new(id);
   }
 
-  void set_new(int id, Inst* inst) {
+  void insert_new(int id) {
     last_was_mark_ = false;
-    SparseArray<Inst*>::set_new(id, inst);
+    SparseSet::insert_new(id);
   }
 
  private:
@@ -423,8 +422,8 @@
   // Account for space needed for DFA, q0, q1, astack.
   mem_budget_ -= sizeof(DFA);
   mem_budget_ -= (prog_->size() + nmark) *
-                 (sizeof(int)+sizeof(int)+sizeof(Inst*)) * 2;  // q0, q1
-  mem_budget_ -= nastack_ * sizeof(Inst*);  // astack
+                 (sizeof(int)+sizeof(int)) * 2;  // q0, q1
+  mem_budget_ -= nastack_ * sizeof(int);  // astack
   if (mem_budget_ < 0) {
     LOG(INFO) << StringPrintf("DFA out of memory: prog size %lld mem %lld",
                               prog_->size(), max_mem);
@@ -438,7 +437,7 @@
   // At minimum, the search requires room for two states in order
   // to limp along, restarting frequently.  We'll get better performance
   // if there is room for a larger number of states, say 20.
-  int one_state = sizeof(State) + (prog_->size()+nmark)*sizeof(Inst*) +
+  int one_state = sizeof(State) + (prog_->size()+nmark)*sizeof(int) +
                   (prog_->bytemap_range()+1)*sizeof(State*);
   if (state_budget_ < 20*one_state) {
     LOG(INFO) << StringPrintf("DFA out of memory: prog size %lld mem %lld",
@@ -449,7 +448,7 @@
 
   q0_ = new Workq(prog->size(), nmark);
   q1_ = new Workq(prog->size(), nmark);
-  astack_ = new Inst*[nastack_];
+  astack_ = new int[nastack_];
 }
 
 DFA::~DFA() {
@@ -478,11 +477,11 @@
   string s;
   const char* sep = "";
   for (DFA::Workq::iterator it = q->begin(); it != q->end(); ++it) {
-    if (it->value() == Mark) {
+    if (q->is_mark(*it)) {
       StringAppendF(&s, "|");
       sep = "";
     } else {
-      StringAppendF(&s, "%s%d", sep, it->value()->id());
+      StringAppendF(&s, "%s%d", sep, *it);
       sep = ",";
     }
   }
@@ -505,7 +504,7 @@
       StringAppendF(&s, "|");
       sep = "";
     } else {
-      StringAppendF(&s, "%s%d", sep, state->inst_[i]->id());
+      StringAppendF(&s, "%s%d", sep, state->inst_[i]);
       sep = ",";
     }
   }
@@ -524,9 +523,9 @@
 // requirements and also avoids duplication of effort across the two
 // identical states.
 //
-// A State is defined by an ordered list of Inst* pointers and a flag word.
+// A State is defined by an ordered list of instruction ids and a flag word.
 //
-// The choice of an ordered list of Inst* pointers differs from a typical
+// The choice of an ordered list of instructions differs from a typical
 // textbook DFA implementation, which would use an unordered set.
 // Textbook descriptions, however, only care about whether
 // the DFA matches, not where it matches in the text.  To decide where the
@@ -534,7 +533,7 @@
 // implementations like PCRE, which try one possible regular expression
 // execution, then another, then another, stopping when one of them succeeds.
 // The DFA execution tries these many executions in parallel, representing
-// each by an Inst* pointer.  These pointers are ordered in the State.inst_
+// each by an instruction id.  These pointers are ordered in the State.inst_
 // list in the same order that the executions would happen in a backtracking
 // search: if a match is found during execution of inst_[2], inst_[i] for i>=3
 // can be discarded.
@@ -552,8 +551,8 @@
 // any kInstEmptyWidth instructions in the state.  These provide a useful
 // summary indicating when new flags might be useful.
 //
-// The permanent representation of a State's Inst* pointers is just an array,
-// but while a state is being analyzed, these Inst* pointers are represented
+// The permanent representation of a State's instruction ids is just an array,
+// but while a state is being analyzed, these instruction ids are represented
 // as a Workq, which is an array that allows iteration in insertion order.
 
 // NOTE(rsc): The choice of State construction determines whether the DFA
@@ -562,9 +561,9 @@
 // prescribed by POSIX).  This implementation chooses to mimic the
 // backtracking implementations, because we want to replace PCRE.  To get
 // POSIX behavior, the states would need to be considered not as a simple
-// ordered list of Inst* pointers, but as a list of unordered sets of Inst*
-// pointers.  A match by a state in one set would inhibit the running of sets
-// farther down the list but not other Inst* pointers in the same set.  Each
+// ordered list of instruction ids, but as a list of unordered sets of instruction
+// ids.  A match by a state in one set would inhibit the running of sets
+// farther down the list but not other instruction ids in the same set.  Each
 // set would correspond to matches beginning at a given point in the string.
 // This is implemented by separating different sets with Mark pointers.
 
@@ -575,28 +574,29 @@
   if (DEBUG_MODE)
     mutex_.AssertHeld();
 
-  // Construct array of Inst* for the new state.
+  // Construct array of instruction ids for the new state.
   // Only ByteRange, EmptyWidth, and Match instructions are useful to keep:
   // those are the only operators with any effect in
   // RunWorkqOnEmptyString or RunWorkqOnByte.
-  Inst** inst = new Inst*[q->size()];
+  int* inst = new int[q->size()];
   int n = 0;
   uint needflags = 0;     // flags needed by kInstEmptyWidth instructions
   bool sawmatch = false;  // whether queue contains guaranteed kInstMatch
   if (DebugDFA)
     fprintf(stderr, "WorkqToCachedState %s [%#x]", DumpWorkq(q).c_str(), flag);
   for (Workq::iterator it = q->begin(); it != q->end(); ++it) {
-    Inst* ip = it->value();
-    if (sawmatch && (kind_ == Prog::kFirstMatch || ip == Mark))
+    int id = *it;
+    if (sawmatch && (kind_ == Prog::kFirstMatch || q->is_mark(id)))
       break;
-    if (ip == Mark) {
+    if (q->is_mark(id)) {
       if (n > 0 && inst[n-1] != Mark)
         inst[n++] = Mark;
       continue;
     }
+    Prog::Inst* ip = prog_->inst(id);
     switch (ip->opcode()) {
       case kInstAltMatch:
-        if (kind_ != Prog::kFirstMatch || (it == q->begin() && ip->greedy())) {
+        if (kind_ != Prog::kFirstMatch || (it == q->begin() && ip->greedy(prog_))) {
           delete[] inst;
           return FullMatchState;
         }
@@ -605,7 +605,7 @@
       case kInstEmptyWidth:
       case kInstMatch:
       case kInstAlt:          // Not useful, but necessary [*]
-        inst[n++] = it->value();
+        inst[n++] = *it;
         if (ip->opcode() == kInstEmptyWidth)
           needflags |= ip->empty();
         if (ip->opcode() == kInstMatch && !prog_->anchor_end())
@@ -664,10 +664,10 @@
   // unordered state sets separated by Marks.  Sort each set
   // to canonicalize, to reduce the number of distinct sets stored.
   if (kind_ == Prog::kLongestMatch) {
-    Inst** ip = inst;
-    Inst** ep = ip + n;
+    int* ip = inst;
+    int* ep = ip + n;
     while (ip < ep) {
-      Inst** markp = ip;
+      int* markp = ip;
       while (markp < ep && *markp != Mark)
         markp++;
       sort(ip, markp);
@@ -688,7 +688,7 @@
 // Looks in the State cache for a State matching inst, ninst, flag.
 // If one is found, returns it.  If one is not found, allocates one,
 // inserts it in the cache, and returns it.
-DFA::State* DFA::CachedState(Inst** inst, int ninst, uint flag) {
+DFA::State* DFA::CachedState(int* inst, int ninst, uint flag) {
   if (DEBUG_MODE)
     mutex_.AssertHeld();
 
@@ -707,7 +707,7 @@
   // State*, empirically.
   const int kStateCacheOverhead = 32;
   int nnext = prog_->bytemap_range() + 1;  // + 1 for kByteEndText slot
-  int mem = sizeof(State) + nnext*sizeof(State*) + ninst*sizeof(Inst*);
+  int mem = sizeof(State) + nnext*sizeof(State*) + ninst*sizeof(int);
   if (mem_budget_ < mem + kStateCacheOverhead) {
     mem_budget_ = -1;
     return NULL;
@@ -718,7 +718,7 @@
   char* space = new char[mem];
   State* s = reinterpret_cast<State*>(space);
   s->next_ = reinterpret_cast<State**>(s + 1);
-  s->inst_ = reinterpret_cast<Inst**>(s->next_ + nnext);
+  s->inst_ = reinterpret_cast<int*>(s->next_ + nnext);
   memset(s->next_, 0, nnext*sizeof s->next_[0]);
   memmove(s->inst_, inst, ninst*sizeof s->inst_[0]);
   s->ninst_ = ninst;
@@ -752,13 +752,13 @@
     if (s->inst_[i] == Mark)
       q->mark();
     else
-      q->set_new(s->inst_[i]->id(), s->inst_[i]);
+      q->insert_new(s->inst_[i]);
   }
 }
 
 // Adds ip to the work queue, following empty arrows according to flag
 // and expanding kInstAlt instructions (two-target gotos).
-void DFA::AddToQueue(Workq* q, Inst* ip, uint flag) {
+void DFA::AddToQueue(Workq* q, int id, uint flag) {
 
   // Use astack_ to hold our stack of states yet to process.
   // It is sized to have room for nastack_ == 2*prog->size() + nmark
@@ -766,33 +766,34 @@
   // processed by the switch below only once, and the processing
   // pushes at most two instructions plus maybe a mark.
   // (If we're using marks, nmark == prog->size(); otherwise nmark == 0.)
-  Inst** stk = astack_;
+  int* stk = astack_;
   int nstk = 0;
 
-  stk[nstk++] = ip;
+  stk[nstk++] = id;
   while (nstk > 0) {
     DCHECK_LE(nstk, nastack_);
-    ip = stk[--nstk];
+    id = stk[--nstk];
 
-    if (ip == Mark) {
+    if (id == Mark) {
       q->mark();
       continue;
     }
 
-    if (ip == NULL || ip->opcode() == kInstFail)
+    if (id == 0)
       continue;
 
     // If ip is already on the queue, nothing to do.
     // Otherwise add it.  We don't actually keep all the ones
     // that get added -- for example, kInstAlt is ignored
     // when on a work queue -- but adding all ip's here
-    // increases the likelihood of q->has_index(ip->id()),
+    // increases the likelihood of q->contains(id),
     // reducing the amount of duplicated work.
-    if (q->has_index(ip->id()))
+    if (q->contains(id))
       continue;
-    q->set_new(ip->id(), ip);
+    q->insert_new(id);
 
     // Process instruction.
+    Prog::Inst* ip = prog_->inst(id);
     switch (ip->opcode()) {
       case kInstFail:       // can't happen: discarded above
         break;
@@ -816,7 +817,7 @@
         // than the current ones.
         stk[nstk++] = ip->out1();
         if (q->maxmark() > 0 &&
-            ip == prog_->start_unanchored() && ip != prog_->start())
+            id == prog_->start_unanchored() && id != prog_->start())
           stk[nstk++] = Mark;
         stk[nstk++] = ip->out();
         break;
@@ -847,8 +848,12 @@
 // exhibited by existing implementations).
 void DFA::RunWorkqOnEmptyString(Workq* oldq, Workq* newq, uint flag) {
   newq->clear();
-  for (Workq::iterator i = oldq->begin(); i != oldq->end(); ++i)
-    AddToQueue(newq, i->value(), flag);
+  for (Workq::iterator i = oldq->begin(); i != oldq->end(); ++i) {
+    if (oldq->is_mark(*i))
+      AddToQueue(newq, Mark, flag);
+    else
+      AddToQueue(newq, *i, flag);
+  }
 }
 
 // Runs the work queue, processing the single byte c followed by any empty
@@ -858,19 +863,20 @@
 void DFA::RunWorkqOnByte(Workq* oldq, Workq* newq,
                          int c, uint flag, bool* ismatch,
                          Prog::MatchKind kind,
-                         Inst* new_byte_loop) {
+                         int new_byte_loop) {
   if (DEBUG_MODE)
     mutex_.AssertHeld();
 
   newq->clear();
   for (Workq::iterator i = oldq->begin(); i != oldq->end(); ++i) {
-    Inst* ip = i->value();
-    if (ip == Mark) {
+    if (oldq->is_mark(*i)) {
       if (*ismatch)
         return;
       newq->mark();
       continue;
     }
+    int id = *i;
+    Prog::Inst* ip = prog_->inst(id);
     switch (ip->opcode()) {
       case kInstFail:        // never succeeds
       case kInstCapture:     // already followed
@@ -1140,7 +1146,7 @@
 
  private:
   DFA* dfa_;         // the DFA to use
-  Inst** inst_;      // saved info from State
+  int* inst_;        // saved info from State
   int ninst_;
   uint flag_;
   bool is_special_;  // whether original state was special
@@ -1163,7 +1169,7 @@
   special_ = NULL;
   flag_ = state->flag_;
   ninst_ = state->ninst_;
-  inst_ = new Inst*[ninst_];
+  inst_ = new int[ninst_];
   memmove(inst_, state->inst_, ninst_*sizeof inst_[0]);
 }
 
diff --git a/re2/nfa.cc b/re2/nfa.cc
index c0d6ffc..2b1786f 100644
--- a/re2/nfa.cc
+++ b/re2/nfa.cc
@@ -27,6 +27,7 @@
 #include "re2/prog.h"
 #include "re2/regexp.h"
 #include "util/sparse_array.h"
+#include "util/sparse_set.h"
 
 namespace re2 {
 
@@ -55,7 +56,7 @@
  private:
   struct Thread {
     union {
-      Inst* ip;
+      int id;
       Thread* next;  // when on free list
     };
     const char** capture;
@@ -63,16 +64,16 @@
 
   // State for explicit stack in AddToThreadq.
   struct AddState {
-    Inst* ip;           // Inst to process
-    const char* cap_j;  // if j>=0, set capture[j] = cap_j before processing ip
+    int id;           // Inst to process
     int j;
+    const char* cap_j;  // if j>=0, set capture[j] = cap_j before processing ip
 
     AddState()
-      : ip(NULL), cap_j(NULL), j(-1) {}
-    explicit AddState(Inst* ip)
-      : ip(ip), cap_j(NULL), j(-1) {}
-    AddState(Inst* ip, const char* cap_j, int j)
-      : ip(ip), cap_j(cap_j), j(j) {}
+      : id(0), j(-1), cap_j(NULL) {}
+    explicit AddState(int id)
+      : id(id), j(-1), cap_j(NULL) {}
+    AddState(int id, const char* cap_j, int j)
+      : id(id), j(j), cap_j(cap_j) {}
   };
 
   // Threadq is a list of threads.  The list is sorted by the order
@@ -85,7 +86,7 @@
 
   // Add r (or its children, following unlabeled arrows)
   // to the workqueue q with associated capture info.
-  void AddToThreadq(Threadq* q, Inst* ip, int flag,
+  void AddToThreadq(Threadq* q, int id, int flag,
                     const char* p, const char** capture);
 
   // Run runq on byte c, appending new states to nextq.
@@ -94,7 +95,7 @@
   // in the input string, used when processing capturing parens.
   // flag is the bitwise or of Bol, Eol, etc., specifying whether
   // ^, $ and \b match the current input point (after c).
-  inline Inst* Step(Threadq* runq, Threadq* nextq, int c, int flag, const char* p);
+  inline int Step(Threadq* runq, Threadq* nextq, int c, int flag, const char* p);
 
   // Returns text version of capture information, for debugging.
   string FormatCapture(const char** capture);
@@ -106,7 +107,7 @@
   int ComputeFirstByte();
 
   Prog* prog_;          // underlying program
-  Inst* start_;         // start instruction in program
+  int start_;           // start instruction in program
   int ncapture_;        // number of submatches to track
   bool longest_;        // whether searching for longest match
   bool endmatch_;       // whether match must end at text.end()
@@ -182,9 +183,9 @@
 // The bits in flag (Bol, Eol, etc.) specify whether ^, $ and \b match.
 // The pointer p is the current input position, and m is the
 // current set of match boundaries.
-void NFA::AddToThreadq(Threadq* q, Inst* ip0, int flag,
+void NFA::AddToThreadq(Threadq* q, int id0, int flag,
                        const char* p, const char** capture) {
-  if (ip0 == NULL)
+  if (id0 == 0)
     return;
 
   // Astack_ is pre-allocated to avoid resize operations.
@@ -194,7 +195,7 @@
 
   int nstk = 0;
   AddState* stk = astack_;
-  stk[nstk++] = AddState(ip0);
+  stk[nstk++] = AddState(id0);
 
   while (nstk > 0) {
     DCHECK_LE(nstk, nastack_);
@@ -202,23 +203,24 @@
     if (a.j >= 0)
       capture[a.j] = a.cap_j;
 
-    Inst* ip = a.ip;
-    if (ip == NULL)
+    int id = a.id;
+    if (id == 0)
       continue;
-    if (q->has_index(ip->id())) {
+    if (q->has_index(id)) {
       if (Debug)
-        fprintf(stderr, "  [%d%s]\n", ip->id(), FormatCapture(capture).c_str());
+        fprintf(stderr, "  [%d%s]\n", id, FormatCapture(capture).c_str());
       continue;
     }
 
     // Create entry in q no matter what.  We might fill it in below,
     // or we might not.  Even if not, it is necessary to have it,
     // so that we don't revisit r during the recursion.
-    q->set_new(ip->id(), NULL);
+    q->set_new(id, NULL);
 
-    Thread** tp = &q->find(ip->id())->second;
+    Thread** tp = &q->find(id)->second;
     int j;
     Thread* t;
+    Prog::Inst* ip = prog_->inst(id);
     switch (ip->opcode()) {
     default:
       LOG(DFATAL) << "unhandled " << ip->opcode() << " in AddToThreadq";
@@ -230,7 +232,7 @@
     case kInstAltMatch:
       // Save state; will pick up at next byte.
       t = AllocThread();
-      t->ip = ip;
+      t->id = id;
       CopyCapture(t->capture, capture);
       *tp = t;
       // fall through
@@ -262,11 +264,11 @@
     case kInstByteRange:
       // Save state; will pick up at next byte.
       t = AllocThread();
-      t->ip = ip;
+      t->id = id;
       CopyCapture(t->capture, capture);
       *tp = t;
       if (Debug)
-        fprintf(stderr, " + %d%s [%p]\n", ip->id(), FormatCapture(t->capture).c_str(), t);
+        fprintf(stderr, " + %d%s [%p]\n", id, FormatCapture(t->capture).c_str(), t);
       break;
 
     case kInstEmptyWidth:
@@ -287,7 +289,7 @@
 // ^, $ and \b match the current input point (after c).
 // Frees all the threads on runq.
 // If there is a shortcut to the end, returns that shortcut.
-Inst* NFA::Step(Threadq* runq, Threadq* nextq, int c, int flag, const char* p) {
+int NFA::Step(Threadq* runq, Threadq* nextq, int c, int flag, const char* p) {
   nextq->clear();
 
   for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i) {
@@ -295,8 +297,6 @@
     if (t == NULL)
       continue;
 
-    Inst* ip = t->ip;
-
     if (longest_) {
       // Can skip any threads started after our current best match.
       if (matched_ && match_[0] < t->capture[0]) {
@@ -305,6 +305,9 @@
       }
     }
 
+    int id = t->id;
+    Prog::Inst* ip = prog_->inst(id);
+
     switch (ip->opcode()) {
       default:
         // Should only see the values handled below.
@@ -320,14 +323,14 @@
         if (i != runq->begin())
           break;
         // The match is ours if we want it.
-        if (ip->greedy() || longest_) {
+        if (ip->greedy(prog_) || longest_) {
           CopyCapture((const char**)match_, t->capture);
           FreeThread(t);
           for (++i; i != runq->end(); ++i)
             FreeThread(i->second);
           runq->clear();
           matched_ = true;
-          if (ip->greedy())
+          if (ip->greedy(prog_))
             return ip->out1();
           return ip->out();
         }
@@ -397,11 +400,8 @@
 bool NFA::Search(const StringPiece& text, const StringPiece& const_context,
             bool anchored, bool longest,
             StringPiece* submatch, int nsubmatch) {
-  // Can't happen -- set during constructor.
-  if (start_ == NULL) {
-    LOG(DFATAL) << "start_ == NULL";
+  if (start_ == 0)
     return false;
-  }
 
   StringPiece context = const_context;
   if (context.begin() == NULL)
@@ -504,7 +504,7 @@
         Thread* t = i->second;
         if (t == NULL)
           continue;
-        fprintf(stderr, " %d%s", t->ip->id(),
+        fprintf(stderr, " %d%s", t->id,
                 FormatCapture((const char**)t->capture).c_str());
       }
       fprintf(stderr, "\n");
@@ -514,14 +514,15 @@
     // repeating the flag computation above).
     // This is a no-op the first time around the loop, because
     // runq is empty.
-    Inst* ip = Step(runq, nextq, c, flag, p-1);
+    int id = Step(runq, nextq, c, flag, p-1);
     DCHECK_EQ(runq->size(), 0);
     swap(nextq, runq);
     nextq->clear();
-    if (ip != NULL) {
+    if (id != 0) {
       // We're done: full match ahead.
       p = text.end();
       for (;;) {
+        Prog::Inst* ip = prog_->inst(id);
         switch (ip->opcode()) {
           default:
             LOG(DFATAL) << "Unexpected opcode in short circuit: " << ip->opcode();
@@ -529,11 +530,11 @@
 
           case kInstCapture:
             match_[ip->cap()] = p;
-            ip = ip->out();
+            id = ip->out();
             continue;
 
           case kInstNop:
-            ip = ip->out();
+            id = ip->out();
             continue;
 
           case kInstMatch:
@@ -546,7 +547,7 @@
               LOG(DFATAL) << "Unexpected empty-width in short circuit: " << ip->empty();
               break;
             }
-            ip = ip->out();
+            id = ip->out();
             continue;
         }
         break;
@@ -620,16 +621,17 @@
 // Computes whether all successful matches have a common first byte,
 // and if so, returns that byte.  If not, returns -1.
 int NFA::ComputeFirstByte() {
-  if (start_ == NULL)
+  if (start_ == 0)
     return -1;
 
   int b = -1;  // first byte, not yet computed
 
-  typedef SparseArray<Inst*> Workq;
+  typedef SparseSet Workq;
   Workq q(prog_->size());
-  q.set(start_->id(), start_);
+  q.insert(start_);
   for (Workq::iterator it = q.begin(); it != q.end(); ++it) {
-    Inst* ip = it->value();
+    int id = *it;
+    Prog::Inst* ip = prog_->inst(id);
     switch (ip->opcode()) {
       default:
         LOG(DFATAL) << "unhandled " << ip->opcode() << " in ComputeFirstByte";
@@ -661,16 +663,16 @@
         // in order to be as conservative as possible
         // (assume all possible empty-width flags are true).
         if (ip->out())
-          q.set(ip->out()->id(), ip->out());
+          q.insert(ip->out());
         break;
 
       case kInstAlt:
       case kInstAltMatch:
         // Explore alternatives.
         if (ip->out())
-          q.set(ip->out()->id(), ip->out());
+          q.insert(ip->out());
         if (ip->out1())
-          q.set(ip->out1()->id(), ip->out1());
+          q.insert(ip->out1());
         break;
 
       case kInstFail:
diff --git a/re2/onepass.cc b/re2/onepass.cc
index 6650f9e..1c49988 100644
--- a/re2/onepass.cc
+++ b/re2/onepass.cc
@@ -54,7 +54,7 @@
 #include <map>
 #include "util/util.h"
 #include "util/arena.h"
-#include "util/sparse_array.h"
+#include "util/sparse_set.h"
 #include "re2/prog.h"
 #include "re2/stringpiece.h"
 
@@ -197,7 +197,7 @@
 
 // Compute a node pointer.
 // Basically (OneState*)(nodes + statesize*nodeindex)
-// but the version wih the C++ casts overflows 80 characters (and is ugly).
+// but the version with the C++ casts overflows 80 characters (and is ugly).
 static inline OneState* IndexToNode(volatile uint8* nodes, int statesize,
                                     int nodeindex) {
   return reinterpret_cast<OneState*>(
@@ -345,18 +345,18 @@
 // If ip is not on workq, adds ip to work queue and returns true.
 // If ip is already on work queue, does nothing and returns false.
 // If ip is NULL, does nothing and returns true (pretends to add it).
-typedef SparseArray<Inst*> Instq;
-static bool AddQ(Instq *q, Inst *ip) {
-  if (ip == NULL)
+typedef SparseSet Instq;
+static bool AddQ(Instq *q, int id) {
+  if (id == 0)
     return true;
-  if (q->has_index(ip->id()))
+  if (q->contains(id))
     return false;
-  q->set(ip->id(), ip);
+  q->insert(id);
   return true;
 }
 
 struct InstCond {
-  Inst* ip;
+  int id;
   uint32 cond;
 };
 
@@ -380,6 +380,9 @@
     return onepass_start_ != NULL;
   did_onepass_ = true;
 
+  if (start() == 0)  // no match
+    return false;
+
   // Steal memory for the one-pass NFA from the overall DFA budget.
   // Willing to use at most 1/4 of the DFA budget (heuristic).
   // Limit max node count to 65000 as a conservative estimate to
@@ -395,7 +398,7 @@
   int size = this->size();
   InstCond *stack = new InstCond[size];
 
-  int* nodebyid = new int[size];  // indexed by ip->id()
+  int* nodebyid = new int[size];  // indexed by ip
   memset(nodebyid, 0xFF, size*sizeof nodebyid[0]);
 
   uint8* nodes = new uint8[maxnodes*statesize];
@@ -403,12 +406,12 @@
 
   Instq tovisit(size), workq(size);
   AddQ(&tovisit, start());
-  nodebyid[start()->id()] = 0;
+  nodebyid[start()] = 0;
   nodep += statesize;
   int nalloc = 1;
   for (Instq::iterator it = tovisit.begin(); it != tovisit.end(); ++it) {
-    Inst* ip = it->second;
-    int nodeindex = nodebyid[ip->id()];
+    int id = *it;
+    int nodeindex = nodebyid[id];
     OneState* node = IndexToNode(nodes, statesize, nodeindex);
 
     // Flood graph using manual stack, filling in actions as found.
@@ -420,10 +423,11 @@
     workq.clear();
     bool matched = false;
     int nstack = 0;
-    stack[nstack].ip = ip;
+    stack[nstack].id = id;
     stack[nstack++].cond = 0;
     while (nstack > 0) {
-      Inst* ip = stack[--nstack].ip;
+      int id = stack[--nstack].id;
+      Prog::Inst* ip = inst(id);
       uint32 cond = stack[nstack].cond;
       switch (ip->opcode()) {
         case kInstAltMatch:
@@ -434,14 +438,14 @@
           // If already on work queue, (1) is violated: bail out.
           if (!AddQ(&workq, ip->out()) || !AddQ(&workq, ip->out1()))
             goto fail;
-          stack[nstack].ip = ip->out1();
+          stack[nstack].id = ip->out1();
           stack[nstack++].cond = cond;
-          stack[nstack].ip = ip->out();
+          stack[nstack].id = ip->out();
           stack[nstack++].cond = cond;
           break;
 
         case kInstByteRange: {
-          int nextindex = nodebyid[ip->out()->id()];
+          int nextindex = nodebyid[ip->out()];
           if (nextindex == -1) {
             if (nalloc >= maxnodes) {
               if (Debug)
@@ -452,7 +456,7 @@
             }
             nextindex = nalloc;
             nodep += statesize;
-            nodebyid[ip->out()->id()] = nextindex;
+            nodebyid[ip->out()] = nextindex;
             nalloc++;
             AddQ(&tovisit, ip->out());
           }
@@ -470,7 +474,7 @@
                 LOG(ERROR)
                   << StringPrintf("Not OnePass: conflict on byte "
                                   "%#x at state %d",
-                                  c, it->second->id());
+                                  c, *it);
               }
               goto fail;
             }
@@ -490,7 +494,7 @@
                   LOG(ERROR)
                     << StringPrintf("Not OnePass: conflict on byte "
                                     "%#x at state %d",
-                                    c, it->second->id());
+                                    c, *it);
                 }
                 goto fail;
               }
@@ -521,11 +525,11 @@
             if (Debug) {
               LOG(ERROR) << StringPrintf("Not OnePass: multiple paths"
                                          " %d -> %d\n",
-                                         it->second->id(), ip->id());
+                                         *it, ip->out());
             }
             goto fail;
           }
-          stack[nstack].ip = ip->out();
+          stack[nstack].id = ip->out();
           stack[nstack++].cond = cond;
           break;
 
@@ -534,7 +538,7 @@
             // (3) is violated
             if (Debug) {
               LOG(ERROR) << StringPrintf("Not OnePass: multiple matches"
-                                         " from %d\n", it->second->id());
+                                         " from %d\n", *it);
             }
             goto fail;
           }
@@ -565,14 +569,14 @@
     }
 
     for (Instq::iterator it = tovisit.begin(); it != tovisit.end(); ++it) {
-      Inst* ip = it->second;
-      int nodeindex = nodebyid[ip->id()];
+      int id = *it;
+      int nodeindex = nodebyid[id];
       if (nodeindex == -1)
       	continue;
       OneState* node = IndexToNode(nodes, statesize, nodeindex);
       string s;
       StringAppendF(&dump, "node %d id=%d: matchcond=%#x\n",
-                    nodeindex, ip->id(), node->matchcond);
+                    nodeindex, id, node->matchcond);
       for (int i = 0; i < bytemap_range_; i++) {
         if ((node->action[i] & kImpossible) == kImpossible)
           continue;
@@ -585,7 +589,13 @@
     LOG(ERROR) << dump;
   }
 
-  onepass_start_ = IndexToNode(nodes, statesize, nodebyid[start()->id()]);
+  // Overallocated earlier; cut down to actual size.
+  nodep = new uint8[nalloc*statesize];
+  memmove(nodep, nodes, nalloc*statesize);
+  delete[] nodes;
+  nodes = nodep;
+
+  onepass_start_ = IndexToNode(nodes, statesize, nodebyid[start()]);
   onepass_nodes_ = nodes;
   onepass_statesize_ = statesize;
   dfa_mem_ -= nalloc*statesize;
diff --git a/re2/prog.cc b/re2/prog.cc
index 3c3f33a..af216b8 100644
--- a/re2/prog.cc
+++ b/re2/prog.cc
@@ -3,10 +3,10 @@
 // license that can be found in the LICENSE file.
 
 // Compiled regular expression representation.
-// Tested by compile_unittest.cc
+// Tested by compile_test.cc
 
 #include "util/util.h"
-#include "util/sparse_array.h"
+#include "util/sparse_set.h"
 #include "re2/prog.h"
 #include "re2/stringpiece.h"
 
@@ -14,80 +14,75 @@
 
 // Constructors per Inst opcode
 
-void Inst::InitAlt(Inst* out, Inst* out1) {
-  DCHECK_EQ(opcode_, 0);
-  opcode_ = kInstAlt;
-  out_ = out;
+void Prog::Inst::InitAlt(uint32 out, uint32 out1) {
+  DCHECK_EQ(out_opcode_, 0);
+  set_out_opcode(out, kInstAlt);
   out1_ = out1;
 }
 
-void Inst::InitByteRange(int lo, int hi, int foldcase, Inst* out) {
-  DCHECK_EQ(opcode_, 0);
-  opcode_ = kInstByteRange;
+void Prog::Inst::InitByteRange(int lo, int hi, int foldcase, uint32 out) {
+  DCHECK_EQ(out_opcode_, 0);
+  set_out_opcode(out, kInstByteRange);
   lo_ = lo & 0xFF;
   hi_ = hi & 0xFF;
   foldcase_ = foldcase;
-  out_ = out;
 }
 
-void Inst::InitCapture(int cap, Inst* out) {
-  DCHECK_EQ(opcode_, 0);
-  opcode_ = kInstCapture;
+void Prog::Inst::InitCapture(int cap, uint32 out) {
+  DCHECK_EQ(out_opcode_, 0);
+  set_out_opcode(out, kInstCapture);
   cap_ = cap;
-  out_ = out;
 }
 
-void Inst::InitEmptyWidth(EmptyOp empty, Inst* out) {
-  DCHECK_EQ(opcode_, 0);
-  opcode_ = kInstEmptyWidth;
+void Prog::Inst::InitEmptyWidth(EmptyOp empty, uint32 out) {
+  DCHECK_EQ(out_opcode_, 0);
+  set_out_opcode(out, kInstEmptyWidth);
   empty_ = empty;
-  out_ = out;
 }
 
-void Inst::InitMatch() {
-  DCHECK_EQ(opcode_, 0);
-  opcode_ = kInstMatch;
+void Prog::Inst::InitMatch() {
+  DCHECK_EQ(out_opcode_, 0);
+  set_opcode(kInstMatch);
 }
 
-void Inst::InitNop(Inst* out) {
-  DCHECK_EQ(opcode_, 0);
-  opcode_ = kInstNop;
-  out_ = out;
+void Prog::Inst::InitNop(uint32 out) {
+  DCHECK_EQ(out_opcode_, 0);
+  set_opcode(kInstNop);
 }
 
-void Inst::InitFail() {
-  DCHECK_EQ(opcode_, 0);
-  opcode_ = kInstFail;
+void Prog::Inst::InitFail() {
+  DCHECK_EQ(out_opcode_, 0);
+  set_opcode(kInstFail);
 }
 
-string Inst::Dump() {
-  switch (opcode_) {
+string Prog::Inst::Dump() {
+  switch (opcode()) {
     default:
-      return StringPrintf("opcode %d", static_cast<int>(opcode_));
+      return StringPrintf("opcode %d", static_cast<int>(opcode()));
 
     case kInstAlt:
-      return StringPrintf("alt -> %d | %d", out_->id(), out1_->id());
+      return StringPrintf("alt -> %d | %d", out(), out1_);
 
     case kInstAltMatch:
-      return StringPrintf("altmatch -> %d | %d", out_->id(), out1_->id());
+      return StringPrintf("altmatch -> %d | %d", out(), out1_);
 
     case kInstByteRange:
       return StringPrintf("byte%s [%02x-%02x] -> %d",
                           foldcase_ ? "/i" : "",
-                          lo_, hi_, out_->id());
+                          lo_, hi_, out());
 
     case kInstCapture:
-      return StringPrintf("capture %d -> %d", cap_, out_->id());
+      return StringPrintf("capture %d -> %d", cap_, out());
 
     case kInstEmptyWidth:
       return StringPrintf("emptywidth %#x -> %d",
-                          static_cast<int>(empty_), out_->id());
+                          static_cast<int>(empty_), out());
 
     case kInstMatch:
       return StringPrintf("match!");
 
     case kInstNop:
-      return StringPrintf("nop -> %d", out_->id());
+      return StringPrintf("nop -> %d", out());
 
     case kInstFail:
       return StringPrintf("fail");
@@ -97,17 +92,18 @@
 Prog::Prog()
   : anchor_start_(false),
     anchor_end_(false),
+    did_onepass_(false),
     start_(NULL),
     size_(0),
     byte_inst_count_(0),
-    arena_(40*sizeof(Inst)),        // about 1kB on 64-bit
+    bytemap_range_(0),
+    flags_(0),
+    inst_(NULL),
     dfa_first_(NULL),
     dfa_longest_(NULL),
     dfa_mem_(0),
     delete_dfa_(NULL),
-    bytemap_range_(0),
-    flags_(0),
-    did_onepass_(false),
+    unbytemap_(NULL),
     onepass_nodes_(NULL),
     onepass_start_(NULL) {
 }
@@ -120,25 +116,24 @@
       delete_dfa_(dfa_longest_);
   }
   delete[] onepass_nodes_;
+  delete[] inst_;
+  delete[] unbytemap_;
 }
 
-Inst* Prog::AllocInst() {
-  return new (AllocateInArena, &arena_) Inst(size_++);
+typedef SparseSet Workq;
+
+static inline void AddToQueue(Workq* q, int id) {
+  if (id != 0)
+    q->insert(id);
 }
 
-typedef SparseArray<Inst*> Workq;
-
-static inline void AddToQueue(Workq* q, Inst* ip) {
-  if (ip != NULL)
-    q->set(ip->id(), ip);
-}
-
-static string ProgToString(Workq *q) {
+static string ProgToString(Prog* prog, Workq* q) {
   string s;
 
   for (Workq::iterator i = q->begin(); i != q->end(); ++i) {
-    Inst* ip = i->value();
-    StringAppendF(&s, "%d. %s\n", ip->id(), ip->Dump().c_str());
+    int id = *i;
+    Prog::Inst* ip = prog->inst(id);
+    StringAppendF(&s, "%d. %s\n", id, ip->Dump().c_str());
     AddToQueue(q, ip->out());
     if (ip->opcode() == kInstAlt || ip->opcode() == kInstAltMatch)
       AddToQueue(q, ip->out1());
@@ -160,16 +155,16 @@
 
   Workq q(size_);
   AddToQueue(&q, start_);
-  return map + ProgToString(&q);
+  return map + ProgToString(this, &q);
 }
 
 string Prog::DumpUnanchored() {
   Workq q(size_);
   AddToQueue(&q, start_unanchored_);
-  return ProgToString(&q);
+  return ProgToString(this, &q);
 }
 
-static bool IsMatch(Inst*);
+static bool IsMatch(Prog*, Prog::Inst*);
 
 // Peep-hole optimizer.
 void Prog::Optimize() {
@@ -180,19 +175,21 @@
   q.clear();
   AddToQueue(&q, start_);
   for (Workq::iterator i = q.begin(); i != q.end(); ++i) {
-    Inst* ip = i->value();
+    int id = *i;
 
-    Inst* j = ip->out();
-    while (j != NULL && j->opcode() == kInstNop) {
-      j = j->out();
+    Inst* ip = inst(id);
+    int j = ip->out();
+    Inst* jp;
+    while (j != 0 && (jp=inst(j))->opcode() == kInstNop) {
+      j = jp->out();
     }
-    ip->out_ = j;
+    ip->set_out(j);
     AddToQueue(&q, ip->out());
 
     if (ip->opcode() == kInstAlt) {
       j = ip->out1();
-      while (j != NULL && j->opcode() == kInstNop) {
-        j = j->out();
+      while (j != 0 && (jp=inst(j))->opcode() == kInstNop) {
+        j = jp->out();
       }
       ip->out1_ = j;
       AddToQueue(&q, ip->out1());
@@ -209,31 +206,32 @@
   q.clear();
   AddToQueue(&q, start_);
   for (Workq::iterator i = q.begin(); i != q.end(); ++i) {
-    Inst* ip = i->value();
+    int id = *i;
+    Inst* ip = inst(id);
     AddToQueue(&q, ip->out());
     if (ip->opcode() == kInstAlt)
       AddToQueue(&q, ip->out1());
 
     if (ip->opcode() == kInstAlt) {
-      Inst* j = ip->out();
-      Inst* k = ip->out1();
-      if (j->opcode() == kInstByteRange && j->out() == ip &&
+      Inst* j = inst(ip->out());
+      Inst* k = inst(ip->out1());
+      if (j->opcode() == kInstByteRange && j->out() == id &&
           j->lo() == 0x00 && j->hi() == 0xFF &&
-          IsMatch(k)) {
-        ip->opcode_ = kInstAltMatch;
+          IsMatch(this, k)) {
+        ip->set_opcode(kInstAltMatch);
         continue;
       }
-      if (IsMatch(j) &&
-          k->opcode() == kInstByteRange && k->out() == ip &&
+      if (IsMatch(this, j) &&
+          k->opcode() == kInstByteRange && k->out() == id &&
           k->lo() == 0x00 && k->hi() == 0xFF) {
-        ip->opcode_ = kInstAltMatch;
+        ip->set_opcode(kInstAltMatch);
       }
     }
   }
 }
 
 // Is ip a guaranteed match at end of text, perhaps after some capturing?
-static bool IsMatch(Inst *ip) {
+static bool IsMatch(Prog* prog, Prog::Inst* ip) {
   for (;;) {
     switch (ip->opcode()) {
       default:
@@ -249,7 +247,7 @@
 
       case kInstCapture:
       case kInstNop:
-        ip = ip->out();
+        ip = prog->inst(ip->out());
         break;
 
       case kInstMatch:
@@ -317,11 +315,13 @@
     if ((i&31) == 0)
       bits = v.Word(i >> 5);
     bytemap_[i] = n;
-    unbytemap_[n] = i;
     n += bits & 1;
     bits >>= 1;
   }
   bytemap_range_ = bytemap_[255] + 1;
+  unbytemap_ = new uint8[bytemap_range_];
+  for (int i = 0; i < 256; i++)
+    unbytemap_[bytemap_[i]] = i;  
 
   if (0) {  // For debugging: use trivial byte map.
     for (int i = 0; i < 256; i++) {
diff --git a/re2/prog.h b/re2/prog.h
index 4d30359..6232a6e 100644
--- a/re2/prog.h
+++ b/re2/prog.h
@@ -47,14 +47,14 @@
 
 // Opcodes for Inst
 enum InstOp {
-  kInstAlt = 1,      // choose between out_ and out1_
+  kInstAlt = 0,      // choose between out_ and out1_
+  kInstAltMatch,     // Alt: out_ is [00-FF] and back, out1_ is match; or vice versa.
   kInstByteRange,    // next (possible case-folded) byte must be in [lo_, hi_]
   kInstCapture,      // capturing parenthesis number cap_
   kInstEmptyWidth,   // empty-width special (^ $ ...); bit(s) set in empty_
   kInstMatch,        // found a match!
   kInstNop,          // no-op; occasionally unavoidable
   kInstFail,         // never match; occasionally unavoidable
-  kInstAltMatch,     // Alt: out_ is [00-FF] and back, out1_ is match; or vice versa.
 };
 
 // Bit flags for empty-width specials
@@ -68,79 +68,8 @@
   kEmptyAllFlags         = (1<<6)-1,
 };
 
-struct Frag;
-class Prog;
 class Regexp;
 
-// Single instruction in regexp program.
-class Inst {
- public:
-  Inst(int id) : id_(id), opcode_(static_cast<InstOp>(0)), out_(NULL) { }
-
-  // Constructors per opcode
-  void InitAlt(Inst* out, Inst* out1);
-  void InitByteRange(int lo, int hi, int foldcase, Inst* out);
-  void InitCapture(int cap, Inst* out);
-  void InitEmptyWidth(EmptyOp empty, Inst* out);
-  void InitMatch();
-  void InitNop(Inst* out);
-  void InitFail();
-
-  // Getters
-  int id()        { return (this == NULL) ? -1 : id_; }
-  int id_unchecked() { return id_; }
-  InstOp opcode() { return opcode_; }
-  Inst* out()     { return out_; }
-  Inst* out1()    { DCHECK(opcode_ == kInstAlt || opcode_ == kInstAltMatch); return out1_; }
-  int cap()       { DCHECK_EQ(opcode_, kInstCapture); return cap_; }
-  int lo()        { DCHECK_EQ(opcode_, kInstByteRange); return lo_; }
-  int hi()        { DCHECK_EQ(opcode_, kInstByteRange); return hi_; }
-  int foldcase()  { DCHECK_EQ(opcode_, kInstByteRange); return foldcase_; }
-  EmptyOp empty() { DCHECK_EQ(opcode_, kInstEmptyWidth); return empty_; }
-  bool greedy()   { DCHECK_EQ(opcode_, kInstAltMatch); return out_->opcode() == kInstByteRange; }
-
-  // Does this inst (an kInstByteRange) match c?
-  inline bool Matches(int c) {
-    DCHECK_EQ(opcode_, kInstByteRange);
-    if (foldcase_ && 'A' <= c && c <= 'Z')
-      c += 'a' - 'A';
-    return lo_ <= c && c <= hi_;
-  }
-
-  // Returns string representation for debugging.
-  string Dump();
-
- private:
-  int id_;          // sequence number in Prog
-  InstOp opcode_;   // type of instruction
-  Inst* out_;       // next instruction to run
-  union {           // additional instruction arguments:
-    Inst* out1_;       // opcode == kInstAlt
-                       //   alternate next instruction
-
-    int cap_;          // opcode == kInstCapture
-                       //   Index of capture register (holds text
-                       //   position recorded by capturing parentheses).
-                       //   For \n (the submatch for the nth parentheses),
-                       //   the left parenthesis captures into register 2*n
-                       //   and the right one captures into register 2*n+1.
-
-    struct {           // opcode == kInstByteRange
-      int16 lo_;       //   byte range is lo_-hi_ inclusive
-      int16 hi_;       //
-      bool foldcase_;  //   convert A-Z to a-z before checking range.
-    };
-
-    EmptyOp empty_;    // opcode == kInstEmptyWidth
-                       //   empty_ is bitwise OR of kEmpty* flags above.
-  };
-
-  friend class Compiler;
-  friend class Prog;
-
-  DISALLOW_EVIL_CONSTRUCTORS(Inst);
-};
-
 class DFA;
 class OneState;
 
@@ -150,6 +79,92 @@
   Prog();
   ~Prog();
 
+  // Single instruction in regexp program.
+  class Inst {
+   public:
+    Inst() : out_opcode_(0), out1_(0) { }
+  
+    // Constructors per opcode
+    void InitAlt(uint32 out, uint32 out1);
+    void InitByteRange(int lo, int hi, int foldcase, uint32 out);
+    void InitCapture(int cap, uint32 out);
+    void InitEmptyWidth(EmptyOp empty, uint32 out);
+    void InitMatch();
+    void InitNop(uint32 out);
+    void InitFail();
+  
+    // Getters
+    int id(Prog* p) { return this - p->inst_; }
+    InstOp opcode() { return static_cast<InstOp>(out_opcode_&7); }
+    int out()     { return out_opcode_>>3; }
+    int out1()    { DCHECK(opcode() == kInstAlt || opcode() == kInstAltMatch); return out1_; }
+    int cap()       { DCHECK_EQ(opcode(), kInstCapture); return cap_; }
+    int lo()        { DCHECK_EQ(opcode(), kInstByteRange); return lo_; }
+    int hi()        { DCHECK_EQ(opcode(), kInstByteRange); return hi_; }
+    int foldcase()  { DCHECK_EQ(opcode(), kInstByteRange); return foldcase_; }
+    EmptyOp empty() { DCHECK_EQ(opcode(), kInstEmptyWidth); return empty_; }
+    bool greedy(Prog *p) {
+      DCHECK_EQ(opcode(), kInstAltMatch);
+      return p->inst(out())->opcode() == kInstByteRange;
+    }
+  
+    // Does this inst (an kInstByteRange) match c?
+    inline bool Matches(int c) {
+      DCHECK_EQ(opcode(), kInstByteRange);
+      if (foldcase_ && 'A' <= c && c <= 'Z')
+        c += 'a' - 'A';
+      return lo_ <= c && c <= hi_;
+    }
+  
+    // Returns string representation for debugging.
+    string Dump();
+  
+    // Maximum instruction id.
+    // (Must fit in out_opcode_, and PatchList steals another bit.)
+    static const int kMaxInst = (1<<28) - 1;
+  
+   private:
+    void set_opcode(InstOp opcode) {
+      out_opcode_ = (out()<<3) | opcode;
+    }
+  
+    void set_out(int out) {
+      out_opcode_ = (out<<3) | opcode();
+    }
+    
+    void set_out_opcode(int out, InstOp opcode) {
+      out_opcode_ = (out<<3) | opcode;
+    }
+    
+    uint32 out_opcode_;  // 29 bits of out, 3 (low) bits opcode
+    union {              // additional instruction arguments:
+      uint32 out1_;      // opcode == kInstAlt
+                         //   alternate next instruction
+  
+      int32 cap_;        // opcode == kInstCapture
+                         //   Index of capture register (holds text
+                         //   position recorded by capturing parentheses).
+                         //   For \n (the submatch for the nth parentheses),
+                         //   the left parenthesis captures into register 2*n
+                         //   and the right one captures into register 2*n+1.
+  
+      struct {           // opcode == kInstByteRange
+        uint8 lo_;       //   byte range is lo_-hi_ inclusive
+        uint8 hi_;       //
+        uint8 foldcase_; //   convert A-Z to a-z before checking range.
+      };
+  
+      EmptyOp empty_;    // opcode == kInstEmptyWidth
+                         //   empty_ is bitwise OR of kEmpty* flags above.
+    };
+  
+    friend class Compiler;
+    friend class PatchList;
+    friend class Prog;
+  
+    DISALLOW_EVIL_CONSTRUCTORS(Inst);
+  };
+
   // Whether to anchor the search.
   enum Anchor {
     kUnanchored,  // match anywhere
@@ -174,10 +189,11 @@
     kFullMatch       // match only entire text; implies anchor==kAnchored
   };
 
-  Inst* start() { return start_; }
-  Inst* start_unanchored() { return start_unanchored_; }
-  void set_start(Inst *start) { start_ = start; }
-  void set_start_unanchored(Inst *start) { start_unanchored_ = start; }
+  Inst *inst(int id) { return &inst_[id]; }
+  int start() { return start_; }
+  int start_unanchored() { return start_unanchored_; }
+  void set_start(int start) { start_ = start; }
+  void set_start_unanchored(int start) { start_unanchored_ = start; }
   int64 size() { return size_; }
   bool reversed() { return reversed_; }
   void set_reversed(bool reversed) { reversed_ = reversed; }
@@ -198,9 +214,6 @@
   string Dump();
   string DumpUnanchored();
 
-  // Allocates and returns a new instruction.
-  Inst* AllocInst();
-
   // Record that at some point in the prog, the bytes in the range
   // lo-hi (inclusive) are treated as different from bytes outside the range.
   // Tracking this lets the DFA collapse commonly-treated byte ranges
@@ -311,15 +324,21 @@
   friend class Compiler;
 
   DFA* GetDFA(MatchKind kind);
-
+  
   bool anchor_start_;       // regexp has explicit start anchor
   bool anchor_end_;         // regexp has explicit end anchor
-  Inst* start_;             // entry point for program
-  Inst* start_unanchored_;  // unanchored entry point for program
-  int64 size_;              // number of instructions
-  int64 byte_inst_count_;   // number of kInstByteRange instructions
-  UnsafeArena arena_;       // allocation arena for Inst structures
   bool reversed_;           // whether program runs backward over input
+  bool did_onepass_;        // has IsOnePass been called?
+
+  int start_;               // entry point for program
+  int start_unanchored_;    // unanchored entry point for program
+  int size_;                // number of instructions
+  int byte_inst_count_;     // number of kInstByteRange instructions
+  int bytemap_range_;       // bytemap_[x] < bytemap_range_
+  int flags_;               // regexp parse flags
+  int onepass_statesize_;   // byte size of each OneState* node
+
+  Inst* inst_;              // pointer to instruction array
 
   Mutex dfa_mutex_;    // Protects dfa_first_, dfa_longest_
   DFA* dfa_first_;     // DFA cached for kFirstMatch
@@ -330,13 +349,9 @@
   Bitmap<256> byterange_;    // byterange.Get(x) true if x ends a
                              // commonly-treated byte range.
   uint8 bytemap_[256];       // map from input bytes to byte classes
-  uint8 unbytemap_[256];     // bytemap_[unbytemap_[x]] == x
-  int bytemap_range_;        // bytemap_[x] < bytemap_range_
-  int flags_;                // regexp parse flags
+  uint8 *unbytemap_;         // bytemap_[unbytemap_[x]] == x
 
-  bool did_onepass_;         // has IsOnePass been called?
   uint8* onepass_nodes_;     // data for OnePass nodes
-  int onepass_statesize_;    // byte size of each node
   OneState* onepass_start_;  // start node for OnePass program
 
   DISALLOW_EVIL_CONSTRUCTORS(Prog);
diff --git a/re2/testing/backtrack.cc b/re2/testing/backtrack.cc
index af02eac..b2dd6db 100644
--- a/re2/testing/backtrack.cc
+++ b/re2/testing/backtrack.cc
@@ -57,7 +57,7 @@
  private:
   // Explores from instruction ip at string position p looking for a match.
   // Returns true if found (so that caller can stop trying other possibilities).
-  bool Visit(Inst* ip, const char* p);
+  bool Visit(int id, const char* p);
 
   // Search parameters
   Prog* prog_;              // program being run
@@ -145,12 +145,12 @@
 
 // Explores from instruction ip at string position p looking for a match.
 // Return true if found (so that caller can stop trying other possibilities).
-bool Backtracker::Visit(Inst* ip, const char* p) {
+bool Backtracker::Visit(int id, const char* p) {
   // Check bitmap.  If we've already explored from here,
   // either it didn't match or it did but we're hoping for a better match.
   // Either way, don't go down that road again.
   CHECK(p <= text_.end());
-  int n = ip->id()*(text_.size()+1) + (p - text_.begin());
+  int n = id*(text_.size()+1) + (p - text_.begin());
   CHECK_LT(n/32, nvisited_);
   if (visited_[n/32] & (1 << (n&31)))
     return false;
@@ -162,6 +162,7 @@
   if (p < text_.end())
     c = *p & 0xFF;
 
+  Prog::Inst* ip = prog_->inst(id);
   switch (ip->opcode()) {
     default:
       LOG(FATAL) << "Unexpected opcode: " << (int)ip->opcode();
diff --git a/util/sparse_set.h b/util/sparse_set.h
new file mode 100644
index 0000000..7243ff2
--- /dev/null
+++ b/util/sparse_set.h
@@ -0,0 +1,162 @@
+// Copyright 2006 The RE2 Authors.  All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// DESCRIPTION
+// 
+// SparseSet<T>(m) is a set of integers in [0, m).
+// It requires sizeof(int)*m memory, but it provides
+// fast iteration through the elements in the set and fast clearing
+// of the set.
+// 
+// Insertion and deletion are constant time operations.
+// 
+// Allocating the set is a constant time operation 
+// when memory allocation is a constant time operation.
+// 
+// Clearing the set is a constant time operation (unusual!).
+// 
+// Iterating through the set is an O(n) operation, where n
+// is the number of items in the set (not O(m)).
+//
+// The set iterator visits entries in the order they were first 
+// inserted into the array.  It is safe to add items to the set while
+// using an iterator: the iterator will visit indices added to the set
+// during the iteration, but will not re-visit indices whose values
+// change after visiting.  Thus SparseSet can be a convenient
+// implementation of a work queue.
+// 
+// The SparseSet implementation is NOT thread-safe.  It is up to the
+// caller to make sure only one thread is accessing the set.  (Typically
+// these sets are temporary values and used in situations where speed is
+// important.)
+// 
+// The SparseSet interface does not present all the usual STL bells and
+// whistles.
+// 
+// Implemented with reference to Briggs & Torczon, An Efficient
+// Representation for Sparse Sets, ACM Letters on Programming Languages
+// and Systems, Volume 2, Issue 1-4 (March-Dec.  1993), pp.  59-69.
+// 
+// For a generalization to sparse array, see sparse_array.h.
+
+// IMPLEMENTATION
+//
+// See sparse_array.h for implementation details
+
+#ifndef RE2_UTIL_SPARSE_SET_H__
+#define RE2_UTIL_SPARSE_SET_H__
+
+#include "util/util.h"
+
+namespace re2 {
+
+class SparseSet {
+ public:
+  SparseSet()
+    : size_(0), max_size_(0), sparse_to_dense_(NULL), dense_(NULL) {}
+
+  SparseSet(int max_size) {
+    max_size_ = max_size;
+    sparse_to_dense_ = new int[max_size];
+    dense_ = new int[max_size];
+    // Don't need to zero the memory.
+    size_ = 0;
+  }
+
+  ~SparseSet() {
+    delete[] sparse_to_dense_;
+    delete[] dense_;
+  }
+
+  typedef int* iterator;
+  typedef const int* const_iterator;
+
+  int size() const { return size_; }
+  iterator begin() { return dense_; }
+  iterator end() { return dense_ + size_; }
+  const_iterator begin() const { return dense_; }
+  const_iterator end() const { return dense_ + size_; }
+
+  // Change the maximum size of the array.
+  // Invalidates all iterators.
+  void resize(int max_size) {
+    if (size_ > max_size_)
+      size_ = max_size_;
+    if (max_size > max_size_) {
+      int* a = new int[max_size];
+      if (sparse_to_dense_) {
+        memmove(a, sparse_to_dense_, max_size_*sizeof a[0]);
+        delete[] sparse_to_dense_;
+      }
+      sparse_to_dense_ = a;
+
+      a = new int[max_size_];
+      if (dense_) {
+        memmove(a, dense_, size_*sizeof a[0]);
+        delete[] dense_;
+      }
+      dense_ = a;
+    }
+    max_size_ = max_size;
+  }
+
+  // Return the maximum size of the array.
+  // Indices can be in the range [0, max_size).
+  int max_size() const { return max_size_; }
+
+  // Clear the array.
+  void clear() { size_ = 0; }
+
+  // Check whether i is in the array.
+  bool contains(int i) const {
+    DCHECK_GE(i, 0);
+    DCHECK_LT(i, max_size_);
+    if (static_cast<uint>(i) >= max_size_) {
+      return false;
+    }
+    // Unsigned comparison avoids checking sparse_to_dense_[i] < 0.
+    return (uint)sparse_to_dense_[i] < (uint)size_ && 
+      dense_[sparse_to_dense_[i]] == i;
+  }
+
+  // Adds i to the set.
+  void insert(int i) {
+    if (!contains(i))
+      insert_new(i);
+  }
+
+  // Set the value at the new index i to v.
+  // Fast but unsafe: only use if contains(i) is false.
+  void insert_new(int i) {
+    if (static_cast<uint>(i) >= max_size_) {
+      // Semantically, end() would be better here, but we already know
+      // the user did something stupid, so begin() insulates them from
+      // dereferencing an invalid pointer.
+      return;
+    }
+    DCHECK(!contains(i));
+    DCHECK_LT(size_, max_size_);
+    sparse_to_dense_[i] = size_;
+    dense_[size_] = i;
+    size_++;
+  }
+
+  // Comparison function for sorting.
+  // Can sort the sparse array so that future iterations
+  // will visit indices in increasing order using
+  // sort(arr.begin(), arr.end(), arr.less);
+  static bool less(int a, int b) { return a < b; }
+
+ private:
+  int size_;
+  int max_size_;
+  int* sparse_to_dense_;
+  int* dense_;
+
+  DISALLOW_EVIL_CONSTRUCTORS(SparseSet);
+};
+
+}  // namespace re2
+
+#endif  // RE2_UTIL_SPARSE_SET_H__