[subset/cff] Cache CFF accelerator in hb_subset_plan_t

This shows 7% speedup in:
BM_subset/subset_glyphs/SourceHanSans-Regular_subset.otf/retaingids/10
diff --git a/src/hb-atomic.hh b/src/hb-atomic.hh
index a6283de..303dfe6 100644
--- a/src/hb-atomic.hh
+++ b/src/hb-atomic.hh
@@ -204,6 +204,7 @@
 
   hb_atomic_ptr_t () = default;
   constexpr hb_atomic_ptr_t (T* v) : v (v) {}
+  hb_atomic_ptr_t (const hb_atomic_ptr_t &other) = delete;
 
   void init (T* v_ = nullptr) { set_relaxed (v_); }
   void set_relaxed (T* v_) { hb_atomic_ptr_impl_set_relaxed (&v, v_); }
diff --git a/src/hb-machinery.hh b/src/hb-machinery.hh
index 1084725..556d52c 100644
--- a/src/hb-machinery.hh
+++ b/src/hb-machinery.hh
@@ -180,6 +180,9 @@
 				 hb_lazy_loader_t<Returned,Subclass,Data,WheresData,Stored>
 				>::value Funcs;
 
+  hb_lazy_loader_t () = default;
+  hb_lazy_loader_t (const hb_lazy_loader_t &other) = delete;
+
   void init0 () {} /* Init, when memory is already set to 0. No-op for us. */
   void init ()  { instance.set_relaxed (nullptr); }
   void fini ()  { do_destroy (instance.get_acquire ()); init (); }
@@ -278,7 +281,11 @@
 template <typename T, unsigned int WheresFace>
 struct hb_face_lazy_loader_t : hb_lazy_loader_t<T,
 						hb_face_lazy_loader_t<T, WheresFace>,
-						hb_face_t, WheresFace> {};
+						hb_face_t, WheresFace>
+{
+  // Hack; have them here for API parity with hb_table_lazy_loader_t
+  hb_blob_t *get_blob () { return this->get ()->get_blob (); }
+};
 
 template <typename T, unsigned int WheresFace, bool core=false>
 struct hb_table_lazy_loader_t : hb_lazy_loader_t<T,
diff --git a/src/hb-ot-cff1-table.hh b/src/hb-ot-cff1-table.hh
index d320594..1b19e0d 100644
--- a/src/hb-ot-cff1-table.hh
+++ b/src/hb-ot-cff1-table.hh
@@ -1214,6 +1214,8 @@
       blob = nullptr;
     }
 
+    hb_blob_t *get_blob () const { return blob; }
+
     bool is_valid () const { return blob; }
     bool   is_CID () const { return topDict.is_CID (); }
 
@@ -1484,7 +1486,14 @@
     typedef accelerator_templ_t<cff1_private_dict_opset_t, cff1_private_dict_values_t> SUPER;
   };
 
-  typedef accelerator_templ_t<cff1_private_dict_opset_subset_t, cff1_private_dict_values_subset_t> accelerator_subset_t;
+  struct accelerator_subset_t : accelerator_templ_t<cff1_private_dict_opset_subset_t, cff1_private_dict_values_subset_t>
+  {
+    accelerator_subset_t (hb_face_t *face) : SUPER (face) {}
+
+    HB_INTERNAL bool subset (hb_subset_context_t *c) const;
+
+    typedef accelerator_templ_t<cff1_private_dict_opset_subset_t, cff1_private_dict_values_subset_t> SUPER;
+  };
 
   HB_INTERNAL bool subset (hb_subset_context_t *c) const;
 
@@ -1510,6 +1519,10 @@
   cff1_accelerator_t (hb_face_t *face) : cff1::accelerator_t (face) {}
 };
 
+struct cff1_subset_accelerator_t : cff1::accelerator_subset_t {
+  cff1_subset_accelerator_t (hb_face_t *face) : cff1::accelerator_subset_t (face) {}
+};
+
 } /* namespace OT */
 
 #endif /* HB_OT_CFF1_TABLE_HH */
diff --git a/src/hb-ot-cff2-table.hh b/src/hb-ot-cff2-table.hh
index 93690d0..c5d2f48 100644
--- a/src/hb-ot-cff2-table.hh
+++ b/src/hb-ot-cff2-table.hh
@@ -483,6 +483,8 @@
       return nullptr;
     }
 
+    hb_blob_t *get_blob () const { return blob; }
+
     bool is_valid () const { return blob; }
 
     protected:
@@ -515,7 +517,14 @@
     HB_INTERNAL bool get_path (hb_font_t *font, hb_codepoint_t glyph, hb_draw_session_t &draw_session) const;
   };
 
-  typedef accelerator_templ_t<cff2_private_dict_opset_subset_t, cff2_private_dict_values_subset_t> accelerator_subset_t;
+  struct accelerator_subset_t : accelerator_templ_t<cff2_private_dict_opset_subset_t, cff2_private_dict_values_subset_t>
+  {
+    accelerator_subset_t (hb_face_t *face) : SUPER (face) {}
+
+    HB_INTERNAL bool subset (hb_subset_context_t *c) const;
+
+    typedef accelerator_templ_t<cff2_private_dict_opset_subset_t, cff2_private_dict_values_subset_t> SUPER;
+  };
 
   HB_INTERNAL bool subset (hb_subset_context_t *c) const;
 
@@ -532,6 +541,10 @@
   cff2_accelerator_t (hb_face_t *face) : cff2::accelerator_t (face) {}
 };
 
+struct cff2_subset_accelerator_t : cff2::accelerator_subset_t {
+  cff2_subset_accelerator_t (hb_face_t *face) : cff2::accelerator_subset_t (face) {}
+};
+
 } /* namespace OT */
 
 #endif /* HB_OT_CFF2_TABLE_HH */
diff --git a/src/hb-subset-accelerator.hh b/src/hb-subset-accelerator.hh
index 925b447..bdf7546 100644
--- a/src/hb-subset-accelerator.hh
+++ b/src/hb-subset-accelerator.hh
@@ -42,7 +42,9 @@
 
 namespace OT {
 struct SubtableUnicodesCache;
-};
+struct cff1_subset_accelerator_t;
+struct cff2_subset_accelerator_t;
+}
 
 struct hb_subset_accelerator_t
 {
@@ -51,7 +53,8 @@
     return &_hb_subset_accelerator_user_data_key;
   }
 
-  static hb_subset_accelerator_t* create(const hb_map_t& unicode_to_gid_,
+  static hb_subset_accelerator_t* create(hb_face_t *source,
+					 const hb_map_t& unicode_to_gid_,
 					 const hb_set_t& unicodes_,
 					 bool has_seac_) {
     hb_subset_accelerator_t* accel =
@@ -59,7 +62,8 @@
 
     if (unlikely (!accel)) return accel;
 
-    new (accel) hb_subset_accelerator_t (unicode_to_gid_,
+    new (accel) hb_subset_accelerator_t (source,
+					 unicode_to_gid_,
 					 unicodes_,
 					 has_seac_);
 
@@ -77,7 +81,8 @@
     hb_free (accel);
   }
 
-  hb_subset_accelerator_t (const hb_map_t& unicode_to_gid_,
+  hb_subset_accelerator_t (hb_face_t *source,
+			   const hb_map_t& unicode_to_gid_,
 			   const hb_set_t& unicodes_,
 			   bool has_seac_) :
     unicode_to_gid(unicode_to_gid_),
@@ -86,7 +91,8 @@
     destroy_cmap_cache(nullptr),
     has_seac(has_seac_),
     cff_accelerator(nullptr),
-    destroy_cff_accelerator(nullptr)
+    destroy_cff_accelerator(nullptr),
+    source(hb_face_reference (source))
   {
     gid_to_unicodes.resize (unicode_to_gid.get_population ());
     for (const auto &_ : unicode_to_gid)
@@ -97,14 +103,7 @@
     }
   }
 
-  ~hb_subset_accelerator_t ()
-  {
-    if (cff_accelerator && destroy_cff_accelerator)
-      destroy_cff_accelerator ((void*) cff_accelerator);
-
-    if (cmap_cache && destroy_cmap_cache)
-      destroy_cmap_cache ((void*) cmap_cache);
-  }
+  HB_INTERNAL ~hb_subset_accelerator_t ();
 
   // Generic
 
@@ -133,6 +132,10 @@
 	   unicodes.in_error () ||
 	   sanitized_table_cache.in_error ();
   }
+
+  hb_face_t *source;
+  mutable hb_face_lazy_loader_t<OT::cff1_subset_accelerator_t, 1> cff1_accel;
+  mutable hb_face_lazy_loader_t<OT::cff2_subset_accelerator_t, 2> cff2_accel;
 };
 
 
diff --git a/src/hb-subset-cff-common.hh b/src/hb-subset-cff-common.hh
index 8c61a53..9c0b2f4 100644
--- a/src/hb-subset-cff-common.hh
+++ b/src/hb-subset-cff-common.hh
@@ -525,10 +525,7 @@
   parsed_cs_str_vec_t parsed_charstrings;
   parsed_cs_str_vec_t parsed_global_subrs;
   hb_vector_t<parsed_cs_str_vec_t> parsed_local_subrs;
-  mutable hb_atomic_ptr_t<hb_vector_t<uint16_t>> glyph_to_sid_map = nullptr;
-
-  void *cff1_acc = nullptr;
-  void *cff2_acc = nullptr;
+  mutable hb_atomic_ptr_t<hb_vector_t<uint16_t>> glyph_to_sid_map;
 
  private:
   hb_blob_t* original_blob;
diff --git a/src/hb-subset-cff1.cc b/src/hb-subset-cff1.cc
index a106681..22ea1c3 100644
--- a/src/hb-subset-cff1.cc
+++ b/src/hb-subset-cff1.cc
@@ -491,7 +491,7 @@
     }
 
     hb_vector_t<uint16_t> *glyph_to_sid_map = (plan->accelerator && plan->accelerator->cff_accelerator) ?
-					       plan->accelerator->cff_accelerator->glyph_to_sid_map :
+					       plan->accelerator->cff_accelerator->glyph_to_sid_map.get_acquire () :
 					       nullptr;
     bool created_map = false;
     if (!glyph_to_sid_map &&
@@ -938,26 +938,25 @@
   }
 }
 
-static bool
-_hb_subset_cff1 (const OT::cff1::accelerator_subset_t  &acc,
-		hb_subset_context_t	*c)
+bool
+OT::cff1::accelerator_subset_t::subset (hb_subset_context_t *c) const
 {
   cff_subset_plan cff_plan;
 
-  if (unlikely (!cff_plan.create (acc, c->plan)))
+  if (unlikely (!cff_plan.create (*this, c->plan)))
   {
     DEBUG_MSG(SUBSET, nullptr, "Failed to generate a cff subsetting plan.");
     return false;
   }
 
-  return _serialize_cff1 (c->serializer, cff_plan, acc);
+  return _serialize_cff1 (c->serializer, cff_plan, *this);
 }
 
 bool
 OT::cff1::subset (hb_subset_context_t *c) const
 {
   OT::cff1::accelerator_subset_t acc (c->plan->source);
-  return acc.is_valid () && _hb_subset_cff1 (acc, c);
+  return acc.is_valid () && acc.subset (c);
 }
 
 
diff --git a/src/hb-subset-cff2.cc b/src/hb-subset-cff2.cc
index f2afff5..2a2b164 100644
--- a/src/hb-subset-cff2.cc
+++ b/src/hb-subset-cff2.cc
@@ -642,14 +642,13 @@
   }
 }
 
-static bool
-_hb_subset_cff2 (const OT::cff2::accelerator_subset_t  &acc,
-		 hb_subset_context_t	*c)
+bool
+OT::cff2::accelerator_subset_t::subset (hb_subset_context_t *c) const
 {
   cff2_subset_plan cff2_plan;
 
-  if (unlikely (!cff2_plan.create (acc, c->plan))) return false;
-  return _serialize_cff2 (c->serializer, cff2_plan, acc,
+  if (unlikely (!cff2_plan.create (*this, c->plan))) return false;
+  return _serialize_cff2 (c->serializer, cff2_plan, *this,
 			  c->plan->normalized_coords.as_array ());
 }
 
@@ -657,7 +656,7 @@
 OT::cff2::subset (hb_subset_context_t *c) const
 {
   OT::cff2::accelerator_subset_t acc (c->plan->source);
-  return acc.is_valid () && _hb_subset_cff2 (acc, c);
+  return acc.is_valid () && acc.subset (c);
 }
 
 #endif
diff --git a/src/hb-subset-plan.cc b/src/hb-subset-plan.cc
index 178f5fe..7927d09 100644
--- a/src/hb-subset-plan.cc
+++ b/src/hb-subset-plan.cc
@@ -48,6 +48,22 @@
 using OT::Layout::GSUB;
 using OT::Layout::GPOS;
 
+
+hb_subset_accelerator_t::~hb_subset_accelerator_t ()
+{
+  if (cff_accelerator && destroy_cff_accelerator)
+    destroy_cff_accelerator ((void*) cff_accelerator);
+
+  if (cmap_cache && destroy_cmap_cache)
+    destroy_cmap_cache ((void*) cmap_cache);
+
+  cff1_accel.fini ();
+  cff2_accel.fini ();
+
+  hb_face_destroy (source);
+}
+
+
 typedef hb_hashmap_t<unsigned, hb::unique_ptr<hb_set_t>> script_langsys_map;
 #ifndef HB_NO_SUBSET_CFF
 static inline bool
@@ -1130,7 +1146,8 @@
   if (attach_accelerator_data)
   {
     inprogress_accelerator =
-      hb_subset_accelerator_t::create (*codepoint_to_glyph,
+      hb_subset_accelerator_t::create (source,
+				       *codepoint_to_glyph,
                                        unicodes,
 				       has_seac);
 
@@ -1142,6 +1159,27 @@
 #undef HB_SUBSET_PLAN_MEMBER
 }
 
+hb_subset_plan_t::~hb_subset_plan_t()
+{
+  hb_face_destroy (source);
+  hb_face_destroy (dest);
+
+  hb_map_destroy (codepoint_to_glyph);
+  hb_map_destroy (glyph_map);
+  hb_map_destroy (reverse_glyph_map);
+  cff1_accel.fini ();
+  cff2_accel.fini ();
+
+#ifdef HB_EXPERIMENTAL_API
+  for (auto _ : name_table_overrides.iter_ref ())
+    _.second.fini ();
+#endif
+
+  if (inprogress_accelerator)
+    hb_subset_accelerator_t::destroy ((void*) inprogress_accelerator);
+}
+
+
 /**
  * hb_subset_plan_create_or_fail:
  * @face: font face to create the plan for.
diff --git a/src/hb-subset-plan.hh b/src/hb-subset-plan.hh
index f777f85..b5a8d7f 100644
--- a/src/hb-subset-plan.hh
+++ b/src/hb-subset-plan.hh
@@ -67,28 +67,17 @@
 
 typedef struct head_maxp_info_t head_maxp_info_t;
 
+namespace OT {
+  struct cff1_subset_accelerator_t;
+  struct cff2_subset_accelerator_t;
+}
+
 struct hb_subset_plan_t
 {
   HB_INTERNAL hb_subset_plan_t (hb_face_t *,
 				const hb_subset_input_t *input);
 
-  ~hb_subset_plan_t()
-  {
-    hb_face_destroy (source);
-    hb_face_destroy (dest);
-
-    hb_map_destroy (codepoint_to_glyph);
-    hb_map_destroy (glyph_map);
-    hb_map_destroy (reverse_glyph_map);
-
-#ifdef HB_EXPERIMENTAL_API
-    for (auto _ : name_table_overrides.iter_ref ())
-      _.second.fini ();
-#endif
-
-    if (inprogress_accelerator)
-      hb_subset_accelerator_t::destroy ((void*) inprogress_accelerator);
-  }
+  HB_INTERNAL ~hb_subset_plan_t();
 
   hb_object_header_t header;
 
@@ -106,6 +95,10 @@
 
   // Plan is only good for a specific source/dest so keep them with it
   hb_face_t *source;
+  // These have to be immediately after source:
+  hb_face_lazy_loader_t<OT::cff1_subset_accelerator_t, 1> cff1_accel;
+  hb_face_lazy_loader_t<OT::cff2_subset_accelerator_t, 2> cff2_accel;
+
   hb_face_t *dest;
 
   unsigned int _num_output_glyphs;
@@ -131,25 +124,31 @@
  public:
 
   template<typename T>
-  hb_blob_ptr_t<T> source_table()
+  struct source_table_loader
   {
-    hb_lock_t lock (accelerator ? &accelerator->sanitized_table_cache_lock : nullptr);
+    hb_blob_ptr_t<T> operator () (hb_subset_plan_t *plan)
+    {
+      hb_lock_t lock (plan->accelerator ? &plan->accelerator->sanitized_table_cache_lock : nullptr);
 
-    auto *cache = accelerator ? &accelerator->sanitized_table_cache : &sanitized_table_cache;
-    if (cache
-        && !cache->in_error ()
-        && cache->has (+T::tableTag)) {
-      return hb_blob_reference (cache->get (+T::tableTag).get ());
+      auto *cache = plan->accelerator ? &plan->accelerator->sanitized_table_cache : &plan->sanitized_table_cache;
+      if (cache
+	  && !cache->in_error ()
+	  && cache->has (+T::tableTag)) {
+	return hb_blob_reference (cache->get (+T::tableTag).get ());
+      }
+
+      hb::unique_ptr<hb_blob_t> table_blob {hb_sanitize_context_t ().reference_table<T> (plan->source)};
+      hb_blob_t* ret = hb_blob_reference (table_blob.get ());
+
+      if (likely (cache))
+	cache->set (+T::tableTag, std::move (table_blob));
+
+      return ret;
     }
+  };
 
-    hb::unique_ptr<hb_blob_t> table_blob {hb_sanitize_context_t ().reference_table<T> (source)};
-    hb_blob_t* ret = hb_blob_reference (table_blob.get ());
-
-    if (likely (cache))
-      cache->set (+T::tableTag, std::move (table_blob));
-
-    return ret;
-  }
+  template<typename T>
+  auto source_table() HB_AUTO_RETURN (source_table_loader<T> {} (this))
 
   bool in_error () const { return !successful; }
 
@@ -237,4 +236,5 @@
   }
 };
 
+
 #endif /* HB_SUBSET_PLAN_HH */
diff --git a/src/hb-subset.cc b/src/hb-subset.cc
index e644fb8..cc875ce 100644
--- a/src/hb-subset.cc
+++ b/src/hb-subset.cc
@@ -62,6 +62,21 @@
 using OT::Layout::GSUB;
 using OT::Layout::GPOS;
 
+
+template<>
+struct hb_subset_plan_t::source_table_loader<const OT::cff1>
+{
+  auto operator () (hb_subset_plan_t *plan)
+  HB_AUTO_RETURN (plan->accelerator ? plan->accelerator->cff1_accel : plan->cff1_accel)
+};
+template<>
+struct hb_subset_plan_t::source_table_loader<const OT::cff2>
+{
+  auto operator () (hb_subset_plan_t *plan)
+  HB_AUTO_RETURN (plan->accelerator ? plan->accelerator->cff2_accel : plan->cff2_accel)
+};
+
+
 /**
  * SECTION:hb-subset
  * @title: hb-subset
@@ -262,19 +277,26 @@
   return _try_subset (table, buf, c);
 }
 
+template <typename T>
+auto _do_destroy (T &t, hb_priority<1>) HB_RETURN (void, t.destroy ())
+
+template <typename T>
+void _do_destroy (T &t, hb_priority<0>) {}
+
 template<typename TableType>
 static bool
 _subset (hb_subset_plan_t *plan, hb_vector_t<char> &buf)
 {
-  auto source_blob = plan->source_table<TableType> ();
-  const TableType *table = source_blob.get ();
+  auto &&source_blob = plan->source_table<TableType> ();
+  auto *table = source_blob.get ();
 
   hb_tag_t tag = TableType::tableTag;
-  if (!source_blob.get_blob()->data)
+  hb_blob_t *blob = source_blob.get_blob();
+  if (unlikely (!blob || !blob->data))
   {
     DEBUG_MSG (SUBSET, nullptr,
                "OT::%c%c%c%c::subset sanitize failed on source table.", HB_UNTAG (tag));
-    source_blob.destroy ();
+    _do_destroy (source_blob, hb_prioritize);
     return false;
   }
 
@@ -284,23 +306,23 @@
 			 TableType::tableTag == HB_OT_TAG_GPOS ||
 			 TableType::tableTag == HB_OT_TAG_name;
 
-  unsigned buf_size = _plan_estimate_subset_table_size (plan, source_blob.get_length (), same_size_table);
+  unsigned buf_size = _plan_estimate_subset_table_size (plan, blob->length, same_size_table);
   DEBUG_MSG (SUBSET, nullptr,
              "OT::%c%c%c%c initial estimated table size: %u bytes.", HB_UNTAG (tag), buf_size);
   if (unlikely (!buf.alloc (buf_size)))
   {
     DEBUG_MSG (SUBSET, nullptr, "OT::%c%c%c%c failed to allocate %u bytes.", HB_UNTAG (tag), buf_size);
-    source_blob.destroy ();
+    _do_destroy (source_blob, hb_prioritize);
     return false;
   }
 
   bool needed = false;
   hb_serialize_context_t serializer (buf.arrayZ, buf.allocated);
   {
-    hb_subset_context_t c (source_blob.get_blob (), plan, &serializer, tag);
+    hb_subset_context_t c (blob, plan, &serializer, tag);
     needed = _try_subset (table, &buf, &c);
   }
-  source_blob.destroy ();
+  _do_destroy (source_blob, hb_prioritize);
 
   if (serializer.in_error () && !serializer.only_offset_overflow ())
   {