Add South-East Asian shaper

Handles Tai Tham, Cham, and New Tai Lue for now.
diff --git a/src/hb-ot-shape-complex-indic.cc b/src/hb-ot-shape-complex-indic.cc
index 9c704fb..fe62783 100644
--- a/src/hb-ot-shape-complex-indic.cc
+++ b/src/hb-ot-shape-complex-indic.cc
@@ -27,6 +27,245 @@
 #include "hb-ot-shape-complex-indic-private.hh"
 #include "hb-ot-layout-private.hh"
 
+/* buffer var allocations */
+#define indic_category() complex_var_u8_0() /* indic_category_t */
+#define indic_position() complex_var_u8_1() /* indic_position_t */
+
+
+/*
+ * Indic shaper.
+ */
+
+
+#define IN_HALF_BLOCK(u, Base) (((u) & ~0x7F) == (Base))
+
+#define IS_DEVA(u) (IN_HALF_BLOCK (u, 0x0900))
+#define IS_BENG(u) (IN_HALF_BLOCK (u, 0x0980))
+#define IS_GURU(u) (IN_HALF_BLOCK (u, 0x0A00))
+#define IS_GUJR(u) (IN_HALF_BLOCK (u, 0x0A80))
+#define IS_ORYA(u) (IN_HALF_BLOCK (u, 0x0B00))
+#define IS_TAML(u) (IN_HALF_BLOCK (u, 0x0B80))
+#define IS_TELU(u) (IN_HALF_BLOCK (u, 0x0C00))
+#define IS_KNDA(u) (IN_HALF_BLOCK (u, 0x0C80))
+#define IS_MLYM(u) (IN_HALF_BLOCK (u, 0x0D00))
+#define IS_SINH(u) (IN_HALF_BLOCK (u, 0x0D80))
+#define IS_KHMR(u) (IN_HALF_BLOCK (u, 0x1780))
+
+
+#define MATRA_POS_LEFT(u)	POS_PRE_M
+#define MATRA_POS_RIGHT(u)	( \
+				  IS_DEVA(u) ? POS_AFTER_SUB  : \
+				  IS_BENG(u) ? POS_AFTER_POST : \
+				  IS_GURU(u) ? POS_AFTER_POST : \
+				  IS_GUJR(u) ? POS_AFTER_POST : \
+				  IS_ORYA(u) ? POS_AFTER_POST : \
+				  IS_TAML(u) ? POS_AFTER_POST : \
+				  IS_TELU(u) ? (u <= 0x0C42 ? POS_BEFORE_SUB : POS_AFTER_SUB) : \
+				  IS_KNDA(u) ? (u < 0x0CC3 || u > 0xCD6 ? POS_BEFORE_SUB : POS_AFTER_SUB) : \
+				  IS_MLYM(u) ? POS_AFTER_POST : \
+				  IS_SINH(u) ? POS_AFTER_SUB  : \
+				  IS_KHMR(u) ? POS_AFTER_POST : \
+				  /*default*/  POS_AFTER_SUB    \
+				)
+#define MATRA_POS_TOP(u)	( /* BENG and MLYM don't have top matras. */ \
+				  IS_DEVA(u) ? POS_AFTER_SUB  : \
+				  IS_GURU(u) ? POS_AFTER_POST : /* Deviate from spec */ \
+				  IS_GUJR(u) ? POS_AFTER_SUB  : \
+				  IS_ORYA(u) ? POS_AFTER_MAIN : \
+				  IS_TAML(u) ? POS_AFTER_SUB  : \
+				  IS_TELU(u) ? POS_BEFORE_SUB : \
+				  IS_KNDA(u) ? POS_BEFORE_SUB : \
+				  IS_SINH(u) ? POS_AFTER_SUB  : \
+				  IS_KHMR(u) ? POS_AFTER_POST : \
+				  /*default*/  POS_AFTER_SUB    \
+				)
+#define MATRA_POS_BOTTOM(u)	( \
+				  IS_DEVA(u) ? POS_AFTER_SUB  : \
+				  IS_BENG(u) ? POS_AFTER_SUB  : \
+				  IS_GURU(u) ? POS_AFTER_POST : \
+				  IS_GUJR(u) ? POS_AFTER_POST : \
+				  IS_ORYA(u) ? POS_AFTER_SUB  : \
+				  IS_TAML(u) ? POS_AFTER_POST : \
+				  IS_TELU(u) ? POS_BEFORE_SUB : \
+				  IS_KNDA(u) ? POS_BEFORE_SUB : \
+				  IS_MLYM(u) ? POS_AFTER_POST : \
+				  IS_SINH(u) ? POS_AFTER_SUB  : \
+				  IS_KHMR(u) ? POS_AFTER_POST : \
+				  /*default*/  POS_AFTER_SUB    \
+				)
+
+static inline indic_position_t
+matra_position (hb_codepoint_t u, indic_position_t side)
+{
+  switch ((int) side)
+  {
+    case POS_PRE_C:	return MATRA_POS_LEFT (u);
+    case POS_POST_C:	return MATRA_POS_RIGHT (u);
+    case POS_ABOVE_C:	return MATRA_POS_TOP (u);
+    case POS_BELOW_C:	return MATRA_POS_BOTTOM (u);
+  };
+  return side;
+}
+
+/* XXX
+ * This is a hack for now.  We should move this data into the main Indic table.
+ * Or completely remove it and just check in the tables.
+ */
+static const hb_codepoint_t ra_chars[] = {
+  0x0930, /* Devanagari */
+  0x09B0, /* Bengali */
+  0x09F0, /* Bengali */
+  0x0A30, /* Gurmukhi */	/* No Reph */
+  0x0AB0, /* Gujarati */
+  0x0B30, /* Oriya */
+  0x0BB0, /* Tamil */		/* No Reph */
+  0x0C30, /* Telugu */		/* Reph formed only with ZWJ */
+  0x0CB0, /* Kannada */
+  0x0D30, /* Malayalam */	/* No Reph, Logical Repha */
+
+  0x0DBB, /* Sinhala */		/* Reph formed only with ZWJ */
+
+  0x179A, /* Khmer */		/* No Reph, Visual Repha */
+};
+
+static inline indic_position_t
+consonant_position (hb_codepoint_t  u)
+{
+  if ((u & ~0x007F) == 0x1780)
+    return POS_BELOW_C; /* In Khmer coeng model, post and below forms should not be reordered. */
+  return POS_BASE_C; /* Will recategorize later based on font lookups. */
+}
+
+static inline bool
+is_ra (hb_codepoint_t u)
+{
+  for (unsigned int i = 0; i < ARRAY_LENGTH (ra_chars); i++)
+    if (u == ra_chars[i])
+      return true;
+  return false;
+}
+
+static inline bool
+is_one_of (const hb_glyph_info_t &info, unsigned int flags)
+{
+  /* If it ligated, all bets are off. */
+  if (is_a_ligature (info)) return false;
+  return !!(FLAG (info.indic_category()) & flags);
+}
+
+#define JOINER_FLAGS (FLAG (OT_ZWJ) | FLAG (OT_ZWNJ))
+static inline bool
+is_joiner (const hb_glyph_info_t &info)
+{
+  return is_one_of (info, JOINER_FLAGS);
+}
+
+/* Note:
+ *
+ * We treat Vowels and placeholders as if they were consonants.  This is safe because Vowels
+ * cannot happen in a consonant syllable.  The plus side however is, we can call the
+ * consonant syllable logic from the vowel syllable function and get it all right! */
+#define CONSONANT_FLAGS (FLAG (OT_C) | FLAG (OT_CM) | FLAG (OT_Ra) | FLAG (OT_V) | FLAG (OT_NBSP) | FLAG (OT_DOTTEDCIRCLE))
+static inline bool
+is_consonant (const hb_glyph_info_t &info)
+{
+  return is_one_of (info, CONSONANT_FLAGS);
+}
+
+#define HALANT_OR_COENG_FLAGS (FLAG (OT_H) | FLAG (OT_Coeng))
+static inline bool
+is_halant_or_coeng (const hb_glyph_info_t &info)
+{
+  return is_one_of (info, HALANT_OR_COENG_FLAGS);
+}
+
+static inline void
+set_indic_properties (hb_glyph_info_t &info)
+{
+  hb_codepoint_t u = info.codepoint;
+  unsigned int type = hb_indic_get_categories (u);
+  indic_category_t cat = (indic_category_t) (type & 0x7F);
+  indic_position_t pos = (indic_position_t) (type >> 8);
+
+
+  /*
+   * Re-assign category
+   */
+
+
+  /* The spec says U+0952 is OT_A.  However, testing shows that Uniscribe
+   * treats U+0951..U+0952 all as OT_VD.
+   * TESTS:
+   * U+092E,U+0947,U+0952
+   * U+092E,U+0952,U+0947
+   * U+092E,U+0947,U+0951
+   * U+092E,U+0951,U+0947
+   * */
+  if (unlikely (hb_in_range<hb_codepoint_t> (u, 0x0951, 0x0954)))
+    cat = OT_VD;
+
+  if (unlikely (u == 0x17D1))
+    cat = OT_X;
+  if (cat == OT_X &&
+      unlikely (hb_in_range<hb_codepoint_t> (u, 0x17CB, 0x17D3))) /* Khmer Various signs */
+  {
+    /* These are like Top Matras. */
+    cat = OT_M;
+    pos = POS_ABOVE_C;
+  }
+  if (u == 0x17C6) /* Khmer Bindu doesn't like to be repositioned. */
+    cat = OT_N;
+
+  if (unlikely (u == 0x17D2)) cat = OT_Coeng; /* Khmer coeng */
+  else if (unlikely (u == 0x200C)) cat = OT_ZWNJ;
+  else if (unlikely (u == 0x200D)) cat = OT_ZWJ;
+  else if (unlikely (u == 0x25CC)) cat = OT_DOTTEDCIRCLE;
+  else if (unlikely (u == 0x0A71)) cat = OT_SM; /* GURMUKHI ADDAK.  More like consonant medial. like 0A75. */
+
+  if (cat == OT_Repha) {
+    /* There are two kinds of characters marked as Repha:
+     * - The ones that are GenCat=Mn are already positioned visually, ie. after base. (eg. Khmer)
+     * - The ones that are GenCat=Lo is encoded logically, ie. beginning of syllable. (eg. Malayalam)
+     *
+     * We recategorize the first kind to look like a Nukta and attached to the base directly.
+     */
+    if (_hb_glyph_info_get_general_category (&info) == HB_UNICODE_GENERAL_CATEGORY_NON_SPACING_MARK)
+      cat = OT_N;
+  }
+
+
+
+  /*
+   * Re-assign position.
+   */
+
+  if ((FLAG (cat) & CONSONANT_FLAGS))
+  {
+    pos = consonant_position (u);
+    if (is_ra (u))
+      cat = OT_Ra;
+  }
+  else if (cat == OT_M)
+  {
+    pos = matra_position (u, pos);
+  }
+  else if (cat == OT_SM || cat == OT_VD)
+  {
+    pos = POS_SMVD;
+  }
+
+  if (unlikely (u == 0x0B01)) pos = POS_BEFORE_SUB; /* Oriya Bindu is BeforeSub in the spec. */
+
+
+
+  info.indic_category() = cat;
+  info.indic_position() = pos;
+}
+
+/*
+ * Things above this line should ideally be moved to the Indic table itself.
+ */
+
 
 /*
  * Global Indic shaper options.