| #!/usr/bin/env python3 |
| |
| """usage: ./gen-indic-table.py IndicSyllabicCategory.txt IndicPositionalCategory.txt Blocks.txt |
| |
| Input files: |
| * https://unicode.org/Public/UCD/latest/ucd/IndicSyllabicCategory.txt |
| * https://unicode.org/Public/UCD/latest/ucd/IndicPositionalCategory.txt |
| * https://unicode.org/Public/UCD/latest/ucd/Blocks.txt |
| """ |
| |
| import sys |
| |
| if len (sys.argv) != 4: |
| sys.exit (__doc__) |
| |
| ALLOWED_SINGLES = [0x00A0, 0x25CC] |
| ALLOWED_BLOCKS = [ |
| 'Basic Latin', |
| 'Latin-1 Supplement', |
| 'Devanagari', |
| 'Bengali', |
| 'Gurmukhi', |
| 'Gujarati', |
| 'Oriya', |
| 'Tamil', |
| 'Telugu', |
| 'Kannada', |
| 'Malayalam', |
| 'Myanmar', |
| 'Khmer', |
| 'Vedic Extensions', |
| 'General Punctuation', |
| 'Superscripts and Subscripts', |
| 'Devanagari Extended', |
| 'Myanmar Extended-B', |
| 'Myanmar Extended-A', |
| 'Myanmar Extended-C', |
| ] |
| |
| files = [open (x, encoding='utf-8') for x in sys.argv[1:]] |
| |
| headers = [[f.readline () for i in range (2)] for f in files] |
| |
| unicode_data = [{} for _ in files] |
| for i, f in enumerate (files): |
| for line in f: |
| |
| j = line.find ('#') |
| if j >= 0: |
| line = line[:j] |
| |
| fields = [x.strip () for x in line.split (';')] |
| if len (fields) == 1: |
| continue |
| |
| uu = fields[0].split ('..') |
| start = int (uu[0], 16) |
| if len (uu) == 1: |
| end = start |
| else: |
| end = int (uu[1], 16) |
| |
| t = fields[1] |
| |
| for u in range (start, end + 1): |
| unicode_data[i][u] = t |
| |
| # Merge data into one dict: |
| defaults = ('Other', 'Not_Applicable', 'No_Block') |
| combined = {} |
| for i,d in enumerate (unicode_data): |
| for u,v in d.items (): |
| if i == 2 and not u in combined: |
| continue |
| if not u in combined: |
| combined[u] = list (defaults) |
| combined[u][i] = v |
| combined = {k:v for k,v in combined.items() if k in ALLOWED_SINGLES or v[2] in ALLOWED_BLOCKS} |
| |
| |
| # Convert categories & positions types |
| |
| categories = { |
| 'indic' : [ |
| 'X', |
| 'C', |
| 'V', |
| 'N', |
| 'H', |
| 'ZWNJ', |
| 'ZWJ', |
| 'M', |
| 'SM', |
| 'A', |
| 'VD', |
| 'PLACEHOLDER', |
| 'DOTTEDCIRCLE', |
| 'RS', |
| 'MPst', |
| 'Repha', |
| 'Ra', |
| 'CM', |
| 'Symbol', |
| 'CS', |
| 'SMPst', |
| ], |
| 'khmer' : [ |
| 'VAbv', |
| 'VBlw', |
| 'VPre', |
| 'VPst', |
| |
| 'Robatic', |
| 'Xgroup', |
| 'Ygroup', |
| ], |
| 'myanmar' : [ |
| 'VAbv', |
| 'VBlw', |
| 'VPre', |
| 'VPst', |
| |
| 'IV', |
| 'As', |
| 'DB', |
| 'GB', |
| 'MH', |
| 'MR', |
| 'MW', |
| 'MY', |
| 'PT', |
| 'VS', |
| 'ML', |
| ], |
| } |
| |
| category_map = { |
| 'Other' : 'X', |
| 'Avagraha' : 'Symbol', |
| 'Bindu' : 'SM', |
| 'Brahmi_Joining_Number' : 'PLACEHOLDER', # Don't care. |
| 'Cantillation_Mark' : 'A', |
| 'Consonant' : 'C', |
| 'Consonant_Dead' : 'C', |
| 'Consonant_Final' : 'CM', |
| 'Consonant_Head_Letter' : 'C', |
| 'Consonant_Initial_Postfixed' : 'C', # TODO |
| 'Consonant_Killer' : 'M', # U+17CD only. |
| 'Consonant_Medial' : 'CM', |
| 'Consonant_Placeholder' : 'PLACEHOLDER', |
| 'Consonant_Preceding_Repha' : 'Repha', |
| 'Consonant_Prefixed' : 'X', # Don't care. |
| 'Consonant_Subjoined' : 'CM', |
| 'Consonant_Succeeding_Repha' : 'CM', |
| 'Consonant_With_Stacker' : 'CS', |
| 'Gemination_Mark' : 'SM', # https://github.com/harfbuzz/harfbuzz/issues/552 |
| 'Invisible_Stacker' : 'H', |
| 'Joiner' : 'ZWJ', |
| 'Modifying_Letter' : 'X', |
| 'Non_Joiner' : 'ZWNJ', |
| 'Nukta' : 'N', |
| 'Number' : 'PLACEHOLDER', |
| 'Number_Joiner' : 'PLACEHOLDER', # Don't care. |
| 'Pure_Killer' : 'M', # Is like a vowel matra. |
| 'Register_Shifter' : 'RS', |
| 'Syllable_Modifier' : 'SM', |
| 'Tone_Letter' : 'X', |
| 'Tone_Mark' : 'N', |
| 'Virama' : 'H', |
| 'Visarga' : 'SM', |
| 'Vowel' : 'V', |
| 'Vowel_Dependent' : 'M', |
| 'Vowel_Independent' : 'V', |
| } |
| position_map = { |
| 'Not_Applicable' : 'END', |
| |
| 'Left' : 'PRE_C', |
| 'Top' : 'ABOVE_C', |
| 'Bottom' : 'BELOW_C', |
| 'Right' : 'POST_C', |
| |
| # These should resolve to the position of the last part of the split sequence. |
| 'Bottom_And_Right' : 'POST_C', |
| 'Left_And_Right' : 'POST_C', |
| 'Top_And_Bottom' : 'BELOW_C', |
| 'Top_And_Bottom_And_Left' : 'BELOW_C', |
| 'Top_And_Bottom_And_Right' : 'POST_C', |
| 'Top_And_Left' : 'ABOVE_C', |
| 'Top_And_Left_And_Right' : 'POST_C', |
| 'Top_And_Right' : 'POST_C', |
| |
| 'Overstruck' : 'AFTER_MAIN', |
| 'Visual_order_left' : 'PRE_M', |
| } |
| |
| category_overrides = { |
| |
| # These are the variation-selectors. They only appear in the Myanmar grammar |
| # but are not Myanmar-specific |
| 0xFE00: 'VS', |
| 0xFE01: 'VS', |
| 0xFE02: 'VS', |
| 0xFE03: 'VS', |
| 0xFE04: 'VS', |
| 0xFE05: 'VS', |
| 0xFE06: 'VS', |
| 0xFE07: 'VS', |
| 0xFE08: 'VS', |
| 0xFE09: 'VS', |
| 0xFE0A: 'VS', |
| 0xFE0B: 'VS', |
| 0xFE0C: 'VS', |
| 0xFE0D: 'VS', |
| 0xFE0E: 'VS', |
| 0xFE0F: 'VS', |
| |
| # These appear in the OT Myanmar spec, but are not Myanmar-specific |
| 0x2015: 'PLACEHOLDER', |
| 0x2022: 'PLACEHOLDER', |
| 0x25FB: 'PLACEHOLDER', |
| 0x25FC: 'PLACEHOLDER', |
| 0x25FD: 'PLACEHOLDER', |
| 0x25FE: 'PLACEHOLDER', |
| |
| |
| # Indic |
| |
| 0x0930: 'Ra', # Devanagari |
| 0x09B0: 'Ra', # Bengali |
| 0x09F0: 'Ra', # Bengali |
| 0x0A30: 'Ra', # Gurmukhi No Reph |
| 0x0AB0: 'Ra', # Gujarati |
| 0x0B30: 'Ra', # Oriya |
| 0x0BB0: 'Ra', # Tamil No Reph |
| 0x0C30: 'Ra', # Telugu Reph formed only with ZWJ |
| 0x0CB0: 'Ra', # Kannada |
| 0x0D30: 'Ra', # Malayalam No Reph, Logical Repha |
| |
| # The following act more like the Bindus. |
| 0x0953: 'SM', |
| 0x0954: 'SM', |
| |
| # U+0A40 GURMUKHI VOWEL SIGN II may be preceded by U+0A02 GURMUKHI SIGN BINDI. |
| 0x0A40: 'MPst', |
| |
| # The following act like consonants. |
| 0x0A72: 'C', |
| 0x0A73: 'C', |
| 0x1CF5: 'C', |
| 0x1CF6: 'C', |
| |
| # TODO: The following should only be allowed after a Visarga. |
| # For now, just treat them like regular tone marks. |
| 0x1CE2: 'A', |
| 0x1CE3: 'A', |
| 0x1CE4: 'A', |
| 0x1CE5: 'A', |
| 0x1CE6: 'A', |
| 0x1CE7: 'A', |
| 0x1CE8: 'A', |
| |
| # TODO: The following should only be allowed after some of |
| # the nasalization marks, maybe only for U+1CE9..U+1CF1. |
| # For now, just treat them like tone marks. |
| 0x1CED: 'A', |
| |
| # The following take marks in standalone clusters, similar to Avagraha. |
| 0xA8F2: 'Symbol', |
| 0xA8F3: 'Symbol', |
| 0xA8F4: 'Symbol', |
| 0xA8F5: 'Symbol', |
| 0xA8F6: 'Symbol', |
| 0xA8F7: 'Symbol', |
| 0x1CE9: 'Symbol', |
| 0x1CEA: 'Symbol', |
| 0x1CEB: 'Symbol', |
| 0x1CEC: 'Symbol', |
| 0x1CEE: 'Symbol', |
| 0x1CEF: 'Symbol', |
| 0x1CF0: 'Symbol', |
| 0x1CF1: 'Symbol', |
| |
| 0x0A51: 'M', # https://github.com/harfbuzz/harfbuzz/issues/524 |
| |
| # According to ScriptExtensions.txt, these Grantha marks may also be used in Tamil, |
| # so the Indic shaper needs to know their categories. |
| 0x11301: 'SM', |
| 0x11302: 'SM', |
| 0x11303: 'SM', |
| 0x1133B: 'N', |
| 0x1133C: 'N', |
| |
| 0x0AFB: 'N', # https://github.com/harfbuzz/harfbuzz/issues/552 |
| 0x0B55: 'N', # https://github.com/harfbuzz/harfbuzz/issues/2849 |
| |
| 0x09FC: 'PLACEHOLDER', # https://github.com/harfbuzz/harfbuzz/pull/1613 |
| 0x0C80: 'PLACEHOLDER', # https://github.com/harfbuzz/harfbuzz/pull/623 |
| 0x0D04: 'PLACEHOLDER', # https://github.com/harfbuzz/harfbuzz/pull/3511 |
| |
| 0x25CC: 'DOTTEDCIRCLE', |
| |
| |
| # Khmer |
| |
| 0x179A: 'Ra', |
| |
| 0x17CC: 'Robatic', |
| 0x17C9: 'Robatic', |
| 0x17CA: 'Robatic', |
| |
| 0x17C6: 'Xgroup', |
| 0x17CB: 'Xgroup', |
| 0x17CD: 'Xgroup', |
| 0x17CE: 'Xgroup', |
| 0x17CF: 'Xgroup', |
| 0x17D0: 'Xgroup', |
| 0x17D1: 'Xgroup', |
| |
| 0x17C7: 'Ygroup', |
| 0x17C8: 'Ygroup', |
| 0x17DD: 'Ygroup', |
| 0x17D3: 'Ygroup', # Just guessing. Uniscribe doesn't categorize it. |
| |
| 0x17D9: 'PLACEHOLDER', # https://github.com/harfbuzz/harfbuzz/issues/2384 |
| |
| |
| # Myanmar |
| |
| # https://docs.microsoft.com/en-us/typography/script-development/myanmar#analyze |
| |
| 0x104E: 'C', # The spec says C, IndicSyllableCategory says Consonant_Placeholder |
| |
| 0x1004: 'Ra', |
| 0x101B: 'Ra', |
| 0x105A: 'Ra', |
| |
| 0x1032: 'A', |
| 0x1036: 'A', |
| |
| 0x103A: 'As', |
| |
| #0x1040: 'D0', # XXX The spec says D0, but Uniscribe doesn't seem to do. |
| |
| 0x103E: 'MH', |
| 0x1060: 'ML', |
| 0x103C: 'MR', |
| 0x103D: 'MW', |
| 0x1082: 'MW', |
| 0x103B: 'MY', |
| 0x105E: 'MY', |
| 0x105F: 'MY', |
| |
| 0x1063: 'PT', |
| 0x1064: 'PT', |
| 0x1069: 'PT', |
| 0x106A: 'PT', |
| 0x106B: 'PT', |
| 0x106C: 'PT', |
| 0x106D: 'PT', |
| 0xAA7B: 'PT', |
| |
| 0x1038: 'SM', |
| 0x1087: 'SM', |
| 0x1088: 'SM', |
| 0x1089: 'SM', |
| 0x108A: 'SM', |
| 0x108B: 'SM', |
| 0x108C: 'SM', |
| 0x108D: 'SM', |
| 0x108F: 'SM', |
| 0x109A: 'SM', |
| 0x109B: 'SM', |
| 0x109C: 'SM', |
| |
| 0x104A: 'PLACEHOLDER', |
| } |
| position_overrides = { |
| |
| 0x0A51: 'BELOW_C', # https://github.com/harfbuzz/harfbuzz/issues/524 |
| |
| 0x0B01: 'BEFORE_SUB', # Oriya Bindu is BeforeSub in the spec. |
| } |
| |
| def matra_pos_left(u, block): |
| return "PRE_M" |
| def matra_pos_right(u, block): |
| if block == 'Devanagari': return 'AFTER_SUB' |
| if block == 'Bengali': return 'AFTER_POST' |
| if block == 'Gurmukhi': return 'AFTER_POST' |
| if block == 'Gujarati': return 'AFTER_POST' |
| if block == 'Oriya': return 'AFTER_POST' |
| if block == 'Tamil': return 'AFTER_POST' |
| if block == 'Telugu': return 'BEFORE_SUB' if u <= 0x0C42 else 'AFTER_SUB' |
| if block == 'Kannada': return 'BEFORE_SUB' if u < 0x0CC3 or u > 0x0CD6 else 'AFTER_SUB' |
| if block == 'Malayalam': return 'AFTER_POST' |
| return 'AFTER_SUB' |
| def matra_pos_top(u, block): |
| # BENG and MLYM don't have top matras. |
| if block == 'Devanagari': return 'AFTER_SUB' |
| if block == 'Gurmukhi': return 'AFTER_POST' # Deviate from spec |
| if block == 'Gujarati': return 'AFTER_SUB' |
| if block == 'Oriya': return 'AFTER_MAIN' |
| if block == 'Tamil': return 'AFTER_SUB' |
| if block == 'Telugu': return 'BEFORE_SUB' |
| if block == 'Kannada': return 'BEFORE_SUB' |
| return 'AFTER_SUB' |
| def matra_pos_bottom(u, block): |
| if block == 'Devanagari': return 'AFTER_SUB' |
| if block == 'Bengali': return 'AFTER_SUB' |
| if block == 'Gurmukhi': return 'AFTER_POST' |
| if block == 'Gujarati': return 'AFTER_POST' |
| if block == 'Oriya': return 'AFTER_SUB' |
| if block == 'Tamil': return 'AFTER_POST' |
| if block == 'Telugu': return 'BEFORE_SUB' |
| if block == 'Kannada': return 'BEFORE_SUB' |
| if block == 'Malayalam': return 'AFTER_POST' |
| return "AFTER_SUB" |
| def indic_matra_position(u, pos, block): # Reposition matra |
| if pos == 'PRE_C': return matra_pos_left(u, block) |
| if pos == 'POST_C': return matra_pos_right(u, block) |
| if pos == 'ABOVE_C': return matra_pos_top(u, block) |
| if pos == 'BELOW_C': return matra_pos_bottom(u, block) |
| assert (False) |
| |
| def position_to_category(pos): |
| if pos == 'PRE_C': return 'VPre' |
| if pos == 'ABOVE_C': return 'VAbv' |
| if pos == 'BELOW_C': return 'VBlw' |
| if pos == 'POST_C': return 'VPst' |
| assert(False) |
| |
| |
| defaults = (category_map[defaults[0]], position_map[defaults[1]], defaults[2]) |
| |
| indic_data = {} |
| for k, (cat, pos, block) in combined.items(): |
| cat = category_map[cat] |
| if cat == 'SM' and pos == 'Not_Applicable': |
| cat = 'SMPst' |
| pos = position_map[pos] |
| indic_data[k] = (cat, pos, block) |
| |
| for k,new_cat in category_overrides.items(): |
| (cat, pos, _) = indic_data.get(k, defaults) |
| indic_data[k] = (new_cat, pos, unicode_data[2][k]) |
| |
| # We only expect position for certain types |
| positioned_categories = ('CM', 'SM', 'RS', 'H', 'M', 'MPst') |
| for k, (cat, pos, block) in indic_data.items(): |
| if cat not in positioned_categories: |
| pos = 'END' |
| indic_data[k] = (cat, pos, block) |
| |
| # Position overrides are more complicated |
| |
| # Keep in sync with CONSONANT_FLAGS in the shaper |
| consonant_categories = ('C', 'CS', 'Ra','CM', 'V', 'PLACEHOLDER', 'DOTTEDCIRCLE') |
| matra_categories = ('M', 'MPst') |
| smvd_categories = ('SM', 'SMPst', 'VD', 'A', 'Symbol') |
| for k, (cat, pos, block) in indic_data.items(): |
| if cat in consonant_categories: |
| pos = 'BASE_C' |
| elif cat in matra_categories: |
| if block.startswith('Khmer') or block.startswith('Myanmar'): |
| cat = position_to_category(pos) |
| else: |
| pos = indic_matra_position(k, pos, block) |
| elif cat in smvd_categories: |
| pos = 'SMVD'; |
| indic_data[k] = (cat, pos, block) |
| |
| for k,new_pos in position_overrides.items(): |
| (cat, pos, _) = indic_data.get(k, defaults) |
| indic_data[k] = (cat, new_pos, unicode_data[2][k]) |
| |
| |
| values = [{_: 1} for _ in defaults] |
| for vv in indic_data.values(): |
| for i,v in enumerate(vv): |
| values[i][v] = values[i].get (v, 0) + 1 |
| |
| |
| |
| |
| # Move the outliers NO-BREAK SPACE and DOTTED CIRCLE out |
| singles = {} |
| for u in ALLOWED_SINGLES: |
| singles[u] = indic_data[u] |
| del indic_data[u] |
| |
| print ("/* == Start of generated table == */") |
| print ("/*") |
| print (" * The following table is generated by running:") |
| print (" *") |
| print (" * ./gen-indic-table.py IndicSyllabicCategory.txt IndicPositionalCategory.txt Blocks.txt") |
| print (" *") |
| print (" * on files with these headers:") |
| print (" *") |
| for h in headers: |
| for l in h: |
| print (" * %s" % (l.strip())) |
| print (" */") |
| print () |
| print ('#include "hb.hh"') |
| print () |
| print ('#ifndef HB_NO_OT_SHAPE') |
| print () |
| print ('#include "hb-ot-shaper-indic.hh"') |
| print () |
| print ('#pragma GCC diagnostic push') |
| print ('#pragma GCC diagnostic ignored "-Wunused-macros"') |
| print () |
| |
| # Print categories |
| for shaper in categories: |
| print ('#include "hb-ot-shaper-%s-machine.hh"' % shaper) |
| print () |
| done = {} |
| for shaper, shaper_cats in categories.items(): |
| print ('/* %s */' % shaper) |
| for cat in shaper_cats: |
| v = shaper[0].upper() |
| if cat not in done: |
| print ("#define OT_%s %s_Cat(%s)" % (cat, v, cat)) |
| done[cat] = v |
| else: |
| print ('static_assert (OT_%s == %s_Cat(%s), "");' % (cat, v, cat)) |
| print () |
| |
| # Shorten values |
| short = [{ |
| "Repha": 'Rf', |
| "PLACEHOLDER": 'GB', |
| "DOTTEDCIRCLE": 'DC', |
| "SMPst": 'SP', |
| "VPst": 'VR', |
| "VPre": 'VL', |
| "Robatic": 'Rt', |
| "Xgroup": 'Xg', |
| "Ygroup": 'Yg', |
| "As": 'As', |
| },{ |
| "END": 'X', |
| "BASE_C": 'C', |
| "ABOVE_C": 'T', |
| "BELOW_C": 'B', |
| "POST_C": 'R', |
| "PRE_C": 'L', |
| "PRE_M": 'LM', |
| "AFTER_MAIN": 'A', |
| "AFTER_SUB": 'AS', |
| "BEFORE_SUB": 'BS', |
| "AFTER_POST": 'AP', |
| "SMVD": 'SM', |
| }] |
| all_shorts = [{},{}] |
| |
| # Add some of the values, to make them more readable, and to avoid duplicates |
| |
| for i in range (2): |
| for v,s in short[i].items (): |
| all_shorts[i][s] = v |
| |
| what = ["OT", "POS"] |
| what_short = ["_OT", "_POS"] |
| cat_defs = [] |
| for i in range (2): |
| vv = sorted (values[i].keys ()) |
| for v in vv: |
| v_no_and = v.replace ('_And_', '_') |
| if v in short[i]: |
| s = short[i][v] |
| else: |
| s = ''.join ([c for c in v_no_and if ord ('A') <= ord (c) <= ord ('Z')]) |
| if s in all_shorts[i]: |
| raise Exception ("Duplicate short value alias", v, all_shorts[i][s]) |
| all_shorts[i][s] = v |
| short[i][v] = s |
| cat_defs.append ((what_short[i] + '_' + s, what[i] + '_' + (v.upper () if i else v), str (values[i][v]), v)) |
| |
| maxlen_s = max ([len (c[0]) for c in cat_defs]) |
| maxlen_l = max ([len (c[1]) for c in cat_defs]) |
| maxlen_n = max ([len (c[2]) for c in cat_defs]) |
| for s in what_short: |
| print () |
| for c in [c for c in cat_defs if s in c[0]]: |
| print ("#define %s %s /* %s chars; %s */" % |
| (c[0].ljust (maxlen_s), c[1].ljust (maxlen_l), c[2].rjust (maxlen_n), c[3])) |
| print () |
| print ('#pragma GCC diagnostic pop') |
| print () |
| print ("#define INDIC_COMBINE_CATEGORIES(S,M) ((S) | ((M) << 8))") |
| print () |
| print ("#define _(S,M) INDIC_COMBINE_CATEGORIES (%s_##S, %s_##M)" % tuple(what_short)) |
| print () |
| print () |
| |
| total = 0 |
| used = 0 |
| last_block = None |
| def print_block (block, start, end, data): |
| global total, used, last_block |
| if block and block != last_block: |
| print () |
| print () |
| print (" /* %s */" % block) |
| num = 0 |
| assert start % 8 == 0 |
| assert (end+1) % 8 == 0 |
| for u in range (start, end+1): |
| if u % 8 == 0: |
| print () |
| print (" /* %04X */" % u, end="") |
| if u in data: |
| num += 1 |
| d = data.get (u, defaults) |
| print ("%9s" % ("_(%s,%s)," % (short[0][d[0]], short[1][d[1]])), end="") |
| |
| total += end - start + 1 |
| used += num |
| if block: |
| last_block = block |
| |
| uu = sorted (indic_data) |
| |
| last = -100000 |
| num = 0 |
| offset = 0 |
| starts = [] |
| ends = [] |
| print ("static const uint16_t indic_table[] = {") |
| for u in uu: |
| if u <= last: |
| continue |
| block = indic_data[u][2] |
| |
| start = u//8*8 |
| end = start+1 |
| while end in uu and block == indic_data[end][2]: |
| end += 1 |
| end = (end-1)//8*8 + 7 |
| |
| if start != last + 1: |
| if start - last <= 1+16*2: |
| print_block (None, last+1, start-1, indic_data) |
| else: |
| if last >= 0: |
| ends.append (last + 1) |
| offset += ends[-1] - starts[-1] |
| print () |
| print () |
| print ("#define indic_offset_0x%04xu %d" % (start, offset)) |
| starts.append (start) |
| |
| print_block (block, start, end, indic_data) |
| last = end |
| ends.append (last + 1) |
| offset += ends[-1] - starts[-1] |
| print () |
| print () |
| occupancy = used * 100. / total |
| page_bits = 12 |
| print ("}; /* Table items: %d; occupancy: %d%% */" % (offset, occupancy)) |
| print () |
| print ("uint16_t") |
| print ("hb_indic_get_categories (hb_codepoint_t u)") |
| print ("{") |
| print (" switch (u >> %d)" % page_bits) |
| print (" {") |
| pages = set ([u>>page_bits for u in starts+ends+list (singles.keys ())]) |
| for p in sorted(pages): |
| print (" case 0x%0Xu:" % p) |
| for u,d in singles.items (): |
| if p != u>>page_bits: continue |
| print (" if (unlikely (u == 0x%04Xu)) return _(%s,%s);" % (u, short[0][d[0]], short[1][d[1]])) |
| for (start,end) in zip (starts, ends): |
| if p not in [start>>page_bits, end>>page_bits]: continue |
| offset = "indic_offset_0x%04xu" % start |
| print (" if (hb_in_range<hb_codepoint_t> (u, 0x%04Xu, 0x%04Xu)) return indic_table[u - 0x%04Xu + %s];" % (start, end-1, start, offset)) |
| print (" break;") |
| print ("") |
| print (" default:") |
| print (" break;") |
| print (" }") |
| print (" return _(X,X);") |
| print ("}") |
| print () |
| print ("#undef _") |
| print ("#undef INDIC_COMBINE_CATEGORIES") |
| for i in range (2): |
| print () |
| vv = sorted (values[i].keys ()) |
| for v in vv: |
| print ("#undef %s_%s" % |
| (what_short[i], short[i][v])) |
| print () |
| print ('#endif') |
| print () |
| print ("/* == End of generated table == */") |
| |
| # Maintain at least 50% occupancy in the table */ |
| if occupancy < 50: |
| raise Exception ("Table too sparse, please investigate: ", occupancy) |