blob: 4e86806501467650eafcc5afe97df97561469610 [file] [log] [blame]
Ebrahim Byagowi8d199072020-02-19 14:56:55 +03301#!/usr/bin/env python3
Ebrahim Byagowi08f1d952020-05-28 15:01:15 +04302# flake8: noqa: F821
Behdad Esfahbode2c95112015-07-20 11:32:48 +01003
David Corbett3e635cf2021-10-08 17:13:22 -04004"""usage: ./gen-use-table.py IndicSyllabicCategory.txt IndicPositionalCategory.txt ArabicShaping.txt DerivedCoreProperties.txt UnicodeData.txt Blocks.txt Scripts.txt IndicSyllabicCategory-Additional.txt IndicPositionalCategory-Additional.txt
Ebrahim Byagowi6a390df2020-02-10 17:19:23 +03305
David Corbett06f49fc2020-08-13 13:37:45 -04006Input files:
Ebrahim Byagowi6a390df2020-02-10 17:19:23 +03307* https://unicode.org/Public/UCD/latest/ucd/IndicSyllabicCategory.txt
8* https://unicode.org/Public/UCD/latest/ucd/IndicPositionalCategory.txt
David Corbett3e635cf2021-10-08 17:13:22 -04009* https://unicode.org/Public/UCD/latest/ucd/ArabicShaping.txt
David Corbett8eaee2f2021-10-07 20:10:31 -040010* https://unicode.org/Public/UCD/latest/ucd/DerivedCoreProperties.txt
Ebrahim Byagowi6a390df2020-02-10 17:19:23 +033011* https://unicode.org/Public/UCD/latest/ucd/UnicodeData.txt
Ebrahim Byagowi08f1d952020-05-28 15:01:15 +043012* https://unicode.org/Public/UCD/latest/ucd/Blocks.txt
David Corbett3e635cf2021-10-08 17:13:22 -040013* https://unicode.org/Public/UCD/latest/ucd/Scripts.txt
David Corbett06f49fc2020-08-13 13:37:45 -040014* ms-use/IndicSyllabicCategory-Additional.txt
David Corbettc39ab822020-10-06 16:51:40 -040015* ms-use/IndicPositionalCategory-Additional.txt
Ebrahim Byagowi08f1d952020-05-28 15:01:15 +043016"""
17
Behdad Esfahbod29f8d9f2023-11-09 09:29:02 -070018import logging
19logging.basicConfig(format='%(levelname)s: %(message)s', level=logging.INFO)
20
21
Ebrahim Byagowi08f1d952020-05-28 15:01:15 +043022import sys
23
David Corbett3e635cf2021-10-08 17:13:22 -040024if len (sys.argv) != 10:
Ebrahim Byagowi7554f612020-05-28 22:51:29 +043025 sys.exit (__doc__)
Behdad Esfahbode2c95112015-07-20 11:32:48 +010026
David Corbett3e635cf2021-10-08 17:13:22 -040027DISABLED_SCRIPTS = {
28 'Arabic',
David Corbett06f49fc2020-08-13 13:37:45 -040029 'Lao',
David Corbett3e635cf2021-10-08 17:13:22 -040030 'Samaritan',
31 'Syriac',
32 'Thai',
33}
Behdad Esfahbode2c95112015-07-20 11:32:48 +010034
Ebrahim Byagowiad871552020-05-29 00:11:19 +043035files = [open (x, encoding='utf-8') for x in sys.argv[1:]]
Behdad Esfahbode2c95112015-07-20 11:32:48 +010036
David Corbett3e635cf2021-10-08 17:13:22 -040037headers = [[f.readline () for i in range (2)] for j,f in enumerate(files) if j != 4]
38for j in range(7, 9):
David Corbett06f49fc2020-08-13 13:37:45 -040039 for line in files[j]:
40 line = line.rstrip()
41 if not line:
42 break
43 headers[j - 1].append(line)
Behdad Esfahbod20e246e2015-07-20 15:56:19 +010044headers.append (["UnicodeData.txt does not have a header."])
Behdad Esfahbode2c95112015-07-20 11:32:48 +010045
David Corbettc33468d2022-03-06 12:26:37 -050046unicode_data = [{} for _ in files]
Ebrahim Byagowi69370922020-07-13 21:32:15 +043047values = [{} for _ in files]
Behdad Esfahbode2c95112015-07-20 11:32:48 +010048for i, f in enumerate (files):
49 for line in f:
50
51 j = line.find ('#')
52 if j >= 0:
53 line = line[:j]
54
55 fields = [x.strip () for x in line.split (';')]
56 if len (fields) == 1:
57 continue
58
59 uu = fields[0].split ('..')
60 start = int (uu[0], 16)
61 if len (uu) == 1:
62 end = start
63 else:
64 end = int (uu[1], 16)
65
David Corbett3e635cf2021-10-08 17:13:22 -040066 t = fields[1 if i not in [2, 4] else 2]
Behdad Esfahbode2c95112015-07-20 11:32:48 +010067
David Corbett3e635cf2021-10-08 17:13:22 -040068 if i == 2:
David Corbett06f49fc2020-08-13 13:37:45 -040069 t = 'jt_' + t
David Corbett3e635cf2021-10-08 17:13:22 -040070 elif i == 3 and t != 'Default_Ignorable_Code_Point':
71 continue
72 elif i == 7 and t == 'Consonant_Final_Modifier':
David Corbett06f49fc2020-08-13 13:37:45 -040073 # TODO: https://github.com/MicrosoftDocs/typography-issues/issues/336
74 t = 'Syllable_Modifier'
David Corbett3e635cf2021-10-08 17:13:22 -040075 elif i == 8 and t == 'NA':
David Corbett06f49fc2020-08-13 13:37:45 -040076 t = 'Not_Applicable'
77
David Corbett3e635cf2021-10-08 17:13:22 -040078 i0 = i if i < 7 else i - 7
Behdad Esfahbode2c95112015-07-20 11:32:48 +010079 for u in range (start, end + 1):
David Corbettc33468d2022-03-06 12:26:37 -050080 unicode_data[i0][u] = t
David Corbett06f49fc2020-08-13 13:37:45 -040081 values[i0][t] = values[i0].get (t, 0) + end - start + 1
Behdad Esfahbode2c95112015-07-20 11:32:48 +010082
David Corbett3e635cf2021-10-08 17:13:22 -040083defaults = ('Other', 'Not_Applicable', 'jt_X', '', 'Cn', 'No_Block', 'Unknown')
Behdad Esfahbodba728012015-07-21 11:57:23 +010084
Behdad Esfahbodba728012015-07-21 11:57:23 +010085# Merge data into one dict:
Behdad Esfahbode2c95112015-07-20 11:32:48 +010086for i,v in enumerate (defaults):
87 values[i][v] = values[i].get (v, 0) + 1
88combined = {}
David Corbettc33468d2022-03-06 12:26:37 -050089for i,d in enumerate (unicode_data):
Behdad Esfahbode2c95112015-07-20 11:32:48 +010090 for u,v in d.items ():
Behdad Esfahbode2c95112015-07-20 11:32:48 +010091 if not u in combined:
David Corbett3e635cf2021-10-08 17:13:22 -040092 if i >= 4:
93 continue
Behdad Esfahbode2c95112015-07-20 11:32:48 +010094 combined[u] = list (defaults)
95 combined[u][i] = v
David Corbett3e635cf2021-10-08 17:13:22 -040096combined = {k: v for k, v in combined.items() if v[6] not in DISABLED_SCRIPTS}
Behdad Esfahbode2c95112015-07-20 11:32:48 +010097
Behdad Esfahbod20e246e2015-07-20 15:56:19 +010098
99property_names = [
100 # General_Category
101 'Cc', 'Cf', 'Cn', 'Co', 'Cs', 'Ll', 'Lm', 'Lo', 'Lt', 'Lu', 'Mc',
102 'Me', 'Mn', 'Nd', 'Nl', 'No', 'Pc', 'Pd', 'Pe', 'Pf', 'Pi', 'Po',
103 'Ps', 'Sc', 'Sk', 'Sm', 'So', 'Zl', 'Zp', 'Zs',
104 # Indic_Syllabic_Category
Behdad Esfahbodad725552015-07-20 17:00:06 +0100105 'Other',
Behdad Esfahbod20e246e2015-07-20 15:56:19 +0100106 'Bindu',
107 'Visarga',
108 'Avagraha',
109 'Nukta',
110 'Virama',
111 'Pure_Killer',
David Corbettb6196982024-10-03 15:56:04 -0400112 'Reordering_Killer',
Behdad Esfahbod20e246e2015-07-20 15:56:19 +0100113 'Invisible_Stacker',
114 'Vowel_Independent',
115 'Vowel_Dependent',
116 'Vowel',
117 'Consonant_Placeholder',
118 'Consonant',
119 'Consonant_Dead',
120 'Consonant_With_Stacker',
121 'Consonant_Prefixed',
122 'Consonant_Preceding_Repha',
123 'Consonant_Succeeding_Repha',
124 'Consonant_Subjoined',
125 'Consonant_Medial',
126 'Consonant_Final',
127 'Consonant_Head_Letter',
Behdad Esfahbod060e6b42018-06-05 17:31:46 -0700128 'Consonant_Initial_Postfixed',
Behdad Esfahbod20e246e2015-07-20 15:56:19 +0100129 'Modifying_Letter',
130 'Tone_Letter',
131 'Tone_Mark',
132 'Gemination_Mark',
133 'Cantillation_Mark',
134 'Register_Shifter',
135 'Syllable_Modifier',
136 'Consonant_Killer',
137 'Non_Joiner',
138 'Joiner',
139 'Number_Joiner',
140 'Number',
141 'Brahmi_Joining_Number',
David Corbett3ca5fbd2022-09-21 18:13:17 -0400142 'Symbol_Modifier',
David Corbettfaf09f52020-08-14 18:30:20 -0400143 'Hieroglyph',
144 'Hieroglyph_Joiner',
David Corbett767f9372023-09-15 23:11:26 -0400145 'Hieroglyph_Mark_Begin',
146 'Hieroglyph_Mark_End',
147 'Hieroglyph_Mirror',
148 'Hieroglyph_Modifier',
David Corbettfaf09f52020-08-14 18:30:20 -0400149 'Hieroglyph_Segment_Begin',
150 'Hieroglyph_Segment_End',
Behdad Esfahbod20e246e2015-07-20 15:56:19 +0100151 # Indic_Positional_Category
Behdad Esfahbod44910ce2015-07-20 18:01:10 +0100152 'Not_Applicable',
Behdad Esfahbod20e246e2015-07-20 15:56:19 +0100153 'Right',
154 'Left',
155 'Visual_Order_Left',
156 'Left_And_Right',
157 'Top',
158 'Bottom',
159 'Top_And_Bottom',
David Corbettfd748fa2020-03-15 15:59:31 -0400160 'Top_And_Bottom_And_Left',
Behdad Esfahbod20e246e2015-07-20 15:56:19 +0100161 'Top_And_Right',
162 'Top_And_Left',
163 'Top_And_Left_And_Right',
Behdad Esfahbodea535a12017-10-02 17:02:39 +0200164 'Bottom_And_Left',
Behdad Esfahbod20e246e2015-07-20 15:56:19 +0100165 'Bottom_And_Right',
166 'Top_And_Bottom_And_Right',
167 'Overstruck',
David Corbett06f49fc2020-08-13 13:37:45 -0400168 # Joining_Type
169 'jt_C',
170 'jt_D',
171 'jt_L',
172 'jt_R',
173 'jt_T',
174 'jt_U',
175 'jt_X',
Behdad Esfahbod20e246e2015-07-20 15:56:19 +0100176]
177
178class PropertyValue(object):
179 def __init__(self, name_):
180 self.name = name_
Behdad Esfahbodad725552015-07-20 17:00:06 +0100181 def __str__(self):
182 return self.name
183 def __eq__(self, other):
Ebrahim Byagowia0c58be2020-03-18 23:40:59 +0330184 return self.name == (other if isinstance(other, str) else other.name)
Behdad Esfahbodad725552015-07-20 17:00:06 +0100185 def __ne__(self, other):
186 return not (self == other)
Ebrahim Byagowi26e0cbd2018-03-29 21:22:47 +0430187 def __hash__(self):
188 return hash(str(self))
Behdad Esfahbod20e246e2015-07-20 15:56:19 +0100189
190property_values = {}
191
192for name in property_names:
193 value = PropertyValue(name)
194 assert value not in property_values
195 assert value not in globals()
196 property_values[name] = value
197globals().update(property_values)
198
199
David Corbett8eaee2f2021-10-07 20:10:31 -0400200def is_BASE(U, UISC, UDI, UGC, AJT):
David Corbetta2a14842018-06-06 12:57:28 -0400201 return (UISC in [Number, Consonant, Consonant_Head_Letter,
Behdad Esfahbod9b6312f2016-05-06 17:41:49 +0100202 Tone_Letter,
David Corbett06f49fc2020-08-13 13:37:45 -0400203 Vowel_Independent,
David Corbettf83496a2020-11-20 08:16:36 -0500204 ] or
David Corbett06f49fc2020-08-13 13:37:45 -0400205 # TODO: https://github.com/MicrosoftDocs/typography-issues/issues/484
David Corbett07926902020-10-16 22:41:12 -0400206 AJT in [jt_C, jt_D, jt_L, jt_R] and UISC != Joiner or
Behdad Esfahbod20e246e2015-07-20 15:56:19 +0100207 (UGC == Lo and UISC in [Avagraha, Bindu, Consonant_Final, Consonant_Medial,
208 Consonant_Subjoined, Vowel, Vowel_Dependent]))
David Corbett8eaee2f2021-10-07 20:10:31 -0400209def is_BASE_NUM(U, UISC, UDI, UGC, AJT):
Behdad Esfahbod20e246e2015-07-20 15:56:19 +0100210 return UISC == Brahmi_Joining_Number
David Corbett8eaee2f2021-10-07 20:10:31 -0400211def is_BASE_OTHER(U, UISC, UDI, UGC, AJT):
David Corbett06f49fc2020-08-13 13:37:45 -0400212 if UISC == Consonant_Placeholder: return True
Behdad Esfahbod9b6312f2016-05-06 17:41:49 +0100213 return U in [0x2015, 0x2022, 0x25FB, 0x25FC, 0x25FD, 0x25FE]
David Corbett8eaee2f2021-10-07 20:10:31 -0400214def is_CGJ(U, UISC, UDI, UGC, AJT):
David Corbett5b0a5982022-03-04 20:45:30 -0500215 # Also includes VARIATION_SELECTOR and ZWJ
216 return UISC == Joiner or UDI and UGC in [Mc, Me, Mn]
David Corbett8eaee2f2021-10-07 20:10:31 -0400217def is_CONS_FINAL(U, UISC, UDI, UGC, AJT):
Behdad Esfahbod20e246e2015-07-20 15:56:19 +0100218 return ((UISC == Consonant_Final and UGC != Lo) or
219 UISC == Consonant_Succeeding_Repha)
David Corbett8eaee2f2021-10-07 20:10:31 -0400220def is_CONS_FINAL_MOD(U, UISC, UDI, UGC, AJT):
David Corbett06f49fc2020-08-13 13:37:45 -0400221 return UISC == Syllable_Modifier
David Corbett8eaee2f2021-10-07 20:10:31 -0400222def is_CONS_MED(U, UISC, UDI, UGC, AJT):
David Corbett125c45e2019-05-15 17:02:32 -0400223 # Consonant_Initial_Postfixed is new in Unicode 11; not in the spec.
224 return (UISC == Consonant_Medial and UGC != Lo or
225 UISC == Consonant_Initial_Postfixed)
David Corbett8eaee2f2021-10-07 20:10:31 -0400226def is_CONS_MOD(U, UISC, UDI, UGC, AJT):
David Corbett3ca5fbd2022-09-21 18:13:17 -0400227 return UISC in [Nukta, Gemination_Mark, Consonant_Killer]
David Corbett8eaee2f2021-10-07 20:10:31 -0400228def is_CONS_SUB(U, UISC, UDI, UGC, AJT):
Behdad Esfahbodad725552015-07-20 17:00:06 +0100229 return UISC == Consonant_Subjoined and UGC != Lo
David Corbett8eaee2f2021-10-07 20:10:31 -0400230def is_CONS_WITH_STACKER(U, UISC, UDI, UGC, AJT):
Behdad Esfahbode07669f2017-10-03 14:57:14 +0200231 return UISC == Consonant_With_Stacker
David Corbett8eaee2f2021-10-07 20:10:31 -0400232def is_HALANT(U, UISC, UDI, UGC, AJT):
David Corbett60598282022-06-25 11:33:44 -0400233 return UISC == Virama and not is_HALANT_OR_VOWEL_MODIFIER(U, UISC, UDI, UGC, AJT)
234def is_HALANT_OR_VOWEL_MODIFIER(U, UISC, UDI, UGC, AJT):
235 # Split off of HALANT
236 return U == 0x0DCA
David Corbett8eaee2f2021-10-07 20:10:31 -0400237def is_HALANT_NUM(U, UISC, UDI, UGC, AJT):
Behdad Esfahbod20e246e2015-07-20 15:56:19 +0100238 return UISC == Number_Joiner
David Corbett8eaee2f2021-10-07 20:10:31 -0400239def is_HIEROGLYPH(U, UISC, UDI, UGC, AJT):
David Corbettfaf09f52020-08-14 18:30:20 -0400240 return UISC == Hieroglyph
David Corbett8eaee2f2021-10-07 20:10:31 -0400241def is_HIEROGLYPH_JOINER(U, UISC, UDI, UGC, AJT):
David Corbettfaf09f52020-08-14 18:30:20 -0400242 return UISC == Hieroglyph_Joiner
David Corbett767f9372023-09-15 23:11:26 -0400243def is_HIEROGLYPH_MIRROR(U, UISC, UDI, UGC, AJT):
244 return UISC == Hieroglyph_Mirror
245def is_HIEROGLYPH_MOD(U, UISC, UDI, UGC, AJT):
246 return UISC == Hieroglyph_Modifier
David Corbett8eaee2f2021-10-07 20:10:31 -0400247def is_HIEROGLYPH_SEGMENT_BEGIN(U, UISC, UDI, UGC, AJT):
David Corbett767f9372023-09-15 23:11:26 -0400248 return UISC in [Hieroglyph_Mark_Begin, Hieroglyph_Segment_Begin]
David Corbett8eaee2f2021-10-07 20:10:31 -0400249def is_HIEROGLYPH_SEGMENT_END(U, UISC, UDI, UGC, AJT):
David Corbett767f9372023-09-15 23:11:26 -0400250 return UISC in [Hieroglyph_Mark_End, Hieroglyph_Segment_End]
David Corbett49fb8f92022-03-13 15:01:11 -0400251def is_INVISIBLE_STACKER(U, UISC, UDI, UGC, AJT):
252 # Split off of HALANT
253 return (UISC == Invisible_Stacker
254 and not is_SAKOT(U, UISC, UDI, UGC, AJT)
255 )
David Corbett8eaee2f2021-10-07 20:10:31 -0400256def is_ZWNJ(U, UISC, UDI, UGC, AJT):
Behdad Esfahbod20e246e2015-07-20 15:56:19 +0100257 return UISC == Non_Joiner
David Corbett8eaee2f2021-10-07 20:10:31 -0400258def is_OTHER(U, UISC, UDI, UGC, AJT):
David Corbettc33468d2022-03-06 12:26:37 -0500259 # Also includes BASE_IND and SYM
260 return ((UGC == Po or UISC in [Consonant_Dead, Joiner, Modifying_Letter, Other])
David Corbett8eaee2f2021-10-07 20:10:31 -0400261 and not is_BASE(U, UISC, UDI, UGC, AJT)
262 and not is_BASE_OTHER(U, UISC, UDI, UGC, AJT)
263 and not is_CGJ(U, UISC, UDI, UGC, AJT)
David Corbett8eaee2f2021-10-07 20:10:31 -0400264 and not is_SYM_MOD(U, UISC, UDI, UGC, AJT)
David Corbett5b0a5982022-03-04 20:45:30 -0500265 and not is_Word_Joiner(U, UISC, UDI, UGC, AJT)
Behdad Esfahbodba728012015-07-21 11:57:23 +0100266 )
David Corbettb6196982024-10-03 15:56:04 -0400267def is_REORDERING_KILLER(U, UISC, UDI, UGC, AJT):
268 return UISC == Reordering_Killer
David Corbett8eaee2f2021-10-07 20:10:31 -0400269def is_REPHA(U, UISC, UDI, UGC, AJT):
Behdad Esfahbode07669f2017-10-03 14:57:14 +0200270 return UISC in [Consonant_Preceding_Repha, Consonant_Prefixed]
David Corbett8eaee2f2021-10-07 20:10:31 -0400271def is_SAKOT(U, UISC, UDI, UGC, AJT):
David Corbettbb50aae2021-10-08 13:25:46 -0400272 # Split off of HALANT
David Corbettd64fb9d2019-05-26 11:05:54 -0400273 return U == 0x1A60
David Corbett8eaee2f2021-10-07 20:10:31 -0400274def is_SYM_MOD(U, UISC, UDI, UGC, AJT):
David Corbett3ca5fbd2022-09-21 18:13:17 -0400275 return UISC == Symbol_Modifier
David Corbett8eaee2f2021-10-07 20:10:31 -0400276def is_VOWEL(U, UISC, UDI, UGC, AJT):
Behdad Esfahbod20e246e2015-07-20 15:56:19 +0100277 return (UISC == Pure_Killer or
David Corbett13bb46c2022-03-06 15:35:31 -0500278 UGC != Lo and UISC in [Vowel, Vowel_Dependent])
David Corbett8eaee2f2021-10-07 20:10:31 -0400279def is_VOWEL_MOD(U, UISC, UDI, UGC, AJT):
Behdad Esfahbod20e246e2015-07-20 15:56:19 +0100280 return (UISC in [Tone_Mark, Cantillation_Mark, Register_Shifter, Visarga] or
David Corbett13bb46c2022-03-06 15:35:31 -0500281 UGC != Lo and UISC == Bindu)
David Corbett5b0a5982022-03-04 20:45:30 -0500282def is_Word_Joiner(U, UISC, UDI, UGC, AJT):
David Corbettc33468d2022-03-06 12:26:37 -0500283 # Also includes Rsv
David Corbett5b0a5982022-03-04 20:45:30 -0500284 return (UDI and U not in [0x115F, 0x1160, 0x3164, 0xFFA0, 0x1BCA0, 0x1BCA1, 0x1BCA2, 0x1BCA3]
285 and UISC == Other
286 and not is_CGJ(U, UISC, UDI, UGC, AJT)
David Corbettc33468d2022-03-06 12:26:37 -0500287 ) or UGC == Cn
Behdad Esfahbod20e246e2015-07-20 15:56:19 +0100288
289use_mapping = {
290 'B': is_BASE,
Behdad Esfahbod20e246e2015-07-20 15:56:19 +0100291 'N': is_BASE_NUM,
292 'GB': is_BASE_OTHER,
David Corbett8eaee2f2021-10-07 20:10:31 -0400293 'CGJ': is_CGJ,
Behdad Esfahbod20e246e2015-07-20 15:56:19 +0100294 'F': is_CONS_FINAL,
295 'FM': is_CONS_FINAL_MOD,
296 'M': is_CONS_MED,
297 'CM': is_CONS_MOD,
298 'SUB': is_CONS_SUB,
Behdad Esfahbode07669f2017-10-03 14:57:14 +0200299 'CS': is_CONS_WITH_STACKER,
Behdad Esfahbod20e246e2015-07-20 15:56:19 +0100300 'H': is_HALANT,
David Corbett60598282022-06-25 11:33:44 -0400301 'HVM': is_HALANT_OR_VOWEL_MODIFIER,
Behdad Esfahbod20e246e2015-07-20 15:56:19 +0100302 'HN': is_HALANT_NUM,
David Corbett49fb8f92022-03-13 15:01:11 -0400303 'IS': is_INVISIBLE_STACKER,
David Corbettfaf09f52020-08-14 18:30:20 -0400304 'G': is_HIEROGLYPH,
David Corbett767f9372023-09-15 23:11:26 -0400305 'HM': is_HIEROGLYPH_MOD,
306 'HR': is_HIEROGLYPH_MIRROR,
David Corbettfaf09f52020-08-14 18:30:20 -0400307 'J': is_HIEROGLYPH_JOINER,
308 'SB': is_HIEROGLYPH_SEGMENT_BEGIN,
309 'SE': is_HIEROGLYPH_SEGMENT_END,
Behdad Esfahbod20e246e2015-07-20 15:56:19 +0100310 'ZWNJ': is_ZWNJ,
Behdad Esfahbod20e246e2015-07-20 15:56:19 +0100311 'O': is_OTHER,
David Corbettb6196982024-10-03 15:56:04 -0400312 'RK': is_REORDERING_KILLER,
Behdad Esfahbod20e246e2015-07-20 15:56:19 +0100313 'R': is_REPHA,
David Corbettd64fb9d2019-05-26 11:05:54 -0400314 'Sk': is_SAKOT,
Behdad Esfahbod20e246e2015-07-20 15:56:19 +0100315 'SM': is_SYM_MOD,
Behdad Esfahbod20e246e2015-07-20 15:56:19 +0100316 'V': is_VOWEL,
317 'VM': is_VOWEL_MOD,
David Corbett5b0a5982022-03-04 20:45:30 -0500318 'WJ': is_Word_Joiner,
Behdad Esfahbod20e246e2015-07-20 15:56:19 +0100319}
320
Behdad Esfahbod44910ce2015-07-20 18:01:10 +0100321use_positions = {
322 'F': {
323 'Abv': [Top],
324 'Blw': [Bottom],
325 'Pst': [Right],
326 },
327 'M': {
328 'Abv': [Top],
David Corbettfd748fa2020-03-15 15:59:31 -0400329 'Blw': [Bottom, Bottom_And_Left, Bottom_And_Right],
Behdad Esfahbod44910ce2015-07-20 18:01:10 +0100330 'Pst': [Right],
David Corbettfd748fa2020-03-15 15:59:31 -0400331 'Pre': [Left, Top_And_Bottom_And_Left],
Behdad Esfahbod44910ce2015-07-20 18:01:10 +0100332 },
333 'CM': {
334 'Abv': [Top],
David Corbett06f49fc2020-08-13 13:37:45 -0400335 'Blw': [Bottom, Overstruck],
Behdad Esfahbod44910ce2015-07-20 18:01:10 +0100336 },
337 'V': {
338 'Abv': [Top, Top_And_Bottom, Top_And_Bottom_And_Right, Top_And_Right],
339 'Blw': [Bottom, Overstruck, Bottom_And_Right],
David Corbett06f49fc2020-08-13 13:37:45 -0400340 'Pst': [Right],
341 'Pre': [Left, Top_And_Left, Top_And_Left_And_Right, Left_And_Right],
Behdad Esfahbod44910ce2015-07-20 18:01:10 +0100342 },
343 'VM': {
344 'Abv': [Top],
345 'Blw': [Bottom, Overstruck],
346 'Pst': [Right],
347 'Pre': [Left],
348 },
349 'SM': {
350 'Abv': [Top],
351 'Blw': [Bottom],
352 },
353 'H': None,
David Corbett767f9372023-09-15 23:11:26 -0400354 'HM': None,
355 'HR': None,
David Corbett60598282022-06-25 11:33:44 -0400356 'HVM': None,
David Corbett49fb8f92022-03-13 15:01:11 -0400357 'IS': None,
Behdad Esfahbod44910ce2015-07-20 18:01:10 +0100358 'B': None,
David Corbettd100cca2019-05-19 10:01:20 -0400359 'FM': {
360 'Abv': [Top],
361 'Blw': [Bottom],
362 'Pst': [Not_Applicable],
363 },
David Corbett06f49fc2020-08-13 13:37:45 -0400364 'R': None,
David Corbettb6196982024-10-03 15:56:04 -0400365 'RK': None,
Behdad Esfahbod44910ce2015-07-20 18:01:10 +0100366 'SUB': None,
367}
368
Behdad Esfahbodad725552015-07-20 17:00:06 +0100369def map_to_use(data):
370 out = {}
371 items = use_mapping.items()
David Corbett3e635cf2021-10-08 17:13:22 -0400372 for U, (UISC, UIPC, AJT, UDI, UGC, UBlock, _) in data.items():
David Corbett06f49fc2020-08-13 13:37:45 -0400373
Behdad Esfahbod44910ce2015-07-20 18:01:10 +0100374 # Resolve Indic_Syllabic_Category
375
David Corbettfd748fa2020-03-15 15:59:31 -0400376 # TODO: These don't have UISC assigned in Unicode 13.0.0, but have UIPC
Behdad Esfahbod44910ce2015-07-20 18:01:10 +0100377 if 0x1CE2 <= U <= 0x1CE8: UISC = Cantillation_Mark
378
Behdad Esfahbod32a43812018-10-02 18:43:29 +0200379 # Tibetan:
David Corbettfd748fa2020-03-15 15:59:31 -0400380 # TODO: These don't have UISC assigned in Unicode 13.0.0, but have UIPC
Behdad Esfahbod32a43812018-10-02 18:43:29 +0200381 if 0x0F18 <= U <= 0x0F19 or 0x0F3E <= U <= 0x0F3F: UISC = Vowel_Dependent
Behdad Esfahbod32a43812018-10-02 18:43:29 +0200382
Behdad Esfahbod44910ce2015-07-20 18:01:10 +0100383 # TODO: U+1CED should only be allowed after some of
384 # the nasalization marks, maybe only for U+1CE9..U+1CF1.
385 if U == 0x1CED: UISC = Tone_Mark
386
David Corbett8eaee2f2021-10-07 20:10:31 -0400387 values = [k for k,v in items if v(U, UISC, UDI, UGC, AJT)]
388 assert len(values) == 1, "%s %s %s %s %s %s" % (hex(U), UISC, UDI, UGC, AJT, values)
Behdad Esfahbod44910ce2015-07-20 18:01:10 +0100389 USE = values[0]
390
391 # Resolve Indic_Positional_Category
392
punchcutter9541c9d2018-06-24 22:54:57 -0700393 # TODO: https://github.com/harfbuzz/harfbuzz/pull/1037
punchcuttera7eed7e2019-03-27 23:12:58 -0700394 # and https://github.com/harfbuzz/harfbuzz/issues/1631
395 if U in [0x11302, 0x11303, 0x114C1]: UIPC = Top
David Corbett06f49fc2020-08-13 13:37:45 -0400396
David Corbett85a9ec82024-09-14 11:44:03 -0400397 # TODO: https://github.com/microsoft/font-tools/issues/17#issuecomment-2346952091
398 if U == 0x113CF: UIPC = Bottom
399
400 assert (UIPC in [Not_Applicable, Visual_Order_Left] or
401 U in {0x0F7F, 0x11A3A} or
David Corbett8eaee2f2021-10-07 20:10:31 -0400402 USE in use_positions), "%s %s %s %s %s %s %s" % (hex(U), UIPC, USE, UISC, UDI, UGC, AJT)
Behdad Esfahbod44910ce2015-07-20 18:01:10 +0100403
404 pos_mapping = use_positions.get(USE, None)
405 if pos_mapping:
406 values = [k for k,v in pos_mapping.items() if v and UIPC in v]
David Corbett8eaee2f2021-10-07 20:10:31 -0400407 assert len(values) == 1, "%s %s %s %s %s %s %s %s" % (hex(U), UIPC, USE, UISC, UDI, UGC, AJT, values)
Behdad Esfahbod44910ce2015-07-20 18:01:10 +0100408 USE = USE + values[0]
409
410 out[U] = (USE, UBlock)
Behdad Esfahbodad725552015-07-20 17:00:06 +0100411 return out
412
David Corbettc33468d2022-03-06 12:26:37 -0500413use_data = map_to_use(combined)
Behdad Esfahbod20e246e2015-07-20 15:56:19 +0100414
Ebrahim Byagowia48dd6e2018-03-28 19:08:19 +0430415print ("/* == Start of generated table == */")
416print ("/*")
417print (" * The following table is generated by running:")
418print (" *")
David Corbett3e635cf2021-10-08 17:13:22 -0400419print (" * {} IndicSyllabicCategory.txt IndicPositionalCategory.txt ArabicShaping.txt DerivedCoreProperties.txt UnicodeData.txt Blocks.txt Scripts.txt IndicSyllabicCategory-Additional.txt IndicPositionalCategory-Additional.txt".format (sys.argv[0]))
Ebrahim Byagowia48dd6e2018-03-28 19:08:19 +0430420print (" *")
421print (" * on files with these headers:")
422print (" *")
Behdad Esfahbode2c95112015-07-20 11:32:48 +0100423for h in headers:
424 for l in h:
Ebrahim Byagowia48dd6e2018-03-28 19:08:19 +0430425 print (" * %s" % (l.strip()))
426print (" */")
427print ()
Behdad Esfahbod44be1e52022-06-03 02:54:33 -0600428print ("#ifndef HB_OT_SHAPER_USE_TABLE_HH")
429print ("#define HB_OT_SHAPER_USE_TABLE_HH")
Behdad Esfahbod7aad5362019-06-26 13:21:03 -0700430print ()
Behdad Esfahbod59721c22021-01-29 11:34:59 -0700431print ('#include "hb.hh"')
Behdad Esfahbod7aad5362019-06-26 13:21:03 -0700432print ()
Behdad Esfahbod5bfb0b72022-06-03 02:56:41 -0600433print ('#include "hb-ot-shaper-use-machine.hh"')
Ebrahim Byagowia48dd6e2018-03-28 19:08:19 +0430434print ()
Behdad Esfahbode2c95112015-07-20 11:32:48 +0100435
Behdad Esfahbode2c95112015-07-20 11:32:48 +0100436total = 0
437used = 0
438last_block = None
David Corbettc33468d2022-03-06 12:26:37 -0500439def print_block (block, start, end, use_data):
Behdad Esfahbode2c95112015-07-20 11:32:48 +0100440 global total, used, last_block
441 if block and block != last_block:
Ebrahim Byagowia48dd6e2018-03-28 19:08:19 +0430442 print ()
443 print ()
444 print (" /* %s */" % block)
Behdad Esfahbodad725552015-07-20 17:00:06 +0100445 if start % 16:
Ebrahim Byagowia48dd6e2018-03-28 19:08:19 +0430446 print (' ' * (20 + (start % 16 * 6)), end='')
Behdad Esfahbode2c95112015-07-20 11:32:48 +0100447 num = 0
448 assert start % 8 == 0
449 assert (end+1) % 8 == 0
450 for u in range (start, end+1):
Behdad Esfahbodad725552015-07-20 17:00:06 +0100451 if u % 16 == 0:
Ebrahim Byagowia48dd6e2018-03-28 19:08:19 +0430452 print ()
453 print (" /* %04X */" % u, end='')
David Corbettc33468d2022-03-06 12:26:37 -0500454 if u in use_data:
Behdad Esfahbode2c95112015-07-20 11:32:48 +0100455 num += 1
David Corbettc33468d2022-03-06 12:26:37 -0500456 d = use_data.get (u)
457 if d is not None:
458 d = d[0]
459 elif u in unicode_data[4]:
460 d = 'O'
461 else:
462 d = 'WJ'
463 print ("%6s," % d, end='')
Behdad Esfahbode2c95112015-07-20 11:32:48 +0100464
465 total += end - start + 1
466 used += num
467 if block:
468 last_block = block
469
David Corbettc33468d2022-03-06 12:26:37 -0500470uu = sorted (use_data.keys ())
Behdad Esfahbode2c95112015-07-20 11:32:48 +0100471
472last = -100000
473num = 0
474offset = 0
475starts = []
476ends = []
Behdad Esfahbod8874eef2019-01-17 15:04:44 -0500477print ('#pragma GCC diagnostic push')
478print ('#pragma GCC diagnostic ignored "-Wunused-macros"')
Behdad Esfahbodad725552015-07-20 17:00:06 +0100479for k,v in sorted(use_mapping.items()):
Behdad Esfahbod44910ce2015-07-20 18:01:10 +0100480 if k in use_positions and use_positions[k]: continue
Behdad Esfahbod3bb26532021-01-28 20:36:51 -0700481 print ("#define %s USE(%s) /* %s */" % (k, k, v.__name__[3:]))
Behdad Esfahbod44910ce2015-07-20 18:01:10 +0100482for k,v in sorted(use_positions.items()):
483 if not v: continue
484 for suf in v.keys():
485 tag = k + suf
Behdad Esfahbod3bb26532021-01-28 20:36:51 -0700486 print ("#define %s USE(%s)" % (tag, tag))
Behdad Esfahbod8874eef2019-01-17 15:04:44 -0500487print ('#pragma GCC diagnostic pop')
Ebrahim Byagowia48dd6e2018-03-28 19:08:19 +0430488print ("")
Behdad Esfahbode2c95112015-07-20 11:32:48 +0100489
Behdad Esfahbode2c95112015-07-20 11:32:48 +0100490
Behdad Esfahbod015aecf2022-07-13 12:15:01 -0600491import packTab
492data = {u:v[0] for u,v in use_data.items()}
Behdad Esfahbod72c4e432022-11-19 13:40:33 -0700493
494DEFAULT = 5
495COMPACT = 9
496for compression in (DEFAULT, COMPACT):
497
498 logging.info(' Compression=%d:' % compression)
499 print()
500 if compression == DEFAULT:
501 print('#ifndef HB_OPTIMIZE_SIZE')
502 elif compression == COMPACT:
503 print('#else')
504 else:
505 assert False
506 print()
507
508 code = packTab.Code('hb_use')
509 sol = packTab.pack_table(data, compression=compression, default='O')
510 logging.info(' FullCost=%d' % (sol.fullCost))
511 sol.genCode(code, f'get_category')
512 code.print_c(linkage='static inline')
513 print ()
514
515print('#endif')
Behdad Esfahbode2c95112015-07-20 11:32:48 +0100516
Ebrahim Byagowia48dd6e2018-03-28 19:08:19 +0430517print ()
Behdad Esfahbodad717822015-07-21 16:43:27 +0100518for k in sorted(use_mapping.keys()):
519 if k in use_positions and use_positions[k]: continue
Ebrahim Byagowia48dd6e2018-03-28 19:08:19 +0430520 print ("#undef %s" % k)
Behdad Esfahbodad717822015-07-21 16:43:27 +0100521for k,v in sorted(use_positions.items()):
522 if not v: continue
523 for suf in v.keys():
524 tag = k + suf
Ebrahim Byagowia48dd6e2018-03-28 19:08:19 +0430525 print ("#undef %s" % tag)
526print ()
Behdad Esfahbod7aad5362019-06-26 13:21:03 -0700527print ()
Behdad Esfahbod44be1e52022-06-03 02:54:33 -0600528print ("#endif /* HB_OT_SHAPER_USE_TABLE_HH */")
Ebrahim Byagowia48dd6e2018-03-28 19:08:19 +0430529print ("/* == End of generated table == */")