blob: 3c1f6211eb452ede44ccc654315ecaa1cbfc1987 [file] [log] [blame]
Ebrahim Byagowi8d199072020-02-19 14:56:55 +03301#!/usr/bin/env python3
David Corbett205737a2018-10-12 16:54:54 -04002
3"""Generator of the function to prohibit certain vowel sequences.
4
Behdad Esfahbod6d40eb82018-10-23 02:51:42 -07005It creates ``_hb_preprocess_text_vowel_constraints``, which inserts dotted
David Corbett205737a2018-10-12 16:54:54 -04006circles into sequences prohibited by the USE script development spec.
7This function should be used as the ``preprocess_text`` of an
Behdad Esfahbod44a7b3b2022-06-03 02:42:34 -06008``hb_ot_shaper_t``.
Evgeniy Reizner4dc87362020-02-09 18:39:33 +02009
Ebrahim Byagowi08f1d952020-05-28 15:01:15 +043010usage: ./gen-vowel-constraints.py ms-use/IndicShapingInvalidCluster.txt Scripts.txt
11
12Input file:
13* https://unicode.org/Public/UCD/latest/ucd/Scripts.txt
David Corbett205737a2018-10-12 16:54:54 -040014"""
15
David Corbett205737a2018-10-12 16:54:54 -040016import collections
Ebrahim Byagowie17fd0d2020-02-23 23:58:39 +033017def write (s):
18 sys.stdout.flush ()
19 sys.stdout.buffer.write (s.encode ('utf-8'))
David Corbett205737a2018-10-12 16:54:54 -040020import sys
21
22if len (sys.argv) != 3:
Ebrahim Byagowi7554f612020-05-28 22:51:29 +043023 sys.exit (__doc__)
David Corbett205737a2018-10-12 16:54:54 -040024
Ebrahim Byagowiad871552020-05-29 00:11:19 +043025with open (sys.argv[2], encoding='utf-8') as f:
David Corbett205737a2018-10-12 16:54:54 -040026 scripts_header = [f.readline () for i in range (2)]
27 scripts = {}
28 script_order = {}
29 for line in f:
30 j = line.find ('#')
31 if j >= 0:
32 line = line[:j]
33 fields = [x.strip () for x in line.split (';')]
34 if len (fields) == 1:
35 continue
36 uu = fields[0].split ('..')
37 start = int (uu[0], 16)
38 if len (uu) == 1:
39 end = start
40 else:
41 end = int (uu[1], 16)
42 script = fields[1]
43 for u in range (start, end + 1):
44 scripts[u] = script
45 if script not in script_order:
46 script_order[script] = start
47
48class ConstraintSet (object):
49 """A set of prohibited code point sequences.
50
51 Args:
52 constraint (List[int]): A prohibited code point sequence.
53
54 """
55 def __init__ (self, constraint):
56 # Either a list or a dictionary. As a list of code points, it
57 # represents a prohibited code point sequence. As a dictionary,
58 # it represents a set of prohibited sequences, where each item
59 # represents the set of prohibited sequences starting with the
60 # key (a code point) concatenated with any of the values
61 # (ConstraintSets).
62 self._c = constraint
63
64 def add (self, constraint):
65 """Add a constraint to this set."""
66 if not constraint:
67 return
68 first = constraint[0]
69 rest = constraint[1:]
70 if isinstance (self._c, list):
71 if constraint == self._c[:len (constraint)]:
72 self._c = constraint
73 elif self._c != constraint[:len (self._c)]:
74 self._c = {self._c[0]: ConstraintSet (self._c[1:])}
75 if isinstance (self._c, dict):
76 if first in self._c:
77 self._c[first].add (rest)
78 else:
79 self._c[first] = ConstraintSet (rest)
80
David Corbettb372c3e2019-11-08 20:59:48 -050081 @staticmethod
82 def _indent (depth):
David Corbett205737a2018-10-12 16:54:54 -040083 return (' ' * depth).replace (' ', '\t')
84
85 def __str__ (self, index=0, depth=4):
86 s = []
87 indent = self._indent (depth)
88 if isinstance (self._c, list):
89 if len (self._c) == 0:
David Corbettb372c3e2019-11-08 20:59:48 -050090 assert index == 2, 'Cannot use `matched` for this constraint; the general case has not been implemented'
David Corbett205737a2018-10-12 16:54:54 -040091 s.append ('{}matched = true;\n'.format (indent))
92 elif len (self._c) == 1:
David Corbettb372c3e2019-11-08 20:59:48 -050093 assert index == 1, 'Cannot use `matched` for this constraint; the general case has not been implemented'
David Corbett205737a2018-10-12 16:54:54 -040094 s.append ('{}matched = 0x{:04X}u == buffer->cur ({}).codepoint;\n'.format (indent, next (iter (self._c)), index or ''))
95 else:
David Corbettb372c3e2019-11-08 20:59:48 -050096 s.append ('{}if (0x{:04X}u == buffer->cur ({}).codepoint &&\n'.format (indent, self._c[0], index or ''))
97 if index:
98 s.append ('{}buffer->idx + {} < count &&\n'.format (self._indent (depth + 2), index + 1))
David Corbett205737a2018-10-12 16:54:54 -040099 for i, cp in enumerate (self._c[1:], start=1):
100 s.append ('{}0x{:04X}u == buffer->cur ({}).codepoint{}\n'.format (
101 self._indent (depth + 2), cp, index + i, ')' if i == len (self._c) - 1 else ' &&'))
102 s.append ('{}{{\n'.format (indent))
Behdad Esfahbod28633b12020-04-29 11:15:25 -0700103 for i in range (index):
Behdad Esfahbod8450f432021-03-15 15:18:06 -0600104 s.append ('{}(void) buffer->next_glyph ();\n'.format (self._indent (depth + 1)))
Behdad Esfahbod28633b12020-04-29 11:15:25 -0700105 s.append ('{}matched = true;\n'.format (self._indent (depth + 1)))
David Corbett205737a2018-10-12 16:54:54 -0400106 s.append ('{}}}\n'.format (indent))
107 else:
108 s.append ('{}switch (buffer->cur ({}).codepoint)\n'.format(indent, index or ''))
109 s.append ('{}{{\n'.format (indent))
110 cases = collections.defaultdict (set)
111 for first, rest in sorted (self._c.items ()):
112 cases[rest.__str__ (index + 1, depth + 2)].add (first)
113 for body, labels in sorted (cases.items (), key=lambda b_ls: sorted (b_ls[1])[0]):
114 for i, cp in enumerate (sorted (labels)):
115 if i % 4 == 0:
116 s.append (self._indent (depth + 1))
117 else:
118 s.append (' ')
119 s.append ('case 0x{:04X}u:{}'.format (cp, '\n' if i % 4 == 3 else ''))
120 if len (labels) % 4 != 0:
121 s.append ('\n')
122 s.append (body)
123 s.append ('{}break;\n'.format (self._indent (depth + 2)))
124 s.append ('{}}}\n'.format (indent))
125 return ''.join (s)
126
Behdad Esfahbod6d40eb82018-10-23 02:51:42 -0700127constraints = {}
Ebrahim Byagowiad871552020-05-29 00:11:19 +0430128with open (sys.argv[1], encoding='utf-8') as f:
David Corbettb372c3e2019-11-08 20:59:48 -0500129 constraints_header = []
130 while True:
131 line = f.readline ().strip ()
132 if line == '#':
133 break
134 constraints_header.append(line)
Behdad Esfahbod6d40eb82018-10-23 02:51:42 -0700135 for line in f:
136 j = line.find ('#')
137 if j >= 0:
138 line = line[:j]
139 constraint = [int (cp, 16) for cp in line.split (';')[0].split ()]
140 if not constraint: continue
141 assert 2 <= len (constraint), 'Prohibited sequence is too short: {}'.format (constraint)
142 script = scripts[constraint[0]]
143 if script in constraints:
144 constraints[script].add (constraint)
145 else:
146 constraints[script] = ConstraintSet (constraint)
147 assert constraints, 'No constraints found'
David Corbett205737a2018-10-12 16:54:54 -0400148
149print ('/* == Start of generated functions == */')
150print ('/*')
151print (' * The following functions are generated by running:')
152print (' *')
David Corbette6351d92019-11-11 17:39:55 -0500153print (' * %s ms-use/IndicShapingInvalidCluster.txt Scripts.txt' % sys.argv[0])
David Corbett205737a2018-10-12 16:54:54 -0400154print (' *')
155print (' * on files with these headers:')
156print (' *')
Behdad Esfahbod6d40eb82018-10-23 02:51:42 -0700157for line in constraints_header:
158 print (' * %s' % line.strip ())
159print (' *')
David Corbett205737a2018-10-12 16:54:54 -0400160for line in scripts_header:
161 print (' * %s' % line.strip ())
162print (' */')
Behdad Esfahbod7aad5362019-06-26 13:21:03 -0700163
164print ()
165print ('#include "hb.hh"')
166print ()
167print ('#ifndef HB_NO_OT_SHAPE')
David Corbett205737a2018-10-12 16:54:54 -0400168print ()
Behdad Esfahbod5bfb0b72022-06-03 02:56:41 -0600169print ('#include "hb-ot-shaper-vowel-constraints.hh"')
David Corbett205737a2018-10-12 16:54:54 -0400170print ()
David Corbett205737a2018-10-12 16:54:54 -0400171print ('static void')
Behdad Esfahbodae9e8f22018-10-24 16:46:07 -0700172print ('_output_dotted_circle (hb_buffer_t *buffer)')
David Corbett205737a2018-10-12 16:54:54 -0400173print ('{')
Behdad Esfahbod34a12042021-03-15 14:39:06 -0600174print (' (void) buffer->output_glyph (0x25CCu);')
175print (' _hb_glyph_info_reset_continuation (&buffer->prev());')
Behdad Esfahbodae9e8f22018-10-24 16:46:07 -0700176print ('}')
David Corbett205737a2018-10-12 16:54:54 -0400177print ()
Behdad Esfahbodae9e8f22018-10-24 16:46:07 -0700178print ('static void')
179print ('_output_with_dotted_circle (hb_buffer_t *buffer)')
180print ('{')
181print (' _output_dotted_circle (buffer);')
Behdad Esfahbod8450f432021-03-15 15:18:06 -0600182print (' (void) buffer->next_glyph ();')
David Corbett205737a2018-10-12 16:54:54 -0400183print ('}')
184print ()
185
Behdad Esfahbod6d40eb82018-10-23 02:51:42 -0700186print ('void')
Behdad Esfahbod39bd07a2018-10-26 21:01:11 -0700187print ('_hb_preprocess_text_vowel_constraints (const hb_ot_shape_plan_t *plan HB_UNUSED,')
Behdad Esfahbod6d40eb82018-10-23 02:51:42 -0700188print ('\t\t\t\t hb_buffer_t *buffer,')
Behdad Esfahbod39bd07a2018-10-26 21:01:11 -0700189print ('\t\t\t\t hb_font_t *font HB_UNUSED)')
David Corbett205737a2018-10-12 16:54:54 -0400190print ('{')
Behdad Esfahbod44be1e52022-06-03 02:54:33 -0600191print ('#ifdef HB_NO_OT_SHAPER_VOWEL_CONSTRAINTS')
David Corbett14e1fab2019-05-01 21:29:06 -0400192print (' return;')
193print ('#endif')
Eric Mullerb38bab82019-02-12 11:41:16 -0800194print (' if (buffer->flags & HB_BUFFER_FLAG_DO_NOT_INSERT_DOTTED_CIRCLE)')
195print (' return;')
196print ()
David Corbett205737a2018-10-12 16:54:54 -0400197print (' /* UGLY UGLY UGLY business of adding dotted-circle in the middle of')
198print (' * vowel-sequences that look like another vowel. Data for each script')
199print (' * collected from the USE script development spec.')
200print (' *')
201print (' * https://github.com/harfbuzz/harfbuzz/issues/1019')
202print (' */')
David Corbett205737a2018-10-12 16:54:54 -0400203print (' buffer->clear_output ();')
204print (' unsigned int count = buffer->len;')
205print (' switch ((unsigned) buffer->props.script)')
206print (' {')
207
Behdad Esfahbod6d40eb82018-10-23 02:51:42 -0700208for script, constraints in sorted (constraints.items (), key=lambda s_c: script_order[s_c[0]]):
David Corbett205737a2018-10-12 16:54:54 -0400209 print (' case HB_SCRIPT_{}:'.format (script.upper ()))
210 print (' for (buffer->idx = 0; buffer->idx + 1 < count && buffer->successful;)')
211 print (' {')
212 print ('\tbool matched = false;')
213 write (str (constraints))
Behdad Esfahbod8450f432021-03-15 15:18:06 -0600214 print ('\t(void) buffer->next_glyph ();')
David Corbett205737a2018-10-12 16:54:54 -0400215 print ('\tif (matched) _output_with_dotted_circle (buffer);')
216 print (' }')
David Corbett205737a2018-10-12 16:54:54 -0400217 print (' break;')
218 print ()
219
220print (' default:')
221print (' break;')
222print (' }')
Behdad Esfahbodbea53692022-01-04 10:52:05 -0700223print (' buffer->sync ();')
David Corbett205737a2018-10-12 16:54:54 -0400224print ('}')
225
226print ()
Behdad Esfahbod7aad5362019-06-26 13:21:03 -0700227print ()
228print ('#endif')
David Corbett205737a2018-10-12 16:54:54 -0400229print ('/* == End of generated functions == */')