1#!/usr/bin/python
2
3"""Generator of the function to prohibit certain vowel sequences.
4
5It creates ``_hb_preprocess_text_vowel_constraints``, which inserts dotted
6circles into sequences prohibited by the USE script development spec.
7This function should be used as the ``preprocess_text`` of an
8``hb_ot_complex_shaper_t``.
9"""
10
11from __future__ import absolute_import, division, print_function, unicode_literals
12
13import collections
14try:
15	from HTMLParser import HTMLParser
16	def write (s):
17		print (s.encode ('utf-8'), end='')
18except ImportError:
19	from html.parser import HTMLParser
20	def write (s):
21		sys.stdout.flush ()
22		sys.stdout.buffer.write (s.encode ('utf-8'))
23import itertools
24import io
25import sys
26
27if len (sys.argv) != 3:
28	print ('usage: ./gen-vowel-constraints.py HBIndicVowelConstraints.txt Scripts.txt', file=sys.stderr)
29	sys.exit (1)
30
31with io.open (sys.argv[2], encoding='utf-8') as f:
32	scripts_header = [f.readline () for i in range (2)]
33	scripts = {}
34	script_order = {}
35	for line in f:
36		j = line.find ('#')
37		if j >= 0:
38			line = line[:j]
39		fields = [x.strip () for x in line.split (';')]
40		if len (fields) == 1:
41			continue
42		uu = fields[0].split ('..')
43		start = int (uu[0], 16)
44		if len (uu) == 1:
45			end = start
46		else:
47			end = int (uu[1], 16)
48		script = fields[1]
49		for u in range (start, end + 1):
50			scripts[u] = script
51		if script not in script_order:
52			script_order[script] = start
53
54class ConstraintSet (object):
55	"""A set of prohibited code point sequences.
56
57	Args:
58		constraint (List[int]): A prohibited code point sequence.
59
60	"""
61	def __init__ (self, constraint):
62		# Either a list or a dictionary. As a list of code points, it
63		# represents a prohibited code point sequence. As a dictionary,
64		# it represents a set of prohibited sequences, where each item
65		# represents the set of prohibited sequences starting with the
66		# key (a code point) concatenated with any of the values
67		# (ConstraintSets).
68		self._c = constraint
69
70	def add (self, constraint):
71		"""Add a constraint to this set."""
72		if not constraint:
73			return
74		first = constraint[0]
75		rest = constraint[1:]
76		if isinstance (self._c, list):
77			if constraint == self._c[:len (constraint)]:
78				self._c = constraint
79			elif self._c != constraint[:len (self._c)]:
80				self._c = {self._c[0]: ConstraintSet (self._c[1:])}
81		if isinstance (self._c, dict):
82			if first in self._c:
83				self._c[first].add (rest)
84			else:
85				self._c[first] = ConstraintSet (rest)
86
87	def _indent (self, depth):
88		return ('  ' * depth).replace ('        ', '\t')
89
90	def __str__ (self, index=0, depth=4):
91		s = []
92		indent = self._indent (depth)
93		if isinstance (self._c, list):
94			if len (self._c) == 0:
95				s.append ('{}matched = true;\n'.format (indent))
96			elif len (self._c) == 1:
97				s.append ('{}matched = 0x{:04X}u == buffer->cur ({}).codepoint;\n'.format (indent, next (iter (self._c)), index or ''))
98			else:
99				s.append ('{}if (0x{:04X}u == buffer->cur ({}).codepoint &&\n'.format (indent, self._c[0], index))
100				s.append ('{}buffer->idx + {} < count &&\n'.format (self._indent (depth + 2), len (self._c)))
101				for i, cp in enumerate (self._c[1:], start=1):
102					s.append ('{}0x{:04X}u == buffer->cur ({}).codepoint{}\n'.format (
103						self._indent (depth + 2), cp, index + i, ')' if i == len (self._c) - 1 else ' &&'))
104				s.append ('{}{{\n'.format (indent))
105				for i in range (len (self._c)):
106					s.append ('{}buffer->next_glyph ();\n'.format (self._indent (depth + 1)))
107				s.append ('{}_output_dotted_circle (buffer);\n'.format (self._indent (depth + 1)))
108				s.append ('{}}}\n'.format (indent))
109		else:
110			s.append ('{}switch (buffer->cur ({}).codepoint)\n'.format(indent, index or ''))
111			s.append ('{}{{\n'.format (indent))
112			cases = collections.defaultdict (set)
113			for first, rest in sorted (self._c.items ()):
114				cases[rest.__str__ (index + 1, depth + 2)].add (first)
115			for body, labels in sorted (cases.items (), key=lambda b_ls: sorted (b_ls[1])[0]):
116				for i, cp in enumerate (sorted (labels)):
117					if i % 4 == 0:
118						s.append (self._indent (depth + 1))
119					else:
120						s.append (' ')
121					s.append ('case 0x{:04X}u:{}'.format (cp, '\n' if i % 4 == 3 else ''))
122				if len (labels) % 4 != 0:
123					s.append ('\n')
124				s.append (body)
125				s.append ('{}break;\n'.format (self._indent (depth + 2)))
126			s.append ('{}}}\n'.format (indent))
127		return ''.join (s)
128
129constraints = {}
130with io.open (sys.argv[1], encoding='utf-8') as f:
131	constraints_header = [f.readline ().strip () for i in range (2)]
132	for line in f:
133		j = line.find ('#')
134		if j >= 0:
135			line = line[:j]
136		constraint = [int (cp, 16) for cp in line.split (';')[0].split ()]
137		if not constraint: continue
138		assert 2 <= len (constraint), 'Prohibited sequence is too short: {}'.format (constraint)
139		script = scripts[constraint[0]]
140		if script in constraints:
141			constraints[script].add (constraint)
142		else:
143			constraints[script] = ConstraintSet (constraint)
144		assert constraints, 'No constraints found'
145
146print ('/* == Start of generated functions == */')
147print ('/*')
148print (' * The following functions are generated by running:')
149print (' *')
150print (' *   %s use Scripts.txt' % sys.argv[0])
151print (' *')
152print (' * on files with these headers:')
153print (' *')
154for line in constraints_header:
155	print (' * %s' % line.strip ())
156print (' *')
157for line in scripts_header:
158	print (' * %s' % line.strip ())
159print (' */')
160
161print ()
162print ('#include "hb.hh"')
163print ()
164print ('#ifndef HB_NO_OT_SHAPE')
165print ()
166print ('#include "hb-ot-shape-complex-vowel-constraints.hh"')
167print ()
168print ('static void')
169print ('_output_dotted_circle (hb_buffer_t *buffer)')
170print ('{')
171print ('  hb_glyph_info_t &dottedcircle = buffer->output_glyph (0x25CCu);')
172print ('  _hb_glyph_info_reset_continuation (&dottedcircle);')
173print ('}')
174print ()
175print ('static void')
176print ('_output_with_dotted_circle (hb_buffer_t *buffer)')
177print ('{')
178print ('  _output_dotted_circle (buffer);')
179print ('  buffer->next_glyph ();')
180print ('}')
181print ()
182
183print ('void')
184print ('_hb_preprocess_text_vowel_constraints (const hb_ot_shape_plan_t *plan HB_UNUSED,')
185print ('\t\t\t\t       hb_buffer_t              *buffer,')
186print ('\t\t\t\t       hb_font_t                *font HB_UNUSED)')
187print ('{')
188print ('#if defined(HB_NO_OT_SHAPE_COMPLEX_VOWEL_CONSTRAINTS)')
189print ('  return;')
190print ('#endif')
191print ('  if (buffer->flags & HB_BUFFER_FLAG_DO_NOT_INSERT_DOTTED_CIRCLE)')
192print ('    return;')
193print ()
194print ('  /* UGLY UGLY UGLY business of adding dotted-circle in the middle of')
195print ('   * vowel-sequences that look like another vowel.  Data for each script')
196print ('   * collected from the USE script development spec.')
197print ('   *')
198print ('   * https://github.com/harfbuzz/harfbuzz/issues/1019')
199print ('   */')
200print ('  bool processed = false;')
201print ('  buffer->clear_output ();')
202print ('  unsigned int count = buffer->len;')
203print ('  switch ((unsigned) buffer->props.script)')
204print ('  {')
205
206for script, constraints in sorted (constraints.items (), key=lambda s_c: script_order[s_c[0]]):
207	print ('    case HB_SCRIPT_{}:'.format (script.upper ()))
208	print ('      for (buffer->idx = 0; buffer->idx + 1 < count && buffer->successful;)')
209	print ('      {')
210	print ('\tbool matched = false;')
211	write (str (constraints))
212	print ('\tbuffer->next_glyph ();')
213	print ('\tif (matched) _output_with_dotted_circle (buffer);')
214	print ('      }')
215	print ('      processed = true;')
216	print ('      break;')
217	print ()
218
219print ('    default:')
220print ('      break;')
221print ('  }')
222print ('  if (processed)')
223print ('  {')
224print ('    if (buffer->idx < count)')
225print ('      buffer->next_glyph ();')
226print ('    buffer->swap_buffers ();')
227print ('  }')
228print ('}')
229
230print ()
231print ()
232print ('#endif')
233print ('/* == End of generated functions == */')
234