1#!/usr/bin/python 2 3"""Generator of the function to prohibit certain vowel sequences. 4 5It creates ``_hb_preprocess_text_vowel_constraints``, which inserts dotted 6circles into sequences prohibited by the USE script development spec. 7This function should be used as the ``preprocess_text`` of an 8``hb_ot_complex_shaper_t``. 9""" 10 11from __future__ import absolute_import, division, print_function, unicode_literals 12 13import collections 14try: 15 from HTMLParser import HTMLParser 16 def write (s): 17 print (s.encode ('utf-8'), end='') 18except ImportError: 19 from html.parser import HTMLParser 20 def write (s): 21 sys.stdout.flush () 22 sys.stdout.buffer.write (s.encode ('utf-8')) 23import itertools 24import io 25import sys 26 27if len (sys.argv) != 3: 28 print ('usage: ./gen-vowel-constraints.py HBIndicVowelConstraints.txt Scripts.txt', file=sys.stderr) 29 sys.exit (1) 30 31with io.open (sys.argv[2], encoding='utf-8') as f: 32 scripts_header = [f.readline () for i in range (2)] 33 scripts = {} 34 script_order = {} 35 for line in f: 36 j = line.find ('#') 37 if j >= 0: 38 line = line[:j] 39 fields = [x.strip () for x in line.split (';')] 40 if len (fields) == 1: 41 continue 42 uu = fields[0].split ('..') 43 start = int (uu[0], 16) 44 if len (uu) == 1: 45 end = start 46 else: 47 end = int (uu[1], 16) 48 script = fields[1] 49 for u in range (start, end + 1): 50 scripts[u] = script 51 if script not in script_order: 52 script_order[script] = start 53 54class ConstraintSet (object): 55 """A set of prohibited code point sequences. 56 57 Args: 58 constraint (List[int]): A prohibited code point sequence. 59 60 """ 61 def __init__ (self, constraint): 62 # Either a list or a dictionary. As a list of code points, it 63 # represents a prohibited code point sequence. As a dictionary, 64 # it represents a set of prohibited sequences, where each item 65 # represents the set of prohibited sequences starting with the 66 # key (a code point) concatenated with any of the values 67 # (ConstraintSets). 68 self._c = constraint 69 70 def add (self, constraint): 71 """Add a constraint to this set.""" 72 if not constraint: 73 return 74 first = constraint[0] 75 rest = constraint[1:] 76 if isinstance (self._c, list): 77 if constraint == self._c[:len (constraint)]: 78 self._c = constraint 79 elif self._c != constraint[:len (self._c)]: 80 self._c = {self._c[0]: ConstraintSet (self._c[1:])} 81 if isinstance (self._c, dict): 82 if first in self._c: 83 self._c[first].add (rest) 84 else: 85 self._c[first] = ConstraintSet (rest) 86 87 def _indent (self, depth): 88 return (' ' * depth).replace (' ', '\t') 89 90 def __str__ (self, index=0, depth=4): 91 s = [] 92 indent = self._indent (depth) 93 if isinstance (self._c, list): 94 if len (self._c) == 0: 95 s.append ('{}matched = true;\n'.format (indent)) 96 elif len (self._c) == 1: 97 s.append ('{}matched = 0x{:04X}u == buffer->cur ({}).codepoint;\n'.format (indent, next (iter (self._c)), index or '')) 98 else: 99 s.append ('{}if (0x{:04X}u == buffer->cur ({}).codepoint &&\n'.format (indent, self._c[0], index)) 100 s.append ('{}buffer->idx + {} < count &&\n'.format (self._indent (depth + 2), len (self._c))) 101 for i, cp in enumerate (self._c[1:], start=1): 102 s.append ('{}0x{:04X}u == buffer->cur ({}).codepoint{}\n'.format ( 103 self._indent (depth + 2), cp, index + i, ')' if i == len (self._c) - 1 else ' &&')) 104 s.append ('{}{{\n'.format (indent)) 105 for i in range (len (self._c)): 106 s.append ('{}buffer->next_glyph ();\n'.format (self._indent (depth + 1))) 107 s.append ('{}_output_dotted_circle (buffer);\n'.format (self._indent (depth + 1))) 108 s.append ('{}}}\n'.format (indent)) 109 else: 110 s.append ('{}switch (buffer->cur ({}).codepoint)\n'.format(indent, index or '')) 111 s.append ('{}{{\n'.format (indent)) 112 cases = collections.defaultdict (set) 113 for first, rest in sorted (self._c.items ()): 114 cases[rest.__str__ (index + 1, depth + 2)].add (first) 115 for body, labels in sorted (cases.items (), key=lambda b_ls: sorted (b_ls[1])[0]): 116 for i, cp in enumerate (sorted (labels)): 117 if i % 4 == 0: 118 s.append (self._indent (depth + 1)) 119 else: 120 s.append (' ') 121 s.append ('case 0x{:04X}u:{}'.format (cp, '\n' if i % 4 == 3 else '')) 122 if len (labels) % 4 != 0: 123 s.append ('\n') 124 s.append (body) 125 s.append ('{}break;\n'.format (self._indent (depth + 2))) 126 s.append ('{}}}\n'.format (indent)) 127 return ''.join (s) 128 129constraints = {} 130with io.open (sys.argv[1], encoding='utf-8') as f: 131 constraints_header = [f.readline ().strip () for i in range (2)] 132 for line in f: 133 j = line.find ('#') 134 if j >= 0: 135 line = line[:j] 136 constraint = [int (cp, 16) for cp in line.split (';')[0].split ()] 137 if not constraint: continue 138 assert 2 <= len (constraint), 'Prohibited sequence is too short: {}'.format (constraint) 139 script = scripts[constraint[0]] 140 if script in constraints: 141 constraints[script].add (constraint) 142 else: 143 constraints[script] = ConstraintSet (constraint) 144 assert constraints, 'No constraints found' 145 146print ('/* == Start of generated functions == */') 147print ('/*') 148print (' * The following functions are generated by running:') 149print (' *') 150print (' * %s use Scripts.txt' % sys.argv[0]) 151print (' *') 152print (' * on files with these headers:') 153print (' *') 154for line in constraints_header: 155 print (' * %s' % line.strip ()) 156print (' *') 157for line in scripts_header: 158 print (' * %s' % line.strip ()) 159print (' */') 160 161print () 162print ('#include "hb.hh"') 163print () 164print ('#ifndef HB_NO_OT_SHAPE') 165print () 166print ('#include "hb-ot-shape-complex-vowel-constraints.hh"') 167print () 168print ('static void') 169print ('_output_dotted_circle (hb_buffer_t *buffer)') 170print ('{') 171print (' hb_glyph_info_t &dottedcircle = buffer->output_glyph (0x25CCu);') 172print (' _hb_glyph_info_reset_continuation (&dottedcircle);') 173print ('}') 174print () 175print ('static void') 176print ('_output_with_dotted_circle (hb_buffer_t *buffer)') 177print ('{') 178print (' _output_dotted_circle (buffer);') 179print (' buffer->next_glyph ();') 180print ('}') 181print () 182 183print ('void') 184print ('_hb_preprocess_text_vowel_constraints (const hb_ot_shape_plan_t *plan HB_UNUSED,') 185print ('\t\t\t\t hb_buffer_t *buffer,') 186print ('\t\t\t\t hb_font_t *font HB_UNUSED)') 187print ('{') 188print ('#if defined(HB_NO_OT_SHAPE_COMPLEX_VOWEL_CONSTRAINTS)') 189print (' return;') 190print ('#endif') 191print (' if (buffer->flags & HB_BUFFER_FLAG_DO_NOT_INSERT_DOTTED_CIRCLE)') 192print (' return;') 193print () 194print (' /* UGLY UGLY UGLY business of adding dotted-circle in the middle of') 195print (' * vowel-sequences that look like another vowel. Data for each script') 196print (' * collected from the USE script development spec.') 197print (' *') 198print (' * https://github.com/harfbuzz/harfbuzz/issues/1019') 199print (' */') 200print (' bool processed = false;') 201print (' buffer->clear_output ();') 202print (' unsigned int count = buffer->len;') 203print (' switch ((unsigned) buffer->props.script)') 204print (' {') 205 206for script, constraints in sorted (constraints.items (), key=lambda s_c: script_order[s_c[0]]): 207 print (' case HB_SCRIPT_{}:'.format (script.upper ())) 208 print (' for (buffer->idx = 0; buffer->idx + 1 < count && buffer->successful;)') 209 print (' {') 210 print ('\tbool matched = false;') 211 write (str (constraints)) 212 print ('\tbuffer->next_glyph ();') 213 print ('\tif (matched) _output_with_dotted_circle (buffer);') 214 print (' }') 215 print (' processed = true;') 216 print (' break;') 217 print () 218 219print (' default:') 220print (' break;') 221print (' }') 222print (' if (processed)') 223print (' {') 224print (' if (buffer->idx < count)') 225print (' buffer->next_glyph ();') 226print (' buffer->swap_buffers ();') 227print (' }') 228print ('}') 229 230print () 231print () 232print ('#endif') 233print ('/* == End of generated functions == */') 234