1#!/usr/bin/env python 2 3from __future__ import print_function, division, absolute_import 4 5import io, sys 6 7if len (sys.argv) != 4: 8 print ("usage: ./gen-indic-table.py IndicSyllabicCategory.txt IndicPositionalCategory.txt Blocks.txt", file=sys.stderr) 9 sys.exit (1) 10 11ALLOWED_SINGLES = [0x00A0, 0x25CC] 12ALLOWED_BLOCKS = [ 13 'Basic Latin', 14 'Latin-1 Supplement', 15 'Devanagari', 16 'Bengali', 17 'Gurmukhi', 18 'Gujarati', 19 'Oriya', 20 'Tamil', 21 'Telugu', 22 'Kannada', 23 'Malayalam', 24 'Sinhala', 25 'Myanmar', 26 'Khmer', 27 'Vedic Extensions', 28 'General Punctuation', 29 'Superscripts and Subscripts', 30 'Devanagari Extended', 31 'Myanmar Extended-B', 32 'Myanmar Extended-A', 33] 34 35files = [io.open (x, encoding='utf-8') for x in sys.argv[1:]] 36 37headers = [[f.readline () for i in range (2)] for f in files] 38 39data = [{} for f in files] 40values = [{} for f in files] 41for i, f in enumerate (files): 42 for line in f: 43 44 j = line.find ('#') 45 if j >= 0: 46 line = line[:j] 47 48 fields = [x.strip () for x in line.split (';')] 49 if len (fields) == 1: 50 continue 51 52 uu = fields[0].split ('..') 53 start = int (uu[0], 16) 54 if len (uu) == 1: 55 end = start 56 else: 57 end = int (uu[1], 16) 58 59 t = fields[1] 60 61 for u in range (start, end + 1): 62 data[i][u] = t 63 values[i][t] = values[i].get (t, 0) + end - start + 1 64 65# Merge data into one dict: 66defaults = ('Other', 'Not_Applicable', 'No_Block') 67for i,v in enumerate (defaults): 68 values[i][v] = values[i].get (v, 0) + 1 69combined = {} 70for i,d in enumerate (data): 71 for u,v in d.items (): 72 if i == 2 and not u in combined: 73 continue 74 if not u in combined: 75 combined[u] = list (defaults) 76 combined[u][i] = v 77combined = {k:v for k,v in combined.items() if k in ALLOWED_SINGLES or v[2] in ALLOWED_BLOCKS} 78data = combined 79del combined 80num = len (data) 81 82# Move the outliers NO-BREAK SPACE and DOTTED CIRCLE out 83singles = {} 84for u in ALLOWED_SINGLES: 85 singles[u] = data[u] 86 del data[u] 87 88print ("/* == Start of generated table == */") 89print ("/*") 90print (" * The following table is generated by running:") 91print (" *") 92print (" * ./gen-indic-table.py IndicSyllabicCategory.txt IndicPositionalCategory.txt Blocks.txt") 93print (" *") 94print (" * on files with these headers:") 95print (" *") 96for h in headers: 97 for l in h: 98 print (" * %s" % (l.strip())) 99print (" */") 100print () 101print ('#include "hb.hh"') 102print () 103print ('#ifndef HB_NO_OT_SHAPE') 104print () 105print ('#include "hb-ot-shape-complex-indic.hh"') 106print () 107 108# Shorten values 109short = [{ 110 "Bindu": 'Bi', 111 "Cantillation_Mark": 'Ca', 112 "Joiner": 'ZWJ', 113 "Non_Joiner": 'ZWNJ', 114 "Number": 'Nd', 115 "Visarga": 'Vs', 116 "Vowel": 'Vo', 117 "Vowel_Dependent": 'M', 118 "Consonant_Prefixed": 'CPrf', 119 "Other": 'x', 120},{ 121 "Not_Applicable": 'x', 122}] 123all_shorts = [{},{}] 124 125# Add some of the values, to make them more readable, and to avoid duplicates 126 127 128for i in range (2): 129 for v,s in short[i].items (): 130 all_shorts[i][s] = v 131 132what = ["INDIC_SYLLABIC_CATEGORY", "INDIC_MATRA_CATEGORY"] 133what_short = ["ISC", "IMC"] 134print ('#pragma GCC diagnostic push') 135print ('#pragma GCC diagnostic ignored "-Wunused-macros"') 136cat_defs = [] 137for i in range (2): 138 vv = sorted (values[i].keys ()) 139 for v in vv: 140 v_no_and = v.replace ('_And_', '_') 141 if v in short[i]: 142 s = short[i][v] 143 else: 144 s = ''.join ([c for c in v_no_and if ord ('A') <= ord (c) <= ord ('Z')]) 145 if s in all_shorts[i]: 146 raise Exception ("Duplicate short value alias", v, all_shorts[i][s]) 147 all_shorts[i][s] = v 148 short[i][v] = s 149 cat_defs.append ((what_short[i] + '_' + s, what[i] + '_' + v.upper (), str (values[i][v]), v)) 150 151maxlen_s = max ([len (c[0]) for c in cat_defs]) 152maxlen_l = max ([len (c[1]) for c in cat_defs]) 153maxlen_n = max ([len (c[2]) for c in cat_defs]) 154for s in what_short: 155 print () 156 for c in [c for c in cat_defs if s in c[0]]: 157 print ("#define %s %s /* %s chars; %s */" % 158 (c[0].ljust (maxlen_s), c[1].ljust (maxlen_l), c[2].rjust (maxlen_n), c[3])) 159print () 160print ('#pragma GCC diagnostic pop') 161print () 162print ("#define _(S,M) INDIC_COMBINE_CATEGORIES (ISC_##S, IMC_##M)") 163print () 164print () 165 166total = 0 167used = 0 168last_block = None 169def print_block (block, start, end, data): 170 global total, used, last_block 171 if block and block != last_block: 172 print () 173 print () 174 print (" /* %s */" % block) 175 num = 0 176 assert start % 8 == 0 177 assert (end+1) % 8 == 0 178 for u in range (start, end+1): 179 if u % 8 == 0: 180 print () 181 print (" /* %04X */" % u, end="") 182 if u in data: 183 num += 1 184 d = data.get (u, defaults) 185 print ("%9s" % ("_(%s,%s)," % (short[0][d[0]], short[1][d[1]])), end="") 186 187 total += end - start + 1 188 used += num 189 if block: 190 last_block = block 191 192uu = sorted (data.keys ()) 193 194last = -100000 195num = 0 196offset = 0 197starts = [] 198ends = [] 199print ("static const INDIC_TABLE_ELEMENT_TYPE indic_table[] = {") 200for u in uu: 201 if u <= last: 202 continue 203 block = data[u][2] 204 205 start = u//8*8 206 end = start+1 207 while end in uu and block == data[end][2]: 208 end += 1 209 end = (end-1)//8*8 + 7 210 211 if start != last + 1: 212 if start - last <= 1+16*3: 213 print_block (None, last+1, start-1, data) 214 last = start-1 215 else: 216 if last >= 0: 217 ends.append (last + 1) 218 offset += ends[-1] - starts[-1] 219 print () 220 print () 221 print ("#define indic_offset_0x%04xu %d" % (start, offset)) 222 starts.append (start) 223 224 print_block (block, start, end, data) 225 last = end 226ends.append (last + 1) 227offset += ends[-1] - starts[-1] 228print () 229print () 230occupancy = used * 100. / total 231page_bits = 12 232print ("}; /* Table items: %d; occupancy: %d%% */" % (offset, occupancy)) 233print () 234print ("INDIC_TABLE_ELEMENT_TYPE") 235print ("hb_indic_get_categories (hb_codepoint_t u)") 236print ("{") 237print (" switch (u >> %d)" % page_bits) 238print (" {") 239pages = set ([u>>page_bits for u in starts+ends+list (singles.keys ())]) 240for p in sorted(pages): 241 print (" case 0x%0Xu:" % p) 242 for u,d in singles.items (): 243 if p != u>>page_bits: continue 244 print (" if (unlikely (u == 0x%04Xu)) return _(%s,%s);" % (u, short[0][d[0]], short[1][d[1]])) 245 for (start,end) in zip (starts, ends): 246 if p not in [start>>page_bits, end>>page_bits]: continue 247 offset = "indic_offset_0x%04xu" % start 248 print (" if (hb_in_range<hb_codepoint_t> (u, 0x%04Xu, 0x%04Xu)) return indic_table[u - 0x%04Xu + %s];" % (start, end-1, start, offset)) 249 print (" break;") 250 print ("") 251print (" default:") 252print (" break;") 253print (" }") 254print (" return _(x,x);") 255print ("}") 256print () 257print ("#undef _") 258for i in range (2): 259 print () 260 vv = sorted (values[i].keys ()) 261 for v in vv: 262 print ("#undef %s_%s" % 263 (what_short[i], short[i][v])) 264print () 265print ('#endif') 266print () 267print ("/* == End of generated table == */") 268 269# Maintain at least 30% occupancy in the table */ 270if occupancy < 30: 271 raise Exception ("Table too sparse, please investigate: ", occupancy) 272