1#!/usr/bin/env python 2 3from __future__ import print_function, division, absolute_import 4 5import io, sys 6 7if len (sys.argv) != 5: 8 print ("usage: ./gen-use-table.py IndicSyllabicCategory.txt IndicPositionalCategory.txt UnicodeData.txt Blocks.txt", file=sys.stderr) 9 sys.exit (1) 10 11BLACKLISTED_BLOCKS = ["Thai", "Lao"] 12 13files = [io.open (x, encoding='utf-8') for x in sys.argv[1:]] 14 15headers = [[f.readline () for i in range (2)] for j,f in enumerate(files) if j != 2] 16headers.append (["UnicodeData.txt does not have a header."]) 17 18data = [{} for f in files] 19values = [{} for f in files] 20for i, f in enumerate (files): 21 for line in f: 22 23 j = line.find ('#') 24 if j >= 0: 25 line = line[:j] 26 27 fields = [x.strip () for x in line.split (';')] 28 if len (fields) == 1: 29 continue 30 31 uu = fields[0].split ('..') 32 start = int (uu[0], 16) 33 if len (uu) == 1: 34 end = start 35 else: 36 end = int (uu[1], 16) 37 38 t = fields[1 if i != 2 else 2] 39 40 for u in range (start, end + 1): 41 data[i][u] = t 42 values[i][t] = values[i].get (t, 0) + end - start + 1 43 44defaults = ('Other', 'Not_Applicable', 'Cn', 'No_Block') 45 46# TODO Characters that are not in Unicode Indic files, but used in USE 47data[0][0x034F] = defaults[0] 48data[0][0x2060] = defaults[0] 49data[0][0x20F0] = defaults[0] 50# TODO https://github.com/roozbehp/unicode-data/issues/9 51data[0][0x11C44] = 'Consonant_Placeholder' 52data[0][0x11C45] = 'Consonant_Placeholder' 53# TODO https://github.com/harfbuzz/harfbuzz/pull/1399 54data[0][0x111C8] = 'Consonant_Placeholder' 55for u in range (0xFE00, 0xFE0F + 1): 56 data[0][u] = defaults[0] 57 58# Merge data into one dict: 59for i,v in enumerate (defaults): 60 values[i][v] = values[i].get (v, 0) + 1 61combined = {} 62for i,d in enumerate (data): 63 for u,v in d.items (): 64 if i >= 2 and not u in combined: 65 continue 66 if not u in combined: 67 combined[u] = list (defaults) 68 combined[u][i] = v 69combined = {k:v for k,v in combined.items() if v[3] not in BLACKLISTED_BLOCKS} 70data = combined 71del combined 72num = len (data) 73 74 75property_names = [ 76 # General_Category 77 'Cc', 'Cf', 'Cn', 'Co', 'Cs', 'Ll', 'Lm', 'Lo', 'Lt', 'Lu', 'Mc', 78 'Me', 'Mn', 'Nd', 'Nl', 'No', 'Pc', 'Pd', 'Pe', 'Pf', 'Pi', 'Po', 79 'Ps', 'Sc', 'Sk', 'Sm', 'So', 'Zl', 'Zp', 'Zs', 80 # Indic_Syllabic_Category 81 'Other', 82 'Bindu', 83 'Visarga', 84 'Avagraha', 85 'Nukta', 86 'Virama', 87 'Pure_Killer', 88 'Invisible_Stacker', 89 'Vowel_Independent', 90 'Vowel_Dependent', 91 'Vowel', 92 'Consonant_Placeholder', 93 'Consonant', 94 'Consonant_Dead', 95 'Consonant_With_Stacker', 96 'Consonant_Prefixed', 97 'Consonant_Preceding_Repha', 98 'Consonant_Succeeding_Repha', 99 'Consonant_Subjoined', 100 'Consonant_Medial', 101 'Consonant_Final', 102 'Consonant_Head_Letter', 103 'Consonant_Initial_Postfixed', 104 'Modifying_Letter', 105 'Tone_Letter', 106 'Tone_Mark', 107 'Gemination_Mark', 108 'Cantillation_Mark', 109 'Register_Shifter', 110 'Syllable_Modifier', 111 'Consonant_Killer', 112 'Non_Joiner', 113 'Joiner', 114 'Number_Joiner', 115 'Number', 116 'Brahmi_Joining_Number', 117 # Indic_Positional_Category 118 'Not_Applicable', 119 'Right', 120 'Left', 121 'Visual_Order_Left', 122 'Left_And_Right', 123 'Top', 124 'Bottom', 125 'Top_And_Bottom', 126 'Top_And_Right', 127 'Top_And_Left', 128 'Top_And_Left_And_Right', 129 'Bottom_And_Left', 130 'Bottom_And_Right', 131 'Top_And_Bottom_And_Right', 132 'Overstruck', 133] 134 135try: 136 basestring 137except NameError: 138 basestring = str 139 140class PropertyValue(object): 141 def __init__(self, name_): 142 self.name = name_ 143 def __str__(self): 144 return self.name 145 def __eq__(self, other): 146 return self.name == (other if isinstance(other, basestring) else other.name) 147 def __ne__(self, other): 148 return not (self == other) 149 def __hash__(self): 150 return hash(str(self)) 151 152property_values = {} 153 154for name in property_names: 155 value = PropertyValue(name) 156 assert value not in property_values 157 assert value not in globals() 158 property_values[name] = value 159globals().update(property_values) 160 161 162def is_BASE(U, UISC, UGC): 163 return (UISC in [Number, Consonant, Consonant_Head_Letter, 164 #SPEC-DRAFT Consonant_Placeholder, 165 Tone_Letter, 166 Vowel_Independent #SPEC-DRAFT 167 ] or 168 (UGC == Lo and UISC in [Avagraha, Bindu, Consonant_Final, Consonant_Medial, 169 Consonant_Subjoined, Vowel, Vowel_Dependent])) 170def is_BASE_IND(U, UISC, UGC): 171 #SPEC-DRAFT return (UISC in [Consonant_Dead, Modifying_Letter] or UGC == Po) 172 return (UISC in [Consonant_Dead, Modifying_Letter] or 173 (UGC == Po and not U in [0x104B, 0x104E, 0x2022, 0x111C8, 0x11A3F, 0x11A45, 0x11C44, 0x11C45]) or 174 False # SPEC-DRAFT-OUTDATED! U == 0x002D 175 ) 176def is_BASE_NUM(U, UISC, UGC): 177 return UISC == Brahmi_Joining_Number 178def is_BASE_OTHER(U, UISC, UGC): 179 if UISC == Consonant_Placeholder: return True #SPEC-DRAFT 180 #SPEC-DRAFT return U in [0x00A0, 0x00D7, 0x2015, 0x2022, 0x25CC, 0x25FB, 0x25FC, 0x25FD, 0x25FE] 181 return U in [0x2015, 0x2022, 0x25FB, 0x25FC, 0x25FD, 0x25FE] 182def is_CGJ(U, UISC, UGC): 183 return U == 0x034F 184def is_CONS_FINAL(U, UISC, UGC): 185 # Consonant_Initial_Postfixed is new in Unicode 11; not in the spec. 186 return ((UISC == Consonant_Final and UGC != Lo) or 187 UISC == Consonant_Initial_Postfixed or 188 UISC == Consonant_Succeeding_Repha) 189def is_CONS_FINAL_MOD(U, UISC, UGC): 190 #SPEC-DRAFT return UISC in [Consonant_Final_Modifier, Syllable_Modifier] 191 return UISC == Syllable_Modifier 192def is_CONS_MED(U, UISC, UGC): 193 return UISC == Consonant_Medial and UGC != Lo 194def is_CONS_MOD(U, UISC, UGC): 195 return UISC in [Nukta, Gemination_Mark, Consonant_Killer] 196def is_CONS_SUB(U, UISC, UGC): 197 #SPEC-DRAFT return UISC == Consonant_Subjoined 198 return UISC == Consonant_Subjoined and UGC != Lo 199def is_CONS_WITH_STACKER(U, UISC, UGC): 200 return UISC == Consonant_With_Stacker 201def is_HALANT(U, UISC, UGC): 202 return UISC in [Virama, Invisible_Stacker] and not is_HALANT_OR_VOWEL_MODIFIER(U, UISC, UGC) 203def is_HALANT_OR_VOWEL_MODIFIER(U, UISC, UGC): 204 # https://github.com/harfbuzz/harfbuzz/issues/1102 205 # https://github.com/harfbuzz/harfbuzz/issues/1379 206 return U in [0x11046, 0x1134D] 207def is_HALANT_NUM(U, UISC, UGC): 208 return UISC == Number_Joiner 209def is_ZWNJ(U, UISC, UGC): 210 return UISC == Non_Joiner 211def is_ZWJ(U, UISC, UGC): 212 return UISC == Joiner 213def is_Word_Joiner(U, UISC, UGC): 214 return U == 0x2060 215def is_OTHER(U, UISC, UGC): 216 #SPEC-OUTDATED return UGC == Zs # or any other SCRIPT_COMMON characters 217 return (UISC == Other 218 and not is_SYM_MOD(U, UISC, UGC) 219 and not is_CGJ(U, UISC, UGC) 220 and not is_Word_Joiner(U, UISC, UGC) 221 and not is_VARIATION_SELECTOR(U, UISC, UGC) 222 ) 223def is_Reserved(U, UISC, UGC): 224 return UGC == 'Cn' 225def is_REPHA(U, UISC, UGC): 226 return UISC in [Consonant_Preceding_Repha, Consonant_Prefixed] 227def is_SYM(U, UISC, UGC): 228 if U == 0x25CC: return False #SPEC-DRAFT 229 #SPEC-DRAFT return UGC in [So, Sc] or UISC == Symbol_Letter 230 return UGC in [So, Sc] 231def is_SYM_MOD(U, UISC, UGC): 232 return U in [0x1B6B, 0x1B6C, 0x1B6D, 0x1B6E, 0x1B6F, 0x1B70, 0x1B71, 0x1B72, 0x1B73] 233def is_VARIATION_SELECTOR(U, UISC, UGC): 234 return 0xFE00 <= U <= 0xFE0F 235def is_VOWEL(U, UISC, UGC): 236 # https://github.com/roozbehp/unicode-data/issues/6 237 return (UISC == Pure_Killer or 238 (UGC != Lo and UISC in [Vowel, Vowel_Dependent] and U not in [0xAA29])) 239def is_VOWEL_MOD(U, UISC, UGC): 240 # https://github.com/roozbehp/unicode-data/issues/6 241 return (UISC in [Tone_Mark, Cantillation_Mark, Register_Shifter, Visarga] or 242 (UGC != Lo and (UISC == Bindu or U in [0xAA29]))) 243 244use_mapping = { 245 'B': is_BASE, 246 'IND': is_BASE_IND, 247 'N': is_BASE_NUM, 248 'GB': is_BASE_OTHER, 249 'CGJ': is_CGJ, 250 'F': is_CONS_FINAL, 251 'FM': is_CONS_FINAL_MOD, 252 'M': is_CONS_MED, 253 'CM': is_CONS_MOD, 254 'SUB': is_CONS_SUB, 255 'CS': is_CONS_WITH_STACKER, 256 'H': is_HALANT, 257 'HVM': is_HALANT_OR_VOWEL_MODIFIER, 258 'HN': is_HALANT_NUM, 259 'ZWNJ': is_ZWNJ, 260 'ZWJ': is_ZWJ, 261 'WJ': is_Word_Joiner, 262 'O': is_OTHER, 263 'Rsv': is_Reserved, 264 'R': is_REPHA, 265 'S': is_SYM, 266 'SM': is_SYM_MOD, 267 'VS': is_VARIATION_SELECTOR, 268 'V': is_VOWEL, 269 'VM': is_VOWEL_MOD, 270} 271 272use_positions = { 273 'F': { 274 'Abv': [Top], 275 'Blw': [Bottom], 276 'Pst': [Right], 277 }, 278 'M': { 279 'Abv': [Top], 280 'Blw': [Bottom, Bottom_And_Left], 281 'Pst': [Right], 282 'Pre': [Left], 283 }, 284 'CM': { 285 'Abv': [Top], 286 'Blw': [Bottom], 287 }, 288 'V': { 289 'Abv': [Top, Top_And_Bottom, Top_And_Bottom_And_Right, Top_And_Right], 290 'Blw': [Bottom, Overstruck, Bottom_And_Right], 291 'Pst': [Right, Top_And_Left, Top_And_Left_And_Right, Left_And_Right], 292 'Pre': [Left], 293 }, 294 'VM': { 295 'Abv': [Top], 296 'Blw': [Bottom, Overstruck], 297 'Pst': [Right], 298 'Pre': [Left], 299 }, 300 'SM': { 301 'Abv': [Top], 302 'Blw': [Bottom], 303 }, 304 'H': None, 305 'HVM': None, 306 'B': None, 307 'FM': None, 308 'SUB': None, 309} 310 311def map_to_use(data): 312 out = {} 313 items = use_mapping.items() 314 for U,(UISC,UIPC,UGC,UBlock) in data.items(): 315 316 # Resolve Indic_Syllabic_Category 317 318 # TODO: These don't have UISC assigned in Unicode 8.0, but have UIPC 319 if U == 0x17DD: UISC = Vowel_Dependent 320 if 0x1CE2 <= U <= 0x1CE8: UISC = Cantillation_Mark 321 322 # Tibetan: 323 # TODO: These don't have UISC assigned in Unicode 11.0, but have UIPC 324 if 0x0F18 <= U <= 0x0F19 or 0x0F3E <= U <= 0x0F3F: UISC = Vowel_Dependent 325 if 0x0F86 <= U <= 0x0F87: UISC = Tone_Mark 326 # Overrides to allow NFC order matching syllable 327 # https://github.com/harfbuzz/harfbuzz/issues/1012 328 if UBlock == 'Tibetan' and is_VOWEL (U, UISC, UGC): 329 if UIPC == Top: 330 UIPC = Bottom 331 332 # TODO: https://github.com/harfbuzz/harfbuzz/pull/982 333 # also https://github.com/harfbuzz/harfbuzz/issues/1012 334 if UBlock == 'Chakma' and is_VOWEL (U, UISC, UGC): 335 if UIPC == Top: 336 UIPC = Bottom 337 elif UIPC == Bottom: 338 UIPC = Top 339 340 # TODO: https://github.com/harfbuzz/harfbuzz/pull/627 341 if 0x1BF2 <= U <= 0x1BF3: UISC = Nukta; UIPC = Bottom 342 343 # TODO: U+1CED should only be allowed after some of 344 # the nasalization marks, maybe only for U+1CE9..U+1CF1. 345 if U == 0x1CED: UISC = Tone_Mark 346 347 # TODO: https://github.com/harfbuzz/harfbuzz/issues/525 348 if U == 0x1A7F: UISC = Consonant_Final; UIPC = Bottom 349 350 # TODO: https://github.com/harfbuzz/harfbuzz/pull/609 351 if U == 0x20F0: UISC = Cantillation_Mark; UIPC = Top 352 353 # TODO: https://github.com/harfbuzz/harfbuzz/pull/626 354 if U == 0xA8B4: UISC = Consonant_Medial 355 356 # TODO: https://github.com/harfbuzz/harfbuzz/issues/1105 357 if U == 0x11134: UISC = Gemination_Mark 358 359 # TODO: https://github.com/harfbuzz/harfbuzz/pull/1399 360 if U == 0x111C9: UISC = Consonant_Final 361 362 values = [k for k,v in items if v(U,UISC,UGC)] 363 assert len(values) == 1, "%s %s %s %s" % (hex(U), UISC, UGC, values) 364 USE = values[0] 365 366 # Resolve Indic_Positional_Category 367 368 # TODO: Not in Unicode 8.0 yet, but in spec. 369 if U == 0x1B6C: UIPC = Bottom 370 371 # TODO: These should die, but have UIPC in Unicode 8.0 372 if U in [0x953, 0x954]: UIPC = Not_Applicable 373 374 # TODO: In USE's override list but not in Unicode 11.0 375 if U == 0x103C: UIPC = Left 376 377 # TODO: These are not in USE's override list that we have, nor are they in Unicode 11.0 378 if 0xA926 <= U <= 0xA92A: UIPC = Top 379 if U == 0x111CA: UIPC = Bottom 380 if U == 0x11300: UIPC = Top 381 # TODO: https://github.com/harfbuzz/harfbuzz/pull/1037 382 if U == 0x11302: UIPC = Top 383 if U == 0x1133C: UIPC = Bottom 384 if U == 0x1171E: UIPC = Left # Correct?! 385 if 0x1CF2 <= U <= 0x1CF3: UIPC = Right 386 if 0x1CF8 <= U <= 0x1CF9: UIPC = Top 387 # https://github.com/roozbehp/unicode-data/issues/8 388 if U == 0x0A51: UIPC = Bottom 389 390 assert (UIPC in [Not_Applicable, Visual_Order_Left] or 391 USE in use_positions), "%s %s %s %s %s" % (hex(U), UIPC, USE, UISC, UGC) 392 393 pos_mapping = use_positions.get(USE, None) 394 if pos_mapping: 395 values = [k for k,v in pos_mapping.items() if v and UIPC in v] 396 assert len(values) == 1, "%s %s %s %s %s %s" % (hex(U), UIPC, USE, UISC, UGC, values) 397 USE = USE + values[0] 398 399 out[U] = (USE, UBlock) 400 return out 401 402defaults = ('O', 'No_Block') 403data = map_to_use(data) 404 405print ("/* == Start of generated table == */") 406print ("/*") 407print (" * The following table is generated by running:") 408print (" *") 409print (" * ./gen-use-table.py IndicSyllabicCategory.txt IndicPositionalCategory.txt UnicodeData.txt Blocks.txt") 410print (" *") 411print (" * on files with these headers:") 412print (" *") 413for h in headers: 414 for l in h: 415 print (" * %s" % (l.strip())) 416print (" */") 417print () 418print ('#include "hb-ot-shape-complex-use.hh"') 419print () 420 421total = 0 422used = 0 423last_block = None 424def print_block (block, start, end, data): 425 global total, used, last_block 426 if block and block != last_block: 427 print () 428 print () 429 print (" /* %s */" % block) 430 if start % 16: 431 print (' ' * (20 + (start % 16 * 6)), end='') 432 num = 0 433 assert start % 8 == 0 434 assert (end+1) % 8 == 0 435 for u in range (start, end+1): 436 if u % 16 == 0: 437 print () 438 print (" /* %04X */" % u, end='') 439 if u in data: 440 num += 1 441 d = data.get (u, defaults) 442 print ("%6s," % d[0], end='') 443 444 total += end - start + 1 445 used += num 446 if block: 447 last_block = block 448 449uu = sorted (data.keys ()) 450 451last = -100000 452num = 0 453offset = 0 454starts = [] 455ends = [] 456for k,v in sorted(use_mapping.items()): 457 if k in use_positions and use_positions[k]: continue 458 print ("#define %s USE_%s /* %s */" % (k, k, v.__name__[3:])) 459for k,v in sorted(use_positions.items()): 460 if not v: continue 461 for suf in v.keys(): 462 tag = k + suf 463 print ("#define %s USE_%s" % (tag, tag)) 464print ("") 465print ("static const USE_TABLE_ELEMENT_TYPE use_table[] = {") 466for u in uu: 467 if u <= last: 468 continue 469 block = data[u][1] 470 471 start = u//8*8 472 end = start+1 473 while end in uu and block == data[end][1]: 474 end += 1 475 end = (end-1)//8*8 + 7 476 477 if start != last + 1: 478 if start - last <= 1+16*3: 479 print_block (None, last+1, start-1, data) 480 last = start-1 481 else: 482 if last >= 0: 483 ends.append (last + 1) 484 offset += ends[-1] - starts[-1] 485 print () 486 print () 487 print ("#define use_offset_0x%04xu %d" % (start, offset)) 488 starts.append (start) 489 490 print_block (block, start, end, data) 491 last = end 492ends.append (last + 1) 493offset += ends[-1] - starts[-1] 494print () 495print () 496occupancy = used * 100. / total 497page_bits = 12 498print ("}; /* Table items: %d; occupancy: %d%% */" % (offset, occupancy)) 499print () 500print ("USE_TABLE_ELEMENT_TYPE") 501print ("hb_use_get_category (hb_codepoint_t u)") 502print ("{") 503print (" switch (u >> %d)" % page_bits) 504print (" {") 505pages = set([u>>page_bits for u in starts+ends]) 506for p in sorted(pages): 507 print (" case 0x%0Xu:" % p) 508 for (start,end) in zip (starts, ends): 509 if p not in [start>>page_bits, end>>page_bits]: continue 510 offset = "use_offset_0x%04xu" % start 511 print (" if (hb_in_range<hb_codepoint_t> (u, 0x%04Xu, 0x%04Xu)) return use_table[u - 0x%04Xu + %s];" % (start, end-1, start, offset)) 512 print (" break;") 513 print ("") 514print (" default:") 515print (" break;") 516print (" }") 517print (" return USE_O;") 518print ("}") 519print () 520for k in sorted(use_mapping.keys()): 521 if k in use_positions and use_positions[k]: continue 522 print ("#undef %s" % k) 523for k,v in sorted(use_positions.items()): 524 if not v: continue 525 for suf in v.keys(): 526 tag = k + suf 527 print ("#undef %s" % tag) 528print () 529print ("/* == End of generated table == */") 530 531# Maintain at least 50% occupancy in the table */ 532if occupancy < 50: 533 raise Exception ("Table too sparse, please investigate: ", occupancy) 534