1# 2# (re)generate unicode property and type databases 3# 4# this script converts a unicode 3.2 database file to 5# Modules/unicodedata_db.h, Modules/unicodename_db.h, 6# and Objects/unicodetype_db.h 7# 8# history: 9# 2000-09-24 fl created (based on bits and pieces from unidb) 10# 2000-09-25 fl merged tim's splitbin fixes, separate decomposition table 11# 2000-09-25 fl added character type table 12# 2000-09-26 fl added LINEBREAK, DECIMAL, and DIGIT flags/fields (2.0) 13# 2000-11-03 fl expand first/last ranges 14# 2001-01-19 fl added character name tables (2.1) 15# 2001-01-21 fl added decomp compression; dynamic phrasebook threshold 16# 2002-09-11 wd use string methods 17# 2002-10-18 mvl update to Unicode 3.2 18# 2002-10-22 mvl generate NFC tables 19# 2002-11-24 mvl expand all ranges, sort names version-independently 20# 2002-11-25 mvl add UNIDATA_VERSION 21# 2004-05-29 perky add east asian width information 22# 2006-03-10 mvl update to Unicode 4.1; add UCD 3.2 delta 23# 2008-06-11 gb add PRINTABLE_MASK for Atsuo Ishimoto's ascii() patch 24# 2011-10-21 ezio add support for name aliases and named sequences 25# 2012-01 benjamin add full case mappings 26# 27# written by Fredrik Lundh (fredrik@pythonware.com) 28# 29 30import os 31import sys 32import zipfile 33 34from textwrap import dedent 35 36SCRIPT = sys.argv[0] 37VERSION = "3.2" 38 39# The Unicode Database 40# -------------------- 41# When changing UCD version please update 42# * Doc/library/stdtypes.rst, and 43# * Doc/library/unicodedata.rst 44# * Doc/reference/lexical_analysis.rst (two occurrences) 45UNIDATA_VERSION = "9.0.0" 46UNICODE_DATA = "UnicodeData%s.txt" 47COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt" 48EASTASIAN_WIDTH = "EastAsianWidth%s.txt" 49UNIHAN = "Unihan%s.zip" 50DERIVED_CORE_PROPERTIES = "DerivedCoreProperties%s.txt" 51DERIVEDNORMALIZATION_PROPS = "DerivedNormalizationProps%s.txt" 52LINE_BREAK = "LineBreak%s.txt" 53NAME_ALIASES = "NameAliases%s.txt" 54NAMED_SEQUENCES = "NamedSequences%s.txt" 55SPECIAL_CASING = "SpecialCasing%s.txt" 56CASE_FOLDING = "CaseFolding%s.txt" 57 58# Private Use Areas -- in planes 1, 15, 16 59PUA_1 = range(0xE000, 0xF900) 60PUA_15 = range(0xF0000, 0xFFFFE) 61PUA_16 = range(0x100000, 0x10FFFE) 62 63# we use this ranges of PUA_15 to store name aliases and named sequences 64NAME_ALIASES_START = 0xF0000 65NAMED_SEQUENCES_START = 0xF0200 66 67old_versions = ["3.2.0"] 68 69CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd", 70 "Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm", 71 "Lo", "Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po", "Sm", "Sc", "Sk", 72 "So" ] 73 74BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO", 75 "PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS", 76 "ON", "LRI", "RLI", "FSI", "PDI" ] 77 78EASTASIANWIDTH_NAMES = [ "F", "H", "W", "Na", "A", "N" ] 79 80MANDATORY_LINE_BREAKS = [ "BK", "CR", "LF", "NL" ] 81 82# note: should match definitions in Objects/unicodectype.c 83ALPHA_MASK = 0x01 84DECIMAL_MASK = 0x02 85DIGIT_MASK = 0x04 86LOWER_MASK = 0x08 87LINEBREAK_MASK = 0x10 88SPACE_MASK = 0x20 89TITLE_MASK = 0x40 90UPPER_MASK = 0x80 91XID_START_MASK = 0x100 92XID_CONTINUE_MASK = 0x200 93PRINTABLE_MASK = 0x400 94NUMERIC_MASK = 0x800 95CASE_IGNORABLE_MASK = 0x1000 96CASED_MASK = 0x2000 97EXTENDED_CASE_MASK = 0x4000 98 99# these ranges need to match unicodedata.c:is_unified_ideograph 100cjk_ranges = [ 101 ('3400', '4DB5'), 102 ('4E00', '9FD5'), 103 ('20000', '2A6D6'), 104 ('2A700', '2B734'), 105 ('2B740', '2B81D'), 106 ('2B820', '2CEA1'), 107] 108 109def maketables(trace=0): 110 111 print("--- Reading", UNICODE_DATA % "", "...") 112 113 version = "" 114 unicode = UnicodeData(UNIDATA_VERSION) 115 116 print(len(list(filter(None, unicode.table))), "characters") 117 118 for version in old_versions: 119 print("--- Reading", UNICODE_DATA % ("-"+version), "...") 120 old_unicode = UnicodeData(version, cjk_check=False) 121 print(len(list(filter(None, old_unicode.table))), "characters") 122 merge_old_version(version, unicode, old_unicode) 123 124 makeunicodename(unicode, trace) 125 makeunicodedata(unicode, trace) 126 makeunicodetype(unicode, trace) 127 128# -------------------------------------------------------------------- 129# unicode character properties 130 131def makeunicodedata(unicode, trace): 132 133 dummy = (0, 0, 0, 0, 0, 0) 134 table = [dummy] 135 cache = {0: dummy} 136 index = [0] * len(unicode.chars) 137 138 FILE = "Modules/unicodedata_db.h" 139 140 print("--- Preparing", FILE, "...") 141 142 # 1) database properties 143 144 for char in unicode.chars: 145 record = unicode.table[char] 146 if record: 147 # extract database properties 148 category = CATEGORY_NAMES.index(record[2]) 149 combining = int(record[3]) 150 bidirectional = BIDIRECTIONAL_NAMES.index(record[4]) 151 mirrored = record[9] == "Y" 152 eastasianwidth = EASTASIANWIDTH_NAMES.index(record[15]) 153 normalizationquickcheck = record[17] 154 item = ( 155 category, combining, bidirectional, mirrored, eastasianwidth, 156 normalizationquickcheck 157 ) 158 # add entry to index and item tables 159 i = cache.get(item) 160 if i is None: 161 cache[item] = i = len(table) 162 table.append(item) 163 index[char] = i 164 165 # 2) decomposition data 166 167 decomp_data = [0] 168 decomp_prefix = [""] 169 decomp_index = [0] * len(unicode.chars) 170 decomp_size = 0 171 172 comp_pairs = [] 173 comp_first = [None] * len(unicode.chars) 174 comp_last = [None] * len(unicode.chars) 175 176 for char in unicode.chars: 177 record = unicode.table[char] 178 if record: 179 if record[5]: 180 decomp = record[5].split() 181 if len(decomp) > 19: 182 raise Exception("character %x has a decomposition too large for nfd_nfkd" % char) 183 # prefix 184 if decomp[0][0] == "<": 185 prefix = decomp.pop(0) 186 else: 187 prefix = "" 188 try: 189 i = decomp_prefix.index(prefix) 190 except ValueError: 191 i = len(decomp_prefix) 192 decomp_prefix.append(prefix) 193 prefix = i 194 assert prefix < 256 195 # content 196 decomp = [prefix + (len(decomp)<<8)] + [int(s, 16) for s in decomp] 197 # Collect NFC pairs 198 if not prefix and len(decomp) == 3 and \ 199 char not in unicode.exclusions and \ 200 unicode.table[decomp[1]][3] == "0": 201 p, l, r = decomp 202 comp_first[l] = 1 203 comp_last[r] = 1 204 comp_pairs.append((l,r,char)) 205 try: 206 i = decomp_data.index(decomp) 207 except ValueError: 208 i = len(decomp_data) 209 decomp_data.extend(decomp) 210 decomp_size = decomp_size + len(decomp) * 2 211 else: 212 i = 0 213 decomp_index[char] = i 214 215 f = l = 0 216 comp_first_ranges = [] 217 comp_last_ranges = [] 218 prev_f = prev_l = None 219 for i in unicode.chars: 220 if comp_first[i] is not None: 221 comp_first[i] = f 222 f += 1 223 if prev_f is None: 224 prev_f = (i,i) 225 elif prev_f[1]+1 == i: 226 prev_f = prev_f[0],i 227 else: 228 comp_first_ranges.append(prev_f) 229 prev_f = (i,i) 230 if comp_last[i] is not None: 231 comp_last[i] = l 232 l += 1 233 if prev_l is None: 234 prev_l = (i,i) 235 elif prev_l[1]+1 == i: 236 prev_l = prev_l[0],i 237 else: 238 comp_last_ranges.append(prev_l) 239 prev_l = (i,i) 240 comp_first_ranges.append(prev_f) 241 comp_last_ranges.append(prev_l) 242 total_first = f 243 total_last = l 244 245 comp_data = [0]*(total_first*total_last) 246 for f,l,char in comp_pairs: 247 f = comp_first[f] 248 l = comp_last[l] 249 comp_data[f*total_last+l] = char 250 251 print(len(table), "unique properties") 252 print(len(decomp_prefix), "unique decomposition prefixes") 253 print(len(decomp_data), "unique decomposition entries:", end=' ') 254 print(decomp_size, "bytes") 255 print(total_first, "first characters in NFC") 256 print(total_last, "last characters in NFC") 257 print(len(comp_pairs), "NFC pairs") 258 259 print("--- Writing", FILE, "...") 260 261 fp = open(FILE, "w") 262 print("/* this file was generated by %s %s */" % (SCRIPT, VERSION), file=fp) 263 print(file=fp) 264 print('#define UNIDATA_VERSION "%s"' % UNIDATA_VERSION, file=fp) 265 print("/* a list of unique database records */", file=fp) 266 print("const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {", file=fp) 267 for item in table: 268 print(" {%d, %d, %d, %d, %d, %d}," % item, file=fp) 269 print("};", file=fp) 270 print(file=fp) 271 272 print("/* Reindexing of NFC first characters. */", file=fp) 273 print("#define TOTAL_FIRST",total_first, file=fp) 274 print("#define TOTAL_LAST",total_last, file=fp) 275 print("struct reindex{int start;short count,index;};", file=fp) 276 print("static struct reindex nfc_first[] = {", file=fp) 277 for start,end in comp_first_ranges: 278 print(" { %d, %d, %d}," % (start,end-start,comp_first[start]), file=fp) 279 print(" {0,0,0}", file=fp) 280 print("};\n", file=fp) 281 print("static struct reindex nfc_last[] = {", file=fp) 282 for start,end in comp_last_ranges: 283 print(" { %d, %d, %d}," % (start,end-start,comp_last[start]), file=fp) 284 print(" {0,0,0}", file=fp) 285 print("};\n", file=fp) 286 287 # FIXME: <fl> the following tables could be made static, and 288 # the support code moved into unicodedatabase.c 289 290 print("/* string literals */", file=fp) 291 print("const char *_PyUnicode_CategoryNames[] = {", file=fp) 292 for name in CATEGORY_NAMES: 293 print(" \"%s\"," % name, file=fp) 294 print(" NULL", file=fp) 295 print("};", file=fp) 296 297 print("const char *_PyUnicode_BidirectionalNames[] = {", file=fp) 298 for name in BIDIRECTIONAL_NAMES: 299 print(" \"%s\"," % name, file=fp) 300 print(" NULL", file=fp) 301 print("};", file=fp) 302 303 print("const char *_PyUnicode_EastAsianWidthNames[] = {", file=fp) 304 for name in EASTASIANWIDTH_NAMES: 305 print(" \"%s\"," % name, file=fp) 306 print(" NULL", file=fp) 307 print("};", file=fp) 308 309 print("static const char *decomp_prefix[] = {", file=fp) 310 for name in decomp_prefix: 311 print(" \"%s\"," % name, file=fp) 312 print(" NULL", file=fp) 313 print("};", file=fp) 314 315 # split record index table 316 index1, index2, shift = splitbins(index, trace) 317 318 print("/* index tables for the database records */", file=fp) 319 print("#define SHIFT", shift, file=fp) 320 Array("index1", index1).dump(fp, trace) 321 Array("index2", index2).dump(fp, trace) 322 323 # split decomposition index table 324 index1, index2, shift = splitbins(decomp_index, trace) 325 326 print("/* decomposition data */", file=fp) 327 Array("decomp_data", decomp_data).dump(fp, trace) 328 329 print("/* index tables for the decomposition data */", file=fp) 330 print("#define DECOMP_SHIFT", shift, file=fp) 331 Array("decomp_index1", index1).dump(fp, trace) 332 Array("decomp_index2", index2).dump(fp, trace) 333 334 index, index2, shift = splitbins(comp_data, trace) 335 print("/* NFC pairs */", file=fp) 336 print("#define COMP_SHIFT", shift, file=fp) 337 Array("comp_index", index).dump(fp, trace) 338 Array("comp_data", index2).dump(fp, trace) 339 340 # Generate delta tables for old versions 341 for version, table, normalization in unicode.changed: 342 cversion = version.replace(".","_") 343 records = [table[0]] 344 cache = {table[0]:0} 345 index = [0] * len(table) 346 for i, record in enumerate(table): 347 try: 348 index[i] = cache[record] 349 except KeyError: 350 index[i] = cache[record] = len(records) 351 records.append(record) 352 index1, index2, shift = splitbins(index, trace) 353 print("static const change_record change_records_%s[] = {" % cversion, file=fp) 354 for record in records: 355 print("\t{ %s }," % ", ".join(map(str,record)), file=fp) 356 print("};", file=fp) 357 Array("changes_%s_index" % cversion, index1).dump(fp, trace) 358 Array("changes_%s_data" % cversion, index2).dump(fp, trace) 359 print("static const change_record* get_change_%s(Py_UCS4 n)" % cversion, file=fp) 360 print("{", file=fp) 361 print("\tint index;", file=fp) 362 print("\tif (n >= 0x110000) index = 0;", file=fp) 363 print("\telse {", file=fp) 364 print("\t\tindex = changes_%s_index[n>>%d];" % (cversion, shift), file=fp) 365 print("\t\tindex = changes_%s_data[(index<<%d)+(n & %d)];" % \ 366 (cversion, shift, ((1<<shift)-1)), file=fp) 367 print("\t}", file=fp) 368 print("\treturn change_records_%s+index;" % cversion, file=fp) 369 print("}\n", file=fp) 370 print("static Py_UCS4 normalization_%s(Py_UCS4 n)" % cversion, file=fp) 371 print("{", file=fp) 372 print("\tswitch(n) {", file=fp) 373 for k, v in normalization: 374 print("\tcase %s: return 0x%s;" % (hex(k), v), file=fp) 375 print("\tdefault: return 0;", file=fp) 376 print("\t}\n}\n", file=fp) 377 378 fp.close() 379 380# -------------------------------------------------------------------- 381# unicode character type tables 382 383def makeunicodetype(unicode, trace): 384 385 FILE = "Objects/unicodetype_db.h" 386 387 print("--- Preparing", FILE, "...") 388 389 # extract unicode types 390 dummy = (0, 0, 0, 0, 0, 0) 391 table = [dummy] 392 cache = {0: dummy} 393 index = [0] * len(unicode.chars) 394 numeric = {} 395 spaces = [] 396 linebreaks = [] 397 extra_casing = [] 398 399 for char in unicode.chars: 400 record = unicode.table[char] 401 if record: 402 # extract database properties 403 category = record[2] 404 bidirectional = record[4] 405 properties = record[16] 406 flags = 0 407 delta = True 408 if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]: 409 flags |= ALPHA_MASK 410 if "Lowercase" in properties: 411 flags |= LOWER_MASK 412 if 'Line_Break' in properties or bidirectional == "B": 413 flags |= LINEBREAK_MASK 414 linebreaks.append(char) 415 if category == "Zs" or bidirectional in ("WS", "B", "S"): 416 flags |= SPACE_MASK 417 spaces.append(char) 418 if category == "Lt": 419 flags |= TITLE_MASK 420 if "Uppercase" in properties: 421 flags |= UPPER_MASK 422 if char == ord(" ") or category[0] not in ("C", "Z"): 423 flags |= PRINTABLE_MASK 424 if "XID_Start" in properties: 425 flags |= XID_START_MASK 426 if "XID_Continue" in properties: 427 flags |= XID_CONTINUE_MASK 428 if "Cased" in properties: 429 flags |= CASED_MASK 430 if "Case_Ignorable" in properties: 431 flags |= CASE_IGNORABLE_MASK 432 sc = unicode.special_casing.get(char) 433 cf = unicode.case_folding.get(char, [char]) 434 if record[12]: 435 upper = int(record[12], 16) 436 else: 437 upper = char 438 if record[13]: 439 lower = int(record[13], 16) 440 else: 441 lower = char 442 if record[14]: 443 title = int(record[14], 16) 444 else: 445 title = upper 446 if sc is None and cf != [lower]: 447 sc = ([lower], [title], [upper]) 448 if sc is None: 449 if upper == lower == title: 450 upper = lower = title = 0 451 else: 452 upper = upper - char 453 lower = lower - char 454 title = title - char 455 assert (abs(upper) <= 2147483647 and 456 abs(lower) <= 2147483647 and 457 abs(title) <= 2147483647) 458 else: 459 # This happens either when some character maps to more than one 460 # character in uppercase, lowercase, or titlecase or the 461 # casefolded version of the character is different from the 462 # lowercase. The extra characters are stored in a different 463 # array. 464 flags |= EXTENDED_CASE_MASK 465 lower = len(extra_casing) | (len(sc[0]) << 24) 466 extra_casing.extend(sc[0]) 467 if cf != sc[0]: 468 lower |= len(cf) << 20 469 extra_casing.extend(cf) 470 upper = len(extra_casing) | (len(sc[2]) << 24) 471 extra_casing.extend(sc[2]) 472 # Title is probably equal to upper. 473 if sc[1] == sc[2]: 474 title = upper 475 else: 476 title = len(extra_casing) | (len(sc[1]) << 24) 477 extra_casing.extend(sc[1]) 478 # decimal digit, integer digit 479 decimal = 0 480 if record[6]: 481 flags |= DECIMAL_MASK 482 decimal = int(record[6]) 483 digit = 0 484 if record[7]: 485 flags |= DIGIT_MASK 486 digit = int(record[7]) 487 if record[8]: 488 flags |= NUMERIC_MASK 489 numeric.setdefault(record[8], []).append(char) 490 item = ( 491 upper, lower, title, decimal, digit, flags 492 ) 493 # add entry to index and item tables 494 i = cache.get(item) 495 if i is None: 496 cache[item] = i = len(table) 497 table.append(item) 498 index[char] = i 499 500 print(len(table), "unique character type entries") 501 print(sum(map(len, numeric.values())), "numeric code points") 502 print(len(spaces), "whitespace code points") 503 print(len(linebreaks), "linebreak code points") 504 print(len(extra_casing), "extended case array") 505 506 print("--- Writing", FILE, "...") 507 508 fp = open(FILE, "w") 509 print("/* this file was generated by %s %s */" % (SCRIPT, VERSION), file=fp) 510 print(file=fp) 511 print("/* a list of unique character type descriptors */", file=fp) 512 print("const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {", file=fp) 513 for item in table: 514 print(" {%d, %d, %d, %d, %d, %d}," % item, file=fp) 515 print("};", file=fp) 516 print(file=fp) 517 518 print("/* extended case mappings */", file=fp) 519 print(file=fp) 520 print("const Py_UCS4 _PyUnicode_ExtendedCase[] = {", file=fp) 521 for c in extra_casing: 522 print(" %d," % c, file=fp) 523 print("};", file=fp) 524 print(file=fp) 525 526 # split decomposition index table 527 index1, index2, shift = splitbins(index, trace) 528 529 print("/* type indexes */", file=fp) 530 print("#define SHIFT", shift, file=fp) 531 Array("index1", index1).dump(fp, trace) 532 Array("index2", index2).dump(fp, trace) 533 534 # Generate code for _PyUnicode_ToNumeric() 535 numeric_items = sorted(numeric.items()) 536 print('/* Returns the numeric value as double for Unicode characters', file=fp) 537 print(' * having this property, -1.0 otherwise.', file=fp) 538 print(' */', file=fp) 539 print('double _PyUnicode_ToNumeric(Py_UCS4 ch)', file=fp) 540 print('{', file=fp) 541 print(' switch (ch) {', file=fp) 542 for value, codepoints in numeric_items: 543 # Turn text into float literals 544 parts = value.split('/') 545 parts = [repr(float(part)) for part in parts] 546 value = '/'.join(parts) 547 548 codepoints.sort() 549 for codepoint in codepoints: 550 print(' case 0x%04X:' % (codepoint,), file=fp) 551 print(' return (double) %s;' % (value,), file=fp) 552 print(' }', file=fp) 553 print(' return -1.0;', file=fp) 554 print('}', file=fp) 555 print(file=fp) 556 557 # Generate code for _PyUnicode_IsWhitespace() 558 print("/* Returns 1 for Unicode characters having the bidirectional", file=fp) 559 print(" * type 'WS', 'B' or 'S' or the category 'Zs', 0 otherwise.", file=fp) 560 print(" */", file=fp) 561 print('int _PyUnicode_IsWhitespace(const Py_UCS4 ch)', file=fp) 562 print('{', file=fp) 563 print(' switch (ch) {', file=fp) 564 565 for codepoint in sorted(spaces): 566 print(' case 0x%04X:' % (codepoint,), file=fp) 567 print(' return 1;', file=fp) 568 569 print(' }', file=fp) 570 print(' return 0;', file=fp) 571 print('}', file=fp) 572 print(file=fp) 573 574 # Generate code for _PyUnicode_IsLinebreak() 575 print("/* Returns 1 for Unicode characters having the line break", file=fp) 576 print(" * property 'BK', 'CR', 'LF' or 'NL' or having bidirectional", file=fp) 577 print(" * type 'B', 0 otherwise.", file=fp) 578 print(" */", file=fp) 579 print('int _PyUnicode_IsLinebreak(const Py_UCS4 ch)', file=fp) 580 print('{', file=fp) 581 print(' switch (ch) {', file=fp) 582 for codepoint in sorted(linebreaks): 583 print(' case 0x%04X:' % (codepoint,), file=fp) 584 print(' return 1;', file=fp) 585 586 print(' }', file=fp) 587 print(' return 0;', file=fp) 588 print('}', file=fp) 589 print(file=fp) 590 591 fp.close() 592 593# -------------------------------------------------------------------- 594# unicode name database 595 596def makeunicodename(unicode, trace): 597 598 FILE = "Modules/unicodename_db.h" 599 600 print("--- Preparing", FILE, "...") 601 602 # collect names 603 names = [None] * len(unicode.chars) 604 605 for char in unicode.chars: 606 record = unicode.table[char] 607 if record: 608 name = record[1].strip() 609 if name and name[0] != "<": 610 names[char] = name + chr(0) 611 612 print(len(list(n for n in names if n is not None)), "distinct names") 613 614 # collect unique words from names (note that we differ between 615 # words inside a sentence, and words ending a sentence. the 616 # latter includes the trailing null byte. 617 618 words = {} 619 n = b = 0 620 for char in unicode.chars: 621 name = names[char] 622 if name: 623 w = name.split() 624 b = b + len(name) 625 n = n + len(w) 626 for w in w: 627 l = words.get(w) 628 if l: 629 l.append(None) 630 else: 631 words[w] = [len(words)] 632 633 print(n, "words in text;", b, "bytes") 634 635 wordlist = list(words.items()) 636 637 # sort on falling frequency, then by name 638 def word_key(a): 639 aword, alist = a 640 return -len(alist), aword 641 wordlist.sort(key=word_key) 642 643 # figure out how many phrasebook escapes we need 644 escapes = 0 645 while escapes * 256 < len(wordlist): 646 escapes = escapes + 1 647 print(escapes, "escapes") 648 649 short = 256 - escapes 650 651 assert short > 0 652 653 print(short, "short indexes in lexicon") 654 655 # statistics 656 n = 0 657 for i in range(short): 658 n = n + len(wordlist[i][1]) 659 print(n, "short indexes in phrasebook") 660 661 # pick the most commonly used words, and sort the rest on falling 662 # length (to maximize overlap) 663 664 wordlist, wordtail = wordlist[:short], wordlist[short:] 665 wordtail.sort(key=lambda a: a[0], reverse=True) 666 wordlist.extend(wordtail) 667 668 # generate lexicon from words 669 670 lexicon_offset = [0] 671 lexicon = "" 672 words = {} 673 674 # build a lexicon string 675 offset = 0 676 for w, x in wordlist: 677 # encoding: bit 7 indicates last character in word (chr(128) 678 # indicates the last character in an entire string) 679 ww = w[:-1] + chr(ord(w[-1])+128) 680 # reuse string tails, when possible 681 o = lexicon.find(ww) 682 if o < 0: 683 o = offset 684 lexicon = lexicon + ww 685 offset = offset + len(w) 686 words[w] = len(lexicon_offset) 687 lexicon_offset.append(o) 688 689 lexicon = list(map(ord, lexicon)) 690 691 # generate phrasebook from names and lexicon 692 phrasebook = [0] 693 phrasebook_offset = [0] * len(unicode.chars) 694 for char in unicode.chars: 695 name = names[char] 696 if name: 697 w = name.split() 698 phrasebook_offset[char] = len(phrasebook) 699 for w in w: 700 i = words[w] 701 if i < short: 702 phrasebook.append(i) 703 else: 704 # store as two bytes 705 phrasebook.append((i>>8) + short) 706 phrasebook.append(i&255) 707 708 assert getsize(phrasebook) == 1 709 710 # 711 # unicode name hash table 712 713 # extract names 714 data = [] 715 for char in unicode.chars: 716 record = unicode.table[char] 717 if record: 718 name = record[1].strip() 719 if name and name[0] != "<": 720 data.append((name, char)) 721 722 # the magic number 47 was chosen to minimize the number of 723 # collisions on the current data set. if you like, change it 724 # and see what happens... 725 726 codehash = Hash("code", data, 47) 727 728 print("--- Writing", FILE, "...") 729 730 fp = open(FILE, "w") 731 print("/* this file was generated by %s %s */" % (SCRIPT, VERSION), file=fp) 732 print(file=fp) 733 print("#define NAME_MAXLEN", 256, file=fp) 734 print(file=fp) 735 print("/* lexicon */", file=fp) 736 Array("lexicon", lexicon).dump(fp, trace) 737 Array("lexicon_offset", lexicon_offset).dump(fp, trace) 738 739 # split decomposition index table 740 offset1, offset2, shift = splitbins(phrasebook_offset, trace) 741 742 print("/* code->name phrasebook */", file=fp) 743 print("#define phrasebook_shift", shift, file=fp) 744 print("#define phrasebook_short", short, file=fp) 745 746 Array("phrasebook", phrasebook).dump(fp, trace) 747 Array("phrasebook_offset1", offset1).dump(fp, trace) 748 Array("phrasebook_offset2", offset2).dump(fp, trace) 749 750 print("/* name->code dictionary */", file=fp) 751 codehash.dump(fp, trace) 752 753 print(file=fp) 754 print('static const unsigned int aliases_start = %#x;' % 755 NAME_ALIASES_START, file=fp) 756 print('static const unsigned int aliases_end = %#x;' % 757 (NAME_ALIASES_START + len(unicode.aliases)), file=fp) 758 759 print('static const unsigned int name_aliases[] = {', file=fp) 760 for name, codepoint in unicode.aliases: 761 print(' 0x%04X,' % codepoint, file=fp) 762 print('};', file=fp) 763 764 # In Unicode 6.0.0, the sequences contain at most 4 BMP chars, 765 # so we are using Py_UCS2 seq[4]. This needs to be updated if longer 766 # sequences or sequences with non-BMP chars are added. 767 # unicodedata_lookup should be adapted too. 768 print(dedent(""" 769 typedef struct NamedSequence { 770 int seqlen; 771 Py_UCS2 seq[4]; 772 } named_sequence; 773 """), file=fp) 774 775 print('static const unsigned int named_sequences_start = %#x;' % 776 NAMED_SEQUENCES_START, file=fp) 777 print('static const unsigned int named_sequences_end = %#x;' % 778 (NAMED_SEQUENCES_START + len(unicode.named_sequences)), file=fp) 779 780 print('static const named_sequence named_sequences[] = {', file=fp) 781 for name, sequence in unicode.named_sequences: 782 seq_str = ', '.join('0x%04X' % cp for cp in sequence) 783 print(' {%d, {%s}},' % (len(sequence), seq_str), file=fp) 784 print('};', file=fp) 785 786 fp.close() 787 788 789def merge_old_version(version, new, old): 790 # Changes to exclusion file not implemented yet 791 if old.exclusions != new.exclusions: 792 raise NotImplementedError("exclusions differ") 793 794 # In these change records, 0xFF means "no change" 795 bidir_changes = [0xFF]*0x110000 796 category_changes = [0xFF]*0x110000 797 decimal_changes = [0xFF]*0x110000 798 mirrored_changes = [0xFF]*0x110000 799 east_asian_width_changes = [0xFF]*0x110000 800 # In numeric data, 0 means "no change", 801 # -1 means "did not have a numeric value 802 numeric_changes = [0] * 0x110000 803 # normalization_changes is a list of key-value pairs 804 normalization_changes = [] 805 for i in range(0x110000): 806 if new.table[i] is None: 807 # Characters unassigned in the new version ought to 808 # be unassigned in the old one 809 assert old.table[i] is None 810 continue 811 # check characters unassigned in the old version 812 if old.table[i] is None: 813 # category 0 is "unassigned" 814 category_changes[i] = 0 815 continue 816 # check characters that differ 817 if old.table[i] != new.table[i]: 818 for k in range(len(old.table[i])): 819 if old.table[i][k] != new.table[i][k]: 820 value = old.table[i][k] 821 if k == 1 and i in PUA_15: 822 # the name is not set in the old.table, but in the 823 # new.table we are using it for aliases and named seq 824 assert value == '' 825 elif k == 2: 826 #print "CATEGORY",hex(i), old.table[i][k], new.table[i][k] 827 category_changes[i] = CATEGORY_NAMES.index(value) 828 elif k == 4: 829 #print "BIDIR",hex(i), old.table[i][k], new.table[i][k] 830 bidir_changes[i] = BIDIRECTIONAL_NAMES.index(value) 831 elif k == 5: 832 #print "DECOMP",hex(i), old.table[i][k], new.table[i][k] 833 # We assume that all normalization changes are in 1:1 mappings 834 assert " " not in value 835 normalization_changes.append((i, value)) 836 elif k == 6: 837 #print "DECIMAL",hex(i), old.table[i][k], new.table[i][k] 838 # we only support changes where the old value is a single digit 839 assert value in "0123456789" 840 decimal_changes[i] = int(value) 841 elif k == 8: 842 # print "NUMERIC",hex(i), `old.table[i][k]`, new.table[i][k] 843 # Since 0 encodes "no change", the old value is better not 0 844 if not value: 845 numeric_changes[i] = -1 846 else: 847 numeric_changes[i] = float(value) 848 assert numeric_changes[i] not in (0, -1) 849 elif k == 9: 850 if value == 'Y': 851 mirrored_changes[i] = '1' 852 else: 853 mirrored_changes[i] = '0' 854 elif k == 11: 855 # change to ISO comment, ignore 856 pass 857 elif k == 12: 858 # change to simple uppercase mapping; ignore 859 pass 860 elif k == 13: 861 # change to simple lowercase mapping; ignore 862 pass 863 elif k == 14: 864 # change to simple titlecase mapping; ignore 865 pass 866 elif k == 15: 867 # change to east asian width 868 east_asian_width_changes[i] = EASTASIANWIDTH_NAMES.index(value) 869 elif k == 16: 870 # derived property changes; not yet 871 pass 872 elif k == 17: 873 # normalization quickchecks are not performed 874 # for older versions 875 pass 876 else: 877 class Difference(Exception):pass 878 raise Difference(hex(i), k, old.table[i], new.table[i]) 879 new.changed.append((version, list(zip(bidir_changes, category_changes, 880 decimal_changes, mirrored_changes, 881 east_asian_width_changes, 882 numeric_changes)), 883 normalization_changes)) 884 885def open_data(template, version): 886 local = template % ('-'+version,) 887 if not os.path.exists(local): 888 import urllib.request 889 if version == '3.2.0': 890 # irregular url structure 891 url = 'http://www.unicode.org/Public/3.2-Update/' + local 892 else: 893 url = ('http://www.unicode.org/Public/%s/ucd/'+template) % (version, '') 894 urllib.request.urlretrieve(url, filename=local) 895 if local.endswith('.txt'): 896 return open(local, encoding='utf-8') 897 else: 898 # Unihan.zip 899 return open(local, 'rb') 900 901# -------------------------------------------------------------------- 902# the following support code is taken from the unidb utilities 903# Copyright (c) 1999-2000 by Secret Labs AB 904 905# load a unicode-data file from disk 906 907class UnicodeData: 908 # Record structure: 909 # [ID, name, category, combining, bidi, decomp, (6) 910 # decimal, digit, numeric, bidi-mirrored, Unicode-1-name, (11) 911 # ISO-comment, uppercase, lowercase, titlecase, ea-width, (16) 912 # derived-props] (17) 913 914 def __init__(self, version, 915 linebreakprops=False, 916 expand=1, 917 cjk_check=True): 918 self.changed = [] 919 table = [None] * 0x110000 920 with open_data(UNICODE_DATA, version) as file: 921 while 1: 922 s = file.readline() 923 if not s: 924 break 925 s = s.strip().split(";") 926 char = int(s[0], 16) 927 table[char] = s 928 929 cjk_ranges_found = [] 930 931 # expand first-last ranges 932 if expand: 933 field = None 934 for i in range(0, 0x110000): 935 s = table[i] 936 if s: 937 if s[1][-6:] == "First>": 938 s[1] = "" 939 field = s 940 elif s[1][-5:] == "Last>": 941 if s[1].startswith("<CJK Ideograph"): 942 cjk_ranges_found.append((field[0], 943 s[0])) 944 s[1] = "" 945 field = None 946 elif field: 947 f2 = field[:] 948 f2[0] = "%X" % i 949 table[i] = f2 950 if cjk_check and cjk_ranges != cjk_ranges_found: 951 raise ValueError("CJK ranges deviate: have %r" % cjk_ranges_found) 952 953 # public attributes 954 self.filename = UNICODE_DATA % '' 955 self.table = table 956 self.chars = list(range(0x110000)) # unicode 3.2 957 958 # check for name aliases and named sequences, see #12753 959 # aliases and named sequences are not in 3.2.0 960 if version != '3.2.0': 961 self.aliases = [] 962 # store aliases in the Private Use Area 15, in range U+F0000..U+F00FF, 963 # in order to take advantage of the compression and lookup 964 # algorithms used for the other characters 965 pua_index = NAME_ALIASES_START 966 with open_data(NAME_ALIASES, version) as file: 967 for s in file: 968 s = s.strip() 969 if not s or s.startswith('#'): 970 continue 971 char, name, abbrev = s.split(';') 972 char = int(char, 16) 973 self.aliases.append((name, char)) 974 # also store the name in the PUA 1 975 self.table[pua_index][1] = name 976 pua_index += 1 977 assert pua_index - NAME_ALIASES_START == len(self.aliases) 978 979 self.named_sequences = [] 980 # store named sequences in the PUA 1, in range U+F0100.., 981 # in order to take advantage of the compression and lookup 982 # algorithms used for the other characters. 983 984 assert pua_index < NAMED_SEQUENCES_START 985 pua_index = NAMED_SEQUENCES_START 986 with open_data(NAMED_SEQUENCES, version) as file: 987 for s in file: 988 s = s.strip() 989 if not s or s.startswith('#'): 990 continue 991 name, chars = s.split(';') 992 chars = tuple(int(char, 16) for char in chars.split()) 993 # check that the structure defined in makeunicodename is OK 994 assert 2 <= len(chars) <= 4, "change the Py_UCS2 array size" 995 assert all(c <= 0xFFFF for c in chars), ("use Py_UCS4 in " 996 "the NamedSequence struct and in unicodedata_lookup") 997 self.named_sequences.append((name, chars)) 998 # also store these in the PUA 1 999 self.table[pua_index][1] = name 1000 pua_index += 1 1001 assert pua_index - NAMED_SEQUENCES_START == len(self.named_sequences) 1002 1003 self.exclusions = {} 1004 with open_data(COMPOSITION_EXCLUSIONS, version) as file: 1005 for s in file: 1006 s = s.strip() 1007 if not s: 1008 continue 1009 if s[0] == '#': 1010 continue 1011 char = int(s.split()[0],16) 1012 self.exclusions[char] = 1 1013 1014 widths = [None] * 0x110000 1015 with open_data(EASTASIAN_WIDTH, version) as file: 1016 for s in file: 1017 s = s.strip() 1018 if not s: 1019 continue 1020 if s[0] == '#': 1021 continue 1022 s = s.split()[0].split(';') 1023 if '..' in s[0]: 1024 first, last = [int(c, 16) for c in s[0].split('..')] 1025 chars = list(range(first, last+1)) 1026 else: 1027 chars = [int(s[0], 16)] 1028 for char in chars: 1029 widths[char] = s[1] 1030 1031 for i in range(0, 0x110000): 1032 if table[i] is not None: 1033 table[i].append(widths[i]) 1034 1035 for i in range(0, 0x110000): 1036 if table[i] is not None: 1037 table[i].append(set()) 1038 1039 with open_data(DERIVED_CORE_PROPERTIES, version) as file: 1040 for s in file: 1041 s = s.split('#', 1)[0].strip() 1042 if not s: 1043 continue 1044 1045 r, p = s.split(";") 1046 r = r.strip() 1047 p = p.strip() 1048 if ".." in r: 1049 first, last = [int(c, 16) for c in r.split('..')] 1050 chars = list(range(first, last+1)) 1051 else: 1052 chars = [int(r, 16)] 1053 for char in chars: 1054 if table[char]: 1055 # Some properties (e.g. Default_Ignorable_Code_Point) 1056 # apply to unassigned code points; ignore them 1057 table[char][-1].add(p) 1058 1059 with open_data(LINE_BREAK, version) as file: 1060 for s in file: 1061 s = s.partition('#')[0] 1062 s = [i.strip() for i in s.split(';')] 1063 if len(s) < 2 or s[1] not in MANDATORY_LINE_BREAKS: 1064 continue 1065 if '..' not in s[0]: 1066 first = last = int(s[0], 16) 1067 else: 1068 first, last = [int(c, 16) for c in s[0].split('..')] 1069 for char in range(first, last+1): 1070 table[char][-1].add('Line_Break') 1071 1072 # We only want the quickcheck properties 1073 # Format: NF?_QC; Y(es)/N(o)/M(aybe) 1074 # Yes is the default, hence only N and M occur 1075 # In 3.2.0, the format was different (NF?_NO) 1076 # The parsing will incorrectly determine these as 1077 # "yes", however, unicodedata.c will not perform quickchecks 1078 # for older versions, and no delta records will be created. 1079 quickchecks = [0] * 0x110000 1080 qc_order = 'NFD_QC NFKD_QC NFC_QC NFKC_QC'.split() 1081 with open_data(DERIVEDNORMALIZATION_PROPS, version) as file: 1082 for s in file: 1083 if '#' in s: 1084 s = s[:s.index('#')] 1085 s = [i.strip() for i in s.split(';')] 1086 if len(s) < 2 or s[1] not in qc_order: 1087 continue 1088 quickcheck = 'MN'.index(s[2]) + 1 # Maybe or No 1089 quickcheck_shift = qc_order.index(s[1])*2 1090 quickcheck <<= quickcheck_shift 1091 if '..' not in s[0]: 1092 first = last = int(s[0], 16) 1093 else: 1094 first, last = [int(c, 16) for c in s[0].split('..')] 1095 for char in range(first, last+1): 1096 assert not (quickchecks[char]>>quickcheck_shift)&3 1097 quickchecks[char] |= quickcheck 1098 for i in range(0, 0x110000): 1099 if table[i] is not None: 1100 table[i].append(quickchecks[i]) 1101 1102 with open_data(UNIHAN, version) as file: 1103 zip = zipfile.ZipFile(file) 1104 if version == '3.2.0': 1105 data = zip.open('Unihan-3.2.0.txt').read() 1106 else: 1107 data = zip.open('Unihan_NumericValues.txt').read() 1108 for line in data.decode("utf-8").splitlines(): 1109 if not line.startswith('U+'): 1110 continue 1111 code, tag, value = line.split(None, 3)[:3] 1112 if tag not in ('kAccountingNumeric', 'kPrimaryNumeric', 1113 'kOtherNumeric'): 1114 continue 1115 value = value.strip().replace(',', '') 1116 i = int(code[2:], 16) 1117 # Patch the numeric field 1118 if table[i] is not None: 1119 table[i][8] = value 1120 sc = self.special_casing = {} 1121 with open_data(SPECIAL_CASING, version) as file: 1122 for s in file: 1123 s = s[:-1].split('#', 1)[0] 1124 if not s: 1125 continue 1126 data = s.split("; ") 1127 if data[4]: 1128 # We ignore all conditionals (since they depend on 1129 # languages) except for one, which is hardcoded. See 1130 # handle_capital_sigma in unicodeobject.c. 1131 continue 1132 c = int(data[0], 16) 1133 lower = [int(char, 16) for char in data[1].split()] 1134 title = [int(char, 16) for char in data[2].split()] 1135 upper = [int(char, 16) for char in data[3].split()] 1136 sc[c] = (lower, title, upper) 1137 cf = self.case_folding = {} 1138 if version != '3.2.0': 1139 with open_data(CASE_FOLDING, version) as file: 1140 for s in file: 1141 s = s[:-1].split('#', 1)[0] 1142 if not s: 1143 continue 1144 data = s.split("; ") 1145 if data[1] in "CF": 1146 c = int(data[0], 16) 1147 cf[c] = [int(char, 16) for char in data[2].split()] 1148 1149 def uselatin1(self): 1150 # restrict character range to ISO Latin 1 1151 self.chars = list(range(256)) 1152 1153# hash table tools 1154 1155# this is a straight-forward reimplementation of Python's built-in 1156# dictionary type, using a static data structure, and a custom string 1157# hash algorithm. 1158 1159def myhash(s, magic): 1160 h = 0 1161 for c in map(ord, s.upper()): 1162 h = (h * magic) + c 1163 ix = h & 0xff000000 1164 if ix: 1165 h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff 1166 return h 1167 1168SIZES = [ 1169 (4,3), (8,3), (16,3), (32,5), (64,3), (128,3), (256,29), (512,17), 1170 (1024,9), (2048,5), (4096,83), (8192,27), (16384,43), (32768,3), 1171 (65536,45), (131072,9), (262144,39), (524288,39), (1048576,9), 1172 (2097152,5), (4194304,3), (8388608,33), (16777216,27) 1173] 1174 1175class Hash: 1176 def __init__(self, name, data, magic): 1177 # turn a (key, value) list into a static hash table structure 1178 1179 # determine table size 1180 for size, poly in SIZES: 1181 if size > len(data): 1182 poly = size + poly 1183 break 1184 else: 1185 raise AssertionError("ran out of polynomials") 1186 1187 print(size, "slots in hash table") 1188 1189 table = [None] * size 1190 1191 mask = size-1 1192 1193 n = 0 1194 1195 hash = myhash 1196 1197 # initialize hash table 1198 for key, value in data: 1199 h = hash(key, magic) 1200 i = (~h) & mask 1201 v = table[i] 1202 if v is None: 1203 table[i] = value 1204 continue 1205 incr = (h ^ (h >> 3)) & mask; 1206 if not incr: 1207 incr = mask 1208 while 1: 1209 n = n + 1 1210 i = (i + incr) & mask 1211 v = table[i] 1212 if v is None: 1213 table[i] = value 1214 break 1215 incr = incr << 1 1216 if incr > mask: 1217 incr = incr ^ poly 1218 1219 print(n, "collisions") 1220 self.collisions = n 1221 1222 for i in range(len(table)): 1223 if table[i] is None: 1224 table[i] = 0 1225 1226 self.data = Array(name + "_hash", table) 1227 self.magic = magic 1228 self.name = name 1229 self.size = size 1230 self.poly = poly 1231 1232 def dump(self, file, trace): 1233 # write data to file, as a C array 1234 self.data.dump(file, trace) 1235 file.write("#define %s_magic %d\n" % (self.name, self.magic)) 1236 file.write("#define %s_size %d\n" % (self.name, self.size)) 1237 file.write("#define %s_poly %d\n" % (self.name, self.poly)) 1238 1239# stuff to deal with arrays of unsigned integers 1240 1241class Array: 1242 1243 def __init__(self, name, data): 1244 self.name = name 1245 self.data = data 1246 1247 def dump(self, file, trace=0): 1248 # write data to file, as a C array 1249 size = getsize(self.data) 1250 if trace: 1251 print(self.name+":", size*len(self.data), "bytes", file=sys.stderr) 1252 file.write("static ") 1253 if size == 1: 1254 file.write("unsigned char") 1255 elif size == 2: 1256 file.write("unsigned short") 1257 else: 1258 file.write("unsigned int") 1259 file.write(" " + self.name + "[] = {\n") 1260 if self.data: 1261 s = " " 1262 for item in self.data: 1263 i = str(item) + ", " 1264 if len(s) + len(i) > 78: 1265 file.write(s + "\n") 1266 s = " " + i 1267 else: 1268 s = s + i 1269 if s.strip(): 1270 file.write(s + "\n") 1271 file.write("};\n\n") 1272 1273def getsize(data): 1274 # return smallest possible integer size for the given array 1275 maxdata = max(data) 1276 if maxdata < 256: 1277 return 1 1278 elif maxdata < 65536: 1279 return 2 1280 else: 1281 return 4 1282 1283def splitbins(t, trace=0): 1284 """t, trace=0 -> (t1, t2, shift). Split a table to save space. 1285 1286 t is a sequence of ints. This function can be useful to save space if 1287 many of the ints are the same. t1 and t2 are lists of ints, and shift 1288 is an int, chosen to minimize the combined size of t1 and t2 (in C 1289 code), and where for each i in range(len(t)), 1290 t[i] == t2[(t1[i >> shift] << shift) + (i & mask)] 1291 where mask is a bitmask isolating the last "shift" bits. 1292 1293 If optional arg trace is non-zero (default zero), progress info 1294 is printed to sys.stderr. The higher the value, the more info 1295 you'll get. 1296 """ 1297 1298 if trace: 1299 def dump(t1, t2, shift, bytes): 1300 print("%d+%d bins at shift %d; %d bytes" % ( 1301 len(t1), len(t2), shift, bytes), file=sys.stderr) 1302 print("Size of original table:", len(t)*getsize(t), \ 1303 "bytes", file=sys.stderr) 1304 n = len(t)-1 # last valid index 1305 maxshift = 0 # the most we can shift n and still have something left 1306 if n > 0: 1307 while n >> 1: 1308 n >>= 1 1309 maxshift += 1 1310 del n 1311 bytes = sys.maxsize # smallest total size so far 1312 t = tuple(t) # so slices can be dict keys 1313 for shift in range(maxshift + 1): 1314 t1 = [] 1315 t2 = [] 1316 size = 2**shift 1317 bincache = {} 1318 for i in range(0, len(t), size): 1319 bin = t[i:i+size] 1320 index = bincache.get(bin) 1321 if index is None: 1322 index = len(t2) 1323 bincache[bin] = index 1324 t2.extend(bin) 1325 t1.append(index >> shift) 1326 # determine memory size 1327 b = len(t1)*getsize(t1) + len(t2)*getsize(t2) 1328 if trace > 1: 1329 dump(t1, t2, shift, b) 1330 if b < bytes: 1331 best = t1, t2, shift 1332 bytes = b 1333 t1, t2, shift = best 1334 if trace: 1335 print("Best:", end=' ', file=sys.stderr) 1336 dump(t1, t2, shift, bytes) 1337 if __debug__: 1338 # exhaustively verify that the decomposition is correct 1339 mask = ~((~0) << shift) # i.e., low-bit mask of shift bits 1340 for i in range(len(t)): 1341 assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)] 1342 return best 1343 1344if __name__ == "__main__": 1345 maketables(1) 1346