1#
2# (re)generate unicode property and type databases
3#
4# this script converts a unicode 3.2 database file to
5# Modules/unicodedata_db.h, Modules/unicodename_db.h,
6# and Objects/unicodetype_db.h
7#
8# history:
9# 2000-09-24 fl   created (based on bits and pieces from unidb)
10# 2000-09-25 fl   merged tim's splitbin fixes, separate decomposition table
11# 2000-09-25 fl   added character type table
12# 2000-09-26 fl   added LINEBREAK, DECIMAL, and DIGIT flags/fields (2.0)
13# 2000-11-03 fl   expand first/last ranges
14# 2001-01-19 fl   added character name tables (2.1)
15# 2001-01-21 fl   added decomp compression; dynamic phrasebook threshold
16# 2002-09-11 wd   use string methods
17# 2002-10-18 mvl  update to Unicode 3.2
18# 2002-10-22 mvl  generate NFC tables
19# 2002-11-24 mvl  expand all ranges, sort names version-independently
20# 2002-11-25 mvl  add UNIDATA_VERSION
21# 2004-05-29 perky add east asian width information
22# 2006-03-10 mvl  update to Unicode 4.1; add UCD 3.2 delta
23# 2008-06-11 gb   add PRINTABLE_MASK for Atsuo Ishimoto's ascii() patch
24# 2011-10-21 ezio add support for name aliases and named sequences
25# 2012-01    benjamin add full case mappings
26#
27# written by Fredrik Lundh (fredrik@pythonware.com)
28#
29
30import os
31import sys
32import zipfile
33
34from textwrap import dedent
35
36SCRIPT = sys.argv[0]
37VERSION = "3.2"
38
39# The Unicode Database
40# --------------------
41# When changing UCD version please update
42#   * Doc/library/stdtypes.rst, and
43#   * Doc/library/unicodedata.rst
44#   * Doc/reference/lexical_analysis.rst (two occurrences)
45UNIDATA_VERSION = "9.0.0"
46UNICODE_DATA = "UnicodeData%s.txt"
47COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt"
48EASTASIAN_WIDTH = "EastAsianWidth%s.txt"
49UNIHAN = "Unihan%s.zip"
50DERIVED_CORE_PROPERTIES = "DerivedCoreProperties%s.txt"
51DERIVEDNORMALIZATION_PROPS = "DerivedNormalizationProps%s.txt"
52LINE_BREAK = "LineBreak%s.txt"
53NAME_ALIASES = "NameAliases%s.txt"
54NAMED_SEQUENCES = "NamedSequences%s.txt"
55SPECIAL_CASING = "SpecialCasing%s.txt"
56CASE_FOLDING = "CaseFolding%s.txt"
57
58# Private Use Areas -- in planes 1, 15, 16
59PUA_1 = range(0xE000, 0xF900)
60PUA_15 = range(0xF0000, 0xFFFFE)
61PUA_16 = range(0x100000, 0x10FFFE)
62
63# we use this ranges of PUA_15 to store name aliases and named sequences
64NAME_ALIASES_START = 0xF0000
65NAMED_SEQUENCES_START = 0xF0200
66
67old_versions = ["3.2.0"]
68
69CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd",
70    "Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm",
71    "Lo", "Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po", "Sm", "Sc", "Sk",
72    "So" ]
73
74BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO",
75    "PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS",
76    "ON", "LRI", "RLI", "FSI", "PDI" ]
77
78EASTASIANWIDTH_NAMES = [ "F", "H", "W", "Na", "A", "N" ]
79
80MANDATORY_LINE_BREAKS = [ "BK", "CR", "LF", "NL" ]
81
82# note: should match definitions in Objects/unicodectype.c
83ALPHA_MASK = 0x01
84DECIMAL_MASK = 0x02
85DIGIT_MASK = 0x04
86LOWER_MASK = 0x08
87LINEBREAK_MASK = 0x10
88SPACE_MASK = 0x20
89TITLE_MASK = 0x40
90UPPER_MASK = 0x80
91XID_START_MASK = 0x100
92XID_CONTINUE_MASK = 0x200
93PRINTABLE_MASK = 0x400
94NUMERIC_MASK = 0x800
95CASE_IGNORABLE_MASK = 0x1000
96CASED_MASK = 0x2000
97EXTENDED_CASE_MASK = 0x4000
98
99# these ranges need to match unicodedata.c:is_unified_ideograph
100cjk_ranges = [
101    ('3400', '4DB5'),
102    ('4E00', '9FD5'),
103    ('20000', '2A6D6'),
104    ('2A700', '2B734'),
105    ('2B740', '2B81D'),
106    ('2B820', '2CEA1'),
107]
108
109def maketables(trace=0):
110
111    print("--- Reading", UNICODE_DATA % "", "...")
112
113    version = ""
114    unicode = UnicodeData(UNIDATA_VERSION)
115
116    print(len(list(filter(None, unicode.table))), "characters")
117
118    for version in old_versions:
119        print("--- Reading", UNICODE_DATA % ("-"+version), "...")
120        old_unicode = UnicodeData(version, cjk_check=False)
121        print(len(list(filter(None, old_unicode.table))), "characters")
122        merge_old_version(version, unicode, old_unicode)
123
124    makeunicodename(unicode, trace)
125    makeunicodedata(unicode, trace)
126    makeunicodetype(unicode, trace)
127
128# --------------------------------------------------------------------
129# unicode character properties
130
131def makeunicodedata(unicode, trace):
132
133    dummy = (0, 0, 0, 0, 0, 0)
134    table = [dummy]
135    cache = {0: dummy}
136    index = [0] * len(unicode.chars)
137
138    FILE = "Modules/unicodedata_db.h"
139
140    print("--- Preparing", FILE, "...")
141
142    # 1) database properties
143
144    for char in unicode.chars:
145        record = unicode.table[char]
146        if record:
147            # extract database properties
148            category = CATEGORY_NAMES.index(record[2])
149            combining = int(record[3])
150            bidirectional = BIDIRECTIONAL_NAMES.index(record[4])
151            mirrored = record[9] == "Y"
152            eastasianwidth = EASTASIANWIDTH_NAMES.index(record[15])
153            normalizationquickcheck = record[17]
154            item = (
155                category, combining, bidirectional, mirrored, eastasianwidth,
156                normalizationquickcheck
157                )
158            # add entry to index and item tables
159            i = cache.get(item)
160            if i is None:
161                cache[item] = i = len(table)
162                table.append(item)
163            index[char] = i
164
165    # 2) decomposition data
166
167    decomp_data = [0]
168    decomp_prefix = [""]
169    decomp_index = [0] * len(unicode.chars)
170    decomp_size = 0
171
172    comp_pairs = []
173    comp_first = [None] * len(unicode.chars)
174    comp_last = [None] * len(unicode.chars)
175
176    for char in unicode.chars:
177        record = unicode.table[char]
178        if record:
179            if record[5]:
180                decomp = record[5].split()
181                if len(decomp) > 19:
182                    raise Exception("character %x has a decomposition too large for nfd_nfkd" % char)
183                # prefix
184                if decomp[0][0] == "<":
185                    prefix = decomp.pop(0)
186                else:
187                    prefix = ""
188                try:
189                    i = decomp_prefix.index(prefix)
190                except ValueError:
191                    i = len(decomp_prefix)
192                    decomp_prefix.append(prefix)
193                prefix = i
194                assert prefix < 256
195                # content
196                decomp = [prefix + (len(decomp)<<8)] + [int(s, 16) for s in decomp]
197                # Collect NFC pairs
198                if not prefix and len(decomp) == 3 and \
199                   char not in unicode.exclusions and \
200                   unicode.table[decomp[1]][3] == "0":
201                    p, l, r = decomp
202                    comp_first[l] = 1
203                    comp_last[r] = 1
204                    comp_pairs.append((l,r,char))
205                try:
206                    i = decomp_data.index(decomp)
207                except ValueError:
208                    i = len(decomp_data)
209                    decomp_data.extend(decomp)
210                    decomp_size = decomp_size + len(decomp) * 2
211            else:
212                i = 0
213            decomp_index[char] = i
214
215    f = l = 0
216    comp_first_ranges = []
217    comp_last_ranges = []
218    prev_f = prev_l = None
219    for i in unicode.chars:
220        if comp_first[i] is not None:
221            comp_first[i] = f
222            f += 1
223            if prev_f is None:
224                prev_f = (i,i)
225            elif prev_f[1]+1 == i:
226                prev_f = prev_f[0],i
227            else:
228                comp_first_ranges.append(prev_f)
229                prev_f = (i,i)
230        if comp_last[i] is not None:
231            comp_last[i] = l
232            l += 1
233            if prev_l is None:
234                prev_l = (i,i)
235            elif prev_l[1]+1 == i:
236                prev_l = prev_l[0],i
237            else:
238                comp_last_ranges.append(prev_l)
239                prev_l = (i,i)
240    comp_first_ranges.append(prev_f)
241    comp_last_ranges.append(prev_l)
242    total_first = f
243    total_last = l
244
245    comp_data = [0]*(total_first*total_last)
246    for f,l,char in comp_pairs:
247        f = comp_first[f]
248        l = comp_last[l]
249        comp_data[f*total_last+l] = char
250
251    print(len(table), "unique properties")
252    print(len(decomp_prefix), "unique decomposition prefixes")
253    print(len(decomp_data), "unique decomposition entries:", end=' ')
254    print(decomp_size, "bytes")
255    print(total_first, "first characters in NFC")
256    print(total_last, "last characters in NFC")
257    print(len(comp_pairs), "NFC pairs")
258
259    print("--- Writing", FILE, "...")
260
261    fp = open(FILE, "w")
262    print("/* this file was generated by %s %s */" % (SCRIPT, VERSION), file=fp)
263    print(file=fp)
264    print('#define UNIDATA_VERSION "%s"' % UNIDATA_VERSION, file=fp)
265    print("/* a list of unique database records */", file=fp)
266    print("const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {", file=fp)
267    for item in table:
268        print("    {%d, %d, %d, %d, %d, %d}," % item, file=fp)
269    print("};", file=fp)
270    print(file=fp)
271
272    print("/* Reindexing of NFC first characters. */", file=fp)
273    print("#define TOTAL_FIRST",total_first, file=fp)
274    print("#define TOTAL_LAST",total_last, file=fp)
275    print("struct reindex{int start;short count,index;};", file=fp)
276    print("static struct reindex nfc_first[] = {", file=fp)
277    for start,end in comp_first_ranges:
278        print("  { %d, %d, %d}," % (start,end-start,comp_first[start]), file=fp)
279    print("  {0,0,0}", file=fp)
280    print("};\n", file=fp)
281    print("static struct reindex nfc_last[] = {", file=fp)
282    for start,end in comp_last_ranges:
283        print("  { %d, %d, %d}," % (start,end-start,comp_last[start]), file=fp)
284    print("  {0,0,0}", file=fp)
285    print("};\n", file=fp)
286
287    # FIXME: <fl> the following tables could be made static, and
288    # the support code moved into unicodedatabase.c
289
290    print("/* string literals */", file=fp)
291    print("const char *_PyUnicode_CategoryNames[] = {", file=fp)
292    for name in CATEGORY_NAMES:
293        print("    \"%s\"," % name, file=fp)
294    print("    NULL", file=fp)
295    print("};", file=fp)
296
297    print("const char *_PyUnicode_BidirectionalNames[] = {", file=fp)
298    for name in BIDIRECTIONAL_NAMES:
299        print("    \"%s\"," % name, file=fp)
300    print("    NULL", file=fp)
301    print("};", file=fp)
302
303    print("const char *_PyUnicode_EastAsianWidthNames[] = {", file=fp)
304    for name in EASTASIANWIDTH_NAMES:
305        print("    \"%s\"," % name, file=fp)
306    print("    NULL", file=fp)
307    print("};", file=fp)
308
309    print("static const char *decomp_prefix[] = {", file=fp)
310    for name in decomp_prefix:
311        print("    \"%s\"," % name, file=fp)
312    print("    NULL", file=fp)
313    print("};", file=fp)
314
315    # split record index table
316    index1, index2, shift = splitbins(index, trace)
317
318    print("/* index tables for the database records */", file=fp)
319    print("#define SHIFT", shift, file=fp)
320    Array("index1", index1).dump(fp, trace)
321    Array("index2", index2).dump(fp, trace)
322
323    # split decomposition index table
324    index1, index2, shift = splitbins(decomp_index, trace)
325
326    print("/* decomposition data */", file=fp)
327    Array("decomp_data", decomp_data).dump(fp, trace)
328
329    print("/* index tables for the decomposition data */", file=fp)
330    print("#define DECOMP_SHIFT", shift, file=fp)
331    Array("decomp_index1", index1).dump(fp, trace)
332    Array("decomp_index2", index2).dump(fp, trace)
333
334    index, index2, shift = splitbins(comp_data, trace)
335    print("/* NFC pairs */", file=fp)
336    print("#define COMP_SHIFT", shift, file=fp)
337    Array("comp_index", index).dump(fp, trace)
338    Array("comp_data", index2).dump(fp, trace)
339
340    # Generate delta tables for old versions
341    for version, table, normalization in unicode.changed:
342        cversion = version.replace(".","_")
343        records = [table[0]]
344        cache = {table[0]:0}
345        index = [0] * len(table)
346        for i, record in enumerate(table):
347            try:
348                index[i] = cache[record]
349            except KeyError:
350                index[i] = cache[record] = len(records)
351                records.append(record)
352        index1, index2, shift = splitbins(index, trace)
353        print("static const change_record change_records_%s[] = {" % cversion, file=fp)
354        for record in records:
355            print("\t{ %s }," % ", ".join(map(str,record)), file=fp)
356        print("};", file=fp)
357        Array("changes_%s_index" % cversion, index1).dump(fp, trace)
358        Array("changes_%s_data" % cversion, index2).dump(fp, trace)
359        print("static const change_record* get_change_%s(Py_UCS4 n)" % cversion, file=fp)
360        print("{", file=fp)
361        print("\tint index;", file=fp)
362        print("\tif (n >= 0x110000) index = 0;", file=fp)
363        print("\telse {", file=fp)
364        print("\t\tindex = changes_%s_index[n>>%d];" % (cversion, shift), file=fp)
365        print("\t\tindex = changes_%s_data[(index<<%d)+(n & %d)];" % \
366              (cversion, shift, ((1<<shift)-1)), file=fp)
367        print("\t}", file=fp)
368        print("\treturn change_records_%s+index;" % cversion, file=fp)
369        print("}\n", file=fp)
370        print("static Py_UCS4 normalization_%s(Py_UCS4 n)" % cversion, file=fp)
371        print("{", file=fp)
372        print("\tswitch(n) {", file=fp)
373        for k, v in normalization:
374            print("\tcase %s: return 0x%s;" % (hex(k), v), file=fp)
375        print("\tdefault: return 0;", file=fp)
376        print("\t}\n}\n", file=fp)
377
378    fp.close()
379
380# --------------------------------------------------------------------
381# unicode character type tables
382
383def makeunicodetype(unicode, trace):
384
385    FILE = "Objects/unicodetype_db.h"
386
387    print("--- Preparing", FILE, "...")
388
389    # extract unicode types
390    dummy = (0, 0, 0, 0, 0, 0)
391    table = [dummy]
392    cache = {0: dummy}
393    index = [0] * len(unicode.chars)
394    numeric = {}
395    spaces = []
396    linebreaks = []
397    extra_casing = []
398
399    for char in unicode.chars:
400        record = unicode.table[char]
401        if record:
402            # extract database properties
403            category = record[2]
404            bidirectional = record[4]
405            properties = record[16]
406            flags = 0
407            delta = True
408            if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]:
409                flags |= ALPHA_MASK
410            if "Lowercase" in properties:
411                flags |= LOWER_MASK
412            if 'Line_Break' in properties or bidirectional == "B":
413                flags |= LINEBREAK_MASK
414                linebreaks.append(char)
415            if category == "Zs" or bidirectional in ("WS", "B", "S"):
416                flags |= SPACE_MASK
417                spaces.append(char)
418            if category == "Lt":
419                flags |= TITLE_MASK
420            if "Uppercase" in properties:
421                flags |= UPPER_MASK
422            if char == ord(" ") or category[0] not in ("C", "Z"):
423                flags |= PRINTABLE_MASK
424            if "XID_Start" in properties:
425                flags |= XID_START_MASK
426            if "XID_Continue" in properties:
427                flags |= XID_CONTINUE_MASK
428            if "Cased" in properties:
429                flags |= CASED_MASK
430            if "Case_Ignorable" in properties:
431                flags |= CASE_IGNORABLE_MASK
432            sc = unicode.special_casing.get(char)
433            cf = unicode.case_folding.get(char, [char])
434            if record[12]:
435                upper = int(record[12], 16)
436            else:
437                upper = char
438            if record[13]:
439                lower = int(record[13], 16)
440            else:
441                lower = char
442            if record[14]:
443                title = int(record[14], 16)
444            else:
445                title = upper
446            if sc is None and cf != [lower]:
447                sc = ([lower], [title], [upper])
448            if sc is None:
449                if upper == lower == title:
450                    upper = lower = title = 0
451                else:
452                    upper = upper - char
453                    lower = lower - char
454                    title = title - char
455                    assert (abs(upper) <= 2147483647 and
456                            abs(lower) <= 2147483647 and
457                            abs(title) <= 2147483647)
458            else:
459                # This happens either when some character maps to more than one
460                # character in uppercase, lowercase, or titlecase or the
461                # casefolded version of the character is different from the
462                # lowercase. The extra characters are stored in a different
463                # array.
464                flags |= EXTENDED_CASE_MASK
465                lower = len(extra_casing) | (len(sc[0]) << 24)
466                extra_casing.extend(sc[0])
467                if cf != sc[0]:
468                    lower |= len(cf) << 20
469                    extra_casing.extend(cf)
470                upper = len(extra_casing) | (len(sc[2]) << 24)
471                extra_casing.extend(sc[2])
472                # Title is probably equal to upper.
473                if sc[1] == sc[2]:
474                    title = upper
475                else:
476                    title = len(extra_casing) | (len(sc[1]) << 24)
477                    extra_casing.extend(sc[1])
478            # decimal digit, integer digit
479            decimal = 0
480            if record[6]:
481                flags |= DECIMAL_MASK
482                decimal = int(record[6])
483            digit = 0
484            if record[7]:
485                flags |= DIGIT_MASK
486                digit = int(record[7])
487            if record[8]:
488                flags |= NUMERIC_MASK
489                numeric.setdefault(record[8], []).append(char)
490            item = (
491                upper, lower, title, decimal, digit, flags
492                )
493            # add entry to index and item tables
494            i = cache.get(item)
495            if i is None:
496                cache[item] = i = len(table)
497                table.append(item)
498            index[char] = i
499
500    print(len(table), "unique character type entries")
501    print(sum(map(len, numeric.values())), "numeric code points")
502    print(len(spaces), "whitespace code points")
503    print(len(linebreaks), "linebreak code points")
504    print(len(extra_casing), "extended case array")
505
506    print("--- Writing", FILE, "...")
507
508    fp = open(FILE, "w")
509    print("/* this file was generated by %s %s */" % (SCRIPT, VERSION), file=fp)
510    print(file=fp)
511    print("/* a list of unique character type descriptors */", file=fp)
512    print("const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {", file=fp)
513    for item in table:
514        print("    {%d, %d, %d, %d, %d, %d}," % item, file=fp)
515    print("};", file=fp)
516    print(file=fp)
517
518    print("/* extended case mappings */", file=fp)
519    print(file=fp)
520    print("const Py_UCS4 _PyUnicode_ExtendedCase[] = {", file=fp)
521    for c in extra_casing:
522        print("    %d," % c, file=fp)
523    print("};", file=fp)
524    print(file=fp)
525
526    # split decomposition index table
527    index1, index2, shift = splitbins(index, trace)
528
529    print("/* type indexes */", file=fp)
530    print("#define SHIFT", shift, file=fp)
531    Array("index1", index1).dump(fp, trace)
532    Array("index2", index2).dump(fp, trace)
533
534    # Generate code for _PyUnicode_ToNumeric()
535    numeric_items = sorted(numeric.items())
536    print('/* Returns the numeric value as double for Unicode characters', file=fp)
537    print(' * having this property, -1.0 otherwise.', file=fp)
538    print(' */', file=fp)
539    print('double _PyUnicode_ToNumeric(Py_UCS4 ch)', file=fp)
540    print('{', file=fp)
541    print('    switch (ch) {', file=fp)
542    for value, codepoints in numeric_items:
543        # Turn text into float literals
544        parts = value.split('/')
545        parts = [repr(float(part)) for part in parts]
546        value = '/'.join(parts)
547
548        codepoints.sort()
549        for codepoint in codepoints:
550            print('    case 0x%04X:' % (codepoint,), file=fp)
551        print('        return (double) %s;' % (value,), file=fp)
552    print('    }', file=fp)
553    print('    return -1.0;', file=fp)
554    print('}', file=fp)
555    print(file=fp)
556
557    # Generate code for _PyUnicode_IsWhitespace()
558    print("/* Returns 1 for Unicode characters having the bidirectional", file=fp)
559    print(" * type 'WS', 'B' or 'S' or the category 'Zs', 0 otherwise.", file=fp)
560    print(" */", file=fp)
561    print('int _PyUnicode_IsWhitespace(const Py_UCS4 ch)', file=fp)
562    print('{', file=fp)
563    print('    switch (ch) {', file=fp)
564
565    for codepoint in sorted(spaces):
566        print('    case 0x%04X:' % (codepoint,), file=fp)
567    print('        return 1;', file=fp)
568
569    print('    }', file=fp)
570    print('    return 0;', file=fp)
571    print('}', file=fp)
572    print(file=fp)
573
574    # Generate code for _PyUnicode_IsLinebreak()
575    print("/* Returns 1 for Unicode characters having the line break", file=fp)
576    print(" * property 'BK', 'CR', 'LF' or 'NL' or having bidirectional", file=fp)
577    print(" * type 'B', 0 otherwise.", file=fp)
578    print(" */", file=fp)
579    print('int _PyUnicode_IsLinebreak(const Py_UCS4 ch)', file=fp)
580    print('{', file=fp)
581    print('    switch (ch) {', file=fp)
582    for codepoint in sorted(linebreaks):
583        print('    case 0x%04X:' % (codepoint,), file=fp)
584    print('        return 1;', file=fp)
585
586    print('    }', file=fp)
587    print('    return 0;', file=fp)
588    print('}', file=fp)
589    print(file=fp)
590
591    fp.close()
592
593# --------------------------------------------------------------------
594# unicode name database
595
596def makeunicodename(unicode, trace):
597
598    FILE = "Modules/unicodename_db.h"
599
600    print("--- Preparing", FILE, "...")
601
602    # collect names
603    names = [None] * len(unicode.chars)
604
605    for char in unicode.chars:
606        record = unicode.table[char]
607        if record:
608            name = record[1].strip()
609            if name and name[0] != "<":
610                names[char] = name + chr(0)
611
612    print(len(list(n for n in names if n is not None)), "distinct names")
613
614    # collect unique words from names (note that we differ between
615    # words inside a sentence, and words ending a sentence.  the
616    # latter includes the trailing null byte.
617
618    words = {}
619    n = b = 0
620    for char in unicode.chars:
621        name = names[char]
622        if name:
623            w = name.split()
624            b = b + len(name)
625            n = n + len(w)
626            for w in w:
627                l = words.get(w)
628                if l:
629                    l.append(None)
630                else:
631                    words[w] = [len(words)]
632
633    print(n, "words in text;", b, "bytes")
634
635    wordlist = list(words.items())
636
637    # sort on falling frequency, then by name
638    def word_key(a):
639        aword, alist = a
640        return -len(alist), aword
641    wordlist.sort(key=word_key)
642
643    # figure out how many phrasebook escapes we need
644    escapes = 0
645    while escapes * 256 < len(wordlist):
646        escapes = escapes + 1
647    print(escapes, "escapes")
648
649    short = 256 - escapes
650
651    assert short > 0
652
653    print(short, "short indexes in lexicon")
654
655    # statistics
656    n = 0
657    for i in range(short):
658        n = n + len(wordlist[i][1])
659    print(n, "short indexes in phrasebook")
660
661    # pick the most commonly used words, and sort the rest on falling
662    # length (to maximize overlap)
663
664    wordlist, wordtail = wordlist[:short], wordlist[short:]
665    wordtail.sort(key=lambda a: a[0], reverse=True)
666    wordlist.extend(wordtail)
667
668    # generate lexicon from words
669
670    lexicon_offset = [0]
671    lexicon = ""
672    words = {}
673
674    # build a lexicon string
675    offset = 0
676    for w, x in wordlist:
677        # encoding: bit 7 indicates last character in word (chr(128)
678        # indicates the last character in an entire string)
679        ww = w[:-1] + chr(ord(w[-1])+128)
680        # reuse string tails, when possible
681        o = lexicon.find(ww)
682        if o < 0:
683            o = offset
684            lexicon = lexicon + ww
685            offset = offset + len(w)
686        words[w] = len(lexicon_offset)
687        lexicon_offset.append(o)
688
689    lexicon = list(map(ord, lexicon))
690
691    # generate phrasebook from names and lexicon
692    phrasebook = [0]
693    phrasebook_offset = [0] * len(unicode.chars)
694    for char in unicode.chars:
695        name = names[char]
696        if name:
697            w = name.split()
698            phrasebook_offset[char] = len(phrasebook)
699            for w in w:
700                i = words[w]
701                if i < short:
702                    phrasebook.append(i)
703                else:
704                    # store as two bytes
705                    phrasebook.append((i>>8) + short)
706                    phrasebook.append(i&255)
707
708    assert getsize(phrasebook) == 1
709
710    #
711    # unicode name hash table
712
713    # extract names
714    data = []
715    for char in unicode.chars:
716        record = unicode.table[char]
717        if record:
718            name = record[1].strip()
719            if name and name[0] != "<":
720                data.append((name, char))
721
722    # the magic number 47 was chosen to minimize the number of
723    # collisions on the current data set.  if you like, change it
724    # and see what happens...
725
726    codehash = Hash("code", data, 47)
727
728    print("--- Writing", FILE, "...")
729
730    fp = open(FILE, "w")
731    print("/* this file was generated by %s %s */" % (SCRIPT, VERSION), file=fp)
732    print(file=fp)
733    print("#define NAME_MAXLEN", 256, file=fp)
734    print(file=fp)
735    print("/* lexicon */", file=fp)
736    Array("lexicon", lexicon).dump(fp, trace)
737    Array("lexicon_offset", lexicon_offset).dump(fp, trace)
738
739    # split decomposition index table
740    offset1, offset2, shift = splitbins(phrasebook_offset, trace)
741
742    print("/* code->name phrasebook */", file=fp)
743    print("#define phrasebook_shift", shift, file=fp)
744    print("#define phrasebook_short", short, file=fp)
745
746    Array("phrasebook", phrasebook).dump(fp, trace)
747    Array("phrasebook_offset1", offset1).dump(fp, trace)
748    Array("phrasebook_offset2", offset2).dump(fp, trace)
749
750    print("/* name->code dictionary */", file=fp)
751    codehash.dump(fp, trace)
752
753    print(file=fp)
754    print('static const unsigned int aliases_start = %#x;' %
755          NAME_ALIASES_START, file=fp)
756    print('static const unsigned int aliases_end = %#x;' %
757          (NAME_ALIASES_START + len(unicode.aliases)), file=fp)
758
759    print('static const unsigned int name_aliases[] = {', file=fp)
760    for name, codepoint in unicode.aliases:
761        print('    0x%04X,' % codepoint, file=fp)
762    print('};', file=fp)
763
764    # In Unicode 6.0.0, the sequences contain at most 4 BMP chars,
765    # so we are using Py_UCS2 seq[4].  This needs to be updated if longer
766    # sequences or sequences with non-BMP chars are added.
767    # unicodedata_lookup should be adapted too.
768    print(dedent("""
769        typedef struct NamedSequence {
770            int seqlen;
771            Py_UCS2 seq[4];
772        } named_sequence;
773        """), file=fp)
774
775    print('static const unsigned int named_sequences_start = %#x;' %
776          NAMED_SEQUENCES_START, file=fp)
777    print('static const unsigned int named_sequences_end = %#x;' %
778          (NAMED_SEQUENCES_START + len(unicode.named_sequences)), file=fp)
779
780    print('static const named_sequence named_sequences[] = {', file=fp)
781    for name, sequence in unicode.named_sequences:
782        seq_str = ', '.join('0x%04X' % cp for cp in sequence)
783        print('    {%d, {%s}},' % (len(sequence), seq_str), file=fp)
784    print('};', file=fp)
785
786    fp.close()
787
788
789def merge_old_version(version, new, old):
790    # Changes to exclusion file not implemented yet
791    if old.exclusions != new.exclusions:
792        raise NotImplementedError("exclusions differ")
793
794    # In these change records, 0xFF means "no change"
795    bidir_changes = [0xFF]*0x110000
796    category_changes = [0xFF]*0x110000
797    decimal_changes = [0xFF]*0x110000
798    mirrored_changes = [0xFF]*0x110000
799    east_asian_width_changes = [0xFF]*0x110000
800    # In numeric data, 0 means "no change",
801    # -1 means "did not have a numeric value
802    numeric_changes = [0] * 0x110000
803    # normalization_changes is a list of key-value pairs
804    normalization_changes = []
805    for i in range(0x110000):
806        if new.table[i] is None:
807            # Characters unassigned in the new version ought to
808            # be unassigned in the old one
809            assert old.table[i] is None
810            continue
811        # check characters unassigned in the old version
812        if old.table[i] is None:
813            # category 0 is "unassigned"
814            category_changes[i] = 0
815            continue
816        # check characters that differ
817        if old.table[i] != new.table[i]:
818            for k in range(len(old.table[i])):
819                if old.table[i][k] != new.table[i][k]:
820                    value = old.table[i][k]
821                    if k == 1 and i in PUA_15:
822                        # the name is not set in the old.table, but in the
823                        # new.table we are using it for aliases and named seq
824                        assert value == ''
825                    elif k == 2:
826                        #print "CATEGORY",hex(i), old.table[i][k], new.table[i][k]
827                        category_changes[i] = CATEGORY_NAMES.index(value)
828                    elif k == 4:
829                        #print "BIDIR",hex(i), old.table[i][k], new.table[i][k]
830                        bidir_changes[i] = BIDIRECTIONAL_NAMES.index(value)
831                    elif k == 5:
832                        #print "DECOMP",hex(i), old.table[i][k], new.table[i][k]
833                        # We assume that all normalization changes are in 1:1 mappings
834                        assert " " not in value
835                        normalization_changes.append((i, value))
836                    elif k == 6:
837                        #print "DECIMAL",hex(i), old.table[i][k], new.table[i][k]
838                        # we only support changes where the old value is a single digit
839                        assert value in "0123456789"
840                        decimal_changes[i] = int(value)
841                    elif k == 8:
842                        # print "NUMERIC",hex(i), `old.table[i][k]`, new.table[i][k]
843                        # Since 0 encodes "no change", the old value is better not 0
844                        if not value:
845                            numeric_changes[i] = -1
846                        else:
847                            numeric_changes[i] = float(value)
848                            assert numeric_changes[i] not in (0, -1)
849                    elif k == 9:
850                        if value == 'Y':
851                            mirrored_changes[i] = '1'
852                        else:
853                            mirrored_changes[i] = '0'
854                    elif k == 11:
855                        # change to ISO comment, ignore
856                        pass
857                    elif k == 12:
858                        # change to simple uppercase mapping; ignore
859                        pass
860                    elif k == 13:
861                        # change to simple lowercase mapping; ignore
862                        pass
863                    elif k == 14:
864                        # change to simple titlecase mapping; ignore
865                        pass
866                    elif k == 15:
867                        # change to east asian width
868                        east_asian_width_changes[i] = EASTASIANWIDTH_NAMES.index(value)
869                    elif k == 16:
870                        # derived property changes; not yet
871                        pass
872                    elif k == 17:
873                        # normalization quickchecks are not performed
874                        # for older versions
875                        pass
876                    else:
877                        class Difference(Exception):pass
878                        raise Difference(hex(i), k, old.table[i], new.table[i])
879    new.changed.append((version, list(zip(bidir_changes, category_changes,
880                                          decimal_changes, mirrored_changes,
881                                          east_asian_width_changes,
882                                          numeric_changes)),
883                        normalization_changes))
884
885def open_data(template, version):
886    local = template % ('-'+version,)
887    if not os.path.exists(local):
888        import urllib.request
889        if version == '3.2.0':
890            # irregular url structure
891            url = 'http://www.unicode.org/Public/3.2-Update/' + local
892        else:
893            url = ('http://www.unicode.org/Public/%s/ucd/'+template) % (version, '')
894        urllib.request.urlretrieve(url, filename=local)
895    if local.endswith('.txt'):
896        return open(local, encoding='utf-8')
897    else:
898        # Unihan.zip
899        return open(local, 'rb')
900
901# --------------------------------------------------------------------
902# the following support code is taken from the unidb utilities
903# Copyright (c) 1999-2000 by Secret Labs AB
904
905# load a unicode-data file from disk
906
907class UnicodeData:
908    # Record structure:
909    # [ID, name, category, combining, bidi, decomp,  (6)
910    #  decimal, digit, numeric, bidi-mirrored, Unicode-1-name, (11)
911    #  ISO-comment, uppercase, lowercase, titlecase, ea-width, (16)
912    #  derived-props] (17)
913
914    def __init__(self, version,
915                 linebreakprops=False,
916                 expand=1,
917                 cjk_check=True):
918        self.changed = []
919        table = [None] * 0x110000
920        with open_data(UNICODE_DATA, version) as file:
921            while 1:
922                s = file.readline()
923                if not s:
924                    break
925                s = s.strip().split(";")
926                char = int(s[0], 16)
927                table[char] = s
928
929        cjk_ranges_found = []
930
931        # expand first-last ranges
932        if expand:
933            field = None
934            for i in range(0, 0x110000):
935                s = table[i]
936                if s:
937                    if s[1][-6:] == "First>":
938                        s[1] = ""
939                        field = s
940                    elif s[1][-5:] == "Last>":
941                        if s[1].startswith("<CJK Ideograph"):
942                            cjk_ranges_found.append((field[0],
943                                                     s[0]))
944                        s[1] = ""
945                        field = None
946                elif field:
947                    f2 = field[:]
948                    f2[0] = "%X" % i
949                    table[i] = f2
950            if cjk_check and cjk_ranges != cjk_ranges_found:
951                raise ValueError("CJK ranges deviate: have %r" % cjk_ranges_found)
952
953        # public attributes
954        self.filename = UNICODE_DATA % ''
955        self.table = table
956        self.chars = list(range(0x110000)) # unicode 3.2
957
958        # check for name aliases and named sequences, see #12753
959        # aliases and named sequences are not in 3.2.0
960        if version != '3.2.0':
961            self.aliases = []
962            # store aliases in the Private Use Area 15, in range U+F0000..U+F00FF,
963            # in order to take advantage of the compression and lookup
964            # algorithms used for the other characters
965            pua_index = NAME_ALIASES_START
966            with open_data(NAME_ALIASES, version) as file:
967                for s in file:
968                    s = s.strip()
969                    if not s or s.startswith('#'):
970                        continue
971                    char, name, abbrev = s.split(';')
972                    char = int(char, 16)
973                    self.aliases.append((name, char))
974                    # also store the name in the PUA 1
975                    self.table[pua_index][1] = name
976                    pua_index += 1
977            assert pua_index - NAME_ALIASES_START == len(self.aliases)
978
979            self.named_sequences = []
980            # store named sequences in the PUA 1, in range U+F0100..,
981            # in order to take advantage of the compression and lookup
982            # algorithms used for the other characters.
983
984            assert pua_index < NAMED_SEQUENCES_START
985            pua_index = NAMED_SEQUENCES_START
986            with open_data(NAMED_SEQUENCES, version) as file:
987                for s in file:
988                    s = s.strip()
989                    if not s or s.startswith('#'):
990                        continue
991                    name, chars = s.split(';')
992                    chars = tuple(int(char, 16) for char in chars.split())
993                    # check that the structure defined in makeunicodename is OK
994                    assert 2 <= len(chars) <= 4, "change the Py_UCS2 array size"
995                    assert all(c <= 0xFFFF for c in chars), ("use Py_UCS4 in "
996                        "the NamedSequence struct and in unicodedata_lookup")
997                    self.named_sequences.append((name, chars))
998                    # also store these in the PUA 1
999                    self.table[pua_index][1] = name
1000                    pua_index += 1
1001            assert pua_index - NAMED_SEQUENCES_START == len(self.named_sequences)
1002
1003        self.exclusions = {}
1004        with open_data(COMPOSITION_EXCLUSIONS, version) as file:
1005            for s in file:
1006                s = s.strip()
1007                if not s:
1008                    continue
1009                if s[0] == '#':
1010                    continue
1011                char = int(s.split()[0],16)
1012                self.exclusions[char] = 1
1013
1014        widths = [None] * 0x110000
1015        with open_data(EASTASIAN_WIDTH, version) as file:
1016            for s in file:
1017                s = s.strip()
1018                if not s:
1019                    continue
1020                if s[0] == '#':
1021                    continue
1022                s = s.split()[0].split(';')
1023                if '..' in s[0]:
1024                    first, last = [int(c, 16) for c in s[0].split('..')]
1025                    chars = list(range(first, last+1))
1026                else:
1027                    chars = [int(s[0], 16)]
1028                for char in chars:
1029                    widths[char] = s[1]
1030
1031        for i in range(0, 0x110000):
1032            if table[i] is not None:
1033                table[i].append(widths[i])
1034
1035        for i in range(0, 0x110000):
1036            if table[i] is not None:
1037                table[i].append(set())
1038
1039        with open_data(DERIVED_CORE_PROPERTIES, version) as file:
1040            for s in file:
1041                s = s.split('#', 1)[0].strip()
1042                if not s:
1043                    continue
1044
1045                r, p = s.split(";")
1046                r = r.strip()
1047                p = p.strip()
1048                if ".." in r:
1049                    first, last = [int(c, 16) for c in r.split('..')]
1050                    chars = list(range(first, last+1))
1051                else:
1052                    chars = [int(r, 16)]
1053                for char in chars:
1054                    if table[char]:
1055                        # Some properties (e.g. Default_Ignorable_Code_Point)
1056                        # apply to unassigned code points; ignore them
1057                        table[char][-1].add(p)
1058
1059        with open_data(LINE_BREAK, version) as file:
1060            for s in file:
1061                s = s.partition('#')[0]
1062                s = [i.strip() for i in s.split(';')]
1063                if len(s) < 2 or s[1] not in MANDATORY_LINE_BREAKS:
1064                    continue
1065                if '..' not in s[0]:
1066                    first = last = int(s[0], 16)
1067                else:
1068                    first, last = [int(c, 16) for c in s[0].split('..')]
1069                for char in range(first, last+1):
1070                    table[char][-1].add('Line_Break')
1071
1072        # We only want the quickcheck properties
1073        # Format: NF?_QC; Y(es)/N(o)/M(aybe)
1074        # Yes is the default, hence only N and M occur
1075        # In 3.2.0, the format was different (NF?_NO)
1076        # The parsing will incorrectly determine these as
1077        # "yes", however, unicodedata.c will not perform quickchecks
1078        # for older versions, and no delta records will be created.
1079        quickchecks = [0] * 0x110000
1080        qc_order = 'NFD_QC NFKD_QC NFC_QC NFKC_QC'.split()
1081        with open_data(DERIVEDNORMALIZATION_PROPS, version) as file:
1082            for s in file:
1083                if '#' in s:
1084                    s = s[:s.index('#')]
1085                s = [i.strip() for i in s.split(';')]
1086                if len(s) < 2 or s[1] not in qc_order:
1087                    continue
1088                quickcheck = 'MN'.index(s[2]) + 1 # Maybe or No
1089                quickcheck_shift = qc_order.index(s[1])*2
1090                quickcheck <<= quickcheck_shift
1091                if '..' not in s[0]:
1092                    first = last = int(s[0], 16)
1093                else:
1094                    first, last = [int(c, 16) for c in s[0].split('..')]
1095                for char in range(first, last+1):
1096                    assert not (quickchecks[char]>>quickcheck_shift)&3
1097                    quickchecks[char] |= quickcheck
1098        for i in range(0, 0x110000):
1099            if table[i] is not None:
1100                table[i].append(quickchecks[i])
1101
1102        with open_data(UNIHAN, version) as file:
1103            zip = zipfile.ZipFile(file)
1104            if version == '3.2.0':
1105                data = zip.open('Unihan-3.2.0.txt').read()
1106            else:
1107                data = zip.open('Unihan_NumericValues.txt').read()
1108        for line in data.decode("utf-8").splitlines():
1109            if not line.startswith('U+'):
1110                continue
1111            code, tag, value = line.split(None, 3)[:3]
1112            if tag not in ('kAccountingNumeric', 'kPrimaryNumeric',
1113                           'kOtherNumeric'):
1114                continue
1115            value = value.strip().replace(',', '')
1116            i = int(code[2:], 16)
1117            # Patch the numeric field
1118            if table[i] is not None:
1119                table[i][8] = value
1120        sc = self.special_casing = {}
1121        with open_data(SPECIAL_CASING, version) as file:
1122            for s in file:
1123                s = s[:-1].split('#', 1)[0]
1124                if not s:
1125                    continue
1126                data = s.split("; ")
1127                if data[4]:
1128                    # We ignore all conditionals (since they depend on
1129                    # languages) except for one, which is hardcoded. See
1130                    # handle_capital_sigma in unicodeobject.c.
1131                    continue
1132                c = int(data[0], 16)
1133                lower = [int(char, 16) for char in data[1].split()]
1134                title = [int(char, 16) for char in data[2].split()]
1135                upper = [int(char, 16) for char in data[3].split()]
1136                sc[c] = (lower, title, upper)
1137        cf = self.case_folding = {}
1138        if version != '3.2.0':
1139            with open_data(CASE_FOLDING, version) as file:
1140                for s in file:
1141                    s = s[:-1].split('#', 1)[0]
1142                    if not s:
1143                        continue
1144                    data = s.split("; ")
1145                    if data[1] in "CF":
1146                        c = int(data[0], 16)
1147                        cf[c] = [int(char, 16) for char in data[2].split()]
1148
1149    def uselatin1(self):
1150        # restrict character range to ISO Latin 1
1151        self.chars = list(range(256))
1152
1153# hash table tools
1154
1155# this is a straight-forward reimplementation of Python's built-in
1156# dictionary type, using a static data structure, and a custom string
1157# hash algorithm.
1158
1159def myhash(s, magic):
1160    h = 0
1161    for c in map(ord, s.upper()):
1162        h = (h * magic) + c
1163        ix = h & 0xff000000
1164        if ix:
1165            h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff
1166    return h
1167
1168SIZES = [
1169    (4,3), (8,3), (16,3), (32,5), (64,3), (128,3), (256,29), (512,17),
1170    (1024,9), (2048,5), (4096,83), (8192,27), (16384,43), (32768,3),
1171    (65536,45), (131072,9), (262144,39), (524288,39), (1048576,9),
1172    (2097152,5), (4194304,3), (8388608,33), (16777216,27)
1173]
1174
1175class Hash:
1176    def __init__(self, name, data, magic):
1177        # turn a (key, value) list into a static hash table structure
1178
1179        # determine table size
1180        for size, poly in SIZES:
1181            if size > len(data):
1182                poly = size + poly
1183                break
1184        else:
1185            raise AssertionError("ran out of polynomials")
1186
1187        print(size, "slots in hash table")
1188
1189        table = [None] * size
1190
1191        mask = size-1
1192
1193        n = 0
1194
1195        hash = myhash
1196
1197        # initialize hash table
1198        for key, value in data:
1199            h = hash(key, magic)
1200            i = (~h) & mask
1201            v = table[i]
1202            if v is None:
1203                table[i] = value
1204                continue
1205            incr = (h ^ (h >> 3)) & mask;
1206            if not incr:
1207                incr = mask
1208            while 1:
1209                n = n + 1
1210                i = (i + incr) & mask
1211                v = table[i]
1212                if v is None:
1213                    table[i] = value
1214                    break
1215                incr = incr << 1
1216                if incr > mask:
1217                    incr = incr ^ poly
1218
1219        print(n, "collisions")
1220        self.collisions = n
1221
1222        for i in range(len(table)):
1223            if table[i] is None:
1224                table[i] = 0
1225
1226        self.data = Array(name + "_hash", table)
1227        self.magic = magic
1228        self.name = name
1229        self.size = size
1230        self.poly = poly
1231
1232    def dump(self, file, trace):
1233        # write data to file, as a C array
1234        self.data.dump(file, trace)
1235        file.write("#define %s_magic %d\n" % (self.name, self.magic))
1236        file.write("#define %s_size %d\n" % (self.name, self.size))
1237        file.write("#define %s_poly %d\n" % (self.name, self.poly))
1238
1239# stuff to deal with arrays of unsigned integers
1240
1241class Array:
1242
1243    def __init__(self, name, data):
1244        self.name = name
1245        self.data = data
1246
1247    def dump(self, file, trace=0):
1248        # write data to file, as a C array
1249        size = getsize(self.data)
1250        if trace:
1251            print(self.name+":", size*len(self.data), "bytes", file=sys.stderr)
1252        file.write("static ")
1253        if size == 1:
1254            file.write("unsigned char")
1255        elif size == 2:
1256            file.write("unsigned short")
1257        else:
1258            file.write("unsigned int")
1259        file.write(" " + self.name + "[] = {\n")
1260        if self.data:
1261            s = "    "
1262            for item in self.data:
1263                i = str(item) + ", "
1264                if len(s) + len(i) > 78:
1265                    file.write(s + "\n")
1266                    s = "    " + i
1267                else:
1268                    s = s + i
1269            if s.strip():
1270                file.write(s + "\n")
1271        file.write("};\n\n")
1272
1273def getsize(data):
1274    # return smallest possible integer size for the given array
1275    maxdata = max(data)
1276    if maxdata < 256:
1277        return 1
1278    elif maxdata < 65536:
1279        return 2
1280    else:
1281        return 4
1282
1283def splitbins(t, trace=0):
1284    """t, trace=0 -> (t1, t2, shift).  Split a table to save space.
1285
1286    t is a sequence of ints.  This function can be useful to save space if
1287    many of the ints are the same.  t1 and t2 are lists of ints, and shift
1288    is an int, chosen to minimize the combined size of t1 and t2 (in C
1289    code), and where for each i in range(len(t)),
1290        t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
1291    where mask is a bitmask isolating the last "shift" bits.
1292
1293    If optional arg trace is non-zero (default zero), progress info
1294    is printed to sys.stderr.  The higher the value, the more info
1295    you'll get.
1296    """
1297
1298    if trace:
1299        def dump(t1, t2, shift, bytes):
1300            print("%d+%d bins at shift %d; %d bytes" % (
1301                len(t1), len(t2), shift, bytes), file=sys.stderr)
1302        print("Size of original table:", len(t)*getsize(t), \
1303                            "bytes", file=sys.stderr)
1304    n = len(t)-1    # last valid index
1305    maxshift = 0    # the most we can shift n and still have something left
1306    if n > 0:
1307        while n >> 1:
1308            n >>= 1
1309            maxshift += 1
1310    del n
1311    bytes = sys.maxsize  # smallest total size so far
1312    t = tuple(t)    # so slices can be dict keys
1313    for shift in range(maxshift + 1):
1314        t1 = []
1315        t2 = []
1316        size = 2**shift
1317        bincache = {}
1318        for i in range(0, len(t), size):
1319            bin = t[i:i+size]
1320            index = bincache.get(bin)
1321            if index is None:
1322                index = len(t2)
1323                bincache[bin] = index
1324                t2.extend(bin)
1325            t1.append(index >> shift)
1326        # determine memory size
1327        b = len(t1)*getsize(t1) + len(t2)*getsize(t2)
1328        if trace > 1:
1329            dump(t1, t2, shift, b)
1330        if b < bytes:
1331            best = t1, t2, shift
1332            bytes = b
1333    t1, t2, shift = best
1334    if trace:
1335        print("Best:", end=' ', file=sys.stderr)
1336        dump(t1, t2, shift, bytes)
1337    if __debug__:
1338        # exhaustively verify that the decomposition is correct
1339        mask = ~((~0) << shift) # i.e., low-bit mask of shift bits
1340        for i in range(len(t)):
1341            assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
1342    return best
1343
1344if __name__ == "__main__":
1345    maketables(1)
1346