1#
2# (re)generate unicode property and type databases
3#
4# This script converts Unicode database files to Modules/unicodedata_db.h,
5# Modules/unicodename_db.h, and Objects/unicodetype_db.h
6#
7# history:
8# 2000-09-24 fl   created (based on bits and pieces from unidb)
9# 2000-09-25 fl   merged tim's splitbin fixes, separate decomposition table
10# 2000-09-25 fl   added character type table
11# 2000-09-26 fl   added LINEBREAK, DECIMAL, and DIGIT flags/fields (2.0)
12# 2000-11-03 fl   expand first/last ranges
13# 2001-01-19 fl   added character name tables (2.1)
14# 2001-01-21 fl   added decomp compression; dynamic phrasebook threshold
15# 2002-09-11 wd   use string methods
16# 2002-10-18 mvl  update to Unicode 3.2
17# 2002-10-22 mvl  generate NFC tables
18# 2002-11-24 mvl  expand all ranges, sort names version-independently
19# 2002-11-25 mvl  add UNIDATA_VERSION
20# 2004-05-29 perky add east asian width information
21# 2006-03-10 mvl  update to Unicode 4.1; add UCD 3.2 delta
22# 2008-06-11 gb   add PRINTABLE_MASK for Atsuo Ishimoto's ascii() patch
23# 2011-10-21 ezio add support for name aliases and named sequences
24# 2012-01    benjamin add full case mappings
25#
26# written by Fredrik Lundh (fredrik@pythonware.com)
27#
28
29import dataclasses
30import os
31import sys
32import zipfile
33
34from functools import partial
35from textwrap import dedent
36from typing import Iterator, List, Optional, Set, Tuple
37
38SCRIPT = sys.argv[0]
39VERSION = "3.3"
40
41# The Unicode Database
42# --------------------
43# When changing UCD version please update
44#   * Doc/library/stdtypes.rst, and
45#   * Doc/library/unicodedata.rst
46#   * Doc/reference/lexical_analysis.rst (two occurrences)
47UNIDATA_VERSION = "13.0.0"
48UNICODE_DATA = "UnicodeData%s.txt"
49COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt"
50EASTASIAN_WIDTH = "EastAsianWidth%s.txt"
51UNIHAN = "Unihan%s.zip"
52DERIVED_CORE_PROPERTIES = "DerivedCoreProperties%s.txt"
53DERIVEDNORMALIZATION_PROPS = "DerivedNormalizationProps%s.txt"
54LINE_BREAK = "LineBreak%s.txt"
55NAME_ALIASES = "NameAliases%s.txt"
56NAMED_SEQUENCES = "NamedSequences%s.txt"
57SPECIAL_CASING = "SpecialCasing%s.txt"
58CASE_FOLDING = "CaseFolding%s.txt"
59
60# Private Use Areas -- in planes 1, 15, 16
61PUA_1 = range(0xE000, 0xF900)
62PUA_15 = range(0xF0000, 0xFFFFE)
63PUA_16 = range(0x100000, 0x10FFFE)
64
65# we use this ranges of PUA_15 to store name aliases and named sequences
66NAME_ALIASES_START = 0xF0000
67NAMED_SEQUENCES_START = 0xF0200
68
69old_versions = ["3.2.0"]
70
71CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd",
72    "Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm",
73    "Lo", "Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po", "Sm", "Sc", "Sk",
74    "So" ]
75
76BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO",
77    "PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS",
78    "ON", "LRI", "RLI", "FSI", "PDI" ]
79
80EASTASIANWIDTH_NAMES = [ "F", "H", "W", "Na", "A", "N" ]
81
82MANDATORY_LINE_BREAKS = [ "BK", "CR", "LF", "NL" ]
83
84# note: should match definitions in Objects/unicodectype.c
85ALPHA_MASK = 0x01
86DECIMAL_MASK = 0x02
87DIGIT_MASK = 0x04
88LOWER_MASK = 0x08
89LINEBREAK_MASK = 0x10
90SPACE_MASK = 0x20
91TITLE_MASK = 0x40
92UPPER_MASK = 0x80
93XID_START_MASK = 0x100
94XID_CONTINUE_MASK = 0x200
95PRINTABLE_MASK = 0x400
96NUMERIC_MASK = 0x800
97CASE_IGNORABLE_MASK = 0x1000
98CASED_MASK = 0x2000
99EXTENDED_CASE_MASK = 0x4000
100
101# these ranges need to match unicodedata.c:is_unified_ideograph
102cjk_ranges = [
103    ('3400', '4DBF'),
104    ('4E00', '9FFC'),
105    ('20000', '2A6DD'),
106    ('2A700', '2B734'),
107    ('2B740', '2B81D'),
108    ('2B820', '2CEA1'),
109    ('2CEB0', '2EBE0'),
110    ('30000', '3134A'),
111]
112
113
114def maketables(trace=0):
115
116    print("--- Reading", UNICODE_DATA % "", "...")
117
118    unicode = UnicodeData(UNIDATA_VERSION)
119
120    print(len(list(filter(None, unicode.table))), "characters")
121
122    for version in old_versions:
123        print("--- Reading", UNICODE_DATA % ("-"+version), "...")
124        old_unicode = UnicodeData(version, cjk_check=False)
125        print(len(list(filter(None, old_unicode.table))), "characters")
126        merge_old_version(version, unicode, old_unicode)
127
128    makeunicodename(unicode, trace)
129    makeunicodedata(unicode, trace)
130    makeunicodetype(unicode, trace)
131
132
133# --------------------------------------------------------------------
134# unicode character properties
135
136def makeunicodedata(unicode, trace):
137
138    dummy = (0, 0, 0, 0, 0, 0)
139    table = [dummy]
140    cache = {0: dummy}
141    index = [0] * len(unicode.chars)
142
143    FILE = "Modules/unicodedata_db.h"
144
145    print("--- Preparing", FILE, "...")
146
147    # 1) database properties
148
149    for char in unicode.chars:
150        record = unicode.table[char]
151        if record:
152            # extract database properties
153            category = CATEGORY_NAMES.index(record.general_category)
154            combining = int(record.canonical_combining_class)
155            bidirectional = BIDIRECTIONAL_NAMES.index(record.bidi_class)
156            mirrored = record.bidi_mirrored == "Y"
157            eastasianwidth = EASTASIANWIDTH_NAMES.index(record.east_asian_width)
158            normalizationquickcheck = record.quick_check
159            item = (
160                category, combining, bidirectional, mirrored, eastasianwidth,
161                normalizationquickcheck
162                )
163            # add entry to index and item tables
164            i = cache.get(item)
165            if i is None:
166                cache[item] = i = len(table)
167                table.append(item)
168            index[char] = i
169
170    # 2) decomposition data
171
172    decomp_data = [0]
173    decomp_prefix = [""]
174    decomp_index = [0] * len(unicode.chars)
175    decomp_size = 0
176
177    comp_pairs = []
178    comp_first = [None] * len(unicode.chars)
179    comp_last = [None] * len(unicode.chars)
180
181    for char in unicode.chars:
182        record = unicode.table[char]
183        if record:
184            if record.decomposition_type:
185                decomp = record.decomposition_type.split()
186                if len(decomp) > 19:
187                    raise Exception("character %x has a decomposition too large for nfd_nfkd" % char)
188                # prefix
189                if decomp[0][0] == "<":
190                    prefix = decomp.pop(0)
191                else:
192                    prefix = ""
193                try:
194                    i = decomp_prefix.index(prefix)
195                except ValueError:
196                    i = len(decomp_prefix)
197                    decomp_prefix.append(prefix)
198                prefix = i
199                assert prefix < 256
200                # content
201                decomp = [prefix + (len(decomp)<<8)] + [int(s, 16) for s in decomp]
202                # Collect NFC pairs
203                if not prefix and len(decomp) == 3 and \
204                   char not in unicode.exclusions and \
205                   unicode.table[decomp[1]].canonical_combining_class == "0":
206                    p, l, r = decomp
207                    comp_first[l] = 1
208                    comp_last[r] = 1
209                    comp_pairs.append((l,r,char))
210                try:
211                    i = decomp_data.index(decomp)
212                except ValueError:
213                    i = len(decomp_data)
214                    decomp_data.extend(decomp)
215                    decomp_size = decomp_size + len(decomp) * 2
216            else:
217                i = 0
218            decomp_index[char] = i
219
220    f = l = 0
221    comp_first_ranges = []
222    comp_last_ranges = []
223    prev_f = prev_l = None
224    for i in unicode.chars:
225        if comp_first[i] is not None:
226            comp_first[i] = f
227            f += 1
228            if prev_f is None:
229                prev_f = (i,i)
230            elif prev_f[1]+1 == i:
231                prev_f = prev_f[0],i
232            else:
233                comp_first_ranges.append(prev_f)
234                prev_f = (i,i)
235        if comp_last[i] is not None:
236            comp_last[i] = l
237            l += 1
238            if prev_l is None:
239                prev_l = (i,i)
240            elif prev_l[1]+1 == i:
241                prev_l = prev_l[0],i
242            else:
243                comp_last_ranges.append(prev_l)
244                prev_l = (i,i)
245    comp_first_ranges.append(prev_f)
246    comp_last_ranges.append(prev_l)
247    total_first = f
248    total_last = l
249
250    comp_data = [0]*(total_first*total_last)
251    for f,l,char in comp_pairs:
252        f = comp_first[f]
253        l = comp_last[l]
254        comp_data[f*total_last+l] = char
255
256    print(len(table), "unique properties")
257    print(len(decomp_prefix), "unique decomposition prefixes")
258    print(len(decomp_data), "unique decomposition entries:", end=' ')
259    print(decomp_size, "bytes")
260    print(total_first, "first characters in NFC")
261    print(total_last, "last characters in NFC")
262    print(len(comp_pairs), "NFC pairs")
263
264    print("--- Writing", FILE, "...")
265
266    with open(FILE, "w") as fp:
267        fprint = partial(print, file=fp)
268
269        fprint("/* this file was generated by %s %s */" % (SCRIPT, VERSION))
270        fprint()
271        fprint('#define UNIDATA_VERSION "%s"' % UNIDATA_VERSION)
272        fprint("/* a list of unique database records */")
273        fprint("const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {")
274        for item in table:
275            fprint("    {%d, %d, %d, %d, %d, %d}," % item)
276        fprint("};")
277        fprint()
278
279        fprint("/* Reindexing of NFC first characters. */")
280        fprint("#define TOTAL_FIRST",total_first)
281        fprint("#define TOTAL_LAST",total_last)
282        fprint("struct reindex{int start;short count,index;};")
283        fprint("static struct reindex nfc_first[] = {")
284        for start,end in comp_first_ranges:
285            fprint("    { %d, %d, %d}," % (start,end-start,comp_first[start]))
286        fprint("    {0,0,0}")
287        fprint("};\n")
288        fprint("static struct reindex nfc_last[] = {")
289        for start,end in comp_last_ranges:
290            fprint("  { %d, %d, %d}," % (start,end-start,comp_last[start]))
291        fprint("  {0,0,0}")
292        fprint("};\n")
293
294        # FIXME: <fl> the following tables could be made static, and
295        # the support code moved into unicodedatabase.c
296
297        fprint("/* string literals */")
298        fprint("const char *_PyUnicode_CategoryNames[] = {")
299        for name in CATEGORY_NAMES:
300            fprint("    \"%s\"," % name)
301        fprint("    NULL")
302        fprint("};")
303
304        fprint("const char *_PyUnicode_BidirectionalNames[] = {")
305        for name in BIDIRECTIONAL_NAMES:
306            fprint("    \"%s\"," % name)
307        fprint("    NULL")
308        fprint("};")
309
310        fprint("const char *_PyUnicode_EastAsianWidthNames[] = {")
311        for name in EASTASIANWIDTH_NAMES:
312            fprint("    \"%s\"," % name)
313        fprint("    NULL")
314        fprint("};")
315
316        fprint("static const char *decomp_prefix[] = {")
317        for name in decomp_prefix:
318            fprint("    \"%s\"," % name)
319        fprint("    NULL")
320        fprint("};")
321
322        # split record index table
323        index1, index2, shift = splitbins(index, trace)
324
325        fprint("/* index tables for the database records */")
326        fprint("#define SHIFT", shift)
327        Array("index1", index1).dump(fp, trace)
328        Array("index2", index2).dump(fp, trace)
329
330        # split decomposition index table
331        index1, index2, shift = splitbins(decomp_index, trace)
332
333        fprint("/* decomposition data */")
334        Array("decomp_data", decomp_data).dump(fp, trace)
335
336        fprint("/* index tables for the decomposition data */")
337        fprint("#define DECOMP_SHIFT", shift)
338        Array("decomp_index1", index1).dump(fp, trace)
339        Array("decomp_index2", index2).dump(fp, trace)
340
341        index, index2, shift = splitbins(comp_data, trace)
342        fprint("/* NFC pairs */")
343        fprint("#define COMP_SHIFT", shift)
344        Array("comp_index", index).dump(fp, trace)
345        Array("comp_data", index2).dump(fp, trace)
346
347        # Generate delta tables for old versions
348        for version, table, normalization in unicode.changed:
349            cversion = version.replace(".","_")
350            records = [table[0]]
351            cache = {table[0]:0}
352            index = [0] * len(table)
353            for i, record in enumerate(table):
354                try:
355                    index[i] = cache[record]
356                except KeyError:
357                    index[i] = cache[record] = len(records)
358                    records.append(record)
359            index1, index2, shift = splitbins(index, trace)
360            fprint("static const change_record change_records_%s[] = {" % cversion)
361            for record in records:
362                fprint("    { %s }," % ", ".join(map(str,record)))
363            fprint("};")
364            Array("changes_%s_index" % cversion, index1).dump(fp, trace)
365            Array("changes_%s_data" % cversion, index2).dump(fp, trace)
366            fprint("static const change_record* get_change_%s(Py_UCS4 n)" % cversion)
367            fprint("{")
368            fprint("    int index;")
369            fprint("    if (n >= 0x110000) index = 0;")
370            fprint("    else {")
371            fprint("        index = changes_%s_index[n>>%d];" % (cversion, shift))
372            fprint("        index = changes_%s_data[(index<<%d)+(n & %d)];" % \
373                   (cversion, shift, ((1<<shift)-1)))
374            fprint("    }")
375            fprint("    return change_records_%s+index;" % cversion)
376            fprint("}\n")
377            fprint("static Py_UCS4 normalization_%s(Py_UCS4 n)" % cversion)
378            fprint("{")
379            fprint("    switch(n) {")
380            for k, v in normalization:
381                fprint("    case %s: return 0x%s;" % (hex(k), v))
382            fprint("    default: return 0;")
383            fprint("    }\n}\n")
384
385
386# --------------------------------------------------------------------
387# unicode character type tables
388
389def makeunicodetype(unicode, trace):
390
391    FILE = "Objects/unicodetype_db.h"
392
393    print("--- Preparing", FILE, "...")
394
395    # extract unicode types
396    dummy = (0, 0, 0, 0, 0, 0)
397    table = [dummy]
398    cache = {0: dummy}
399    index = [0] * len(unicode.chars)
400    numeric = {}
401    spaces = []
402    linebreaks = []
403    extra_casing = []
404
405    for char in unicode.chars:
406        record = unicode.table[char]
407        if record:
408            # extract database properties
409            category = record.general_category
410            bidirectional = record.bidi_class
411            properties = record.binary_properties
412            flags = 0
413            if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]:
414                flags |= ALPHA_MASK
415            if "Lowercase" in properties:
416                flags |= LOWER_MASK
417            if 'Line_Break' in properties or bidirectional == "B":
418                flags |= LINEBREAK_MASK
419                linebreaks.append(char)
420            if category == "Zs" or bidirectional in ("WS", "B", "S"):
421                flags |= SPACE_MASK
422                spaces.append(char)
423            if category == "Lt":
424                flags |= TITLE_MASK
425            if "Uppercase" in properties:
426                flags |= UPPER_MASK
427            if char == ord(" ") or category[0] not in ("C", "Z"):
428                flags |= PRINTABLE_MASK
429            if "XID_Start" in properties:
430                flags |= XID_START_MASK
431            if "XID_Continue" in properties:
432                flags |= XID_CONTINUE_MASK
433            if "Cased" in properties:
434                flags |= CASED_MASK
435            if "Case_Ignorable" in properties:
436                flags |= CASE_IGNORABLE_MASK
437            sc = unicode.special_casing.get(char)
438            cf = unicode.case_folding.get(char, [char])
439            if record.simple_uppercase_mapping:
440                upper = int(record.simple_uppercase_mapping, 16)
441            else:
442                upper = char
443            if record.simple_lowercase_mapping:
444                lower = int(record.simple_lowercase_mapping, 16)
445            else:
446                lower = char
447            if record.simple_titlecase_mapping:
448                title = int(record.simple_titlecase_mapping, 16)
449            else:
450                title = upper
451            if sc is None and cf != [lower]:
452                sc = ([lower], [title], [upper])
453            if sc is None:
454                if upper == lower == title:
455                    upper = lower = title = 0
456                else:
457                    upper = upper - char
458                    lower = lower - char
459                    title = title - char
460                    assert (abs(upper) <= 2147483647 and
461                            abs(lower) <= 2147483647 and
462                            abs(title) <= 2147483647)
463            else:
464                # This happens either when some character maps to more than one
465                # character in uppercase, lowercase, or titlecase or the
466                # casefolded version of the character is different from the
467                # lowercase. The extra characters are stored in a different
468                # array.
469                flags |= EXTENDED_CASE_MASK
470                lower = len(extra_casing) | (len(sc[0]) << 24)
471                extra_casing.extend(sc[0])
472                if cf != sc[0]:
473                    lower |= len(cf) << 20
474                    extra_casing.extend(cf)
475                upper = len(extra_casing) | (len(sc[2]) << 24)
476                extra_casing.extend(sc[2])
477                # Title is probably equal to upper.
478                if sc[1] == sc[2]:
479                    title = upper
480                else:
481                    title = len(extra_casing) | (len(sc[1]) << 24)
482                    extra_casing.extend(sc[1])
483            # decimal digit, integer digit
484            decimal = 0
485            if record.decomposition_mapping:
486                flags |= DECIMAL_MASK
487                decimal = int(record.decomposition_mapping)
488            digit = 0
489            if record.numeric_type:
490                flags |= DIGIT_MASK
491                digit = int(record.numeric_type)
492            if record.numeric_value:
493                flags |= NUMERIC_MASK
494                numeric.setdefault(record.numeric_value, []).append(char)
495            item = (
496                upper, lower, title, decimal, digit, flags
497                )
498            # add entry to index and item tables
499            i = cache.get(item)
500            if i is None:
501                cache[item] = i = len(table)
502                table.append(item)
503            index[char] = i
504
505    print(len(table), "unique character type entries")
506    print(sum(map(len, numeric.values())), "numeric code points")
507    print(len(spaces), "whitespace code points")
508    print(len(linebreaks), "linebreak code points")
509    print(len(extra_casing), "extended case array")
510
511    print("--- Writing", FILE, "...")
512
513    with open(FILE, "w") as fp:
514        fprint = partial(print, file=fp)
515
516        fprint("/* this file was generated by %s %s */" % (SCRIPT, VERSION))
517        fprint()
518        fprint("/* a list of unique character type descriptors */")
519        fprint("const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {")
520        for item in table:
521            fprint("    {%d, %d, %d, %d, %d, %d}," % item)
522        fprint("};")
523        fprint()
524
525        fprint("/* extended case mappings */")
526        fprint()
527        fprint("const Py_UCS4 _PyUnicode_ExtendedCase[] = {")
528        for c in extra_casing:
529            fprint("    %d," % c)
530        fprint("};")
531        fprint()
532
533        # split decomposition index table
534        index1, index2, shift = splitbins(index, trace)
535
536        fprint("/* type indexes */")
537        fprint("#define SHIFT", shift)
538        Array("index1", index1).dump(fp, trace)
539        Array("index2", index2).dump(fp, trace)
540
541        # Generate code for _PyUnicode_ToNumeric()
542        numeric_items = sorted(numeric.items())
543        fprint('/* Returns the numeric value as double for Unicode characters')
544        fprint(' * having this property, -1.0 otherwise.')
545        fprint(' */')
546        fprint('double _PyUnicode_ToNumeric(Py_UCS4 ch)')
547        fprint('{')
548        fprint('    switch (ch) {')
549        for value, codepoints in numeric_items:
550            # Turn text into float literals
551            parts = value.split('/')
552            parts = [repr(float(part)) for part in parts]
553            value = '/'.join(parts)
554
555            codepoints.sort()
556            for codepoint in codepoints:
557                fprint('    case 0x%04X:' % (codepoint,))
558            fprint('        return (double) %s;' % (value,))
559        fprint('    }')
560        fprint('    return -1.0;')
561        fprint('}')
562        fprint()
563
564        # Generate code for _PyUnicode_IsWhitespace()
565        fprint("/* Returns 1 for Unicode characters having the bidirectional")
566        fprint(" * type 'WS', 'B' or 'S' or the category 'Zs', 0 otherwise.")
567        fprint(" */")
568        fprint('int _PyUnicode_IsWhitespace(const Py_UCS4 ch)')
569        fprint('{')
570        fprint('    switch (ch) {')
571
572        for codepoint in sorted(spaces):
573            fprint('    case 0x%04X:' % (codepoint,))
574        fprint('        return 1;')
575
576        fprint('    }')
577        fprint('    return 0;')
578        fprint('}')
579        fprint()
580
581        # Generate code for _PyUnicode_IsLinebreak()
582        fprint("/* Returns 1 for Unicode characters having the line break")
583        fprint(" * property 'BK', 'CR', 'LF' or 'NL' or having bidirectional")
584        fprint(" * type 'B', 0 otherwise.")
585        fprint(" */")
586        fprint('int _PyUnicode_IsLinebreak(const Py_UCS4 ch)')
587        fprint('{')
588        fprint('    switch (ch) {')
589        for codepoint in sorted(linebreaks):
590            fprint('    case 0x%04X:' % (codepoint,))
591        fprint('        return 1;')
592
593        fprint('    }')
594        fprint('    return 0;')
595        fprint('}')
596        fprint()
597
598
599# --------------------------------------------------------------------
600# unicode name database
601
602def makeunicodename(unicode, trace):
603
604    FILE = "Modules/unicodename_db.h"
605
606    print("--- Preparing", FILE, "...")
607
608    # collect names
609    names = [None] * len(unicode.chars)
610
611    for char in unicode.chars:
612        record = unicode.table[char]
613        if record:
614            name = record.name.strip()
615            if name and name[0] != "<":
616                names[char] = name + chr(0)
617
618    print(len([n for n in names if n is not None]), "distinct names")
619
620    # collect unique words from names (note that we differ between
621    # words inside a sentence, and words ending a sentence.  the
622    # latter includes the trailing null byte.
623
624    words = {}
625    n = b = 0
626    for char in unicode.chars:
627        name = names[char]
628        if name:
629            w = name.split()
630            b = b + len(name)
631            n = n + len(w)
632            for w in w:
633                l = words.get(w)
634                if l:
635                    l.append(None)
636                else:
637                    words[w] = [len(words)]
638
639    print(n, "words in text;", b, "bytes")
640
641    wordlist = list(words.items())
642
643    # sort on falling frequency, then by name
644    def word_key(a):
645        aword, alist = a
646        return -len(alist), aword
647    wordlist.sort(key=word_key)
648
649    # figure out how many phrasebook escapes we need
650    escapes = 0
651    while escapes * 256 < len(wordlist):
652        escapes = escapes + 1
653    print(escapes, "escapes")
654
655    short = 256 - escapes
656
657    assert short > 0
658
659    print(short, "short indexes in lexicon")
660
661    # statistics
662    n = 0
663    for i in range(short):
664        n = n + len(wordlist[i][1])
665    print(n, "short indexes in phrasebook")
666
667    # pick the most commonly used words, and sort the rest on falling
668    # length (to maximize overlap)
669
670    wordlist, wordtail = wordlist[:short], wordlist[short:]
671    wordtail.sort(key=lambda a: a[0], reverse=True)
672    wordlist.extend(wordtail)
673
674    # generate lexicon from words
675
676    lexicon_offset = [0]
677    lexicon = ""
678    words = {}
679
680    # build a lexicon string
681    offset = 0
682    for w, x in wordlist:
683        # encoding: bit 7 indicates last character in word (chr(128)
684        # indicates the last character in an entire string)
685        ww = w[:-1] + chr(ord(w[-1])+128)
686        # reuse string tails, when possible
687        o = lexicon.find(ww)
688        if o < 0:
689            o = offset
690            lexicon = lexicon + ww
691            offset = offset + len(w)
692        words[w] = len(lexicon_offset)
693        lexicon_offset.append(o)
694
695    lexicon = list(map(ord, lexicon))
696
697    # generate phrasebook from names and lexicon
698    phrasebook = [0]
699    phrasebook_offset = [0] * len(unicode.chars)
700    for char in unicode.chars:
701        name = names[char]
702        if name:
703            w = name.split()
704            phrasebook_offset[char] = len(phrasebook)
705            for w in w:
706                i = words[w]
707                if i < short:
708                    phrasebook.append(i)
709                else:
710                    # store as two bytes
711                    phrasebook.append((i>>8) + short)
712                    phrasebook.append(i&255)
713
714    assert getsize(phrasebook) == 1
715
716    #
717    # unicode name hash table
718
719    # extract names
720    data = []
721    for char in unicode.chars:
722        record = unicode.table[char]
723        if record:
724            name = record.name.strip()
725            if name and name[0] != "<":
726                data.append((name, char))
727
728    # the magic number 47 was chosen to minimize the number of
729    # collisions on the current data set.  if you like, change it
730    # and see what happens...
731
732    codehash = Hash("code", data, 47)
733
734    print("--- Writing", FILE, "...")
735
736    with open(FILE, "w") as fp:
737        fprint = partial(print, file=fp)
738
739        fprint("/* this file was generated by %s %s */" % (SCRIPT, VERSION))
740        fprint()
741        fprint("#define NAME_MAXLEN", 256)
742        fprint()
743        fprint("/* lexicon */")
744        Array("lexicon", lexicon).dump(fp, trace)
745        Array("lexicon_offset", lexicon_offset).dump(fp, trace)
746
747        # split decomposition index table
748        offset1, offset2, shift = splitbins(phrasebook_offset, trace)
749
750        fprint("/* code->name phrasebook */")
751        fprint("#define phrasebook_shift", shift)
752        fprint("#define phrasebook_short", short)
753
754        Array("phrasebook", phrasebook).dump(fp, trace)
755        Array("phrasebook_offset1", offset1).dump(fp, trace)
756        Array("phrasebook_offset2", offset2).dump(fp, trace)
757
758        fprint("/* name->code dictionary */")
759        codehash.dump(fp, trace)
760
761        fprint()
762        fprint('static const unsigned int aliases_start = %#x;' %
763               NAME_ALIASES_START)
764        fprint('static const unsigned int aliases_end = %#x;' %
765               (NAME_ALIASES_START + len(unicode.aliases)))
766
767        fprint('static const unsigned int name_aliases[] = {')
768        for name, codepoint in unicode.aliases:
769            fprint('    0x%04X,' % codepoint)
770        fprint('};')
771
772        # In Unicode 6.0.0, the sequences contain at most 4 BMP chars,
773        # so we are using Py_UCS2 seq[4].  This needs to be updated if longer
774        # sequences or sequences with non-BMP chars are added.
775        # unicodedata_lookup should be adapted too.
776        fprint(dedent("""
777            typedef struct NamedSequence {
778                int seqlen;
779                Py_UCS2 seq[4];
780            } named_sequence;
781            """))
782
783        fprint('static const unsigned int named_sequences_start = %#x;' %
784               NAMED_SEQUENCES_START)
785        fprint('static const unsigned int named_sequences_end = %#x;' %
786               (NAMED_SEQUENCES_START + len(unicode.named_sequences)))
787
788        fprint('static const named_sequence named_sequences[] = {')
789        for name, sequence in unicode.named_sequences:
790            seq_str = ', '.join('0x%04X' % cp for cp in sequence)
791            fprint('    {%d, {%s}},' % (len(sequence), seq_str))
792        fprint('};')
793
794
795def merge_old_version(version, new, old):
796    # Changes to exclusion file not implemented yet
797    if old.exclusions != new.exclusions:
798        raise NotImplementedError("exclusions differ")
799
800    # In these change records, 0xFF means "no change"
801    bidir_changes = [0xFF]*0x110000
802    category_changes = [0xFF]*0x110000
803    decimal_changes = [0xFF]*0x110000
804    mirrored_changes = [0xFF]*0x110000
805    east_asian_width_changes = [0xFF]*0x110000
806    # In numeric data, 0 means "no change",
807    # -1 means "did not have a numeric value
808    numeric_changes = [0] * 0x110000
809    # normalization_changes is a list of key-value pairs
810    normalization_changes = []
811    for i in range(0x110000):
812        if new.table[i] is None:
813            # Characters unassigned in the new version ought to
814            # be unassigned in the old one
815            assert old.table[i] is None
816            continue
817        # check characters unassigned in the old version
818        if old.table[i] is None:
819            # category 0 is "unassigned"
820            category_changes[i] = 0
821            continue
822        # check characters that differ
823        if old.table[i] != new.table[i]:
824            for k, field in enumerate(dataclasses.fields(UcdRecord)):
825                value = getattr(old.table[i], field.name)
826                new_value = getattr(new.table[i], field.name)
827                if value != new_value:
828                    if k == 1 and i in PUA_15:
829                        # the name is not set in the old.table, but in the
830                        # new.table we are using it for aliases and named seq
831                        assert value == ''
832                    elif k == 2:
833                        category_changes[i] = CATEGORY_NAMES.index(value)
834                    elif k == 4:
835                        bidir_changes[i] = BIDIRECTIONAL_NAMES.index(value)
836                    elif k == 5:
837                        # We assume that all normalization changes are in 1:1 mappings
838                        assert " " not in value
839                        normalization_changes.append((i, value))
840                    elif k == 6:
841                        # we only support changes where the old value is a single digit
842                        assert value in "0123456789"
843                        decimal_changes[i] = int(value)
844                    elif k == 8:
845                        # Since 0 encodes "no change", the old value is better not 0
846                        if not value:
847                            numeric_changes[i] = -1
848                        else:
849                            numeric_changes[i] = float(value)
850                            assert numeric_changes[i] not in (0, -1)
851                    elif k == 9:
852                        if value == 'Y':
853                            mirrored_changes[i] = '1'
854                        else:
855                            mirrored_changes[i] = '0'
856                    elif k == 11:
857                        # change to ISO comment, ignore
858                        pass
859                    elif k == 12:
860                        # change to simple uppercase mapping; ignore
861                        pass
862                    elif k == 13:
863                        # change to simple lowercase mapping; ignore
864                        pass
865                    elif k == 14:
866                        # change to simple titlecase mapping; ignore
867                        pass
868                    elif k == 15:
869                        # change to east asian width
870                        east_asian_width_changes[i] = EASTASIANWIDTH_NAMES.index(value)
871                    elif k == 16:
872                        # derived property changes; not yet
873                        pass
874                    elif k == 17:
875                        # normalization quickchecks are not performed
876                        # for older versions
877                        pass
878                    else:
879                        class Difference(Exception):pass
880                        raise Difference(hex(i), k, old.table[i], new.table[i])
881    new.changed.append((version, list(zip(bidir_changes, category_changes,
882                                          decimal_changes, mirrored_changes,
883                                          east_asian_width_changes,
884                                          numeric_changes)),
885                        normalization_changes))
886
887
888DATA_DIR = os.path.join('Tools', 'unicode', 'data')
889
890def open_data(template, version):
891    local = os.path.join(DATA_DIR, template % ('-'+version,))
892    if not os.path.exists(local):
893        import urllib.request
894        if version == '3.2.0':
895            # irregular url structure
896            url = ('https://www.unicode.org/Public/3.2-Update/'+template) % ('-'+version,)
897        else:
898            url = ('https://www.unicode.org/Public/%s/ucd/'+template) % (version, '')
899        os.makedirs(DATA_DIR, exist_ok=True)
900        urllib.request.urlretrieve(url, filename=local)
901    if local.endswith('.txt'):
902        return open(local, encoding='utf-8')
903    else:
904        # Unihan.zip
905        return open(local, 'rb')
906
907
908def expand_range(char_range: str) -> Iterator[int]:
909    '''
910    Parses ranges of code points, as described in UAX #44:
911      https://www.unicode.org/reports/tr44/#Code_Point_Ranges
912    '''
913    if '..' in char_range:
914        first, last = [int(c, 16) for c in char_range.split('..')]
915    else:
916        first = last = int(char_range, 16)
917    for char in range(first, last+1):
918        yield char
919
920
921class UcdFile:
922    '''
923    A file in the standard format of the UCD.
924
925    See: https://www.unicode.org/reports/tr44/#Format_Conventions
926
927    Note that, as described there, the Unihan data files have their
928    own separate format.
929    '''
930
931    def __init__(self, template: str, version: str) -> None:
932        self.template = template
933        self.version = version
934
935    def records(self) -> Iterator[List[str]]:
936        with open_data(self.template, self.version) as file:
937            for line in file:
938                line = line.split('#', 1)[0].strip()
939                if not line:
940                    continue
941                yield [field.strip() for field in line.split(';')]
942
943    def __iter__(self) -> Iterator[List[str]]:
944        return self.records()
945
946    def expanded(self) -> Iterator[Tuple[int, List[str]]]:
947        for record in self.records():
948            char_range, rest = record[0], record[1:]
949            for char in expand_range(char_range):
950                yield char, rest
951
952
953@dataclasses.dataclass
954class UcdRecord:
955    # 15 fields from UnicodeData.txt .  See:
956    #   https://www.unicode.org/reports/tr44/#UnicodeData.txt
957    codepoint: str
958    name: str
959    general_category: str
960    canonical_combining_class: str
961    bidi_class: str
962    decomposition_type: str
963    decomposition_mapping: str
964    numeric_type: str
965    numeric_value: str
966    bidi_mirrored: str
967    unicode_1_name: str  # obsolete
968    iso_comment: str  # obsolete
969    simple_uppercase_mapping: str
970    simple_lowercase_mapping: str
971    simple_titlecase_mapping: str
972
973    # https://www.unicode.org/reports/tr44/#EastAsianWidth.txt
974    east_asian_width: Optional[str]
975
976    # Binary properties, as a set of those that are true.
977    # Taken from multiple files:
978    #   https://www.unicode.org/reports/tr44/#DerivedCoreProperties.txt
979    #   https://www.unicode.org/reports/tr44/#LineBreak.txt
980    binary_properties: Set[str]
981
982    # The Quick_Check properties related to normalization:
983    #   https://www.unicode.org/reports/tr44/#Decompositions_and_Normalization
984    # We store them as a bitmask.
985    quick_check: int
986
987
988def from_row(row: List[str]) -> UcdRecord:
989    return UcdRecord(*row, None, set(), 0)
990
991
992# --------------------------------------------------------------------
993# the following support code is taken from the unidb utilities
994# Copyright (c) 1999-2000 by Secret Labs AB
995
996# load a unicode-data file from disk
997
998class UnicodeData:
999    # table: List[Optional[UcdRecord]]  # index is codepoint; None means unassigned
1000
1001    def __init__(self, version, cjk_check=True):
1002        self.changed = []
1003        table = [None] * 0x110000
1004        for s in UcdFile(UNICODE_DATA, version):
1005            char = int(s[0], 16)
1006            table[char] = from_row(s)
1007
1008        cjk_ranges_found = []
1009
1010        # expand first-last ranges
1011        field = None
1012        for i in range(0, 0x110000):
1013            # The file UnicodeData.txt has its own distinct way of
1014            # expressing ranges.  See:
1015            #   https://www.unicode.org/reports/tr44/#Code_Point_Ranges
1016            s = table[i]
1017            if s:
1018                if s.name[-6:] == "First>":
1019                    s.name = ""
1020                    field = dataclasses.astuple(s)[:15]
1021                elif s.name[-5:] == "Last>":
1022                    if s.name.startswith("<CJK Ideograph"):
1023                        cjk_ranges_found.append((field[0],
1024                                                 s.codepoint))
1025                    s.name = ""
1026                    field = None
1027            elif field:
1028                table[i] = from_row(('%X' % i,) + field[1:])
1029        if cjk_check and cjk_ranges != cjk_ranges_found:
1030            raise ValueError("CJK ranges deviate: have %r" % cjk_ranges_found)
1031
1032        # public attributes
1033        self.filename = UNICODE_DATA % ''
1034        self.table = table
1035        self.chars = list(range(0x110000)) # unicode 3.2
1036
1037        # check for name aliases and named sequences, see #12753
1038        # aliases and named sequences are not in 3.2.0
1039        if version != '3.2.0':
1040            self.aliases = []
1041            # store aliases in the Private Use Area 15, in range U+F0000..U+F00FF,
1042            # in order to take advantage of the compression and lookup
1043            # algorithms used for the other characters
1044            pua_index = NAME_ALIASES_START
1045            for char, name, abbrev in UcdFile(NAME_ALIASES, version):
1046                char = int(char, 16)
1047                self.aliases.append((name, char))
1048                # also store the name in the PUA 1
1049                self.table[pua_index].name = name
1050                pua_index += 1
1051            assert pua_index - NAME_ALIASES_START == len(self.aliases)
1052
1053            self.named_sequences = []
1054            # store named sequences in the PUA 1, in range U+F0100..,
1055            # in order to take advantage of the compression and lookup
1056            # algorithms used for the other characters.
1057
1058            assert pua_index < NAMED_SEQUENCES_START
1059            pua_index = NAMED_SEQUENCES_START
1060            for name, chars in UcdFile(NAMED_SEQUENCES, version):
1061                chars = tuple(int(char, 16) for char in chars.split())
1062                # check that the structure defined in makeunicodename is OK
1063                assert 2 <= len(chars) <= 4, "change the Py_UCS2 array size"
1064                assert all(c <= 0xFFFF for c in chars), ("use Py_UCS4 in "
1065                    "the NamedSequence struct and in unicodedata_lookup")
1066                self.named_sequences.append((name, chars))
1067                # also store these in the PUA 1
1068                self.table[pua_index].name = name
1069                pua_index += 1
1070            assert pua_index - NAMED_SEQUENCES_START == len(self.named_sequences)
1071
1072        self.exclusions = {}
1073        for char, in UcdFile(COMPOSITION_EXCLUSIONS, version):
1074            char = int(char, 16)
1075            self.exclusions[char] = 1
1076
1077        widths = [None] * 0x110000
1078        for char, (width,) in UcdFile(EASTASIAN_WIDTH, version).expanded():
1079            widths[char] = width
1080
1081        for i in range(0, 0x110000):
1082            if table[i] is not None:
1083                table[i].east_asian_width = widths[i]
1084
1085        for char, (p,) in UcdFile(DERIVED_CORE_PROPERTIES, version).expanded():
1086            if table[char]:
1087                # Some properties (e.g. Default_Ignorable_Code_Point)
1088                # apply to unassigned code points; ignore them
1089                table[char].binary_properties.add(p)
1090
1091        for char_range, value in UcdFile(LINE_BREAK, version):
1092            if value not in MANDATORY_LINE_BREAKS:
1093                continue
1094            for char in expand_range(char_range):
1095                table[char].binary_properties.add('Line_Break')
1096
1097        # We only want the quickcheck properties
1098        # Format: NF?_QC; Y(es)/N(o)/M(aybe)
1099        # Yes is the default, hence only N and M occur
1100        # In 3.2.0, the format was different (NF?_NO)
1101        # The parsing will incorrectly determine these as
1102        # "yes", however, unicodedata.c will not perform quickchecks
1103        # for older versions, and no delta records will be created.
1104        quickchecks = [0] * 0x110000
1105        qc_order = 'NFD_QC NFKD_QC NFC_QC NFKC_QC'.split()
1106        for s in UcdFile(DERIVEDNORMALIZATION_PROPS, version):
1107            if len(s) < 2 or s[1] not in qc_order:
1108                continue
1109            quickcheck = 'MN'.index(s[2]) + 1 # Maybe or No
1110            quickcheck_shift = qc_order.index(s[1])*2
1111            quickcheck <<= quickcheck_shift
1112            for char in expand_range(s[0]):
1113                assert not (quickchecks[char]>>quickcheck_shift)&3
1114                quickchecks[char] |= quickcheck
1115        for i in range(0, 0x110000):
1116            if table[i] is not None:
1117                table[i].quick_check = quickchecks[i]
1118
1119        with open_data(UNIHAN, version) as file:
1120            zip = zipfile.ZipFile(file)
1121            if version == '3.2.0':
1122                data = zip.open('Unihan-3.2.0.txt').read()
1123            else:
1124                data = zip.open('Unihan_NumericValues.txt').read()
1125        for line in data.decode("utf-8").splitlines():
1126            if not line.startswith('U+'):
1127                continue
1128            code, tag, value = line.split(None, 3)[:3]
1129            if tag not in ('kAccountingNumeric', 'kPrimaryNumeric',
1130                           'kOtherNumeric'):
1131                continue
1132            value = value.strip().replace(',', '')
1133            i = int(code[2:], 16)
1134            # Patch the numeric field
1135            if table[i] is not None:
1136                table[i].numeric_value = value
1137
1138        sc = self.special_casing = {}
1139        for data in UcdFile(SPECIAL_CASING, version):
1140            if data[4]:
1141                # We ignore all conditionals (since they depend on
1142                # languages) except for one, which is hardcoded. See
1143                # handle_capital_sigma in unicodeobject.c.
1144                continue
1145            c = int(data[0], 16)
1146            lower = [int(char, 16) for char in data[1].split()]
1147            title = [int(char, 16) for char in data[2].split()]
1148            upper = [int(char, 16) for char in data[3].split()]
1149            sc[c] = (lower, title, upper)
1150
1151        cf = self.case_folding = {}
1152        if version != '3.2.0':
1153            for data in UcdFile(CASE_FOLDING, version):
1154                if data[1] in "CF":
1155                    c = int(data[0], 16)
1156                    cf[c] = [int(char, 16) for char in data[2].split()]
1157
1158    def uselatin1(self):
1159        # restrict character range to ISO Latin 1
1160        self.chars = list(range(256))
1161
1162
1163# hash table tools
1164
1165# this is a straight-forward reimplementation of Python's built-in
1166# dictionary type, using a static data structure, and a custom string
1167# hash algorithm.
1168
1169def myhash(s, magic):
1170    h = 0
1171    for c in map(ord, s.upper()):
1172        h = (h * magic) + c
1173        ix = h & 0xff000000
1174        if ix:
1175            h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff
1176    return h
1177
1178
1179SIZES = [
1180    (4,3), (8,3), (16,3), (32,5), (64,3), (128,3), (256,29), (512,17),
1181    (1024,9), (2048,5), (4096,83), (8192,27), (16384,43), (32768,3),
1182    (65536,45), (131072,9), (262144,39), (524288,39), (1048576,9),
1183    (2097152,5), (4194304,3), (8388608,33), (16777216,27)
1184]
1185
1186
1187class Hash:
1188    def __init__(self, name, data, magic):
1189        # turn a (key, value) list into a static hash table structure
1190
1191        # determine table size
1192        for size, poly in SIZES:
1193            if size > len(data):
1194                poly = size + poly
1195                break
1196        else:
1197            raise AssertionError("ran out of polynomials")
1198
1199        print(size, "slots in hash table")
1200
1201        table = [None] * size
1202
1203        mask = size-1
1204
1205        n = 0
1206
1207        hash = myhash
1208
1209        # initialize hash table
1210        for key, value in data:
1211            h = hash(key, magic)
1212            i = (~h) & mask
1213            v = table[i]
1214            if v is None:
1215                table[i] = value
1216                continue
1217            incr = (h ^ (h >> 3)) & mask
1218            if not incr:
1219                incr = mask
1220            while 1:
1221                n = n + 1
1222                i = (i + incr) & mask
1223                v = table[i]
1224                if v is None:
1225                    table[i] = value
1226                    break
1227                incr = incr << 1
1228                if incr > mask:
1229                    incr = incr ^ poly
1230
1231        print(n, "collisions")
1232        self.collisions = n
1233
1234        for i in range(len(table)):
1235            if table[i] is None:
1236                table[i] = 0
1237
1238        self.data = Array(name + "_hash", table)
1239        self.magic = magic
1240        self.name = name
1241        self.size = size
1242        self.poly = poly
1243
1244    def dump(self, file, trace):
1245        # write data to file, as a C array
1246        self.data.dump(file, trace)
1247        file.write("#define %s_magic %d\n" % (self.name, self.magic))
1248        file.write("#define %s_size %d\n" % (self.name, self.size))
1249        file.write("#define %s_poly %d\n" % (self.name, self.poly))
1250
1251
1252# stuff to deal with arrays of unsigned integers
1253
1254class Array:
1255
1256    def __init__(self, name, data):
1257        self.name = name
1258        self.data = data
1259
1260    def dump(self, file, trace=0):
1261        # write data to file, as a C array
1262        size = getsize(self.data)
1263        if trace:
1264            print(self.name+":", size*len(self.data), "bytes", file=sys.stderr)
1265        file.write("static const ")
1266        if size == 1:
1267            file.write("unsigned char")
1268        elif size == 2:
1269            file.write("unsigned short")
1270        else:
1271            file.write("unsigned int")
1272        file.write(" " + self.name + "[] = {\n")
1273        if self.data:
1274            s = "    "
1275            for item in self.data:
1276                i = str(item) + ", "
1277                if len(s) + len(i) > 78:
1278                    file.write(s.rstrip() + "\n")
1279                    s = "    " + i
1280                else:
1281                    s = s + i
1282            if s.strip():
1283                file.write(s.rstrip() + "\n")
1284        file.write("};\n\n")
1285
1286
1287def getsize(data):
1288    # return smallest possible integer size for the given array
1289    maxdata = max(data)
1290    if maxdata < 256:
1291        return 1
1292    elif maxdata < 65536:
1293        return 2
1294    else:
1295        return 4
1296
1297
1298def splitbins(t, trace=0):
1299    """t, trace=0 -> (t1, t2, shift).  Split a table to save space.
1300
1301    t is a sequence of ints.  This function can be useful to save space if
1302    many of the ints are the same.  t1 and t2 are lists of ints, and shift
1303    is an int, chosen to minimize the combined size of t1 and t2 (in C
1304    code), and where for each i in range(len(t)),
1305        t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
1306    where mask is a bitmask isolating the last "shift" bits.
1307
1308    If optional arg trace is non-zero (default zero), progress info
1309    is printed to sys.stderr.  The higher the value, the more info
1310    you'll get.
1311    """
1312
1313    if trace:
1314        def dump(t1, t2, shift, bytes):
1315            print("%d+%d bins at shift %d; %d bytes" % (
1316                len(t1), len(t2), shift, bytes), file=sys.stderr)
1317        print("Size of original table:", len(t)*getsize(t), "bytes",
1318              file=sys.stderr)
1319    n = len(t)-1    # last valid index
1320    maxshift = 0    # the most we can shift n and still have something left
1321    if n > 0:
1322        while n >> 1:
1323            n >>= 1
1324            maxshift += 1
1325    del n
1326    bytes = sys.maxsize  # smallest total size so far
1327    t = tuple(t)    # so slices can be dict keys
1328    for shift in range(maxshift + 1):
1329        t1 = []
1330        t2 = []
1331        size = 2**shift
1332        bincache = {}
1333        for i in range(0, len(t), size):
1334            bin = t[i:i+size]
1335            index = bincache.get(bin)
1336            if index is None:
1337                index = len(t2)
1338                bincache[bin] = index
1339                t2.extend(bin)
1340            t1.append(index >> shift)
1341        # determine memory size
1342        b = len(t1)*getsize(t1) + len(t2)*getsize(t2)
1343        if trace > 1:
1344            dump(t1, t2, shift, b)
1345        if b < bytes:
1346            best = t1, t2, shift
1347            bytes = b
1348    t1, t2, shift = best
1349    if trace:
1350        print("Best:", end=' ', file=sys.stderr)
1351        dump(t1, t2, shift, bytes)
1352    if __debug__:
1353        # exhaustively verify that the decomposition is correct
1354        mask = ~((~0) << shift) # i.e., low-bit mask of shift bits
1355        for i in range(len(t)):
1356            assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
1357    return best
1358
1359
1360if __name__ == "__main__":
1361    maketables(1)
1362