1#!/usr/bin/env python
2#
3# Copyright 2011-2018 The Rust Project Developers. See the COPYRIGHT
4# file at the top-level directory of this distribution and at
5# http://rust-lang.org/COPYRIGHT.
6#
7# Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
8# http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
9# <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
10# option. This file may not be copied, modified, or distributed
11# except according to those terms.
12
13# This script uses the following Unicode tables:
14# - DerivedNormalizationProps.txt
15# - NormalizationTest.txt
16# - UnicodeData.txt
17# - StandardizedVariants.txt
18#
19# Since this should not require frequent updates, we just store this
20# out-of-line and check the tables.rs and normalization_tests.rs files into git.
21import collections
22import urllib.request
23
24UNICODE_VERSION = "13.0.0"
25UCD_URL = "https://www.unicode.org/Public/%s/ucd/" % UNICODE_VERSION
26
27PREAMBLE = """// Copyright 2012-2018 The Rust Project Developers. See the COPYRIGHT
28// file at the top-level directory of this distribution and at
29// http://rust-lang.org/COPYRIGHT.
30//
31// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
32// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
33// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
34// option. This file may not be copied, modified, or distributed
35// except according to those terms.
36
37// NOTE: The following code was generated by "scripts/unicode.py", do not edit directly
38
39#![allow(missing_docs)]
40"""
41
42NormalizationTest = collections.namedtuple(
43    "NormalizationTest",
44    ["source", "nfc", "nfd", "nfkc", "nfkd"],
45)
46
47# Mapping taken from Table 12 from:
48# http://www.unicode.org/reports/tr44/#General_Category_Values
49expanded_categories = {
50    'Lu': ['LC', 'L'], 'Ll': ['LC', 'L'], 'Lt': ['LC', 'L'],
51    'Lm': ['L'], 'Lo': ['L'],
52    'Mn': ['M'], 'Mc': ['M'], 'Me': ['M'],
53    'Nd': ['N'], 'Nl': ['N'], 'No': ['No'],
54    'Pc': ['P'], 'Pd': ['P'], 'Ps': ['P'], 'Pe': ['P'],
55    'Pi': ['P'], 'Pf': ['P'], 'Po': ['P'],
56    'Sm': ['S'], 'Sc': ['S'], 'Sk': ['S'], 'So': ['S'],
57    'Zs': ['Z'], 'Zl': ['Z'], 'Zp': ['Z'],
58    'Cc': ['C'], 'Cf': ['C'], 'Cs': ['C'], 'Co': ['C'], 'Cn': ['C'],
59}
60
61# Constants from Unicode 9.0.0 Section 3.12 Conjoining Jamo Behavior
62# http://www.unicode.org/versions/Unicode9.0.0/ch03.pdf#M9.32468.Heading.310.Combining.Jamo.Behavior
63S_BASE, L_COUNT, V_COUNT, T_COUNT = 0xAC00, 19, 21, 28
64S_COUNT = L_COUNT * V_COUNT * T_COUNT
65
66class UnicodeData(object):
67    def __init__(self):
68        self._load_unicode_data()
69        self.norm_props = self._load_norm_props()
70        self.norm_tests = self._load_norm_tests()
71
72        self.canon_comp = self._compute_canonical_comp()
73        self.canon_fully_decomp, self.compat_fully_decomp = self._compute_fully_decomposed()
74
75        self.cjk_compat_variants_fully_decomp = {}
76        self._load_cjk_compat_ideograph_variants()
77
78        def stats(name, table):
79            count = sum(len(v) for v in table.values())
80            print("%s: %d chars => %d decomposed chars" % (name, len(table), count))
81
82        print("Decomposition table stats:")
83        stats("Canonical decomp", self.canon_decomp)
84        stats("Compatible decomp", self.compat_decomp)
85        stats("Canonical fully decomp", self.canon_fully_decomp)
86        stats("Compatible fully decomp", self.compat_fully_decomp)
87        stats("CJK Compat Variants fully decomp", self.cjk_compat_variants_fully_decomp)
88
89        self.ss_leading, self.ss_trailing = self._compute_stream_safe_tables()
90
91    def _fetch(self, filename):
92        resp = urllib.request.urlopen(UCD_URL + filename)
93        return resp.read().decode('utf-8')
94
95    def _load_unicode_data(self):
96        self.name_to_char_int = {}
97        self.combining_classes = {}
98        self.compat_decomp = {}
99        self.canon_decomp = {}
100        self.general_category_mark = []
101
102        for line in self._fetch("UnicodeData.txt").splitlines():
103            # See ftp://ftp.unicode.org/Public/3.0-Update/UnicodeData-3.0.0.html
104            pieces = line.split(';')
105            assert len(pieces) == 15
106            char, category, cc, decomp = pieces[0], pieces[2], pieces[3], pieces[5]
107            char_int = int(char, 16)
108
109            name = pieces[1].strip()
110            self.name_to_char_int[name] = char_int
111
112            if cc != '0':
113                self.combining_classes[char_int] = cc
114
115            if decomp.startswith('<'):
116                self.compat_decomp[char_int] = [int(c, 16) for c in decomp.split()[1:]]
117            elif decomp != '':
118                self.canon_decomp[char_int] = [int(c, 16) for c in decomp.split()]
119
120            if category == 'M' or 'M' in expanded_categories.get(category, []):
121                self.general_category_mark.append(char_int)
122
123    def _load_cjk_compat_ideograph_variants(self):
124        for line in self._fetch("StandardizedVariants.txt").splitlines():
125            strip_comments = line.split('#', 1)[0].strip()
126            if not strip_comments:
127                continue
128
129            variation_sequence, description, differences = strip_comments.split(';')
130            description = description.strip()
131
132            # Don't use variations that only apply in particular shaping environments.
133            if differences:
134                continue
135
136            # Look for entries where the description field is a codepoint name.
137            if description not in self.name_to_char_int:
138                continue
139
140            # Only consider the CJK Compatibility Ideographs.
141            if not description.startswith('CJK COMPATIBILITY IDEOGRAPH-'):
142                continue
143
144            char_int = self.name_to_char_int[description]
145
146            assert not char_int in self.combining_classes, "Unexpected: CJK compat variant with a combining class"
147            assert not char_int in self.compat_decomp, "Unexpected: CJK compat variant and compatibility decomposition"
148            assert len(self.canon_decomp[char_int]) == 1, "Unexpected: CJK compat variant and non-singleton canonical decomposition"
149            # If we ever need to handle Hangul here, we'll need to handle it separately.
150            assert not (S_BASE <= char_int < S_BASE + S_COUNT)
151
152            cjk_compat_variant_parts = [int(c, 16) for c in variation_sequence.split()]
153            for c in cjk_compat_variant_parts:
154                assert not c in self.canon_decomp, "Unexpected: CJK compat variant is unnormalized (canon)"
155                assert not c in self.compat_decomp, "Unexpected: CJK compat variant is unnormalized (compat)"
156            self.cjk_compat_variants_fully_decomp[char_int] = cjk_compat_variant_parts
157
158    def _load_norm_props(self):
159        props = collections.defaultdict(list)
160
161        for line in self._fetch("DerivedNormalizationProps.txt").splitlines():
162            (prop_data, _, _) = line.partition("#")
163            prop_pieces = prop_data.split(";")
164
165            if len(prop_pieces) < 2:
166                continue
167
168            assert len(prop_pieces) <= 3
169            (low, _, high) = prop_pieces[0].strip().partition("..")
170
171            prop = prop_pieces[1].strip()
172
173            data = None
174            if len(prop_pieces) == 3:
175                data = prop_pieces[2].strip()
176
177            props[prop].append((low, high, data))
178
179        return props
180
181    def _load_norm_tests(self):
182        tests = []
183        for line in self._fetch("NormalizationTest.txt").splitlines():
184            (test_data, _, _) = line.partition("#")
185            test_pieces = test_data.split(";")
186
187            if len(test_pieces) < 5:
188                continue
189
190            source, nfc, nfd, nfkc, nfkd = [[c.strip() for c in p.split()] for p in test_pieces[:5]]
191            tests.append(NormalizationTest(source, nfc, nfd, nfkc, nfkd))
192
193        return tests
194
195    def _compute_canonical_comp(self):
196        canon_comp = {}
197        comp_exclusions = [
198            (int(low, 16), int(high or low, 16))
199            for low, high, _ in self.norm_props["Full_Composition_Exclusion"]
200        ]
201        for char_int, decomp in self.canon_decomp.items():
202            if any(lo <= char_int <= hi for lo, hi in comp_exclusions):
203                continue
204
205            assert len(decomp) == 2
206            assert (decomp[0], decomp[1]) not in canon_comp
207            canon_comp[(decomp[0], decomp[1])] = char_int
208
209        return canon_comp
210
211    def _compute_fully_decomposed(self):
212        """
213        Even though the decomposition algorithm is recursive, it is possible
214        to precompute the recursion at table generation time with modest
215        increase to the table size.  Then, for these precomputed tables, we
216        note that 1) compatible decomposition is a subset of canonical
217        decomposition and 2) they mostly agree on their intersection.
218        Therefore, we don't store entries in the compatible table for
219        characters that decompose the same way under canonical decomposition.
220
221            Decomposition table stats:
222            Canonical decomp: 2060 chars => 3085 decomposed chars
223            Compatible decomp: 3662 chars => 5440 decomposed chars
224            Canonical fully decomp: 2060 chars => 3404 decomposed chars
225            Compatible fully decomp: 3678 chars => 5599 decomposed chars
226
227        The upshot is that decomposition code is very simple and easy to inline
228        at mild code size cost.
229        """
230        def _decompose(char_int, compatible):
231            # 7-bit ASCII never decomposes
232            if char_int <= 0x7f:
233                yield char_int
234                return
235
236            # Assert that we're handling Hangul separately.
237            assert not (S_BASE <= char_int < S_BASE + S_COUNT)
238
239            decomp = self.canon_decomp.get(char_int)
240            if decomp is not None:
241                for decomposed_ch in decomp:
242                    for fully_decomposed_ch in _decompose(decomposed_ch, compatible):
243                        yield fully_decomposed_ch
244                return
245
246            if compatible and char_int in self.compat_decomp:
247                for decomposed_ch in self.compat_decomp[char_int]:
248                    for fully_decomposed_ch in _decompose(decomposed_ch, compatible):
249                        yield fully_decomposed_ch
250                return
251
252            yield char_int
253            return
254
255        end_codepoint = max(
256            max(self.canon_decomp.keys()),
257            max(self.compat_decomp.keys()),
258        )
259
260        canon_fully_decomp = {}
261        compat_fully_decomp = {}
262
263        for char_int in range(0, end_codepoint + 1):
264            # Always skip Hangul, since it's more efficient to represent its
265            # decomposition programmatically.
266            if S_BASE <= char_int < S_BASE + S_COUNT:
267                continue
268
269            canon = list(_decompose(char_int, False))
270            if not (len(canon) == 1 and canon[0] == char_int):
271                canon_fully_decomp[char_int] = canon
272
273            compat = list(_decompose(char_int, True))
274            if not (len(compat) == 1 and compat[0] == char_int):
275                compat_fully_decomp[char_int] = compat
276
277        # Since canon_fully_decomp is a subset of compat_fully_decomp, we don't
278        # need to store their overlap when they agree.  When they don't agree,
279        # store the decomposition in the compatibility table since we'll check
280        # that first when normalizing to NFKD.
281        assert set(canon_fully_decomp) <= set(compat_fully_decomp)
282
283        for ch in set(canon_fully_decomp) & set(compat_fully_decomp):
284            if canon_fully_decomp[ch] == compat_fully_decomp[ch]:
285                del compat_fully_decomp[ch]
286
287        return canon_fully_decomp, compat_fully_decomp
288
289    def _compute_stream_safe_tables(self):
290        """
291        To make a text stream-safe with the Stream-Safe Text Process (UAX15-D4),
292        we need to be able to know the number of contiguous non-starters *after*
293        applying compatibility decomposition to each character.
294
295        We can do this incrementally by computing the number of leading and
296        trailing non-starters for each character's compatibility decomposition
297        with the following rules:
298
299        1) If a character is not affected by compatibility decomposition, look
300           up its canonical combining class to find out if it's a non-starter.
301        2) All Hangul characters are starters, even under decomposition.
302        3) Otherwise, very few decomposing characters have a nonzero count
303           of leading or trailing non-starters, so store these characters
304           with their associated counts in a separate table.
305        """
306        leading_nonstarters = {}
307        trailing_nonstarters = {}
308
309        for c in set(self.canon_fully_decomp) | set(self.compat_fully_decomp):
310            decomposed = self.compat_fully_decomp.get(c) or self.canon_fully_decomp[c]
311
312            num_leading = 0
313            for d in decomposed:
314                if d not in self.combining_classes:
315                    break
316                num_leading += 1
317
318            num_trailing = 0
319            for d in reversed(decomposed):
320                if d not in self.combining_classes:
321                    break
322                num_trailing += 1
323
324            if num_leading > 0:
325                leading_nonstarters[c] = num_leading
326            if num_trailing > 0:
327                trailing_nonstarters[c] = num_trailing
328
329        return leading_nonstarters, trailing_nonstarters
330
331hexify = lambda c: '{:04X}'.format(c)
332
333def gen_mph_data(name, d, kv_type, kv_callback):
334    (salt, keys) = minimal_perfect_hash(d)
335    out.write("pub(crate) const %s_SALT: &[u16] = &[\n" % name.upper())
336    for s in salt:
337        out.write("    0x{:x},\n".format(s))
338    out.write("];\n")
339    out.write("pub(crate) const {}_KV: &[{}] = &[\n".format(name.upper(), kv_type))
340    for k in keys:
341        out.write("    {},\n".format(kv_callback(k)))
342    out.write("];\n\n")
343
344def gen_combining_class(combining_classes, out):
345    gen_mph_data('canonical_combining_class', combining_classes, 'u32',
346        lambda k: "0x{:X}".format(int(combining_classes[k]) | (k << 8)))
347
348def gen_composition_table(canon_comp, out):
349    table = {}
350    for (c1, c2), c3 in canon_comp.items():
351        if c1 < 0x10000 and c2 < 0x10000:
352            table[(c1 << 16) | c2] = c3
353    (salt, keys) = minimal_perfect_hash(table)
354    gen_mph_data('COMPOSITION_TABLE', table, '(u32, char)',
355        lambda k: "(0x%s, '\\u{%s}')" % (hexify(k), hexify(table[k])))
356
357    out.write("pub(crate) fn composition_table_astral(c1: char, c2: char) -> Option<char> {\n")
358    out.write("    match (c1, c2) {\n")
359    for (c1, c2), c3 in sorted(canon_comp.items()):
360        if c1 >= 0x10000 and c2 >= 0x10000:
361            out.write("        ('\\u{%s}', '\\u{%s}') => Some('\\u{%s}'),\n" % (hexify(c1), hexify(c2), hexify(c3)))
362
363    out.write("        _ => None,\n")
364    out.write("    }\n")
365    out.write("}\n")
366
367def gen_decomposition_tables(canon_decomp, compat_decomp, cjk_compat_variants_decomp, out):
368    tables = [(canon_decomp, 'canonical'), (compat_decomp, 'compatibility'), (cjk_compat_variants_decomp, 'cjk_compat_variants')]
369    for table, name in tables:
370        gen_mph_data(name + '_decomposed', table, "(u32, &'static [char])",
371            lambda k: "(0x{:x}, &[{}])".format(k,
372                ", ".join("'\\u{%s}'" % hexify(c) for c in table[k])))
373
374def gen_qc_match(prop_table, out):
375    out.write("    match c {\n")
376
377    for low, high, data in prop_table:
378        assert data in ('N', 'M')
379        result = "No" if data == 'N' else "Maybe"
380        if high:
381            out.write(r"        '\u{%s}'...'\u{%s}' => %s," % (low, high, result))
382        else:
383            out.write(r"        '\u{%s}' => %s," % (low, result))
384        out.write("\n")
385
386    out.write("        _ => Yes,\n")
387    out.write("    }\n")
388
389def gen_nfc_qc(prop_tables, out):
390    out.write("#[inline]\n")
391    out.write("#[allow(ellipsis_inclusive_range_patterns)]\n")
392    out.write("pub fn qc_nfc(c: char) -> IsNormalized {\n")
393    gen_qc_match(prop_tables['NFC_QC'], out)
394    out.write("}\n")
395
396def gen_nfkc_qc(prop_tables, out):
397    out.write("#[inline]\n")
398    out.write("#[allow(ellipsis_inclusive_range_patterns)]\n")
399    out.write("pub fn qc_nfkc(c: char) -> IsNormalized {\n")
400    gen_qc_match(prop_tables['NFKC_QC'], out)
401    out.write("}\n")
402
403def gen_nfd_qc(prop_tables, out):
404    out.write("#[inline]\n")
405    out.write("#[allow(ellipsis_inclusive_range_patterns)]\n")
406    out.write("pub fn qc_nfd(c: char) -> IsNormalized {\n")
407    gen_qc_match(prop_tables['NFD_QC'], out)
408    out.write("}\n")
409
410def gen_nfkd_qc(prop_tables, out):
411    out.write("#[inline]\n")
412    out.write("#[allow(ellipsis_inclusive_range_patterns)]\n")
413    out.write("pub fn qc_nfkd(c: char) -> IsNormalized {\n")
414    gen_qc_match(prop_tables['NFKD_QC'], out)
415    out.write("}\n")
416
417def gen_combining_mark(general_category_mark, out):
418    gen_mph_data('combining_mark', general_category_mark, 'u32',
419        lambda k: '0x{:04x}'.format(k))
420
421def gen_stream_safe(leading, trailing, out):
422    # This could be done as a hash but the table is very small.
423    out.write("#[inline]\n")
424    out.write("pub fn stream_safe_leading_nonstarters(c: char) -> usize {\n")
425    out.write("    match c {\n")
426
427    for char, num_leading in sorted(leading.items()):
428        out.write("        '\\u{%s}' => %d,\n" % (hexify(char), num_leading))
429
430    out.write("        _ => 0,\n")
431    out.write("    }\n")
432    out.write("}\n")
433    out.write("\n")
434
435    gen_mph_data('trailing_nonstarters', trailing, 'u32',
436        lambda k: "0x{:X}".format(int(trailing[k]) | (k << 8)))
437
438def gen_tests(tests, out):
439    out.write("""#[derive(Debug)]
440pub struct NormalizationTest {
441    pub source: &'static str,
442    pub nfc: &'static str,
443    pub nfd: &'static str,
444    pub nfkc: &'static str,
445    pub nfkd: &'static str,
446}
447
448""")
449
450    out.write("pub const NORMALIZATION_TESTS: &[NormalizationTest] = &[\n")
451    str_literal = lambda s: '"%s"' % "".join("\\u{%s}" % c for c in s)
452
453    for test in tests:
454        out.write("    NormalizationTest {\n")
455        out.write("        source: %s,\n" % str_literal(test.source))
456        out.write("        nfc: %s,\n" % str_literal(test.nfc))
457        out.write("        nfd: %s,\n" % str_literal(test.nfd))
458        out.write("        nfkc: %s,\n" % str_literal(test.nfkc))
459        out.write("        nfkd: %s,\n" % str_literal(test.nfkd))
460        out.write("    },\n")
461
462    out.write("];\n")
463
464# Guaranteed to be less than n.
465def my_hash(x, salt, n):
466    # This is hash based on the theory that multiplication is efficient
467    mask_32 = 0xffffffff
468    y = ((x + salt) * 2654435769) & mask_32
469    y ^= (x * 0x31415926) & mask_32
470    return (y * n) >> 32
471
472# Compute minimal perfect hash function, d can be either a dict or list of keys.
473def minimal_perfect_hash(d):
474    n = len(d)
475    buckets = dict((h, []) for h in range(n))
476    for key in d:
477        h = my_hash(key, 0, n)
478        buckets[h].append(key)
479    bsorted = [(len(buckets[h]), h) for h in range(n)]
480    bsorted.sort(reverse = True)
481    claimed = [False] * n
482    salts = [0] * n
483    keys = [0] * n
484    for (bucket_size, h) in bsorted:
485        # Note: the traditional perfect hashing approach would also special-case
486        # bucket_size == 1 here and assign any empty slot, rather than iterating
487        # until rehash finds an empty slot. But we're not doing that so we can
488        # avoid the branch.
489        if bucket_size == 0:
490            break
491        else:
492            for salt in range(1, 32768):
493                rehashes = [my_hash(key, salt, n) for key in buckets[h]]
494                # Make sure there are no rehash collisions within this bucket.
495                if all(not claimed[hash] for hash in rehashes):
496                    if len(set(rehashes)) < bucket_size:
497                        continue
498                    salts[h] = salt
499                    for key in buckets[h]:
500                        rehash = my_hash(key, salt, n)
501                        claimed[rehash] = True
502                        keys[rehash] = key
503                    break
504            if salts[h] == 0:
505                print("minimal perfect hashing failed")
506                # Note: if this happens (because of unfortunate data), then there are
507                # a few things that could be done. First, the hash function could be
508                # tweaked. Second, the bucket order could be scrambled (especially the
509                # singletons). Right now, the buckets are sorted, which has the advantage
510                # of being deterministic.
511                #
512                # As a more extreme approach, the singleton bucket optimization could be
513                # applied (give the direct address for singleton buckets, rather than
514                # relying on a rehash). That is definitely the more standard approach in
515                # the minimal perfect hashing literature, but in testing the branch was a
516                # significant slowdown.
517                exit(1)
518    return (salts, keys)
519
520if __name__ == '__main__':
521    data = UnicodeData()
522    with open("tables.rs", "w", newline = "\n") as out:
523        out.write(PREAMBLE)
524        out.write("use crate::quick_check::IsNormalized;\n")
525        out.write("use crate::quick_check::IsNormalized::*;\n")
526        out.write("\n")
527
528        version = "(%s, %s, %s)" % tuple(UNICODE_VERSION.split("."))
529        out.write("#[allow(unused)]\n")
530        out.write("pub const UNICODE_VERSION: (u8, u8, u8) = %s;\n\n" % version)
531
532        gen_combining_class(data.combining_classes, out)
533        out.write("\n")
534
535        gen_composition_table(data.canon_comp, out)
536        out.write("\n")
537
538        gen_decomposition_tables(data.canon_fully_decomp, data.compat_fully_decomp, data.cjk_compat_variants_fully_decomp, out)
539
540        gen_combining_mark(data.general_category_mark, out)
541        out.write("\n")
542
543        gen_nfc_qc(data.norm_props, out)
544        out.write("\n")
545
546        gen_nfkc_qc(data.norm_props, out)
547        out.write("\n")
548
549        gen_nfd_qc(data.norm_props, out)
550        out.write("\n")
551
552        gen_nfkd_qc(data.norm_props, out)
553        out.write("\n")
554
555        gen_stream_safe(data.ss_leading, data.ss_trailing, out)
556        out.write("\n")
557
558    with open("normalization_tests.rs", "w", newline = "\n") as out:
559        out.write(PREAMBLE)
560        gen_tests(data.norm_tests, out)
561