1#!/usr/bin/env python 2# 3# Copyright 2011-2018 The Rust Project Developers. See the COPYRIGHT 4# file at the top-level directory of this distribution and at 5# http://rust-lang.org/COPYRIGHT. 6# 7# Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or 8# http://www.apache.org/licenses/LICENSE-2.0> or the MIT license 9# <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your 10# option. This file may not be copied, modified, or distributed 11# except according to those terms. 12 13# This script uses the following Unicode tables: 14# - DerivedNormalizationProps.txt 15# - NormalizationTest.txt 16# - UnicodeData.txt 17# - StandardizedVariants.txt 18# 19# Since this should not require frequent updates, we just store this 20# out-of-line and check the tables.rs and normalization_tests.rs files into git. 21import collections 22import urllib.request 23 24UNICODE_VERSION = "13.0.0" 25UCD_URL = "https://www.unicode.org/Public/%s/ucd/" % UNICODE_VERSION 26 27PREAMBLE = """// Copyright 2012-2018 The Rust Project Developers. See the COPYRIGHT 28// file at the top-level directory of this distribution and at 29// http://rust-lang.org/COPYRIGHT. 30// 31// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or 32// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license 33// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your 34// option. This file may not be copied, modified, or distributed 35// except according to those terms. 36 37// NOTE: The following code was generated by "scripts/unicode.py", do not edit directly 38 39#![allow(missing_docs)] 40""" 41 42NormalizationTest = collections.namedtuple( 43 "NormalizationTest", 44 ["source", "nfc", "nfd", "nfkc", "nfkd"], 45) 46 47# Mapping taken from Table 12 from: 48# http://www.unicode.org/reports/tr44/#General_Category_Values 49expanded_categories = { 50 'Lu': ['LC', 'L'], 'Ll': ['LC', 'L'], 'Lt': ['LC', 'L'], 51 'Lm': ['L'], 'Lo': ['L'], 52 'Mn': ['M'], 'Mc': ['M'], 'Me': ['M'], 53 'Nd': ['N'], 'Nl': ['N'], 'No': ['No'], 54 'Pc': ['P'], 'Pd': ['P'], 'Ps': ['P'], 'Pe': ['P'], 55 'Pi': ['P'], 'Pf': ['P'], 'Po': ['P'], 56 'Sm': ['S'], 'Sc': ['S'], 'Sk': ['S'], 'So': ['S'], 57 'Zs': ['Z'], 'Zl': ['Z'], 'Zp': ['Z'], 58 'Cc': ['C'], 'Cf': ['C'], 'Cs': ['C'], 'Co': ['C'], 'Cn': ['C'], 59} 60 61# Constants from Unicode 9.0.0 Section 3.12 Conjoining Jamo Behavior 62# http://www.unicode.org/versions/Unicode9.0.0/ch03.pdf#M9.32468.Heading.310.Combining.Jamo.Behavior 63S_BASE, L_COUNT, V_COUNT, T_COUNT = 0xAC00, 19, 21, 28 64S_COUNT = L_COUNT * V_COUNT * T_COUNT 65 66class UnicodeData(object): 67 def __init__(self): 68 self._load_unicode_data() 69 self.norm_props = self._load_norm_props() 70 self.norm_tests = self._load_norm_tests() 71 72 self.canon_comp = self._compute_canonical_comp() 73 self.canon_fully_decomp, self.compat_fully_decomp = self._compute_fully_decomposed() 74 75 self.cjk_compat_variants_fully_decomp = {} 76 self._load_cjk_compat_ideograph_variants() 77 78 def stats(name, table): 79 count = sum(len(v) for v in table.values()) 80 print("%s: %d chars => %d decomposed chars" % (name, len(table), count)) 81 82 print("Decomposition table stats:") 83 stats("Canonical decomp", self.canon_decomp) 84 stats("Compatible decomp", self.compat_decomp) 85 stats("Canonical fully decomp", self.canon_fully_decomp) 86 stats("Compatible fully decomp", self.compat_fully_decomp) 87 stats("CJK Compat Variants fully decomp", self.cjk_compat_variants_fully_decomp) 88 89 self.ss_leading, self.ss_trailing = self._compute_stream_safe_tables() 90 91 def _fetch(self, filename): 92 resp = urllib.request.urlopen(UCD_URL + filename) 93 return resp.read().decode('utf-8') 94 95 def _load_unicode_data(self): 96 self.name_to_char_int = {} 97 self.combining_classes = {} 98 self.compat_decomp = {} 99 self.canon_decomp = {} 100 self.general_category_mark = [] 101 102 for line in self._fetch("UnicodeData.txt").splitlines(): 103 # See ftp://ftp.unicode.org/Public/3.0-Update/UnicodeData-3.0.0.html 104 pieces = line.split(';') 105 assert len(pieces) == 15 106 char, category, cc, decomp = pieces[0], pieces[2], pieces[3], pieces[5] 107 char_int = int(char, 16) 108 109 name = pieces[1].strip() 110 self.name_to_char_int[name] = char_int 111 112 if cc != '0': 113 self.combining_classes[char_int] = cc 114 115 if decomp.startswith('<'): 116 self.compat_decomp[char_int] = [int(c, 16) for c in decomp.split()[1:]] 117 elif decomp != '': 118 self.canon_decomp[char_int] = [int(c, 16) for c in decomp.split()] 119 120 if category == 'M' or 'M' in expanded_categories.get(category, []): 121 self.general_category_mark.append(char_int) 122 123 def _load_cjk_compat_ideograph_variants(self): 124 for line in self._fetch("StandardizedVariants.txt").splitlines(): 125 strip_comments = line.split('#', 1)[0].strip() 126 if not strip_comments: 127 continue 128 129 variation_sequence, description, differences = strip_comments.split(';') 130 description = description.strip() 131 132 # Don't use variations that only apply in particular shaping environments. 133 if differences: 134 continue 135 136 # Look for entries where the description field is a codepoint name. 137 if description not in self.name_to_char_int: 138 continue 139 140 # Only consider the CJK Compatibility Ideographs. 141 if not description.startswith('CJK COMPATIBILITY IDEOGRAPH-'): 142 continue 143 144 char_int = self.name_to_char_int[description] 145 146 assert not char_int in self.combining_classes, "Unexpected: CJK compat variant with a combining class" 147 assert not char_int in self.compat_decomp, "Unexpected: CJK compat variant and compatibility decomposition" 148 assert len(self.canon_decomp[char_int]) == 1, "Unexpected: CJK compat variant and non-singleton canonical decomposition" 149 # If we ever need to handle Hangul here, we'll need to handle it separately. 150 assert not (S_BASE <= char_int < S_BASE + S_COUNT) 151 152 cjk_compat_variant_parts = [int(c, 16) for c in variation_sequence.split()] 153 for c in cjk_compat_variant_parts: 154 assert not c in self.canon_decomp, "Unexpected: CJK compat variant is unnormalized (canon)" 155 assert not c in self.compat_decomp, "Unexpected: CJK compat variant is unnormalized (compat)" 156 self.cjk_compat_variants_fully_decomp[char_int] = cjk_compat_variant_parts 157 158 def _load_norm_props(self): 159 props = collections.defaultdict(list) 160 161 for line in self._fetch("DerivedNormalizationProps.txt").splitlines(): 162 (prop_data, _, _) = line.partition("#") 163 prop_pieces = prop_data.split(";") 164 165 if len(prop_pieces) < 2: 166 continue 167 168 assert len(prop_pieces) <= 3 169 (low, _, high) = prop_pieces[0].strip().partition("..") 170 171 prop = prop_pieces[1].strip() 172 173 data = None 174 if len(prop_pieces) == 3: 175 data = prop_pieces[2].strip() 176 177 props[prop].append((low, high, data)) 178 179 return props 180 181 def _load_norm_tests(self): 182 tests = [] 183 for line in self._fetch("NormalizationTest.txt").splitlines(): 184 (test_data, _, _) = line.partition("#") 185 test_pieces = test_data.split(";") 186 187 if len(test_pieces) < 5: 188 continue 189 190 source, nfc, nfd, nfkc, nfkd = [[c.strip() for c in p.split()] for p in test_pieces[:5]] 191 tests.append(NormalizationTest(source, nfc, nfd, nfkc, nfkd)) 192 193 return tests 194 195 def _compute_canonical_comp(self): 196 canon_comp = {} 197 comp_exclusions = [ 198 (int(low, 16), int(high or low, 16)) 199 for low, high, _ in self.norm_props["Full_Composition_Exclusion"] 200 ] 201 for char_int, decomp in self.canon_decomp.items(): 202 if any(lo <= char_int <= hi for lo, hi in comp_exclusions): 203 continue 204 205 assert len(decomp) == 2 206 assert (decomp[0], decomp[1]) not in canon_comp 207 canon_comp[(decomp[0], decomp[1])] = char_int 208 209 return canon_comp 210 211 def _compute_fully_decomposed(self): 212 """ 213 Even though the decomposition algorithm is recursive, it is possible 214 to precompute the recursion at table generation time with modest 215 increase to the table size. Then, for these precomputed tables, we 216 note that 1) compatible decomposition is a subset of canonical 217 decomposition and 2) they mostly agree on their intersection. 218 Therefore, we don't store entries in the compatible table for 219 characters that decompose the same way under canonical decomposition. 220 221 Decomposition table stats: 222 Canonical decomp: 2060 chars => 3085 decomposed chars 223 Compatible decomp: 3662 chars => 5440 decomposed chars 224 Canonical fully decomp: 2060 chars => 3404 decomposed chars 225 Compatible fully decomp: 3678 chars => 5599 decomposed chars 226 227 The upshot is that decomposition code is very simple and easy to inline 228 at mild code size cost. 229 """ 230 def _decompose(char_int, compatible): 231 # 7-bit ASCII never decomposes 232 if char_int <= 0x7f: 233 yield char_int 234 return 235 236 # Assert that we're handling Hangul separately. 237 assert not (S_BASE <= char_int < S_BASE + S_COUNT) 238 239 decomp = self.canon_decomp.get(char_int) 240 if decomp is not None: 241 for decomposed_ch in decomp: 242 for fully_decomposed_ch in _decompose(decomposed_ch, compatible): 243 yield fully_decomposed_ch 244 return 245 246 if compatible and char_int in self.compat_decomp: 247 for decomposed_ch in self.compat_decomp[char_int]: 248 for fully_decomposed_ch in _decompose(decomposed_ch, compatible): 249 yield fully_decomposed_ch 250 return 251 252 yield char_int 253 return 254 255 end_codepoint = max( 256 max(self.canon_decomp.keys()), 257 max(self.compat_decomp.keys()), 258 ) 259 260 canon_fully_decomp = {} 261 compat_fully_decomp = {} 262 263 for char_int in range(0, end_codepoint + 1): 264 # Always skip Hangul, since it's more efficient to represent its 265 # decomposition programmatically. 266 if S_BASE <= char_int < S_BASE + S_COUNT: 267 continue 268 269 canon = list(_decompose(char_int, False)) 270 if not (len(canon) == 1 and canon[0] == char_int): 271 canon_fully_decomp[char_int] = canon 272 273 compat = list(_decompose(char_int, True)) 274 if not (len(compat) == 1 and compat[0] == char_int): 275 compat_fully_decomp[char_int] = compat 276 277 # Since canon_fully_decomp is a subset of compat_fully_decomp, we don't 278 # need to store their overlap when they agree. When they don't agree, 279 # store the decomposition in the compatibility table since we'll check 280 # that first when normalizing to NFKD. 281 assert set(canon_fully_decomp) <= set(compat_fully_decomp) 282 283 for ch in set(canon_fully_decomp) & set(compat_fully_decomp): 284 if canon_fully_decomp[ch] == compat_fully_decomp[ch]: 285 del compat_fully_decomp[ch] 286 287 return canon_fully_decomp, compat_fully_decomp 288 289 def _compute_stream_safe_tables(self): 290 """ 291 To make a text stream-safe with the Stream-Safe Text Process (UAX15-D4), 292 we need to be able to know the number of contiguous non-starters *after* 293 applying compatibility decomposition to each character. 294 295 We can do this incrementally by computing the number of leading and 296 trailing non-starters for each character's compatibility decomposition 297 with the following rules: 298 299 1) If a character is not affected by compatibility decomposition, look 300 up its canonical combining class to find out if it's a non-starter. 301 2) All Hangul characters are starters, even under decomposition. 302 3) Otherwise, very few decomposing characters have a nonzero count 303 of leading or trailing non-starters, so store these characters 304 with their associated counts in a separate table. 305 """ 306 leading_nonstarters = {} 307 trailing_nonstarters = {} 308 309 for c in set(self.canon_fully_decomp) | set(self.compat_fully_decomp): 310 decomposed = self.compat_fully_decomp.get(c) or self.canon_fully_decomp[c] 311 312 num_leading = 0 313 for d in decomposed: 314 if d not in self.combining_classes: 315 break 316 num_leading += 1 317 318 num_trailing = 0 319 for d in reversed(decomposed): 320 if d not in self.combining_classes: 321 break 322 num_trailing += 1 323 324 if num_leading > 0: 325 leading_nonstarters[c] = num_leading 326 if num_trailing > 0: 327 trailing_nonstarters[c] = num_trailing 328 329 return leading_nonstarters, trailing_nonstarters 330 331hexify = lambda c: '{:04X}'.format(c) 332 333def gen_mph_data(name, d, kv_type, kv_callback): 334 (salt, keys) = minimal_perfect_hash(d) 335 out.write("pub(crate) const %s_SALT: &[u16] = &[\n" % name.upper()) 336 for s in salt: 337 out.write(" 0x{:x},\n".format(s)) 338 out.write("];\n") 339 out.write("pub(crate) const {}_KV: &[{}] = &[\n".format(name.upper(), kv_type)) 340 for k in keys: 341 out.write(" {},\n".format(kv_callback(k))) 342 out.write("];\n\n") 343 344def gen_combining_class(combining_classes, out): 345 gen_mph_data('canonical_combining_class', combining_classes, 'u32', 346 lambda k: "0x{:X}".format(int(combining_classes[k]) | (k << 8))) 347 348def gen_composition_table(canon_comp, out): 349 table = {} 350 for (c1, c2), c3 in canon_comp.items(): 351 if c1 < 0x10000 and c2 < 0x10000: 352 table[(c1 << 16) | c2] = c3 353 (salt, keys) = minimal_perfect_hash(table) 354 gen_mph_data('COMPOSITION_TABLE', table, '(u32, char)', 355 lambda k: "(0x%s, '\\u{%s}')" % (hexify(k), hexify(table[k]))) 356 357 out.write("pub(crate) fn composition_table_astral(c1: char, c2: char) -> Option<char> {\n") 358 out.write(" match (c1, c2) {\n") 359 for (c1, c2), c3 in sorted(canon_comp.items()): 360 if c1 >= 0x10000 and c2 >= 0x10000: 361 out.write(" ('\\u{%s}', '\\u{%s}') => Some('\\u{%s}'),\n" % (hexify(c1), hexify(c2), hexify(c3))) 362 363 out.write(" _ => None,\n") 364 out.write(" }\n") 365 out.write("}\n") 366 367def gen_decomposition_tables(canon_decomp, compat_decomp, cjk_compat_variants_decomp, out): 368 tables = [(canon_decomp, 'canonical'), (compat_decomp, 'compatibility'), (cjk_compat_variants_decomp, 'cjk_compat_variants')] 369 for table, name in tables: 370 gen_mph_data(name + '_decomposed', table, "(u32, &'static [char])", 371 lambda k: "(0x{:x}, &[{}])".format(k, 372 ", ".join("'\\u{%s}'" % hexify(c) for c in table[k]))) 373 374def gen_qc_match(prop_table, out): 375 out.write(" match c {\n") 376 377 for low, high, data in prop_table: 378 assert data in ('N', 'M') 379 result = "No" if data == 'N' else "Maybe" 380 if high: 381 out.write(r" '\u{%s}'...'\u{%s}' => %s," % (low, high, result)) 382 else: 383 out.write(r" '\u{%s}' => %s," % (low, result)) 384 out.write("\n") 385 386 out.write(" _ => Yes,\n") 387 out.write(" }\n") 388 389def gen_nfc_qc(prop_tables, out): 390 out.write("#[inline]\n") 391 out.write("#[allow(ellipsis_inclusive_range_patterns)]\n") 392 out.write("pub fn qc_nfc(c: char) -> IsNormalized {\n") 393 gen_qc_match(prop_tables['NFC_QC'], out) 394 out.write("}\n") 395 396def gen_nfkc_qc(prop_tables, out): 397 out.write("#[inline]\n") 398 out.write("#[allow(ellipsis_inclusive_range_patterns)]\n") 399 out.write("pub fn qc_nfkc(c: char) -> IsNormalized {\n") 400 gen_qc_match(prop_tables['NFKC_QC'], out) 401 out.write("}\n") 402 403def gen_nfd_qc(prop_tables, out): 404 out.write("#[inline]\n") 405 out.write("#[allow(ellipsis_inclusive_range_patterns)]\n") 406 out.write("pub fn qc_nfd(c: char) -> IsNormalized {\n") 407 gen_qc_match(prop_tables['NFD_QC'], out) 408 out.write("}\n") 409 410def gen_nfkd_qc(prop_tables, out): 411 out.write("#[inline]\n") 412 out.write("#[allow(ellipsis_inclusive_range_patterns)]\n") 413 out.write("pub fn qc_nfkd(c: char) -> IsNormalized {\n") 414 gen_qc_match(prop_tables['NFKD_QC'], out) 415 out.write("}\n") 416 417def gen_combining_mark(general_category_mark, out): 418 gen_mph_data('combining_mark', general_category_mark, 'u32', 419 lambda k: '0x{:04x}'.format(k)) 420 421def gen_stream_safe(leading, trailing, out): 422 # This could be done as a hash but the table is very small. 423 out.write("#[inline]\n") 424 out.write("pub fn stream_safe_leading_nonstarters(c: char) -> usize {\n") 425 out.write(" match c {\n") 426 427 for char, num_leading in sorted(leading.items()): 428 out.write(" '\\u{%s}' => %d,\n" % (hexify(char), num_leading)) 429 430 out.write(" _ => 0,\n") 431 out.write(" }\n") 432 out.write("}\n") 433 out.write("\n") 434 435 gen_mph_data('trailing_nonstarters', trailing, 'u32', 436 lambda k: "0x{:X}".format(int(trailing[k]) | (k << 8))) 437 438def gen_tests(tests, out): 439 out.write("""#[derive(Debug)] 440pub struct NormalizationTest { 441 pub source: &'static str, 442 pub nfc: &'static str, 443 pub nfd: &'static str, 444 pub nfkc: &'static str, 445 pub nfkd: &'static str, 446} 447 448""") 449 450 out.write("pub const NORMALIZATION_TESTS: &[NormalizationTest] = &[\n") 451 str_literal = lambda s: '"%s"' % "".join("\\u{%s}" % c for c in s) 452 453 for test in tests: 454 out.write(" NormalizationTest {\n") 455 out.write(" source: %s,\n" % str_literal(test.source)) 456 out.write(" nfc: %s,\n" % str_literal(test.nfc)) 457 out.write(" nfd: %s,\n" % str_literal(test.nfd)) 458 out.write(" nfkc: %s,\n" % str_literal(test.nfkc)) 459 out.write(" nfkd: %s,\n" % str_literal(test.nfkd)) 460 out.write(" },\n") 461 462 out.write("];\n") 463 464# Guaranteed to be less than n. 465def my_hash(x, salt, n): 466 # This is hash based on the theory that multiplication is efficient 467 mask_32 = 0xffffffff 468 y = ((x + salt) * 2654435769) & mask_32 469 y ^= (x * 0x31415926) & mask_32 470 return (y * n) >> 32 471 472# Compute minimal perfect hash function, d can be either a dict or list of keys. 473def minimal_perfect_hash(d): 474 n = len(d) 475 buckets = dict((h, []) for h in range(n)) 476 for key in d: 477 h = my_hash(key, 0, n) 478 buckets[h].append(key) 479 bsorted = [(len(buckets[h]), h) for h in range(n)] 480 bsorted.sort(reverse = True) 481 claimed = [False] * n 482 salts = [0] * n 483 keys = [0] * n 484 for (bucket_size, h) in bsorted: 485 # Note: the traditional perfect hashing approach would also special-case 486 # bucket_size == 1 here and assign any empty slot, rather than iterating 487 # until rehash finds an empty slot. But we're not doing that so we can 488 # avoid the branch. 489 if bucket_size == 0: 490 break 491 else: 492 for salt in range(1, 32768): 493 rehashes = [my_hash(key, salt, n) for key in buckets[h]] 494 # Make sure there are no rehash collisions within this bucket. 495 if all(not claimed[hash] for hash in rehashes): 496 if len(set(rehashes)) < bucket_size: 497 continue 498 salts[h] = salt 499 for key in buckets[h]: 500 rehash = my_hash(key, salt, n) 501 claimed[rehash] = True 502 keys[rehash] = key 503 break 504 if salts[h] == 0: 505 print("minimal perfect hashing failed") 506 # Note: if this happens (because of unfortunate data), then there are 507 # a few things that could be done. First, the hash function could be 508 # tweaked. Second, the bucket order could be scrambled (especially the 509 # singletons). Right now, the buckets are sorted, which has the advantage 510 # of being deterministic. 511 # 512 # As a more extreme approach, the singleton bucket optimization could be 513 # applied (give the direct address for singleton buckets, rather than 514 # relying on a rehash). That is definitely the more standard approach in 515 # the minimal perfect hashing literature, but in testing the branch was a 516 # significant slowdown. 517 exit(1) 518 return (salts, keys) 519 520if __name__ == '__main__': 521 data = UnicodeData() 522 with open("tables.rs", "w", newline = "\n") as out: 523 out.write(PREAMBLE) 524 out.write("use crate::quick_check::IsNormalized;\n") 525 out.write("use crate::quick_check::IsNormalized::*;\n") 526 out.write("\n") 527 528 version = "(%s, %s, %s)" % tuple(UNICODE_VERSION.split(".")) 529 out.write("#[allow(unused)]\n") 530 out.write("pub const UNICODE_VERSION: (u8, u8, u8) = %s;\n\n" % version) 531 532 gen_combining_class(data.combining_classes, out) 533 out.write("\n") 534 535 gen_composition_table(data.canon_comp, out) 536 out.write("\n") 537 538 gen_decomposition_tables(data.canon_fully_decomp, data.compat_fully_decomp, data.cjk_compat_variants_fully_decomp, out) 539 540 gen_combining_mark(data.general_category_mark, out) 541 out.write("\n") 542 543 gen_nfc_qc(data.norm_props, out) 544 out.write("\n") 545 546 gen_nfkc_qc(data.norm_props, out) 547 out.write("\n") 548 549 gen_nfd_qc(data.norm_props, out) 550 out.write("\n") 551 552 gen_nfkd_qc(data.norm_props, out) 553 out.write("\n") 554 555 gen_stream_safe(data.ss_leading, data.ss_trailing, out) 556 out.write("\n") 557 558 with open("normalization_tests.rs", "w", newline = "\n") as out: 559 out.write(PREAMBLE) 560 gen_tests(data.norm_tests, out) 561