1// Copyright 2011 The Go Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style 3// license that can be found in the LICENSE file. 4 5// +build ignore 6 7// Normalization table generator. 8// Data read from the web. 9// See forminfo.go for a description of the trie values associated with each rune. 10 11package main 12 13import ( 14 "bytes" 15 "flag" 16 "fmt" 17 "io" 18 "log" 19 "sort" 20 "strconv" 21 "strings" 22 23 "golang.org/x/text/internal/gen" 24 "golang.org/x/text/internal/triegen" 25 "golang.org/x/text/internal/ucd" 26) 27 28func main() { 29 gen.Init() 30 loadUnicodeData() 31 compactCCC() 32 loadCompositionExclusions() 33 completeCharFields(FCanonical) 34 completeCharFields(FCompatibility) 35 computeNonStarterCounts() 36 verifyComputed() 37 printChars() 38 testDerived() 39 printTestdata() 40 makeTables() 41} 42 43var ( 44 tablelist = flag.String("tables", 45 "all", 46 "comma-separated list of which tables to generate; "+ 47 "can be 'decomp', 'recomp', 'info' and 'all'") 48 test = flag.Bool("test", 49 false, 50 "test existing tables against DerivedNormalizationProps and generate test data for regression testing") 51 verbose = flag.Bool("verbose", 52 false, 53 "write data to stdout as it is parsed") 54) 55 56const MaxChar = 0x10FFFF // anything above this shouldn't exist 57 58// Quick Check properties of runes allow us to quickly 59// determine whether a rune may occur in a normal form. 60// For a given normal form, a rune may be guaranteed to occur 61// verbatim (QC=Yes), may or may not combine with another 62// rune (QC=Maybe), or may not occur (QC=No). 63type QCResult int 64 65const ( 66 QCUnknown QCResult = iota 67 QCYes 68 QCNo 69 QCMaybe 70) 71 72func (r QCResult) String() string { 73 switch r { 74 case QCYes: 75 return "Yes" 76 case QCNo: 77 return "No" 78 case QCMaybe: 79 return "Maybe" 80 } 81 return "***UNKNOWN***" 82} 83 84const ( 85 FCanonical = iota // NFC or NFD 86 FCompatibility // NFKC or NFKD 87 FNumberOfFormTypes 88) 89 90const ( 91 MComposed = iota // NFC or NFKC 92 MDecomposed // NFD or NFKD 93 MNumberOfModes 94) 95 96// This contains only the properties we're interested in. 97type Char struct { 98 name string 99 codePoint rune // if zero, this index is not a valid code point. 100 ccc uint8 // canonical combining class 101 origCCC uint8 102 excludeInComp bool // from CompositionExclusions.txt 103 compatDecomp bool // it has a compatibility expansion 104 105 nTrailingNonStarters uint8 106 nLeadingNonStarters uint8 // must be equal to trailing if non-zero 107 108 forms [FNumberOfFormTypes]FormInfo // For FCanonical and FCompatibility 109 110 state State 111} 112 113var chars = make([]Char, MaxChar+1) 114var cccMap = make(map[uint8]uint8) 115 116func (c Char) String() string { 117 buf := new(bytes.Buffer) 118 119 fmt.Fprintf(buf, "%U [%s]:\n", c.codePoint, c.name) 120 fmt.Fprintf(buf, " ccc: %v\n", c.ccc) 121 fmt.Fprintf(buf, " excludeInComp: %v\n", c.excludeInComp) 122 fmt.Fprintf(buf, " compatDecomp: %v\n", c.compatDecomp) 123 fmt.Fprintf(buf, " state: %v\n", c.state) 124 fmt.Fprintf(buf, " NFC:\n") 125 fmt.Fprint(buf, c.forms[FCanonical]) 126 fmt.Fprintf(buf, " NFKC:\n") 127 fmt.Fprint(buf, c.forms[FCompatibility]) 128 129 return buf.String() 130} 131 132// In UnicodeData.txt, some ranges are marked like this: 133// 3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;; 134// 4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;; 135// parseCharacter keeps a state variable indicating the weirdness. 136type State int 137 138const ( 139 SNormal State = iota // known to be zero for the type 140 SFirst 141 SLast 142 SMissing 143) 144 145var lastChar = rune('\u0000') 146 147func (c Char) isValid() bool { 148 return c.codePoint != 0 && c.state != SMissing 149} 150 151type FormInfo struct { 152 quickCheck [MNumberOfModes]QCResult // index: MComposed or MDecomposed 153 verified [MNumberOfModes]bool // index: MComposed or MDecomposed 154 155 combinesForward bool // May combine with rune on the right 156 combinesBackward bool // May combine with rune on the left 157 isOneWay bool // Never appears in result 158 inDecomp bool // Some decompositions result in this char. 159 decomp Decomposition 160 expandedDecomp Decomposition 161} 162 163func (f FormInfo) String() string { 164 buf := bytes.NewBuffer(make([]byte, 0)) 165 166 fmt.Fprintf(buf, " quickCheck[C]: %v\n", f.quickCheck[MComposed]) 167 fmt.Fprintf(buf, " quickCheck[D]: %v\n", f.quickCheck[MDecomposed]) 168 fmt.Fprintf(buf, " cmbForward: %v\n", f.combinesForward) 169 fmt.Fprintf(buf, " cmbBackward: %v\n", f.combinesBackward) 170 fmt.Fprintf(buf, " isOneWay: %v\n", f.isOneWay) 171 fmt.Fprintf(buf, " inDecomp: %v\n", f.inDecomp) 172 fmt.Fprintf(buf, " decomposition: %X\n", f.decomp) 173 fmt.Fprintf(buf, " expandedDecomp: %X\n", f.expandedDecomp) 174 175 return buf.String() 176} 177 178type Decomposition []rune 179 180func parseDecomposition(s string, skipfirst bool) (a []rune, err error) { 181 decomp := strings.Split(s, " ") 182 if len(decomp) > 0 && skipfirst { 183 decomp = decomp[1:] 184 } 185 for _, d := range decomp { 186 point, err := strconv.ParseUint(d, 16, 64) 187 if err != nil { 188 return a, err 189 } 190 a = append(a, rune(point)) 191 } 192 return a, nil 193} 194 195func loadUnicodeData() { 196 f := gen.OpenUCDFile("UnicodeData.txt") 197 defer f.Close() 198 p := ucd.New(f) 199 for p.Next() { 200 r := p.Rune(ucd.CodePoint) 201 char := &chars[r] 202 203 char.ccc = uint8(p.Uint(ucd.CanonicalCombiningClass)) 204 decmap := p.String(ucd.DecompMapping) 205 206 exp, err := parseDecomposition(decmap, false) 207 isCompat := false 208 if err != nil { 209 if len(decmap) > 0 { 210 exp, err = parseDecomposition(decmap, true) 211 if err != nil { 212 log.Fatalf(`%U: bad decomp |%v|: "%s"`, r, decmap, err) 213 } 214 isCompat = true 215 } 216 } 217 218 char.name = p.String(ucd.Name) 219 char.codePoint = r 220 char.forms[FCompatibility].decomp = exp 221 if !isCompat { 222 char.forms[FCanonical].decomp = exp 223 } else { 224 char.compatDecomp = true 225 } 226 if len(decmap) > 0 { 227 char.forms[FCompatibility].decomp = exp 228 } 229 } 230 if err := p.Err(); err != nil { 231 log.Fatal(err) 232 } 233} 234 235// compactCCC converts the sparse set of CCC values to a continguous one, 236// reducing the number of bits needed from 8 to 6. 237func compactCCC() { 238 m := make(map[uint8]uint8) 239 for i := range chars { 240 c := &chars[i] 241 m[c.ccc] = 0 242 } 243 cccs := []int{} 244 for v, _ := range m { 245 cccs = append(cccs, int(v)) 246 } 247 sort.Ints(cccs) 248 for i, c := range cccs { 249 cccMap[uint8(i)] = uint8(c) 250 m[uint8(c)] = uint8(i) 251 } 252 for i := range chars { 253 c := &chars[i] 254 c.origCCC = c.ccc 255 c.ccc = m[c.ccc] 256 } 257 if len(m) >= 1<<6 { 258 log.Fatalf("too many difference CCC values: %d >= 64", len(m)) 259 } 260} 261 262// CompositionExclusions.txt has form: 263// 0958 # ... 264// See http://unicode.org/reports/tr44/ for full explanation 265func loadCompositionExclusions() { 266 f := gen.OpenUCDFile("CompositionExclusions.txt") 267 defer f.Close() 268 p := ucd.New(f) 269 for p.Next() { 270 c := &chars[p.Rune(0)] 271 if c.excludeInComp { 272 log.Fatalf("%U: Duplicate entry in exclusions.", c.codePoint) 273 } 274 c.excludeInComp = true 275 } 276 if e := p.Err(); e != nil { 277 log.Fatal(e) 278 } 279} 280 281// hasCompatDecomp returns true if any of the recursive 282// decompositions contains a compatibility expansion. 283// In this case, the character may not occur in NFK*. 284func hasCompatDecomp(r rune) bool { 285 c := &chars[r] 286 if c.compatDecomp { 287 return true 288 } 289 for _, d := range c.forms[FCompatibility].decomp { 290 if hasCompatDecomp(d) { 291 return true 292 } 293 } 294 return false 295} 296 297// Hangul related constants. 298const ( 299 HangulBase = 0xAC00 300 HangulEnd = 0xD7A4 // hangulBase + Jamo combinations (19 * 21 * 28) 301 302 JamoLBase = 0x1100 303 JamoLEnd = 0x1113 304 JamoVBase = 0x1161 305 JamoVEnd = 0x1176 306 JamoTBase = 0x11A8 307 JamoTEnd = 0x11C3 308 309 JamoLVTCount = 19 * 21 * 28 310 JamoTCount = 28 311) 312 313func isHangul(r rune) bool { 314 return HangulBase <= r && r < HangulEnd 315} 316 317func isHangulWithoutJamoT(r rune) bool { 318 if !isHangul(r) { 319 return false 320 } 321 r -= HangulBase 322 return r < JamoLVTCount && r%JamoTCount == 0 323} 324 325func ccc(r rune) uint8 { 326 return chars[r].ccc 327} 328 329// Insert a rune in a buffer, ordered by Canonical Combining Class. 330func insertOrdered(b Decomposition, r rune) Decomposition { 331 n := len(b) 332 b = append(b, 0) 333 cc := ccc(r) 334 if cc > 0 { 335 // Use bubble sort. 336 for ; n > 0; n-- { 337 if ccc(b[n-1]) <= cc { 338 break 339 } 340 b[n] = b[n-1] 341 } 342 } 343 b[n] = r 344 return b 345} 346 347// Recursively decompose. 348func decomposeRecursive(form int, r rune, d Decomposition) Decomposition { 349 dcomp := chars[r].forms[form].decomp 350 if len(dcomp) == 0 { 351 return insertOrdered(d, r) 352 } 353 for _, c := range dcomp { 354 d = decomposeRecursive(form, c, d) 355 } 356 return d 357} 358 359func completeCharFields(form int) { 360 // Phase 0: pre-expand decomposition. 361 for i := range chars { 362 f := &chars[i].forms[form] 363 if len(f.decomp) == 0 { 364 continue 365 } 366 exp := make(Decomposition, 0) 367 for _, c := range f.decomp { 368 exp = decomposeRecursive(form, c, exp) 369 } 370 f.expandedDecomp = exp 371 } 372 373 // Phase 1: composition exclusion, mark decomposition. 374 for i := range chars { 375 c := &chars[i] 376 f := &c.forms[form] 377 378 // Marks script-specific exclusions and version restricted. 379 f.isOneWay = c.excludeInComp 380 381 // Singletons 382 f.isOneWay = f.isOneWay || len(f.decomp) == 1 383 384 // Non-starter decompositions 385 if len(f.decomp) > 1 { 386 chk := c.ccc != 0 || chars[f.decomp[0]].ccc != 0 387 f.isOneWay = f.isOneWay || chk 388 } 389 390 // Runes that decompose into more than two runes. 391 f.isOneWay = f.isOneWay || len(f.decomp) > 2 392 393 if form == FCompatibility { 394 f.isOneWay = f.isOneWay || hasCompatDecomp(c.codePoint) 395 } 396 397 for _, r := range f.decomp { 398 chars[r].forms[form].inDecomp = true 399 } 400 } 401 402 // Phase 2: forward and backward combining. 403 for i := range chars { 404 c := &chars[i] 405 f := &c.forms[form] 406 407 if !f.isOneWay && len(f.decomp) == 2 { 408 f0 := &chars[f.decomp[0]].forms[form] 409 f1 := &chars[f.decomp[1]].forms[form] 410 if !f0.isOneWay { 411 f0.combinesForward = true 412 } 413 if !f1.isOneWay { 414 f1.combinesBackward = true 415 } 416 } 417 if isHangulWithoutJamoT(rune(i)) { 418 f.combinesForward = true 419 } 420 } 421 422 // Phase 3: quick check values. 423 for i := range chars { 424 c := &chars[i] 425 f := &c.forms[form] 426 427 switch { 428 case len(f.decomp) > 0: 429 f.quickCheck[MDecomposed] = QCNo 430 case isHangul(rune(i)): 431 f.quickCheck[MDecomposed] = QCNo 432 default: 433 f.quickCheck[MDecomposed] = QCYes 434 } 435 switch { 436 case f.isOneWay: 437 f.quickCheck[MComposed] = QCNo 438 case (i & 0xffff00) == JamoLBase: 439 f.quickCheck[MComposed] = QCYes 440 if JamoLBase <= i && i < JamoLEnd { 441 f.combinesForward = true 442 } 443 if JamoVBase <= i && i < JamoVEnd { 444 f.quickCheck[MComposed] = QCMaybe 445 f.combinesBackward = true 446 f.combinesForward = true 447 } 448 if JamoTBase <= i && i < JamoTEnd { 449 f.quickCheck[MComposed] = QCMaybe 450 f.combinesBackward = true 451 } 452 case !f.combinesBackward: 453 f.quickCheck[MComposed] = QCYes 454 default: 455 f.quickCheck[MComposed] = QCMaybe 456 } 457 } 458} 459 460func computeNonStarterCounts() { 461 // Phase 4: leading and trailing non-starter count 462 for i := range chars { 463 c := &chars[i] 464 465 runes := []rune{rune(i)} 466 // We always use FCompatibility so that the CGJ insertion points do not 467 // change for repeated normalizations with different forms. 468 if exp := c.forms[FCompatibility].expandedDecomp; len(exp) > 0 { 469 runes = exp 470 } 471 // We consider runes that combine backwards to be non-starters for the 472 // purpose of Stream-Safe Text Processing. 473 for _, r := range runes { 474 if cr := &chars[r]; cr.ccc == 0 && !cr.forms[FCompatibility].combinesBackward { 475 break 476 } 477 c.nLeadingNonStarters++ 478 } 479 for i := len(runes) - 1; i >= 0; i-- { 480 if cr := &chars[runes[i]]; cr.ccc == 0 && !cr.forms[FCompatibility].combinesBackward { 481 break 482 } 483 c.nTrailingNonStarters++ 484 } 485 if c.nTrailingNonStarters > 3 { 486 log.Fatalf("%U: Decomposition with more than 3 (%d) trailing modifiers (%U)", i, c.nTrailingNonStarters, runes) 487 } 488 489 if isHangul(rune(i)) { 490 c.nTrailingNonStarters = 2 491 if isHangulWithoutJamoT(rune(i)) { 492 c.nTrailingNonStarters = 1 493 } 494 } 495 496 if l, t := c.nLeadingNonStarters, c.nTrailingNonStarters; l > 0 && l != t { 497 log.Fatalf("%U: number of leading and trailing non-starters should be equal (%d vs %d)", i, l, t) 498 } 499 if t := c.nTrailingNonStarters; t > 3 { 500 log.Fatalf("%U: number of trailing non-starters is %d > 3", t) 501 } 502 } 503} 504 505func printBytes(w io.Writer, b []byte, name string) { 506 fmt.Fprintf(w, "// %s: %d bytes\n", name, len(b)) 507 fmt.Fprintf(w, "var %s = [...]byte {", name) 508 for i, c := range b { 509 switch { 510 case i%64 == 0: 511 fmt.Fprintf(w, "\n// Bytes %x - %x\n", i, i+63) 512 case i%8 == 0: 513 fmt.Fprintf(w, "\n") 514 } 515 fmt.Fprintf(w, "0x%.2X, ", c) 516 } 517 fmt.Fprint(w, "\n}\n\n") 518} 519 520// See forminfo.go for format. 521func makeEntry(f *FormInfo, c *Char) uint16 { 522 e := uint16(0) 523 if r := c.codePoint; HangulBase <= r && r < HangulEnd { 524 e |= 0x40 525 } 526 if f.combinesForward { 527 e |= 0x20 528 } 529 if f.quickCheck[MDecomposed] == QCNo { 530 e |= 0x4 531 } 532 switch f.quickCheck[MComposed] { 533 case QCYes: 534 case QCNo: 535 e |= 0x10 536 case QCMaybe: 537 e |= 0x18 538 default: 539 log.Fatalf("Illegal quickcheck value %v.", f.quickCheck[MComposed]) 540 } 541 e |= uint16(c.nTrailingNonStarters) 542 return e 543} 544 545// decompSet keeps track of unique decompositions, grouped by whether 546// the decomposition is followed by a trailing and/or leading CCC. 547type decompSet [7]map[string]bool 548 549const ( 550 normalDecomp = iota 551 firstMulti 552 firstCCC 553 endMulti 554 firstLeadingCCC 555 firstCCCZeroExcept 556 firstStarterWithNLead 557 lastDecomp 558) 559 560var cname = []string{"firstMulti", "firstCCC", "endMulti", "firstLeadingCCC", "firstCCCZeroExcept", "firstStarterWithNLead", "lastDecomp"} 561 562func makeDecompSet() decompSet { 563 m := decompSet{} 564 for i := range m { 565 m[i] = make(map[string]bool) 566 } 567 return m 568} 569func (m *decompSet) insert(key int, s string) { 570 m[key][s] = true 571} 572 573func printCharInfoTables(w io.Writer) int { 574 mkstr := func(r rune, f *FormInfo) (int, string) { 575 d := f.expandedDecomp 576 s := string([]rune(d)) 577 if max := 1 << 6; len(s) >= max { 578 const msg = "%U: too many bytes in decomposition: %d >= %d" 579 log.Fatalf(msg, r, len(s), max) 580 } 581 head := uint8(len(s)) 582 if f.quickCheck[MComposed] != QCYes { 583 head |= 0x40 584 } 585 if f.combinesForward { 586 head |= 0x80 587 } 588 s = string([]byte{head}) + s 589 590 lccc := ccc(d[0]) 591 tccc := ccc(d[len(d)-1]) 592 cc := ccc(r) 593 if cc != 0 && lccc == 0 && tccc == 0 { 594 log.Fatalf("%U: trailing and leading ccc are 0 for non-zero ccc %d", r, cc) 595 } 596 if tccc < lccc && lccc != 0 { 597 const msg = "%U: lccc (%d) must be <= tcc (%d)" 598 log.Fatalf(msg, r, lccc, tccc) 599 } 600 index := normalDecomp 601 nTrail := chars[r].nTrailingNonStarters 602 nLead := chars[r].nLeadingNonStarters 603 if tccc > 0 || lccc > 0 || nTrail > 0 { 604 tccc <<= 2 605 tccc |= nTrail 606 s += string([]byte{tccc}) 607 index = endMulti 608 for _, r := range d[1:] { 609 if ccc(r) == 0 { 610 index = firstCCC 611 } 612 } 613 if lccc > 0 || nLead > 0 { 614 s += string([]byte{lccc}) 615 if index == firstCCC { 616 log.Fatalf("%U: multi-segment decomposition not supported for decompositions with leading CCC != 0", r) 617 } 618 index = firstLeadingCCC 619 } 620 if cc != lccc { 621 if cc != 0 { 622 log.Fatalf("%U: for lccc != ccc, expected ccc to be 0; was %d", r, cc) 623 } 624 index = firstCCCZeroExcept 625 } 626 } else if len(d) > 1 { 627 index = firstMulti 628 } 629 return index, s 630 } 631 632 decompSet := makeDecompSet() 633 const nLeadStr = "\x00\x01" // 0-byte length and tccc with nTrail. 634 decompSet.insert(firstStarterWithNLead, nLeadStr) 635 636 // Store the uniqued decompositions in a byte buffer, 637 // preceded by their byte length. 638 for _, c := range chars { 639 for _, f := range c.forms { 640 if len(f.expandedDecomp) == 0 { 641 continue 642 } 643 if f.combinesBackward { 644 log.Fatalf("%U: combinesBackward and decompose", c.codePoint) 645 } 646 index, s := mkstr(c.codePoint, &f) 647 decompSet.insert(index, s) 648 } 649 } 650 651 decompositions := bytes.NewBuffer(make([]byte, 0, 10000)) 652 size := 0 653 positionMap := make(map[string]uint16) 654 decompositions.WriteString("\000") 655 fmt.Fprintln(w, "const (") 656 for i, m := range decompSet { 657 sa := []string{} 658 for s := range m { 659 sa = append(sa, s) 660 } 661 sort.Strings(sa) 662 for _, s := range sa { 663 p := decompositions.Len() 664 decompositions.WriteString(s) 665 positionMap[s] = uint16(p) 666 } 667 if cname[i] != "" { 668 fmt.Fprintf(w, "%s = 0x%X\n", cname[i], decompositions.Len()) 669 } 670 } 671 fmt.Fprintln(w, "maxDecomp = 0x8000") 672 fmt.Fprintln(w, ")") 673 b := decompositions.Bytes() 674 printBytes(w, b, "decomps") 675 size += len(b) 676 677 varnames := []string{"nfc", "nfkc"} 678 for i := 0; i < FNumberOfFormTypes; i++ { 679 trie := triegen.NewTrie(varnames[i]) 680 681 for r, c := range chars { 682 f := c.forms[i] 683 d := f.expandedDecomp 684 if len(d) != 0 { 685 _, key := mkstr(c.codePoint, &f) 686 trie.Insert(rune(r), uint64(positionMap[key])) 687 if c.ccc != ccc(d[0]) { 688 // We assume the lead ccc of a decomposition !=0 in this case. 689 if ccc(d[0]) == 0 { 690 log.Fatalf("Expected leading CCC to be non-zero; ccc is %d", c.ccc) 691 } 692 } 693 } else if c.nLeadingNonStarters > 0 && len(f.expandedDecomp) == 0 && c.ccc == 0 && !f.combinesBackward { 694 // Handle cases where it can't be detected that the nLead should be equal 695 // to nTrail. 696 trie.Insert(c.codePoint, uint64(positionMap[nLeadStr])) 697 } else if v := makeEntry(&f, &c)<<8 | uint16(c.ccc); v != 0 { 698 trie.Insert(c.codePoint, uint64(0x8000|v)) 699 } 700 } 701 sz, err := trie.Gen(w, triegen.Compact(&normCompacter{name: varnames[i]})) 702 if err != nil { 703 log.Fatal(err) 704 } 705 size += sz 706 } 707 return size 708} 709 710func contains(sa []string, s string) bool { 711 for _, a := range sa { 712 if a == s { 713 return true 714 } 715 } 716 return false 717} 718 719func makeTables() { 720 w := &bytes.Buffer{} 721 722 size := 0 723 if *tablelist == "" { 724 return 725 } 726 list := strings.Split(*tablelist, ",") 727 if *tablelist == "all" { 728 list = []string{"recomp", "info"} 729 } 730 731 // Compute maximum decomposition size. 732 max := 0 733 for _, c := range chars { 734 if n := len(string(c.forms[FCompatibility].expandedDecomp)); n > max { 735 max = n 736 } 737 } 738 739 fmt.Fprintln(w, "const (") 740 fmt.Fprintln(w, "\t// Version is the Unicode edition from which the tables are derived.") 741 fmt.Fprintf(w, "\tVersion = %q\n", gen.UnicodeVersion()) 742 fmt.Fprintln(w) 743 fmt.Fprintln(w, "\t// MaxTransformChunkSize indicates the maximum number of bytes that Transform") 744 fmt.Fprintln(w, "\t// may need to write atomically for any Form. Making a destination buffer at") 745 fmt.Fprintln(w, "\t// least this size ensures that Transform can always make progress and that") 746 fmt.Fprintln(w, "\t// the user does not need to grow the buffer on an ErrShortDst.") 747 fmt.Fprintf(w, "\tMaxTransformChunkSize = %d+maxNonStarters*4\n", len(string(0x034F))+max) 748 fmt.Fprintln(w, ")\n") 749 750 // Print the CCC remap table. 751 size += len(cccMap) 752 fmt.Fprintf(w, "var ccc = [%d]uint8{", len(cccMap)) 753 for i := 0; i < len(cccMap); i++ { 754 if i%8 == 0 { 755 fmt.Fprintln(w) 756 } 757 fmt.Fprintf(w, "%3d, ", cccMap[uint8(i)]) 758 } 759 fmt.Fprintln(w, "\n}\n") 760 761 if contains(list, "info") { 762 size += printCharInfoTables(w) 763 } 764 765 if contains(list, "recomp") { 766 // Note that we use 32 bit keys, instead of 64 bit. 767 // This clips the bits of three entries, but we know 768 // this won't cause a collision. The compiler will catch 769 // any changes made to UnicodeData.txt that introduces 770 // a collision. 771 // Note that the recomposition map for NFC and NFKC 772 // are identical. 773 774 // Recomposition map 775 nrentries := 0 776 for _, c := range chars { 777 f := c.forms[FCanonical] 778 if !f.isOneWay && len(f.decomp) > 0 { 779 nrentries++ 780 } 781 } 782 sz := nrentries * 8 783 size += sz 784 fmt.Fprintf(w, "// recompMap: %d bytes (entries only)\n", sz) 785 fmt.Fprintln(w, "var recompMap = map[uint32]rune{") 786 for i, c := range chars { 787 f := c.forms[FCanonical] 788 d := f.decomp 789 if !f.isOneWay && len(d) > 0 { 790 key := uint32(uint16(d[0]))<<16 + uint32(uint16(d[1])) 791 fmt.Fprintf(w, "0x%.8X: 0x%.4X,\n", key, i) 792 } 793 } 794 fmt.Fprintf(w, "}\n\n") 795 } 796 797 fmt.Fprintf(w, "// Total size of tables: %dKB (%d bytes)\n", (size+512)/1024, size) 798 gen.WriteVersionedGoFile("tables.go", "norm", w.Bytes()) 799} 800 801func printChars() { 802 if *verbose { 803 for _, c := range chars { 804 if !c.isValid() || c.state == SMissing { 805 continue 806 } 807 fmt.Println(c) 808 } 809 } 810} 811 812// verifyComputed does various consistency tests. 813func verifyComputed() { 814 for i, c := range chars { 815 for _, f := range c.forms { 816 isNo := (f.quickCheck[MDecomposed] == QCNo) 817 if (len(f.decomp) > 0) != isNo && !isHangul(rune(i)) { 818 log.Fatalf("%U: NF*D QC must be No if rune decomposes", i) 819 } 820 821 isMaybe := f.quickCheck[MComposed] == QCMaybe 822 if f.combinesBackward != isMaybe { 823 log.Fatalf("%U: NF*C QC must be Maybe if combinesBackward", i) 824 } 825 if len(f.decomp) > 0 && f.combinesForward && isMaybe { 826 log.Fatalf("%U: NF*C QC must be Yes or No if combinesForward and decomposes", i) 827 } 828 829 if len(f.expandedDecomp) != 0 { 830 continue 831 } 832 if a, b := c.nLeadingNonStarters > 0, (c.ccc > 0 || f.combinesBackward); a != b { 833 // We accept these runes to be treated differently (it only affects 834 // segment breaking in iteration, most likely on improper use), but 835 // reconsider if more characters are added. 836 // U+FF9E HALFWIDTH KATAKANA VOICED SOUND MARK;Lm;0;L;<narrow> 3099;;;;N;;;;; 837 // U+FF9F HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK;Lm;0;L;<narrow> 309A;;;;N;;;;; 838 // U+3133 HANGUL LETTER KIYEOK-SIOS;Lo;0;L;<compat> 11AA;;;;N;HANGUL LETTER GIYEOG SIOS;;;; 839 // U+318E HANGUL LETTER ARAEAE;Lo;0;L;<compat> 11A1;;;;N;HANGUL LETTER ALAE AE;;;; 840 // U+FFA3 HALFWIDTH HANGUL LETTER KIYEOK-SIOS;Lo;0;L;<narrow> 3133;;;;N;HALFWIDTH HANGUL LETTER GIYEOG SIOS;;;; 841 // U+FFDC HALFWIDTH HANGUL LETTER I;Lo;0;L;<narrow> 3163;;;;N;;;;; 842 if i != 0xFF9E && i != 0xFF9F && !(0x3133 <= i && i <= 0x318E) && !(0xFFA3 <= i && i <= 0xFFDC) { 843 log.Fatalf("%U: nLead was %v; want %v", i, a, b) 844 } 845 } 846 } 847 nfc := c.forms[FCanonical] 848 nfkc := c.forms[FCompatibility] 849 if nfc.combinesBackward != nfkc.combinesBackward { 850 log.Fatalf("%U: Cannot combine combinesBackward\n", c.codePoint) 851 } 852 } 853} 854 855// Use values in DerivedNormalizationProps.txt to compare against the 856// values we computed. 857// DerivedNormalizationProps.txt has form: 858// 00C0..00C5 ; NFD_QC; N # ... 859// 0374 ; NFD_QC; N # ... 860// See http://unicode.org/reports/tr44/ for full explanation 861func testDerived() { 862 f := gen.OpenUCDFile("DerivedNormalizationProps.txt") 863 defer f.Close() 864 p := ucd.New(f) 865 for p.Next() { 866 r := p.Rune(0) 867 c := &chars[r] 868 869 var ftype, mode int 870 qt := p.String(1) 871 switch qt { 872 case "NFC_QC": 873 ftype, mode = FCanonical, MComposed 874 case "NFD_QC": 875 ftype, mode = FCanonical, MDecomposed 876 case "NFKC_QC": 877 ftype, mode = FCompatibility, MComposed 878 case "NFKD_QC": 879 ftype, mode = FCompatibility, MDecomposed 880 default: 881 continue 882 } 883 var qr QCResult 884 switch p.String(2) { 885 case "Y": 886 qr = QCYes 887 case "N": 888 qr = QCNo 889 case "M": 890 qr = QCMaybe 891 default: 892 log.Fatalf(`Unexpected quick check value "%s"`, p.String(2)) 893 } 894 if got := c.forms[ftype].quickCheck[mode]; got != qr { 895 log.Printf("%U: FAILED %s (was %v need %v)\n", r, qt, got, qr) 896 } 897 c.forms[ftype].verified[mode] = true 898 } 899 if err := p.Err(); err != nil { 900 log.Fatal(err) 901 } 902 // Any unspecified value must be QCYes. Verify this. 903 for i, c := range chars { 904 for j, fd := range c.forms { 905 for k, qr := range fd.quickCheck { 906 if !fd.verified[k] && qr != QCYes { 907 m := "%U: FAIL F:%d M:%d (was %v need Yes) %s\n" 908 log.Printf(m, i, j, k, qr, c.name) 909 } 910 } 911 } 912 } 913} 914 915var testHeader = `const ( 916 Yes = iota 917 No 918 Maybe 919) 920 921type formData struct { 922 qc uint8 923 combinesForward bool 924 decomposition string 925} 926 927type runeData struct { 928 r rune 929 ccc uint8 930 nLead uint8 931 nTrail uint8 932 f [2]formData // 0: canonical; 1: compatibility 933} 934 935func f(qc uint8, cf bool, dec string) [2]formData { 936 return [2]formData{{qc, cf, dec}, {qc, cf, dec}} 937} 938 939func g(qc, qck uint8, cf, cfk bool, d, dk string) [2]formData { 940 return [2]formData{{qc, cf, d}, {qck, cfk, dk}} 941} 942 943var testData = []runeData{ 944` 945 946func printTestdata() { 947 type lastInfo struct { 948 ccc uint8 949 nLead uint8 950 nTrail uint8 951 f string 952 } 953 954 last := lastInfo{} 955 w := &bytes.Buffer{} 956 fmt.Fprintf(w, testHeader) 957 for r, c := range chars { 958 f := c.forms[FCanonical] 959 qc, cf, d := f.quickCheck[MComposed], f.combinesForward, string(f.expandedDecomp) 960 f = c.forms[FCompatibility] 961 qck, cfk, dk := f.quickCheck[MComposed], f.combinesForward, string(f.expandedDecomp) 962 s := "" 963 if d == dk && qc == qck && cf == cfk { 964 s = fmt.Sprintf("f(%s, %v, %q)", qc, cf, d) 965 } else { 966 s = fmt.Sprintf("g(%s, %s, %v, %v, %q, %q)", qc, qck, cf, cfk, d, dk) 967 } 968 current := lastInfo{c.ccc, c.nLeadingNonStarters, c.nTrailingNonStarters, s} 969 if last != current { 970 fmt.Fprintf(w, "\t{0x%x, %d, %d, %d, %s},\n", r, c.origCCC, c.nLeadingNonStarters, c.nTrailingNonStarters, s) 971 last = current 972 } 973 } 974 fmt.Fprintln(w, "}") 975 gen.WriteVersionedGoFile("data_test.go", "norm", w.Bytes()) 976} 977