1#!/usr/bin/env python 2""" 3Unicode case folding database conversion utility 4 5Parses the database and generates a C++ function which implements the case 6folding algorithm. The database entries are of the form: 7 8 <code>; <status>; <mapping>; # <name> 9 10<status> can be one of four characters: 11 C - Common mappings 12 S - mappings for Simple case folding 13 F - mappings for Full case folding 14 T - special case for Turkish I characters 15 16Right now this generates a function which implements simple case folding (C+S 17entries). 18""" 19 20import sys 21import re 22import urllib2 23 24# This variable will body of the mappings function 25body = "" 26 27# Reads file line-by-line, extracts Common and Simple case fold mappings and 28# returns a (from_char, to_char, from_name) tuple. 29def mappings(f): 30 previous_from = -1 31 expr = re.compile(r'^(.*); [CS]; (.*); # (.*)') 32 for line in f: 33 m = expr.match(line) 34 if not m: continue 35 from_char = int(m.group(1), 16) 36 to_char = int(m.group(2), 16) 37 from_name = m.group(3) 38 39 if from_char <= previous_from: 40 raise Exception("Duplicate or unsorted characters in input") 41 yield from_char, to_char, from_name 42 previous_from = from_char 43 44# Computes the shift (to_char - from_char) in a mapping. 45def shift(mapping): 46 return mapping[1] - mapping[0] 47 48# Computes the stride (from_char2 - from_char1) of two mappings. 49def stride2(mapping1, mapping2): 50 return mapping2[0] - mapping1[0] 51 52# Computes the stride of a list of mappings. The list should have at least two 53# mappings. All mappings in the list are assumed to have the same stride. 54def stride(block): 55 return stride2(block[0], block[1]) 56 57 58# b is a list of mappings. All the mappings are assumed to have the same 59# shift and the stride between adjecant mappings (if any) is constant. 60def dump_block(b): 61 global body 62 63 if len(b) == 1: 64 # Special case for handling blocks of length 1. We don't even need to 65 # emit the "if (C < X) return C" check below as all characters in this 66 # range will be caught by the "C < X" check emitted by the first 67 # non-trivial block. 68 body += " // {2}\n if (C == {0:#06x})\n return {1:#06x};\n".format(*b[0]) 69 return 70 71 first = b[0][0] 72 last = first + stride(b) * (len(b)-1) 73 modulo = first % stride(b) 74 75 # All characters before this block map to themselves. 76 body += " if (C < {0:#06x})\n return C;\n".format(first) 77 body += " // {0} characters\n".format(len(b)) 78 79 # Generic pattern: check upper bound (lower bound is checked by the "if" 80 # above) and modulo of C, return C+shift. 81 pattern = " if (C <= {0:#06x} && C % {1} == {2})\n return C + {3};\n" 82 83 if stride(b) == 2 and shift(b[0]) == 1 and modulo == 0: 84 # Special case: 85 # We can elide the modulo-check because the expression "C|1" will map 86 # the intervening characters to themselves. 87 pattern = " if (C <= {0:#06x})\n return C | 1;\n" 88 elif stride(b) == 1: 89 # Another special case: X % 1 is always zero, so don't emit the 90 # modulo-check. 91 pattern = " if (C <= {0:#06x})\n return C + {3};\n" 92 93 body += pattern.format(last, stride(b), modulo, shift(b[0])) 94 95current_block = [] 96f = urllib2.urlopen(sys.argv[1]) 97for m in mappings(f): 98 if len(current_block) == 0: 99 current_block.append(m) 100 continue 101 102 if shift(current_block[0]) != shift(m): 103 # Incompatible shift, start a new block. 104 dump_block(current_block) 105 current_block = [m] 106 continue 107 108 if len(current_block) == 1 or stride(current_block) == stride2(current_block[-1], m): 109 current_block.append(m) 110 continue 111 112 # Incompatible stride, start a new block. 113 dump_block(current_block) 114 current_block = [m] 115f.close() 116 117dump_block(current_block) 118 119print '//===---------- Support/UnicodeCaseFold.cpp -------------------------------===//' 120print '//' 121print '// This file was generated by utils/unicode-case-fold.py from the Unicode' 122print '// case folding database at' 123print '// ', sys.argv[1] 124print '//' 125print '// To regenerate this file, run:' 126print '// utils/unicode-case-fold.py \\' 127print '// "{}" \\'.format(sys.argv[1]) 128print '// > lib/Support/UnicodeCaseFold.cpp' 129print '//' 130print '//===----------------------------------------------------------------------===//' 131print '' 132print '#include "llvm/Support/Unicode.h"' 133print '' 134print "int llvm::sys::unicode::foldCharSimple(int C) {" 135print body 136print " return C;" 137print "}" 138