1#!/usr/bin/env python 2 3# Copyright (C) 2015 The Android Open Source Project 4# 5# Licensed under the Apache License, Version 2.0 (the 'License'); 6# you may not use this file except in compliance with the License. 7# You may obtain a copy of the License at 8# 9# http://www.apache.org/licenses/LICENSE-2.0 10# 11# Unless required by applicable law or agreed to in writing, software 12# distributed under the License is distributed on an 'AS IS' BASIS, 13# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14# See the License for the specific language governing permissions and 15# limitations under the License. 16 17""" 18Convert hyphen files in standard TeX format (a trio of pat, chr, and hyp) 19into binary format. See doc/hyb_file_format.md for more information. 20 21Usage: mk_hyb_file.py [-v] hyph-foo.pat.txt hyph-foo.hyb 22 23Optional -v parameter turns on verbose debugging. 24 25""" 26 27from __future__ import print_function 28 29import io 30import sys 31import struct 32import math 33import getopt 34 35 36VERBOSE = False 37 38# U+00DF is LATIN SMALL LETTER SHARP S 39# U+1E9E is LATIN CAPITAL LETTER SHARP S 40SHARP_S_TO_DOUBLE = u'\u00dfSS' 41SHARP_S_TO_CAPITAL = u'\u00df\u1e9e' 42 43if sys.version_info[0] >= 3: 44 def unichr(x): 45 return chr(x) 46 47 48# number of bits required to represent numbers up to n inclusive 49def num_bits(n): 50 return 1 + int(math.log(n, 2)) if n > 0 else 0 51 52 53class Node: 54 55 def __init__(self): 56 self.succ = {} 57 self.res = None 58 self.fsm_pat = None 59 self.fail = None 60 61 62# List of free slots, implemented as doubly linked list 63class Freelist: 64 65 def __init__(self): 66 self.first = None 67 self.last = None 68 self.pred = [] 69 self.succ = [] 70 71 def grow(self): 72 this = len(self.pred) 73 self.pred.append(self.last) 74 self.succ.append(None) 75 if self.last is None: 76 self.first = this 77 else: 78 self.succ[self.last] = this 79 self.last = this 80 81 def next(self, cursor): 82 if cursor == 0: 83 cursor = self.first 84 if cursor is None: 85 self.grow() 86 result = self.last 87 else: 88 result = cursor 89 return result, self.succ[result] 90 91 def is_free(self, ix): 92 while ix >= len(self.pred): 93 self.grow() 94 return self.pred[ix] != -1 95 96 def use(self, ix): 97 if self.pred[ix] is None: 98 self.first = self.succ[ix] 99 else: 100 self.succ[self.pred[ix]] = self.succ[ix] 101 if self.succ[ix] is None: 102 self.last = self.pred[ix] 103 else: 104 self.pred[self.succ[ix]] = self.pred[ix] 105 if self.pred[ix] == -1: 106 assert self.pred[ix] != -1, 'double free!' 107 self.pred[ix] = -1 108 109 110def combine(a, b): 111 if a is None: return b 112 if b is None: return a 113 if len(b) < len(a): a, b = b, a 114 res = b[:len(b) - len(a)] 115 for i in range(len(a)): 116 res.append(max(a[i], b[i + len(b) - len(a)])) 117 return res 118 119 120def trim(pattern): 121 for ix in range(len(pattern)): 122 if pattern[ix] != 0: 123 return pattern[ix:] 124 125 126def pat_to_binary(pattern): 127 return b''.join(struct.pack('B', x) for x in pattern) 128 129 130class Hyph: 131 132 def __init__(self): 133 self.root = Node() 134 self.root.str = '<root>' 135 self.node_list = [self.root] 136 137 # Add a pattern (word fragment with numeric codes, such as ".ad4der") 138 def add_pat(self, pat): 139 lastWasLetter = False 140 haveSeenNumber = False 141 result = [] 142 word = '' 143 for c in pat: 144 if c.isdigit(): 145 result.append(int(c)) 146 lastWasLetter = False 147 haveSeenNumber = True 148 else: 149 word += c 150 if lastWasLetter and haveSeenNumber: 151 result.append(0) 152 lastWasLetter = True 153 if lastWasLetter: 154 result.append(0) 155 156 self.add_word_res(word, result) 157 158 # Add an exception (word with hyphens, such as "ta-ble") 159 def add_exception(self, hyph_word): 160 res = [] 161 word = ['.'] 162 need_10 = False 163 for c in hyph_word: 164 if c == '-': 165 res.append(11) 166 need_10 = False 167 else: 168 if need_10: 169 res.append(10) 170 word.append(c) 171 need_10 = True 172 word.append('.') 173 res.append(0) 174 res.append(0) 175 if VERBOSE: 176 print(word, res) 177 self.add_word_res(''.join(word), res) 178 179 def add_word_res(self, word, result): 180 if VERBOSE: 181 print(word, result) 182 183 t = self.root 184 s = '' 185 for c in word: 186 s += c 187 if c not in t.succ: 188 new_node = Node() 189 new_node.str = s 190 self.node_list.append(new_node) 191 t.succ[c] = new_node 192 t = t.succ[c] 193 t.res = result 194 195 def pack(self, node_list, ch_map, use_node=False): 196 size = 0 197 self.node_map = {} 198 nodes = Freelist() 199 edges = Freelist() 200 edge_start = 1 if use_node else 0 201 for node in node_list: 202 succ = sorted([ch_map[c] + edge_start for c in node.succ.keys()]) 203 if len(succ): 204 cursor = 0 205 while True: 206 edge_ix, cursor = edges.next(cursor) 207 ix = edge_ix - succ[0] 208 if (ix >= 0 and nodes.is_free(ix) and 209 all(edges.is_free(ix + s) for s in succ) and 210 ((not use_node) or edges.is_free(ix))): 211 break 212 elif use_node: 213 ix, _ = edges.next(0) 214 nodes.is_free(ix) # actually don't need nodes at all when use_node, 215 # but keep it happy 216 else: 217 ix, _ = nodes.next(0) 218 node.ix = ix 219 self.node_map[ix] = node 220 nodes.use(ix) 221 size = max(size, ix) 222 if use_node: 223 edges.use(ix) 224 for s in succ: 225 edges.use(ix + s) 226 size += max(ch_map.values()) + 1 227 return size 228 229 # return list of nodes in bfs order 230 def bfs(self, ch_map): 231 result = [self.root] 232 ix = 0 233 while ix < len(result): 234 node = result[ix] 235 node.bfs_ix = ix 236 mapped = {} 237 for c, next in node.succ.items(): 238 assert ch_map[c] not in mapped, 'duplicate edge ' + node.str + ' ' + hex(ord(c)) 239 mapped[ch_map[c]] = next 240 for i in sorted(mapped.keys()): 241 result.append(mapped[i]) 242 ix += 1 243 self.bfs_order = result 244 return result 245 246 # suffix compression - convert the trie into an acyclic digraph, merging nodes when 247 # the subtries are identical 248 def dedup(self): 249 uniques = [] 250 dupmap = {} 251 dedup_ix = [0] * len(self.bfs_order) 252 for ix in reversed(range(len(self.bfs_order))): 253 # construct string representation of node 254 node = self.bfs_order[ix] 255 if node.res is None: 256 s = '' 257 else: 258 s = ''.join(str(c) for c in node.res) 259 for c in sorted(node.succ.keys()): 260 succ = node.succ[c] 261 s += ' ' + c + str(dedup_ix[succ.bfs_ix]) 262 if s in dupmap: 263 dedup_ix[ix] = dupmap[s] 264 else: 265 uniques.append(node) 266 dedup_ix[ix] = ix 267 dupmap[s] = dedup_ix[ix] 268 uniques.reverse() 269 print(len(uniques), 'unique nodes,', len(self.bfs_order), 'total') 270 return dedup_ix, uniques 271 272 273# load the ".pat" file, which contains patterns such as a1b2c3 274def load(fn): 275 hyph = Hyph() 276 with io.open(fn, encoding='UTF-8') as f: 277 for l in f: 278 pat = l.strip() 279 hyph.add_pat(pat) 280 return hyph 281 282 283# load the ".chr" file, which contains the alphabet and case pairs, eg "aA", "bB" etc. 284def load_chr(fn): 285 ch_map = {'.': 0} 286 with io.open(fn, encoding='UTF-8') as f: 287 for i, l in enumerate(f): 288 l = l.strip() 289 if len(l) > 2: 290 if l == SHARP_S_TO_DOUBLE: 291 # replace with lowercasing from capital letter sharp s 292 l = SHARP_S_TO_CAPITAL 293 else: 294 # lowercase maps to multi-character uppercase sequence, ignore uppercase for now 295 l = l[:1] 296 else: 297 assert len(l) == 2, 'expected 2 chars in chr' 298 for c in l: 299 ch_map[c] = i + 1 300 return ch_map 301 302 303# load exceptions with explicit hyphens 304def load_hyp(hyph, fn): 305 with io.open(fn, encoding='UTF-8') as f: 306 for l in f: 307 hyph.add_exception(l.strip()) 308 309 310def generate_header(alphabet, trie, pattern): 311 alphabet_off = 6 * 4 312 trie_off = alphabet_off + len(alphabet) 313 pattern_off = trie_off + len(trie) 314 file_size = pattern_off + len(pattern) 315 data = [0x62ad7968, 0, alphabet_off, trie_off, pattern_off, file_size] 316 return struct.pack('<6I', *data) 317 318 319def generate_alphabet(ch_map): 320 ch_map = ch_map.copy() 321 del ch_map['.'] 322 min_ch = ord(min(ch_map)) 323 max_ch = ord(max(ch_map)) 324 if max_ch - min_ch < 1024 and max(ch_map.values()) < 256: 325 # generate format 0 326 data = [0] * (max_ch - min_ch + 1) 327 for c, val in ch_map.items(): 328 data[ord(c) - min_ch] = val 329 result = [struct.pack('<3I', 0, min_ch, max_ch + 1)] 330 for b in data: 331 result.append(struct.pack('<B', b)) 332 else: 333 # generate format 1 334 assert max(ch_map.values()) < 2048, 'max number of unique characters exceeded' 335 result = [struct.pack('<2I', 1, len(ch_map))] 336 for c, val in sorted(ch_map.items()): 337 data = (ord(c) << 11) | val 338 result.append(struct.pack('<I', data)) 339 binary = b''.join(result) 340 if len(binary) % 4 != 0: 341 binary += b'\x00' * (4 - len(binary) % 4) 342 return binary 343 344 345# assumes hyph structure has been packed, ie node.ix values have been set 346def generate_trie(hyph, ch_map, n_trie, dedup_ix, dedup_nodes, patmap): 347 ch_array = [0] * n_trie 348 link_array = [0] * n_trie 349 pat_array = [0] * n_trie 350 link_shift = num_bits(max(ch_map.values())) 351 char_mask = (1 << link_shift) - 1 352 pattern_shift = link_shift + num_bits(n_trie - 1) 353 link_mask = (1 << pattern_shift) - (1 << link_shift) 354 result = [struct.pack('<6I', 0, char_mask, link_shift, link_mask, pattern_shift, n_trie)] 355 356 for node in dedup_nodes: 357 ix = node.ix 358 if node.res is not None: 359 pat_array[ix] = patmap[pat_to_binary(node.res)] 360 for c, next in node.succ.items(): 361 c_num = ch_map[c] 362 link_ix = ix + c_num 363 ch_array[link_ix] = c_num 364 if dedup_ix is None: 365 dedup_next = next 366 else: 367 dedup_next = hyph.bfs_order[dedup_ix[next.bfs_ix]] 368 link_array[link_ix] = dedup_next.ix 369 370 for i in range(n_trie): 371 #print((pat_array[i], link_array[i], ch_array[i])) 372 packed = (pat_array[i] << pattern_shift) | (link_array[i] << link_shift) | ch_array[i] 373 result.append(struct.pack('<I', packed)) 374 return b''.join(result) 375 376 377def generate_pattern(pats): 378 pat_array = [0] 379 patmap = {b'': 0} 380 381 raw_pat_array = [] 382 raw_pat_size = 0 383 raw_patmap = {} 384 385 for pat in pats: 386 if pat is None: 387 continue 388 pat_str = pat_to_binary(pat) 389 if pat_str not in patmap: 390 shift = 0 391 while shift < len(pat) and pat[len(pat) - shift - 1] == 0: 392 shift += 1 393 rawpat = pat_str[:len(pat) - shift] 394 if rawpat not in raw_patmap: 395 raw_patmap[rawpat] = raw_pat_size 396 raw_pat_array.append(rawpat) 397 raw_pat_size += len(rawpat) 398 data = (len(rawpat) << 26) | (shift << 20) | raw_patmap[rawpat] 399 patmap[pat_str] = len(pat_array) 400 pat_array.append(data) 401 data = [0, len(pat_array), 16 + 4 * len(pat_array), raw_pat_size] 402 result = [struct.pack('<4I', *data)] 403 for x in pat_array: 404 result.append(struct.pack('<I', x)) 405 result.extend(raw_pat_array) 406 return patmap, b''.join(result) 407 408 409def generate_hyb_file(hyph, ch_map, hyb_fn): 410 bfs = hyph.bfs(ch_map) 411 dedup_ix, dedup_nodes = hyph.dedup() 412 n_trie = hyph.pack(dedup_nodes, ch_map) 413 alphabet = generate_alphabet(ch_map) 414 patmap, pattern = generate_pattern([n.res for n in hyph.node_list]) 415 trie = generate_trie(hyph, ch_map, n_trie, dedup_ix, dedup_nodes, patmap) 416 header = generate_header(alphabet, trie, pattern) 417 418 with open(hyb_fn, 'wb') as f: 419 f.write(header) 420 f.write(alphabet) 421 f.write(trie) 422 f.write(pattern) 423 424 425# Verify that the file contains the same lines as the lines argument, in arbitrary order 426def verify_file_sorted(lines, fn): 427 file_lines = [l.strip() for l in io.open(fn, encoding='UTF-8')] 428 line_set = set(lines) 429 file_set = set(file_lines) 430 if SHARP_S_TO_DOUBLE in file_set: 431 # ignore difference of double capital letter s and capital letter sharp s 432 file_set.symmetric_difference_update([SHARP_S_TO_DOUBLE, SHARP_S_TO_CAPITAL]) 433 if line_set == file_set: 434 return True 435 for line in line_set - file_set: 436 print(repr(line) + ' in reconstruction, not in file') 437 for line in file_set - line_set: 438 print(repr(line) + ' in file, not in reconstruction') 439 return False 440 441 442def map_to_chr(alphabet_map): 443 result = [] 444 ch_map = {} 445 for val in alphabet_map.values(): 446 chs = [ch for ch in alphabet_map if alphabet_map[ch] == val] 447 # non-cased characters (like Ethopic) are in both, matching chr file 448 lowercase = [ch for ch in chs if not ch.isupper()] 449 uppercase = [ch for ch in chs if not ch.islower()] 450 # print(val, `lowercase`, `uppercase`) 451 assert len(lowercase) == 1, 'expected 1 lowercase character' 452 assert 0 <= len(uppercase) <= 1, 'expected 0 or 1 uppercase character' 453 ch_map[val] = lowercase[0] 454 result.append(''.join(lowercase + uppercase)) 455 ch_map[0] = '.' 456 return (ch_map, result) 457 458 459def get_pattern(pattern_data, ix): 460 pattern_offset = struct.unpack('<I', pattern_data[8:12])[0] 461 entry = struct.unpack('<I', pattern_data[16 + ix * 4: 16 + ix * 4 + 4])[0] 462 pat_len = entry >> 26 463 pat_shift = (entry >> 20) & 0x1f 464 offset = pattern_offset + (entry & 0xfffff) 465 return pattern_data[offset: offset + pat_len] + b'\0' * pat_shift 466 467 468def traverse_trie(ix, s, trie_data, ch_map, pattern_data, patterns, exceptions): 469 (char_mask, link_shift, link_mask, pattern_shift) = struct.unpack('<4I', trie_data[4:20]) 470 node_entry = struct.unpack('<I', trie_data[24 + ix * 4: 24 + ix * 4 + 4])[0] 471 pattern = node_entry >> pattern_shift 472 if pattern: 473 result = [] 474 is_exception = False 475 pat = get_pattern(pattern_data, pattern) 476 for i in range(len(s) + 1): 477 pat_off = i - 1 + len(pat) - len(s) 478 if pat_off < 0: 479 code = 0 480 else: 481 code = struct.unpack('B', pat[pat_off : pat_off + 1])[0] 482 if 1 <= code <= 9: 483 result.append('%d' % code) 484 elif code == 10: 485 is_exception = True 486 elif code == 11: 487 result.append('-') 488 is_exception = True 489 else: 490 assert code == 0, 'unexpected code' 491 if i < len(s): 492 result.append(s[i]) 493 pat_str = ''.join(result) 494 #print(`pat_str`, `pat`) 495 if is_exception: 496 assert pat_str[0] == '.', "expected leading '.'" 497 assert pat_str[-1] == '.', "expected trailing '.'" 498 exceptions.append(pat_str[1:-1]) # strip leading and trailing '.' 499 else: 500 patterns.append(pat_str) 501 for ch in ch_map: 502 edge_entry = struct.unpack('<I', trie_data[24 + (ix + ch) * 4: 24 + (ix + ch) * 4 + 4])[0] 503 link = (edge_entry & link_mask) >> link_shift 504 if link != 0 and ch == (edge_entry & char_mask): 505 sch = s + ch_map[ch] 506 traverse_trie(link, sch, trie_data, ch_map, pattern_data, patterns, exceptions) 507 508 509# Verify the generated binary file by reconstructing the textual representations 510# from the binary hyb file, then checking that they're identical (mod the order of 511# lines within the file, which is irrelevant). This function makes assumptions that 512# are stronger than absolutely necessary (in particular, that the patterns are in 513# lowercase as defined by python islower). 514def verify_hyb_file(hyb_fn, pat_fn, chr_fn, hyp_fn): 515 with open(hyb_fn, 'rb') as f: 516 hyb_data = f.read() 517 header = hyb_data[0: 6 * 4] 518 (magic, version, alphabet_off, trie_off, pattern_off, file_size) = struct.unpack('<6I', header) 519 alphabet_data = hyb_data[alphabet_off:trie_off] 520 trie_data = hyb_data[trie_off:pattern_off] 521 pattern_data = hyb_data[pattern_off:file_size] 522 523 # reconstruct alphabet table 524 alphabet_version = struct.unpack('<I', alphabet_data[:4])[0] 525 alphabet_map = {} 526 if alphabet_version == 0: 527 (min_ch, max_ch) = struct.unpack('<2I', alphabet_data[4:12]) 528 for ch in range(min_ch, max_ch): 529 offset = 12 + ch - min_ch 530 b = struct.unpack('B', alphabet_data[offset : offset + 1])[0] 531 if b != 0: 532 alphabet_map[unichr(ch)] = b 533 else: 534 assert alphabet_version == 1 535 n_entries = struct.unpack('<I', alphabet_data[4:8])[0] 536 for i in range(n_entries): 537 entry = struct.unpack('<I', alphabet_data[8 + 4 * i: 8 + 4 * i + 4])[0] 538 alphabet_map[unichr(entry >> 11)] = entry & 0x7ff 539 540 ch_map, reconstructed_chr = map_to_chr(alphabet_map) 541 542 # EXCEPTION for Armenian (hy), we don't really deal with the uppercase form of U+0587 543 if u'\u0587' in reconstructed_chr: 544 reconstructed_chr.remove(u'\u0587') 545 reconstructed_chr.append(u'\u0587\u0535\u0552') 546 547 assert verify_file_sorted(reconstructed_chr, chr_fn), 'alphabet table not verified' 548 549 # reconstruct trie 550 patterns = [] 551 exceptions = [] 552 traverse_trie(0, '', trie_data, ch_map, pattern_data, patterns, exceptions) 553 554 # EXCEPTION for Bulgarian (bg), which contains an ineffectual line of <0, U+044C, 0> 555 if u'\u044c' in patterns: 556 patterns.remove(u'\u044c') 557 patterns.append(u'0\u044c0') 558 559 assert verify_file_sorted(patterns, pat_fn), 'pattern table not verified' 560 assert verify_file_sorted(exceptions, hyp_fn), 'exception table not verified' 561 562 563def main(): 564 global VERBOSE 565 try: 566 opts, args = getopt.getopt(sys.argv[1:], 'v') 567 except getopt.GetoptError as err: 568 print(str(err)) 569 sys.exit(1) 570 for o, _ in opts: 571 if o == '-v': 572 VERBOSE = True 573 pat_fn, out_fn = args 574 hyph = load(pat_fn) 575 if pat_fn.endswith('.pat.txt'): 576 chr_fn = pat_fn[:-8] + '.chr.txt' 577 ch_map = load_chr(chr_fn) 578 hyp_fn = pat_fn[:-8] + '.hyp.txt' 579 load_hyp(hyph, hyp_fn) 580 generate_hyb_file(hyph, ch_map, out_fn) 581 verify_hyb_file(out_fn, pat_fn, chr_fn, hyp_fn) 582 583if __name__ == '__main__': 584 main() 585