1#!/usr/bin/env python 2 3# Copyright (C) 2015 The Android Open Source Project 4# 5# Licensed under the Apache License, Version 2.0 (the 'License'); 6# you may not use this file except in compliance with the License. 7# You may obtain a copy of the License at 8# 9# http://www.apache.org/licenses/LICENSE-2.0 10# 11# Unless required by applicable law or agreed to in writing, software 12# distributed under the License is distributed on an 'AS IS' BASIS, 13# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14# See the License for the specific language governing permissions and 15# limitations under the License. 16 17""" 18Convert hyphen files in standard TeX format (a trio of pat, chr, and hyp) 19into binary format. See doc/hyb_file_format.md for more information. 20 21Usage: mk_hyb_file.py [-v] hyph-foo.pat.txt hyph-foo.hyb 22 23Optional -v parameter turns on verbose debugging. 24 25""" 26 27from __future__ import print_function 28 29import io 30import sys 31import struct 32import math 33import getopt 34 35 36VERBOSE = False 37 38 39if sys.version_info[0] >= 3: 40 def unichr(x): 41 return chr(x) 42 43 44# number of bits required to represent numbers up to n inclusive 45def num_bits(n): 46 return 1 + int(math.log(n, 2)) if n > 0 else 0 47 48 49class Node: 50 51 def __init__(self): 52 self.succ = {} 53 self.res = None 54 self.fsm_pat = None 55 self.fail = None 56 57 58# List of free slots, implemented as doubly linked list 59class Freelist: 60 61 def __init__(self): 62 self.first = None 63 self.last = None 64 self.pred = [] 65 self.succ = [] 66 67 def grow(self): 68 this = len(self.pred) 69 self.pred.append(self.last) 70 self.succ.append(None) 71 if self.last is None: 72 self.first = this 73 else: 74 self.succ[self.last] = this 75 self.last = this 76 77 def next(self, cursor): 78 if cursor == 0: 79 cursor = self.first 80 if cursor is None: 81 self.grow() 82 result = self.last 83 else: 84 result = cursor 85 return result, self.succ[result] 86 87 def is_free(self, ix): 88 while ix >= len(self.pred): 89 self.grow() 90 return self.pred[ix] != -1 91 92 def use(self, ix): 93 if self.pred[ix] is None: 94 self.first = self.succ[ix] 95 else: 96 self.succ[self.pred[ix]] = self.succ[ix] 97 if self.succ[ix] is None: 98 self.last = self.pred[ix] 99 else: 100 self.pred[self.succ[ix]] = self.pred[ix] 101 if self.pred[ix] == -1: 102 assert self.pred[ix] != -1, 'double free!' 103 self.pred[ix] = -1 104 105 106def combine(a, b): 107 if a is None: return b 108 if b is None: return a 109 if len(b) < len(a): a, b = b, a 110 res = b[:len(b) - len(a)] 111 for i in range(len(a)): 112 res.append(max(a[i], b[i + len(b) - len(a)])) 113 return res 114 115 116def trim(pattern): 117 for ix in range(len(pattern)): 118 if pattern[ix] != 0: 119 return pattern[ix:] 120 121 122def pat_to_binary(pattern): 123 return b''.join(struct.pack('B', x) for x in pattern) 124 125 126class Hyph: 127 128 def __init__(self): 129 self.root = Node() 130 self.root.str = '<root>' 131 self.node_list = [self.root] 132 133 # Add a pattern (word fragment with numeric codes, such as ".ad4der") 134 def add_pat(self, pat): 135 lastWasLetter = False 136 haveSeenNumber = False 137 result = [] 138 word = '' 139 for c in pat: 140 if c.isdigit(): 141 result.append(int(c)) 142 lastWasLetter = False 143 haveSeenNumber = True 144 else: 145 word += c 146 if lastWasLetter and haveSeenNumber: 147 result.append(0) 148 lastWasLetter = True 149 if lastWasLetter: 150 result.append(0) 151 152 self.add_word_res(word, result) 153 154 # Add an exception (word with hyphens, such as "ta-ble") 155 def add_exception(self, hyph_word): 156 res = [] 157 word = ['.'] 158 need_10 = False 159 for c in hyph_word: 160 if c == '-': 161 res.append(11) 162 need_10 = False 163 else: 164 if need_10: 165 res.append(10) 166 word.append(c) 167 need_10 = True 168 word.append('.') 169 res.append(0) 170 res.append(0) 171 if VERBOSE: 172 print(word, res) 173 self.add_word_res(''.join(word), res) 174 175 def add_word_res(self, word, result): 176 if VERBOSE: 177 print(word, result) 178 179 t = self.root 180 s = '' 181 for c in word: 182 s += c 183 if c not in t.succ: 184 new_node = Node() 185 new_node.str = s 186 self.node_list.append(new_node) 187 t.succ[c] = new_node 188 t = t.succ[c] 189 t.res = result 190 191 def pack(self, node_list, ch_map, use_node=False): 192 size = 0 193 self.node_map = {} 194 nodes = Freelist() 195 edges = Freelist() 196 edge_start = 1 if use_node else 0 197 for node in node_list: 198 succ = sorted([ch_map[c] + edge_start for c in node.succ.keys()]) 199 if len(succ): 200 cursor = 0 201 while True: 202 edge_ix, cursor = edges.next(cursor) 203 ix = edge_ix - succ[0] 204 if (ix >= 0 and nodes.is_free(ix) and 205 all(edges.is_free(ix + s) for s in succ) and 206 ((not use_node) or edges.is_free(ix))): 207 break 208 elif use_node: 209 ix, _ = edges.next(0) 210 nodes.is_free(ix) # actually don't need nodes at all when use_node, 211 # but keep it happy 212 else: 213 ix, _ = nodes.next(0) 214 node.ix = ix 215 self.node_map[ix] = node 216 nodes.use(ix) 217 size = max(size, ix) 218 if use_node: 219 edges.use(ix) 220 for s in succ: 221 edges.use(ix + s) 222 size += max(ch_map.values()) + 1 223 return size 224 225 # return list of nodes in bfs order 226 def bfs(self, ch_map): 227 result = [self.root] 228 ix = 0 229 while ix < len(result): 230 node = result[ix] 231 node.bfs_ix = ix 232 mapped = {} 233 for c, next in node.succ.items(): 234 assert ch_map[c] not in mapped, 'duplicate edge ' + node.str + ' ' + hex(ord(c)) 235 mapped[ch_map[c]] = next 236 for i in sorted(mapped.keys()): 237 result.append(mapped[i]) 238 ix += 1 239 self.bfs_order = result 240 return result 241 242 # suffix compression - convert the trie into an acyclic digraph, merging nodes when 243 # the subtries are identical 244 def dedup(self): 245 uniques = [] 246 dupmap = {} 247 dedup_ix = [0] * len(self.bfs_order) 248 for ix in reversed(range(len(self.bfs_order))): 249 # construct string representation of node 250 node = self.bfs_order[ix] 251 if node.res is None: 252 s = '' 253 else: 254 s = ''.join(str(c) for c in node.res) 255 for c in sorted(node.succ.keys()): 256 succ = node.succ[c] 257 s += ' ' + c + str(dedup_ix[succ.bfs_ix]) 258 if s in dupmap: 259 dedup_ix[ix] = dupmap[s] 260 else: 261 uniques.append(node) 262 dedup_ix[ix] = ix 263 dupmap[s] = dedup_ix[ix] 264 uniques.reverse() 265 print(len(uniques), 'unique nodes,', len(self.bfs_order), 'total') 266 return dedup_ix, uniques 267 268 269# load the ".pat" file, which contains patterns such as a1b2c3 270def load(fn): 271 hyph = Hyph() 272 with io.open(fn, encoding='UTF-8') as f: 273 for l in f: 274 pat = l.strip() 275 hyph.add_pat(pat) 276 return hyph 277 278 279# load the ".chr" file, which contains the alphabet and case pairs, eg "aA", "bB" etc. 280def load_chr(fn): 281 ch_map = {'.': 0} 282 with io.open(fn, encoding='UTF-8') as f: 283 for i, l in enumerate(f): 284 l = l.strip() 285 if len(l) > 2: 286 # lowercase maps to multi-character uppercase sequence, ignore uppercase for now 287 l = l[:1] 288 else: 289 assert len(l) == 2, 'expected 2 chars in chr' 290 for c in l: 291 ch_map[c] = i + 1 292 return ch_map 293 294 295# load exceptions with explicit hyphens 296def load_hyp(hyph, fn): 297 with io.open(fn, encoding='UTF-8') as f: 298 for l in f: 299 hyph.add_exception(l.strip()) 300 301 302def generate_header(alphabet, trie, pattern): 303 alphabet_off = 6 * 4 304 trie_off = alphabet_off + len(alphabet) 305 pattern_off = trie_off + len(trie) 306 file_size = pattern_off + len(pattern) 307 data = [0x62ad7968, 0, alphabet_off, trie_off, pattern_off, file_size] 308 return struct.pack('<6I', *data) 309 310 311def generate_alphabet(ch_map): 312 ch_map = ch_map.copy() 313 del ch_map['.'] 314 min_ch = ord(min(ch_map)) 315 max_ch = ord(max(ch_map)) 316 if max_ch - min_ch < 1024 and max(ch_map.values()) < 256: 317 # generate format 0 318 data = [0] * (max_ch - min_ch + 1) 319 for c, val in ch_map.items(): 320 data[ord(c) - min_ch] = val 321 result = [struct.pack('<3I', 0, min_ch, max_ch + 1)] 322 for b in data: 323 result.append(struct.pack('<B', b)) 324 else: 325 # generate format 1 326 assert max(ch_map.values()) < 2048, 'max number of unique characters exceeded' 327 result = [struct.pack('<2I', 1, len(ch_map))] 328 for c, val in sorted(ch_map.items()): 329 data = (ord(c) << 11) | val 330 result.append(struct.pack('<I', data)) 331 binary = b''.join(result) 332 if len(binary) % 4 != 0: 333 binary += b'\x00' * (4 - len(binary) % 4) 334 return binary 335 336 337# assumes hyph structure has been packed, ie node.ix values have been set 338def generate_trie(hyph, ch_map, n_trie, dedup_ix, dedup_nodes, patmap): 339 ch_array = [0] * n_trie 340 link_array = [0] * n_trie 341 pat_array = [0] * n_trie 342 link_shift = num_bits(max(ch_map.values())) 343 char_mask = (1 << link_shift) - 1 344 pattern_shift = link_shift + num_bits(n_trie - 1) 345 link_mask = (1 << pattern_shift) - (1 << link_shift) 346 result = [struct.pack('<6I', 0, char_mask, link_shift, link_mask, pattern_shift, n_trie)] 347 348 for node in dedup_nodes: 349 ix = node.ix 350 if node.res is not None: 351 pat_array[ix] = patmap[pat_to_binary(node.res)] 352 for c, next in node.succ.items(): 353 c_num = ch_map[c] 354 link_ix = ix + c_num 355 ch_array[link_ix] = c_num 356 if dedup_ix is None: 357 dedup_next = next 358 else: 359 dedup_next = hyph.bfs_order[dedup_ix[next.bfs_ix]] 360 link_array[link_ix] = dedup_next.ix 361 362 for i in range(n_trie): 363 #print((pat_array[i], link_array[i], ch_array[i])) 364 packed = (pat_array[i] << pattern_shift) | (link_array[i] << link_shift) | ch_array[i] 365 result.append(struct.pack('<I', packed)) 366 return b''.join(result) 367 368 369def generate_pattern(pats): 370 pat_array = [0] 371 patmap = {b'': 0} 372 373 raw_pat_array = [] 374 raw_pat_size = 0 375 raw_patmap = {} 376 377 for pat in pats: 378 if pat is None: 379 continue 380 pat_str = pat_to_binary(pat) 381 if pat_str not in patmap: 382 shift = 0 383 while shift < len(pat) and pat[len(pat) - shift - 1] == 0: 384 shift += 1 385 rawpat = pat_str[:len(pat) - shift] 386 if rawpat not in raw_patmap: 387 raw_patmap[rawpat] = raw_pat_size 388 raw_pat_array.append(rawpat) 389 raw_pat_size += len(rawpat) 390 data = (len(rawpat) << 26) | (shift << 20) | raw_patmap[rawpat] 391 patmap[pat_str] = len(pat_array) 392 pat_array.append(data) 393 data = [0, len(pat_array), 16 + 4 * len(pat_array), raw_pat_size] 394 result = [struct.pack('<4I', *data)] 395 for x in pat_array: 396 result.append(struct.pack('<I', x)) 397 result.extend(raw_pat_array) 398 return patmap, b''.join(result) 399 400 401def generate_hyb_file(hyph, ch_map, hyb_fn): 402 bfs = hyph.bfs(ch_map) 403 dedup_ix, dedup_nodes = hyph.dedup() 404 n_trie = hyph.pack(dedup_nodes, ch_map) 405 alphabet = generate_alphabet(ch_map) 406 patmap, pattern = generate_pattern([n.res for n in hyph.node_list]) 407 trie = generate_trie(hyph, ch_map, n_trie, dedup_ix, dedup_nodes, patmap) 408 header = generate_header(alphabet, trie, pattern) 409 410 with open(hyb_fn, 'wb') as f: 411 f.write(header) 412 f.write(alphabet) 413 f.write(trie) 414 f.write(pattern) 415 416 417# Verify that the file contains the same lines as the lines argument, in arbitrary order 418def verify_file_sorted(lines, fn): 419 file_lines = [l.strip() for l in io.open(fn, encoding='UTF-8')] 420 line_set = set(lines) 421 file_set = set(file_lines) 422 if line_set == file_set: 423 return True 424 for line in line_set - file_set: 425 print(repr(line) + ' in reconstruction, not in file') 426 for line in file_set - line_set: 427 print(repr(line) + ' in file, not in reconstruction') 428 return False 429 430 431def map_to_chr(alphabet_map): 432 result = [] 433 ch_map = {} 434 for val in alphabet_map.values(): 435 chs = [ch for ch in alphabet_map if alphabet_map[ch] == val] 436 # non-cased characters (like Ethopic) are in both, matching chr file 437 lowercase = [ch for ch in chs if not ch.isupper()] 438 uppercase = [ch for ch in chs if not ch.islower()] 439 # print(val, `lowercase`, `uppercase`) 440 assert len(lowercase) == 1, 'expected 1 lowercase character' 441 assert 0 <= len(uppercase) <= 1, 'expected 0 or 1 uppercase character' 442 ch_map[val] = lowercase[0] 443 result.append(''.join(lowercase + uppercase)) 444 ch_map[0] = '.' 445 return (ch_map, result) 446 447 448def get_pattern(pattern_data, ix): 449 pattern_offset = struct.unpack('<I', pattern_data[8:12])[0] 450 entry = struct.unpack('<I', pattern_data[16 + ix * 4: 16 + ix * 4 + 4])[0] 451 pat_len = entry >> 26 452 pat_shift = (entry >> 20) & 0x1f 453 offset = pattern_offset + (entry & 0xfffff) 454 return pattern_data[offset: offset + pat_len] + b'\0' * pat_shift 455 456 457def traverse_trie(ix, s, trie_data, ch_map, pattern_data, patterns, exceptions): 458 (char_mask, link_shift, link_mask, pattern_shift) = struct.unpack('<4I', trie_data[4:20]) 459 node_entry = struct.unpack('<I', trie_data[24 + ix * 4: 24 + ix * 4 + 4])[0] 460 pattern = node_entry >> pattern_shift 461 if pattern: 462 result = [] 463 is_exception = False 464 pat = get_pattern(pattern_data, pattern) 465 for i in range(len(s) + 1): 466 pat_off = i - 1 + len(pat) - len(s) 467 if pat_off < 0: 468 code = 0 469 else: 470 code = struct.unpack('B', pat[pat_off : pat_off + 1])[0] 471 if 1 <= code <= 9: 472 result.append('%d' % code) 473 elif code == 10: 474 is_exception = True 475 elif code == 11: 476 result.append('-') 477 is_exception = True 478 else: 479 assert code == 0, 'unexpected code' 480 if i < len(s): 481 result.append(s[i]) 482 pat_str = ''.join(result) 483 #print(`pat_str`, `pat`) 484 if is_exception: 485 assert pat_str[0] == '.', "expected leading '.'" 486 assert pat_str[-1] == '.', "expected trailing '.'" 487 exceptions.append(pat_str[1:-1]) # strip leading and trailing '.' 488 else: 489 patterns.append(pat_str) 490 for ch in ch_map: 491 edge_entry = struct.unpack('<I', trie_data[24 + (ix + ch) * 4: 24 + (ix + ch) * 4 + 4])[0] 492 link = (edge_entry & link_mask) >> link_shift 493 if link != 0 and ch == (edge_entry & char_mask): 494 sch = s + ch_map[ch] 495 traverse_trie(link, sch, trie_data, ch_map, pattern_data, patterns, exceptions) 496 497 498# Verify the generated binary file by reconstructing the textual representations 499# from the binary hyb file, then checking that they're identical (mod the order of 500# lines within the file, which is irrelevant). This function makes assumptions that 501# are stronger than absolutely necessary (in particular, that the patterns are in 502# lowercase as defined by python islower). 503def verify_hyb_file(hyb_fn, pat_fn, chr_fn, hyp_fn): 504 with open(hyb_fn, 'rb') as f: 505 hyb_data = f.read() 506 header = hyb_data[0: 6 * 4] 507 (magic, version, alphabet_off, trie_off, pattern_off, file_size) = struct.unpack('<6I', header) 508 alphabet_data = hyb_data[alphabet_off:trie_off] 509 trie_data = hyb_data[trie_off:pattern_off] 510 pattern_data = hyb_data[pattern_off:file_size] 511 512 # reconstruct alphabet table 513 alphabet_version = struct.unpack('<I', alphabet_data[:4])[0] 514 alphabet_map = {} 515 if alphabet_version == 0: 516 (min_ch, max_ch) = struct.unpack('<2I', alphabet_data[4:12]) 517 for ch in range(min_ch, max_ch): 518 offset = 12 + ch - min_ch 519 b = struct.unpack('B', alphabet_data[offset : offset + 1])[0] 520 if b != 0: 521 alphabet_map[unichr(ch)] = b 522 else: 523 assert alphabet_version == 1 524 n_entries = struct.unpack('<I', alphabet_data[4:8])[0] 525 for i in range(n_entries): 526 entry = struct.unpack('<I', alphabet_data[8 + 4 * i: 8 + 4 * i + 4])[0] 527 alphabet_map[unichr(entry >> 11)] = entry & 0x7ff 528 529 ch_map, reconstructed_chr = map_to_chr(alphabet_map) 530 531 # EXCEPTION for Armenian (hy), we don't really deal with the uppercase form of U+0587 532 if u'\u0587' in reconstructed_chr: 533 reconstructed_chr.remove(u'\u0587') 534 reconstructed_chr.append(u'\u0587\u0535\u0552') 535 536 assert verify_file_sorted(reconstructed_chr, chr_fn), 'alphabet table not verified' 537 538 # reconstruct trie 539 patterns = [] 540 exceptions = [] 541 traverse_trie(0, '', trie_data, ch_map, pattern_data, patterns, exceptions) 542 assert verify_file_sorted(patterns, pat_fn), 'pattern table not verified' 543 assert verify_file_sorted(exceptions, hyp_fn), 'exception table not verified' 544 545 546def main(): 547 global VERBOSE 548 try: 549 opts, args = getopt.getopt(sys.argv[1:], 'v') 550 except getopt.GetoptError as err: 551 print(str(err)) 552 sys.exit(1) 553 for o, _ in opts: 554 if o == '-v': 555 VERBOSE = True 556 pat_fn, out_fn = args 557 hyph = load(pat_fn) 558 if pat_fn.endswith('.pat.txt'): 559 chr_fn = pat_fn[:-8] + '.chr.txt' 560 ch_map = load_chr(chr_fn) 561 hyp_fn = pat_fn[:-8] + '.hyp.txt' 562 load_hyp(hyph, hyp_fn) 563 generate_hyb_file(hyph, ch_map, out_fn) 564 verify_hyb_file(out_fn, pat_fn, chr_fn, hyp_fn) 565 566if __name__ == '__main__': 567 main() 568