1#!/usr/bin/env python
2
3# Copyright (C) 2015 The Android Open Source Project
4#
5# Licensed under the Apache License, Version 2.0 (the 'License');
6# you may not use this file except in compliance with the License.
7# You may obtain a copy of the License at
8#
9#      http://www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an 'AS IS' BASIS,
13# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14# See the License for the specific language governing permissions and
15# limitations under the License.
16
17"""
18Convert hyphen files in standard TeX format (a trio of pat, chr, and hyp)
19into binary format. See doc/hyb_file_format.md for more information.
20
21Usage: mk_hyb_file.py [-v] hyph-foo.pat.txt hyph-foo.hyb
22
23Optional -v parameter turns on verbose debugging.
24
25"""
26
27from __future__ import print_function
28
29import io
30import sys
31import struct
32import math
33import getopt
34
35
36VERBOSE = False
37
38# U+00DF is LATIN SMALL LETTER SHARP S
39# U+1E9E is LATIN CAPITAL LETTER SHARP S
40SHARP_S_TO_DOUBLE = u'\u00dfSS'
41SHARP_S_TO_CAPITAL = u'\u00df\u1e9e'
42
43if sys.version_info[0] >= 3:
44    def unichr(x):
45        return chr(x)
46
47
48# number of bits required to represent numbers up to n inclusive
49def num_bits(n):
50    return 1 + int(math.log(n, 2)) if n > 0 else 0
51
52
53class Node:
54
55    def __init__(self):
56        self.succ = {}
57        self.res = None
58        self.fsm_pat = None
59        self.fail = None
60
61
62# List of free slots, implemented as doubly linked list
63class Freelist:
64
65    def __init__(self):
66        self.first = None
67        self.last = None
68        self.pred = []
69        self.succ = []
70
71    def grow(self):
72        this = len(self.pred)
73        self.pred.append(self.last)
74        self.succ.append(None)
75        if self.last is None:
76            self.first = this
77        else:
78            self.succ[self.last] = this
79        self.last = this
80
81    def next(self, cursor):
82        if cursor == 0:
83            cursor = self.first
84        if cursor is None:
85            self.grow()
86            result = self.last
87        else:
88            result = cursor
89        return result, self.succ[result]
90
91    def is_free(self, ix):
92        while ix >= len(self.pred):
93            self.grow()
94        return self.pred[ix] != -1
95
96    def use(self, ix):
97        if self.pred[ix] is None:
98            self.first = self.succ[ix]
99        else:
100            self.succ[self.pred[ix]] = self.succ[ix]
101        if self.succ[ix] is None:
102            self.last = self.pred[ix]
103        else:
104            self.pred[self.succ[ix]] = self.pred[ix]
105        if self.pred[ix] == -1:
106            assert self.pred[ix] != -1, 'double free!'
107        self.pred[ix] = -1
108
109
110def combine(a, b):
111    if a is None: return b
112    if b is None: return a
113    if len(b) < len(a): a, b = b, a
114    res = b[:len(b) - len(a)]
115    for i in range(len(a)):
116        res.append(max(a[i], b[i + len(b) - len(a)]))
117    return res
118
119
120def trim(pattern):
121    for ix in range(len(pattern)):
122        if pattern[ix] != 0:
123            return pattern[ix:]
124
125
126def pat_to_binary(pattern):
127    return b''.join(struct.pack('B', x) for x in pattern)
128
129
130class Hyph:
131
132    def __init__(self):
133        self.root = Node()
134        self.root.str = '<root>'
135        self.node_list = [self.root]
136
137    # Add a pattern (word fragment with numeric codes, such as ".ad4der")
138    def add_pat(self, pat):
139        lastWasLetter = False
140        haveSeenNumber = False
141        result = []
142        word = ''
143        for c in pat:
144            if c.isdigit():
145                result.append(int(c))
146                lastWasLetter = False
147                haveSeenNumber = True
148            else:
149                word += c
150                if lastWasLetter and haveSeenNumber:
151                    result.append(0)
152                lastWasLetter = True
153        if lastWasLetter:
154            result.append(0)
155
156        self.add_word_res(word, result)
157
158    # Add an exception (word with hyphens, such as "ta-ble")
159    def add_exception(self, hyph_word):
160        res = []
161        word = ['.']
162        need_10 = False
163        for c in hyph_word:
164            if c == '-':
165                res.append(11)
166                need_10 = False
167            else:
168                if need_10:
169                    res.append(10)
170                word.append(c)
171                need_10 = True
172        word.append('.')
173        res.append(0)
174        res.append(0)
175        if VERBOSE:
176            print(word, res)
177        self.add_word_res(''.join(word), res)
178
179    def add_word_res(self, word, result):
180        if VERBOSE:
181            print(word, result)
182
183        t = self.root
184        s = ''
185        for c in word:
186            s += c
187            if c not in t.succ:
188                new_node = Node()
189                new_node.str = s
190                self.node_list.append(new_node)
191                t.succ[c] = new_node
192            t = t.succ[c]
193        t.res = result
194
195    def pack(self, node_list, ch_map, use_node=False):
196        size = 0
197        self.node_map = {}
198        nodes = Freelist()
199        edges = Freelist()
200        edge_start = 1 if use_node else 0
201        for node in node_list:
202            succ = sorted([ch_map[c] + edge_start for c in node.succ.keys()])
203            if len(succ):
204                cursor = 0
205                while True:
206                    edge_ix, cursor = edges.next(cursor)
207                    ix = edge_ix - succ[0]
208                    if (ix >= 0 and nodes.is_free(ix) and
209                            all(edges.is_free(ix + s) for s in succ) and
210                            ((not use_node) or edges.is_free(ix))):
211                        break
212            elif use_node:
213                ix, _ = edges.next(0)
214                nodes.is_free(ix)  # actually don't need nodes at all when use_node,
215                # but keep it happy
216            else:
217                ix, _ = nodes.next(0)
218            node.ix = ix
219            self.node_map[ix] = node
220            nodes.use(ix)
221            size = max(size, ix)
222            if use_node:
223                edges.use(ix)
224            for s in succ:
225                edges.use(ix + s)
226        size += max(ch_map.values()) + 1
227        return size
228
229    # return list of nodes in bfs order
230    def bfs(self, ch_map):
231        result = [self.root]
232        ix = 0
233        while ix < len(result):
234            node = result[ix]
235            node.bfs_ix = ix
236            mapped = {}
237            for c, next in node.succ.items():
238                assert ch_map[c] not in mapped, 'duplicate edge ' + node.str + ' ' + hex(ord(c))
239                mapped[ch_map[c]] = next
240            for i in sorted(mapped.keys()):
241                result.append(mapped[i])
242            ix += 1
243        self.bfs_order = result
244        return result
245
246    # suffix compression - convert the trie into an acyclic digraph, merging nodes when
247    # the subtries are identical
248    def dedup(self):
249        uniques = []
250        dupmap = {}
251        dedup_ix = [0] * len(self.bfs_order)
252        for ix in reversed(range(len(self.bfs_order))):
253            # construct string representation of node
254            node = self.bfs_order[ix]
255            if node.res is None:
256                s = ''
257            else:
258                s = ''.join(str(c) for c in node.res)
259            for c in sorted(node.succ.keys()):
260                succ = node.succ[c]
261                s += ' ' + c + str(dedup_ix[succ.bfs_ix])
262            if s in dupmap:
263                dedup_ix[ix] = dupmap[s]
264            else:
265                uniques.append(node)
266                dedup_ix[ix] = ix
267            dupmap[s] = dedup_ix[ix]
268        uniques.reverse()
269        print(len(uniques), 'unique nodes,', len(self.bfs_order), 'total')
270        return dedup_ix, uniques
271
272
273# load the ".pat" file, which contains patterns such as a1b2c3
274def load(fn):
275    hyph = Hyph()
276    with io.open(fn, encoding='UTF-8') as f:
277        for l in f:
278            pat = l.strip()
279            hyph.add_pat(pat)
280    return hyph
281
282
283# load the ".chr" file, which contains the alphabet and case pairs, eg "aA", "bB" etc.
284def load_chr(fn):
285    ch_map = {'.': 0}
286    with io.open(fn, encoding='UTF-8') as f:
287        for i, l in enumerate(f):
288            l = l.strip()
289            if len(l) > 2:
290                if l == SHARP_S_TO_DOUBLE:
291                    # replace with lowercasing from capital letter sharp s
292                    l = SHARP_S_TO_CAPITAL
293                else:
294                    # lowercase maps to multi-character uppercase sequence, ignore uppercase for now
295                    l = l[:1]
296            else:
297                assert len(l) == 2, 'expected 2 chars in chr'
298            for c in l:
299                ch_map[c] = i + 1
300    return ch_map
301
302
303# load exceptions with explicit hyphens
304def load_hyp(hyph, fn):
305    with io.open(fn, encoding='UTF-8') as f:
306        for l in f:
307            hyph.add_exception(l.strip())
308
309
310def generate_header(alphabet, trie, pattern):
311    alphabet_off = 6 * 4
312    trie_off = alphabet_off + len(alphabet)
313    pattern_off = trie_off + len(trie)
314    file_size = pattern_off + len(pattern)
315    data = [0x62ad7968, 0, alphabet_off, trie_off, pattern_off, file_size]
316    return struct.pack('<6I', *data)
317
318
319def generate_alphabet(ch_map):
320    ch_map = ch_map.copy()
321    del ch_map['.']
322    min_ch = ord(min(ch_map))
323    max_ch = ord(max(ch_map))
324    if max_ch - min_ch < 1024 and max(ch_map.values()) < 256:
325        # generate format 0
326        data = [0] * (max_ch - min_ch + 1)
327        for c, val in ch_map.items():
328            data[ord(c) - min_ch] = val
329        result = [struct.pack('<3I', 0, min_ch, max_ch + 1)]
330        for b in data:
331            result.append(struct.pack('<B', b))
332    else:
333        # generate format 1
334        assert max(ch_map.values()) < 2048, 'max number of unique characters exceeded'
335        result = [struct.pack('<2I', 1, len(ch_map))]
336        for c, val in sorted(ch_map.items()):
337            data = (ord(c) << 11) | val
338            result.append(struct.pack('<I', data))
339    binary = b''.join(result)
340    if len(binary) % 4 != 0:
341        binary += b'\x00' * (4 - len(binary) % 4)
342    return binary
343
344
345# assumes hyph structure has been packed, ie node.ix values have been set
346def generate_trie(hyph, ch_map, n_trie, dedup_ix, dedup_nodes, patmap):
347    ch_array = [0] * n_trie
348    link_array = [0] * n_trie
349    pat_array = [0] * n_trie
350    link_shift = num_bits(max(ch_map.values()))
351    char_mask = (1 << link_shift) - 1
352    pattern_shift = link_shift + num_bits(n_trie - 1)
353    link_mask = (1 << pattern_shift) - (1 << link_shift)
354    result = [struct.pack('<6I', 0, char_mask, link_shift, link_mask, pattern_shift, n_trie)]
355
356    for node in dedup_nodes:
357        ix = node.ix
358        if node.res is not None:
359            pat_array[ix] = patmap[pat_to_binary(node.res)]
360        for c, next in node.succ.items():
361            c_num = ch_map[c]
362            link_ix = ix + c_num
363            ch_array[link_ix] = c_num
364            if dedup_ix is None:
365                dedup_next = next
366            else:
367                dedup_next = hyph.bfs_order[dedup_ix[next.bfs_ix]]
368            link_array[link_ix] = dedup_next.ix
369
370    for i in range(n_trie):
371        #print((pat_array[i], link_array[i], ch_array[i]))
372        packed = (pat_array[i] << pattern_shift) | (link_array[i] << link_shift) | ch_array[i]
373        result.append(struct.pack('<I', packed))
374    return b''.join(result)
375
376
377def generate_pattern(pats):
378    pat_array = [0]
379    patmap = {b'': 0}
380
381    raw_pat_array = []
382    raw_pat_size = 0
383    raw_patmap = {}
384
385    for pat in pats:
386        if pat is None:
387            continue
388        pat_str = pat_to_binary(pat)
389        if pat_str not in patmap:
390            shift = 0
391            while shift < len(pat) and pat[len(pat) - shift - 1] == 0:
392                shift += 1
393            rawpat = pat_str[:len(pat) - shift]
394            if rawpat not in raw_patmap:
395                raw_patmap[rawpat] = raw_pat_size
396                raw_pat_array.append(rawpat)
397                raw_pat_size += len(rawpat)
398            data = (len(rawpat) << 26) | (shift << 20) | raw_patmap[rawpat]
399            patmap[pat_str] = len(pat_array)
400            pat_array.append(data)
401    data = [0, len(pat_array), 16 + 4 * len(pat_array), raw_pat_size]
402    result = [struct.pack('<4I', *data)]
403    for x in pat_array:
404        result.append(struct.pack('<I', x))
405    result.extend(raw_pat_array)
406    return patmap, b''.join(result)
407
408
409def generate_hyb_file(hyph, ch_map, hyb_fn):
410    bfs = hyph.bfs(ch_map)
411    dedup_ix, dedup_nodes = hyph.dedup()
412    n_trie = hyph.pack(dedup_nodes, ch_map)
413    alphabet = generate_alphabet(ch_map)
414    patmap, pattern = generate_pattern([n.res for n in hyph.node_list])
415    trie = generate_trie(hyph, ch_map, n_trie, dedup_ix, dedup_nodes, patmap)
416    header = generate_header(alphabet, trie, pattern)
417
418    with open(hyb_fn, 'wb') as f:
419        f.write(header)
420        f.write(alphabet)
421        f.write(trie)
422        f.write(pattern)
423
424
425# Verify that the file contains the same lines as the lines argument, in arbitrary order
426def verify_file_sorted(lines, fn):
427    file_lines = [l.strip() for l in io.open(fn, encoding='UTF-8')]
428    line_set = set(lines)
429    file_set = set(file_lines)
430    if SHARP_S_TO_DOUBLE in file_set:
431        # ignore difference of double capital letter s and capital letter sharp s
432        file_set.symmetric_difference_update([SHARP_S_TO_DOUBLE, SHARP_S_TO_CAPITAL])
433    if line_set == file_set:
434        return True
435    for line in line_set - file_set:
436        print(repr(line) + ' in reconstruction, not in file')
437    for line in file_set - line_set:
438        print(repr(line) + ' in file, not in reconstruction')
439    return False
440
441
442def map_to_chr(alphabet_map):
443    result = []
444    ch_map = {}
445    for val in alphabet_map.values():
446        chs = [ch for ch in alphabet_map if alphabet_map[ch] == val]
447        # non-cased characters (like Ethopic) are in both, matching chr file
448        lowercase = [ch for ch in chs if not ch.isupper()]
449        uppercase = [ch for ch in chs if not ch.islower()]
450        # print(val, `lowercase`, `uppercase`)
451        assert len(lowercase) == 1, 'expected 1 lowercase character'
452        assert 0 <= len(uppercase) <= 1, 'expected 0 or 1 uppercase character'
453        ch_map[val] = lowercase[0]
454        result.append(''.join(lowercase + uppercase))
455    ch_map[0] = '.'
456    return (ch_map, result)
457
458
459def get_pattern(pattern_data, ix):
460    pattern_offset = struct.unpack('<I', pattern_data[8:12])[0]
461    entry = struct.unpack('<I', pattern_data[16 + ix * 4: 16 + ix * 4 + 4])[0]
462    pat_len = entry >> 26
463    pat_shift = (entry >> 20) & 0x1f
464    offset = pattern_offset + (entry & 0xfffff)
465    return pattern_data[offset: offset + pat_len] + b'\0' * pat_shift
466
467
468def traverse_trie(ix, s, trie_data, ch_map, pattern_data, patterns, exceptions):
469    (char_mask, link_shift, link_mask, pattern_shift) = struct.unpack('<4I', trie_data[4:20])
470    node_entry = struct.unpack('<I', trie_data[24 + ix * 4: 24 + ix * 4 + 4])[0]
471    pattern = node_entry >> pattern_shift
472    if pattern:
473        result = []
474        is_exception = False
475        pat = get_pattern(pattern_data, pattern)
476        for i in range(len(s) + 1):
477            pat_off = i - 1 + len(pat) - len(s)
478            if pat_off < 0:
479                code = 0
480            else:
481                code = struct.unpack('B', pat[pat_off : pat_off + 1])[0]
482            if 1 <= code <= 9:
483                result.append('%d' % code)
484            elif code == 10:
485                is_exception = True
486            elif code == 11:
487                result.append('-')
488                is_exception = True
489            else:
490                assert code == 0, 'unexpected code'
491            if i < len(s):
492                result.append(s[i])
493        pat_str = ''.join(result)
494        #print(`pat_str`, `pat`)
495        if is_exception:
496            assert pat_str[0] == '.', "expected leading '.'"
497            assert pat_str[-1] == '.', "expected trailing '.'"
498            exceptions.append(pat_str[1:-1])  # strip leading and trailing '.'
499        else:
500            patterns.append(pat_str)
501    for ch in ch_map:
502        edge_entry = struct.unpack('<I', trie_data[24 + (ix + ch) * 4: 24 + (ix + ch) * 4 + 4])[0]
503        link = (edge_entry & link_mask) >> link_shift
504        if link != 0 and ch == (edge_entry & char_mask):
505            sch = s + ch_map[ch]
506            traverse_trie(link, sch, trie_data, ch_map, pattern_data, patterns, exceptions)
507
508
509# Verify the generated binary file by reconstructing the textual representations
510# from the binary hyb file, then checking that they're identical (mod the order of
511# lines within the file, which is irrelevant). This function makes assumptions that
512# are stronger than absolutely necessary (in particular, that the patterns are in
513# lowercase as defined by python islower).
514def verify_hyb_file(hyb_fn, pat_fn, chr_fn, hyp_fn):
515    with open(hyb_fn, 'rb') as f:
516        hyb_data = f.read()
517    header = hyb_data[0: 6 * 4]
518    (magic, version, alphabet_off, trie_off, pattern_off, file_size) = struct.unpack('<6I', header)
519    alphabet_data = hyb_data[alphabet_off:trie_off]
520    trie_data = hyb_data[trie_off:pattern_off]
521    pattern_data = hyb_data[pattern_off:file_size]
522
523    # reconstruct alphabet table
524    alphabet_version = struct.unpack('<I', alphabet_data[:4])[0]
525    alphabet_map = {}
526    if alphabet_version == 0:
527        (min_ch, max_ch) = struct.unpack('<2I', alphabet_data[4:12])
528        for ch in range(min_ch, max_ch):
529            offset = 12 + ch - min_ch
530            b = struct.unpack('B', alphabet_data[offset : offset + 1])[0]
531            if b != 0:
532                alphabet_map[unichr(ch)] = b
533    else:
534        assert alphabet_version == 1
535        n_entries = struct.unpack('<I', alphabet_data[4:8])[0]
536        for i in range(n_entries):
537            entry = struct.unpack('<I', alphabet_data[8 + 4 * i: 8 + 4 * i + 4])[0]
538            alphabet_map[unichr(entry >> 11)] = entry & 0x7ff
539
540    ch_map, reconstructed_chr = map_to_chr(alphabet_map)
541
542    # EXCEPTION for Armenian (hy), we don't really deal with the uppercase form of U+0587
543    if u'\u0587' in reconstructed_chr:
544        reconstructed_chr.remove(u'\u0587')
545        reconstructed_chr.append(u'\u0587\u0535\u0552')
546
547    assert verify_file_sorted(reconstructed_chr, chr_fn), 'alphabet table not verified'
548
549    # reconstruct trie
550    patterns = []
551    exceptions = []
552    traverse_trie(0, '', trie_data, ch_map, pattern_data, patterns, exceptions)
553
554    # EXCEPTION for Bulgarian (bg), which contains an ineffectual line of <0, U+044C, 0>
555    if u'\u044c' in patterns:
556        patterns.remove(u'\u044c')
557        patterns.append(u'0\u044c0')
558
559    assert verify_file_sorted(patterns, pat_fn), 'pattern table not verified'
560    assert verify_file_sorted(exceptions, hyp_fn), 'exception table not verified'
561
562
563def main():
564    global VERBOSE
565    try:
566        opts, args = getopt.getopt(sys.argv[1:], 'v')
567    except getopt.GetoptError as err:
568        print(str(err))
569        sys.exit(1)
570    for o, _ in opts:
571        if o == '-v':
572            VERBOSE = True
573    pat_fn, out_fn = args
574    hyph = load(pat_fn)
575    if pat_fn.endswith('.pat.txt'):
576        chr_fn = pat_fn[:-8] + '.chr.txt'
577        ch_map = load_chr(chr_fn)
578        hyp_fn = pat_fn[:-8] + '.hyp.txt'
579        load_hyp(hyph, hyp_fn)
580        generate_hyb_file(hyph, ch_map, out_fn)
581        verify_hyb_file(out_fn, pat_fn, chr_fn, hyp_fn)
582
583if __name__ == '__main__':
584    main()
585