1# Step 04 - generate Java literals. 2# 3# Java byte-code has ridiculous restrictions. There is no such thing as 4# "array literal" - those are implemented as series of data[x] = y; 5# as a consequence N-byte array will use 7N bytes in class, plus N bytes 6# in instantiated variable. Also no literal could be longer than 64KiB. 7# 8# To keep dictionary data compact both in source code and in compiled format 9# we use the following tricks: 10# * use String as a data container 11# * store only lowest 7 bits; i.e. all characters fit ASCII table; this allows 12# efficient conversion to byte array; also ASCII characters use only 1 byte 13#. of memory (UTF-8 encoding) 14# * RLE-compress sequence of 8-th bits 15# 16# This script generates literals used in Java code. 17 18bin_path = "dictionary.bin" 19 20with open(bin_path, "rb") as raw: 21 data = raw.read() 22 23low = [] 24hi = [] 25is_skip = True 26skip_flip_offset = 36 27cntr = skip_flip_offset 28for b in data: 29 value = ord(b) 30 low.append(chr(value & 0x7F)) 31 if is_skip: 32 if value < 0x80: 33 cntr += 1 34 else: 35 is_skip = False 36 hi.append(unichr(cntr)) 37 cntr = skip_flip_offset + 1 38 else: 39 if value >= 0x80: 40 cntr += 1 41 else: 42 is_skip = True 43 hi.append(unichr(cntr)) 44 cntr = skip_flip_offset + 1 45hi.append(unichr(cntr)) 46 47low0 = low[0 : len(low) // 2] 48low1 = low[len(low) // 2 : len(low)] 49 50def escape(chars): 51 result = [] 52 for c in chars: 53 if "\r" == c: 54 result.append("\\r") 55 elif "\n" == c: 56 result.append("\\n") 57 elif "\t" == c: 58 result.append("\\t") 59 elif "\"" == c: 60 result.append("\\\"") 61 elif "\\" == c: 62 result.append("\\\\") 63 elif ord(c) < 32 or ord(c) >= 127: 64 result.append("\\u%04X" % ord(c)) 65 else: 66 result.append(c); 67 return result 68 69 70source_code = [ 71 " private static final String DATA0 = \"", "".join(escape(low0)), "\";\n", 72 " private static final String DATA1 = \"", "".join(escape(low1)), "\";\n", 73 " private static final String SKIP_FLIP = \"", "".join(escape(hi)), "\";\n" 74] 75 76src_path = "DictionaryData.inc.java" 77 78with open(src_path, "w") as source: 79 source.write("".join(source_code)) 80