1"""Extend the Python codecs module with a few encodings that are used in OpenType (name table) 2but missing from Python. See https://github.com/fonttools/fonttools/issues/236 for details.""" 3 4from __future__ import print_function, division, absolute_import 5from fontTools.misc.py23 import * 6import codecs 7import encodings 8 9class ExtendCodec(codecs.Codec): 10 11 def __init__(self, name, base_encoding, mapping): 12 self.name = name 13 self.base_encoding = base_encoding 14 self.mapping = mapping 15 self.reverse = {v:k for k,v in mapping.items()} 16 self.max_len = max(len(v) for v in mapping.values()) 17 self.info = codecs.CodecInfo(name=self.name, encode=self.encode, decode=self.decode) 18 codecs.register_error(name, self.error) 19 20 def encode(self, input, errors='strict'): 21 assert errors == 'strict' 22 #return codecs.encode(input, self.base_encoding, self.name), len(input) 23 24 # The above line could totally be all we needed, relying on the error 25 # handling to replace the unencodable Unicode characters with our extended 26 # byte sequences. 27 # 28 # However, there seems to be a design bug in Python (probably intentional): 29 # the error handler for encoding is supposed to return a **Unicode** character, 30 # that then needs to be encodable itself... Ugh. 31 # 32 # So we implement what codecs.encode() should have been doing: which is expect 33 # error handler to return bytes() to be added to the output. 34 # 35 # This seems to have been fixed in Python 3.3. We should try using that and 36 # use fallback only if that failed. 37 # https://docs.python.org/3.3/library/codecs.html#codecs.register_error 38 39 length = len(input) 40 out = b'' 41 while input: 42 try: 43 part = codecs.encode(input, self.base_encoding) 44 out += part 45 input = '' # All converted 46 except UnicodeEncodeError as e: 47 # Convert the correct part 48 out += codecs.encode(input[:e.start], self.base_encoding) 49 replacement, pos = self.error(e) 50 out += replacement 51 input = input[pos:] 52 return out, length 53 54 def decode(self, input, errors='strict'): 55 assert errors == 'strict' 56 return codecs.decode(input, self.base_encoding, self.name), len(input) 57 58 def error(self, e): 59 if isinstance(e, UnicodeDecodeError): 60 for end in range(e.start + 1, e.end + 1): 61 s = e.object[e.start:end] 62 if s in self.mapping: 63 return self.mapping[s], end 64 elif isinstance(e, UnicodeEncodeError): 65 for end in range(e.start + 1, e.start + self.max_len + 1): 66 s = e.object[e.start:end] 67 if s in self.reverse: 68 return self.reverse[s], end 69 e.encoding = self.name 70 raise e 71 72 73_extended_encodings = { 74 "x_mac_japanese_ttx": ("shift_jis", { 75 b"\xFC": unichr(0x007C), 76 b"\x7E": unichr(0x007E), 77 b"\x80": unichr(0x005C), 78 b"\xA0": unichr(0x00A0), 79 b"\xFD": unichr(0x00A9), 80 b"\xFE": unichr(0x2122), 81 b"\xFF": unichr(0x2026), 82 }), 83 "x_mac_trad_chinese_ttx": ("big5", { 84 b"\x80": unichr(0x005C), 85 b"\xA0": unichr(0x00A0), 86 b"\xFD": unichr(0x00A9), 87 b"\xFE": unichr(0x2122), 88 b"\xFF": unichr(0x2026), 89 }), 90 "x_mac_korean_ttx": ("euc_kr", { 91 b"\x80": unichr(0x00A0), 92 b"\x81": unichr(0x20A9), 93 b"\x82": unichr(0x2014), 94 b"\x83": unichr(0x00A9), 95 b"\xFE": unichr(0x2122), 96 b"\xFF": unichr(0x2026), 97 }), 98 "x_mac_simp_chinese_ttx": ("gb2312", { 99 b"\x80": unichr(0x00FC), 100 b"\xA0": unichr(0x00A0), 101 b"\xFD": unichr(0x00A9), 102 b"\xFE": unichr(0x2122), 103 b"\xFF": unichr(0x2026), 104 }), 105} 106 107_cache = {} 108 109def search_function(name): 110 name = encodings.normalize_encoding(name) # Rather undocumented... 111 if name in _extended_encodings: 112 if name not in _cache: 113 base_encoding, mapping = _extended_encodings[name] 114 assert(name[-4:] == "_ttx") 115 # Python 2 didn't have any of the encodings that we are implementing 116 # in this file. Python 3 added aliases for the East Asian ones, mapping 117 # them "temporarily" to the same base encoding as us, with a comment 118 # suggesting that full implementation will appear some time later. 119 # As such, try the Python version of the x_mac_... first, if that is found, 120 # use *that* as our base encoding. This would make our encoding upgrade 121 # to the full encoding when and if Python finally implements that. 122 # http://bugs.python.org/issue24041 123 base_encodings = [name[:-4], base_encoding] 124 for base_encoding in base_encodings: 125 try: 126 codecs.lookup(base_encoding) 127 except LookupError: 128 continue 129 _cache[name] = ExtendCodec(name, base_encoding, mapping) 130 break 131 return _cache[name].info 132 133 return None 134 135codecs.register(search_function) 136