1""" codecs -- Python Codec Registry, API and helpers. 2 3 4Written by Marc-Andre Lemburg (mal@lemburg.com). 5 6(c) Copyright CNRI, All Rights Reserved. NO WARRANTY. 7 8""" 9 10import builtins 11import sys 12 13### Registry and builtin stateless codec functions 14 15try: 16 from _codecs import * 17except ImportError as why: 18 raise SystemError('Failed to load the builtin codecs: %s' % why) 19 20__all__ = ["register", "lookup", "open", "EncodedFile", "BOM", "BOM_BE", 21 "BOM_LE", "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE", 22 "BOM_UTF8", "BOM_UTF16", "BOM_UTF16_LE", "BOM_UTF16_BE", 23 "BOM_UTF32", "BOM_UTF32_LE", "BOM_UTF32_BE", 24 "CodecInfo", "Codec", "IncrementalEncoder", "IncrementalDecoder", 25 "StreamReader", "StreamWriter", 26 "StreamReaderWriter", "StreamRecoder", 27 "getencoder", "getdecoder", "getincrementalencoder", 28 "getincrementaldecoder", "getreader", "getwriter", 29 "encode", "decode", "iterencode", "iterdecode", 30 "strict_errors", "ignore_errors", "replace_errors", 31 "xmlcharrefreplace_errors", 32 "backslashreplace_errors", "namereplace_errors", 33 "register_error", "lookup_error"] 34 35### Constants 36 37# 38# Byte Order Mark (BOM = ZERO WIDTH NO-BREAK SPACE = U+FEFF) 39# and its possible byte string values 40# for UTF8/UTF16/UTF32 output and little/big endian machines 41# 42 43# UTF-8 44BOM_UTF8 = b'\xef\xbb\xbf' 45 46# UTF-16, little endian 47BOM_LE = BOM_UTF16_LE = b'\xff\xfe' 48 49# UTF-16, big endian 50BOM_BE = BOM_UTF16_BE = b'\xfe\xff' 51 52# UTF-32, little endian 53BOM_UTF32_LE = b'\xff\xfe\x00\x00' 54 55# UTF-32, big endian 56BOM_UTF32_BE = b'\x00\x00\xfe\xff' 57 58if sys.byteorder == 'little': 59 60 # UTF-16, native endianness 61 BOM = BOM_UTF16 = BOM_UTF16_LE 62 63 # UTF-32, native endianness 64 BOM_UTF32 = BOM_UTF32_LE 65 66else: 67 68 # UTF-16, native endianness 69 BOM = BOM_UTF16 = BOM_UTF16_BE 70 71 # UTF-32, native endianness 72 BOM_UTF32 = BOM_UTF32_BE 73 74# Old broken names (don't use in new code) 75BOM32_LE = BOM_UTF16_LE 76BOM32_BE = BOM_UTF16_BE 77BOM64_LE = BOM_UTF32_LE 78BOM64_BE = BOM_UTF32_BE 79 80 81### Codec base classes (defining the API) 82 83class CodecInfo(tuple): 84 """Codec details when looking up the codec registry""" 85 86 # Private API to allow Python 3.4 to blacklist the known non-Unicode 87 # codecs in the standard library. A more general mechanism to 88 # reliably distinguish test encodings from other codecs will hopefully 89 # be defined for Python 3.5 90 # 91 # See http://bugs.python.org/issue19619 92 _is_text_encoding = True # Assume codecs are text encodings by default 93 94 def __new__(cls, encode, decode, streamreader=None, streamwriter=None, 95 incrementalencoder=None, incrementaldecoder=None, name=None, 96 *, _is_text_encoding=None): 97 self = tuple.__new__(cls, (encode, decode, streamreader, streamwriter)) 98 self.name = name 99 self.encode = encode 100 self.decode = decode 101 self.incrementalencoder = incrementalencoder 102 self.incrementaldecoder = incrementaldecoder 103 self.streamwriter = streamwriter 104 self.streamreader = streamreader 105 if _is_text_encoding is not None: 106 self._is_text_encoding = _is_text_encoding 107 return self 108 109 def __repr__(self): 110 return "<%s.%s object for encoding %s at %#x>" % \ 111 (self.__class__.__module__, self.__class__.__qualname__, 112 self.name, id(self)) 113 114class Codec: 115 116 """ Defines the interface for stateless encoders/decoders. 117 118 The .encode()/.decode() methods may use different error 119 handling schemes by providing the errors argument. These 120 string values are predefined: 121 122 'strict' - raise a ValueError error (or a subclass) 123 'ignore' - ignore the character and continue with the next 124 'replace' - replace with a suitable replacement character; 125 Python will use the official U+FFFD REPLACEMENT 126 CHARACTER for the builtin Unicode codecs on 127 decoding and '?' on encoding. 128 'surrogateescape' - replace with private code points U+DCnn. 129 'xmlcharrefreplace' - Replace with the appropriate XML 130 character reference (only for encoding). 131 'backslashreplace' - Replace with backslashed escape sequences. 132 'namereplace' - Replace with \\N{...} escape sequences 133 (only for encoding). 134 135 The set of allowed values can be extended via register_error. 136 137 """ 138 def encode(self, input, errors='strict'): 139 140 """ Encodes the object input and returns a tuple (output 141 object, length consumed). 142 143 errors defines the error handling to apply. It defaults to 144 'strict' handling. 145 146 The method may not store state in the Codec instance. Use 147 StreamWriter for codecs which have to keep state in order to 148 make encoding efficient. 149 150 The encoder must be able to handle zero length input and 151 return an empty object of the output object type in this 152 situation. 153 154 """ 155 raise NotImplementedError 156 157 def decode(self, input, errors='strict'): 158 159 """ Decodes the object input and returns a tuple (output 160 object, length consumed). 161 162 input must be an object which provides the bf_getreadbuf 163 buffer slot. Python strings, buffer objects and memory 164 mapped files are examples of objects providing this slot. 165 166 errors defines the error handling to apply. It defaults to 167 'strict' handling. 168 169 The method may not store state in the Codec instance. Use 170 StreamReader for codecs which have to keep state in order to 171 make decoding efficient. 172 173 The decoder must be able to handle zero length input and 174 return an empty object of the output object type in this 175 situation. 176 177 """ 178 raise NotImplementedError 179 180class IncrementalEncoder(object): 181 """ 182 An IncrementalEncoder encodes an input in multiple steps. The input can 183 be passed piece by piece to the encode() method. The IncrementalEncoder 184 remembers the state of the encoding process between calls to encode(). 185 """ 186 def __init__(self, errors='strict'): 187 """ 188 Creates an IncrementalEncoder instance. 189 190 The IncrementalEncoder may use different error handling schemes by 191 providing the errors keyword argument. See the module docstring 192 for a list of possible values. 193 """ 194 self.errors = errors 195 self.buffer = "" 196 197 def encode(self, input, final=False): 198 """ 199 Encodes input and returns the resulting object. 200 """ 201 raise NotImplementedError 202 203 def reset(self): 204 """ 205 Resets the encoder to the initial state. 206 """ 207 208 def getstate(self): 209 """ 210 Return the current state of the encoder. 211 """ 212 return 0 213 214 def setstate(self, state): 215 """ 216 Set the current state of the encoder. state must have been 217 returned by getstate(). 218 """ 219 220class BufferedIncrementalEncoder(IncrementalEncoder): 221 """ 222 This subclass of IncrementalEncoder can be used as the baseclass for an 223 incremental encoder if the encoder must keep some of the output in a 224 buffer between calls to encode(). 225 """ 226 def __init__(self, errors='strict'): 227 IncrementalEncoder.__init__(self, errors) 228 # unencoded input that is kept between calls to encode() 229 self.buffer = "" 230 231 def _buffer_encode(self, input, errors, final): 232 # Overwrite this method in subclasses: It must encode input 233 # and return an (output, length consumed) tuple 234 raise NotImplementedError 235 236 def encode(self, input, final=False): 237 # encode input (taking the buffer into account) 238 data = self.buffer + input 239 (result, consumed) = self._buffer_encode(data, self.errors, final) 240 # keep unencoded input until the next call 241 self.buffer = data[consumed:] 242 return result 243 244 def reset(self): 245 IncrementalEncoder.reset(self) 246 self.buffer = "" 247 248 def getstate(self): 249 return self.buffer or 0 250 251 def setstate(self, state): 252 self.buffer = state or "" 253 254class IncrementalDecoder(object): 255 """ 256 An IncrementalDecoder decodes an input in multiple steps. The input can 257 be passed piece by piece to the decode() method. The IncrementalDecoder 258 remembers the state of the decoding process between calls to decode(). 259 """ 260 def __init__(self, errors='strict'): 261 """ 262 Create an IncrementalDecoder instance. 263 264 The IncrementalDecoder may use different error handling schemes by 265 providing the errors keyword argument. See the module docstring 266 for a list of possible values. 267 """ 268 self.errors = errors 269 270 def decode(self, input, final=False): 271 """ 272 Decode input and returns the resulting object. 273 """ 274 raise NotImplementedError 275 276 def reset(self): 277 """ 278 Reset the decoder to the initial state. 279 """ 280 281 def getstate(self): 282 """ 283 Return the current state of the decoder. 284 285 This must be a (buffered_input, additional_state_info) tuple. 286 buffered_input must be a bytes object containing bytes that 287 were passed to decode() that have not yet been converted. 288 additional_state_info must be a non-negative integer 289 representing the state of the decoder WITHOUT yet having 290 processed the contents of buffered_input. In the initial state 291 and after reset(), getstate() must return (b"", 0). 292 """ 293 return (b"", 0) 294 295 def setstate(self, state): 296 """ 297 Set the current state of the decoder. 298 299 state must have been returned by getstate(). The effect of 300 setstate((b"", 0)) must be equivalent to reset(). 301 """ 302 303class BufferedIncrementalDecoder(IncrementalDecoder): 304 """ 305 This subclass of IncrementalDecoder can be used as the baseclass for an 306 incremental decoder if the decoder must be able to handle incomplete 307 byte sequences. 308 """ 309 def __init__(self, errors='strict'): 310 IncrementalDecoder.__init__(self, errors) 311 # undecoded input that is kept between calls to decode() 312 self.buffer = b"" 313 314 def _buffer_decode(self, input, errors, final): 315 # Overwrite this method in subclasses: It must decode input 316 # and return an (output, length consumed) tuple 317 raise NotImplementedError 318 319 def decode(self, input, final=False): 320 # decode input (taking the buffer into account) 321 data = self.buffer + input 322 (result, consumed) = self._buffer_decode(data, self.errors, final) 323 # keep undecoded input until the next call 324 self.buffer = data[consumed:] 325 return result 326 327 def reset(self): 328 IncrementalDecoder.reset(self) 329 self.buffer = b"" 330 331 def getstate(self): 332 # additional state info is always 0 333 return (self.buffer, 0) 334 335 def setstate(self, state): 336 # ignore additional state info 337 self.buffer = state[0] 338 339# 340# The StreamWriter and StreamReader class provide generic working 341# interfaces which can be used to implement new encoding submodules 342# very easily. See encodings/utf_8.py for an example on how this is 343# done. 344# 345 346class StreamWriter(Codec): 347 348 def __init__(self, stream, errors='strict'): 349 350 """ Creates a StreamWriter instance. 351 352 stream must be a file-like object open for writing. 353 354 The StreamWriter may use different error handling 355 schemes by providing the errors keyword argument. These 356 parameters are predefined: 357 358 'strict' - raise a ValueError (or a subclass) 359 'ignore' - ignore the character and continue with the next 360 'replace'- replace with a suitable replacement character 361 'xmlcharrefreplace' - Replace with the appropriate XML 362 character reference. 363 'backslashreplace' - Replace with backslashed escape 364 sequences. 365 'namereplace' - Replace with \\N{...} escape sequences. 366 367 The set of allowed parameter values can be extended via 368 register_error. 369 """ 370 self.stream = stream 371 self.errors = errors 372 373 def write(self, object): 374 375 """ Writes the object's contents encoded to self.stream. 376 """ 377 data, consumed = self.encode(object, self.errors) 378 self.stream.write(data) 379 380 def writelines(self, list): 381 382 """ Writes the concatenated list of strings to the stream 383 using .write(). 384 """ 385 self.write(''.join(list)) 386 387 def reset(self): 388 389 """ Flushes and resets the codec buffers used for keeping state. 390 391 Calling this method should ensure that the data on the 392 output is put into a clean state, that allows appending 393 of new fresh data without having to rescan the whole 394 stream to recover state. 395 396 """ 397 pass 398 399 def seek(self, offset, whence=0): 400 self.stream.seek(offset, whence) 401 if whence == 0 and offset == 0: 402 self.reset() 403 404 def __getattr__(self, name, 405 getattr=getattr): 406 407 """ Inherit all other methods from the underlying stream. 408 """ 409 return getattr(self.stream, name) 410 411 def __enter__(self): 412 return self 413 414 def __exit__(self, type, value, tb): 415 self.stream.close() 416 417### 418 419class StreamReader(Codec): 420 421 charbuffertype = str 422 423 def __init__(self, stream, errors='strict'): 424 425 """ Creates a StreamReader instance. 426 427 stream must be a file-like object open for reading. 428 429 The StreamReader may use different error handling 430 schemes by providing the errors keyword argument. These 431 parameters are predefined: 432 433 'strict' - raise a ValueError (or a subclass) 434 'ignore' - ignore the character and continue with the next 435 'replace'- replace with a suitable replacement character 436 'backslashreplace' - Replace with backslashed escape sequences; 437 438 The set of allowed parameter values can be extended via 439 register_error. 440 """ 441 self.stream = stream 442 self.errors = errors 443 self.bytebuffer = b"" 444 self._empty_charbuffer = self.charbuffertype() 445 self.charbuffer = self._empty_charbuffer 446 self.linebuffer = None 447 448 def decode(self, input, errors='strict'): 449 raise NotImplementedError 450 451 def read(self, size=-1, chars=-1, firstline=False): 452 453 """ Decodes data from the stream self.stream and returns the 454 resulting object. 455 456 chars indicates the number of decoded code points or bytes to 457 return. read() will never return more data than requested, 458 but it might return less, if there is not enough available. 459 460 size indicates the approximate maximum number of decoded 461 bytes or code points to read for decoding. The decoder 462 can modify this setting as appropriate. The default value 463 -1 indicates to read and decode as much as possible. size 464 is intended to prevent having to decode huge files in one 465 step. 466 467 If firstline is true, and a UnicodeDecodeError happens 468 after the first line terminator in the input only the first line 469 will be returned, the rest of the input will be kept until the 470 next call to read(). 471 472 The method should use a greedy read strategy, meaning that 473 it should read as much data as is allowed within the 474 definition of the encoding and the given size, e.g. if 475 optional encoding endings or state markers are available 476 on the stream, these should be read too. 477 """ 478 # If we have lines cached, first merge them back into characters 479 if self.linebuffer: 480 self.charbuffer = self._empty_charbuffer.join(self.linebuffer) 481 self.linebuffer = None 482 483 if chars < 0: 484 # For compatibility with other read() methods that take a 485 # single argument 486 chars = size 487 488 # read until we get the required number of characters (if available) 489 while True: 490 # can the request be satisfied from the character buffer? 491 if chars >= 0: 492 if len(self.charbuffer) >= chars: 493 break 494 # we need more data 495 if size < 0: 496 newdata = self.stream.read() 497 else: 498 newdata = self.stream.read(size) 499 # decode bytes (those remaining from the last call included) 500 data = self.bytebuffer + newdata 501 if not data: 502 break 503 try: 504 newchars, decodedbytes = self.decode(data, self.errors) 505 except UnicodeDecodeError as exc: 506 if firstline: 507 newchars, decodedbytes = \ 508 self.decode(data[:exc.start], self.errors) 509 lines = newchars.splitlines(keepends=True) 510 if len(lines)<=1: 511 raise 512 else: 513 raise 514 # keep undecoded bytes until the next call 515 self.bytebuffer = data[decodedbytes:] 516 # put new characters in the character buffer 517 self.charbuffer += newchars 518 # there was no data available 519 if not newdata: 520 break 521 if chars < 0: 522 # Return everything we've got 523 result = self.charbuffer 524 self.charbuffer = self._empty_charbuffer 525 else: 526 # Return the first chars characters 527 result = self.charbuffer[:chars] 528 self.charbuffer = self.charbuffer[chars:] 529 return result 530 531 def readline(self, size=None, keepends=True): 532 533 """ Read one line from the input stream and return the 534 decoded data. 535 536 size, if given, is passed as size argument to the 537 read() method. 538 539 """ 540 # If we have lines cached from an earlier read, return 541 # them unconditionally 542 if self.linebuffer: 543 line = self.linebuffer[0] 544 del self.linebuffer[0] 545 if len(self.linebuffer) == 1: 546 # revert to charbuffer mode; we might need more data 547 # next time 548 self.charbuffer = self.linebuffer[0] 549 self.linebuffer = None 550 if not keepends: 551 line = line.splitlines(keepends=False)[0] 552 return line 553 554 readsize = size or 72 555 line = self._empty_charbuffer 556 # If size is given, we call read() only once 557 while True: 558 data = self.read(readsize, firstline=True) 559 if data: 560 # If we're at a "\r" read one extra character (which might 561 # be a "\n") to get a proper line ending. If the stream is 562 # temporarily exhausted we return the wrong line ending. 563 if (isinstance(data, str) and data.endswith("\r")) or \ 564 (isinstance(data, bytes) and data.endswith(b"\r")): 565 data += self.read(size=1, chars=1) 566 567 line += data 568 lines = line.splitlines(keepends=True) 569 if lines: 570 if len(lines) > 1: 571 # More than one line result; the first line is a full line 572 # to return 573 line = lines[0] 574 del lines[0] 575 if len(lines) > 1: 576 # cache the remaining lines 577 lines[-1] += self.charbuffer 578 self.linebuffer = lines 579 self.charbuffer = None 580 else: 581 # only one remaining line, put it back into charbuffer 582 self.charbuffer = lines[0] + self.charbuffer 583 if not keepends: 584 line = line.splitlines(keepends=False)[0] 585 break 586 line0withend = lines[0] 587 line0withoutend = lines[0].splitlines(keepends=False)[0] 588 if line0withend != line0withoutend: # We really have a line end 589 # Put the rest back together and keep it until the next call 590 self.charbuffer = self._empty_charbuffer.join(lines[1:]) + \ 591 self.charbuffer 592 if keepends: 593 line = line0withend 594 else: 595 line = line0withoutend 596 break 597 # we didn't get anything or this was our only try 598 if not data or size is not None: 599 if line and not keepends: 600 line = line.splitlines(keepends=False)[0] 601 break 602 if readsize < 8000: 603 readsize *= 2 604 return line 605 606 def readlines(self, sizehint=None, keepends=True): 607 608 """ Read all lines available on the input stream 609 and return them as a list. 610 611 Line breaks are implemented using the codec's decoder 612 method and are included in the list entries. 613 614 sizehint, if given, is ignored since there is no efficient 615 way to finding the true end-of-line. 616 617 """ 618 data = self.read() 619 return data.splitlines(keepends) 620 621 def reset(self): 622 623 """ Resets the codec buffers used for keeping state. 624 625 Note that no stream repositioning should take place. 626 This method is primarily intended to be able to recover 627 from decoding errors. 628 629 """ 630 self.bytebuffer = b"" 631 self.charbuffer = self._empty_charbuffer 632 self.linebuffer = None 633 634 def seek(self, offset, whence=0): 635 """ Set the input stream's current position. 636 637 Resets the codec buffers used for keeping state. 638 """ 639 self.stream.seek(offset, whence) 640 self.reset() 641 642 def __next__(self): 643 644 """ Return the next decoded line from the input stream.""" 645 line = self.readline() 646 if line: 647 return line 648 raise StopIteration 649 650 def __iter__(self): 651 return self 652 653 def __getattr__(self, name, 654 getattr=getattr): 655 656 """ Inherit all other methods from the underlying stream. 657 """ 658 return getattr(self.stream, name) 659 660 def __enter__(self): 661 return self 662 663 def __exit__(self, type, value, tb): 664 self.stream.close() 665 666### 667 668class StreamReaderWriter: 669 670 """ StreamReaderWriter instances allow wrapping streams which 671 work in both read and write modes. 672 673 The design is such that one can use the factory functions 674 returned by the codec.lookup() function to construct the 675 instance. 676 677 """ 678 # Optional attributes set by the file wrappers below 679 encoding = 'unknown' 680 681 def __init__(self, stream, Reader, Writer, errors='strict'): 682 683 """ Creates a StreamReaderWriter instance. 684 685 stream must be a Stream-like object. 686 687 Reader, Writer must be factory functions or classes 688 providing the StreamReader, StreamWriter interface resp. 689 690 Error handling is done in the same way as defined for the 691 StreamWriter/Readers. 692 693 """ 694 self.stream = stream 695 self.reader = Reader(stream, errors) 696 self.writer = Writer(stream, errors) 697 self.errors = errors 698 699 def read(self, size=-1): 700 701 return self.reader.read(size) 702 703 def readline(self, size=None): 704 705 return self.reader.readline(size) 706 707 def readlines(self, sizehint=None): 708 709 return self.reader.readlines(sizehint) 710 711 def __next__(self): 712 713 """ Return the next decoded line from the input stream.""" 714 return next(self.reader) 715 716 def __iter__(self): 717 return self 718 719 def write(self, data): 720 721 return self.writer.write(data) 722 723 def writelines(self, list): 724 725 return self.writer.writelines(list) 726 727 def reset(self): 728 729 self.reader.reset() 730 self.writer.reset() 731 732 def seek(self, offset, whence=0): 733 self.stream.seek(offset, whence) 734 self.reader.reset() 735 if whence == 0 and offset == 0: 736 self.writer.reset() 737 738 def __getattr__(self, name, 739 getattr=getattr): 740 741 """ Inherit all other methods from the underlying stream. 742 """ 743 return getattr(self.stream, name) 744 745 # these are needed to make "with StreamReaderWriter(...)" work properly 746 747 def __enter__(self): 748 return self 749 750 def __exit__(self, type, value, tb): 751 self.stream.close() 752 753### 754 755class StreamRecoder: 756 757 """ StreamRecoder instances translate data from one encoding to another. 758 759 They use the complete set of APIs returned by the 760 codecs.lookup() function to implement their task. 761 762 Data written to the StreamRecoder is first decoded into an 763 intermediate format (depending on the "decode" codec) and then 764 written to the underlying stream using an instance of the provided 765 Writer class. 766 767 In the other direction, data is read from the underlying stream using 768 a Reader instance and then encoded and returned to the caller. 769 770 """ 771 # Optional attributes set by the file wrappers below 772 data_encoding = 'unknown' 773 file_encoding = 'unknown' 774 775 def __init__(self, stream, encode, decode, Reader, Writer, 776 errors='strict'): 777 778 """ Creates a StreamRecoder instance which implements a two-way 779 conversion: encode and decode work on the frontend (the 780 data visible to .read() and .write()) while Reader and Writer 781 work on the backend (the data in stream). 782 783 You can use these objects to do transparent 784 transcodings from e.g. latin-1 to utf-8 and back. 785 786 stream must be a file-like object. 787 788 encode and decode must adhere to the Codec interface; Reader and 789 Writer must be factory functions or classes providing the 790 StreamReader and StreamWriter interfaces resp. 791 792 Error handling is done in the same way as defined for the 793 StreamWriter/Readers. 794 795 """ 796 self.stream = stream 797 self.encode = encode 798 self.decode = decode 799 self.reader = Reader(stream, errors) 800 self.writer = Writer(stream, errors) 801 self.errors = errors 802 803 def read(self, size=-1): 804 805 data = self.reader.read(size) 806 data, bytesencoded = self.encode(data, self.errors) 807 return data 808 809 def readline(self, size=None): 810 811 if size is None: 812 data = self.reader.readline() 813 else: 814 data = self.reader.readline(size) 815 data, bytesencoded = self.encode(data, self.errors) 816 return data 817 818 def readlines(self, sizehint=None): 819 820 data = self.reader.read() 821 data, bytesencoded = self.encode(data, self.errors) 822 return data.splitlines(keepends=True) 823 824 def __next__(self): 825 826 """ Return the next decoded line from the input stream.""" 827 data = next(self.reader) 828 data, bytesencoded = self.encode(data, self.errors) 829 return data 830 831 def __iter__(self): 832 return self 833 834 def write(self, data): 835 836 data, bytesdecoded = self.decode(data, self.errors) 837 return self.writer.write(data) 838 839 def writelines(self, list): 840 841 data = ''.join(list) 842 data, bytesdecoded = self.decode(data, self.errors) 843 return self.writer.write(data) 844 845 def reset(self): 846 847 self.reader.reset() 848 self.writer.reset() 849 850 def __getattr__(self, name, 851 getattr=getattr): 852 853 """ Inherit all other methods from the underlying stream. 854 """ 855 return getattr(self.stream, name) 856 857 def __enter__(self): 858 return self 859 860 def __exit__(self, type, value, tb): 861 self.stream.close() 862 863### Shortcuts 864 865def open(filename, mode='r', encoding=None, errors='strict', buffering=1): 866 867 """ Open an encoded file using the given mode and return 868 a wrapped version providing transparent encoding/decoding. 869 870 Note: The wrapped version will only accept the object format 871 defined by the codecs, i.e. Unicode objects for most builtin 872 codecs. Output is also codec dependent and will usually be 873 Unicode as well. 874 875 Underlying encoded files are always opened in binary mode. 876 The default file mode is 'r', meaning to open the file in read mode. 877 878 encoding specifies the encoding which is to be used for the 879 file. 880 881 errors may be given to define the error handling. It defaults 882 to 'strict' which causes ValueErrors to be raised in case an 883 encoding error occurs. 884 885 buffering has the same meaning as for the builtin open() API. 886 It defaults to line buffered. 887 888 The returned wrapped file object provides an extra attribute 889 .encoding which allows querying the used encoding. This 890 attribute is only available if an encoding was specified as 891 parameter. 892 893 """ 894 if encoding is not None and \ 895 'b' not in mode: 896 # Force opening of the file in binary mode 897 mode = mode + 'b' 898 file = builtins.open(filename, mode, buffering) 899 if encoding is None: 900 return file 901 info = lookup(encoding) 902 srw = StreamReaderWriter(file, info.streamreader, info.streamwriter, errors) 903 # Add attributes to simplify introspection 904 srw.encoding = encoding 905 return srw 906 907def EncodedFile(file, data_encoding, file_encoding=None, errors='strict'): 908 909 """ Return a wrapped version of file which provides transparent 910 encoding translation. 911 912 Data written to the wrapped file is decoded according 913 to the given data_encoding and then encoded to the underlying 914 file using file_encoding. The intermediate data type 915 will usually be Unicode but depends on the specified codecs. 916 917 Bytes read from the file are decoded using file_encoding and then 918 passed back to the caller encoded using data_encoding. 919 920 If file_encoding is not given, it defaults to data_encoding. 921 922 errors may be given to define the error handling. It defaults 923 to 'strict' which causes ValueErrors to be raised in case an 924 encoding error occurs. 925 926 The returned wrapped file object provides two extra attributes 927 .data_encoding and .file_encoding which reflect the given 928 parameters of the same name. The attributes can be used for 929 introspection by Python programs. 930 931 """ 932 if file_encoding is None: 933 file_encoding = data_encoding 934 data_info = lookup(data_encoding) 935 file_info = lookup(file_encoding) 936 sr = StreamRecoder(file, data_info.encode, data_info.decode, 937 file_info.streamreader, file_info.streamwriter, errors) 938 # Add attributes to simplify introspection 939 sr.data_encoding = data_encoding 940 sr.file_encoding = file_encoding 941 return sr 942 943### Helpers for codec lookup 944 945def getencoder(encoding): 946 947 """ Lookup up the codec for the given encoding and return 948 its encoder function. 949 950 Raises a LookupError in case the encoding cannot be found. 951 952 """ 953 return lookup(encoding).encode 954 955def getdecoder(encoding): 956 957 """ Lookup up the codec for the given encoding and return 958 its decoder function. 959 960 Raises a LookupError in case the encoding cannot be found. 961 962 """ 963 return lookup(encoding).decode 964 965def getincrementalencoder(encoding): 966 967 """ Lookup up the codec for the given encoding and return 968 its IncrementalEncoder class or factory function. 969 970 Raises a LookupError in case the encoding cannot be found 971 or the codecs doesn't provide an incremental encoder. 972 973 """ 974 encoder = lookup(encoding).incrementalencoder 975 if encoder is None: 976 raise LookupError(encoding) 977 return encoder 978 979def getincrementaldecoder(encoding): 980 981 """ Lookup up the codec for the given encoding and return 982 its IncrementalDecoder class or factory function. 983 984 Raises a LookupError in case the encoding cannot be found 985 or the codecs doesn't provide an incremental decoder. 986 987 """ 988 decoder = lookup(encoding).incrementaldecoder 989 if decoder is None: 990 raise LookupError(encoding) 991 return decoder 992 993def getreader(encoding): 994 995 """ Lookup up the codec for the given encoding and return 996 its StreamReader class or factory function. 997 998 Raises a LookupError in case the encoding cannot be found. 999 1000 """ 1001 return lookup(encoding).streamreader 1002 1003def getwriter(encoding): 1004 1005 """ Lookup up the codec for the given encoding and return 1006 its StreamWriter class or factory function. 1007 1008 Raises a LookupError in case the encoding cannot be found. 1009 1010 """ 1011 return lookup(encoding).streamwriter 1012 1013def iterencode(iterator, encoding, errors='strict', **kwargs): 1014 """ 1015 Encoding iterator. 1016 1017 Encodes the input strings from the iterator using an IncrementalEncoder. 1018 1019 errors and kwargs are passed through to the IncrementalEncoder 1020 constructor. 1021 """ 1022 encoder = getincrementalencoder(encoding)(errors, **kwargs) 1023 for input in iterator: 1024 output = encoder.encode(input) 1025 if output: 1026 yield output 1027 output = encoder.encode("", True) 1028 if output: 1029 yield output 1030 1031def iterdecode(iterator, encoding, errors='strict', **kwargs): 1032 """ 1033 Decoding iterator. 1034 1035 Decodes the input strings from the iterator using an IncrementalDecoder. 1036 1037 errors and kwargs are passed through to the IncrementalDecoder 1038 constructor. 1039 """ 1040 decoder = getincrementaldecoder(encoding)(errors, **kwargs) 1041 for input in iterator: 1042 output = decoder.decode(input) 1043 if output: 1044 yield output 1045 output = decoder.decode(b"", True) 1046 if output: 1047 yield output 1048 1049### Helpers for charmap-based codecs 1050 1051def make_identity_dict(rng): 1052 1053 """ make_identity_dict(rng) -> dict 1054 1055 Return a dictionary where elements of the rng sequence are 1056 mapped to themselves. 1057 1058 """ 1059 return {i:i for i in rng} 1060 1061def make_encoding_map(decoding_map): 1062 1063 """ Creates an encoding map from a decoding map. 1064 1065 If a target mapping in the decoding map occurs multiple 1066 times, then that target is mapped to None (undefined mapping), 1067 causing an exception when encountered by the charmap codec 1068 during translation. 1069 1070 One example where this happens is cp875.py which decodes 1071 multiple character to \\u001a. 1072 1073 """ 1074 m = {} 1075 for k,v in decoding_map.items(): 1076 if not v in m: 1077 m[v] = k 1078 else: 1079 m[v] = None 1080 return m 1081 1082### error handlers 1083 1084try: 1085 strict_errors = lookup_error("strict") 1086 ignore_errors = lookup_error("ignore") 1087 replace_errors = lookup_error("replace") 1088 xmlcharrefreplace_errors = lookup_error("xmlcharrefreplace") 1089 backslashreplace_errors = lookup_error("backslashreplace") 1090 namereplace_errors = lookup_error("namereplace") 1091except LookupError: 1092 # In --disable-unicode builds, these error handler are missing 1093 strict_errors = None 1094 ignore_errors = None 1095 replace_errors = None 1096 xmlcharrefreplace_errors = None 1097 backslashreplace_errors = None 1098 namereplace_errors = None 1099 1100# Tell modulefinder that using codecs probably needs the encodings 1101# package 1102_false = 0 1103if _false: 1104 import encodings 1105 1106### Tests 1107 1108if __name__ == '__main__': 1109 1110 # Make stdout translate Latin-1 output into UTF-8 output 1111 sys.stdout = EncodedFile(sys.stdout, 'latin-1', 'utf-8') 1112 1113 # Have stdin translate Latin-1 input into UTF-8 input 1114 sys.stdin = EncodedFile(sys.stdin, 'utf-8', 'latin-1') 1115