1""" codecs -- Python Codec Registry, API and helpers. 2 3 4Written by Marc-Andre Lemburg (mal@lemburg.com). 5 6(c) Copyright CNRI, All Rights Reserved. NO WARRANTY. 7 8"""#" 9 10import __builtin__, sys 11 12### Registry and builtin stateless codec functions 13 14try: 15 from _codecs import * 16except ImportError, why: 17 raise SystemError('Failed to load the builtin codecs: %s' % why) 18 19__all__ = ["register", "lookup", "open", "EncodedFile", "BOM", "BOM_BE", 20 "BOM_LE", "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE", 21 "BOM_UTF8", "BOM_UTF16", "BOM_UTF16_LE", "BOM_UTF16_BE", 22 "BOM_UTF32", "BOM_UTF32_LE", "BOM_UTF32_BE", 23 "CodecInfo", "Codec", "IncrementalEncoder", "IncrementalDecoder", 24 "StreamReader", "StreamWriter", 25 "StreamReaderWriter", "StreamRecoder", 26 "getencoder", "getdecoder", "getincrementalencoder", 27 "getincrementaldecoder", "getreader", "getwriter", 28 "encode", "decode", "iterencode", "iterdecode", 29 "strict_errors", "ignore_errors", "replace_errors", 30 "xmlcharrefreplace_errors", "backslashreplace_errors", 31 "register_error", "lookup_error"] 32 33### Constants 34 35# 36# Byte Order Mark (BOM = ZERO WIDTH NO-BREAK SPACE = U+FEFF) 37# and its possible byte string values 38# for UTF8/UTF16/UTF32 output and little/big endian machines 39# 40 41# UTF-8 42BOM_UTF8 = '\xef\xbb\xbf' 43 44# UTF-16, little endian 45BOM_LE = BOM_UTF16_LE = '\xff\xfe' 46 47# UTF-16, big endian 48BOM_BE = BOM_UTF16_BE = '\xfe\xff' 49 50# UTF-32, little endian 51BOM_UTF32_LE = '\xff\xfe\x00\x00' 52 53# UTF-32, big endian 54BOM_UTF32_BE = '\x00\x00\xfe\xff' 55 56if sys.byteorder == 'little': 57 58 # UTF-16, native endianness 59 BOM = BOM_UTF16 = BOM_UTF16_LE 60 61 # UTF-32, native endianness 62 BOM_UTF32 = BOM_UTF32_LE 63 64else: 65 66 # UTF-16, native endianness 67 BOM = BOM_UTF16 = BOM_UTF16_BE 68 69 # UTF-32, native endianness 70 BOM_UTF32 = BOM_UTF32_BE 71 72# Old broken names (don't use in new code) 73BOM32_LE = BOM_UTF16_LE 74BOM32_BE = BOM_UTF16_BE 75BOM64_LE = BOM_UTF32_LE 76BOM64_BE = BOM_UTF32_BE 77 78 79### Codec base classes (defining the API) 80 81class CodecInfo(tuple): 82 """Codec details when looking up the codec registry""" 83 84 # Private API to allow Python to blacklist the known non-Unicode 85 # codecs in the standard library. A more general mechanism to 86 # reliably distinguish test encodings from other codecs will hopefully 87 # be defined for Python 3.5 88 # 89 # See http://bugs.python.org/issue19619 90 _is_text_encoding = True # Assume codecs are text encodings by default 91 92 def __new__(cls, encode, decode, streamreader=None, streamwriter=None, 93 incrementalencoder=None, incrementaldecoder=None, name=None, 94 _is_text_encoding=None): 95 self = tuple.__new__(cls, (encode, decode, streamreader, streamwriter)) 96 self.name = name 97 self.encode = encode 98 self.decode = decode 99 self.incrementalencoder = incrementalencoder 100 self.incrementaldecoder = incrementaldecoder 101 self.streamwriter = streamwriter 102 self.streamreader = streamreader 103 if _is_text_encoding is not None: 104 self._is_text_encoding = _is_text_encoding 105 return self 106 107 def __repr__(self): 108 return "<%s.%s object for encoding %s at 0x%x>" % (self.__class__.__module__, self.__class__.__name__, self.name, id(self)) 109 110class Codec: 111 112 """ Defines the interface for stateless encoders/decoders. 113 114 The .encode()/.decode() methods may use different error 115 handling schemes by providing the errors argument. These 116 string values are predefined: 117 118 'strict' - raise a ValueError error (or a subclass) 119 'ignore' - ignore the character and continue with the next 120 'replace' - replace with a suitable replacement character; 121 Python will use the official U+FFFD REPLACEMENT 122 CHARACTER for the builtin Unicode codecs on 123 decoding and '?' on encoding. 124 'xmlcharrefreplace' - Replace with the appropriate XML 125 character reference (only for encoding). 126 'backslashreplace' - Replace with backslashed escape sequences 127 (only for encoding). 128 129 The set of allowed values can be extended via register_error. 130 131 """ 132 def encode(self, input, errors='strict'): 133 134 """ Encodes the object input and returns a tuple (output 135 object, length consumed). 136 137 errors defines the error handling to apply. It defaults to 138 'strict' handling. 139 140 The method may not store state in the Codec instance. Use 141 StreamWriter for codecs which have to keep state in order to 142 make encoding efficient. 143 144 The encoder must be able to handle zero length input and 145 return an empty object of the output object type in this 146 situation. 147 148 """ 149 raise NotImplementedError 150 151 def decode(self, input, errors='strict'): 152 153 """ Decodes the object input and returns a tuple (output 154 object, length consumed). 155 156 input must be an object which provides the bf_getreadbuf 157 buffer slot. Python strings, buffer objects and memory 158 mapped files are examples of objects providing this slot. 159 160 errors defines the error handling to apply. It defaults to 161 'strict' handling. 162 163 The method may not store state in the Codec instance. Use 164 StreamReader for codecs which have to keep state in order to 165 make decoding efficient. 166 167 The decoder must be able to handle zero length input and 168 return an empty object of the output object type in this 169 situation. 170 171 """ 172 raise NotImplementedError 173 174class IncrementalEncoder(object): 175 """ 176 An IncrementalEncoder encodes an input in multiple steps. The input can be 177 passed piece by piece to the encode() method. The IncrementalEncoder remembers 178 the state of the Encoding process between calls to encode(). 179 """ 180 def __init__(self, errors='strict'): 181 """ 182 Creates an IncrementalEncoder instance. 183 184 The IncrementalEncoder may use different error handling schemes by 185 providing the errors keyword argument. See the module docstring 186 for a list of possible values. 187 """ 188 self.errors = errors 189 self.buffer = "" 190 191 def encode(self, input, final=False): 192 """ 193 Encodes input and returns the resulting object. 194 """ 195 raise NotImplementedError 196 197 def reset(self): 198 """ 199 Resets the encoder to the initial state. 200 """ 201 202 def getstate(self): 203 """ 204 Return the current state of the encoder. 205 """ 206 return 0 207 208 def setstate(self, state): 209 """ 210 Set the current state of the encoder. state must have been 211 returned by getstate(). 212 """ 213 214class BufferedIncrementalEncoder(IncrementalEncoder): 215 """ 216 This subclass of IncrementalEncoder can be used as the baseclass for an 217 incremental encoder if the encoder must keep some of the output in a 218 buffer between calls to encode(). 219 """ 220 def __init__(self, errors='strict'): 221 IncrementalEncoder.__init__(self, errors) 222 self.buffer = "" # unencoded input that is kept between calls to encode() 223 224 def _buffer_encode(self, input, errors, final): 225 # Overwrite this method in subclasses: It must encode input 226 # and return an (output, length consumed) tuple 227 raise NotImplementedError 228 229 def encode(self, input, final=False): 230 # encode input (taking the buffer into account) 231 data = self.buffer + input 232 (result, consumed) = self._buffer_encode(data, self.errors, final) 233 # keep unencoded input until the next call 234 self.buffer = data[consumed:] 235 return result 236 237 def reset(self): 238 IncrementalEncoder.reset(self) 239 self.buffer = "" 240 241 def getstate(self): 242 return self.buffer or 0 243 244 def setstate(self, state): 245 self.buffer = state or "" 246 247class IncrementalDecoder(object): 248 """ 249 An IncrementalDecoder decodes an input in multiple steps. The input can be 250 passed piece by piece to the decode() method. The IncrementalDecoder 251 remembers the state of the decoding process between calls to decode(). 252 """ 253 def __init__(self, errors='strict'): 254 """ 255 Creates an IncrementalDecoder instance. 256 257 The IncrementalDecoder may use different error handling schemes by 258 providing the errors keyword argument. See the module docstring 259 for a list of possible values. 260 """ 261 self.errors = errors 262 263 def decode(self, input, final=False): 264 """ 265 Decodes input and returns the resulting object. 266 """ 267 raise NotImplementedError 268 269 def reset(self): 270 """ 271 Resets the decoder to the initial state. 272 """ 273 274 def getstate(self): 275 """ 276 Return the current state of the decoder. 277 278 This must be a (buffered_input, additional_state_info) tuple. 279 buffered_input must be a bytes object containing bytes that 280 were passed to decode() that have not yet been converted. 281 additional_state_info must be a non-negative integer 282 representing the state of the decoder WITHOUT yet having 283 processed the contents of buffered_input. In the initial state 284 and after reset(), getstate() must return (b"", 0). 285 """ 286 return (b"", 0) 287 288 def setstate(self, state): 289 """ 290 Set the current state of the decoder. 291 292 state must have been returned by getstate(). The effect of 293 setstate((b"", 0)) must be equivalent to reset(). 294 """ 295 296class BufferedIncrementalDecoder(IncrementalDecoder): 297 """ 298 This subclass of IncrementalDecoder can be used as the baseclass for an 299 incremental decoder if the decoder must be able to handle incomplete byte 300 sequences. 301 """ 302 def __init__(self, errors='strict'): 303 IncrementalDecoder.__init__(self, errors) 304 self.buffer = "" # undecoded input that is kept between calls to decode() 305 306 def _buffer_decode(self, input, errors, final): 307 # Overwrite this method in subclasses: It must decode input 308 # and return an (output, length consumed) tuple 309 raise NotImplementedError 310 311 def decode(self, input, final=False): 312 # decode input (taking the buffer into account) 313 data = self.buffer + input 314 (result, consumed) = self._buffer_decode(data, self.errors, final) 315 # keep undecoded input until the next call 316 self.buffer = data[consumed:] 317 return result 318 319 def reset(self): 320 IncrementalDecoder.reset(self) 321 self.buffer = "" 322 323 def getstate(self): 324 # additional state info is always 0 325 return (self.buffer, 0) 326 327 def setstate(self, state): 328 # ignore additional state info 329 self.buffer = state[0] 330 331# 332# The StreamWriter and StreamReader class provide generic working 333# interfaces which can be used to implement new encoding submodules 334# very easily. See encodings/utf_8.py for an example on how this is 335# done. 336# 337 338class StreamWriter(Codec): 339 340 def __init__(self, stream, errors='strict'): 341 342 """ Creates a StreamWriter instance. 343 344 stream must be a file-like object open for writing 345 (binary) data. 346 347 The StreamWriter may use different error handling 348 schemes by providing the errors keyword argument. These 349 parameters are predefined: 350 351 'strict' - raise a ValueError (or a subclass) 352 'ignore' - ignore the character and continue with the next 353 'replace'- replace with a suitable replacement character 354 'xmlcharrefreplace' - Replace with the appropriate XML 355 character reference. 356 'backslashreplace' - Replace with backslashed escape 357 sequences (only for encoding). 358 359 The set of allowed parameter values can be extended via 360 register_error. 361 """ 362 self.stream = stream 363 self.errors = errors 364 365 def write(self, object): 366 367 """ Writes the object's contents encoded to self.stream. 368 """ 369 data, consumed = self.encode(object, self.errors) 370 self.stream.write(data) 371 372 def writelines(self, list): 373 374 """ Writes the concatenated list of strings to the stream 375 using .write(). 376 """ 377 self.write(''.join(list)) 378 379 def reset(self): 380 381 """ Flushes and resets the codec buffers used for keeping state. 382 383 Calling this method should ensure that the data on the 384 output is put into a clean state, that allows appending 385 of new fresh data without having to rescan the whole 386 stream to recover state. 387 388 """ 389 pass 390 391 def seek(self, offset, whence=0): 392 self.stream.seek(offset, whence) 393 if whence == 0 and offset == 0: 394 self.reset() 395 396 def __getattr__(self, name, 397 getattr=getattr): 398 399 """ Inherit all other methods from the underlying stream. 400 """ 401 return getattr(self.stream, name) 402 403 def __enter__(self): 404 return self 405 406 def __exit__(self, type, value, tb): 407 self.stream.close() 408 409### 410 411class StreamReader(Codec): 412 413 def __init__(self, stream, errors='strict'): 414 415 """ Creates a StreamReader instance. 416 417 stream must be a file-like object open for reading 418 (binary) data. 419 420 The StreamReader may use different error handling 421 schemes by providing the errors keyword argument. These 422 parameters are predefined: 423 424 'strict' - raise a ValueError (or a subclass) 425 'ignore' - ignore the character and continue with the next 426 'replace'- replace with a suitable replacement character; 427 428 The set of allowed parameter values can be extended via 429 register_error. 430 """ 431 self.stream = stream 432 self.errors = errors 433 self.bytebuffer = "" 434 # For str->str decoding this will stay a str 435 # For str->unicode decoding the first read will promote it to unicode 436 self.charbuffer = "" 437 self.linebuffer = None 438 439 def decode(self, input, errors='strict'): 440 raise NotImplementedError 441 442 def read(self, size=-1, chars=-1, firstline=False): 443 444 """ Decodes data from the stream self.stream and returns the 445 resulting object. 446 447 chars indicates the number of characters to read from the 448 stream. read() will never return more than chars 449 characters, but it might return less, if there are not enough 450 characters available. 451 452 size indicates the approximate maximum number of bytes to 453 read from the stream for decoding purposes. The decoder 454 can modify this setting as appropriate. The default value 455 -1 indicates to read and decode as much as possible. size 456 is intended to prevent having to decode huge files in one 457 step. 458 459 If firstline is true, and a UnicodeDecodeError happens 460 after the first line terminator in the input only the first line 461 will be returned, the rest of the input will be kept until the 462 next call to read(). 463 464 The method should use a greedy read strategy meaning that 465 it should read as much data as is allowed within the 466 definition of the encoding and the given size, e.g. if 467 optional encoding endings or state markers are available 468 on the stream, these should be read too. 469 """ 470 # If we have lines cached, first merge them back into characters 471 if self.linebuffer: 472 self.charbuffer = "".join(self.linebuffer) 473 self.linebuffer = None 474 475 if chars < 0: 476 # For compatibility with other read() methods that take a 477 # single argument 478 chars = size 479 480 # read until we get the required number of characters (if available) 481 while True: 482 # can the request be satisfied from the character buffer? 483 if chars >= 0: 484 if len(self.charbuffer) >= chars: 485 break 486 # we need more data 487 if size < 0: 488 newdata = self.stream.read() 489 else: 490 newdata = self.stream.read(size) 491 # decode bytes (those remaining from the last call included) 492 data = self.bytebuffer + newdata 493 try: 494 newchars, decodedbytes = self.decode(data, self.errors) 495 except UnicodeDecodeError, exc: 496 if firstline: 497 newchars, decodedbytes = self.decode(data[:exc.start], self.errors) 498 lines = newchars.splitlines(True) 499 if len(lines)<=1: 500 raise 501 else: 502 raise 503 # keep undecoded bytes until the next call 504 self.bytebuffer = data[decodedbytes:] 505 # put new characters in the character buffer 506 self.charbuffer += newchars 507 # there was no data available 508 if not newdata: 509 break 510 if chars < 0: 511 # Return everything we've got 512 result = self.charbuffer 513 self.charbuffer = "" 514 else: 515 # Return the first chars characters 516 result = self.charbuffer[:chars] 517 self.charbuffer = self.charbuffer[chars:] 518 return result 519 520 def readline(self, size=None, keepends=True): 521 522 """ Read one line from the input stream and return the 523 decoded data. 524 525 size, if given, is passed as size argument to the 526 read() method. 527 528 """ 529 # If we have lines cached from an earlier read, return 530 # them unconditionally 531 if self.linebuffer: 532 line = self.linebuffer[0] 533 del self.linebuffer[0] 534 if len(self.linebuffer) == 1: 535 # revert to charbuffer mode; we might need more data 536 # next time 537 self.charbuffer = self.linebuffer[0] 538 self.linebuffer = None 539 if not keepends: 540 line = line.splitlines(False)[0] 541 return line 542 543 readsize = size or 72 544 line = "" 545 # If size is given, we call read() only once 546 while True: 547 data = self.read(readsize, firstline=True) 548 if data: 549 # If we're at a "\r" read one extra character (which might 550 # be a "\n") to get a proper line ending. If the stream is 551 # temporarily exhausted we return the wrong line ending. 552 if data.endswith("\r"): 553 data += self.read(size=1, chars=1) 554 555 line += data 556 lines = line.splitlines(True) 557 if lines: 558 if len(lines) > 1: 559 # More than one line result; the first line is a full line 560 # to return 561 line = lines[0] 562 del lines[0] 563 if len(lines) > 1: 564 # cache the remaining lines 565 lines[-1] += self.charbuffer 566 self.linebuffer = lines 567 self.charbuffer = None 568 else: 569 # only one remaining line, put it back into charbuffer 570 self.charbuffer = lines[0] + self.charbuffer 571 if not keepends: 572 line = line.splitlines(False)[0] 573 break 574 line0withend = lines[0] 575 line0withoutend = lines[0].splitlines(False)[0] 576 if line0withend != line0withoutend: # We really have a line end 577 # Put the rest back together and keep it until the next call 578 self.charbuffer = "".join(lines[1:]) + self.charbuffer 579 if keepends: 580 line = line0withend 581 else: 582 line = line0withoutend 583 break 584 # we didn't get anything or this was our only try 585 if not data or size is not None: 586 if line and not keepends: 587 line = line.splitlines(False)[0] 588 break 589 if readsize<8000: 590 readsize *= 2 591 return line 592 593 def readlines(self, sizehint=None, keepends=True): 594 595 """ Read all lines available on the input stream 596 and return them as list of lines. 597 598 Line breaks are implemented using the codec's decoder 599 method and are included in the list entries. 600 601 sizehint, if given, is ignored since there is no efficient 602 way to finding the true end-of-line. 603 604 """ 605 data = self.read() 606 return data.splitlines(keepends) 607 608 def reset(self): 609 610 """ Resets the codec buffers used for keeping state. 611 612 Note that no stream repositioning should take place. 613 This method is primarily intended to be able to recover 614 from decoding errors. 615 616 """ 617 self.bytebuffer = "" 618 self.charbuffer = u"" 619 self.linebuffer = None 620 621 def seek(self, offset, whence=0): 622 """ Set the input stream's current position. 623 624 Resets the codec buffers used for keeping state. 625 """ 626 self.stream.seek(offset, whence) 627 self.reset() 628 629 def next(self): 630 631 """ Return the next decoded line from the input stream.""" 632 line = self.readline() 633 if line: 634 return line 635 raise StopIteration 636 637 def __iter__(self): 638 return self 639 640 def __getattr__(self, name, 641 getattr=getattr): 642 643 """ Inherit all other methods from the underlying stream. 644 """ 645 return getattr(self.stream, name) 646 647 def __enter__(self): 648 return self 649 650 def __exit__(self, type, value, tb): 651 self.stream.close() 652 653### 654 655class StreamReaderWriter: 656 657 """ StreamReaderWriter instances allow wrapping streams which 658 work in both read and write modes. 659 660 The design is such that one can use the factory functions 661 returned by the codec.lookup() function to construct the 662 instance. 663 664 """ 665 # Optional attributes set by the file wrappers below 666 encoding = 'unknown' 667 668 def __init__(self, stream, Reader, Writer, errors='strict'): 669 670 """ Creates a StreamReaderWriter instance. 671 672 stream must be a Stream-like object. 673 674 Reader, Writer must be factory functions or classes 675 providing the StreamReader, StreamWriter interface resp. 676 677 Error handling is done in the same way as defined for the 678 StreamWriter/Readers. 679 680 """ 681 self.stream = stream 682 self.reader = Reader(stream, errors) 683 self.writer = Writer(stream, errors) 684 self.errors = errors 685 686 def read(self, size=-1): 687 688 return self.reader.read(size) 689 690 def readline(self, size=None): 691 692 return self.reader.readline(size) 693 694 def readlines(self, sizehint=None): 695 696 return self.reader.readlines(sizehint) 697 698 def next(self): 699 700 """ Return the next decoded line from the input stream.""" 701 return self.reader.next() 702 703 def __iter__(self): 704 return self 705 706 def write(self, data): 707 708 return self.writer.write(data) 709 710 def writelines(self, list): 711 712 return self.writer.writelines(list) 713 714 def reset(self): 715 716 self.reader.reset() 717 self.writer.reset() 718 719 def seek(self, offset, whence=0): 720 self.stream.seek(offset, whence) 721 self.reader.reset() 722 if whence == 0 and offset == 0: 723 self.writer.reset() 724 725 def __getattr__(self, name, 726 getattr=getattr): 727 728 """ Inherit all other methods from the underlying stream. 729 """ 730 return getattr(self.stream, name) 731 732 # these are needed to make "with codecs.open(...)" work properly 733 734 def __enter__(self): 735 return self 736 737 def __exit__(self, type, value, tb): 738 self.stream.close() 739 740### 741 742class StreamRecoder: 743 744 """ StreamRecoder instances provide a frontend - backend 745 view of encoding data. 746 747 They use the complete set of APIs returned by the 748 codecs.lookup() function to implement their task. 749 750 Data written to the stream is first decoded into an 751 intermediate format (which is dependent on the given codec 752 combination) and then written to the stream using an instance 753 of the provided Writer class. 754 755 In the other direction, data is read from the stream using a 756 Reader instance and then return encoded data to the caller. 757 758 """ 759 # Optional attributes set by the file wrappers below 760 data_encoding = 'unknown' 761 file_encoding = 'unknown' 762 763 def __init__(self, stream, encode, decode, Reader, Writer, 764 errors='strict'): 765 766 """ Creates a StreamRecoder instance which implements a two-way 767 conversion: encode and decode work on the frontend (the 768 input to .read() and output of .write()) while 769 Reader and Writer work on the backend (reading and 770 writing to the stream). 771 772 You can use these objects to do transparent direct 773 recodings from e.g. latin-1 to utf-8 and back. 774 775 stream must be a file-like object. 776 777 encode, decode must adhere to the Codec interface, Reader, 778 Writer must be factory functions or classes providing the 779 StreamReader, StreamWriter interface resp. 780 781 encode and decode are needed for the frontend translation, 782 Reader and Writer for the backend translation. Unicode is 783 used as intermediate encoding. 784 785 Error handling is done in the same way as defined for the 786 StreamWriter/Readers. 787 788 """ 789 self.stream = stream 790 self.encode = encode 791 self.decode = decode 792 self.reader = Reader(stream, errors) 793 self.writer = Writer(stream, errors) 794 self.errors = errors 795 796 def read(self, size=-1): 797 798 data = self.reader.read(size) 799 data, bytesencoded = self.encode(data, self.errors) 800 return data 801 802 def readline(self, size=None): 803 804 if size is None: 805 data = self.reader.readline() 806 else: 807 data = self.reader.readline(size) 808 data, bytesencoded = self.encode(data, self.errors) 809 return data 810 811 def readlines(self, sizehint=None): 812 813 data = self.reader.read() 814 data, bytesencoded = self.encode(data, self.errors) 815 return data.splitlines(1) 816 817 def next(self): 818 819 """ Return the next decoded line from the input stream.""" 820 data = self.reader.next() 821 data, bytesencoded = self.encode(data, self.errors) 822 return data 823 824 def __iter__(self): 825 return self 826 827 def write(self, data): 828 829 data, bytesdecoded = self.decode(data, self.errors) 830 return self.writer.write(data) 831 832 def writelines(self, list): 833 834 data = ''.join(list) 835 data, bytesdecoded = self.decode(data, self.errors) 836 return self.writer.write(data) 837 838 def reset(self): 839 840 self.reader.reset() 841 self.writer.reset() 842 843 def __getattr__(self, name, 844 getattr=getattr): 845 846 """ Inherit all other methods from the underlying stream. 847 """ 848 return getattr(self.stream, name) 849 850 def __enter__(self): 851 return self 852 853 def __exit__(self, type, value, tb): 854 self.stream.close() 855 856### Shortcuts 857 858def open(filename, mode='rb', encoding=None, errors='strict', buffering=1): 859 860 """ Open an encoded file using the given mode and return 861 a wrapped version providing transparent encoding/decoding. 862 863 Note: The wrapped version will only accept the object format 864 defined by the codecs, i.e. Unicode objects for most builtin 865 codecs. Output is also codec dependent and will usually be 866 Unicode as well. 867 868 Files are always opened in binary mode, even if no binary mode 869 was specified. This is done to avoid data loss due to encodings 870 using 8-bit values. The default file mode is 'rb' meaning to 871 open the file in binary read mode. 872 873 encoding specifies the encoding which is to be used for the 874 file. 875 876 errors may be given to define the error handling. It defaults 877 to 'strict' which causes ValueErrors to be raised in case an 878 encoding error occurs. 879 880 buffering has the same meaning as for the builtin open() API. 881 It defaults to line buffered. 882 883 The returned wrapped file object provides an extra attribute 884 .encoding which allows querying the used encoding. This 885 attribute is only available if an encoding was specified as 886 parameter. 887 888 """ 889 if encoding is not None: 890 if 'U' in mode: 891 # No automatic conversion of '\n' is done on reading and writing 892 mode = mode.strip().replace('U', '') 893 if mode[:1] not in set('rwa'): 894 mode = 'r' + mode 895 if 'b' not in mode: 896 # Force opening of the file in binary mode 897 mode = mode + 'b' 898 file = __builtin__.open(filename, mode, buffering) 899 if encoding is None: 900 return file 901 info = lookup(encoding) 902 srw = StreamReaderWriter(file, info.streamreader, info.streamwriter, errors) 903 # Add attributes to simplify introspection 904 srw.encoding = encoding 905 return srw 906 907def EncodedFile(file, data_encoding, file_encoding=None, errors='strict'): 908 909 """ Return a wrapped version of file which provides transparent 910 encoding translation. 911 912 Strings written to the wrapped file are interpreted according 913 to the given data_encoding and then written to the original 914 file as string using file_encoding. The intermediate encoding 915 will usually be Unicode but depends on the specified codecs. 916 917 Strings are read from the file using file_encoding and then 918 passed back to the caller as string using data_encoding. 919 920 If file_encoding is not given, it defaults to data_encoding. 921 922 errors may be given to define the error handling. It defaults 923 to 'strict' which causes ValueErrors to be raised in case an 924 encoding error occurs. 925 926 The returned wrapped file object provides two extra attributes 927 .data_encoding and .file_encoding which reflect the given 928 parameters of the same name. The attributes can be used for 929 introspection by Python programs. 930 931 """ 932 if file_encoding is None: 933 file_encoding = data_encoding 934 data_info = lookup(data_encoding) 935 file_info = lookup(file_encoding) 936 sr = StreamRecoder(file, data_info.encode, data_info.decode, 937 file_info.streamreader, file_info.streamwriter, errors) 938 # Add attributes to simplify introspection 939 sr.data_encoding = data_encoding 940 sr.file_encoding = file_encoding 941 return sr 942 943### Helpers for codec lookup 944 945def getencoder(encoding): 946 947 """ Lookup up the codec for the given encoding and return 948 its encoder function. 949 950 Raises a LookupError in case the encoding cannot be found. 951 952 """ 953 return lookup(encoding).encode 954 955def getdecoder(encoding): 956 957 """ Lookup up the codec for the given encoding and return 958 its decoder function. 959 960 Raises a LookupError in case the encoding cannot be found. 961 962 """ 963 return lookup(encoding).decode 964 965def getincrementalencoder(encoding): 966 967 """ Lookup up the codec for the given encoding and return 968 its IncrementalEncoder class or factory function. 969 970 Raises a LookupError in case the encoding cannot be found 971 or the codecs doesn't provide an incremental encoder. 972 973 """ 974 encoder = lookup(encoding).incrementalencoder 975 if encoder is None: 976 raise LookupError(encoding) 977 return encoder 978 979def getincrementaldecoder(encoding): 980 981 """ Lookup up the codec for the given encoding and return 982 its IncrementalDecoder class or factory function. 983 984 Raises a LookupError in case the encoding cannot be found 985 or the codecs doesn't provide an incremental decoder. 986 987 """ 988 decoder = lookup(encoding).incrementaldecoder 989 if decoder is None: 990 raise LookupError(encoding) 991 return decoder 992 993def getreader(encoding): 994 995 """ Lookup up the codec for the given encoding and return 996 its StreamReader class or factory function. 997 998 Raises a LookupError in case the encoding cannot be found. 999 1000 """ 1001 return lookup(encoding).streamreader 1002 1003def getwriter(encoding): 1004 1005 """ Lookup up the codec for the given encoding and return 1006 its StreamWriter class or factory function. 1007 1008 Raises a LookupError in case the encoding cannot be found. 1009 1010 """ 1011 return lookup(encoding).streamwriter 1012 1013def iterencode(iterator, encoding, errors='strict', **kwargs): 1014 """ 1015 Encoding iterator. 1016 1017 Encodes the input strings from the iterator using an IncrementalEncoder. 1018 1019 errors and kwargs are passed through to the IncrementalEncoder 1020 constructor. 1021 """ 1022 encoder = getincrementalencoder(encoding)(errors, **kwargs) 1023 for input in iterator: 1024 output = encoder.encode(input) 1025 if output: 1026 yield output 1027 output = encoder.encode("", True) 1028 if output: 1029 yield output 1030 1031def iterdecode(iterator, encoding, errors='strict', **kwargs): 1032 """ 1033 Decoding iterator. 1034 1035 Decodes the input strings from the iterator using an IncrementalDecoder. 1036 1037 errors and kwargs are passed through to the IncrementalDecoder 1038 constructor. 1039 """ 1040 decoder = getincrementaldecoder(encoding)(errors, **kwargs) 1041 for input in iterator: 1042 output = decoder.decode(input) 1043 if output: 1044 yield output 1045 output = decoder.decode("", True) 1046 if output: 1047 yield output 1048 1049### Helpers for charmap-based codecs 1050 1051def make_identity_dict(rng): 1052 1053 """ make_identity_dict(rng) -> dict 1054 1055 Return a dictionary where elements of the rng sequence are 1056 mapped to themselves. 1057 1058 """ 1059 res = {} 1060 for i in rng: 1061 res[i]=i 1062 return res 1063 1064def make_encoding_map(decoding_map): 1065 1066 """ Creates an encoding map from a decoding map. 1067 1068 If a target mapping in the decoding map occurs multiple 1069 times, then that target is mapped to None (undefined mapping), 1070 causing an exception when encountered by the charmap codec 1071 during translation. 1072 1073 One example where this happens is cp875.py which decodes 1074 multiple character to \\u001a. 1075 1076 """ 1077 m = {} 1078 for k,v in decoding_map.items(): 1079 if not v in m: 1080 m[v] = k 1081 else: 1082 m[v] = None 1083 return m 1084 1085### error handlers 1086 1087try: 1088 strict_errors = lookup_error("strict") 1089 ignore_errors = lookup_error("ignore") 1090 replace_errors = lookup_error("replace") 1091 xmlcharrefreplace_errors = lookup_error("xmlcharrefreplace") 1092 backslashreplace_errors = lookup_error("backslashreplace") 1093except LookupError: 1094 # In --disable-unicode builds, these error handler are missing 1095 strict_errors = None 1096 ignore_errors = None 1097 replace_errors = None 1098 xmlcharrefreplace_errors = None 1099 backslashreplace_errors = None 1100 1101# Tell modulefinder that using codecs probably needs the encodings 1102# package 1103_false = 0 1104if _false: 1105 import encodings 1106 1107### Tests 1108 1109if __name__ == '__main__': 1110 1111 # Make stdout translate Latin-1 output into UTF-8 output 1112 sys.stdout = EncodedFile(sys.stdout, 'latin-1', 'utf-8') 1113 1114 # Have stdin translate Latin-1 input into UTF-8 input 1115 sys.stdin = EncodedFile(sys.stdin, 'utf-8', 'latin-1') 1116