1""" codecs -- Python Codec Registry, API and helpers.
2
3
4Written by Marc-Andre Lemburg (mal@lemburg.com).
5
6(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
7
8"""
9
10import builtins
11import sys
12
13### Registry and builtin stateless codec functions
14
15try:
16    from _codecs import *
17except ImportError as why:
18    raise SystemError('Failed to load the builtin codecs: %s' % why)
19
20__all__ = ["register", "lookup", "open", "EncodedFile", "BOM", "BOM_BE",
21           "BOM_LE", "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE",
22           "BOM_UTF8", "BOM_UTF16", "BOM_UTF16_LE", "BOM_UTF16_BE",
23           "BOM_UTF32", "BOM_UTF32_LE", "BOM_UTF32_BE",
24           "CodecInfo", "Codec", "IncrementalEncoder", "IncrementalDecoder",
25           "StreamReader", "StreamWriter",
26           "StreamReaderWriter", "StreamRecoder",
27           "getencoder", "getdecoder", "getincrementalencoder",
28           "getincrementaldecoder", "getreader", "getwriter",
29           "encode", "decode", "iterencode", "iterdecode",
30           "strict_errors", "ignore_errors", "replace_errors",
31           "xmlcharrefreplace_errors",
32           "backslashreplace_errors", "namereplace_errors",
33           "register_error", "lookup_error"]
34
35### Constants
36
37#
38# Byte Order Mark (BOM = ZERO WIDTH NO-BREAK SPACE = U+FEFF)
39# and its possible byte string values
40# for UTF8/UTF16/UTF32 output and little/big endian machines
41#
42
43# UTF-8
44BOM_UTF8 = b'\xef\xbb\xbf'
45
46# UTF-16, little endian
47BOM_LE = BOM_UTF16_LE = b'\xff\xfe'
48
49# UTF-16, big endian
50BOM_BE = BOM_UTF16_BE = b'\xfe\xff'
51
52# UTF-32, little endian
53BOM_UTF32_LE = b'\xff\xfe\x00\x00'
54
55# UTF-32, big endian
56BOM_UTF32_BE = b'\x00\x00\xfe\xff'
57
58if sys.byteorder == 'little':
59
60    # UTF-16, native endianness
61    BOM = BOM_UTF16 = BOM_UTF16_LE
62
63    # UTF-32, native endianness
64    BOM_UTF32 = BOM_UTF32_LE
65
66else:
67
68    # UTF-16, native endianness
69    BOM = BOM_UTF16 = BOM_UTF16_BE
70
71    # UTF-32, native endianness
72    BOM_UTF32 = BOM_UTF32_BE
73
74# Old broken names (don't use in new code)
75BOM32_LE = BOM_UTF16_LE
76BOM32_BE = BOM_UTF16_BE
77BOM64_LE = BOM_UTF32_LE
78BOM64_BE = BOM_UTF32_BE
79
80
81### Codec base classes (defining the API)
82
83class CodecInfo(tuple):
84    """Codec details when looking up the codec registry"""
85
86    # Private API to allow Python 3.4 to blacklist the known non-Unicode
87    # codecs in the standard library. A more general mechanism to
88    # reliably distinguish test encodings from other codecs will hopefully
89    # be defined for Python 3.5
90    #
91    # See http://bugs.python.org/issue19619
92    _is_text_encoding = True # Assume codecs are text encodings by default
93
94    def __new__(cls, encode, decode, streamreader=None, streamwriter=None,
95        incrementalencoder=None, incrementaldecoder=None, name=None,
96        *, _is_text_encoding=None):
97        self = tuple.__new__(cls, (encode, decode, streamreader, streamwriter))
98        self.name = name
99        self.encode = encode
100        self.decode = decode
101        self.incrementalencoder = incrementalencoder
102        self.incrementaldecoder = incrementaldecoder
103        self.streamwriter = streamwriter
104        self.streamreader = streamreader
105        if _is_text_encoding is not None:
106            self._is_text_encoding = _is_text_encoding
107        return self
108
109    def __repr__(self):
110        return "<%s.%s object for encoding %s at %#x>" % \
111                (self.__class__.__module__, self.__class__.__qualname__,
112                 self.name, id(self))
113
114class Codec:
115
116    """ Defines the interface for stateless encoders/decoders.
117
118        The .encode()/.decode() methods may use different error
119        handling schemes by providing the errors argument. These
120        string values are predefined:
121
122         'strict' - raise a ValueError error (or a subclass)
123         'ignore' - ignore the character and continue with the next
124         'replace' - replace with a suitable replacement character;
125                    Python will use the official U+FFFD REPLACEMENT
126                    CHARACTER for the builtin Unicode codecs on
127                    decoding and '?' on encoding.
128         'surrogateescape' - replace with private code points U+DCnn.
129         'xmlcharrefreplace' - Replace with the appropriate XML
130                               character reference (only for encoding).
131         'backslashreplace'  - Replace with backslashed escape sequences.
132         'namereplace'       - Replace with \\N{...} escape sequences
133                               (only for encoding).
134
135        The set of allowed values can be extended via register_error.
136
137    """
138    def encode(self, input, errors='strict'):
139
140        """ Encodes the object input and returns a tuple (output
141            object, length consumed).
142
143            errors defines the error handling to apply. It defaults to
144            'strict' handling.
145
146            The method may not store state in the Codec instance. Use
147            StreamWriter for codecs which have to keep state in order to
148            make encoding efficient.
149
150            The encoder must be able to handle zero length input and
151            return an empty object of the output object type in this
152            situation.
153
154        """
155        raise NotImplementedError
156
157    def decode(self, input, errors='strict'):
158
159        """ Decodes the object input and returns a tuple (output
160            object, length consumed).
161
162            input must be an object which provides the bf_getreadbuf
163            buffer slot. Python strings, buffer objects and memory
164            mapped files are examples of objects providing this slot.
165
166            errors defines the error handling to apply. It defaults to
167            'strict' handling.
168
169            The method may not store state in the Codec instance. Use
170            StreamReader for codecs which have to keep state in order to
171            make decoding efficient.
172
173            The decoder must be able to handle zero length input and
174            return an empty object of the output object type in this
175            situation.
176
177        """
178        raise NotImplementedError
179
180class IncrementalEncoder(object):
181    """
182    An IncrementalEncoder encodes an input in multiple steps. The input can
183    be passed piece by piece to the encode() method. The IncrementalEncoder
184    remembers the state of the encoding process between calls to encode().
185    """
186    def __init__(self, errors='strict'):
187        """
188        Creates an IncrementalEncoder instance.
189
190        The IncrementalEncoder may use different error handling schemes by
191        providing the errors keyword argument. See the module docstring
192        for a list of possible values.
193        """
194        self.errors = errors
195        self.buffer = ""
196
197    def encode(self, input, final=False):
198        """
199        Encodes input and returns the resulting object.
200        """
201        raise NotImplementedError
202
203    def reset(self):
204        """
205        Resets the encoder to the initial state.
206        """
207
208    def getstate(self):
209        """
210        Return the current state of the encoder.
211        """
212        return 0
213
214    def setstate(self, state):
215        """
216        Set the current state of the encoder. state must have been
217        returned by getstate().
218        """
219
220class BufferedIncrementalEncoder(IncrementalEncoder):
221    """
222    This subclass of IncrementalEncoder can be used as the baseclass for an
223    incremental encoder if the encoder must keep some of the output in a
224    buffer between calls to encode().
225    """
226    def __init__(self, errors='strict'):
227        IncrementalEncoder.__init__(self, errors)
228        # unencoded input that is kept between calls to encode()
229        self.buffer = ""
230
231    def _buffer_encode(self, input, errors, final):
232        # Overwrite this method in subclasses: It must encode input
233        # and return an (output, length consumed) tuple
234        raise NotImplementedError
235
236    def encode(self, input, final=False):
237        # encode input (taking the buffer into account)
238        data = self.buffer + input
239        (result, consumed) = self._buffer_encode(data, self.errors, final)
240        # keep unencoded input until the next call
241        self.buffer = data[consumed:]
242        return result
243
244    def reset(self):
245        IncrementalEncoder.reset(self)
246        self.buffer = ""
247
248    def getstate(self):
249        return self.buffer or 0
250
251    def setstate(self, state):
252        self.buffer = state or ""
253
254class IncrementalDecoder(object):
255    """
256    An IncrementalDecoder decodes an input in multiple steps. The input can
257    be passed piece by piece to the decode() method. The IncrementalDecoder
258    remembers the state of the decoding process between calls to decode().
259    """
260    def __init__(self, errors='strict'):
261        """
262        Create an IncrementalDecoder instance.
263
264        The IncrementalDecoder may use different error handling schemes by
265        providing the errors keyword argument. See the module docstring
266        for a list of possible values.
267        """
268        self.errors = errors
269
270    def decode(self, input, final=False):
271        """
272        Decode input and returns the resulting object.
273        """
274        raise NotImplementedError
275
276    def reset(self):
277        """
278        Reset the decoder to the initial state.
279        """
280
281    def getstate(self):
282        """
283        Return the current state of the decoder.
284
285        This must be a (buffered_input, additional_state_info) tuple.
286        buffered_input must be a bytes object containing bytes that
287        were passed to decode() that have not yet been converted.
288        additional_state_info must be a non-negative integer
289        representing the state of the decoder WITHOUT yet having
290        processed the contents of buffered_input.  In the initial state
291        and after reset(), getstate() must return (b"", 0).
292        """
293        return (b"", 0)
294
295    def setstate(self, state):
296        """
297        Set the current state of the decoder.
298
299        state must have been returned by getstate().  The effect of
300        setstate((b"", 0)) must be equivalent to reset().
301        """
302
303class BufferedIncrementalDecoder(IncrementalDecoder):
304    """
305    This subclass of IncrementalDecoder can be used as the baseclass for an
306    incremental decoder if the decoder must be able to handle incomplete
307    byte sequences.
308    """
309    def __init__(self, errors='strict'):
310        IncrementalDecoder.__init__(self, errors)
311        # undecoded input that is kept between calls to decode()
312        self.buffer = b""
313
314    def _buffer_decode(self, input, errors, final):
315        # Overwrite this method in subclasses: It must decode input
316        # and return an (output, length consumed) tuple
317        raise NotImplementedError
318
319    def decode(self, input, final=False):
320        # decode input (taking the buffer into account)
321        data = self.buffer + input
322        (result, consumed) = self._buffer_decode(data, self.errors, final)
323        # keep undecoded input until the next call
324        self.buffer = data[consumed:]
325        return result
326
327    def reset(self):
328        IncrementalDecoder.reset(self)
329        self.buffer = b""
330
331    def getstate(self):
332        # additional state info is always 0
333        return (self.buffer, 0)
334
335    def setstate(self, state):
336        # ignore additional state info
337        self.buffer = state[0]
338
339#
340# The StreamWriter and StreamReader class provide generic working
341# interfaces which can be used to implement new encoding submodules
342# very easily. See encodings/utf_8.py for an example on how this is
343# done.
344#
345
346class StreamWriter(Codec):
347
348    def __init__(self, stream, errors='strict'):
349
350        """ Creates a StreamWriter instance.
351
352            stream must be a file-like object open for writing.
353
354            The StreamWriter may use different error handling
355            schemes by providing the errors keyword argument. These
356            parameters are predefined:
357
358             'strict' - raise a ValueError (or a subclass)
359             'ignore' - ignore the character and continue with the next
360             'replace'- replace with a suitable replacement character
361             'xmlcharrefreplace' - Replace with the appropriate XML
362                                   character reference.
363             'backslashreplace'  - Replace with backslashed escape
364                                   sequences.
365             'namereplace'       - Replace with \\N{...} escape sequences.
366
367            The set of allowed parameter values can be extended via
368            register_error.
369        """
370        self.stream = stream
371        self.errors = errors
372
373    def write(self, object):
374
375        """ Writes the object's contents encoded to self.stream.
376        """
377        data, consumed = self.encode(object, self.errors)
378        self.stream.write(data)
379
380    def writelines(self, list):
381
382        """ Writes the concatenated list of strings to the stream
383            using .write().
384        """
385        self.write(''.join(list))
386
387    def reset(self):
388
389        """ Flushes and resets the codec buffers used for keeping state.
390
391            Calling this method should ensure that the data on the
392            output is put into a clean state, that allows appending
393            of new fresh data without having to rescan the whole
394            stream to recover state.
395
396        """
397        pass
398
399    def seek(self, offset, whence=0):
400        self.stream.seek(offset, whence)
401        if whence == 0 and offset == 0:
402            self.reset()
403
404    def __getattr__(self, name,
405                    getattr=getattr):
406
407        """ Inherit all other methods from the underlying stream.
408        """
409        return getattr(self.stream, name)
410
411    def __enter__(self):
412        return self
413
414    def __exit__(self, type, value, tb):
415        self.stream.close()
416
417###
418
419class StreamReader(Codec):
420
421    charbuffertype = str
422
423    def __init__(self, stream, errors='strict'):
424
425        """ Creates a StreamReader instance.
426
427            stream must be a file-like object open for reading.
428
429            The StreamReader may use different error handling
430            schemes by providing the errors keyword argument. These
431            parameters are predefined:
432
433             'strict' - raise a ValueError (or a subclass)
434             'ignore' - ignore the character and continue with the next
435             'replace'- replace with a suitable replacement character
436             'backslashreplace' - Replace with backslashed escape sequences;
437
438            The set of allowed parameter values can be extended via
439            register_error.
440        """
441        self.stream = stream
442        self.errors = errors
443        self.bytebuffer = b""
444        self._empty_charbuffer = self.charbuffertype()
445        self.charbuffer = self._empty_charbuffer
446        self.linebuffer = None
447
448    def decode(self, input, errors='strict'):
449        raise NotImplementedError
450
451    def read(self, size=-1, chars=-1, firstline=False):
452
453        """ Decodes data from the stream self.stream and returns the
454            resulting object.
455
456            chars indicates the number of decoded code points or bytes to
457            return. read() will never return more data than requested,
458            but it might return less, if there is not enough available.
459
460            size indicates the approximate maximum number of decoded
461            bytes or code points to read for decoding. The decoder
462            can modify this setting as appropriate. The default value
463            -1 indicates to read and decode as much as possible.  size
464            is intended to prevent having to decode huge files in one
465            step.
466
467            If firstline is true, and a UnicodeDecodeError happens
468            after the first line terminator in the input only the first line
469            will be returned, the rest of the input will be kept until the
470            next call to read().
471
472            The method should use a greedy read strategy, meaning that
473            it should read as much data as is allowed within the
474            definition of the encoding and the given size, e.g.  if
475            optional encoding endings or state markers are available
476            on the stream, these should be read too.
477        """
478        # If we have lines cached, first merge them back into characters
479        if self.linebuffer:
480            self.charbuffer = self._empty_charbuffer.join(self.linebuffer)
481            self.linebuffer = None
482
483        if chars < 0:
484            # For compatibility with other read() methods that take a
485            # single argument
486            chars = size
487
488        # read until we get the required number of characters (if available)
489        while True:
490            # can the request be satisfied from the character buffer?
491            if chars >= 0:
492                if len(self.charbuffer) >= chars:
493                    break
494            # we need more data
495            if size < 0:
496                newdata = self.stream.read()
497            else:
498                newdata = self.stream.read(size)
499            # decode bytes (those remaining from the last call included)
500            data = self.bytebuffer + newdata
501            if not data:
502                break
503            try:
504                newchars, decodedbytes = self.decode(data, self.errors)
505            except UnicodeDecodeError as exc:
506                if firstline:
507                    newchars, decodedbytes = \
508                        self.decode(data[:exc.start], self.errors)
509                    lines = newchars.splitlines(keepends=True)
510                    if len(lines)<=1:
511                        raise
512                else:
513                    raise
514            # keep undecoded bytes until the next call
515            self.bytebuffer = data[decodedbytes:]
516            # put new characters in the character buffer
517            self.charbuffer += newchars
518            # there was no data available
519            if not newdata:
520                break
521        if chars < 0:
522            # Return everything we've got
523            result = self.charbuffer
524            self.charbuffer = self._empty_charbuffer
525        else:
526            # Return the first chars characters
527            result = self.charbuffer[:chars]
528            self.charbuffer = self.charbuffer[chars:]
529        return result
530
531    def readline(self, size=None, keepends=True):
532
533        """ Read one line from the input stream and return the
534            decoded data.
535
536            size, if given, is passed as size argument to the
537            read() method.
538
539        """
540        # If we have lines cached from an earlier read, return
541        # them unconditionally
542        if self.linebuffer:
543            line = self.linebuffer[0]
544            del self.linebuffer[0]
545            if len(self.linebuffer) == 1:
546                # revert to charbuffer mode; we might need more data
547                # next time
548                self.charbuffer = self.linebuffer[0]
549                self.linebuffer = None
550            if not keepends:
551                line = line.splitlines(keepends=False)[0]
552            return line
553
554        readsize = size or 72
555        line = self._empty_charbuffer
556        # If size is given, we call read() only once
557        while True:
558            data = self.read(readsize, firstline=True)
559            if data:
560                # If we're at a "\r" read one extra character (which might
561                # be a "\n") to get a proper line ending. If the stream is
562                # temporarily exhausted we return the wrong line ending.
563                if (isinstance(data, str) and data.endswith("\r")) or \
564                   (isinstance(data, bytes) and data.endswith(b"\r")):
565                    data += self.read(size=1, chars=1)
566
567            line += data
568            lines = line.splitlines(keepends=True)
569            if lines:
570                if len(lines) > 1:
571                    # More than one line result; the first line is a full line
572                    # to return
573                    line = lines[0]
574                    del lines[0]
575                    if len(lines) > 1:
576                        # cache the remaining lines
577                        lines[-1] += self.charbuffer
578                        self.linebuffer = lines
579                        self.charbuffer = None
580                    else:
581                        # only one remaining line, put it back into charbuffer
582                        self.charbuffer = lines[0] + self.charbuffer
583                    if not keepends:
584                        line = line.splitlines(keepends=False)[0]
585                    break
586                line0withend = lines[0]
587                line0withoutend = lines[0].splitlines(keepends=False)[0]
588                if line0withend != line0withoutend: # We really have a line end
589                    # Put the rest back together and keep it until the next call
590                    self.charbuffer = self._empty_charbuffer.join(lines[1:]) + \
591                                      self.charbuffer
592                    if keepends:
593                        line = line0withend
594                    else:
595                        line = line0withoutend
596                    break
597            # we didn't get anything or this was our only try
598            if not data or size is not None:
599                if line and not keepends:
600                    line = line.splitlines(keepends=False)[0]
601                break
602            if readsize < 8000:
603                readsize *= 2
604        return line
605
606    def readlines(self, sizehint=None, keepends=True):
607
608        """ Read all lines available on the input stream
609            and return them as a list.
610
611            Line breaks are implemented using the codec's decoder
612            method and are included in the list entries.
613
614            sizehint, if given, is ignored since there is no efficient
615            way to finding the true end-of-line.
616
617        """
618        data = self.read()
619        return data.splitlines(keepends)
620
621    def reset(self):
622
623        """ Resets the codec buffers used for keeping state.
624
625            Note that no stream repositioning should take place.
626            This method is primarily intended to be able to recover
627            from decoding errors.
628
629        """
630        self.bytebuffer = b""
631        self.charbuffer = self._empty_charbuffer
632        self.linebuffer = None
633
634    def seek(self, offset, whence=0):
635        """ Set the input stream's current position.
636
637            Resets the codec buffers used for keeping state.
638        """
639        self.stream.seek(offset, whence)
640        self.reset()
641
642    def __next__(self):
643
644        """ Return the next decoded line from the input stream."""
645        line = self.readline()
646        if line:
647            return line
648        raise StopIteration
649
650    def __iter__(self):
651        return self
652
653    def __getattr__(self, name,
654                    getattr=getattr):
655
656        """ Inherit all other methods from the underlying stream.
657        """
658        return getattr(self.stream, name)
659
660    def __enter__(self):
661        return self
662
663    def __exit__(self, type, value, tb):
664        self.stream.close()
665
666###
667
668class StreamReaderWriter:
669
670    """ StreamReaderWriter instances allow wrapping streams which
671        work in both read and write modes.
672
673        The design is such that one can use the factory functions
674        returned by the codec.lookup() function to construct the
675        instance.
676
677    """
678    # Optional attributes set by the file wrappers below
679    encoding = 'unknown'
680
681    def __init__(self, stream, Reader, Writer, errors='strict'):
682
683        """ Creates a StreamReaderWriter instance.
684
685            stream must be a Stream-like object.
686
687            Reader, Writer must be factory functions or classes
688            providing the StreamReader, StreamWriter interface resp.
689
690            Error handling is done in the same way as defined for the
691            StreamWriter/Readers.
692
693        """
694        self.stream = stream
695        self.reader = Reader(stream, errors)
696        self.writer = Writer(stream, errors)
697        self.errors = errors
698
699    def read(self, size=-1):
700
701        return self.reader.read(size)
702
703    def readline(self, size=None):
704
705        return self.reader.readline(size)
706
707    def readlines(self, sizehint=None):
708
709        return self.reader.readlines(sizehint)
710
711    def __next__(self):
712
713        """ Return the next decoded line from the input stream."""
714        return next(self.reader)
715
716    def __iter__(self):
717        return self
718
719    def write(self, data):
720
721        return self.writer.write(data)
722
723    def writelines(self, list):
724
725        return self.writer.writelines(list)
726
727    def reset(self):
728
729        self.reader.reset()
730        self.writer.reset()
731
732    def seek(self, offset, whence=0):
733        self.stream.seek(offset, whence)
734        self.reader.reset()
735        if whence == 0 and offset == 0:
736            self.writer.reset()
737
738    def __getattr__(self, name,
739                    getattr=getattr):
740
741        """ Inherit all other methods from the underlying stream.
742        """
743        return getattr(self.stream, name)
744
745    # these are needed to make "with StreamReaderWriter(...)" work properly
746
747    def __enter__(self):
748        return self
749
750    def __exit__(self, type, value, tb):
751        self.stream.close()
752
753###
754
755class StreamRecoder:
756
757    """ StreamRecoder instances translate data from one encoding to another.
758
759        They use the complete set of APIs returned by the
760        codecs.lookup() function to implement their task.
761
762        Data written to the StreamRecoder is first decoded into an
763        intermediate format (depending on the "decode" codec) and then
764        written to the underlying stream using an instance of the provided
765        Writer class.
766
767        In the other direction, data is read from the underlying stream using
768        a Reader instance and then encoded and returned to the caller.
769
770    """
771    # Optional attributes set by the file wrappers below
772    data_encoding = 'unknown'
773    file_encoding = 'unknown'
774
775    def __init__(self, stream, encode, decode, Reader, Writer,
776                 errors='strict'):
777
778        """ Creates a StreamRecoder instance which implements a two-way
779            conversion: encode and decode work on the frontend (the
780            data visible to .read() and .write()) while Reader and Writer
781            work on the backend (the data in stream).
782
783            You can use these objects to do transparent
784            transcodings from e.g. latin-1 to utf-8 and back.
785
786            stream must be a file-like object.
787
788            encode and decode must adhere to the Codec interface; Reader and
789            Writer must be factory functions or classes providing the
790            StreamReader and StreamWriter interfaces resp.
791
792            Error handling is done in the same way as defined for the
793            StreamWriter/Readers.
794
795        """
796        self.stream = stream
797        self.encode = encode
798        self.decode = decode
799        self.reader = Reader(stream, errors)
800        self.writer = Writer(stream, errors)
801        self.errors = errors
802
803    def read(self, size=-1):
804
805        data = self.reader.read(size)
806        data, bytesencoded = self.encode(data, self.errors)
807        return data
808
809    def readline(self, size=None):
810
811        if size is None:
812            data = self.reader.readline()
813        else:
814            data = self.reader.readline(size)
815        data, bytesencoded = self.encode(data, self.errors)
816        return data
817
818    def readlines(self, sizehint=None):
819
820        data = self.reader.read()
821        data, bytesencoded = self.encode(data, self.errors)
822        return data.splitlines(keepends=True)
823
824    def __next__(self):
825
826        """ Return the next decoded line from the input stream."""
827        data = next(self.reader)
828        data, bytesencoded = self.encode(data, self.errors)
829        return data
830
831    def __iter__(self):
832        return self
833
834    def write(self, data):
835
836        data, bytesdecoded = self.decode(data, self.errors)
837        return self.writer.write(data)
838
839    def writelines(self, list):
840
841        data = ''.join(list)
842        data, bytesdecoded = self.decode(data, self.errors)
843        return self.writer.write(data)
844
845    def reset(self):
846
847        self.reader.reset()
848        self.writer.reset()
849
850    def __getattr__(self, name,
851                    getattr=getattr):
852
853        """ Inherit all other methods from the underlying stream.
854        """
855        return getattr(self.stream, name)
856
857    def __enter__(self):
858        return self
859
860    def __exit__(self, type, value, tb):
861        self.stream.close()
862
863### Shortcuts
864
865def open(filename, mode='r', encoding=None, errors='strict', buffering=1):
866
867    """ Open an encoded file using the given mode and return
868        a wrapped version providing transparent encoding/decoding.
869
870        Note: The wrapped version will only accept the object format
871        defined by the codecs, i.e. Unicode objects for most builtin
872        codecs. Output is also codec dependent and will usually be
873        Unicode as well.
874
875        Underlying encoded files are always opened in binary mode.
876        The default file mode is 'r', meaning to open the file in read mode.
877
878        encoding specifies the encoding which is to be used for the
879        file.
880
881        errors may be given to define the error handling. It defaults
882        to 'strict' which causes ValueErrors to be raised in case an
883        encoding error occurs.
884
885        buffering has the same meaning as for the builtin open() API.
886        It defaults to line buffered.
887
888        The returned wrapped file object provides an extra attribute
889        .encoding which allows querying the used encoding. This
890        attribute is only available if an encoding was specified as
891        parameter.
892
893    """
894    if encoding is not None and \
895       'b' not in mode:
896        # Force opening of the file in binary mode
897        mode = mode + 'b'
898    file = builtins.open(filename, mode, buffering)
899    if encoding is None:
900        return file
901    info = lookup(encoding)
902    srw = StreamReaderWriter(file, info.streamreader, info.streamwriter, errors)
903    # Add attributes to simplify introspection
904    srw.encoding = encoding
905    return srw
906
907def EncodedFile(file, data_encoding, file_encoding=None, errors='strict'):
908
909    """ Return a wrapped version of file which provides transparent
910        encoding translation.
911
912        Data written to the wrapped file is decoded according
913        to the given data_encoding and then encoded to the underlying
914        file using file_encoding. The intermediate data type
915        will usually be Unicode but depends on the specified codecs.
916
917        Bytes read from the file are decoded using file_encoding and then
918        passed back to the caller encoded using data_encoding.
919
920        If file_encoding is not given, it defaults to data_encoding.
921
922        errors may be given to define the error handling. It defaults
923        to 'strict' which causes ValueErrors to be raised in case an
924        encoding error occurs.
925
926        The returned wrapped file object provides two extra attributes
927        .data_encoding and .file_encoding which reflect the given
928        parameters of the same name. The attributes can be used for
929        introspection by Python programs.
930
931    """
932    if file_encoding is None:
933        file_encoding = data_encoding
934    data_info = lookup(data_encoding)
935    file_info = lookup(file_encoding)
936    sr = StreamRecoder(file, data_info.encode, data_info.decode,
937                       file_info.streamreader, file_info.streamwriter, errors)
938    # Add attributes to simplify introspection
939    sr.data_encoding = data_encoding
940    sr.file_encoding = file_encoding
941    return sr
942
943### Helpers for codec lookup
944
945def getencoder(encoding):
946
947    """ Lookup up the codec for the given encoding and return
948        its encoder function.
949
950        Raises a LookupError in case the encoding cannot be found.
951
952    """
953    return lookup(encoding).encode
954
955def getdecoder(encoding):
956
957    """ Lookup up the codec for the given encoding and return
958        its decoder function.
959
960        Raises a LookupError in case the encoding cannot be found.
961
962    """
963    return lookup(encoding).decode
964
965def getincrementalencoder(encoding):
966
967    """ Lookup up the codec for the given encoding and return
968        its IncrementalEncoder class or factory function.
969
970        Raises a LookupError in case the encoding cannot be found
971        or the codecs doesn't provide an incremental encoder.
972
973    """
974    encoder = lookup(encoding).incrementalencoder
975    if encoder is None:
976        raise LookupError(encoding)
977    return encoder
978
979def getincrementaldecoder(encoding):
980
981    """ Lookup up the codec for the given encoding and return
982        its IncrementalDecoder class or factory function.
983
984        Raises a LookupError in case the encoding cannot be found
985        or the codecs doesn't provide an incremental decoder.
986
987    """
988    decoder = lookup(encoding).incrementaldecoder
989    if decoder is None:
990        raise LookupError(encoding)
991    return decoder
992
993def getreader(encoding):
994
995    """ Lookup up the codec for the given encoding and return
996        its StreamReader class or factory function.
997
998        Raises a LookupError in case the encoding cannot be found.
999
1000    """
1001    return lookup(encoding).streamreader
1002
1003def getwriter(encoding):
1004
1005    """ Lookup up the codec for the given encoding and return
1006        its StreamWriter class or factory function.
1007
1008        Raises a LookupError in case the encoding cannot be found.
1009
1010    """
1011    return lookup(encoding).streamwriter
1012
1013def iterencode(iterator, encoding, errors='strict', **kwargs):
1014    """
1015    Encoding iterator.
1016
1017    Encodes the input strings from the iterator using an IncrementalEncoder.
1018
1019    errors and kwargs are passed through to the IncrementalEncoder
1020    constructor.
1021    """
1022    encoder = getincrementalencoder(encoding)(errors, **kwargs)
1023    for input in iterator:
1024        output = encoder.encode(input)
1025        if output:
1026            yield output
1027    output = encoder.encode("", True)
1028    if output:
1029        yield output
1030
1031def iterdecode(iterator, encoding, errors='strict', **kwargs):
1032    """
1033    Decoding iterator.
1034
1035    Decodes the input strings from the iterator using an IncrementalDecoder.
1036
1037    errors and kwargs are passed through to the IncrementalDecoder
1038    constructor.
1039    """
1040    decoder = getincrementaldecoder(encoding)(errors, **kwargs)
1041    for input in iterator:
1042        output = decoder.decode(input)
1043        if output:
1044            yield output
1045    output = decoder.decode(b"", True)
1046    if output:
1047        yield output
1048
1049### Helpers for charmap-based codecs
1050
1051def make_identity_dict(rng):
1052
1053    """ make_identity_dict(rng) -> dict
1054
1055        Return a dictionary where elements of the rng sequence are
1056        mapped to themselves.
1057
1058    """
1059    return {i:i for i in rng}
1060
1061def make_encoding_map(decoding_map):
1062
1063    """ Creates an encoding map from a decoding map.
1064
1065        If a target mapping in the decoding map occurs multiple
1066        times, then that target is mapped to None (undefined mapping),
1067        causing an exception when encountered by the charmap codec
1068        during translation.
1069
1070        One example where this happens is cp875.py which decodes
1071        multiple character to \\u001a.
1072
1073    """
1074    m = {}
1075    for k,v in decoding_map.items():
1076        if not v in m:
1077            m[v] = k
1078        else:
1079            m[v] = None
1080    return m
1081
1082### error handlers
1083
1084try:
1085    strict_errors = lookup_error("strict")
1086    ignore_errors = lookup_error("ignore")
1087    replace_errors = lookup_error("replace")
1088    xmlcharrefreplace_errors = lookup_error("xmlcharrefreplace")
1089    backslashreplace_errors = lookup_error("backslashreplace")
1090    namereplace_errors = lookup_error("namereplace")
1091except LookupError:
1092    # In --disable-unicode builds, these error handler are missing
1093    strict_errors = None
1094    ignore_errors = None
1095    replace_errors = None
1096    xmlcharrefreplace_errors = None
1097    backslashreplace_errors = None
1098    namereplace_errors = None
1099
1100# Tell modulefinder that using codecs probably needs the encodings
1101# package
1102_false = 0
1103if _false:
1104    import encodings
1105
1106### Tests
1107
1108if __name__ == '__main__':
1109
1110    # Make stdout translate Latin-1 output into UTF-8 output
1111    sys.stdout = EncodedFile(sys.stdout, 'latin-1', 'utf-8')
1112
1113    # Have stdin translate Latin-1 input into UTF-8 input
1114    sys.stdin = EncodedFile(sys.stdin, 'utf-8', 'latin-1')
1115