1""" codecs -- Python Codec Registry, API and helpers.
2
3
4Written by Marc-Andre Lemburg (mal@lemburg.com).
5
6(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
7
8"""#"
9
10import builtins, sys
11
12### Registry and builtin stateless codec functions
13
14try:
15    from _codecs import *
16except ImportError as why:
17    raise SystemError('Failed to load the builtin codecs: %s' % why)
18
19__all__ = ["register", "lookup", "open", "EncodedFile", "BOM", "BOM_BE",
20           "BOM_LE", "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE",
21           "BOM_UTF8", "BOM_UTF16", "BOM_UTF16_LE", "BOM_UTF16_BE",
22           "BOM_UTF32", "BOM_UTF32_LE", "BOM_UTF32_BE",
23           "CodecInfo", "Codec", "IncrementalEncoder", "IncrementalDecoder",
24           "StreamReader", "StreamWriter",
25           "StreamReaderWriter", "StreamRecoder",
26           "getencoder", "getdecoder", "getincrementalencoder",
27           "getincrementaldecoder", "getreader", "getwriter",
28           "encode", "decode", "iterencode", "iterdecode",
29           "strict_errors", "ignore_errors", "replace_errors",
30           "xmlcharrefreplace_errors",
31           "backslashreplace_errors", "namereplace_errors",
32           "register_error", "lookup_error"]
33
34### Constants
35
36#
37# Byte Order Mark (BOM = ZERO WIDTH NO-BREAK SPACE = U+FEFF)
38# and its possible byte string values
39# for UTF8/UTF16/UTF32 output and little/big endian machines
40#
41
42# UTF-8
43BOM_UTF8 = b'\xef\xbb\xbf'
44
45# UTF-16, little endian
46BOM_LE = BOM_UTF16_LE = b'\xff\xfe'
47
48# UTF-16, big endian
49BOM_BE = BOM_UTF16_BE = b'\xfe\xff'
50
51# UTF-32, little endian
52BOM_UTF32_LE = b'\xff\xfe\x00\x00'
53
54# UTF-32, big endian
55BOM_UTF32_BE = b'\x00\x00\xfe\xff'
56
57if sys.byteorder == 'little':
58
59    # UTF-16, native endianness
60    BOM = BOM_UTF16 = BOM_UTF16_LE
61
62    # UTF-32, native endianness
63    BOM_UTF32 = BOM_UTF32_LE
64
65else:
66
67    # UTF-16, native endianness
68    BOM = BOM_UTF16 = BOM_UTF16_BE
69
70    # UTF-32, native endianness
71    BOM_UTF32 = BOM_UTF32_BE
72
73# Old broken names (don't use in new code)
74BOM32_LE = BOM_UTF16_LE
75BOM32_BE = BOM_UTF16_BE
76BOM64_LE = BOM_UTF32_LE
77BOM64_BE = BOM_UTF32_BE
78
79
80### Codec base classes (defining the API)
81
82class CodecInfo(tuple):
83    """Codec details when looking up the codec registry"""
84
85    # Private API to allow Python 3.4 to blacklist the known non-Unicode
86    # codecs in the standard library. A more general mechanism to
87    # reliably distinguish test encodings from other codecs will hopefully
88    # be defined for Python 3.5
89    #
90    # See http://bugs.python.org/issue19619
91    _is_text_encoding = True # Assume codecs are text encodings by default
92
93    def __new__(cls, encode, decode, streamreader=None, streamwriter=None,
94        incrementalencoder=None, incrementaldecoder=None, name=None,
95        *, _is_text_encoding=None):
96        self = tuple.__new__(cls, (encode, decode, streamreader, streamwriter))
97        self.name = name
98        self.encode = encode
99        self.decode = decode
100        self.incrementalencoder = incrementalencoder
101        self.incrementaldecoder = incrementaldecoder
102        self.streamwriter = streamwriter
103        self.streamreader = streamreader
104        if _is_text_encoding is not None:
105            self._is_text_encoding = _is_text_encoding
106        return self
107
108    def __repr__(self):
109        return "<%s.%s object for encoding %s at %#x>" % \
110                (self.__class__.__module__, self.__class__.__qualname__,
111                 self.name, id(self))
112
113class Codec:
114
115    """ Defines the interface for stateless encoders/decoders.
116
117        The .encode()/.decode() methods may use different error
118        handling schemes by providing the errors argument. These
119        string values are predefined:
120
121         'strict' - raise a ValueError error (or a subclass)
122         'ignore' - ignore the character and continue with the next
123         'replace' - replace with a suitable replacement character;
124                    Python will use the official U+FFFD REPLACEMENT
125                    CHARACTER for the builtin Unicode codecs on
126                    decoding and '?' on encoding.
127         'surrogateescape' - replace with private code points U+DCnn.
128         'xmlcharrefreplace' - Replace with the appropriate XML
129                               character reference (only for encoding).
130         'backslashreplace'  - Replace with backslashed escape sequences.
131         'namereplace'       - Replace with \\N{...} escape sequences
132                               (only for encoding).
133
134        The set of allowed values can be extended via register_error.
135
136    """
137    def encode(self, input, errors='strict'):
138
139        """ Encodes the object input and returns a tuple (output
140            object, length consumed).
141
142            errors defines the error handling to apply. It defaults to
143            'strict' handling.
144
145            The method may not store state in the Codec instance. Use
146            StreamWriter for codecs which have to keep state in order to
147            make encoding efficient.
148
149            The encoder must be able to handle zero length input and
150            return an empty object of the output object type in this
151            situation.
152
153        """
154        raise NotImplementedError
155
156    def decode(self, input, errors='strict'):
157
158        """ Decodes the object input and returns a tuple (output
159            object, length consumed).
160
161            input must be an object which provides the bf_getreadbuf
162            buffer slot. Python strings, buffer objects and memory
163            mapped files are examples of objects providing this slot.
164
165            errors defines the error handling to apply. It defaults to
166            'strict' handling.
167
168            The method may not store state in the Codec instance. Use
169            StreamReader for codecs which have to keep state in order to
170            make decoding efficient.
171
172            The decoder must be able to handle zero length input and
173            return an empty object of the output object type in this
174            situation.
175
176        """
177        raise NotImplementedError
178
179class IncrementalEncoder(object):
180    """
181    An IncrementalEncoder encodes an input in multiple steps. The input can
182    be passed piece by piece to the encode() method. The IncrementalEncoder
183    remembers the state of the encoding process between calls to encode().
184    """
185    def __init__(self, errors='strict'):
186        """
187        Creates an IncrementalEncoder instance.
188
189        The IncrementalEncoder may use different error handling schemes by
190        providing the errors keyword argument. See the module docstring
191        for a list of possible values.
192        """
193        self.errors = errors
194        self.buffer = ""
195
196    def encode(self, input, final=False):
197        """
198        Encodes input and returns the resulting object.
199        """
200        raise NotImplementedError
201
202    def reset(self):
203        """
204        Resets the encoder to the initial state.
205        """
206
207    def getstate(self):
208        """
209        Return the current state of the encoder.
210        """
211        return 0
212
213    def setstate(self, state):
214        """
215        Set the current state of the encoder. state must have been
216        returned by getstate().
217        """
218
219class BufferedIncrementalEncoder(IncrementalEncoder):
220    """
221    This subclass of IncrementalEncoder can be used as the baseclass for an
222    incremental encoder if the encoder must keep some of the output in a
223    buffer between calls to encode().
224    """
225    def __init__(self, errors='strict'):
226        IncrementalEncoder.__init__(self, errors)
227        # unencoded input that is kept between calls to encode()
228        self.buffer = ""
229
230    def _buffer_encode(self, input, errors, final):
231        # Overwrite this method in subclasses: It must encode input
232        # and return an (output, length consumed) tuple
233        raise NotImplementedError
234
235    def encode(self, input, final=False):
236        # encode input (taking the buffer into account)
237        data = self.buffer + input
238        (result, consumed) = self._buffer_encode(data, self.errors, final)
239        # keep unencoded input until the next call
240        self.buffer = data[consumed:]
241        return result
242
243    def reset(self):
244        IncrementalEncoder.reset(self)
245        self.buffer = ""
246
247    def getstate(self):
248        return self.buffer or 0
249
250    def setstate(self, state):
251        self.buffer = state or ""
252
253class IncrementalDecoder(object):
254    """
255    An IncrementalDecoder decodes an input in multiple steps. The input can
256    be passed piece by piece to the decode() method. The IncrementalDecoder
257    remembers the state of the decoding process between calls to decode().
258    """
259    def __init__(self, errors='strict'):
260        """
261        Create an IncrementalDecoder instance.
262
263        The IncrementalDecoder may use different error handling schemes by
264        providing the errors keyword argument. See the module docstring
265        for a list of possible values.
266        """
267        self.errors = errors
268
269    def decode(self, input, final=False):
270        """
271        Decode input and returns the resulting object.
272        """
273        raise NotImplementedError
274
275    def reset(self):
276        """
277        Reset the decoder to the initial state.
278        """
279
280    def getstate(self):
281        """
282        Return the current state of the decoder.
283
284        This must be a (buffered_input, additional_state_info) tuple.
285        buffered_input must be a bytes object containing bytes that
286        were passed to decode() that have not yet been converted.
287        additional_state_info must be a non-negative integer
288        representing the state of the decoder WITHOUT yet having
289        processed the contents of buffered_input.  In the initial state
290        and after reset(), getstate() must return (b"", 0).
291        """
292        return (b"", 0)
293
294    def setstate(self, state):
295        """
296        Set the current state of the decoder.
297
298        state must have been returned by getstate().  The effect of
299        setstate((b"", 0)) must be equivalent to reset().
300        """
301
302class BufferedIncrementalDecoder(IncrementalDecoder):
303    """
304    This subclass of IncrementalDecoder can be used as the baseclass for an
305    incremental decoder if the decoder must be able to handle incomplete
306    byte sequences.
307    """
308    def __init__(self, errors='strict'):
309        IncrementalDecoder.__init__(self, errors)
310        # undecoded input that is kept between calls to decode()
311        self.buffer = b""
312
313    def _buffer_decode(self, input, errors, final):
314        # Overwrite this method in subclasses: It must decode input
315        # and return an (output, length consumed) tuple
316        raise NotImplementedError
317
318    def decode(self, input, final=False):
319        # decode input (taking the buffer into account)
320        data = self.buffer + input
321        (result, consumed) = self._buffer_decode(data, self.errors, final)
322        # keep undecoded input until the next call
323        self.buffer = data[consumed:]
324        return result
325
326    def reset(self):
327        IncrementalDecoder.reset(self)
328        self.buffer = b""
329
330    def getstate(self):
331        # additional state info is always 0
332        return (self.buffer, 0)
333
334    def setstate(self, state):
335        # ignore additional state info
336        self.buffer = state[0]
337
338#
339# The StreamWriter and StreamReader class provide generic working
340# interfaces which can be used to implement new encoding submodules
341# very easily. See encodings/utf_8.py for an example on how this is
342# done.
343#
344
345class StreamWriter(Codec):
346
347    def __init__(self, stream, errors='strict'):
348
349        """ Creates a StreamWriter instance.
350
351            stream must be a file-like object open for writing.
352
353            The StreamWriter may use different error handling
354            schemes by providing the errors keyword argument. These
355            parameters are predefined:
356
357             'strict' - raise a ValueError (or a subclass)
358             'ignore' - ignore the character and continue with the next
359             'replace'- replace with a suitable replacement character
360             'xmlcharrefreplace' - Replace with the appropriate XML
361                                   character reference.
362             'backslashreplace'  - Replace with backslashed escape
363                                   sequences.
364             'namereplace'       - Replace with \\N{...} escape sequences.
365
366            The set of allowed parameter values can be extended via
367            register_error.
368        """
369        self.stream = stream
370        self.errors = errors
371
372    def write(self, object):
373
374        """ Writes the object's contents encoded to self.stream.
375        """
376        data, consumed = self.encode(object, self.errors)
377        self.stream.write(data)
378
379    def writelines(self, list):
380
381        """ Writes the concatenated list of strings to the stream
382            using .write().
383        """
384        self.write(''.join(list))
385
386    def reset(self):
387
388        """ Flushes and resets the codec buffers used for keeping state.
389
390            Calling this method should ensure that the data on the
391            output is put into a clean state, that allows appending
392            of new fresh data without having to rescan the whole
393            stream to recover state.
394
395        """
396        pass
397
398    def seek(self, offset, whence=0):
399        self.stream.seek(offset, whence)
400        if whence == 0 and offset == 0:
401            self.reset()
402
403    def __getattr__(self, name,
404                    getattr=getattr):
405
406        """ Inherit all other methods from the underlying stream.
407        """
408        return getattr(self.stream, name)
409
410    def __enter__(self):
411        return self
412
413    def __exit__(self, type, value, tb):
414        self.stream.close()
415
416###
417
418class StreamReader(Codec):
419
420    charbuffertype = str
421
422    def __init__(self, stream, errors='strict'):
423
424        """ Creates a StreamReader instance.
425
426            stream must be a file-like object open for reading.
427
428            The StreamReader may use different error handling
429            schemes by providing the errors keyword argument. These
430            parameters are predefined:
431
432             'strict' - raise a ValueError (or a subclass)
433             'ignore' - ignore the character and continue with the next
434             'replace'- replace with a suitable replacement character
435             'backslashreplace' - Replace with backslashed escape sequences;
436
437            The set of allowed parameter values can be extended via
438            register_error.
439        """
440        self.stream = stream
441        self.errors = errors
442        self.bytebuffer = b""
443        self._empty_charbuffer = self.charbuffertype()
444        self.charbuffer = self._empty_charbuffer
445        self.linebuffer = None
446
447    def decode(self, input, errors='strict'):
448        raise NotImplementedError
449
450    def read(self, size=-1, chars=-1, firstline=False):
451
452        """ Decodes data from the stream self.stream and returns the
453            resulting object.
454
455            chars indicates the number of decoded code points or bytes to
456            return. read() will never return more data than requested,
457            but it might return less, if there is not enough available.
458
459            size indicates the approximate maximum number of decoded
460            bytes or code points to read for decoding. The decoder
461            can modify this setting as appropriate. The default value
462            -1 indicates to read and decode as much as possible.  size
463            is intended to prevent having to decode huge files in one
464            step.
465
466            If firstline is true, and a UnicodeDecodeError happens
467            after the first line terminator in the input only the first line
468            will be returned, the rest of the input will be kept until the
469            next call to read().
470
471            The method should use a greedy read strategy, meaning that
472            it should read as much data as is allowed within the
473            definition of the encoding and the given size, e.g.  if
474            optional encoding endings or state markers are available
475            on the stream, these should be read too.
476        """
477        # If we have lines cached, first merge them back into characters
478        if self.linebuffer:
479            self.charbuffer = self._empty_charbuffer.join(self.linebuffer)
480            self.linebuffer = None
481
482        # read until we get the required number of characters (if available)
483        while True:
484            # can the request be satisfied from the character buffer?
485            if chars >= 0:
486                if len(self.charbuffer) >= chars:
487                    break
488            elif size >= 0:
489                if len(self.charbuffer) >= size:
490                    break
491            # we need more data
492            if size < 0:
493                newdata = self.stream.read()
494            else:
495                newdata = self.stream.read(size)
496            # decode bytes (those remaining from the last call included)
497            data = self.bytebuffer + newdata
498            if not data:
499                break
500            try:
501                newchars, decodedbytes = self.decode(data, self.errors)
502            except UnicodeDecodeError as exc:
503                if firstline:
504                    newchars, decodedbytes = \
505                        self.decode(data[:exc.start], self.errors)
506                    lines = newchars.splitlines(keepends=True)
507                    if len(lines)<=1:
508                        raise
509                else:
510                    raise
511            # keep undecoded bytes until the next call
512            self.bytebuffer = data[decodedbytes:]
513            # put new characters in the character buffer
514            self.charbuffer += newchars
515            # there was no data available
516            if not newdata:
517                break
518        if chars < 0:
519            # Return everything we've got
520            result = self.charbuffer
521            self.charbuffer = self._empty_charbuffer
522        else:
523            # Return the first chars characters
524            result = self.charbuffer[:chars]
525            self.charbuffer = self.charbuffer[chars:]
526        return result
527
528    def readline(self, size=None, keepends=True):
529
530        """ Read one line from the input stream and return the
531            decoded data.
532
533            size, if given, is passed as size argument to the
534            read() method.
535
536        """
537        # If we have lines cached from an earlier read, return
538        # them unconditionally
539        if self.linebuffer:
540            line = self.linebuffer[0]
541            del self.linebuffer[0]
542            if len(self.linebuffer) == 1:
543                # revert to charbuffer mode; we might need more data
544                # next time
545                self.charbuffer = self.linebuffer[0]
546                self.linebuffer = None
547            if not keepends:
548                line = line.splitlines(keepends=False)[0]
549            return line
550
551        readsize = size or 72
552        line = self._empty_charbuffer
553        # If size is given, we call read() only once
554        while True:
555            data = self.read(readsize, firstline=True)
556            if data:
557                # If we're at a "\r" read one extra character (which might
558                # be a "\n") to get a proper line ending. If the stream is
559                # temporarily exhausted we return the wrong line ending.
560                if (isinstance(data, str) and data.endswith("\r")) or \
561                   (isinstance(data, bytes) and data.endswith(b"\r")):
562                    data += self.read(size=1, chars=1)
563
564            line += data
565            lines = line.splitlines(keepends=True)
566            if lines:
567                if len(lines) > 1:
568                    # More than one line result; the first line is a full line
569                    # to return
570                    line = lines[0]
571                    del lines[0]
572                    if len(lines) > 1:
573                        # cache the remaining lines
574                        lines[-1] += self.charbuffer
575                        self.linebuffer = lines
576                        self.charbuffer = None
577                    else:
578                        # only one remaining line, put it back into charbuffer
579                        self.charbuffer = lines[0] + self.charbuffer
580                    if not keepends:
581                        line = line.splitlines(keepends=False)[0]
582                    break
583                line0withend = lines[0]
584                line0withoutend = lines[0].splitlines(keepends=False)[0]
585                if line0withend != line0withoutend: # We really have a line end
586                    # Put the rest back together and keep it until the next call
587                    self.charbuffer = self._empty_charbuffer.join(lines[1:]) + \
588                                      self.charbuffer
589                    if keepends:
590                        line = line0withend
591                    else:
592                        line = line0withoutend
593                    break
594            # we didn't get anything or this was our only try
595            if not data or size is not None:
596                if line and not keepends:
597                    line = line.splitlines(keepends=False)[0]
598                break
599            if readsize < 8000:
600                readsize *= 2
601        return line
602
603    def readlines(self, sizehint=None, keepends=True):
604
605        """ Read all lines available on the input stream
606            and return them as a list.
607
608            Line breaks are implemented using the codec's decoder
609            method and are included in the list entries.
610
611            sizehint, if given, is ignored since there is no efficient
612            way to finding the true end-of-line.
613
614        """
615        data = self.read()
616        return data.splitlines(keepends)
617
618    def reset(self):
619
620        """ Resets the codec buffers used for keeping state.
621
622            Note that no stream repositioning should take place.
623            This method is primarily intended to be able to recover
624            from decoding errors.
625
626        """
627        self.bytebuffer = b""
628        self.charbuffer = self._empty_charbuffer
629        self.linebuffer = None
630
631    def seek(self, offset, whence=0):
632        """ Set the input stream's current position.
633
634            Resets the codec buffers used for keeping state.
635        """
636        self.stream.seek(offset, whence)
637        self.reset()
638
639    def __next__(self):
640
641        """ Return the next decoded line from the input stream."""
642        line = self.readline()
643        if line:
644            return line
645        raise StopIteration
646
647    def __iter__(self):
648        return self
649
650    def __getattr__(self, name,
651                    getattr=getattr):
652
653        """ Inherit all other methods from the underlying stream.
654        """
655        return getattr(self.stream, name)
656
657    def __enter__(self):
658        return self
659
660    def __exit__(self, type, value, tb):
661        self.stream.close()
662
663###
664
665class StreamReaderWriter:
666
667    """ StreamReaderWriter instances allow wrapping streams which
668        work in both read and write modes.
669
670        The design is such that one can use the factory functions
671        returned by the codec.lookup() function to construct the
672        instance.
673
674    """
675    # Optional attributes set by the file wrappers below
676    encoding = 'unknown'
677
678    def __init__(self, stream, Reader, Writer, errors='strict'):
679
680        """ Creates a StreamReaderWriter instance.
681
682            stream must be a Stream-like object.
683
684            Reader, Writer must be factory functions or classes
685            providing the StreamReader, StreamWriter interface resp.
686
687            Error handling is done in the same way as defined for the
688            StreamWriter/Readers.
689
690        """
691        self.stream = stream
692        self.reader = Reader(stream, errors)
693        self.writer = Writer(stream, errors)
694        self.errors = errors
695
696    def read(self, size=-1):
697
698        return self.reader.read(size)
699
700    def readline(self, size=None):
701
702        return self.reader.readline(size)
703
704    def readlines(self, sizehint=None):
705
706        return self.reader.readlines(sizehint)
707
708    def __next__(self):
709
710        """ Return the next decoded line from the input stream."""
711        return next(self.reader)
712
713    def __iter__(self):
714        return self
715
716    def write(self, data):
717
718        return self.writer.write(data)
719
720    def writelines(self, list):
721
722        return self.writer.writelines(list)
723
724    def reset(self):
725
726        self.reader.reset()
727        self.writer.reset()
728
729    def seek(self, offset, whence=0):
730        self.stream.seek(offset, whence)
731        self.reader.reset()
732        if whence == 0 and offset == 0:
733            self.writer.reset()
734
735    def __getattr__(self, name,
736                    getattr=getattr):
737
738        """ Inherit all other methods from the underlying stream.
739        """
740        return getattr(self.stream, name)
741
742    # these are needed to make "with codecs.open(...)" work properly
743
744    def __enter__(self):
745        return self
746
747    def __exit__(self, type, value, tb):
748        self.stream.close()
749
750###
751
752class StreamRecoder:
753
754    """ StreamRecoder instances translate data from one encoding to another.
755
756        They use the complete set of APIs returned by the
757        codecs.lookup() function to implement their task.
758
759        Data written to the StreamRecoder is first decoded into an
760        intermediate format (depending on the "decode" codec) and then
761        written to the underlying stream using an instance of the provided
762        Writer class.
763
764        In the other direction, data is read from the underlying stream using
765        a Reader instance and then encoded and returned to the caller.
766
767    """
768    # Optional attributes set by the file wrappers below
769    data_encoding = 'unknown'
770    file_encoding = 'unknown'
771
772    def __init__(self, stream, encode, decode, Reader, Writer,
773                 errors='strict'):
774
775        """ Creates a StreamRecoder instance which implements a two-way
776            conversion: encode and decode work on the frontend (the
777            data visible to .read() and .write()) while Reader and Writer
778            work on the backend (the data in stream).
779
780            You can use these objects to do transparent
781            transcodings from e.g. latin-1 to utf-8 and back.
782
783            stream must be a file-like object.
784
785            encode and decode must adhere to the Codec interface; Reader and
786            Writer must be factory functions or classes providing the
787            StreamReader and StreamWriter interfaces resp.
788
789            Error handling is done in the same way as defined for the
790            StreamWriter/Readers.
791
792        """
793        self.stream = stream
794        self.encode = encode
795        self.decode = decode
796        self.reader = Reader(stream, errors)
797        self.writer = Writer(stream, errors)
798        self.errors = errors
799
800    def read(self, size=-1):
801
802        data = self.reader.read(size)
803        data, bytesencoded = self.encode(data, self.errors)
804        return data
805
806    def readline(self, size=None):
807
808        if size is None:
809            data = self.reader.readline()
810        else:
811            data = self.reader.readline(size)
812        data, bytesencoded = self.encode(data, self.errors)
813        return data
814
815    def readlines(self, sizehint=None):
816
817        data = self.reader.read()
818        data, bytesencoded = self.encode(data, self.errors)
819        return data.splitlines(keepends=True)
820
821    def __next__(self):
822
823        """ Return the next decoded line from the input stream."""
824        data = next(self.reader)
825        data, bytesencoded = self.encode(data, self.errors)
826        return data
827
828    def __iter__(self):
829        return self
830
831    def write(self, data):
832
833        data, bytesdecoded = self.decode(data, self.errors)
834        return self.writer.write(data)
835
836    def writelines(self, list):
837
838        data = ''.join(list)
839        data, bytesdecoded = self.decode(data, self.errors)
840        return self.writer.write(data)
841
842    def reset(self):
843
844        self.reader.reset()
845        self.writer.reset()
846
847    def __getattr__(self, name,
848                    getattr=getattr):
849
850        """ Inherit all other methods from the underlying stream.
851        """
852        return getattr(self.stream, name)
853
854    def __enter__(self):
855        return self
856
857    def __exit__(self, type, value, tb):
858        self.stream.close()
859
860### Shortcuts
861
862def open(filename, mode='r', encoding=None, errors='strict', buffering=1):
863
864    """ Open an encoded file using the given mode and return
865        a wrapped version providing transparent encoding/decoding.
866
867        Note: The wrapped version will only accept the object format
868        defined by the codecs, i.e. Unicode objects for most builtin
869        codecs. Output is also codec dependent and will usually be
870        Unicode as well.
871
872        Underlying encoded files are always opened in binary mode.
873        The default file mode is 'r', meaning to open the file in read mode.
874
875        encoding specifies the encoding which is to be used for the
876        file.
877
878        errors may be given to define the error handling. It defaults
879        to 'strict' which causes ValueErrors to be raised in case an
880        encoding error occurs.
881
882        buffering has the same meaning as for the builtin open() API.
883        It defaults to line buffered.
884
885        The returned wrapped file object provides an extra attribute
886        .encoding which allows querying the used encoding. This
887        attribute is only available if an encoding was specified as
888        parameter.
889
890    """
891    if encoding is not None and \
892       'b' not in mode:
893        # Force opening of the file in binary mode
894        mode = mode + 'b'
895    file = builtins.open(filename, mode, buffering)
896    if encoding is None:
897        return file
898    info = lookup(encoding)
899    srw = StreamReaderWriter(file, info.streamreader, info.streamwriter, errors)
900    # Add attributes to simplify introspection
901    srw.encoding = encoding
902    return srw
903
904def EncodedFile(file, data_encoding, file_encoding=None, errors='strict'):
905
906    """ Return a wrapped version of file which provides transparent
907        encoding translation.
908
909        Data written to the wrapped file is decoded according
910        to the given data_encoding and then encoded to the underlying
911        file using file_encoding. The intermediate data type
912        will usually be Unicode but depends on the specified codecs.
913
914        Bytes read from the file are decoded using file_encoding and then
915        passed back to the caller encoded using data_encoding.
916
917        If file_encoding is not given, it defaults to data_encoding.
918
919        errors may be given to define the error handling. It defaults
920        to 'strict' which causes ValueErrors to be raised in case an
921        encoding error occurs.
922
923        The returned wrapped file object provides two extra attributes
924        .data_encoding and .file_encoding which reflect the given
925        parameters of the same name. The attributes can be used for
926        introspection by Python programs.
927
928    """
929    if file_encoding is None:
930        file_encoding = data_encoding
931    data_info = lookup(data_encoding)
932    file_info = lookup(file_encoding)
933    sr = StreamRecoder(file, data_info.encode, data_info.decode,
934                       file_info.streamreader, file_info.streamwriter, errors)
935    # Add attributes to simplify introspection
936    sr.data_encoding = data_encoding
937    sr.file_encoding = file_encoding
938    return sr
939
940### Helpers for codec lookup
941
942def getencoder(encoding):
943
944    """ Lookup up the codec for the given encoding and return
945        its encoder function.
946
947        Raises a LookupError in case the encoding cannot be found.
948
949    """
950    return lookup(encoding).encode
951
952def getdecoder(encoding):
953
954    """ Lookup up the codec for the given encoding and return
955        its decoder function.
956
957        Raises a LookupError in case the encoding cannot be found.
958
959    """
960    return lookup(encoding).decode
961
962def getincrementalencoder(encoding):
963
964    """ Lookup up the codec for the given encoding and return
965        its IncrementalEncoder class or factory function.
966
967        Raises a LookupError in case the encoding cannot be found
968        or the codecs doesn't provide an incremental encoder.
969
970    """
971    encoder = lookup(encoding).incrementalencoder
972    if encoder is None:
973        raise LookupError(encoding)
974    return encoder
975
976def getincrementaldecoder(encoding):
977
978    """ Lookup up the codec for the given encoding and return
979        its IncrementalDecoder class or factory function.
980
981        Raises a LookupError in case the encoding cannot be found
982        or the codecs doesn't provide an incremental decoder.
983
984    """
985    decoder = lookup(encoding).incrementaldecoder
986    if decoder is None:
987        raise LookupError(encoding)
988    return decoder
989
990def getreader(encoding):
991
992    """ Lookup up the codec for the given encoding and return
993        its StreamReader class or factory function.
994
995        Raises a LookupError in case the encoding cannot be found.
996
997    """
998    return lookup(encoding).streamreader
999
1000def getwriter(encoding):
1001
1002    """ Lookup up the codec for the given encoding and return
1003        its StreamWriter class or factory function.
1004
1005        Raises a LookupError in case the encoding cannot be found.
1006
1007    """
1008    return lookup(encoding).streamwriter
1009
1010def iterencode(iterator, encoding, errors='strict', **kwargs):
1011    """
1012    Encoding iterator.
1013
1014    Encodes the input strings from the iterator using an IncrementalEncoder.
1015
1016    errors and kwargs are passed through to the IncrementalEncoder
1017    constructor.
1018    """
1019    encoder = getincrementalencoder(encoding)(errors, **kwargs)
1020    for input in iterator:
1021        output = encoder.encode(input)
1022        if output:
1023            yield output
1024    output = encoder.encode("", True)
1025    if output:
1026        yield output
1027
1028def iterdecode(iterator, encoding, errors='strict', **kwargs):
1029    """
1030    Decoding iterator.
1031
1032    Decodes the input strings from the iterator using an IncrementalDecoder.
1033
1034    errors and kwargs are passed through to the IncrementalDecoder
1035    constructor.
1036    """
1037    decoder = getincrementaldecoder(encoding)(errors, **kwargs)
1038    for input in iterator:
1039        output = decoder.decode(input)
1040        if output:
1041            yield output
1042    output = decoder.decode(b"", True)
1043    if output:
1044        yield output
1045
1046### Helpers for charmap-based codecs
1047
1048def make_identity_dict(rng):
1049
1050    """ make_identity_dict(rng) -> dict
1051
1052        Return a dictionary where elements of the rng sequence are
1053        mapped to themselves.
1054
1055    """
1056    return {i:i for i in rng}
1057
1058def make_encoding_map(decoding_map):
1059
1060    """ Creates an encoding map from a decoding map.
1061
1062        If a target mapping in the decoding map occurs multiple
1063        times, then that target is mapped to None (undefined mapping),
1064        causing an exception when encountered by the charmap codec
1065        during translation.
1066
1067        One example where this happens is cp875.py which decodes
1068        multiple character to \\u001a.
1069
1070    """
1071    m = {}
1072    for k,v in decoding_map.items():
1073        if not v in m:
1074            m[v] = k
1075        else:
1076            m[v] = None
1077    return m
1078
1079### error handlers
1080
1081try:
1082    strict_errors = lookup_error("strict")
1083    ignore_errors = lookup_error("ignore")
1084    replace_errors = lookup_error("replace")
1085    xmlcharrefreplace_errors = lookup_error("xmlcharrefreplace")
1086    backslashreplace_errors = lookup_error("backslashreplace")
1087    namereplace_errors = lookup_error("namereplace")
1088except LookupError:
1089    # In --disable-unicode builds, these error handler are missing
1090    strict_errors = None
1091    ignore_errors = None
1092    replace_errors = None
1093    xmlcharrefreplace_errors = None
1094    backslashreplace_errors = None
1095    namereplace_errors = None
1096
1097# Tell modulefinder that using codecs probably needs the encodings
1098# package
1099_false = 0
1100if _false:
1101    import encodings
1102
1103### Tests
1104
1105if __name__ == '__main__':
1106
1107    # Make stdout translate Latin-1 output into UTF-8 output
1108    sys.stdout = EncodedFile(sys.stdout, 'latin-1', 'utf-8')
1109
1110    # Have stdin translate Latin-1 input into UTF-8 input
1111    sys.stdin = EncodedFile(sys.stdin, 'utf-8', 'latin-1')
1112