1""" codecs -- Python Codec Registry, API and helpers.
2
3
4Written by Marc-Andre Lemburg (mal@lemburg.com).
5
6(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
7
8"""#"
9
10import __builtin__, sys
11
12### Registry and builtin stateless codec functions
13
14try:
15    from _codecs import *
16except ImportError, why:
17    raise SystemError('Failed to load the builtin codecs: %s' % why)
18
19__all__ = ["register", "lookup", "open", "EncodedFile", "BOM", "BOM_BE",
20           "BOM_LE", "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE",
21           "BOM_UTF8", "BOM_UTF16", "BOM_UTF16_LE", "BOM_UTF16_BE",
22           "BOM_UTF32", "BOM_UTF32_LE", "BOM_UTF32_BE",
23           "CodecInfo", "Codec", "IncrementalEncoder", "IncrementalDecoder",
24           "StreamReader", "StreamWriter",
25           "StreamReaderWriter", "StreamRecoder",
26           "getencoder", "getdecoder", "getincrementalencoder",
27           "getincrementaldecoder", "getreader", "getwriter",
28           "encode", "decode", "iterencode", "iterdecode",
29           "strict_errors", "ignore_errors", "replace_errors",
30           "xmlcharrefreplace_errors", "backslashreplace_errors",
31           "register_error", "lookup_error"]
32
33### Constants
34
35#
36# Byte Order Mark (BOM = ZERO WIDTH NO-BREAK SPACE = U+FEFF)
37# and its possible byte string values
38# for UTF8/UTF16/UTF32 output and little/big endian machines
39#
40
41# UTF-8
42BOM_UTF8 = '\xef\xbb\xbf'
43
44# UTF-16, little endian
45BOM_LE = BOM_UTF16_LE = '\xff\xfe'
46
47# UTF-16, big endian
48BOM_BE = BOM_UTF16_BE = '\xfe\xff'
49
50# UTF-32, little endian
51BOM_UTF32_LE = '\xff\xfe\x00\x00'
52
53# UTF-32, big endian
54BOM_UTF32_BE = '\x00\x00\xfe\xff'
55
56if sys.byteorder == 'little':
57
58    # UTF-16, native endianness
59    BOM = BOM_UTF16 = BOM_UTF16_LE
60
61    # UTF-32, native endianness
62    BOM_UTF32 = BOM_UTF32_LE
63
64else:
65
66    # UTF-16, native endianness
67    BOM = BOM_UTF16 = BOM_UTF16_BE
68
69    # UTF-32, native endianness
70    BOM_UTF32 = BOM_UTF32_BE
71
72# Old broken names (don't use in new code)
73BOM32_LE = BOM_UTF16_LE
74BOM32_BE = BOM_UTF16_BE
75BOM64_LE = BOM_UTF32_LE
76BOM64_BE = BOM_UTF32_BE
77
78
79### Codec base classes (defining the API)
80
81class CodecInfo(tuple):
82    """Codec details when looking up the codec registry"""
83
84    # Private API to allow Python to blacklist the known non-Unicode
85    # codecs in the standard library. A more general mechanism to
86    # reliably distinguish test encodings from other codecs will hopefully
87    # be defined for Python 3.5
88    #
89    # See http://bugs.python.org/issue19619
90    _is_text_encoding = True # Assume codecs are text encodings by default
91
92    def __new__(cls, encode, decode, streamreader=None, streamwriter=None,
93        incrementalencoder=None, incrementaldecoder=None, name=None,
94        _is_text_encoding=None):
95        self = tuple.__new__(cls, (encode, decode, streamreader, streamwriter))
96        self.name = name
97        self.encode = encode
98        self.decode = decode
99        self.incrementalencoder = incrementalencoder
100        self.incrementaldecoder = incrementaldecoder
101        self.streamwriter = streamwriter
102        self.streamreader = streamreader
103        if _is_text_encoding is not None:
104            self._is_text_encoding = _is_text_encoding
105        return self
106
107    def __repr__(self):
108        return "<%s.%s object for encoding %s at 0x%x>" % (self.__class__.__module__, self.__class__.__name__, self.name, id(self))
109
110class Codec:
111
112    """ Defines the interface for stateless encoders/decoders.
113
114        The .encode()/.decode() methods may use different error
115        handling schemes by providing the errors argument. These
116        string values are predefined:
117
118         'strict' - raise a ValueError error (or a subclass)
119         'ignore' - ignore the character and continue with the next
120         'replace' - replace with a suitable replacement character;
121                    Python will use the official U+FFFD REPLACEMENT
122                    CHARACTER for the builtin Unicode codecs on
123                    decoding and '?' on encoding.
124         'xmlcharrefreplace' - Replace with the appropriate XML
125                               character reference (only for encoding).
126         'backslashreplace'  - Replace with backslashed escape sequences
127                               (only for encoding).
128
129        The set of allowed values can be extended via register_error.
130
131    """
132    def encode(self, input, errors='strict'):
133
134        """ Encodes the object input and returns a tuple (output
135            object, length consumed).
136
137            errors defines the error handling to apply. It defaults to
138            'strict' handling.
139
140            The method may not store state in the Codec instance. Use
141            StreamWriter for codecs which have to keep state in order to
142            make encoding efficient.
143
144            The encoder must be able to handle zero length input and
145            return an empty object of the output object type in this
146            situation.
147
148        """
149        raise NotImplementedError
150
151    def decode(self, input, errors='strict'):
152
153        """ Decodes the object input and returns a tuple (output
154            object, length consumed).
155
156            input must be an object which provides the bf_getreadbuf
157            buffer slot. Python strings, buffer objects and memory
158            mapped files are examples of objects providing this slot.
159
160            errors defines the error handling to apply. It defaults to
161            'strict' handling.
162
163            The method may not store state in the Codec instance. Use
164            StreamReader for codecs which have to keep state in order to
165            make decoding efficient.
166
167            The decoder must be able to handle zero length input and
168            return an empty object of the output object type in this
169            situation.
170
171        """
172        raise NotImplementedError
173
174class IncrementalEncoder(object):
175    """
176    An IncrementalEncoder encodes an input in multiple steps. The input can be
177    passed piece by piece to the encode() method. The IncrementalEncoder remembers
178    the state of the Encoding process between calls to encode().
179    """
180    def __init__(self, errors='strict'):
181        """
182        Creates an IncrementalEncoder instance.
183
184        The IncrementalEncoder may use different error handling schemes by
185        providing the errors keyword argument. See the module docstring
186        for a list of possible values.
187        """
188        self.errors = errors
189        self.buffer = ""
190
191    def encode(self, input, final=False):
192        """
193        Encodes input and returns the resulting object.
194        """
195        raise NotImplementedError
196
197    def reset(self):
198        """
199        Resets the encoder to the initial state.
200        """
201
202    def getstate(self):
203        """
204        Return the current state of the encoder.
205        """
206        return 0
207
208    def setstate(self, state):
209        """
210        Set the current state of the encoder. state must have been
211        returned by getstate().
212        """
213
214class BufferedIncrementalEncoder(IncrementalEncoder):
215    """
216    This subclass of IncrementalEncoder can be used as the baseclass for an
217    incremental encoder if the encoder must keep some of the output in a
218    buffer between calls to encode().
219    """
220    def __init__(self, errors='strict'):
221        IncrementalEncoder.__init__(self, errors)
222        self.buffer = "" # unencoded input that is kept between calls to encode()
223
224    def _buffer_encode(self, input, errors, final):
225        # Overwrite this method in subclasses: It must encode input
226        # and return an (output, length consumed) tuple
227        raise NotImplementedError
228
229    def encode(self, input, final=False):
230        # encode input (taking the buffer into account)
231        data = self.buffer + input
232        (result, consumed) = self._buffer_encode(data, self.errors, final)
233        # keep unencoded input until the next call
234        self.buffer = data[consumed:]
235        return result
236
237    def reset(self):
238        IncrementalEncoder.reset(self)
239        self.buffer = ""
240
241    def getstate(self):
242        return self.buffer or 0
243
244    def setstate(self, state):
245        self.buffer = state or ""
246
247class IncrementalDecoder(object):
248    """
249    An IncrementalDecoder decodes an input in multiple steps. The input can be
250    passed piece by piece to the decode() method. The IncrementalDecoder
251    remembers the state of the decoding process between calls to decode().
252    """
253    def __init__(self, errors='strict'):
254        """
255        Creates an IncrementalDecoder instance.
256
257        The IncrementalDecoder may use different error handling schemes by
258        providing the errors keyword argument. See the module docstring
259        for a list of possible values.
260        """
261        self.errors = errors
262
263    def decode(self, input, final=False):
264        """
265        Decodes input and returns the resulting object.
266        """
267        raise NotImplementedError
268
269    def reset(self):
270        """
271        Resets the decoder to the initial state.
272        """
273
274    def getstate(self):
275        """
276        Return the current state of the decoder.
277
278        This must be a (buffered_input, additional_state_info) tuple.
279        buffered_input must be a bytes object containing bytes that
280        were passed to decode() that have not yet been converted.
281        additional_state_info must be a non-negative integer
282        representing the state of the decoder WITHOUT yet having
283        processed the contents of buffered_input.  In the initial state
284        and after reset(), getstate() must return (b"", 0).
285        """
286        return (b"", 0)
287
288    def setstate(self, state):
289        """
290        Set the current state of the decoder.
291
292        state must have been returned by getstate().  The effect of
293        setstate((b"", 0)) must be equivalent to reset().
294        """
295
296class BufferedIncrementalDecoder(IncrementalDecoder):
297    """
298    This subclass of IncrementalDecoder can be used as the baseclass for an
299    incremental decoder if the decoder must be able to handle incomplete byte
300    sequences.
301    """
302    def __init__(self, errors='strict'):
303        IncrementalDecoder.__init__(self, errors)
304        self.buffer = "" # undecoded input that is kept between calls to decode()
305
306    def _buffer_decode(self, input, errors, final):
307        # Overwrite this method in subclasses: It must decode input
308        # and return an (output, length consumed) tuple
309        raise NotImplementedError
310
311    def decode(self, input, final=False):
312        # decode input (taking the buffer into account)
313        data = self.buffer + input
314        (result, consumed) = self._buffer_decode(data, self.errors, final)
315        # keep undecoded input until the next call
316        self.buffer = data[consumed:]
317        return result
318
319    def reset(self):
320        IncrementalDecoder.reset(self)
321        self.buffer = ""
322
323    def getstate(self):
324        # additional state info is always 0
325        return (self.buffer, 0)
326
327    def setstate(self, state):
328        # ignore additional state info
329        self.buffer = state[0]
330
331#
332# The StreamWriter and StreamReader class provide generic working
333# interfaces which can be used to implement new encoding submodules
334# very easily. See encodings/utf_8.py for an example on how this is
335# done.
336#
337
338class StreamWriter(Codec):
339
340    def __init__(self, stream, errors='strict'):
341
342        """ Creates a StreamWriter instance.
343
344            stream must be a file-like object open for writing
345            (binary) data.
346
347            The StreamWriter may use different error handling
348            schemes by providing the errors keyword argument. These
349            parameters are predefined:
350
351             'strict' - raise a ValueError (or a subclass)
352             'ignore' - ignore the character and continue with the next
353             'replace'- replace with a suitable replacement character
354             'xmlcharrefreplace' - Replace with the appropriate XML
355                                   character reference.
356             'backslashreplace'  - Replace with backslashed escape
357                                   sequences (only for encoding).
358
359            The set of allowed parameter values can be extended via
360            register_error.
361        """
362        self.stream = stream
363        self.errors = errors
364
365    def write(self, object):
366
367        """ Writes the object's contents encoded to self.stream.
368        """
369        data, consumed = self.encode(object, self.errors)
370        self.stream.write(data)
371
372    def writelines(self, list):
373
374        """ Writes the concatenated list of strings to the stream
375            using .write().
376        """
377        self.write(''.join(list))
378
379    def reset(self):
380
381        """ Flushes and resets the codec buffers used for keeping state.
382
383            Calling this method should ensure that the data on the
384            output is put into a clean state, that allows appending
385            of new fresh data without having to rescan the whole
386            stream to recover state.
387
388        """
389        pass
390
391    def seek(self, offset, whence=0):
392        self.stream.seek(offset, whence)
393        if whence == 0 and offset == 0:
394            self.reset()
395
396    def __getattr__(self, name,
397                    getattr=getattr):
398
399        """ Inherit all other methods from the underlying stream.
400        """
401        return getattr(self.stream, name)
402
403    def __enter__(self):
404        return self
405
406    def __exit__(self, type, value, tb):
407        self.stream.close()
408
409###
410
411class StreamReader(Codec):
412
413    def __init__(self, stream, errors='strict'):
414
415        """ Creates a StreamReader instance.
416
417            stream must be a file-like object open for reading
418            (binary) data.
419
420            The StreamReader may use different error handling
421            schemes by providing the errors keyword argument. These
422            parameters are predefined:
423
424             'strict' - raise a ValueError (or a subclass)
425             'ignore' - ignore the character and continue with the next
426             'replace'- replace with a suitable replacement character;
427
428            The set of allowed parameter values can be extended via
429            register_error.
430        """
431        self.stream = stream
432        self.errors = errors
433        self.bytebuffer = ""
434        # For str->str decoding this will stay a str
435        # For str->unicode decoding the first read will promote it to unicode
436        self.charbuffer = ""
437        self.linebuffer = None
438
439    def decode(self, input, errors='strict'):
440        raise NotImplementedError
441
442    def read(self, size=-1, chars=-1, firstline=False):
443
444        """ Decodes data from the stream self.stream and returns the
445            resulting object.
446
447            chars indicates the number of characters to read from the
448            stream. read() will never return more than chars
449            characters, but it might return less, if there are not enough
450            characters available.
451
452            size indicates the approximate maximum number of bytes to
453            read from the stream for decoding purposes. The decoder
454            can modify this setting as appropriate. The default value
455            -1 indicates to read and decode as much as possible.  size
456            is intended to prevent having to decode huge files in one
457            step.
458
459            If firstline is true, and a UnicodeDecodeError happens
460            after the first line terminator in the input only the first line
461            will be returned, the rest of the input will be kept until the
462            next call to read().
463
464            The method should use a greedy read strategy meaning that
465            it should read as much data as is allowed within the
466            definition of the encoding and the given size, e.g.  if
467            optional encoding endings or state markers are available
468            on the stream, these should be read too.
469        """
470        # If we have lines cached, first merge them back into characters
471        if self.linebuffer:
472            self.charbuffer = "".join(self.linebuffer)
473            self.linebuffer = None
474
475        if chars < 0:
476            # For compatibility with other read() methods that take a
477            # single argument
478            chars = size
479
480        # read until we get the required number of characters (if available)
481        while True:
482            # can the request be satisfied from the character buffer?
483            if chars >= 0:
484                if len(self.charbuffer) >= chars:
485                    break
486            # we need more data
487            if size < 0:
488                newdata = self.stream.read()
489            else:
490                newdata = self.stream.read(size)
491            # decode bytes (those remaining from the last call included)
492            data = self.bytebuffer + newdata
493            try:
494                newchars, decodedbytes = self.decode(data, self.errors)
495            except UnicodeDecodeError, exc:
496                if firstline:
497                    newchars, decodedbytes = self.decode(data[:exc.start], self.errors)
498                    lines = newchars.splitlines(True)
499                    if len(lines)<=1:
500                        raise
501                else:
502                    raise
503            # keep undecoded bytes until the next call
504            self.bytebuffer = data[decodedbytes:]
505            # put new characters in the character buffer
506            self.charbuffer += newchars
507            # there was no data available
508            if not newdata:
509                break
510        if chars < 0:
511            # Return everything we've got
512            result = self.charbuffer
513            self.charbuffer = ""
514        else:
515            # Return the first chars characters
516            result = self.charbuffer[:chars]
517            self.charbuffer = self.charbuffer[chars:]
518        return result
519
520    def readline(self, size=None, keepends=True):
521
522        """ Read one line from the input stream and return the
523            decoded data.
524
525            size, if given, is passed as size argument to the
526            read() method.
527
528        """
529        # If we have lines cached from an earlier read, return
530        # them unconditionally
531        if self.linebuffer:
532            line = self.linebuffer[0]
533            del self.linebuffer[0]
534            if len(self.linebuffer) == 1:
535                # revert to charbuffer mode; we might need more data
536                # next time
537                self.charbuffer = self.linebuffer[0]
538                self.linebuffer = None
539            if not keepends:
540                line = line.splitlines(False)[0]
541            return line
542
543        readsize = size or 72
544        line = ""
545        # If size is given, we call read() only once
546        while True:
547            data = self.read(readsize, firstline=True)
548            if data:
549                # If we're at a "\r" read one extra character (which might
550                # be a "\n") to get a proper line ending. If the stream is
551                # temporarily exhausted we return the wrong line ending.
552                if data.endswith("\r"):
553                    data += self.read(size=1, chars=1)
554
555            line += data
556            lines = line.splitlines(True)
557            if lines:
558                if len(lines) > 1:
559                    # More than one line result; the first line is a full line
560                    # to return
561                    line = lines[0]
562                    del lines[0]
563                    if len(lines) > 1:
564                        # cache the remaining lines
565                        lines[-1] += self.charbuffer
566                        self.linebuffer = lines
567                        self.charbuffer = None
568                    else:
569                        # only one remaining line, put it back into charbuffer
570                        self.charbuffer = lines[0] + self.charbuffer
571                    if not keepends:
572                        line = line.splitlines(False)[0]
573                    break
574                line0withend = lines[0]
575                line0withoutend = lines[0].splitlines(False)[0]
576                if line0withend != line0withoutend: # We really have a line end
577                    # Put the rest back together and keep it until the next call
578                    self.charbuffer = "".join(lines[1:]) + self.charbuffer
579                    if keepends:
580                        line = line0withend
581                    else:
582                        line = line0withoutend
583                    break
584            # we didn't get anything or this was our only try
585            if not data or size is not None:
586                if line and not keepends:
587                    line = line.splitlines(False)[0]
588                break
589            if readsize<8000:
590                readsize *= 2
591        return line
592
593    def readlines(self, sizehint=None, keepends=True):
594
595        """ Read all lines available on the input stream
596            and return them as list of lines.
597
598            Line breaks are implemented using the codec's decoder
599            method and are included in the list entries.
600
601            sizehint, if given, is ignored since there is no efficient
602            way to finding the true end-of-line.
603
604        """
605        data = self.read()
606        return data.splitlines(keepends)
607
608    def reset(self):
609
610        """ Resets the codec buffers used for keeping state.
611
612            Note that no stream repositioning should take place.
613            This method is primarily intended to be able to recover
614            from decoding errors.
615
616        """
617        self.bytebuffer = ""
618        self.charbuffer = u""
619        self.linebuffer = None
620
621    def seek(self, offset, whence=0):
622        """ Set the input stream's current position.
623
624            Resets the codec buffers used for keeping state.
625        """
626        self.stream.seek(offset, whence)
627        self.reset()
628
629    def next(self):
630
631        """ Return the next decoded line from the input stream."""
632        line = self.readline()
633        if line:
634            return line
635        raise StopIteration
636
637    def __iter__(self):
638        return self
639
640    def __getattr__(self, name,
641                    getattr=getattr):
642
643        """ Inherit all other methods from the underlying stream.
644        """
645        return getattr(self.stream, name)
646
647    def __enter__(self):
648        return self
649
650    def __exit__(self, type, value, tb):
651        self.stream.close()
652
653###
654
655class StreamReaderWriter:
656
657    """ StreamReaderWriter instances allow wrapping streams which
658        work in both read and write modes.
659
660        The design is such that one can use the factory functions
661        returned by the codec.lookup() function to construct the
662        instance.
663
664    """
665    # Optional attributes set by the file wrappers below
666    encoding = 'unknown'
667
668    def __init__(self, stream, Reader, Writer, errors='strict'):
669
670        """ Creates a StreamReaderWriter instance.
671
672            stream must be a Stream-like object.
673
674            Reader, Writer must be factory functions or classes
675            providing the StreamReader, StreamWriter interface resp.
676
677            Error handling is done in the same way as defined for the
678            StreamWriter/Readers.
679
680        """
681        self.stream = stream
682        self.reader = Reader(stream, errors)
683        self.writer = Writer(stream, errors)
684        self.errors = errors
685
686    def read(self, size=-1):
687
688        return self.reader.read(size)
689
690    def readline(self, size=None):
691
692        return self.reader.readline(size)
693
694    def readlines(self, sizehint=None):
695
696        return self.reader.readlines(sizehint)
697
698    def next(self):
699
700        """ Return the next decoded line from the input stream."""
701        return self.reader.next()
702
703    def __iter__(self):
704        return self
705
706    def write(self, data):
707
708        return self.writer.write(data)
709
710    def writelines(self, list):
711
712        return self.writer.writelines(list)
713
714    def reset(self):
715
716        self.reader.reset()
717        self.writer.reset()
718
719    def seek(self, offset, whence=0):
720        self.stream.seek(offset, whence)
721        self.reader.reset()
722        if whence == 0 and offset == 0:
723            self.writer.reset()
724
725    def __getattr__(self, name,
726                    getattr=getattr):
727
728        """ Inherit all other methods from the underlying stream.
729        """
730        return getattr(self.stream, name)
731
732    # these are needed to make "with codecs.open(...)" work properly
733
734    def __enter__(self):
735        return self
736
737    def __exit__(self, type, value, tb):
738        self.stream.close()
739
740###
741
742class StreamRecoder:
743
744    """ StreamRecoder instances provide a frontend - backend
745        view of encoding data.
746
747        They use the complete set of APIs returned by the
748        codecs.lookup() function to implement their task.
749
750        Data written to the stream is first decoded into an
751        intermediate format (which is dependent on the given codec
752        combination) and then written to the stream using an instance
753        of the provided Writer class.
754
755        In the other direction, data is read from the stream using a
756        Reader instance and then return encoded data to the caller.
757
758    """
759    # Optional attributes set by the file wrappers below
760    data_encoding = 'unknown'
761    file_encoding = 'unknown'
762
763    def __init__(self, stream, encode, decode, Reader, Writer,
764                 errors='strict'):
765
766        """ Creates a StreamRecoder instance which implements a two-way
767            conversion: encode and decode work on the frontend (the
768            input to .read() and output of .write()) while
769            Reader and Writer work on the backend (reading and
770            writing to the stream).
771
772            You can use these objects to do transparent direct
773            recodings from e.g. latin-1 to utf-8 and back.
774
775            stream must be a file-like object.
776
777            encode, decode must adhere to the Codec interface, Reader,
778            Writer must be factory functions or classes providing the
779            StreamReader, StreamWriter interface resp.
780
781            encode and decode are needed for the frontend translation,
782            Reader and Writer for the backend translation. Unicode is
783            used as intermediate encoding.
784
785            Error handling is done in the same way as defined for the
786            StreamWriter/Readers.
787
788        """
789        self.stream = stream
790        self.encode = encode
791        self.decode = decode
792        self.reader = Reader(stream, errors)
793        self.writer = Writer(stream, errors)
794        self.errors = errors
795
796    def read(self, size=-1):
797
798        data = self.reader.read(size)
799        data, bytesencoded = self.encode(data, self.errors)
800        return data
801
802    def readline(self, size=None):
803
804        if size is None:
805            data = self.reader.readline()
806        else:
807            data = self.reader.readline(size)
808        data, bytesencoded = self.encode(data, self.errors)
809        return data
810
811    def readlines(self, sizehint=None):
812
813        data = self.reader.read()
814        data, bytesencoded = self.encode(data, self.errors)
815        return data.splitlines(1)
816
817    def next(self):
818
819        """ Return the next decoded line from the input stream."""
820        data = self.reader.next()
821        data, bytesencoded = self.encode(data, self.errors)
822        return data
823
824    def __iter__(self):
825        return self
826
827    def write(self, data):
828
829        data, bytesdecoded = self.decode(data, self.errors)
830        return self.writer.write(data)
831
832    def writelines(self, list):
833
834        data = ''.join(list)
835        data, bytesdecoded = self.decode(data, self.errors)
836        return self.writer.write(data)
837
838    def reset(self):
839
840        self.reader.reset()
841        self.writer.reset()
842
843    def __getattr__(self, name,
844                    getattr=getattr):
845
846        """ Inherit all other methods from the underlying stream.
847        """
848        return getattr(self.stream, name)
849
850    def __enter__(self):
851        return self
852
853    def __exit__(self, type, value, tb):
854        self.stream.close()
855
856### Shortcuts
857
858def open(filename, mode='rb', encoding=None, errors='strict', buffering=1):
859
860    """ Open an encoded file using the given mode and return
861        a wrapped version providing transparent encoding/decoding.
862
863        Note: The wrapped version will only accept the object format
864        defined by the codecs, i.e. Unicode objects for most builtin
865        codecs. Output is also codec dependent and will usually be
866        Unicode as well.
867
868        Files are always opened in binary mode, even if no binary mode
869        was specified. This is done to avoid data loss due to encodings
870        using 8-bit values. The default file mode is 'rb' meaning to
871        open the file in binary read mode.
872
873        encoding specifies the encoding which is to be used for the
874        file.
875
876        errors may be given to define the error handling. It defaults
877        to 'strict' which causes ValueErrors to be raised in case an
878        encoding error occurs.
879
880        buffering has the same meaning as for the builtin open() API.
881        It defaults to line buffered.
882
883        The returned wrapped file object provides an extra attribute
884        .encoding which allows querying the used encoding. This
885        attribute is only available if an encoding was specified as
886        parameter.
887
888    """
889    if encoding is not None:
890        if 'U' in mode:
891            # No automatic conversion of '\n' is done on reading and writing
892            mode = mode.strip().replace('U', '')
893            if mode[:1] not in set('rwa'):
894                mode = 'r' + mode
895        if 'b' not in mode:
896            # Force opening of the file in binary mode
897            mode = mode + 'b'
898    file = __builtin__.open(filename, mode, buffering)
899    if encoding is None:
900        return file
901    info = lookup(encoding)
902    srw = StreamReaderWriter(file, info.streamreader, info.streamwriter, errors)
903    # Add attributes to simplify introspection
904    srw.encoding = encoding
905    return srw
906
907def EncodedFile(file, data_encoding, file_encoding=None, errors='strict'):
908
909    """ Return a wrapped version of file which provides transparent
910        encoding translation.
911
912        Strings written to the wrapped file are interpreted according
913        to the given data_encoding and then written to the original
914        file as string using file_encoding. The intermediate encoding
915        will usually be Unicode but depends on the specified codecs.
916
917        Strings are read from the file using file_encoding and then
918        passed back to the caller as string using data_encoding.
919
920        If file_encoding is not given, it defaults to data_encoding.
921
922        errors may be given to define the error handling. It defaults
923        to 'strict' which causes ValueErrors to be raised in case an
924        encoding error occurs.
925
926        The returned wrapped file object provides two extra attributes
927        .data_encoding and .file_encoding which reflect the given
928        parameters of the same name. The attributes can be used for
929        introspection by Python programs.
930
931    """
932    if file_encoding is None:
933        file_encoding = data_encoding
934    data_info = lookup(data_encoding)
935    file_info = lookup(file_encoding)
936    sr = StreamRecoder(file, data_info.encode, data_info.decode,
937                       file_info.streamreader, file_info.streamwriter, errors)
938    # Add attributes to simplify introspection
939    sr.data_encoding = data_encoding
940    sr.file_encoding = file_encoding
941    return sr
942
943### Helpers for codec lookup
944
945def getencoder(encoding):
946
947    """ Lookup up the codec for the given encoding and return
948        its encoder function.
949
950        Raises a LookupError in case the encoding cannot be found.
951
952    """
953    return lookup(encoding).encode
954
955def getdecoder(encoding):
956
957    """ Lookup up the codec for the given encoding and return
958        its decoder function.
959
960        Raises a LookupError in case the encoding cannot be found.
961
962    """
963    return lookup(encoding).decode
964
965def getincrementalencoder(encoding):
966
967    """ Lookup up the codec for the given encoding and return
968        its IncrementalEncoder class or factory function.
969
970        Raises a LookupError in case the encoding cannot be found
971        or the codecs doesn't provide an incremental encoder.
972
973    """
974    encoder = lookup(encoding).incrementalencoder
975    if encoder is None:
976        raise LookupError(encoding)
977    return encoder
978
979def getincrementaldecoder(encoding):
980
981    """ Lookup up the codec for the given encoding and return
982        its IncrementalDecoder class or factory function.
983
984        Raises a LookupError in case the encoding cannot be found
985        or the codecs doesn't provide an incremental decoder.
986
987    """
988    decoder = lookup(encoding).incrementaldecoder
989    if decoder is None:
990        raise LookupError(encoding)
991    return decoder
992
993def getreader(encoding):
994
995    """ Lookup up the codec for the given encoding and return
996        its StreamReader class or factory function.
997
998        Raises a LookupError in case the encoding cannot be found.
999
1000    """
1001    return lookup(encoding).streamreader
1002
1003def getwriter(encoding):
1004
1005    """ Lookup up the codec for the given encoding and return
1006        its StreamWriter class or factory function.
1007
1008        Raises a LookupError in case the encoding cannot be found.
1009
1010    """
1011    return lookup(encoding).streamwriter
1012
1013def iterencode(iterator, encoding, errors='strict', **kwargs):
1014    """
1015    Encoding iterator.
1016
1017    Encodes the input strings from the iterator using an IncrementalEncoder.
1018
1019    errors and kwargs are passed through to the IncrementalEncoder
1020    constructor.
1021    """
1022    encoder = getincrementalencoder(encoding)(errors, **kwargs)
1023    for input in iterator:
1024        output = encoder.encode(input)
1025        if output:
1026            yield output
1027    output = encoder.encode("", True)
1028    if output:
1029        yield output
1030
1031def iterdecode(iterator, encoding, errors='strict', **kwargs):
1032    """
1033    Decoding iterator.
1034
1035    Decodes the input strings from the iterator using an IncrementalDecoder.
1036
1037    errors and kwargs are passed through to the IncrementalDecoder
1038    constructor.
1039    """
1040    decoder = getincrementaldecoder(encoding)(errors, **kwargs)
1041    for input in iterator:
1042        output = decoder.decode(input)
1043        if output:
1044            yield output
1045    output = decoder.decode("", True)
1046    if output:
1047        yield output
1048
1049### Helpers for charmap-based codecs
1050
1051def make_identity_dict(rng):
1052
1053    """ make_identity_dict(rng) -> dict
1054
1055        Return a dictionary where elements of the rng sequence are
1056        mapped to themselves.
1057
1058    """
1059    res = {}
1060    for i in rng:
1061        res[i]=i
1062    return res
1063
1064def make_encoding_map(decoding_map):
1065
1066    """ Creates an encoding map from a decoding map.
1067
1068        If a target mapping in the decoding map occurs multiple
1069        times, then that target is mapped to None (undefined mapping),
1070        causing an exception when encountered by the charmap codec
1071        during translation.
1072
1073        One example where this happens is cp875.py which decodes
1074        multiple character to \\u001a.
1075
1076    """
1077    m = {}
1078    for k,v in decoding_map.items():
1079        if not v in m:
1080            m[v] = k
1081        else:
1082            m[v] = None
1083    return m
1084
1085### error handlers
1086
1087try:
1088    strict_errors = lookup_error("strict")
1089    ignore_errors = lookup_error("ignore")
1090    replace_errors = lookup_error("replace")
1091    xmlcharrefreplace_errors = lookup_error("xmlcharrefreplace")
1092    backslashreplace_errors = lookup_error("backslashreplace")
1093except LookupError:
1094    # In --disable-unicode builds, these error handler are missing
1095    strict_errors = None
1096    ignore_errors = None
1097    replace_errors = None
1098    xmlcharrefreplace_errors = None
1099    backslashreplace_errors = None
1100
1101# Tell modulefinder that using codecs probably needs the encodings
1102# package
1103_false = 0
1104if _false:
1105    import encodings
1106
1107### Tests
1108
1109if __name__ == '__main__':
1110
1111    # Make stdout translate Latin-1 output into UTF-8 output
1112    sys.stdout = EncodedFile(sys.stdout, 'latin-1', 'utf-8')
1113
1114    # Have stdin translate Latin-1 input into UTF-8 input
1115    sys.stdin = EncodedFile(sys.stdin, 'utf-8', 'latin-1')
1116