1 /* ------------------------------------------------------------------------
2 
3    _codecs -- Provides access to the codec registry and the builtin
4               codecs.
5 
6    This module should never be imported directly. The standard library
7    module "codecs" wraps this builtin module for use within Python.
8 
9    The codec registry is accessible via:
10 
11      register(search_function) -> None
12 
13      lookup(encoding) -> CodecInfo object
14 
15    The builtin Unicode codecs use the following interface:
16 
17      <encoding>_encode(Unicode_object[,errors='strict']) ->
18         (string object, bytes consumed)
19 
20      <encoding>_decode(char_buffer_obj[,errors='strict']) ->
21         (Unicode object, bytes consumed)
22 
23    <encoding>_encode() interfaces also accept non-Unicode object as
24    input. The objects are then converted to Unicode using
25    PyUnicode_FromObject() prior to applying the conversion.
26 
27    These <encoding>s are available: utf_8, unicode_escape,
28    raw_unicode_escape, unicode_internal, latin_1, ascii (7-bit),
29    mbcs (on win32).
30 
31 
32 Written by Marc-Andre Lemburg (mal@lemburg.com).
33 
34 Copyright (c) Corporation for National Research Initiatives.
35 
36    ------------------------------------------------------------------------ */
37 
38 #define PY_SSIZE_T_CLEAN
39 #include "Python.h"
40 
41 /* --- Registry ----------------------------------------------------------- */
42 
43 PyDoc_STRVAR(register__doc__,
44 "register(search_function)\n\
45 \n\
46 Register a codec search function. Search functions are expected to take\n\
47 one argument, the encoding name in all lower case letters, and return\n\
48 a tuple of functions (encoder, decoder, stream_reader, stream_writer)\n\
49 (or a CodecInfo object).");
50 
51 static
codec_register(PyObject * self,PyObject * search_function)52 PyObject *codec_register(PyObject *self, PyObject *search_function)
53 {
54     if (PyCodec_Register(search_function))
55         return NULL;
56 
57     Py_RETURN_NONE;
58 }
59 
60 PyDoc_STRVAR(lookup__doc__,
61 "lookup(encoding) -> CodecInfo\n\
62 \n\
63 Looks up a codec tuple in the Python codec registry and returns\n\
64 a CodecInfo object.");
65 
66 static
codec_lookup(PyObject * self,PyObject * args)67 PyObject *codec_lookup(PyObject *self, PyObject *args)
68 {
69     char *encoding;
70 
71     if (!PyArg_ParseTuple(args, "s:lookup", &encoding))
72         return NULL;
73 
74     return _PyCodec_Lookup(encoding);
75 }
76 
77 PyDoc_STRVAR(encode__doc__,
78 "encode(obj, [encoding[,errors]]) -> object\n\
79 \n\
80 Encodes obj using the codec registered for encoding. encoding defaults\n\
81 to the default encoding. errors may be given to set a different error\n\
82 handling scheme. Default is 'strict' meaning that encoding errors raise\n\
83 a ValueError. Other possible values are 'ignore', 'replace' and\n\
84 'xmlcharrefreplace' as well as any other name registered with\n\
85 codecs.register_error that can handle ValueErrors.");
86 
87 static PyObject *
codec_encode(PyObject * self,PyObject * args)88 codec_encode(PyObject *self, PyObject *args)
89 {
90     const char *encoding = NULL;
91     const char *errors = NULL;
92     PyObject *v;
93 
94     if (!PyArg_ParseTuple(args, "O|ss:encode", &v, &encoding, &errors))
95         return NULL;
96 
97 #ifdef Py_USING_UNICODE
98     if (encoding == NULL)
99         encoding = PyUnicode_GetDefaultEncoding();
100 #else
101     if (encoding == NULL) {
102         PyErr_SetString(PyExc_ValueError, "no encoding specified");
103         return NULL;
104     }
105 #endif
106 
107     /* Encode via the codec registry */
108     return PyCodec_Encode(v, encoding, errors);
109 }
110 
111 PyDoc_STRVAR(decode__doc__,
112 "decode(obj, [encoding[,errors]]) -> object\n\
113 \n\
114 Decodes obj using the codec registered for encoding. encoding defaults\n\
115 to the default encoding. errors may be given to set a different error\n\
116 handling scheme. Default is 'strict' meaning that encoding errors raise\n\
117 a ValueError. Other possible values are 'ignore' and 'replace'\n\
118 as well as any other name registered with codecs.register_error that is\n\
119 able to handle ValueErrors.");
120 
121 static PyObject *
codec_decode(PyObject * self,PyObject * args)122 codec_decode(PyObject *self, PyObject *args)
123 {
124     const char *encoding = NULL;
125     const char *errors = NULL;
126     PyObject *v;
127 
128     if (!PyArg_ParseTuple(args, "O|ss:decode", &v, &encoding, &errors))
129         return NULL;
130 
131 #ifdef Py_USING_UNICODE
132     if (encoding == NULL)
133         encoding = PyUnicode_GetDefaultEncoding();
134 #else
135     if (encoding == NULL) {
136         PyErr_SetString(PyExc_ValueError, "no encoding specified");
137         return NULL;
138     }
139 #endif
140 
141     /* Decode via the codec registry */
142     return PyCodec_Decode(v, encoding, errors);
143 }
144 
145 /* --- Helpers ------------------------------------------------------------ */
146 
147 static
codec_tuple(PyObject * unicode,Py_ssize_t len)148 PyObject *codec_tuple(PyObject *unicode,
149                       Py_ssize_t len)
150 {
151     PyObject *v;
152     if (unicode == NULL)
153         return NULL;
154     v = Py_BuildValue("On", unicode, len);
155     Py_DECREF(unicode);
156     return v;
157 }
158 
159 /* --- String codecs ------------------------------------------------------ */
160 static PyObject *
escape_decode(PyObject * self,PyObject * args)161 escape_decode(PyObject *self,
162               PyObject *args)
163 {
164     const char *errors = NULL;
165     const char *data;
166     Py_ssize_t size;
167 
168     if (!PyArg_ParseTuple(args, "s#|z:escape_decode",
169                           &data, &size, &errors))
170         return NULL;
171     return codec_tuple(PyString_DecodeEscape(data, size, errors, 0, NULL),
172                        size);
173 }
174 
175 static PyObject *
escape_encode(PyObject * self,PyObject * args)176 escape_encode(PyObject *self,
177               PyObject *args)
178 {
179     PyObject *str;
180     const char *errors = NULL;
181     char *buf;
182     Py_ssize_t consumed, len;
183 
184     if (!PyArg_ParseTuple(args, "S|z:escape_encode",
185                           &str, &errors))
186         return NULL;
187 
188     consumed = PyString_GET_SIZE(str);
189     str = PyString_Repr(str, 0);
190     if (!str)
191         return NULL;
192 
193     /* The string will be quoted. Unquote, similar to unicode-escape. */
194     buf = PyString_AS_STRING (str);
195     len = PyString_GET_SIZE (str);
196     memmove(buf, buf+1, len-2);
197     if (_PyString_Resize(&str, len-2) < 0)
198         return NULL;
199 
200     return codec_tuple(str, consumed);
201 }
202 
203 #ifdef Py_USING_UNICODE
204 /* --- Decoder ------------------------------------------------------------ */
205 
206 static PyObject *
unicode_internal_decode(PyObject * self,PyObject * args)207 unicode_internal_decode(PyObject *self,
208                         PyObject *args)
209 {
210     PyObject *obj;
211     const char *errors = NULL;
212     const char *data;
213     Py_ssize_t size;
214 
215     if (!PyArg_ParseTuple(args, "O|z:unicode_internal_decode",
216                           &obj, &errors))
217         return NULL;
218 
219     if (PyUnicode_Check(obj)) {
220         Py_INCREF(obj);
221         return codec_tuple(obj, PyUnicode_GET_SIZE(obj));
222     }
223     else {
224         if (PyObject_AsReadBuffer(obj, (const void **)&data, &size))
225             return NULL;
226 
227         return codec_tuple(_PyUnicode_DecodeUnicodeInternal(data, size, errors),
228                            size);
229     }
230 }
231 
232 static PyObject *
utf_7_decode(PyObject * self,PyObject * args)233 utf_7_decode(PyObject *self,
234              PyObject *args)
235 {
236     Py_buffer pbuf;
237     const char *errors = NULL;
238     int final = 0;
239     Py_ssize_t consumed;
240     PyObject *decoded = NULL;
241 
242     if (!PyArg_ParseTuple(args, "s*|zi:utf_7_decode",
243                           &pbuf, &errors, &final))
244         return NULL;
245     consumed = pbuf.len;
246 
247     decoded = PyUnicode_DecodeUTF7Stateful(pbuf.buf, pbuf.len, errors,
248                                            final ? NULL : &consumed);
249     PyBuffer_Release(&pbuf);
250     if (decoded == NULL)
251         return NULL;
252     return codec_tuple(decoded, consumed);
253 }
254 
255 static PyObject *
utf_8_decode(PyObject * self,PyObject * args)256 utf_8_decode(PyObject *self,
257             PyObject *args)
258 {
259     Py_buffer pbuf;
260     const char *errors = NULL;
261     int final = 0;
262     Py_ssize_t consumed;
263     PyObject *decoded = NULL;
264 
265     if (!PyArg_ParseTuple(args, "s*|zi:utf_8_decode",
266                           &pbuf, &errors, &final))
267         return NULL;
268     consumed = pbuf.len;
269 
270     decoded = PyUnicode_DecodeUTF8Stateful(pbuf.buf, pbuf.len, errors,
271                                            final ? NULL : &consumed);
272     PyBuffer_Release(&pbuf);
273     if (decoded == NULL)
274         return NULL;
275     return codec_tuple(decoded, consumed);
276 }
277 
278 static PyObject *
utf_16_decode(PyObject * self,PyObject * args)279 utf_16_decode(PyObject *self,
280             PyObject *args)
281 {
282     Py_buffer pbuf;
283     const char *errors = NULL;
284     int byteorder = 0;
285     int final = 0;
286     Py_ssize_t consumed;
287     PyObject *decoded;
288 
289     if (!PyArg_ParseTuple(args, "s*|zi:utf_16_decode",
290                           &pbuf, &errors, &final))
291         return NULL;
292     consumed = pbuf.len; /* This is overwritten unless final is true. */
293     decoded = PyUnicode_DecodeUTF16Stateful(pbuf.buf, pbuf.len, errors,
294                                         &byteorder, final ? NULL : &consumed);
295     PyBuffer_Release(&pbuf);
296     if (decoded == NULL)
297         return NULL;
298     return codec_tuple(decoded, consumed);
299 }
300 
301 static PyObject *
utf_16_le_decode(PyObject * self,PyObject * args)302 utf_16_le_decode(PyObject *self,
303                  PyObject *args)
304 {
305     Py_buffer pbuf;
306     const char *errors = NULL;
307     int byteorder = -1;
308     int final = 0;
309     Py_ssize_t consumed;
310     PyObject *decoded = NULL;
311 
312     if (!PyArg_ParseTuple(args, "s*|zi:utf_16_le_decode",
313                           &pbuf, &errors, &final))
314         return NULL;
315 
316     consumed = pbuf.len; /* This is overwritten unless final is true. */
317     decoded = PyUnicode_DecodeUTF16Stateful(pbuf.buf, pbuf.len, errors,
318         &byteorder, final ? NULL : &consumed);
319     PyBuffer_Release(&pbuf);
320     if (decoded == NULL)
321         return NULL;
322     return codec_tuple(decoded, consumed);
323 }
324 
325 static PyObject *
utf_16_be_decode(PyObject * self,PyObject * args)326 utf_16_be_decode(PyObject *self,
327                  PyObject *args)
328 {
329     Py_buffer pbuf;
330     const char *errors = NULL;
331     int byteorder = 1;
332     int final = 0;
333     Py_ssize_t consumed;
334     PyObject *decoded = NULL;
335 
336     if (!PyArg_ParseTuple(args, "s*|zi:utf_16_be_decode",
337                           &pbuf, &errors, &final))
338         return NULL;
339 
340     consumed = pbuf.len; /* This is overwritten unless final is true. */
341     decoded = PyUnicode_DecodeUTF16Stateful(pbuf.buf, pbuf.len, errors,
342         &byteorder, final ? NULL : &consumed);
343     PyBuffer_Release(&pbuf);
344     if (decoded == NULL)
345         return NULL;
346     return codec_tuple(decoded, consumed);
347 }
348 
349 /* This non-standard version also provides access to the byteorder
350    parameter of the builtin UTF-16 codec.
351 
352    It returns a tuple (unicode, bytesread, byteorder) with byteorder
353    being the value in effect at the end of data.
354 
355 */
356 
357 static PyObject *
utf_16_ex_decode(PyObject * self,PyObject * args)358 utf_16_ex_decode(PyObject *self,
359                  PyObject *args)
360 {
361     Py_buffer pbuf;
362     const char *errors = NULL;
363     int byteorder = 0;
364     PyObject *unicode, *tuple;
365     int final = 0;
366     Py_ssize_t consumed;
367 
368     if (!PyArg_ParseTuple(args, "s*|zii:utf_16_ex_decode",
369                           &pbuf, &errors, &byteorder, &final))
370         return NULL;
371     consumed = pbuf.len; /* This is overwritten unless final is true. */
372     unicode = PyUnicode_DecodeUTF16Stateful(pbuf.buf, pbuf.len, errors,
373                                         &byteorder, final ? NULL : &consumed);
374     PyBuffer_Release(&pbuf);
375     if (unicode == NULL)
376         return NULL;
377     tuple = Py_BuildValue("Oni", unicode, consumed, byteorder);
378     Py_DECREF(unicode);
379     return tuple;
380 }
381 
382 static PyObject *
utf_32_decode(PyObject * self,PyObject * args)383 utf_32_decode(PyObject *self,
384             PyObject *args)
385 {
386     Py_buffer pbuf;
387     const char *errors = NULL;
388     int byteorder = 0;
389     int final = 0;
390     Py_ssize_t consumed;
391     PyObject *decoded;
392 
393     if (!PyArg_ParseTuple(args, "s*|zi:utf_32_decode",
394                           &pbuf, &errors, &final))
395         return NULL;
396     consumed = pbuf.len; /* This is overwritten unless final is true. */
397     decoded = PyUnicode_DecodeUTF32Stateful(pbuf.buf, pbuf.len, errors,
398                                         &byteorder, final ? NULL : &consumed);
399     PyBuffer_Release(&pbuf);
400     if (decoded == NULL)
401         return NULL;
402     return codec_tuple(decoded, consumed);
403 }
404 
405 static PyObject *
utf_32_le_decode(PyObject * self,PyObject * args)406 utf_32_le_decode(PyObject *self,
407                  PyObject *args)
408 {
409     Py_buffer pbuf;
410     const char *errors = NULL;
411     int byteorder = -1;
412     int final = 0;
413     Py_ssize_t consumed;
414     PyObject *decoded;
415 
416     if (!PyArg_ParseTuple(args, "s*|zi:utf_32_le_decode",
417                           &pbuf, &errors, &final))
418         return NULL;
419     consumed = pbuf.len; /* This is overwritten unless final is true. */
420     decoded = PyUnicode_DecodeUTF32Stateful(pbuf.buf, pbuf.len, errors,
421                                         &byteorder, final ? NULL : &consumed);
422     PyBuffer_Release(&pbuf);
423     if (decoded == NULL)
424         return NULL;
425     return codec_tuple(decoded, consumed);
426 }
427 
428 static PyObject *
utf_32_be_decode(PyObject * self,PyObject * args)429 utf_32_be_decode(PyObject *self,
430                  PyObject *args)
431 {
432     Py_buffer pbuf;
433     const char *errors = NULL;
434     int byteorder = 1;
435     int final = 0;
436     Py_ssize_t consumed;
437     PyObject *decoded;
438 
439     if (!PyArg_ParseTuple(args, "s*|zi:utf_32_be_decode",
440                           &pbuf, &errors, &final))
441         return NULL;
442     consumed = pbuf.len; /* This is overwritten unless final is true. */
443     decoded = PyUnicode_DecodeUTF32Stateful(pbuf.buf, pbuf.len, errors,
444                                         &byteorder, final ? NULL : &consumed);
445     PyBuffer_Release(&pbuf);
446     if (decoded == NULL)
447         return NULL;
448     return codec_tuple(decoded, consumed);
449 }
450 
451 /* This non-standard version also provides access to the byteorder
452    parameter of the builtin UTF-32 codec.
453 
454    It returns a tuple (unicode, bytesread, byteorder) with byteorder
455    being the value in effect at the end of data.
456 
457 */
458 
459 static PyObject *
utf_32_ex_decode(PyObject * self,PyObject * args)460 utf_32_ex_decode(PyObject *self,
461                  PyObject *args)
462 {
463     Py_buffer pbuf;
464     const char *errors = NULL;
465     int byteorder = 0;
466     PyObject *unicode, *tuple;
467     int final = 0;
468     Py_ssize_t consumed;
469 
470     if (!PyArg_ParseTuple(args, "s*|zii:utf_32_ex_decode",
471                           &pbuf, &errors, &byteorder, &final))
472         return NULL;
473     consumed = pbuf.len; /* This is overwritten unless final is true. */
474     unicode = PyUnicode_DecodeUTF32Stateful(pbuf.buf, pbuf.len, errors,
475                                         &byteorder, final ? NULL : &consumed);
476     PyBuffer_Release(&pbuf);
477     if (unicode == NULL)
478         return NULL;
479     tuple = Py_BuildValue("Oni", unicode, consumed, byteorder);
480     Py_DECREF(unicode);
481     return tuple;
482 }
483 
484 static PyObject *
unicode_escape_decode(PyObject * self,PyObject * args)485 unicode_escape_decode(PyObject *self,
486                      PyObject *args)
487 {
488     Py_buffer pbuf;
489     const char *errors = NULL;
490         PyObject *unicode;
491 
492     if (!PyArg_ParseTuple(args, "s*|z:unicode_escape_decode",
493                           &pbuf, &errors))
494         return NULL;
495 
496     unicode = PyUnicode_DecodeUnicodeEscape(pbuf.buf, pbuf.len, errors);
497     PyBuffer_Release(&pbuf);
498     return codec_tuple(unicode, pbuf.len);
499 }
500 
501 static PyObject *
raw_unicode_escape_decode(PyObject * self,PyObject * args)502 raw_unicode_escape_decode(PyObject *self,
503                         PyObject *args)
504 {
505     Py_buffer pbuf;
506     const char *errors = NULL;
507     PyObject *unicode;
508 
509     if (!PyArg_ParseTuple(args, "s*|z:raw_unicode_escape_decode",
510                           &pbuf, &errors))
511         return NULL;
512 
513     unicode = PyUnicode_DecodeRawUnicodeEscape(pbuf.buf, pbuf.len, errors);
514     PyBuffer_Release(&pbuf);
515     return codec_tuple(unicode, pbuf.len);
516 }
517 
518 static PyObject *
latin_1_decode(PyObject * self,PyObject * args)519 latin_1_decode(PyObject *self,
520                PyObject *args)
521 {
522     Py_buffer pbuf;
523     PyObject *unicode;
524     const char *errors = NULL;
525 
526     if (!PyArg_ParseTuple(args, "s*|z:latin_1_decode",
527                           &pbuf, &errors))
528         return NULL;
529 
530     unicode = PyUnicode_DecodeLatin1(pbuf.buf, pbuf.len, errors);
531     PyBuffer_Release(&pbuf);
532     return codec_tuple(unicode, pbuf.len);
533 }
534 
535 static PyObject *
ascii_decode(PyObject * self,PyObject * args)536 ascii_decode(PyObject *self,
537              PyObject *args)
538 {
539     Py_buffer pbuf;
540     PyObject *unicode;
541     const char *errors = NULL;
542 
543     if (!PyArg_ParseTuple(args, "s*|z:ascii_decode",
544                           &pbuf, &errors))
545         return NULL;
546 
547     unicode = PyUnicode_DecodeASCII(pbuf.buf, pbuf.len, errors);
548     PyBuffer_Release(&pbuf);
549     return codec_tuple(unicode, pbuf.len);
550 }
551 
552 static PyObject *
charmap_decode(PyObject * self,PyObject * args)553 charmap_decode(PyObject *self,
554                PyObject *args)
555 {
556     Py_buffer pbuf;
557     PyObject *unicode;
558     const char *errors = NULL;
559     PyObject *mapping = NULL;
560 
561     if (!PyArg_ParseTuple(args, "s*|zO:charmap_decode",
562                           &pbuf, &errors, &mapping))
563         return NULL;
564     if (mapping == Py_None)
565         mapping = NULL;
566 
567     unicode = PyUnicode_DecodeCharmap(pbuf.buf, pbuf.len, mapping, errors);
568     PyBuffer_Release(&pbuf);
569     return codec_tuple(unicode, pbuf.len);
570 }
571 
572 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
573 
574 static PyObject *
mbcs_decode(PyObject * self,PyObject * args)575 mbcs_decode(PyObject *self,
576             PyObject *args)
577 {
578     Py_buffer pbuf;
579     const char *errors = NULL;
580     int final = 0;
581     Py_ssize_t consumed;
582     PyObject *decoded = NULL;
583 
584     if (!PyArg_ParseTuple(args, "s*|zi:mbcs_decode",
585                           &pbuf, &errors, &final))
586         return NULL;
587     consumed = pbuf.len;
588 
589     decoded = PyUnicode_DecodeMBCSStateful(pbuf.buf, pbuf.len, errors,
590                                            final ? NULL : &consumed);
591     PyBuffer_Release(&pbuf);
592     if (decoded == NULL)
593         return NULL;
594     return codec_tuple(decoded, consumed);
595 }
596 
597 #endif /* MS_WINDOWS */
598 
599 /* --- Encoder ------------------------------------------------------------ */
600 
601 static PyObject *
readbuffer_encode(PyObject * self,PyObject * args)602 readbuffer_encode(PyObject *self,
603                   PyObject *args)
604 {
605     const char *data;
606     Py_ssize_t size;
607     const char *errors = NULL;
608 
609     if (!PyArg_ParseTuple(args, "s#|z:readbuffer_encode",
610                           &data, &size, &errors))
611         return NULL;
612 
613     return codec_tuple(PyString_FromStringAndSize(data, size),
614                        size);
615 }
616 
617 static PyObject *
charbuffer_encode(PyObject * self,PyObject * args)618 charbuffer_encode(PyObject *self,
619                   PyObject *args)
620 {
621     const char *data;
622     Py_ssize_t size;
623     const char *errors = NULL;
624 
625     if (!PyArg_ParseTuple(args, "t#|z:charbuffer_encode",
626                           &data, &size, &errors))
627         return NULL;
628 
629     return codec_tuple(PyString_FromStringAndSize(data, size),
630                        size);
631 }
632 
633 static PyObject *
unicode_internal_encode(PyObject * self,PyObject * args)634 unicode_internal_encode(PyObject *self,
635                         PyObject *args)
636 {
637     PyObject *obj;
638     const char *errors = NULL;
639     const char *data;
640     Py_ssize_t size;
641 
642     if (!PyArg_ParseTuple(args, "O|z:unicode_internal_encode",
643                           &obj, &errors))
644         return NULL;
645 
646     if (PyUnicode_Check(obj)) {
647         data = PyUnicode_AS_DATA(obj);
648         size = PyUnicode_GET_DATA_SIZE(obj);
649         return codec_tuple(PyString_FromStringAndSize(data, size),
650                            PyUnicode_GET_SIZE(obj));
651     }
652     else {
653         if (PyObject_AsReadBuffer(obj, (const void **)&data, &size))
654             return NULL;
655         return codec_tuple(PyString_FromStringAndSize(data, size),
656                            size);
657     }
658 }
659 
660 static PyObject *
utf_7_encode(PyObject * self,PyObject * args)661 utf_7_encode(PyObject *self,
662             PyObject *args)
663 {
664     PyObject *str, *v;
665     const char *errors = NULL;
666 
667     if (!PyArg_ParseTuple(args, "O|z:utf_7_encode",
668                           &str, &errors))
669         return NULL;
670 
671     str = PyUnicode_FromObject(str);
672     if (str == NULL)
673         return NULL;
674     v = codec_tuple(PyUnicode_EncodeUTF7(PyUnicode_AS_UNICODE(str),
675                                          PyUnicode_GET_SIZE(str),
676                                          0,
677                                          0,
678                                          errors),
679                     PyUnicode_GET_SIZE(str));
680     Py_DECREF(str);
681     return v;
682 }
683 
684 static PyObject *
utf_8_encode(PyObject * self,PyObject * args)685 utf_8_encode(PyObject *self,
686             PyObject *args)
687 {
688     PyObject *str, *v;
689     const char *errors = NULL;
690 
691     if (!PyArg_ParseTuple(args, "O|z:utf_8_encode",
692                           &str, &errors))
693         return NULL;
694 
695     str = PyUnicode_FromObject(str);
696     if (str == NULL)
697         return NULL;
698     v = codec_tuple(PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(str),
699                                          PyUnicode_GET_SIZE(str),
700                                          errors),
701                     PyUnicode_GET_SIZE(str));
702     Py_DECREF(str);
703     return v;
704 }
705 
706 /* This version provides access to the byteorder parameter of the
707    builtin UTF-16 codecs as optional third argument. It defaults to 0
708    which means: use the native byte order and prepend the data with a
709    BOM mark.
710 
711 */
712 
713 static PyObject *
utf_16_encode(PyObject * self,PyObject * args)714 utf_16_encode(PyObject *self,
715             PyObject *args)
716 {
717     PyObject *str, *v;
718     const char *errors = NULL;
719     int byteorder = 0;
720 
721     if (!PyArg_ParseTuple(args, "O|zi:utf_16_encode",
722                           &str, &errors, &byteorder))
723         return NULL;
724 
725     str = PyUnicode_FromObject(str);
726     if (str == NULL)
727         return NULL;
728     v = codec_tuple(PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(str),
729                                           PyUnicode_GET_SIZE(str),
730                                           errors,
731                                           byteorder),
732                     PyUnicode_GET_SIZE(str));
733     Py_DECREF(str);
734     return v;
735 }
736 
737 static PyObject *
utf_16_le_encode(PyObject * self,PyObject * args)738 utf_16_le_encode(PyObject *self,
739                  PyObject *args)
740 {
741     PyObject *str, *v;
742     const char *errors = NULL;
743 
744     if (!PyArg_ParseTuple(args, "O|z:utf_16_le_encode",
745                           &str, &errors))
746         return NULL;
747 
748     str = PyUnicode_FromObject(str);
749     if (str == NULL)
750         return NULL;
751     v = codec_tuple(PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(str),
752                                              PyUnicode_GET_SIZE(str),
753                                              errors,
754                                              -1),
755                        PyUnicode_GET_SIZE(str));
756     Py_DECREF(str);
757     return v;
758 }
759 
760 static PyObject *
utf_16_be_encode(PyObject * self,PyObject * args)761 utf_16_be_encode(PyObject *self,
762                  PyObject *args)
763 {
764     PyObject *str, *v;
765     const char *errors = NULL;
766 
767     if (!PyArg_ParseTuple(args, "O|z:utf_16_be_encode",
768                           &str, &errors))
769         return NULL;
770 
771     str = PyUnicode_FromObject(str);
772     if (str == NULL)
773         return NULL;
774     v = codec_tuple(PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(str),
775                                           PyUnicode_GET_SIZE(str),
776                                           errors,
777                                           +1),
778                     PyUnicode_GET_SIZE(str));
779     Py_DECREF(str);
780     return v;
781 }
782 
783 /* This version provides access to the byteorder parameter of the
784    builtin UTF-32 codecs as optional third argument. It defaults to 0
785    which means: use the native byte order and prepend the data with a
786    BOM mark.
787 
788 */
789 
790 static PyObject *
utf_32_encode(PyObject * self,PyObject * args)791 utf_32_encode(PyObject *self,
792             PyObject *args)
793 {
794     PyObject *str, *v;
795     const char *errors = NULL;
796     int byteorder = 0;
797 
798     if (!PyArg_ParseTuple(args, "O|zi:utf_32_encode",
799                           &str, &errors, &byteorder))
800         return NULL;
801 
802     str = PyUnicode_FromObject(str);
803     if (str == NULL)
804         return NULL;
805     v = codec_tuple(PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(str),
806                                           PyUnicode_GET_SIZE(str),
807                                           errors,
808                                           byteorder),
809                     PyUnicode_GET_SIZE(str));
810     Py_DECREF(str);
811     return v;
812 }
813 
814 static PyObject *
utf_32_le_encode(PyObject * self,PyObject * args)815 utf_32_le_encode(PyObject *self,
816                  PyObject *args)
817 {
818     PyObject *str, *v;
819     const char *errors = NULL;
820 
821     if (!PyArg_ParseTuple(args, "O|z:utf_32_le_encode",
822                           &str, &errors))
823         return NULL;
824 
825     str = PyUnicode_FromObject(str);
826     if (str == NULL)
827         return NULL;
828     v = codec_tuple(PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(str),
829                                              PyUnicode_GET_SIZE(str),
830                                              errors,
831                                              -1),
832                        PyUnicode_GET_SIZE(str));
833     Py_DECREF(str);
834     return v;
835 }
836 
837 static PyObject *
utf_32_be_encode(PyObject * self,PyObject * args)838 utf_32_be_encode(PyObject *self,
839                  PyObject *args)
840 {
841     PyObject *str, *v;
842     const char *errors = NULL;
843 
844     if (!PyArg_ParseTuple(args, "O|z:utf_32_be_encode",
845                           &str, &errors))
846         return NULL;
847 
848     str = PyUnicode_FromObject(str);
849     if (str == NULL)
850         return NULL;
851     v = codec_tuple(PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(str),
852                                           PyUnicode_GET_SIZE(str),
853                                           errors,
854                                           +1),
855                     PyUnicode_GET_SIZE(str));
856     Py_DECREF(str);
857     return v;
858 }
859 
860 static PyObject *
unicode_escape_encode(PyObject * self,PyObject * args)861 unicode_escape_encode(PyObject *self,
862                      PyObject *args)
863 {
864     PyObject *str, *v;
865     const char *errors = NULL;
866 
867     if (!PyArg_ParseTuple(args, "O|z:unicode_escape_encode",
868                           &str, &errors))
869         return NULL;
870 
871     str = PyUnicode_FromObject(str);
872     if (str == NULL)
873         return NULL;
874     v = codec_tuple(PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(str),
875                                                   PyUnicode_GET_SIZE(str)),
876                     PyUnicode_GET_SIZE(str));
877     Py_DECREF(str);
878     return v;
879 }
880 
881 static PyObject *
raw_unicode_escape_encode(PyObject * self,PyObject * args)882 raw_unicode_escape_encode(PyObject *self,
883                         PyObject *args)
884 {
885     PyObject *str, *v;
886     const char *errors = NULL;
887 
888     if (!PyArg_ParseTuple(args, "O|z:raw_unicode_escape_encode",
889                           &str, &errors))
890         return NULL;
891 
892     str = PyUnicode_FromObject(str);
893     if (str == NULL)
894         return NULL;
895     v = codec_tuple(PyUnicode_EncodeRawUnicodeEscape(
896                                PyUnicode_AS_UNICODE(str),
897                                PyUnicode_GET_SIZE(str)),
898                     PyUnicode_GET_SIZE(str));
899     Py_DECREF(str);
900     return v;
901 }
902 
903 static PyObject *
latin_1_encode(PyObject * self,PyObject * args)904 latin_1_encode(PyObject *self,
905                PyObject *args)
906 {
907     PyObject *str, *v;
908     const char *errors = NULL;
909 
910     if (!PyArg_ParseTuple(args, "O|z:latin_1_encode",
911                           &str, &errors))
912         return NULL;
913 
914     str = PyUnicode_FromObject(str);
915     if (str == NULL)
916         return NULL;
917     v = codec_tuple(PyUnicode_EncodeLatin1(
918                                PyUnicode_AS_UNICODE(str),
919                                PyUnicode_GET_SIZE(str),
920                                errors),
921                     PyUnicode_GET_SIZE(str));
922     Py_DECREF(str);
923     return v;
924 }
925 
926 static PyObject *
ascii_encode(PyObject * self,PyObject * args)927 ascii_encode(PyObject *self,
928              PyObject *args)
929 {
930     PyObject *str, *v;
931     const char *errors = NULL;
932 
933     if (!PyArg_ParseTuple(args, "O|z:ascii_encode",
934                           &str, &errors))
935         return NULL;
936 
937     str = PyUnicode_FromObject(str);
938     if (str == NULL)
939         return NULL;
940     v = codec_tuple(PyUnicode_EncodeASCII(
941                                PyUnicode_AS_UNICODE(str),
942                                PyUnicode_GET_SIZE(str),
943                                errors),
944                     PyUnicode_GET_SIZE(str));
945     Py_DECREF(str);
946     return v;
947 }
948 
949 static PyObject *
charmap_encode(PyObject * self,PyObject * args)950 charmap_encode(PyObject *self,
951              PyObject *args)
952 {
953     PyObject *str, *v;
954     const char *errors = NULL;
955     PyObject *mapping = NULL;
956 
957     if (!PyArg_ParseTuple(args, "O|zO:charmap_encode",
958                           &str, &errors, &mapping))
959         return NULL;
960     if (mapping == Py_None)
961         mapping = NULL;
962 
963     str = PyUnicode_FromObject(str);
964     if (str == NULL)
965         return NULL;
966     v = codec_tuple(PyUnicode_EncodeCharmap(
967                                PyUnicode_AS_UNICODE(str),
968                                PyUnicode_GET_SIZE(str),
969                                mapping,
970                                errors),
971                     PyUnicode_GET_SIZE(str));
972     Py_DECREF(str);
973     return v;
974 }
975 
976 static PyObject*
charmap_build(PyObject * self,PyObject * args)977 charmap_build(PyObject *self, PyObject *args)
978 {
979     PyObject *map;
980     if (!PyArg_ParseTuple(args, "U:charmap_build", &map))
981         return NULL;
982     return PyUnicode_BuildEncodingMap(map);
983 }
984 
985 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
986 
987 static PyObject *
mbcs_encode(PyObject * self,PyObject * args)988 mbcs_encode(PyObject *self,
989             PyObject *args)
990 {
991     PyObject *str, *v;
992     const char *errors = NULL;
993 
994     if (!PyArg_ParseTuple(args, "O|z:mbcs_encode",
995                           &str, &errors))
996         return NULL;
997 
998     str = PyUnicode_FromObject(str);
999     if (str == NULL)
1000         return NULL;
1001     v = codec_tuple(PyUnicode_EncodeMBCS(
1002                                PyUnicode_AS_UNICODE(str),
1003                                PyUnicode_GET_SIZE(str),
1004                                errors),
1005                     PyUnicode_GET_SIZE(str));
1006     Py_DECREF(str);
1007     return v;
1008 }
1009 
1010 #endif /* MS_WINDOWS */
1011 #endif /* Py_USING_UNICODE */
1012 
1013 /* --- Error handler registry --------------------------------------------- */
1014 
1015 PyDoc_STRVAR(register_error__doc__,
1016 "register_error(errors, handler)\n\
1017 \n\
1018 Register the specified error handler under the name\n\
1019 errors. handler must be a callable object, that\n\
1020 will be called with an exception instance containing\n\
1021 information about the location of the encoding/decoding\n\
1022 error and must return a (replacement, new position) tuple.");
1023 
register_error(PyObject * self,PyObject * args)1024 static PyObject *register_error(PyObject *self, PyObject *args)
1025 {
1026     const char *name;
1027     PyObject *handler;
1028 
1029     if (!PyArg_ParseTuple(args, "sO:register_error",
1030                           &name, &handler))
1031         return NULL;
1032     if (PyCodec_RegisterError(name, handler))
1033         return NULL;
1034     Py_RETURN_NONE;
1035 }
1036 
1037 PyDoc_STRVAR(lookup_error__doc__,
1038 "lookup_error(errors) -> handler\n\
1039 \n\
1040 Return the error handler for the specified error handling name\n\
1041 or raise a LookupError, if no handler exists under this name.");
1042 
lookup_error(PyObject * self,PyObject * args)1043 static PyObject *lookup_error(PyObject *self, PyObject *args)
1044 {
1045     const char *name;
1046 
1047     if (!PyArg_ParseTuple(args, "s:lookup_error",
1048                           &name))
1049         return NULL;
1050     return PyCodec_LookupError(name);
1051 }
1052 
1053 /* --- Module API --------------------------------------------------------- */
1054 
1055 static PyMethodDef _codecs_functions[] = {
1056     {"register",                codec_register,                 METH_O,
1057         register__doc__},
1058     {"lookup",                  codec_lookup,                   METH_VARARGS,
1059         lookup__doc__},
1060     {"encode",                  codec_encode,                   METH_VARARGS,
1061         encode__doc__},
1062     {"decode",                  codec_decode,                   METH_VARARGS,
1063         decode__doc__},
1064     {"escape_encode",           escape_encode,                  METH_VARARGS},
1065     {"escape_decode",           escape_decode,                  METH_VARARGS},
1066 #ifdef Py_USING_UNICODE
1067     {"utf_8_encode",            utf_8_encode,                   METH_VARARGS},
1068     {"utf_8_decode",            utf_8_decode,                   METH_VARARGS},
1069     {"utf_7_encode",            utf_7_encode,                   METH_VARARGS},
1070     {"utf_7_decode",            utf_7_decode,                   METH_VARARGS},
1071     {"utf_16_encode",           utf_16_encode,                  METH_VARARGS},
1072     {"utf_16_le_encode",        utf_16_le_encode,               METH_VARARGS},
1073     {"utf_16_be_encode",        utf_16_be_encode,               METH_VARARGS},
1074     {"utf_16_decode",           utf_16_decode,                  METH_VARARGS},
1075     {"utf_16_le_decode",        utf_16_le_decode,               METH_VARARGS},
1076     {"utf_16_be_decode",        utf_16_be_decode,               METH_VARARGS},
1077     {"utf_16_ex_decode",        utf_16_ex_decode,               METH_VARARGS},
1078     {"utf_32_encode",           utf_32_encode,                  METH_VARARGS},
1079     {"utf_32_le_encode",        utf_32_le_encode,               METH_VARARGS},
1080     {"utf_32_be_encode",        utf_32_be_encode,               METH_VARARGS},
1081     {"utf_32_decode",           utf_32_decode,                  METH_VARARGS},
1082     {"utf_32_le_decode",        utf_32_le_decode,               METH_VARARGS},
1083     {"utf_32_be_decode",        utf_32_be_decode,               METH_VARARGS},
1084     {"utf_32_ex_decode",        utf_32_ex_decode,               METH_VARARGS},
1085     {"unicode_escape_encode",   unicode_escape_encode,          METH_VARARGS},
1086     {"unicode_escape_decode",   unicode_escape_decode,          METH_VARARGS},
1087     {"unicode_internal_encode", unicode_internal_encode,        METH_VARARGS},
1088     {"unicode_internal_decode", unicode_internal_decode,        METH_VARARGS},
1089     {"raw_unicode_escape_encode", raw_unicode_escape_encode,    METH_VARARGS},
1090     {"raw_unicode_escape_decode", raw_unicode_escape_decode,    METH_VARARGS},
1091     {"latin_1_encode",          latin_1_encode,                 METH_VARARGS},
1092     {"latin_1_decode",          latin_1_decode,                 METH_VARARGS},
1093     {"ascii_encode",            ascii_encode,                   METH_VARARGS},
1094     {"ascii_decode",            ascii_decode,                   METH_VARARGS},
1095     {"charmap_encode",          charmap_encode,                 METH_VARARGS},
1096     {"charmap_decode",          charmap_decode,                 METH_VARARGS},
1097     {"charmap_build",           charmap_build,                  METH_VARARGS},
1098     {"readbuffer_encode",       readbuffer_encode,              METH_VARARGS},
1099     {"charbuffer_encode",       charbuffer_encode,              METH_VARARGS},
1100 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1101     {"mbcs_encode",             mbcs_encode,                    METH_VARARGS},
1102     {"mbcs_decode",             mbcs_decode,                    METH_VARARGS},
1103 #endif
1104 #endif /* Py_USING_UNICODE */
1105     {"register_error",          register_error,                 METH_VARARGS,
1106         register_error__doc__},
1107     {"lookup_error",            lookup_error,                   METH_VARARGS,
1108         lookup_error__doc__},
1109     {NULL, NULL}                /* sentinel */
1110 };
1111 
1112 PyMODINIT_FUNC
init_codecs(void)1113 init_codecs(void)
1114 {
1115     Py_InitModule("_codecs", _codecs_functions);
1116 }
1117