1 /* ------------------------------------------------------------------------
2 
3    _codecs -- Provides access to the codec registry and the builtin
4               codecs.
5 
6    This module should never be imported directly. The standard library
7    module "codecs" wraps this builtin module for use within Python.
8 
9    The codec registry is accessible via:
10 
11      register(search_function) -> None
12 
13      lookup(encoding) -> CodecInfo object
14 
15    The builtin Unicode codecs use the following interface:
16 
17      <encoding>_encode(Unicode_object[,errors='strict']) ->
18         (string object, bytes consumed)
19 
20      <encoding>_decode(char_buffer_obj[,errors='strict']) ->
21         (Unicode object, bytes consumed)
22 
23    These <encoding>s are available: utf_8, unicode_escape,
24    raw_unicode_escape, unicode_internal, latin_1, ascii (7-bit),
25    mbcs (on win32).
26 
27 
28 Written by Marc-Andre Lemburg (mal@lemburg.com).
29 
30 Copyright (c) Corporation for National Research Initiatives.
31 
32    ------------------------------------------------------------------------ */
33 
34 #define PY_SSIZE_T_CLEAN
35 #include "Python.h"
36 
37 #ifdef MS_WINDOWS
38 #include <windows.h>
39 #endif
40 
41 /*[clinic input]
42 module _codecs
43 [clinic start generated code]*/
44 /*[clinic end generated code: output=da39a3ee5e6b4b0d input=e1390e3da3cb9deb]*/
45 
46 #include "clinic/_codecsmodule.c.h"
47 
48 /* --- Registry ----------------------------------------------------------- */
49 
50 /*[clinic input]
51 _codecs.register
52     search_function: object
53     /
54 
55 Register a codec search function.
56 
57 Search functions are expected to take one argument, the encoding name in
58 all lower case letters, and either return None, or a tuple of functions
59 (encoder, decoder, stream_reader, stream_writer) (or a CodecInfo object).
60 [clinic start generated code]*/
61 
62 static PyObject *
_codecs_register(PyObject * module,PyObject * search_function)63 _codecs_register(PyObject *module, PyObject *search_function)
64 /*[clinic end generated code: output=d1bf21e99db7d6d3 input=369578467955cae4]*/
65 {
66     if (PyCodec_Register(search_function))
67         return NULL;
68 
69     Py_RETURN_NONE;
70 }
71 
72 /*[clinic input]
73 _codecs.lookup
74     encoding: str
75     /
76 
77 Looks up a codec tuple in the Python codec registry and returns a CodecInfo object.
78 [clinic start generated code]*/
79 
80 static PyObject *
_codecs_lookup_impl(PyObject * module,const char * encoding)81 _codecs_lookup_impl(PyObject *module, const char *encoding)
82 /*[clinic end generated code: output=9f0afa572080c36d input=3c572c0db3febe9c]*/
83 {
84     return _PyCodec_Lookup(encoding);
85 }
86 
87 /*[clinic input]
88 _codecs.encode
89     obj: object
90     encoding: str(c_default="NULL") = "utf-8"
91     errors: str(c_default="NULL") = "strict"
92 
93 Encodes obj using the codec registered for encoding.
94 
95 The default encoding is 'utf-8'.  errors may be given to set a
96 different error handling scheme.  Default is 'strict' meaning that encoding
97 errors raise a ValueError.  Other possible values are 'ignore', 'replace'
98 and 'backslashreplace' as well as any other name registered with
99 codecs.register_error that can handle ValueErrors.
100 [clinic start generated code]*/
101 
102 static PyObject *
_codecs_encode_impl(PyObject * module,PyObject * obj,const char * encoding,const char * errors)103 _codecs_encode_impl(PyObject *module, PyObject *obj, const char *encoding,
104                     const char *errors)
105 /*[clinic end generated code: output=385148eb9a067c86 input=cd5b685040ff61f0]*/
106 {
107     if (encoding == NULL)
108         encoding = PyUnicode_GetDefaultEncoding();
109 
110     /* Encode via the codec registry */
111     return PyCodec_Encode(obj, encoding, errors);
112 }
113 
114 /*[clinic input]
115 _codecs.decode
116     obj: object
117     encoding: str(c_default="NULL") = "utf-8"
118     errors: str(c_default="NULL") = "strict"
119 
120 Decodes obj using the codec registered for encoding.
121 
122 Default encoding is 'utf-8'.  errors may be given to set a
123 different error handling scheme.  Default is 'strict' meaning that encoding
124 errors raise a ValueError.  Other possible values are 'ignore', 'replace'
125 and 'backslashreplace' as well as any other name registered with
126 codecs.register_error that can handle ValueErrors.
127 [clinic start generated code]*/
128 
129 static PyObject *
_codecs_decode_impl(PyObject * module,PyObject * obj,const char * encoding,const char * errors)130 _codecs_decode_impl(PyObject *module, PyObject *obj, const char *encoding,
131                     const char *errors)
132 /*[clinic end generated code: output=679882417dc3a0bd input=7702c0cc2fa1add6]*/
133 {
134     if (encoding == NULL)
135         encoding = PyUnicode_GetDefaultEncoding();
136 
137     /* Decode via the codec registry */
138     return PyCodec_Decode(obj, encoding, errors);
139 }
140 
141 /* --- Helpers ------------------------------------------------------------ */
142 
143 /*[clinic input]
144 _codecs._forget_codec
145 
146     encoding: str
147     /
148 
149 Purge the named codec from the internal codec lookup cache
150 [clinic start generated code]*/
151 
152 static PyObject *
_codecs__forget_codec_impl(PyObject * module,const char * encoding)153 _codecs__forget_codec_impl(PyObject *module, const char *encoding)
154 /*[clinic end generated code: output=0bde9f0a5b084aa2 input=18d5d92d0e386c38]*/
155 {
156     if (_PyCodec_Forget(encoding) < 0) {
157         return NULL;
158     };
159     Py_RETURN_NONE;
160 }
161 
162 static
codec_tuple(PyObject * decoded,Py_ssize_t len)163 PyObject *codec_tuple(PyObject *decoded,
164                       Py_ssize_t len)
165 {
166     if (decoded == NULL)
167         return NULL;
168     return Py_BuildValue("Nn", decoded, len);
169 }
170 
171 /* --- String codecs ------------------------------------------------------ */
172 /*[clinic input]
173 _codecs.escape_decode
174     data: Py_buffer(accept={str, buffer})
175     errors: str(accept={str, NoneType}) = NULL
176     /
177 [clinic start generated code]*/
178 
179 static PyObject *
_codecs_escape_decode_impl(PyObject * module,Py_buffer * data,const char * errors)180 _codecs_escape_decode_impl(PyObject *module, Py_buffer *data,
181                            const char *errors)
182 /*[clinic end generated code: output=505200ba8056979a input=0018edfd99db714d]*/
183 {
184     PyObject *decoded = PyBytes_DecodeEscape(data->buf, data->len,
185                                              errors, 0, NULL);
186     return codec_tuple(decoded, data->len);
187 }
188 
189 /*[clinic input]
190 _codecs.escape_encode
191     data: object(subclass_of='&PyBytes_Type')
192     errors: str(accept={str, NoneType}) = NULL
193     /
194 [clinic start generated code]*/
195 
196 static PyObject *
_codecs_escape_encode_impl(PyObject * module,PyObject * data,const char * errors)197 _codecs_escape_encode_impl(PyObject *module, PyObject *data,
198                            const char *errors)
199 /*[clinic end generated code: output=4af1d477834bab34 input=da9ded00992f32f2]*/
200 {
201     Py_ssize_t size;
202     Py_ssize_t newsize;
203     PyObject *v;
204 
205     size = PyBytes_GET_SIZE(data);
206     if (size > PY_SSIZE_T_MAX / 4) {
207         PyErr_SetString(PyExc_OverflowError,
208             "string is too large to encode");
209             return NULL;
210     }
211     newsize = 4*size;
212     v = PyBytes_FromStringAndSize(NULL, newsize);
213 
214     if (v == NULL) {
215         return NULL;
216     }
217     else {
218         Py_ssize_t i;
219         char c;
220         char *p = PyBytes_AS_STRING(v);
221 
222         for (i = 0; i < size; i++) {
223             /* There's at least enough room for a hex escape */
224             assert(newsize - (p - PyBytes_AS_STRING(v)) >= 4);
225             c = PyBytes_AS_STRING(data)[i];
226             if (c == '\'' || c == '\\')
227                 *p++ = '\\', *p++ = c;
228             else if (c == '\t')
229                 *p++ = '\\', *p++ = 't';
230             else if (c == '\n')
231                 *p++ = '\\', *p++ = 'n';
232             else if (c == '\r')
233                 *p++ = '\\', *p++ = 'r';
234             else if (c < ' ' || c >= 0x7f) {
235                 *p++ = '\\';
236                 *p++ = 'x';
237                 *p++ = Py_hexdigits[(c & 0xf0) >> 4];
238                 *p++ = Py_hexdigits[c & 0xf];
239             }
240             else
241                 *p++ = c;
242         }
243         *p = '\0';
244         if (_PyBytes_Resize(&v, (p - PyBytes_AS_STRING(v)))) {
245             return NULL;
246         }
247     }
248 
249     return codec_tuple(v, size);
250 }
251 
252 /* --- Decoder ------------------------------------------------------------ */
253 /*[clinic input]
254 _codecs.unicode_internal_decode
255     obj: object
256     errors: str(accept={str, NoneType}) = NULL
257     /
258 [clinic start generated code]*/
259 
260 static PyObject *
_codecs_unicode_internal_decode_impl(PyObject * module,PyObject * obj,const char * errors)261 _codecs_unicode_internal_decode_impl(PyObject *module, PyObject *obj,
262                                      const char *errors)
263 /*[clinic end generated code: output=edbfe175e09eff9a input=8d57930aeda170c6]*/
264 {
265     if (PyUnicode_Check(obj)) {
266         if (PyUnicode_READY(obj) < 0)
267             return NULL;
268         Py_INCREF(obj);
269         return codec_tuple(obj, PyUnicode_GET_LENGTH(obj));
270     }
271     else {
272         Py_buffer view;
273         PyObject *result;
274         if (PyObject_GetBuffer(obj, &view, PyBUF_SIMPLE) != 0)
275             return NULL;
276 
277         result = codec_tuple(
278                 _PyUnicode_DecodeUnicodeInternal(view.buf, view.len, errors),
279                 view.len);
280         PyBuffer_Release(&view);
281         return result;
282     }
283 }
284 
285 /*[clinic input]
286 _codecs.utf_7_decode
287     data: Py_buffer
288     errors: str(accept={str, NoneType}) = NULL
289     final: bool(accept={int}) = False
290     /
291 [clinic start generated code]*/
292 
293 static PyObject *
_codecs_utf_7_decode_impl(PyObject * module,Py_buffer * data,const char * errors,int final)294 _codecs_utf_7_decode_impl(PyObject *module, Py_buffer *data,
295                           const char *errors, int final)
296 /*[clinic end generated code: output=0cd3a944a32a4089 input=2d94a5a1f170c8ae]*/
297 {
298     Py_ssize_t consumed = data->len;
299     PyObject *decoded = PyUnicode_DecodeUTF7Stateful(data->buf, data->len,
300                                                      errors,
301                                                      final ? NULL : &consumed);
302     return codec_tuple(decoded, consumed);
303 }
304 
305 /*[clinic input]
306 _codecs.utf_8_decode
307     data: Py_buffer
308     errors: str(accept={str, NoneType}) = NULL
309     final: bool(accept={int}) = False
310     /
311 [clinic start generated code]*/
312 
313 static PyObject *
_codecs_utf_8_decode_impl(PyObject * module,Py_buffer * data,const char * errors,int final)314 _codecs_utf_8_decode_impl(PyObject *module, Py_buffer *data,
315                           const char *errors, int final)
316 /*[clinic end generated code: output=10f74dec8d9bb8bf input=1ea6c21492e8bcbe]*/
317 {
318     Py_ssize_t consumed = data->len;
319     PyObject *decoded = PyUnicode_DecodeUTF8Stateful(data->buf, data->len,
320                                                      errors,
321                                                      final ? NULL : &consumed);
322     return codec_tuple(decoded, consumed);
323 }
324 
325 /*[clinic input]
326 _codecs.utf_16_decode
327     data: Py_buffer
328     errors: str(accept={str, NoneType}) = NULL
329     final: bool(accept={int}) = False
330     /
331 [clinic start generated code]*/
332 
333 static PyObject *
_codecs_utf_16_decode_impl(PyObject * module,Py_buffer * data,const char * errors,int final)334 _codecs_utf_16_decode_impl(PyObject *module, Py_buffer *data,
335                            const char *errors, int final)
336 /*[clinic end generated code: output=783b442abcbcc2d0 input=2ba128c28ea0bb40]*/
337 {
338     int byteorder = 0;
339     /* This is overwritten unless final is true. */
340     Py_ssize_t consumed = data->len;
341     PyObject *decoded = PyUnicode_DecodeUTF16Stateful(data->buf, data->len,
342                                                       errors, &byteorder,
343                                                       final ? NULL : &consumed);
344     return codec_tuple(decoded, consumed);
345 }
346 
347 /*[clinic input]
348 _codecs.utf_16_le_decode
349     data: Py_buffer
350     errors: str(accept={str, NoneType}) = NULL
351     final: bool(accept={int}) = False
352     /
353 [clinic start generated code]*/
354 
355 static PyObject *
_codecs_utf_16_le_decode_impl(PyObject * module,Py_buffer * data,const char * errors,int final)356 _codecs_utf_16_le_decode_impl(PyObject *module, Py_buffer *data,
357                               const char *errors, int final)
358 /*[clinic end generated code: output=899b9e6364379dcd input=43aeb8b0461cace5]*/
359 {
360     int byteorder = -1;
361     /* This is overwritten unless final is true. */
362     Py_ssize_t consumed = data->len;
363     PyObject *decoded = PyUnicode_DecodeUTF16Stateful(data->buf, data->len,
364                                                       errors, &byteorder,
365                                                       final ? NULL : &consumed);
366     return codec_tuple(decoded, consumed);
367 }
368 
369 /*[clinic input]
370 _codecs.utf_16_be_decode
371     data: Py_buffer
372     errors: str(accept={str, NoneType}) = NULL
373     final: bool(accept={int}) = False
374     /
375 [clinic start generated code]*/
376 
377 static PyObject *
_codecs_utf_16_be_decode_impl(PyObject * module,Py_buffer * data,const char * errors,int final)378 _codecs_utf_16_be_decode_impl(PyObject *module, Py_buffer *data,
379                               const char *errors, int final)
380 /*[clinic end generated code: output=49f6465ea07669c8 input=339e554c804f34b2]*/
381 {
382     int byteorder = 1;
383     /* This is overwritten unless final is true. */
384     Py_ssize_t consumed = data->len;
385     PyObject *decoded = PyUnicode_DecodeUTF16Stateful(data->buf, data->len,
386                                                       errors, &byteorder,
387                                                       final ? NULL : &consumed);
388     return codec_tuple(decoded, consumed);
389 }
390 
391 /* This non-standard version also provides access to the byteorder
392    parameter of the builtin UTF-16 codec.
393 
394    It returns a tuple (unicode, bytesread, byteorder) with byteorder
395    being the value in effect at the end of data.
396 
397 */
398 /*[clinic input]
399 _codecs.utf_16_ex_decode
400     data: Py_buffer
401     errors: str(accept={str, NoneType}) = NULL
402     byteorder: int = 0
403     final: bool(accept={int}) = False
404     /
405 [clinic start generated code]*/
406 
407 static PyObject *
_codecs_utf_16_ex_decode_impl(PyObject * module,Py_buffer * data,const char * errors,int byteorder,int final)408 _codecs_utf_16_ex_decode_impl(PyObject *module, Py_buffer *data,
409                               const char *errors, int byteorder, int final)
410 /*[clinic end generated code: output=0f385f251ecc1988 input=3201aeddb9636889]*/
411 {
412     /* This is overwritten unless final is true. */
413     Py_ssize_t consumed = data->len;
414 
415     PyObject *decoded = PyUnicode_DecodeUTF16Stateful(data->buf, data->len,
416                                                       errors, &byteorder,
417                                                       final ? NULL : &consumed);
418     if (decoded == NULL)
419         return NULL;
420     return Py_BuildValue("Nni", decoded, consumed, byteorder);
421 }
422 
423 /*[clinic input]
424 _codecs.utf_32_decode
425     data: Py_buffer
426     errors: str(accept={str, NoneType}) = NULL
427     final: bool(accept={int}) = False
428     /
429 [clinic start generated code]*/
430 
431 static PyObject *
_codecs_utf_32_decode_impl(PyObject * module,Py_buffer * data,const char * errors,int final)432 _codecs_utf_32_decode_impl(PyObject *module, Py_buffer *data,
433                            const char *errors, int final)
434 /*[clinic end generated code: output=2fc961807f7b145f input=155a5c673a4e2514]*/
435 {
436     int byteorder = 0;
437     /* This is overwritten unless final is true. */
438     Py_ssize_t consumed = data->len;
439     PyObject *decoded = PyUnicode_DecodeUTF32Stateful(data->buf, data->len,
440                                                       errors, &byteorder,
441                                                       final ? NULL : &consumed);
442     return codec_tuple(decoded, consumed);
443 }
444 
445 /*[clinic input]
446 _codecs.utf_32_le_decode
447     data: Py_buffer
448     errors: str(accept={str, NoneType}) = NULL
449     final: bool(accept={int}) = False
450     /
451 [clinic start generated code]*/
452 
453 static PyObject *
_codecs_utf_32_le_decode_impl(PyObject * module,Py_buffer * data,const char * errors,int final)454 _codecs_utf_32_le_decode_impl(PyObject *module, Py_buffer *data,
455                               const char *errors, int final)
456 /*[clinic end generated code: output=ec8f46b67a94f3e6 input=7baf061069e92d3b]*/
457 {
458     int byteorder = -1;
459     /* This is overwritten unless final is true. */
460     Py_ssize_t consumed = data->len;
461     PyObject *decoded = PyUnicode_DecodeUTF32Stateful(data->buf, data->len,
462                                                       errors, &byteorder,
463                                                       final ? NULL : &consumed);
464     return codec_tuple(decoded, consumed);
465 }
466 
467 /*[clinic input]
468 _codecs.utf_32_be_decode
469     data: Py_buffer
470     errors: str(accept={str, NoneType}) = NULL
471     final: bool(accept={int}) = False
472     /
473 [clinic start generated code]*/
474 
475 static PyObject *
_codecs_utf_32_be_decode_impl(PyObject * module,Py_buffer * data,const char * errors,int final)476 _codecs_utf_32_be_decode_impl(PyObject *module, Py_buffer *data,
477                               const char *errors, int final)
478 /*[clinic end generated code: output=ff82bae862c92c4e input=b182026300dae595]*/
479 {
480     int byteorder = 1;
481     /* This is overwritten unless final is true. */
482     Py_ssize_t consumed = data->len;
483     PyObject *decoded = PyUnicode_DecodeUTF32Stateful(data->buf, data->len,
484                                                       errors, &byteorder,
485                                                       final ? NULL : &consumed);
486     return codec_tuple(decoded, consumed);
487 }
488 
489 /* This non-standard version also provides access to the byteorder
490    parameter of the builtin UTF-32 codec.
491 
492    It returns a tuple (unicode, bytesread, byteorder) with byteorder
493    being the value in effect at the end of data.
494 
495 */
496 /*[clinic input]
497 _codecs.utf_32_ex_decode
498     data: Py_buffer
499     errors: str(accept={str, NoneType}) = NULL
500     byteorder: int = 0
501     final: bool(accept={int}) = False
502     /
503 [clinic start generated code]*/
504 
505 static PyObject *
_codecs_utf_32_ex_decode_impl(PyObject * module,Py_buffer * data,const char * errors,int byteorder,int final)506 _codecs_utf_32_ex_decode_impl(PyObject *module, Py_buffer *data,
507                               const char *errors, int byteorder, int final)
508 /*[clinic end generated code: output=6bfb177dceaf4848 input=7b9c2cb819fb237a]*/
509 {
510     Py_ssize_t consumed = data->len;
511     PyObject *decoded = PyUnicode_DecodeUTF32Stateful(data->buf, data->len,
512                                                       errors, &byteorder,
513                                                       final ? NULL : &consumed);
514     if (decoded == NULL)
515         return NULL;
516     return Py_BuildValue("Nni", decoded, consumed, byteorder);
517 }
518 
519 /*[clinic input]
520 _codecs.unicode_escape_decode
521     data: Py_buffer(accept={str, buffer})
522     errors: str(accept={str, NoneType}) = NULL
523     /
524 [clinic start generated code]*/
525 
526 static PyObject *
_codecs_unicode_escape_decode_impl(PyObject * module,Py_buffer * data,const char * errors)527 _codecs_unicode_escape_decode_impl(PyObject *module, Py_buffer *data,
528                                    const char *errors)
529 /*[clinic end generated code: output=3ca3c917176b82ab input=49fd27d06813a7f5]*/
530 {
531     PyObject *decoded = PyUnicode_DecodeUnicodeEscape(data->buf, data->len,
532                                                       errors);
533     return codec_tuple(decoded, data->len);
534 }
535 
536 /*[clinic input]
537 _codecs.raw_unicode_escape_decode
538     data: Py_buffer(accept={str, buffer})
539     errors: str(accept={str, NoneType}) = NULL
540     /
541 [clinic start generated code]*/
542 
543 static PyObject *
_codecs_raw_unicode_escape_decode_impl(PyObject * module,Py_buffer * data,const char * errors)544 _codecs_raw_unicode_escape_decode_impl(PyObject *module, Py_buffer *data,
545                                        const char *errors)
546 /*[clinic end generated code: output=c98eeb56028070a6 input=770903a211434ebc]*/
547 {
548     PyObject *decoded = PyUnicode_DecodeRawUnicodeEscape(data->buf, data->len,
549                                                          errors);
550     return codec_tuple(decoded, data->len);
551 }
552 
553 /*[clinic input]
554 _codecs.latin_1_decode
555     data: Py_buffer
556     errors: str(accept={str, NoneType}) = NULL
557     /
558 [clinic start generated code]*/
559 
560 static PyObject *
_codecs_latin_1_decode_impl(PyObject * module,Py_buffer * data,const char * errors)561 _codecs_latin_1_decode_impl(PyObject *module, Py_buffer *data,
562                             const char *errors)
563 /*[clinic end generated code: output=07f3dfa3f72c7d8f input=5cad0f1759c618ec]*/
564 {
565     PyObject *decoded = PyUnicode_DecodeLatin1(data->buf, data->len, errors);
566     return codec_tuple(decoded, data->len);
567 }
568 
569 /*[clinic input]
570 _codecs.ascii_decode
571     data: Py_buffer
572     errors: str(accept={str, NoneType}) = NULL
573     /
574 [clinic start generated code]*/
575 
576 static PyObject *
_codecs_ascii_decode_impl(PyObject * module,Py_buffer * data,const char * errors)577 _codecs_ascii_decode_impl(PyObject *module, Py_buffer *data,
578                           const char *errors)
579 /*[clinic end generated code: output=2627d72058d42429 input=ad1106f64037bd16]*/
580 {
581     PyObject *decoded = PyUnicode_DecodeASCII(data->buf, data->len, errors);
582     return codec_tuple(decoded, data->len);
583 }
584 
585 /*[clinic input]
586 _codecs.charmap_decode
587     data: Py_buffer
588     errors: str(accept={str, NoneType}) = NULL
589     mapping: object = NULL
590     /
591 [clinic start generated code]*/
592 
593 static PyObject *
_codecs_charmap_decode_impl(PyObject * module,Py_buffer * data,const char * errors,PyObject * mapping)594 _codecs_charmap_decode_impl(PyObject *module, Py_buffer *data,
595                             const char *errors, PyObject *mapping)
596 /*[clinic end generated code: output=2c335b09778cf895 input=19712ca35c5a80e2]*/
597 {
598     PyObject *decoded;
599 
600     if (mapping == Py_None)
601         mapping = NULL;
602 
603     decoded = PyUnicode_DecodeCharmap(data->buf, data->len, mapping, errors);
604     return codec_tuple(decoded, data->len);
605 }
606 
607 #ifdef MS_WINDOWS
608 
609 /*[clinic input]
610 _codecs.mbcs_decode
611     data: Py_buffer
612     errors: str(accept={str, NoneType}) = NULL
613     final: bool(accept={int}) = False
614     /
615 [clinic start generated code]*/
616 
617 static PyObject *
_codecs_mbcs_decode_impl(PyObject * module,Py_buffer * data,const char * errors,int final)618 _codecs_mbcs_decode_impl(PyObject *module, Py_buffer *data,
619                          const char *errors, int final)
620 /*[clinic end generated code: output=39b65b8598938c4b input=b5f2fe568f311297]*/
621 {
622     Py_ssize_t consumed = data->len;
623     PyObject *decoded = PyUnicode_DecodeMBCSStateful(data->buf, data->len,
624             errors, final ? NULL : &consumed);
625     return codec_tuple(decoded, consumed);
626 }
627 
628 /*[clinic input]
629 _codecs.oem_decode
630     data: Py_buffer
631     errors: str(accept={str, NoneType}) = NULL
632     final: bool(accept={int}) = False
633     /
634 [clinic start generated code]*/
635 
636 static PyObject *
_codecs_oem_decode_impl(PyObject * module,Py_buffer * data,const char * errors,int final)637 _codecs_oem_decode_impl(PyObject *module, Py_buffer *data,
638                         const char *errors, int final)
639 /*[clinic end generated code: output=da1617612f3fcad8 input=278709bcfd374a9c]*/
640 {
641     Py_ssize_t consumed = data->len;
642     PyObject *decoded = PyUnicode_DecodeCodePageStateful(CP_OEMCP,
643         data->buf, data->len, errors, final ? NULL : &consumed);
644     return codec_tuple(decoded, consumed);
645 }
646 
647 /*[clinic input]
648 _codecs.code_page_decode
649     codepage: int
650     data: Py_buffer
651     errors: str(accept={str, NoneType}) = NULL
652     final: bool(accept={int}) = False
653     /
654 [clinic start generated code]*/
655 
656 static PyObject *
_codecs_code_page_decode_impl(PyObject * module,int codepage,Py_buffer * data,const char * errors,int final)657 _codecs_code_page_decode_impl(PyObject *module, int codepage,
658                               Py_buffer *data, const char *errors, int final)
659 /*[clinic end generated code: output=53008ea967da3fff input=51f6169021c68dd5]*/
660 {
661     Py_ssize_t consumed = data->len;
662     PyObject *decoded = PyUnicode_DecodeCodePageStateful(codepage,
663                                                          data->buf, data->len,
664                                                          errors,
665                                                          final ? NULL : &consumed);
666     return codec_tuple(decoded, consumed);
667 }
668 
669 #endif /* MS_WINDOWS */
670 
671 /* --- Encoder ------------------------------------------------------------ */
672 
673 /*[clinic input]
674 _codecs.readbuffer_encode
675     data: Py_buffer(accept={str, buffer})
676     errors: str(accept={str, NoneType}) = NULL
677     /
678 [clinic start generated code]*/
679 
680 static PyObject *
_codecs_readbuffer_encode_impl(PyObject * module,Py_buffer * data,const char * errors)681 _codecs_readbuffer_encode_impl(PyObject *module, Py_buffer *data,
682                                const char *errors)
683 /*[clinic end generated code: output=c645ea7cdb3d6e86 input=b7c322b89d4ab923]*/
684 {
685     PyObject *result = PyBytes_FromStringAndSize(data->buf, data->len);
686     return codec_tuple(result, data->len);
687 }
688 
689 /*[clinic input]
690 _codecs.unicode_internal_encode
691     obj: object
692     errors: str(accept={str, NoneType}) = NULL
693     /
694 [clinic start generated code]*/
695 
696 static PyObject *
_codecs_unicode_internal_encode_impl(PyObject * module,PyObject * obj,const char * errors)697 _codecs_unicode_internal_encode_impl(PyObject *module, PyObject *obj,
698                                      const char *errors)
699 /*[clinic end generated code: output=a72507dde4ea558f input=8628f0280cf5ba61]*/
700 {
701     if (PyErr_WarnEx(PyExc_DeprecationWarning,
702                      "unicode_internal codec has been deprecated",
703                      1))
704         return NULL;
705 
706     if (PyUnicode_Check(obj)) {
707         Py_UNICODE *u;
708         Py_ssize_t len, size;
709 
710         if (PyUnicode_READY(obj) < 0)
711             return NULL;
712 
713         u = PyUnicode_AsUnicodeAndSize(obj, &len);
714         if (u == NULL)
715             return NULL;
716         if ((size_t)len > (size_t)PY_SSIZE_T_MAX / sizeof(Py_UNICODE))
717             return PyErr_NoMemory();
718         size = len * sizeof(Py_UNICODE);
719         return codec_tuple(PyBytes_FromStringAndSize((const char*)u, size),
720                            PyUnicode_GET_LENGTH(obj));
721     }
722     else {
723         Py_buffer view;
724         PyObject *result;
725         if (PyObject_GetBuffer(obj, &view, PyBUF_SIMPLE) != 0)
726             return NULL;
727         result = codec_tuple(PyBytes_FromStringAndSize(view.buf, view.len),
728                              view.len);
729         PyBuffer_Release(&view);
730         return result;
731     }
732 }
733 
734 /*[clinic input]
735 _codecs.utf_7_encode
736     str: unicode
737     errors: str(accept={str, NoneType}) = NULL
738     /
739 [clinic start generated code]*/
740 
741 static PyObject *
_codecs_utf_7_encode_impl(PyObject * module,PyObject * str,const char * errors)742 _codecs_utf_7_encode_impl(PyObject *module, PyObject *str,
743                           const char *errors)
744 /*[clinic end generated code: output=0feda21ffc921bc8 input=d1a47579e79cbe15]*/
745 {
746     return codec_tuple(_PyUnicode_EncodeUTF7(str, 0, 0, errors),
747                        PyUnicode_GET_LENGTH(str));
748 }
749 
750 /*[clinic input]
751 _codecs.utf_8_encode
752     str: unicode
753     errors: str(accept={str, NoneType}) = NULL
754     /
755 [clinic start generated code]*/
756 
757 static PyObject *
_codecs_utf_8_encode_impl(PyObject * module,PyObject * str,const char * errors)758 _codecs_utf_8_encode_impl(PyObject *module, PyObject *str,
759                           const char *errors)
760 /*[clinic end generated code: output=02bf47332b9c796c input=42e3ba73c4392eef]*/
761 {
762     return codec_tuple(_PyUnicode_AsUTF8String(str, errors),
763                        PyUnicode_GET_LENGTH(str));
764 }
765 
766 /* This version provides access to the byteorder parameter of the
767    builtin UTF-16 codecs as optional third argument. It defaults to 0
768    which means: use the native byte order and prepend the data with a
769    BOM mark.
770 
771 */
772 
773 /*[clinic input]
774 _codecs.utf_16_encode
775     str: unicode
776     errors: str(accept={str, NoneType}) = NULL
777     byteorder: int = 0
778     /
779 [clinic start generated code]*/
780 
781 static PyObject *
_codecs_utf_16_encode_impl(PyObject * module,PyObject * str,const char * errors,int byteorder)782 _codecs_utf_16_encode_impl(PyObject *module, PyObject *str,
783                            const char *errors, int byteorder)
784 /*[clinic end generated code: output=c654e13efa2e64e4 input=ff46416b04edb944]*/
785 {
786     return codec_tuple(_PyUnicode_EncodeUTF16(str, errors, byteorder),
787                        PyUnicode_GET_LENGTH(str));
788 }
789 
790 /*[clinic input]
791 _codecs.utf_16_le_encode
792     str: unicode
793     errors: str(accept={str, NoneType}) = NULL
794     /
795 [clinic start generated code]*/
796 
797 static PyObject *
_codecs_utf_16_le_encode_impl(PyObject * module,PyObject * str,const char * errors)798 _codecs_utf_16_le_encode_impl(PyObject *module, PyObject *str,
799                               const char *errors)
800 /*[clinic end generated code: output=431b01e55f2d4995 input=cb385455ea8f2fe0]*/
801 {
802     return codec_tuple(_PyUnicode_EncodeUTF16(str, errors, -1),
803                        PyUnicode_GET_LENGTH(str));
804 }
805 
806 /*[clinic input]
807 _codecs.utf_16_be_encode
808     str: unicode
809     errors: str(accept={str, NoneType}) = NULL
810     /
811 [clinic start generated code]*/
812 
813 static PyObject *
_codecs_utf_16_be_encode_impl(PyObject * module,PyObject * str,const char * errors)814 _codecs_utf_16_be_encode_impl(PyObject *module, PyObject *str,
815                               const char *errors)
816 /*[clinic end generated code: output=96886a6fd54dcae3 input=9119997066bdaefd]*/
817 {
818     return codec_tuple(_PyUnicode_EncodeUTF16(str, errors, +1),
819                        PyUnicode_GET_LENGTH(str));
820 }
821 
822 /* This version provides access to the byteorder parameter of the
823    builtin UTF-32 codecs as optional third argument. It defaults to 0
824    which means: use the native byte order and prepend the data with a
825    BOM mark.
826 
827 */
828 
829 /*[clinic input]
830 _codecs.utf_32_encode
831     str: unicode
832     errors: str(accept={str, NoneType}) = NULL
833     byteorder: int = 0
834     /
835 [clinic start generated code]*/
836 
837 static PyObject *
_codecs_utf_32_encode_impl(PyObject * module,PyObject * str,const char * errors,int byteorder)838 _codecs_utf_32_encode_impl(PyObject *module, PyObject *str,
839                            const char *errors, int byteorder)
840 /*[clinic end generated code: output=5c760da0c09a8b83 input=c5e77da82fbe5c2a]*/
841 {
842     return codec_tuple(_PyUnicode_EncodeUTF32(str, errors, byteorder),
843                        PyUnicode_GET_LENGTH(str));
844 }
845 
846 /*[clinic input]
847 _codecs.utf_32_le_encode
848     str: unicode
849     errors: str(accept={str, NoneType}) = NULL
850     /
851 [clinic start generated code]*/
852 
853 static PyObject *
_codecs_utf_32_le_encode_impl(PyObject * module,PyObject * str,const char * errors)854 _codecs_utf_32_le_encode_impl(PyObject *module, PyObject *str,
855                               const char *errors)
856 /*[clinic end generated code: output=b65cd176de8e36d6 input=9993b25fe0877848]*/
857 {
858     return codec_tuple(_PyUnicode_EncodeUTF32(str, errors, -1),
859                        PyUnicode_GET_LENGTH(str));
860 }
861 
862 /*[clinic input]
863 _codecs.utf_32_be_encode
864     str: unicode
865     errors: str(accept={str, NoneType}) = NULL
866     /
867 [clinic start generated code]*/
868 
869 static PyObject *
_codecs_utf_32_be_encode_impl(PyObject * module,PyObject * str,const char * errors)870 _codecs_utf_32_be_encode_impl(PyObject *module, PyObject *str,
871                               const char *errors)
872 /*[clinic end generated code: output=1d9e71a9358709e9 input=d3e0ccaa02920431]*/
873 {
874     return codec_tuple(_PyUnicode_EncodeUTF32(str, errors, +1),
875                        PyUnicode_GET_LENGTH(str));
876 }
877 
878 /*[clinic input]
879 _codecs.unicode_escape_encode
880     str: unicode
881     errors: str(accept={str, NoneType}) = NULL
882     /
883 [clinic start generated code]*/
884 
885 static PyObject *
_codecs_unicode_escape_encode_impl(PyObject * module,PyObject * str,const char * errors)886 _codecs_unicode_escape_encode_impl(PyObject *module, PyObject *str,
887                                    const char *errors)
888 /*[clinic end generated code: output=66271b30bc4f7a3c input=65d9eefca65b455a]*/
889 {
890     return codec_tuple(PyUnicode_AsUnicodeEscapeString(str),
891                        PyUnicode_GET_LENGTH(str));
892 }
893 
894 /*[clinic input]
895 _codecs.raw_unicode_escape_encode
896     str: unicode
897     errors: str(accept={str, NoneType}) = NULL
898     /
899 [clinic start generated code]*/
900 
901 static PyObject *
_codecs_raw_unicode_escape_encode_impl(PyObject * module,PyObject * str,const char * errors)902 _codecs_raw_unicode_escape_encode_impl(PyObject *module, PyObject *str,
903                                        const char *errors)
904 /*[clinic end generated code: output=a66a806ed01c830a input=5aa33e4a133391ab]*/
905 {
906     return codec_tuple(PyUnicode_AsRawUnicodeEscapeString(str),
907                        PyUnicode_GET_LENGTH(str));
908 }
909 
910 /*[clinic input]
911 _codecs.latin_1_encode
912     str: unicode
913     errors: str(accept={str, NoneType}) = NULL
914     /
915 [clinic start generated code]*/
916 
917 static PyObject *
_codecs_latin_1_encode_impl(PyObject * module,PyObject * str,const char * errors)918 _codecs_latin_1_encode_impl(PyObject *module, PyObject *str,
919                             const char *errors)
920 /*[clinic end generated code: output=2c28c83a27884e08 input=30b11c9e49a65150]*/
921 {
922     return codec_tuple(_PyUnicode_AsLatin1String(str, errors),
923                        PyUnicode_GET_LENGTH(str));
924 }
925 
926 /*[clinic input]
927 _codecs.ascii_encode
928     str: unicode
929     errors: str(accept={str, NoneType}) = NULL
930     /
931 [clinic start generated code]*/
932 
933 static PyObject *
_codecs_ascii_encode_impl(PyObject * module,PyObject * str,const char * errors)934 _codecs_ascii_encode_impl(PyObject *module, PyObject *str,
935                           const char *errors)
936 /*[clinic end generated code: output=b5e035182d33befc input=843a1d268e6dfa8e]*/
937 {
938     return codec_tuple(_PyUnicode_AsASCIIString(str, errors),
939                        PyUnicode_GET_LENGTH(str));
940 }
941 
942 /*[clinic input]
943 _codecs.charmap_encode
944     str: unicode
945     errors: str(accept={str, NoneType}) = NULL
946     mapping: object = NULL
947     /
948 [clinic start generated code]*/
949 
950 static PyObject *
_codecs_charmap_encode_impl(PyObject * module,PyObject * str,const char * errors,PyObject * mapping)951 _codecs_charmap_encode_impl(PyObject *module, PyObject *str,
952                             const char *errors, PyObject *mapping)
953 /*[clinic end generated code: output=047476f48495a9e9 input=0752cde07a6d6d00]*/
954 {
955     if (mapping == Py_None)
956         mapping = NULL;
957 
958     return codec_tuple(_PyUnicode_EncodeCharmap(str, mapping, errors),
959                        PyUnicode_GET_LENGTH(str));
960 }
961 
962 /*[clinic input]
963 _codecs.charmap_build
964     map: unicode
965     /
966 [clinic start generated code]*/
967 
968 static PyObject *
_codecs_charmap_build_impl(PyObject * module,PyObject * map)969 _codecs_charmap_build_impl(PyObject *module, PyObject *map)
970 /*[clinic end generated code: output=bb073c27031db9ac input=d91a91d1717dbc6d]*/
971 {
972     return PyUnicode_BuildEncodingMap(map);
973 }
974 
975 #ifdef MS_WINDOWS
976 
977 /*[clinic input]
978 _codecs.mbcs_encode
979     str: unicode
980     errors: str(accept={str, NoneType}) = NULL
981     /
982 [clinic start generated code]*/
983 
984 static PyObject *
_codecs_mbcs_encode_impl(PyObject * module,PyObject * str,const char * errors)985 _codecs_mbcs_encode_impl(PyObject *module, PyObject *str, const char *errors)
986 /*[clinic end generated code: output=76e2e170c966c080 input=de471e0815947553]*/
987 {
988     return codec_tuple(PyUnicode_EncodeCodePage(CP_ACP, str, errors),
989                        PyUnicode_GET_LENGTH(str));
990 }
991 
992 /*[clinic input]
993 _codecs.oem_encode
994     str: unicode
995     errors: str(accept={str, NoneType}) = NULL
996     /
997 [clinic start generated code]*/
998 
999 static PyObject *
_codecs_oem_encode_impl(PyObject * module,PyObject * str,const char * errors)1000 _codecs_oem_encode_impl(PyObject *module, PyObject *str, const char *errors)
1001 /*[clinic end generated code: output=65d5982c737de649 input=3fc5f0028aad3cda]*/
1002 {
1003     return codec_tuple(PyUnicode_EncodeCodePage(CP_OEMCP, str, errors),
1004         PyUnicode_GET_LENGTH(str));
1005 }
1006 
1007 /*[clinic input]
1008 _codecs.code_page_encode
1009     code_page: int
1010     str: unicode
1011     errors: str(accept={str, NoneType}) = NULL
1012     /
1013 [clinic start generated code]*/
1014 
1015 static PyObject *
_codecs_code_page_encode_impl(PyObject * module,int code_page,PyObject * str,const char * errors)1016 _codecs_code_page_encode_impl(PyObject *module, int code_page, PyObject *str,
1017                               const char *errors)
1018 /*[clinic end generated code: output=45673f6085657a9e input=786421ae617d680b]*/
1019 {
1020     return codec_tuple(PyUnicode_EncodeCodePage(code_page, str, errors),
1021                        PyUnicode_GET_LENGTH(str));
1022 }
1023 
1024 #endif /* MS_WINDOWS */
1025 
1026 /* --- Error handler registry --------------------------------------------- */
1027 
1028 /*[clinic input]
1029 _codecs.register_error
1030     errors: str
1031     handler: object
1032     /
1033 
1034 Register the specified error handler under the name errors.
1035 
1036 handler must be a callable object, that will be called with an exception
1037 instance containing information about the location of the encoding/decoding
1038 error and must return a (replacement, new position) tuple.
1039 [clinic start generated code]*/
1040 
1041 static PyObject *
_codecs_register_error_impl(PyObject * module,const char * errors,PyObject * handler)1042 _codecs_register_error_impl(PyObject *module, const char *errors,
1043                             PyObject *handler)
1044 /*[clinic end generated code: output=fa2f7d1879b3067d input=5e6709203c2e33fe]*/
1045 {
1046     if (PyCodec_RegisterError(errors, handler))
1047         return NULL;
1048     Py_RETURN_NONE;
1049 }
1050 
1051 /*[clinic input]
1052 _codecs.lookup_error
1053     name: str
1054     /
1055 
1056 lookup_error(errors) -> handler
1057 
1058 Return the error handler for the specified error handling name or raise a
1059 LookupError, if no handler exists under this name.
1060 [clinic start generated code]*/
1061 
1062 static PyObject *
_codecs_lookup_error_impl(PyObject * module,const char * name)1063 _codecs_lookup_error_impl(PyObject *module, const char *name)
1064 /*[clinic end generated code: output=087f05dc0c9a98cc input=4775dd65e6235aba]*/
1065 {
1066     return PyCodec_LookupError(name);
1067 }
1068 
1069 /* --- Module API --------------------------------------------------------- */
1070 
1071 static PyMethodDef _codecs_functions[] = {
1072     _CODECS_REGISTER_METHODDEF
1073     _CODECS_LOOKUP_METHODDEF
1074     _CODECS_ENCODE_METHODDEF
1075     _CODECS_DECODE_METHODDEF
1076     _CODECS_ESCAPE_ENCODE_METHODDEF
1077     _CODECS_ESCAPE_DECODE_METHODDEF
1078     _CODECS_UTF_8_ENCODE_METHODDEF
1079     _CODECS_UTF_8_DECODE_METHODDEF
1080     _CODECS_UTF_7_ENCODE_METHODDEF
1081     _CODECS_UTF_7_DECODE_METHODDEF
1082     _CODECS_UTF_16_ENCODE_METHODDEF
1083     _CODECS_UTF_16_LE_ENCODE_METHODDEF
1084     _CODECS_UTF_16_BE_ENCODE_METHODDEF
1085     _CODECS_UTF_16_DECODE_METHODDEF
1086     _CODECS_UTF_16_LE_DECODE_METHODDEF
1087     _CODECS_UTF_16_BE_DECODE_METHODDEF
1088     _CODECS_UTF_16_EX_DECODE_METHODDEF
1089     _CODECS_UTF_32_ENCODE_METHODDEF
1090     _CODECS_UTF_32_LE_ENCODE_METHODDEF
1091     _CODECS_UTF_32_BE_ENCODE_METHODDEF
1092     _CODECS_UTF_32_DECODE_METHODDEF
1093     _CODECS_UTF_32_LE_DECODE_METHODDEF
1094     _CODECS_UTF_32_BE_DECODE_METHODDEF
1095     _CODECS_UTF_32_EX_DECODE_METHODDEF
1096     _CODECS_UNICODE_ESCAPE_ENCODE_METHODDEF
1097     _CODECS_UNICODE_ESCAPE_DECODE_METHODDEF
1098     _CODECS_UNICODE_INTERNAL_ENCODE_METHODDEF
1099     _CODECS_UNICODE_INTERNAL_DECODE_METHODDEF
1100     _CODECS_RAW_UNICODE_ESCAPE_ENCODE_METHODDEF
1101     _CODECS_RAW_UNICODE_ESCAPE_DECODE_METHODDEF
1102     _CODECS_LATIN_1_ENCODE_METHODDEF
1103     _CODECS_LATIN_1_DECODE_METHODDEF
1104     _CODECS_ASCII_ENCODE_METHODDEF
1105     _CODECS_ASCII_DECODE_METHODDEF
1106     _CODECS_CHARMAP_ENCODE_METHODDEF
1107     _CODECS_CHARMAP_DECODE_METHODDEF
1108     _CODECS_CHARMAP_BUILD_METHODDEF
1109     _CODECS_READBUFFER_ENCODE_METHODDEF
1110     _CODECS_MBCS_ENCODE_METHODDEF
1111     _CODECS_MBCS_DECODE_METHODDEF
1112     _CODECS_OEM_ENCODE_METHODDEF
1113     _CODECS_OEM_DECODE_METHODDEF
1114     _CODECS_CODE_PAGE_ENCODE_METHODDEF
1115     _CODECS_CODE_PAGE_DECODE_METHODDEF
1116     _CODECS_REGISTER_ERROR_METHODDEF
1117     _CODECS_LOOKUP_ERROR_METHODDEF
1118     _CODECS__FORGET_CODEC_METHODDEF
1119     {NULL, NULL}                /* sentinel */
1120 };
1121 
1122 static struct PyModuleDef codecsmodule = {
1123         PyModuleDef_HEAD_INIT,
1124         "_codecs",
1125         NULL,
1126         -1,
1127         _codecs_functions,
1128         NULL,
1129         NULL,
1130         NULL,
1131         NULL
1132 };
1133 
1134 PyMODINIT_FUNC
PyInit__codecs(void)1135 PyInit__codecs(void)
1136 {
1137         return PyModule_Create(&codecsmodule);
1138 }
1139