1 #ifndef Py_UNICODEOBJECT_H
2 #define Py_UNICODEOBJECT_H
3 
4 #include <stdarg.h>
5 
6 /*
7 
8 Unicode implementation based on original code by Fredrik Lundh,
9 modified by Marc-Andre Lemburg (mal@lemburg.com) according to the
10 Unicode Integration Proposal. (See
11 http://www.egenix.com/files/python/unicode-proposal.txt).
12 
13 Copyright (c) Corporation for National Research Initiatives.
14 
15 
16  Original header:
17  --------------------------------------------------------------------
18 
19  * Yet another Unicode string type for Python.  This type supports the
20  * 16-bit Basic Multilingual Plane (BMP) only.
21  *
22  * Written by Fredrik Lundh, January 1999.
23  *
24  * Copyright (c) 1999 by Secret Labs AB.
25  * Copyright (c) 1999 by Fredrik Lundh.
26  *
27  * fredrik@pythonware.com
28  * http://www.pythonware.com
29  *
30  * --------------------------------------------------------------------
31  * This Unicode String Type is
32  *
33  * Copyright (c) 1999 by Secret Labs AB
34  * Copyright (c) 1999 by Fredrik Lundh
35  *
36  * By obtaining, using, and/or copying this software and/or its
37  * associated documentation, you agree that you have read, understood,
38  * and will comply with the following terms and conditions:
39  *
40  * Permission to use, copy, modify, and distribute this software and its
41  * associated documentation for any purpose and without fee is hereby
42  * granted, provided that the above copyright notice appears in all
43  * copies, and that both that copyright notice and this permission notice
44  * appear in supporting documentation, and that the name of Secret Labs
45  * AB or the author not be used in advertising or publicity pertaining to
46  * distribution of the software without specific, written prior
47  * permission.
48  *
49  * SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
50  * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
51  * FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
52  * ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
53  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
54  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
55  * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
56  * -------------------------------------------------------------------- */
57 
58 #include <ctype.h>
59 
60 /* === Internal API ======================================================= */
61 
62 /* --- Internal Unicode Format -------------------------------------------- */
63 
64 /* Python 3.x requires unicode */
65 #define Py_USING_UNICODE
66 
67 #ifndef SIZEOF_WCHAR_T
68 #error Must define SIZEOF_WCHAR_T
69 #endif
70 
71 #define Py_UNICODE_SIZE SIZEOF_WCHAR_T
72 
73 /* If wchar_t can be used for UCS-4 storage, set Py_UNICODE_WIDE.
74    Otherwise, Unicode strings are stored as UCS-2 (with limited support
75    for UTF-16) */
76 
77 #if Py_UNICODE_SIZE >= 4
78 #define Py_UNICODE_WIDE
79 #endif
80 
81 /* Set these flags if the platform has "wchar.h" and the
82    wchar_t type is a 16-bit unsigned type */
83 /* #define HAVE_WCHAR_H */
84 /* #define HAVE_USABLE_WCHAR_T */
85 
86 /* Py_UNICODE was the native Unicode storage format (code unit) used by
87    Python and represents a single Unicode element in the Unicode type.
88    With PEP 393, Py_UNICODE is deprecated and replaced with a
89    typedef to wchar_t. */
90 
91 #ifndef Py_LIMITED_API
92 #define PY_UNICODE_TYPE wchar_t
93 typedef wchar_t Py_UNICODE;
94 #endif
95 
96 /* If the compiler provides a wchar_t type we try to support it
97    through the interface functions PyUnicode_FromWideChar(),
98    PyUnicode_AsWideChar() and PyUnicode_AsWideCharString(). */
99 
100 #ifdef HAVE_USABLE_WCHAR_T
101 # ifndef HAVE_WCHAR_H
102 #  define HAVE_WCHAR_H
103 # endif
104 #endif
105 
106 #ifdef HAVE_WCHAR_H
107 /* Work around a cosmetic bug in BSDI 4.x wchar.h; thanks to Thomas Wouters */
108 # ifdef _HAVE_BSDI
109 #  include <time.h>
110 # endif
111 #  include <wchar.h>
112 #endif
113 
114 /* Py_UCS4 and Py_UCS2 are typedefs for the respective
115    unicode representations. */
116 typedef uint32_t Py_UCS4;
117 typedef uint16_t Py_UCS2;
118 typedef uint8_t Py_UCS1;
119 
120 /* --- Internal Unicode Operations ---------------------------------------- */
121 
122 /* Since splitting on whitespace is an important use case, and
123    whitespace in most situations is solely ASCII whitespace, we
124    optimize for the common case by using a quick look-up table
125    _Py_ascii_whitespace (see below) with an inlined check.
126 
127  */
128 #ifndef Py_LIMITED_API
129 #define Py_UNICODE_ISSPACE(ch) \
130     ((ch) < 128U ? _Py_ascii_whitespace[(ch)] : _PyUnicode_IsWhitespace(ch))
131 
132 #define Py_UNICODE_ISLOWER(ch) _PyUnicode_IsLowercase(ch)
133 #define Py_UNICODE_ISUPPER(ch) _PyUnicode_IsUppercase(ch)
134 #define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch)
135 #define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch)
136 
137 #define Py_UNICODE_TOLOWER(ch) _PyUnicode_ToLowercase(ch)
138 #define Py_UNICODE_TOUPPER(ch) _PyUnicode_ToUppercase(ch)
139 #define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch)
140 
141 #define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch)
142 #define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch)
143 #define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch)
144 #define Py_UNICODE_ISPRINTABLE(ch) _PyUnicode_IsPrintable(ch)
145 
146 #define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch)
147 #define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch)
148 #define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch)
149 
150 #define Py_UNICODE_ISALPHA(ch) _PyUnicode_IsAlpha(ch)
151 
152 #define Py_UNICODE_ISALNUM(ch) \
153        (Py_UNICODE_ISALPHA(ch) || \
154     Py_UNICODE_ISDECIMAL(ch) || \
155     Py_UNICODE_ISDIGIT(ch) || \
156     Py_UNICODE_ISNUMERIC(ch))
157 
158 #define Py_UNICODE_COPY(target, source, length) \
159     memcpy((target), (source), (length)*sizeof(Py_UNICODE))
160 
161 #define Py_UNICODE_FILL(target, value, length) \
162     do {Py_ssize_t i_; Py_UNICODE *t_ = (target); Py_UNICODE v_ = (value);\
163         for (i_ = 0; i_ < (length); i_++) t_[i_] = v_;\
164     } while (0)
165 
166 /* macros to work with surrogates */
167 #define Py_UNICODE_IS_SURROGATE(ch) (0xD800 <= (ch) && (ch) <= 0xDFFF)
168 #define Py_UNICODE_IS_HIGH_SURROGATE(ch) (0xD800 <= (ch) && (ch) <= 0xDBFF)
169 #define Py_UNICODE_IS_LOW_SURROGATE(ch) (0xDC00 <= (ch) && (ch) <= 0xDFFF)
170 /* Join two surrogate characters and return a single Py_UCS4 value. */
171 #define Py_UNICODE_JOIN_SURROGATES(high, low)  \
172     (((((Py_UCS4)(high) & 0x03FF) << 10) |      \
173       ((Py_UCS4)(low) & 0x03FF)) + 0x10000)
174 /* high surrogate = top 10 bits added to D800 */
175 #define Py_UNICODE_HIGH_SURROGATE(ch) (0xD800 - (0x10000 >> 10) + ((ch) >> 10))
176 /* low surrogate = bottom 10 bits added to DC00 */
177 #define Py_UNICODE_LOW_SURROGATE(ch) (0xDC00 + ((ch) & 0x3FF))
178 
179 /* Check if substring matches at given offset.  The offset must be
180    valid, and the substring must not be empty. */
181 
182 #define Py_UNICODE_MATCH(string, offset, substring) \
183     ((*((string)->wstr + (offset)) == *((substring)->wstr)) && \
184      ((*((string)->wstr + (offset) + (substring)->wstr_length-1) == *((substring)->wstr + (substring)->wstr_length-1))) && \
185      !memcmp((string)->wstr + (offset), (substring)->wstr, (substring)->wstr_length*sizeof(Py_UNICODE)))
186 
187 #endif /* Py_LIMITED_API */
188 
189 #ifdef __cplusplus
190 extern "C" {
191 #endif
192 
193 /* --- Unicode Type ------------------------------------------------------- */
194 
195 #ifndef Py_LIMITED_API
196 
197 /* ASCII-only strings created through PyUnicode_New use the PyASCIIObject
198    structure. state.ascii and state.compact are set, and the data
199    immediately follow the structure. utf8_length and wstr_length can be found
200    in the length field; the utf8 pointer is equal to the data pointer. */
201 typedef struct {
202     /* There are 4 forms of Unicode strings:
203 
204        - compact ascii:
205 
206          * structure = PyASCIIObject
207          * test: PyUnicode_IS_COMPACT_ASCII(op)
208          * kind = PyUnicode_1BYTE_KIND
209          * compact = 1
210          * ascii = 1
211          * ready = 1
212          * (length is the length of the utf8 and wstr strings)
213          * (data starts just after the structure)
214          * (since ASCII is decoded from UTF-8, the utf8 string are the data)
215 
216        - compact:
217 
218          * structure = PyCompactUnicodeObject
219          * test: PyUnicode_IS_COMPACT(op) && !PyUnicode_IS_ASCII(op)
220          * kind = PyUnicode_1BYTE_KIND, PyUnicode_2BYTE_KIND or
221            PyUnicode_4BYTE_KIND
222          * compact = 1
223          * ready = 1
224          * ascii = 0
225          * utf8 is not shared with data
226          * utf8_length = 0 if utf8 is NULL
227          * wstr is shared with data and wstr_length=length
228            if kind=PyUnicode_2BYTE_KIND and sizeof(wchar_t)=2
229            or if kind=PyUnicode_4BYTE_KIND and sizeof(wchar_t)=4
230          * wstr_length = 0 if wstr is NULL
231          * (data starts just after the structure)
232 
233        - legacy string, not ready:
234 
235          * structure = PyUnicodeObject
236          * test: kind == PyUnicode_WCHAR_KIND
237          * length = 0 (use wstr_length)
238          * hash = -1
239          * kind = PyUnicode_WCHAR_KIND
240          * compact = 0
241          * ascii = 0
242          * ready = 0
243          * interned = SSTATE_NOT_INTERNED
244          * wstr is not NULL
245          * data.any is NULL
246          * utf8 is NULL
247          * utf8_length = 0
248 
249        - legacy string, ready:
250 
251          * structure = PyUnicodeObject structure
252          * test: !PyUnicode_IS_COMPACT(op) && kind != PyUnicode_WCHAR_KIND
253          * kind = PyUnicode_1BYTE_KIND, PyUnicode_2BYTE_KIND or
254            PyUnicode_4BYTE_KIND
255          * compact = 0
256          * ready = 1
257          * data.any is not NULL
258          * utf8 is shared and utf8_length = length with data.any if ascii = 1
259          * utf8_length = 0 if utf8 is NULL
260          * wstr is shared with data.any and wstr_length = length
261            if kind=PyUnicode_2BYTE_KIND and sizeof(wchar_t)=2
262            or if kind=PyUnicode_4BYTE_KIND and sizeof(wchar_4)=4
263          * wstr_length = 0 if wstr is NULL
264 
265        Compact strings use only one memory block (structure + characters),
266        whereas legacy strings use one block for the structure and one block
267        for characters.
268 
269        Legacy strings are created by PyUnicode_FromUnicode() and
270        PyUnicode_FromStringAndSize(NULL, size) functions. They become ready
271        when PyUnicode_READY() is called.
272 
273        See also _PyUnicode_CheckConsistency().
274     */
275     PyObject_HEAD
276     Py_ssize_t length;          /* Number of code points in the string */
277     Py_hash_t hash;             /* Hash value; -1 if not set */
278     struct {
279         /*
280            SSTATE_NOT_INTERNED (0)
281            SSTATE_INTERNED_MORTAL (1)
282            SSTATE_INTERNED_IMMORTAL (2)
283 
284            If interned != SSTATE_NOT_INTERNED, the two references from the
285            dictionary to this object are *not* counted in ob_refcnt.
286          */
287         unsigned int interned:2;
288         /* Character size:
289 
290            - PyUnicode_WCHAR_KIND (0):
291 
292              * character type = wchar_t (16 or 32 bits, depending on the
293                platform)
294 
295            - PyUnicode_1BYTE_KIND (1):
296 
297              * character type = Py_UCS1 (8 bits, unsigned)
298              * all characters are in the range U+0000-U+00FF (latin1)
299              * if ascii is set, all characters are in the range U+0000-U+007F
300                (ASCII), otherwise at least one character is in the range
301                U+0080-U+00FF
302 
303            - PyUnicode_2BYTE_KIND (2):
304 
305              * character type = Py_UCS2 (16 bits, unsigned)
306              * all characters are in the range U+0000-U+FFFF (BMP)
307              * at least one character is in the range U+0100-U+FFFF
308 
309            - PyUnicode_4BYTE_KIND (4):
310 
311              * character type = Py_UCS4 (32 bits, unsigned)
312              * all characters are in the range U+0000-U+10FFFF
313              * at least one character is in the range U+10000-U+10FFFF
314          */
315         unsigned int kind:3;
316         /* Compact is with respect to the allocation scheme. Compact unicode
317            objects only require one memory block while non-compact objects use
318            one block for the PyUnicodeObject struct and another for its data
319            buffer. */
320         unsigned int compact:1;
321         /* The string only contains characters in the range U+0000-U+007F (ASCII)
322            and the kind is PyUnicode_1BYTE_KIND. If ascii is set and compact is
323            set, use the PyASCIIObject structure. */
324         unsigned int ascii:1;
325         /* The ready flag indicates whether the object layout is initialized
326            completely. This means that this is either a compact object, or
327            the data pointer is filled out. The bit is redundant, and helps
328            to minimize the test in PyUnicode_IS_READY(). */
329         unsigned int ready:1;
330         /* Padding to ensure that PyUnicode_DATA() is always aligned to
331            4 bytes (see issue #19537 on m68k). */
332         unsigned int :24;
333     } state;
334     wchar_t *wstr;              /* wchar_t representation (null-terminated) */
335 } PyASCIIObject;
336 
337 /* Non-ASCII strings allocated through PyUnicode_New use the
338    PyCompactUnicodeObject structure. state.compact is set, and the data
339    immediately follow the structure. */
340 typedef struct {
341     PyASCIIObject _base;
342     Py_ssize_t utf8_length;     /* Number of bytes in utf8, excluding the
343                                  * terminating \0. */
344     char *utf8;                 /* UTF-8 representation (null-terminated) */
345     Py_ssize_t wstr_length;     /* Number of code points in wstr, possible
346                                  * surrogates count as two code points. */
347 } PyCompactUnicodeObject;
348 
349 /* Strings allocated through PyUnicode_FromUnicode(NULL, len) use the
350    PyUnicodeObject structure. The actual string data is initially in the wstr
351    block, and copied into the data block using _PyUnicode_Ready. */
352 typedef struct {
353     PyCompactUnicodeObject _base;
354     union {
355         void *any;
356         Py_UCS1 *latin1;
357         Py_UCS2 *ucs2;
358         Py_UCS4 *ucs4;
359     } data;                     /* Canonical, smallest-form Unicode buffer */
360 } PyUnicodeObject;
361 #endif
362 
363 PyAPI_DATA(PyTypeObject) PyUnicode_Type;
364 PyAPI_DATA(PyTypeObject) PyUnicodeIter_Type;
365 
366 #define PyUnicode_Check(op) \
367                  PyType_FastSubclass(Py_TYPE(op), Py_TPFLAGS_UNICODE_SUBCLASS)
368 #define PyUnicode_CheckExact(op) (Py_TYPE(op) == &PyUnicode_Type)
369 
370 /* Fast access macros */
371 #ifndef Py_LIMITED_API
372 
373 #define PyUnicode_WSTR_LENGTH(op) \
374     (PyUnicode_IS_COMPACT_ASCII(op) ?                  \
375      ((PyASCIIObject*)op)->length :                    \
376      ((PyCompactUnicodeObject*)op)->wstr_length)
377 
378 /* Returns the deprecated Py_UNICODE representation's size in code units
379    (this includes surrogate pairs as 2 units).
380    If the Py_UNICODE representation is not available, it will be computed
381    on request.  Use PyUnicode_GET_LENGTH() for the length in code points. */
382 
383 #define PyUnicode_GET_SIZE(op)                       \
384     (assert(PyUnicode_Check(op)),                    \
385      (((PyASCIIObject *)(op))->wstr) ?               \
386       PyUnicode_WSTR_LENGTH(op) :                    \
387       ((void)PyUnicode_AsUnicode((PyObject *)(op)),  \
388        assert(((PyASCIIObject *)(op))->wstr),        \
389        PyUnicode_WSTR_LENGTH(op)))
390 
391 #define PyUnicode_GET_DATA_SIZE(op) \
392     (PyUnicode_GET_SIZE(op) * Py_UNICODE_SIZE)
393 
394 /* Alias for PyUnicode_AsUnicode().  This will create a wchar_t/Py_UNICODE
395    representation on demand.  Using this macro is very inefficient now,
396    try to port your code to use the new PyUnicode_*BYTE_DATA() macros or
397    use PyUnicode_WRITE() and PyUnicode_READ(). */
398 
399 #define PyUnicode_AS_UNICODE(op) \
400     (assert(PyUnicode_Check(op)), \
401      (((PyASCIIObject *)(op))->wstr) ? (((PyASCIIObject *)(op))->wstr) : \
402       PyUnicode_AsUnicode((PyObject *)(op)))
403 
404 #define PyUnicode_AS_DATA(op) \
405     ((const char *)(PyUnicode_AS_UNICODE(op)))
406 
407 
408 /* --- Flexible String Representation Helper Macros (PEP 393) -------------- */
409 
410 /* Values for PyASCIIObject.state: */
411 
412 /* Interning state. */
413 #define SSTATE_NOT_INTERNED 0
414 #define SSTATE_INTERNED_MORTAL 1
415 #define SSTATE_INTERNED_IMMORTAL 2
416 
417 /* Return true if the string contains only ASCII characters, or 0 if not. The
418    string may be compact (PyUnicode_IS_COMPACT_ASCII) or not, but must be
419    ready. */
420 #define PyUnicode_IS_ASCII(op)                   \
421     (assert(PyUnicode_Check(op)),                \
422      assert(PyUnicode_IS_READY(op)),             \
423      ((PyASCIIObject*)op)->state.ascii)
424 
425 /* Return true if the string is compact or 0 if not.
426    No type checks or Ready calls are performed. */
427 #define PyUnicode_IS_COMPACT(op) \
428     (((PyASCIIObject*)(op))->state.compact)
429 
430 /* Return true if the string is a compact ASCII string (use PyASCIIObject
431    structure), or 0 if not.  No type checks or Ready calls are performed. */
432 #define PyUnicode_IS_COMPACT_ASCII(op)                 \
433     (((PyASCIIObject*)op)->state.ascii && PyUnicode_IS_COMPACT(op))
434 
435 enum PyUnicode_Kind {
436 /* String contains only wstr byte characters.  This is only possible
437    when the string was created with a legacy API and _PyUnicode_Ready()
438    has not been called yet.  */
439     PyUnicode_WCHAR_KIND = 0,
440 /* Return values of the PyUnicode_KIND() macro: */
441     PyUnicode_1BYTE_KIND = 1,
442     PyUnicode_2BYTE_KIND = 2,
443     PyUnicode_4BYTE_KIND = 4
444 };
445 
446 /* Return pointers to the canonical representation cast to unsigned char,
447    Py_UCS2, or Py_UCS4 for direct character access.
448    No checks are performed, use PyUnicode_KIND() before to ensure
449    these will work correctly. */
450 
451 #define PyUnicode_1BYTE_DATA(op) ((Py_UCS1*)PyUnicode_DATA(op))
452 #define PyUnicode_2BYTE_DATA(op) ((Py_UCS2*)PyUnicode_DATA(op))
453 #define PyUnicode_4BYTE_DATA(op) ((Py_UCS4*)PyUnicode_DATA(op))
454 
455 /* Return one of the PyUnicode_*_KIND values defined above. */
456 #define PyUnicode_KIND(op) \
457     (assert(PyUnicode_Check(op)), \
458      assert(PyUnicode_IS_READY(op)),            \
459      ((PyASCIIObject *)(op))->state.kind)
460 
461 /* Return a void pointer to the raw unicode buffer. */
462 #define _PyUnicode_COMPACT_DATA(op)                     \
463     (PyUnicode_IS_ASCII(op) ?                   \
464      ((void*)((PyASCIIObject*)(op) + 1)) :              \
465      ((void*)((PyCompactUnicodeObject*)(op) + 1)))
466 
467 #define _PyUnicode_NONCOMPACT_DATA(op)                  \
468     (assert(((PyUnicodeObject*)(op))->data.any),        \
469      ((((PyUnicodeObject *)(op))->data.any)))
470 
471 #define PyUnicode_DATA(op) \
472     (assert(PyUnicode_Check(op)), \
473      PyUnicode_IS_COMPACT(op) ? _PyUnicode_COMPACT_DATA(op) :   \
474      _PyUnicode_NONCOMPACT_DATA(op))
475 
476 /* In the access macros below, "kind" may be evaluated more than once.
477    All other macro parameters are evaluated exactly once, so it is safe
478    to put side effects into them (such as increasing the index). */
479 
480 /* Write into the canonical representation, this macro does not do any sanity
481    checks and is intended for usage in loops.  The caller should cache the
482    kind and data pointers obtained from other macro calls.
483    index is the index in the string (starts at 0) and value is the new
484    code point value which should be written to that location. */
485 #define PyUnicode_WRITE(kind, data, index, value) \
486     do { \
487         switch ((kind)) { \
488         case PyUnicode_1BYTE_KIND: { \
489             ((Py_UCS1 *)(data))[(index)] = (Py_UCS1)(value); \
490             break; \
491         } \
492         case PyUnicode_2BYTE_KIND: { \
493             ((Py_UCS2 *)(data))[(index)] = (Py_UCS2)(value); \
494             break; \
495         } \
496         default: { \
497             assert((kind) == PyUnicode_4BYTE_KIND); \
498             ((Py_UCS4 *)(data))[(index)] = (Py_UCS4)(value); \
499         } \
500         } \
501     } while (0)
502 
503 /* Read a code point from the string's canonical representation.  No checks
504    or ready calls are performed. */
505 #define PyUnicode_READ(kind, data, index) \
506     ((Py_UCS4) \
507     ((kind) == PyUnicode_1BYTE_KIND ? \
508         ((const Py_UCS1 *)(data))[(index)] : \
509         ((kind) == PyUnicode_2BYTE_KIND ? \
510             ((const Py_UCS2 *)(data))[(index)] : \
511             ((const Py_UCS4 *)(data))[(index)] \
512         ) \
513     ))
514 
515 /* PyUnicode_READ_CHAR() is less efficient than PyUnicode_READ() because it
516    calls PyUnicode_KIND() and might call it twice.  For single reads, use
517    PyUnicode_READ_CHAR, for multiple consecutive reads callers should
518    cache kind and use PyUnicode_READ instead. */
519 #define PyUnicode_READ_CHAR(unicode, index) \
520     (assert(PyUnicode_Check(unicode)),          \
521      assert(PyUnicode_IS_READY(unicode)),       \
522      (Py_UCS4)                                  \
523         (PyUnicode_KIND((unicode)) == PyUnicode_1BYTE_KIND ? \
524             ((const Py_UCS1 *)(PyUnicode_DATA((unicode))))[(index)] : \
525             (PyUnicode_KIND((unicode)) == PyUnicode_2BYTE_KIND ? \
526                 ((const Py_UCS2 *)(PyUnicode_DATA((unicode))))[(index)] : \
527                 ((const Py_UCS4 *)(PyUnicode_DATA((unicode))))[(index)] \
528             ) \
529         ))
530 
531 /* Returns the length of the unicode string. The caller has to make sure that
532    the string has it's canonical representation set before calling
533    this macro.  Call PyUnicode_(FAST_)Ready to ensure that. */
534 #define PyUnicode_GET_LENGTH(op)                \
535     (assert(PyUnicode_Check(op)),               \
536      assert(PyUnicode_IS_READY(op)),            \
537      ((PyASCIIObject *)(op))->length)
538 
539 
540 /* Fast check to determine whether an object is ready. Equivalent to
541    PyUnicode_IS_COMPACT(op) || ((PyUnicodeObject*)(op))->data.any) */
542 
543 #define PyUnicode_IS_READY(op) (((PyASCIIObject*)op)->state.ready)
544 
545 /* PyUnicode_READY() does less work than _PyUnicode_Ready() in the best
546    case.  If the canonical representation is not yet set, it will still call
547    _PyUnicode_Ready().
548    Returns 0 on success and -1 on errors. */
549 #define PyUnicode_READY(op)                        \
550     (assert(PyUnicode_Check(op)),                       \
551      (PyUnicode_IS_READY(op) ?                          \
552       0 : _PyUnicode_Ready((PyObject *)(op))))
553 
554 /* Return a maximum character value which is suitable for creating another
555    string based on op.  This is always an approximation but more efficient
556    than iterating over the string. */
557 #define PyUnicode_MAX_CHAR_VALUE(op) \
558     (assert(PyUnicode_IS_READY(op)),                                    \
559      (PyUnicode_IS_ASCII(op) ?                                          \
560       (0x7f) :                                                          \
561       (PyUnicode_KIND(op) == PyUnicode_1BYTE_KIND ?                     \
562        (0xffU) :                                                        \
563        (PyUnicode_KIND(op) == PyUnicode_2BYTE_KIND ?                    \
564         (0xffffU) :                                                     \
565         (0x10ffffU)))))
566 
567 #endif
568 
569 /* --- Constants ---------------------------------------------------------- */
570 
571 /* This Unicode character will be used as replacement character during
572    decoding if the errors argument is set to "replace". Note: the
573    Unicode character U+FFFD is the official REPLACEMENT CHARACTER in
574    Unicode 3.0. */
575 
576 #define Py_UNICODE_REPLACEMENT_CHARACTER ((Py_UCS4) 0xFFFD)
577 
578 /* === Public API ========================================================= */
579 
580 /* --- Plain Py_UNICODE --------------------------------------------------- */
581 
582 /* With PEP 393, this is the recommended way to allocate a new unicode object.
583    This function will allocate the object and its buffer in a single memory
584    block.  Objects created using this function are not resizable. */
585 #ifndef Py_LIMITED_API
586 PyAPI_FUNC(PyObject*) PyUnicode_New(
587     Py_ssize_t size,            /* Number of code points in the new string */
588     Py_UCS4 maxchar             /* maximum code point value in the string */
589     );
590 #endif
591 
592 /* Initializes the canonical string representation from the deprecated
593    wstr/Py_UNICODE representation. This function is used to convert Unicode
594    objects which were created using the old API to the new flexible format
595    introduced with PEP 393.
596 
597    Don't call this function directly, use the public PyUnicode_READY() macro
598    instead. */
599 #ifndef Py_LIMITED_API
600 PyAPI_FUNC(int) _PyUnicode_Ready(
601     PyObject *unicode           /* Unicode object */
602     );
603 #endif
604 
605 /* Get a copy of a Unicode string. */
606 #ifndef Py_LIMITED_API
607 PyAPI_FUNC(PyObject*) _PyUnicode_Copy(
608     PyObject *unicode
609     );
610 #endif
611 
612 /* Copy character from one unicode object into another, this function performs
613    character conversion when necessary and falls back to memcpy() if possible.
614 
615    Fail if to is too small (smaller than *how_many* or smaller than
616    len(from)-from_start), or if kind(from[from_start:from_start+how_many]) >
617    kind(to), or if *to* has more than 1 reference.
618 
619    Return the number of written character, or return -1 and raise an exception
620    on error.
621 
622    Pseudo-code:
623 
624        how_many = min(how_many, len(from) - from_start)
625        to[to_start:to_start+how_many] = from[from_start:from_start+how_many]
626        return how_many
627 
628    Note: The function doesn't write a terminating null character.
629    */
630 #ifndef Py_LIMITED_API
631 PyAPI_FUNC(Py_ssize_t) PyUnicode_CopyCharacters(
632     PyObject *to,
633     Py_ssize_t to_start,
634     PyObject *from,
635     Py_ssize_t from_start,
636     Py_ssize_t how_many
637     );
638 
639 /* Unsafe version of PyUnicode_CopyCharacters(): don't check arguments and so
640    may crash if parameters are invalid (e.g. if the output string
641    is too short). */
642 PyAPI_FUNC(void) _PyUnicode_FastCopyCharacters(
643     PyObject *to,
644     Py_ssize_t to_start,
645     PyObject *from,
646     Py_ssize_t from_start,
647     Py_ssize_t how_many
648     );
649 #endif
650 
651 #ifndef Py_LIMITED_API
652 /* Fill a string with a character: write fill_char into
653    unicode[start:start+length].
654 
655    Fail if fill_char is bigger than the string maximum character, or if the
656    string has more than 1 reference.
657 
658    Return the number of written character, or return -1 and raise an exception
659    on error. */
660 PyAPI_FUNC(Py_ssize_t) PyUnicode_Fill(
661     PyObject *unicode,
662     Py_ssize_t start,
663     Py_ssize_t length,
664     Py_UCS4 fill_char
665     );
666 
667 /* Unsafe version of PyUnicode_Fill(): don't check arguments and so may crash
668    if parameters are invalid (e.g. if length is longer than the string). */
669 PyAPI_FUNC(void) _PyUnicode_FastFill(
670     PyObject *unicode,
671     Py_ssize_t start,
672     Py_ssize_t length,
673     Py_UCS4 fill_char
674     );
675 #endif
676 
677 /* Create a Unicode Object from the Py_UNICODE buffer u of the given
678    size.
679 
680    u may be NULL which causes the contents to be undefined. It is the
681    user's responsibility to fill in the needed data afterwards. Note
682    that modifying the Unicode object contents after construction is
683    only allowed if u was set to NULL.
684 
685    The buffer is copied into the new object. */
686 
687 #ifndef Py_LIMITED_API
688 PyAPI_FUNC(PyObject*) PyUnicode_FromUnicode(
689     const Py_UNICODE *u,        /* Unicode buffer */
690     Py_ssize_t size             /* size of buffer */
691     );
692 #endif
693 
694 /* Similar to PyUnicode_FromUnicode(), but u points to UTF-8 encoded bytes */
695 PyAPI_FUNC(PyObject*) PyUnicode_FromStringAndSize(
696     const char *u,             /* UTF-8 encoded string */
697     Py_ssize_t size            /* size of buffer */
698     );
699 
700 /* Similar to PyUnicode_FromUnicode(), but u points to null-terminated
701    UTF-8 encoded bytes.  The size is determined with strlen(). */
702 PyAPI_FUNC(PyObject*) PyUnicode_FromString(
703     const char *u              /* UTF-8 encoded string */
704     );
705 
706 #ifndef Py_LIMITED_API
707 /* Create a new string from a buffer of Py_UCS1, Py_UCS2 or Py_UCS4 characters.
708    Scan the string to find the maximum character. */
709 PyAPI_FUNC(PyObject*) PyUnicode_FromKindAndData(
710     int kind,
711     const void *buffer,
712     Py_ssize_t size);
713 
714 /* Create a new string from a buffer of ASCII characters.
715    WARNING: Don't check if the string contains any non-ASCII character. */
716 PyAPI_FUNC(PyObject*) _PyUnicode_FromASCII(
717     const char *buffer,
718     Py_ssize_t size);
719 #endif
720 
721 #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
722 PyAPI_FUNC(PyObject*) PyUnicode_Substring(
723     PyObject *str,
724     Py_ssize_t start,
725     Py_ssize_t end);
726 #endif
727 
728 #ifndef Py_LIMITED_API
729 /* Compute the maximum character of the substring unicode[start:end].
730    Return 127 for an empty string. */
731 PyAPI_FUNC(Py_UCS4) _PyUnicode_FindMaxChar (
732     PyObject *unicode,
733     Py_ssize_t start,
734     Py_ssize_t end);
735 #endif
736 
737 #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
738 /* Copy the string into a UCS4 buffer including the null character if copy_null
739    is set. Return NULL and raise an exception on error. Raise a SystemError if
740    the buffer is smaller than the string. Return buffer on success.
741 
742    buflen is the length of the buffer in (Py_UCS4) characters. */
743 PyAPI_FUNC(Py_UCS4*) PyUnicode_AsUCS4(
744     PyObject *unicode,
745     Py_UCS4* buffer,
746     Py_ssize_t buflen,
747     int copy_null);
748 
749 /* Copy the string into a UCS4 buffer. A new buffer is allocated using
750  * PyMem_Malloc; if this fails, NULL is returned with a memory error
751    exception set. */
752 PyAPI_FUNC(Py_UCS4*) PyUnicode_AsUCS4Copy(PyObject *unicode);
753 #endif
754 
755 /* Return a read-only pointer to the Unicode object's internal
756    Py_UNICODE buffer.
757    If the wchar_t/Py_UNICODE representation is not yet available, this
758    function will calculate it. */
759 
760 #ifndef Py_LIMITED_API
761 PyAPI_FUNC(Py_UNICODE *) PyUnicode_AsUnicode(
762     PyObject *unicode           /* Unicode object */
763     );
764 #endif
765 
766 /* Return a read-only pointer to the Unicode object's internal
767    Py_UNICODE buffer and save the length at size.
768    If the wchar_t/Py_UNICODE representation is not yet available, this
769    function will calculate it. */
770 
771 #ifndef Py_LIMITED_API
772 PyAPI_FUNC(Py_UNICODE *) PyUnicode_AsUnicodeAndSize(
773     PyObject *unicode,          /* Unicode object */
774     Py_ssize_t *size            /* location where to save the length */
775     );
776 #endif
777 
778 #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
779 /* Get the length of the Unicode object. */
780 
781 PyAPI_FUNC(Py_ssize_t) PyUnicode_GetLength(
782     PyObject *unicode
783 );
784 #endif
785 
786 /* Get the number of Py_UNICODE units in the
787    string representation. */
788 
789 PyAPI_FUNC(Py_ssize_t) PyUnicode_GetSize(
790     PyObject *unicode           /* Unicode object */
791     );
792 
793 #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
794 /* Read a character from the string. */
795 
796 PyAPI_FUNC(Py_UCS4) PyUnicode_ReadChar(
797     PyObject *unicode,
798     Py_ssize_t index
799     );
800 
801 /* Write a character to the string. The string must have been created through
802    PyUnicode_New, must not be shared, and must not have been hashed yet.
803 
804    Return 0 on success, -1 on error. */
805 
806 PyAPI_FUNC(int) PyUnicode_WriteChar(
807     PyObject *unicode,
808     Py_ssize_t index,
809     Py_UCS4 character
810     );
811 #endif
812 
813 #ifndef Py_LIMITED_API
814 /* Get the maximum ordinal for a Unicode character. */
815 PyAPI_FUNC(Py_UNICODE) PyUnicode_GetMax(void);
816 #endif
817 
818 /* Resize a Unicode object. The length is the number of characters, except
819    if the kind of the string is PyUnicode_WCHAR_KIND: in this case, the length
820    is the number of Py_UNICODE characters.
821 
822    *unicode is modified to point to the new (resized) object and 0
823    returned on success.
824 
825    Try to resize the string in place (which is usually faster than allocating
826    a new string and copy characters), or create a new string.
827 
828    Error handling is implemented as follows: an exception is set, -1
829    is returned and *unicode left untouched.
830 
831    WARNING: The function doesn't check string content, the result may not be a
832             string in canonical representation. */
833 
834 PyAPI_FUNC(int) PyUnicode_Resize(
835     PyObject **unicode,         /* Pointer to the Unicode object */
836     Py_ssize_t length           /* New length */
837     );
838 
839 /* Decode obj to a Unicode object.
840 
841    bytes, bytearray and other bytes-like objects are decoded according to the
842    given encoding and error handler. The encoding and error handler can be
843    NULL to have the interface use UTF-8 and "strict".
844 
845    All other objects (including Unicode objects) raise an exception.
846 
847    The API returns NULL in case of an error. The caller is responsible
848    for decref'ing the returned objects.
849 
850 */
851 
852 PyAPI_FUNC(PyObject*) PyUnicode_FromEncodedObject(
853     PyObject *obj,              /* Object */
854     const char *encoding,       /* encoding */
855     const char *errors          /* error handling */
856     );
857 
858 /* Copy an instance of a Unicode subtype to a new true Unicode object if
859    necessary. If obj is already a true Unicode object (not a subtype), return
860    the reference with *incremented* refcount.
861 
862    The API returns NULL in case of an error. The caller is responsible
863    for decref'ing the returned objects.
864 
865 */
866 
867 PyAPI_FUNC(PyObject*) PyUnicode_FromObject(
868     PyObject *obj      /* Object */
869     );
870 
871 PyAPI_FUNC(PyObject *) PyUnicode_FromFormatV(
872     const char *format,   /* ASCII-encoded string  */
873     va_list vargs
874     );
875 PyAPI_FUNC(PyObject *) PyUnicode_FromFormat(
876     const char *format,   /* ASCII-encoded string  */
877     ...
878     );
879 
880 #ifndef Py_LIMITED_API
881 typedef struct {
882     PyObject *buffer;
883     void *data;
884     enum PyUnicode_Kind kind;
885     Py_UCS4 maxchar;
886     Py_ssize_t size;
887     Py_ssize_t pos;
888 
889     /* minimum number of allocated characters (default: 0) */
890     Py_ssize_t min_length;
891 
892     /* minimum character (default: 127, ASCII) */
893     Py_UCS4 min_char;
894 
895     /* If non-zero, overallocate the buffer (default: 0). */
896     unsigned char overallocate;
897 
898     /* If readonly is 1, buffer is a shared string (cannot be modified)
899        and size is set to 0. */
900     unsigned char readonly;
901 } _PyUnicodeWriter ;
902 
903 /* Initialize a Unicode writer.
904  *
905  * By default, the minimum buffer size is 0 character and overallocation is
906  * disabled. Set min_length, min_char and overallocate attributes to control
907  * the allocation of the buffer. */
908 PyAPI_FUNC(void)
909 _PyUnicodeWriter_Init(_PyUnicodeWriter *writer);
910 
911 /* Prepare the buffer to write 'length' characters
912    with the specified maximum character.
913 
914    Return 0 on success, raise an exception and return -1 on error. */
915 #define _PyUnicodeWriter_Prepare(WRITER, LENGTH, MAXCHAR)             \
916     (((MAXCHAR) <= (WRITER)->maxchar                                  \
917       && (LENGTH) <= (WRITER)->size - (WRITER)->pos)                  \
918      ? 0                                                              \
919      : (((LENGTH) == 0)                                               \
920         ? 0                                                           \
921         : _PyUnicodeWriter_PrepareInternal((WRITER), (LENGTH), (MAXCHAR))))
922 
923 /* Don't call this function directly, use the _PyUnicodeWriter_Prepare() macro
924    instead. */
925 PyAPI_FUNC(int)
926 _PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
927                                  Py_ssize_t length, Py_UCS4 maxchar);
928 
929 /* Prepare the buffer to have at least the kind KIND.
930    For example, kind=PyUnicode_2BYTE_KIND ensures that the writer will
931    support characters in range U+000-U+FFFF.
932 
933    Return 0 on success, raise an exception and return -1 on error. */
934 #define _PyUnicodeWriter_PrepareKind(WRITER, KIND)                    \
935     (assert((KIND) != PyUnicode_WCHAR_KIND),                          \
936      (KIND) <= (WRITER)->kind                                         \
937      ? 0                                                              \
938      : _PyUnicodeWriter_PrepareKindInternal((WRITER), (KIND)))
939 
940 /* Don't call this function directly, use the _PyUnicodeWriter_PrepareKind()
941    macro instead. */
942 PyAPI_FUNC(int)
943 _PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
944                                      enum PyUnicode_Kind kind);
945 
946 /* Append a Unicode character.
947    Return 0 on success, raise an exception and return -1 on error. */
948 PyAPI_FUNC(int)
949 _PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer,
950     Py_UCS4 ch
951     );
952 
953 /* Append a Unicode string.
954    Return 0 on success, raise an exception and return -1 on error. */
955 PyAPI_FUNC(int)
956 _PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer,
957     PyObject *str               /* Unicode string */
958     );
959 
960 /* Append a substring of a Unicode string.
961    Return 0 on success, raise an exception and return -1 on error. */
962 PyAPI_FUNC(int)
963 _PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer,
964     PyObject *str,              /* Unicode string */
965     Py_ssize_t start,
966     Py_ssize_t end
967     );
968 
969 /* Append an ASCII-encoded byte string.
970    Return 0 on success, raise an exception and return -1 on error. */
971 PyAPI_FUNC(int)
972 _PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
973     const char *str,           /* ASCII-encoded byte string */
974     Py_ssize_t len             /* number of bytes, or -1 if unknown */
975     );
976 
977 /* Append a latin1-encoded byte string.
978    Return 0 on success, raise an exception and return -1 on error. */
979 PyAPI_FUNC(int)
980 _PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
981     const char *str,           /* latin1-encoded byte string */
982     Py_ssize_t len             /* length in bytes */
983     );
984 
985 /* Get the value of the writer as a Unicode string. Clear the
986    buffer of the writer. Raise an exception and return NULL
987    on error. */
988 PyAPI_FUNC(PyObject *)
989 _PyUnicodeWriter_Finish(_PyUnicodeWriter *writer);
990 
991 /* Deallocate memory of a writer (clear its internal buffer). */
992 PyAPI_FUNC(void)
993 _PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer);
994 #endif
995 
996 #ifndef Py_LIMITED_API
997 /* Format the object based on the format_spec, as defined in PEP 3101
998    (Advanced String Formatting). */
999 PyAPI_FUNC(int) _PyUnicode_FormatAdvancedWriter(
1000     _PyUnicodeWriter *writer,
1001     PyObject *obj,
1002     PyObject *format_spec,
1003     Py_ssize_t start,
1004     Py_ssize_t end);
1005 #endif
1006 
1007 PyAPI_FUNC(void) PyUnicode_InternInPlace(PyObject **);
1008 PyAPI_FUNC(void) PyUnicode_InternImmortal(PyObject **);
1009 PyAPI_FUNC(PyObject *) PyUnicode_InternFromString(
1010     const char *u              /* UTF-8 encoded string */
1011     );
1012 #ifndef Py_LIMITED_API
1013 PyAPI_FUNC(void) _Py_ReleaseInternedUnicodeStrings(void);
1014 #endif
1015 
1016 /* Use only if you know it's a string */
1017 #define PyUnicode_CHECK_INTERNED(op) \
1018     (((PyASCIIObject *)(op))->state.interned)
1019 
1020 /* --- wchar_t support for platforms which support it --------------------- */
1021 
1022 #ifdef HAVE_WCHAR_H
1023 
1024 /* Create a Unicode Object from the wchar_t buffer w of the given
1025    size.
1026 
1027    The buffer is copied into the new object. */
1028 
1029 PyAPI_FUNC(PyObject*) PyUnicode_FromWideChar(
1030     const wchar_t *w,           /* wchar_t buffer */
1031     Py_ssize_t size             /* size of buffer */
1032     );
1033 
1034 /* Copies the Unicode Object contents into the wchar_t buffer w.  At
1035    most size wchar_t characters are copied.
1036 
1037    Note that the resulting wchar_t string may or may not be
1038    0-terminated.  It is the responsibility of the caller to make sure
1039    that the wchar_t string is 0-terminated in case this is required by
1040    the application.
1041 
1042    Returns the number of wchar_t characters copied (excluding a
1043    possibly trailing 0-termination character) or -1 in case of an
1044    error. */
1045 
1046 PyAPI_FUNC(Py_ssize_t) PyUnicode_AsWideChar(
1047     PyObject *unicode,          /* Unicode object */
1048     wchar_t *w,                 /* wchar_t buffer */
1049     Py_ssize_t size             /* size of buffer */
1050     );
1051 
1052 /* Convert the Unicode object to a wide character string. The output string
1053    always ends with a nul character. If size is not NULL, write the number of
1054    wide characters (excluding the null character) into *size.
1055 
1056    Returns a buffer allocated by PyMem_Malloc() (use PyMem_Free() to free it)
1057    on success. On error, returns NULL, *size is undefined and raises a
1058    MemoryError. */
1059 
1060 PyAPI_FUNC(wchar_t*) PyUnicode_AsWideCharString(
1061     PyObject *unicode,          /* Unicode object */
1062     Py_ssize_t *size            /* number of characters of the result */
1063     );
1064 
1065 #ifndef Py_LIMITED_API
1066 PyAPI_FUNC(void*) _PyUnicode_AsKind(PyObject *s, unsigned int kind);
1067 #endif
1068 
1069 #endif
1070 
1071 /* --- Unicode ordinals --------------------------------------------------- */
1072 
1073 /* Create a Unicode Object from the given Unicode code point ordinal.
1074 
1075    The ordinal must be in range(0x110000). A ValueError is
1076    raised in case it is not.
1077 
1078 */
1079 
1080 PyAPI_FUNC(PyObject*) PyUnicode_FromOrdinal(int ordinal);
1081 
1082 /* --- Free-list management ----------------------------------------------- */
1083 
1084 /* Clear the free list used by the Unicode implementation.
1085 
1086    This can be used to release memory used for objects on the free
1087    list back to the Python memory allocator.
1088 
1089 */
1090 
1091 PyAPI_FUNC(int) PyUnicode_ClearFreeList(void);
1092 
1093 /* === Builtin Codecs =====================================================
1094 
1095    Many of these APIs take two arguments encoding and errors. These
1096    parameters encoding and errors have the same semantics as the ones
1097    of the builtin str() API.
1098 
1099    Setting encoding to NULL causes the default encoding (UTF-8) to be used.
1100 
1101    Error handling is set by errors which may also be set to NULL
1102    meaning to use the default handling defined for the codec. Default
1103    error handling for all builtin codecs is "strict" (ValueErrors are
1104    raised).
1105 
1106    The codecs all use a similar interface. Only deviation from the
1107    generic ones are documented.
1108 
1109 */
1110 
1111 /* --- Manage the default encoding ---------------------------------------- */
1112 
1113 /* Returns a pointer to the default encoding (UTF-8) of the
1114    Unicode object unicode and the size of the encoded representation
1115    in bytes stored in *size.
1116 
1117    In case of an error, no *size is set.
1118 
1119    This function caches the UTF-8 encoded string in the unicodeobject
1120    and subsequent calls will return the same string.  The memory is released
1121    when the unicodeobject is deallocated.
1122 
1123    _PyUnicode_AsStringAndSize is a #define for PyUnicode_AsUTF8AndSize to
1124    support the previous internal function with the same behaviour.
1125 
1126    *** This API is for interpreter INTERNAL USE ONLY and will likely
1127    *** be removed or changed in the future.
1128 
1129    *** If you need to access the Unicode object as UTF-8 bytes string,
1130    *** please use PyUnicode_AsUTF8String() instead.
1131 */
1132 
1133 #ifndef Py_LIMITED_API
1134 PyAPI_FUNC(char *) PyUnicode_AsUTF8AndSize(
1135     PyObject *unicode,
1136     Py_ssize_t *size);
1137 #define _PyUnicode_AsStringAndSize PyUnicode_AsUTF8AndSize
1138 #endif
1139 
1140 /* Returns a pointer to the default encoding (UTF-8) of the
1141    Unicode object unicode.
1142 
1143    Like PyUnicode_AsUTF8AndSize(), this also caches the UTF-8 representation
1144    in the unicodeobject.
1145 
1146    _PyUnicode_AsString is a #define for PyUnicode_AsUTF8 to
1147    support the previous internal function with the same behaviour.
1148 
1149    Use of this API is DEPRECATED since no size information can be
1150    extracted from the returned data.
1151 
1152    *** This API is for interpreter INTERNAL USE ONLY and will likely
1153    *** be removed or changed for Python 3.1.
1154 
1155    *** If you need to access the Unicode object as UTF-8 bytes string,
1156    *** please use PyUnicode_AsUTF8String() instead.
1157 
1158 */
1159 
1160 #ifndef Py_LIMITED_API
1161 PyAPI_FUNC(char *) PyUnicode_AsUTF8(PyObject *unicode);
1162 #define _PyUnicode_AsString PyUnicode_AsUTF8
1163 #endif
1164 
1165 /* Returns "utf-8".  */
1166 
1167 PyAPI_FUNC(const char*) PyUnicode_GetDefaultEncoding(void);
1168 
1169 /* --- Generic Codecs ----------------------------------------------------- */
1170 
1171 /* Create a Unicode object by decoding the encoded string s of the
1172    given size. */
1173 
1174 PyAPI_FUNC(PyObject*) PyUnicode_Decode(
1175     const char *s,              /* encoded string */
1176     Py_ssize_t size,            /* size of buffer */
1177     const char *encoding,       /* encoding */
1178     const char *errors          /* error handling */
1179     );
1180 
1181 /* Decode a Unicode object unicode and return the result as Python
1182    object.
1183 
1184    This API is DEPRECATED. The only supported standard encoding is rot13.
1185    Use PyCodec_Decode() to decode with rot13 and non-standard codecs
1186    that decode from str. */
1187 
1188 PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedObject(
1189     PyObject *unicode,          /* Unicode object */
1190     const char *encoding,       /* encoding */
1191     const char *errors          /* error handling */
1192     ) Py_DEPRECATED(3.6);
1193 
1194 /* Decode a Unicode object unicode and return the result as Unicode
1195    object.
1196 
1197    This API is DEPRECATED. The only supported standard encoding is rot13.
1198    Use PyCodec_Decode() to decode with rot13 and non-standard codecs
1199    that decode from str to str. */
1200 
1201 PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedUnicode(
1202     PyObject *unicode,          /* Unicode object */
1203     const char *encoding,       /* encoding */
1204     const char *errors          /* error handling */
1205     ) Py_DEPRECATED(3.6);
1206 
1207 /* Encodes a Py_UNICODE buffer of the given size and returns a
1208    Python string object. */
1209 
1210 #ifndef Py_LIMITED_API
1211 PyAPI_FUNC(PyObject*) PyUnicode_Encode(
1212     const Py_UNICODE *s,        /* Unicode char buffer */
1213     Py_ssize_t size,            /* number of Py_UNICODE chars to encode */
1214     const char *encoding,       /* encoding */
1215     const char *errors          /* error handling */
1216     );
1217 #endif
1218 
1219 /* Encodes a Unicode object and returns the result as Python
1220    object.
1221 
1222    This API is DEPRECATED.  It is superceeded by PyUnicode_AsEncodedString()
1223    since all standard encodings (except rot13) encode str to bytes.
1224    Use PyCodec_Encode() for encoding with rot13 and non-standard codecs
1225    that encode form str to non-bytes. */
1226 
1227 PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedObject(
1228     PyObject *unicode,          /* Unicode object */
1229     const char *encoding,       /* encoding */
1230     const char *errors          /* error handling */
1231     ) Py_DEPRECATED(3.6);
1232 
1233 /* Encodes a Unicode object and returns the result as Python string
1234    object. */
1235 
1236 PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedString(
1237     PyObject *unicode,          /* Unicode object */
1238     const char *encoding,       /* encoding */
1239     const char *errors          /* error handling */
1240     );
1241 
1242 /* Encodes a Unicode object and returns the result as Unicode
1243    object.
1244 
1245    This API is DEPRECATED.  The only supported standard encodings is rot13.
1246    Use PyCodec_Encode() to encode with rot13 and non-standard codecs
1247    that encode from str to str. */
1248 
1249 PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedUnicode(
1250     PyObject *unicode,          /* Unicode object */
1251     const char *encoding,       /* encoding */
1252     const char *errors          /* error handling */
1253     ) Py_DEPRECATED(3.6);
1254 
1255 /* Build an encoding map. */
1256 
1257 PyAPI_FUNC(PyObject*) PyUnicode_BuildEncodingMap(
1258     PyObject* string            /* 256 character map */
1259    );
1260 
1261 /* --- UTF-7 Codecs ------------------------------------------------------- */
1262 
1263 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7(
1264     const char *string,         /* UTF-7 encoded string */
1265     Py_ssize_t length,          /* size of string */
1266     const char *errors          /* error handling */
1267     );
1268 
1269 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7Stateful(
1270     const char *string,         /* UTF-7 encoded string */
1271     Py_ssize_t length,          /* size of string */
1272     const char *errors,         /* error handling */
1273     Py_ssize_t *consumed        /* bytes consumed */
1274     );
1275 
1276 #ifndef Py_LIMITED_API
1277 PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF7(
1278     const Py_UNICODE *data,     /* Unicode char buffer */
1279     Py_ssize_t length,          /* number of Py_UNICODE chars to encode */
1280     int base64SetO,             /* Encode RFC2152 Set O characters in base64 */
1281     int base64WhiteSpace,       /* Encode whitespace (sp, ht, nl, cr) in base64 */
1282     const char *errors          /* error handling */
1283     );
1284 PyAPI_FUNC(PyObject*) _PyUnicode_EncodeUTF7(
1285     PyObject *unicode,          /* Unicode object */
1286     int base64SetO,             /* Encode RFC2152 Set O characters in base64 */
1287     int base64WhiteSpace,       /* Encode whitespace (sp, ht, nl, cr) in base64 */
1288     const char *errors          /* error handling */
1289     );
1290 #endif
1291 
1292 /* --- UTF-8 Codecs ------------------------------------------------------- */
1293 
1294 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8(
1295     const char *string,         /* UTF-8 encoded string */
1296     Py_ssize_t length,          /* size of string */
1297     const char *errors          /* error handling */
1298     );
1299 
1300 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8Stateful(
1301     const char *string,         /* UTF-8 encoded string */
1302     Py_ssize_t length,          /* size of string */
1303     const char *errors,         /* error handling */
1304     Py_ssize_t *consumed        /* bytes consumed */
1305     );
1306 
1307 PyAPI_FUNC(PyObject*) PyUnicode_AsUTF8String(
1308     PyObject *unicode           /* Unicode object */
1309     );
1310 
1311 #ifndef Py_LIMITED_API
1312 PyAPI_FUNC(PyObject*) _PyUnicode_AsUTF8String(
1313     PyObject *unicode,
1314     const char *errors);
1315 
1316 PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF8(
1317     const Py_UNICODE *data,     /* Unicode char buffer */
1318     Py_ssize_t length,          /* number of Py_UNICODE chars to encode */
1319     const char *errors          /* error handling */
1320     );
1321 #endif
1322 
1323 /* --- UTF-32 Codecs ------------------------------------------------------ */
1324 
1325 /* Decodes length bytes from a UTF-32 encoded buffer string and returns
1326    the corresponding Unicode object.
1327 
1328    errors (if non-NULL) defines the error handling. It defaults
1329    to "strict".
1330 
1331    If byteorder is non-NULL, the decoder starts decoding using the
1332    given byte order:
1333 
1334     *byteorder == -1: little endian
1335     *byteorder == 0:  native order
1336     *byteorder == 1:  big endian
1337 
1338    In native mode, the first four bytes of the stream are checked for a
1339    BOM mark. If found, the BOM mark is analysed, the byte order
1340    adjusted and the BOM skipped.  In the other modes, no BOM mark
1341    interpretation is done. After completion, *byteorder is set to the
1342    current byte order at the end of input data.
1343 
1344    If byteorder is NULL, the codec starts in native order mode.
1345 
1346 */
1347 
1348 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32(
1349     const char *string,         /* UTF-32 encoded string */
1350     Py_ssize_t length,          /* size of string */
1351     const char *errors,         /* error handling */
1352     int *byteorder              /* pointer to byteorder to use
1353                                    0=native;-1=LE,1=BE; updated on
1354                                    exit */
1355     );
1356 
1357 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32Stateful(
1358     const char *string,         /* UTF-32 encoded string */
1359     Py_ssize_t length,          /* size of string */
1360     const char *errors,         /* error handling */
1361     int *byteorder,             /* pointer to byteorder to use
1362                                    0=native;-1=LE,1=BE; updated on
1363                                    exit */
1364     Py_ssize_t *consumed        /* bytes consumed */
1365     );
1366 
1367 /* Returns a Python string using the UTF-32 encoding in native byte
1368    order. The string always starts with a BOM mark.  */
1369 
1370 PyAPI_FUNC(PyObject*) PyUnicode_AsUTF32String(
1371     PyObject *unicode           /* Unicode object */
1372     );
1373 
1374 /* Returns a Python string object holding the UTF-32 encoded value of
1375    the Unicode data.
1376 
1377    If byteorder is not 0, output is written according to the following
1378    byte order:
1379 
1380    byteorder == -1: little endian
1381    byteorder == 0:  native byte order (writes a BOM mark)
1382    byteorder == 1:  big endian
1383 
1384    If byteorder is 0, the output string will always start with the
1385    Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
1386    prepended.
1387 
1388 */
1389 
1390 #ifndef Py_LIMITED_API
1391 PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF32(
1392     const Py_UNICODE *data,     /* Unicode char buffer */
1393     Py_ssize_t length,          /* number of Py_UNICODE chars to encode */
1394     const char *errors,         /* error handling */
1395     int byteorder               /* byteorder to use 0=BOM+native;-1=LE,1=BE */
1396     );
1397 PyAPI_FUNC(PyObject*) _PyUnicode_EncodeUTF32(
1398     PyObject *object,           /* Unicode object */
1399     const char *errors,         /* error handling */
1400     int byteorder               /* byteorder to use 0=BOM+native;-1=LE,1=BE */
1401     );
1402 #endif
1403 
1404 /* --- UTF-16 Codecs ------------------------------------------------------ */
1405 
1406 /* Decodes length bytes from a UTF-16 encoded buffer string and returns
1407    the corresponding Unicode object.
1408 
1409    errors (if non-NULL) defines the error handling. It defaults
1410    to "strict".
1411 
1412    If byteorder is non-NULL, the decoder starts decoding using the
1413    given byte order:
1414 
1415     *byteorder == -1: little endian
1416     *byteorder == 0:  native order
1417     *byteorder == 1:  big endian
1418 
1419    In native mode, the first two bytes of the stream are checked for a
1420    BOM mark. If found, the BOM mark is analysed, the byte order
1421    adjusted and the BOM skipped.  In the other modes, no BOM mark
1422    interpretation is done. After completion, *byteorder is set to the
1423    current byte order at the end of input data.
1424 
1425    If byteorder is NULL, the codec starts in native order mode.
1426 
1427 */
1428 
1429 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16(
1430     const char *string,         /* UTF-16 encoded string */
1431     Py_ssize_t length,          /* size of string */
1432     const char *errors,         /* error handling */
1433     int *byteorder              /* pointer to byteorder to use
1434                                    0=native;-1=LE,1=BE; updated on
1435                                    exit */
1436     );
1437 
1438 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16Stateful(
1439     const char *string,         /* UTF-16 encoded string */
1440     Py_ssize_t length,          /* size of string */
1441     const char *errors,         /* error handling */
1442     int *byteorder,             /* pointer to byteorder to use
1443                                    0=native;-1=LE,1=BE; updated on
1444                                    exit */
1445     Py_ssize_t *consumed        /* bytes consumed */
1446     );
1447 
1448 /* Returns a Python string using the UTF-16 encoding in native byte
1449    order. The string always starts with a BOM mark.  */
1450 
1451 PyAPI_FUNC(PyObject*) PyUnicode_AsUTF16String(
1452     PyObject *unicode           /* Unicode object */
1453     );
1454 
1455 /* Returns a Python string object holding the UTF-16 encoded value of
1456    the Unicode data.
1457 
1458    If byteorder is not 0, output is written according to the following
1459    byte order:
1460 
1461    byteorder == -1: little endian
1462    byteorder == 0:  native byte order (writes a BOM mark)
1463    byteorder == 1:  big endian
1464 
1465    If byteorder is 0, the output string will always start with the
1466    Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
1467    prepended.
1468 
1469    Note that Py_UNICODE data is being interpreted as UTF-16 reduced to
1470    UCS-2. This trick makes it possible to add full UTF-16 capabilities
1471    at a later point without compromising the APIs.
1472 
1473 */
1474 
1475 #ifndef Py_LIMITED_API
1476 PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF16(
1477     const Py_UNICODE *data,     /* Unicode char buffer */
1478     Py_ssize_t length,          /* number of Py_UNICODE chars to encode */
1479     const char *errors,         /* error handling */
1480     int byteorder               /* byteorder to use 0=BOM+native;-1=LE,1=BE */
1481     );
1482 PyAPI_FUNC(PyObject*) _PyUnicode_EncodeUTF16(
1483     PyObject* unicode,          /* Unicode object */
1484     const char *errors,         /* error handling */
1485     int byteorder               /* byteorder to use 0=BOM+native;-1=LE,1=BE */
1486     );
1487 #endif
1488 
1489 /* --- Unicode-Escape Codecs ---------------------------------------------- */
1490 
1491 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUnicodeEscape(
1492     const char *string,         /* Unicode-Escape encoded string */
1493     Py_ssize_t length,          /* size of string */
1494     const char *errors          /* error handling */
1495     );
1496 
1497 #ifndef Py_LIMITED_API
1498 /* Helper for PyUnicode_DecodeUnicodeEscape that detects invalid escape
1499    chars. */
1500 PyAPI_FUNC(PyObject*) _PyUnicode_DecodeUnicodeEscape(
1501         const char *string,     /* Unicode-Escape encoded string */
1502         Py_ssize_t length,      /* size of string */
1503         const char *errors,     /* error handling */
1504         const char **first_invalid_escape  /* on return, points to first
1505                                               invalid escaped char in
1506                                               string. */
1507 );
1508 #endif
1509 
1510 PyAPI_FUNC(PyObject*) PyUnicode_AsUnicodeEscapeString(
1511     PyObject *unicode           /* Unicode object */
1512     );
1513 
1514 #ifndef Py_LIMITED_API
1515 PyAPI_FUNC(PyObject*) PyUnicode_EncodeUnicodeEscape(
1516     const Py_UNICODE *data,     /* Unicode char buffer */
1517     Py_ssize_t length           /* Number of Py_UNICODE chars to encode */
1518     );
1519 #endif
1520 
1521 /* --- Raw-Unicode-Escape Codecs ------------------------------------------ */
1522 
1523 PyAPI_FUNC(PyObject*) PyUnicode_DecodeRawUnicodeEscape(
1524     const char *string,         /* Raw-Unicode-Escape encoded string */
1525     Py_ssize_t length,          /* size of string */
1526     const char *errors          /* error handling */
1527     );
1528 
1529 PyAPI_FUNC(PyObject*) PyUnicode_AsRawUnicodeEscapeString(
1530     PyObject *unicode           /* Unicode object */
1531     );
1532 
1533 #ifndef Py_LIMITED_API
1534 PyAPI_FUNC(PyObject*) PyUnicode_EncodeRawUnicodeEscape(
1535     const Py_UNICODE *data,     /* Unicode char buffer */
1536     Py_ssize_t length           /* Number of Py_UNICODE chars to encode */
1537     );
1538 #endif
1539 
1540 /* --- Unicode Internal Codec ---------------------------------------------
1541 
1542     Only for internal use in _codecsmodule.c */
1543 
1544 #ifndef Py_LIMITED_API
1545 PyObject *_PyUnicode_DecodeUnicodeInternal(
1546     const char *string,
1547     Py_ssize_t length,
1548     const char *errors
1549     );
1550 #endif
1551 
1552 /* --- Latin-1 Codecs -----------------------------------------------------
1553 
1554    Note: Latin-1 corresponds to the first 256 Unicode ordinals.
1555 
1556 */
1557 
1558 PyAPI_FUNC(PyObject*) PyUnicode_DecodeLatin1(
1559     const char *string,         /* Latin-1 encoded string */
1560     Py_ssize_t length,          /* size of string */
1561     const char *errors          /* error handling */
1562     );
1563 
1564 PyAPI_FUNC(PyObject*) PyUnicode_AsLatin1String(
1565     PyObject *unicode           /* Unicode object */
1566     );
1567 
1568 #ifndef Py_LIMITED_API
1569 PyAPI_FUNC(PyObject*) _PyUnicode_AsLatin1String(
1570     PyObject* unicode,
1571     const char* errors);
1572 
1573 PyAPI_FUNC(PyObject*) PyUnicode_EncodeLatin1(
1574     const Py_UNICODE *data,     /* Unicode char buffer */
1575     Py_ssize_t length,          /* Number of Py_UNICODE chars to encode */
1576     const char *errors          /* error handling */
1577     );
1578 #endif
1579 
1580 /* --- ASCII Codecs -------------------------------------------------------
1581 
1582    Only 7-bit ASCII data is excepted. All other codes generate errors.
1583 
1584 */
1585 
1586 PyAPI_FUNC(PyObject*) PyUnicode_DecodeASCII(
1587     const char *string,         /* ASCII encoded string */
1588     Py_ssize_t length,          /* size of string */
1589     const char *errors          /* error handling */
1590     );
1591 
1592 PyAPI_FUNC(PyObject*) PyUnicode_AsASCIIString(
1593     PyObject *unicode           /* Unicode object */
1594     );
1595 
1596 #ifndef Py_LIMITED_API
1597 PyAPI_FUNC(PyObject*) _PyUnicode_AsASCIIString(
1598     PyObject* unicode,
1599     const char* errors);
1600 
1601 PyAPI_FUNC(PyObject*) PyUnicode_EncodeASCII(
1602     const Py_UNICODE *data,     /* Unicode char buffer */
1603     Py_ssize_t length,          /* Number of Py_UNICODE chars to encode */
1604     const char *errors          /* error handling */
1605     );
1606 #endif
1607 
1608 /* --- Character Map Codecs -----------------------------------------------
1609 
1610    This codec uses mappings to encode and decode characters.
1611 
1612    Decoding mappings must map single string characters to single
1613    Unicode characters, integers (which are then interpreted as Unicode
1614    ordinals) or None (meaning "undefined mapping" and causing an
1615    error).
1616 
1617    Encoding mappings must map single Unicode characters to single
1618    string characters, integers (which are then interpreted as Latin-1
1619    ordinals) or None (meaning "undefined mapping" and causing an
1620    error).
1621 
1622    If a character lookup fails with a LookupError, the character is
1623    copied as-is meaning that its ordinal value will be interpreted as
1624    Unicode or Latin-1 ordinal resp. Because of this mappings only need
1625    to contain those mappings which map characters to different code
1626    points.
1627 
1628 */
1629 
1630 PyAPI_FUNC(PyObject*) PyUnicode_DecodeCharmap(
1631     const char *string,         /* Encoded string */
1632     Py_ssize_t length,          /* size of string */
1633     PyObject *mapping,          /* character mapping
1634                                    (char ordinal -> unicode ordinal) */
1635     const char *errors          /* error handling */
1636     );
1637 
1638 PyAPI_FUNC(PyObject*) PyUnicode_AsCharmapString(
1639     PyObject *unicode,          /* Unicode object */
1640     PyObject *mapping           /* character mapping
1641                                    (unicode ordinal -> char ordinal) */
1642     );
1643 
1644 #ifndef Py_LIMITED_API
1645 PyAPI_FUNC(PyObject*) PyUnicode_EncodeCharmap(
1646     const Py_UNICODE *data,     /* Unicode char buffer */
1647     Py_ssize_t length,          /* Number of Py_UNICODE chars to encode */
1648     PyObject *mapping,          /* character mapping
1649                                    (unicode ordinal -> char ordinal) */
1650     const char *errors          /* error handling */
1651     );
1652 PyAPI_FUNC(PyObject*) _PyUnicode_EncodeCharmap(
1653     PyObject *unicode,          /* Unicode object */
1654     PyObject *mapping,          /* character mapping
1655                                    (unicode ordinal -> char ordinal) */
1656     const char *errors          /* error handling */
1657     );
1658 #endif
1659 
1660 /* Translate a Py_UNICODE buffer of the given length by applying a
1661    character mapping table to it and return the resulting Unicode
1662    object.
1663 
1664    The mapping table must map Unicode ordinal integers to Unicode
1665    ordinal integers or None (causing deletion of the character).
1666 
1667    Mapping tables may be dictionaries or sequences. Unmapped character
1668    ordinals (ones which cause a LookupError) are left untouched and
1669    are copied as-is.
1670 
1671 */
1672 
1673 #ifndef Py_LIMITED_API
1674 PyAPI_FUNC(PyObject *) PyUnicode_TranslateCharmap(
1675     const Py_UNICODE *data,     /* Unicode char buffer */
1676     Py_ssize_t length,          /* Number of Py_UNICODE chars to encode */
1677     PyObject *table,            /* Translate table */
1678     const char *errors          /* error handling */
1679     );
1680 #endif
1681 
1682 #ifdef MS_WINDOWS
1683 
1684 /* --- MBCS codecs for Windows -------------------------------------------- */
1685 
1686 PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCS(
1687     const char *string,         /* MBCS encoded string */
1688     Py_ssize_t length,          /* size of string */
1689     const char *errors          /* error handling */
1690     );
1691 
1692 PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCSStateful(
1693     const char *string,         /* MBCS encoded string */
1694     Py_ssize_t length,          /* size of string */
1695     const char *errors,         /* error handling */
1696     Py_ssize_t *consumed        /* bytes consumed */
1697     );
1698 
1699 #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
1700 PyAPI_FUNC(PyObject*) PyUnicode_DecodeCodePageStateful(
1701     int code_page,              /* code page number */
1702     const char *string,         /* encoded string */
1703     Py_ssize_t length,          /* size of string */
1704     const char *errors,         /* error handling */
1705     Py_ssize_t *consumed        /* bytes consumed */
1706     );
1707 #endif
1708 
1709 PyAPI_FUNC(PyObject*) PyUnicode_AsMBCSString(
1710     PyObject *unicode           /* Unicode object */
1711     );
1712 
1713 #ifndef Py_LIMITED_API
1714 PyAPI_FUNC(PyObject*) PyUnicode_EncodeMBCS(
1715     const Py_UNICODE *data,     /* Unicode char buffer */
1716     Py_ssize_t length,          /* number of Py_UNICODE chars to encode */
1717     const char *errors          /* error handling */
1718     );
1719 #endif
1720 
1721 #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
1722 PyAPI_FUNC(PyObject*) PyUnicode_EncodeCodePage(
1723     int code_page,              /* code page number */
1724     PyObject *unicode,          /* Unicode object */
1725     const char *errors          /* error handling */
1726     );
1727 #endif
1728 
1729 #endif /* MS_WINDOWS */
1730 
1731 /* --- Decimal Encoder ---------------------------------------------------- */
1732 
1733 /* Takes a Unicode string holding a decimal value and writes it into
1734    an output buffer using standard ASCII digit codes.
1735 
1736    The output buffer has to provide at least length+1 bytes of storage
1737    area. The output string is 0-terminated.
1738 
1739    The encoder converts whitespace to ' ', decimal characters to their
1740    corresponding ASCII digit and all other Latin-1 characters except
1741    \0 as-is. Characters outside this range (Unicode ordinals 1-256)
1742    are treated as errors. This includes embedded NULL bytes.
1743 
1744    Error handling is defined by the errors argument:
1745 
1746       NULL or "strict": raise a ValueError
1747       "ignore": ignore the wrong characters (these are not copied to the
1748                 output buffer)
1749       "replace": replaces illegal characters with '?'
1750 
1751    Returns 0 on success, -1 on failure.
1752 
1753 */
1754 
1755 #ifndef Py_LIMITED_API
1756 PyAPI_FUNC(int) PyUnicode_EncodeDecimal(
1757     Py_UNICODE *s,              /* Unicode buffer */
1758     Py_ssize_t length,          /* Number of Py_UNICODE chars to encode */
1759     char *output,               /* Output buffer; must have size >= length */
1760     const char *errors          /* error handling */
1761     );
1762 #endif
1763 
1764 /* Transforms code points that have decimal digit property to the
1765    corresponding ASCII digit code points.
1766 
1767    Returns a new Unicode string on success, NULL on failure.
1768 */
1769 
1770 #ifndef Py_LIMITED_API
1771 PyAPI_FUNC(PyObject*) PyUnicode_TransformDecimalToASCII(
1772     Py_UNICODE *s,              /* Unicode buffer */
1773     Py_ssize_t length           /* Number of Py_UNICODE chars to transform */
1774     );
1775 #endif
1776 
1777 /* Similar to PyUnicode_TransformDecimalToASCII(), but takes a PyObject
1778    as argument instead of a raw buffer and length.  This function additionally
1779    transforms spaces to ASCII because this is what the callers in longobject,
1780    floatobject, and complexobject did anyways. */
1781 
1782 #ifndef Py_LIMITED_API
1783 PyAPI_FUNC(PyObject*) _PyUnicode_TransformDecimalAndSpaceToASCII(
1784     PyObject *unicode           /* Unicode object */
1785     );
1786 #endif
1787 
1788 /* --- Locale encoding --------------------------------------------------- */
1789 
1790 #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
1791 /* Decode a string from the current locale encoding. The decoder is strict if
1792    *surrogateescape* is equal to zero, otherwise it uses the 'surrogateescape'
1793    error handler (PEP 383) to escape undecodable bytes. If a byte sequence can
1794    be decoded as a surrogate character and *surrogateescape* is not equal to
1795    zero, the byte sequence is escaped using the 'surrogateescape' error handler
1796    instead of being decoded. *str* must end with a null character but cannot
1797    contain embedded null characters. */
1798 
1799 PyAPI_FUNC(PyObject*) PyUnicode_DecodeLocaleAndSize(
1800     const char *str,
1801     Py_ssize_t len,
1802     const char *errors);
1803 
1804 /* Similar to PyUnicode_DecodeLocaleAndSize(), but compute the string
1805    length using strlen(). */
1806 
1807 PyAPI_FUNC(PyObject*) PyUnicode_DecodeLocale(
1808     const char *str,
1809     const char *errors);
1810 
1811 /* Encode a Unicode object to the current locale encoding. The encoder is
1812    strict is *surrogateescape* is equal to zero, otherwise the
1813    "surrogateescape" error handler is used. Return a bytes object. The string
1814    cannot contain embedded null characters. */
1815 
1816 PyAPI_FUNC(PyObject*) PyUnicode_EncodeLocale(
1817     PyObject *unicode,
1818     const char *errors
1819     );
1820 #endif
1821 
1822 /* --- File system encoding ---------------------------------------------- */
1823 
1824 /* ParseTuple converter: encode str objects to bytes using
1825    PyUnicode_EncodeFSDefault(); bytes objects are output as-is. */
1826 
1827 PyAPI_FUNC(int) PyUnicode_FSConverter(PyObject*, void*);
1828 
1829 /* ParseTuple converter: decode bytes objects to unicode using
1830    PyUnicode_DecodeFSDefaultAndSize(); str objects are output as-is. */
1831 
1832 PyAPI_FUNC(int) PyUnicode_FSDecoder(PyObject*, void*);
1833 
1834 /* Decode a null-terminated string using Py_FileSystemDefaultEncoding
1835    and the "surrogateescape" error handler.
1836 
1837    If Py_FileSystemDefaultEncoding is not set, fall back to the locale
1838    encoding.
1839 
1840    Use PyUnicode_DecodeFSDefaultAndSize() if the string length is known.
1841 */
1842 
1843 PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefault(
1844     const char *s               /* encoded string */
1845     );
1846 
1847 /* Decode a string using Py_FileSystemDefaultEncoding
1848    and the "surrogateescape" error handler.
1849 
1850    If Py_FileSystemDefaultEncoding is not set, fall back to the locale
1851    encoding.
1852 */
1853 
1854 PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefaultAndSize(
1855     const char *s,               /* encoded string */
1856     Py_ssize_t size              /* size */
1857     );
1858 
1859 /* Encode a Unicode object to Py_FileSystemDefaultEncoding with the
1860    "surrogateescape" error handler, and return bytes.
1861 
1862    If Py_FileSystemDefaultEncoding is not set, fall back to the locale
1863    encoding.
1864 */
1865 
1866 PyAPI_FUNC(PyObject*) PyUnicode_EncodeFSDefault(
1867     PyObject *unicode
1868     );
1869 
1870 /* --- Methods & Slots ----------------------------------------------------
1871 
1872    These are capable of handling Unicode objects and strings on input
1873    (we refer to them as strings in the descriptions) and return
1874    Unicode objects or integers as appropriate. */
1875 
1876 /* Concat two strings giving a new Unicode string. */
1877 
1878 PyAPI_FUNC(PyObject*) PyUnicode_Concat(
1879     PyObject *left,             /* Left string */
1880     PyObject *right             /* Right string */
1881     );
1882 
1883 /* Concat two strings and put the result in *pleft
1884    (sets *pleft to NULL on error) */
1885 
1886 PyAPI_FUNC(void) PyUnicode_Append(
1887     PyObject **pleft,           /* Pointer to left string */
1888     PyObject *right             /* Right string */
1889     );
1890 
1891 /* Concat two strings, put the result in *pleft and drop the right object
1892    (sets *pleft to NULL on error) */
1893 
1894 PyAPI_FUNC(void) PyUnicode_AppendAndDel(
1895     PyObject **pleft,           /* Pointer to left string */
1896     PyObject *right             /* Right string */
1897     );
1898 
1899 /* Split a string giving a list of Unicode strings.
1900 
1901    If sep is NULL, splitting will be done at all whitespace
1902    substrings. Otherwise, splits occur at the given separator.
1903 
1904    At most maxsplit splits will be done. If negative, no limit is set.
1905 
1906    Separators are not included in the resulting list.
1907 
1908 */
1909 
1910 PyAPI_FUNC(PyObject*) PyUnicode_Split(
1911     PyObject *s,                /* String to split */
1912     PyObject *sep,              /* String separator */
1913     Py_ssize_t maxsplit         /* Maxsplit count */
1914     );
1915 
1916 /* Dito, but split at line breaks.
1917 
1918    CRLF is considered to be one line break. Line breaks are not
1919    included in the resulting list. */
1920 
1921 PyAPI_FUNC(PyObject*) PyUnicode_Splitlines(
1922     PyObject *s,                /* String to split */
1923     int keepends                /* If true, line end markers are included */
1924     );
1925 
1926 /* Partition a string using a given separator. */
1927 
1928 PyAPI_FUNC(PyObject*) PyUnicode_Partition(
1929     PyObject *s,                /* String to partition */
1930     PyObject *sep               /* String separator */
1931     );
1932 
1933 /* Partition a string using a given separator, searching from the end of the
1934    string. */
1935 
1936 PyAPI_FUNC(PyObject*) PyUnicode_RPartition(
1937     PyObject *s,                /* String to partition */
1938     PyObject *sep               /* String separator */
1939     );
1940 
1941 /* Split a string giving a list of Unicode strings.
1942 
1943    If sep is NULL, splitting will be done at all whitespace
1944    substrings. Otherwise, splits occur at the given separator.
1945 
1946    At most maxsplit splits will be done. But unlike PyUnicode_Split
1947    PyUnicode_RSplit splits from the end of the string. If negative,
1948    no limit is set.
1949 
1950    Separators are not included in the resulting list.
1951 
1952 */
1953 
1954 PyAPI_FUNC(PyObject*) PyUnicode_RSplit(
1955     PyObject *s,                /* String to split */
1956     PyObject *sep,              /* String separator */
1957     Py_ssize_t maxsplit         /* Maxsplit count */
1958     );
1959 
1960 /* Translate a string by applying a character mapping table to it and
1961    return the resulting Unicode object.
1962 
1963    The mapping table must map Unicode ordinal integers to Unicode
1964    ordinal integers or None (causing deletion of the character).
1965 
1966    Mapping tables may be dictionaries or sequences. Unmapped character
1967    ordinals (ones which cause a LookupError) are left untouched and
1968    are copied as-is.
1969 
1970 */
1971 
1972 PyAPI_FUNC(PyObject *) PyUnicode_Translate(
1973     PyObject *str,              /* String */
1974     PyObject *table,            /* Translate table */
1975     const char *errors          /* error handling */
1976     );
1977 
1978 /* Join a sequence of strings using the given separator and return
1979    the resulting Unicode string. */
1980 
1981 PyAPI_FUNC(PyObject*) PyUnicode_Join(
1982     PyObject *separator,        /* Separator string */
1983     PyObject *seq               /* Sequence object */
1984     );
1985 
1986 #ifndef Py_LIMITED_API
1987 PyAPI_FUNC(PyObject *) _PyUnicode_JoinArray(
1988     PyObject *separator,
1989     PyObject **items,
1990     Py_ssize_t seqlen
1991     );
1992 #endif /* Py_LIMITED_API */
1993 
1994 /* Return 1 if substr matches str[start:end] at the given tail end, 0
1995    otherwise. */
1996 
1997 PyAPI_FUNC(Py_ssize_t) PyUnicode_Tailmatch(
1998     PyObject *str,              /* String */
1999     PyObject *substr,           /* Prefix or Suffix string */
2000     Py_ssize_t start,           /* Start index */
2001     Py_ssize_t end,             /* Stop index */
2002     int direction               /* Tail end: -1 prefix, +1 suffix */
2003     );
2004 
2005 /* Return the first position of substr in str[start:end] using the
2006    given search direction or -1 if not found. -2 is returned in case
2007    an error occurred and an exception is set. */
2008 
2009 PyAPI_FUNC(Py_ssize_t) PyUnicode_Find(
2010     PyObject *str,              /* String */
2011     PyObject *substr,           /* Substring to find */
2012     Py_ssize_t start,           /* Start index */
2013     Py_ssize_t end,             /* Stop index */
2014     int direction               /* Find direction: +1 forward, -1 backward */
2015     );
2016 
2017 #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
2018 /* Like PyUnicode_Find, but search for single character only. */
2019 PyAPI_FUNC(Py_ssize_t) PyUnicode_FindChar(
2020     PyObject *str,
2021     Py_UCS4 ch,
2022     Py_ssize_t start,
2023     Py_ssize_t end,
2024     int direction
2025     );
2026 #endif
2027 
2028 /* Count the number of occurrences of substr in str[start:end]. */
2029 
2030 PyAPI_FUNC(Py_ssize_t) PyUnicode_Count(
2031     PyObject *str,              /* String */
2032     PyObject *substr,           /* Substring to count */
2033     Py_ssize_t start,           /* Start index */
2034     Py_ssize_t end              /* Stop index */
2035     );
2036 
2037 /* Replace at most maxcount occurrences of substr in str with replstr
2038    and return the resulting Unicode object. */
2039 
2040 PyAPI_FUNC(PyObject *) PyUnicode_Replace(
2041     PyObject *str,              /* String */
2042     PyObject *substr,           /* Substring to find */
2043     PyObject *replstr,          /* Substring to replace */
2044     Py_ssize_t maxcount         /* Max. number of replacements to apply;
2045                                    -1 = all */
2046     );
2047 
2048 /* Compare two strings and return -1, 0, 1 for less than, equal,
2049    greater than resp.
2050    Raise an exception and return -1 on error. */
2051 
2052 PyAPI_FUNC(int) PyUnicode_Compare(
2053     PyObject *left,             /* Left string */
2054     PyObject *right             /* Right string */
2055     );
2056 
2057 #ifndef Py_LIMITED_API
2058 /* Test whether a unicode is equal to ASCII identifier.  Return 1 if true,
2059    0 otherwise.  The right argument must be ASCII identifier.
2060    Any error occurs inside will be cleared before return. */
2061 
2062 PyAPI_FUNC(int) _PyUnicode_EqualToASCIIId(
2063     PyObject *left,             /* Left string */
2064     _Py_Identifier *right       /* Right identifier */
2065     );
2066 #endif
2067 
2068 /* Compare a Unicode object with C string and return -1, 0, 1 for less than,
2069    equal, and greater than, respectively.  It is best to pass only
2070    ASCII-encoded strings, but the function interprets the input string as
2071    ISO-8859-1 if it contains non-ASCII characters.
2072    This function does not raise exceptions. */
2073 
2074 PyAPI_FUNC(int) PyUnicode_CompareWithASCIIString(
2075     PyObject *left,
2076     const char *right           /* ASCII-encoded string */
2077     );
2078 
2079 #ifndef Py_LIMITED_API
2080 /* Test whether a unicode is equal to ASCII string.  Return 1 if true,
2081    0 otherwise.  The right argument must be ASCII-encoded string.
2082    Any error occurs inside will be cleared before return. */
2083 
2084 PyAPI_FUNC(int) _PyUnicode_EqualToASCIIString(
2085     PyObject *left,
2086     const char *right           /* ASCII-encoded string */
2087     );
2088 #endif
2089 
2090 /* Rich compare two strings and return one of the following:
2091 
2092    - NULL in case an exception was raised
2093    - Py_True or Py_False for successful comparisons
2094    - Py_NotImplemented in case the type combination is unknown
2095 
2096    Possible values for op:
2097 
2098      Py_GT, Py_GE, Py_EQ, Py_NE, Py_LT, Py_LE
2099 
2100 */
2101 
2102 PyAPI_FUNC(PyObject *) PyUnicode_RichCompare(
2103     PyObject *left,             /* Left string */
2104     PyObject *right,            /* Right string */
2105     int op                      /* Operation: Py_EQ, Py_NE, Py_GT, etc. */
2106     );
2107 
2108 /* Apply an argument tuple or dictionary to a format string and return
2109    the resulting Unicode string. */
2110 
2111 PyAPI_FUNC(PyObject *) PyUnicode_Format(
2112     PyObject *format,           /* Format string */
2113     PyObject *args              /* Argument tuple or dictionary */
2114     );
2115 
2116 /* Checks whether element is contained in container and return 1/0
2117    accordingly.
2118 
2119    element has to coerce to a one element Unicode string. -1 is
2120    returned in case of an error. */
2121 
2122 PyAPI_FUNC(int) PyUnicode_Contains(
2123     PyObject *container,        /* Container string */
2124     PyObject *element           /* Element string */
2125     );
2126 
2127 /* Checks whether argument is a valid identifier. */
2128 
2129 PyAPI_FUNC(int) PyUnicode_IsIdentifier(PyObject *s);
2130 
2131 #ifndef Py_LIMITED_API
2132 /* Externally visible for str.strip(unicode) */
2133 PyAPI_FUNC(PyObject *) _PyUnicode_XStrip(
2134     PyObject *self,
2135     int striptype,
2136     PyObject *sepobj
2137     );
2138 #endif
2139 
2140 /* Using explicit passed-in values, insert the thousands grouping
2141    into the string pointed to by buffer.  For the argument descriptions,
2142    see Objects/stringlib/localeutil.h */
2143 #ifndef Py_LIMITED_API
2144 PyAPI_FUNC(Py_ssize_t) _PyUnicode_InsertThousandsGrouping(
2145     PyObject *unicode,
2146     Py_ssize_t index,
2147     Py_ssize_t n_buffer,
2148     void *digits,
2149     Py_ssize_t n_digits,
2150     Py_ssize_t min_width,
2151     const char *grouping,
2152     PyObject *thousands_sep,
2153     Py_UCS4 *maxchar);
2154 #endif
2155 /* === Characters Type APIs =============================================== */
2156 
2157 /* Helper array used by Py_UNICODE_ISSPACE(). */
2158 
2159 #ifndef Py_LIMITED_API
2160 PyAPI_DATA(const unsigned char) _Py_ascii_whitespace[];
2161 
2162 /* These should not be used directly. Use the Py_UNICODE_IS* and
2163    Py_UNICODE_TO* macros instead.
2164 
2165    These APIs are implemented in Objects/unicodectype.c.
2166 
2167 */
2168 
2169 PyAPI_FUNC(int) _PyUnicode_IsLowercase(
2170     Py_UCS4 ch       /* Unicode character */
2171     );
2172 
2173 PyAPI_FUNC(int) _PyUnicode_IsUppercase(
2174     Py_UCS4 ch       /* Unicode character */
2175     );
2176 
2177 PyAPI_FUNC(int) _PyUnicode_IsTitlecase(
2178     Py_UCS4 ch       /* Unicode character */
2179     );
2180 
2181 PyAPI_FUNC(int) _PyUnicode_IsXidStart(
2182     Py_UCS4 ch       /* Unicode character */
2183     );
2184 
2185 PyAPI_FUNC(int) _PyUnicode_IsXidContinue(
2186     Py_UCS4 ch       /* Unicode character */
2187     );
2188 
2189 PyAPI_FUNC(int) _PyUnicode_IsWhitespace(
2190     const Py_UCS4 ch         /* Unicode character */
2191     );
2192 
2193 PyAPI_FUNC(int) _PyUnicode_IsLinebreak(
2194     const Py_UCS4 ch         /* Unicode character */
2195     );
2196 
2197 PyAPI_FUNC(Py_UCS4) _PyUnicode_ToLowercase(
2198     Py_UCS4 ch       /* Unicode character */
2199     );
2200 
2201 PyAPI_FUNC(Py_UCS4) _PyUnicode_ToUppercase(
2202     Py_UCS4 ch       /* Unicode character */
2203     );
2204 
2205 PyAPI_FUNC(Py_UCS4) _PyUnicode_ToTitlecase(
2206     Py_UCS4 ch       /* Unicode character */
2207     );
2208 
2209 PyAPI_FUNC(int) _PyUnicode_ToLowerFull(
2210     Py_UCS4 ch,       /* Unicode character */
2211     Py_UCS4 *res
2212     );
2213 
2214 PyAPI_FUNC(int) _PyUnicode_ToTitleFull(
2215     Py_UCS4 ch,       /* Unicode character */
2216     Py_UCS4 *res
2217     );
2218 
2219 PyAPI_FUNC(int) _PyUnicode_ToUpperFull(
2220     Py_UCS4 ch,       /* Unicode character */
2221     Py_UCS4 *res
2222     );
2223 
2224 PyAPI_FUNC(int) _PyUnicode_ToFoldedFull(
2225     Py_UCS4 ch,       /* Unicode character */
2226     Py_UCS4 *res
2227     );
2228 
2229 PyAPI_FUNC(int) _PyUnicode_IsCaseIgnorable(
2230     Py_UCS4 ch         /* Unicode character */
2231     );
2232 
2233 PyAPI_FUNC(int) _PyUnicode_IsCased(
2234     Py_UCS4 ch         /* Unicode character */
2235     );
2236 
2237 PyAPI_FUNC(int) _PyUnicode_ToDecimalDigit(
2238     Py_UCS4 ch       /* Unicode character */
2239     );
2240 
2241 PyAPI_FUNC(int) _PyUnicode_ToDigit(
2242     Py_UCS4 ch       /* Unicode character */
2243     );
2244 
2245 PyAPI_FUNC(double) _PyUnicode_ToNumeric(
2246     Py_UCS4 ch       /* Unicode character */
2247     );
2248 
2249 PyAPI_FUNC(int) _PyUnicode_IsDecimalDigit(
2250     Py_UCS4 ch       /* Unicode character */
2251     );
2252 
2253 PyAPI_FUNC(int) _PyUnicode_IsDigit(
2254     Py_UCS4 ch       /* Unicode character */
2255     );
2256 
2257 PyAPI_FUNC(int) _PyUnicode_IsNumeric(
2258     Py_UCS4 ch       /* Unicode character */
2259     );
2260 
2261 PyAPI_FUNC(int) _PyUnicode_IsPrintable(
2262     Py_UCS4 ch       /* Unicode character */
2263     );
2264 
2265 PyAPI_FUNC(int) _PyUnicode_IsAlpha(
2266     Py_UCS4 ch       /* Unicode character */
2267     );
2268 
2269 PyAPI_FUNC(size_t) Py_UNICODE_strlen(
2270     const Py_UNICODE *u
2271     );
2272 
2273 PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strcpy(
2274     Py_UNICODE *s1,
2275     const Py_UNICODE *s2);
2276 
2277 PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strcat(
2278     Py_UNICODE *s1, const Py_UNICODE *s2);
2279 
2280 PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strncpy(
2281     Py_UNICODE *s1,
2282     const Py_UNICODE *s2,
2283     size_t n);
2284 
2285 PyAPI_FUNC(int) Py_UNICODE_strcmp(
2286     const Py_UNICODE *s1,
2287     const Py_UNICODE *s2
2288     );
2289 
2290 PyAPI_FUNC(int) Py_UNICODE_strncmp(
2291     const Py_UNICODE *s1,
2292     const Py_UNICODE *s2,
2293     size_t n
2294     );
2295 
2296 PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strchr(
2297     const Py_UNICODE *s,
2298     Py_UNICODE c
2299     );
2300 
2301 PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strrchr(
2302     const Py_UNICODE *s,
2303     Py_UNICODE c
2304     );
2305 
2306 PyAPI_FUNC(PyObject*) _PyUnicode_FormatLong(PyObject *, int, int, int);
2307 
2308 /* Create a copy of a unicode string ending with a nul character. Return NULL
2309    and raise a MemoryError exception on memory allocation failure, otherwise
2310    return a new allocated buffer (use PyMem_Free() to free the buffer). */
2311 
2312 PyAPI_FUNC(Py_UNICODE*) PyUnicode_AsUnicodeCopy(
2313     PyObject *unicode
2314     );
2315 #endif /* Py_LIMITED_API */
2316 
2317 #if defined(Py_DEBUG) && !defined(Py_LIMITED_API)
2318 PyAPI_FUNC(int) _PyUnicode_CheckConsistency(
2319     PyObject *op,
2320     int check_content);
2321 #endif
2322 
2323 #ifndef Py_LIMITED_API
2324 /* Return an interned Unicode object for an Identifier; may fail if there is no memory.*/
2325 PyAPI_FUNC(PyObject*) _PyUnicode_FromId(_Py_Identifier*);
2326 /* Clear all static strings. */
2327 PyAPI_FUNC(void) _PyUnicode_ClearStaticStrings(void);
2328 
2329 /* Fast equality check when the inputs are known to be exact unicode types
2330    and where the hash values are equal (i.e. a very probable match) */
2331 PyAPI_FUNC(int) _PyUnicode_EQ(PyObject *, PyObject *);
2332 #endif /* !Py_LIMITED_API */
2333 
2334 #ifdef __cplusplus
2335 }
2336 #endif
2337 #endif /* !Py_UNICODEOBJECT_H */
2338