1 /*
2 
3 Unicode implementation based on original code by Fredrik Lundh,
4 modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
5 Unicode Integration Proposal (see file Misc/unicode.txt).
6 
7 Major speed upgrades to the method implementations at the Reykjavik
8 NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9 
10 Copyright (c) Corporation for National Research Initiatives.
11 
12 --------------------------------------------------------------------
13 The original string type implementation is:
14 
15   Copyright (c) 1999 by Secret Labs AB
16   Copyright (c) 1999 by Fredrik Lundh
17 
18 By obtaining, using, and/or copying this software and/or its
19 associated documentation, you agree that you have read, understood,
20 and will comply with the following terms and conditions:
21 
22 Permission to use, copy, modify, and distribute this software and its
23 associated documentation for any purpose and without fee is hereby
24 granted, provided that the above copyright notice appears in all
25 copies, and that both that copyright notice and this permission notice
26 appear in supporting documentation, and that the name of Secret Labs
27 AB or the author not be used in advertising or publicity pertaining to
28 distribution of the software without specific, written prior
29 permission.
30 
31 SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32 THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33 FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34 ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36 ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37 OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38 --------------------------------------------------------------------
39 
40 */
41 
42 #define PY_SSIZE_T_CLEAN
43 #include "Python.h"
44 
45 #include "unicodeobject.h"
46 #include "ucnhash.h"
47 
48 #ifdef MS_WINDOWS
49 #include <windows.h>
50 #endif
51 
52 /* Limit for the Unicode object free list */
53 
54 #define PyUnicode_MAXFREELIST       1024
55 
56 /* Limit for the Unicode object free list stay alive optimization.
57 
58    The implementation will keep allocated Unicode memory intact for
59    all objects on the free list having a size less than this
60    limit. This reduces malloc() overhead for small Unicode objects.
61 
62    At worst this will result in PyUnicode_MAXFREELIST *
63    (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
64    malloc()-overhead) bytes of unused garbage.
65 
66    Setting the limit to 0 effectively turns the feature off.
67 
68    Note: This is an experimental feature ! If you get core dumps when
69    using Unicode objects, turn this feature off.
70 
71 */
72 
73 #define KEEPALIVE_SIZE_LIMIT       9
74 
75 /* Endianness switches; defaults to little endian */
76 
77 #ifdef WORDS_BIGENDIAN
78 # define BYTEORDER_IS_BIG_ENDIAN
79 #else
80 # define BYTEORDER_IS_LITTLE_ENDIAN
81 #endif
82 
83 /* --- Globals ------------------------------------------------------------
84 
85 NOTE: In the interpreter's initialization phase, some globals are currently
86       initialized dynamically as needed. In the process Unicode objects may
87       be created before the Unicode type is ready.
88 
89 */
90 
91 
92 #ifdef __cplusplus
93 extern "C" {
94 #endif
95 
96 /* Free list for Unicode objects */
97 static PyUnicodeObject *free_list = NULL;
98 static int numfree = 0;
99 
100 /* The empty Unicode object is shared to improve performance. */
101 static PyUnicodeObject *unicode_empty = NULL;
102 
103 #define _Py_RETURN_UNICODE_EMPTY()                      \
104     do {                                                \
105         if (unicode_empty != NULL)                      \
106             Py_INCREF(unicode_empty);                   \
107         else {                                          \
108             unicode_empty = _PyUnicode_New(0);          \
109             if (unicode_empty != NULL)                  \
110                 Py_INCREF(unicode_empty);               \
111         }                                               \
112         return (PyObject *)unicode_empty;               \
113     } while (0)
114 
115 /* Single character Unicode strings in the Latin-1 range are being
116    shared as well. */
117 static PyUnicodeObject *unicode_latin1[256] = {NULL};
118 
119 /* Default encoding to use and assume when NULL is passed as encoding
120    parameter; it is initialized by _PyUnicode_Init().
121 
122    Always use the PyUnicode_SetDefaultEncoding() and
123    PyUnicode_GetDefaultEncoding() APIs to access this global.
124 
125 */
126 static char unicode_default_encoding[100 + 1] = "ascii";
127 
128 /* Fast detection of the most frequent whitespace characters */
129 const unsigned char _Py_ascii_whitespace[] = {
130     0, 0, 0, 0, 0, 0, 0, 0,
131 /*     case 0x0009: * CHARACTER TABULATION */
132 /*     case 0x000A: * LINE FEED */
133 /*     case 0x000B: * LINE TABULATION */
134 /*     case 0x000C: * FORM FEED */
135 /*     case 0x000D: * CARRIAGE RETURN */
136     0, 1, 1, 1, 1, 1, 0, 0,
137     0, 0, 0, 0, 0, 0, 0, 0,
138 /*     case 0x001C: * FILE SEPARATOR */
139 /*     case 0x001D: * GROUP SEPARATOR */
140 /*     case 0x001E: * RECORD SEPARATOR */
141 /*     case 0x001F: * UNIT SEPARATOR */
142     0, 0, 0, 0, 1, 1, 1, 1,
143 /*     case 0x0020: * SPACE */
144     1, 0, 0, 0, 0, 0, 0, 0,
145     0, 0, 0, 0, 0, 0, 0, 0,
146     0, 0, 0, 0, 0, 0, 0, 0,
147     0, 0, 0, 0, 0, 0, 0, 0,
148 
149     0, 0, 0, 0, 0, 0, 0, 0,
150     0, 0, 0, 0, 0, 0, 0, 0,
151     0, 0, 0, 0, 0, 0, 0, 0,
152     0, 0, 0, 0, 0, 0, 0, 0,
153     0, 0, 0, 0, 0, 0, 0, 0,
154     0, 0, 0, 0, 0, 0, 0, 0,
155     0, 0, 0, 0, 0, 0, 0, 0,
156     0, 0, 0, 0, 0, 0, 0, 0
157 };
158 
159 /* Same for linebreaks */
160 static unsigned char ascii_linebreak[] = {
161     0, 0, 0, 0, 0, 0, 0, 0,
162 /*         0x000A, * LINE FEED */
163 /*         0x000B, * LINE TABULATION */
164 /*         0x000C, * FORM FEED */
165 /*         0x000D, * CARRIAGE RETURN */
166     0, 0, 1, 1, 1, 1, 0, 0,
167     0, 0, 0, 0, 0, 0, 0, 0,
168 /*         0x001C, * FILE SEPARATOR */
169 /*         0x001D, * GROUP SEPARATOR */
170 /*         0x001E, * RECORD SEPARATOR */
171     0, 0, 0, 0, 1, 1, 1, 0,
172     0, 0, 0, 0, 0, 0, 0, 0,
173     0, 0, 0, 0, 0, 0, 0, 0,
174     0, 0, 0, 0, 0, 0, 0, 0,
175     0, 0, 0, 0, 0, 0, 0, 0,
176 
177     0, 0, 0, 0, 0, 0, 0, 0,
178     0, 0, 0, 0, 0, 0, 0, 0,
179     0, 0, 0, 0, 0, 0, 0, 0,
180     0, 0, 0, 0, 0, 0, 0, 0,
181     0, 0, 0, 0, 0, 0, 0, 0,
182     0, 0, 0, 0, 0, 0, 0, 0,
183     0, 0, 0, 0, 0, 0, 0, 0,
184     0, 0, 0, 0, 0, 0, 0, 0
185 };
186 
187 
188 Py_UNICODE
PyUnicode_GetMax(void)189 PyUnicode_GetMax(void)
190 {
191 #ifdef Py_UNICODE_WIDE
192     return 0x10FFFF;
193 #else
194     /* This is actually an illegal character, so it should
195        not be passed to unichr. */
196     return 0xFFFF;
197 #endif
198 }
199 
200 /* --- Bloom Filters ----------------------------------------------------- */
201 
202 /* stuff to implement simple "bloom filters" for Unicode characters.
203    to keep things simple, we use a single bitmask, using the least 5
204    bits from each unicode characters as the bit index. */
205 
206 /* the linebreak mask is set up by Unicode_Init below */
207 
208 #if LONG_BIT >= 128
209 #define BLOOM_WIDTH 128
210 #elif LONG_BIT >= 64
211 #define BLOOM_WIDTH 64
212 #elif LONG_BIT >= 32
213 #define BLOOM_WIDTH 32
214 #else
215 #error "LONG_BIT is smaller than 32"
216 #endif
217 
218 #define BLOOM_MASK unsigned long
219 
220 static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
221 
222 #define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
223 #define BLOOM(mask, ch)     ((mask &  (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
224 
225 #define BLOOM_LINEBREAK(ch)                                             \
226     ((ch) < 128U ? ascii_linebreak[(ch)] :                              \
227      (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
228 
make_bloom_mask(Py_UNICODE * ptr,Py_ssize_t len)229 Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
230 {
231     /* calculate simple bloom-style bitmask for a given unicode string */
232 
233     BLOOM_MASK mask;
234     Py_ssize_t i;
235 
236     mask = 0;
237     for (i = 0; i < len; i++)
238         BLOOM_ADD(mask, ptr[i]);
239 
240     return mask;
241 }
242 
unicode_member(Py_UNICODE chr,Py_UNICODE * set,Py_ssize_t setlen)243 Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
244 {
245     Py_ssize_t i;
246 
247     for (i = 0; i < setlen; i++)
248         if (set[i] == chr)
249             return 1;
250 
251     return 0;
252 }
253 
254 #define BLOOM_MEMBER(mask, chr, set, setlen)                    \
255     BLOOM(mask, chr) && unicode_member(chr, set, setlen)
256 
257 /* --- Unicode Object ----------------------------------------------------- */
258 
259 static
unicode_resize(register PyUnicodeObject * unicode,Py_ssize_t length)260 int unicode_resize(register PyUnicodeObject *unicode,
261                    Py_ssize_t length)
262 {
263     void *oldstr;
264 
265     /* Shortcut if there's nothing much to do. */
266     if (unicode->length == length)
267         goto reset;
268 
269     /* Resizing shared object (unicode_empty or single character
270        objects) in-place is not allowed. Use PyUnicode_Resize()
271        instead ! */
272 
273     if (unicode == unicode_empty ||
274         (unicode->length == 1 &&
275          unicode->str[0] < 256U &&
276          unicode_latin1[unicode->str[0]] == unicode)) {
277         PyErr_SetString(PyExc_SystemError,
278                         "can't resize shared unicode objects");
279         return -1;
280     }
281 
282     /* We allocate one more byte to make sure the string is Ux0000 terminated.
283        The overallocation is also used by fastsearch, which assumes that it's
284        safe to look at str[length] (without making any assumptions about what
285        it contains). */
286 
287     oldstr = unicode->str;
288     unicode->str = PyObject_REALLOC(unicode->str,
289                                     sizeof(Py_UNICODE) * (length + 1));
290     if (!unicode->str) {
291         unicode->str = (Py_UNICODE *)oldstr;
292         PyErr_NoMemory();
293         return -1;
294     }
295     unicode->str[length] = 0;
296     unicode->length = length;
297 
298   reset:
299     /* Reset the object caches */
300     if (unicode->defenc) {
301         Py_CLEAR(unicode->defenc);
302     }
303     unicode->hash = -1;
304 
305     return 0;
306 }
307 
308 /* We allocate one more byte to make sure the string is
309    Ux0000 terminated; some code relies on that.
310 
311    XXX This allocator could further be enhanced by assuring that the
312    free list never reduces its size below 1.
313 
314 */
315 
316 static
_PyUnicode_New(Py_ssize_t length)317 PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
318 {
319     register PyUnicodeObject *unicode;
320 
321     /* Optimization for empty strings */
322     if (length == 0 && unicode_empty != NULL) {
323         Py_INCREF(unicode_empty);
324         return unicode_empty;
325     }
326 
327     /* Ensure we won't overflow the size. */
328     if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
329         return (PyUnicodeObject *)PyErr_NoMemory();
330     }
331 
332     /* Unicode freelist & memory allocation */
333     if (free_list) {
334         unicode = free_list;
335         free_list = *(PyUnicodeObject **)unicode;
336         numfree--;
337         if (unicode->str) {
338             /* Keep-Alive optimization: we only upsize the buffer,
339                never downsize it. */
340             if ((unicode->length < length) &&
341                 unicode_resize(unicode, length) < 0) {
342                 PyObject_DEL(unicode->str);
343                 unicode->str = NULL;
344             }
345         }
346         else {
347             size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
348             unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
349         }
350         (void)PyObject_INIT(unicode, &PyUnicode_Type);
351     }
352     else {
353         size_t new_size;
354         unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
355         if (unicode == NULL)
356             return NULL;
357         new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
358         unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
359     }
360 
361     if (!unicode->str) {
362         PyErr_NoMemory();
363         goto onError;
364     }
365     /* Initialize the first element to guard against cases where
366      * the caller fails before initializing str -- unicode_resize()
367      * reads str[0], and the Keep-Alive optimization can keep memory
368      * allocated for str alive across a call to unicode_dealloc(unicode).
369      * We don't want unicode_resize to read uninitialized memory in
370      * that case.
371      */
372     unicode->str[0] = 0;
373     unicode->str[length] = 0;
374     unicode->length = length;
375     unicode->hash = -1;
376     unicode->defenc = NULL;
377     return unicode;
378 
379   onError:
380     /* XXX UNREF/NEWREF interface should be more symmetrical */
381     _Py_DEC_REFTOTAL;
382     _Py_ForgetReference((PyObject *)unicode);
383     PyObject_Del(unicode);
384     return NULL;
385 }
386 
387 static
unicode_dealloc(register PyUnicodeObject * unicode)388 void unicode_dealloc(register PyUnicodeObject *unicode)
389 {
390     if (PyUnicode_CheckExact(unicode) &&
391         numfree < PyUnicode_MAXFREELIST) {
392         /* Keep-Alive optimization */
393         if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
394             PyObject_DEL(unicode->str);
395             unicode->str = NULL;
396             unicode->length = 0;
397         }
398         if (unicode->defenc) {
399             Py_CLEAR(unicode->defenc);
400         }
401         /* Add to free list */
402         *(PyUnicodeObject **)unicode = free_list;
403         free_list = unicode;
404         numfree++;
405     }
406     else {
407         PyObject_DEL(unicode->str);
408         Py_XDECREF(unicode->defenc);
409         Py_TYPE(unicode)->tp_free((PyObject *)unicode);
410     }
411 }
412 
413 static
_PyUnicode_Resize(PyUnicodeObject ** unicode,Py_ssize_t length)414 int _PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
415 {
416     register PyUnicodeObject *v;
417 
418     /* Argument checks */
419     if (unicode == NULL) {
420         PyErr_BadInternalCall();
421         return -1;
422     }
423     v = *unicode;
424     if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
425         PyErr_BadInternalCall();
426         return -1;
427     }
428 
429     /* Resizing unicode_empty and single character objects is not
430        possible since these are being shared. We simply return a fresh
431        copy with the same Unicode content. */
432     if (v->length != length &&
433         (v == unicode_empty || v->length == 1)) {
434         PyUnicodeObject *w = _PyUnicode_New(length);
435         if (w == NULL)
436             return -1;
437         Py_UNICODE_COPY(w->str, v->str,
438                         length < v->length ? length : v->length);
439         Py_SETREF(*unicode, w);
440         return 0;
441     }
442 
443     /* Note that we don't have to modify *unicode for unshared Unicode
444        objects, since we can modify them in-place. */
445     return unicode_resize(v, length);
446 }
447 
PyUnicode_Resize(PyObject ** unicode,Py_ssize_t length)448 int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
449 {
450     return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
451 }
452 
PyUnicode_FromUnicode(const Py_UNICODE * u,Py_ssize_t size)453 PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
454                                 Py_ssize_t size)
455 {
456     PyUnicodeObject *unicode;
457 
458     /* If the Unicode data is known at construction time, we can apply
459        some optimizations which share commonly used objects. */
460     if (u != NULL) {
461 
462         /* Optimization for empty strings */
463         if (size == 0)
464             _Py_RETURN_UNICODE_EMPTY();
465 
466         /* Single character Unicode objects in the Latin-1 range are
467            shared when using this constructor */
468         if (size == 1 && *u < 256) {
469             unicode = unicode_latin1[*u];
470             if (!unicode) {
471                 unicode = _PyUnicode_New(1);
472                 if (!unicode)
473                     return NULL;
474                 unicode->str[0] = *u;
475                 unicode_latin1[*u] = unicode;
476             }
477             Py_INCREF(unicode);
478             return (PyObject *)unicode;
479         }
480     }
481 
482     unicode = _PyUnicode_New(size);
483     if (!unicode)
484         return NULL;
485 
486     /* Copy the Unicode data into the new object */
487     if (u != NULL)
488         Py_UNICODE_COPY(unicode->str, u, size);
489 
490     return (PyObject *)unicode;
491 }
492 
PyUnicode_FromStringAndSize(const char * u,Py_ssize_t size)493 PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
494 {
495     PyUnicodeObject *unicode;
496 
497     if (size < 0) {
498         PyErr_SetString(PyExc_SystemError,
499                         "Negative size passed to PyUnicode_FromStringAndSize");
500         return NULL;
501     }
502 
503     /* If the Unicode data is known at construction time, we can apply
504        some optimizations which share commonly used objects.
505        Also, this means the input must be UTF-8, so fall back to the
506        UTF-8 decoder at the end. */
507     if (u != NULL) {
508 
509         /* Optimization for empty strings */
510         if (size == 0)
511             _Py_RETURN_UNICODE_EMPTY();
512 
513         /* Single characters are shared when using this constructor.
514            Restrict to ASCII, since the input must be UTF-8. */
515         if (size == 1 && Py_CHARMASK(*u) < 128) {
516             unicode = unicode_latin1[Py_CHARMASK(*u)];
517             if (!unicode) {
518                 unicode = _PyUnicode_New(1);
519                 if (!unicode)
520                     return NULL;
521                 unicode->str[0] = Py_CHARMASK(*u);
522                 unicode_latin1[Py_CHARMASK(*u)] = unicode;
523             }
524             Py_INCREF(unicode);
525             return (PyObject *)unicode;
526         }
527 
528         return PyUnicode_DecodeUTF8(u, size, NULL);
529     }
530 
531     unicode = _PyUnicode_New(size);
532     if (!unicode)
533         return NULL;
534 
535     return (PyObject *)unicode;
536 }
537 
PyUnicode_FromString(const char * u)538 PyObject *PyUnicode_FromString(const char *u)
539 {
540     size_t size = strlen(u);
541     if (size > PY_SSIZE_T_MAX) {
542         PyErr_SetString(PyExc_OverflowError, "input too long");
543         return NULL;
544     }
545 
546     return PyUnicode_FromStringAndSize(u, size);
547 }
548 
549 /* _Py_UNICODE_NEXT is a private macro used to retrieve the character pointed
550  * by 'ptr', possibly combining surrogate pairs on narrow builds.
551  * 'ptr' and 'end' must be Py_UNICODE*, with 'ptr' pointing at the character
552  * that should be returned and 'end' pointing to the end of the buffer.
553  * ('end' is used on narrow builds to detect a lone surrogate at the
554  * end of the buffer that should be returned unchanged.)
555  * The ptr and end arguments should be side-effect free and ptr must an lvalue.
556  * The type of the returned char is always Py_UCS4.
557  *
558  * Note: the macro advances ptr to next char, so it might have side-effects
559  *       (especially if used with other macros).
560  */
561 
562 /* helper macros used by _Py_UNICODE_NEXT */
563 #define _Py_UNICODE_IS_HIGH_SURROGATE(ch) (0xD800 <= ch && ch <= 0xDBFF)
564 #define _Py_UNICODE_IS_LOW_SURROGATE(ch) (0xDC00 <= ch && ch <= 0xDFFF)
565 /* Join two surrogate characters and return a single Py_UCS4 value. */
566 #define _Py_UNICODE_JOIN_SURROGATES(high, low)  \
567     (((((Py_UCS4)(high) & 0x03FF) << 10) |      \
568       ((Py_UCS4)(low) & 0x03FF)) + 0x10000)
569 
570 #ifdef Py_UNICODE_WIDE
571 #define _Py_UNICODE_NEXT(ptr, end) *(ptr)++
572 #else
573 #define _Py_UNICODE_NEXT(ptr, end)                                      \
574      (((_Py_UNICODE_IS_HIGH_SURROGATE(*(ptr)) && (ptr) < (end)) &&      \
575         _Py_UNICODE_IS_LOW_SURROGATE((ptr)[1])) ?                       \
576        ((ptr) += 2,_Py_UNICODE_JOIN_SURROGATES((ptr)[-2], (ptr)[-1])) : \
577        (Py_UCS4)*(ptr)++)
578 #endif
579 
580 #ifdef HAVE_WCHAR_H
581 
582 #if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4)
583 # define CONVERT_WCHAR_TO_SURROGATES
584 #endif
585 
586 #ifdef CONVERT_WCHAR_TO_SURROGATES
587 
588 /* Here sizeof(wchar_t) is 4 but Py_UNICODE_SIZE == 2, so we need
589    to convert from UTF32 to UTF16. */
590 
PyUnicode_FromWideChar(register const wchar_t * w,Py_ssize_t size)591 PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
592                                  Py_ssize_t size)
593 {
594     PyUnicodeObject *unicode;
595     register Py_ssize_t i;
596     Py_ssize_t alloc;
597     const wchar_t *orig_w;
598 
599     if (w == NULL) {
600         PyErr_BadInternalCall();
601         return NULL;
602     }
603 
604     alloc = size;
605     orig_w = w;
606     for (i = size; i > 0; i--) {
607         if (*w > 0xFFFF)
608             alloc++;
609         w++;
610     }
611     w = orig_w;
612     unicode = _PyUnicode_New(alloc);
613     if (!unicode)
614         return NULL;
615 
616     /* Copy the wchar_t data into the new object */
617     {
618         register Py_UNICODE *u;
619         u = PyUnicode_AS_UNICODE(unicode);
620         for (i = size; i > 0; i--) {
621             if (*w > 0xFFFF) {
622                 wchar_t ordinal = *w++;
623                 ordinal -= 0x10000;
624                 *u++ = 0xD800 | (ordinal >> 10);
625                 *u++ = 0xDC00 | (ordinal & 0x3FF);
626             }
627             else
628                 *u++ = *w++;
629         }
630     }
631     return (PyObject *)unicode;
632 }
633 
634 #else
635 
PyUnicode_FromWideChar(register const wchar_t * w,Py_ssize_t size)636 PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
637                                  Py_ssize_t size)
638 {
639     PyUnicodeObject *unicode;
640 
641     if (w == NULL) {
642         PyErr_BadInternalCall();
643         return NULL;
644     }
645 
646     unicode = _PyUnicode_New(size);
647     if (!unicode)
648         return NULL;
649 
650     /* Copy the wchar_t data into the new object */
651 #ifdef HAVE_USABLE_WCHAR_T
652     memcpy(unicode->str, w, size * sizeof(wchar_t));
653 #else
654     {
655         register Py_UNICODE *u;
656         register Py_ssize_t i;
657         u = PyUnicode_AS_UNICODE(unicode);
658         for (i = size; i > 0; i--)
659             *u++ = *w++;
660     }
661 #endif
662 
663     return (PyObject *)unicode;
664 }
665 
666 #endif /* CONVERT_WCHAR_TO_SURROGATES */
667 
668 #undef CONVERT_WCHAR_TO_SURROGATES
669 
670 static void
makefmt(char * fmt,int longflag,int size_tflag,int zeropad,int width,int precision,char c)671 makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)
672 {
673     *fmt++ = '%';
674     if (width) {
675         if (zeropad)
676             *fmt++ = '0';
677         fmt += sprintf(fmt, "%d", width);
678     }
679     if (precision)
680         fmt += sprintf(fmt, ".%d", precision);
681     if (longflag)
682         *fmt++ = 'l';
683     else if (size_tflag) {
684         char *f = PY_FORMAT_SIZE_T;
685         while (*f)
686             *fmt++ = *f++;
687     }
688     *fmt++ = c;
689     *fmt = '\0';
690 }
691 
692 #define appendstring(string) \
693     do { \
694         for (copy = string;*copy; copy++) { \
695             *s++ = (unsigned char)*copy; \
696         } \
697     } while (0)
698 
699 PyObject *
PyUnicode_FromFormatV(const char * format,va_list vargs)700 PyUnicode_FromFormatV(const char *format, va_list vargs)
701 {
702     va_list count;
703     Py_ssize_t callcount = 0;
704     PyObject **callresults = NULL;
705     PyObject **callresult = NULL;
706     Py_ssize_t n = 0;
707     int width = 0;
708     int precision = 0;
709     int zeropad;
710     const char* f;
711     Py_UNICODE *s;
712     PyObject *string;
713     /* used by sprintf */
714     char buffer[21];
715     /* use abuffer instead of buffer, if we need more space
716      * (which can happen if there's a format specifier with width). */
717     char *abuffer = NULL;
718     char *realbuffer;
719     Py_ssize_t abuffersize = 0;
720     char fmt[60]; /* should be enough for %0width.precisionld */
721     const char *copy;
722 
723 #ifdef VA_LIST_IS_ARRAY
724     Py_MEMCPY(count, vargs, sizeof(va_list));
725 #else
726 #ifdef  __va_copy
727     __va_copy(count, vargs);
728 #else
729     count = vargs;
730 #endif
731 #endif
732      /* step 1: count the number of %S/%R/%s format specifications
733       * (we call PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() for these
734       * objects once during step 3 and put the result in an array) */
735     for (f = format; *f; f++) {
736          if (*f == '%') {
737              f++;
738              while (*f && *f != '%' && !isalpha((unsigned)*f))
739                  f++;
740              if (!*f)
741                  break;
742              if (*f == 's' || *f=='S' || *f=='R')
743                  ++callcount;
744          }
745     }
746     /* step 2: allocate memory for the results of
747      * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
748     if (callcount) {
749         callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
750         if (!callresults) {
751             PyErr_NoMemory();
752             return NULL;
753         }
754         callresult = callresults;
755     }
756     /* step 3: figure out how large a buffer we need */
757     for (f = format; *f; f++) {
758         if (*f == '%') {
759             const char* p = f++;
760             width = 0;
761             while (isdigit((unsigned)*f))
762                 width = (width*10) + *f++ - '0';
763             precision = 0;
764             if (*f == '.') {
765                 f++;
766                 while (isdigit((unsigned)*f))
767                     precision = (precision*10) + *f++ - '0';
768             }
769 
770             /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
771              * they don't affect the amount of space we reserve.
772              */
773             if ((*f == 'l' || *f == 'z') &&
774                 (f[1] == 'd' || f[1] == 'u'))
775                 ++f;
776 
777             switch (*f) {
778             case 'c':
779             {
780                 int ordinal = va_arg(count, int);
781 #ifdef Py_UNICODE_WIDE
782                 if (ordinal < 0 || ordinal > 0x10ffff) {
783                     PyErr_SetString(PyExc_OverflowError,
784                                     "%c arg not in range(0x110000) "
785                                     "(wide Python build)");
786                     goto fail;
787                 }
788 #else
789                 if (ordinal < 0 || ordinal > 0xffff) {
790                     PyErr_SetString(PyExc_OverflowError,
791                                     "%c arg not in range(0x10000) "
792                                     "(narrow Python build)");
793                     goto fail;
794                 }
795 #endif
796                 /* fall through... */
797             }
798             case '%':
799                 n++;
800                 break;
801             case 'd': case 'u': case 'i': case 'x':
802                 (void) va_arg(count, int);
803                 if (width < precision)
804                     width = precision;
805                 /* 20 bytes is enough to hold a 64-bit
806                    integer.  Decimal takes the most space.
807                    This isn't enough for octal.
808                    If a width is specified we need more
809                    (which we allocate later). */
810                 if (width < 20)
811                     width = 20;
812                 n += width;
813                 if (abuffersize < width)
814                     abuffersize = width;
815                 break;
816             case 's':
817             {
818                 /* UTF-8 */
819                 const char *s = va_arg(count, const char*);
820                 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
821                 if (!str)
822                     goto fail;
823                 n += PyUnicode_GET_SIZE(str);
824                 /* Remember the str and switch to the next slot */
825                 *callresult++ = str;
826                 break;
827             }
828             case 'U':
829             {
830                 PyObject *obj = va_arg(count, PyObject *);
831                 assert(obj && PyUnicode_Check(obj));
832                 n += PyUnicode_GET_SIZE(obj);
833                 break;
834             }
835             case 'V':
836             {
837                 PyObject *obj = va_arg(count, PyObject *);
838                 const char *str = va_arg(count, const char *);
839                 assert(obj || str);
840                 assert(!obj || PyUnicode_Check(obj));
841                 if (obj)
842                     n += PyUnicode_GET_SIZE(obj);
843                 else
844                     n += strlen(str);
845                 break;
846             }
847             case 'S':
848             {
849                 PyObject *obj = va_arg(count, PyObject *);
850                 PyObject *str;
851                 assert(obj);
852                 str = PyObject_Str(obj);
853                 if (!str)
854                     goto fail;
855                 n += PyString_GET_SIZE(str);
856                 /* Remember the str and switch to the next slot */
857                 *callresult++ = str;
858                 break;
859             }
860             case 'R':
861             {
862                 PyObject *obj = va_arg(count, PyObject *);
863                 PyObject *repr;
864                 assert(obj);
865                 repr = PyObject_Repr(obj);
866                 if (!repr)
867                     goto fail;
868                 n += PyUnicode_GET_SIZE(repr);
869                 /* Remember the repr and switch to the next slot */
870                 *callresult++ = repr;
871                 break;
872             }
873             case 'p':
874                 (void) va_arg(count, int);
875                 /* maximum 64-bit pointer representation:
876                  * 0xffffffffffffffff
877                  * so 19 characters is enough.
878                  * XXX I count 18 -- what's the extra for?
879                  */
880                 n += 19;
881                 break;
882             default:
883                 /* if we stumble upon an unknown
884                    formatting code, copy the rest of
885                    the format string to the output
886                    string. (we cannot just skip the
887                    code, since there's no way to know
888                    what's in the argument list) */
889                 n += strlen(p);
890                 goto expand;
891             }
892         } else
893             n++;
894     }
895   expand:
896     if (abuffersize > 20) {
897         /* add 1 for sprintf's trailing null byte */
898         abuffer = PyObject_Malloc(abuffersize + 1);
899         if (!abuffer) {
900             PyErr_NoMemory();
901             goto fail;
902         }
903         realbuffer = abuffer;
904     }
905     else
906         realbuffer = buffer;
907     /* step 4: fill the buffer */
908     /* Since we've analyzed how much space we need for the worst case,
909        we don't have to resize the string.
910        There can be no errors beyond this point. */
911     string = PyUnicode_FromUnicode(NULL, n);
912     if (!string)
913         goto fail;
914 
915     s = PyUnicode_AS_UNICODE(string);
916     callresult = callresults;
917 
918     for (f = format; *f; f++) {
919         if (*f == '%') {
920             const char* p = f++;
921             int longflag = 0;
922             int size_tflag = 0;
923             zeropad = (*f == '0');
924             /* parse the width.precision part */
925             width = 0;
926             while (isdigit((unsigned)*f))
927                 width = (width*10) + *f++ - '0';
928             precision = 0;
929             if (*f == '.') {
930                 f++;
931                 while (isdigit((unsigned)*f))
932                     precision = (precision*10) + *f++ - '0';
933             }
934             /* handle the long flag, but only for %ld and %lu.
935                others can be added when necessary. */
936             if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
937                 longflag = 1;
938                 ++f;
939             }
940             /* handle the size_t flag. */
941             if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
942                 size_tflag = 1;
943                 ++f;
944             }
945 
946             switch (*f) {
947             case 'c':
948                 *s++ = va_arg(vargs, int);
949                 break;
950             case 'd':
951                 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd');
952                 if (longflag)
953                     sprintf(realbuffer, fmt, va_arg(vargs, long));
954                 else if (size_tflag)
955                     sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
956                 else
957                     sprintf(realbuffer, fmt, va_arg(vargs, int));
958                 appendstring(realbuffer);
959                 break;
960             case 'u':
961                 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u');
962                 if (longflag)
963                     sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
964                 else if (size_tflag)
965                     sprintf(realbuffer, fmt, va_arg(vargs, size_t));
966                 else
967                     sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
968                 appendstring(realbuffer);
969                 break;
970             case 'i':
971                 makefmt(fmt, 0, 0, zeropad, width, precision, 'i');
972                 sprintf(realbuffer, fmt, va_arg(vargs, int));
973                 appendstring(realbuffer);
974                 break;
975             case 'x':
976                 makefmt(fmt, 0, 0, zeropad, width, precision, 'x');
977                 sprintf(realbuffer, fmt, va_arg(vargs, int));
978                 appendstring(realbuffer);
979                 break;
980             case 's':
981             {
982                 /* unused, since we already have the result */
983                 (void) va_arg(vargs, char *);
984                 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult),
985                                 PyUnicode_GET_SIZE(*callresult));
986                 s += PyUnicode_GET_SIZE(*callresult);
987                 /* We're done with the unicode()/repr() => forget it */
988                 Py_DECREF(*callresult);
989                 /* switch to next unicode()/repr() result */
990                 ++callresult;
991                 break;
992             }
993             case 'U':
994             {
995                 PyObject *obj = va_arg(vargs, PyObject *);
996                 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
997                 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
998                 s += size;
999                 break;
1000             }
1001             case 'V':
1002             {
1003                 PyObject *obj = va_arg(vargs, PyObject *);
1004                 const char *str = va_arg(vargs, const char *);
1005                 if (obj) {
1006                     Py_ssize_t size = PyUnicode_GET_SIZE(obj);
1007                     Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
1008                     s += size;
1009                 } else {
1010                     appendstring(str);
1011                 }
1012                 break;
1013             }
1014             case 'S':
1015             case 'R':
1016             {
1017                 const char *str = PyString_AS_STRING(*callresult);
1018                 /* unused, since we already have the result */
1019                 (void) va_arg(vargs, PyObject *);
1020                 appendstring(str);
1021                 /* We're done with the unicode()/repr() => forget it */
1022                 Py_DECREF(*callresult);
1023                 /* switch to next unicode()/repr() result */
1024                 ++callresult;
1025                 break;
1026             }
1027             case 'p':
1028                 sprintf(buffer, "%p", va_arg(vargs, void*));
1029                 /* %p is ill-defined:  ensure leading 0x. */
1030                 if (buffer[1] == 'X')
1031                     buffer[1] = 'x';
1032                 else if (buffer[1] != 'x') {
1033                     memmove(buffer+2, buffer, strlen(buffer)+1);
1034                     buffer[0] = '0';
1035                     buffer[1] = 'x';
1036                 }
1037                 appendstring(buffer);
1038                 break;
1039             case '%':
1040                 *s++ = '%';
1041                 break;
1042             default:
1043                 appendstring(p);
1044                 goto end;
1045             }
1046         } else
1047             *s++ = *f;
1048     }
1049 
1050   end:
1051     if (callresults)
1052         PyObject_Free(callresults);
1053     if (abuffer)
1054         PyObject_Free(abuffer);
1055     PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
1056     return string;
1057   fail:
1058     if (callresults) {
1059         PyObject **callresult2 = callresults;
1060         while (callresult2 < callresult) {
1061             Py_DECREF(*callresult2);
1062             ++callresult2;
1063         }
1064         PyObject_Free(callresults);
1065     }
1066     if (abuffer)
1067         PyObject_Free(abuffer);
1068     return NULL;
1069 }
1070 
1071 #undef appendstring
1072 
1073 PyObject *
PyUnicode_FromFormat(const char * format,...)1074 PyUnicode_FromFormat(const char *format, ...)
1075 {
1076     PyObject* ret;
1077     va_list vargs;
1078 
1079 #ifdef HAVE_STDARG_PROTOTYPES
1080     va_start(vargs, format);
1081 #else
1082     va_start(vargs);
1083 #endif
1084     ret = PyUnicode_FromFormatV(format, vargs);
1085     va_end(vargs);
1086     return ret;
1087 }
1088 
PyUnicode_AsWideChar(PyUnicodeObject * unicode,wchar_t * w,Py_ssize_t size)1089 Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
1090                                 wchar_t *w,
1091                                 Py_ssize_t size)
1092 {
1093     if (unicode == NULL) {
1094         PyErr_BadInternalCall();
1095         return -1;
1096     }
1097 
1098     /* If possible, try to copy the 0-termination as well */
1099     if (size > PyUnicode_GET_SIZE(unicode))
1100         size = PyUnicode_GET_SIZE(unicode) + 1;
1101 
1102 #ifdef HAVE_USABLE_WCHAR_T
1103     memcpy(w, unicode->str, size * sizeof(wchar_t));
1104 #else
1105     {
1106         register Py_UNICODE *u;
1107         register Py_ssize_t i;
1108         u = PyUnicode_AS_UNICODE(unicode);
1109         for (i = size; i > 0; i--)
1110             *w++ = *u++;
1111     }
1112 #endif
1113 
1114     if (size > PyUnicode_GET_SIZE(unicode))
1115         return PyUnicode_GET_SIZE(unicode);
1116     else
1117         return size;
1118 }
1119 
1120 #endif
1121 
PyUnicode_FromOrdinal(int ordinal)1122 PyObject *PyUnicode_FromOrdinal(int ordinal)
1123 {
1124     Py_UNICODE s[1];
1125 
1126 #ifdef Py_UNICODE_WIDE
1127     if (ordinal < 0 || ordinal > 0x10ffff) {
1128         PyErr_SetString(PyExc_ValueError,
1129                         "unichr() arg not in range(0x110000) "
1130                         "(wide Python build)");
1131         return NULL;
1132     }
1133 #else
1134     if (ordinal < 0 || ordinal > 0xffff) {
1135         PyErr_SetString(PyExc_ValueError,
1136                         "unichr() arg not in range(0x10000) "
1137                         "(narrow Python build)");
1138         return NULL;
1139     }
1140 #endif
1141 
1142     s[0] = (Py_UNICODE)ordinal;
1143     return PyUnicode_FromUnicode(s, 1);
1144 }
1145 
PyUnicode_FromObject(register PyObject * obj)1146 PyObject *PyUnicode_FromObject(register PyObject *obj)
1147 {
1148     /* XXX Perhaps we should make this API an alias of
1149        PyObject_Unicode() instead ?! */
1150     if (PyUnicode_CheckExact(obj)) {
1151         Py_INCREF(obj);
1152         return obj;
1153     }
1154     if (PyUnicode_Check(obj)) {
1155         /* For a Unicode subtype that's not a Unicode object,
1156            return a true Unicode object with the same data. */
1157         return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1158                                      PyUnicode_GET_SIZE(obj));
1159     }
1160     return PyUnicode_FromEncodedObject(obj, NULL, "strict");
1161 }
1162 
PyUnicode_FromEncodedObject(register PyObject * obj,const char * encoding,const char * errors)1163 PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
1164                                       const char *encoding,
1165                                       const char *errors)
1166 {
1167     const char *s = NULL;
1168     Py_ssize_t len;
1169     PyObject *v;
1170 
1171     if (obj == NULL) {
1172         PyErr_BadInternalCall();
1173         return NULL;
1174     }
1175 
1176 #if 0
1177     /* For b/w compatibility we also accept Unicode objects provided
1178        that no encodings is given and then redirect to
1179        PyObject_Unicode() which then applies the additional logic for
1180        Unicode subclasses.
1181 
1182        NOTE: This API should really only be used for object which
1183        represent *encoded* Unicode !
1184 
1185     */
1186     if (PyUnicode_Check(obj)) {
1187         if (encoding) {
1188             PyErr_SetString(PyExc_TypeError,
1189                             "decoding Unicode is not supported");
1190             return NULL;
1191         }
1192         return PyObject_Unicode(obj);
1193     }
1194 #else
1195     if (PyUnicode_Check(obj)) {
1196         PyErr_SetString(PyExc_TypeError,
1197                         "decoding Unicode is not supported");
1198         return NULL;
1199     }
1200 #endif
1201 
1202     /* Coerce object */
1203     if (PyString_Check(obj)) {
1204         s = PyString_AS_STRING(obj);
1205         len = PyString_GET_SIZE(obj);
1206     }
1207     else if (PyByteArray_Check(obj)) {
1208         /* Python 2.x specific */
1209         PyErr_Format(PyExc_TypeError,
1210                      "decoding bytearray is not supported");
1211         return NULL;
1212     }
1213     else if (PyObject_AsCharBuffer(obj, &s, &len)) {
1214         /* Overwrite the error message with something more useful in
1215            case of a TypeError. */
1216         if (PyErr_ExceptionMatches(PyExc_TypeError))
1217             PyErr_Format(PyExc_TypeError,
1218                          "coercing to Unicode: need string or buffer, "
1219                          "%.80s found",
1220                          Py_TYPE(obj)->tp_name);
1221         goto onError;
1222     }
1223 
1224     /* Convert to Unicode */
1225     if (len == 0)
1226         _Py_RETURN_UNICODE_EMPTY();
1227 
1228     v = PyUnicode_Decode(s, len, encoding, errors);
1229     return v;
1230 
1231   onError:
1232     return NULL;
1233 }
1234 
PyUnicode_Decode(const char * s,Py_ssize_t size,const char * encoding,const char * errors)1235 PyObject *PyUnicode_Decode(const char *s,
1236                            Py_ssize_t size,
1237                            const char *encoding,
1238                            const char *errors)
1239 {
1240     PyObject *buffer = NULL, *unicode;
1241 
1242     if (encoding == NULL)
1243         encoding = PyUnicode_GetDefaultEncoding();
1244 
1245     /* Shortcuts for common default encodings */
1246     if (strcmp(encoding, "utf-8") == 0)
1247         return PyUnicode_DecodeUTF8(s, size, errors);
1248     else if (strcmp(encoding, "latin-1") == 0)
1249         return PyUnicode_DecodeLatin1(s, size, errors);
1250 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1251     else if (strcmp(encoding, "mbcs") == 0)
1252         return PyUnicode_DecodeMBCS(s, size, errors);
1253 #endif
1254     else if (strcmp(encoding, "ascii") == 0)
1255         return PyUnicode_DecodeASCII(s, size, errors);
1256 
1257     /* Decode via the codec registry */
1258     buffer = PyBuffer_FromMemory((void *)s, size);
1259     if (buffer == NULL)
1260         goto onError;
1261     unicode = _PyCodec_DecodeText(buffer, encoding, errors);
1262     if (unicode == NULL)
1263         goto onError;
1264     if (!PyUnicode_Check(unicode)) {
1265         PyErr_Format(PyExc_TypeError,
1266                      "decoder did not return an unicode object (type=%.400s)",
1267                      Py_TYPE(unicode)->tp_name);
1268         Py_DECREF(unicode);
1269         goto onError;
1270     }
1271     Py_DECREF(buffer);
1272     return unicode;
1273 
1274   onError:
1275     Py_XDECREF(buffer);
1276     return NULL;
1277 }
1278 
PyUnicode_AsDecodedObject(PyObject * unicode,const char * encoding,const char * errors)1279 PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1280                                     const char *encoding,
1281                                     const char *errors)
1282 {
1283     PyObject *v;
1284 
1285     if (!PyUnicode_Check(unicode)) {
1286         PyErr_BadArgument();
1287         goto onError;
1288     }
1289 
1290     if (PyErr_WarnPy3k("decoding Unicode is not supported in 3.x", 1) < 0)
1291         goto onError;
1292 
1293     if (encoding == NULL)
1294         encoding = PyUnicode_GetDefaultEncoding();
1295 
1296     /* Decode via the codec registry */
1297     v = _PyCodec_DecodeText(unicode, encoding, errors);
1298     if (v == NULL)
1299         goto onError;
1300     return v;
1301 
1302   onError:
1303     return NULL;
1304 }
1305 
PyUnicode_Encode(const Py_UNICODE * s,Py_ssize_t size,const char * encoding,const char * errors)1306 PyObject *PyUnicode_Encode(const Py_UNICODE *s,
1307                            Py_ssize_t size,
1308                            const char *encoding,
1309                            const char *errors)
1310 {
1311     PyObject *v, *unicode;
1312 
1313     unicode = PyUnicode_FromUnicode(s, size);
1314     if (unicode == NULL)
1315         return NULL;
1316     v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1317     Py_DECREF(unicode);
1318     return v;
1319 }
1320 
PyUnicode_AsEncodedObject(PyObject * unicode,const char * encoding,const char * errors)1321 PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1322                                     const char *encoding,
1323                                     const char *errors)
1324 {
1325     PyObject *v;
1326 
1327     if (!PyUnicode_Check(unicode)) {
1328         PyErr_BadArgument();
1329         goto onError;
1330     }
1331 
1332     if (encoding == NULL)
1333         encoding = PyUnicode_GetDefaultEncoding();
1334 
1335     /* Encode via the codec registry */
1336     v = _PyCodec_EncodeText(unicode, encoding, errors);
1337     if (v == NULL)
1338         goto onError;
1339     return v;
1340 
1341   onError:
1342     return NULL;
1343 }
1344 
PyUnicode_AsEncodedString(PyObject * unicode,const char * encoding,const char * errors)1345 PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1346                                     const char *encoding,
1347                                     const char *errors)
1348 {
1349     PyObject *v;
1350 
1351     if (!PyUnicode_Check(unicode)) {
1352         PyErr_BadArgument();
1353         goto onError;
1354     }
1355 
1356     if (encoding == NULL)
1357         encoding = PyUnicode_GetDefaultEncoding();
1358 
1359     /* Shortcuts for common default encodings */
1360     if (errors == NULL) {
1361         if (strcmp(encoding, "utf-8") == 0)
1362             return PyUnicode_AsUTF8String(unicode);
1363         else if (strcmp(encoding, "latin-1") == 0)
1364             return PyUnicode_AsLatin1String(unicode);
1365 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1366         else if (strcmp(encoding, "mbcs") == 0)
1367             return PyUnicode_AsMBCSString(unicode);
1368 #endif
1369         else if (strcmp(encoding, "ascii") == 0)
1370             return PyUnicode_AsASCIIString(unicode);
1371     }
1372 
1373     /* Encode via the codec registry */
1374     v = _PyCodec_EncodeText(unicode, encoding, errors);
1375     if (v == NULL)
1376         goto onError;
1377     if (!PyString_Check(v)) {
1378         PyErr_Format(PyExc_TypeError,
1379                      "encoder did not return a string object (type=%.400s)",
1380                      Py_TYPE(v)->tp_name);
1381         Py_DECREF(v);
1382         goto onError;
1383     }
1384     return v;
1385 
1386   onError:
1387     return NULL;
1388 }
1389 
_PyUnicode_AsDefaultEncodedString(PyObject * unicode,const char * errors)1390 PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
1391                                             const char *errors)
1392 {
1393     PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
1394 
1395     if (v)
1396         return v;
1397     v = PyUnicode_AsEncodedString(unicode, NULL, errors);
1398     if (v && errors == NULL)
1399         ((PyUnicodeObject *)unicode)->defenc = v;
1400     return v;
1401 }
1402 
PyUnicode_AsUnicode(PyObject * unicode)1403 Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1404 {
1405     if (!PyUnicode_Check(unicode)) {
1406         PyErr_BadArgument();
1407         goto onError;
1408     }
1409     return PyUnicode_AS_UNICODE(unicode);
1410 
1411   onError:
1412     return NULL;
1413 }
1414 
PyUnicode_GetSize(PyObject * unicode)1415 Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
1416 {
1417     if (!PyUnicode_Check(unicode)) {
1418         PyErr_BadArgument();
1419         goto onError;
1420     }
1421     return PyUnicode_GET_SIZE(unicode);
1422 
1423   onError:
1424     return -1;
1425 }
1426 
PyUnicode_GetDefaultEncoding(void)1427 const char *PyUnicode_GetDefaultEncoding(void)
1428 {
1429     return unicode_default_encoding;
1430 }
1431 
PyUnicode_SetDefaultEncoding(const char * encoding)1432 int PyUnicode_SetDefaultEncoding(const char *encoding)
1433 {
1434     PyObject *v;
1435 
1436     /* Make sure the encoding is valid. As side effect, this also
1437        loads the encoding into the codec registry cache. */
1438     v = _PyCodec_Lookup(encoding);
1439     if (v == NULL)
1440         goto onError;
1441     Py_DECREF(v);
1442     strncpy(unicode_default_encoding,
1443             encoding,
1444             sizeof(unicode_default_encoding) - 1);
1445     return 0;
1446 
1447   onError:
1448     return -1;
1449 }
1450 
1451 /* error handling callback helper:
1452    build arguments, call the callback and check the arguments,
1453    if no exception occurred, copy the replacement to the output
1454    and adjust various state variables.
1455    return 0 on success, -1 on error
1456 */
1457 
1458 static
unicode_decode_call_errorhandler(const char * errors,PyObject ** errorHandler,const char * encoding,const char * reason,const char * input,Py_ssize_t insize,Py_ssize_t * startinpos,Py_ssize_t * endinpos,PyObject ** exceptionObject,const char ** inptr,PyUnicodeObject ** output,Py_ssize_t * outpos,Py_UNICODE ** outptr)1459 int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
1460                                      const char *encoding, const char *reason,
1461                                      const char *input, Py_ssize_t insize, Py_ssize_t *startinpos,
1462                                      Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
1463                                      PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
1464 {
1465     static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
1466 
1467     PyObject *restuple = NULL;
1468     PyObject *repunicode = NULL;
1469     Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
1470     Py_ssize_t requiredsize;
1471     Py_ssize_t newpos;
1472     Py_UNICODE *repptr;
1473     Py_ssize_t repsize;
1474     int res = -1;
1475 
1476     if (*errorHandler == NULL) {
1477         *errorHandler = PyCodec_LookupError(errors);
1478         if (*errorHandler == NULL)
1479             goto onError;
1480     }
1481 
1482     if (*exceptionObject == NULL) {
1483         *exceptionObject = PyUnicodeDecodeError_Create(
1484             encoding, input, insize, *startinpos, *endinpos, reason);
1485         if (*exceptionObject == NULL)
1486             goto onError;
1487     }
1488     else {
1489         if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1490             goto onError;
1491         if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1492             goto onError;
1493         if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1494             goto onError;
1495     }
1496 
1497     restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1498     if (restuple == NULL)
1499         goto onError;
1500     if (!PyTuple_Check(restuple)) {
1501         PyErr_SetString(PyExc_TypeError, &argparse[4]);
1502         goto onError;
1503     }
1504     if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
1505         goto onError;
1506     if (newpos<0)
1507         newpos = insize+newpos;
1508     if (newpos<0 || newpos>insize) {
1509         PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
1510         goto onError;
1511     }
1512 
1513     /* need more space? (at least enough for what we
1514        have+the replacement+the rest of the string (starting
1515        at the new input position), so we won't have to check space
1516        when there are no errors in the rest of the string) */
1517     repptr = PyUnicode_AS_UNICODE(repunicode);
1518     repsize = PyUnicode_GET_SIZE(repunicode);
1519     requiredsize = *outpos;
1520     if (requiredsize > PY_SSIZE_T_MAX - repsize)
1521         goto overflow;
1522     requiredsize += repsize;
1523     if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
1524         goto overflow;
1525     requiredsize += insize - newpos;
1526     if (requiredsize > outsize) {
1527         if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
1528             requiredsize = 2*outsize;
1529         if (_PyUnicode_Resize(output, requiredsize) < 0)
1530             goto onError;
1531         *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
1532     }
1533     *endinpos = newpos;
1534     *inptr = input + newpos;
1535     Py_UNICODE_COPY(*outptr, repptr, repsize);
1536     *outptr += repsize;
1537     *outpos += repsize;
1538     /* we made it! */
1539     res = 0;
1540 
1541   onError:
1542     Py_XDECREF(restuple);
1543     return res;
1544 
1545   overflow:
1546     PyErr_SetString(PyExc_OverflowError,
1547                     "decoded result is too long for a Python string");
1548     goto onError;
1549 }
1550 
1551 /* --- UTF-7 Codec -------------------------------------------------------- */
1552 
1553 /* See RFC2152 for details.  We encode conservatively and decode liberally. */
1554 
1555 /* Three simple macros defining base-64. */
1556 
1557 /* Is c a base-64 character? */
1558 
1559 #define IS_BASE64(c) \
1560     (((c) >= 'A' && (c) <= 'Z') ||     \
1561      ((c) >= 'a' && (c) <= 'z') ||     \
1562      ((c) >= '0' && (c) <= '9') ||     \
1563      (c) == '+' || (c) == '/')
1564 
1565 /* given that c is a base-64 character, what is its base-64 value? */
1566 
1567 #define FROM_BASE64(c)                                                  \
1568     (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' :                           \
1569      ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 :                      \
1570      ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 :                      \
1571      (c) == '+' ? 62 : 63)
1572 
1573 /* What is the base-64 character of the bottom 6 bits of n? */
1574 
1575 #define TO_BASE64(n)  \
1576     ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1577 
1578 /* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
1579  * decoded as itself.  We are permissive on decoding; the only ASCII
1580  * byte not decoding to itself is the + which begins a base64
1581  * string. */
1582 
1583 #define DECODE_DIRECT(c)                                \
1584     ((c) <= 127 && (c) != '+')
1585 
1586 /* The UTF-7 encoder treats ASCII characters differently according to
1587  * whether they are Set D, Set O, Whitespace, or special (i.e. none of
1588  * the above).  See RFC2152.  This array identifies these different
1589  * sets:
1590  * 0 : "Set D"
1591  *     alphanumeric and '(),-./:?
1592  * 1 : "Set O"
1593  *     !"#$%&*;<=>@[]^_`{|}
1594  * 2 : "whitespace"
1595  *     ht nl cr sp
1596  * 3 : special (must be base64 encoded)
1597  *     everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
1598  */
1599 
1600 static
1601 char utf7_category[128] = {
1602 /* nul soh stx etx eot enq ack bel bs  ht  nl  vt  np  cr  so  si  */
1603     3,  3,  3,  3,  3,  3,  3,  3,  3,  2,  2,  3,  3,  2,  3,  3,
1604 /* dle dc1 dc2 dc3 dc4 nak syn etb can em  sub esc fs  gs  rs  us  */
1605     3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
1606 /* sp   !   "   #   $   %   &   '   (   )   *   +   ,   -   .   /  */
1607     2,  1,  1,  1,  1,  1,  1,  0,  0,  0,  1,  3,  0,  0,  0,  0,
1608 /*  0   1   2   3   4   5   6   7   8   9   :   ;   <   =   >   ?  */
1609     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  1,  0,
1610 /*  @   A   B   C   D   E   F   G   H   I   J   K   L   M   N   O  */
1611     1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
1612 /*  P   Q   R   S   T   U   V   W   X   Y   Z   [   \   ]   ^   _  */
1613     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  3,  1,  1,  1,
1614 /*  `   a   b   c   d   e   f   g   h   i   j   k   l   m   n   o  */
1615     1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
1616 /*  p   q   r   s   t   u   v   w   x   y   z   {   |   }   ~  del */
1617     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  3,  3,
1618 };
1619 
1620 /* ENCODE_DIRECT: this character should be encoded as itself.  The
1621  * answer depends on whether we are encoding set O as itself, and also
1622  * on whether we are encoding whitespace as itself.  RFC2152 makes it
1623  * clear that the answers to these questions vary between
1624  * applications, so this code needs to be flexible.  */
1625 
1626 #define ENCODE_DIRECT(c, directO, directWS)             \
1627     ((c) < 128 && (c) > 0 &&                            \
1628      ((utf7_category[(c)] == 0) ||                      \
1629       (directWS && (utf7_category[(c)] == 2)) ||        \
1630       (directO && (utf7_category[(c)] == 1))))
1631 
PyUnicode_DecodeUTF7(const char * s,Py_ssize_t size,const char * errors)1632 PyObject *PyUnicode_DecodeUTF7(const char *s,
1633                                Py_ssize_t size,
1634                                const char *errors)
1635 {
1636     return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
1637 }
1638 
1639 /* The decoder.  The only state we preserve is our read position,
1640  * i.e. how many characters we have consumed.  So if we end in the
1641  * middle of a shift sequence we have to back off the read position
1642  * and the output to the beginning of the sequence, otherwise we lose
1643  * all the shift state (seen bits, number of bits seen, high
1644  * surrogate). */
1645 
PyUnicode_DecodeUTF7Stateful(const char * s,Py_ssize_t size,const char * errors,Py_ssize_t * consumed)1646 PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
1647                                        Py_ssize_t size,
1648                                        const char *errors,
1649                                        Py_ssize_t *consumed)
1650 {
1651     const char *starts = s;
1652     Py_ssize_t startinpos;
1653     Py_ssize_t endinpos;
1654     Py_ssize_t outpos;
1655     const char *e;
1656     PyUnicodeObject *unicode;
1657     Py_UNICODE *p;
1658     const char *errmsg = "";
1659     int inShift = 0;
1660     Py_UNICODE *shiftOutStart;
1661     unsigned int base64bits = 0;
1662     unsigned long base64buffer = 0;
1663     Py_UNICODE surrogate = 0;
1664     PyObject *errorHandler = NULL;
1665     PyObject *exc = NULL;
1666 
1667     unicode = _PyUnicode_New(size);
1668     if (!unicode)
1669         return NULL;
1670     if (size == 0) {
1671         if (consumed)
1672             *consumed = 0;
1673         return (PyObject *)unicode;
1674     }
1675 
1676     p = unicode->str;
1677     shiftOutStart = p;
1678     e = s + size;
1679 
1680     while (s < e) {
1681         Py_UNICODE ch = (unsigned char) *s;
1682 
1683         if (inShift) { /* in a base-64 section */
1684             if (IS_BASE64(ch)) { /* consume a base-64 character */
1685                 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
1686                 base64bits += 6;
1687                 s++;
1688                 if (base64bits >= 16) {
1689                     /* we have enough bits for a UTF-16 value */
1690                     Py_UNICODE outCh = (Py_UNICODE)
1691                                        (base64buffer >> (base64bits-16));
1692                     base64bits -= 16;
1693                     base64buffer &= (1 << base64bits) - 1; /* clear high bits */
1694                     assert(outCh <= 0xffff);
1695                     if (surrogate) {
1696                         /* expecting a second surrogate */
1697                         if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
1698 #ifdef Py_UNICODE_WIDE
1699                             *p++ = (((surrogate & 0x3FF)<<10)
1700                                     | (outCh & 0x3FF)) + 0x10000;
1701 #else
1702                             *p++ = surrogate;
1703                             *p++ = outCh;
1704 #endif
1705                             surrogate = 0;
1706                             continue;
1707                         }
1708                         else {
1709                             *p++ = surrogate;
1710                             surrogate = 0;
1711                         }
1712                     }
1713                     if (outCh >= 0xD800 && outCh <= 0xDBFF) {
1714                         /* first surrogate */
1715                         surrogate = outCh;
1716                     }
1717                     else {
1718                         *p++ = outCh;
1719                     }
1720                 }
1721             }
1722             else { /* now leaving a base-64 section */
1723                 inShift = 0;
1724                 if (base64bits > 0) { /* left-over bits */
1725                     if (base64bits >= 6) {
1726                         /* We've seen at least one base-64 character */
1727                         s++;
1728                         errmsg = "partial character in shift sequence";
1729                         goto utf7Error;
1730                     }
1731                     else {
1732                         /* Some bits remain; they should be zero */
1733                         if (base64buffer != 0) {
1734                             s++;
1735                             errmsg = "non-zero padding bits in shift sequence";
1736                             goto utf7Error;
1737                         }
1738                     }
1739                 }
1740                 if (surrogate && DECODE_DIRECT(ch))
1741                     *p++ = surrogate;
1742                 surrogate = 0;
1743                 if (ch == '-') {
1744                     /* '-' is absorbed; other terminating
1745                        characters are preserved */
1746                     s++;
1747                 }
1748             }
1749         }
1750         else if ( ch == '+' ) {
1751             startinpos = s-starts;
1752             s++; /* consume '+' */
1753             if (s < e && *s == '-') { /* '+-' encodes '+' */
1754                 s++;
1755                 *p++ = '+';
1756             }
1757             else { /* begin base64-encoded section */
1758                 inShift = 1;
1759                 surrogate = 0;
1760                 shiftOutStart = p;
1761                 base64bits = 0;
1762                 base64buffer = 0;
1763             }
1764         }
1765         else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
1766             *p++ = ch;
1767             s++;
1768         }
1769         else {
1770             startinpos = s-starts;
1771             s++;
1772             errmsg = "unexpected special character";
1773             goto utf7Error;
1774         }
1775         continue;
1776 utf7Error:
1777         outpos = p-PyUnicode_AS_UNICODE(unicode);
1778         endinpos = s-starts;
1779         if (unicode_decode_call_errorhandler(
1780                 errors, &errorHandler,
1781                 "utf7", errmsg,
1782                 starts, size, &startinpos, &endinpos, &exc, &s,
1783                 &unicode, &outpos, &p))
1784             goto onError;
1785     }
1786 
1787     /* end of string */
1788 
1789     if (inShift && !consumed) { /* in shift sequence, no more to follow */
1790         /* if we're in an inconsistent state, that's an error */
1791         inShift = 0;
1792         if (surrogate ||
1793                 (base64bits >= 6) ||
1794                 (base64bits > 0 && base64buffer != 0)) {
1795             outpos = p-PyUnicode_AS_UNICODE(unicode);
1796             endinpos = size;
1797             if (unicode_decode_call_errorhandler(
1798                     errors, &errorHandler,
1799                     "utf7", "unterminated shift sequence",
1800                     starts, size, &startinpos, &endinpos, &exc, &s,
1801                     &unicode, &outpos, &p))
1802                 goto onError;
1803         }
1804     }
1805 
1806     /* return state */
1807     if (consumed) {
1808         if (inShift) {
1809             p = shiftOutStart; /* back off output */
1810             *consumed = startinpos;
1811         }
1812         else {
1813             *consumed = s-starts;
1814         }
1815     }
1816 
1817     if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
1818         goto onError;
1819 
1820     Py_XDECREF(errorHandler);
1821     Py_XDECREF(exc);
1822     return (PyObject *)unicode;
1823 
1824   onError:
1825     Py_XDECREF(errorHandler);
1826     Py_XDECREF(exc);
1827     Py_DECREF(unicode);
1828     return NULL;
1829 }
1830 
1831 
PyUnicode_EncodeUTF7(const Py_UNICODE * s,Py_ssize_t size,int base64SetO,int base64WhiteSpace,const char * errors)1832 PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
1833                                Py_ssize_t size,
1834                                int base64SetO,
1835                                int base64WhiteSpace,
1836                                const char *errors)
1837 {
1838     PyObject *v;
1839     /* It might be possible to tighten this worst case */
1840     Py_ssize_t allocated = 8 * size;
1841     int inShift = 0;
1842     Py_ssize_t i = 0;
1843     unsigned int base64bits = 0;
1844     unsigned long base64buffer = 0;
1845     char * out;
1846     char * start;
1847 
1848     if (allocated / 8 != size)
1849         return PyErr_NoMemory();
1850 
1851     if (size == 0)
1852         return PyString_FromStringAndSize(NULL, 0);
1853 
1854     v = PyString_FromStringAndSize(NULL, allocated);
1855     if (v == NULL)
1856         return NULL;
1857 
1858     start = out = PyString_AS_STRING(v);
1859     for (;i < size; ++i) {
1860         Py_UNICODE ch = s[i];
1861 
1862         if (inShift) {
1863             if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
1864                 /* shifting out */
1865                 if (base64bits) { /* output remaining bits */
1866                     *out++ = TO_BASE64(base64buffer << (6-base64bits));
1867                     base64buffer = 0;
1868                     base64bits = 0;
1869                 }
1870                 inShift = 0;
1871                 /* Characters not in the BASE64 set implicitly unshift the sequence
1872                    so no '-' is required, except if the character is itself a '-' */
1873                 if (IS_BASE64(ch) || ch == '-') {
1874                     *out++ = '-';
1875                 }
1876                 *out++ = (char) ch;
1877             }
1878             else {
1879                 goto encode_char;
1880             }
1881         }
1882         else { /* not in a shift sequence */
1883             if (ch == '+') {
1884                 *out++ = '+';
1885                         *out++ = '-';
1886             }
1887             else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
1888                 *out++ = (char) ch;
1889             }
1890             else {
1891                 *out++ = '+';
1892                 inShift = 1;
1893                 goto encode_char;
1894             }
1895         }
1896         continue;
1897 encode_char:
1898 #ifdef Py_UNICODE_WIDE
1899         if (ch >= 0x10000) {
1900             /* code first surrogate */
1901             base64bits += 16;
1902             base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
1903             while (base64bits >= 6) {
1904                 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
1905                 base64bits -= 6;
1906             }
1907             /* prepare second surrogate */
1908             ch =  0xDC00 | ((ch-0x10000) & 0x3FF);
1909         }
1910 #endif
1911         base64bits += 16;
1912         base64buffer = (base64buffer << 16) | ch;
1913         while (base64bits >= 6) {
1914             *out++ = TO_BASE64(base64buffer >> (base64bits-6));
1915             base64bits -= 6;
1916         }
1917     }
1918     if (base64bits)
1919         *out++= TO_BASE64(base64buffer << (6-base64bits) );
1920     if (inShift)
1921         *out++ = '-';
1922 
1923     if (_PyString_Resize(&v, out - start))
1924         return NULL;
1925     return v;
1926 }
1927 
1928 #undef IS_BASE64
1929 #undef FROM_BASE64
1930 #undef TO_BASE64
1931 #undef DECODE_DIRECT
1932 #undef ENCODE_DIRECT
1933 
1934 /* --- UTF-8 Codec -------------------------------------------------------- */
1935 
1936 static
1937 char utf8_code_length[256] = {
1938     /* Map UTF-8 encoded prefix byte to sequence length.  Zero means
1939        illegal prefix.  See RFC 3629 for details */
1940     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
1941     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1942     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1943     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1944     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1945     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1946     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1947     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
1948     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
1949     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1950     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1951     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
1952     0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
1953     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
1954     3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
1955     4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0  /* F0-F4 + F5-FF */
1956 };
1957 
PyUnicode_DecodeUTF8(const char * s,Py_ssize_t size,const char * errors)1958 PyObject *PyUnicode_DecodeUTF8(const char *s,
1959                                Py_ssize_t size,
1960                                const char *errors)
1961 {
1962     return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1963 }
1964 
PyUnicode_DecodeUTF8Stateful(const char * s,Py_ssize_t size,const char * errors,Py_ssize_t * consumed)1965 PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
1966                                        Py_ssize_t size,
1967                                        const char *errors,
1968                                        Py_ssize_t *consumed)
1969 {
1970     const char *starts = s;
1971     int n;
1972     int k;
1973     Py_ssize_t startinpos;
1974     Py_ssize_t endinpos;
1975     Py_ssize_t outpos;
1976     const char *e;
1977     PyUnicodeObject *unicode;
1978     Py_UNICODE *p;
1979     const char *errmsg = "";
1980     PyObject *errorHandler = NULL;
1981     PyObject *exc = NULL;
1982 
1983     /* Note: size will always be longer than the resulting Unicode
1984        character count */
1985     unicode = _PyUnicode_New(size);
1986     if (!unicode)
1987         return NULL;
1988     if (size == 0) {
1989         if (consumed)
1990             *consumed = 0;
1991         return (PyObject *)unicode;
1992     }
1993 
1994     /* Unpack UTF-8 encoded data */
1995     p = unicode->str;
1996     e = s + size;
1997 
1998     while (s < e) {
1999         Py_UCS4 ch = (unsigned char)*s;
2000 
2001         if (ch < 0x80) {
2002             *p++ = (Py_UNICODE)ch;
2003             s++;
2004             continue;
2005         }
2006 
2007         n = utf8_code_length[ch];
2008 
2009         if (s + n > e) {
2010             if (consumed)
2011                 break;
2012             else {
2013                 errmsg = "unexpected end of data";
2014                 startinpos = s-starts;
2015                 endinpos = startinpos+1;
2016                 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
2017                     endinpos++;
2018                 goto utf8Error;
2019             }
2020         }
2021 
2022         switch (n) {
2023 
2024         case 0:
2025             errmsg = "invalid start byte";
2026             startinpos = s-starts;
2027             endinpos = startinpos+1;
2028             goto utf8Error;
2029 
2030         case 1:
2031             errmsg = "internal error";
2032             startinpos = s-starts;
2033             endinpos = startinpos+1;
2034             goto utf8Error;
2035 
2036         case 2:
2037             if ((s[1] & 0xc0) != 0x80) {
2038                 errmsg = "invalid continuation byte";
2039                 startinpos = s-starts;
2040                 endinpos = startinpos + 1;
2041                 goto utf8Error;
2042             }
2043             ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
2044             assert ((ch > 0x007F) && (ch <= 0x07FF));
2045             *p++ = (Py_UNICODE)ch;
2046             break;
2047 
2048         case 3:
2049             /* XXX: surrogates shouldn't be valid UTF-8!
2050                see http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
2051                (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt
2052                Uncomment the 2 lines below to make them invalid,
2053                code points: d800-dfff; UTF-8: \xed\xa0\x80-\xed\xbf\xbf. */
2054             if ((s[1] & 0xc0) != 0x80 ||
2055                 (s[2] & 0xc0) != 0x80 ||
2056                 ((unsigned char)s[0] == 0xE0 &&
2057                  (unsigned char)s[1] < 0xA0)/* ||
2058                 ((unsigned char)s[0] == 0xED &&
2059                  (unsigned char)s[1] > 0x9F)*/) {
2060                 errmsg = "invalid continuation byte";
2061                 startinpos = s-starts;
2062                 endinpos = startinpos + 1;
2063 
2064                 /* if s[1] first two bits are 1 and 0, then the invalid
2065                    continuation byte is s[2], so increment endinpos by 1,
2066                    if not, s[1] is invalid and endinpos doesn't need to
2067                    be incremented. */
2068                 if ((s[1] & 0xC0) == 0x80)
2069                     endinpos++;
2070                 goto utf8Error;
2071             }
2072             ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
2073             assert ((ch > 0x07FF) && (ch <= 0xFFFF));
2074             *p++ = (Py_UNICODE)ch;
2075             break;
2076 
2077         case 4:
2078             if ((s[1] & 0xc0) != 0x80 ||
2079                 (s[2] & 0xc0) != 0x80 ||
2080                 (s[3] & 0xc0) != 0x80 ||
2081                 ((unsigned char)s[0] == 0xF0 &&
2082                  (unsigned char)s[1] < 0x90) ||
2083                 ((unsigned char)s[0] == 0xF4 &&
2084                  (unsigned char)s[1] > 0x8F)) {
2085                 errmsg = "invalid continuation byte";
2086                 startinpos = s-starts;
2087                 endinpos = startinpos + 1;
2088                 if ((s[1] & 0xC0) == 0x80) {
2089                     endinpos++;
2090                     if ((s[2] & 0xC0) == 0x80)
2091                         endinpos++;
2092                 }
2093                 goto utf8Error;
2094             }
2095             ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
2096                  ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
2097             assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
2098 
2099 #ifdef Py_UNICODE_WIDE
2100             *p++ = (Py_UNICODE)ch;
2101 #else
2102             /*  compute and append the two surrogates: */
2103 
2104             /*  translate from 10000..10FFFF to 0..FFFF */
2105             ch -= 0x10000;
2106 
2107             /*  high surrogate = top 10 bits added to D800 */
2108             *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
2109 
2110             /*  low surrogate = bottom 10 bits added to DC00 */
2111             *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
2112 #endif
2113             break;
2114         }
2115         s += n;
2116         continue;
2117 
2118       utf8Error:
2119         outpos = p-PyUnicode_AS_UNICODE(unicode);
2120         if (unicode_decode_call_errorhandler(
2121                 errors, &errorHandler,
2122                 "utf8", errmsg,
2123                 starts, size, &startinpos, &endinpos, &exc, &s,
2124                 &unicode, &outpos, &p))
2125             goto onError;
2126     }
2127     if (consumed)
2128         *consumed = s-starts;
2129 
2130     /* Adjust length */
2131     if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2132         goto onError;
2133 
2134     Py_XDECREF(errorHandler);
2135     Py_XDECREF(exc);
2136     return (PyObject *)unicode;
2137 
2138   onError:
2139     Py_XDECREF(errorHandler);
2140     Py_XDECREF(exc);
2141     Py_DECREF(unicode);
2142     return NULL;
2143 }
2144 
2145 /* Allocation strategy:  if the string is short, convert into a stack buffer
2146    and allocate exactly as much space needed at the end.  Else allocate the
2147    maximum possible needed (4 result bytes per Unicode character), and return
2148    the excess memory at the end.
2149 */
2150 PyObject *
PyUnicode_EncodeUTF8(const Py_UNICODE * s,Py_ssize_t size,const char * errors)2151 PyUnicode_EncodeUTF8(const Py_UNICODE *s,
2152                      Py_ssize_t size,
2153                      const char *errors)
2154 {
2155 #define MAX_SHORT_UNICHARS 300  /* largest size we'll do on the stack */
2156 
2157     Py_ssize_t i;           /* index into s of next input byte */
2158     PyObject *v;        /* result string object */
2159     char *p;            /* next free byte in output buffer */
2160     Py_ssize_t nallocated;  /* number of result bytes allocated */
2161     Py_ssize_t nneeded;        /* number of result bytes needed */
2162     char stackbuf[MAX_SHORT_UNICHARS * 4];
2163 
2164     assert(s != NULL);
2165     assert(size >= 0);
2166 
2167     if (size <= MAX_SHORT_UNICHARS) {
2168         /* Write into the stack buffer; nallocated can't overflow.
2169          * At the end, we'll allocate exactly as much heap space as it
2170          * turns out we need.
2171          */
2172         nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
2173         v = NULL;   /* will allocate after we're done */
2174         p = stackbuf;
2175     }
2176     else {
2177         /* Overallocate on the heap, and give the excess back at the end. */
2178         nallocated = size * 4;
2179         if (nallocated / 4 != size)  /* overflow! */
2180             return PyErr_NoMemory();
2181         v = PyString_FromStringAndSize(NULL, nallocated);
2182         if (v == NULL)
2183             return NULL;
2184         p = PyString_AS_STRING(v);
2185     }
2186 
2187     for (i = 0; i < size;) {
2188         Py_UCS4 ch = s[i++];
2189 
2190         if (ch < 0x80)
2191             /* Encode ASCII */
2192             *p++ = (char) ch;
2193 
2194         else if (ch < 0x0800) {
2195             /* Encode Latin-1 */
2196             *p++ = (char)(0xc0 | (ch >> 6));
2197             *p++ = (char)(0x80 | (ch & 0x3f));
2198         }
2199         else {
2200             /* Encode UCS2 Unicode ordinals */
2201             if (ch < 0x10000) {
2202                 /* Special case: check for high surrogate */
2203                 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
2204                     Py_UCS4 ch2 = s[i];
2205                     /* Check for low surrogate and combine the two to
2206                        form a UCS4 value */
2207                     if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2208                         ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
2209                         i++;
2210                         goto encodeUCS4;
2211                     }
2212                     /* Fall through: handles isolated high surrogates */
2213                 }
2214                 *p++ = (char)(0xe0 | (ch >> 12));
2215                 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2216                 *p++ = (char)(0x80 | (ch & 0x3f));
2217                 continue;
2218             }
2219           encodeUCS4:
2220             /* Encode UCS4 Unicode ordinals */
2221             *p++ = (char)(0xf0 | (ch >> 18));
2222             *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2223             *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2224             *p++ = (char)(0x80 | (ch & 0x3f));
2225         }
2226     }
2227 
2228     if (v == NULL) {
2229         /* This was stack allocated. */
2230         nneeded = p - stackbuf;
2231         assert(nneeded <= nallocated);
2232         v = PyString_FromStringAndSize(stackbuf, nneeded);
2233     }
2234     else {
2235         /* Cut back to size actually needed. */
2236         nneeded = p - PyString_AS_STRING(v);
2237         assert(nneeded <= nallocated);
2238         if (_PyString_Resize(&v, nneeded))
2239             return NULL;
2240     }
2241     return v;
2242 
2243 #undef MAX_SHORT_UNICHARS
2244 }
2245 
PyUnicode_AsUTF8String(PyObject * unicode)2246 PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
2247 {
2248     if (!PyUnicode_Check(unicode)) {
2249         PyErr_BadArgument();
2250         return NULL;
2251     }
2252     return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
2253                                 PyUnicode_GET_SIZE(unicode),
2254                                 NULL);
2255 }
2256 
2257 /* --- UTF-32 Codec ------------------------------------------------------- */
2258 
2259 PyObject *
PyUnicode_DecodeUTF32(const char * s,Py_ssize_t size,const char * errors,int * byteorder)2260 PyUnicode_DecodeUTF32(const char *s,
2261                       Py_ssize_t size,
2262                       const char *errors,
2263                       int *byteorder)
2264 {
2265     return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
2266 }
2267 
2268 PyObject *
PyUnicode_DecodeUTF32Stateful(const char * s,Py_ssize_t size,const char * errors,int * byteorder,Py_ssize_t * consumed)2269 PyUnicode_DecodeUTF32Stateful(const char *s,
2270                               Py_ssize_t size,
2271                               const char *errors,
2272                               int *byteorder,
2273                               Py_ssize_t *consumed)
2274 {
2275     const char *starts = s;
2276     Py_ssize_t startinpos;
2277     Py_ssize_t endinpos;
2278     Py_ssize_t outpos;
2279     PyUnicodeObject *unicode;
2280     Py_UNICODE *p;
2281 #ifndef Py_UNICODE_WIDE
2282     int pairs = 0;
2283     const unsigned char *qq;
2284 #else
2285     const int pairs = 0;
2286 #endif
2287     const unsigned char *q, *e;
2288     int bo = 0;       /* assume native ordering by default */
2289     const char *errmsg = "";
2290     /* Offsets from q for retrieving bytes in the right order. */
2291 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2292     int iorder[] = {0, 1, 2, 3};
2293 #else
2294     int iorder[] = {3, 2, 1, 0};
2295 #endif
2296     PyObject *errorHandler = NULL;
2297     PyObject *exc = NULL;
2298 
2299     q = (unsigned char *)s;
2300     e = q + size;
2301 
2302     if (byteorder)
2303         bo = *byteorder;
2304 
2305     /* Check for BOM marks (U+FEFF) in the input and adjust current
2306        byte order setting accordingly. In native mode, the leading BOM
2307        mark is skipped, in all other modes, it is copied to the output
2308        stream as-is (giving a ZWNBSP character). */
2309     if (bo == 0) {
2310         if (size >= 4) {
2311             const Py_UCS4 bom = ((unsigned int)q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2312                 (q[iorder[1]] << 8) | q[iorder[0]];
2313 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2314             if (bom == 0x0000FEFF) {
2315                 q += 4;
2316                 bo = -1;
2317             }
2318             else if (bom == 0xFFFE0000) {
2319                 q += 4;
2320                 bo = 1;
2321             }
2322 #else
2323             if (bom == 0x0000FEFF) {
2324                 q += 4;
2325                 bo = 1;
2326             }
2327             else if (bom == 0xFFFE0000) {
2328                 q += 4;
2329                 bo = -1;
2330             }
2331 #endif
2332         }
2333     }
2334 
2335     if (bo == -1) {
2336         /* force LE */
2337         iorder[0] = 0;
2338         iorder[1] = 1;
2339         iorder[2] = 2;
2340         iorder[3] = 3;
2341     }
2342     else if (bo == 1) {
2343         /* force BE */
2344         iorder[0] = 3;
2345         iorder[1] = 2;
2346         iorder[2] = 1;
2347         iorder[3] = 0;
2348     }
2349 
2350     /* On narrow builds we split characters outside the BMP into two
2351        code points => count how much extra space we need. */
2352 #ifndef Py_UNICODE_WIDE
2353     for (qq = q; e - qq >= 4; qq += 4)
2354         if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
2355             pairs++;
2356 #endif
2357 
2358     /* This might be one to much, because of a BOM */
2359     unicode = _PyUnicode_New((size+3)/4+pairs);
2360     if (!unicode)
2361         return NULL;
2362     if (size == 0)
2363         return (PyObject *)unicode;
2364 
2365     /* Unpack UTF-32 encoded data */
2366     p = unicode->str;
2367 
2368     while (q < e) {
2369         Py_UCS4 ch;
2370         /* remaining bytes at the end? (size should be divisible by 4) */
2371         if (e-q<4) {
2372             if (consumed)
2373                 break;
2374             errmsg = "truncated data";
2375             startinpos = ((const char *)q)-starts;
2376             endinpos = ((const char *)e)-starts;
2377             goto utf32Error;
2378             /* The remaining input chars are ignored if the callback
2379                chooses to skip the input */
2380         }
2381         ch = ((unsigned int)q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2382             (q[iorder[1]] << 8) | q[iorder[0]];
2383 
2384         if (ch >= 0x110000)
2385         {
2386             errmsg = "code point not in range(0x110000)";
2387             startinpos = ((const char *)q)-starts;
2388             endinpos = startinpos+4;
2389             goto utf32Error;
2390         }
2391 #ifndef Py_UNICODE_WIDE
2392         if (ch >= 0x10000)
2393         {
2394             *p++ = 0xD800 | ((ch-0x10000) >> 10);
2395             *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
2396         }
2397         else
2398 #endif
2399             *p++ = ch;
2400         q += 4;
2401         continue;
2402       utf32Error:
2403         outpos = p-PyUnicode_AS_UNICODE(unicode);
2404         if (unicode_decode_call_errorhandler(
2405                 errors, &errorHandler,
2406                 "utf32", errmsg,
2407                 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
2408                 &unicode, &outpos, &p))
2409             goto onError;
2410     }
2411 
2412     if (byteorder)
2413         *byteorder = bo;
2414 
2415     if (consumed)
2416         *consumed = (const char *)q-starts;
2417 
2418     /* Adjust length */
2419     if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2420         goto onError;
2421 
2422     Py_XDECREF(errorHandler);
2423     Py_XDECREF(exc);
2424     return (PyObject *)unicode;
2425 
2426   onError:
2427     Py_DECREF(unicode);
2428     Py_XDECREF(errorHandler);
2429     Py_XDECREF(exc);
2430     return NULL;
2431 }
2432 
2433 PyObject *
PyUnicode_EncodeUTF32(const Py_UNICODE * s,Py_ssize_t size,const char * errors,int byteorder)2434 PyUnicode_EncodeUTF32(const Py_UNICODE *s,
2435                       Py_ssize_t size,
2436                       const char *errors,
2437                       int byteorder)
2438 {
2439     PyObject *v;
2440     unsigned char *p;
2441     Py_ssize_t nsize, bytesize;
2442 #ifndef Py_UNICODE_WIDE
2443     Py_ssize_t i, pairs;
2444 #else
2445     const int pairs = 0;
2446 #endif
2447     /* Offsets from p for storing byte pairs in the right order. */
2448 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2449     int iorder[] = {0, 1, 2, 3};
2450 #else
2451     int iorder[] = {3, 2, 1, 0};
2452 #endif
2453 
2454 #define STORECHAR(CH)                           \
2455     do {                                        \
2456         p[iorder[3]] = ((CH) >> 24) & 0xff;     \
2457         p[iorder[2]] = ((CH) >> 16) & 0xff;     \
2458         p[iorder[1]] = ((CH) >> 8) & 0xff;      \
2459         p[iorder[0]] = (CH) & 0xff;             \
2460         p += 4;                                 \
2461     } while(0)
2462 
2463     /* In narrow builds we can output surrogate pairs as one code point,
2464        so we need less space. */
2465 #ifndef Py_UNICODE_WIDE
2466     for (i = pairs = 0; i < size-1; i++)
2467         if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
2468             0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
2469             pairs++;
2470 #endif
2471     nsize = (size - pairs + (byteorder == 0));
2472     bytesize = nsize * 4;
2473     if (bytesize / 4 != nsize)
2474         return PyErr_NoMemory();
2475     v = PyString_FromStringAndSize(NULL, bytesize);
2476     if (v == NULL)
2477         return NULL;
2478 
2479     p = (unsigned char *)PyString_AS_STRING(v);
2480     if (byteorder == 0)
2481         STORECHAR(0xFEFF);
2482     if (size == 0)
2483         return v;
2484 
2485     if (byteorder == -1) {
2486         /* force LE */
2487         iorder[0] = 0;
2488         iorder[1] = 1;
2489         iorder[2] = 2;
2490         iorder[3] = 3;
2491     }
2492     else if (byteorder == 1) {
2493         /* force BE */
2494         iorder[0] = 3;
2495         iorder[1] = 2;
2496         iorder[2] = 1;
2497         iorder[3] = 0;
2498     }
2499 
2500     while (size-- > 0) {
2501         Py_UCS4 ch = *s++;
2502 #ifndef Py_UNICODE_WIDE
2503         if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
2504             Py_UCS4 ch2 = *s;
2505             if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2506                 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2507                 s++;
2508                 size--;
2509             }
2510         }
2511 #endif
2512         STORECHAR(ch);
2513     }
2514     return v;
2515 #undef STORECHAR
2516 }
2517 
PyUnicode_AsUTF32String(PyObject * unicode)2518 PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
2519 {
2520     if (!PyUnicode_Check(unicode)) {
2521         PyErr_BadArgument();
2522         return NULL;
2523     }
2524     return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
2525                                  PyUnicode_GET_SIZE(unicode),
2526                                  NULL,
2527                                  0);
2528 }
2529 
2530 /* --- UTF-16 Codec ------------------------------------------------------- */
2531 
2532 PyObject *
PyUnicode_DecodeUTF16(const char * s,Py_ssize_t size,const char * errors,int * byteorder)2533 PyUnicode_DecodeUTF16(const char *s,
2534                       Py_ssize_t size,
2535                       const char *errors,
2536                       int *byteorder)
2537 {
2538     return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
2539 }
2540 
2541 PyObject *
PyUnicode_DecodeUTF16Stateful(const char * s,Py_ssize_t size,const char * errors,int * byteorder,Py_ssize_t * consumed)2542 PyUnicode_DecodeUTF16Stateful(const char *s,
2543                               Py_ssize_t size,
2544                               const char *errors,
2545                               int *byteorder,
2546                               Py_ssize_t *consumed)
2547 {
2548     const char *starts = s;
2549     Py_ssize_t startinpos;
2550     Py_ssize_t endinpos;
2551     Py_ssize_t outpos;
2552     PyUnicodeObject *unicode;
2553     Py_UNICODE *p;
2554     const unsigned char *q, *e;
2555     int bo = 0;       /* assume native ordering by default */
2556     const char *errmsg = "";
2557     /* Offsets from q for retrieving byte pairs in the right order. */
2558 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2559     int ihi = 1, ilo = 0;
2560 #else
2561     int ihi = 0, ilo = 1;
2562 #endif
2563     PyObject *errorHandler = NULL;
2564     PyObject *exc = NULL;
2565 
2566     /* Note: size will always be longer than the resulting Unicode
2567        character count */
2568     unicode = _PyUnicode_New(size);
2569     if (!unicode)
2570         return NULL;
2571     if (size == 0)
2572         return (PyObject *)unicode;
2573 
2574     /* Unpack UTF-16 encoded data */
2575     p = unicode->str;
2576     q = (unsigned char *)s;
2577     e = q + size;
2578 
2579     if (byteorder)
2580         bo = *byteorder;
2581 
2582     /* Check for BOM marks (U+FEFF) in the input and adjust current
2583        byte order setting accordingly. In native mode, the leading BOM
2584        mark is skipped, in all other modes, it is copied to the output
2585        stream as-is (giving a ZWNBSP character). */
2586     if (bo == 0) {
2587         if (size >= 2) {
2588             const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
2589 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2590             if (bom == 0xFEFF) {
2591                 q += 2;
2592                 bo = -1;
2593             }
2594             else if (bom == 0xFFFE) {
2595                 q += 2;
2596                 bo = 1;
2597             }
2598 #else
2599             if (bom == 0xFEFF) {
2600                 q += 2;
2601                 bo = 1;
2602             }
2603             else if (bom == 0xFFFE) {
2604                 q += 2;
2605                 bo = -1;
2606             }
2607 #endif
2608         }
2609     }
2610 
2611     if (bo == -1) {
2612         /* force LE */
2613         ihi = 1;
2614         ilo = 0;
2615     }
2616     else if (bo == 1) {
2617         /* force BE */
2618         ihi = 0;
2619         ilo = 1;
2620     }
2621 
2622     while (q < e) {
2623         Py_UNICODE ch;
2624         /* remaining bytes at the end? (size should be even) */
2625         if (e-q<2) {
2626             if (consumed)
2627                 break;
2628             errmsg = "truncated data";
2629             startinpos = ((const char *)q)-starts;
2630             endinpos = ((const char *)e)-starts;
2631             goto utf16Error;
2632             /* The remaining input chars are ignored if the callback
2633                chooses to skip the input */
2634         }
2635         ch = (q[ihi] << 8) | q[ilo];
2636 
2637         q += 2;
2638 
2639         if (ch < 0xD800 || ch > 0xDFFF) {
2640             *p++ = ch;
2641             continue;
2642         }
2643 
2644         /* UTF-16 code pair: */
2645         if (e - q < 2) {
2646             q -= 2;
2647             if (consumed)
2648                 break;
2649             errmsg = "unexpected end of data";
2650             startinpos = ((const char *)q)-starts;
2651             endinpos = ((const char *)e)-starts;
2652             goto utf16Error;
2653         }
2654         if (0xD800 <= ch && ch <= 0xDBFF) {
2655             Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
2656             q += 2;
2657             if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2658 #ifndef Py_UNICODE_WIDE
2659                 *p++ = ch;
2660                 *p++ = ch2;
2661 #else
2662                 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2663 #endif
2664                 continue;
2665             }
2666             else {
2667                 errmsg = "illegal UTF-16 surrogate";
2668                 startinpos = (((const char *)q)-4)-starts;
2669                 endinpos = startinpos+2;
2670                 goto utf16Error;
2671             }
2672 
2673         }
2674         errmsg = "illegal encoding";
2675         startinpos = (((const char *)q)-2)-starts;
2676         endinpos = startinpos+2;
2677         /* Fall through to report the error */
2678 
2679       utf16Error:
2680         outpos = p-PyUnicode_AS_UNICODE(unicode);
2681         if (unicode_decode_call_errorhandler(
2682                 errors, &errorHandler,
2683                 "utf16", errmsg,
2684                 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
2685                 &unicode, &outpos, &p))
2686             goto onError;
2687     }
2688 
2689     if (byteorder)
2690         *byteorder = bo;
2691 
2692     if (consumed)
2693         *consumed = (const char *)q-starts;
2694 
2695     /* Adjust length */
2696     if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2697         goto onError;
2698 
2699     Py_XDECREF(errorHandler);
2700     Py_XDECREF(exc);
2701     return (PyObject *)unicode;
2702 
2703   onError:
2704     Py_DECREF(unicode);
2705     Py_XDECREF(errorHandler);
2706     Py_XDECREF(exc);
2707     return NULL;
2708 }
2709 
2710 PyObject *
PyUnicode_EncodeUTF16(const Py_UNICODE * s,Py_ssize_t size,const char * errors,int byteorder)2711 PyUnicode_EncodeUTF16(const Py_UNICODE *s,
2712                       Py_ssize_t size,
2713                       const char *errors,
2714                       int byteorder)
2715 {
2716     PyObject *v;
2717     unsigned char *p;
2718     Py_ssize_t nsize, bytesize;
2719 #ifdef Py_UNICODE_WIDE
2720     Py_ssize_t i, pairs;
2721 #else
2722     const int pairs = 0;
2723 #endif
2724     /* Offsets from p for storing byte pairs in the right order. */
2725 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2726     int ihi = 1, ilo = 0;
2727 #else
2728     int ihi = 0, ilo = 1;
2729 #endif
2730 
2731 #define STORECHAR(CH)                           \
2732     do {                                        \
2733         p[ihi] = ((CH) >> 8) & 0xff;            \
2734         p[ilo] = (CH) & 0xff;                   \
2735         p += 2;                                 \
2736     } while(0)
2737 
2738 #ifdef Py_UNICODE_WIDE
2739     for (i = pairs = 0; i < size; i++)
2740         if (s[i] >= 0x10000)
2741             pairs++;
2742 #endif
2743     /* 2 * (size + pairs + (byteorder == 0)) */
2744     if (size > PY_SSIZE_T_MAX ||
2745         size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
2746         return PyErr_NoMemory();
2747     nsize = size + pairs + (byteorder == 0);
2748     bytesize = nsize * 2;
2749     if (bytesize / 2 != nsize)
2750         return PyErr_NoMemory();
2751     v = PyString_FromStringAndSize(NULL, bytesize);
2752     if (v == NULL)
2753         return NULL;
2754 
2755     p = (unsigned char *)PyString_AS_STRING(v);
2756     if (byteorder == 0)
2757         STORECHAR(0xFEFF);
2758     if (size == 0)
2759         return v;
2760 
2761     if (byteorder == -1) {
2762         /* force LE */
2763         ihi = 1;
2764         ilo = 0;
2765     }
2766     else if (byteorder == 1) {
2767         /* force BE */
2768         ihi = 0;
2769         ilo = 1;
2770     }
2771 
2772     while (size-- > 0) {
2773         Py_UNICODE ch = *s++;
2774         Py_UNICODE ch2 = 0;
2775 #ifdef Py_UNICODE_WIDE
2776         if (ch >= 0x10000) {
2777             ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
2778             ch  = 0xD800 | ((ch-0x10000) >> 10);
2779         }
2780 #endif
2781         STORECHAR(ch);
2782         if (ch2)
2783             STORECHAR(ch2);
2784     }
2785     return v;
2786 #undef STORECHAR
2787 }
2788 
PyUnicode_AsUTF16String(PyObject * unicode)2789 PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
2790 {
2791     if (!PyUnicode_Check(unicode)) {
2792         PyErr_BadArgument();
2793         return NULL;
2794     }
2795     return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
2796                                  PyUnicode_GET_SIZE(unicode),
2797                                  NULL,
2798                                  0);
2799 }
2800 
2801 /* --- Unicode Escape Codec ----------------------------------------------- */
2802 
2803 static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
2804 
PyUnicode_DecodeUnicodeEscape(const char * s,Py_ssize_t size,const char * errors)2805 PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
2806                                         Py_ssize_t size,
2807                                         const char *errors)
2808 {
2809     const char *starts = s;
2810     Py_ssize_t startinpos;
2811     Py_ssize_t endinpos;
2812     Py_ssize_t outpos;
2813     PyUnicodeObject *v;
2814     Py_UNICODE *p;
2815     const char *end;
2816     char* message;
2817     Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
2818     PyObject *errorHandler = NULL;
2819     PyObject *exc = NULL;
2820 
2821     /* Escaped strings will always be longer than the resulting
2822        Unicode string, so we start with size here and then reduce the
2823        length after conversion to the true value.
2824        (but if the error callback returns a long replacement string
2825        we'll have to allocate more space) */
2826     v = _PyUnicode_New(size);
2827     if (v == NULL)
2828         goto onError;
2829     if (size == 0)
2830         return (PyObject *)v;
2831 
2832     p = PyUnicode_AS_UNICODE(v);
2833     end = s + size;
2834 
2835     while (s < end) {
2836         unsigned char c;
2837         Py_UNICODE x;
2838         int digits;
2839 
2840         /* Non-escape characters are interpreted as Unicode ordinals */
2841         if (*s != '\\') {
2842             *p++ = (unsigned char) *s++;
2843             continue;
2844         }
2845 
2846         startinpos = s-starts;
2847         /* \ - Escapes */
2848         s++;
2849         c = *s++;
2850         if (s > end)
2851             c = '\0'; /* Invalid after \ */
2852         switch (c) {
2853 
2854             /* \x escapes */
2855         case '\n': break;
2856         case '\\': *p++ = '\\'; break;
2857         case '\'': *p++ = '\''; break;
2858         case '\"': *p++ = '\"'; break;
2859         case 'b': *p++ = '\b'; break;
2860         case 'f': *p++ = '\014'; break; /* FF */
2861         case 't': *p++ = '\t'; break;
2862         case 'n': *p++ = '\n'; break;
2863         case 'r': *p++ = '\r'; break;
2864         case 'v': *p++ = '\013'; break; /* VT */
2865         case 'a': *p++ = '\007'; break; /* BEL, not classic C */
2866 
2867             /* \OOO (octal) escapes */
2868         case '0': case '1': case '2': case '3':
2869         case '4': case '5': case '6': case '7':
2870             x = s[-1] - '0';
2871             if (s < end && '0' <= *s && *s <= '7') {
2872                 x = (x<<3) + *s++ - '0';
2873                 if (s < end && '0' <= *s && *s <= '7')
2874                     x = (x<<3) + *s++ - '0';
2875             }
2876             *p++ = x;
2877             break;
2878 
2879             /* hex escapes */
2880             /* \xXX */
2881         case 'x':
2882             digits = 2;
2883             message = "truncated \\xXX escape";
2884             goto hexescape;
2885 
2886             /* \uXXXX */
2887         case 'u':
2888             digits = 4;
2889             message = "truncated \\uXXXX escape";
2890             goto hexescape;
2891 
2892             /* \UXXXXXXXX */
2893         case 'U':
2894             digits = 8;
2895             message = "truncated \\UXXXXXXXX escape";
2896         hexescape:
2897             chr = 0;
2898             if (end - s < digits) {
2899                 /* count only hex digits */
2900                 for (; s < end; ++s) {
2901                     c = (unsigned char)*s;
2902                     if (!Py_ISXDIGIT(c))
2903                         goto error;
2904                 }
2905                 goto error;
2906             }
2907             for (; digits--; ++s) {
2908                 c = (unsigned char)*s;
2909                 if (!Py_ISXDIGIT(c))
2910                     goto error;
2911                 chr = (chr<<4) & ~0xF;
2912                 if (c >= '0' && c <= '9')
2913                     chr += c - '0';
2914                 else if (c >= 'a' && c <= 'f')
2915                     chr += 10 + c - 'a';
2916                 else
2917                     chr += 10 + c - 'A';
2918             }
2919             if (chr == 0xffffffff && PyErr_Occurred())
2920                 /* _decoding_error will have already written into the
2921                    target buffer. */
2922                 break;
2923         store:
2924             /* when we get here, chr is a 32-bit unicode character */
2925             if (chr <= 0xffff)
2926                 /* UCS-2 character */
2927                 *p++ = (Py_UNICODE) chr;
2928             else if (chr <= 0x10ffff) {
2929                 /* UCS-4 character. Either store directly, or as
2930                    surrogate pair. */
2931 #ifdef Py_UNICODE_WIDE
2932                 *p++ = chr;
2933 #else
2934                 chr -= 0x10000L;
2935                 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
2936                 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
2937 #endif
2938             } else {
2939                 message = "illegal Unicode character";
2940                 goto error;
2941             }
2942             break;
2943 
2944             /* \N{name} */
2945         case 'N':
2946             message = "malformed \\N character escape";
2947             if (ucnhash_CAPI == NULL) {
2948                 /* load the unicode data module */
2949                 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(PyUnicodeData_CAPSULE_NAME, 1);
2950                 if (ucnhash_CAPI == NULL)
2951                     goto ucnhashError;
2952             }
2953             if (*s == '{') {
2954                 const char *start = s+1;
2955                 /* look for the closing brace */
2956                 while (*s != '}' && s < end)
2957                     s++;
2958                 if (s > start && s < end && *s == '}') {
2959                     /* found a name.  look it up in the unicode database */
2960                     message = "unknown Unicode character name";
2961                     s++;
2962                     if (s - start - 1 <= INT_MAX &&
2963                         ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
2964                         goto store;
2965                 }
2966             }
2967             goto error;
2968 
2969         default:
2970             if (s > end) {
2971                 message = "\\ at end of string";
2972                 s--;
2973                 goto error;
2974             }
2975             else {
2976                 *p++ = '\\';
2977                 *p++ = (unsigned char)s[-1];
2978             }
2979             break;
2980         }
2981         continue;
2982 
2983       error:
2984         endinpos = s-starts;
2985         outpos = p-PyUnicode_AS_UNICODE(v);
2986         if (unicode_decode_call_errorhandler(
2987                 errors, &errorHandler,
2988                 "unicodeescape", message,
2989                 starts, size, &startinpos, &endinpos, &exc, &s,
2990                 &v, &outpos, &p))
2991             goto onError;
2992         continue;
2993     }
2994     if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
2995         goto onError;
2996     Py_XDECREF(errorHandler);
2997     Py_XDECREF(exc);
2998     return (PyObject *)v;
2999 
3000   ucnhashError:
3001     PyErr_SetString(
3002         PyExc_UnicodeError,
3003         "\\N escapes not supported (can't load unicodedata module)"
3004         );
3005     Py_XDECREF(v);
3006     Py_XDECREF(errorHandler);
3007     Py_XDECREF(exc);
3008     return NULL;
3009 
3010   onError:
3011     Py_XDECREF(v);
3012     Py_XDECREF(errorHandler);
3013     Py_XDECREF(exc);
3014     return NULL;
3015 }
3016 
3017 /* Return a Unicode-Escape string version of the Unicode object.
3018 
3019    If quotes is true, the string is enclosed in u"" or u'' quotes as
3020    appropriate.
3021 
3022 */
3023 
findchar(const Py_UNICODE * s,Py_ssize_t size,Py_UNICODE ch)3024 Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
3025                                              Py_ssize_t size,
3026                                              Py_UNICODE ch)
3027 {
3028     /* like wcschr, but doesn't stop at NULL characters */
3029 
3030     while (size-- > 0) {
3031         if (*s == ch)
3032             return s;
3033         s++;
3034     }
3035 
3036     return NULL;
3037 }
3038 
3039 static
unicodeescape_string(const Py_UNICODE * s,Py_ssize_t size,int quotes)3040 PyObject *unicodeescape_string(const Py_UNICODE *s,
3041                                Py_ssize_t size,
3042                                int quotes)
3043 {
3044     PyObject *repr;
3045     char *p;
3046 
3047     static const char *hexdigit = "0123456789abcdef";
3048 #ifdef Py_UNICODE_WIDE
3049     const Py_ssize_t expandsize = 10;
3050 #else
3051     const Py_ssize_t expandsize = 6;
3052 #endif
3053 
3054     /* XXX(nnorwitz): rather than over-allocating, it would be
3055        better to choose a different scheme.  Perhaps scan the
3056        first N-chars of the string and allocate based on that size.
3057     */
3058     /* Initial allocation is based on the longest-possible unichr
3059        escape.
3060 
3061        In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
3062        unichr, so in this case it's the longest unichr escape. In
3063        narrow (UTF-16) builds this is five chars per source unichr
3064        since there are two unichrs in the surrogate pair, so in narrow
3065        (UTF-16) builds it's not the longest unichr escape.
3066 
3067        In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
3068        so in the narrow (UTF-16) build case it's the longest unichr
3069        escape.
3070     */
3071 
3072     if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
3073         return PyErr_NoMemory();
3074 
3075     repr = PyString_FromStringAndSize(NULL,
3076                                       2
3077                                       + expandsize*size
3078                                       + 1);
3079     if (repr == NULL)
3080         return NULL;
3081 
3082     p = PyString_AS_STRING(repr);
3083 
3084     if (quotes) {
3085         *p++ = 'u';
3086         *p++ = (findchar(s, size, '\'') &&
3087                 !findchar(s, size, '"')) ? '"' : '\'';
3088     }
3089     while (size-- > 0) {
3090         Py_UNICODE ch = *s++;
3091 
3092         /* Escape quotes and backslashes */
3093         if ((quotes &&
3094              ch == (Py_UNICODE) PyString_AS_STRING(repr)[1]) || ch == '\\') {
3095             *p++ = '\\';
3096             *p++ = (char) ch;
3097             continue;
3098         }
3099 
3100 #ifdef Py_UNICODE_WIDE
3101         /* Map 21-bit characters to '\U00xxxxxx' */
3102         else if (ch >= 0x10000) {
3103             *p++ = '\\';
3104             *p++ = 'U';
3105             *p++ = hexdigit[(ch >> 28) & 0x0000000F];
3106             *p++ = hexdigit[(ch >> 24) & 0x0000000F];
3107             *p++ = hexdigit[(ch >> 20) & 0x0000000F];
3108             *p++ = hexdigit[(ch >> 16) & 0x0000000F];
3109             *p++ = hexdigit[(ch >> 12) & 0x0000000F];
3110             *p++ = hexdigit[(ch >> 8) & 0x0000000F];
3111             *p++ = hexdigit[(ch >> 4) & 0x0000000F];
3112             *p++ = hexdigit[ch & 0x0000000F];
3113             continue;
3114         }
3115 #else
3116         /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3117         else if (ch >= 0xD800 && ch < 0xDC00) {
3118             Py_UNICODE ch2;
3119             Py_UCS4 ucs;
3120 
3121             ch2 = *s++;
3122             size--;
3123             if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3124                 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3125                 *p++ = '\\';
3126                 *p++ = 'U';
3127                 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
3128                 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
3129                 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
3130                 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
3131                 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
3132                 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
3133                 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
3134                 *p++ = hexdigit[ucs & 0x0000000F];
3135                 continue;
3136             }
3137             /* Fall through: isolated surrogates are copied as-is */
3138             s--;
3139             size++;
3140         }
3141 #endif
3142 
3143         /* Map 16-bit characters to '\uxxxx' */
3144         if (ch >= 256) {
3145             *p++ = '\\';
3146             *p++ = 'u';
3147             *p++ = hexdigit[(ch >> 12) & 0x000F];
3148             *p++ = hexdigit[(ch >> 8) & 0x000F];
3149             *p++ = hexdigit[(ch >> 4) & 0x000F];
3150             *p++ = hexdigit[ch & 0x000F];
3151         }
3152 
3153         /* Map special whitespace to '\t', \n', '\r' */
3154         else if (ch == '\t') {
3155             *p++ = '\\';
3156             *p++ = 't';
3157         }
3158         else if (ch == '\n') {
3159             *p++ = '\\';
3160             *p++ = 'n';
3161         }
3162         else if (ch == '\r') {
3163             *p++ = '\\';
3164             *p++ = 'r';
3165         }
3166 
3167         /* Map non-printable US ASCII to '\xhh' */
3168         else if (ch < ' ' || ch >= 0x7F) {
3169             *p++ = '\\';
3170             *p++ = 'x';
3171             *p++ = hexdigit[(ch >> 4) & 0x000F];
3172             *p++ = hexdigit[ch & 0x000F];
3173         }
3174 
3175         /* Copy everything else as-is */
3176         else
3177             *p++ = (char) ch;
3178     }
3179     if (quotes)
3180         *p++ = PyString_AS_STRING(repr)[1];
3181 
3182     *p = '\0';
3183     if (_PyString_Resize(&repr, p - PyString_AS_STRING(repr)))
3184         return NULL;
3185     return repr;
3186 }
3187 
PyUnicode_EncodeUnicodeEscape(const Py_UNICODE * s,Py_ssize_t size)3188 PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
3189                                         Py_ssize_t size)
3190 {
3191     return unicodeescape_string(s, size, 0);
3192 }
3193 
PyUnicode_AsUnicodeEscapeString(PyObject * unicode)3194 PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
3195 {
3196     if (!PyUnicode_Check(unicode)) {
3197         PyErr_BadArgument();
3198         return NULL;
3199     }
3200     return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3201                                          PyUnicode_GET_SIZE(unicode));
3202 }
3203 
3204 /* --- Raw Unicode Escape Codec ------------------------------------------- */
3205 
PyUnicode_DecodeRawUnicodeEscape(const char * s,Py_ssize_t size,const char * errors)3206 PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
3207                                            Py_ssize_t size,
3208                                            const char *errors)
3209 {
3210     const char *starts = s;
3211     Py_ssize_t startinpos;
3212     Py_ssize_t endinpos;
3213     Py_ssize_t outpos;
3214     PyUnicodeObject *v;
3215     Py_UNICODE *p;
3216     const char *end;
3217     const char *bs;
3218     PyObject *errorHandler = NULL;
3219     PyObject *exc = NULL;
3220 
3221     /* Escaped strings will always be longer than the resulting
3222        Unicode string, so we start with size here and then reduce the
3223        length after conversion to the true value. (But decoding error
3224        handler might have to resize the string) */
3225     v = _PyUnicode_New(size);
3226     if (v == NULL)
3227         goto onError;
3228     if (size == 0)
3229         return (PyObject *)v;
3230     p = PyUnicode_AS_UNICODE(v);
3231     end = s + size;
3232     while (s < end) {
3233         unsigned char c;
3234         Py_UCS4 x;
3235         int i;
3236         int count;
3237 
3238         /* Non-escape characters are interpreted as Unicode ordinals */
3239         if (*s != '\\') {
3240             *p++ = (unsigned char)*s++;
3241             continue;
3242         }
3243         startinpos = s-starts;
3244 
3245         /* \u-escapes are only interpreted iff the number of leading
3246            backslashes if odd */
3247         bs = s;
3248         for (;s < end;) {
3249             if (*s != '\\')
3250                 break;
3251             *p++ = (unsigned char)*s++;
3252         }
3253         if (((s - bs) & 1) == 0 ||
3254             s >= end ||
3255             (*s != 'u' && *s != 'U')) {
3256             continue;
3257         }
3258         p--;
3259         count = *s=='u' ? 4 : 8;
3260         s++;
3261 
3262         /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
3263         outpos = p-PyUnicode_AS_UNICODE(v);
3264         for (x = 0, i = 0; i < count; ++i, ++s) {
3265             c = (unsigned char)*s;
3266             if (!isxdigit(c)) {
3267                 endinpos = s-starts;
3268                 if (unicode_decode_call_errorhandler(
3269                         errors, &errorHandler,
3270                         "rawunicodeescape", "truncated \\uXXXX",
3271                         starts, size, &startinpos, &endinpos, &exc, &s,
3272                         &v, &outpos, &p))
3273                     goto onError;
3274                 goto nextByte;
3275             }
3276             x = (x<<4) & ~0xF;
3277             if (c >= '0' && c <= '9')
3278                 x += c - '0';
3279             else if (c >= 'a' && c <= 'f')
3280                 x += 10 + c - 'a';
3281             else
3282                 x += 10 + c - 'A';
3283         }
3284         if (x <= 0xffff)
3285             /* UCS-2 character */
3286             *p++ = (Py_UNICODE) x;
3287         else if (x <= 0x10ffff) {
3288             /* UCS-4 character. Either store directly, or as
3289                surrogate pair. */
3290 #ifdef Py_UNICODE_WIDE
3291             *p++ = (Py_UNICODE) x;
3292 #else
3293             x -= 0x10000L;
3294             *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
3295             *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
3296 #endif
3297         } else {
3298             endinpos = s-starts;
3299             outpos = p-PyUnicode_AS_UNICODE(v);
3300             if (unicode_decode_call_errorhandler(
3301                     errors, &errorHandler,
3302                     "rawunicodeescape", "\\Uxxxxxxxx out of range",
3303                     starts, size, &startinpos, &endinpos, &exc, &s,
3304                     &v, &outpos, &p))
3305                 goto onError;
3306         }
3307       nextByte:
3308         ;
3309     }
3310     if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3311         goto onError;
3312     Py_XDECREF(errorHandler);
3313     Py_XDECREF(exc);
3314     return (PyObject *)v;
3315 
3316   onError:
3317     Py_XDECREF(v);
3318     Py_XDECREF(errorHandler);
3319     Py_XDECREF(exc);
3320     return NULL;
3321 }
3322 
PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE * s,Py_ssize_t size)3323 PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
3324                                            Py_ssize_t size)
3325 {
3326     PyObject *repr;
3327     char *p;
3328     char *q;
3329 
3330     static const char *hexdigit = "0123456789abcdef";
3331 #ifdef Py_UNICODE_WIDE
3332     const Py_ssize_t expandsize = 10;
3333 #else
3334     const Py_ssize_t expandsize = 6;
3335 #endif
3336 
3337     if (size > PY_SSIZE_T_MAX / expandsize)
3338         return PyErr_NoMemory();
3339 
3340     repr = PyString_FromStringAndSize(NULL, expandsize * size);
3341     if (repr == NULL)
3342         return NULL;
3343     if (size == 0)
3344         return repr;
3345 
3346     p = q = PyString_AS_STRING(repr);
3347     while (size-- > 0) {
3348         Py_UNICODE ch = *s++;
3349 #ifdef Py_UNICODE_WIDE
3350         /* Map 32-bit characters to '\Uxxxxxxxx' */
3351         if (ch >= 0x10000) {
3352             *p++ = '\\';
3353             *p++ = 'U';
3354             *p++ = hexdigit[(ch >> 28) & 0xf];
3355             *p++ = hexdigit[(ch >> 24) & 0xf];
3356             *p++ = hexdigit[(ch >> 20) & 0xf];
3357             *p++ = hexdigit[(ch >> 16) & 0xf];
3358             *p++ = hexdigit[(ch >> 12) & 0xf];
3359             *p++ = hexdigit[(ch >> 8) & 0xf];
3360             *p++ = hexdigit[(ch >> 4) & 0xf];
3361             *p++ = hexdigit[ch & 15];
3362         }
3363         else
3364 #else
3365             /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3366             if (ch >= 0xD800 && ch < 0xDC00) {
3367                 Py_UNICODE ch2;
3368                 Py_UCS4 ucs;
3369 
3370                 ch2 = *s++;
3371                 size--;
3372                 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3373                     ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3374                     *p++ = '\\';
3375                     *p++ = 'U';
3376                     *p++ = hexdigit[(ucs >> 28) & 0xf];
3377                     *p++ = hexdigit[(ucs >> 24) & 0xf];
3378                     *p++ = hexdigit[(ucs >> 20) & 0xf];
3379                     *p++ = hexdigit[(ucs >> 16) & 0xf];
3380                     *p++ = hexdigit[(ucs >> 12) & 0xf];
3381                     *p++ = hexdigit[(ucs >> 8) & 0xf];
3382                     *p++ = hexdigit[(ucs >> 4) & 0xf];
3383                     *p++ = hexdigit[ucs & 0xf];
3384                     continue;
3385                 }
3386                 /* Fall through: isolated surrogates are copied as-is */
3387                 s--;
3388                 size++;
3389             }
3390 #endif
3391         /* Map 16-bit characters to '\uxxxx' */
3392         if (ch >= 256) {
3393             *p++ = '\\';
3394             *p++ = 'u';
3395             *p++ = hexdigit[(ch >> 12) & 0xf];
3396             *p++ = hexdigit[(ch >> 8) & 0xf];
3397             *p++ = hexdigit[(ch >> 4) & 0xf];
3398             *p++ = hexdigit[ch & 15];
3399         }
3400         /* Copy everything else as-is */
3401         else
3402             *p++ = (char) ch;
3403     }
3404     *p = '\0';
3405     if (_PyString_Resize(&repr, p - q))
3406         return NULL;
3407     return repr;
3408 }
3409 
PyUnicode_AsRawUnicodeEscapeString(PyObject * unicode)3410 PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
3411 {
3412     if (!PyUnicode_Check(unicode)) {
3413         PyErr_BadArgument();
3414         return NULL;
3415     }
3416     return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3417                                             PyUnicode_GET_SIZE(unicode));
3418 }
3419 
3420 /* --- Unicode Internal Codec ------------------------------------------- */
3421 
_PyUnicode_DecodeUnicodeInternal(const char * s,Py_ssize_t size,const char * errors)3422 PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
3423                                            Py_ssize_t size,
3424                                            const char *errors)
3425 {
3426     const char *starts = s;
3427     Py_ssize_t startinpos;
3428     Py_ssize_t endinpos;
3429     Py_ssize_t outpos;
3430     PyUnicodeObject *v;
3431     Py_UNICODE *p;
3432     const char *end;
3433     const char *reason;
3434     PyObject *errorHandler = NULL;
3435     PyObject *exc = NULL;
3436 
3437 #ifdef Py_UNICODE_WIDE
3438     Py_UNICODE unimax = PyUnicode_GetMax();
3439 #endif
3440 
3441     /* XXX overflow detection missing */
3442     v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
3443     if (v == NULL)
3444         goto onError;
3445     if (PyUnicode_GetSize((PyObject *)v) == 0)
3446         return (PyObject *)v;
3447     p = PyUnicode_AS_UNICODE(v);
3448     end = s + size;
3449 
3450     while (s < end) {
3451         if (end-s < Py_UNICODE_SIZE) {
3452             endinpos = end-starts;
3453             reason = "truncated input";
3454             goto error;
3455         }
3456         memcpy(p, s, sizeof(Py_UNICODE));
3457 #ifdef Py_UNICODE_WIDE
3458         /* We have to sanity check the raw data, otherwise doom looms for
3459            some malformed UCS-4 data. */
3460         if (*p > unimax || *p < 0) {
3461             endinpos = s - starts + Py_UNICODE_SIZE;
3462             reason = "illegal code point (> 0x10FFFF)";
3463             goto error;
3464         }
3465 #endif
3466         p++;
3467         s += Py_UNICODE_SIZE;
3468         continue;
3469 
3470   error:
3471         startinpos = s - starts;
3472         outpos = p - PyUnicode_AS_UNICODE(v);
3473         if (unicode_decode_call_errorhandler(
3474                 errors, &errorHandler,
3475                 "unicode_internal", reason,
3476                 starts, size, &startinpos, &endinpos, &exc, &s,
3477                 &v, &outpos, &p)) {
3478             goto onError;
3479         }
3480     }
3481 
3482     if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3483         goto onError;
3484     Py_XDECREF(errorHandler);
3485     Py_XDECREF(exc);
3486     return (PyObject *)v;
3487 
3488   onError:
3489     Py_XDECREF(v);
3490     Py_XDECREF(errorHandler);
3491     Py_XDECREF(exc);
3492     return NULL;
3493 }
3494 
3495 /* --- Latin-1 Codec ------------------------------------------------------ */
3496 
PyUnicode_DecodeLatin1(const char * s,Py_ssize_t size,const char * errors)3497 PyObject *PyUnicode_DecodeLatin1(const char *s,
3498                                  Py_ssize_t size,
3499                                  const char *errors)
3500 {
3501     PyUnicodeObject *v;
3502     Py_UNICODE *p;
3503 
3504     /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
3505     if (size == 1) {
3506         Py_UNICODE r = *(unsigned char*)s;
3507         return PyUnicode_FromUnicode(&r, 1);
3508     }
3509 
3510     v = _PyUnicode_New(size);
3511     if (v == NULL)
3512         goto onError;
3513     if (size == 0)
3514         return (PyObject *)v;
3515     p = PyUnicode_AS_UNICODE(v);
3516     while (size-- > 0)
3517         *p++ = (unsigned char)*s++;
3518     return (PyObject *)v;
3519 
3520   onError:
3521     Py_XDECREF(v);
3522     return NULL;
3523 }
3524 
3525 /* create or adjust a UnicodeEncodeError */
make_encode_exception(PyObject ** exceptionObject,const char * encoding,const Py_UNICODE * unicode,Py_ssize_t size,Py_ssize_t startpos,Py_ssize_t endpos,const char * reason)3526 static void make_encode_exception(PyObject **exceptionObject,
3527                                   const char *encoding,
3528                                   const Py_UNICODE *unicode, Py_ssize_t size,
3529                                   Py_ssize_t startpos, Py_ssize_t endpos,
3530                                   const char *reason)
3531 {
3532     if (*exceptionObject == NULL) {
3533         *exceptionObject = PyUnicodeEncodeError_Create(
3534             encoding, unicode, size, startpos, endpos, reason);
3535     }
3536     else {
3537         if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
3538             goto onError;
3539         if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
3540             goto onError;
3541         if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
3542             goto onError;
3543         return;
3544       onError:
3545         Py_CLEAR(*exceptionObject);
3546     }
3547 }
3548 
3549 /* raises a UnicodeEncodeError */
raise_encode_exception(PyObject ** exceptionObject,const char * encoding,const Py_UNICODE * unicode,Py_ssize_t size,Py_ssize_t startpos,Py_ssize_t endpos,const char * reason)3550 static void raise_encode_exception(PyObject **exceptionObject,
3551                                    const char *encoding,
3552                                    const Py_UNICODE *unicode, Py_ssize_t size,
3553                                    Py_ssize_t startpos, Py_ssize_t endpos,
3554                                    const char *reason)
3555 {
3556     make_encode_exception(exceptionObject,
3557                           encoding, unicode, size, startpos, endpos, reason);
3558     if (*exceptionObject != NULL)
3559         PyCodec_StrictErrors(*exceptionObject);
3560 }
3561 
3562 /* error handling callback helper:
3563    build arguments, call the callback and check the arguments,
3564    put the result into newpos and return the replacement string, which
3565    has to be freed by the caller */
unicode_encode_call_errorhandler(const char * errors,PyObject ** errorHandler,const char * encoding,const char * reason,const Py_UNICODE * unicode,Py_ssize_t size,PyObject ** exceptionObject,Py_ssize_t startpos,Py_ssize_t endpos,Py_ssize_t * newpos)3566 static PyObject *unicode_encode_call_errorhandler(const char *errors,
3567                                                   PyObject **errorHandler,
3568                                                   const char *encoding, const char *reason,
3569                                                   const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3570                                                   Py_ssize_t startpos, Py_ssize_t endpos,
3571                                                   Py_ssize_t *newpos)
3572 {
3573     static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
3574 
3575     PyObject *restuple;
3576     PyObject *resunicode;
3577 
3578     if (*errorHandler == NULL) {
3579         *errorHandler = PyCodec_LookupError(errors);
3580         if (*errorHandler == NULL)
3581             return NULL;
3582     }
3583 
3584     make_encode_exception(exceptionObject,
3585                           encoding, unicode, size, startpos, endpos, reason);
3586     if (*exceptionObject == NULL)
3587         return NULL;
3588 
3589     restuple = PyObject_CallFunctionObjArgs(
3590         *errorHandler, *exceptionObject, NULL);
3591     if (restuple == NULL)
3592         return NULL;
3593     if (!PyTuple_Check(restuple)) {
3594         PyErr_SetString(PyExc_TypeError, &argparse[4]);
3595         Py_DECREF(restuple);
3596         return NULL;
3597     }
3598     if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
3599                           &resunicode, newpos)) {
3600         Py_DECREF(restuple);
3601         return NULL;
3602     }
3603     if (*newpos<0)
3604         *newpos = size+*newpos;
3605     if (*newpos<0 || *newpos>size) {
3606         PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
3607         Py_DECREF(restuple);
3608         return NULL;
3609     }
3610     Py_INCREF(resunicode);
3611     Py_DECREF(restuple);
3612     return resunicode;
3613 }
3614 
unicode_encode_ucs1(const Py_UNICODE * p,Py_ssize_t size,const char * errors,int limit)3615 static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
3616                                      Py_ssize_t size,
3617                                      const char *errors,
3618                                      int limit)
3619 {
3620     /* output object */
3621     PyObject *res;
3622     /* pointers to the beginning and end+1 of input */
3623     const Py_UNICODE *startp = p;
3624     const Py_UNICODE *endp = p + size;
3625     /* pointer to the beginning of the unencodable characters */
3626     /* const Py_UNICODE *badp = NULL; */
3627     /* pointer into the output */
3628     char *str;
3629     /* current output position */
3630     Py_ssize_t respos = 0;
3631     Py_ssize_t ressize;
3632     const char *encoding = (limit == 256) ? "latin-1" : "ascii";
3633     const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
3634     PyObject *errorHandler = NULL;
3635     PyObject *exc = NULL;
3636     /* the following variable is used for caching string comparisons
3637      * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3638     int known_errorHandler = -1;
3639 
3640     /* allocate enough for a simple encoding without
3641        replacements, if we need more, we'll resize */
3642     res = PyString_FromStringAndSize(NULL, size);
3643     if (res == NULL)
3644         goto onError;
3645     if (size == 0)
3646         return res;
3647     str = PyString_AS_STRING(res);
3648     ressize = size;
3649 
3650     while (p<endp) {
3651         Py_UNICODE c = *p;
3652 
3653         /* can we encode this? */
3654         if (c<limit) {
3655             /* no overflow check, because we know that the space is enough */
3656             *str++ = (char)c;
3657             ++p;
3658         }
3659         else {
3660             Py_ssize_t unicodepos = p-startp;
3661             Py_ssize_t requiredsize;
3662             PyObject *repunicode;
3663             Py_ssize_t repsize;
3664             Py_ssize_t newpos;
3665             Py_ssize_t respos;
3666             Py_UNICODE *uni2;
3667             /* startpos for collecting unencodable chars */
3668             const Py_UNICODE *collstart = p;
3669             const Py_UNICODE *collend = p;
3670             /* find all unecodable characters */
3671             while ((collend < endp) && ((*collend) >= limit))
3672                 ++collend;
3673             /* cache callback name lookup (if not done yet, i.e. it's the first error) */
3674             if (known_errorHandler==-1) {
3675                 if ((errors==NULL) || (!strcmp(errors, "strict")))
3676                     known_errorHandler = 1;
3677                 else if (!strcmp(errors, "replace"))
3678                     known_errorHandler = 2;
3679                 else if (!strcmp(errors, "ignore"))
3680                     known_errorHandler = 3;
3681                 else if (!strcmp(errors, "xmlcharrefreplace"))
3682                     known_errorHandler = 4;
3683                 else
3684                     known_errorHandler = 0;
3685             }
3686             switch (known_errorHandler) {
3687             case 1: /* strict */
3688                 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
3689                 goto onError;
3690             case 2: /* replace */
3691                 while (collstart++ < collend)
3692                     *str++ = '?'; /* fall through */
3693             case 3: /* ignore */
3694                 p = collend;
3695                 break;
3696             case 4: /* xmlcharrefreplace */
3697                 respos = str - PyString_AS_STRING(res);
3698                 /* determine replacement size (temporarily (mis)uses p) */
3699                 requiredsize = respos;
3700                 for (p = collstart; p < collend;) {
3701                     Py_UCS4 ch = _Py_UNICODE_NEXT(p, collend);
3702                     Py_ssize_t incr;
3703                     if (ch < 10)
3704                         incr = 2+1+1;
3705                     else if (ch < 100)
3706                         incr = 2+2+1;
3707                     else if (ch < 1000)
3708                         incr = 2+3+1;
3709                     else if (ch < 10000)
3710                         incr = 2+4+1;
3711                     else if (ch < 100000)
3712                         incr = 2+5+1;
3713                     else if (ch < 1000000)
3714                         incr = 2+6+1;
3715                     else
3716                         incr = 2+7+1;
3717                     if (requiredsize > PY_SSIZE_T_MAX - incr)
3718                         goto overflow;
3719                     requiredsize += incr;
3720                 }
3721                 if (requiredsize > PY_SSIZE_T_MAX - (endp - collend))
3722                     goto overflow;
3723                 requiredsize += endp - collend;
3724                 if (requiredsize > ressize) {
3725                     if (ressize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*ressize)
3726                         requiredsize = 2*ressize;
3727                     if (_PyString_Resize(&res, requiredsize))
3728                         goto onError;
3729                     str = PyString_AS_STRING(res) + respos;
3730                     ressize = requiredsize;
3731                 }
3732                 /* generate replacement (temporarily (mis)uses p) */
3733                 for (p = collstart; p < collend;) {
3734                     Py_UCS4 ch = _Py_UNICODE_NEXT(p, collend);
3735                     str += sprintf(str, "&#%d;", (int)ch);
3736                 }
3737                 p = collend;
3738                 break;
3739             default:
3740                 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3741                                                               encoding, reason, startp, size, &exc,
3742                                                               collstart-startp, collend-startp, &newpos);
3743                 if (repunicode == NULL)
3744                     goto onError;
3745                 /* need more space? (at least enough for what we have+the
3746                    replacement+the rest of the string, so we won't have to
3747                    check space for encodable characters) */
3748                 respos = str - PyString_AS_STRING(res);
3749                 repsize = PyUnicode_GET_SIZE(repunicode);
3750                 if (respos > PY_SSIZE_T_MAX - repsize)
3751                     goto overflow;
3752                 requiredsize = respos + repsize;
3753                 if (requiredsize > PY_SSIZE_T_MAX - (endp - collend))
3754                     goto overflow;
3755                 requiredsize += endp - collend;
3756                 if (requiredsize > ressize) {
3757                     if (ressize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*ressize)
3758                         requiredsize = 2*ressize;
3759                     if (_PyString_Resize(&res, requiredsize)) {
3760                         Py_DECREF(repunicode);
3761                         goto onError;
3762                     }
3763                     str = PyString_AS_STRING(res) + respos;
3764                     ressize = requiredsize;
3765                 }
3766                 /* check if there is anything unencodable in the replacement
3767                    and copy it to the output */
3768                 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2, ++str) {
3769                     c = *uni2;
3770                     if (c >= limit) {
3771                         raise_encode_exception(&exc, encoding, startp, size,
3772                                                unicodepos, unicodepos+1, reason);
3773                         Py_DECREF(repunicode);
3774                         goto onError;
3775                     }
3776                     *str = (char)c;
3777                 }
3778                 p = startp + newpos;
3779                 Py_DECREF(repunicode);
3780             }
3781         }
3782     }
3783     /* Resize if we allocated to much */
3784     respos = str - PyString_AS_STRING(res);
3785     if (respos < ressize)
3786         /* If this falls res will be NULL */
3787         _PyString_Resize(&res, respos);
3788     Py_XDECREF(errorHandler);
3789     Py_XDECREF(exc);
3790     return res;
3791 
3792   overflow:
3793     PyErr_SetString(PyExc_OverflowError,
3794                     "encoded result is too long for a Python string");
3795 
3796   onError:
3797     Py_XDECREF(res);
3798     Py_XDECREF(errorHandler);
3799     Py_XDECREF(exc);
3800     return NULL;
3801 }
3802 
PyUnicode_EncodeLatin1(const Py_UNICODE * p,Py_ssize_t size,const char * errors)3803 PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
3804                                  Py_ssize_t size,
3805                                  const char *errors)
3806 {
3807     return unicode_encode_ucs1(p, size, errors, 256);
3808 }
3809 
PyUnicode_AsLatin1String(PyObject * unicode)3810 PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
3811 {
3812     if (!PyUnicode_Check(unicode)) {
3813         PyErr_BadArgument();
3814         return NULL;
3815     }
3816     return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
3817                                   PyUnicode_GET_SIZE(unicode),
3818                                   NULL);
3819 }
3820 
3821 /* --- 7-bit ASCII Codec -------------------------------------------------- */
3822 
PyUnicode_DecodeASCII(const char * s,Py_ssize_t size,const char * errors)3823 PyObject *PyUnicode_DecodeASCII(const char *s,
3824                                 Py_ssize_t size,
3825                                 const char *errors)
3826 {
3827     const char *starts = s;
3828     PyUnicodeObject *v;
3829     Py_UNICODE *p;
3830     Py_ssize_t startinpos;
3831     Py_ssize_t endinpos;
3832     Py_ssize_t outpos;
3833     const char *e;
3834     PyObject *errorHandler = NULL;
3835     PyObject *exc = NULL;
3836 
3837     /* ASCII is equivalent to the first 128 ordinals in Unicode. */
3838     if (size == 1 && *(unsigned char*)s < 128) {
3839         Py_UNICODE r = *(unsigned char*)s;
3840         return PyUnicode_FromUnicode(&r, 1);
3841     }
3842 
3843     v = _PyUnicode_New(size);
3844     if (v == NULL)
3845         goto onError;
3846     if (size == 0)
3847         return (PyObject *)v;
3848     p = PyUnicode_AS_UNICODE(v);
3849     e = s + size;
3850     while (s < e) {
3851         register unsigned char c = (unsigned char)*s;
3852         if (c < 128) {
3853             *p++ = c;
3854             ++s;
3855         }
3856         else {
3857             startinpos = s-starts;
3858             endinpos = startinpos + 1;
3859             outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
3860             if (unicode_decode_call_errorhandler(
3861                     errors, &errorHandler,
3862                     "ascii", "ordinal not in range(128)",
3863                     starts, size, &startinpos, &endinpos, &exc, &s,
3864                     &v, &outpos, &p))
3865                 goto onError;
3866         }
3867     }
3868     if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
3869         if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3870             goto onError;
3871     Py_XDECREF(errorHandler);
3872     Py_XDECREF(exc);
3873     return (PyObject *)v;
3874 
3875   onError:
3876     Py_XDECREF(v);
3877     Py_XDECREF(errorHandler);
3878     Py_XDECREF(exc);
3879     return NULL;
3880 }
3881 
PyUnicode_EncodeASCII(const Py_UNICODE * p,Py_ssize_t size,const char * errors)3882 PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
3883                                 Py_ssize_t size,
3884                                 const char *errors)
3885 {
3886     return unicode_encode_ucs1(p, size, errors, 128);
3887 }
3888 
PyUnicode_AsASCIIString(PyObject * unicode)3889 PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
3890 {
3891     if (!PyUnicode_Check(unicode)) {
3892         PyErr_BadArgument();
3893         return NULL;
3894     }
3895     return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
3896                                  PyUnicode_GET_SIZE(unicode),
3897                                  NULL);
3898 }
3899 
3900 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
3901 
3902 /* --- MBCS codecs for Windows -------------------------------------------- */
3903 
3904 #if SIZEOF_INT < SIZEOF_SIZE_T
3905 #define NEED_RETRY
3906 #endif
3907 
3908 /* XXX This code is limited to "true" double-byte encodings, as
3909    a) it assumes an incomplete character consists of a single byte, and
3910    b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
3911    encodings, see IsDBCSLeadByteEx documentation. */
3912 
is_dbcs_lead_byte(const char * s,int offset)3913 static int is_dbcs_lead_byte(const char *s, int offset)
3914 {
3915     const char *curr = s + offset;
3916 
3917     if (IsDBCSLeadByte(*curr)) {
3918         const char *prev = CharPrev(s, curr);
3919         return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
3920     }
3921     return 0;
3922 }
3923 
3924 /*
3925  * Decode MBCS string into unicode object. If 'final' is set, converts
3926  * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
3927  */
decode_mbcs(PyUnicodeObject ** v,const char * s,int size,int final)3928 static int decode_mbcs(PyUnicodeObject **v,
3929                        const char *s, /* MBCS string */
3930                        int size, /* sizeof MBCS string */
3931                        int final)
3932 {
3933     Py_UNICODE *p;
3934     Py_ssize_t n = 0;
3935     int usize = 0;
3936 
3937     assert(size >= 0);
3938 
3939     /* Skip trailing lead-byte unless 'final' is set */
3940     if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
3941         --size;
3942 
3943     /* First get the size of the result */
3944     if (size > 0) {
3945         usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
3946         if (usize == 0) {
3947             PyErr_SetFromWindowsErrWithFilename(0, NULL);
3948             return -1;
3949         }
3950     }
3951 
3952     if (*v == NULL) {
3953         /* Create unicode object */
3954         *v = _PyUnicode_New(usize);
3955         if (*v == NULL)
3956             return -1;
3957     }
3958     else {
3959         /* Extend unicode object */
3960         n = PyUnicode_GET_SIZE(*v);
3961         if (_PyUnicode_Resize(v, n + usize) < 0)
3962             return -1;
3963     }
3964 
3965     /* Do the conversion */
3966     if (size > 0) {
3967         p = PyUnicode_AS_UNICODE(*v) + n;
3968         if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
3969             PyErr_SetFromWindowsErrWithFilename(0, NULL);
3970             return -1;
3971         }
3972     }
3973 
3974     return size;
3975 }
3976 
PyUnicode_DecodeMBCSStateful(const char * s,Py_ssize_t size,const char * errors,Py_ssize_t * consumed)3977 PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
3978                                        Py_ssize_t size,
3979                                        const char *errors,
3980                                        Py_ssize_t *consumed)
3981 {
3982     PyUnicodeObject *v = NULL;
3983     int done;
3984 
3985     if (consumed)
3986         *consumed = 0;
3987 
3988 #ifdef NEED_RETRY
3989   retry:
3990     if (size > INT_MAX)
3991         done = decode_mbcs(&v, s, INT_MAX, 0);
3992     else
3993 #endif
3994         done = decode_mbcs(&v, s, (int)size, !consumed);
3995 
3996     if (done < 0) {
3997         Py_XDECREF(v);
3998         return NULL;
3999     }
4000 
4001     if (consumed)
4002         *consumed += done;
4003 
4004 #ifdef NEED_RETRY
4005     if (size > INT_MAX) {
4006         s += done;
4007         size -= done;
4008         goto retry;
4009     }
4010 #endif
4011 
4012     return (PyObject *)v;
4013 }
4014 
PyUnicode_DecodeMBCS(const char * s,Py_ssize_t size,const char * errors)4015 PyObject *PyUnicode_DecodeMBCS(const char *s,
4016                                Py_ssize_t size,
4017                                const char *errors)
4018 {
4019     return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
4020 }
4021 
4022 /*
4023  * Convert unicode into string object (MBCS).
4024  * Returns 0 if succeed, -1 otherwise.
4025  */
encode_mbcs(PyObject ** repr,const Py_UNICODE * p,int size)4026 static int encode_mbcs(PyObject **repr,
4027                        const Py_UNICODE *p, /* unicode */
4028                        int size) /* size of unicode */
4029 {
4030     int mbcssize = 0;
4031     Py_ssize_t n = 0;
4032 
4033     assert(size >= 0);
4034 
4035     /* First get the size of the result */
4036     if (size > 0) {
4037         mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
4038         if (mbcssize == 0) {
4039             PyErr_SetFromWindowsErrWithFilename(0, NULL);
4040             return -1;
4041         }
4042     }
4043 
4044     if (*repr == NULL) {
4045         /* Create string object */
4046         *repr = PyString_FromStringAndSize(NULL, mbcssize);
4047         if (*repr == NULL)
4048             return -1;
4049     }
4050     else {
4051         /* Extend string object */
4052         n = PyString_Size(*repr);
4053         if (_PyString_Resize(repr, n + mbcssize) < 0)
4054             return -1;
4055     }
4056 
4057     /* Do the conversion */
4058     if (size > 0) {
4059         char *s = PyString_AS_STRING(*repr) + n;
4060         if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
4061             PyErr_SetFromWindowsErrWithFilename(0, NULL);
4062             return -1;
4063         }
4064     }
4065 
4066     return 0;
4067 }
4068 
PyUnicode_EncodeMBCS(const Py_UNICODE * p,Py_ssize_t size,const char * errors)4069 PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
4070                                Py_ssize_t size,
4071                                const char *errors)
4072 {
4073     PyObject *repr = NULL;
4074     int ret;
4075 
4076 #ifdef NEED_RETRY
4077   retry:
4078     if (size > INT_MAX)
4079         ret = encode_mbcs(&repr, p, INT_MAX);
4080     else
4081 #endif
4082         ret = encode_mbcs(&repr, p, (int)size);
4083 
4084     if (ret < 0) {
4085         Py_XDECREF(repr);
4086         return NULL;
4087     }
4088 
4089 #ifdef NEED_RETRY
4090     if (size > INT_MAX) {
4091         p += INT_MAX;
4092         size -= INT_MAX;
4093         goto retry;
4094     }
4095 #endif
4096 
4097     return repr;
4098 }
4099 
PyUnicode_AsMBCSString(PyObject * unicode)4100 PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
4101 {
4102     if (!PyUnicode_Check(unicode)) {
4103         PyErr_BadArgument();
4104         return NULL;
4105     }
4106     return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
4107                                 PyUnicode_GET_SIZE(unicode),
4108                                 NULL);
4109 }
4110 
4111 #undef NEED_RETRY
4112 
4113 #endif /* MS_WINDOWS */
4114 
4115 /* --- Character Mapping Codec -------------------------------------------- */
4116 
PyUnicode_DecodeCharmap(const char * s,Py_ssize_t size,PyObject * mapping,const char * errors)4117 PyObject *PyUnicode_DecodeCharmap(const char *s,
4118                                   Py_ssize_t size,
4119                                   PyObject *mapping,
4120                                   const char *errors)
4121 {
4122     const char *starts = s;
4123     Py_ssize_t startinpos;
4124     Py_ssize_t endinpos;
4125     Py_ssize_t outpos;
4126     const char *e;
4127     PyUnicodeObject *v;
4128     Py_UNICODE *p;
4129     Py_ssize_t extrachars = 0;
4130     PyObject *errorHandler = NULL;
4131     PyObject *exc = NULL;
4132     Py_UNICODE *mapstring = NULL;
4133     Py_ssize_t maplen = 0;
4134 
4135     /* Default to Latin-1 */
4136     if (mapping == NULL)
4137         return PyUnicode_DecodeLatin1(s, size, errors);
4138 
4139     v = _PyUnicode_New(size);
4140     if (v == NULL)
4141         goto onError;
4142     if (size == 0)
4143         return (PyObject *)v;
4144     p = PyUnicode_AS_UNICODE(v);
4145     e = s + size;
4146     if (PyUnicode_CheckExact(mapping)) {
4147         mapstring = PyUnicode_AS_UNICODE(mapping);
4148         maplen = PyUnicode_GET_SIZE(mapping);
4149         while (s < e) {
4150             unsigned char ch = *s;
4151             Py_UNICODE x = 0xfffe; /* illegal value */
4152 
4153             if (ch < maplen)
4154                 x = mapstring[ch];
4155 
4156             if (x == 0xfffe) {
4157                 /* undefined mapping */
4158                 outpos = p-PyUnicode_AS_UNICODE(v);
4159                 startinpos = s-starts;
4160                 endinpos = startinpos+1;
4161                 if (unicode_decode_call_errorhandler(
4162                         errors, &errorHandler,
4163                         "charmap", "character maps to <undefined>",
4164                         starts, size, &startinpos, &endinpos, &exc, &s,
4165                         &v, &outpos, &p)) {
4166                     goto onError;
4167                 }
4168                 continue;
4169             }
4170             *p++ = x;
4171             ++s;
4172         }
4173     }
4174     else {
4175         while (s < e) {
4176             unsigned char ch = *s;
4177             PyObject *w, *x;
4178 
4179             /* Get mapping (char ordinal -> integer, Unicode char or None) */
4180             w = PyInt_FromLong((long)ch);
4181             if (w == NULL)
4182                 goto onError;
4183             x = PyObject_GetItem(mapping, w);
4184             Py_DECREF(w);
4185             if (x == NULL) {
4186                 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4187                     /* No mapping found means: mapping is undefined. */
4188                     PyErr_Clear();
4189                     goto Undefined;
4190                 } else
4191                     goto onError;
4192             }
4193 
4194             /* Apply mapping */
4195             if (x == Py_None)
4196                 goto Undefined;
4197             if (PyInt_Check(x)) {
4198                 long value = PyInt_AS_LONG(x);
4199                 if (value == 0xFFFE)
4200                     goto Undefined;
4201                 if (value < 0 || value > 0x10FFFF) {
4202                     PyErr_SetString(PyExc_TypeError,
4203                                     "character mapping must be in range(0x110000)");
4204                     Py_DECREF(x);
4205                     goto onError;
4206                 }
4207 
4208 #ifndef Py_UNICODE_WIDE
4209                 if (value > 0xFFFF) {
4210                     /* see the code for 1-n mapping below */
4211                     if (extrachars < 2) {
4212                         /* resize first */
4213                         Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
4214                         Py_ssize_t needed = 10 - extrachars;
4215                         extrachars += needed;
4216                         /* XXX overflow detection missing */
4217                         if (_PyUnicode_Resize(&v,
4218                                               PyUnicode_GET_SIZE(v) + needed) < 0) {
4219                             Py_DECREF(x);
4220                             goto onError;
4221                         }
4222                         p = PyUnicode_AS_UNICODE(v) + oldpos;
4223                     }
4224                     value -= 0x10000;
4225                     *p++ = 0xD800 | (Py_UNICODE)(value >> 10);
4226                     *p++ = 0xDC00 | (value & 0x3FF);
4227                     extrachars -= 2;
4228                 }
4229                 else
4230 #endif
4231                 *p++ = (Py_UNICODE)value;
4232             }
4233             else if (PyUnicode_Check(x)) {
4234                 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
4235 
4236                 if (targetsize == 1) {
4237                     /* 1-1 mapping */
4238                     Py_UNICODE value = *PyUnicode_AS_UNICODE(x);
4239                     if (value == 0xFFFE)
4240                         goto Undefined;
4241                     *p++ = value;
4242                 }
4243                 else if (targetsize > 1) {
4244                     /* 1-n mapping */
4245                     if (targetsize > extrachars) {
4246                         /* resize first */
4247                         Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
4248                         Py_ssize_t needed = (targetsize - extrachars) + \
4249                             (targetsize << 2);
4250                         extrachars += needed;
4251                         /* XXX overflow detection missing */
4252                         if (_PyUnicode_Resize(&v,
4253                                               PyUnicode_GET_SIZE(v) + needed) < 0) {
4254                             Py_DECREF(x);
4255                             goto onError;
4256                         }
4257                         p = PyUnicode_AS_UNICODE(v) + oldpos;
4258                     }
4259                     Py_UNICODE_COPY(p,
4260                                     PyUnicode_AS_UNICODE(x),
4261                                     targetsize);
4262                     p += targetsize;
4263                     extrachars -= targetsize;
4264                 }
4265                 /* 1-0 mapping: skip the character */
4266             }
4267             else {
4268                 /* wrong return value */
4269                 PyErr_SetString(PyExc_TypeError,
4270                                 "character mapping must return integer, None or unicode");
4271                 Py_DECREF(x);
4272                 goto onError;
4273             }
4274             Py_DECREF(x);
4275             ++s;
4276             continue;
4277 Undefined:
4278             /* undefined mapping */
4279             Py_XDECREF(x);
4280             outpos = p-PyUnicode_AS_UNICODE(v);
4281             startinpos = s-starts;
4282             endinpos = startinpos+1;
4283             if (unicode_decode_call_errorhandler(
4284                     errors, &errorHandler,
4285                     "charmap", "character maps to <undefined>",
4286                     starts, size, &startinpos, &endinpos, &exc, &s,
4287                     &v, &outpos, &p)) {
4288                 goto onError;
4289             }
4290         }
4291     }
4292     if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
4293         if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4294             goto onError;
4295     Py_XDECREF(errorHandler);
4296     Py_XDECREF(exc);
4297     return (PyObject *)v;
4298 
4299   onError:
4300     Py_XDECREF(errorHandler);
4301     Py_XDECREF(exc);
4302     Py_XDECREF(v);
4303     return NULL;
4304 }
4305 
4306 /* Charmap encoding: the lookup table */
4307 
4308 struct encoding_map{
4309     PyObject_HEAD
4310     unsigned char level1[32];
4311     int count2, count3;
4312     unsigned char level23[1];
4313 };
4314 
4315 static PyObject*
encoding_map_size(PyObject * obj,PyObject * args)4316 encoding_map_size(PyObject *obj, PyObject* args)
4317 {
4318     struct encoding_map *map = (struct encoding_map*)obj;
4319     return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 +
4320                           128*map->count3);
4321 }
4322 
4323 static PyMethodDef encoding_map_methods[] = {
4324     {"size", encoding_map_size, METH_NOARGS,
4325      PyDoc_STR("Return the size (in bytes) of this object") },
4326     { 0 }
4327 };
4328 
4329 static void
encoding_map_dealloc(PyObject * o)4330 encoding_map_dealloc(PyObject* o)
4331 {
4332     PyObject_FREE(o);
4333 }
4334 
4335 static PyTypeObject EncodingMapType = {
4336     PyVarObject_HEAD_INIT(NULL, 0)
4337     "EncodingMap",          /*tp_name*/
4338     sizeof(struct encoding_map),   /*tp_basicsize*/
4339     0,                      /*tp_itemsize*/
4340     /* methods */
4341     encoding_map_dealloc,   /*tp_dealloc*/
4342     0,                      /*tp_print*/
4343     0,                      /*tp_getattr*/
4344     0,                      /*tp_setattr*/
4345     0,                      /*tp_compare*/
4346     0,                      /*tp_repr*/
4347     0,                      /*tp_as_number*/
4348     0,                      /*tp_as_sequence*/
4349     0,                      /*tp_as_mapping*/
4350     0,                      /*tp_hash*/
4351     0,                      /*tp_call*/
4352     0,                      /*tp_str*/
4353     0,                      /*tp_getattro*/
4354     0,                      /*tp_setattro*/
4355     0,                      /*tp_as_buffer*/
4356     Py_TPFLAGS_DEFAULT,     /*tp_flags*/
4357     0,                      /*tp_doc*/
4358     0,                      /*tp_traverse*/
4359     0,                      /*tp_clear*/
4360     0,                      /*tp_richcompare*/
4361     0,                      /*tp_weaklistoffset*/
4362     0,                      /*tp_iter*/
4363     0,                      /*tp_iternext*/
4364     encoding_map_methods,   /*tp_methods*/
4365     0,                      /*tp_members*/
4366     0,                      /*tp_getset*/
4367     0,                      /*tp_base*/
4368     0,                      /*tp_dict*/
4369     0,                      /*tp_descr_get*/
4370     0,                      /*tp_descr_set*/
4371     0,                      /*tp_dictoffset*/
4372     0,                      /*tp_init*/
4373     0,                      /*tp_alloc*/
4374     0,                      /*tp_new*/
4375     0,                      /*tp_free*/
4376     0,                      /*tp_is_gc*/
4377 };
4378 
4379 PyObject*
PyUnicode_BuildEncodingMap(PyObject * string)4380 PyUnicode_BuildEncodingMap(PyObject* string)
4381 {
4382     Py_UNICODE *decode;
4383     PyObject *result;
4384     struct encoding_map *mresult;
4385     int i;
4386     int need_dict = 0;
4387     unsigned char level1[32];
4388     unsigned char level2[512];
4389     unsigned char *mlevel1, *mlevel2, *mlevel3;
4390     int count2 = 0, count3 = 0;
4391 
4392     if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
4393         PyErr_BadArgument();
4394         return NULL;
4395     }
4396     decode = PyUnicode_AS_UNICODE(string);
4397     memset(level1, 0xFF, sizeof level1);
4398     memset(level2, 0xFF, sizeof level2);
4399 
4400     /* If there isn't a one-to-one mapping of NULL to \0,
4401        or if there are non-BMP characters, we need to use
4402        a mapping dictionary. */
4403     if (decode[0] != 0)
4404         need_dict = 1;
4405     for (i = 1; i < 256; i++) {
4406         int l1, l2;
4407         if (decode[i] == 0
4408 #ifdef Py_UNICODE_WIDE
4409             || decode[i] > 0xFFFF
4410 #endif
4411             ) {
4412             need_dict = 1;
4413             break;
4414         }
4415         if (decode[i] == 0xFFFE)
4416             /* unmapped character */
4417             continue;
4418         l1 = decode[i] >> 11;
4419         l2 = decode[i] >> 7;
4420         if (level1[l1] == 0xFF)
4421             level1[l1] = count2++;
4422         if (level2[l2] == 0xFF)
4423             level2[l2] = count3++;
4424     }
4425 
4426     if (count2 >= 0xFF || count3 >= 0xFF)
4427         need_dict = 1;
4428 
4429     if (need_dict) {
4430         PyObject *result = PyDict_New();
4431         PyObject *key, *value;
4432         if (!result)
4433             return NULL;
4434         for (i = 0; i < 256; i++) {
4435             value = NULL;
4436             key = PyInt_FromLong(decode[i]);
4437             value = PyInt_FromLong(i);
4438             if (!key || !value)
4439                 goto failed1;
4440             if (PyDict_SetItem(result, key, value) == -1)
4441                 goto failed1;
4442             Py_DECREF(key);
4443             Py_DECREF(value);
4444         }
4445         return result;
4446       failed1:
4447         Py_XDECREF(key);
4448         Py_XDECREF(value);
4449         Py_DECREF(result);
4450         return NULL;
4451     }
4452 
4453     /* Create a three-level trie */
4454     result = PyObject_MALLOC(sizeof(struct encoding_map) +
4455                              16*count2 + 128*count3 - 1);
4456     if (!result)
4457         return PyErr_NoMemory();
4458     PyObject_Init(result, &EncodingMapType);
4459     mresult = (struct encoding_map*)result;
4460     mresult->count2 = count2;
4461     mresult->count3 = count3;
4462     mlevel1 = mresult->level1;
4463     mlevel2 = mresult->level23;
4464     mlevel3 = mresult->level23 + 16*count2;
4465     memcpy(mlevel1, level1, 32);
4466     memset(mlevel2, 0xFF, 16*count2);
4467     memset(mlevel3, 0, 128*count3);
4468     count3 = 0;
4469     for (i = 1; i < 256; i++) {
4470         int o1, o2, o3, i2, i3;
4471         if (decode[i] == 0xFFFE)
4472             /* unmapped character */
4473             continue;
4474         o1 = decode[i]>>11;
4475         o2 = (decode[i]>>7) & 0xF;
4476         i2 = 16*mlevel1[o1] + o2;
4477         if (mlevel2[i2] == 0xFF)
4478             mlevel2[i2] = count3++;
4479         o3 = decode[i] & 0x7F;
4480         i3 = 128*mlevel2[i2] + o3;
4481         mlevel3[i3] = i;
4482     }
4483     return result;
4484 }
4485 
4486 static int
encoding_map_lookup(Py_UNICODE c,PyObject * mapping)4487 encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
4488 {
4489     struct encoding_map *map = (struct encoding_map*)mapping;
4490     int l1 = c>>11;
4491     int l2 = (c>>7) & 0xF;
4492     int l3 = c & 0x7F;
4493     int i;
4494 
4495 #ifdef Py_UNICODE_WIDE
4496     if (c > 0xFFFF) {
4497         return -1;
4498     }
4499 #endif
4500     if (c == 0)
4501         return 0;
4502     /* level 1*/
4503     i = map->level1[l1];
4504     if (i == 0xFF) {
4505         return -1;
4506     }
4507     /* level 2*/
4508     i = map->level23[16*i+l2];
4509     if (i == 0xFF) {
4510         return -1;
4511     }
4512     /* level 3 */
4513     i = map->level23[16*map->count2 + 128*i + l3];
4514     if (i == 0) {
4515         return -1;
4516     }
4517     return i;
4518 }
4519 
4520 /* Lookup the character ch in the mapping. If the character
4521    can't be found, Py_None is returned (or NULL, if another
4522    error occurred). */
charmapencode_lookup(Py_UNICODE c,PyObject * mapping)4523 static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
4524 {
4525     PyObject *w = PyInt_FromLong((long)c);
4526     PyObject *x;
4527 
4528     if (w == NULL)
4529         return NULL;
4530     x = PyObject_GetItem(mapping, w);
4531     Py_DECREF(w);
4532     if (x == NULL) {
4533         if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4534             /* No mapping found means: mapping is undefined. */
4535             PyErr_Clear();
4536             x = Py_None;
4537             Py_INCREF(x);
4538             return x;
4539         } else
4540             return NULL;
4541     }
4542     else if (x == Py_None)
4543         return x;
4544     else if (PyInt_Check(x)) {
4545         long value = PyInt_AS_LONG(x);
4546         if (value < 0 || value > 255) {
4547             PyErr_SetString(PyExc_TypeError,
4548                             "character mapping must be in range(256)");
4549             Py_DECREF(x);
4550             return NULL;
4551         }
4552         return x;
4553     }
4554     else if (PyString_Check(x))
4555         return x;
4556     else {
4557         /* wrong return value */
4558         PyErr_SetString(PyExc_TypeError,
4559                         "character mapping must return integer, None or str");
4560         Py_DECREF(x);
4561         return NULL;
4562     }
4563 }
4564 
4565 static int
charmapencode_resize(PyObject ** outobj,Py_ssize_t * outpos,Py_ssize_t requiredsize)4566 charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
4567 {
4568     Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
4569     /* exponentially overallocate to minimize reallocations */
4570     if (requiredsize < 2*outsize)
4571         requiredsize = 2*outsize;
4572     if (_PyString_Resize(outobj, requiredsize)) {
4573         return 0;
4574     }
4575     return 1;
4576 }
4577 
4578 typedef enum charmapencode_result {
4579     enc_SUCCESS, enc_FAILED, enc_EXCEPTION
4580 }charmapencode_result;
4581 /* lookup the character, put the result in the output string and adjust
4582    various state variables. Reallocate the output string if not enough
4583    space is available. Return a new reference to the object that
4584    was put in the output buffer, or Py_None, if the mapping was undefined
4585    (in which case no character was written) or NULL, if a
4586    reallocation error occurred. The caller must decref the result */
4587 static
charmapencode_output(Py_UNICODE c,PyObject * mapping,PyObject ** outobj,Py_ssize_t * outpos)4588 charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
4589                                           PyObject **outobj, Py_ssize_t *outpos)
4590 {
4591     PyObject *rep;
4592     char *outstart;
4593     Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
4594 
4595     if (Py_TYPE(mapping) == &EncodingMapType) {
4596         int res = encoding_map_lookup(c, mapping);
4597         Py_ssize_t requiredsize = *outpos+1;
4598         if (res == -1)
4599             return enc_FAILED;
4600         if (outsize<requiredsize)
4601             if (!charmapencode_resize(outobj, outpos, requiredsize))
4602                 return enc_EXCEPTION;
4603         outstart = PyString_AS_STRING(*outobj);
4604         outstart[(*outpos)++] = (char)res;
4605         return enc_SUCCESS;
4606     }
4607 
4608     rep = charmapencode_lookup(c, mapping);
4609     if (rep==NULL)
4610         return enc_EXCEPTION;
4611     else if (rep==Py_None) {
4612         Py_DECREF(rep);
4613         return enc_FAILED;
4614     } else {
4615         if (PyInt_Check(rep)) {
4616             Py_ssize_t requiredsize = *outpos+1;
4617             if (outsize<requiredsize)
4618                 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
4619                     Py_DECREF(rep);
4620                     return enc_EXCEPTION;
4621                 }
4622             outstart = PyString_AS_STRING(*outobj);
4623             outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
4624         }
4625         else {
4626             const char *repchars = PyString_AS_STRING(rep);
4627             Py_ssize_t repsize = PyString_GET_SIZE(rep);
4628             Py_ssize_t requiredsize = *outpos+repsize;
4629             if (outsize<requiredsize)
4630                 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
4631                     Py_DECREF(rep);
4632                     return enc_EXCEPTION;
4633                 }
4634             outstart = PyString_AS_STRING(*outobj);
4635             memcpy(outstart + *outpos, repchars, repsize);
4636             *outpos += repsize;
4637         }
4638     }
4639     Py_DECREF(rep);
4640     return enc_SUCCESS;
4641 }
4642 
4643 /* handle an error in PyUnicode_EncodeCharmap
4644    Return 0 on success, -1 on error */
4645 static
charmap_encoding_error(const Py_UNICODE * p,Py_ssize_t size,Py_ssize_t * inpos,PyObject * mapping,PyObject ** exceptionObject,int * known_errorHandler,PyObject ** errorHandler,const char * errors,PyObject ** res,Py_ssize_t * respos)4646 int charmap_encoding_error(
4647     const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
4648     PyObject **exceptionObject,
4649     int *known_errorHandler, PyObject **errorHandler, const char *errors,
4650     PyObject **res, Py_ssize_t *respos)
4651 {
4652     PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
4653     Py_ssize_t repsize;
4654     Py_ssize_t newpos;
4655     Py_UNICODE *uni2;
4656     /* startpos for collecting unencodable chars */
4657     Py_ssize_t collstartpos = *inpos;
4658     Py_ssize_t collendpos = *inpos+1;
4659     Py_ssize_t collpos;
4660     char *encoding = "charmap";
4661     char *reason = "character maps to <undefined>";
4662     charmapencode_result x;
4663 
4664     /* find all unencodable characters */
4665     while (collendpos < size) {
4666         PyObject *rep;
4667         if (Py_TYPE(mapping) == &EncodingMapType) {
4668             int res = encoding_map_lookup(p[collendpos], mapping);
4669             if (res != -1)
4670                 break;
4671             ++collendpos;
4672             continue;
4673         }
4674 
4675         rep = charmapencode_lookup(p[collendpos], mapping);
4676         if (rep==NULL)
4677             return -1;
4678         else if (rep!=Py_None) {
4679             Py_DECREF(rep);
4680             break;
4681         }
4682         Py_DECREF(rep);
4683         ++collendpos;
4684     }
4685     /* cache callback name lookup
4686      * (if not done yet, i.e. it's the first error) */
4687     if (*known_errorHandler==-1) {
4688         if ((errors==NULL) || (!strcmp(errors, "strict")))
4689             *known_errorHandler = 1;
4690         else if (!strcmp(errors, "replace"))
4691             *known_errorHandler = 2;
4692         else if (!strcmp(errors, "ignore"))
4693             *known_errorHandler = 3;
4694         else if (!strcmp(errors, "xmlcharrefreplace"))
4695             *known_errorHandler = 4;
4696         else
4697             *known_errorHandler = 0;
4698     }
4699     switch (*known_errorHandler) {
4700     case 1: /* strict */
4701         raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4702         return -1;
4703     case 2: /* replace */
4704         for (collpos = collstartpos; collpos<collendpos; ++collpos) {
4705             x = charmapencode_output('?', mapping, res, respos);
4706             if (x==enc_EXCEPTION) {
4707                 return -1;
4708             }
4709             else if (x==enc_FAILED) {
4710                 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4711                 return -1;
4712             }
4713         }
4714         /* fall through */
4715     case 3: /* ignore */
4716         *inpos = collendpos;
4717         break;
4718     case 4: /* xmlcharrefreplace */
4719         /* generate replacement */
4720         for (collpos = collstartpos; collpos < collendpos;) {
4721             char buffer[2+29+1+1];
4722             char *cp;
4723             Py_UCS4 ch = p[collpos++];
4724 #ifndef Py_UNICODE_WIDE
4725             if ((0xD800 <= ch && ch <= 0xDBFF) &&
4726                 (collpos < collendpos) &&
4727                 (0xDC00 <= p[collpos] && p[collpos] <= 0xDFFF)) {
4728                 ch = ((((ch & 0x03FF) << 10) |
4729                        ((Py_UCS4)p[collpos++] & 0x03FF)) + 0x10000);
4730             }
4731 #endif
4732             sprintf(buffer, "&#%d;", (int)ch);
4733             for (cp = buffer; *cp; ++cp) {
4734                 x = charmapencode_output(*cp, mapping, res, respos);
4735                 if (x==enc_EXCEPTION)
4736                     return -1;
4737                 else if (x==enc_FAILED) {
4738                     raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4739                     return -1;
4740                 }
4741             }
4742         }
4743         *inpos = collendpos;
4744         break;
4745     default:
4746         repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
4747                                                       encoding, reason, p, size, exceptionObject,
4748                                                       collstartpos, collendpos, &newpos);
4749         if (repunicode == NULL)
4750             return -1;
4751         /* generate replacement  */
4752         repsize = PyUnicode_GET_SIZE(repunicode);
4753         for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4754             x = charmapencode_output(*uni2, mapping, res, respos);
4755             if (x==enc_EXCEPTION) {
4756                 return -1;
4757             }
4758             else if (x==enc_FAILED) {
4759                 Py_DECREF(repunicode);
4760                 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4761                 return -1;
4762             }
4763         }
4764         *inpos = newpos;
4765         Py_DECREF(repunicode);
4766     }
4767     return 0;
4768 }
4769 
PyUnicode_EncodeCharmap(const Py_UNICODE * p,Py_ssize_t size,PyObject * mapping,const char * errors)4770 PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
4771                                   Py_ssize_t size,
4772                                   PyObject *mapping,
4773                                   const char *errors)
4774 {
4775     /* output object */
4776     PyObject *res = NULL;
4777     /* current input position */
4778     Py_ssize_t inpos = 0;
4779     /* current output position */
4780     Py_ssize_t respos = 0;
4781     PyObject *errorHandler = NULL;
4782     PyObject *exc = NULL;
4783     /* the following variable is used for caching string comparisons
4784      * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4785      * 3=ignore, 4=xmlcharrefreplace */
4786     int known_errorHandler = -1;
4787 
4788     /* Default to Latin-1 */
4789     if (mapping == NULL)
4790         return PyUnicode_EncodeLatin1(p, size, errors);
4791 
4792     /* allocate enough for a simple encoding without
4793        replacements, if we need more, we'll resize */
4794     res = PyString_FromStringAndSize(NULL, size);
4795     if (res == NULL)
4796         goto onError;
4797     if (size == 0)
4798         return res;
4799 
4800     while (inpos<size) {
4801         /* try to encode it */
4802         charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
4803         if (x==enc_EXCEPTION) /* error */
4804             goto onError;
4805         if (x==enc_FAILED) { /* unencodable character */
4806             if (charmap_encoding_error(p, size, &inpos, mapping,
4807                                        &exc,
4808                                        &known_errorHandler, &errorHandler, errors,
4809                                        &res, &respos)) {
4810                 goto onError;
4811             }
4812         }
4813         else
4814             /* done with this character => adjust input position */
4815             ++inpos;
4816     }
4817 
4818     /* Resize if we allocated to much */
4819     if (respos<PyString_GET_SIZE(res)) {
4820         if (_PyString_Resize(&res, respos))
4821             goto onError;
4822     }
4823     Py_XDECREF(exc);
4824     Py_XDECREF(errorHandler);
4825     return res;
4826 
4827   onError:
4828     Py_XDECREF(res);
4829     Py_XDECREF(exc);
4830     Py_XDECREF(errorHandler);
4831     return NULL;
4832 }
4833 
PyUnicode_AsCharmapString(PyObject * unicode,PyObject * mapping)4834 PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
4835                                     PyObject *mapping)
4836 {
4837     if (!PyUnicode_Check(unicode) || mapping == NULL) {
4838         PyErr_BadArgument();
4839         return NULL;
4840     }
4841     return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
4842                                    PyUnicode_GET_SIZE(unicode),
4843                                    mapping,
4844                                    NULL);
4845 }
4846 
4847 /* create or adjust a UnicodeTranslateError */
make_translate_exception(PyObject ** exceptionObject,const Py_UNICODE * unicode,Py_ssize_t size,Py_ssize_t startpos,Py_ssize_t endpos,const char * reason)4848 static void make_translate_exception(PyObject **exceptionObject,
4849                                      const Py_UNICODE *unicode, Py_ssize_t size,
4850                                      Py_ssize_t startpos, Py_ssize_t endpos,
4851                                      const char *reason)
4852 {
4853     if (*exceptionObject == NULL) {
4854         *exceptionObject = PyUnicodeTranslateError_Create(
4855             unicode, size, startpos, endpos, reason);
4856     }
4857     else {
4858         if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
4859             goto onError;
4860         if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
4861             goto onError;
4862         if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
4863             goto onError;
4864         return;
4865       onError:
4866         Py_CLEAR(*exceptionObject);
4867     }
4868 }
4869 
4870 /* raises a UnicodeTranslateError */
raise_translate_exception(PyObject ** exceptionObject,const Py_UNICODE * unicode,Py_ssize_t size,Py_ssize_t startpos,Py_ssize_t endpos,const char * reason)4871 static void raise_translate_exception(PyObject **exceptionObject,
4872                                       const Py_UNICODE *unicode, Py_ssize_t size,
4873                                       Py_ssize_t startpos, Py_ssize_t endpos,
4874                                       const char *reason)
4875 {
4876     make_translate_exception(exceptionObject,
4877                              unicode, size, startpos, endpos, reason);
4878     if (*exceptionObject != NULL)
4879         PyCodec_StrictErrors(*exceptionObject);
4880 }
4881 
4882 /* error handling callback helper:
4883    build arguments, call the callback and check the arguments,
4884    put the result into newpos and return the replacement string, which
4885    has to be freed by the caller */
unicode_translate_call_errorhandler(const char * errors,PyObject ** errorHandler,const char * reason,const Py_UNICODE * unicode,Py_ssize_t size,PyObject ** exceptionObject,Py_ssize_t startpos,Py_ssize_t endpos,Py_ssize_t * newpos)4886 static PyObject *unicode_translate_call_errorhandler(const char *errors,
4887                                                      PyObject **errorHandler,
4888                                                      const char *reason,
4889                                                      const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4890                                                      Py_ssize_t startpos, Py_ssize_t endpos,
4891                                                      Py_ssize_t *newpos)
4892 {
4893     static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
4894 
4895     Py_ssize_t i_newpos;
4896     PyObject *restuple;
4897     PyObject *resunicode;
4898 
4899     if (*errorHandler == NULL) {
4900         *errorHandler = PyCodec_LookupError(errors);
4901         if (*errorHandler == NULL)
4902             return NULL;
4903     }
4904 
4905     make_translate_exception(exceptionObject,
4906                              unicode, size, startpos, endpos, reason);
4907     if (*exceptionObject == NULL)
4908         return NULL;
4909 
4910     restuple = PyObject_CallFunctionObjArgs(
4911         *errorHandler, *exceptionObject, NULL);
4912     if (restuple == NULL)
4913         return NULL;
4914     if (!PyTuple_Check(restuple)) {
4915         PyErr_SetString(PyExc_TypeError, &argparse[4]);
4916         Py_DECREF(restuple);
4917         return NULL;
4918     }
4919     if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
4920                           &resunicode, &i_newpos)) {
4921         Py_DECREF(restuple);
4922         return NULL;
4923     }
4924     if (i_newpos<0)
4925         *newpos = size+i_newpos;
4926     else
4927         *newpos = i_newpos;
4928     if (*newpos<0 || *newpos>size) {
4929         PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
4930         Py_DECREF(restuple);
4931         return NULL;
4932     }
4933     Py_INCREF(resunicode);
4934     Py_DECREF(restuple);
4935     return resunicode;
4936 }
4937 
4938 /* Lookup the character ch in the mapping and put the result in result,
4939    which must be decrefed by the caller.
4940    Return 0 on success, -1 on error */
4941 static
charmaptranslate_lookup(Py_UNICODE c,PyObject * mapping,PyObject ** result)4942 int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
4943 {
4944     PyObject *w = PyInt_FromLong((long)c);
4945     PyObject *x;
4946 
4947     if (w == NULL)
4948         return -1;
4949     x = PyObject_GetItem(mapping, w);
4950     Py_DECREF(w);
4951     if (x == NULL) {
4952         if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4953             /* No mapping found means: use 1:1 mapping. */
4954             PyErr_Clear();
4955             *result = NULL;
4956             return 0;
4957         } else
4958             return -1;
4959     }
4960     else if (x == Py_None) {
4961         *result = x;
4962         return 0;
4963     }
4964     else if (PyInt_Check(x)) {
4965         long value = PyInt_AS_LONG(x);
4966         long max = PyUnicode_GetMax();
4967         if (value < 0 || value > max) {
4968             PyErr_Format(PyExc_TypeError,
4969                          "character mapping must be in range(0x%lx)", max+1);
4970             Py_DECREF(x);
4971             return -1;
4972         }
4973         *result = x;
4974         return 0;
4975     }
4976     else if (PyUnicode_Check(x)) {
4977         *result = x;
4978         return 0;
4979     }
4980     else {
4981         /* wrong return value */
4982         PyErr_SetString(PyExc_TypeError,
4983                         "character mapping must return integer, None or unicode");
4984         Py_DECREF(x);
4985         return -1;
4986     }
4987 }
4988 /* ensure that *outobj is at least requiredsize characters long,
4989    if not reallocate and adjust various state variables.
4990    Return 0 on success, -1 on error */
4991 static
charmaptranslate_makespace(PyObject ** outobj,Py_UNICODE ** outp,Py_ssize_t requiredsize)4992 int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
4993                                Py_ssize_t requiredsize)
4994 {
4995     Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
4996     if (requiredsize > oldsize) {
4997         /* remember old output position */
4998         Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
4999         /* exponentially overallocate to minimize reallocations */
5000         if (requiredsize < 2 * oldsize)
5001             requiredsize = 2 * oldsize;
5002         if (PyUnicode_Resize(outobj, requiredsize) < 0)
5003             return -1;
5004         *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
5005     }
5006     return 0;
5007 }
5008 /* lookup the character, put the result in the output string and adjust
5009    various state variables. Return a new reference to the object that
5010    was put in the output buffer in *result, or Py_None, if the mapping was
5011    undefined (in which case no character was written).
5012    The called must decref result.
5013    Return 0 on success, -1 on error. */
5014 static
charmaptranslate_output(const Py_UNICODE * startinp,const Py_UNICODE * curinp,Py_ssize_t insize,PyObject * mapping,PyObject ** outobj,Py_UNICODE ** outp,PyObject ** res)5015 int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
5016                             Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
5017                             PyObject **res)
5018 {
5019     if (charmaptranslate_lookup(*curinp, mapping, res))
5020         return -1;
5021     if (*res==NULL) {
5022         /* not found => default to 1:1 mapping */
5023         *(*outp)++ = *curinp;
5024     }
5025     else if (*res==Py_None)
5026         ;
5027     else if (PyInt_Check(*res)) {
5028         /* no overflow check, because we know that the space is enough */
5029         *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
5030     }
5031     else if (PyUnicode_Check(*res)) {
5032         Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
5033         if (repsize==1) {
5034             /* no overflow check, because we know that the space is enough */
5035             *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
5036         }
5037         else if (repsize!=0) {
5038             /* more than one character */
5039             Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
5040                 (insize - (curinp-startinp)) +
5041                 repsize - 1;
5042             if (charmaptranslate_makespace(outobj, outp, requiredsize))
5043                 return -1;
5044             memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
5045             *outp += repsize;
5046         }
5047     }
5048     else
5049         return -1;
5050     return 0;
5051 }
5052 
PyUnicode_TranslateCharmap(const Py_UNICODE * p,Py_ssize_t size,PyObject * mapping,const char * errors)5053 PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
5054                                      Py_ssize_t size,
5055                                      PyObject *mapping,
5056                                      const char *errors)
5057 {
5058     /* output object */
5059     PyObject *res = NULL;
5060     /* pointers to the beginning and end+1 of input */
5061     const Py_UNICODE *startp = p;
5062     const Py_UNICODE *endp = p + size;
5063     /* pointer into the output */
5064     Py_UNICODE *str;
5065     /* current output position */
5066     Py_ssize_t respos = 0;
5067     char *reason = "character maps to <undefined>";
5068     PyObject *errorHandler = NULL;
5069     PyObject *exc = NULL;
5070     /* the following variable is used for caching string comparisons
5071      * -1=not initialized, 0=unknown, 1=strict, 2=replace,
5072      * 3=ignore, 4=xmlcharrefreplace */
5073     int known_errorHandler = -1;
5074 
5075     if (mapping == NULL) {
5076         PyErr_BadArgument();
5077         return NULL;
5078     }
5079 
5080     /* allocate enough for a simple 1:1 translation without
5081        replacements, if we need more, we'll resize */
5082     res = PyUnicode_FromUnicode(NULL, size);
5083     if (res == NULL)
5084         goto onError;
5085     if (size == 0)
5086         return res;
5087     str = PyUnicode_AS_UNICODE(res);
5088 
5089     while (p<endp) {
5090         /* try to encode it */
5091         PyObject *x = NULL;
5092         if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
5093             Py_XDECREF(x);
5094             goto onError;
5095         }
5096         Py_XDECREF(x);
5097         if (x!=Py_None) /* it worked => adjust input pointer */
5098             ++p;
5099         else { /* untranslatable character */
5100             PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
5101             Py_ssize_t repsize;
5102             Py_ssize_t newpos;
5103             Py_UNICODE *uni2;
5104             /* startpos for collecting untranslatable chars */
5105             const Py_UNICODE *collstart = p;
5106             const Py_UNICODE *collend = p+1;
5107             const Py_UNICODE *coll;
5108 
5109             /* find all untranslatable characters */
5110             while (collend < endp) {
5111                 if (charmaptranslate_lookup(*collend, mapping, &x))
5112                     goto onError;
5113                 Py_XDECREF(x);
5114                 if (x!=Py_None)
5115                     break;
5116                 ++collend;
5117             }
5118             /* cache callback name lookup
5119              * (if not done yet, i.e. it's the first error) */
5120             if (known_errorHandler==-1) {
5121                 if ((errors==NULL) || (!strcmp(errors, "strict")))
5122                     known_errorHandler = 1;
5123                 else if (!strcmp(errors, "replace"))
5124                     known_errorHandler = 2;
5125                 else if (!strcmp(errors, "ignore"))
5126                     known_errorHandler = 3;
5127                 else if (!strcmp(errors, "xmlcharrefreplace"))
5128                     known_errorHandler = 4;
5129                 else
5130                     known_errorHandler = 0;
5131             }
5132             switch (known_errorHandler) {
5133             case 1: /* strict */
5134                 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
5135                 goto onError;
5136             case 2: /* replace */
5137                 /* No need to check for space, this is a 1:1 replacement */
5138                 for (coll = collstart; coll<collend; ++coll)
5139                     *str++ = '?';
5140                 /* fall through */
5141             case 3: /* ignore */
5142                 p = collend;
5143                 break;
5144             case 4: /* xmlcharrefreplace */
5145                 /* generate replacement (temporarily (mis)uses p) */
5146                 for (p = collstart; p < collend;) {
5147                     char buffer[2+29+1+1];
5148                     char *cp;
5149                     Py_UCS4 ch = _Py_UNICODE_NEXT(p, collend);
5150                     sprintf(buffer, "&#%d;", (int)ch);
5151                     if (charmaptranslate_makespace(&res, &str,
5152                                                    (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
5153                         goto onError;
5154                     for (cp = buffer; *cp; ++cp)
5155                         *str++ = *cp;
5156                 }
5157                 p = collend;
5158                 break;
5159             default:
5160                 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
5161                                                                  reason, startp, size, &exc,
5162                                                                  collstart-startp, collend-startp, &newpos);
5163                 if (repunicode == NULL)
5164                     goto onError;
5165                 /* generate replacement  */
5166                 repsize = PyUnicode_GET_SIZE(repunicode);
5167                 if (charmaptranslate_makespace(&res, &str,
5168                                                (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
5169                     Py_DECREF(repunicode);
5170                     goto onError;
5171                 }
5172                 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
5173                     *str++ = *uni2;
5174                 p = startp + newpos;
5175                 Py_DECREF(repunicode);
5176             }
5177         }
5178     }
5179     /* Resize if we allocated to much */
5180     respos = str-PyUnicode_AS_UNICODE(res);
5181     if (respos<PyUnicode_GET_SIZE(res)) {
5182         if (PyUnicode_Resize(&res, respos) < 0)
5183             goto onError;
5184     }
5185     Py_XDECREF(exc);
5186     Py_XDECREF(errorHandler);
5187     return res;
5188 
5189   onError:
5190     Py_XDECREF(res);
5191     Py_XDECREF(exc);
5192     Py_XDECREF(errorHandler);
5193     return NULL;
5194 }
5195 
PyUnicode_Translate(PyObject * str,PyObject * mapping,const char * errors)5196 PyObject *PyUnicode_Translate(PyObject *str,
5197                               PyObject *mapping,
5198                               const char *errors)
5199 {
5200     PyObject *result;
5201 
5202     str = PyUnicode_FromObject(str);
5203     if (str == NULL)
5204         goto onError;
5205     result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
5206                                         PyUnicode_GET_SIZE(str),
5207                                         mapping,
5208                                         errors);
5209     Py_DECREF(str);
5210     return result;
5211 
5212   onError:
5213     Py_XDECREF(str);
5214     return NULL;
5215 }
5216 
5217 /* --- Decimal Encoder ---------------------------------------------------- */
5218 
PyUnicode_EncodeDecimal(Py_UNICODE * s,Py_ssize_t length,char * output,const char * errors)5219 int PyUnicode_EncodeDecimal(Py_UNICODE *s,
5220                             Py_ssize_t length,
5221                             char *output,
5222                             const char *errors)
5223 {
5224     Py_UNICODE *p, *end;
5225     PyObject *errorHandler = NULL;
5226     PyObject *exc = NULL;
5227     const char *encoding = "decimal";
5228     const char *reason = "invalid decimal Unicode string";
5229     /* the following variable is used for caching string comparisons
5230      * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
5231     int known_errorHandler = -1;
5232 
5233     if (output == NULL) {
5234         PyErr_BadArgument();
5235         return -1;
5236     }
5237 
5238     p = s;
5239     end = s + length;
5240     while (p < end) {
5241         register Py_UNICODE ch = *p;
5242         int decimal;
5243         PyObject *repunicode;
5244         Py_ssize_t repsize;
5245         Py_ssize_t newpos;
5246         Py_UNICODE *uni2;
5247         Py_UNICODE *collstart;
5248         Py_UNICODE *collend;
5249 
5250         if (Py_UNICODE_ISSPACE(ch)) {
5251             *output++ = ' ';
5252             ++p;
5253             continue;
5254         }
5255         decimal = Py_UNICODE_TODECIMAL(ch);
5256         if (decimal >= 0) {
5257             *output++ = '0' + decimal;
5258             ++p;
5259             continue;
5260         }
5261         if (0 < ch && ch < 256) {
5262             *output++ = (char)ch;
5263             ++p;
5264             continue;
5265         }
5266         /* All other characters are considered unencodable */
5267         collstart = p;
5268         for (collend = p+1; collend < end; collend++) {
5269             if ((0 < *collend && *collend < 256) ||
5270                 Py_UNICODE_ISSPACE(*collend) ||
5271                 0 <= Py_UNICODE_TODECIMAL(*collend))
5272                 break;
5273         }
5274         /* cache callback name lookup
5275          * (if not done yet, i.e. it's the first error) */
5276         if (known_errorHandler==-1) {
5277             if ((errors==NULL) || (!strcmp(errors, "strict")))
5278                 known_errorHandler = 1;
5279             else if (!strcmp(errors, "replace"))
5280                 known_errorHandler = 2;
5281             else if (!strcmp(errors, "ignore"))
5282                 known_errorHandler = 3;
5283             else if (!strcmp(errors, "xmlcharrefreplace"))
5284                 known_errorHandler = 4;
5285             else
5286                 known_errorHandler = 0;
5287         }
5288         switch (known_errorHandler) {
5289         case 1: /* strict */
5290             raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
5291             goto onError;
5292         case 2: /* replace */
5293             for (p = collstart; p < collend; ++p)
5294                 *output++ = '?';
5295             /* fall through */
5296         case 3: /* ignore */
5297             p = collend;
5298             break;
5299         case 4: /* xmlcharrefreplace */
5300             /* generate replacement (temporarily (mis)uses p) */
5301             for (p = collstart; p < collend;) {
5302                 Py_UCS4 ch = _Py_UNICODE_NEXT(p, collend);
5303                 output += sprintf(output, "&#%d;", ch);
5304             }
5305             p = collend;
5306             break;
5307         default:
5308             repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
5309                                                           encoding, reason, s, length, &exc,
5310                                                           collstart-s, collend-s, &newpos);
5311             if (repunicode == NULL)
5312                 goto onError;
5313             /* generate replacement  */
5314             repsize = PyUnicode_GET_SIZE(repunicode);
5315             for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
5316                 Py_UNICODE ch = *uni2;
5317                 if (Py_UNICODE_ISSPACE(ch))
5318                     *output++ = ' ';
5319                 else {
5320                     decimal = Py_UNICODE_TODECIMAL(ch);
5321                     if (decimal >= 0)
5322                         *output++ = '0' + decimal;
5323                     else if (0 < ch && ch < 256)
5324                         *output++ = (char)ch;
5325                     else {
5326                         Py_DECREF(repunicode);
5327                         raise_encode_exception(&exc, encoding,
5328                                                s, length, collstart-s, collend-s, reason);
5329                         goto onError;
5330                     }
5331                 }
5332             }
5333             p = s + newpos;
5334             Py_DECREF(repunicode);
5335         }
5336     }
5337     /* 0-terminate the output string */
5338     *output++ = '\0';
5339     Py_XDECREF(exc);
5340     Py_XDECREF(errorHandler);
5341     return 0;
5342 
5343   onError:
5344     Py_XDECREF(exc);
5345     Py_XDECREF(errorHandler);
5346     return -1;
5347 }
5348 
5349 /* --- Helpers ------------------------------------------------------------ */
5350 
5351 #include "stringlib/unicodedefs.h"
5352 #include "stringlib/fastsearch.h"
5353 
5354 #include "stringlib/count.h"
5355 #include "stringlib/find.h"
5356 #include "stringlib/partition.h"
5357 #include "stringlib/split.h"
5358 
5359 /* helper macro to fixup start/end slice values */
5360 #define ADJUST_INDICES(start, end, len)         \
5361     if (end > len)                              \
5362         end = len;                              \
5363     else if (end < 0) {                         \
5364         end += len;                             \
5365         if (end < 0)                            \
5366             end = 0;                            \
5367     }                                           \
5368     if (start < 0) {                            \
5369         start += len;                           \
5370         if (start < 0)                          \
5371             start = 0;                          \
5372     }
5373 
PyUnicode_Count(PyObject * str,PyObject * substr,Py_ssize_t start,Py_ssize_t end)5374 Py_ssize_t PyUnicode_Count(PyObject *str,
5375                            PyObject *substr,
5376                            Py_ssize_t start,
5377                            Py_ssize_t end)
5378 {
5379     Py_ssize_t result;
5380     PyUnicodeObject* str_obj;
5381     PyUnicodeObject* sub_obj;
5382 
5383     str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
5384     if (!str_obj)
5385         return -1;
5386     sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
5387     if (!sub_obj) {
5388         Py_DECREF(str_obj);
5389         return -1;
5390     }
5391 
5392     ADJUST_INDICES(start, end, str_obj->length);
5393     result = stringlib_count(
5394         str_obj->str + start, end - start, sub_obj->str, sub_obj->length,
5395         PY_SSIZE_T_MAX
5396         );
5397 
5398     Py_DECREF(sub_obj);
5399     Py_DECREF(str_obj);
5400 
5401     return result;
5402 }
5403 
PyUnicode_Find(PyObject * str,PyObject * sub,Py_ssize_t start,Py_ssize_t end,int direction)5404 Py_ssize_t PyUnicode_Find(PyObject *str,
5405                           PyObject *sub,
5406                           Py_ssize_t start,
5407                           Py_ssize_t end,
5408                           int direction)
5409 {
5410     Py_ssize_t result;
5411 
5412     str = PyUnicode_FromObject(str);
5413     if (!str)
5414         return -2;
5415     sub = PyUnicode_FromObject(sub);
5416     if (!sub) {
5417         Py_DECREF(str);
5418         return -2;
5419     }
5420 
5421     if (direction > 0)
5422         result = stringlib_find_slice(
5423             PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5424             PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5425             start, end
5426             );
5427     else
5428         result = stringlib_rfind_slice(
5429             PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5430             PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5431             start, end
5432             );
5433 
5434     Py_DECREF(str);
5435     Py_DECREF(sub);
5436 
5437     return result;
5438 }
5439 
5440 static
tailmatch(PyUnicodeObject * self,PyUnicodeObject * substring,Py_ssize_t start,Py_ssize_t end,int direction)5441 int tailmatch(PyUnicodeObject *self,
5442               PyUnicodeObject *substring,
5443               Py_ssize_t start,
5444               Py_ssize_t end,
5445               int direction)
5446 {
5447     if (substring->length == 0)
5448         return 1;
5449 
5450     ADJUST_INDICES(start, end, self->length);
5451     end -= substring->length;
5452     if (end < start)
5453         return 0;
5454 
5455     if (direction > 0) {
5456         if (Py_UNICODE_MATCH(self, end, substring))
5457             return 1;
5458     } else {
5459         if (Py_UNICODE_MATCH(self, start, substring))
5460             return 1;
5461     }
5462 
5463     return 0;
5464 }
5465 
PyUnicode_Tailmatch(PyObject * str,PyObject * substr,Py_ssize_t start,Py_ssize_t end,int direction)5466 Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
5467                                PyObject *substr,
5468                                Py_ssize_t start,
5469                                Py_ssize_t end,
5470                                int direction)
5471 {
5472     Py_ssize_t result;
5473 
5474     str = PyUnicode_FromObject(str);
5475     if (str == NULL)
5476         return -1;
5477     substr = PyUnicode_FromObject(substr);
5478     if (substr == NULL) {
5479         Py_DECREF(str);
5480         return -1;
5481     }
5482 
5483     result = tailmatch((PyUnicodeObject *)str,
5484                        (PyUnicodeObject *)substr,
5485                        start, end, direction);
5486     Py_DECREF(str);
5487     Py_DECREF(substr);
5488     return result;
5489 }
5490 
5491 /* Apply fixfct filter to the Unicode object self and return a
5492    reference to the modified object */
5493 
5494 static
fixup(PyUnicodeObject * self,int (* fixfct)(PyUnicodeObject * s))5495 PyObject *fixup(PyUnicodeObject *self,
5496                 int (*fixfct)(PyUnicodeObject *s))
5497 {
5498 
5499     PyUnicodeObject *u;
5500 
5501     u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5502     if (u == NULL)
5503         return NULL;
5504 
5505     Py_UNICODE_COPY(u->str, self->str, self->length);
5506 
5507     if (!fixfct(u) && PyUnicode_CheckExact(self)) {
5508         /* fixfct should return TRUE if it modified the buffer. If
5509            FALSE, return a reference to the original buffer instead
5510            (to save space, not time) */
5511         Py_INCREF(self);
5512         Py_DECREF(u);
5513         return (PyObject*) self;
5514     }
5515     return (PyObject*) u;
5516 }
5517 
5518 static
fixupper(PyUnicodeObject * self)5519 int fixupper(PyUnicodeObject *self)
5520 {
5521     Py_ssize_t len = self->length;
5522     Py_UNICODE *s = self->str;
5523     int status = 0;
5524 
5525     while (len-- > 0) {
5526         register Py_UNICODE ch;
5527 
5528         ch = Py_UNICODE_TOUPPER(*s);
5529         if (ch != *s) {
5530             status = 1;
5531             *s = ch;
5532         }
5533         s++;
5534     }
5535 
5536     return status;
5537 }
5538 
5539 static
fixlower(PyUnicodeObject * self)5540 int fixlower(PyUnicodeObject *self)
5541 {
5542     Py_ssize_t len = self->length;
5543     Py_UNICODE *s = self->str;
5544     int status = 0;
5545 
5546     while (len-- > 0) {
5547         register Py_UNICODE ch;
5548 
5549         ch = Py_UNICODE_TOLOWER(*s);
5550         if (ch != *s) {
5551             status = 1;
5552             *s = ch;
5553         }
5554         s++;
5555     }
5556 
5557     return status;
5558 }
5559 
5560 static
fixswapcase(PyUnicodeObject * self)5561 int fixswapcase(PyUnicodeObject *self)
5562 {
5563     Py_ssize_t len = self->length;
5564     Py_UNICODE *s = self->str;
5565     int status = 0;
5566 
5567     while (len-- > 0) {
5568         if (Py_UNICODE_ISUPPER(*s)) {
5569             *s = Py_UNICODE_TOLOWER(*s);
5570             status = 1;
5571         } else if (Py_UNICODE_ISLOWER(*s)) {
5572             *s = Py_UNICODE_TOUPPER(*s);
5573             status = 1;
5574         }
5575         s++;
5576     }
5577 
5578     return status;
5579 }
5580 
5581 static
fixcapitalize(PyUnicodeObject * self)5582 int fixcapitalize(PyUnicodeObject *self)
5583 {
5584     Py_ssize_t len = self->length;
5585     Py_UNICODE *s = self->str;
5586     int status = 0;
5587 
5588     if (len == 0)
5589         return 0;
5590     if (!Py_UNICODE_ISUPPER(*s)) {
5591         *s = Py_UNICODE_TOUPPER(*s);
5592         status = 1;
5593     }
5594     s++;
5595     while (--len > 0) {
5596         if (!Py_UNICODE_ISLOWER(*s)) {
5597             *s = Py_UNICODE_TOLOWER(*s);
5598             status = 1;
5599         }
5600         s++;
5601     }
5602     return status;
5603 }
5604 
5605 static
fixtitle(PyUnicodeObject * self)5606 int fixtitle(PyUnicodeObject *self)
5607 {
5608     register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5609     register Py_UNICODE *e;
5610     int previous_is_cased;
5611 
5612     /* Shortcut for single character strings */
5613     if (PyUnicode_GET_SIZE(self) == 1) {
5614         Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
5615         if (*p != ch) {
5616             *p = ch;
5617             return 1;
5618         }
5619         else
5620             return 0;
5621     }
5622 
5623     e = p + PyUnicode_GET_SIZE(self);
5624     previous_is_cased = 0;
5625     for (; p < e; p++) {
5626         register const Py_UNICODE ch = *p;
5627 
5628         if (previous_is_cased)
5629             *p = Py_UNICODE_TOLOWER(ch);
5630         else
5631             *p = Py_UNICODE_TOTITLE(ch);
5632 
5633         if (Py_UNICODE_ISLOWER(ch) ||
5634             Py_UNICODE_ISUPPER(ch) ||
5635             Py_UNICODE_ISTITLE(ch))
5636             previous_is_cased = 1;
5637         else
5638             previous_is_cased = 0;
5639     }
5640     return 1;
5641 }
5642 
5643 PyObject *
PyUnicode_Join(PyObject * separator,PyObject * seq)5644 PyUnicode_Join(PyObject *separator, PyObject *seq)
5645 {
5646     PyObject *internal_separator = NULL;
5647     const Py_UNICODE blank = ' ';
5648     const Py_UNICODE *sep = &blank;
5649     Py_ssize_t seplen = 1;
5650     PyUnicodeObject *res = NULL; /* the result */
5651     Py_ssize_t res_alloc = 100;  /* # allocated bytes for string in res */
5652     Py_ssize_t res_used;         /* # used bytes */
5653     Py_UNICODE *res_p;       /* pointer to free byte in res's string area */
5654     PyObject *fseq;          /* PySequence_Fast(seq) */
5655     Py_ssize_t seqlen;              /* len(fseq) -- number of items in sequence */
5656     PyObject *item;
5657     Py_ssize_t i;
5658 
5659     fseq = PySequence_Fast(seq, "can only join an iterable");
5660     if (fseq == NULL) {
5661         return NULL;
5662     }
5663 
5664     /* Grrrr.  A codec may be invoked to convert str objects to
5665      * Unicode, and so it's possible to call back into Python code
5666      * during PyUnicode_FromObject(), and so it's possible for a sick
5667      * codec to change the size of fseq (if seq is a list).  Therefore
5668      * we have to keep refetching the size -- can't assume seqlen
5669      * is invariant.
5670      */
5671     seqlen = PySequence_Fast_GET_SIZE(fseq);
5672     /* If empty sequence, return u"". */
5673     if (seqlen == 0) {
5674         res = _PyUnicode_New(0);  /* empty sequence; return u"" */
5675         goto Done;
5676     }
5677     /* If singleton sequence with an exact Unicode, return that. */
5678     if (seqlen == 1) {
5679         item = PySequence_Fast_GET_ITEM(fseq, 0);
5680         if (PyUnicode_CheckExact(item)) {
5681             Py_INCREF(item);
5682             res = (PyUnicodeObject *)item;
5683             goto Done;
5684         }
5685     }
5686 
5687     /* At least two items to join, or one that isn't exact Unicode. */
5688     if (seqlen > 1) {
5689         /* Set up sep and seplen -- they're needed. */
5690         if (separator == NULL) {
5691             sep = &blank;
5692             seplen = 1;
5693         }
5694         else {
5695             internal_separator = PyUnicode_FromObject(separator);
5696             if (internal_separator == NULL)
5697                 goto onError;
5698             sep = PyUnicode_AS_UNICODE(internal_separator);
5699             seplen = PyUnicode_GET_SIZE(internal_separator);
5700             /* In case PyUnicode_FromObject() mutated seq. */
5701             seqlen = PySequence_Fast_GET_SIZE(fseq);
5702         }
5703     }
5704 
5705     /* Get space. */
5706     res = _PyUnicode_New(res_alloc);
5707     if (res == NULL)
5708         goto onError;
5709     res_p = PyUnicode_AS_UNICODE(res);
5710     res_used = 0;
5711 
5712     for (i = 0; i < seqlen; ++i) {
5713         Py_ssize_t itemlen;
5714         Py_ssize_t new_res_used;
5715 
5716         item = PySequence_Fast_GET_ITEM(fseq, i);
5717         /* Convert item to Unicode. */
5718         if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
5719             PyErr_Format(PyExc_TypeError,
5720                          "sequence item %zd: expected string or Unicode,"
5721                          " %.80s found",
5722                          i, Py_TYPE(item)->tp_name);
5723             goto onError;
5724         }
5725         item = PyUnicode_FromObject(item);
5726         if (item == NULL)
5727             goto onError;
5728         /* We own a reference to item from here on. */
5729 
5730         /* In case PyUnicode_FromObject() mutated seq. */
5731         seqlen = PySequence_Fast_GET_SIZE(fseq);
5732 
5733         /* Make sure we have enough space for the separator and the item. */
5734         itemlen = PyUnicode_GET_SIZE(item);
5735         if (res_used > PY_SSIZE_T_MAX - itemlen)
5736             goto Overflow;
5737         new_res_used = res_used + itemlen;
5738         if (i < seqlen - 1) {
5739             if (new_res_used > PY_SSIZE_T_MAX - seplen)
5740                 goto Overflow;
5741             new_res_used += seplen;
5742         }
5743         if (new_res_used > res_alloc) {
5744             /* double allocated size until it's big enough */
5745             do {
5746                 if (res_alloc > PY_SSIZE_T_MAX / 2)
5747                     goto Overflow;
5748                 res_alloc += res_alloc;
5749             } while (new_res_used > res_alloc);
5750             if (_PyUnicode_Resize(&res, res_alloc) < 0) {
5751                 Py_DECREF(item);
5752                 goto onError;
5753             }
5754             res_p = PyUnicode_AS_UNICODE(res) + res_used;
5755         }
5756 
5757         /* Copy item, and maybe the separator. */
5758         Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
5759         res_p += itemlen;
5760         if (i < seqlen - 1) {
5761             Py_UNICODE_COPY(res_p, sep, seplen);
5762             res_p += seplen;
5763         }
5764         Py_DECREF(item);
5765         res_used = new_res_used;
5766     }
5767 
5768     /* Shrink res to match the used area; this probably can't fail,
5769      * but it's cheap to check.
5770      */
5771     if (_PyUnicode_Resize(&res, res_used) < 0)
5772         goto onError;
5773 
5774   Done:
5775     Py_XDECREF(internal_separator);
5776     Py_DECREF(fseq);
5777     return (PyObject *)res;
5778 
5779   Overflow:
5780     PyErr_SetString(PyExc_OverflowError,
5781                     "join() result is too long for a Python string");
5782     Py_DECREF(item);
5783     /* fall through */
5784 
5785   onError:
5786     Py_XDECREF(internal_separator);
5787     Py_DECREF(fseq);
5788     Py_XDECREF(res);
5789     return NULL;
5790 }
5791 
5792 static
pad(PyUnicodeObject * self,Py_ssize_t left,Py_ssize_t right,Py_UNICODE fill)5793 PyUnicodeObject *pad(PyUnicodeObject *self,
5794                      Py_ssize_t left,
5795                      Py_ssize_t right,
5796                      Py_UNICODE fill)
5797 {
5798     PyUnicodeObject *u;
5799 
5800     if (left < 0)
5801         left = 0;
5802     if (right < 0)
5803         right = 0;
5804 
5805     if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
5806         Py_INCREF(self);
5807         return self;
5808     }
5809 
5810     if (left > PY_SSIZE_T_MAX - self->length ||
5811         right > PY_SSIZE_T_MAX - (left + self->length)) {
5812         PyErr_SetString(PyExc_OverflowError, "padded string is too long");
5813         return NULL;
5814     }
5815     u = _PyUnicode_New(left + self->length + right);
5816     if (u) {
5817         if (left)
5818             Py_UNICODE_FILL(u->str, fill, left);
5819         Py_UNICODE_COPY(u->str + left, self->str, self->length);
5820         if (right)
5821             Py_UNICODE_FILL(u->str + left + self->length, fill, right);
5822     }
5823 
5824     return u;
5825 }
5826 
PyUnicode_Splitlines(PyObject * string,int keepends)5827 PyObject *PyUnicode_Splitlines(PyObject *string, int keepends)
5828 {
5829     PyObject *list;
5830 
5831     string = PyUnicode_FromObject(string);
5832     if (string == NULL)
5833         return NULL;
5834 
5835     list = stringlib_splitlines(
5836         (PyObject*) string, PyUnicode_AS_UNICODE(string),
5837         PyUnicode_GET_SIZE(string), keepends);
5838 
5839     Py_DECREF(string);
5840     return list;
5841 }
5842 
5843 static
split(PyUnicodeObject * self,PyUnicodeObject * substring,Py_ssize_t maxcount)5844 PyObject *split(PyUnicodeObject *self,
5845                 PyUnicodeObject *substring,
5846                 Py_ssize_t maxcount)
5847 {
5848     if (maxcount < 0)
5849         maxcount = PY_SSIZE_T_MAX;
5850 
5851     if (substring == NULL)
5852         return stringlib_split_whitespace(
5853             (PyObject*) self,  self->str, self->length, maxcount
5854             );
5855 
5856     return stringlib_split(
5857         (PyObject*) self,  self->str, self->length,
5858         substring->str, substring->length,
5859         maxcount
5860         );
5861 }
5862 
5863 static
rsplit(PyUnicodeObject * self,PyUnicodeObject * substring,Py_ssize_t maxcount)5864 PyObject *rsplit(PyUnicodeObject *self,
5865                  PyUnicodeObject *substring,
5866                  Py_ssize_t maxcount)
5867 {
5868     if (maxcount < 0)
5869         maxcount = PY_SSIZE_T_MAX;
5870 
5871     if (substring == NULL)
5872         return stringlib_rsplit_whitespace(
5873             (PyObject*) self,  self->str, self->length, maxcount
5874             );
5875 
5876     return stringlib_rsplit(
5877         (PyObject*) self,  self->str, self->length,
5878         substring->str, substring->length,
5879         maxcount
5880         );
5881 }
5882 
5883 static
replace(PyUnicodeObject * self,PyUnicodeObject * str1,PyUnicodeObject * str2,Py_ssize_t maxcount)5884 PyObject *replace(PyUnicodeObject *self,
5885                   PyUnicodeObject *str1,
5886                   PyUnicodeObject *str2,
5887                   Py_ssize_t maxcount)
5888 {
5889     PyUnicodeObject *u;
5890 
5891     if (maxcount < 0)
5892         maxcount = PY_SSIZE_T_MAX;
5893     else if (maxcount == 0 || self->length == 0)
5894         goto nothing;
5895 
5896     if (str1->length == str2->length) {
5897         Py_ssize_t i;
5898         /* same length */
5899         if (str1->length == 0)
5900             goto nothing;
5901         if (str1->length == 1) {
5902             /* replace characters */
5903             Py_UNICODE u1, u2;
5904             if (!findchar(self->str, self->length, str1->str[0]))
5905                 goto nothing;
5906             u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5907             if (!u)
5908                 return NULL;
5909             Py_UNICODE_COPY(u->str, self->str, self->length);
5910             u1 = str1->str[0];
5911             u2 = str2->str[0];
5912             for (i = 0; i < u->length; i++)
5913                 if (u->str[i] == u1) {
5914                     if (--maxcount < 0)
5915                         break;
5916                     u->str[i] = u2;
5917                 }
5918         } else {
5919             i = stringlib_find(
5920                 self->str, self->length, str1->str, str1->length, 0
5921                 );
5922             if (i < 0)
5923                 goto nothing;
5924             u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5925             if (!u)
5926                 return NULL;
5927             Py_UNICODE_COPY(u->str, self->str, self->length);
5928 
5929             /* change everything in-place, starting with this one */
5930             Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5931             i += str1->length;
5932 
5933             while ( --maxcount > 0) {
5934                 i = stringlib_find(self->str+i, self->length-i,
5935                                    str1->str, str1->length,
5936                                    i);
5937                 if (i == -1)
5938                     break;
5939                 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5940                 i += str1->length;
5941             }
5942         }
5943     } else {
5944 
5945         Py_ssize_t n, i, j;
5946         Py_ssize_t new_size, delta;
5947         Py_UNICODE *p;
5948 
5949         /* replace strings */
5950         n = stringlib_count(self->str, self->length, str1->str, str1->length,
5951                             maxcount);
5952         if (n == 0)
5953             goto nothing;
5954         /* new_size = self->length + n * (str2->length - str1->length)); */
5955         delta = (str2->length - str1->length);
5956         if (delta == 0) {
5957             new_size = self->length;
5958         } else {
5959             assert(n > 0);
5960             if (delta > (PY_SSIZE_T_MAX - self->length) / n) {
5961                 PyErr_SetString(PyExc_OverflowError,
5962                                 "replace string is too long");
5963                 return NULL;
5964             }
5965             new_size = self->length + delta * n;
5966         }
5967         u = _PyUnicode_New(new_size);
5968         if (!u)
5969             return NULL;
5970         i = 0;
5971         p = u->str;
5972         if (str1->length > 0) {
5973             while (n-- > 0) {
5974                 /* look for next match */
5975                 j = stringlib_find(self->str+i, self->length-i,
5976                                    str1->str, str1->length,
5977                                    i);
5978                 if (j == -1)
5979                     break;
5980                 else if (j > i) {
5981                     /* copy unchanged part [i:j] */
5982                     Py_UNICODE_COPY(p, self->str+i, j-i);
5983                     p += j - i;
5984                 }
5985                 /* copy substitution string */
5986                 if (str2->length > 0) {
5987                     Py_UNICODE_COPY(p, str2->str, str2->length);
5988                     p += str2->length;
5989                 }
5990                 i = j + str1->length;
5991             }
5992             if (i < self->length)
5993                 /* copy tail [i:] */
5994                 Py_UNICODE_COPY(p, self->str+i, self->length-i);
5995         } else {
5996             /* interleave */
5997             while (n > 0) {
5998                 Py_UNICODE_COPY(p, str2->str, str2->length);
5999                 p += str2->length;
6000                 if (--n <= 0)
6001                     break;
6002                 *p++ = self->str[i++];
6003             }
6004             Py_UNICODE_COPY(p, self->str+i, self->length-i);
6005         }
6006     }
6007     return (PyObject *) u;
6008 
6009   nothing:
6010     /* nothing to replace; return original string (when possible) */
6011     if (PyUnicode_CheckExact(self)) {
6012         Py_INCREF(self);
6013         return (PyObject *) self;
6014     }
6015     return PyUnicode_FromUnicode(self->str, self->length);
6016 }
6017 
6018 /* --- Unicode Object Methods --------------------------------------------- */
6019 
6020 PyDoc_STRVAR(title__doc__,
6021              "S.title() -> unicode\n\
6022 \n\
6023 Return a titlecased version of S, i.e. words start with title case\n\
6024 characters, all remaining cased characters have lower case.");
6025 
6026 static PyObject*
unicode_title(PyUnicodeObject * self)6027 unicode_title(PyUnicodeObject *self)
6028 {
6029     return fixup(self, fixtitle);
6030 }
6031 
6032 PyDoc_STRVAR(capitalize__doc__,
6033              "S.capitalize() -> unicode\n\
6034 \n\
6035 Return a capitalized version of S, i.e. make the first character\n\
6036 have upper case and the rest lower case.");
6037 
6038 static PyObject*
unicode_capitalize(PyUnicodeObject * self)6039 unicode_capitalize(PyUnicodeObject *self)
6040 {
6041     return fixup(self, fixcapitalize);
6042 }
6043 
6044 #if 0
6045 PyDoc_STRVAR(capwords__doc__,
6046              "S.capwords() -> unicode\n\
6047 \n\
6048 Apply .capitalize() to all words in S and return the result with\n\
6049 normalized whitespace (all whitespace strings are replaced by ' ').");
6050 
6051 static PyObject*
6052 unicode_capwords(PyUnicodeObject *self)
6053 {
6054     PyObject *list;
6055     PyObject *item;
6056     Py_ssize_t i;
6057 
6058     /* Split into words */
6059     list = split(self, NULL, -1);
6060     if (!list)
6061         return NULL;
6062 
6063     /* Capitalize each word */
6064     for (i = 0; i < PyList_GET_SIZE(list); i++) {
6065         item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
6066                      fixcapitalize);
6067         if (item == NULL)
6068             goto onError;
6069         Py_DECREF(PyList_GET_ITEM(list, i));
6070         PyList_SET_ITEM(list, i, item);
6071     }
6072 
6073     /* Join the words to form a new string */
6074     item = PyUnicode_Join(NULL, list);
6075 
6076   onError:
6077     Py_DECREF(list);
6078     return (PyObject *)item;
6079 }
6080 #endif
6081 
6082 /* Argument converter.  Coerces to a single unicode character */
6083 
6084 static int
convert_uc(PyObject * obj,void * addr)6085 convert_uc(PyObject *obj, void *addr)
6086 {
6087     Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
6088     PyObject *uniobj;
6089     Py_UNICODE *unistr;
6090 
6091     uniobj = PyUnicode_FromObject(obj);
6092     if (uniobj == NULL) {
6093         PyErr_SetString(PyExc_TypeError,
6094                         "The fill character cannot be converted to Unicode");
6095         return 0;
6096     }
6097     if (PyUnicode_GET_SIZE(uniobj) != 1) {
6098         PyErr_SetString(PyExc_TypeError,
6099                         "The fill character must be exactly one character long");
6100         Py_DECREF(uniobj);
6101         return 0;
6102     }
6103     unistr = PyUnicode_AS_UNICODE(uniobj);
6104     *fillcharloc = unistr[0];
6105     Py_DECREF(uniobj);
6106     return 1;
6107 }
6108 
6109 PyDoc_STRVAR(center__doc__,
6110              "S.center(width[, fillchar]) -> unicode\n\
6111 \n\
6112 Return S centered in a Unicode string of length width. Padding is\n\
6113 done using the specified fill character (default is a space)");
6114 
6115 static PyObject *
unicode_center(PyUnicodeObject * self,PyObject * args)6116 unicode_center(PyUnicodeObject *self, PyObject *args)
6117 {
6118     Py_ssize_t marg, left;
6119     Py_ssize_t width;
6120     Py_UNICODE fillchar = ' ';
6121 
6122     if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
6123         return NULL;
6124 
6125     if (self->length >= width && PyUnicode_CheckExact(self)) {
6126         Py_INCREF(self);
6127         return (PyObject*) self;
6128     }
6129 
6130     marg = width - self->length;
6131     left = marg / 2 + (marg & width & 1);
6132 
6133     return (PyObject*) pad(self, left, marg - left, fillchar);
6134 }
6135 
6136 #if 0
6137 
6138 /* This code should go into some future Unicode collation support
6139    module. The basic comparison should compare ordinals on a naive
6140    basis (this is what Java does and thus Jython too). */
6141 
6142 /* speedy UTF-16 code point order comparison */
6143 /* gleaned from: */
6144 /* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6145 
6146 static short utf16Fixup[32] =
6147 {
6148     0, 0, 0, 0, 0, 0, 0, 0,
6149     0, 0, 0, 0, 0, 0, 0, 0,
6150     0, 0, 0, 0, 0, 0, 0, 0,
6151     0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
6152 };
6153 
6154 static int
6155 unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6156 {
6157     Py_ssize_t len1, len2;
6158 
6159     Py_UNICODE *s1 = str1->str;
6160     Py_UNICODE *s2 = str2->str;
6161 
6162     len1 = str1->length;
6163     len2 = str2->length;
6164 
6165     while (len1 > 0 && len2 > 0) {
6166         Py_UNICODE c1, c2;
6167 
6168         c1 = *s1++;
6169         c2 = *s2++;
6170 
6171         if (c1 > (1<<11) * 26)
6172             c1 += utf16Fixup[c1>>11];
6173         if (c2 > (1<<11) * 26)
6174             c2 += utf16Fixup[c2>>11];
6175         /* now c1 and c2 are in UTF-32-compatible order */
6176 
6177         if (c1 != c2)
6178             return (c1 < c2) ? -1 : 1;
6179 
6180         len1--; len2--;
6181     }
6182 
6183     return (len1 < len2) ? -1 : (len1 != len2);
6184 }
6185 
6186 #else
6187 
6188 static int
unicode_compare(PyUnicodeObject * str1,PyUnicodeObject * str2)6189 unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6190 {
6191     register Py_ssize_t len1, len2;
6192 
6193     Py_UNICODE *s1 = str1->str;
6194     Py_UNICODE *s2 = str2->str;
6195 
6196     len1 = str1->length;
6197     len2 = str2->length;
6198 
6199     while (len1 > 0 && len2 > 0) {
6200         Py_UNICODE c1, c2;
6201 
6202         c1 = *s1++;
6203         c2 = *s2++;
6204 
6205         if (c1 != c2)
6206             return (c1 < c2) ? -1 : 1;
6207 
6208         len1--; len2--;
6209     }
6210 
6211     return (len1 < len2) ? -1 : (len1 != len2);
6212 }
6213 
6214 #endif
6215 
PyUnicode_Compare(PyObject * left,PyObject * right)6216 int PyUnicode_Compare(PyObject *left,
6217                       PyObject *right)
6218 {
6219     PyUnicodeObject *u = NULL, *v = NULL;
6220     int result;
6221 
6222     /* Coerce the two arguments */
6223     u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6224     if (u == NULL)
6225         goto onError;
6226     v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6227     if (v == NULL)
6228         goto onError;
6229 
6230     /* Shortcut for empty or interned objects */
6231     if (v == u) {
6232         Py_DECREF(u);
6233         Py_DECREF(v);
6234         return 0;
6235     }
6236 
6237     result = unicode_compare(u, v);
6238 
6239     Py_DECREF(u);
6240     Py_DECREF(v);
6241     return result;
6242 
6243   onError:
6244     Py_XDECREF(u);
6245     Py_XDECREF(v);
6246     return -1;
6247 }
6248 
PyUnicode_RichCompare(PyObject * left,PyObject * right,int op)6249 PyObject *PyUnicode_RichCompare(PyObject *left,
6250                                 PyObject *right,
6251                                 int op)
6252 {
6253     int result;
6254 
6255     result = PyUnicode_Compare(left, right);
6256     if (result == -1 && PyErr_Occurred())
6257         goto onError;
6258 
6259     /* Convert the return value to a Boolean */
6260     switch (op) {
6261     case Py_EQ:
6262         result = (result == 0);
6263         break;
6264     case Py_NE:
6265         result = (result != 0);
6266         break;
6267     case Py_LE:
6268         result = (result <= 0);
6269         break;
6270     case Py_GE:
6271         result = (result >= 0);
6272         break;
6273     case Py_LT:
6274         result = (result == -1);
6275         break;
6276     case Py_GT:
6277         result = (result == 1);
6278         break;
6279     }
6280     return PyBool_FromLong(result);
6281 
6282   onError:
6283 
6284     /* Standard case
6285 
6286        Type errors mean that PyUnicode_FromObject() could not convert
6287        one of the arguments (usually the right hand side) to Unicode,
6288        ie. we can't handle the comparison request. However, it is
6289        possible that the other object knows a comparison method, which
6290        is why we return Py_NotImplemented to give the other object a
6291        chance.
6292 
6293     */
6294     if (PyErr_ExceptionMatches(PyExc_TypeError)) {
6295         PyErr_Clear();
6296         Py_INCREF(Py_NotImplemented);
6297         return Py_NotImplemented;
6298     }
6299     if (op != Py_EQ && op != Py_NE)
6300         return NULL;
6301 
6302     /* Equality comparison.
6303 
6304        This is a special case: we silence any PyExc_UnicodeDecodeError
6305        and instead turn it into a PyErr_UnicodeWarning.
6306 
6307     */
6308     if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
6309         return NULL;
6310     PyErr_Clear();
6311     if (PyErr_Warn(PyExc_UnicodeWarning,
6312                    (op == Py_EQ) ?
6313                    "Unicode equal comparison "
6314                    "failed to convert both arguments to Unicode - "
6315                    "interpreting them as being unequal" :
6316                    "Unicode unequal comparison "
6317                    "failed to convert both arguments to Unicode - "
6318                    "interpreting them as being unequal"
6319             ) < 0)
6320         return NULL;
6321     result = (op == Py_NE);
6322     return PyBool_FromLong(result);
6323 }
6324 
PyUnicode_Contains(PyObject * container,PyObject * element)6325 int PyUnicode_Contains(PyObject *container,
6326                        PyObject *element)
6327 {
6328     PyObject *str, *sub;
6329     int result;
6330 
6331     /* Coerce the two arguments */
6332     sub = PyUnicode_FromObject(element);
6333     if (!sub) {
6334         return -1;
6335     }
6336 
6337     str = PyUnicode_FromObject(container);
6338     if (!str) {
6339         Py_DECREF(sub);
6340         return -1;
6341     }
6342 
6343     result = stringlib_contains_obj(str, sub);
6344 
6345     Py_DECREF(str);
6346     Py_DECREF(sub);
6347 
6348     return result;
6349 }
6350 
6351 /* Concat to string or Unicode object giving a new Unicode object. */
6352 
PyUnicode_Concat(PyObject * left,PyObject * right)6353 PyObject *PyUnicode_Concat(PyObject *left,
6354                            PyObject *right)
6355 {
6356     PyUnicodeObject *u = NULL, *v = NULL, *w;
6357 
6358     /* Coerce the two arguments */
6359     u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6360     if (u == NULL)
6361         goto onError;
6362     v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6363     if (v == NULL)
6364         goto onError;
6365 
6366     /* Shortcuts */
6367     if (v == unicode_empty) {
6368         Py_DECREF(v);
6369         return (PyObject *)u;
6370     }
6371     if (u == unicode_empty) {
6372         Py_DECREF(u);
6373         return (PyObject *)v;
6374     }
6375 
6376     if (u->length > PY_SSIZE_T_MAX - v->length) {
6377         PyErr_SetString(PyExc_OverflowError,
6378                         "strings are too large to concat");
6379         goto onError;
6380     }
6381 
6382     /* Concat the two Unicode strings */
6383     w = _PyUnicode_New(u->length + v->length);
6384     if (w == NULL)
6385         goto onError;
6386     Py_UNICODE_COPY(w->str, u->str, u->length);
6387     Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
6388 
6389     Py_DECREF(u);
6390     Py_DECREF(v);
6391     return (PyObject *)w;
6392 
6393   onError:
6394     Py_XDECREF(u);
6395     Py_XDECREF(v);
6396     return NULL;
6397 }
6398 
6399 PyDoc_STRVAR(count__doc__,
6400              "S.count(sub[, start[, end]]) -> int\n\
6401 \n\
6402 Return the number of non-overlapping occurrences of substring sub in\n\
6403 Unicode string S[start:end].  Optional arguments start and end are\n\
6404 interpreted as in slice notation.");
6405 
6406 static PyObject *
unicode_count(PyUnicodeObject * self,PyObject * args)6407 unicode_count(PyUnicodeObject *self, PyObject *args)
6408 {
6409     PyUnicodeObject *substring;
6410     Py_ssize_t start = 0;
6411     Py_ssize_t end = PY_SSIZE_T_MAX;
6412     PyObject *result;
6413 
6414     if (!stringlib_parse_args_finds_unicode("count", args, &substring,
6415                                             &start, &end))
6416         return NULL;
6417 
6418     ADJUST_INDICES(start, end, self->length);
6419     result = PyInt_FromSsize_t(
6420         stringlib_count(self->str + start, end - start,
6421                         substring->str, substring->length,
6422                         PY_SSIZE_T_MAX)
6423         );
6424 
6425     Py_DECREF(substring);
6426 
6427     return result;
6428 }
6429 
6430 PyDoc_STRVAR(encode__doc__,
6431              "S.encode([encoding[,errors]]) -> string or unicode\n\
6432 \n\
6433 Encodes S using the codec registered for encoding. encoding defaults\n\
6434 to the default encoding. errors may be given to set a different error\n\
6435 handling scheme. Default is 'strict' meaning that encoding errors raise\n\
6436 a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
6437 'xmlcharrefreplace' as well as any other name registered with\n\
6438 codecs.register_error that can handle UnicodeEncodeErrors.");
6439 
6440 static PyObject *
unicode_encode(PyUnicodeObject * self,PyObject * args,PyObject * kwargs)6441 unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
6442 {
6443     static char *kwlist[] = {"encoding", "errors", 0};
6444     char *encoding = NULL;
6445     char *errors = NULL;
6446     PyObject *v;
6447 
6448     if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
6449                                      kwlist, &encoding, &errors))
6450         return NULL;
6451     v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
6452     if (v == NULL)
6453         goto onError;
6454     if (!PyString_Check(v) && !PyUnicode_Check(v)) {
6455         PyErr_Format(PyExc_TypeError,
6456                      "encoder did not return a string/unicode object "
6457                      "(type=%.400s)",
6458                      Py_TYPE(v)->tp_name);
6459         Py_DECREF(v);
6460         return NULL;
6461     }
6462     return v;
6463 
6464   onError:
6465     return NULL;
6466 }
6467 
6468 PyDoc_STRVAR(decode__doc__,
6469              "S.decode([encoding[,errors]]) -> string or unicode\n\
6470 \n\
6471 Decodes S using the codec registered for encoding. encoding defaults\n\
6472 to the default encoding. errors may be given to set a different error\n\
6473 handling scheme. Default is 'strict' meaning that encoding errors raise\n\
6474 a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
6475 as well as any other name registered with codecs.register_error that is\n\
6476 able to handle UnicodeDecodeErrors.");
6477 
6478 static PyObject *
unicode_decode(PyUnicodeObject * self,PyObject * args,PyObject * kwargs)6479 unicode_decode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
6480 {
6481     static char *kwlist[] = {"encoding", "errors", 0};
6482     char *encoding = NULL;
6483     char *errors = NULL;
6484     PyObject *v;
6485 
6486     if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:decode",
6487                                      kwlist, &encoding, &errors))
6488         return NULL;
6489     v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
6490     if (v == NULL)
6491         goto onError;
6492     if (!PyString_Check(v) && !PyUnicode_Check(v)) {
6493         PyErr_Format(PyExc_TypeError,
6494                      "decoder did not return a string/unicode object "
6495                      "(type=%.400s)",
6496                      Py_TYPE(v)->tp_name);
6497         Py_DECREF(v);
6498         return NULL;
6499     }
6500     return v;
6501 
6502   onError:
6503     return NULL;
6504 }
6505 
6506 PyDoc_STRVAR(expandtabs__doc__,
6507              "S.expandtabs([tabsize]) -> unicode\n\
6508 \n\
6509 Return a copy of S where all tab characters are expanded using spaces.\n\
6510 If tabsize is not given, a tab size of 8 characters is assumed.");
6511 
6512 static PyObject*
unicode_expandtabs(PyUnicodeObject * self,PyObject * args)6513 unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
6514 {
6515     Py_UNICODE *e;
6516     Py_UNICODE *p;
6517     Py_UNICODE *q;
6518     Py_UNICODE *qe;
6519     Py_ssize_t i, j, incr;
6520     PyUnicodeObject *u;
6521     int tabsize = 8;
6522 
6523     if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
6524         return NULL;
6525 
6526     /* First pass: determine size of output string */
6527     i = 0; /* chars up to and including most recent \n or \r */
6528     j = 0; /* chars since most recent \n or \r (use in tab calculations) */
6529     e = self->str + self->length; /* end of input */
6530     for (p = self->str; p < e; p++)
6531         if (*p == '\t') {
6532             if (tabsize > 0) {
6533                 incr = tabsize - (j % tabsize); /* cannot overflow */
6534                 if (j > PY_SSIZE_T_MAX - incr)
6535                     goto overflow1;
6536                 j += incr;
6537             }
6538         }
6539         else {
6540             if (j > PY_SSIZE_T_MAX - 1)
6541                 goto overflow1;
6542             j++;
6543             if (*p == '\n' || *p == '\r') {
6544                 if (i > PY_SSIZE_T_MAX - j)
6545                     goto overflow1;
6546                 i += j;
6547                 j = 0;
6548             }
6549         }
6550 
6551     if (i > PY_SSIZE_T_MAX - j)
6552         goto overflow1;
6553 
6554     /* Second pass: create output string and fill it */
6555     u = _PyUnicode_New(i + j);
6556     if (!u)
6557         return NULL;
6558 
6559     j = 0; /* same as in first pass */
6560     q = u->str; /* next output char */
6561     qe = u->str + u->length; /* end of output */
6562 
6563     for (p = self->str; p < e; p++)
6564         if (*p == '\t') {
6565             if (tabsize > 0) {
6566                 i = tabsize - (j % tabsize);
6567                 j += i;
6568                 while (i--) {
6569                     if (q >= qe)
6570                         goto overflow2;
6571                     *q++ = ' ';
6572                 }
6573             }
6574         }
6575         else {
6576             if (q >= qe)
6577                 goto overflow2;
6578             *q++ = *p;
6579             j++;
6580             if (*p == '\n' || *p == '\r')
6581                 j = 0;
6582         }
6583 
6584     return (PyObject*) u;
6585 
6586   overflow2:
6587     Py_DECREF(u);
6588   overflow1:
6589     PyErr_SetString(PyExc_OverflowError, "new string is too long");
6590     return NULL;
6591 }
6592 
6593 PyDoc_STRVAR(find__doc__,
6594              "S.find(sub [,start [,end]]) -> int\n\
6595 \n\
6596 Return the lowest index in S where substring sub is found,\n\
6597 such that sub is contained within S[start:end].  Optional\n\
6598 arguments start and end are interpreted as in slice notation.\n\
6599 \n\
6600 Return -1 on failure.");
6601 
6602 static PyObject *
unicode_find(PyUnicodeObject * self,PyObject * args)6603 unicode_find(PyUnicodeObject *self, PyObject *args)
6604 {
6605     PyUnicodeObject *substring;
6606     Py_ssize_t start;
6607     Py_ssize_t end;
6608     Py_ssize_t result;
6609 
6610     if (!stringlib_parse_args_finds_unicode("find", args, &substring,
6611                                             &start, &end))
6612         return NULL;
6613 
6614     result = stringlib_find_slice(
6615         PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6616         PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6617         start, end
6618         );
6619 
6620     Py_DECREF(substring);
6621 
6622     return PyInt_FromSsize_t(result);
6623 }
6624 
6625 static PyObject *
unicode_getitem(PyUnicodeObject * self,Py_ssize_t index)6626 unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
6627 {
6628     if (index < 0 || index >= self->length) {
6629         PyErr_SetString(PyExc_IndexError, "string index out of range");
6630         return NULL;
6631     }
6632 
6633     return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
6634 }
6635 
6636 static long
unicode_hash(PyUnicodeObject * self)6637 unicode_hash(PyUnicodeObject *self)
6638 {
6639     /* Since Unicode objects compare equal to their ASCII string
6640        counterparts, they should use the individual character values
6641        as basis for their hash value.  This is needed to assure that
6642        strings and Unicode objects behave in the same way as
6643        dictionary keys. */
6644 
6645     register Py_ssize_t len;
6646     register Py_UNICODE *p;
6647     register long x;
6648 
6649 #ifdef Py_DEBUG
6650     assert(_Py_HashSecret_Initialized);
6651 #endif
6652     if (self->hash != -1)
6653         return self->hash;
6654     len = PyUnicode_GET_SIZE(self);
6655     /*
6656       We make the hash of the empty string be 0, rather than using
6657       (prefix ^ suffix), since this slightly obfuscates the hash secret
6658     */
6659     if (len == 0) {
6660         self->hash = 0;
6661         return 0;
6662     }
6663     p = PyUnicode_AS_UNICODE(self);
6664     x = _Py_HashSecret.prefix;
6665     x ^= *p << 7;
6666     while (--len >= 0)
6667         x = (1000003*x) ^ *p++;
6668     x ^= PyUnicode_GET_SIZE(self);
6669     x ^= _Py_HashSecret.suffix;
6670     if (x == -1)
6671         x = -2;
6672     self->hash = x;
6673     return x;
6674 }
6675 
6676 PyDoc_STRVAR(index__doc__,
6677              "S.index(sub [,start [,end]]) -> int\n\
6678 \n\
6679 Like S.find() but raise ValueError when the substring is not found.");
6680 
6681 static PyObject *
unicode_index(PyUnicodeObject * self,PyObject * args)6682 unicode_index(PyUnicodeObject *self, PyObject *args)
6683 {
6684     Py_ssize_t result;
6685     PyUnicodeObject *substring;
6686     Py_ssize_t start;
6687     Py_ssize_t end;
6688 
6689     if (!stringlib_parse_args_finds_unicode("index", args, &substring,
6690                                             &start, &end))
6691         return NULL;
6692 
6693     result = stringlib_find_slice(
6694         PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6695         PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6696         start, end
6697         );
6698 
6699     Py_DECREF(substring);
6700 
6701     if (result < 0) {
6702         PyErr_SetString(PyExc_ValueError, "substring not found");
6703         return NULL;
6704     }
6705 
6706     return PyInt_FromSsize_t(result);
6707 }
6708 
6709 PyDoc_STRVAR(islower__doc__,
6710              "S.islower() -> bool\n\
6711 \n\
6712 Return True if all cased characters in S are lowercase and there is\n\
6713 at least one cased character in S, False otherwise.");
6714 
6715 static PyObject*
unicode_islower(PyUnicodeObject * self)6716 unicode_islower(PyUnicodeObject *self)
6717 {
6718     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6719     register const Py_UNICODE *e;
6720     int cased;
6721 
6722     /* Shortcut for single character strings */
6723     if (PyUnicode_GET_SIZE(self) == 1)
6724         return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
6725 
6726     /* Special case for empty strings */
6727     if (PyUnicode_GET_SIZE(self) == 0)
6728         return PyBool_FromLong(0);
6729 
6730     e = p + PyUnicode_GET_SIZE(self);
6731     cased = 0;
6732     for (; p < e; p++) {
6733         register const Py_UNICODE ch = *p;
6734 
6735         if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
6736             return PyBool_FromLong(0);
6737         else if (!cased && Py_UNICODE_ISLOWER(ch))
6738             cased = 1;
6739     }
6740     return PyBool_FromLong(cased);
6741 }
6742 
6743 PyDoc_STRVAR(isupper__doc__,
6744              "S.isupper() -> bool\n\
6745 \n\
6746 Return True if all cased characters in S are uppercase and there is\n\
6747 at least one cased character in S, False otherwise.");
6748 
6749 static PyObject*
unicode_isupper(PyUnicodeObject * self)6750 unicode_isupper(PyUnicodeObject *self)
6751 {
6752     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6753     register const Py_UNICODE *e;
6754     int cased;
6755 
6756     /* Shortcut for single character strings */
6757     if (PyUnicode_GET_SIZE(self) == 1)
6758         return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
6759 
6760     /* Special case for empty strings */
6761     if (PyUnicode_GET_SIZE(self) == 0)
6762         return PyBool_FromLong(0);
6763 
6764     e = p + PyUnicode_GET_SIZE(self);
6765     cased = 0;
6766     for (; p < e; p++) {
6767         register const Py_UNICODE ch = *p;
6768 
6769         if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
6770             return PyBool_FromLong(0);
6771         else if (!cased && Py_UNICODE_ISUPPER(ch))
6772             cased = 1;
6773     }
6774     return PyBool_FromLong(cased);
6775 }
6776 
6777 PyDoc_STRVAR(istitle__doc__,
6778              "S.istitle() -> bool\n\
6779 \n\
6780 Return True if S is a titlecased string and there is at least one\n\
6781 character in S, i.e. upper- and titlecase characters may only\n\
6782 follow uncased characters and lowercase characters only cased ones.\n\
6783 Return False otherwise.");
6784 
6785 static PyObject*
unicode_istitle(PyUnicodeObject * self)6786 unicode_istitle(PyUnicodeObject *self)
6787 {
6788     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6789     register const Py_UNICODE *e;
6790     int cased, previous_is_cased;
6791 
6792     /* Shortcut for single character strings */
6793     if (PyUnicode_GET_SIZE(self) == 1)
6794         return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
6795                                (Py_UNICODE_ISUPPER(*p) != 0));
6796 
6797     /* Special case for empty strings */
6798     if (PyUnicode_GET_SIZE(self) == 0)
6799         return PyBool_FromLong(0);
6800 
6801     e = p + PyUnicode_GET_SIZE(self);
6802     cased = 0;
6803     previous_is_cased = 0;
6804     for (; p < e; p++) {
6805         register const Py_UNICODE ch = *p;
6806 
6807         if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
6808             if (previous_is_cased)
6809                 return PyBool_FromLong(0);
6810             previous_is_cased = 1;
6811             cased = 1;
6812         }
6813         else if (Py_UNICODE_ISLOWER(ch)) {
6814             if (!previous_is_cased)
6815                 return PyBool_FromLong(0);
6816             previous_is_cased = 1;
6817             cased = 1;
6818         }
6819         else
6820             previous_is_cased = 0;
6821     }
6822     return PyBool_FromLong(cased);
6823 }
6824 
6825 PyDoc_STRVAR(isspace__doc__,
6826              "S.isspace() -> bool\n\
6827 \n\
6828 Return True if all characters in S are whitespace\n\
6829 and there is at least one character in S, False otherwise.");
6830 
6831 static PyObject*
unicode_isspace(PyUnicodeObject * self)6832 unicode_isspace(PyUnicodeObject *self)
6833 {
6834     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6835     register const Py_UNICODE *e;
6836 
6837     /* Shortcut for single character strings */
6838     if (PyUnicode_GET_SIZE(self) == 1 &&
6839         Py_UNICODE_ISSPACE(*p))
6840         return PyBool_FromLong(1);
6841 
6842     /* Special case for empty strings */
6843     if (PyUnicode_GET_SIZE(self) == 0)
6844         return PyBool_FromLong(0);
6845 
6846     e = p + PyUnicode_GET_SIZE(self);
6847     for (; p < e; p++) {
6848         if (!Py_UNICODE_ISSPACE(*p))
6849             return PyBool_FromLong(0);
6850     }
6851     return PyBool_FromLong(1);
6852 }
6853 
6854 PyDoc_STRVAR(isalpha__doc__,
6855              "S.isalpha() -> bool\n\
6856 \n\
6857 Return True if all characters in S are alphabetic\n\
6858 and there is at least one character in S, False otherwise.");
6859 
6860 static PyObject*
unicode_isalpha(PyUnicodeObject * self)6861 unicode_isalpha(PyUnicodeObject *self)
6862 {
6863     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6864     register const Py_UNICODE *e;
6865 
6866     /* Shortcut for single character strings */
6867     if (PyUnicode_GET_SIZE(self) == 1 &&
6868         Py_UNICODE_ISALPHA(*p))
6869         return PyBool_FromLong(1);
6870 
6871     /* Special case for empty strings */
6872     if (PyUnicode_GET_SIZE(self) == 0)
6873         return PyBool_FromLong(0);
6874 
6875     e = p + PyUnicode_GET_SIZE(self);
6876     for (; p < e; p++) {
6877         if (!Py_UNICODE_ISALPHA(*p))
6878             return PyBool_FromLong(0);
6879     }
6880     return PyBool_FromLong(1);
6881 }
6882 
6883 PyDoc_STRVAR(isalnum__doc__,
6884              "S.isalnum() -> bool\n\
6885 \n\
6886 Return True if all characters in S are alphanumeric\n\
6887 and there is at least one character in S, False otherwise.");
6888 
6889 static PyObject*
unicode_isalnum(PyUnicodeObject * self)6890 unicode_isalnum(PyUnicodeObject *self)
6891 {
6892     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6893     register const Py_UNICODE *e;
6894 
6895     /* Shortcut for single character strings */
6896     if (PyUnicode_GET_SIZE(self) == 1 &&
6897         Py_UNICODE_ISALNUM(*p))
6898         return PyBool_FromLong(1);
6899 
6900     /* Special case for empty strings */
6901     if (PyUnicode_GET_SIZE(self) == 0)
6902         return PyBool_FromLong(0);
6903 
6904     e = p + PyUnicode_GET_SIZE(self);
6905     for (; p < e; p++) {
6906         if (!Py_UNICODE_ISALNUM(*p))
6907             return PyBool_FromLong(0);
6908     }
6909     return PyBool_FromLong(1);
6910 }
6911 
6912 PyDoc_STRVAR(isdecimal__doc__,
6913              "S.isdecimal() -> bool\n\
6914 \n\
6915 Return True if there are only decimal characters in S,\n\
6916 False otherwise.");
6917 
6918 static PyObject*
unicode_isdecimal(PyUnicodeObject * self)6919 unicode_isdecimal(PyUnicodeObject *self)
6920 {
6921     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6922     register const Py_UNICODE *e;
6923 
6924     /* Shortcut for single character strings */
6925     if (PyUnicode_GET_SIZE(self) == 1 &&
6926         Py_UNICODE_ISDECIMAL(*p))
6927         return PyBool_FromLong(1);
6928 
6929     /* Special case for empty strings */
6930     if (PyUnicode_GET_SIZE(self) == 0)
6931         return PyBool_FromLong(0);
6932 
6933     e = p + PyUnicode_GET_SIZE(self);
6934     for (; p < e; p++) {
6935         if (!Py_UNICODE_ISDECIMAL(*p))
6936             return PyBool_FromLong(0);
6937     }
6938     return PyBool_FromLong(1);
6939 }
6940 
6941 PyDoc_STRVAR(isdigit__doc__,
6942              "S.isdigit() -> bool\n\
6943 \n\
6944 Return True if all characters in S are digits\n\
6945 and there is at least one character in S, False otherwise.");
6946 
6947 static PyObject*
unicode_isdigit(PyUnicodeObject * self)6948 unicode_isdigit(PyUnicodeObject *self)
6949 {
6950     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6951     register const Py_UNICODE *e;
6952 
6953     /* Shortcut for single character strings */
6954     if (PyUnicode_GET_SIZE(self) == 1 &&
6955         Py_UNICODE_ISDIGIT(*p))
6956         return PyBool_FromLong(1);
6957 
6958     /* Special case for empty strings */
6959     if (PyUnicode_GET_SIZE(self) == 0)
6960         return PyBool_FromLong(0);
6961 
6962     e = p + PyUnicode_GET_SIZE(self);
6963     for (; p < e; p++) {
6964         if (!Py_UNICODE_ISDIGIT(*p))
6965             return PyBool_FromLong(0);
6966     }
6967     return PyBool_FromLong(1);
6968 }
6969 
6970 PyDoc_STRVAR(isnumeric__doc__,
6971              "S.isnumeric() -> bool\n\
6972 \n\
6973 Return True if there are only numeric characters in S,\n\
6974 False otherwise.");
6975 
6976 static PyObject*
unicode_isnumeric(PyUnicodeObject * self)6977 unicode_isnumeric(PyUnicodeObject *self)
6978 {
6979     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6980     register const Py_UNICODE *e;
6981 
6982     /* Shortcut for single character strings */
6983     if (PyUnicode_GET_SIZE(self) == 1 &&
6984         Py_UNICODE_ISNUMERIC(*p))
6985         return PyBool_FromLong(1);
6986 
6987     /* Special case for empty strings */
6988     if (PyUnicode_GET_SIZE(self) == 0)
6989         return PyBool_FromLong(0);
6990 
6991     e = p + PyUnicode_GET_SIZE(self);
6992     for (; p < e; p++) {
6993         if (!Py_UNICODE_ISNUMERIC(*p))
6994             return PyBool_FromLong(0);
6995     }
6996     return PyBool_FromLong(1);
6997 }
6998 
6999 PyDoc_STRVAR(join__doc__,
7000              "S.join(iterable) -> unicode\n\
7001 \n\
7002 Return a string which is the concatenation of the strings in the\n\
7003 iterable.  The separator between elements is S.");
7004 
7005 static PyObject*
unicode_join(PyObject * self,PyObject * data)7006 unicode_join(PyObject *self, PyObject *data)
7007 {
7008     return PyUnicode_Join(self, data);
7009 }
7010 
7011 static Py_ssize_t
unicode_length(PyUnicodeObject * self)7012 unicode_length(PyUnicodeObject *self)
7013 {
7014     return self->length;
7015 }
7016 
7017 PyDoc_STRVAR(ljust__doc__,
7018              "S.ljust(width[, fillchar]) -> int\n\
7019 \n\
7020 Return S left-justified in a Unicode string of length width. Padding is\n\
7021 done using the specified fill character (default is a space).");
7022 
7023 static PyObject *
unicode_ljust(PyUnicodeObject * self,PyObject * args)7024 unicode_ljust(PyUnicodeObject *self, PyObject *args)
7025 {
7026     Py_ssize_t width;
7027     Py_UNICODE fillchar = ' ';
7028 
7029     if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
7030         return NULL;
7031 
7032     if (self->length >= width && PyUnicode_CheckExact(self)) {
7033         Py_INCREF(self);
7034         return (PyObject*) self;
7035     }
7036 
7037     return (PyObject*) pad(self, 0, width - self->length, fillchar);
7038 }
7039 
7040 PyDoc_STRVAR(lower__doc__,
7041              "S.lower() -> unicode\n\
7042 \n\
7043 Return a copy of the string S converted to lowercase.");
7044 
7045 static PyObject*
unicode_lower(PyUnicodeObject * self)7046 unicode_lower(PyUnicodeObject *self)
7047 {
7048     return fixup(self, fixlower);
7049 }
7050 
7051 #define LEFTSTRIP 0
7052 #define RIGHTSTRIP 1
7053 #define BOTHSTRIP 2
7054 
7055 /* Arrays indexed by above */
7056 static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
7057 
7058 #define STRIPNAME(i) (stripformat[i]+3)
7059 
7060 /* externally visible for str.strip(unicode) */
7061 PyObject *
_PyUnicode_XStrip(PyUnicodeObject * self,int striptype,PyObject * sepobj)7062 _PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
7063 {
7064     Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7065     Py_ssize_t len = PyUnicode_GET_SIZE(self);
7066     Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
7067     Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
7068     Py_ssize_t i, j;
7069 
7070     BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
7071 
7072     i = 0;
7073     if (striptype != RIGHTSTRIP) {
7074         while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
7075             i++;
7076         }
7077     }
7078 
7079     j = len;
7080     if (striptype != LEFTSTRIP) {
7081         do {
7082             j--;
7083         } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
7084         j++;
7085     }
7086 
7087     if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7088         Py_INCREF(self);
7089         return (PyObject*)self;
7090     }
7091     else
7092         return PyUnicode_FromUnicode(s+i, j-i);
7093 }
7094 
7095 
7096 static PyObject *
do_strip(PyUnicodeObject * self,int striptype)7097 do_strip(PyUnicodeObject *self, int striptype)
7098 {
7099     Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7100     Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
7101 
7102     i = 0;
7103     if (striptype != RIGHTSTRIP) {
7104         while (i < len && Py_UNICODE_ISSPACE(s[i])) {
7105             i++;
7106         }
7107     }
7108 
7109     j = len;
7110     if (striptype != LEFTSTRIP) {
7111         do {
7112             j--;
7113         } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
7114         j++;
7115     }
7116 
7117     if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7118         Py_INCREF(self);
7119         return (PyObject*)self;
7120     }
7121     else
7122         return PyUnicode_FromUnicode(s+i, j-i);
7123 }
7124 
7125 
7126 static PyObject *
do_argstrip(PyUnicodeObject * self,int striptype,PyObject * args)7127 do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
7128 {
7129     PyObject *sep = NULL;
7130 
7131     if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
7132         return NULL;
7133 
7134     if (sep != NULL && sep != Py_None) {
7135         if (PyUnicode_Check(sep))
7136             return _PyUnicode_XStrip(self, striptype, sep);
7137         else if (PyString_Check(sep)) {
7138             PyObject *res;
7139             sep = PyUnicode_FromObject(sep);
7140             if (sep==NULL)
7141                 return NULL;
7142             res = _PyUnicode_XStrip(self, striptype, sep);
7143             Py_DECREF(sep);
7144             return res;
7145         }
7146         else {
7147             PyErr_Format(PyExc_TypeError,
7148                          "%s arg must be None, unicode or str",
7149                          STRIPNAME(striptype));
7150             return NULL;
7151         }
7152     }
7153 
7154     return do_strip(self, striptype);
7155 }
7156 
7157 
7158 PyDoc_STRVAR(strip__doc__,
7159              "S.strip([chars]) -> unicode\n\
7160 \n\
7161 Return a copy of the string S with leading and trailing\n\
7162 whitespace removed.\n\
7163 If chars is given and not None, remove characters in chars instead.\n\
7164 If chars is a str, it will be converted to unicode before stripping");
7165 
7166 static PyObject *
unicode_strip(PyUnicodeObject * self,PyObject * args)7167 unicode_strip(PyUnicodeObject *self, PyObject *args)
7168 {
7169     if (PyTuple_GET_SIZE(args) == 0)
7170         return do_strip(self, BOTHSTRIP); /* Common case */
7171     else
7172         return do_argstrip(self, BOTHSTRIP, args);
7173 }
7174 
7175 
7176 PyDoc_STRVAR(lstrip__doc__,
7177              "S.lstrip([chars]) -> unicode\n\
7178 \n\
7179 Return a copy of the string S with leading whitespace removed.\n\
7180 If chars is given and not None, remove characters in chars instead.\n\
7181 If chars is a str, it will be converted to unicode before stripping");
7182 
7183 static PyObject *
unicode_lstrip(PyUnicodeObject * self,PyObject * args)7184 unicode_lstrip(PyUnicodeObject *self, PyObject *args)
7185 {
7186     if (PyTuple_GET_SIZE(args) == 0)
7187         return do_strip(self, LEFTSTRIP); /* Common case */
7188     else
7189         return do_argstrip(self, LEFTSTRIP, args);
7190 }
7191 
7192 
7193 PyDoc_STRVAR(rstrip__doc__,
7194              "S.rstrip([chars]) -> unicode\n\
7195 \n\
7196 Return a copy of the string S with trailing whitespace removed.\n\
7197 If chars is given and not None, remove characters in chars instead.\n\
7198 If chars is a str, it will be converted to unicode before stripping");
7199 
7200 static PyObject *
unicode_rstrip(PyUnicodeObject * self,PyObject * args)7201 unicode_rstrip(PyUnicodeObject *self, PyObject *args)
7202 {
7203     if (PyTuple_GET_SIZE(args) == 0)
7204         return do_strip(self, RIGHTSTRIP); /* Common case */
7205     else
7206         return do_argstrip(self, RIGHTSTRIP, args);
7207 }
7208 
7209 
7210 static PyObject*
unicode_repeat(PyUnicodeObject * str,Py_ssize_t len)7211 unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
7212 {
7213     PyUnicodeObject *u;
7214     Py_UNICODE *p;
7215     Py_ssize_t nchars;
7216     size_t nbytes;
7217 
7218     if (len < 0)
7219         len = 0;
7220 
7221     if (len == 1 && PyUnicode_CheckExact(str)) {
7222         /* no repeat, return original string */
7223         Py_INCREF(str);
7224         return (PyObject*) str;
7225     }
7226 
7227     /* ensure # of chars needed doesn't overflow Py_ssize_t and # of bytes
7228      * needed doesn't overflow size_t
7229      */
7230     if (len && str->length > PY_SSIZE_T_MAX / len) {
7231         PyErr_SetString(PyExc_OverflowError,
7232                         "repeated string is too long");
7233         return NULL;
7234     }
7235     nchars = len * str->length;
7236     nbytes = ((size_t)nchars + 1u) * sizeof(Py_UNICODE);
7237     if (nbytes / sizeof(Py_UNICODE) != ((size_t)nchars + 1u)) {
7238         PyErr_SetString(PyExc_OverflowError,
7239                         "repeated string is too long");
7240         return NULL;
7241     }
7242     u = _PyUnicode_New(nchars);
7243     if (!u)
7244         return NULL;
7245 
7246     p = u->str;
7247 
7248     if (str->length == 1 && len > 0) {
7249         Py_UNICODE_FILL(p, str->str[0], len);
7250     } else {
7251         Py_ssize_t done = 0; /* number of characters copied this far */
7252         if (done < nchars) {
7253             Py_UNICODE_COPY(p, str->str, str->length);
7254             done = str->length;
7255         }
7256         while (done < nchars) {
7257             Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
7258             Py_UNICODE_COPY(p+done, p, n);
7259             done += n;
7260         }
7261     }
7262 
7263     return (PyObject*) u;
7264 }
7265 
PyUnicode_Replace(PyObject * obj,PyObject * subobj,PyObject * replobj,Py_ssize_t maxcount)7266 PyObject *PyUnicode_Replace(PyObject *obj,
7267                             PyObject *subobj,
7268                             PyObject *replobj,
7269                             Py_ssize_t maxcount)
7270 {
7271     PyObject *self;
7272     PyObject *str1;
7273     PyObject *str2;
7274     PyObject *result;
7275 
7276     self = PyUnicode_FromObject(obj);
7277     if (self == NULL)
7278         return NULL;
7279     str1 = PyUnicode_FromObject(subobj);
7280     if (str1 == NULL) {
7281         Py_DECREF(self);
7282         return NULL;
7283     }
7284     str2 = PyUnicode_FromObject(replobj);
7285     if (str2 == NULL) {
7286         Py_DECREF(self);
7287         Py_DECREF(str1);
7288         return NULL;
7289     }
7290     result = replace((PyUnicodeObject *)self,
7291                      (PyUnicodeObject *)str1,
7292                      (PyUnicodeObject *)str2,
7293                      maxcount);
7294     Py_DECREF(self);
7295     Py_DECREF(str1);
7296     Py_DECREF(str2);
7297     return result;
7298 }
7299 
7300 PyDoc_STRVAR(replace__doc__,
7301              "S.replace(old, new[, count]) -> unicode\n\
7302 \n\
7303 Return a copy of S with all occurrences of substring\n\
7304 old replaced by new.  If the optional argument count is\n\
7305 given, only the first count occurrences are replaced.");
7306 
7307 static PyObject*
unicode_replace(PyUnicodeObject * self,PyObject * args)7308 unicode_replace(PyUnicodeObject *self, PyObject *args)
7309 {
7310     PyUnicodeObject *str1;
7311     PyUnicodeObject *str2;
7312     Py_ssize_t maxcount = -1;
7313     PyObject *result;
7314 
7315     if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
7316         return NULL;
7317     str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
7318     if (str1 == NULL)
7319         return NULL;
7320     str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
7321     if (str2 == NULL) {
7322         Py_DECREF(str1);
7323         return NULL;
7324     }
7325 
7326     result = replace(self, str1, str2, maxcount);
7327 
7328     Py_DECREF(str1);
7329     Py_DECREF(str2);
7330     return result;
7331 }
7332 
7333 static
unicode_repr(PyObject * unicode)7334 PyObject *unicode_repr(PyObject *unicode)
7335 {
7336     return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
7337                                 PyUnicode_GET_SIZE(unicode),
7338                                 1);
7339 }
7340 
7341 PyDoc_STRVAR(rfind__doc__,
7342              "S.rfind(sub [,start [,end]]) -> int\n\
7343 \n\
7344 Return the highest index in S where substring sub is found,\n\
7345 such that sub is contained within S[start:end].  Optional\n\
7346 arguments start and end are interpreted as in slice notation.\n\
7347 \n\
7348 Return -1 on failure.");
7349 
7350 static PyObject *
unicode_rfind(PyUnicodeObject * self,PyObject * args)7351 unicode_rfind(PyUnicodeObject *self, PyObject *args)
7352 {
7353     PyUnicodeObject *substring;
7354     Py_ssize_t start;
7355     Py_ssize_t end;
7356     Py_ssize_t result;
7357 
7358     if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
7359                                             &start, &end))
7360         return NULL;
7361 
7362     result = stringlib_rfind_slice(
7363         PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7364         PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7365         start, end
7366         );
7367 
7368     Py_DECREF(substring);
7369 
7370     return PyInt_FromSsize_t(result);
7371 }
7372 
7373 PyDoc_STRVAR(rindex__doc__,
7374              "S.rindex(sub [,start [,end]]) -> int\n\
7375 \n\
7376 Like S.rfind() but raise ValueError when the substring is not found.");
7377 
7378 static PyObject *
unicode_rindex(PyUnicodeObject * self,PyObject * args)7379 unicode_rindex(PyUnicodeObject *self, PyObject *args)
7380 {
7381     PyUnicodeObject *substring;
7382     Py_ssize_t start;
7383     Py_ssize_t end;
7384     Py_ssize_t result;
7385 
7386     if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
7387                                             &start, &end))
7388         return NULL;
7389 
7390     result = stringlib_rfind_slice(
7391         PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7392         PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7393         start, end
7394         );
7395 
7396     Py_DECREF(substring);
7397 
7398     if (result < 0) {
7399         PyErr_SetString(PyExc_ValueError, "substring not found");
7400         return NULL;
7401     }
7402     return PyInt_FromSsize_t(result);
7403 }
7404 
7405 PyDoc_STRVAR(rjust__doc__,
7406              "S.rjust(width[, fillchar]) -> unicode\n\
7407 \n\
7408 Return S right-justified in a Unicode string of length width. Padding is\n\
7409 done using the specified fill character (default is a space).");
7410 
7411 static PyObject *
unicode_rjust(PyUnicodeObject * self,PyObject * args)7412 unicode_rjust(PyUnicodeObject *self, PyObject *args)
7413 {
7414     Py_ssize_t width;
7415     Py_UNICODE fillchar = ' ';
7416 
7417     if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
7418         return NULL;
7419 
7420     if (self->length >= width && PyUnicode_CheckExact(self)) {
7421         Py_INCREF(self);
7422         return (PyObject*) self;
7423     }
7424 
7425     return (PyObject*) pad(self, width - self->length, 0, fillchar);
7426 }
7427 
7428 static PyObject*
unicode_slice(PyUnicodeObject * self,Py_ssize_t start,Py_ssize_t end)7429 unicode_slice(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t end)
7430 {
7431     /* standard clamping */
7432     if (start < 0)
7433         start = 0;
7434     if (end < 0)
7435         end = 0;
7436     if (end > self->length)
7437         end = self->length;
7438     if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
7439         /* full slice, return original string */
7440         Py_INCREF(self);
7441         return (PyObject*) self;
7442     }
7443     if (start > end)
7444         start = end;
7445     /* copy slice */
7446     return (PyObject*) PyUnicode_FromUnicode(self->str + start,
7447                                              end - start);
7448 }
7449 
PyUnicode_Split(PyObject * s,PyObject * sep,Py_ssize_t maxsplit)7450 PyObject *PyUnicode_Split(PyObject *s,
7451                           PyObject *sep,
7452                           Py_ssize_t maxsplit)
7453 {
7454     PyObject *result;
7455 
7456     s = PyUnicode_FromObject(s);
7457     if (s == NULL)
7458         return NULL;
7459     if (sep != NULL) {
7460         sep = PyUnicode_FromObject(sep);
7461         if (sep == NULL) {
7462             Py_DECREF(s);
7463             return NULL;
7464         }
7465     }
7466 
7467     result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7468 
7469     Py_DECREF(s);
7470     Py_XDECREF(sep);
7471     return result;
7472 }
7473 
7474 PyDoc_STRVAR(split__doc__,
7475              "S.split([sep [,maxsplit]]) -> list of strings\n\
7476 \n\
7477 Return a list of the words in S, using sep as the\n\
7478 delimiter string.  If maxsplit is given, at most maxsplit\n\
7479 splits are done. If sep is not specified or is None, any\n\
7480 whitespace string is a separator and empty strings are\n\
7481 removed from the result.");
7482 
7483 static PyObject*
unicode_split(PyUnicodeObject * self,PyObject * args)7484 unicode_split(PyUnicodeObject *self, PyObject *args)
7485 {
7486     PyObject *substring = Py_None;
7487     Py_ssize_t maxcount = -1;
7488 
7489     if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
7490         return NULL;
7491 
7492     if (substring == Py_None)
7493         return split(self, NULL, maxcount);
7494     else if (PyUnicode_Check(substring))
7495         return split(self, (PyUnicodeObject *)substring, maxcount);
7496     else
7497         return PyUnicode_Split((PyObject *)self, substring, maxcount);
7498 }
7499 
7500 PyObject *
PyUnicode_Partition(PyObject * str_in,PyObject * sep_in)7501 PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
7502 {
7503     PyObject* str_obj;
7504     PyObject* sep_obj;
7505     PyObject* out;
7506 
7507     str_obj = PyUnicode_FromObject(str_in);
7508     if (!str_obj)
7509         return NULL;
7510     sep_obj = PyUnicode_FromObject(sep_in);
7511     if (!sep_obj) {
7512         Py_DECREF(str_obj);
7513         return NULL;
7514     }
7515 
7516     out = stringlib_partition(
7517         str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7518         sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7519         );
7520 
7521     Py_DECREF(sep_obj);
7522     Py_DECREF(str_obj);
7523 
7524     return out;
7525 }
7526 
7527 
7528 PyObject *
PyUnicode_RPartition(PyObject * str_in,PyObject * sep_in)7529 PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
7530 {
7531     PyObject* str_obj;
7532     PyObject* sep_obj;
7533     PyObject* out;
7534 
7535     str_obj = PyUnicode_FromObject(str_in);
7536     if (!str_obj)
7537         return NULL;
7538     sep_obj = PyUnicode_FromObject(sep_in);
7539     if (!sep_obj) {
7540         Py_DECREF(str_obj);
7541         return NULL;
7542     }
7543 
7544     out = stringlib_rpartition(
7545         str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7546         sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7547         );
7548 
7549     Py_DECREF(sep_obj);
7550     Py_DECREF(str_obj);
7551 
7552     return out;
7553 }
7554 
7555 PyDoc_STRVAR(partition__doc__,
7556              "S.partition(sep) -> (head, sep, tail)\n\
7557 \n\
7558 Search for the separator sep in S, and return the part before it,\n\
7559 the separator itself, and the part after it.  If the separator is not\n\
7560 found, return S and two empty strings.");
7561 
7562 static PyObject*
unicode_partition(PyUnicodeObject * self,PyObject * separator)7563 unicode_partition(PyUnicodeObject *self, PyObject *separator)
7564 {
7565     return PyUnicode_Partition((PyObject *)self, separator);
7566 }
7567 
7568 PyDoc_STRVAR(rpartition__doc__,
7569              "S.rpartition(sep) -> (head, sep, tail)\n\
7570 \n\
7571 Search for the separator sep in S, starting at the end of S, and return\n\
7572 the part before it, the separator itself, and the part after it.  If the\n\
7573 separator is not found, return two empty strings and S.");
7574 
7575 static PyObject*
unicode_rpartition(PyUnicodeObject * self,PyObject * separator)7576 unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
7577 {
7578     return PyUnicode_RPartition((PyObject *)self, separator);
7579 }
7580 
PyUnicode_RSplit(PyObject * s,PyObject * sep,Py_ssize_t maxsplit)7581 PyObject *PyUnicode_RSplit(PyObject *s,
7582                            PyObject *sep,
7583                            Py_ssize_t maxsplit)
7584 {
7585     PyObject *result;
7586 
7587     s = PyUnicode_FromObject(s);
7588     if (s == NULL)
7589         return NULL;
7590     if (sep != NULL) {
7591         sep = PyUnicode_FromObject(sep);
7592         if (sep == NULL) {
7593             Py_DECREF(s);
7594             return NULL;
7595         }
7596     }
7597 
7598     result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7599 
7600     Py_DECREF(s);
7601     Py_XDECREF(sep);
7602     return result;
7603 }
7604 
7605 PyDoc_STRVAR(rsplit__doc__,
7606              "S.rsplit([sep [,maxsplit]]) -> list of strings\n\
7607 \n\
7608 Return a list of the words in S, using sep as the\n\
7609 delimiter string, starting at the end of the string and\n\
7610 working to the front.  If maxsplit is given, at most maxsplit\n\
7611 splits are done. If sep is not specified, any whitespace string\n\
7612 is a separator.");
7613 
7614 static PyObject*
unicode_rsplit(PyUnicodeObject * self,PyObject * args)7615 unicode_rsplit(PyUnicodeObject *self, PyObject *args)
7616 {
7617     PyObject *substring = Py_None;
7618     Py_ssize_t maxcount = -1;
7619 
7620     if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
7621         return NULL;
7622 
7623     if (substring == Py_None)
7624         return rsplit(self, NULL, maxcount);
7625     else if (PyUnicode_Check(substring))
7626         return rsplit(self, (PyUnicodeObject *)substring, maxcount);
7627     else
7628         return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
7629 }
7630 
7631 PyDoc_STRVAR(splitlines__doc__,
7632              "S.splitlines(keepends=False) -> list of strings\n\
7633 \n\
7634 Return a list of the lines in S, breaking at line boundaries.\n\
7635 Line breaks are not included in the resulting list unless keepends\n\
7636 is given and true.");
7637 
7638 static PyObject*
unicode_splitlines(PyUnicodeObject * self,PyObject * args)7639 unicode_splitlines(PyUnicodeObject *self, PyObject *args)
7640 {
7641     int keepends = 0;
7642 
7643     if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
7644         return NULL;
7645 
7646     return PyUnicode_Splitlines((PyObject *)self, keepends);
7647 }
7648 
7649 static
unicode_str(PyUnicodeObject * self)7650 PyObject *unicode_str(PyUnicodeObject *self)
7651 {
7652     return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
7653 }
7654 
7655 PyDoc_STRVAR(swapcase__doc__,
7656              "S.swapcase() -> unicode\n\
7657 \n\
7658 Return a copy of S with uppercase characters converted to lowercase\n\
7659 and vice versa.");
7660 
7661 static PyObject*
unicode_swapcase(PyUnicodeObject * self)7662 unicode_swapcase(PyUnicodeObject *self)
7663 {
7664     return fixup(self, fixswapcase);
7665 }
7666 
7667 PyDoc_STRVAR(translate__doc__,
7668              "S.translate(table) -> unicode\n\
7669 \n\
7670 Return a copy of the string S, where all characters have been mapped\n\
7671 through the given translation table, which must be a mapping of\n\
7672 Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
7673 Unmapped characters are left untouched. Characters mapped to None\n\
7674 are deleted.");
7675 
7676 static PyObject*
unicode_translate(PyUnicodeObject * self,PyObject * table)7677 unicode_translate(PyUnicodeObject *self, PyObject *table)
7678 {
7679     return PyUnicode_TranslateCharmap(self->str,
7680                                       self->length,
7681                                       table,
7682                                       "ignore");
7683 }
7684 
7685 PyDoc_STRVAR(upper__doc__,
7686              "S.upper() -> unicode\n\
7687 \n\
7688 Return a copy of S converted to uppercase.");
7689 
7690 static PyObject*
unicode_upper(PyUnicodeObject * self)7691 unicode_upper(PyUnicodeObject *self)
7692 {
7693     return fixup(self, fixupper);
7694 }
7695 
7696 PyDoc_STRVAR(zfill__doc__,
7697              "S.zfill(width) -> unicode\n\
7698 \n\
7699 Pad a numeric string S with zeros on the left, to fill a field\n\
7700 of the specified width. The string S is never truncated.");
7701 
7702 static PyObject *
unicode_zfill(PyUnicodeObject * self,PyObject * args)7703 unicode_zfill(PyUnicodeObject *self, PyObject *args)
7704 {
7705     Py_ssize_t fill;
7706     PyUnicodeObject *u;
7707 
7708     Py_ssize_t width;
7709     if (!PyArg_ParseTuple(args, "n:zfill", &width))
7710         return NULL;
7711 
7712     if (self->length >= width) {
7713         if (PyUnicode_CheckExact(self)) {
7714             Py_INCREF(self);
7715             return (PyObject*) self;
7716         }
7717         else
7718             return PyUnicode_FromUnicode(
7719                 PyUnicode_AS_UNICODE(self),
7720                 PyUnicode_GET_SIZE(self)
7721                 );
7722     }
7723 
7724     fill = width - self->length;
7725 
7726     u = pad(self, fill, 0, '0');
7727 
7728     if (u == NULL)
7729         return NULL;
7730 
7731     if (u->str[fill] == '+' || u->str[fill] == '-') {
7732         /* move sign to beginning of string */
7733         u->str[0] = u->str[fill];
7734         u->str[fill] = '0';
7735     }
7736 
7737     return (PyObject*) u;
7738 }
7739 
7740 #if 0
7741 static PyObject*
7742 free_listsize(PyUnicodeObject *self)
7743 {
7744     return PyInt_FromLong(numfree);
7745 }
7746 #endif
7747 
7748 PyDoc_STRVAR(startswith__doc__,
7749              "S.startswith(prefix[, start[, end]]) -> bool\n\
7750 \n\
7751 Return True if S starts with the specified prefix, False otherwise.\n\
7752 With optional start, test S beginning at that position.\n\
7753 With optional end, stop comparing S at that position.\n\
7754 prefix can also be a tuple of strings to try.");
7755 
7756 static PyObject *
unicode_startswith(PyUnicodeObject * self,PyObject * args)7757 unicode_startswith(PyUnicodeObject *self,
7758                    PyObject *args)
7759 {
7760     PyObject *subobj;
7761     PyUnicodeObject *substring;
7762     Py_ssize_t start = 0;
7763     Py_ssize_t end = PY_SSIZE_T_MAX;
7764     int result;
7765 
7766     if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
7767         return NULL;
7768     if (PyTuple_Check(subobj)) {
7769         Py_ssize_t i;
7770         for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7771             substring = (PyUnicodeObject *)PyUnicode_FromObject(
7772                 PyTuple_GET_ITEM(subobj, i));
7773             if (substring == NULL)
7774                 return NULL;
7775             result = tailmatch(self, substring, start, end, -1);
7776             Py_DECREF(substring);
7777             if (result) {
7778                 Py_RETURN_TRUE;
7779             }
7780         }
7781         /* nothing matched */
7782         Py_RETURN_FALSE;
7783     }
7784     substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
7785     if (substring == NULL) {
7786         if (PyErr_ExceptionMatches(PyExc_TypeError))
7787             PyErr_Format(PyExc_TypeError, "startswith first arg must be str, "
7788                          "unicode, or tuple, not %s", Py_TYPE(subobj)->tp_name);
7789         return NULL;
7790     }
7791     result = tailmatch(self, substring, start, end, -1);
7792     Py_DECREF(substring);
7793     return PyBool_FromLong(result);
7794 }
7795 
7796 
7797 PyDoc_STRVAR(endswith__doc__,
7798              "S.endswith(suffix[, start[, end]]) -> bool\n\
7799 \n\
7800 Return True if S ends with the specified suffix, False otherwise.\n\
7801 With optional start, test S beginning at that position.\n\
7802 With optional end, stop comparing S at that position.\n\
7803 suffix can also be a tuple of strings to try.");
7804 
7805 static PyObject *
unicode_endswith(PyUnicodeObject * self,PyObject * args)7806 unicode_endswith(PyUnicodeObject *self,
7807                  PyObject *args)
7808 {
7809     PyObject *subobj;
7810     PyUnicodeObject *substring;
7811     Py_ssize_t start = 0;
7812     Py_ssize_t end = PY_SSIZE_T_MAX;
7813     int result;
7814 
7815     if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
7816         return NULL;
7817     if (PyTuple_Check(subobj)) {
7818         Py_ssize_t i;
7819         for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7820             substring = (PyUnicodeObject *)PyUnicode_FromObject(
7821                 PyTuple_GET_ITEM(subobj, i));
7822             if (substring == NULL)
7823                 return NULL;
7824             result = tailmatch(self, substring, start, end, +1);
7825             Py_DECREF(substring);
7826             if (result) {
7827                 Py_RETURN_TRUE;
7828             }
7829         }
7830         Py_RETURN_FALSE;
7831     }
7832     substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
7833     if (substring == NULL) {
7834         if (PyErr_ExceptionMatches(PyExc_TypeError))
7835             PyErr_Format(PyExc_TypeError, "endswith first arg must be str, "
7836                          "unicode, or tuple, not %s", Py_TYPE(subobj)->tp_name);
7837         return NULL;
7838     }
7839     result = tailmatch(self, substring, start, end, +1);
7840     Py_DECREF(substring);
7841     return PyBool_FromLong(result);
7842 }
7843 
7844 
7845 /* Implements do_string_format, which is unicode because of stringlib */
7846 #include "stringlib/string_format.h"
7847 
7848 PyDoc_STRVAR(format__doc__,
7849              "S.format(*args, **kwargs) -> unicode\n\
7850 \n\
7851 Return a formatted version of S, using substitutions from args and kwargs.\n\
7852 The substitutions are identified by braces ('{' and '}').");
7853 
7854 static PyObject *
unicode__format__(PyObject * self,PyObject * args)7855 unicode__format__(PyObject *self, PyObject *args)
7856 {
7857     PyObject *format_spec;
7858     PyObject *result = NULL;
7859     PyObject *tmp = NULL;
7860 
7861     /* If 2.x, convert format_spec to the same type as value */
7862     /* This is to allow things like u''.format('') */
7863     if (!PyArg_ParseTuple(args, "O:__format__", &format_spec))
7864         goto done;
7865     if (!(PyBytes_Check(format_spec) || PyUnicode_Check(format_spec))) {
7866         PyErr_Format(PyExc_TypeError, "__format__ arg must be str "
7867                      "or unicode, not %s", Py_TYPE(format_spec)->tp_name);
7868         goto done;
7869     }
7870     tmp = PyObject_Unicode(format_spec);
7871     if (tmp == NULL)
7872         goto done;
7873     format_spec = tmp;
7874 
7875     result = _PyUnicode_FormatAdvanced(self,
7876                                        PyUnicode_AS_UNICODE(format_spec),
7877                                        PyUnicode_GET_SIZE(format_spec));
7878   done:
7879     Py_XDECREF(tmp);
7880     return result;
7881 }
7882 
7883 PyDoc_STRVAR(p_format__doc__,
7884              "S.__format__(format_spec) -> unicode\n\
7885 \n\
7886 Return a formatted version of S as described by format_spec.");
7887 
7888 static PyObject *
unicode__sizeof__(PyUnicodeObject * v)7889 unicode__sizeof__(PyUnicodeObject *v)
7890 {
7891     return PyInt_FromSsize_t(sizeof(PyUnicodeObject) +
7892                              sizeof(Py_UNICODE) * (v->length + 1));
7893 }
7894 
7895 PyDoc_STRVAR(sizeof__doc__,
7896              "S.__sizeof__() -> size of S in memory, in bytes\n\
7897 \n\
7898 ");
7899 
7900 static PyObject *
unicode_getnewargs(PyUnicodeObject * v)7901 unicode_getnewargs(PyUnicodeObject *v)
7902 {
7903     return Py_BuildValue("(u#)", v->str, v->length);
7904 }
7905 
7906 
7907 static PyMethodDef unicode_methods[] = {
7908     {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
7909     {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
7910     {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
7911     {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
7912     {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
7913     {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
7914     {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
7915     {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
7916     {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
7917     {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
7918     {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
7919     {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
7920     {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
7921     {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
7922     {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
7923     {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
7924     {"decode", (PyCFunction) unicode_decode, METH_VARARGS | METH_KEYWORDS, decode__doc__},
7925 /*  {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
7926     {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
7927     {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
7928     {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
7929     {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
7930     {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
7931     {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
7932     {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
7933     {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
7934     {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
7935     {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
7936     {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
7937     {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
7938     {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
7939     {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
7940     {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
7941     {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
7942     {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
7943     {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
7944     {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
7945     {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
7946     {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
7947     {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
7948     {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
7949     {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
7950     {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS},
7951     {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS},
7952     {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
7953 #if 0
7954     {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
7955 #endif
7956 
7957 #if 0
7958     /* This one is just used for debugging the implementation. */
7959     {"freelistsize", (PyCFunction) free_listsize, METH_NOARGS},
7960 #endif
7961 
7962     {"__getnewargs__",  (PyCFunction)unicode_getnewargs, METH_NOARGS},
7963     {NULL, NULL}
7964 };
7965 
7966 static PyObject *
unicode_mod(PyObject * v,PyObject * w)7967 unicode_mod(PyObject *v, PyObject *w)
7968 {
7969     if (!PyUnicode_Check(v)) {
7970         Py_INCREF(Py_NotImplemented);
7971         return Py_NotImplemented;
7972     }
7973     return PyUnicode_Format(v, w);
7974 }
7975 
7976 static PyNumberMethods unicode_as_number = {
7977     0,              /*nb_add*/
7978     0,              /*nb_subtract*/
7979     0,              /*nb_multiply*/
7980     0,              /*nb_divide*/
7981     unicode_mod,            /*nb_remainder*/
7982 };
7983 
7984 static PySequenceMethods unicode_as_sequence = {
7985     (lenfunc) unicode_length,       /* sq_length */
7986     PyUnicode_Concat,           /* sq_concat */
7987     (ssizeargfunc) unicode_repeat,  /* sq_repeat */
7988     (ssizeargfunc) unicode_getitem,     /* sq_item */
7989     (ssizessizeargfunc) unicode_slice,  /* sq_slice */
7990     0,                  /* sq_ass_item */
7991     0,                  /* sq_ass_slice */
7992     PyUnicode_Contains,         /* sq_contains */
7993 };
7994 
7995 static PyObject*
unicode_subscript(PyUnicodeObject * self,PyObject * item)7996 unicode_subscript(PyUnicodeObject* self, PyObject* item)
7997 {
7998     if (PyIndex_Check(item)) {
7999         Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
8000         if (i == -1 && PyErr_Occurred())
8001             return NULL;
8002         if (i < 0)
8003             i += PyUnicode_GET_SIZE(self);
8004         return unicode_getitem(self, i);
8005     } else if (PySlice_Check(item)) {
8006         Py_ssize_t start, stop, step, slicelength, cur, i;
8007         Py_UNICODE* source_buf;
8008         Py_UNICODE* result_buf;
8009         PyObject* result;
8010 
8011         if (_PySlice_Unpack(item, &start, &stop, &step) < 0) {
8012             return NULL;
8013         }
8014         slicelength = _PySlice_AdjustIndices(PyUnicode_GET_SIZE(self), &start,
8015                                             &stop, step);
8016 
8017         if (slicelength <= 0) {
8018             return PyUnicode_FromUnicode(NULL, 0);
8019         } else if (start == 0 && step == 1 && slicelength == self->length &&
8020                    PyUnicode_CheckExact(self)) {
8021             Py_INCREF(self);
8022             return (PyObject *)self;
8023         } else if (step == 1) {
8024             return PyUnicode_FromUnicode(self->str + start, slicelength);
8025         } else {
8026             source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
8027             result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
8028                                                        sizeof(Py_UNICODE));
8029 
8030             if (result_buf == NULL)
8031                 return PyErr_NoMemory();
8032 
8033             for (cur = start, i = 0; i < slicelength; cur += step, i++) {
8034                 result_buf[i] = source_buf[cur];
8035             }
8036 
8037             result = PyUnicode_FromUnicode(result_buf, slicelength);
8038             PyObject_FREE(result_buf);
8039             return result;
8040         }
8041     } else {
8042         PyErr_SetString(PyExc_TypeError, "string indices must be integers");
8043         return NULL;
8044     }
8045 }
8046 
8047 static PyMappingMethods unicode_as_mapping = {
8048     (lenfunc)unicode_length,        /* mp_length */
8049     (binaryfunc)unicode_subscript,  /* mp_subscript */
8050     (objobjargproc)0,           /* mp_ass_subscript */
8051 };
8052 
8053 static Py_ssize_t
unicode_buffer_getreadbuf(PyUnicodeObject * self,Py_ssize_t index,const void ** ptr)8054 unicode_buffer_getreadbuf(PyUnicodeObject *self,
8055                           Py_ssize_t index,
8056                           const void **ptr)
8057 {
8058     if (index != 0) {
8059         PyErr_SetString(PyExc_SystemError,
8060                         "accessing non-existent unicode segment");
8061         return -1;
8062     }
8063     *ptr = (void *) self->str;
8064     return PyUnicode_GET_DATA_SIZE(self);
8065 }
8066 
8067 static Py_ssize_t
unicode_buffer_getwritebuf(PyUnicodeObject * self,Py_ssize_t index,const void ** ptr)8068 unicode_buffer_getwritebuf(PyUnicodeObject *self, Py_ssize_t index,
8069                            const void **ptr)
8070 {
8071     PyErr_SetString(PyExc_TypeError,
8072                     "cannot use unicode as modifiable buffer");
8073     return -1;
8074 }
8075 
8076 static int
unicode_buffer_getsegcount(PyUnicodeObject * self,Py_ssize_t * lenp)8077 unicode_buffer_getsegcount(PyUnicodeObject *self,
8078                            Py_ssize_t *lenp)
8079 {
8080     if (lenp)
8081         *lenp = PyUnicode_GET_DATA_SIZE(self);
8082     return 1;
8083 }
8084 
8085 static Py_ssize_t
unicode_buffer_getcharbuf(PyUnicodeObject * self,Py_ssize_t index,const void ** ptr)8086 unicode_buffer_getcharbuf(PyUnicodeObject *self,
8087                           Py_ssize_t index,
8088                           const void **ptr)
8089 {
8090     PyObject *str;
8091 
8092     if (index != 0) {
8093         PyErr_SetString(PyExc_SystemError,
8094                         "accessing non-existent unicode segment");
8095         return -1;
8096     }
8097     str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
8098     if (str == NULL)
8099         return -1;
8100     *ptr = (void *) PyString_AS_STRING(str);
8101     return PyString_GET_SIZE(str);
8102 }
8103 
8104 /* Helpers for PyUnicode_Format() */
8105 
8106 static PyObject *
getnextarg(PyObject * args,Py_ssize_t arglen,Py_ssize_t * p_argidx)8107 getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
8108 {
8109     Py_ssize_t argidx = *p_argidx;
8110     if (argidx < arglen) {
8111         (*p_argidx)++;
8112         if (arglen < 0)
8113             return args;
8114         else
8115             return PyTuple_GetItem(args, argidx);
8116     }
8117     PyErr_SetString(PyExc_TypeError,
8118                     "not enough arguments for format string");
8119     return NULL;
8120 }
8121 
8122 #define F_LJUST (1<<0)
8123 #define F_SIGN  (1<<1)
8124 #define F_BLANK (1<<2)
8125 #define F_ALT   (1<<3)
8126 #define F_ZERO  (1<<4)
8127 
8128 static Py_ssize_t
strtounicode(Py_UNICODE * buffer,const char * charbuffer)8129 strtounicode(Py_UNICODE *buffer, const char *charbuffer)
8130 {
8131     register Py_ssize_t i;
8132     Py_ssize_t len = strlen(charbuffer);
8133     for (i = len - 1; i >= 0; i--)
8134         buffer[i] = (Py_UNICODE) charbuffer[i];
8135 
8136     return len;
8137 }
8138 
8139 static int
longtounicode(Py_UNICODE * buffer,size_t len,const char * format,long x)8140 longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
8141 {
8142     Py_ssize_t result;
8143 
8144     PyOS_snprintf((char *)buffer, len, format, x);
8145     result = strtounicode(buffer, (char *)buffer);
8146     return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
8147 }
8148 
8149 /* XXX To save some code duplication, formatfloat/long/int could have been
8150    shared with stringobject.c, converting from 8-bit to Unicode after the
8151    formatting is done. */
8152 
8153 /* Returns a new reference to a PyUnicode object, or NULL on failure. */
8154 
8155 static PyObject *
formatfloat(PyObject * v,int flags,int prec,int type)8156 formatfloat(PyObject *v, int flags, int prec, int type)
8157 {
8158     char *p;
8159     PyObject *result;
8160     double x;
8161 
8162     x = PyFloat_AsDouble(v);
8163     if (x == -1.0 && PyErr_Occurred())
8164         return NULL;
8165 
8166     if (prec < 0)
8167         prec = 6;
8168 
8169     p = PyOS_double_to_string(x, type, prec,
8170                               (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
8171     if (p == NULL)
8172         return NULL;
8173     result = PyUnicode_FromStringAndSize(p, strlen(p));
8174     PyMem_Free(p);
8175     return result;
8176 }
8177 
8178 static PyObject*
formatlong(PyObject * val,int flags,int prec,int type)8179 formatlong(PyObject *val, int flags, int prec, int type)
8180 {
8181     char *buf;
8182     int i, len;
8183     PyObject *str; /* temporary string object. */
8184     PyUnicodeObject *result;
8185 
8186     str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
8187     if (!str)
8188         return NULL;
8189     result = _PyUnicode_New(len);
8190     if (!result) {
8191         Py_DECREF(str);
8192         return NULL;
8193     }
8194     for (i = 0; i < len; i++)
8195         result->str[i] = buf[i];
8196     result->str[len] = 0;
8197     Py_DECREF(str);
8198     return (PyObject*)result;
8199 }
8200 
8201 static int
formatint(Py_UNICODE * buf,size_t buflen,int flags,int prec,int type,PyObject * v)8202 formatint(Py_UNICODE *buf,
8203           size_t buflen,
8204           int flags,
8205           int prec,
8206           int type,
8207           PyObject *v)
8208 {
8209     /* fmt = '%#.' + `prec` + 'l' + `type`
8210      * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
8211      *                     + 1 + 1
8212      *                   = 24
8213      */
8214     char fmt[64]; /* plenty big enough! */
8215     char *sign;
8216     long x;
8217 
8218     x = PyInt_AsLong(v);
8219     if (x == -1 && PyErr_Occurred())
8220         return -1;
8221     if (x < 0 && type == 'u') {
8222         type = 'd';
8223     }
8224     if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
8225         sign = "-";
8226     else
8227         sign = "";
8228     if (prec < 0)
8229         prec = 1;
8230 
8231     /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
8232      * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
8233      */
8234     if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
8235         PyErr_SetString(PyExc_OverflowError,
8236                         "formatted integer is too long (precision too large?)");
8237         return -1;
8238     }
8239 
8240     if ((flags & F_ALT) &&
8241         (type == 'x' || type == 'X')) {
8242         /* When converting under %#x or %#X, there are a number
8243          * of issues that cause pain:
8244          * - when 0 is being converted, the C standard leaves off
8245          *   the '0x' or '0X', which is inconsistent with other
8246          *   %#x/%#X conversions and inconsistent with Python's
8247          *   hex() function
8248          * - there are platforms that violate the standard and
8249          *   convert 0 with the '0x' or '0X'
8250          *   (Metrowerks, Compaq Tru64)
8251          * - there are platforms that give '0x' when converting
8252          *   under %#X, but convert 0 in accordance with the
8253          *   standard (OS/2 EMX)
8254          *
8255          * We can achieve the desired consistency by inserting our
8256          * own '0x' or '0X' prefix, and substituting %x/%X in place
8257          * of %#x/%#X.
8258          *
8259          * Note that this is the same approach as used in
8260          * formatint() in stringobject.c
8261          */
8262         PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
8263                       sign, type, prec, type);
8264     }
8265     else {
8266         PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
8267                       sign, (flags&F_ALT) ? "#" : "",
8268                       prec, type);
8269     }
8270     if (sign[0])
8271         return longtounicode(buf, buflen, fmt, -x);
8272     else
8273         return longtounicode(buf, buflen, fmt, x);
8274 }
8275 
8276 static int
formatchar(Py_UNICODE * buf,size_t buflen,PyObject * v)8277 formatchar(Py_UNICODE *buf,
8278            size_t buflen,
8279            PyObject *v)
8280 {
8281     PyObject *unistr;
8282     char *str;
8283     /* presume that the buffer is at least 2 characters long */
8284     if (PyUnicode_Check(v)) {
8285         if (PyUnicode_GET_SIZE(v) != 1)
8286             goto onError;
8287         buf[0] = PyUnicode_AS_UNICODE(v)[0];
8288     }
8289 
8290     else if (PyString_Check(v)) {
8291         if (PyString_GET_SIZE(v) != 1)
8292             goto onError;
8293         /* #7649: "u'%c' % char" should behave like "u'%s' % char" and fail
8294            with a UnicodeDecodeError if 'char' is not decodable with the
8295            default encoding (usually ASCII, but it might be something else) */
8296         str = PyString_AS_STRING(v);
8297         if ((unsigned char)str[0] > 0x7F) {
8298             /* the char is not ASCII; try to decode the string using the
8299                default encoding and return -1 to let the UnicodeDecodeError
8300                be raised if the string can't be decoded */
8301             unistr = PyUnicode_Decode(str, 1, NULL, "strict");
8302             if (unistr == NULL)
8303                 return -1;
8304             buf[0] = PyUnicode_AS_UNICODE(unistr)[0];
8305             Py_DECREF(unistr);
8306         }
8307         else
8308             buf[0] = (Py_UNICODE)str[0];
8309     }
8310 
8311     else {
8312         /* Integer input truncated to a character */
8313         long x;
8314         x = PyInt_AsLong(v);
8315         if (x == -1 && PyErr_Occurred())
8316             goto onError;
8317 #ifdef Py_UNICODE_WIDE
8318         if (x < 0 || x > 0x10ffff) {
8319             PyErr_SetString(PyExc_OverflowError,
8320                             "%c arg not in range(0x110000) "
8321                             "(wide Python build)");
8322             return -1;
8323         }
8324 #else
8325         if (x < 0 || x > 0xffff) {
8326             PyErr_SetString(PyExc_OverflowError,
8327                             "%c arg not in range(0x10000) "
8328                             "(narrow Python build)");
8329             return -1;
8330         }
8331 #endif
8332         buf[0] = (Py_UNICODE) x;
8333     }
8334     buf[1] = '\0';
8335     return 1;
8336 
8337   onError:
8338     PyErr_SetString(PyExc_TypeError,
8339                     "%c requires int or char");
8340     return -1;
8341 }
8342 
8343 /* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
8344 
8345    FORMATBUFLEN is the length of the buffer in which the ints &
8346    chars are formatted. XXX This is a magic number. Each formatting
8347    routine does bounds checking to ensure no overflow, but a better
8348    solution may be to malloc a buffer of appropriate size for each
8349    format. For now, the current solution is sufficient.
8350 */
8351 #define FORMATBUFLEN (size_t)120
8352 
PyUnicode_Format(PyObject * format,PyObject * args)8353 PyObject *PyUnicode_Format(PyObject *format,
8354                            PyObject *args)
8355 {
8356     Py_UNICODE *fmt, *res;
8357     Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
8358     int args_owned = 0;
8359     PyUnicodeObject *result = NULL;
8360     PyObject *dict = NULL;
8361     PyObject *uformat;
8362 
8363     if (format == NULL || args == NULL) {
8364         PyErr_BadInternalCall();
8365         return NULL;
8366     }
8367     uformat = PyUnicode_FromObject(format);
8368     if (uformat == NULL)
8369         return NULL;
8370     fmt = PyUnicode_AS_UNICODE(uformat);
8371     fmtcnt = PyUnicode_GET_SIZE(uformat);
8372 
8373     reslen = rescnt = fmtcnt + 100;
8374     result = _PyUnicode_New(reslen);
8375     if (result == NULL)
8376         goto onError;
8377     res = PyUnicode_AS_UNICODE(result);
8378 
8379     if (PyTuple_Check(args)) {
8380         arglen = PyTuple_Size(args);
8381         argidx = 0;
8382     }
8383     else {
8384         arglen = -1;
8385         argidx = -2;
8386     }
8387     if (Py_TYPE(args)->tp_as_mapping && Py_TYPE(args)->tp_as_mapping->mp_subscript &&
8388         !PyTuple_Check(args) && !PyObject_TypeCheck(args, &PyBaseString_Type))
8389         dict = args;
8390 
8391     while (--fmtcnt >= 0) {
8392         if (*fmt != '%') {
8393             if (--rescnt < 0) {
8394                 rescnt = fmtcnt + 100;
8395                 reslen += rescnt;
8396                 if (_PyUnicode_Resize(&result, reslen) < 0)
8397                     goto onError;
8398                 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
8399                 --rescnt;
8400             }
8401             *res++ = *fmt++;
8402         }
8403         else {
8404             /* Got a format specifier */
8405             int flags = 0;
8406             Py_ssize_t width = -1;
8407             int prec = -1;
8408             Py_UNICODE c = '\0';
8409             Py_UNICODE fill;
8410             int isnumok;
8411             PyObject *v = NULL;
8412             PyObject *temp = NULL;
8413             Py_UNICODE *pbuf;
8414             Py_UNICODE sign;
8415             Py_ssize_t len;
8416             Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{int,char}() */
8417 
8418             fmt++;
8419             if (*fmt == '(') {
8420                 Py_UNICODE *keystart;
8421                 Py_ssize_t keylen;
8422                 PyObject *key;
8423                 int pcount = 1;
8424 
8425                 if (dict == NULL) {
8426                     PyErr_SetString(PyExc_TypeError,
8427                                     "format requires a mapping");
8428                     goto onError;
8429                 }
8430                 ++fmt;
8431                 --fmtcnt;
8432                 keystart = fmt;
8433                 /* Skip over balanced parentheses */
8434                 while (pcount > 0 && --fmtcnt >= 0) {
8435                     if (*fmt == ')')
8436                         --pcount;
8437                     else if (*fmt == '(')
8438                         ++pcount;
8439                     fmt++;
8440                 }
8441                 keylen = fmt - keystart - 1;
8442                 if (fmtcnt < 0 || pcount > 0) {
8443                     PyErr_SetString(PyExc_ValueError,
8444                                     "incomplete format key");
8445                     goto onError;
8446                 }
8447 #if 0
8448                 /* keys are converted to strings using UTF-8 and
8449                    then looked up since Python uses strings to hold
8450                    variables names etc. in its namespaces and we
8451                    wouldn't want to break common idioms. */
8452                 key = PyUnicode_EncodeUTF8(keystart,
8453                                            keylen,
8454                                            NULL);
8455 #else
8456                 key = PyUnicode_FromUnicode(keystart, keylen);
8457 #endif
8458                 if (key == NULL)
8459                     goto onError;
8460                 if (args_owned) {
8461                     Py_DECREF(args);
8462                     args_owned = 0;
8463                 }
8464                 args = PyObject_GetItem(dict, key);
8465                 Py_DECREF(key);
8466                 if (args == NULL) {
8467                     goto onError;
8468                 }
8469                 args_owned = 1;
8470                 arglen = -1;
8471                 argidx = -2;
8472             }
8473             while (--fmtcnt >= 0) {
8474                 switch (c = *fmt++) {
8475                 case '-': flags |= F_LJUST; continue;
8476                 case '+': flags |= F_SIGN; continue;
8477                 case ' ': flags |= F_BLANK; continue;
8478                 case '#': flags |= F_ALT; continue;
8479                 case '0': flags |= F_ZERO; continue;
8480                 }
8481                 break;
8482             }
8483             if (c == '*') {
8484                 v = getnextarg(args, arglen, &argidx);
8485                 if (v == NULL)
8486                     goto onError;
8487                 if (!PyInt_Check(v)) {
8488                     PyErr_SetString(PyExc_TypeError,
8489                                     "* wants int");
8490                     goto onError;
8491                 }
8492                 width = PyInt_AsSsize_t(v);
8493                 if (width == -1 && PyErr_Occurred())
8494                     goto onError;
8495                 if (width < 0) {
8496                     flags |= F_LJUST;
8497                     width = -width;
8498                 }
8499                 if (--fmtcnt >= 0)
8500                     c = *fmt++;
8501             }
8502             else if (c >= '0' && c <= '9') {
8503                 width = c - '0';
8504                 while (--fmtcnt >= 0) {
8505                     c = *fmt++;
8506                     if (c < '0' || c > '9')
8507                         break;
8508                     if (width > (PY_SSIZE_T_MAX - ((int)c - '0')) / 10) {
8509                         PyErr_SetString(PyExc_ValueError,
8510                                         "width too big");
8511                         goto onError;
8512                     }
8513                     width = width*10 + (c - '0');
8514                 }
8515             }
8516             if (c == '.') {
8517                 prec = 0;
8518                 if (--fmtcnt >= 0)
8519                     c = *fmt++;
8520                 if (c == '*') {
8521                     v = getnextarg(args, arglen, &argidx);
8522                     if (v == NULL)
8523                         goto onError;
8524                     if (!PyInt_Check(v)) {
8525                         PyErr_SetString(PyExc_TypeError,
8526                                         "* wants int");
8527                         goto onError;
8528                     }
8529                     prec = _PyInt_AsInt(v);
8530                     if (prec == -1 && PyErr_Occurred())
8531                         goto onError;
8532                     if (prec < 0)
8533                         prec = 0;
8534                     if (--fmtcnt >= 0)
8535                         c = *fmt++;
8536                 }
8537                 else if (c >= '0' && c <= '9') {
8538                     prec = c - '0';
8539                     while (--fmtcnt >= 0) {
8540                         c = *fmt++;
8541                         if (c < '0' || c > '9')
8542                             break;
8543                         if (prec > (INT_MAX - ((int)c - '0')) / 10) {
8544                             PyErr_SetString(PyExc_ValueError,
8545                                             "prec too big");
8546                             goto onError;
8547                         }
8548                         prec = prec*10 + (c - '0');
8549                     }
8550                 }
8551             } /* prec */
8552             if (fmtcnt >= 0) {
8553                 if (c == 'h' || c == 'l' || c == 'L') {
8554                     if (--fmtcnt >= 0)
8555                         c = *fmt++;
8556                 }
8557             }
8558             if (fmtcnt < 0) {
8559                 PyErr_SetString(PyExc_ValueError,
8560                                 "incomplete format");
8561                 goto onError;
8562             }
8563             if (c != '%') {
8564                 v = getnextarg(args, arglen, &argidx);
8565                 if (v == NULL)
8566                     goto onError;
8567             }
8568             sign = 0;
8569             fill = ' ';
8570             switch (c) {
8571 
8572             case '%':
8573                 pbuf = formatbuf;
8574                 /* presume that buffer length is at least 1 */
8575                 pbuf[0] = '%';
8576                 len = 1;
8577                 break;
8578 
8579             case 's':
8580             case 'r':
8581                 if (PyUnicode_CheckExact(v) && c == 's') {
8582                     temp = v;
8583                     Py_INCREF(temp);
8584                 }
8585                 else {
8586                     PyObject *unicode;
8587                     if (c == 's')
8588                         temp = PyObject_Unicode(v);
8589                     else
8590                         temp = PyObject_Repr(v);
8591                     if (temp == NULL)
8592                         goto onError;
8593                     if (PyUnicode_Check(temp))
8594                         /* nothing to do */;
8595                     else if (PyString_Check(temp)) {
8596                         /* convert to string to Unicode */
8597                         unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
8598                                                    PyString_GET_SIZE(temp),
8599                                                    NULL,
8600                                                    "strict");
8601                         Py_DECREF(temp);
8602                         temp = unicode;
8603                         if (temp == NULL)
8604                             goto onError;
8605                     }
8606                     else {
8607                         Py_DECREF(temp);
8608                         PyErr_SetString(PyExc_TypeError,
8609                                         "%s argument has non-string str()");
8610                         goto onError;
8611                     }
8612                 }
8613                 pbuf = PyUnicode_AS_UNICODE(temp);
8614                 len = PyUnicode_GET_SIZE(temp);
8615                 if (prec >= 0 && len > prec)
8616                     len = prec;
8617                 break;
8618 
8619             case 'i':
8620             case 'd':
8621             case 'u':
8622             case 'o':
8623             case 'x':
8624             case 'X':
8625                 if (c == 'i')
8626                     c = 'd';
8627                 isnumok = 0;
8628                 if (PyNumber_Check(v)) {
8629                     PyObject *iobj=NULL;
8630 
8631                     if (_PyAnyInt_Check(v)) {
8632                         iobj = v;
8633                         Py_INCREF(iobj);
8634                     }
8635                     else {
8636                         iobj = PyNumber_Int(v);
8637                         if (iobj==NULL) {
8638                             PyErr_Clear();
8639                             iobj = PyNumber_Long(v);
8640                         }
8641                     }
8642                     if (iobj!=NULL) {
8643                         if (PyInt_Check(iobj)) {
8644                             isnumok = 1;
8645                             pbuf = formatbuf;
8646                             len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8647                                             flags, prec, c, iobj);
8648                             Py_DECREF(iobj);
8649                             if (len < 0)
8650                                 goto onError;
8651                             sign = 1;
8652                         }
8653                         else if (PyLong_Check(iobj)) {
8654                             isnumok = 1;
8655                             temp = formatlong(iobj, flags, prec, c);
8656                             Py_DECREF(iobj);
8657                             if (!temp)
8658                                 goto onError;
8659                             pbuf = PyUnicode_AS_UNICODE(temp);
8660                             len = PyUnicode_GET_SIZE(temp);
8661                             sign = 1;
8662                         }
8663                         else {
8664                             Py_DECREF(iobj);
8665                         }
8666                     }
8667                 }
8668                 if (!isnumok) {
8669                     PyErr_Format(PyExc_TypeError,
8670                                  "%%%c format: a number is required, "
8671                                  "not %.200s", (char)c, Py_TYPE(v)->tp_name);
8672                     goto onError;
8673                 }
8674                 if (flags & F_ZERO)
8675                     fill = '0';
8676                 break;
8677 
8678             case 'e':
8679             case 'E':
8680             case 'f':
8681             case 'F':
8682             case 'g':
8683             case 'G':
8684                 temp = formatfloat(v, flags, prec, c);
8685                 if (temp == NULL)
8686                     goto onError;
8687                 pbuf = PyUnicode_AS_UNICODE(temp);
8688                 len = PyUnicode_GET_SIZE(temp);
8689                 sign = 1;
8690                 if (flags & F_ZERO)
8691                     fill = '0';
8692                 break;
8693 
8694             case 'c':
8695                 pbuf = formatbuf;
8696                 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
8697                 if (len < 0)
8698                     goto onError;
8699                 break;
8700 
8701             default:
8702                 PyErr_Format(PyExc_ValueError,
8703                              "unsupported format character '%c' (0x%x) "
8704                              "at index %zd",
8705                              (31<=c && c<=126) ? (char)c : '?',
8706                              (int)c,
8707                              (Py_ssize_t)(fmt - 1 -
8708                                           PyUnicode_AS_UNICODE(uformat)));
8709                 goto onError;
8710             }
8711             if (sign) {
8712                 if (*pbuf == '-' || *pbuf == '+') {
8713                     sign = *pbuf++;
8714                     len--;
8715                 }
8716                 else if (flags & F_SIGN)
8717                     sign = '+';
8718                 else if (flags & F_BLANK)
8719                     sign = ' ';
8720                 else
8721                     sign = 0;
8722             }
8723             if (width < len)
8724                 width = len;
8725             if (rescnt - (sign != 0) < width) {
8726                 reslen -= rescnt;
8727                 rescnt = width + fmtcnt + 100;
8728                 reslen += rescnt;
8729                 if (reslen < 0) {
8730                     Py_XDECREF(temp);
8731                     PyErr_NoMemory();
8732                     goto onError;
8733                 }
8734                 if (_PyUnicode_Resize(&result, reslen) < 0) {
8735                     Py_XDECREF(temp);
8736                     goto onError;
8737                 }
8738                 res = PyUnicode_AS_UNICODE(result)
8739                     + reslen - rescnt;
8740             }
8741             if (sign) {
8742                 if (fill != ' ')
8743                     *res++ = sign;
8744                 rescnt--;
8745                 if (width > len)
8746                     width--;
8747             }
8748             if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
8749                 assert(pbuf[0] == '0');
8750                 assert(pbuf[1] == c);
8751                 if (fill != ' ') {
8752                     *res++ = *pbuf++;
8753                     *res++ = *pbuf++;
8754                 }
8755                 rescnt -= 2;
8756                 width -= 2;
8757                 if (width < 0)
8758                     width = 0;
8759                 len -= 2;
8760             }
8761             if (width > len && !(flags & F_LJUST)) {
8762                 do {
8763                     --rescnt;
8764                     *res++ = fill;
8765                 } while (--width > len);
8766             }
8767             if (fill == ' ') {
8768                 if (sign)
8769                     *res++ = sign;
8770                 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
8771                     assert(pbuf[0] == '0');
8772                     assert(pbuf[1] == c);
8773                     *res++ = *pbuf++;
8774                     *res++ = *pbuf++;
8775                 }
8776             }
8777             Py_UNICODE_COPY(res, pbuf, len);
8778             res += len;
8779             rescnt -= len;
8780             while (--width >= len) {
8781                 --rescnt;
8782                 *res++ = ' ';
8783             }
8784             if (dict && (argidx < arglen) && c != '%') {
8785                 PyErr_SetString(PyExc_TypeError,
8786                                 "not all arguments converted during string formatting");
8787                 Py_XDECREF(temp);
8788                 goto onError;
8789             }
8790             Py_XDECREF(temp);
8791         } /* '%' */
8792     } /* until end */
8793     if (argidx < arglen && !dict) {
8794         PyErr_SetString(PyExc_TypeError,
8795                         "not all arguments converted during string formatting");
8796         goto onError;
8797     }
8798 
8799     if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
8800         goto onError;
8801     if (args_owned) {
8802         Py_DECREF(args);
8803     }
8804     Py_DECREF(uformat);
8805     return (PyObject *)result;
8806 
8807   onError:
8808     Py_XDECREF(result);
8809     Py_DECREF(uformat);
8810     if (args_owned) {
8811         Py_DECREF(args);
8812     }
8813     return NULL;
8814 }
8815 
8816 static PyBufferProcs unicode_as_buffer = {
8817     (readbufferproc) unicode_buffer_getreadbuf,
8818     (writebufferproc) unicode_buffer_getwritebuf,
8819     (segcountproc) unicode_buffer_getsegcount,
8820     (charbufferproc) unicode_buffer_getcharbuf,
8821 };
8822 
8823 static PyObject *
8824 unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
8825 
8826 static PyObject *
unicode_new(PyTypeObject * type,PyObject * args,PyObject * kwds)8827 unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8828 {
8829     PyObject *x = NULL;
8830     static char *kwlist[] = {"string", "encoding", "errors", 0};
8831     char *encoding = NULL;
8832     char *errors = NULL;
8833 
8834     if (type != &PyUnicode_Type)
8835         return unicode_subtype_new(type, args, kwds);
8836     if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
8837                                      kwlist, &x, &encoding, &errors))
8838         return NULL;
8839     if (x == NULL)
8840         return (PyObject *)_PyUnicode_New(0);
8841     if (encoding == NULL && errors == NULL)
8842         return PyObject_Unicode(x);
8843     else
8844         return PyUnicode_FromEncodedObject(x, encoding, errors);
8845 }
8846 
8847 static PyObject *
unicode_subtype_new(PyTypeObject * type,PyObject * args,PyObject * kwds)8848 unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8849 {
8850     PyUnicodeObject *tmp, *pnew;
8851     Py_ssize_t n;
8852 
8853     assert(PyType_IsSubtype(type, &PyUnicode_Type));
8854     tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
8855     if (tmp == NULL)
8856         return NULL;
8857     assert(PyUnicode_Check(tmp));
8858     pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
8859     if (pnew == NULL) {
8860         Py_DECREF(tmp);
8861         return NULL;
8862     }
8863     pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
8864     if (pnew->str == NULL) {
8865         _Py_ForgetReference((PyObject *)pnew);
8866         PyObject_Del(pnew);
8867         Py_DECREF(tmp);
8868         return PyErr_NoMemory();
8869     }
8870     Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
8871     pnew->length = n;
8872     pnew->hash = tmp->hash;
8873     Py_DECREF(tmp);
8874     return (PyObject *)pnew;
8875 }
8876 
8877 PyDoc_STRVAR(unicode_doc,
8878              "unicode(object='') -> unicode object\n\
8879 unicode(string[, encoding[, errors]]) -> unicode object\n\
8880 \n\
8881 Create a new Unicode object from the given encoded string.\n\
8882 encoding defaults to the current default string encoding.\n\
8883 errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
8884 
8885 PyTypeObject PyUnicode_Type = {
8886     PyVarObject_HEAD_INIT(&PyType_Type, 0)
8887     "unicode",              /* tp_name */
8888     sizeof(PyUnicodeObject),        /* tp_size */
8889     0,                  /* tp_itemsize */
8890     /* Slots */
8891     (destructor)unicode_dealloc,    /* tp_dealloc */
8892     0,                  /* tp_print */
8893     0,                  /* tp_getattr */
8894     0,                  /* tp_setattr */
8895     0,                  /* tp_compare */
8896     unicode_repr,           /* tp_repr */
8897     &unicode_as_number,         /* tp_as_number */
8898     &unicode_as_sequence,       /* tp_as_sequence */
8899     &unicode_as_mapping,        /* tp_as_mapping */
8900     (hashfunc) unicode_hash,        /* tp_hash*/
8901     0,                  /* tp_call*/
8902     (reprfunc) unicode_str,     /* tp_str */
8903     PyObject_GenericGetAttr,        /* tp_getattro */
8904     0,                  /* tp_setattro */
8905     &unicode_as_buffer,         /* tp_as_buffer */
8906     Py_TPFLAGS_DEFAULT | Py_TPFLAGS_CHECKTYPES |
8907     Py_TPFLAGS_BASETYPE | Py_TPFLAGS_UNICODE_SUBCLASS,  /* tp_flags */
8908     unicode_doc,            /* tp_doc */
8909     0,                  /* tp_traverse */
8910     0,                  /* tp_clear */
8911     PyUnicode_RichCompare,      /* tp_richcompare */
8912     0,                  /* tp_weaklistoffset */
8913     0,                  /* tp_iter */
8914     0,                  /* tp_iternext */
8915     unicode_methods,            /* tp_methods */
8916     0,                  /* tp_members */
8917     0,                  /* tp_getset */
8918     &PyBaseString_Type,         /* tp_base */
8919     0,                  /* tp_dict */
8920     0,                  /* tp_descr_get */
8921     0,                  /* tp_descr_set */
8922     0,                  /* tp_dictoffset */
8923     0,                  /* tp_init */
8924     0,                  /* tp_alloc */
8925     unicode_new,            /* tp_new */
8926     PyObject_Del,           /* tp_free */
8927 };
8928 
8929 /* Initialize the Unicode implementation */
8930 
_PyUnicode_Init(void)8931 void _PyUnicode_Init(void)
8932 {
8933     /* XXX - move this array to unicodectype.c ? */
8934     Py_UNICODE linebreak[] = {
8935         0x000A, /* LINE FEED */
8936         0x000D, /* CARRIAGE RETURN */
8937         0x001C, /* FILE SEPARATOR */
8938         0x001D, /* GROUP SEPARATOR */
8939         0x001E, /* RECORD SEPARATOR */
8940         0x0085, /* NEXT LINE */
8941         0x2028, /* LINE SEPARATOR */
8942         0x2029, /* PARAGRAPH SEPARATOR */
8943     };
8944 
8945     /* Init the implementation */
8946     if (!unicode_empty) {
8947         unicode_empty = _PyUnicode_New(0);
8948         if (!unicode_empty)
8949             return;
8950     }
8951 
8952     if (PyType_Ready(&PyUnicode_Type) < 0)
8953         Py_FatalError("Can't initialize 'unicode'");
8954 
8955     /* initialize the linebreak bloom filter */
8956     bloom_linebreak = make_bloom_mask(
8957         linebreak, sizeof(linebreak) / sizeof(linebreak[0])
8958         );
8959 
8960     PyType_Ready(&EncodingMapType);
8961 
8962     if (PyType_Ready(&PyFieldNameIter_Type) < 0)
8963         Py_FatalError("Can't initialize field name iterator type");
8964 
8965     if (PyType_Ready(&PyFormatterIter_Type) < 0)
8966         Py_FatalError("Can't initialize formatter iter type");
8967 }
8968 
8969 /* Finalize the Unicode implementation */
8970 
8971 int
PyUnicode_ClearFreeList(void)8972 PyUnicode_ClearFreeList(void)
8973 {
8974     int freelist_size = numfree;
8975     PyUnicodeObject *u;
8976 
8977     for (u = free_list; u != NULL;) {
8978         PyUnicodeObject *v = u;
8979         u = *(PyUnicodeObject **)u;
8980         if (v->str)
8981             PyObject_DEL(v->str);
8982         Py_XDECREF(v->defenc);
8983         PyObject_Del(v);
8984         numfree--;
8985     }
8986     free_list = NULL;
8987     assert(numfree == 0);
8988     return freelist_size;
8989 }
8990 
8991 void
_PyUnicode_Fini(void)8992 _PyUnicode_Fini(void)
8993 {
8994     int i;
8995 
8996     Py_CLEAR(unicode_empty);
8997 
8998     for (i = 0; i < 256; i++)
8999         Py_CLEAR(unicode_latin1[i]);
9000 
9001     (void)PyUnicode_ClearFreeList();
9002 }
9003 
9004 #ifdef __cplusplus
9005 }
9006 #endif
9007