1 /*
2 
3 Unicode implementation based on original code by Fredrik Lundh,
4 modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
5 Unicode Integration Proposal (see file Misc/unicode.txt).
6 
7 Major speed upgrades to the method implementations at the Reykjavik
8 NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9 
10 Copyright (c) Corporation for National Research Initiatives.
11 
12 --------------------------------------------------------------------
13 The original string type implementation is:
14 
15   Copyright (c) 1999 by Secret Labs AB
16   Copyright (c) 1999 by Fredrik Lundh
17 
18 By obtaining, using, and/or copying this software and/or its
19 associated documentation, you agree that you have read, understood,
20 and will comply with the following terms and conditions:
21 
22 Permission to use, copy, modify, and distribute this software and its
23 associated documentation for any purpose and without fee is hereby
24 granted, provided that the above copyright notice appears in all
25 copies, and that both that copyright notice and this permission notice
26 appear in supporting documentation, and that the name of Secret Labs
27 AB or the author not be used in advertising or publicity pertaining to
28 distribution of the software without specific, written prior
29 permission.
30 
31 SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32 THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33 FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34 ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36 ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37 OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38 --------------------------------------------------------------------
39 
40 */
41 
42 #define PY_SSIZE_T_CLEAN
43 #include "Python.h"
44 
45 #include "unicodeobject.h"
46 #include "ucnhash.h"
47 
48 #ifdef MS_WINDOWS
49 #include <windows.h>
50 #endif
51 
52 /* Limit for the Unicode object free list */
53 
54 #define PyUnicode_MAXFREELIST       1024
55 
56 /* Limit for the Unicode object free list stay alive optimization.
57 
58    The implementation will keep allocated Unicode memory intact for
59    all objects on the free list having a size less than this
60    limit. This reduces malloc() overhead for small Unicode objects.
61 
62    At worst this will result in PyUnicode_MAXFREELIST *
63    (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
64    malloc()-overhead) bytes of unused garbage.
65 
66    Setting the limit to 0 effectively turns the feature off.
67 
68    Note: This is an experimental feature ! If you get core dumps when
69    using Unicode objects, turn this feature off.
70 
71 */
72 
73 #define KEEPALIVE_SIZE_LIMIT       9
74 
75 /* Endianness switches; defaults to little endian */
76 
77 #ifdef WORDS_BIGENDIAN
78 # define BYTEORDER_IS_BIG_ENDIAN
79 #else
80 # define BYTEORDER_IS_LITTLE_ENDIAN
81 #endif
82 
83 /* --- Globals ------------------------------------------------------------
84 
85 NOTE: In the interpreter's initialization phase, some globals are currently
86       initialized dynamically as needed. In the process Unicode objects may
87       be created before the Unicode type is ready.
88 
89 */
90 
91 
92 #ifdef __cplusplus
93 extern "C" {
94 #endif
95 
96 /* Free list for Unicode objects */
97 static PyUnicodeObject *free_list = NULL;
98 static int numfree = 0;
99 
100 /* The empty Unicode object is shared to improve performance. */
101 static PyUnicodeObject *unicode_empty = NULL;
102 
103 #define _Py_RETURN_UNICODE_EMPTY()                      \
104     do {                                                \
105         if (unicode_empty != NULL)                      \
106             Py_INCREF(unicode_empty);                   \
107         else {                                          \
108             unicode_empty = _PyUnicode_New(0);          \
109             if (unicode_empty != NULL)                  \
110                 Py_INCREF(unicode_empty);               \
111         }                                               \
112         return (PyObject *)unicode_empty;               \
113     } while (0)
114 
115 /* Single character Unicode strings in the Latin-1 range are being
116    shared as well. */
117 static PyUnicodeObject *unicode_latin1[256] = {NULL};
118 
119 /* Default encoding to use and assume when NULL is passed as encoding
120    parameter; it is initialized by _PyUnicode_Init().
121 
122    Always use the PyUnicode_SetDefaultEncoding() and
123    PyUnicode_GetDefaultEncoding() APIs to access this global.
124 
125 */
126 static char unicode_default_encoding[100 + 1] = "ascii";
127 
128 /* Fast detection of the most frequent whitespace characters */
129 const unsigned char _Py_ascii_whitespace[] = {
130     0, 0, 0, 0, 0, 0, 0, 0,
131 /*     case 0x0009: * CHARACTER TABULATION */
132 /*     case 0x000A: * LINE FEED */
133 /*     case 0x000B: * LINE TABULATION */
134 /*     case 0x000C: * FORM FEED */
135 /*     case 0x000D: * CARRIAGE RETURN */
136     0, 1, 1, 1, 1, 1, 0, 0,
137     0, 0, 0, 0, 0, 0, 0, 0,
138 /*     case 0x001C: * FILE SEPARATOR */
139 /*     case 0x001D: * GROUP SEPARATOR */
140 /*     case 0x001E: * RECORD SEPARATOR */
141 /*     case 0x001F: * UNIT SEPARATOR */
142     0, 0, 0, 0, 1, 1, 1, 1,
143 /*     case 0x0020: * SPACE */
144     1, 0, 0, 0, 0, 0, 0, 0,
145     0, 0, 0, 0, 0, 0, 0, 0,
146     0, 0, 0, 0, 0, 0, 0, 0,
147     0, 0, 0, 0, 0, 0, 0, 0,
148 
149     0, 0, 0, 0, 0, 0, 0, 0,
150     0, 0, 0, 0, 0, 0, 0, 0,
151     0, 0, 0, 0, 0, 0, 0, 0,
152     0, 0, 0, 0, 0, 0, 0, 0,
153     0, 0, 0, 0, 0, 0, 0, 0,
154     0, 0, 0, 0, 0, 0, 0, 0,
155     0, 0, 0, 0, 0, 0, 0, 0,
156     0, 0, 0, 0, 0, 0, 0, 0
157 };
158 
159 /* Same for linebreaks */
160 static unsigned char ascii_linebreak[] = {
161     0, 0, 0, 0, 0, 0, 0, 0,
162 /*         0x000A, * LINE FEED */
163 /*         0x000B, * LINE TABULATION */
164 /*         0x000C, * FORM FEED */
165 /*         0x000D, * CARRIAGE RETURN */
166     0, 0, 1, 1, 1, 1, 0, 0,
167     0, 0, 0, 0, 0, 0, 0, 0,
168 /*         0x001C, * FILE SEPARATOR */
169 /*         0x001D, * GROUP SEPARATOR */
170 /*         0x001E, * RECORD SEPARATOR */
171     0, 0, 0, 0, 1, 1, 1, 0,
172     0, 0, 0, 0, 0, 0, 0, 0,
173     0, 0, 0, 0, 0, 0, 0, 0,
174     0, 0, 0, 0, 0, 0, 0, 0,
175     0, 0, 0, 0, 0, 0, 0, 0,
176 
177     0, 0, 0, 0, 0, 0, 0, 0,
178     0, 0, 0, 0, 0, 0, 0, 0,
179     0, 0, 0, 0, 0, 0, 0, 0,
180     0, 0, 0, 0, 0, 0, 0, 0,
181     0, 0, 0, 0, 0, 0, 0, 0,
182     0, 0, 0, 0, 0, 0, 0, 0,
183     0, 0, 0, 0, 0, 0, 0, 0,
184     0, 0, 0, 0, 0, 0, 0, 0
185 };
186 
187 
188 Py_UNICODE
PyUnicode_GetMax(void)189 PyUnicode_GetMax(void)
190 {
191 #ifdef Py_UNICODE_WIDE
192     return 0x10FFFF;
193 #else
194     /* This is actually an illegal character, so it should
195        not be passed to unichr. */
196     return 0xFFFF;
197 #endif
198 }
199 
200 /* --- Bloom Filters ----------------------------------------------------- */
201 
202 /* stuff to implement simple "bloom filters" for Unicode characters.
203    to keep things simple, we use a single bitmask, using the least 5
204    bits from each unicode characters as the bit index. */
205 
206 /* the linebreak mask is set up by Unicode_Init below */
207 
208 #if LONG_BIT >= 128
209 #define BLOOM_WIDTH 128
210 #elif LONG_BIT >= 64
211 #define BLOOM_WIDTH 64
212 #elif LONG_BIT >= 32
213 #define BLOOM_WIDTH 32
214 #else
215 #error "LONG_BIT is smaller than 32"
216 #endif
217 
218 #define BLOOM_MASK unsigned long
219 
220 static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
221 
222 #define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
223 #define BLOOM(mask, ch)     ((mask &  (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
224 
225 #define BLOOM_LINEBREAK(ch)                                             \
226     ((ch) < 128U ? ascii_linebreak[(ch)] :                              \
227      (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
228 
make_bloom_mask(Py_UNICODE * ptr,Py_ssize_t len)229 Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
230 {
231     /* calculate simple bloom-style bitmask for a given unicode string */
232 
233     BLOOM_MASK mask;
234     Py_ssize_t i;
235 
236     mask = 0;
237     for (i = 0; i < len; i++)
238         BLOOM_ADD(mask, ptr[i]);
239 
240     return mask;
241 }
242 
unicode_member(Py_UNICODE chr,Py_UNICODE * set,Py_ssize_t setlen)243 Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
244 {
245     Py_ssize_t i;
246 
247     for (i = 0; i < setlen; i++)
248         if (set[i] == chr)
249             return 1;
250 
251     return 0;
252 }
253 
254 #define BLOOM_MEMBER(mask, chr, set, setlen)                    \
255     BLOOM(mask, chr) && unicode_member(chr, set, setlen)
256 
257 /* --- Unicode Object ----------------------------------------------------- */
258 
259 static
unicode_resize(register PyUnicodeObject * unicode,Py_ssize_t length)260 int unicode_resize(register PyUnicodeObject *unicode,
261                    Py_ssize_t length)
262 {
263     void *oldstr;
264 
265     /* Shortcut if there's nothing much to do. */
266     if (unicode->length == length)
267         goto reset;
268 
269     /* Resizing shared object (unicode_empty or single character
270        objects) in-place is not allowed. Use PyUnicode_Resize()
271        instead ! */
272 
273     if (unicode == unicode_empty ||
274         (unicode->length == 1 &&
275          unicode->str[0] < 256U &&
276          unicode_latin1[unicode->str[0]] == unicode)) {
277         PyErr_SetString(PyExc_SystemError,
278                         "can't resize shared unicode objects");
279         return -1;
280     }
281 
282     /* We allocate one more byte to make sure the string is Ux0000 terminated.
283        The overallocation is also used by fastsearch, which assumes that it's
284        safe to look at str[length] (without making any assumptions about what
285        it contains). */
286 
287     oldstr = unicode->str;
288     unicode->str = PyObject_REALLOC(unicode->str,
289                                     sizeof(Py_UNICODE) * (length + 1));
290     if (!unicode->str) {
291         unicode->str = (Py_UNICODE *)oldstr;
292         PyErr_NoMemory();
293         return -1;
294     }
295     unicode->str[length] = 0;
296     unicode->length = length;
297 
298   reset:
299     /* Reset the object caches */
300     if (unicode->defenc) {
301         Py_CLEAR(unicode->defenc);
302     }
303     unicode->hash = -1;
304 
305     return 0;
306 }
307 
308 /* We allocate one more byte to make sure the string is
309    Ux0000 terminated; some code relies on that.
310 
311    XXX This allocator could further be enhanced by assuring that the
312    free list never reduces its size below 1.
313 
314 */
315 
316 static
_PyUnicode_New(Py_ssize_t length)317 PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
318 {
319     register PyUnicodeObject *unicode;
320 
321     /* Optimization for empty strings */
322     if (length == 0 && unicode_empty != NULL) {
323         Py_INCREF(unicode_empty);
324         return unicode_empty;
325     }
326 
327     /* Ensure we won't overflow the size. */
328     if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
329         return (PyUnicodeObject *)PyErr_NoMemory();
330     }
331 
332     /* Unicode freelist & memory allocation */
333     if (free_list) {
334         unicode = free_list;
335         free_list = *(PyUnicodeObject **)unicode;
336         numfree--;
337         if (unicode->str) {
338             /* Keep-Alive optimization: we only upsize the buffer,
339                never downsize it. */
340             if ((unicode->length < length) &&
341                 unicode_resize(unicode, length) < 0) {
342                 PyObject_DEL(unicode->str);
343                 unicode->str = NULL;
344             }
345         }
346         else {
347             size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
348             unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
349         }
350         PyObject_INIT(unicode, &PyUnicode_Type);
351     }
352     else {
353         size_t new_size;
354         unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
355         if (unicode == NULL)
356             return NULL;
357         new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
358         unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
359     }
360 
361     if (!unicode->str) {
362         PyErr_NoMemory();
363         goto onError;
364     }
365     /* Initialize the first element to guard against cases where
366      * the caller fails before initializing str -- unicode_resize()
367      * reads str[0], and the Keep-Alive optimization can keep memory
368      * allocated for str alive across a call to unicode_dealloc(unicode).
369      * We don't want unicode_resize to read uninitialized memory in
370      * that case.
371      */
372     unicode->str[0] = 0;
373     unicode->str[length] = 0;
374     unicode->length = length;
375     unicode->hash = -1;
376     unicode->defenc = NULL;
377     return unicode;
378 
379   onError:
380     /* XXX UNREF/NEWREF interface should be more symmetrical */
381     _Py_DEC_REFTOTAL;
382     _Py_ForgetReference((PyObject *)unicode);
383     PyObject_Del(unicode);
384     return NULL;
385 }
386 
387 static
unicode_dealloc(register PyUnicodeObject * unicode)388 void unicode_dealloc(register PyUnicodeObject *unicode)
389 {
390     if (PyUnicode_CheckExact(unicode) &&
391         numfree < PyUnicode_MAXFREELIST) {
392         /* Keep-Alive optimization */
393         if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
394             PyObject_DEL(unicode->str);
395             unicode->str = NULL;
396             unicode->length = 0;
397         }
398         if (unicode->defenc) {
399             Py_CLEAR(unicode->defenc);
400         }
401         /* Add to free list */
402         *(PyUnicodeObject **)unicode = free_list;
403         free_list = unicode;
404         numfree++;
405     }
406     else {
407         PyObject_DEL(unicode->str);
408         Py_XDECREF(unicode->defenc);
409         Py_TYPE(unicode)->tp_free((PyObject *)unicode);
410     }
411 }
412 
413 static
_PyUnicode_Resize(PyUnicodeObject ** unicode,Py_ssize_t length)414 int _PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
415 {
416     register PyUnicodeObject *v;
417 
418     /* Argument checks */
419     if (unicode == NULL) {
420         PyErr_BadInternalCall();
421         return -1;
422     }
423     v = *unicode;
424     if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
425         PyErr_BadInternalCall();
426         return -1;
427     }
428 
429     /* Resizing unicode_empty and single character objects is not
430        possible since these are being shared. We simply return a fresh
431        copy with the same Unicode content. */
432     if (v->length != length &&
433         (v == unicode_empty || v->length == 1)) {
434         PyUnicodeObject *w = _PyUnicode_New(length);
435         if (w == NULL)
436             return -1;
437         Py_UNICODE_COPY(w->str, v->str,
438                         length < v->length ? length : v->length);
439         Py_DECREF(*unicode);
440         *unicode = w;
441         return 0;
442     }
443 
444     /* Note that we don't have to modify *unicode for unshared Unicode
445        objects, since we can modify them in-place. */
446     return unicode_resize(v, length);
447 }
448 
PyUnicode_Resize(PyObject ** unicode,Py_ssize_t length)449 int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
450 {
451     return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
452 }
453 
PyUnicode_FromUnicode(const Py_UNICODE * u,Py_ssize_t size)454 PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
455                                 Py_ssize_t size)
456 {
457     PyUnicodeObject *unicode;
458 
459     /* If the Unicode data is known at construction time, we can apply
460        some optimizations which share commonly used objects. */
461     if (u != NULL) {
462 
463         /* Optimization for empty strings */
464         if (size == 0)
465             _Py_RETURN_UNICODE_EMPTY();
466 
467         /* Single character Unicode objects in the Latin-1 range are
468            shared when using this constructor */
469         if (size == 1 && *u < 256) {
470             unicode = unicode_latin1[*u];
471             if (!unicode) {
472                 unicode = _PyUnicode_New(1);
473                 if (!unicode)
474                     return NULL;
475                 unicode->str[0] = *u;
476                 unicode_latin1[*u] = unicode;
477             }
478             Py_INCREF(unicode);
479             return (PyObject *)unicode;
480         }
481     }
482 
483     unicode = _PyUnicode_New(size);
484     if (!unicode)
485         return NULL;
486 
487     /* Copy the Unicode data into the new object */
488     if (u != NULL)
489         Py_UNICODE_COPY(unicode->str, u, size);
490 
491     return (PyObject *)unicode;
492 }
493 
PyUnicode_FromStringAndSize(const char * u,Py_ssize_t size)494 PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
495 {
496     PyUnicodeObject *unicode;
497 
498     if (size < 0) {
499         PyErr_SetString(PyExc_SystemError,
500                         "Negative size passed to PyUnicode_FromStringAndSize");
501         return NULL;
502     }
503 
504     /* If the Unicode data is known at construction time, we can apply
505        some optimizations which share commonly used objects.
506        Also, this means the input must be UTF-8, so fall back to the
507        UTF-8 decoder at the end. */
508     if (u != NULL) {
509 
510         /* Optimization for empty strings */
511         if (size == 0)
512             _Py_RETURN_UNICODE_EMPTY();
513 
514         /* Single characters are shared when using this constructor.
515            Restrict to ASCII, since the input must be UTF-8. */
516         if (size == 1 && Py_CHARMASK(*u) < 128) {
517             unicode = unicode_latin1[Py_CHARMASK(*u)];
518             if (!unicode) {
519                 unicode = _PyUnicode_New(1);
520                 if (!unicode)
521                     return NULL;
522                 unicode->str[0] = Py_CHARMASK(*u);
523                 unicode_latin1[Py_CHARMASK(*u)] = unicode;
524             }
525             Py_INCREF(unicode);
526             return (PyObject *)unicode;
527         }
528 
529         return PyUnicode_DecodeUTF8(u, size, NULL);
530     }
531 
532     unicode = _PyUnicode_New(size);
533     if (!unicode)
534         return NULL;
535 
536     return (PyObject *)unicode;
537 }
538 
PyUnicode_FromString(const char * u)539 PyObject *PyUnicode_FromString(const char *u)
540 {
541     size_t size = strlen(u);
542     if (size > PY_SSIZE_T_MAX) {
543         PyErr_SetString(PyExc_OverflowError, "input too long");
544         return NULL;
545     }
546 
547     return PyUnicode_FromStringAndSize(u, size);
548 }
549 
550 /* _Py_UNICODE_NEXT is a private macro used to retrieve the character pointed
551  * by 'ptr', possibly combining surrogate pairs on narrow builds.
552  * 'ptr' and 'end' must be Py_UNICODE*, with 'ptr' pointing at the character
553  * that should be returned and 'end' pointing to the end of the buffer.
554  * ('end' is used on narrow builds to detect a lone surrogate at the
555  * end of the buffer that should be returned unchanged.)
556  * The ptr and end arguments should be side-effect free and ptr must an lvalue.
557  * The type of the returned char is always Py_UCS4.
558  *
559  * Note: the macro advances ptr to next char, so it might have side-effects
560  *       (especially if used with other macros).
561  */
562 
563 /* helper macros used by _Py_UNICODE_NEXT */
564 #define _Py_UNICODE_IS_HIGH_SURROGATE(ch) (0xD800 <= ch && ch <= 0xDBFF)
565 #define _Py_UNICODE_IS_LOW_SURROGATE(ch) (0xDC00 <= ch && ch <= 0xDFFF)
566 /* Join two surrogate characters and return a single Py_UCS4 value. */
567 #define _Py_UNICODE_JOIN_SURROGATES(high, low)  \
568     (((((Py_UCS4)(high) & 0x03FF) << 10) |      \
569       ((Py_UCS4)(low) & 0x03FF)) + 0x10000)
570 
571 #ifdef Py_UNICODE_WIDE
572 #define _Py_UNICODE_NEXT(ptr, end) *(ptr)++
573 #else
574 #define _Py_UNICODE_NEXT(ptr, end)                                      \
575      (((_Py_UNICODE_IS_HIGH_SURROGATE(*(ptr)) && (ptr) < (end)) &&      \
576         _Py_UNICODE_IS_LOW_SURROGATE((ptr)[1])) ?                       \
577        ((ptr) += 2,_Py_UNICODE_JOIN_SURROGATES((ptr)[-2], (ptr)[-1])) : \
578        (Py_UCS4)*(ptr)++)
579 #endif
580 
581 #ifdef HAVE_WCHAR_H
582 
583 #if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4)
584 # define CONVERT_WCHAR_TO_SURROGATES
585 #endif
586 
587 #ifdef CONVERT_WCHAR_TO_SURROGATES
588 
589 /* Here sizeof(wchar_t) is 4 but Py_UNICODE_SIZE == 2, so we need
590    to convert from UTF32 to UTF16. */
591 
PyUnicode_FromWideChar(register const wchar_t * w,Py_ssize_t size)592 PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
593                                  Py_ssize_t size)
594 {
595     PyUnicodeObject *unicode;
596     register Py_ssize_t i;
597     Py_ssize_t alloc;
598     const wchar_t *orig_w;
599 
600     if (w == NULL) {
601         PyErr_BadInternalCall();
602         return NULL;
603     }
604 
605     alloc = size;
606     orig_w = w;
607     for (i = size; i > 0; i--) {
608         if (*w > 0xFFFF)
609             alloc++;
610         w++;
611     }
612     w = orig_w;
613     unicode = _PyUnicode_New(alloc);
614     if (!unicode)
615         return NULL;
616 
617     /* Copy the wchar_t data into the new object */
618     {
619         register Py_UNICODE *u;
620         u = PyUnicode_AS_UNICODE(unicode);
621         for (i = size; i > 0; i--) {
622             if (*w > 0xFFFF) {
623                 wchar_t ordinal = *w++;
624                 ordinal -= 0x10000;
625                 *u++ = 0xD800 | (ordinal >> 10);
626                 *u++ = 0xDC00 | (ordinal & 0x3FF);
627             }
628             else
629                 *u++ = *w++;
630         }
631     }
632     return (PyObject *)unicode;
633 }
634 
635 #else
636 
PyUnicode_FromWideChar(register const wchar_t * w,Py_ssize_t size)637 PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
638                                  Py_ssize_t size)
639 {
640     PyUnicodeObject *unicode;
641 
642     if (w == NULL) {
643         PyErr_BadInternalCall();
644         return NULL;
645     }
646 
647     unicode = _PyUnicode_New(size);
648     if (!unicode)
649         return NULL;
650 
651     /* Copy the wchar_t data into the new object */
652 #ifdef HAVE_USABLE_WCHAR_T
653     memcpy(unicode->str, w, size * sizeof(wchar_t));
654 #else
655     {
656         register Py_UNICODE *u;
657         register Py_ssize_t i;
658         u = PyUnicode_AS_UNICODE(unicode);
659         for (i = size; i > 0; i--)
660             *u++ = *w++;
661     }
662 #endif
663 
664     return (PyObject *)unicode;
665 }
666 
667 #endif /* CONVERT_WCHAR_TO_SURROGATES */
668 
669 #undef CONVERT_WCHAR_TO_SURROGATES
670 
671 static void
makefmt(char * fmt,int longflag,int size_tflag,int zeropad,int width,int precision,char c)672 makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)
673 {
674     *fmt++ = '%';
675     if (width) {
676         if (zeropad)
677             *fmt++ = '0';
678         fmt += sprintf(fmt, "%d", width);
679     }
680     if (precision)
681         fmt += sprintf(fmt, ".%d", precision);
682     if (longflag)
683         *fmt++ = 'l';
684     else if (size_tflag) {
685         char *f = PY_FORMAT_SIZE_T;
686         while (*f)
687             *fmt++ = *f++;
688     }
689     *fmt++ = c;
690     *fmt = '\0';
691 }
692 
693 #define appendstring(string) \
694     do { \
695         for (copy = string;*copy; copy++) { \
696             *s++ = (unsigned char)*copy; \
697         } \
698     } while (0)
699 
700 PyObject *
PyUnicode_FromFormatV(const char * format,va_list vargs)701 PyUnicode_FromFormatV(const char *format, va_list vargs)
702 {
703     va_list count;
704     Py_ssize_t callcount = 0;
705     PyObject **callresults = NULL;
706     PyObject **callresult = NULL;
707     Py_ssize_t n = 0;
708     int width = 0;
709     int precision = 0;
710     int zeropad;
711     const char* f;
712     Py_UNICODE *s;
713     PyObject *string;
714     /* used by sprintf */
715     char buffer[21];
716     /* use abuffer instead of buffer, if we need more space
717      * (which can happen if there's a format specifier with width). */
718     char *abuffer = NULL;
719     char *realbuffer;
720     Py_ssize_t abuffersize = 0;
721     char fmt[60]; /* should be enough for %0width.precisionld */
722     const char *copy;
723 
724 #ifdef VA_LIST_IS_ARRAY
725     Py_MEMCPY(count, vargs, sizeof(va_list));
726 #else
727 #ifdef  __va_copy
728     __va_copy(count, vargs);
729 #else
730     count = vargs;
731 #endif
732 #endif
733      /* step 1: count the number of %S/%R/%s format specifications
734       * (we call PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() for these
735       * objects once during step 3 and put the result in an array) */
736     for (f = format; *f; f++) {
737          if (*f == '%') {
738              f++;
739              while (*f && *f != '%' && !isalpha((unsigned)*f))
740                  f++;
741              if (!*f)
742                  break;
743              if (*f == 's' || *f=='S' || *f=='R')
744                  ++callcount;
745          }
746     }
747     /* step 2: allocate memory for the results of
748      * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
749     if (callcount) {
750         callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
751         if (!callresults) {
752             PyErr_NoMemory();
753             return NULL;
754         }
755         callresult = callresults;
756     }
757     /* step 3: figure out how large a buffer we need */
758     for (f = format; *f; f++) {
759         if (*f == '%') {
760             const char* p = f++;
761             width = 0;
762             while (isdigit((unsigned)*f))
763                 width = (width*10) + *f++ - '0';
764             precision = 0;
765             if (*f == '.') {
766                 f++;
767                 while (isdigit((unsigned)*f))
768                     precision = (precision*10) + *f++ - '0';
769             }
770 
771             /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
772              * they don't affect the amount of space we reserve.
773              */
774             if ((*f == 'l' || *f == 'z') &&
775                 (f[1] == 'd' || f[1] == 'u'))
776                 ++f;
777 
778             switch (*f) {
779             case 'c':
780             {
781                 int ordinal = va_arg(count, int);
782 #ifdef Py_UNICODE_WIDE
783                 if (ordinal < 0 || ordinal > 0x10ffff) {
784                     PyErr_SetString(PyExc_OverflowError,
785                                     "%c arg not in range(0x110000) "
786                                     "(wide Python build)");
787                     goto fail;
788                 }
789 #else
790                 if (ordinal < 0 || ordinal > 0xffff) {
791                     PyErr_SetString(PyExc_OverflowError,
792                                     "%c arg not in range(0x10000) "
793                                     "(narrow Python build)");
794                     goto fail;
795                 }
796 #endif
797                 /* fall through... */
798             }
799             case '%':
800                 n++;
801                 break;
802             case 'd': case 'u': case 'i': case 'x':
803                 (void) va_arg(count, int);
804                 if (width < precision)
805                     width = precision;
806                 /* 20 bytes is enough to hold a 64-bit
807                    integer.  Decimal takes the most space.
808                    This isn't enough for octal.
809                    If a width is specified we need more
810                    (which we allocate later). */
811                 if (width < 20)
812                     width = 20;
813                 n += width;
814                 if (abuffersize < width)
815                     abuffersize = width;
816                 break;
817             case 's':
818             {
819                 /* UTF-8 */
820                 const char *s = va_arg(count, const char*);
821                 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
822                 if (!str)
823                     goto fail;
824                 n += PyUnicode_GET_SIZE(str);
825                 /* Remember the str and switch to the next slot */
826                 *callresult++ = str;
827                 break;
828             }
829             case 'U':
830             {
831                 PyObject *obj = va_arg(count, PyObject *);
832                 assert(obj && PyUnicode_Check(obj));
833                 n += PyUnicode_GET_SIZE(obj);
834                 break;
835             }
836             case 'V':
837             {
838                 PyObject *obj = va_arg(count, PyObject *);
839                 const char *str = va_arg(count, const char *);
840                 assert(obj || str);
841                 assert(!obj || PyUnicode_Check(obj));
842                 if (obj)
843                     n += PyUnicode_GET_SIZE(obj);
844                 else
845                     n += strlen(str);
846                 break;
847             }
848             case 'S':
849             {
850                 PyObject *obj = va_arg(count, PyObject *);
851                 PyObject *str;
852                 assert(obj);
853                 str = PyObject_Str(obj);
854                 if (!str)
855                     goto fail;
856                 n += PyString_GET_SIZE(str);
857                 /* Remember the str and switch to the next slot */
858                 *callresult++ = str;
859                 break;
860             }
861             case 'R':
862             {
863                 PyObject *obj = va_arg(count, PyObject *);
864                 PyObject *repr;
865                 assert(obj);
866                 repr = PyObject_Repr(obj);
867                 if (!repr)
868                     goto fail;
869                 n += PyUnicode_GET_SIZE(repr);
870                 /* Remember the repr and switch to the next slot */
871                 *callresult++ = repr;
872                 break;
873             }
874             case 'p':
875                 (void) va_arg(count, int);
876                 /* maximum 64-bit pointer representation:
877                  * 0xffffffffffffffff
878                  * so 19 characters is enough.
879                  * XXX I count 18 -- what's the extra for?
880                  */
881                 n += 19;
882                 break;
883             default:
884                 /* if we stumble upon an unknown
885                    formatting code, copy the rest of
886                    the format string to the output
887                    string. (we cannot just skip the
888                    code, since there's no way to know
889                    what's in the argument list) */
890                 n += strlen(p);
891                 goto expand;
892             }
893         } else
894             n++;
895     }
896   expand:
897     if (abuffersize > 20) {
898         /* add 1 for sprintf's trailing null byte */
899         abuffer = PyObject_Malloc(abuffersize + 1);
900         if (!abuffer) {
901             PyErr_NoMemory();
902             goto fail;
903         }
904         realbuffer = abuffer;
905     }
906     else
907         realbuffer = buffer;
908     /* step 4: fill the buffer */
909     /* Since we've analyzed how much space we need for the worst case,
910        we don't have to resize the string.
911        There can be no errors beyond this point. */
912     string = PyUnicode_FromUnicode(NULL, n);
913     if (!string)
914         goto fail;
915 
916     s = PyUnicode_AS_UNICODE(string);
917     callresult = callresults;
918 
919     for (f = format; *f; f++) {
920         if (*f == '%') {
921             const char* p = f++;
922             int longflag = 0;
923             int size_tflag = 0;
924             zeropad = (*f == '0');
925             /* parse the width.precision part */
926             width = 0;
927             while (isdigit((unsigned)*f))
928                 width = (width*10) + *f++ - '0';
929             precision = 0;
930             if (*f == '.') {
931                 f++;
932                 while (isdigit((unsigned)*f))
933                     precision = (precision*10) + *f++ - '0';
934             }
935             /* handle the long flag, but only for %ld and %lu.
936                others can be added when necessary. */
937             if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
938                 longflag = 1;
939                 ++f;
940             }
941             /* handle the size_t flag. */
942             if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
943                 size_tflag = 1;
944                 ++f;
945             }
946 
947             switch (*f) {
948             case 'c':
949                 *s++ = va_arg(vargs, int);
950                 break;
951             case 'd':
952                 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd');
953                 if (longflag)
954                     sprintf(realbuffer, fmt, va_arg(vargs, long));
955                 else if (size_tflag)
956                     sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
957                 else
958                     sprintf(realbuffer, fmt, va_arg(vargs, int));
959                 appendstring(realbuffer);
960                 break;
961             case 'u':
962                 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u');
963                 if (longflag)
964                     sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
965                 else if (size_tflag)
966                     sprintf(realbuffer, fmt, va_arg(vargs, size_t));
967                 else
968                     sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
969                 appendstring(realbuffer);
970                 break;
971             case 'i':
972                 makefmt(fmt, 0, 0, zeropad, width, precision, 'i');
973                 sprintf(realbuffer, fmt, va_arg(vargs, int));
974                 appendstring(realbuffer);
975                 break;
976             case 'x':
977                 makefmt(fmt, 0, 0, zeropad, width, precision, 'x');
978                 sprintf(realbuffer, fmt, va_arg(vargs, int));
979                 appendstring(realbuffer);
980                 break;
981             case 's':
982             {
983                 /* unused, since we already have the result */
984                 (void) va_arg(vargs, char *);
985                 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult),
986                                 PyUnicode_GET_SIZE(*callresult));
987                 s += PyUnicode_GET_SIZE(*callresult);
988                 /* We're done with the unicode()/repr() => forget it */
989                 Py_DECREF(*callresult);
990                 /* switch to next unicode()/repr() result */
991                 ++callresult;
992                 break;
993             }
994             case 'U':
995             {
996                 PyObject *obj = va_arg(vargs, PyObject *);
997                 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
998                 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
999                 s += size;
1000                 break;
1001             }
1002             case 'V':
1003             {
1004                 PyObject *obj = va_arg(vargs, PyObject *);
1005                 const char *str = va_arg(vargs, const char *);
1006                 if (obj) {
1007                     Py_ssize_t size = PyUnicode_GET_SIZE(obj);
1008                     Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
1009                     s += size;
1010                 } else {
1011                     appendstring(str);
1012                 }
1013                 break;
1014             }
1015             case 'S':
1016             case 'R':
1017             {
1018                 const char *str = PyString_AS_STRING(*callresult);
1019                 /* unused, since we already have the result */
1020                 (void) va_arg(vargs, PyObject *);
1021                 appendstring(str);
1022                 /* We're done with the unicode()/repr() => forget it */
1023                 Py_DECREF(*callresult);
1024                 /* switch to next unicode()/repr() result */
1025                 ++callresult;
1026                 break;
1027             }
1028             case 'p':
1029                 sprintf(buffer, "%p", va_arg(vargs, void*));
1030                 /* %p is ill-defined:  ensure leading 0x. */
1031                 if (buffer[1] == 'X')
1032                     buffer[1] = 'x';
1033                 else if (buffer[1] != 'x') {
1034                     memmove(buffer+2, buffer, strlen(buffer)+1);
1035                     buffer[0] = '0';
1036                     buffer[1] = 'x';
1037                 }
1038                 appendstring(buffer);
1039                 break;
1040             case '%':
1041                 *s++ = '%';
1042                 break;
1043             default:
1044                 appendstring(p);
1045                 goto end;
1046             }
1047         } else
1048             *s++ = *f;
1049     }
1050 
1051   end:
1052     if (callresults)
1053         PyObject_Free(callresults);
1054     if (abuffer)
1055         PyObject_Free(abuffer);
1056     PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
1057     return string;
1058   fail:
1059     if (callresults) {
1060         PyObject **callresult2 = callresults;
1061         while (callresult2 < callresult) {
1062             Py_DECREF(*callresult2);
1063             ++callresult2;
1064         }
1065         PyObject_Free(callresults);
1066     }
1067     if (abuffer)
1068         PyObject_Free(abuffer);
1069     return NULL;
1070 }
1071 
1072 #undef appendstring
1073 
1074 PyObject *
PyUnicode_FromFormat(const char * format,...)1075 PyUnicode_FromFormat(const char *format, ...)
1076 {
1077     PyObject* ret;
1078     va_list vargs;
1079 
1080 #ifdef HAVE_STDARG_PROTOTYPES
1081     va_start(vargs, format);
1082 #else
1083     va_start(vargs);
1084 #endif
1085     ret = PyUnicode_FromFormatV(format, vargs);
1086     va_end(vargs);
1087     return ret;
1088 }
1089 
PyUnicode_AsWideChar(PyUnicodeObject * unicode,wchar_t * w,Py_ssize_t size)1090 Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
1091                                 wchar_t *w,
1092                                 Py_ssize_t size)
1093 {
1094     if (unicode == NULL) {
1095         PyErr_BadInternalCall();
1096         return -1;
1097     }
1098 
1099     /* If possible, try to copy the 0-termination as well */
1100     if (size > PyUnicode_GET_SIZE(unicode))
1101         size = PyUnicode_GET_SIZE(unicode) + 1;
1102 
1103 #ifdef HAVE_USABLE_WCHAR_T
1104     memcpy(w, unicode->str, size * sizeof(wchar_t));
1105 #else
1106     {
1107         register Py_UNICODE *u;
1108         register Py_ssize_t i;
1109         u = PyUnicode_AS_UNICODE(unicode);
1110         for (i = size; i > 0; i--)
1111             *w++ = *u++;
1112     }
1113 #endif
1114 
1115     if (size > PyUnicode_GET_SIZE(unicode))
1116         return PyUnicode_GET_SIZE(unicode);
1117     else
1118         return size;
1119 }
1120 
1121 #endif
1122 
PyUnicode_FromOrdinal(int ordinal)1123 PyObject *PyUnicode_FromOrdinal(int ordinal)
1124 {
1125     Py_UNICODE s[1];
1126 
1127 #ifdef Py_UNICODE_WIDE
1128     if (ordinal < 0 || ordinal > 0x10ffff) {
1129         PyErr_SetString(PyExc_ValueError,
1130                         "unichr() arg not in range(0x110000) "
1131                         "(wide Python build)");
1132         return NULL;
1133     }
1134 #else
1135     if (ordinal < 0 || ordinal > 0xffff) {
1136         PyErr_SetString(PyExc_ValueError,
1137                         "unichr() arg not in range(0x10000) "
1138                         "(narrow Python build)");
1139         return NULL;
1140     }
1141 #endif
1142 
1143     s[0] = (Py_UNICODE)ordinal;
1144     return PyUnicode_FromUnicode(s, 1);
1145 }
1146 
PyUnicode_FromObject(register PyObject * obj)1147 PyObject *PyUnicode_FromObject(register PyObject *obj)
1148 {
1149     /* XXX Perhaps we should make this API an alias of
1150        PyObject_Unicode() instead ?! */
1151     if (PyUnicode_CheckExact(obj)) {
1152         Py_INCREF(obj);
1153         return obj;
1154     }
1155     if (PyUnicode_Check(obj)) {
1156         /* For a Unicode subtype that's not a Unicode object,
1157            return a true Unicode object with the same data. */
1158         return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1159                                      PyUnicode_GET_SIZE(obj));
1160     }
1161     return PyUnicode_FromEncodedObject(obj, NULL, "strict");
1162 }
1163 
PyUnicode_FromEncodedObject(register PyObject * obj,const char * encoding,const char * errors)1164 PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
1165                                       const char *encoding,
1166                                       const char *errors)
1167 {
1168     const char *s = NULL;
1169     Py_ssize_t len;
1170     PyObject *v;
1171 
1172     if (obj == NULL) {
1173         PyErr_BadInternalCall();
1174         return NULL;
1175     }
1176 
1177 #if 0
1178     /* For b/w compatibility we also accept Unicode objects provided
1179        that no encodings is given and then redirect to
1180        PyObject_Unicode() which then applies the additional logic for
1181        Unicode subclasses.
1182 
1183        NOTE: This API should really only be used for object which
1184        represent *encoded* Unicode !
1185 
1186     */
1187     if (PyUnicode_Check(obj)) {
1188         if (encoding) {
1189             PyErr_SetString(PyExc_TypeError,
1190                             "decoding Unicode is not supported");
1191             return NULL;
1192         }
1193         return PyObject_Unicode(obj);
1194     }
1195 #else
1196     if (PyUnicode_Check(obj)) {
1197         PyErr_SetString(PyExc_TypeError,
1198                         "decoding Unicode is not supported");
1199         return NULL;
1200     }
1201 #endif
1202 
1203     /* Coerce object */
1204     if (PyString_Check(obj)) {
1205         s = PyString_AS_STRING(obj);
1206         len = PyString_GET_SIZE(obj);
1207     }
1208     else if (PyByteArray_Check(obj)) {
1209         /* Python 2.x specific */
1210         PyErr_Format(PyExc_TypeError,
1211                      "decoding bytearray is not supported");
1212         return NULL;
1213     }
1214     else if (PyObject_AsCharBuffer(obj, &s, &len)) {
1215         /* Overwrite the error message with something more useful in
1216            case of a TypeError. */
1217         if (PyErr_ExceptionMatches(PyExc_TypeError))
1218             PyErr_Format(PyExc_TypeError,
1219                          "coercing to Unicode: need string or buffer, "
1220                          "%.80s found",
1221                          Py_TYPE(obj)->tp_name);
1222         goto onError;
1223     }
1224 
1225     /* Convert to Unicode */
1226     if (len == 0)
1227         _Py_RETURN_UNICODE_EMPTY();
1228 
1229     v = PyUnicode_Decode(s, len, encoding, errors);
1230     return v;
1231 
1232   onError:
1233     return NULL;
1234 }
1235 
PyUnicode_Decode(const char * s,Py_ssize_t size,const char * encoding,const char * errors)1236 PyObject *PyUnicode_Decode(const char *s,
1237                            Py_ssize_t size,
1238                            const char *encoding,
1239                            const char *errors)
1240 {
1241     PyObject *buffer = NULL, *unicode;
1242 
1243     if (encoding == NULL)
1244         encoding = PyUnicode_GetDefaultEncoding();
1245 
1246     /* Shortcuts for common default encodings */
1247     if (strcmp(encoding, "utf-8") == 0)
1248         return PyUnicode_DecodeUTF8(s, size, errors);
1249     else if (strcmp(encoding, "latin-1") == 0)
1250         return PyUnicode_DecodeLatin1(s, size, errors);
1251 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1252     else if (strcmp(encoding, "mbcs") == 0)
1253         return PyUnicode_DecodeMBCS(s, size, errors);
1254 #endif
1255     else if (strcmp(encoding, "ascii") == 0)
1256         return PyUnicode_DecodeASCII(s, size, errors);
1257 
1258     /* Decode via the codec registry */
1259     buffer = PyBuffer_FromMemory((void *)s, size);
1260     if (buffer == NULL)
1261         goto onError;
1262     unicode = PyCodec_Decode(buffer, encoding, errors);
1263     if (unicode == NULL)
1264         goto onError;
1265     if (!PyUnicode_Check(unicode)) {
1266         PyErr_Format(PyExc_TypeError,
1267                      "decoder did not return an unicode object (type=%.400s)",
1268                      Py_TYPE(unicode)->tp_name);
1269         Py_DECREF(unicode);
1270         goto onError;
1271     }
1272     Py_DECREF(buffer);
1273     return unicode;
1274 
1275   onError:
1276     Py_XDECREF(buffer);
1277     return NULL;
1278 }
1279 
PyUnicode_AsDecodedObject(PyObject * unicode,const char * encoding,const char * errors)1280 PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1281                                     const char *encoding,
1282                                     const char *errors)
1283 {
1284     PyObject *v;
1285 
1286     if (!PyUnicode_Check(unicode)) {
1287         PyErr_BadArgument();
1288         goto onError;
1289     }
1290 
1291     if (encoding == NULL)
1292         encoding = PyUnicode_GetDefaultEncoding();
1293 
1294     /* Decode via the codec registry */
1295     v = PyCodec_Decode(unicode, encoding, errors);
1296     if (v == NULL)
1297         goto onError;
1298     return v;
1299 
1300   onError:
1301     return NULL;
1302 }
1303 
PyUnicode_Encode(const Py_UNICODE * s,Py_ssize_t size,const char * encoding,const char * errors)1304 PyObject *PyUnicode_Encode(const Py_UNICODE *s,
1305                            Py_ssize_t size,
1306                            const char *encoding,
1307                            const char *errors)
1308 {
1309     PyObject *v, *unicode;
1310 
1311     unicode = PyUnicode_FromUnicode(s, size);
1312     if (unicode == NULL)
1313         return NULL;
1314     v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1315     Py_DECREF(unicode);
1316     return v;
1317 }
1318 
PyUnicode_AsEncodedObject(PyObject * unicode,const char * encoding,const char * errors)1319 PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1320                                     const char *encoding,
1321                                     const char *errors)
1322 {
1323     PyObject *v;
1324 
1325     if (!PyUnicode_Check(unicode)) {
1326         PyErr_BadArgument();
1327         goto onError;
1328     }
1329 
1330     if (encoding == NULL)
1331         encoding = PyUnicode_GetDefaultEncoding();
1332 
1333     /* Encode via the codec registry */
1334     v = PyCodec_Encode(unicode, encoding, errors);
1335     if (v == NULL)
1336         goto onError;
1337     return v;
1338 
1339   onError:
1340     return NULL;
1341 }
1342 
PyUnicode_AsEncodedString(PyObject * unicode,const char * encoding,const char * errors)1343 PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1344                                     const char *encoding,
1345                                     const char *errors)
1346 {
1347     PyObject *v;
1348 
1349     if (!PyUnicode_Check(unicode)) {
1350         PyErr_BadArgument();
1351         goto onError;
1352     }
1353 
1354     if (encoding == NULL)
1355         encoding = PyUnicode_GetDefaultEncoding();
1356 
1357     /* Shortcuts for common default encodings */
1358     if (errors == NULL) {
1359         if (strcmp(encoding, "utf-8") == 0)
1360             return PyUnicode_AsUTF8String(unicode);
1361         else if (strcmp(encoding, "latin-1") == 0)
1362             return PyUnicode_AsLatin1String(unicode);
1363 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1364         else if (strcmp(encoding, "mbcs") == 0)
1365             return PyUnicode_AsMBCSString(unicode);
1366 #endif
1367         else if (strcmp(encoding, "ascii") == 0)
1368             return PyUnicode_AsASCIIString(unicode);
1369     }
1370 
1371     /* Encode via the codec registry */
1372     v = PyCodec_Encode(unicode, encoding, errors);
1373     if (v == NULL)
1374         goto onError;
1375     if (!PyString_Check(v)) {
1376         PyErr_Format(PyExc_TypeError,
1377                      "encoder did not return a string object (type=%.400s)",
1378                      Py_TYPE(v)->tp_name);
1379         Py_DECREF(v);
1380         goto onError;
1381     }
1382     return v;
1383 
1384   onError:
1385     return NULL;
1386 }
1387 
_PyUnicode_AsDefaultEncodedString(PyObject * unicode,const char * errors)1388 PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
1389                                             const char *errors)
1390 {
1391     PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
1392 
1393     if (v)
1394         return v;
1395     v = PyUnicode_AsEncodedString(unicode, NULL, errors);
1396     if (v && errors == NULL)
1397         ((PyUnicodeObject *)unicode)->defenc = v;
1398     return v;
1399 }
1400 
PyUnicode_AsUnicode(PyObject * unicode)1401 Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1402 {
1403     if (!PyUnicode_Check(unicode)) {
1404         PyErr_BadArgument();
1405         goto onError;
1406     }
1407     return PyUnicode_AS_UNICODE(unicode);
1408 
1409   onError:
1410     return NULL;
1411 }
1412 
PyUnicode_GetSize(PyObject * unicode)1413 Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
1414 {
1415     if (!PyUnicode_Check(unicode)) {
1416         PyErr_BadArgument();
1417         goto onError;
1418     }
1419     return PyUnicode_GET_SIZE(unicode);
1420 
1421   onError:
1422     return -1;
1423 }
1424 
PyUnicode_GetDefaultEncoding(void)1425 const char *PyUnicode_GetDefaultEncoding(void)
1426 {
1427     return unicode_default_encoding;
1428 }
1429 
PyUnicode_SetDefaultEncoding(const char * encoding)1430 int PyUnicode_SetDefaultEncoding(const char *encoding)
1431 {
1432     PyObject *v;
1433 
1434     /* Make sure the encoding is valid. As side effect, this also
1435        loads the encoding into the codec registry cache. */
1436     v = _PyCodec_Lookup(encoding);
1437     if (v == NULL)
1438         goto onError;
1439     Py_DECREF(v);
1440     strncpy(unicode_default_encoding,
1441             encoding,
1442             sizeof(unicode_default_encoding) - 1);
1443     return 0;
1444 
1445   onError:
1446     return -1;
1447 }
1448 
1449 /* error handling callback helper:
1450    build arguments, call the callback and check the arguments,
1451    if no exception occurred, copy the replacement to the output
1452    and adjust various state variables.
1453    return 0 on success, -1 on error
1454 */
1455 
1456 static
unicode_decode_call_errorhandler(const char * errors,PyObject ** errorHandler,const char * encoding,const char * reason,const char * input,Py_ssize_t insize,Py_ssize_t * startinpos,Py_ssize_t * endinpos,PyObject ** exceptionObject,const char ** inptr,PyUnicodeObject ** output,Py_ssize_t * outpos,Py_UNICODE ** outptr)1457 int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
1458                                      const char *encoding, const char *reason,
1459                                      const char *input, Py_ssize_t insize, Py_ssize_t *startinpos,
1460                                      Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
1461                                      PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
1462 {
1463     static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
1464 
1465     PyObject *restuple = NULL;
1466     PyObject *repunicode = NULL;
1467     Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
1468     Py_ssize_t requiredsize;
1469     Py_ssize_t newpos;
1470     Py_UNICODE *repptr;
1471     Py_ssize_t repsize;
1472     int res = -1;
1473 
1474     if (*errorHandler == NULL) {
1475         *errorHandler = PyCodec_LookupError(errors);
1476         if (*errorHandler == NULL)
1477             goto onError;
1478     }
1479 
1480     if (*exceptionObject == NULL) {
1481         *exceptionObject = PyUnicodeDecodeError_Create(
1482             encoding, input, insize, *startinpos, *endinpos, reason);
1483         if (*exceptionObject == NULL)
1484             goto onError;
1485     }
1486     else {
1487         if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1488             goto onError;
1489         if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1490             goto onError;
1491         if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1492             goto onError;
1493     }
1494 
1495     restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1496     if (restuple == NULL)
1497         goto onError;
1498     if (!PyTuple_Check(restuple)) {
1499         PyErr_SetString(PyExc_TypeError, &argparse[4]);
1500         goto onError;
1501     }
1502     if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
1503         goto onError;
1504     if (newpos<0)
1505         newpos = insize+newpos;
1506     if (newpos<0 || newpos>insize) {
1507         PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
1508         goto onError;
1509     }
1510 
1511     /* need more space? (at least enough for what we
1512        have+the replacement+the rest of the string (starting
1513        at the new input position), so we won't have to check space
1514        when there are no errors in the rest of the string) */
1515     repptr = PyUnicode_AS_UNICODE(repunicode);
1516     repsize = PyUnicode_GET_SIZE(repunicode);
1517     requiredsize = *outpos;
1518     if (requiredsize > PY_SSIZE_T_MAX - repsize)
1519         goto overflow;
1520     requiredsize += repsize;
1521     if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
1522         goto overflow;
1523     requiredsize += insize - newpos;
1524     if (requiredsize > outsize) {
1525         if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
1526             requiredsize = 2*outsize;
1527         if (_PyUnicode_Resize(output, requiredsize) < 0)
1528             goto onError;
1529         *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
1530     }
1531     *endinpos = newpos;
1532     *inptr = input + newpos;
1533     Py_UNICODE_COPY(*outptr, repptr, repsize);
1534     *outptr += repsize;
1535     *outpos += repsize;
1536     /* we made it! */
1537     res = 0;
1538 
1539   onError:
1540     Py_XDECREF(restuple);
1541     return res;
1542 
1543   overflow:
1544     PyErr_SetString(PyExc_OverflowError,
1545                     "decoded result is too long for a Python string");
1546     goto onError;
1547 }
1548 
1549 /* --- UTF-7 Codec -------------------------------------------------------- */
1550 
1551 /* See RFC2152 for details.  We encode conservatively and decode liberally. */
1552 
1553 /* Three simple macros defining base-64. */
1554 
1555 /* Is c a base-64 character? */
1556 
1557 #define IS_BASE64(c) \
1558     (isalnum(c) || (c) == '+' || (c) == '/')
1559 
1560 /* given that c is a base-64 character, what is its base-64 value? */
1561 
1562 #define FROM_BASE64(c)                                                  \
1563     (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' :                           \
1564      ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 :                      \
1565      ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 :                      \
1566      (c) == '+' ? 62 : 63)
1567 
1568 /* What is the base-64 character of the bottom 6 bits of n? */
1569 
1570 #define TO_BASE64(n)  \
1571     ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1572 
1573 /* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
1574  * decoded as itself.  We are permissive on decoding; the only ASCII
1575  * byte not decoding to itself is the + which begins a base64
1576  * string. */
1577 
1578 #define DECODE_DIRECT(c)                                \
1579     ((c) <= 127 && (c) != '+')
1580 
1581 /* The UTF-7 encoder treats ASCII characters differently according to
1582  * whether they are Set D, Set O, Whitespace, or special (i.e. none of
1583  * the above).  See RFC2152.  This array identifies these different
1584  * sets:
1585  * 0 : "Set D"
1586  *     alphanumeric and '(),-./:?
1587  * 1 : "Set O"
1588  *     !"#$%&*;<=>@[]^_`{|}
1589  * 2 : "whitespace"
1590  *     ht nl cr sp
1591  * 3 : special (must be base64 encoded)
1592  *     everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
1593  */
1594 
1595 static
1596 char utf7_category[128] = {
1597 /* nul soh stx etx eot enq ack bel bs  ht  nl  vt  np  cr  so  si  */
1598     3,  3,  3,  3,  3,  3,  3,  3,  3,  2,  2,  3,  3,  2,  3,  3,
1599 /* dle dc1 dc2 dc3 dc4 nak syn etb can em  sub esc fs  gs  rs  us  */
1600     3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
1601 /* sp   !   "   #   $   %   &   '   (   )   *   +   ,   -   .   /  */
1602     2,  1,  1,  1,  1,  1,  1,  0,  0,  0,  1,  3,  0,  0,  0,  0,
1603 /*  0   1   2   3   4   5   6   7   8   9   :   ;   <   =   >   ?  */
1604     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  1,  0,
1605 /*  @   A   B   C   D   E   F   G   H   I   J   K   L   M   N   O  */
1606     1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
1607 /*  P   Q   R   S   T   U   V   W   X   Y   Z   [   \   ]   ^   _  */
1608     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  3,  1,  1,  1,
1609 /*  `   a   b   c   d   e   f   g   h   i   j   k   l   m   n   o  */
1610     1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
1611 /*  p   q   r   s   t   u   v   w   x   y   z   {   |   }   ~  del */
1612     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  3,  3,
1613 };
1614 
1615 /* ENCODE_DIRECT: this character should be encoded as itself.  The
1616  * answer depends on whether we are encoding set O as itself, and also
1617  * on whether we are encoding whitespace as itself.  RFC2152 makes it
1618  * clear that the answers to these questions vary between
1619  * applications, so this code needs to be flexible.  */
1620 
1621 #define ENCODE_DIRECT(c, directO, directWS)             \
1622     ((c) < 128 && (c) > 0 &&                            \
1623      ((utf7_category[(c)] == 0) ||                      \
1624       (directWS && (utf7_category[(c)] == 2)) ||        \
1625       (directO && (utf7_category[(c)] == 1))))
1626 
PyUnicode_DecodeUTF7(const char * s,Py_ssize_t size,const char * errors)1627 PyObject *PyUnicode_DecodeUTF7(const char *s,
1628                                Py_ssize_t size,
1629                                const char *errors)
1630 {
1631     return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
1632 }
1633 
1634 /* The decoder.  The only state we preserve is our read position,
1635  * i.e. how many characters we have consumed.  So if we end in the
1636  * middle of a shift sequence we have to back off the read position
1637  * and the output to the beginning of the sequence, otherwise we lose
1638  * all the shift state (seen bits, number of bits seen, high
1639  * surrogate). */
1640 
PyUnicode_DecodeUTF7Stateful(const char * s,Py_ssize_t size,const char * errors,Py_ssize_t * consumed)1641 PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
1642                                        Py_ssize_t size,
1643                                        const char *errors,
1644                                        Py_ssize_t *consumed)
1645 {
1646     const char *starts = s;
1647     Py_ssize_t startinpos;
1648     Py_ssize_t endinpos;
1649     Py_ssize_t outpos;
1650     const char *e;
1651     PyUnicodeObject *unicode;
1652     Py_UNICODE *p;
1653     const char *errmsg = "";
1654     int inShift = 0;
1655     Py_UNICODE *shiftOutStart;
1656     unsigned int base64bits = 0;
1657     unsigned long base64buffer = 0;
1658     Py_UNICODE surrogate = 0;
1659     PyObject *errorHandler = NULL;
1660     PyObject *exc = NULL;
1661 
1662     unicode = _PyUnicode_New(size);
1663     if (!unicode)
1664         return NULL;
1665     if (size == 0) {
1666         if (consumed)
1667             *consumed = 0;
1668         return (PyObject *)unicode;
1669     }
1670 
1671     p = unicode->str;
1672     shiftOutStart = p;
1673     e = s + size;
1674 
1675     while (s < e) {
1676         Py_UNICODE ch = (unsigned char) *s;
1677 
1678         if (inShift) { /* in a base-64 section */
1679             if (IS_BASE64(ch)) { /* consume a base-64 character */
1680                 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
1681                 base64bits += 6;
1682                 s++;
1683                 if (base64bits >= 16) {
1684                     /* we have enough bits for a UTF-16 value */
1685                     Py_UNICODE outCh = (Py_UNICODE)
1686                                        (base64buffer >> (base64bits-16));
1687                     base64bits -= 16;
1688                     base64buffer &= (1 << base64bits) - 1; /* clear high bits */
1689                     assert(outCh <= 0xffff);
1690                     if (surrogate) {
1691                         /* expecting a second surrogate */
1692                         if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
1693 #ifdef Py_UNICODE_WIDE
1694                             *p++ = (((surrogate & 0x3FF)<<10)
1695                                     | (outCh & 0x3FF)) + 0x10000;
1696 #else
1697                             *p++ = surrogate;
1698                             *p++ = outCh;
1699 #endif
1700                             surrogate = 0;
1701                             continue;
1702                         }
1703                         else {
1704                             *p++ = surrogate;
1705                             surrogate = 0;
1706                         }
1707                     }
1708                     if (outCh >= 0xD800 && outCh <= 0xDBFF) {
1709                         /* first surrogate */
1710                         surrogate = outCh;
1711                     }
1712                     else {
1713                         *p++ = outCh;
1714                     }
1715                 }
1716             }
1717             else { /* now leaving a base-64 section */
1718                 inShift = 0;
1719                 s++;
1720                 if (surrogate) {
1721                     *p++ = surrogate;
1722                     surrogate = 0;
1723                 }
1724                 if (base64bits > 0) { /* left-over bits */
1725                     if (base64bits >= 6) {
1726                         /* We've seen at least one base-64 character */
1727                         errmsg = "partial character in shift sequence";
1728                         goto utf7Error;
1729                     }
1730                     else {
1731                         /* Some bits remain; they should be zero */
1732                         if (base64buffer != 0) {
1733                             errmsg = "non-zero padding bits in shift sequence";
1734                             goto utf7Error;
1735                         }
1736                     }
1737                 }
1738                 if (ch != '-') {
1739                     /* '-' is absorbed; other terminating
1740                        characters are preserved */
1741                     *p++ = ch;
1742                 }
1743             }
1744         }
1745         else if ( ch == '+' ) {
1746             startinpos = s-starts;
1747             s++; /* consume '+' */
1748             if (s < e && *s == '-') { /* '+-' encodes '+' */
1749                 s++;
1750                 *p++ = '+';
1751             }
1752             else { /* begin base64-encoded section */
1753                 inShift = 1;
1754                 shiftOutStart = p;
1755                 base64bits = 0;
1756                 base64buffer = 0;
1757             }
1758         }
1759         else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
1760             *p++ = ch;
1761             s++;
1762         }
1763         else {
1764             startinpos = s-starts;
1765             s++;
1766             errmsg = "unexpected special character";
1767             goto utf7Error;
1768         }
1769         continue;
1770 utf7Error:
1771         outpos = p-PyUnicode_AS_UNICODE(unicode);
1772         endinpos = s-starts;
1773         if (unicode_decode_call_errorhandler(
1774                 errors, &errorHandler,
1775                 "utf7", errmsg,
1776                 starts, size, &startinpos, &endinpos, &exc, &s,
1777                 &unicode, &outpos, &p))
1778             goto onError;
1779     }
1780 
1781     /* end of string */
1782 
1783     if (inShift && !consumed) { /* in shift sequence, no more to follow */
1784         /* if we're in an inconsistent state, that's an error */
1785         if (surrogate ||
1786                 (base64bits >= 6) ||
1787                 (base64bits > 0 && base64buffer != 0)) {
1788             outpos = p-PyUnicode_AS_UNICODE(unicode);
1789             endinpos = size;
1790             if (unicode_decode_call_errorhandler(
1791                     errors, &errorHandler,
1792                     "utf7", "unterminated shift sequence",
1793                     starts, size, &startinpos, &endinpos, &exc, &s,
1794                     &unicode, &outpos, &p))
1795                 goto onError;
1796         }
1797     }
1798 
1799     /* return state */
1800     if (consumed) {
1801         if (inShift) {
1802             p = shiftOutStart; /* back off output */
1803             *consumed = startinpos;
1804         }
1805         else {
1806             *consumed = s-starts;
1807         }
1808     }
1809 
1810     if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
1811         goto onError;
1812 
1813     Py_XDECREF(errorHandler);
1814     Py_XDECREF(exc);
1815     return (PyObject *)unicode;
1816 
1817   onError:
1818     Py_XDECREF(errorHandler);
1819     Py_XDECREF(exc);
1820     Py_DECREF(unicode);
1821     return NULL;
1822 }
1823 
1824 
PyUnicode_EncodeUTF7(const Py_UNICODE * s,Py_ssize_t size,int base64SetO,int base64WhiteSpace,const char * errors)1825 PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
1826                                Py_ssize_t size,
1827                                int base64SetO,
1828                                int base64WhiteSpace,
1829                                const char *errors)
1830 {
1831     PyObject *v;
1832     /* It might be possible to tighten this worst case */
1833     Py_ssize_t allocated = 8 * size;
1834     int inShift = 0;
1835     Py_ssize_t i = 0;
1836     unsigned int base64bits = 0;
1837     unsigned long base64buffer = 0;
1838     char * out;
1839     char * start;
1840 
1841     if (allocated / 8 != size)
1842         return PyErr_NoMemory();
1843 
1844     if (size == 0)
1845         return PyString_FromStringAndSize(NULL, 0);
1846 
1847     v = PyString_FromStringAndSize(NULL, allocated);
1848     if (v == NULL)
1849         return NULL;
1850 
1851     start = out = PyString_AS_STRING(v);
1852     for (;i < size; ++i) {
1853         Py_UNICODE ch = s[i];
1854 
1855         if (inShift) {
1856             if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
1857                 /* shifting out */
1858                 if (base64bits) { /* output remaining bits */
1859                     *out++ = TO_BASE64(base64buffer << (6-base64bits));
1860                     base64buffer = 0;
1861                     base64bits = 0;
1862                 }
1863                 inShift = 0;
1864                 /* Characters not in the BASE64 set implicitly unshift the sequence
1865                    so no '-' is required, except if the character is itself a '-' */
1866                 if (IS_BASE64(ch) || ch == '-') {
1867                     *out++ = '-';
1868                 }
1869                 *out++ = (char) ch;
1870             }
1871             else {
1872                 goto encode_char;
1873             }
1874         }
1875         else { /* not in a shift sequence */
1876             if (ch == '+') {
1877                 *out++ = '+';
1878                         *out++ = '-';
1879             }
1880             else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
1881                 *out++ = (char) ch;
1882             }
1883             else {
1884                 *out++ = '+';
1885                 inShift = 1;
1886                 goto encode_char;
1887             }
1888         }
1889         continue;
1890 encode_char:
1891 #ifdef Py_UNICODE_WIDE
1892         if (ch >= 0x10000) {
1893             /* code first surrogate */
1894             base64bits += 16;
1895             base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
1896             while (base64bits >= 6) {
1897                 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
1898                 base64bits -= 6;
1899             }
1900             /* prepare second surrogate */
1901             ch =  0xDC00 | ((ch-0x10000) & 0x3FF);
1902         }
1903 #endif
1904         base64bits += 16;
1905         base64buffer = (base64buffer << 16) | ch;
1906         while (base64bits >= 6) {
1907             *out++ = TO_BASE64(base64buffer >> (base64bits-6));
1908             base64bits -= 6;
1909         }
1910     }
1911     if (base64bits)
1912         *out++= TO_BASE64(base64buffer << (6-base64bits) );
1913     if (inShift)
1914         *out++ = '-';
1915 
1916     if (_PyString_Resize(&v, out - start))
1917         return NULL;
1918     return v;
1919 }
1920 
1921 #undef IS_BASE64
1922 #undef FROM_BASE64
1923 #undef TO_BASE64
1924 #undef DECODE_DIRECT
1925 #undef ENCODE_DIRECT
1926 
1927 /* --- UTF-8 Codec -------------------------------------------------------- */
1928 
1929 static
1930 char utf8_code_length[256] = {
1931     /* Map UTF-8 encoded prefix byte to sequence length.  Zero means
1932        illegal prefix.  See RFC 3629 for details */
1933     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
1934     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1935     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1936     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1937     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1938     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1939     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1940     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
1941     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
1942     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1943     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1944     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
1945     0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
1946     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
1947     3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
1948     4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0  /* F0-F4 + F5-FF */
1949 };
1950 
PyUnicode_DecodeUTF8(const char * s,Py_ssize_t size,const char * errors)1951 PyObject *PyUnicode_DecodeUTF8(const char *s,
1952                                Py_ssize_t size,
1953                                const char *errors)
1954 {
1955     return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1956 }
1957 
PyUnicode_DecodeUTF8Stateful(const char * s,Py_ssize_t size,const char * errors,Py_ssize_t * consumed)1958 PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
1959                                        Py_ssize_t size,
1960                                        const char *errors,
1961                                        Py_ssize_t *consumed)
1962 {
1963     const char *starts = s;
1964     int n;
1965     int k;
1966     Py_ssize_t startinpos;
1967     Py_ssize_t endinpos;
1968     Py_ssize_t outpos;
1969     const char *e;
1970     PyUnicodeObject *unicode;
1971     Py_UNICODE *p;
1972     const char *errmsg = "";
1973     PyObject *errorHandler = NULL;
1974     PyObject *exc = NULL;
1975 
1976     /* Note: size will always be longer than the resulting Unicode
1977        character count */
1978     unicode = _PyUnicode_New(size);
1979     if (!unicode)
1980         return NULL;
1981     if (size == 0) {
1982         if (consumed)
1983             *consumed = 0;
1984         return (PyObject *)unicode;
1985     }
1986 
1987     /* Unpack UTF-8 encoded data */
1988     p = unicode->str;
1989     e = s + size;
1990 
1991     while (s < e) {
1992         Py_UCS4 ch = (unsigned char)*s;
1993 
1994         if (ch < 0x80) {
1995             *p++ = (Py_UNICODE)ch;
1996             s++;
1997             continue;
1998         }
1999 
2000         n = utf8_code_length[ch];
2001 
2002         if (s + n > e) {
2003             if (consumed)
2004                 break;
2005             else {
2006                 errmsg = "unexpected end of data";
2007                 startinpos = s-starts;
2008                 endinpos = startinpos+1;
2009                 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
2010                     endinpos++;
2011                 goto utf8Error;
2012             }
2013         }
2014 
2015         switch (n) {
2016 
2017         case 0:
2018             errmsg = "invalid start byte";
2019             startinpos = s-starts;
2020             endinpos = startinpos+1;
2021             goto utf8Error;
2022 
2023         case 1:
2024             errmsg = "internal error";
2025             startinpos = s-starts;
2026             endinpos = startinpos+1;
2027             goto utf8Error;
2028 
2029         case 2:
2030             if ((s[1] & 0xc0) != 0x80) {
2031                 errmsg = "invalid continuation byte";
2032                 startinpos = s-starts;
2033                 endinpos = startinpos + 1;
2034                 goto utf8Error;
2035             }
2036             ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
2037             assert ((ch > 0x007F) && (ch <= 0x07FF));
2038             *p++ = (Py_UNICODE)ch;
2039             break;
2040 
2041         case 3:
2042             /* XXX: surrogates shouldn't be valid UTF-8!
2043                see http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
2044                (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt
2045                Uncomment the 2 lines below to make them invalid,
2046                code points: d800-dfff; UTF-8: \xed\xa0\x80-\xed\xbf\xbf. */
2047             if ((s[1] & 0xc0) != 0x80 ||
2048                 (s[2] & 0xc0) != 0x80 ||
2049                 ((unsigned char)s[0] == 0xE0 &&
2050                  (unsigned char)s[1] < 0xA0)/* ||
2051                 ((unsigned char)s[0] == 0xED &&
2052                  (unsigned char)s[1] > 0x9F)*/) {
2053                 errmsg = "invalid continuation byte";
2054                 startinpos = s-starts;
2055                 endinpos = startinpos + 1;
2056 
2057                 /* if s[1] first two bits are 1 and 0, then the invalid
2058                    continuation byte is s[2], so increment endinpos by 1,
2059                    if not, s[1] is invalid and endinpos doesn't need to
2060                    be incremented. */
2061                 if ((s[1] & 0xC0) == 0x80)
2062                     endinpos++;
2063                 goto utf8Error;
2064             }
2065             ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
2066             assert ((ch > 0x07FF) && (ch <= 0xFFFF));
2067             *p++ = (Py_UNICODE)ch;
2068             break;
2069 
2070         case 4:
2071             if ((s[1] & 0xc0) != 0x80 ||
2072                 (s[2] & 0xc0) != 0x80 ||
2073                 (s[3] & 0xc0) != 0x80 ||
2074                 ((unsigned char)s[0] == 0xF0 &&
2075                  (unsigned char)s[1] < 0x90) ||
2076                 ((unsigned char)s[0] == 0xF4 &&
2077                  (unsigned char)s[1] > 0x8F)) {
2078                 errmsg = "invalid continuation byte";
2079                 startinpos = s-starts;
2080                 endinpos = startinpos + 1;
2081                 if ((s[1] & 0xC0) == 0x80) {
2082                     endinpos++;
2083                     if ((s[2] & 0xC0) == 0x80)
2084                         endinpos++;
2085                 }
2086                 goto utf8Error;
2087             }
2088             ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
2089                  ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
2090             assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
2091 
2092 #ifdef Py_UNICODE_WIDE
2093             *p++ = (Py_UNICODE)ch;
2094 #else
2095             /*  compute and append the two surrogates: */
2096 
2097             /*  translate from 10000..10FFFF to 0..FFFF */
2098             ch -= 0x10000;
2099 
2100             /*  high surrogate = top 10 bits added to D800 */
2101             *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
2102 
2103             /*  low surrogate = bottom 10 bits added to DC00 */
2104             *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
2105 #endif
2106             break;
2107         }
2108         s += n;
2109         continue;
2110 
2111       utf8Error:
2112         outpos = p-PyUnicode_AS_UNICODE(unicode);
2113         if (unicode_decode_call_errorhandler(
2114                 errors, &errorHandler,
2115                 "utf8", errmsg,
2116                 starts, size, &startinpos, &endinpos, &exc, &s,
2117                 &unicode, &outpos, &p))
2118             goto onError;
2119     }
2120     if (consumed)
2121         *consumed = s-starts;
2122 
2123     /* Adjust length */
2124     if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2125         goto onError;
2126 
2127     Py_XDECREF(errorHandler);
2128     Py_XDECREF(exc);
2129     return (PyObject *)unicode;
2130 
2131   onError:
2132     Py_XDECREF(errorHandler);
2133     Py_XDECREF(exc);
2134     Py_DECREF(unicode);
2135     return NULL;
2136 }
2137 
2138 /* Allocation strategy:  if the string is short, convert into a stack buffer
2139    and allocate exactly as much space needed at the end.  Else allocate the
2140    maximum possible needed (4 result bytes per Unicode character), and return
2141    the excess memory at the end.
2142 */
2143 PyObject *
PyUnicode_EncodeUTF8(const Py_UNICODE * s,Py_ssize_t size,const char * errors)2144 PyUnicode_EncodeUTF8(const Py_UNICODE *s,
2145                      Py_ssize_t size,
2146                      const char *errors)
2147 {
2148 #define MAX_SHORT_UNICHARS 300  /* largest size we'll do on the stack */
2149 
2150     Py_ssize_t i;           /* index into s of next input byte */
2151     PyObject *v;        /* result string object */
2152     char *p;            /* next free byte in output buffer */
2153     Py_ssize_t nallocated;  /* number of result bytes allocated */
2154     Py_ssize_t nneeded;        /* number of result bytes needed */
2155     char stackbuf[MAX_SHORT_UNICHARS * 4];
2156 
2157     assert(s != NULL);
2158     assert(size >= 0);
2159 
2160     if (size <= MAX_SHORT_UNICHARS) {
2161         /* Write into the stack buffer; nallocated can't overflow.
2162          * At the end, we'll allocate exactly as much heap space as it
2163          * turns out we need.
2164          */
2165         nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
2166         v = NULL;   /* will allocate after we're done */
2167         p = stackbuf;
2168     }
2169     else {
2170         /* Overallocate on the heap, and give the excess back at the end. */
2171         nallocated = size * 4;
2172         if (nallocated / 4 != size)  /* overflow! */
2173             return PyErr_NoMemory();
2174         v = PyString_FromStringAndSize(NULL, nallocated);
2175         if (v == NULL)
2176             return NULL;
2177         p = PyString_AS_STRING(v);
2178     }
2179 
2180     for (i = 0; i < size;) {
2181         Py_UCS4 ch = s[i++];
2182 
2183         if (ch < 0x80)
2184             /* Encode ASCII */
2185             *p++ = (char) ch;
2186 
2187         else if (ch < 0x0800) {
2188             /* Encode Latin-1 */
2189             *p++ = (char)(0xc0 | (ch >> 6));
2190             *p++ = (char)(0x80 | (ch & 0x3f));
2191         }
2192         else {
2193             /* Encode UCS2 Unicode ordinals */
2194             if (ch < 0x10000) {
2195                 /* Special case: check for high surrogate */
2196                 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
2197                     Py_UCS4 ch2 = s[i];
2198                     /* Check for low surrogate and combine the two to
2199                        form a UCS4 value */
2200                     if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2201                         ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
2202                         i++;
2203                         goto encodeUCS4;
2204                     }
2205                     /* Fall through: handles isolated high surrogates */
2206                 }
2207                 *p++ = (char)(0xe0 | (ch >> 12));
2208                 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2209                 *p++ = (char)(0x80 | (ch & 0x3f));
2210                 continue;
2211             }
2212           encodeUCS4:
2213             /* Encode UCS4 Unicode ordinals */
2214             *p++ = (char)(0xf0 | (ch >> 18));
2215             *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2216             *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2217             *p++ = (char)(0x80 | (ch & 0x3f));
2218         }
2219     }
2220 
2221     if (v == NULL) {
2222         /* This was stack allocated. */
2223         nneeded = p - stackbuf;
2224         assert(nneeded <= nallocated);
2225         v = PyString_FromStringAndSize(stackbuf, nneeded);
2226     }
2227     else {
2228         /* Cut back to size actually needed. */
2229         nneeded = p - PyString_AS_STRING(v);
2230         assert(nneeded <= nallocated);
2231         if (_PyString_Resize(&v, nneeded))
2232             return NULL;
2233     }
2234     return v;
2235 
2236 #undef MAX_SHORT_UNICHARS
2237 }
2238 
PyUnicode_AsUTF8String(PyObject * unicode)2239 PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
2240 {
2241     if (!PyUnicode_Check(unicode)) {
2242         PyErr_BadArgument();
2243         return NULL;
2244     }
2245     return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
2246                                 PyUnicode_GET_SIZE(unicode),
2247                                 NULL);
2248 }
2249 
2250 /* --- UTF-32 Codec ------------------------------------------------------- */
2251 
2252 PyObject *
PyUnicode_DecodeUTF32(const char * s,Py_ssize_t size,const char * errors,int * byteorder)2253 PyUnicode_DecodeUTF32(const char *s,
2254                       Py_ssize_t size,
2255                       const char *errors,
2256                       int *byteorder)
2257 {
2258     return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
2259 }
2260 
2261 PyObject *
PyUnicode_DecodeUTF32Stateful(const char * s,Py_ssize_t size,const char * errors,int * byteorder,Py_ssize_t * consumed)2262 PyUnicode_DecodeUTF32Stateful(const char *s,
2263                               Py_ssize_t size,
2264                               const char *errors,
2265                               int *byteorder,
2266                               Py_ssize_t *consumed)
2267 {
2268     const char *starts = s;
2269     Py_ssize_t startinpos;
2270     Py_ssize_t endinpos;
2271     Py_ssize_t outpos;
2272     PyUnicodeObject *unicode;
2273     Py_UNICODE *p;
2274 #ifndef Py_UNICODE_WIDE
2275     int pairs = 0;
2276     const unsigned char *qq;
2277 #else
2278     const int pairs = 0;
2279 #endif
2280     const unsigned char *q, *e;
2281     int bo = 0;       /* assume native ordering by default */
2282     const char *errmsg = "";
2283     /* Offsets from q for retrieving bytes in the right order. */
2284 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2285     int iorder[] = {0, 1, 2, 3};
2286 #else
2287     int iorder[] = {3, 2, 1, 0};
2288 #endif
2289     PyObject *errorHandler = NULL;
2290     PyObject *exc = NULL;
2291 
2292     q = (unsigned char *)s;
2293     e = q + size;
2294 
2295     if (byteorder)
2296         bo = *byteorder;
2297 
2298     /* Check for BOM marks (U+FEFF) in the input and adjust current
2299        byte order setting accordingly. In native mode, the leading BOM
2300        mark is skipped, in all other modes, it is copied to the output
2301        stream as-is (giving a ZWNBSP character). */
2302     if (bo == 0) {
2303         if (size >= 4) {
2304             const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2305                 (q[iorder[1]] << 8) | q[iorder[0]];
2306 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2307             if (bom == 0x0000FEFF) {
2308                 q += 4;
2309                 bo = -1;
2310             }
2311             else if (bom == 0xFFFE0000) {
2312                 q += 4;
2313                 bo = 1;
2314             }
2315 #else
2316             if (bom == 0x0000FEFF) {
2317                 q += 4;
2318                 bo = 1;
2319             }
2320             else if (bom == 0xFFFE0000) {
2321                 q += 4;
2322                 bo = -1;
2323             }
2324 #endif
2325         }
2326     }
2327 
2328     if (bo == -1) {
2329         /* force LE */
2330         iorder[0] = 0;
2331         iorder[1] = 1;
2332         iorder[2] = 2;
2333         iorder[3] = 3;
2334     }
2335     else if (bo == 1) {
2336         /* force BE */
2337         iorder[0] = 3;
2338         iorder[1] = 2;
2339         iorder[2] = 1;
2340         iorder[3] = 0;
2341     }
2342 
2343     /* On narrow builds we split characters outside the BMP into two
2344        code points => count how much extra space we need. */
2345 #ifndef Py_UNICODE_WIDE
2346     for (qq = q; e - qq >= 4; qq += 4)
2347         if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
2348             pairs++;
2349 #endif
2350 
2351     /* This might be one to much, because of a BOM */
2352     unicode = _PyUnicode_New((size+3)/4+pairs);
2353     if (!unicode)
2354         return NULL;
2355     if (size == 0)
2356         return (PyObject *)unicode;
2357 
2358     /* Unpack UTF-32 encoded data */
2359     p = unicode->str;
2360 
2361     while (q < e) {
2362         Py_UCS4 ch;
2363         /* remaining bytes at the end? (size should be divisible by 4) */
2364         if (e-q<4) {
2365             if (consumed)
2366                 break;
2367             errmsg = "truncated data";
2368             startinpos = ((const char *)q)-starts;
2369             endinpos = ((const char *)e)-starts;
2370             goto utf32Error;
2371             /* The remaining input chars are ignored if the callback
2372                chooses to skip the input */
2373         }
2374         ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2375             (q[iorder[1]] << 8) | q[iorder[0]];
2376 
2377         if (ch >= 0x110000)
2378         {
2379             errmsg = "code point not in range(0x110000)";
2380             startinpos = ((const char *)q)-starts;
2381             endinpos = startinpos+4;
2382             goto utf32Error;
2383         }
2384 #ifndef Py_UNICODE_WIDE
2385         if (ch >= 0x10000)
2386         {
2387             *p++ = 0xD800 | ((ch-0x10000) >> 10);
2388             *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
2389         }
2390         else
2391 #endif
2392             *p++ = ch;
2393         q += 4;
2394         continue;
2395       utf32Error:
2396         outpos = p-PyUnicode_AS_UNICODE(unicode);
2397         if (unicode_decode_call_errorhandler(
2398                 errors, &errorHandler,
2399                 "utf32", errmsg,
2400                 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
2401                 &unicode, &outpos, &p))
2402             goto onError;
2403     }
2404 
2405     if (byteorder)
2406         *byteorder = bo;
2407 
2408     if (consumed)
2409         *consumed = (const char *)q-starts;
2410 
2411     /* Adjust length */
2412     if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2413         goto onError;
2414 
2415     Py_XDECREF(errorHandler);
2416     Py_XDECREF(exc);
2417     return (PyObject *)unicode;
2418 
2419   onError:
2420     Py_DECREF(unicode);
2421     Py_XDECREF(errorHandler);
2422     Py_XDECREF(exc);
2423     return NULL;
2424 }
2425 
2426 PyObject *
PyUnicode_EncodeUTF32(const Py_UNICODE * s,Py_ssize_t size,const char * errors,int byteorder)2427 PyUnicode_EncodeUTF32(const Py_UNICODE *s,
2428                       Py_ssize_t size,
2429                       const char *errors,
2430                       int byteorder)
2431 {
2432     PyObject *v;
2433     unsigned char *p;
2434     Py_ssize_t nsize, bytesize;
2435 #ifndef Py_UNICODE_WIDE
2436     Py_ssize_t i, pairs;
2437 #else
2438     const int pairs = 0;
2439 #endif
2440     /* Offsets from p for storing byte pairs in the right order. */
2441 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2442     int iorder[] = {0, 1, 2, 3};
2443 #else
2444     int iorder[] = {3, 2, 1, 0};
2445 #endif
2446 
2447 #define STORECHAR(CH)                           \
2448     do {                                        \
2449         p[iorder[3]] = ((CH) >> 24) & 0xff;     \
2450         p[iorder[2]] = ((CH) >> 16) & 0xff;     \
2451         p[iorder[1]] = ((CH) >> 8) & 0xff;      \
2452         p[iorder[0]] = (CH) & 0xff;             \
2453         p += 4;                                 \
2454     } while(0)
2455 
2456     /* In narrow builds we can output surrogate pairs as one code point,
2457        so we need less space. */
2458 #ifndef Py_UNICODE_WIDE
2459     for (i = pairs = 0; i < size-1; i++)
2460         if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
2461             0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
2462             pairs++;
2463 #endif
2464     nsize = (size - pairs + (byteorder == 0));
2465     bytesize = nsize * 4;
2466     if (bytesize / 4 != nsize)
2467         return PyErr_NoMemory();
2468     v = PyString_FromStringAndSize(NULL, bytesize);
2469     if (v == NULL)
2470         return NULL;
2471 
2472     p = (unsigned char *)PyString_AS_STRING(v);
2473     if (byteorder == 0)
2474         STORECHAR(0xFEFF);
2475     if (size == 0)
2476         return v;
2477 
2478     if (byteorder == -1) {
2479         /* force LE */
2480         iorder[0] = 0;
2481         iorder[1] = 1;
2482         iorder[2] = 2;
2483         iorder[3] = 3;
2484     }
2485     else if (byteorder == 1) {
2486         /* force BE */
2487         iorder[0] = 3;
2488         iorder[1] = 2;
2489         iorder[2] = 1;
2490         iorder[3] = 0;
2491     }
2492 
2493     while (size-- > 0) {
2494         Py_UCS4 ch = *s++;
2495 #ifndef Py_UNICODE_WIDE
2496         if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
2497             Py_UCS4 ch2 = *s;
2498             if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2499                 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2500                 s++;
2501                 size--;
2502             }
2503         }
2504 #endif
2505         STORECHAR(ch);
2506     }
2507     return v;
2508 #undef STORECHAR
2509 }
2510 
PyUnicode_AsUTF32String(PyObject * unicode)2511 PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
2512 {
2513     if (!PyUnicode_Check(unicode)) {
2514         PyErr_BadArgument();
2515         return NULL;
2516     }
2517     return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
2518                                  PyUnicode_GET_SIZE(unicode),
2519                                  NULL,
2520                                  0);
2521 }
2522 
2523 /* --- UTF-16 Codec ------------------------------------------------------- */
2524 
2525 PyObject *
PyUnicode_DecodeUTF16(const char * s,Py_ssize_t size,const char * errors,int * byteorder)2526 PyUnicode_DecodeUTF16(const char *s,
2527                       Py_ssize_t size,
2528                       const char *errors,
2529                       int *byteorder)
2530 {
2531     return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
2532 }
2533 
2534 PyObject *
PyUnicode_DecodeUTF16Stateful(const char * s,Py_ssize_t size,const char * errors,int * byteorder,Py_ssize_t * consumed)2535 PyUnicode_DecodeUTF16Stateful(const char *s,
2536                               Py_ssize_t size,
2537                               const char *errors,
2538                               int *byteorder,
2539                               Py_ssize_t *consumed)
2540 {
2541     const char *starts = s;
2542     Py_ssize_t startinpos;
2543     Py_ssize_t endinpos;
2544     Py_ssize_t outpos;
2545     PyUnicodeObject *unicode;
2546     Py_UNICODE *p;
2547     const unsigned char *q, *e;
2548     int bo = 0;       /* assume native ordering by default */
2549     const char *errmsg = "";
2550     /* Offsets from q for retrieving byte pairs in the right order. */
2551 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2552     int ihi = 1, ilo = 0;
2553 #else
2554     int ihi = 0, ilo = 1;
2555 #endif
2556     PyObject *errorHandler = NULL;
2557     PyObject *exc = NULL;
2558 
2559     /* Note: size will always be longer than the resulting Unicode
2560        character count */
2561     unicode = _PyUnicode_New(size);
2562     if (!unicode)
2563         return NULL;
2564     if (size == 0)
2565         return (PyObject *)unicode;
2566 
2567     /* Unpack UTF-16 encoded data */
2568     p = unicode->str;
2569     q = (unsigned char *)s;
2570     e = q + size;
2571 
2572     if (byteorder)
2573         bo = *byteorder;
2574 
2575     /* Check for BOM marks (U+FEFF) in the input and adjust current
2576        byte order setting accordingly. In native mode, the leading BOM
2577        mark is skipped, in all other modes, it is copied to the output
2578        stream as-is (giving a ZWNBSP character). */
2579     if (bo == 0) {
2580         if (size >= 2) {
2581             const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
2582 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2583             if (bom == 0xFEFF) {
2584                 q += 2;
2585                 bo = -1;
2586             }
2587             else if (bom == 0xFFFE) {
2588                 q += 2;
2589                 bo = 1;
2590             }
2591 #else
2592             if (bom == 0xFEFF) {
2593                 q += 2;
2594                 bo = 1;
2595             }
2596             else if (bom == 0xFFFE) {
2597                 q += 2;
2598                 bo = -1;
2599             }
2600 #endif
2601         }
2602     }
2603 
2604     if (bo == -1) {
2605         /* force LE */
2606         ihi = 1;
2607         ilo = 0;
2608     }
2609     else if (bo == 1) {
2610         /* force BE */
2611         ihi = 0;
2612         ilo = 1;
2613     }
2614 
2615     while (q < e) {
2616         Py_UNICODE ch;
2617         /* remaining bytes at the end? (size should be even) */
2618         if (e-q<2) {
2619             if (consumed)
2620                 break;
2621             errmsg = "truncated data";
2622             startinpos = ((const char *)q)-starts;
2623             endinpos = ((const char *)e)-starts;
2624             goto utf16Error;
2625             /* The remaining input chars are ignored if the callback
2626                chooses to skip the input */
2627         }
2628         ch = (q[ihi] << 8) | q[ilo];
2629 
2630         q += 2;
2631 
2632         if (ch < 0xD800 || ch > 0xDFFF) {
2633             *p++ = ch;
2634             continue;
2635         }
2636 
2637         /* UTF-16 code pair: */
2638         if (e - q < 2) {
2639             q -= 2;
2640             if (consumed)
2641                 break;
2642             errmsg = "unexpected end of data";
2643             startinpos = ((const char *)q)-starts;
2644             endinpos = ((const char *)e)-starts;
2645             goto utf16Error;
2646         }
2647         if (0xD800 <= ch && ch <= 0xDBFF) {
2648             Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
2649             q += 2;
2650             if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2651 #ifndef Py_UNICODE_WIDE
2652                 *p++ = ch;
2653                 *p++ = ch2;
2654 #else
2655                 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2656 #endif
2657                 continue;
2658             }
2659             else {
2660                 errmsg = "illegal UTF-16 surrogate";
2661                 startinpos = (((const char *)q)-4)-starts;
2662                 endinpos = startinpos+2;
2663                 goto utf16Error;
2664             }
2665 
2666         }
2667         errmsg = "illegal encoding";
2668         startinpos = (((const char *)q)-2)-starts;
2669         endinpos = startinpos+2;
2670         /* Fall through to report the error */
2671 
2672       utf16Error:
2673         outpos = p-PyUnicode_AS_UNICODE(unicode);
2674         if (unicode_decode_call_errorhandler(
2675                 errors, &errorHandler,
2676                 "utf16", errmsg,
2677                 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
2678                 &unicode, &outpos, &p))
2679             goto onError;
2680     }
2681 
2682     if (byteorder)
2683         *byteorder = bo;
2684 
2685     if (consumed)
2686         *consumed = (const char *)q-starts;
2687 
2688     /* Adjust length */
2689     if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2690         goto onError;
2691 
2692     Py_XDECREF(errorHandler);
2693     Py_XDECREF(exc);
2694     return (PyObject *)unicode;
2695 
2696   onError:
2697     Py_DECREF(unicode);
2698     Py_XDECREF(errorHandler);
2699     Py_XDECREF(exc);
2700     return NULL;
2701 }
2702 
2703 PyObject *
PyUnicode_EncodeUTF16(const Py_UNICODE * s,Py_ssize_t size,const char * errors,int byteorder)2704 PyUnicode_EncodeUTF16(const Py_UNICODE *s,
2705                       Py_ssize_t size,
2706                       const char *errors,
2707                       int byteorder)
2708 {
2709     PyObject *v;
2710     unsigned char *p;
2711     Py_ssize_t nsize, bytesize;
2712 #ifdef Py_UNICODE_WIDE
2713     Py_ssize_t i, pairs;
2714 #else
2715     const int pairs = 0;
2716 #endif
2717     /* Offsets from p for storing byte pairs in the right order. */
2718 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2719     int ihi = 1, ilo = 0;
2720 #else
2721     int ihi = 0, ilo = 1;
2722 #endif
2723 
2724 #define STORECHAR(CH)                           \
2725     do {                                        \
2726         p[ihi] = ((CH) >> 8) & 0xff;            \
2727         p[ilo] = (CH) & 0xff;                   \
2728         p += 2;                                 \
2729     } while(0)
2730 
2731 #ifdef Py_UNICODE_WIDE
2732     for (i = pairs = 0; i < size; i++)
2733         if (s[i] >= 0x10000)
2734             pairs++;
2735 #endif
2736     /* 2 * (size + pairs + (byteorder == 0)) */
2737     if (size > PY_SSIZE_T_MAX ||
2738         size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
2739         return PyErr_NoMemory();
2740     nsize = size + pairs + (byteorder == 0);
2741     bytesize = nsize * 2;
2742     if (bytesize / 2 != nsize)
2743         return PyErr_NoMemory();
2744     v = PyString_FromStringAndSize(NULL, bytesize);
2745     if (v == NULL)
2746         return NULL;
2747 
2748     p = (unsigned char *)PyString_AS_STRING(v);
2749     if (byteorder == 0)
2750         STORECHAR(0xFEFF);
2751     if (size == 0)
2752         return v;
2753 
2754     if (byteorder == -1) {
2755         /* force LE */
2756         ihi = 1;
2757         ilo = 0;
2758     }
2759     else if (byteorder == 1) {
2760         /* force BE */
2761         ihi = 0;
2762         ilo = 1;
2763     }
2764 
2765     while (size-- > 0) {
2766         Py_UNICODE ch = *s++;
2767         Py_UNICODE ch2 = 0;
2768 #ifdef Py_UNICODE_WIDE
2769         if (ch >= 0x10000) {
2770             ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
2771             ch  = 0xD800 | ((ch-0x10000) >> 10);
2772         }
2773 #endif
2774         STORECHAR(ch);
2775         if (ch2)
2776             STORECHAR(ch2);
2777     }
2778     return v;
2779 #undef STORECHAR
2780 }
2781 
PyUnicode_AsUTF16String(PyObject * unicode)2782 PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
2783 {
2784     if (!PyUnicode_Check(unicode)) {
2785         PyErr_BadArgument();
2786         return NULL;
2787     }
2788     return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
2789                                  PyUnicode_GET_SIZE(unicode),
2790                                  NULL,
2791                                  0);
2792 }
2793 
2794 /* --- Unicode Escape Codec ----------------------------------------------- */
2795 
2796 static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
2797 
PyUnicode_DecodeUnicodeEscape(const char * s,Py_ssize_t size,const char * errors)2798 PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
2799                                         Py_ssize_t size,
2800                                         const char *errors)
2801 {
2802     const char *starts = s;
2803     Py_ssize_t startinpos;
2804     Py_ssize_t endinpos;
2805     Py_ssize_t outpos;
2806     PyUnicodeObject *v;
2807     Py_UNICODE *p;
2808     const char *end;
2809     char* message;
2810     Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
2811     PyObject *errorHandler = NULL;
2812     PyObject *exc = NULL;
2813 
2814     /* Escaped strings will always be longer than the resulting
2815        Unicode string, so we start with size here and then reduce the
2816        length after conversion to the true value.
2817        (but if the error callback returns a long replacement string
2818        we'll have to allocate more space) */
2819     v = _PyUnicode_New(size);
2820     if (v == NULL)
2821         goto onError;
2822     if (size == 0)
2823         return (PyObject *)v;
2824 
2825     p = PyUnicode_AS_UNICODE(v);
2826     end = s + size;
2827 
2828     while (s < end) {
2829         unsigned char c;
2830         Py_UNICODE x;
2831         int digits;
2832 
2833         /* Non-escape characters are interpreted as Unicode ordinals */
2834         if (*s != '\\') {
2835             *p++ = (unsigned char) *s++;
2836             continue;
2837         }
2838 
2839         startinpos = s-starts;
2840         /* \ - Escapes */
2841         s++;
2842         c = *s++;
2843         if (s > end)
2844             c = '\0'; /* Invalid after \ */
2845         switch (c) {
2846 
2847             /* \x escapes */
2848         case '\n': break;
2849         case '\\': *p++ = '\\'; break;
2850         case '\'': *p++ = '\''; break;
2851         case '\"': *p++ = '\"'; break;
2852         case 'b': *p++ = '\b'; break;
2853         case 'f': *p++ = '\014'; break; /* FF */
2854         case 't': *p++ = '\t'; break;
2855         case 'n': *p++ = '\n'; break;
2856         case 'r': *p++ = '\r'; break;
2857         case 'v': *p++ = '\013'; break; /* VT */
2858         case 'a': *p++ = '\007'; break; /* BEL, not classic C */
2859 
2860             /* \OOO (octal) escapes */
2861         case '0': case '1': case '2': case '3':
2862         case '4': case '5': case '6': case '7':
2863             x = s[-1] - '0';
2864             if (s < end && '0' <= *s && *s <= '7') {
2865                 x = (x<<3) + *s++ - '0';
2866                 if (s < end && '0' <= *s && *s <= '7')
2867                     x = (x<<3) + *s++ - '0';
2868             }
2869             *p++ = x;
2870             break;
2871 
2872             /* hex escapes */
2873             /* \xXX */
2874         case 'x':
2875             digits = 2;
2876             message = "truncated \\xXX escape";
2877             goto hexescape;
2878 
2879             /* \uXXXX */
2880         case 'u':
2881             digits = 4;
2882             message = "truncated \\uXXXX escape";
2883             goto hexescape;
2884 
2885             /* \UXXXXXXXX */
2886         case 'U':
2887             digits = 8;
2888             message = "truncated \\UXXXXXXXX escape";
2889         hexescape:
2890             chr = 0;
2891             if (end - s < digits) {
2892                 /* count only hex digits */
2893                 for (; s < end; ++s) {
2894                     c = (unsigned char)*s;
2895                     if (!Py_ISXDIGIT(c))
2896                         goto error;
2897                 }
2898                 goto error;
2899             }
2900             for (; digits--; ++s) {
2901                 c = (unsigned char)*s;
2902                 if (!Py_ISXDIGIT(c))
2903                     goto error;
2904                 chr = (chr<<4) & ~0xF;
2905                 if (c >= '0' && c <= '9')
2906                     chr += c - '0';
2907                 else if (c >= 'a' && c <= 'f')
2908                     chr += 10 + c - 'a';
2909                 else
2910                     chr += 10 + c - 'A';
2911             }
2912             if (chr == 0xffffffff && PyErr_Occurred())
2913                 /* _decoding_error will have already written into the
2914                    target buffer. */
2915                 break;
2916         store:
2917             /* when we get here, chr is a 32-bit unicode character */
2918             if (chr <= 0xffff)
2919                 /* UCS-2 character */
2920                 *p++ = (Py_UNICODE) chr;
2921             else if (chr <= 0x10ffff) {
2922                 /* UCS-4 character. Either store directly, or as
2923                    surrogate pair. */
2924 #ifdef Py_UNICODE_WIDE
2925                 *p++ = chr;
2926 #else
2927                 chr -= 0x10000L;
2928                 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
2929                 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
2930 #endif
2931             } else {
2932                 message = "illegal Unicode character";
2933                 goto error;
2934             }
2935             break;
2936 
2937             /* \N{name} */
2938         case 'N':
2939             message = "malformed \\N character escape";
2940             if (ucnhash_CAPI == NULL) {
2941                 /* load the unicode data module */
2942                 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(PyUnicodeData_CAPSULE_NAME, 1);
2943                 if (ucnhash_CAPI == NULL)
2944                     goto ucnhashError;
2945             }
2946             if (*s == '{') {
2947                 const char *start = s+1;
2948                 /* look for the closing brace */
2949                 while (*s != '}' && s < end)
2950                     s++;
2951                 if (s > start && s < end && *s == '}') {
2952                     /* found a name.  look it up in the unicode database */
2953                     message = "unknown Unicode character name";
2954                     s++;
2955                     if (s - start - 1 <= INT_MAX &&
2956                         ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
2957                         goto store;
2958                 }
2959             }
2960             goto error;
2961 
2962         default:
2963             if (s > end) {
2964                 message = "\\ at end of string";
2965                 s--;
2966                 goto error;
2967             }
2968             else {
2969                 *p++ = '\\';
2970                 *p++ = (unsigned char)s[-1];
2971             }
2972             break;
2973         }
2974         continue;
2975 
2976       error:
2977         endinpos = s-starts;
2978         outpos = p-PyUnicode_AS_UNICODE(v);
2979         if (unicode_decode_call_errorhandler(
2980                 errors, &errorHandler,
2981                 "unicodeescape", message,
2982                 starts, size, &startinpos, &endinpos, &exc, &s,
2983                 &v, &outpos, &p))
2984             goto onError;
2985         continue;
2986     }
2987     if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
2988         goto onError;
2989     Py_XDECREF(errorHandler);
2990     Py_XDECREF(exc);
2991     return (PyObject *)v;
2992 
2993   ucnhashError:
2994     PyErr_SetString(
2995         PyExc_UnicodeError,
2996         "\\N escapes not supported (can't load unicodedata module)"
2997         );
2998     Py_XDECREF(v);
2999     Py_XDECREF(errorHandler);
3000     Py_XDECREF(exc);
3001     return NULL;
3002 
3003   onError:
3004     Py_XDECREF(v);
3005     Py_XDECREF(errorHandler);
3006     Py_XDECREF(exc);
3007     return NULL;
3008 }
3009 
3010 /* Return a Unicode-Escape string version of the Unicode object.
3011 
3012    If quotes is true, the string is enclosed in u"" or u'' quotes as
3013    appropriate.
3014 
3015 */
3016 
findchar(const Py_UNICODE * s,Py_ssize_t size,Py_UNICODE ch)3017 Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
3018                                              Py_ssize_t size,
3019                                              Py_UNICODE ch)
3020 {
3021     /* like wcschr, but doesn't stop at NULL characters */
3022 
3023     while (size-- > 0) {
3024         if (*s == ch)
3025             return s;
3026         s++;
3027     }
3028 
3029     return NULL;
3030 }
3031 
3032 static
unicodeescape_string(const Py_UNICODE * s,Py_ssize_t size,int quotes)3033 PyObject *unicodeescape_string(const Py_UNICODE *s,
3034                                Py_ssize_t size,
3035                                int quotes)
3036 {
3037     PyObject *repr;
3038     char *p;
3039 
3040     static const char *hexdigit = "0123456789abcdef";
3041 #ifdef Py_UNICODE_WIDE
3042     const Py_ssize_t expandsize = 10;
3043 #else
3044     const Py_ssize_t expandsize = 6;
3045 #endif
3046 
3047     /* XXX(nnorwitz): rather than over-allocating, it would be
3048        better to choose a different scheme.  Perhaps scan the
3049        first N-chars of the string and allocate based on that size.
3050     */
3051     /* Initial allocation is based on the longest-possible unichr
3052        escape.
3053 
3054        In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
3055        unichr, so in this case it's the longest unichr escape. In
3056        narrow (UTF-16) builds this is five chars per source unichr
3057        since there are two unichrs in the surrogate pair, so in narrow
3058        (UTF-16) builds it's not the longest unichr escape.
3059 
3060        In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
3061        so in the narrow (UTF-16) build case it's the longest unichr
3062        escape.
3063     */
3064 
3065     if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
3066         return PyErr_NoMemory();
3067 
3068     repr = PyString_FromStringAndSize(NULL,
3069                                       2
3070                                       + expandsize*size
3071                                       + 1);
3072     if (repr == NULL)
3073         return NULL;
3074 
3075     p = PyString_AS_STRING(repr);
3076 
3077     if (quotes) {
3078         *p++ = 'u';
3079         *p++ = (findchar(s, size, '\'') &&
3080                 !findchar(s, size, '"')) ? '"' : '\'';
3081     }
3082     while (size-- > 0) {
3083         Py_UNICODE ch = *s++;
3084 
3085         /* Escape quotes and backslashes */
3086         if ((quotes &&
3087              ch == (Py_UNICODE) PyString_AS_STRING(repr)[1]) || ch == '\\') {
3088             *p++ = '\\';
3089             *p++ = (char) ch;
3090             continue;
3091         }
3092 
3093 #ifdef Py_UNICODE_WIDE
3094         /* Map 21-bit characters to '\U00xxxxxx' */
3095         else if (ch >= 0x10000) {
3096             *p++ = '\\';
3097             *p++ = 'U';
3098             *p++ = hexdigit[(ch >> 28) & 0x0000000F];
3099             *p++ = hexdigit[(ch >> 24) & 0x0000000F];
3100             *p++ = hexdigit[(ch >> 20) & 0x0000000F];
3101             *p++ = hexdigit[(ch >> 16) & 0x0000000F];
3102             *p++ = hexdigit[(ch >> 12) & 0x0000000F];
3103             *p++ = hexdigit[(ch >> 8) & 0x0000000F];
3104             *p++ = hexdigit[(ch >> 4) & 0x0000000F];
3105             *p++ = hexdigit[ch & 0x0000000F];
3106             continue;
3107         }
3108 #else
3109         /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3110         else if (ch >= 0xD800 && ch < 0xDC00) {
3111             Py_UNICODE ch2;
3112             Py_UCS4 ucs;
3113 
3114             ch2 = *s++;
3115             size--;
3116             if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3117                 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3118                 *p++ = '\\';
3119                 *p++ = 'U';
3120                 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
3121                 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
3122                 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
3123                 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
3124                 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
3125                 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
3126                 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
3127                 *p++ = hexdigit[ucs & 0x0000000F];
3128                 continue;
3129             }
3130             /* Fall through: isolated surrogates are copied as-is */
3131             s--;
3132             size++;
3133         }
3134 #endif
3135 
3136         /* Map 16-bit characters to '\uxxxx' */
3137         if (ch >= 256) {
3138             *p++ = '\\';
3139             *p++ = 'u';
3140             *p++ = hexdigit[(ch >> 12) & 0x000F];
3141             *p++ = hexdigit[(ch >> 8) & 0x000F];
3142             *p++ = hexdigit[(ch >> 4) & 0x000F];
3143             *p++ = hexdigit[ch & 0x000F];
3144         }
3145 
3146         /* Map special whitespace to '\t', \n', '\r' */
3147         else if (ch == '\t') {
3148             *p++ = '\\';
3149             *p++ = 't';
3150         }
3151         else if (ch == '\n') {
3152             *p++ = '\\';
3153             *p++ = 'n';
3154         }
3155         else if (ch == '\r') {
3156             *p++ = '\\';
3157             *p++ = 'r';
3158         }
3159 
3160         /* Map non-printable US ASCII to '\xhh' */
3161         else if (ch < ' ' || ch >= 0x7F) {
3162             *p++ = '\\';
3163             *p++ = 'x';
3164             *p++ = hexdigit[(ch >> 4) & 0x000F];
3165             *p++ = hexdigit[ch & 0x000F];
3166         }
3167 
3168         /* Copy everything else as-is */
3169         else
3170             *p++ = (char) ch;
3171     }
3172     if (quotes)
3173         *p++ = PyString_AS_STRING(repr)[1];
3174 
3175     *p = '\0';
3176     if (_PyString_Resize(&repr, p - PyString_AS_STRING(repr)))
3177         return NULL;
3178     return repr;
3179 }
3180 
PyUnicode_EncodeUnicodeEscape(const Py_UNICODE * s,Py_ssize_t size)3181 PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
3182                                         Py_ssize_t size)
3183 {
3184     return unicodeescape_string(s, size, 0);
3185 }
3186 
PyUnicode_AsUnicodeEscapeString(PyObject * unicode)3187 PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
3188 {
3189     if (!PyUnicode_Check(unicode)) {
3190         PyErr_BadArgument();
3191         return NULL;
3192     }
3193     return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3194                                          PyUnicode_GET_SIZE(unicode));
3195 }
3196 
3197 /* --- Raw Unicode Escape Codec ------------------------------------------- */
3198 
PyUnicode_DecodeRawUnicodeEscape(const char * s,Py_ssize_t size,const char * errors)3199 PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
3200                                            Py_ssize_t size,
3201                                            const char *errors)
3202 {
3203     const char *starts = s;
3204     Py_ssize_t startinpos;
3205     Py_ssize_t endinpos;
3206     Py_ssize_t outpos;
3207     PyUnicodeObject *v;
3208     Py_UNICODE *p;
3209     const char *end;
3210     const char *bs;
3211     PyObject *errorHandler = NULL;
3212     PyObject *exc = NULL;
3213 
3214     /* Escaped strings will always be longer than the resulting
3215        Unicode string, so we start with size here and then reduce the
3216        length after conversion to the true value. (But decoding error
3217        handler might have to resize the string) */
3218     v = _PyUnicode_New(size);
3219     if (v == NULL)
3220         goto onError;
3221     if (size == 0)
3222         return (PyObject *)v;
3223     p = PyUnicode_AS_UNICODE(v);
3224     end = s + size;
3225     while (s < end) {
3226         unsigned char c;
3227         Py_UCS4 x;
3228         int i;
3229         int count;
3230 
3231         /* Non-escape characters are interpreted as Unicode ordinals */
3232         if (*s != '\\') {
3233             *p++ = (unsigned char)*s++;
3234             continue;
3235         }
3236         startinpos = s-starts;
3237 
3238         /* \u-escapes are only interpreted iff the number of leading
3239            backslashes if odd */
3240         bs = s;
3241         for (;s < end;) {
3242             if (*s != '\\')
3243                 break;
3244             *p++ = (unsigned char)*s++;
3245         }
3246         if (((s - bs) & 1) == 0 ||
3247             s >= end ||
3248             (*s != 'u' && *s != 'U')) {
3249             continue;
3250         }
3251         p--;
3252         count = *s=='u' ? 4 : 8;
3253         s++;
3254 
3255         /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
3256         outpos = p-PyUnicode_AS_UNICODE(v);
3257         for (x = 0, i = 0; i < count; ++i, ++s) {
3258             c = (unsigned char)*s;
3259             if (!isxdigit(c)) {
3260                 endinpos = s-starts;
3261                 if (unicode_decode_call_errorhandler(
3262                         errors, &errorHandler,
3263                         "rawunicodeescape", "truncated \\uXXXX",
3264                         starts, size, &startinpos, &endinpos, &exc, &s,
3265                         &v, &outpos, &p))
3266                     goto onError;
3267                 goto nextByte;
3268             }
3269             x = (x<<4) & ~0xF;
3270             if (c >= '0' && c <= '9')
3271                 x += c - '0';
3272             else if (c >= 'a' && c <= 'f')
3273                 x += 10 + c - 'a';
3274             else
3275                 x += 10 + c - 'A';
3276         }
3277         if (x <= 0xffff)
3278             /* UCS-2 character */
3279             *p++ = (Py_UNICODE) x;
3280         else if (x <= 0x10ffff) {
3281             /* UCS-4 character. Either store directly, or as
3282                surrogate pair. */
3283 #ifdef Py_UNICODE_WIDE
3284             *p++ = (Py_UNICODE) x;
3285 #else
3286             x -= 0x10000L;
3287             *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
3288             *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
3289 #endif
3290         } else {
3291             endinpos = s-starts;
3292             outpos = p-PyUnicode_AS_UNICODE(v);
3293             if (unicode_decode_call_errorhandler(
3294                     errors, &errorHandler,
3295                     "rawunicodeescape", "\\Uxxxxxxxx out of range",
3296                     starts, size, &startinpos, &endinpos, &exc, &s,
3297                     &v, &outpos, &p))
3298                 goto onError;
3299         }
3300       nextByte:
3301         ;
3302     }
3303     if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3304         goto onError;
3305     Py_XDECREF(errorHandler);
3306     Py_XDECREF(exc);
3307     return (PyObject *)v;
3308 
3309   onError:
3310     Py_XDECREF(v);
3311     Py_XDECREF(errorHandler);
3312     Py_XDECREF(exc);
3313     return NULL;
3314 }
3315 
PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE * s,Py_ssize_t size)3316 PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
3317                                            Py_ssize_t size)
3318 {
3319     PyObject *repr;
3320     char *p;
3321     char *q;
3322 
3323     static const char *hexdigit = "0123456789abcdef";
3324 #ifdef Py_UNICODE_WIDE
3325     const Py_ssize_t expandsize = 10;
3326 #else
3327     const Py_ssize_t expandsize = 6;
3328 #endif
3329 
3330     if (size > PY_SSIZE_T_MAX / expandsize)
3331         return PyErr_NoMemory();
3332 
3333     repr = PyString_FromStringAndSize(NULL, expandsize * size);
3334     if (repr == NULL)
3335         return NULL;
3336     if (size == 0)
3337         return repr;
3338 
3339     p = q = PyString_AS_STRING(repr);
3340     while (size-- > 0) {
3341         Py_UNICODE ch = *s++;
3342 #ifdef Py_UNICODE_WIDE
3343         /* Map 32-bit characters to '\Uxxxxxxxx' */
3344         if (ch >= 0x10000) {
3345             *p++ = '\\';
3346             *p++ = 'U';
3347             *p++ = hexdigit[(ch >> 28) & 0xf];
3348             *p++ = hexdigit[(ch >> 24) & 0xf];
3349             *p++ = hexdigit[(ch >> 20) & 0xf];
3350             *p++ = hexdigit[(ch >> 16) & 0xf];
3351             *p++ = hexdigit[(ch >> 12) & 0xf];
3352             *p++ = hexdigit[(ch >> 8) & 0xf];
3353             *p++ = hexdigit[(ch >> 4) & 0xf];
3354             *p++ = hexdigit[ch & 15];
3355         }
3356         else
3357 #else
3358             /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3359             if (ch >= 0xD800 && ch < 0xDC00) {
3360                 Py_UNICODE ch2;
3361                 Py_UCS4 ucs;
3362 
3363                 ch2 = *s++;
3364                 size--;
3365                 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3366                     ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3367                     *p++ = '\\';
3368                     *p++ = 'U';
3369                     *p++ = hexdigit[(ucs >> 28) & 0xf];
3370                     *p++ = hexdigit[(ucs >> 24) & 0xf];
3371                     *p++ = hexdigit[(ucs >> 20) & 0xf];
3372                     *p++ = hexdigit[(ucs >> 16) & 0xf];
3373                     *p++ = hexdigit[(ucs >> 12) & 0xf];
3374                     *p++ = hexdigit[(ucs >> 8) & 0xf];
3375                     *p++ = hexdigit[(ucs >> 4) & 0xf];
3376                     *p++ = hexdigit[ucs & 0xf];
3377                     continue;
3378                 }
3379                 /* Fall through: isolated surrogates are copied as-is */
3380                 s--;
3381                 size++;
3382             }
3383 #endif
3384         /* Map 16-bit characters to '\uxxxx' */
3385         if (ch >= 256) {
3386             *p++ = '\\';
3387             *p++ = 'u';
3388             *p++ = hexdigit[(ch >> 12) & 0xf];
3389             *p++ = hexdigit[(ch >> 8) & 0xf];
3390             *p++ = hexdigit[(ch >> 4) & 0xf];
3391             *p++ = hexdigit[ch & 15];
3392         }
3393         /* Copy everything else as-is */
3394         else
3395             *p++ = (char) ch;
3396     }
3397     *p = '\0';
3398     if (_PyString_Resize(&repr, p - q))
3399         return NULL;
3400     return repr;
3401 }
3402 
PyUnicode_AsRawUnicodeEscapeString(PyObject * unicode)3403 PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
3404 {
3405     if (!PyUnicode_Check(unicode)) {
3406         PyErr_BadArgument();
3407         return NULL;
3408     }
3409     return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3410                                             PyUnicode_GET_SIZE(unicode));
3411 }
3412 
3413 /* --- Unicode Internal Codec ------------------------------------------- */
3414 
_PyUnicode_DecodeUnicodeInternal(const char * s,Py_ssize_t size,const char * errors)3415 PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
3416                                            Py_ssize_t size,
3417                                            const char *errors)
3418 {
3419     const char *starts = s;
3420     Py_ssize_t startinpos;
3421     Py_ssize_t endinpos;
3422     Py_ssize_t outpos;
3423     PyUnicodeObject *v;
3424     Py_UNICODE *p;
3425     const char *end;
3426     const char *reason;
3427     PyObject *errorHandler = NULL;
3428     PyObject *exc = NULL;
3429 
3430 #ifdef Py_UNICODE_WIDE
3431     Py_UNICODE unimax = PyUnicode_GetMax();
3432 #endif
3433 
3434     /* XXX overflow detection missing */
3435     v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
3436     if (v == NULL)
3437         goto onError;
3438     if (PyUnicode_GetSize((PyObject *)v) == 0)
3439         return (PyObject *)v;
3440     p = PyUnicode_AS_UNICODE(v);
3441     end = s + size;
3442 
3443     while (s < end) {
3444         if (end-s < Py_UNICODE_SIZE) {
3445             endinpos = end-starts;
3446             reason = "truncated input";
3447             goto error;
3448         }
3449         memcpy(p, s, sizeof(Py_UNICODE));
3450 #ifdef Py_UNICODE_WIDE
3451         /* We have to sanity check the raw data, otherwise doom looms for
3452            some malformed UCS-4 data. */
3453         if (*p > unimax || *p < 0) {
3454             endinpos = s - starts + Py_UNICODE_SIZE;
3455             reason = "illegal code point (> 0x10FFFF)";
3456             goto error;
3457         }
3458 #endif
3459         p++;
3460         s += Py_UNICODE_SIZE;
3461         continue;
3462 
3463   error:
3464         startinpos = s - starts;
3465         outpos = p - PyUnicode_AS_UNICODE(v);
3466         if (unicode_decode_call_errorhandler(
3467                 errors, &errorHandler,
3468                 "unicode_internal", reason,
3469                 starts, size, &startinpos, &endinpos, &exc, &s,
3470                 &v, &outpos, &p)) {
3471             goto onError;
3472         }
3473     }
3474 
3475     if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3476         goto onError;
3477     Py_XDECREF(errorHandler);
3478     Py_XDECREF(exc);
3479     return (PyObject *)v;
3480 
3481   onError:
3482     Py_XDECREF(v);
3483     Py_XDECREF(errorHandler);
3484     Py_XDECREF(exc);
3485     return NULL;
3486 }
3487 
3488 /* --- Latin-1 Codec ------------------------------------------------------ */
3489 
PyUnicode_DecodeLatin1(const char * s,Py_ssize_t size,const char * errors)3490 PyObject *PyUnicode_DecodeLatin1(const char *s,
3491                                  Py_ssize_t size,
3492                                  const char *errors)
3493 {
3494     PyUnicodeObject *v;
3495     Py_UNICODE *p;
3496 
3497     /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
3498     if (size == 1) {
3499         Py_UNICODE r = *(unsigned char*)s;
3500         return PyUnicode_FromUnicode(&r, 1);
3501     }
3502 
3503     v = _PyUnicode_New(size);
3504     if (v == NULL)
3505         goto onError;
3506     if (size == 0)
3507         return (PyObject *)v;
3508     p = PyUnicode_AS_UNICODE(v);
3509     while (size-- > 0)
3510         *p++ = (unsigned char)*s++;
3511     return (PyObject *)v;
3512 
3513   onError:
3514     Py_XDECREF(v);
3515     return NULL;
3516 }
3517 
3518 /* create or adjust a UnicodeEncodeError */
make_encode_exception(PyObject ** exceptionObject,const char * encoding,const Py_UNICODE * unicode,Py_ssize_t size,Py_ssize_t startpos,Py_ssize_t endpos,const char * reason)3519 static void make_encode_exception(PyObject **exceptionObject,
3520                                   const char *encoding,
3521                                   const Py_UNICODE *unicode, Py_ssize_t size,
3522                                   Py_ssize_t startpos, Py_ssize_t endpos,
3523                                   const char *reason)
3524 {
3525     if (*exceptionObject == NULL) {
3526         *exceptionObject = PyUnicodeEncodeError_Create(
3527             encoding, unicode, size, startpos, endpos, reason);
3528     }
3529     else {
3530         if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
3531             goto onError;
3532         if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
3533             goto onError;
3534         if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
3535             goto onError;
3536         return;
3537       onError:
3538         Py_CLEAR(*exceptionObject);
3539     }
3540 }
3541 
3542 /* raises a UnicodeEncodeError */
raise_encode_exception(PyObject ** exceptionObject,const char * encoding,const Py_UNICODE * unicode,Py_ssize_t size,Py_ssize_t startpos,Py_ssize_t endpos,const char * reason)3543 static void raise_encode_exception(PyObject **exceptionObject,
3544                                    const char *encoding,
3545                                    const Py_UNICODE *unicode, Py_ssize_t size,
3546                                    Py_ssize_t startpos, Py_ssize_t endpos,
3547                                    const char *reason)
3548 {
3549     make_encode_exception(exceptionObject,
3550                           encoding, unicode, size, startpos, endpos, reason);
3551     if (*exceptionObject != NULL)
3552         PyCodec_StrictErrors(*exceptionObject);
3553 }
3554 
3555 /* error handling callback helper:
3556    build arguments, call the callback and check the arguments,
3557    put the result into newpos and return the replacement string, which
3558    has to be freed by the caller */
unicode_encode_call_errorhandler(const char * errors,PyObject ** errorHandler,const char * encoding,const char * reason,const Py_UNICODE * unicode,Py_ssize_t size,PyObject ** exceptionObject,Py_ssize_t startpos,Py_ssize_t endpos,Py_ssize_t * newpos)3559 static PyObject *unicode_encode_call_errorhandler(const char *errors,
3560                                                   PyObject **errorHandler,
3561                                                   const char *encoding, const char *reason,
3562                                                   const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3563                                                   Py_ssize_t startpos, Py_ssize_t endpos,
3564                                                   Py_ssize_t *newpos)
3565 {
3566     static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
3567 
3568     PyObject *restuple;
3569     PyObject *resunicode;
3570 
3571     if (*errorHandler == NULL) {
3572         *errorHandler = PyCodec_LookupError(errors);
3573         if (*errorHandler == NULL)
3574             return NULL;
3575     }
3576 
3577     make_encode_exception(exceptionObject,
3578                           encoding, unicode, size, startpos, endpos, reason);
3579     if (*exceptionObject == NULL)
3580         return NULL;
3581 
3582     restuple = PyObject_CallFunctionObjArgs(
3583         *errorHandler, *exceptionObject, NULL);
3584     if (restuple == NULL)
3585         return NULL;
3586     if (!PyTuple_Check(restuple)) {
3587         PyErr_SetString(PyExc_TypeError, &argparse[4]);
3588         Py_DECREF(restuple);
3589         return NULL;
3590     }
3591     if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
3592                           &resunicode, newpos)) {
3593         Py_DECREF(restuple);
3594         return NULL;
3595     }
3596     if (*newpos<0)
3597         *newpos = size+*newpos;
3598     if (*newpos<0 || *newpos>size) {
3599         PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
3600         Py_DECREF(restuple);
3601         return NULL;
3602     }
3603     Py_INCREF(resunicode);
3604     Py_DECREF(restuple);
3605     return resunicode;
3606 }
3607 
unicode_encode_ucs1(const Py_UNICODE * p,Py_ssize_t size,const char * errors,int limit)3608 static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
3609                                      Py_ssize_t size,
3610                                      const char *errors,
3611                                      int limit)
3612 {
3613     /* output object */
3614     PyObject *res;
3615     /* pointers to the beginning and end+1 of input */
3616     const Py_UNICODE *startp = p;
3617     const Py_UNICODE *endp = p + size;
3618     /* pointer to the beginning of the unencodable characters */
3619     /* const Py_UNICODE *badp = NULL; */
3620     /* pointer into the output */
3621     char *str;
3622     /* current output position */
3623     Py_ssize_t respos = 0;
3624     Py_ssize_t ressize;
3625     const char *encoding = (limit == 256) ? "latin-1" : "ascii";
3626     const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
3627     PyObject *errorHandler = NULL;
3628     PyObject *exc = NULL;
3629     /* the following variable is used for caching string comparisons
3630      * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3631     int known_errorHandler = -1;
3632 
3633     /* allocate enough for a simple encoding without
3634        replacements, if we need more, we'll resize */
3635     res = PyString_FromStringAndSize(NULL, size);
3636     if (res == NULL)
3637         goto onError;
3638     if (size == 0)
3639         return res;
3640     str = PyString_AS_STRING(res);
3641     ressize = size;
3642 
3643     while (p<endp) {
3644         Py_UNICODE c = *p;
3645 
3646         /* can we encode this? */
3647         if (c<limit) {
3648             /* no overflow check, because we know that the space is enough */
3649             *str++ = (char)c;
3650             ++p;
3651         }
3652         else {
3653             Py_ssize_t unicodepos = p-startp;
3654             Py_ssize_t requiredsize;
3655             PyObject *repunicode;
3656             Py_ssize_t repsize;
3657             Py_ssize_t newpos;
3658             Py_ssize_t respos;
3659             Py_UNICODE *uni2;
3660             /* startpos for collecting unencodable chars */
3661             const Py_UNICODE *collstart = p;
3662             const Py_UNICODE *collend = p;
3663             /* find all unecodable characters */
3664             while ((collend < endp) && ((*collend) >= limit))
3665                 ++collend;
3666             /* cache callback name lookup (if not done yet, i.e. it's the first error) */
3667             if (known_errorHandler==-1) {
3668                 if ((errors==NULL) || (!strcmp(errors, "strict")))
3669                     known_errorHandler = 1;
3670                 else if (!strcmp(errors, "replace"))
3671                     known_errorHandler = 2;
3672                 else if (!strcmp(errors, "ignore"))
3673                     known_errorHandler = 3;
3674                 else if (!strcmp(errors, "xmlcharrefreplace"))
3675                     known_errorHandler = 4;
3676                 else
3677                     known_errorHandler = 0;
3678             }
3679             switch (known_errorHandler) {
3680             case 1: /* strict */
3681                 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
3682                 goto onError;
3683             case 2: /* replace */
3684                 while (collstart++ < collend)
3685                     *str++ = '?'; /* fall through */
3686             case 3: /* ignore */
3687                 p = collend;
3688                 break;
3689             case 4: /* xmlcharrefreplace */
3690                 respos = str - PyString_AS_STRING(res);
3691                 /* determine replacement size (temporarily (mis)uses p) */
3692                 requiredsize = respos;
3693                 for (p = collstart; p < collend;) {
3694                     Py_UCS4 ch = _Py_UNICODE_NEXT(p, collend);
3695                     Py_ssize_t incr;
3696                     if (ch < 10)
3697                         incr = 2+1+1;
3698                     else if (ch < 100)
3699                         incr = 2+2+1;
3700                     else if (ch < 1000)
3701                         incr = 2+3+1;
3702                     else if (ch < 10000)
3703                         incr = 2+4+1;
3704                     else if (ch < 100000)
3705                         incr = 2+5+1;
3706                     else if (ch < 1000000)
3707                         incr = 2+6+1;
3708                     else
3709                         incr = 2+7+1;
3710                     if (requiredsize > PY_SSIZE_T_MAX - incr)
3711                         goto overflow;
3712                     requiredsize += incr;
3713                 }
3714                 if (requiredsize > PY_SSIZE_T_MAX - (endp - collend))
3715                     goto overflow;
3716                 requiredsize += endp - collend;
3717                 if (requiredsize > ressize) {
3718                     if (ressize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*ressize)
3719                         requiredsize = 2*ressize;
3720                     if (_PyString_Resize(&res, requiredsize))
3721                         goto onError;
3722                     str = PyString_AS_STRING(res) + respos;
3723                     ressize = requiredsize;
3724                 }
3725                 /* generate replacement (temporarily (mis)uses p) */
3726                 for (p = collstart; p < collend;) {
3727                     Py_UCS4 ch = _Py_UNICODE_NEXT(p, collend);
3728                     str += sprintf(str, "&#%d;", (int)ch);
3729                 }
3730                 p = collend;
3731                 break;
3732             default:
3733                 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3734                                                               encoding, reason, startp, size, &exc,
3735                                                               collstart-startp, collend-startp, &newpos);
3736                 if (repunicode == NULL)
3737                     goto onError;
3738                 /* need more space? (at least enough for what we have+the
3739                    replacement+the rest of the string, so we won't have to
3740                    check space for encodable characters) */
3741                 respos = str - PyString_AS_STRING(res);
3742                 repsize = PyUnicode_GET_SIZE(repunicode);
3743                 if (respos > PY_SSIZE_T_MAX - repsize)
3744                     goto overflow;
3745                 requiredsize = respos + repsize;
3746                 if (requiredsize > PY_SSIZE_T_MAX - (endp - collend))
3747                     goto overflow;
3748                 requiredsize += endp - collend;
3749                 if (requiredsize > ressize) {
3750                     if (ressize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*ressize)
3751                         requiredsize = 2*ressize;
3752                     if (_PyString_Resize(&res, requiredsize)) {
3753                         Py_DECREF(repunicode);
3754                         goto onError;
3755                     }
3756                     str = PyString_AS_STRING(res) + respos;
3757                     ressize = requiredsize;
3758                 }
3759                 /* check if there is anything unencodable in the replacement
3760                    and copy it to the output */
3761                 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2, ++str) {
3762                     c = *uni2;
3763                     if (c >= limit) {
3764                         raise_encode_exception(&exc, encoding, startp, size,
3765                                                unicodepos, unicodepos+1, reason);
3766                         Py_DECREF(repunicode);
3767                         goto onError;
3768                     }
3769                     *str = (char)c;
3770                 }
3771                 p = startp + newpos;
3772                 Py_DECREF(repunicode);
3773             }
3774         }
3775     }
3776     /* Resize if we allocated to much */
3777     respos = str - PyString_AS_STRING(res);
3778     if (respos < ressize)
3779         /* If this falls res will be NULL */
3780         _PyString_Resize(&res, respos);
3781     Py_XDECREF(errorHandler);
3782     Py_XDECREF(exc);
3783     return res;
3784 
3785   overflow:
3786     PyErr_SetString(PyExc_OverflowError,
3787                     "encoded result is too long for a Python string");
3788 
3789   onError:
3790     Py_XDECREF(res);
3791     Py_XDECREF(errorHandler);
3792     Py_XDECREF(exc);
3793     return NULL;
3794 }
3795 
PyUnicode_EncodeLatin1(const Py_UNICODE * p,Py_ssize_t size,const char * errors)3796 PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
3797                                  Py_ssize_t size,
3798                                  const char *errors)
3799 {
3800     return unicode_encode_ucs1(p, size, errors, 256);
3801 }
3802 
PyUnicode_AsLatin1String(PyObject * unicode)3803 PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
3804 {
3805     if (!PyUnicode_Check(unicode)) {
3806         PyErr_BadArgument();
3807         return NULL;
3808     }
3809     return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
3810                                   PyUnicode_GET_SIZE(unicode),
3811                                   NULL);
3812 }
3813 
3814 /* --- 7-bit ASCII Codec -------------------------------------------------- */
3815 
PyUnicode_DecodeASCII(const char * s,Py_ssize_t size,const char * errors)3816 PyObject *PyUnicode_DecodeASCII(const char *s,
3817                                 Py_ssize_t size,
3818                                 const char *errors)
3819 {
3820     const char *starts = s;
3821     PyUnicodeObject *v;
3822     Py_UNICODE *p;
3823     Py_ssize_t startinpos;
3824     Py_ssize_t endinpos;
3825     Py_ssize_t outpos;
3826     const char *e;
3827     PyObject *errorHandler = NULL;
3828     PyObject *exc = NULL;
3829 
3830     /* ASCII is equivalent to the first 128 ordinals in Unicode. */
3831     if (size == 1 && *(unsigned char*)s < 128) {
3832         Py_UNICODE r = *(unsigned char*)s;
3833         return PyUnicode_FromUnicode(&r, 1);
3834     }
3835 
3836     v = _PyUnicode_New(size);
3837     if (v == NULL)
3838         goto onError;
3839     if (size == 0)
3840         return (PyObject *)v;
3841     p = PyUnicode_AS_UNICODE(v);
3842     e = s + size;
3843     while (s < e) {
3844         register unsigned char c = (unsigned char)*s;
3845         if (c < 128) {
3846             *p++ = c;
3847             ++s;
3848         }
3849         else {
3850             startinpos = s-starts;
3851             endinpos = startinpos + 1;
3852             outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
3853             if (unicode_decode_call_errorhandler(
3854                     errors, &errorHandler,
3855                     "ascii", "ordinal not in range(128)",
3856                     starts, size, &startinpos, &endinpos, &exc, &s,
3857                     &v, &outpos, &p))
3858                 goto onError;
3859         }
3860     }
3861     if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
3862         if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3863             goto onError;
3864     Py_XDECREF(errorHandler);
3865     Py_XDECREF(exc);
3866     return (PyObject *)v;
3867 
3868   onError:
3869     Py_XDECREF(v);
3870     Py_XDECREF(errorHandler);
3871     Py_XDECREF(exc);
3872     return NULL;
3873 }
3874 
PyUnicode_EncodeASCII(const Py_UNICODE * p,Py_ssize_t size,const char * errors)3875 PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
3876                                 Py_ssize_t size,
3877                                 const char *errors)
3878 {
3879     return unicode_encode_ucs1(p, size, errors, 128);
3880 }
3881 
PyUnicode_AsASCIIString(PyObject * unicode)3882 PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
3883 {
3884     if (!PyUnicode_Check(unicode)) {
3885         PyErr_BadArgument();
3886         return NULL;
3887     }
3888     return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
3889                                  PyUnicode_GET_SIZE(unicode),
3890                                  NULL);
3891 }
3892 
3893 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
3894 
3895 /* --- MBCS codecs for Windows -------------------------------------------- */
3896 
3897 #if SIZEOF_INT < SIZEOF_SIZE_T
3898 #define NEED_RETRY
3899 #endif
3900 
3901 /* XXX This code is limited to "true" double-byte encodings, as
3902    a) it assumes an incomplete character consists of a single byte, and
3903    b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
3904    encodings, see IsDBCSLeadByteEx documentation. */
3905 
is_dbcs_lead_byte(const char * s,int offset)3906 static int is_dbcs_lead_byte(const char *s, int offset)
3907 {
3908     const char *curr = s + offset;
3909 
3910     if (IsDBCSLeadByte(*curr)) {
3911         const char *prev = CharPrev(s, curr);
3912         return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
3913     }
3914     return 0;
3915 }
3916 
3917 /*
3918  * Decode MBCS string into unicode object. If 'final' is set, converts
3919  * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
3920  */
decode_mbcs(PyUnicodeObject ** v,const char * s,int size,int final)3921 static int decode_mbcs(PyUnicodeObject **v,
3922                        const char *s, /* MBCS string */
3923                        int size, /* sizeof MBCS string */
3924                        int final)
3925 {
3926     Py_UNICODE *p;
3927     Py_ssize_t n = 0;
3928     int usize = 0;
3929 
3930     assert(size >= 0);
3931 
3932     /* Skip trailing lead-byte unless 'final' is set */
3933     if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
3934         --size;
3935 
3936     /* First get the size of the result */
3937     if (size > 0) {
3938         usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
3939         if (usize == 0) {
3940             PyErr_SetFromWindowsErrWithFilename(0, NULL);
3941             return -1;
3942         }
3943     }
3944 
3945     if (*v == NULL) {
3946         /* Create unicode object */
3947         *v = _PyUnicode_New(usize);
3948         if (*v == NULL)
3949             return -1;
3950     }
3951     else {
3952         /* Extend unicode object */
3953         n = PyUnicode_GET_SIZE(*v);
3954         if (_PyUnicode_Resize(v, n + usize) < 0)
3955             return -1;
3956     }
3957 
3958     /* Do the conversion */
3959     if (size > 0) {
3960         p = PyUnicode_AS_UNICODE(*v) + n;
3961         if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
3962             PyErr_SetFromWindowsErrWithFilename(0, NULL);
3963             return -1;
3964         }
3965     }
3966 
3967     return size;
3968 }
3969 
PyUnicode_DecodeMBCSStateful(const char * s,Py_ssize_t size,const char * errors,Py_ssize_t * consumed)3970 PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
3971                                        Py_ssize_t size,
3972                                        const char *errors,
3973                                        Py_ssize_t *consumed)
3974 {
3975     PyUnicodeObject *v = NULL;
3976     int done;
3977 
3978     if (consumed)
3979         *consumed = 0;
3980 
3981 #ifdef NEED_RETRY
3982   retry:
3983     if (size > INT_MAX)
3984         done = decode_mbcs(&v, s, INT_MAX, 0);
3985     else
3986 #endif
3987         done = decode_mbcs(&v, s, (int)size, !consumed);
3988 
3989     if (done < 0) {
3990         Py_XDECREF(v);
3991         return NULL;
3992     }
3993 
3994     if (consumed)
3995         *consumed += done;
3996 
3997 #ifdef NEED_RETRY
3998     if (size > INT_MAX) {
3999         s += done;
4000         size -= done;
4001         goto retry;
4002     }
4003 #endif
4004 
4005     return (PyObject *)v;
4006 }
4007 
PyUnicode_DecodeMBCS(const char * s,Py_ssize_t size,const char * errors)4008 PyObject *PyUnicode_DecodeMBCS(const char *s,
4009                                Py_ssize_t size,
4010                                const char *errors)
4011 {
4012     return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
4013 }
4014 
4015 /*
4016  * Convert unicode into string object (MBCS).
4017  * Returns 0 if succeed, -1 otherwise.
4018  */
encode_mbcs(PyObject ** repr,const Py_UNICODE * p,int size)4019 static int encode_mbcs(PyObject **repr,
4020                        const Py_UNICODE *p, /* unicode */
4021                        int size) /* size of unicode */
4022 {
4023     int mbcssize = 0;
4024     Py_ssize_t n = 0;
4025 
4026     assert(size >= 0);
4027 
4028     /* First get the size of the result */
4029     if (size > 0) {
4030         mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
4031         if (mbcssize == 0) {
4032             PyErr_SetFromWindowsErrWithFilename(0, NULL);
4033             return -1;
4034         }
4035     }
4036 
4037     if (*repr == NULL) {
4038         /* Create string object */
4039         *repr = PyString_FromStringAndSize(NULL, mbcssize);
4040         if (*repr == NULL)
4041             return -1;
4042     }
4043     else {
4044         /* Extend string object */
4045         n = PyString_Size(*repr);
4046         if (_PyString_Resize(repr, n + mbcssize) < 0)
4047             return -1;
4048     }
4049 
4050     /* Do the conversion */
4051     if (size > 0) {
4052         char *s = PyString_AS_STRING(*repr) + n;
4053         if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
4054             PyErr_SetFromWindowsErrWithFilename(0, NULL);
4055             return -1;
4056         }
4057     }
4058 
4059     return 0;
4060 }
4061 
PyUnicode_EncodeMBCS(const Py_UNICODE * p,Py_ssize_t size,const char * errors)4062 PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
4063                                Py_ssize_t size,
4064                                const char *errors)
4065 {
4066     PyObject *repr = NULL;
4067     int ret;
4068 
4069 #ifdef NEED_RETRY
4070   retry:
4071     if (size > INT_MAX)
4072         ret = encode_mbcs(&repr, p, INT_MAX);
4073     else
4074 #endif
4075         ret = encode_mbcs(&repr, p, (int)size);
4076 
4077     if (ret < 0) {
4078         Py_XDECREF(repr);
4079         return NULL;
4080     }
4081 
4082 #ifdef NEED_RETRY
4083     if (size > INT_MAX) {
4084         p += INT_MAX;
4085         size -= INT_MAX;
4086         goto retry;
4087     }
4088 #endif
4089 
4090     return repr;
4091 }
4092 
PyUnicode_AsMBCSString(PyObject * unicode)4093 PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
4094 {
4095     if (!PyUnicode_Check(unicode)) {
4096         PyErr_BadArgument();
4097         return NULL;
4098     }
4099     return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
4100                                 PyUnicode_GET_SIZE(unicode),
4101                                 NULL);
4102 }
4103 
4104 #undef NEED_RETRY
4105 
4106 #endif /* MS_WINDOWS */
4107 
4108 /* --- Character Mapping Codec -------------------------------------------- */
4109 
PyUnicode_DecodeCharmap(const char * s,Py_ssize_t size,PyObject * mapping,const char * errors)4110 PyObject *PyUnicode_DecodeCharmap(const char *s,
4111                                   Py_ssize_t size,
4112                                   PyObject *mapping,
4113                                   const char *errors)
4114 {
4115     const char *starts = s;
4116     Py_ssize_t startinpos;
4117     Py_ssize_t endinpos;
4118     Py_ssize_t outpos;
4119     const char *e;
4120     PyUnicodeObject *v;
4121     Py_UNICODE *p;
4122     Py_ssize_t extrachars = 0;
4123     PyObject *errorHandler = NULL;
4124     PyObject *exc = NULL;
4125     Py_UNICODE *mapstring = NULL;
4126     Py_ssize_t maplen = 0;
4127 
4128     /* Default to Latin-1 */
4129     if (mapping == NULL)
4130         return PyUnicode_DecodeLatin1(s, size, errors);
4131 
4132     v = _PyUnicode_New(size);
4133     if (v == NULL)
4134         goto onError;
4135     if (size == 0)
4136         return (PyObject *)v;
4137     p = PyUnicode_AS_UNICODE(v);
4138     e = s + size;
4139     if (PyUnicode_CheckExact(mapping)) {
4140         mapstring = PyUnicode_AS_UNICODE(mapping);
4141         maplen = PyUnicode_GET_SIZE(mapping);
4142         while (s < e) {
4143             unsigned char ch = *s;
4144             Py_UNICODE x = 0xfffe; /* illegal value */
4145 
4146             if (ch < maplen)
4147                 x = mapstring[ch];
4148 
4149             if (x == 0xfffe) {
4150                 /* undefined mapping */
4151                 outpos = p-PyUnicode_AS_UNICODE(v);
4152                 startinpos = s-starts;
4153                 endinpos = startinpos+1;
4154                 if (unicode_decode_call_errorhandler(
4155                         errors, &errorHandler,
4156                         "charmap", "character maps to <undefined>",
4157                         starts, size, &startinpos, &endinpos, &exc, &s,
4158                         &v, &outpos, &p)) {
4159                     goto onError;
4160                 }
4161                 continue;
4162             }
4163             *p++ = x;
4164             ++s;
4165         }
4166     }
4167     else {
4168         while (s < e) {
4169             unsigned char ch = *s;
4170             PyObject *w, *x;
4171 
4172             /* Get mapping (char ordinal -> integer, Unicode char or None) */
4173             w = PyInt_FromLong((long)ch);
4174             if (w == NULL)
4175                 goto onError;
4176             x = PyObject_GetItem(mapping, w);
4177             Py_DECREF(w);
4178             if (x == NULL) {
4179                 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4180                     /* No mapping found means: mapping is undefined. */
4181                     PyErr_Clear();
4182                     goto Undefined;
4183                 } else
4184                     goto onError;
4185             }
4186 
4187             /* Apply mapping */
4188             if (x == Py_None)
4189                 goto Undefined;
4190             if (PyInt_Check(x)) {
4191                 long value = PyInt_AS_LONG(x);
4192                 if (value == 0xFFFE)
4193                     goto Undefined;
4194                 if (value < 0 || value > 0x10FFFF) {
4195                     PyErr_SetString(PyExc_TypeError,
4196                                     "character mapping must be in range(0x110000)");
4197                     Py_DECREF(x);
4198                     goto onError;
4199                 }
4200 
4201 #ifndef Py_UNICODE_WIDE
4202                 if (value > 0xFFFF) {
4203                     /* see the code for 1-n mapping below */
4204                     if (extrachars < 2) {
4205                         /* resize first */
4206                         Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
4207                         Py_ssize_t needed = 10 - extrachars;
4208                         extrachars += needed;
4209                         /* XXX overflow detection missing */
4210                         if (_PyUnicode_Resize(&v,
4211                                               PyUnicode_GET_SIZE(v) + needed) < 0) {
4212                             Py_DECREF(x);
4213                             goto onError;
4214                         }
4215                         p = PyUnicode_AS_UNICODE(v) + oldpos;
4216                     }
4217                     value -= 0x10000;
4218                     *p++ = 0xD800 | (value >> 10);
4219                     *p++ = 0xDC00 | (value & 0x3FF);
4220                     extrachars -= 2;
4221                 }
4222                 else
4223 #endif
4224                 *p++ = (Py_UNICODE)value;
4225             }
4226             else if (PyUnicode_Check(x)) {
4227                 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
4228 
4229                 if (targetsize == 1) {
4230                     /* 1-1 mapping */
4231                     Py_UNICODE value = *PyUnicode_AS_UNICODE(x);
4232                     if (value == 0xFFFE)
4233                         goto Undefined;
4234                     *p++ = value;
4235                 }
4236                 else if (targetsize > 1) {
4237                     /* 1-n mapping */
4238                     if (targetsize > extrachars) {
4239                         /* resize first */
4240                         Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
4241                         Py_ssize_t needed = (targetsize - extrachars) + \
4242                             (targetsize << 2);
4243                         extrachars += needed;
4244                         /* XXX overflow detection missing */
4245                         if (_PyUnicode_Resize(&v,
4246                                               PyUnicode_GET_SIZE(v) + needed) < 0) {
4247                             Py_DECREF(x);
4248                             goto onError;
4249                         }
4250                         p = PyUnicode_AS_UNICODE(v) + oldpos;
4251                     }
4252                     Py_UNICODE_COPY(p,
4253                                     PyUnicode_AS_UNICODE(x),
4254                                     targetsize);
4255                     p += targetsize;
4256                     extrachars -= targetsize;
4257                 }
4258                 /* 1-0 mapping: skip the character */
4259             }
4260             else {
4261                 /* wrong return value */
4262                 PyErr_SetString(PyExc_TypeError,
4263                                 "character mapping must return integer, None or unicode");
4264                 Py_DECREF(x);
4265                 goto onError;
4266             }
4267             Py_DECREF(x);
4268             ++s;
4269             continue;
4270 Undefined:
4271             /* undefined mapping */
4272             Py_XDECREF(x);
4273             outpos = p-PyUnicode_AS_UNICODE(v);
4274             startinpos = s-starts;
4275             endinpos = startinpos+1;
4276             if (unicode_decode_call_errorhandler(
4277                     errors, &errorHandler,
4278                     "charmap", "character maps to <undefined>",
4279                     starts, size, &startinpos, &endinpos, &exc, &s,
4280                     &v, &outpos, &p)) {
4281                 goto onError;
4282             }
4283         }
4284     }
4285     if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
4286         if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4287             goto onError;
4288     Py_XDECREF(errorHandler);
4289     Py_XDECREF(exc);
4290     return (PyObject *)v;
4291 
4292   onError:
4293     Py_XDECREF(errorHandler);
4294     Py_XDECREF(exc);
4295     Py_XDECREF(v);
4296     return NULL;
4297 }
4298 
4299 /* Charmap encoding: the lookup table */
4300 
4301 struct encoding_map{
4302     PyObject_HEAD
4303     unsigned char level1[32];
4304     int count2, count3;
4305     unsigned char level23[1];
4306 };
4307 
4308 static PyObject*
encoding_map_size(PyObject * obj,PyObject * args)4309 encoding_map_size(PyObject *obj, PyObject* args)
4310 {
4311     struct encoding_map *map = (struct encoding_map*)obj;
4312     return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 +
4313                           128*map->count3);
4314 }
4315 
4316 static PyMethodDef encoding_map_methods[] = {
4317     {"size", encoding_map_size, METH_NOARGS,
4318      PyDoc_STR("Return the size (in bytes) of this object") },
4319     { 0 }
4320 };
4321 
4322 static void
encoding_map_dealloc(PyObject * o)4323 encoding_map_dealloc(PyObject* o)
4324 {
4325     PyObject_FREE(o);
4326 }
4327 
4328 static PyTypeObject EncodingMapType = {
4329     PyVarObject_HEAD_INIT(NULL, 0)
4330     "EncodingMap",          /*tp_name*/
4331     sizeof(struct encoding_map),   /*tp_basicsize*/
4332     0,                      /*tp_itemsize*/
4333     /* methods */
4334     encoding_map_dealloc,   /*tp_dealloc*/
4335     0,                      /*tp_print*/
4336     0,                      /*tp_getattr*/
4337     0,                      /*tp_setattr*/
4338     0,                      /*tp_compare*/
4339     0,                      /*tp_repr*/
4340     0,                      /*tp_as_number*/
4341     0,                      /*tp_as_sequence*/
4342     0,                      /*tp_as_mapping*/
4343     0,                      /*tp_hash*/
4344     0,                      /*tp_call*/
4345     0,                      /*tp_str*/
4346     0,                      /*tp_getattro*/
4347     0,                      /*tp_setattro*/
4348     0,                      /*tp_as_buffer*/
4349     Py_TPFLAGS_DEFAULT,     /*tp_flags*/
4350     0,                      /*tp_doc*/
4351     0,                      /*tp_traverse*/
4352     0,                      /*tp_clear*/
4353     0,                      /*tp_richcompare*/
4354     0,                      /*tp_weaklistoffset*/
4355     0,                      /*tp_iter*/
4356     0,                      /*tp_iternext*/
4357     encoding_map_methods,   /*tp_methods*/
4358     0,                      /*tp_members*/
4359     0,                      /*tp_getset*/
4360     0,                      /*tp_base*/
4361     0,                      /*tp_dict*/
4362     0,                      /*tp_descr_get*/
4363     0,                      /*tp_descr_set*/
4364     0,                      /*tp_dictoffset*/
4365     0,                      /*tp_init*/
4366     0,                      /*tp_alloc*/
4367     0,                      /*tp_new*/
4368     0,                      /*tp_free*/
4369     0,                      /*tp_is_gc*/
4370 };
4371 
4372 PyObject*
PyUnicode_BuildEncodingMap(PyObject * string)4373 PyUnicode_BuildEncodingMap(PyObject* string)
4374 {
4375     Py_UNICODE *decode;
4376     PyObject *result;
4377     struct encoding_map *mresult;
4378     int i;
4379     int need_dict = 0;
4380     unsigned char level1[32];
4381     unsigned char level2[512];
4382     unsigned char *mlevel1, *mlevel2, *mlevel3;
4383     int count2 = 0, count3 = 0;
4384 
4385     if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
4386         PyErr_BadArgument();
4387         return NULL;
4388     }
4389     decode = PyUnicode_AS_UNICODE(string);
4390     memset(level1, 0xFF, sizeof level1);
4391     memset(level2, 0xFF, sizeof level2);
4392 
4393     /* If there isn't a one-to-one mapping of NULL to \0,
4394        or if there are non-BMP characters, we need to use
4395        a mapping dictionary. */
4396     if (decode[0] != 0)
4397         need_dict = 1;
4398     for (i = 1; i < 256; i++) {
4399         int l1, l2;
4400         if (decode[i] == 0
4401 #ifdef Py_UNICODE_WIDE
4402             || decode[i] > 0xFFFF
4403 #endif
4404             ) {
4405             need_dict = 1;
4406             break;
4407         }
4408         if (decode[i] == 0xFFFE)
4409             /* unmapped character */
4410             continue;
4411         l1 = decode[i] >> 11;
4412         l2 = decode[i] >> 7;
4413         if (level1[l1] == 0xFF)
4414             level1[l1] = count2++;
4415         if (level2[l2] == 0xFF)
4416             level2[l2] = count3++;
4417     }
4418 
4419     if (count2 >= 0xFF || count3 >= 0xFF)
4420         need_dict = 1;
4421 
4422     if (need_dict) {
4423         PyObject *result = PyDict_New();
4424         PyObject *key, *value;
4425         if (!result)
4426             return NULL;
4427         for (i = 0; i < 256; i++) {
4428             value = NULL;
4429             key = PyInt_FromLong(decode[i]);
4430             value = PyInt_FromLong(i);
4431             if (!key || !value)
4432                 goto failed1;
4433             if (PyDict_SetItem(result, key, value) == -1)
4434                 goto failed1;
4435             Py_DECREF(key);
4436             Py_DECREF(value);
4437         }
4438         return result;
4439       failed1:
4440         Py_XDECREF(key);
4441         Py_XDECREF(value);
4442         Py_DECREF(result);
4443         return NULL;
4444     }
4445 
4446     /* Create a three-level trie */
4447     result = PyObject_MALLOC(sizeof(struct encoding_map) +
4448                              16*count2 + 128*count3 - 1);
4449     if (!result)
4450         return PyErr_NoMemory();
4451     PyObject_Init(result, &EncodingMapType);
4452     mresult = (struct encoding_map*)result;
4453     mresult->count2 = count2;
4454     mresult->count3 = count3;
4455     mlevel1 = mresult->level1;
4456     mlevel2 = mresult->level23;
4457     mlevel3 = mresult->level23 + 16*count2;
4458     memcpy(mlevel1, level1, 32);
4459     memset(mlevel2, 0xFF, 16*count2);
4460     memset(mlevel3, 0, 128*count3);
4461     count3 = 0;
4462     for (i = 1; i < 256; i++) {
4463         int o1, o2, o3, i2, i3;
4464         if (decode[i] == 0xFFFE)
4465             /* unmapped character */
4466             continue;
4467         o1 = decode[i]>>11;
4468         o2 = (decode[i]>>7) & 0xF;
4469         i2 = 16*mlevel1[o1] + o2;
4470         if (mlevel2[i2] == 0xFF)
4471             mlevel2[i2] = count3++;
4472         o3 = decode[i] & 0x7F;
4473         i3 = 128*mlevel2[i2] + o3;
4474         mlevel3[i3] = i;
4475     }
4476     return result;
4477 }
4478 
4479 static int
encoding_map_lookup(Py_UNICODE c,PyObject * mapping)4480 encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
4481 {
4482     struct encoding_map *map = (struct encoding_map*)mapping;
4483     int l1 = c>>11;
4484     int l2 = (c>>7) & 0xF;
4485     int l3 = c & 0x7F;
4486     int i;
4487 
4488 #ifdef Py_UNICODE_WIDE
4489     if (c > 0xFFFF) {
4490         return -1;
4491     }
4492 #endif
4493     if (c == 0)
4494         return 0;
4495     /* level 1*/
4496     i = map->level1[l1];
4497     if (i == 0xFF) {
4498         return -1;
4499     }
4500     /* level 2*/
4501     i = map->level23[16*i+l2];
4502     if (i == 0xFF) {
4503         return -1;
4504     }
4505     /* level 3 */
4506     i = map->level23[16*map->count2 + 128*i + l3];
4507     if (i == 0) {
4508         return -1;
4509     }
4510     return i;
4511 }
4512 
4513 /* Lookup the character ch in the mapping. If the character
4514    can't be found, Py_None is returned (or NULL, if another
4515    error occurred). */
charmapencode_lookup(Py_UNICODE c,PyObject * mapping)4516 static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
4517 {
4518     PyObject *w = PyInt_FromLong((long)c);
4519     PyObject *x;
4520 
4521     if (w == NULL)
4522         return NULL;
4523     x = PyObject_GetItem(mapping, w);
4524     Py_DECREF(w);
4525     if (x == NULL) {
4526         if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4527             /* No mapping found means: mapping is undefined. */
4528             PyErr_Clear();
4529             x = Py_None;
4530             Py_INCREF(x);
4531             return x;
4532         } else
4533             return NULL;
4534     }
4535     else if (x == Py_None)
4536         return x;
4537     else if (PyInt_Check(x)) {
4538         long value = PyInt_AS_LONG(x);
4539         if (value < 0 || value > 255) {
4540             PyErr_SetString(PyExc_TypeError,
4541                             "character mapping must be in range(256)");
4542             Py_DECREF(x);
4543             return NULL;
4544         }
4545         return x;
4546     }
4547     else if (PyString_Check(x))
4548         return x;
4549     else {
4550         /* wrong return value */
4551         PyErr_SetString(PyExc_TypeError,
4552                         "character mapping must return integer, None or str");
4553         Py_DECREF(x);
4554         return NULL;
4555     }
4556 }
4557 
4558 static int
charmapencode_resize(PyObject ** outobj,Py_ssize_t * outpos,Py_ssize_t requiredsize)4559 charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
4560 {
4561     Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
4562     /* exponentially overallocate to minimize reallocations */
4563     if (requiredsize < 2*outsize)
4564         requiredsize = 2*outsize;
4565     if (_PyString_Resize(outobj, requiredsize)) {
4566         return 0;
4567     }
4568     return 1;
4569 }
4570 
4571 typedef enum charmapencode_result {
4572     enc_SUCCESS, enc_FAILED, enc_EXCEPTION
4573 }charmapencode_result;
4574 /* lookup the character, put the result in the output string and adjust
4575    various state variables. Reallocate the output string if not enough
4576    space is available. Return a new reference to the object that
4577    was put in the output buffer, or Py_None, if the mapping was undefined
4578    (in which case no character was written) or NULL, if a
4579    reallocation error occurred. The caller must decref the result */
4580 static
charmapencode_output(Py_UNICODE c,PyObject * mapping,PyObject ** outobj,Py_ssize_t * outpos)4581 charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
4582                                           PyObject **outobj, Py_ssize_t *outpos)
4583 {
4584     PyObject *rep;
4585     char *outstart;
4586     Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
4587 
4588     if (Py_TYPE(mapping) == &EncodingMapType) {
4589         int res = encoding_map_lookup(c, mapping);
4590         Py_ssize_t requiredsize = *outpos+1;
4591         if (res == -1)
4592             return enc_FAILED;
4593         if (outsize<requiredsize)
4594             if (!charmapencode_resize(outobj, outpos, requiredsize))
4595                 return enc_EXCEPTION;
4596         outstart = PyString_AS_STRING(*outobj);
4597         outstart[(*outpos)++] = (char)res;
4598         return enc_SUCCESS;
4599     }
4600 
4601     rep = charmapencode_lookup(c, mapping);
4602     if (rep==NULL)
4603         return enc_EXCEPTION;
4604     else if (rep==Py_None) {
4605         Py_DECREF(rep);
4606         return enc_FAILED;
4607     } else {
4608         if (PyInt_Check(rep)) {
4609             Py_ssize_t requiredsize = *outpos+1;
4610             if (outsize<requiredsize)
4611                 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
4612                     Py_DECREF(rep);
4613                     return enc_EXCEPTION;
4614                 }
4615             outstart = PyString_AS_STRING(*outobj);
4616             outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
4617         }
4618         else {
4619             const char *repchars = PyString_AS_STRING(rep);
4620             Py_ssize_t repsize = PyString_GET_SIZE(rep);
4621             Py_ssize_t requiredsize = *outpos+repsize;
4622             if (outsize<requiredsize)
4623                 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
4624                     Py_DECREF(rep);
4625                     return enc_EXCEPTION;
4626                 }
4627             outstart = PyString_AS_STRING(*outobj);
4628             memcpy(outstart + *outpos, repchars, repsize);
4629             *outpos += repsize;
4630         }
4631     }
4632     Py_DECREF(rep);
4633     return enc_SUCCESS;
4634 }
4635 
4636 /* handle an error in PyUnicode_EncodeCharmap
4637    Return 0 on success, -1 on error */
4638 static
charmap_encoding_error(const Py_UNICODE * p,Py_ssize_t size,Py_ssize_t * inpos,PyObject * mapping,PyObject ** exceptionObject,int * known_errorHandler,PyObject ** errorHandler,const char * errors,PyObject ** res,Py_ssize_t * respos)4639 int charmap_encoding_error(
4640     const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
4641     PyObject **exceptionObject,
4642     int *known_errorHandler, PyObject **errorHandler, const char *errors,
4643     PyObject **res, Py_ssize_t *respos)
4644 {
4645     PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
4646     Py_ssize_t repsize;
4647     Py_ssize_t newpos;
4648     Py_UNICODE *uni2;
4649     /* startpos for collecting unencodable chars */
4650     Py_ssize_t collstartpos = *inpos;
4651     Py_ssize_t collendpos = *inpos+1;
4652     Py_ssize_t collpos;
4653     char *encoding = "charmap";
4654     char *reason = "character maps to <undefined>";
4655     charmapencode_result x;
4656 
4657     /* find all unencodable characters */
4658     while (collendpos < size) {
4659         PyObject *rep;
4660         if (Py_TYPE(mapping) == &EncodingMapType) {
4661             int res = encoding_map_lookup(p[collendpos], mapping);
4662             if (res != -1)
4663                 break;
4664             ++collendpos;
4665             continue;
4666         }
4667 
4668         rep = charmapencode_lookup(p[collendpos], mapping);
4669         if (rep==NULL)
4670             return -1;
4671         else if (rep!=Py_None) {
4672             Py_DECREF(rep);
4673             break;
4674         }
4675         Py_DECREF(rep);
4676         ++collendpos;
4677     }
4678     /* cache callback name lookup
4679      * (if not done yet, i.e. it's the first error) */
4680     if (*known_errorHandler==-1) {
4681         if ((errors==NULL) || (!strcmp(errors, "strict")))
4682             *known_errorHandler = 1;
4683         else if (!strcmp(errors, "replace"))
4684             *known_errorHandler = 2;
4685         else if (!strcmp(errors, "ignore"))
4686             *known_errorHandler = 3;
4687         else if (!strcmp(errors, "xmlcharrefreplace"))
4688             *known_errorHandler = 4;
4689         else
4690             *known_errorHandler = 0;
4691     }
4692     switch (*known_errorHandler) {
4693     case 1: /* strict */
4694         raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4695         return -1;
4696     case 2: /* replace */
4697         for (collpos = collstartpos; collpos<collendpos; ++collpos) {
4698             x = charmapencode_output('?', mapping, res, respos);
4699             if (x==enc_EXCEPTION) {
4700                 return -1;
4701             }
4702             else if (x==enc_FAILED) {
4703                 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4704                 return -1;
4705             }
4706         }
4707         /* fall through */
4708     case 3: /* ignore */
4709         *inpos = collendpos;
4710         break;
4711     case 4: /* xmlcharrefreplace */
4712         /* generate replacement */
4713         for (collpos = collstartpos; collpos < collendpos;) {
4714             char buffer[2+29+1+1];
4715             char *cp;
4716             Py_UCS4 ch = p[collpos++];
4717 #ifndef Py_UNICODE_WIDE
4718             if ((0xD800 <= ch && ch <= 0xDBFF) &&
4719                 (collpos < collendpos) &&
4720                 (0xDC00 <= p[collpos] && p[collpos] <= 0xDFFF)) {
4721                 ch = ((((ch & 0x03FF) << 10) |
4722                        ((Py_UCS4)p[collpos++] & 0x03FF)) + 0x10000);
4723             }
4724 #endif
4725             sprintf(buffer, "&#%d;", (int)ch);
4726             for (cp = buffer; *cp; ++cp) {
4727                 x = charmapencode_output(*cp, mapping, res, respos);
4728                 if (x==enc_EXCEPTION)
4729                     return -1;
4730                 else if (x==enc_FAILED) {
4731                     raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4732                     return -1;
4733                 }
4734             }
4735         }
4736         *inpos = collendpos;
4737         break;
4738     default:
4739         repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
4740                                                       encoding, reason, p, size, exceptionObject,
4741                                                       collstartpos, collendpos, &newpos);
4742         if (repunicode == NULL)
4743             return -1;
4744         /* generate replacement  */
4745         repsize = PyUnicode_GET_SIZE(repunicode);
4746         for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4747             x = charmapencode_output(*uni2, mapping, res, respos);
4748             if (x==enc_EXCEPTION) {
4749                 return -1;
4750             }
4751             else if (x==enc_FAILED) {
4752                 Py_DECREF(repunicode);
4753                 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4754                 return -1;
4755             }
4756         }
4757         *inpos = newpos;
4758         Py_DECREF(repunicode);
4759     }
4760     return 0;
4761 }
4762 
PyUnicode_EncodeCharmap(const Py_UNICODE * p,Py_ssize_t size,PyObject * mapping,const char * errors)4763 PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
4764                                   Py_ssize_t size,
4765                                   PyObject *mapping,
4766                                   const char *errors)
4767 {
4768     /* output object */
4769     PyObject *res = NULL;
4770     /* current input position */
4771     Py_ssize_t inpos = 0;
4772     /* current output position */
4773     Py_ssize_t respos = 0;
4774     PyObject *errorHandler = NULL;
4775     PyObject *exc = NULL;
4776     /* the following variable is used for caching string comparisons
4777      * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4778      * 3=ignore, 4=xmlcharrefreplace */
4779     int known_errorHandler = -1;
4780 
4781     /* Default to Latin-1 */
4782     if (mapping == NULL)
4783         return PyUnicode_EncodeLatin1(p, size, errors);
4784 
4785     /* allocate enough for a simple encoding without
4786        replacements, if we need more, we'll resize */
4787     res = PyString_FromStringAndSize(NULL, size);
4788     if (res == NULL)
4789         goto onError;
4790     if (size == 0)
4791         return res;
4792 
4793     while (inpos<size) {
4794         /* try to encode it */
4795         charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
4796         if (x==enc_EXCEPTION) /* error */
4797             goto onError;
4798         if (x==enc_FAILED) { /* unencodable character */
4799             if (charmap_encoding_error(p, size, &inpos, mapping,
4800                                        &exc,
4801                                        &known_errorHandler, &errorHandler, errors,
4802                                        &res, &respos)) {
4803                 goto onError;
4804             }
4805         }
4806         else
4807             /* done with this character => adjust input position */
4808             ++inpos;
4809     }
4810 
4811     /* Resize if we allocated to much */
4812     if (respos<PyString_GET_SIZE(res)) {
4813         if (_PyString_Resize(&res, respos))
4814             goto onError;
4815     }
4816     Py_XDECREF(exc);
4817     Py_XDECREF(errorHandler);
4818     return res;
4819 
4820   onError:
4821     Py_XDECREF(res);
4822     Py_XDECREF(exc);
4823     Py_XDECREF(errorHandler);
4824     return NULL;
4825 }
4826 
PyUnicode_AsCharmapString(PyObject * unicode,PyObject * mapping)4827 PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
4828                                     PyObject *mapping)
4829 {
4830     if (!PyUnicode_Check(unicode) || mapping == NULL) {
4831         PyErr_BadArgument();
4832         return NULL;
4833     }
4834     return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
4835                                    PyUnicode_GET_SIZE(unicode),
4836                                    mapping,
4837                                    NULL);
4838 }
4839 
4840 /* create or adjust a UnicodeTranslateError */
make_translate_exception(PyObject ** exceptionObject,const Py_UNICODE * unicode,Py_ssize_t size,Py_ssize_t startpos,Py_ssize_t endpos,const char * reason)4841 static void make_translate_exception(PyObject **exceptionObject,
4842                                      const Py_UNICODE *unicode, Py_ssize_t size,
4843                                      Py_ssize_t startpos, Py_ssize_t endpos,
4844                                      const char *reason)
4845 {
4846     if (*exceptionObject == NULL) {
4847         *exceptionObject = PyUnicodeTranslateError_Create(
4848             unicode, size, startpos, endpos, reason);
4849     }
4850     else {
4851         if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
4852             goto onError;
4853         if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
4854             goto onError;
4855         if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
4856             goto onError;
4857         return;
4858       onError:
4859         Py_CLEAR(*exceptionObject);
4860     }
4861 }
4862 
4863 /* raises a UnicodeTranslateError */
raise_translate_exception(PyObject ** exceptionObject,const Py_UNICODE * unicode,Py_ssize_t size,Py_ssize_t startpos,Py_ssize_t endpos,const char * reason)4864 static void raise_translate_exception(PyObject **exceptionObject,
4865                                       const Py_UNICODE *unicode, Py_ssize_t size,
4866                                       Py_ssize_t startpos, Py_ssize_t endpos,
4867                                       const char *reason)
4868 {
4869     make_translate_exception(exceptionObject,
4870                              unicode, size, startpos, endpos, reason);
4871     if (*exceptionObject != NULL)
4872         PyCodec_StrictErrors(*exceptionObject);
4873 }
4874 
4875 /* error handling callback helper:
4876    build arguments, call the callback and check the arguments,
4877    put the result into newpos and return the replacement string, which
4878    has to be freed by the caller */
unicode_translate_call_errorhandler(const char * errors,PyObject ** errorHandler,const char * reason,const Py_UNICODE * unicode,Py_ssize_t size,PyObject ** exceptionObject,Py_ssize_t startpos,Py_ssize_t endpos,Py_ssize_t * newpos)4879 static PyObject *unicode_translate_call_errorhandler(const char *errors,
4880                                                      PyObject **errorHandler,
4881                                                      const char *reason,
4882                                                      const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4883                                                      Py_ssize_t startpos, Py_ssize_t endpos,
4884                                                      Py_ssize_t *newpos)
4885 {
4886     static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
4887 
4888     Py_ssize_t i_newpos;
4889     PyObject *restuple;
4890     PyObject *resunicode;
4891 
4892     if (*errorHandler == NULL) {
4893         *errorHandler = PyCodec_LookupError(errors);
4894         if (*errorHandler == NULL)
4895             return NULL;
4896     }
4897 
4898     make_translate_exception(exceptionObject,
4899                              unicode, size, startpos, endpos, reason);
4900     if (*exceptionObject == NULL)
4901         return NULL;
4902 
4903     restuple = PyObject_CallFunctionObjArgs(
4904         *errorHandler, *exceptionObject, NULL);
4905     if (restuple == NULL)
4906         return NULL;
4907     if (!PyTuple_Check(restuple)) {
4908         PyErr_SetString(PyExc_TypeError, &argparse[4]);
4909         Py_DECREF(restuple);
4910         return NULL;
4911     }
4912     if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
4913                           &resunicode, &i_newpos)) {
4914         Py_DECREF(restuple);
4915         return NULL;
4916     }
4917     if (i_newpos<0)
4918         *newpos = size+i_newpos;
4919     else
4920         *newpos = i_newpos;
4921     if (*newpos<0 || *newpos>size) {
4922         PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
4923         Py_DECREF(restuple);
4924         return NULL;
4925     }
4926     Py_INCREF(resunicode);
4927     Py_DECREF(restuple);
4928     return resunicode;
4929 }
4930 
4931 /* Lookup the character ch in the mapping and put the result in result,
4932    which must be decrefed by the caller.
4933    Return 0 on success, -1 on error */
4934 static
charmaptranslate_lookup(Py_UNICODE c,PyObject * mapping,PyObject ** result)4935 int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
4936 {
4937     PyObject *w = PyInt_FromLong((long)c);
4938     PyObject *x;
4939 
4940     if (w == NULL)
4941         return -1;
4942     x = PyObject_GetItem(mapping, w);
4943     Py_DECREF(w);
4944     if (x == NULL) {
4945         if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4946             /* No mapping found means: use 1:1 mapping. */
4947             PyErr_Clear();
4948             *result = NULL;
4949             return 0;
4950         } else
4951             return -1;
4952     }
4953     else if (x == Py_None) {
4954         *result = x;
4955         return 0;
4956     }
4957     else if (PyInt_Check(x)) {
4958         long value = PyInt_AS_LONG(x);
4959         long max = PyUnicode_GetMax();
4960         if (value < 0 || value > max) {
4961             PyErr_Format(PyExc_TypeError,
4962                          "character mapping must be in range(0x%lx)", max+1);
4963             Py_DECREF(x);
4964             return -1;
4965         }
4966         *result = x;
4967         return 0;
4968     }
4969     else if (PyUnicode_Check(x)) {
4970         *result = x;
4971         return 0;
4972     }
4973     else {
4974         /* wrong return value */
4975         PyErr_SetString(PyExc_TypeError,
4976                         "character mapping must return integer, None or unicode");
4977         Py_DECREF(x);
4978         return -1;
4979     }
4980 }
4981 /* ensure that *outobj is at least requiredsize characters long,
4982    if not reallocate and adjust various state variables.
4983    Return 0 on success, -1 on error */
4984 static
charmaptranslate_makespace(PyObject ** outobj,Py_UNICODE ** outp,Py_ssize_t requiredsize)4985 int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
4986                                Py_ssize_t requiredsize)
4987 {
4988     Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
4989     if (requiredsize > oldsize) {
4990         /* remember old output position */
4991         Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
4992         /* exponentially overallocate to minimize reallocations */
4993         if (requiredsize < 2 * oldsize)
4994             requiredsize = 2 * oldsize;
4995         if (PyUnicode_Resize(outobj, requiredsize) < 0)
4996             return -1;
4997         *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
4998     }
4999     return 0;
5000 }
5001 /* lookup the character, put the result in the output string and adjust
5002    various state variables. Return a new reference to the object that
5003    was put in the output buffer in *result, or Py_None, if the mapping was
5004    undefined (in which case no character was written).
5005    The called must decref result.
5006    Return 0 on success, -1 on error. */
5007 static
charmaptranslate_output(const Py_UNICODE * startinp,const Py_UNICODE * curinp,Py_ssize_t insize,PyObject * mapping,PyObject ** outobj,Py_UNICODE ** outp,PyObject ** res)5008 int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
5009                             Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
5010                             PyObject **res)
5011 {
5012     if (charmaptranslate_lookup(*curinp, mapping, res))
5013         return -1;
5014     if (*res==NULL) {
5015         /* not found => default to 1:1 mapping */
5016         *(*outp)++ = *curinp;
5017     }
5018     else if (*res==Py_None)
5019         ;
5020     else if (PyInt_Check(*res)) {
5021         /* no overflow check, because we know that the space is enough */
5022         *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
5023     }
5024     else if (PyUnicode_Check(*res)) {
5025         Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
5026         if (repsize==1) {
5027             /* no overflow check, because we know that the space is enough */
5028             *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
5029         }
5030         else if (repsize!=0) {
5031             /* more than one character */
5032             Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
5033                 (insize - (curinp-startinp)) +
5034                 repsize - 1;
5035             if (charmaptranslate_makespace(outobj, outp, requiredsize))
5036                 return -1;
5037             memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
5038             *outp += repsize;
5039         }
5040     }
5041     else
5042         return -1;
5043     return 0;
5044 }
5045 
PyUnicode_TranslateCharmap(const Py_UNICODE * p,Py_ssize_t size,PyObject * mapping,const char * errors)5046 PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
5047                                      Py_ssize_t size,
5048                                      PyObject *mapping,
5049                                      const char *errors)
5050 {
5051     /* output object */
5052     PyObject *res = NULL;
5053     /* pointers to the beginning and end+1 of input */
5054     const Py_UNICODE *startp = p;
5055     const Py_UNICODE *endp = p + size;
5056     /* pointer into the output */
5057     Py_UNICODE *str;
5058     /* current output position */
5059     Py_ssize_t respos = 0;
5060     char *reason = "character maps to <undefined>";
5061     PyObject *errorHandler = NULL;
5062     PyObject *exc = NULL;
5063     /* the following variable is used for caching string comparisons
5064      * -1=not initialized, 0=unknown, 1=strict, 2=replace,
5065      * 3=ignore, 4=xmlcharrefreplace */
5066     int known_errorHandler = -1;
5067 
5068     if (mapping == NULL) {
5069         PyErr_BadArgument();
5070         return NULL;
5071     }
5072 
5073     /* allocate enough for a simple 1:1 translation without
5074        replacements, if we need more, we'll resize */
5075     res = PyUnicode_FromUnicode(NULL, size);
5076     if (res == NULL)
5077         goto onError;
5078     if (size == 0)
5079         return res;
5080     str = PyUnicode_AS_UNICODE(res);
5081 
5082     while (p<endp) {
5083         /* try to encode it */
5084         PyObject *x = NULL;
5085         if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
5086             Py_XDECREF(x);
5087             goto onError;
5088         }
5089         Py_XDECREF(x);
5090         if (x!=Py_None) /* it worked => adjust input pointer */
5091             ++p;
5092         else { /* untranslatable character */
5093             PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
5094             Py_ssize_t repsize;
5095             Py_ssize_t newpos;
5096             Py_UNICODE *uni2;
5097             /* startpos for collecting untranslatable chars */
5098             const Py_UNICODE *collstart = p;
5099             const Py_UNICODE *collend = p+1;
5100             const Py_UNICODE *coll;
5101 
5102             /* find all untranslatable characters */
5103             while (collend < endp) {
5104                 if (charmaptranslate_lookup(*collend, mapping, &x))
5105                     goto onError;
5106                 Py_XDECREF(x);
5107                 if (x!=Py_None)
5108                     break;
5109                 ++collend;
5110             }
5111             /* cache callback name lookup
5112              * (if not done yet, i.e. it's the first error) */
5113             if (known_errorHandler==-1) {
5114                 if ((errors==NULL) || (!strcmp(errors, "strict")))
5115                     known_errorHandler = 1;
5116                 else if (!strcmp(errors, "replace"))
5117                     known_errorHandler = 2;
5118                 else if (!strcmp(errors, "ignore"))
5119                     known_errorHandler = 3;
5120                 else if (!strcmp(errors, "xmlcharrefreplace"))
5121                     known_errorHandler = 4;
5122                 else
5123                     known_errorHandler = 0;
5124             }
5125             switch (known_errorHandler) {
5126             case 1: /* strict */
5127                 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
5128                 goto onError;
5129             case 2: /* replace */
5130                 /* No need to check for space, this is a 1:1 replacement */
5131                 for (coll = collstart; coll<collend; ++coll)
5132                     *str++ = '?';
5133                 /* fall through */
5134             case 3: /* ignore */
5135                 p = collend;
5136                 break;
5137             case 4: /* xmlcharrefreplace */
5138                 /* generate replacement (temporarily (mis)uses p) */
5139                 for (p = collstart; p < collend;) {
5140                     char buffer[2+29+1+1];
5141                     char *cp;
5142                     Py_UCS4 ch = _Py_UNICODE_NEXT(p, collend);
5143                     sprintf(buffer, "&#%d;", (int)ch);
5144                     if (charmaptranslate_makespace(&res, &str,
5145                                                    (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
5146                         goto onError;
5147                     for (cp = buffer; *cp; ++cp)
5148                         *str++ = *cp;
5149                 }
5150                 p = collend;
5151                 break;
5152             default:
5153                 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
5154                                                                  reason, startp, size, &exc,
5155                                                                  collstart-startp, collend-startp, &newpos);
5156                 if (repunicode == NULL)
5157                     goto onError;
5158                 /* generate replacement  */
5159                 repsize = PyUnicode_GET_SIZE(repunicode);
5160                 if (charmaptranslate_makespace(&res, &str,
5161                                                (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
5162                     Py_DECREF(repunicode);
5163                     goto onError;
5164                 }
5165                 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
5166                     *str++ = *uni2;
5167                 p = startp + newpos;
5168                 Py_DECREF(repunicode);
5169             }
5170         }
5171     }
5172     /* Resize if we allocated to much */
5173     respos = str-PyUnicode_AS_UNICODE(res);
5174     if (respos<PyUnicode_GET_SIZE(res)) {
5175         if (PyUnicode_Resize(&res, respos) < 0)
5176             goto onError;
5177     }
5178     Py_XDECREF(exc);
5179     Py_XDECREF(errorHandler);
5180     return res;
5181 
5182   onError:
5183     Py_XDECREF(res);
5184     Py_XDECREF(exc);
5185     Py_XDECREF(errorHandler);
5186     return NULL;
5187 }
5188 
PyUnicode_Translate(PyObject * str,PyObject * mapping,const char * errors)5189 PyObject *PyUnicode_Translate(PyObject *str,
5190                               PyObject *mapping,
5191                               const char *errors)
5192 {
5193     PyObject *result;
5194 
5195     str = PyUnicode_FromObject(str);
5196     if (str == NULL)
5197         goto onError;
5198     result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
5199                                         PyUnicode_GET_SIZE(str),
5200                                         mapping,
5201                                         errors);
5202     Py_DECREF(str);
5203     return result;
5204 
5205   onError:
5206     Py_XDECREF(str);
5207     return NULL;
5208 }
5209 
5210 /* --- Decimal Encoder ---------------------------------------------------- */
5211 
PyUnicode_EncodeDecimal(Py_UNICODE * s,Py_ssize_t length,char * output,const char * errors)5212 int PyUnicode_EncodeDecimal(Py_UNICODE *s,
5213                             Py_ssize_t length,
5214                             char *output,
5215                             const char *errors)
5216 {
5217     Py_UNICODE *p, *end;
5218     PyObject *errorHandler = NULL;
5219     PyObject *exc = NULL;
5220     const char *encoding = "decimal";
5221     const char *reason = "invalid decimal Unicode string";
5222     /* the following variable is used for caching string comparisons
5223      * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
5224     int known_errorHandler = -1;
5225 
5226     if (output == NULL) {
5227         PyErr_BadArgument();
5228         return -1;
5229     }
5230 
5231     p = s;
5232     end = s + length;
5233     while (p < end) {
5234         register Py_UNICODE ch = *p;
5235         int decimal;
5236         PyObject *repunicode;
5237         Py_ssize_t repsize;
5238         Py_ssize_t newpos;
5239         Py_UNICODE *uni2;
5240         Py_UNICODE *collstart;
5241         Py_UNICODE *collend;
5242 
5243         if (Py_UNICODE_ISSPACE(ch)) {
5244             *output++ = ' ';
5245             ++p;
5246             continue;
5247         }
5248         decimal = Py_UNICODE_TODECIMAL(ch);
5249         if (decimal >= 0) {
5250             *output++ = '0' + decimal;
5251             ++p;
5252             continue;
5253         }
5254         if (0 < ch && ch < 256) {
5255             *output++ = (char)ch;
5256             ++p;
5257             continue;
5258         }
5259         /* All other characters are considered unencodable */
5260         collstart = p;
5261         for (collend = p+1; collend < end; collend++) {
5262             if ((0 < *collend && *collend < 256) ||
5263                 Py_UNICODE_ISSPACE(*collend) ||
5264                 0 <= Py_UNICODE_TODECIMAL(*collend))
5265                 break;
5266         }
5267         /* cache callback name lookup
5268          * (if not done yet, i.e. it's the first error) */
5269         if (known_errorHandler==-1) {
5270             if ((errors==NULL) || (!strcmp(errors, "strict")))
5271                 known_errorHandler = 1;
5272             else if (!strcmp(errors, "replace"))
5273                 known_errorHandler = 2;
5274             else if (!strcmp(errors, "ignore"))
5275                 known_errorHandler = 3;
5276             else if (!strcmp(errors, "xmlcharrefreplace"))
5277                 known_errorHandler = 4;
5278             else
5279                 known_errorHandler = 0;
5280         }
5281         switch (known_errorHandler) {
5282         case 1: /* strict */
5283             raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
5284             goto onError;
5285         case 2: /* replace */
5286             for (p = collstart; p < collend; ++p)
5287                 *output++ = '?';
5288             /* fall through */
5289         case 3: /* ignore */
5290             p = collend;
5291             break;
5292         case 4: /* xmlcharrefreplace */
5293             /* generate replacement (temporarily (mis)uses p) */
5294             for (p = collstart; p < collend;) {
5295                 Py_UCS4 ch = _Py_UNICODE_NEXT(p, collend);
5296                 output += sprintf(output, "&#%d;", ch);
5297             }
5298             p = collend;
5299             break;
5300         default:
5301             repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
5302                                                           encoding, reason, s, length, &exc,
5303                                                           collstart-s, collend-s, &newpos);
5304             if (repunicode == NULL)
5305                 goto onError;
5306             /* generate replacement  */
5307             repsize = PyUnicode_GET_SIZE(repunicode);
5308             for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
5309                 Py_UNICODE ch = *uni2;
5310                 if (Py_UNICODE_ISSPACE(ch))
5311                     *output++ = ' ';
5312                 else {
5313                     decimal = Py_UNICODE_TODECIMAL(ch);
5314                     if (decimal >= 0)
5315                         *output++ = '0' + decimal;
5316                     else if (0 < ch && ch < 256)
5317                         *output++ = (char)ch;
5318                     else {
5319                         Py_DECREF(repunicode);
5320                         raise_encode_exception(&exc, encoding,
5321                                                s, length, collstart-s, collend-s, reason);
5322                         goto onError;
5323                     }
5324                 }
5325             }
5326             p = s + newpos;
5327             Py_DECREF(repunicode);
5328         }
5329     }
5330     /* 0-terminate the output string */
5331     *output++ = '\0';
5332     Py_XDECREF(exc);
5333     Py_XDECREF(errorHandler);
5334     return 0;
5335 
5336   onError:
5337     Py_XDECREF(exc);
5338     Py_XDECREF(errorHandler);
5339     return -1;
5340 }
5341 
5342 /* --- Helpers ------------------------------------------------------------ */
5343 
5344 #include "stringlib/unicodedefs.h"
5345 #include "stringlib/fastsearch.h"
5346 
5347 #include "stringlib/count.h"
5348 #include "stringlib/find.h"
5349 #include "stringlib/partition.h"
5350 #include "stringlib/split.h"
5351 
5352 /* helper macro to fixup start/end slice values */
5353 #define ADJUST_INDICES(start, end, len)         \
5354     if (end > len)                              \
5355         end = len;                              \
5356     else if (end < 0) {                         \
5357         end += len;                             \
5358         if (end < 0)                            \
5359             end = 0;                            \
5360     }                                           \
5361     if (start < 0) {                            \
5362         start += len;                           \
5363         if (start < 0)                          \
5364             start = 0;                          \
5365     }
5366 
PyUnicode_Count(PyObject * str,PyObject * substr,Py_ssize_t start,Py_ssize_t end)5367 Py_ssize_t PyUnicode_Count(PyObject *str,
5368                            PyObject *substr,
5369                            Py_ssize_t start,
5370                            Py_ssize_t end)
5371 {
5372     Py_ssize_t result;
5373     PyUnicodeObject* str_obj;
5374     PyUnicodeObject* sub_obj;
5375 
5376     str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
5377     if (!str_obj)
5378         return -1;
5379     sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
5380     if (!sub_obj) {
5381         Py_DECREF(str_obj);
5382         return -1;
5383     }
5384 
5385     ADJUST_INDICES(start, end, str_obj->length);
5386     result = stringlib_count(
5387         str_obj->str + start, end - start, sub_obj->str, sub_obj->length,
5388         PY_SSIZE_T_MAX
5389         );
5390 
5391     Py_DECREF(sub_obj);
5392     Py_DECREF(str_obj);
5393 
5394     return result;
5395 }
5396 
PyUnicode_Find(PyObject * str,PyObject * sub,Py_ssize_t start,Py_ssize_t end,int direction)5397 Py_ssize_t PyUnicode_Find(PyObject *str,
5398                           PyObject *sub,
5399                           Py_ssize_t start,
5400                           Py_ssize_t end,
5401                           int direction)
5402 {
5403     Py_ssize_t result;
5404 
5405     str = PyUnicode_FromObject(str);
5406     if (!str)
5407         return -2;
5408     sub = PyUnicode_FromObject(sub);
5409     if (!sub) {
5410         Py_DECREF(str);
5411         return -2;
5412     }
5413 
5414     if (direction > 0)
5415         result = stringlib_find_slice(
5416             PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5417             PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5418             start, end
5419             );
5420     else
5421         result = stringlib_rfind_slice(
5422             PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5423             PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5424             start, end
5425             );
5426 
5427     Py_DECREF(str);
5428     Py_DECREF(sub);
5429 
5430     return result;
5431 }
5432 
5433 static
tailmatch(PyUnicodeObject * self,PyUnicodeObject * substring,Py_ssize_t start,Py_ssize_t end,int direction)5434 int tailmatch(PyUnicodeObject *self,
5435               PyUnicodeObject *substring,
5436               Py_ssize_t start,
5437               Py_ssize_t end,
5438               int direction)
5439 {
5440     if (substring->length == 0)
5441         return 1;
5442 
5443     ADJUST_INDICES(start, end, self->length);
5444     end -= substring->length;
5445     if (end < start)
5446         return 0;
5447 
5448     if (direction > 0) {
5449         if (Py_UNICODE_MATCH(self, end, substring))
5450             return 1;
5451     } else {
5452         if (Py_UNICODE_MATCH(self, start, substring))
5453             return 1;
5454     }
5455 
5456     return 0;
5457 }
5458 
PyUnicode_Tailmatch(PyObject * str,PyObject * substr,Py_ssize_t start,Py_ssize_t end,int direction)5459 Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
5460                                PyObject *substr,
5461                                Py_ssize_t start,
5462                                Py_ssize_t end,
5463                                int direction)
5464 {
5465     Py_ssize_t result;
5466 
5467     str = PyUnicode_FromObject(str);
5468     if (str == NULL)
5469         return -1;
5470     substr = PyUnicode_FromObject(substr);
5471     if (substr == NULL) {
5472         Py_DECREF(str);
5473         return -1;
5474     }
5475 
5476     result = tailmatch((PyUnicodeObject *)str,
5477                        (PyUnicodeObject *)substr,
5478                        start, end, direction);
5479     Py_DECREF(str);
5480     Py_DECREF(substr);
5481     return result;
5482 }
5483 
5484 /* Apply fixfct filter to the Unicode object self and return a
5485    reference to the modified object */
5486 
5487 static
fixup(PyUnicodeObject * self,int (* fixfct)(PyUnicodeObject * s))5488 PyObject *fixup(PyUnicodeObject *self,
5489                 int (*fixfct)(PyUnicodeObject *s))
5490 {
5491 
5492     PyUnicodeObject *u;
5493 
5494     u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5495     if (u == NULL)
5496         return NULL;
5497 
5498     Py_UNICODE_COPY(u->str, self->str, self->length);
5499 
5500     if (!fixfct(u) && PyUnicode_CheckExact(self)) {
5501         /* fixfct should return TRUE if it modified the buffer. If
5502            FALSE, return a reference to the original buffer instead
5503            (to save space, not time) */
5504         Py_INCREF(self);
5505         Py_DECREF(u);
5506         return (PyObject*) self;
5507     }
5508     return (PyObject*) u;
5509 }
5510 
5511 static
fixupper(PyUnicodeObject * self)5512 int fixupper(PyUnicodeObject *self)
5513 {
5514     Py_ssize_t len = self->length;
5515     Py_UNICODE *s = self->str;
5516     int status = 0;
5517 
5518     while (len-- > 0) {
5519         register Py_UNICODE ch;
5520 
5521         ch = Py_UNICODE_TOUPPER(*s);
5522         if (ch != *s) {
5523             status = 1;
5524             *s = ch;
5525         }
5526         s++;
5527     }
5528 
5529     return status;
5530 }
5531 
5532 static
fixlower(PyUnicodeObject * self)5533 int fixlower(PyUnicodeObject *self)
5534 {
5535     Py_ssize_t len = self->length;
5536     Py_UNICODE *s = self->str;
5537     int status = 0;
5538 
5539     while (len-- > 0) {
5540         register Py_UNICODE ch;
5541 
5542         ch = Py_UNICODE_TOLOWER(*s);
5543         if (ch != *s) {
5544             status = 1;
5545             *s = ch;
5546         }
5547         s++;
5548     }
5549 
5550     return status;
5551 }
5552 
5553 static
fixswapcase(PyUnicodeObject * self)5554 int fixswapcase(PyUnicodeObject *self)
5555 {
5556     Py_ssize_t len = self->length;
5557     Py_UNICODE *s = self->str;
5558     int status = 0;
5559 
5560     while (len-- > 0) {
5561         if (Py_UNICODE_ISUPPER(*s)) {
5562             *s = Py_UNICODE_TOLOWER(*s);
5563             status = 1;
5564         } else if (Py_UNICODE_ISLOWER(*s)) {
5565             *s = Py_UNICODE_TOUPPER(*s);
5566             status = 1;
5567         }
5568         s++;
5569     }
5570 
5571     return status;
5572 }
5573 
5574 static
fixcapitalize(PyUnicodeObject * self)5575 int fixcapitalize(PyUnicodeObject *self)
5576 {
5577     Py_ssize_t len = self->length;
5578     Py_UNICODE *s = self->str;
5579     int status = 0;
5580 
5581     if (len == 0)
5582         return 0;
5583     if (!Py_UNICODE_ISUPPER(*s)) {
5584         *s = Py_UNICODE_TOUPPER(*s);
5585         status = 1;
5586     }
5587     s++;
5588     while (--len > 0) {
5589         if (!Py_UNICODE_ISLOWER(*s)) {
5590             *s = Py_UNICODE_TOLOWER(*s);
5591             status = 1;
5592         }
5593         s++;
5594     }
5595     return status;
5596 }
5597 
5598 static
fixtitle(PyUnicodeObject * self)5599 int fixtitle(PyUnicodeObject *self)
5600 {
5601     register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5602     register Py_UNICODE *e;
5603     int previous_is_cased;
5604 
5605     /* Shortcut for single character strings */
5606     if (PyUnicode_GET_SIZE(self) == 1) {
5607         Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
5608         if (*p != ch) {
5609             *p = ch;
5610             return 1;
5611         }
5612         else
5613             return 0;
5614     }
5615 
5616     e = p + PyUnicode_GET_SIZE(self);
5617     previous_is_cased = 0;
5618     for (; p < e; p++) {
5619         register const Py_UNICODE ch = *p;
5620 
5621         if (previous_is_cased)
5622             *p = Py_UNICODE_TOLOWER(ch);
5623         else
5624             *p = Py_UNICODE_TOTITLE(ch);
5625 
5626         if (Py_UNICODE_ISLOWER(ch) ||
5627             Py_UNICODE_ISUPPER(ch) ||
5628             Py_UNICODE_ISTITLE(ch))
5629             previous_is_cased = 1;
5630         else
5631             previous_is_cased = 0;
5632     }
5633     return 1;
5634 }
5635 
5636 PyObject *
PyUnicode_Join(PyObject * separator,PyObject * seq)5637 PyUnicode_Join(PyObject *separator, PyObject *seq)
5638 {
5639     PyObject *internal_separator = NULL;
5640     const Py_UNICODE blank = ' ';
5641     const Py_UNICODE *sep = &blank;
5642     Py_ssize_t seplen = 1;
5643     PyUnicodeObject *res = NULL; /* the result */
5644     Py_ssize_t res_alloc = 100;  /* # allocated bytes for string in res */
5645     Py_ssize_t res_used;         /* # used bytes */
5646     Py_UNICODE *res_p;       /* pointer to free byte in res's string area */
5647     PyObject *fseq;          /* PySequence_Fast(seq) */
5648     Py_ssize_t seqlen;              /* len(fseq) -- number of items in sequence */
5649     PyObject *item;
5650     Py_ssize_t i;
5651 
5652     fseq = PySequence_Fast(seq, "can only join an iterable");
5653     if (fseq == NULL) {
5654         return NULL;
5655     }
5656 
5657     /* Grrrr.  A codec may be invoked to convert str objects to
5658      * Unicode, and so it's possible to call back into Python code
5659      * during PyUnicode_FromObject(), and so it's possible for a sick
5660      * codec to change the size of fseq (if seq is a list).  Therefore
5661      * we have to keep refetching the size -- can't assume seqlen
5662      * is invariant.
5663      */
5664     seqlen = PySequence_Fast_GET_SIZE(fseq);
5665     /* If empty sequence, return u"". */
5666     if (seqlen == 0) {
5667         res = _PyUnicode_New(0);  /* empty sequence; return u"" */
5668         goto Done;
5669     }
5670     /* If singleton sequence with an exact Unicode, return that. */
5671     if (seqlen == 1) {
5672         item = PySequence_Fast_GET_ITEM(fseq, 0);
5673         if (PyUnicode_CheckExact(item)) {
5674             Py_INCREF(item);
5675             res = (PyUnicodeObject *)item;
5676             goto Done;
5677         }
5678     }
5679 
5680     /* At least two items to join, or one that isn't exact Unicode. */
5681     if (seqlen > 1) {
5682         /* Set up sep and seplen -- they're needed. */
5683         if (separator == NULL) {
5684             sep = &blank;
5685             seplen = 1;
5686         }
5687         else {
5688             internal_separator = PyUnicode_FromObject(separator);
5689             if (internal_separator == NULL)
5690                 goto onError;
5691             sep = PyUnicode_AS_UNICODE(internal_separator);
5692             seplen = PyUnicode_GET_SIZE(internal_separator);
5693             /* In case PyUnicode_FromObject() mutated seq. */
5694             seqlen = PySequence_Fast_GET_SIZE(fseq);
5695         }
5696     }
5697 
5698     /* Get space. */
5699     res = _PyUnicode_New(res_alloc);
5700     if (res == NULL)
5701         goto onError;
5702     res_p = PyUnicode_AS_UNICODE(res);
5703     res_used = 0;
5704 
5705     for (i = 0; i < seqlen; ++i) {
5706         Py_ssize_t itemlen;
5707         Py_ssize_t new_res_used;
5708 
5709         item = PySequence_Fast_GET_ITEM(fseq, i);
5710         /* Convert item to Unicode. */
5711         if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
5712             PyErr_Format(PyExc_TypeError,
5713                          "sequence item %zd: expected string or Unicode,"
5714                          " %.80s found",
5715                          i, Py_TYPE(item)->tp_name);
5716             goto onError;
5717         }
5718         item = PyUnicode_FromObject(item);
5719         if (item == NULL)
5720             goto onError;
5721         /* We own a reference to item from here on. */
5722 
5723         /* In case PyUnicode_FromObject() mutated seq. */
5724         seqlen = PySequence_Fast_GET_SIZE(fseq);
5725 
5726         /* Make sure we have enough space for the separator and the item. */
5727         itemlen = PyUnicode_GET_SIZE(item);
5728         new_res_used = res_used + itemlen;
5729         if (new_res_used < 0)
5730             goto Overflow;
5731         if (i < seqlen - 1) {
5732             new_res_used += seplen;
5733             if (new_res_used < 0)
5734                 goto Overflow;
5735         }
5736         if (new_res_used > res_alloc) {
5737             /* double allocated size until it's big enough */
5738             do {
5739                 res_alloc += res_alloc;
5740                 if (res_alloc <= 0)
5741                     goto Overflow;
5742             } while (new_res_used > res_alloc);
5743             if (_PyUnicode_Resize(&res, res_alloc) < 0) {
5744                 Py_DECREF(item);
5745                 goto onError;
5746             }
5747             res_p = PyUnicode_AS_UNICODE(res) + res_used;
5748         }
5749 
5750         /* Copy item, and maybe the separator. */
5751         Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
5752         res_p += itemlen;
5753         if (i < seqlen - 1) {
5754             Py_UNICODE_COPY(res_p, sep, seplen);
5755             res_p += seplen;
5756         }
5757         Py_DECREF(item);
5758         res_used = new_res_used;
5759     }
5760 
5761     /* Shrink res to match the used area; this probably can't fail,
5762      * but it's cheap to check.
5763      */
5764     if (_PyUnicode_Resize(&res, res_used) < 0)
5765         goto onError;
5766 
5767   Done:
5768     Py_XDECREF(internal_separator);
5769     Py_DECREF(fseq);
5770     return (PyObject *)res;
5771 
5772   Overflow:
5773     PyErr_SetString(PyExc_OverflowError,
5774                     "join() result is too long for a Python string");
5775     Py_DECREF(item);
5776     /* fall through */
5777 
5778   onError:
5779     Py_XDECREF(internal_separator);
5780     Py_DECREF(fseq);
5781     Py_XDECREF(res);
5782     return NULL;
5783 }
5784 
5785 static
pad(PyUnicodeObject * self,Py_ssize_t left,Py_ssize_t right,Py_UNICODE fill)5786 PyUnicodeObject *pad(PyUnicodeObject *self,
5787                      Py_ssize_t left,
5788                      Py_ssize_t right,
5789                      Py_UNICODE fill)
5790 {
5791     PyUnicodeObject *u;
5792 
5793     if (left < 0)
5794         left = 0;
5795     if (right < 0)
5796         right = 0;
5797 
5798     if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
5799         Py_INCREF(self);
5800         return self;
5801     }
5802 
5803     if (left > PY_SSIZE_T_MAX - self->length ||
5804         right > PY_SSIZE_T_MAX - (left + self->length)) {
5805         PyErr_SetString(PyExc_OverflowError, "padded string is too long");
5806         return NULL;
5807     }
5808     u = _PyUnicode_New(left + self->length + right);
5809     if (u) {
5810         if (left)
5811             Py_UNICODE_FILL(u->str, fill, left);
5812         Py_UNICODE_COPY(u->str + left, self->str, self->length);
5813         if (right)
5814             Py_UNICODE_FILL(u->str + left + self->length, fill, right);
5815     }
5816 
5817     return u;
5818 }
5819 
PyUnicode_Splitlines(PyObject * string,int keepends)5820 PyObject *PyUnicode_Splitlines(PyObject *string, int keepends)
5821 {
5822     PyObject *list;
5823 
5824     string = PyUnicode_FromObject(string);
5825     if (string == NULL)
5826         return NULL;
5827 
5828     list = stringlib_splitlines(
5829         (PyObject*) string, PyUnicode_AS_UNICODE(string),
5830         PyUnicode_GET_SIZE(string), keepends);
5831 
5832     Py_DECREF(string);
5833     return list;
5834 }
5835 
5836 static
split(PyUnicodeObject * self,PyUnicodeObject * substring,Py_ssize_t maxcount)5837 PyObject *split(PyUnicodeObject *self,
5838                 PyUnicodeObject *substring,
5839                 Py_ssize_t maxcount)
5840 {
5841     if (maxcount < 0)
5842         maxcount = PY_SSIZE_T_MAX;
5843 
5844     if (substring == NULL)
5845         return stringlib_split_whitespace(
5846             (PyObject*) self,  self->str, self->length, maxcount
5847             );
5848 
5849     return stringlib_split(
5850         (PyObject*) self,  self->str, self->length,
5851         substring->str, substring->length,
5852         maxcount
5853         );
5854 }
5855 
5856 static
rsplit(PyUnicodeObject * self,PyUnicodeObject * substring,Py_ssize_t maxcount)5857 PyObject *rsplit(PyUnicodeObject *self,
5858                  PyUnicodeObject *substring,
5859                  Py_ssize_t maxcount)
5860 {
5861     if (maxcount < 0)
5862         maxcount = PY_SSIZE_T_MAX;
5863 
5864     if (substring == NULL)
5865         return stringlib_rsplit_whitespace(
5866             (PyObject*) self,  self->str, self->length, maxcount
5867             );
5868 
5869     return stringlib_rsplit(
5870         (PyObject*) self,  self->str, self->length,
5871         substring->str, substring->length,
5872         maxcount
5873         );
5874 }
5875 
5876 static
replace(PyUnicodeObject * self,PyUnicodeObject * str1,PyUnicodeObject * str2,Py_ssize_t maxcount)5877 PyObject *replace(PyUnicodeObject *self,
5878                   PyUnicodeObject *str1,
5879                   PyUnicodeObject *str2,
5880                   Py_ssize_t maxcount)
5881 {
5882     PyUnicodeObject *u;
5883 
5884     if (maxcount < 0)
5885         maxcount = PY_SSIZE_T_MAX;
5886     else if (maxcount == 0 || self->length == 0)
5887         goto nothing;
5888 
5889     if (str1->length == str2->length) {
5890         Py_ssize_t i;
5891         /* same length */
5892         if (str1->length == 0)
5893             goto nothing;
5894         if (str1->length == 1) {
5895             /* replace characters */
5896             Py_UNICODE u1, u2;
5897             if (!findchar(self->str, self->length, str1->str[0]))
5898                 goto nothing;
5899             u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5900             if (!u)
5901                 return NULL;
5902             Py_UNICODE_COPY(u->str, self->str, self->length);
5903             u1 = str1->str[0];
5904             u2 = str2->str[0];
5905             for (i = 0; i < u->length; i++)
5906                 if (u->str[i] == u1) {
5907                     if (--maxcount < 0)
5908                         break;
5909                     u->str[i] = u2;
5910                 }
5911         } else {
5912             i = stringlib_find(
5913                 self->str, self->length, str1->str, str1->length, 0
5914                 );
5915             if (i < 0)
5916                 goto nothing;
5917             u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5918             if (!u)
5919                 return NULL;
5920             Py_UNICODE_COPY(u->str, self->str, self->length);
5921 
5922             /* change everything in-place, starting with this one */
5923             Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5924             i += str1->length;
5925 
5926             while ( --maxcount > 0) {
5927                 i = stringlib_find(self->str+i, self->length-i,
5928                                    str1->str, str1->length,
5929                                    i);
5930                 if (i == -1)
5931                     break;
5932                 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5933                 i += str1->length;
5934             }
5935         }
5936     } else {
5937 
5938         Py_ssize_t n, i, j;
5939         Py_ssize_t product, new_size, delta;
5940         Py_UNICODE *p;
5941 
5942         /* replace strings */
5943         n = stringlib_count(self->str, self->length, str1->str, str1->length,
5944                             maxcount);
5945         if (n == 0)
5946             goto nothing;
5947         /* new_size = self->length + n * (str2->length - str1->length)); */
5948         delta = (str2->length - str1->length);
5949         if (delta == 0) {
5950             new_size = self->length;
5951         } else {
5952             product = n * (str2->length - str1->length);
5953             if ((product / (str2->length - str1->length)) != n) {
5954                 PyErr_SetString(PyExc_OverflowError,
5955                                 "replace string is too long");
5956                 return NULL;
5957             }
5958             new_size = self->length + product;
5959             if (new_size < 0) {
5960                 PyErr_SetString(PyExc_OverflowError,
5961                                 "replace string is too long");
5962                 return NULL;
5963             }
5964         }
5965         u = _PyUnicode_New(new_size);
5966         if (!u)
5967             return NULL;
5968         i = 0;
5969         p = u->str;
5970         if (str1->length > 0) {
5971             while (n-- > 0) {
5972                 /* look for next match */
5973                 j = stringlib_find(self->str+i, self->length-i,
5974                                    str1->str, str1->length,
5975                                    i);
5976                 if (j == -1)
5977                     break;
5978                 else if (j > i) {
5979                     /* copy unchanged part [i:j] */
5980                     Py_UNICODE_COPY(p, self->str+i, j-i);
5981                     p += j - i;
5982                 }
5983                 /* copy substitution string */
5984                 if (str2->length > 0) {
5985                     Py_UNICODE_COPY(p, str2->str, str2->length);
5986                     p += str2->length;
5987                 }
5988                 i = j + str1->length;
5989             }
5990             if (i < self->length)
5991                 /* copy tail [i:] */
5992                 Py_UNICODE_COPY(p, self->str+i, self->length-i);
5993         } else {
5994             /* interleave */
5995             while (n > 0) {
5996                 Py_UNICODE_COPY(p, str2->str, str2->length);
5997                 p += str2->length;
5998                 if (--n <= 0)
5999                     break;
6000                 *p++ = self->str[i++];
6001             }
6002             Py_UNICODE_COPY(p, self->str+i, self->length-i);
6003         }
6004     }
6005     return (PyObject *) u;
6006 
6007   nothing:
6008     /* nothing to replace; return original string (when possible) */
6009     if (PyUnicode_CheckExact(self)) {
6010         Py_INCREF(self);
6011         return (PyObject *) self;
6012     }
6013     return PyUnicode_FromUnicode(self->str, self->length);
6014 }
6015 
6016 /* --- Unicode Object Methods --------------------------------------------- */
6017 
6018 PyDoc_STRVAR(title__doc__,
6019              "S.title() -> unicode\n\
6020 \n\
6021 Return a titlecased version of S, i.e. words start with title case\n\
6022 characters, all remaining cased characters have lower case.");
6023 
6024 static PyObject*
unicode_title(PyUnicodeObject * self)6025 unicode_title(PyUnicodeObject *self)
6026 {
6027     return fixup(self, fixtitle);
6028 }
6029 
6030 PyDoc_STRVAR(capitalize__doc__,
6031              "S.capitalize() -> unicode\n\
6032 \n\
6033 Return a capitalized version of S, i.e. make the first character\n\
6034 have upper case and the rest lower case.");
6035 
6036 static PyObject*
unicode_capitalize(PyUnicodeObject * self)6037 unicode_capitalize(PyUnicodeObject *self)
6038 {
6039     return fixup(self, fixcapitalize);
6040 }
6041 
6042 #if 0
6043 PyDoc_STRVAR(capwords__doc__,
6044              "S.capwords() -> unicode\n\
6045 \n\
6046 Apply .capitalize() to all words in S and return the result with\n\
6047 normalized whitespace (all whitespace strings are replaced by ' ').");
6048 
6049 static PyObject*
6050 unicode_capwords(PyUnicodeObject *self)
6051 {
6052     PyObject *list;
6053     PyObject *item;
6054     Py_ssize_t i;
6055 
6056     /* Split into words */
6057     list = split(self, NULL, -1);
6058     if (!list)
6059         return NULL;
6060 
6061     /* Capitalize each word */
6062     for (i = 0; i < PyList_GET_SIZE(list); i++) {
6063         item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
6064                      fixcapitalize);
6065         if (item == NULL)
6066             goto onError;
6067         Py_DECREF(PyList_GET_ITEM(list, i));
6068         PyList_SET_ITEM(list, i, item);
6069     }
6070 
6071     /* Join the words to form a new string */
6072     item = PyUnicode_Join(NULL, list);
6073 
6074   onError:
6075     Py_DECREF(list);
6076     return (PyObject *)item;
6077 }
6078 #endif
6079 
6080 /* Argument converter.  Coerces to a single unicode character */
6081 
6082 static int
convert_uc(PyObject * obj,void * addr)6083 convert_uc(PyObject *obj, void *addr)
6084 {
6085     Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
6086     PyObject *uniobj;
6087     Py_UNICODE *unistr;
6088 
6089     uniobj = PyUnicode_FromObject(obj);
6090     if (uniobj == NULL) {
6091         PyErr_SetString(PyExc_TypeError,
6092                         "The fill character cannot be converted to Unicode");
6093         return 0;
6094     }
6095     if (PyUnicode_GET_SIZE(uniobj) != 1) {
6096         PyErr_SetString(PyExc_TypeError,
6097                         "The fill character must be exactly one character long");
6098         Py_DECREF(uniobj);
6099         return 0;
6100     }
6101     unistr = PyUnicode_AS_UNICODE(uniobj);
6102     *fillcharloc = unistr[0];
6103     Py_DECREF(uniobj);
6104     return 1;
6105 }
6106 
6107 PyDoc_STRVAR(center__doc__,
6108              "S.center(width[, fillchar]) -> unicode\n\
6109 \n\
6110 Return S centered in a Unicode string of length width. Padding is\n\
6111 done using the specified fill character (default is a space)");
6112 
6113 static PyObject *
unicode_center(PyUnicodeObject * self,PyObject * args)6114 unicode_center(PyUnicodeObject *self, PyObject *args)
6115 {
6116     Py_ssize_t marg, left;
6117     Py_ssize_t width;
6118     Py_UNICODE fillchar = ' ';
6119 
6120     if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
6121         return NULL;
6122 
6123     if (self->length >= width && PyUnicode_CheckExact(self)) {
6124         Py_INCREF(self);
6125         return (PyObject*) self;
6126     }
6127 
6128     marg = width - self->length;
6129     left = marg / 2 + (marg & width & 1);
6130 
6131     return (PyObject*) pad(self, left, marg - left, fillchar);
6132 }
6133 
6134 #if 0
6135 
6136 /* This code should go into some future Unicode collation support
6137    module. The basic comparison should compare ordinals on a naive
6138    basis (this is what Java does and thus Jython too). */
6139 
6140 /* speedy UTF-16 code point order comparison */
6141 /* gleaned from: */
6142 /* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6143 
6144 static short utf16Fixup[32] =
6145 {
6146     0, 0, 0, 0, 0, 0, 0, 0,
6147     0, 0, 0, 0, 0, 0, 0, 0,
6148     0, 0, 0, 0, 0, 0, 0, 0,
6149     0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
6150 };
6151 
6152 static int
6153 unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6154 {
6155     Py_ssize_t len1, len2;
6156 
6157     Py_UNICODE *s1 = str1->str;
6158     Py_UNICODE *s2 = str2->str;
6159 
6160     len1 = str1->length;
6161     len2 = str2->length;
6162 
6163     while (len1 > 0 && len2 > 0) {
6164         Py_UNICODE c1, c2;
6165 
6166         c1 = *s1++;
6167         c2 = *s2++;
6168 
6169         if (c1 > (1<<11) * 26)
6170             c1 += utf16Fixup[c1>>11];
6171         if (c2 > (1<<11) * 26)
6172             c2 += utf16Fixup[c2>>11];
6173         /* now c1 and c2 are in UTF-32-compatible order */
6174 
6175         if (c1 != c2)
6176             return (c1 < c2) ? -1 : 1;
6177 
6178         len1--; len2--;
6179     }
6180 
6181     return (len1 < len2) ? -1 : (len1 != len2);
6182 }
6183 
6184 #else
6185 
6186 static int
unicode_compare(PyUnicodeObject * str1,PyUnicodeObject * str2)6187 unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6188 {
6189     register Py_ssize_t len1, len2;
6190 
6191     Py_UNICODE *s1 = str1->str;
6192     Py_UNICODE *s2 = str2->str;
6193 
6194     len1 = str1->length;
6195     len2 = str2->length;
6196 
6197     while (len1 > 0 && len2 > 0) {
6198         Py_UNICODE c1, c2;
6199 
6200         c1 = *s1++;
6201         c2 = *s2++;
6202 
6203         if (c1 != c2)
6204             return (c1 < c2) ? -1 : 1;
6205 
6206         len1--; len2--;
6207     }
6208 
6209     return (len1 < len2) ? -1 : (len1 != len2);
6210 }
6211 
6212 #endif
6213 
PyUnicode_Compare(PyObject * left,PyObject * right)6214 int PyUnicode_Compare(PyObject *left,
6215                       PyObject *right)
6216 {
6217     PyUnicodeObject *u = NULL, *v = NULL;
6218     int result;
6219 
6220     /* Coerce the two arguments */
6221     u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6222     if (u == NULL)
6223         goto onError;
6224     v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6225     if (v == NULL)
6226         goto onError;
6227 
6228     /* Shortcut for empty or interned objects */
6229     if (v == u) {
6230         Py_DECREF(u);
6231         Py_DECREF(v);
6232         return 0;
6233     }
6234 
6235     result = unicode_compare(u, v);
6236 
6237     Py_DECREF(u);
6238     Py_DECREF(v);
6239     return result;
6240 
6241   onError:
6242     Py_XDECREF(u);
6243     Py_XDECREF(v);
6244     return -1;
6245 }
6246 
PyUnicode_RichCompare(PyObject * left,PyObject * right,int op)6247 PyObject *PyUnicode_RichCompare(PyObject *left,
6248                                 PyObject *right,
6249                                 int op)
6250 {
6251     int result;
6252 
6253     result = PyUnicode_Compare(left, right);
6254     if (result == -1 && PyErr_Occurred())
6255         goto onError;
6256 
6257     /* Convert the return value to a Boolean */
6258     switch (op) {
6259     case Py_EQ:
6260         result = (result == 0);
6261         break;
6262     case Py_NE:
6263         result = (result != 0);
6264         break;
6265     case Py_LE:
6266         result = (result <= 0);
6267         break;
6268     case Py_GE:
6269         result = (result >= 0);
6270         break;
6271     case Py_LT:
6272         result = (result == -1);
6273         break;
6274     case Py_GT:
6275         result = (result == 1);
6276         break;
6277     }
6278     return PyBool_FromLong(result);
6279 
6280   onError:
6281 
6282     /* Standard case
6283 
6284        Type errors mean that PyUnicode_FromObject() could not convert
6285        one of the arguments (usually the right hand side) to Unicode,
6286        ie. we can't handle the comparison request. However, it is
6287        possible that the other object knows a comparison method, which
6288        is why we return Py_NotImplemented to give the other object a
6289        chance.
6290 
6291     */
6292     if (PyErr_ExceptionMatches(PyExc_TypeError)) {
6293         PyErr_Clear();
6294         Py_INCREF(Py_NotImplemented);
6295         return Py_NotImplemented;
6296     }
6297     if (op != Py_EQ && op != Py_NE)
6298         return NULL;
6299 
6300     /* Equality comparison.
6301 
6302        This is a special case: we silence any PyExc_UnicodeDecodeError
6303        and instead turn it into a PyErr_UnicodeWarning.
6304 
6305     */
6306     if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
6307         return NULL;
6308     PyErr_Clear();
6309     if (PyErr_Warn(PyExc_UnicodeWarning,
6310                    (op == Py_EQ) ?
6311                    "Unicode equal comparison "
6312                    "failed to convert both arguments to Unicode - "
6313                    "interpreting them as being unequal" :
6314                    "Unicode unequal comparison "
6315                    "failed to convert both arguments to Unicode - "
6316                    "interpreting them as being unequal"
6317             ) < 0)
6318         return NULL;
6319     result = (op == Py_NE);
6320     return PyBool_FromLong(result);
6321 }
6322 
PyUnicode_Contains(PyObject * container,PyObject * element)6323 int PyUnicode_Contains(PyObject *container,
6324                        PyObject *element)
6325 {
6326     PyObject *str, *sub;
6327     int result;
6328 
6329     /* Coerce the two arguments */
6330     sub = PyUnicode_FromObject(element);
6331     if (!sub) {
6332         return -1;
6333     }
6334 
6335     str = PyUnicode_FromObject(container);
6336     if (!str) {
6337         Py_DECREF(sub);
6338         return -1;
6339     }
6340 
6341     result = stringlib_contains_obj(str, sub);
6342 
6343     Py_DECREF(str);
6344     Py_DECREF(sub);
6345 
6346     return result;
6347 }
6348 
6349 /* Concat to string or Unicode object giving a new Unicode object. */
6350 
PyUnicode_Concat(PyObject * left,PyObject * right)6351 PyObject *PyUnicode_Concat(PyObject *left,
6352                            PyObject *right)
6353 {
6354     PyUnicodeObject *u = NULL, *v = NULL, *w;
6355 
6356     /* Coerce the two arguments */
6357     u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6358     if (u == NULL)
6359         goto onError;
6360     v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6361     if (v == NULL)
6362         goto onError;
6363 
6364     /* Shortcuts */
6365     if (v == unicode_empty) {
6366         Py_DECREF(v);
6367         return (PyObject *)u;
6368     }
6369     if (u == unicode_empty) {
6370         Py_DECREF(u);
6371         return (PyObject *)v;
6372     }
6373 
6374     /* Concat the two Unicode strings */
6375     w = _PyUnicode_New(u->length + v->length);
6376     if (w == NULL)
6377         goto onError;
6378     Py_UNICODE_COPY(w->str, u->str, u->length);
6379     Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
6380 
6381     Py_DECREF(u);
6382     Py_DECREF(v);
6383     return (PyObject *)w;
6384 
6385   onError:
6386     Py_XDECREF(u);
6387     Py_XDECREF(v);
6388     return NULL;
6389 }
6390 
6391 PyDoc_STRVAR(count__doc__,
6392              "S.count(sub[, start[, end]]) -> int\n\
6393 \n\
6394 Return the number of non-overlapping occurrences of substring sub in\n\
6395 Unicode string S[start:end].  Optional arguments start and end are\n\
6396 interpreted as in slice notation.");
6397 
6398 static PyObject *
unicode_count(PyUnicodeObject * self,PyObject * args)6399 unicode_count(PyUnicodeObject *self, PyObject *args)
6400 {
6401     PyUnicodeObject *substring;
6402     Py_ssize_t start = 0;
6403     Py_ssize_t end = PY_SSIZE_T_MAX;
6404     PyObject *result;
6405 
6406     if (!stringlib_parse_args_finds_unicode("count", args, &substring,
6407                                             &start, &end))
6408         return NULL;
6409 
6410     ADJUST_INDICES(start, end, self->length);
6411     result = PyInt_FromSsize_t(
6412         stringlib_count(self->str + start, end - start,
6413                         substring->str, substring->length,
6414                         PY_SSIZE_T_MAX)
6415         );
6416 
6417     Py_DECREF(substring);
6418 
6419     return result;
6420 }
6421 
6422 PyDoc_STRVAR(encode__doc__,
6423              "S.encode([encoding[,errors]]) -> string or unicode\n\
6424 \n\
6425 Encodes S using the codec registered for encoding. encoding defaults\n\
6426 to the default encoding. errors may be given to set a different error\n\
6427 handling scheme. Default is 'strict' meaning that encoding errors raise\n\
6428 a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
6429 'xmlcharrefreplace' as well as any other name registered with\n\
6430 codecs.register_error that can handle UnicodeEncodeErrors.");
6431 
6432 static PyObject *
unicode_encode(PyUnicodeObject * self,PyObject * args,PyObject * kwargs)6433 unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
6434 {
6435     static char *kwlist[] = {"encoding", "errors", 0};
6436     char *encoding = NULL;
6437     char *errors = NULL;
6438     PyObject *v;
6439 
6440     if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
6441                                      kwlist, &encoding, &errors))
6442         return NULL;
6443     v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
6444     if (v == NULL)
6445         goto onError;
6446     if (!PyString_Check(v) && !PyUnicode_Check(v)) {
6447         PyErr_Format(PyExc_TypeError,
6448                      "encoder did not return a string/unicode object "
6449                      "(type=%.400s)",
6450                      Py_TYPE(v)->tp_name);
6451         Py_DECREF(v);
6452         return NULL;
6453     }
6454     return v;
6455 
6456   onError:
6457     return NULL;
6458 }
6459 
6460 PyDoc_STRVAR(decode__doc__,
6461              "S.decode([encoding[,errors]]) -> string or unicode\n\
6462 \n\
6463 Decodes S using the codec registered for encoding. encoding defaults\n\
6464 to the default encoding. errors may be given to set a different error\n\
6465 handling scheme. Default is 'strict' meaning that encoding errors raise\n\
6466 a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
6467 as well as any other name registered with codecs.register_error that is\n\
6468 able to handle UnicodeDecodeErrors.");
6469 
6470 static PyObject *
unicode_decode(PyUnicodeObject * self,PyObject * args,PyObject * kwargs)6471 unicode_decode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
6472 {
6473     static char *kwlist[] = {"encoding", "errors", 0};
6474     char *encoding = NULL;
6475     char *errors = NULL;
6476     PyObject *v;
6477 
6478     if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:decode",
6479                                      kwlist, &encoding, &errors))
6480         return NULL;
6481     v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
6482     if (v == NULL)
6483         goto onError;
6484     if (!PyString_Check(v) && !PyUnicode_Check(v)) {
6485         PyErr_Format(PyExc_TypeError,
6486                      "decoder did not return a string/unicode object "
6487                      "(type=%.400s)",
6488                      Py_TYPE(v)->tp_name);
6489         Py_DECREF(v);
6490         return NULL;
6491     }
6492     return v;
6493 
6494   onError:
6495     return NULL;
6496 }
6497 
6498 PyDoc_STRVAR(expandtabs__doc__,
6499              "S.expandtabs([tabsize]) -> unicode\n\
6500 \n\
6501 Return a copy of S where all tab characters are expanded using spaces.\n\
6502 If tabsize is not given, a tab size of 8 characters is assumed.");
6503 
6504 static PyObject*
unicode_expandtabs(PyUnicodeObject * self,PyObject * args)6505 unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
6506 {
6507     Py_UNICODE *e;
6508     Py_UNICODE *p;
6509     Py_UNICODE *q;
6510     Py_UNICODE *qe;
6511     Py_ssize_t i, j, incr;
6512     PyUnicodeObject *u;
6513     int tabsize = 8;
6514 
6515     if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
6516         return NULL;
6517 
6518     /* First pass: determine size of output string */
6519     i = 0; /* chars up to and including most recent \n or \r */
6520     j = 0; /* chars since most recent \n or \r (use in tab calculations) */
6521     e = self->str + self->length; /* end of input */
6522     for (p = self->str; p < e; p++)
6523         if (*p == '\t') {
6524             if (tabsize > 0) {
6525                 incr = tabsize - (j % tabsize); /* cannot overflow */
6526                 if (j > PY_SSIZE_T_MAX - incr)
6527                     goto overflow1;
6528                 j += incr;
6529             }
6530         }
6531         else {
6532             if (j > PY_SSIZE_T_MAX - 1)
6533                 goto overflow1;
6534             j++;
6535             if (*p == '\n' || *p == '\r') {
6536                 if (i > PY_SSIZE_T_MAX - j)
6537                     goto overflow1;
6538                 i += j;
6539                 j = 0;
6540             }
6541         }
6542 
6543     if (i > PY_SSIZE_T_MAX - j)
6544         goto overflow1;
6545 
6546     /* Second pass: create output string and fill it */
6547     u = _PyUnicode_New(i + j);
6548     if (!u)
6549         return NULL;
6550 
6551     j = 0; /* same as in first pass */
6552     q = u->str; /* next output char */
6553     qe = u->str + u->length; /* end of output */
6554 
6555     for (p = self->str; p < e; p++)
6556         if (*p == '\t') {
6557             if (tabsize > 0) {
6558                 i = tabsize - (j % tabsize);
6559                 j += i;
6560                 while (i--) {
6561                     if (q >= qe)
6562                         goto overflow2;
6563                     *q++ = ' ';
6564                 }
6565             }
6566         }
6567         else {
6568             if (q >= qe)
6569                 goto overflow2;
6570             *q++ = *p;
6571             j++;
6572             if (*p == '\n' || *p == '\r')
6573                 j = 0;
6574         }
6575 
6576     return (PyObject*) u;
6577 
6578   overflow2:
6579     Py_DECREF(u);
6580   overflow1:
6581     PyErr_SetString(PyExc_OverflowError, "new string is too long");
6582     return NULL;
6583 }
6584 
6585 PyDoc_STRVAR(find__doc__,
6586              "S.find(sub [,start [,end]]) -> int\n\
6587 \n\
6588 Return the lowest index in S where substring sub is found,\n\
6589 such that sub is contained within S[start:end].  Optional\n\
6590 arguments start and end are interpreted as in slice notation.\n\
6591 \n\
6592 Return -1 on failure.");
6593 
6594 static PyObject *
unicode_find(PyUnicodeObject * self,PyObject * args)6595 unicode_find(PyUnicodeObject *self, PyObject *args)
6596 {
6597     PyUnicodeObject *substring;
6598     Py_ssize_t start;
6599     Py_ssize_t end;
6600     Py_ssize_t result;
6601 
6602     if (!stringlib_parse_args_finds_unicode("find", args, &substring,
6603                                             &start, &end))
6604         return NULL;
6605 
6606     result = stringlib_find_slice(
6607         PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6608         PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6609         start, end
6610         );
6611 
6612     Py_DECREF(substring);
6613 
6614     return PyInt_FromSsize_t(result);
6615 }
6616 
6617 static PyObject *
unicode_getitem(PyUnicodeObject * self,Py_ssize_t index)6618 unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
6619 {
6620     if (index < 0 || index >= self->length) {
6621         PyErr_SetString(PyExc_IndexError, "string index out of range");
6622         return NULL;
6623     }
6624 
6625     return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
6626 }
6627 
6628 static long
unicode_hash(PyUnicodeObject * self)6629 unicode_hash(PyUnicodeObject *self)
6630 {
6631     /* Since Unicode objects compare equal to their ASCII string
6632        counterparts, they should use the individual character values
6633        as basis for their hash value.  This is needed to assure that
6634        strings and Unicode objects behave in the same way as
6635        dictionary keys. */
6636 
6637     register Py_ssize_t len;
6638     register Py_UNICODE *p;
6639     register long x;
6640 
6641 #ifdef Py_DEBUG
6642     assert(_Py_HashSecret_Initialized);
6643 #endif
6644     if (self->hash != -1)
6645         return self->hash;
6646     len = PyUnicode_GET_SIZE(self);
6647     /*
6648       We make the hash of the empty string be 0, rather than using
6649       (prefix ^ suffix), since this slightly obfuscates the hash secret
6650     */
6651     if (len == 0) {
6652         self->hash = 0;
6653         return 0;
6654     }
6655     p = PyUnicode_AS_UNICODE(self);
6656     x = _Py_HashSecret.prefix;
6657     x ^= *p << 7;
6658     while (--len >= 0)
6659         x = (1000003*x) ^ *p++;
6660     x ^= PyUnicode_GET_SIZE(self);
6661     x ^= _Py_HashSecret.suffix;
6662     if (x == -1)
6663         x = -2;
6664     self->hash = x;
6665     return x;
6666 }
6667 
6668 PyDoc_STRVAR(index__doc__,
6669              "S.index(sub [,start [,end]]) -> int\n\
6670 \n\
6671 Like S.find() but raise ValueError when the substring is not found.");
6672 
6673 static PyObject *
unicode_index(PyUnicodeObject * self,PyObject * args)6674 unicode_index(PyUnicodeObject *self, PyObject *args)
6675 {
6676     Py_ssize_t result;
6677     PyUnicodeObject *substring;
6678     Py_ssize_t start;
6679     Py_ssize_t end;
6680 
6681     if (!stringlib_parse_args_finds_unicode("index", args, &substring,
6682                                             &start, &end))
6683         return NULL;
6684 
6685     result = stringlib_find_slice(
6686         PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6687         PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6688         start, end
6689         );
6690 
6691     Py_DECREF(substring);
6692 
6693     if (result < 0) {
6694         PyErr_SetString(PyExc_ValueError, "substring not found");
6695         return NULL;
6696     }
6697 
6698     return PyInt_FromSsize_t(result);
6699 }
6700 
6701 PyDoc_STRVAR(islower__doc__,
6702              "S.islower() -> bool\n\
6703 \n\
6704 Return True if all cased characters in S are lowercase and there is\n\
6705 at least one cased character in S, False otherwise.");
6706 
6707 static PyObject*
unicode_islower(PyUnicodeObject * self)6708 unicode_islower(PyUnicodeObject *self)
6709 {
6710     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6711     register const Py_UNICODE *e;
6712     int cased;
6713 
6714     /* Shortcut for single character strings */
6715     if (PyUnicode_GET_SIZE(self) == 1)
6716         return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
6717 
6718     /* Special case for empty strings */
6719     if (PyUnicode_GET_SIZE(self) == 0)
6720         return PyBool_FromLong(0);
6721 
6722     e = p + PyUnicode_GET_SIZE(self);
6723     cased = 0;
6724     for (; p < e; p++) {
6725         register const Py_UNICODE ch = *p;
6726 
6727         if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
6728             return PyBool_FromLong(0);
6729         else if (!cased && Py_UNICODE_ISLOWER(ch))
6730             cased = 1;
6731     }
6732     return PyBool_FromLong(cased);
6733 }
6734 
6735 PyDoc_STRVAR(isupper__doc__,
6736              "S.isupper() -> bool\n\
6737 \n\
6738 Return True if all cased characters in S are uppercase and there is\n\
6739 at least one cased character in S, False otherwise.");
6740 
6741 static PyObject*
unicode_isupper(PyUnicodeObject * self)6742 unicode_isupper(PyUnicodeObject *self)
6743 {
6744     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6745     register const Py_UNICODE *e;
6746     int cased;
6747 
6748     /* Shortcut for single character strings */
6749     if (PyUnicode_GET_SIZE(self) == 1)
6750         return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
6751 
6752     /* Special case for empty strings */
6753     if (PyUnicode_GET_SIZE(self) == 0)
6754         return PyBool_FromLong(0);
6755 
6756     e = p + PyUnicode_GET_SIZE(self);
6757     cased = 0;
6758     for (; p < e; p++) {
6759         register const Py_UNICODE ch = *p;
6760 
6761         if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
6762             return PyBool_FromLong(0);
6763         else if (!cased && Py_UNICODE_ISUPPER(ch))
6764             cased = 1;
6765     }
6766     return PyBool_FromLong(cased);
6767 }
6768 
6769 PyDoc_STRVAR(istitle__doc__,
6770              "S.istitle() -> bool\n\
6771 \n\
6772 Return True if S is a titlecased string and there is at least one\n\
6773 character in S, i.e. upper- and titlecase characters may only\n\
6774 follow uncased characters and lowercase characters only cased ones.\n\
6775 Return False otherwise.");
6776 
6777 static PyObject*
unicode_istitle(PyUnicodeObject * self)6778 unicode_istitle(PyUnicodeObject *self)
6779 {
6780     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6781     register const Py_UNICODE *e;
6782     int cased, previous_is_cased;
6783 
6784     /* Shortcut for single character strings */
6785     if (PyUnicode_GET_SIZE(self) == 1)
6786         return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
6787                                (Py_UNICODE_ISUPPER(*p) != 0));
6788 
6789     /* Special case for empty strings */
6790     if (PyUnicode_GET_SIZE(self) == 0)
6791         return PyBool_FromLong(0);
6792 
6793     e = p + PyUnicode_GET_SIZE(self);
6794     cased = 0;
6795     previous_is_cased = 0;
6796     for (; p < e; p++) {
6797         register const Py_UNICODE ch = *p;
6798 
6799         if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
6800             if (previous_is_cased)
6801                 return PyBool_FromLong(0);
6802             previous_is_cased = 1;
6803             cased = 1;
6804         }
6805         else if (Py_UNICODE_ISLOWER(ch)) {
6806             if (!previous_is_cased)
6807                 return PyBool_FromLong(0);
6808             previous_is_cased = 1;
6809             cased = 1;
6810         }
6811         else
6812             previous_is_cased = 0;
6813     }
6814     return PyBool_FromLong(cased);
6815 }
6816 
6817 PyDoc_STRVAR(isspace__doc__,
6818              "S.isspace() -> bool\n\
6819 \n\
6820 Return True if all characters in S are whitespace\n\
6821 and there is at least one character in S, False otherwise.");
6822 
6823 static PyObject*
unicode_isspace(PyUnicodeObject * self)6824 unicode_isspace(PyUnicodeObject *self)
6825 {
6826     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6827     register const Py_UNICODE *e;
6828 
6829     /* Shortcut for single character strings */
6830     if (PyUnicode_GET_SIZE(self) == 1 &&
6831         Py_UNICODE_ISSPACE(*p))
6832         return PyBool_FromLong(1);
6833 
6834     /* Special case for empty strings */
6835     if (PyUnicode_GET_SIZE(self) == 0)
6836         return PyBool_FromLong(0);
6837 
6838     e = p + PyUnicode_GET_SIZE(self);
6839     for (; p < e; p++) {
6840         if (!Py_UNICODE_ISSPACE(*p))
6841             return PyBool_FromLong(0);
6842     }
6843     return PyBool_FromLong(1);
6844 }
6845 
6846 PyDoc_STRVAR(isalpha__doc__,
6847              "S.isalpha() -> bool\n\
6848 \n\
6849 Return True if all characters in S are alphabetic\n\
6850 and there is at least one character in S, False otherwise.");
6851 
6852 static PyObject*
unicode_isalpha(PyUnicodeObject * self)6853 unicode_isalpha(PyUnicodeObject *self)
6854 {
6855     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6856     register const Py_UNICODE *e;
6857 
6858     /* Shortcut for single character strings */
6859     if (PyUnicode_GET_SIZE(self) == 1 &&
6860         Py_UNICODE_ISALPHA(*p))
6861         return PyBool_FromLong(1);
6862 
6863     /* Special case for empty strings */
6864     if (PyUnicode_GET_SIZE(self) == 0)
6865         return PyBool_FromLong(0);
6866 
6867     e = p + PyUnicode_GET_SIZE(self);
6868     for (; p < e; p++) {
6869         if (!Py_UNICODE_ISALPHA(*p))
6870             return PyBool_FromLong(0);
6871     }
6872     return PyBool_FromLong(1);
6873 }
6874 
6875 PyDoc_STRVAR(isalnum__doc__,
6876              "S.isalnum() -> bool\n\
6877 \n\
6878 Return True if all characters in S are alphanumeric\n\
6879 and there is at least one character in S, False otherwise.");
6880 
6881 static PyObject*
unicode_isalnum(PyUnicodeObject * self)6882 unicode_isalnum(PyUnicodeObject *self)
6883 {
6884     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6885     register const Py_UNICODE *e;
6886 
6887     /* Shortcut for single character strings */
6888     if (PyUnicode_GET_SIZE(self) == 1 &&
6889         Py_UNICODE_ISALNUM(*p))
6890         return PyBool_FromLong(1);
6891 
6892     /* Special case for empty strings */
6893     if (PyUnicode_GET_SIZE(self) == 0)
6894         return PyBool_FromLong(0);
6895 
6896     e = p + PyUnicode_GET_SIZE(self);
6897     for (; p < e; p++) {
6898         if (!Py_UNICODE_ISALNUM(*p))
6899             return PyBool_FromLong(0);
6900     }
6901     return PyBool_FromLong(1);
6902 }
6903 
6904 PyDoc_STRVAR(isdecimal__doc__,
6905              "S.isdecimal() -> bool\n\
6906 \n\
6907 Return True if there are only decimal characters in S,\n\
6908 False otherwise.");
6909 
6910 static PyObject*
unicode_isdecimal(PyUnicodeObject * self)6911 unicode_isdecimal(PyUnicodeObject *self)
6912 {
6913     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6914     register const Py_UNICODE *e;
6915 
6916     /* Shortcut for single character strings */
6917     if (PyUnicode_GET_SIZE(self) == 1 &&
6918         Py_UNICODE_ISDECIMAL(*p))
6919         return PyBool_FromLong(1);
6920 
6921     /* Special case for empty strings */
6922     if (PyUnicode_GET_SIZE(self) == 0)
6923         return PyBool_FromLong(0);
6924 
6925     e = p + PyUnicode_GET_SIZE(self);
6926     for (; p < e; p++) {
6927         if (!Py_UNICODE_ISDECIMAL(*p))
6928             return PyBool_FromLong(0);
6929     }
6930     return PyBool_FromLong(1);
6931 }
6932 
6933 PyDoc_STRVAR(isdigit__doc__,
6934              "S.isdigit() -> bool\n\
6935 \n\
6936 Return True if all characters in S are digits\n\
6937 and there is at least one character in S, False otherwise.");
6938 
6939 static PyObject*
unicode_isdigit(PyUnicodeObject * self)6940 unicode_isdigit(PyUnicodeObject *self)
6941 {
6942     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6943     register const Py_UNICODE *e;
6944 
6945     /* Shortcut for single character strings */
6946     if (PyUnicode_GET_SIZE(self) == 1 &&
6947         Py_UNICODE_ISDIGIT(*p))
6948         return PyBool_FromLong(1);
6949 
6950     /* Special case for empty strings */
6951     if (PyUnicode_GET_SIZE(self) == 0)
6952         return PyBool_FromLong(0);
6953 
6954     e = p + PyUnicode_GET_SIZE(self);
6955     for (; p < e; p++) {
6956         if (!Py_UNICODE_ISDIGIT(*p))
6957             return PyBool_FromLong(0);
6958     }
6959     return PyBool_FromLong(1);
6960 }
6961 
6962 PyDoc_STRVAR(isnumeric__doc__,
6963              "S.isnumeric() -> bool\n\
6964 \n\
6965 Return True if there are only numeric characters in S,\n\
6966 False otherwise.");
6967 
6968 static PyObject*
unicode_isnumeric(PyUnicodeObject * self)6969 unicode_isnumeric(PyUnicodeObject *self)
6970 {
6971     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6972     register const Py_UNICODE *e;
6973 
6974     /* Shortcut for single character strings */
6975     if (PyUnicode_GET_SIZE(self) == 1 &&
6976         Py_UNICODE_ISNUMERIC(*p))
6977         return PyBool_FromLong(1);
6978 
6979     /* Special case for empty strings */
6980     if (PyUnicode_GET_SIZE(self) == 0)
6981         return PyBool_FromLong(0);
6982 
6983     e = p + PyUnicode_GET_SIZE(self);
6984     for (; p < e; p++) {
6985         if (!Py_UNICODE_ISNUMERIC(*p))
6986             return PyBool_FromLong(0);
6987     }
6988     return PyBool_FromLong(1);
6989 }
6990 
6991 PyDoc_STRVAR(join__doc__,
6992              "S.join(iterable) -> unicode\n\
6993 \n\
6994 Return a string which is the concatenation of the strings in the\n\
6995 iterable.  The separator between elements is S.");
6996 
6997 static PyObject*
unicode_join(PyObject * self,PyObject * data)6998 unicode_join(PyObject *self, PyObject *data)
6999 {
7000     return PyUnicode_Join(self, data);
7001 }
7002 
7003 static Py_ssize_t
unicode_length(PyUnicodeObject * self)7004 unicode_length(PyUnicodeObject *self)
7005 {
7006     return self->length;
7007 }
7008 
7009 PyDoc_STRVAR(ljust__doc__,
7010              "S.ljust(width[, fillchar]) -> int\n\
7011 \n\
7012 Return S left-justified in a Unicode string of length width. Padding is\n\
7013 done using the specified fill character (default is a space).");
7014 
7015 static PyObject *
unicode_ljust(PyUnicodeObject * self,PyObject * args)7016 unicode_ljust(PyUnicodeObject *self, PyObject *args)
7017 {
7018     Py_ssize_t width;
7019     Py_UNICODE fillchar = ' ';
7020 
7021     if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
7022         return NULL;
7023 
7024     if (self->length >= width && PyUnicode_CheckExact(self)) {
7025         Py_INCREF(self);
7026         return (PyObject*) self;
7027     }
7028 
7029     return (PyObject*) pad(self, 0, width - self->length, fillchar);
7030 }
7031 
7032 PyDoc_STRVAR(lower__doc__,
7033              "S.lower() -> unicode\n\
7034 \n\
7035 Return a copy of the string S converted to lowercase.");
7036 
7037 static PyObject*
unicode_lower(PyUnicodeObject * self)7038 unicode_lower(PyUnicodeObject *self)
7039 {
7040     return fixup(self, fixlower);
7041 }
7042 
7043 #define LEFTSTRIP 0
7044 #define RIGHTSTRIP 1
7045 #define BOTHSTRIP 2
7046 
7047 /* Arrays indexed by above */
7048 static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
7049 
7050 #define STRIPNAME(i) (stripformat[i]+3)
7051 
7052 /* externally visible for str.strip(unicode) */
7053 PyObject *
_PyUnicode_XStrip(PyUnicodeObject * self,int striptype,PyObject * sepobj)7054 _PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
7055 {
7056     Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7057     Py_ssize_t len = PyUnicode_GET_SIZE(self);
7058     Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
7059     Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
7060     Py_ssize_t i, j;
7061 
7062     BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
7063 
7064     i = 0;
7065     if (striptype != RIGHTSTRIP) {
7066         while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
7067             i++;
7068         }
7069     }
7070 
7071     j = len;
7072     if (striptype != LEFTSTRIP) {
7073         do {
7074             j--;
7075         } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
7076         j++;
7077     }
7078 
7079     if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7080         Py_INCREF(self);
7081         return (PyObject*)self;
7082     }
7083     else
7084         return PyUnicode_FromUnicode(s+i, j-i);
7085 }
7086 
7087 
7088 static PyObject *
do_strip(PyUnicodeObject * self,int striptype)7089 do_strip(PyUnicodeObject *self, int striptype)
7090 {
7091     Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7092     Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
7093 
7094     i = 0;
7095     if (striptype != RIGHTSTRIP) {
7096         while (i < len && Py_UNICODE_ISSPACE(s[i])) {
7097             i++;
7098         }
7099     }
7100 
7101     j = len;
7102     if (striptype != LEFTSTRIP) {
7103         do {
7104             j--;
7105         } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
7106         j++;
7107     }
7108 
7109     if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7110         Py_INCREF(self);
7111         return (PyObject*)self;
7112     }
7113     else
7114         return PyUnicode_FromUnicode(s+i, j-i);
7115 }
7116 
7117 
7118 static PyObject *
do_argstrip(PyUnicodeObject * self,int striptype,PyObject * args)7119 do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
7120 {
7121     PyObject *sep = NULL;
7122 
7123     if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
7124         return NULL;
7125 
7126     if (sep != NULL && sep != Py_None) {
7127         if (PyUnicode_Check(sep))
7128             return _PyUnicode_XStrip(self, striptype, sep);
7129         else if (PyString_Check(sep)) {
7130             PyObject *res;
7131             sep = PyUnicode_FromObject(sep);
7132             if (sep==NULL)
7133                 return NULL;
7134             res = _PyUnicode_XStrip(self, striptype, sep);
7135             Py_DECREF(sep);
7136             return res;
7137         }
7138         else {
7139             PyErr_Format(PyExc_TypeError,
7140                          "%s arg must be None, unicode or str",
7141                          STRIPNAME(striptype));
7142             return NULL;
7143         }
7144     }
7145 
7146     return do_strip(self, striptype);
7147 }
7148 
7149 
7150 PyDoc_STRVAR(strip__doc__,
7151              "S.strip([chars]) -> unicode\n\
7152 \n\
7153 Return a copy of the string S with leading and trailing\n\
7154 whitespace removed.\n\
7155 If chars is given and not None, remove characters in chars instead.\n\
7156 If chars is a str, it will be converted to unicode before stripping");
7157 
7158 static PyObject *
unicode_strip(PyUnicodeObject * self,PyObject * args)7159 unicode_strip(PyUnicodeObject *self, PyObject *args)
7160 {
7161     if (PyTuple_GET_SIZE(args) == 0)
7162         return do_strip(self, BOTHSTRIP); /* Common case */
7163     else
7164         return do_argstrip(self, BOTHSTRIP, args);
7165 }
7166 
7167 
7168 PyDoc_STRVAR(lstrip__doc__,
7169              "S.lstrip([chars]) -> unicode\n\
7170 \n\
7171 Return a copy of the string S with leading whitespace removed.\n\
7172 If chars is given and not None, remove characters in chars instead.\n\
7173 If chars is a str, it will be converted to unicode before stripping");
7174 
7175 static PyObject *
unicode_lstrip(PyUnicodeObject * self,PyObject * args)7176 unicode_lstrip(PyUnicodeObject *self, PyObject *args)
7177 {
7178     if (PyTuple_GET_SIZE(args) == 0)
7179         return do_strip(self, LEFTSTRIP); /* Common case */
7180     else
7181         return do_argstrip(self, LEFTSTRIP, args);
7182 }
7183 
7184 
7185 PyDoc_STRVAR(rstrip__doc__,
7186              "S.rstrip([chars]) -> unicode\n\
7187 \n\
7188 Return a copy of the string S with trailing whitespace removed.\n\
7189 If chars is given and not None, remove characters in chars instead.\n\
7190 If chars is a str, it will be converted to unicode before stripping");
7191 
7192 static PyObject *
unicode_rstrip(PyUnicodeObject * self,PyObject * args)7193 unicode_rstrip(PyUnicodeObject *self, PyObject *args)
7194 {
7195     if (PyTuple_GET_SIZE(args) == 0)
7196         return do_strip(self, RIGHTSTRIP); /* Common case */
7197     else
7198         return do_argstrip(self, RIGHTSTRIP, args);
7199 }
7200 
7201 
7202 static PyObject*
unicode_repeat(PyUnicodeObject * str,Py_ssize_t len)7203 unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
7204 {
7205     PyUnicodeObject *u;
7206     Py_UNICODE *p;
7207     Py_ssize_t nchars;
7208     size_t nbytes;
7209 
7210     if (len < 0)
7211         len = 0;
7212 
7213     if (len == 1 && PyUnicode_CheckExact(str)) {
7214         /* no repeat, return original string */
7215         Py_INCREF(str);
7216         return (PyObject*) str;
7217     }
7218 
7219     /* ensure # of chars needed doesn't overflow int and # of bytes
7220      * needed doesn't overflow size_t
7221      */
7222     nchars = len * str->length;
7223     if (len && nchars / len != str->length) {
7224         PyErr_SetString(PyExc_OverflowError,
7225                         "repeated string is too long");
7226         return NULL;
7227     }
7228     nbytes = (nchars + 1) * sizeof(Py_UNICODE);
7229     if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
7230         PyErr_SetString(PyExc_OverflowError,
7231                         "repeated string is too long");
7232         return NULL;
7233     }
7234     u = _PyUnicode_New(nchars);
7235     if (!u)
7236         return NULL;
7237 
7238     p = u->str;
7239 
7240     if (str->length == 1 && len > 0) {
7241         Py_UNICODE_FILL(p, str->str[0], len);
7242     } else {
7243         Py_ssize_t done = 0; /* number of characters copied this far */
7244         if (done < nchars) {
7245             Py_UNICODE_COPY(p, str->str, str->length);
7246             done = str->length;
7247         }
7248         while (done < nchars) {
7249             Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
7250             Py_UNICODE_COPY(p+done, p, n);
7251             done += n;
7252         }
7253     }
7254 
7255     return (PyObject*) u;
7256 }
7257 
PyUnicode_Replace(PyObject * obj,PyObject * subobj,PyObject * replobj,Py_ssize_t maxcount)7258 PyObject *PyUnicode_Replace(PyObject *obj,
7259                             PyObject *subobj,
7260                             PyObject *replobj,
7261                             Py_ssize_t maxcount)
7262 {
7263     PyObject *self;
7264     PyObject *str1;
7265     PyObject *str2;
7266     PyObject *result;
7267 
7268     self = PyUnicode_FromObject(obj);
7269     if (self == NULL)
7270         return NULL;
7271     str1 = PyUnicode_FromObject(subobj);
7272     if (str1 == NULL) {
7273         Py_DECREF(self);
7274         return NULL;
7275     }
7276     str2 = PyUnicode_FromObject(replobj);
7277     if (str2 == NULL) {
7278         Py_DECREF(self);
7279         Py_DECREF(str1);
7280         return NULL;
7281     }
7282     result = replace((PyUnicodeObject *)self,
7283                      (PyUnicodeObject *)str1,
7284                      (PyUnicodeObject *)str2,
7285                      maxcount);
7286     Py_DECREF(self);
7287     Py_DECREF(str1);
7288     Py_DECREF(str2);
7289     return result;
7290 }
7291 
7292 PyDoc_STRVAR(replace__doc__,
7293              "S.replace(old, new[, count]) -> unicode\n\
7294 \n\
7295 Return a copy of S with all occurrences of substring\n\
7296 old replaced by new.  If the optional argument count is\n\
7297 given, only the first count occurrences are replaced.");
7298 
7299 static PyObject*
unicode_replace(PyUnicodeObject * self,PyObject * args)7300 unicode_replace(PyUnicodeObject *self, PyObject *args)
7301 {
7302     PyUnicodeObject *str1;
7303     PyUnicodeObject *str2;
7304     Py_ssize_t maxcount = -1;
7305     PyObject *result;
7306 
7307     if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
7308         return NULL;
7309     str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
7310     if (str1 == NULL)
7311         return NULL;
7312     str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
7313     if (str2 == NULL) {
7314         Py_DECREF(str1);
7315         return NULL;
7316     }
7317 
7318     result = replace(self, str1, str2, maxcount);
7319 
7320     Py_DECREF(str1);
7321     Py_DECREF(str2);
7322     return result;
7323 }
7324 
7325 static
unicode_repr(PyObject * unicode)7326 PyObject *unicode_repr(PyObject *unicode)
7327 {
7328     return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
7329                                 PyUnicode_GET_SIZE(unicode),
7330                                 1);
7331 }
7332 
7333 PyDoc_STRVAR(rfind__doc__,
7334              "S.rfind(sub [,start [,end]]) -> int\n\
7335 \n\
7336 Return the highest index in S where substring sub is found,\n\
7337 such that sub is contained within S[start:end].  Optional\n\
7338 arguments start and end are interpreted as in slice notation.\n\
7339 \n\
7340 Return -1 on failure.");
7341 
7342 static PyObject *
unicode_rfind(PyUnicodeObject * self,PyObject * args)7343 unicode_rfind(PyUnicodeObject *self, PyObject *args)
7344 {
7345     PyUnicodeObject *substring;
7346     Py_ssize_t start;
7347     Py_ssize_t end;
7348     Py_ssize_t result;
7349 
7350     if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
7351                                             &start, &end))
7352         return NULL;
7353 
7354     result = stringlib_rfind_slice(
7355         PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7356         PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7357         start, end
7358         );
7359 
7360     Py_DECREF(substring);
7361 
7362     return PyInt_FromSsize_t(result);
7363 }
7364 
7365 PyDoc_STRVAR(rindex__doc__,
7366              "S.rindex(sub [,start [,end]]) -> int\n\
7367 \n\
7368 Like S.rfind() but raise ValueError when the substring is not found.");
7369 
7370 static PyObject *
unicode_rindex(PyUnicodeObject * self,PyObject * args)7371 unicode_rindex(PyUnicodeObject *self, PyObject *args)
7372 {
7373     PyUnicodeObject *substring;
7374     Py_ssize_t start;
7375     Py_ssize_t end;
7376     Py_ssize_t result;
7377 
7378     if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
7379                                             &start, &end))
7380         return NULL;
7381 
7382     result = stringlib_rfind_slice(
7383         PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7384         PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7385         start, end
7386         );
7387 
7388     Py_DECREF(substring);
7389 
7390     if (result < 0) {
7391         PyErr_SetString(PyExc_ValueError, "substring not found");
7392         return NULL;
7393     }
7394     return PyInt_FromSsize_t(result);
7395 }
7396 
7397 PyDoc_STRVAR(rjust__doc__,
7398              "S.rjust(width[, fillchar]) -> unicode\n\
7399 \n\
7400 Return S right-justified in a Unicode string of length width. Padding is\n\
7401 done using the specified fill character (default is a space).");
7402 
7403 static PyObject *
unicode_rjust(PyUnicodeObject * self,PyObject * args)7404 unicode_rjust(PyUnicodeObject *self, PyObject *args)
7405 {
7406     Py_ssize_t width;
7407     Py_UNICODE fillchar = ' ';
7408 
7409     if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
7410         return NULL;
7411 
7412     if (self->length >= width && PyUnicode_CheckExact(self)) {
7413         Py_INCREF(self);
7414         return (PyObject*) self;
7415     }
7416 
7417     return (PyObject*) pad(self, width - self->length, 0, fillchar);
7418 }
7419 
7420 static PyObject*
unicode_slice(PyUnicodeObject * self,Py_ssize_t start,Py_ssize_t end)7421 unicode_slice(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t end)
7422 {
7423     /* standard clamping */
7424     if (start < 0)
7425         start = 0;
7426     if (end < 0)
7427         end = 0;
7428     if (end > self->length)
7429         end = self->length;
7430     if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
7431         /* full slice, return original string */
7432         Py_INCREF(self);
7433         return (PyObject*) self;
7434     }
7435     if (start > end)
7436         start = end;
7437     /* copy slice */
7438     return (PyObject*) PyUnicode_FromUnicode(self->str + start,
7439                                              end - start);
7440 }
7441 
PyUnicode_Split(PyObject * s,PyObject * sep,Py_ssize_t maxsplit)7442 PyObject *PyUnicode_Split(PyObject *s,
7443                           PyObject *sep,
7444                           Py_ssize_t maxsplit)
7445 {
7446     PyObject *result;
7447 
7448     s = PyUnicode_FromObject(s);
7449     if (s == NULL)
7450         return NULL;
7451     if (sep != NULL) {
7452         sep = PyUnicode_FromObject(sep);
7453         if (sep == NULL) {
7454             Py_DECREF(s);
7455             return NULL;
7456         }
7457     }
7458 
7459     result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7460 
7461     Py_DECREF(s);
7462     Py_XDECREF(sep);
7463     return result;
7464 }
7465 
7466 PyDoc_STRVAR(split__doc__,
7467              "S.split([sep [,maxsplit]]) -> list of strings\n\
7468 \n\
7469 Return a list of the words in S, using sep as the\n\
7470 delimiter string.  If maxsplit is given, at most maxsplit\n\
7471 splits are done. If sep is not specified or is None, any\n\
7472 whitespace string is a separator and empty strings are\n\
7473 removed from the result.");
7474 
7475 static PyObject*
unicode_split(PyUnicodeObject * self,PyObject * args)7476 unicode_split(PyUnicodeObject *self, PyObject *args)
7477 {
7478     PyObject *substring = Py_None;
7479     Py_ssize_t maxcount = -1;
7480 
7481     if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
7482         return NULL;
7483 
7484     if (substring == Py_None)
7485         return split(self, NULL, maxcount);
7486     else if (PyUnicode_Check(substring))
7487         return split(self, (PyUnicodeObject *)substring, maxcount);
7488     else
7489         return PyUnicode_Split((PyObject *)self, substring, maxcount);
7490 }
7491 
7492 PyObject *
PyUnicode_Partition(PyObject * str_in,PyObject * sep_in)7493 PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
7494 {
7495     PyObject* str_obj;
7496     PyObject* sep_obj;
7497     PyObject* out;
7498 
7499     str_obj = PyUnicode_FromObject(str_in);
7500     if (!str_obj)
7501         return NULL;
7502     sep_obj = PyUnicode_FromObject(sep_in);
7503     if (!sep_obj) {
7504         Py_DECREF(str_obj);
7505         return NULL;
7506     }
7507 
7508     out = stringlib_partition(
7509         str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7510         sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7511         );
7512 
7513     Py_DECREF(sep_obj);
7514     Py_DECREF(str_obj);
7515 
7516     return out;
7517 }
7518 
7519 
7520 PyObject *
PyUnicode_RPartition(PyObject * str_in,PyObject * sep_in)7521 PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
7522 {
7523     PyObject* str_obj;
7524     PyObject* sep_obj;
7525     PyObject* out;
7526 
7527     str_obj = PyUnicode_FromObject(str_in);
7528     if (!str_obj)
7529         return NULL;
7530     sep_obj = PyUnicode_FromObject(sep_in);
7531     if (!sep_obj) {
7532         Py_DECREF(str_obj);
7533         return NULL;
7534     }
7535 
7536     out = stringlib_rpartition(
7537         str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7538         sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7539         );
7540 
7541     Py_DECREF(sep_obj);
7542     Py_DECREF(str_obj);
7543 
7544     return out;
7545 }
7546 
7547 PyDoc_STRVAR(partition__doc__,
7548              "S.partition(sep) -> (head, sep, tail)\n\
7549 \n\
7550 Search for the separator sep in S, and return the part before it,\n\
7551 the separator itself, and the part after it.  If the separator is not\n\
7552 found, return S and two empty strings.");
7553 
7554 static PyObject*
unicode_partition(PyUnicodeObject * self,PyObject * separator)7555 unicode_partition(PyUnicodeObject *self, PyObject *separator)
7556 {
7557     return PyUnicode_Partition((PyObject *)self, separator);
7558 }
7559 
7560 PyDoc_STRVAR(rpartition__doc__,
7561              "S.rpartition(sep) -> (head, sep, tail)\n\
7562 \n\
7563 Search for the separator sep in S, starting at the end of S, and return\n\
7564 the part before it, the separator itself, and the part after it.  If the\n\
7565 separator is not found, return two empty strings and S.");
7566 
7567 static PyObject*
unicode_rpartition(PyUnicodeObject * self,PyObject * separator)7568 unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
7569 {
7570     return PyUnicode_RPartition((PyObject *)self, separator);
7571 }
7572 
PyUnicode_RSplit(PyObject * s,PyObject * sep,Py_ssize_t maxsplit)7573 PyObject *PyUnicode_RSplit(PyObject *s,
7574                            PyObject *sep,
7575                            Py_ssize_t maxsplit)
7576 {
7577     PyObject *result;
7578 
7579     s = PyUnicode_FromObject(s);
7580     if (s == NULL)
7581         return NULL;
7582     if (sep != NULL) {
7583         sep = PyUnicode_FromObject(sep);
7584         if (sep == NULL) {
7585             Py_DECREF(s);
7586             return NULL;
7587         }
7588     }
7589 
7590     result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7591 
7592     Py_DECREF(s);
7593     Py_XDECREF(sep);
7594     return result;
7595 }
7596 
7597 PyDoc_STRVAR(rsplit__doc__,
7598              "S.rsplit([sep [,maxsplit]]) -> list of strings\n\
7599 \n\
7600 Return a list of the words in S, using sep as the\n\
7601 delimiter string, starting at the end of the string and\n\
7602 working to the front.  If maxsplit is given, at most maxsplit\n\
7603 splits are done. If sep is not specified, any whitespace string\n\
7604 is a separator.");
7605 
7606 static PyObject*
unicode_rsplit(PyUnicodeObject * self,PyObject * args)7607 unicode_rsplit(PyUnicodeObject *self, PyObject *args)
7608 {
7609     PyObject *substring = Py_None;
7610     Py_ssize_t maxcount = -1;
7611 
7612     if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
7613         return NULL;
7614 
7615     if (substring == Py_None)
7616         return rsplit(self, NULL, maxcount);
7617     else if (PyUnicode_Check(substring))
7618         return rsplit(self, (PyUnicodeObject *)substring, maxcount);
7619     else
7620         return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
7621 }
7622 
7623 PyDoc_STRVAR(splitlines__doc__,
7624              "S.splitlines(keepends=False) -> list of strings\n\
7625 \n\
7626 Return a list of the lines in S, breaking at line boundaries.\n\
7627 Line breaks are not included in the resulting list unless keepends\n\
7628 is given and true.");
7629 
7630 static PyObject*
unicode_splitlines(PyUnicodeObject * self,PyObject * args)7631 unicode_splitlines(PyUnicodeObject *self, PyObject *args)
7632 {
7633     int keepends = 0;
7634 
7635     if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
7636         return NULL;
7637 
7638     return PyUnicode_Splitlines((PyObject *)self, keepends);
7639 }
7640 
7641 static
unicode_str(PyUnicodeObject * self)7642 PyObject *unicode_str(PyUnicodeObject *self)
7643 {
7644     return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
7645 }
7646 
7647 PyDoc_STRVAR(swapcase__doc__,
7648              "S.swapcase() -> unicode\n\
7649 \n\
7650 Return a copy of S with uppercase characters converted to lowercase\n\
7651 and vice versa.");
7652 
7653 static PyObject*
unicode_swapcase(PyUnicodeObject * self)7654 unicode_swapcase(PyUnicodeObject *self)
7655 {
7656     return fixup(self, fixswapcase);
7657 }
7658 
7659 PyDoc_STRVAR(translate__doc__,
7660              "S.translate(table) -> unicode\n\
7661 \n\
7662 Return a copy of the string S, where all characters have been mapped\n\
7663 through the given translation table, which must be a mapping of\n\
7664 Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
7665 Unmapped characters are left untouched. Characters mapped to None\n\
7666 are deleted.");
7667 
7668 static PyObject*
unicode_translate(PyUnicodeObject * self,PyObject * table)7669 unicode_translate(PyUnicodeObject *self, PyObject *table)
7670 {
7671     return PyUnicode_TranslateCharmap(self->str,
7672                                       self->length,
7673                                       table,
7674                                       "ignore");
7675 }
7676 
7677 PyDoc_STRVAR(upper__doc__,
7678              "S.upper() -> unicode\n\
7679 \n\
7680 Return a copy of S converted to uppercase.");
7681 
7682 static PyObject*
unicode_upper(PyUnicodeObject * self)7683 unicode_upper(PyUnicodeObject *self)
7684 {
7685     return fixup(self, fixupper);
7686 }
7687 
7688 PyDoc_STRVAR(zfill__doc__,
7689              "S.zfill(width) -> unicode\n\
7690 \n\
7691 Pad a numeric string S with zeros on the left, to fill a field\n\
7692 of the specified width. The string S is never truncated.");
7693 
7694 static PyObject *
unicode_zfill(PyUnicodeObject * self,PyObject * args)7695 unicode_zfill(PyUnicodeObject *self, PyObject *args)
7696 {
7697     Py_ssize_t fill;
7698     PyUnicodeObject *u;
7699 
7700     Py_ssize_t width;
7701     if (!PyArg_ParseTuple(args, "n:zfill", &width))
7702         return NULL;
7703 
7704     if (self->length >= width) {
7705         if (PyUnicode_CheckExact(self)) {
7706             Py_INCREF(self);
7707             return (PyObject*) self;
7708         }
7709         else
7710             return PyUnicode_FromUnicode(
7711                 PyUnicode_AS_UNICODE(self),
7712                 PyUnicode_GET_SIZE(self)
7713                 );
7714     }
7715 
7716     fill = width - self->length;
7717 
7718     u = pad(self, fill, 0, '0');
7719 
7720     if (u == NULL)
7721         return NULL;
7722 
7723     if (u->str[fill] == '+' || u->str[fill] == '-') {
7724         /* move sign to beginning of string */
7725         u->str[0] = u->str[fill];
7726         u->str[fill] = '0';
7727     }
7728 
7729     return (PyObject*) u;
7730 }
7731 
7732 #if 0
7733 static PyObject*
7734 free_listsize(PyUnicodeObject *self)
7735 {
7736     return PyInt_FromLong(numfree);
7737 }
7738 #endif
7739 
7740 PyDoc_STRVAR(startswith__doc__,
7741              "S.startswith(prefix[, start[, end]]) -> bool\n\
7742 \n\
7743 Return True if S starts with the specified prefix, False otherwise.\n\
7744 With optional start, test S beginning at that position.\n\
7745 With optional end, stop comparing S at that position.\n\
7746 prefix can also be a tuple of strings to try.");
7747 
7748 static PyObject *
unicode_startswith(PyUnicodeObject * self,PyObject * args)7749 unicode_startswith(PyUnicodeObject *self,
7750                    PyObject *args)
7751 {
7752     PyObject *subobj;
7753     PyUnicodeObject *substring;
7754     Py_ssize_t start = 0;
7755     Py_ssize_t end = PY_SSIZE_T_MAX;
7756     int result;
7757 
7758     if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
7759         return NULL;
7760     if (PyTuple_Check(subobj)) {
7761         Py_ssize_t i;
7762         for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7763             substring = (PyUnicodeObject *)PyUnicode_FromObject(
7764                 PyTuple_GET_ITEM(subobj, i));
7765             if (substring == NULL)
7766                 return NULL;
7767             result = tailmatch(self, substring, start, end, -1);
7768             Py_DECREF(substring);
7769             if (result) {
7770                 Py_RETURN_TRUE;
7771             }
7772         }
7773         /* nothing matched */
7774         Py_RETURN_FALSE;
7775     }
7776     substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
7777     if (substring == NULL) {
7778         if (PyErr_ExceptionMatches(PyExc_TypeError))
7779             PyErr_Format(PyExc_TypeError, "startswith first arg must be str, "
7780                          "unicode, or tuple, not %s", Py_TYPE(subobj)->tp_name);
7781         return NULL;
7782     }
7783     result = tailmatch(self, substring, start, end, -1);
7784     Py_DECREF(substring);
7785     return PyBool_FromLong(result);
7786 }
7787 
7788 
7789 PyDoc_STRVAR(endswith__doc__,
7790              "S.endswith(suffix[, start[, end]]) -> bool\n\
7791 \n\
7792 Return True if S ends with the specified suffix, False otherwise.\n\
7793 With optional start, test S beginning at that position.\n\
7794 With optional end, stop comparing S at that position.\n\
7795 suffix can also be a tuple of strings to try.");
7796 
7797 static PyObject *
unicode_endswith(PyUnicodeObject * self,PyObject * args)7798 unicode_endswith(PyUnicodeObject *self,
7799                  PyObject *args)
7800 {
7801     PyObject *subobj;
7802     PyUnicodeObject *substring;
7803     Py_ssize_t start = 0;
7804     Py_ssize_t end = PY_SSIZE_T_MAX;
7805     int result;
7806 
7807     if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
7808         return NULL;
7809     if (PyTuple_Check(subobj)) {
7810         Py_ssize_t i;
7811         for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7812             substring = (PyUnicodeObject *)PyUnicode_FromObject(
7813                 PyTuple_GET_ITEM(subobj, i));
7814             if (substring == NULL)
7815                 return NULL;
7816             result = tailmatch(self, substring, start, end, +1);
7817             Py_DECREF(substring);
7818             if (result) {
7819                 Py_RETURN_TRUE;
7820             }
7821         }
7822         Py_RETURN_FALSE;
7823     }
7824     substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
7825     if (substring == NULL) {
7826         if (PyErr_ExceptionMatches(PyExc_TypeError))
7827             PyErr_Format(PyExc_TypeError, "endswith first arg must be str, "
7828                          "unicode, or tuple, not %s", Py_TYPE(subobj)->tp_name);
7829         return NULL;
7830     }
7831     result = tailmatch(self, substring, start, end, +1);
7832     Py_DECREF(substring);
7833     return PyBool_FromLong(result);
7834 }
7835 
7836 
7837 /* Implements do_string_format, which is unicode because of stringlib */
7838 #include "stringlib/string_format.h"
7839 
7840 PyDoc_STRVAR(format__doc__,
7841              "S.format(*args, **kwargs) -> unicode\n\
7842 \n\
7843 Return a formatted version of S, using substitutions from args and kwargs.\n\
7844 The substitutions are identified by braces ('{' and '}').");
7845 
7846 static PyObject *
unicode__format__(PyObject * self,PyObject * args)7847 unicode__format__(PyObject *self, PyObject *args)
7848 {
7849     PyObject *format_spec;
7850     PyObject *result = NULL;
7851     PyObject *tmp = NULL;
7852 
7853     /* If 2.x, convert format_spec to the same type as value */
7854     /* This is to allow things like u''.format('') */
7855     if (!PyArg_ParseTuple(args, "O:__format__", &format_spec))
7856         goto done;
7857     if (!(PyBytes_Check(format_spec) || PyUnicode_Check(format_spec))) {
7858         PyErr_Format(PyExc_TypeError, "__format__ arg must be str "
7859                      "or unicode, not %s", Py_TYPE(format_spec)->tp_name);
7860         goto done;
7861     }
7862     tmp = PyObject_Unicode(format_spec);
7863     if (tmp == NULL)
7864         goto done;
7865     format_spec = tmp;
7866 
7867     result = _PyUnicode_FormatAdvanced(self,
7868                                        PyUnicode_AS_UNICODE(format_spec),
7869                                        PyUnicode_GET_SIZE(format_spec));
7870   done:
7871     Py_XDECREF(tmp);
7872     return result;
7873 }
7874 
7875 PyDoc_STRVAR(p_format__doc__,
7876              "S.__format__(format_spec) -> unicode\n\
7877 \n\
7878 Return a formatted version of S as described by format_spec.");
7879 
7880 static PyObject *
unicode__sizeof__(PyUnicodeObject * v)7881 unicode__sizeof__(PyUnicodeObject *v)
7882 {
7883     return PyInt_FromSsize_t(sizeof(PyUnicodeObject) +
7884                              sizeof(Py_UNICODE) * (v->length + 1));
7885 }
7886 
7887 PyDoc_STRVAR(sizeof__doc__,
7888              "S.__sizeof__() -> size of S in memory, in bytes\n\
7889 \n\
7890 ");
7891 
7892 static PyObject *
unicode_getnewargs(PyUnicodeObject * v)7893 unicode_getnewargs(PyUnicodeObject *v)
7894 {
7895     return Py_BuildValue("(u#)", v->str, v->length);
7896 }
7897 
7898 
7899 static PyMethodDef unicode_methods[] = {
7900     {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
7901     {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
7902     {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
7903     {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
7904     {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
7905     {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
7906     {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
7907     {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
7908     {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
7909     {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
7910     {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
7911     {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
7912     {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
7913     {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
7914     {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
7915     {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
7916     {"decode", (PyCFunction) unicode_decode, METH_VARARGS | METH_KEYWORDS, decode__doc__},
7917 /*  {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
7918     {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
7919     {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
7920     {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
7921     {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
7922     {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
7923     {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
7924     {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
7925     {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
7926     {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
7927     {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
7928     {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
7929     {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
7930     {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
7931     {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
7932     {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
7933     {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
7934     {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
7935     {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
7936     {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
7937     {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
7938     {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
7939     {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
7940     {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
7941     {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
7942     {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS},
7943     {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS},
7944     {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
7945 #if 0
7946     {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
7947 #endif
7948 
7949 #if 0
7950     /* This one is just used for debugging the implementation. */
7951     {"freelistsize", (PyCFunction) free_listsize, METH_NOARGS},
7952 #endif
7953 
7954     {"__getnewargs__",  (PyCFunction)unicode_getnewargs, METH_NOARGS},
7955     {NULL, NULL}
7956 };
7957 
7958 static PyObject *
unicode_mod(PyObject * v,PyObject * w)7959 unicode_mod(PyObject *v, PyObject *w)
7960 {
7961     if (!PyUnicode_Check(v)) {
7962         Py_INCREF(Py_NotImplemented);
7963         return Py_NotImplemented;
7964     }
7965     return PyUnicode_Format(v, w);
7966 }
7967 
7968 static PyNumberMethods unicode_as_number = {
7969     0,              /*nb_add*/
7970     0,              /*nb_subtract*/
7971     0,              /*nb_multiply*/
7972     0,              /*nb_divide*/
7973     unicode_mod,            /*nb_remainder*/
7974 };
7975 
7976 static PySequenceMethods unicode_as_sequence = {
7977     (lenfunc) unicode_length,       /* sq_length */
7978     PyUnicode_Concat,           /* sq_concat */
7979     (ssizeargfunc) unicode_repeat,  /* sq_repeat */
7980     (ssizeargfunc) unicode_getitem,     /* sq_item */
7981     (ssizessizeargfunc) unicode_slice,  /* sq_slice */
7982     0,                  /* sq_ass_item */
7983     0,                  /* sq_ass_slice */
7984     PyUnicode_Contains,         /* sq_contains */
7985 };
7986 
7987 static PyObject*
unicode_subscript(PyUnicodeObject * self,PyObject * item)7988 unicode_subscript(PyUnicodeObject* self, PyObject* item)
7989 {
7990     if (PyIndex_Check(item)) {
7991         Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
7992         if (i == -1 && PyErr_Occurred())
7993             return NULL;
7994         if (i < 0)
7995             i += PyUnicode_GET_SIZE(self);
7996         return unicode_getitem(self, i);
7997     } else if (PySlice_Check(item)) {
7998         Py_ssize_t start, stop, step, slicelength, cur, i;
7999         Py_UNICODE* source_buf;
8000         Py_UNICODE* result_buf;
8001         PyObject* result;
8002 
8003         if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
8004                                  &start, &stop, &step, &slicelength) < 0) {
8005             return NULL;
8006         }
8007 
8008         if (slicelength <= 0) {
8009             return PyUnicode_FromUnicode(NULL, 0);
8010         } else if (start == 0 && step == 1 && slicelength == self->length &&
8011                    PyUnicode_CheckExact(self)) {
8012             Py_INCREF(self);
8013             return (PyObject *)self;
8014         } else if (step == 1) {
8015             return PyUnicode_FromUnicode(self->str + start, slicelength);
8016         } else {
8017             source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
8018             result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
8019                                                        sizeof(Py_UNICODE));
8020 
8021             if (result_buf == NULL)
8022                 return PyErr_NoMemory();
8023 
8024             for (cur = start, i = 0; i < slicelength; cur += step, i++) {
8025                 result_buf[i] = source_buf[cur];
8026             }
8027 
8028             result = PyUnicode_FromUnicode(result_buf, slicelength);
8029             PyObject_FREE(result_buf);
8030             return result;
8031         }
8032     } else {
8033         PyErr_SetString(PyExc_TypeError, "string indices must be integers");
8034         return NULL;
8035     }
8036 }
8037 
8038 static PyMappingMethods unicode_as_mapping = {
8039     (lenfunc)unicode_length,        /* mp_length */
8040     (binaryfunc)unicode_subscript,  /* mp_subscript */
8041     (objobjargproc)0,           /* mp_ass_subscript */
8042 };
8043 
8044 static Py_ssize_t
unicode_buffer_getreadbuf(PyUnicodeObject * self,Py_ssize_t index,const void ** ptr)8045 unicode_buffer_getreadbuf(PyUnicodeObject *self,
8046                           Py_ssize_t index,
8047                           const void **ptr)
8048 {
8049     if (index != 0) {
8050         PyErr_SetString(PyExc_SystemError,
8051                         "accessing non-existent unicode segment");
8052         return -1;
8053     }
8054     *ptr = (void *) self->str;
8055     return PyUnicode_GET_DATA_SIZE(self);
8056 }
8057 
8058 static Py_ssize_t
unicode_buffer_getwritebuf(PyUnicodeObject * self,Py_ssize_t index,const void ** ptr)8059 unicode_buffer_getwritebuf(PyUnicodeObject *self, Py_ssize_t index,
8060                            const void **ptr)
8061 {
8062     PyErr_SetString(PyExc_TypeError,
8063                     "cannot use unicode as modifiable buffer");
8064     return -1;
8065 }
8066 
8067 static int
unicode_buffer_getsegcount(PyUnicodeObject * self,Py_ssize_t * lenp)8068 unicode_buffer_getsegcount(PyUnicodeObject *self,
8069                            Py_ssize_t *lenp)
8070 {
8071     if (lenp)
8072         *lenp = PyUnicode_GET_DATA_SIZE(self);
8073     return 1;
8074 }
8075 
8076 static Py_ssize_t
unicode_buffer_getcharbuf(PyUnicodeObject * self,Py_ssize_t index,const void ** ptr)8077 unicode_buffer_getcharbuf(PyUnicodeObject *self,
8078                           Py_ssize_t index,
8079                           const void **ptr)
8080 {
8081     PyObject *str;
8082 
8083     if (index != 0) {
8084         PyErr_SetString(PyExc_SystemError,
8085                         "accessing non-existent unicode segment");
8086         return -1;
8087     }
8088     str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
8089     if (str == NULL)
8090         return -1;
8091     *ptr = (void *) PyString_AS_STRING(str);
8092     return PyString_GET_SIZE(str);
8093 }
8094 
8095 /* Helpers for PyUnicode_Format() */
8096 
8097 static PyObject *
getnextarg(PyObject * args,Py_ssize_t arglen,Py_ssize_t * p_argidx)8098 getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
8099 {
8100     Py_ssize_t argidx = *p_argidx;
8101     if (argidx < arglen) {
8102         (*p_argidx)++;
8103         if (arglen < 0)
8104             return args;
8105         else
8106             return PyTuple_GetItem(args, argidx);
8107     }
8108     PyErr_SetString(PyExc_TypeError,
8109                     "not enough arguments for format string");
8110     return NULL;
8111 }
8112 
8113 #define F_LJUST (1<<0)
8114 #define F_SIGN  (1<<1)
8115 #define F_BLANK (1<<2)
8116 #define F_ALT   (1<<3)
8117 #define F_ZERO  (1<<4)
8118 
8119 static Py_ssize_t
strtounicode(Py_UNICODE * buffer,const char * charbuffer)8120 strtounicode(Py_UNICODE *buffer, const char *charbuffer)
8121 {
8122     register Py_ssize_t i;
8123     Py_ssize_t len = strlen(charbuffer);
8124     for (i = len - 1; i >= 0; i--)
8125         buffer[i] = (Py_UNICODE) charbuffer[i];
8126 
8127     return len;
8128 }
8129 
8130 static int
longtounicode(Py_UNICODE * buffer,size_t len,const char * format,long x)8131 longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
8132 {
8133     Py_ssize_t result;
8134 
8135     PyOS_snprintf((char *)buffer, len, format, x);
8136     result = strtounicode(buffer, (char *)buffer);
8137     return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
8138 }
8139 
8140 /* XXX To save some code duplication, formatfloat/long/int could have been
8141    shared with stringobject.c, converting from 8-bit to Unicode after the
8142    formatting is done. */
8143 
8144 /* Returns a new reference to a PyUnicode object, or NULL on failure. */
8145 
8146 static PyObject *
formatfloat(PyObject * v,int flags,int prec,int type)8147 formatfloat(PyObject *v, int flags, int prec, int type)
8148 {
8149     char *p;
8150     PyObject *result;
8151     double x;
8152 
8153     x = PyFloat_AsDouble(v);
8154     if (x == -1.0 && PyErr_Occurred())
8155         return NULL;
8156 
8157     if (prec < 0)
8158         prec = 6;
8159 
8160     p = PyOS_double_to_string(x, type, prec,
8161                               (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
8162     if (p == NULL)
8163         return NULL;
8164     result = PyUnicode_FromStringAndSize(p, strlen(p));
8165     PyMem_Free(p);
8166     return result;
8167 }
8168 
8169 static PyObject*
formatlong(PyObject * val,int flags,int prec,int type)8170 formatlong(PyObject *val, int flags, int prec, int type)
8171 {
8172     char *buf;
8173     int i, len;
8174     PyObject *str; /* temporary string object. */
8175     PyUnicodeObject *result;
8176 
8177     str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
8178     if (!str)
8179         return NULL;
8180     result = _PyUnicode_New(len);
8181     if (!result) {
8182         Py_DECREF(str);
8183         return NULL;
8184     }
8185     for (i = 0; i < len; i++)
8186         result->str[i] = buf[i];
8187     result->str[len] = 0;
8188     Py_DECREF(str);
8189     return (PyObject*)result;
8190 }
8191 
8192 static int
formatint(Py_UNICODE * buf,size_t buflen,int flags,int prec,int type,PyObject * v)8193 formatint(Py_UNICODE *buf,
8194           size_t buflen,
8195           int flags,
8196           int prec,
8197           int type,
8198           PyObject *v)
8199 {
8200     /* fmt = '%#.' + `prec` + 'l' + `type`
8201      * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
8202      *                     + 1 + 1
8203      *                   = 24
8204      */
8205     char fmt[64]; /* plenty big enough! */
8206     char *sign;
8207     long x;
8208 
8209     x = PyInt_AsLong(v);
8210     if (x == -1 && PyErr_Occurred())
8211         return -1;
8212     if (x < 0 && type == 'u') {
8213         type = 'd';
8214     }
8215     if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
8216         sign = "-";
8217     else
8218         sign = "";
8219     if (prec < 0)
8220         prec = 1;
8221 
8222     /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
8223      * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
8224      */
8225     if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
8226         PyErr_SetString(PyExc_OverflowError,
8227                         "formatted integer is too long (precision too large?)");
8228         return -1;
8229     }
8230 
8231     if ((flags & F_ALT) &&
8232         (type == 'x' || type == 'X')) {
8233         /* When converting under %#x or %#X, there are a number
8234          * of issues that cause pain:
8235          * - when 0 is being converted, the C standard leaves off
8236          *   the '0x' or '0X', which is inconsistent with other
8237          *   %#x/%#X conversions and inconsistent with Python's
8238          *   hex() function
8239          * - there are platforms that violate the standard and
8240          *   convert 0 with the '0x' or '0X'
8241          *   (Metrowerks, Compaq Tru64)
8242          * - there are platforms that give '0x' when converting
8243          *   under %#X, but convert 0 in accordance with the
8244          *   standard (OS/2 EMX)
8245          *
8246          * We can achieve the desired consistency by inserting our
8247          * own '0x' or '0X' prefix, and substituting %x/%X in place
8248          * of %#x/%#X.
8249          *
8250          * Note that this is the same approach as used in
8251          * formatint() in stringobject.c
8252          */
8253         PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
8254                       sign, type, prec, type);
8255     }
8256     else {
8257         PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
8258                       sign, (flags&F_ALT) ? "#" : "",
8259                       prec, type);
8260     }
8261     if (sign[0])
8262         return longtounicode(buf, buflen, fmt, -x);
8263     else
8264         return longtounicode(buf, buflen, fmt, x);
8265 }
8266 
8267 static int
formatchar(Py_UNICODE * buf,size_t buflen,PyObject * v)8268 formatchar(Py_UNICODE *buf,
8269            size_t buflen,
8270            PyObject *v)
8271 {
8272     PyObject *unistr;
8273     char *str;
8274     /* presume that the buffer is at least 2 characters long */
8275     if (PyUnicode_Check(v)) {
8276         if (PyUnicode_GET_SIZE(v) != 1)
8277             goto onError;
8278         buf[0] = PyUnicode_AS_UNICODE(v)[0];
8279     }
8280 
8281     else if (PyString_Check(v)) {
8282         if (PyString_GET_SIZE(v) != 1)
8283             goto onError;
8284         /* #7649: "u'%c' % char" should behave like "u'%s' % char" and fail
8285            with a UnicodeDecodeError if 'char' is not decodable with the
8286            default encoding (usually ASCII, but it might be something else) */
8287         str = PyString_AS_STRING(v);
8288         if ((unsigned char)str[0] > 0x7F) {
8289             /* the char is not ASCII; try to decode the string using the
8290                default encoding and return -1 to let the UnicodeDecodeError
8291                be raised if the string can't be decoded */
8292             unistr = PyUnicode_Decode(str, 1, NULL, "strict");
8293             if (unistr == NULL)
8294                 return -1;
8295             buf[0] = PyUnicode_AS_UNICODE(unistr)[0];
8296             Py_DECREF(unistr);
8297         }
8298         else
8299             buf[0] = (Py_UNICODE)str[0];
8300     }
8301 
8302     else {
8303         /* Integer input truncated to a character */
8304         long x;
8305         x = PyInt_AsLong(v);
8306         if (x == -1 && PyErr_Occurred())
8307             goto onError;
8308 #ifdef Py_UNICODE_WIDE
8309         if (x < 0 || x > 0x10ffff) {
8310             PyErr_SetString(PyExc_OverflowError,
8311                             "%c arg not in range(0x110000) "
8312                             "(wide Python build)");
8313             return -1;
8314         }
8315 #else
8316         if (x < 0 || x > 0xffff) {
8317             PyErr_SetString(PyExc_OverflowError,
8318                             "%c arg not in range(0x10000) "
8319                             "(narrow Python build)");
8320             return -1;
8321         }
8322 #endif
8323         buf[0] = (Py_UNICODE) x;
8324     }
8325     buf[1] = '\0';
8326     return 1;
8327 
8328   onError:
8329     PyErr_SetString(PyExc_TypeError,
8330                     "%c requires int or char");
8331     return -1;
8332 }
8333 
8334 /* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
8335 
8336    FORMATBUFLEN is the length of the buffer in which the ints &
8337    chars are formatted. XXX This is a magic number. Each formatting
8338    routine does bounds checking to ensure no overflow, but a better
8339    solution may be to malloc a buffer of appropriate size for each
8340    format. For now, the current solution is sufficient.
8341 */
8342 #define FORMATBUFLEN (size_t)120
8343 
PyUnicode_Format(PyObject * format,PyObject * args)8344 PyObject *PyUnicode_Format(PyObject *format,
8345                            PyObject *args)
8346 {
8347     Py_UNICODE *fmt, *res;
8348     Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
8349     int args_owned = 0;
8350     PyUnicodeObject *result = NULL;
8351     PyObject *dict = NULL;
8352     PyObject *uformat;
8353 
8354     if (format == NULL || args == NULL) {
8355         PyErr_BadInternalCall();
8356         return NULL;
8357     }
8358     uformat = PyUnicode_FromObject(format);
8359     if (uformat == NULL)
8360         return NULL;
8361     fmt = PyUnicode_AS_UNICODE(uformat);
8362     fmtcnt = PyUnicode_GET_SIZE(uformat);
8363 
8364     reslen = rescnt = fmtcnt + 100;
8365     result = _PyUnicode_New(reslen);
8366     if (result == NULL)
8367         goto onError;
8368     res = PyUnicode_AS_UNICODE(result);
8369 
8370     if (PyTuple_Check(args)) {
8371         arglen = PyTuple_Size(args);
8372         argidx = 0;
8373     }
8374     else {
8375         arglen = -1;
8376         argidx = -2;
8377     }
8378     if (Py_TYPE(args)->tp_as_mapping && Py_TYPE(args)->tp_as_mapping->mp_subscript &&
8379         !PyTuple_Check(args) && !PyObject_TypeCheck(args, &PyBaseString_Type))
8380         dict = args;
8381 
8382     while (--fmtcnt >= 0) {
8383         if (*fmt != '%') {
8384             if (--rescnt < 0) {
8385                 rescnt = fmtcnt + 100;
8386                 reslen += rescnt;
8387                 if (_PyUnicode_Resize(&result, reslen) < 0)
8388                     goto onError;
8389                 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
8390                 --rescnt;
8391             }
8392             *res++ = *fmt++;
8393         }
8394         else {
8395             /* Got a format specifier */
8396             int flags = 0;
8397             Py_ssize_t width = -1;
8398             int prec = -1;
8399             Py_UNICODE c = '\0';
8400             Py_UNICODE fill;
8401             int isnumok;
8402             PyObject *v = NULL;
8403             PyObject *temp = NULL;
8404             Py_UNICODE *pbuf;
8405             Py_UNICODE sign;
8406             Py_ssize_t len;
8407             Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{int,char}() */
8408 
8409             fmt++;
8410             if (*fmt == '(') {
8411                 Py_UNICODE *keystart;
8412                 Py_ssize_t keylen;
8413                 PyObject *key;
8414                 int pcount = 1;
8415 
8416                 if (dict == NULL) {
8417                     PyErr_SetString(PyExc_TypeError,
8418                                     "format requires a mapping");
8419                     goto onError;
8420                 }
8421                 ++fmt;
8422                 --fmtcnt;
8423                 keystart = fmt;
8424                 /* Skip over balanced parentheses */
8425                 while (pcount > 0 && --fmtcnt >= 0) {
8426                     if (*fmt == ')')
8427                         --pcount;
8428                     else if (*fmt == '(')
8429                         ++pcount;
8430                     fmt++;
8431                 }
8432                 keylen = fmt - keystart - 1;
8433                 if (fmtcnt < 0 || pcount > 0) {
8434                     PyErr_SetString(PyExc_ValueError,
8435                                     "incomplete format key");
8436                     goto onError;
8437                 }
8438 #if 0
8439                 /* keys are converted to strings using UTF-8 and
8440                    then looked up since Python uses strings to hold
8441                    variables names etc. in its namespaces and we
8442                    wouldn't want to break common idioms. */
8443                 key = PyUnicode_EncodeUTF8(keystart,
8444                                            keylen,
8445                                            NULL);
8446 #else
8447                 key = PyUnicode_FromUnicode(keystart, keylen);
8448 #endif
8449                 if (key == NULL)
8450                     goto onError;
8451                 if (args_owned) {
8452                     Py_DECREF(args);
8453                     args_owned = 0;
8454                 }
8455                 args = PyObject_GetItem(dict, key);
8456                 Py_DECREF(key);
8457                 if (args == NULL) {
8458                     goto onError;
8459                 }
8460                 args_owned = 1;
8461                 arglen = -1;
8462                 argidx = -2;
8463             }
8464             while (--fmtcnt >= 0) {
8465                 switch (c = *fmt++) {
8466                 case '-': flags |= F_LJUST; continue;
8467                 case '+': flags |= F_SIGN; continue;
8468                 case ' ': flags |= F_BLANK; continue;
8469                 case '#': flags |= F_ALT; continue;
8470                 case '0': flags |= F_ZERO; continue;
8471                 }
8472                 break;
8473             }
8474             if (c == '*') {
8475                 v = getnextarg(args, arglen, &argidx);
8476                 if (v == NULL)
8477                     goto onError;
8478                 if (!PyInt_Check(v)) {
8479                     PyErr_SetString(PyExc_TypeError,
8480                                     "* wants int");
8481                     goto onError;
8482                 }
8483                 width = PyInt_AsSsize_t(v);
8484                 if (width == -1 && PyErr_Occurred())
8485                     goto onError;
8486                 if (width < 0) {
8487                     flags |= F_LJUST;
8488                     width = -width;
8489                 }
8490                 if (--fmtcnt >= 0)
8491                     c = *fmt++;
8492             }
8493             else if (c >= '0' && c <= '9') {
8494                 width = c - '0';
8495                 while (--fmtcnt >= 0) {
8496                     c = *fmt++;
8497                     if (c < '0' || c > '9')
8498                         break;
8499                     if (width > (PY_SSIZE_T_MAX - ((int)c - '0')) / 10) {
8500                         PyErr_SetString(PyExc_ValueError,
8501                                         "width too big");
8502                         goto onError;
8503                     }
8504                     width = width*10 + (c - '0');
8505                 }
8506             }
8507             if (c == '.') {
8508                 prec = 0;
8509                 if (--fmtcnt >= 0)
8510                     c = *fmt++;
8511                 if (c == '*') {
8512                     v = getnextarg(args, arglen, &argidx);
8513                     if (v == NULL)
8514                         goto onError;
8515                     if (!PyInt_Check(v)) {
8516                         PyErr_SetString(PyExc_TypeError,
8517                                         "* wants int");
8518                         goto onError;
8519                     }
8520                     prec = _PyInt_AsInt(v);
8521                     if (prec == -1 && PyErr_Occurred())
8522                         goto onError;
8523                     if (prec < 0)
8524                         prec = 0;
8525                     if (--fmtcnt >= 0)
8526                         c = *fmt++;
8527                 }
8528                 else if (c >= '0' && c <= '9') {
8529                     prec = c - '0';
8530                     while (--fmtcnt >= 0) {
8531                         c = *fmt++;
8532                         if (c < '0' || c > '9')
8533                             break;
8534                         if (prec > (INT_MAX - ((int)c - '0')) / 10) {
8535                             PyErr_SetString(PyExc_ValueError,
8536                                             "prec too big");
8537                             goto onError;
8538                         }
8539                         prec = prec*10 + (c - '0');
8540                     }
8541                 }
8542             } /* prec */
8543             if (fmtcnt >= 0) {
8544                 if (c == 'h' || c == 'l' || c == 'L') {
8545                     if (--fmtcnt >= 0)
8546                         c = *fmt++;
8547                 }
8548             }
8549             if (fmtcnt < 0) {
8550                 PyErr_SetString(PyExc_ValueError,
8551                                 "incomplete format");
8552                 goto onError;
8553             }
8554             if (c != '%') {
8555                 v = getnextarg(args, arglen, &argidx);
8556                 if (v == NULL)
8557                     goto onError;
8558             }
8559             sign = 0;
8560             fill = ' ';
8561             switch (c) {
8562 
8563             case '%':
8564                 pbuf = formatbuf;
8565                 /* presume that buffer length is at least 1 */
8566                 pbuf[0] = '%';
8567                 len = 1;
8568                 break;
8569 
8570             case 's':
8571             case 'r':
8572                 if (PyUnicode_CheckExact(v) && c == 's') {
8573                     temp = v;
8574                     Py_INCREF(temp);
8575                 }
8576                 else {
8577                     PyObject *unicode;
8578                     if (c == 's')
8579                         temp = PyObject_Unicode(v);
8580                     else
8581                         temp = PyObject_Repr(v);
8582                     if (temp == NULL)
8583                         goto onError;
8584                     if (PyUnicode_Check(temp))
8585                         /* nothing to do */;
8586                     else if (PyString_Check(temp)) {
8587                         /* convert to string to Unicode */
8588                         unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
8589                                                    PyString_GET_SIZE(temp),
8590                                                    NULL,
8591                                                    "strict");
8592                         Py_DECREF(temp);
8593                         temp = unicode;
8594                         if (temp == NULL)
8595                             goto onError;
8596                     }
8597                     else {
8598                         Py_DECREF(temp);
8599                         PyErr_SetString(PyExc_TypeError,
8600                                         "%s argument has non-string str()");
8601                         goto onError;
8602                     }
8603                 }
8604                 pbuf = PyUnicode_AS_UNICODE(temp);
8605                 len = PyUnicode_GET_SIZE(temp);
8606                 if (prec >= 0 && len > prec)
8607                     len = prec;
8608                 break;
8609 
8610             case 'i':
8611             case 'd':
8612             case 'u':
8613             case 'o':
8614             case 'x':
8615             case 'X':
8616                 if (c == 'i')
8617                     c = 'd';
8618                 isnumok = 0;
8619                 if (PyNumber_Check(v)) {
8620                     PyObject *iobj=NULL;
8621 
8622                     if (PyInt_Check(v) || (PyLong_Check(v))) {
8623                         iobj = v;
8624                         Py_INCREF(iobj);
8625                     }
8626                     else {
8627                         iobj = PyNumber_Int(v);
8628                         if (iobj==NULL) iobj = PyNumber_Long(v);
8629                     }
8630                     if (iobj!=NULL) {
8631                         if (PyInt_Check(iobj)) {
8632                             isnumok = 1;
8633                             pbuf = formatbuf;
8634                             len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8635                                             flags, prec, c, iobj);
8636                             Py_DECREF(iobj);
8637                             if (len < 0)
8638                                 goto onError;
8639                             sign = 1;
8640                         }
8641                         else if (PyLong_Check(iobj)) {
8642                             isnumok = 1;
8643                             temp = formatlong(iobj, flags, prec, c);
8644                             Py_DECREF(iobj);
8645                             if (!temp)
8646                                 goto onError;
8647                             pbuf = PyUnicode_AS_UNICODE(temp);
8648                             len = PyUnicode_GET_SIZE(temp);
8649                             sign = 1;
8650                         }
8651                         else {
8652                             Py_DECREF(iobj);
8653                         }
8654                     }
8655                 }
8656                 if (!isnumok) {
8657                     PyErr_Format(PyExc_TypeError,
8658                                  "%%%c format: a number is required, "
8659                                  "not %.200s", (char)c, Py_TYPE(v)->tp_name);
8660                     goto onError;
8661                 }
8662                 if (flags & F_ZERO)
8663                     fill = '0';
8664                 break;
8665 
8666             case 'e':
8667             case 'E':
8668             case 'f':
8669             case 'F':
8670             case 'g':
8671             case 'G':
8672                 temp = formatfloat(v, flags, prec, c);
8673                 if (temp == NULL)
8674                     goto onError;
8675                 pbuf = PyUnicode_AS_UNICODE(temp);
8676                 len = PyUnicode_GET_SIZE(temp);
8677                 sign = 1;
8678                 if (flags & F_ZERO)
8679                     fill = '0';
8680                 break;
8681 
8682             case 'c':
8683                 pbuf = formatbuf;
8684                 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
8685                 if (len < 0)
8686                     goto onError;
8687                 break;
8688 
8689             default:
8690                 PyErr_Format(PyExc_ValueError,
8691                              "unsupported format character '%c' (0x%x) "
8692                              "at index %zd",
8693                              (31<=c && c<=126) ? (char)c : '?',
8694                              (int)c,
8695                              (Py_ssize_t)(fmt - 1 -
8696                                           PyUnicode_AS_UNICODE(uformat)));
8697                 goto onError;
8698             }
8699             if (sign) {
8700                 if (*pbuf == '-' || *pbuf == '+') {
8701                     sign = *pbuf++;
8702                     len--;
8703                 }
8704                 else if (flags & F_SIGN)
8705                     sign = '+';
8706                 else if (flags & F_BLANK)
8707                     sign = ' ';
8708                 else
8709                     sign = 0;
8710             }
8711             if (width < len)
8712                 width = len;
8713             if (rescnt - (sign != 0) < width) {
8714                 reslen -= rescnt;
8715                 rescnt = width + fmtcnt + 100;
8716                 reslen += rescnt;
8717                 if (reslen < 0) {
8718                     Py_XDECREF(temp);
8719                     PyErr_NoMemory();
8720                     goto onError;
8721                 }
8722                 if (_PyUnicode_Resize(&result, reslen) < 0) {
8723                     Py_XDECREF(temp);
8724                     goto onError;
8725                 }
8726                 res = PyUnicode_AS_UNICODE(result)
8727                     + reslen - rescnt;
8728             }
8729             if (sign) {
8730                 if (fill != ' ')
8731                     *res++ = sign;
8732                 rescnt--;
8733                 if (width > len)
8734                     width--;
8735             }
8736             if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
8737                 assert(pbuf[0] == '0');
8738                 assert(pbuf[1] == c);
8739                 if (fill != ' ') {
8740                     *res++ = *pbuf++;
8741                     *res++ = *pbuf++;
8742                 }
8743                 rescnt -= 2;
8744                 width -= 2;
8745                 if (width < 0)
8746                     width = 0;
8747                 len -= 2;
8748             }
8749             if (width > len && !(flags & F_LJUST)) {
8750                 do {
8751                     --rescnt;
8752                     *res++ = fill;
8753                 } while (--width > len);
8754             }
8755             if (fill == ' ') {
8756                 if (sign)
8757                     *res++ = sign;
8758                 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
8759                     assert(pbuf[0] == '0');
8760                     assert(pbuf[1] == c);
8761                     *res++ = *pbuf++;
8762                     *res++ = *pbuf++;
8763                 }
8764             }
8765             Py_UNICODE_COPY(res, pbuf, len);
8766             res += len;
8767             rescnt -= len;
8768             while (--width >= len) {
8769                 --rescnt;
8770                 *res++ = ' ';
8771             }
8772             if (dict && (argidx < arglen) && c != '%') {
8773                 PyErr_SetString(PyExc_TypeError,
8774                                 "not all arguments converted during string formatting");
8775                 Py_XDECREF(temp);
8776                 goto onError;
8777             }
8778             Py_XDECREF(temp);
8779         } /* '%' */
8780     } /* until end */
8781     if (argidx < arglen && !dict) {
8782         PyErr_SetString(PyExc_TypeError,
8783                         "not all arguments converted during string formatting");
8784         goto onError;
8785     }
8786 
8787     if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
8788         goto onError;
8789     if (args_owned) {
8790         Py_DECREF(args);
8791     }
8792     Py_DECREF(uformat);
8793     return (PyObject *)result;
8794 
8795   onError:
8796     Py_XDECREF(result);
8797     Py_DECREF(uformat);
8798     if (args_owned) {
8799         Py_DECREF(args);
8800     }
8801     return NULL;
8802 }
8803 
8804 static PyBufferProcs unicode_as_buffer = {
8805     (readbufferproc) unicode_buffer_getreadbuf,
8806     (writebufferproc) unicode_buffer_getwritebuf,
8807     (segcountproc) unicode_buffer_getsegcount,
8808     (charbufferproc) unicode_buffer_getcharbuf,
8809 };
8810 
8811 static PyObject *
8812 unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
8813 
8814 static PyObject *
unicode_new(PyTypeObject * type,PyObject * args,PyObject * kwds)8815 unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8816 {
8817     PyObject *x = NULL;
8818     static char *kwlist[] = {"string", "encoding", "errors", 0};
8819     char *encoding = NULL;
8820     char *errors = NULL;
8821 
8822     if (type != &PyUnicode_Type)
8823         return unicode_subtype_new(type, args, kwds);
8824     if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
8825                                      kwlist, &x, &encoding, &errors))
8826         return NULL;
8827     if (x == NULL)
8828         return (PyObject *)_PyUnicode_New(0);
8829     if (encoding == NULL && errors == NULL)
8830         return PyObject_Unicode(x);
8831     else
8832         return PyUnicode_FromEncodedObject(x, encoding, errors);
8833 }
8834 
8835 static PyObject *
unicode_subtype_new(PyTypeObject * type,PyObject * args,PyObject * kwds)8836 unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8837 {
8838     PyUnicodeObject *tmp, *pnew;
8839     Py_ssize_t n;
8840 
8841     assert(PyType_IsSubtype(type, &PyUnicode_Type));
8842     tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
8843     if (tmp == NULL)
8844         return NULL;
8845     assert(PyUnicode_Check(tmp));
8846     pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
8847     if (pnew == NULL) {
8848         Py_DECREF(tmp);
8849         return NULL;
8850     }
8851     pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
8852     if (pnew->str == NULL) {
8853         _Py_ForgetReference((PyObject *)pnew);
8854         PyObject_Del(pnew);
8855         Py_DECREF(tmp);
8856         return PyErr_NoMemory();
8857     }
8858     Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
8859     pnew->length = n;
8860     pnew->hash = tmp->hash;
8861     Py_DECREF(tmp);
8862     return (PyObject *)pnew;
8863 }
8864 
8865 PyDoc_STRVAR(unicode_doc,
8866              "unicode(object='') -> unicode object\n\
8867 unicode(string[, encoding[, errors]]) -> unicode object\n\
8868 \n\
8869 Create a new Unicode object from the given encoded string.\n\
8870 encoding defaults to the current default string encoding.\n\
8871 errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
8872 
8873 PyTypeObject PyUnicode_Type = {
8874     PyVarObject_HEAD_INIT(&PyType_Type, 0)
8875     "unicode",              /* tp_name */
8876     sizeof(PyUnicodeObject),        /* tp_size */
8877     0,                  /* tp_itemsize */
8878     /* Slots */
8879     (destructor)unicode_dealloc,    /* tp_dealloc */
8880     0,                  /* tp_print */
8881     0,                  /* tp_getattr */
8882     0,                  /* tp_setattr */
8883     0,                  /* tp_compare */
8884     unicode_repr,           /* tp_repr */
8885     &unicode_as_number,         /* tp_as_number */
8886     &unicode_as_sequence,       /* tp_as_sequence */
8887     &unicode_as_mapping,        /* tp_as_mapping */
8888     (hashfunc) unicode_hash,        /* tp_hash*/
8889     0,                  /* tp_call*/
8890     (reprfunc) unicode_str,     /* tp_str */
8891     PyObject_GenericGetAttr,        /* tp_getattro */
8892     0,                  /* tp_setattro */
8893     &unicode_as_buffer,         /* tp_as_buffer */
8894     Py_TPFLAGS_DEFAULT | Py_TPFLAGS_CHECKTYPES |
8895     Py_TPFLAGS_BASETYPE | Py_TPFLAGS_UNICODE_SUBCLASS,  /* tp_flags */
8896     unicode_doc,            /* tp_doc */
8897     0,                  /* tp_traverse */
8898     0,                  /* tp_clear */
8899     PyUnicode_RichCompare,      /* tp_richcompare */
8900     0,                  /* tp_weaklistoffset */
8901     0,                  /* tp_iter */
8902     0,                  /* tp_iternext */
8903     unicode_methods,            /* tp_methods */
8904     0,                  /* tp_members */
8905     0,                  /* tp_getset */
8906     &PyBaseString_Type,         /* tp_base */
8907     0,                  /* tp_dict */
8908     0,                  /* tp_descr_get */
8909     0,                  /* tp_descr_set */
8910     0,                  /* tp_dictoffset */
8911     0,                  /* tp_init */
8912     0,                  /* tp_alloc */
8913     unicode_new,            /* tp_new */
8914     PyObject_Del,           /* tp_free */
8915 };
8916 
8917 /* Initialize the Unicode implementation */
8918 
_PyUnicode_Init(void)8919 void _PyUnicode_Init(void)
8920 {
8921     /* XXX - move this array to unicodectype.c ? */
8922     Py_UNICODE linebreak[] = {
8923         0x000A, /* LINE FEED */
8924         0x000D, /* CARRIAGE RETURN */
8925         0x001C, /* FILE SEPARATOR */
8926         0x001D, /* GROUP SEPARATOR */
8927         0x001E, /* RECORD SEPARATOR */
8928         0x0085, /* NEXT LINE */
8929         0x2028, /* LINE SEPARATOR */
8930         0x2029, /* PARAGRAPH SEPARATOR */
8931     };
8932 
8933     /* Init the implementation */
8934     if (!unicode_empty) {
8935         unicode_empty = _PyUnicode_New(0);
8936         if (!unicode_empty)
8937             return;
8938     }
8939 
8940     if (PyType_Ready(&PyUnicode_Type) < 0)
8941         Py_FatalError("Can't initialize 'unicode'");
8942 
8943     /* initialize the linebreak bloom filter */
8944     bloom_linebreak = make_bloom_mask(
8945         linebreak, sizeof(linebreak) / sizeof(linebreak[0])
8946         );
8947 
8948     PyType_Ready(&EncodingMapType);
8949 
8950     if (PyType_Ready(&PyFieldNameIter_Type) < 0)
8951         Py_FatalError("Can't initialize field name iterator type");
8952 
8953     if (PyType_Ready(&PyFormatterIter_Type) < 0)
8954         Py_FatalError("Can't initialize formatter iter type");
8955 }
8956 
8957 /* Finalize the Unicode implementation */
8958 
8959 int
PyUnicode_ClearFreeList(void)8960 PyUnicode_ClearFreeList(void)
8961 {
8962     int freelist_size = numfree;
8963     PyUnicodeObject *u;
8964 
8965     for (u = free_list; u != NULL;) {
8966         PyUnicodeObject *v = u;
8967         u = *(PyUnicodeObject **)u;
8968         if (v->str)
8969             PyObject_DEL(v->str);
8970         Py_XDECREF(v->defenc);
8971         PyObject_Del(v);
8972         numfree--;
8973     }
8974     free_list = NULL;
8975     assert(numfree == 0);
8976     return freelist_size;
8977 }
8978 
8979 void
_PyUnicode_Fini(void)8980 _PyUnicode_Fini(void)
8981 {
8982     int i;
8983 
8984     Py_CLEAR(unicode_empty);
8985 
8986     for (i = 0; i < 256; i++)
8987         Py_CLEAR(unicode_latin1[i]);
8988 
8989     (void)PyUnicode_ClearFreeList();
8990 }
8991 
8992 #ifdef __cplusplus
8993 }
8994 #endif
8995