1 /*
2     string_format.h -- implementation of string.format().
3 
4     It uses the Objects/stringlib conventions, so that it can be
5     compiled for both unicode and string objects.
6 */
7 
8 
9 /* Defines for Python 2.6 compatibility */
10 #if PY_VERSION_HEX < 0x03000000
11 #define PyLong_FromSsize_t _PyLong_FromSsize_t
12 #endif
13 
14 /* Defines for more efficiently reallocating the string buffer */
15 #define INITIAL_SIZE_INCREMENT 100
16 #define SIZE_MULTIPLIER 2
17 #define MAX_SIZE_INCREMENT  3200
18 
19 
20 /************************************************************************/
21 /***********   Global data structures and forward declarations  *********/
22 /************************************************************************/
23 
24 /*
25    A SubString consists of the characters between two string or
26    unicode pointers.
27 */
28 typedef struct {
29     STRINGLIB_CHAR *ptr;
30     STRINGLIB_CHAR *end;
31 } SubString;
32 
33 
34 typedef enum {
35     ANS_INIT,
36     ANS_AUTO,
37     ANS_MANUAL
38 } AutoNumberState;   /* Keep track if we're auto-numbering fields */
39 
40 /* Keeps track of our auto-numbering state, and which number field we're on */
41 typedef struct {
42     AutoNumberState an_state;
43     int an_field_number;
44 } AutoNumber;
45 
46 
47 /* forward declaration for recursion */
48 static PyObject *
49 build_string(SubString *input, PyObject *args, PyObject *kwargs,
50              int recursion_depth, AutoNumber *auto_number);
51 
52 
53 
54 /************************************************************************/
55 /**************************  Utility  functions  ************************/
56 /************************************************************************/
57 
58 static void
AutoNumber_Init(AutoNumber * auto_number)59 AutoNumber_Init(AutoNumber *auto_number)
60 {
61     auto_number->an_state = ANS_INIT;
62     auto_number->an_field_number = 0;
63 }
64 
65 /* fill in a SubString from a pointer and length */
66 Py_LOCAL_INLINE(void)
SubString_init(SubString * str,STRINGLIB_CHAR * p,Py_ssize_t len)67 SubString_init(SubString *str, STRINGLIB_CHAR *p, Py_ssize_t len)
68 {
69     str->ptr = p;
70     if (p == NULL)
71         str->end = NULL;
72     else
73         str->end = str->ptr + len;
74 }
75 
76 /* return a new string.  if str->ptr is NULL, return None */
77 Py_LOCAL_INLINE(PyObject *)
SubString_new_object(SubString * str)78 SubString_new_object(SubString *str)
79 {
80     if (str->ptr == NULL) {
81         Py_INCREF(Py_None);
82         return Py_None;
83     }
84     return STRINGLIB_NEW(str->ptr, str->end - str->ptr);
85 }
86 
87 /* return a new string.  if str->ptr is NULL, return None */
88 Py_LOCAL_INLINE(PyObject *)
SubString_new_object_or_empty(SubString * str)89 SubString_new_object_or_empty(SubString *str)
90 {
91     if (str->ptr == NULL) {
92         return STRINGLIB_NEW(NULL, 0);
93     }
94     return STRINGLIB_NEW(str->ptr, str->end - str->ptr);
95 }
96 
97 /* Return 1 if an error has been detected switching between automatic
98    field numbering and manual field specification, else return 0. Set
99    ValueError on error. */
100 static int
autonumber_state_error(AutoNumberState state,int field_name_is_empty)101 autonumber_state_error(AutoNumberState state, int field_name_is_empty)
102 {
103     if (state == ANS_MANUAL) {
104         if (field_name_is_empty) {
105             PyErr_SetString(PyExc_ValueError, "cannot switch from "
106                             "manual field specification to "
107                             "automatic field numbering");
108             return 1;
109         }
110     }
111     else {
112         if (!field_name_is_empty) {
113             PyErr_SetString(PyExc_ValueError, "cannot switch from "
114                             "automatic field numbering to "
115                             "manual field specification");
116             return 1;
117         }
118     }
119     return 0;
120 }
121 
122 
123 /************************************************************************/
124 /***********    Output string management functions       ****************/
125 /************************************************************************/
126 
127 typedef struct {
128     STRINGLIB_CHAR *ptr;
129     STRINGLIB_CHAR *end;
130     PyObject *obj;
131     Py_ssize_t size_increment;
132 } OutputString;
133 
134 /* initialize an OutputString object, reserving size characters */
135 static int
output_initialize(OutputString * output,Py_ssize_t size)136 output_initialize(OutputString *output, Py_ssize_t size)
137 {
138     output->obj = STRINGLIB_NEW(NULL, size);
139     if (output->obj == NULL)
140         return 0;
141 
142     output->ptr = STRINGLIB_STR(output->obj);
143     output->end = STRINGLIB_LEN(output->obj) + output->ptr;
144     output->size_increment = INITIAL_SIZE_INCREMENT;
145 
146     return 1;
147 }
148 
149 /*
150     output_extend reallocates the output string buffer.
151     It returns a status:  0 for a failed reallocation,
152     1 for success.
153 */
154 
155 static int
output_extend(OutputString * output,Py_ssize_t count)156 output_extend(OutputString *output, Py_ssize_t count)
157 {
158     STRINGLIB_CHAR *startptr = STRINGLIB_STR(output->obj);
159     Py_ssize_t curlen = output->ptr - startptr;
160     Py_ssize_t maxlen = curlen + count + output->size_increment;
161 
162     if (STRINGLIB_RESIZE(&output->obj, maxlen) < 0)
163         return 0;
164     startptr = STRINGLIB_STR(output->obj);
165     output->ptr = startptr + curlen;
166     output->end = startptr + maxlen;
167     if (output->size_increment < MAX_SIZE_INCREMENT)
168         output->size_increment *= SIZE_MULTIPLIER;
169     return 1;
170 }
171 
172 /*
173     output_data dumps characters into our output string
174     buffer.
175 
176     In some cases, it has to reallocate the string.
177 
178     It returns a status:  0 for a failed reallocation,
179     1 for success.
180 */
181 static int
output_data(OutputString * output,const STRINGLIB_CHAR * s,Py_ssize_t count)182 output_data(OutputString *output, const STRINGLIB_CHAR *s, Py_ssize_t count)
183 {
184     if ((count > output->end - output->ptr) && !output_extend(output, count))
185         return 0;
186     memcpy(output->ptr, s, count * sizeof(STRINGLIB_CHAR));
187     output->ptr += count;
188     return 1;
189 }
190 
191 /************************************************************************/
192 /***********  Format string parsing -- integers and identifiers *********/
193 /************************************************************************/
194 
195 static Py_ssize_t
get_integer(const SubString * str)196 get_integer(const SubString *str)
197 {
198     Py_ssize_t accumulator = 0;
199     Py_ssize_t digitval;
200     STRINGLIB_CHAR *p;
201 
202     /* empty string is an error */
203     if (str->ptr >= str->end)
204         return -1;
205 
206     for (p = str->ptr; p < str->end; p++) {
207         digitval = STRINGLIB_TODECIMAL(*p);
208         if (digitval < 0)
209             return -1;
210         /*
211            Detect possible overflow before it happens:
212 
213               accumulator * 10 + digitval > PY_SSIZE_T_MAX if and only if
214               accumulator > (PY_SSIZE_T_MAX - digitval) / 10.
215         */
216         if (accumulator > (PY_SSIZE_T_MAX - digitval) / 10) {
217             PyErr_Format(PyExc_ValueError,
218                          "Too many decimal digits in format string");
219             return -1;
220         }
221         accumulator = accumulator * 10 + digitval;
222     }
223     return accumulator;
224 }
225 
226 /************************************************************************/
227 /******** Functions to get field objects and specification strings ******/
228 /************************************************************************/
229 
230 /* do the equivalent of obj.name */
231 static PyObject *
getattr(PyObject * obj,SubString * name)232 getattr(PyObject *obj, SubString *name)
233 {
234     PyObject *newobj;
235     PyObject *str = SubString_new_object(name);
236     if (str == NULL)
237         return NULL;
238     newobj = PyObject_GetAttr(obj, str);
239     Py_DECREF(str);
240     return newobj;
241 }
242 
243 /* do the equivalent of obj[idx], where obj is a sequence */
244 static PyObject *
getitem_sequence(PyObject * obj,Py_ssize_t idx)245 getitem_sequence(PyObject *obj, Py_ssize_t idx)
246 {
247     return PySequence_GetItem(obj, idx);
248 }
249 
250 /* do the equivalent of obj[idx], where obj is not a sequence */
251 static PyObject *
getitem_idx(PyObject * obj,Py_ssize_t idx)252 getitem_idx(PyObject *obj, Py_ssize_t idx)
253 {
254     PyObject *newobj;
255     PyObject *idx_obj = PyLong_FromSsize_t(idx);
256     if (idx_obj == NULL)
257         return NULL;
258     newobj = PyObject_GetItem(obj, idx_obj);
259     Py_DECREF(idx_obj);
260     return newobj;
261 }
262 
263 /* do the equivalent of obj[name] */
264 static PyObject *
getitem_str(PyObject * obj,SubString * name)265 getitem_str(PyObject *obj, SubString *name)
266 {
267     PyObject *newobj;
268     PyObject *str = SubString_new_object(name);
269     if (str == NULL)
270         return NULL;
271     newobj = PyObject_GetItem(obj, str);
272     Py_DECREF(str);
273     return newobj;
274 }
275 
276 typedef struct {
277     /* the entire string we're parsing.  we assume that someone else
278        is managing its lifetime, and that it will exist for the
279        lifetime of the iterator.  can be empty */
280     SubString str;
281 
282     /* pointer to where we are inside field_name */
283     STRINGLIB_CHAR *ptr;
284 } FieldNameIterator;
285 
286 
287 static int
FieldNameIterator_init(FieldNameIterator * self,STRINGLIB_CHAR * ptr,Py_ssize_t len)288 FieldNameIterator_init(FieldNameIterator *self, STRINGLIB_CHAR *ptr,
289                        Py_ssize_t len)
290 {
291     SubString_init(&self->str, ptr, len);
292     self->ptr = self->str.ptr;
293     return 1;
294 }
295 
296 static int
_FieldNameIterator_attr(FieldNameIterator * self,SubString * name)297 _FieldNameIterator_attr(FieldNameIterator *self, SubString *name)
298 {
299     STRINGLIB_CHAR c;
300 
301     name->ptr = self->ptr;
302 
303     /* return everything until '.' or '[' */
304     while (self->ptr < self->str.end) {
305         switch (c = *self->ptr++) {
306         case '[':
307         case '.':
308             /* backup so that we this character will be seen next time */
309             self->ptr--;
310             break;
311         default:
312             continue;
313         }
314         break;
315     }
316     /* end of string is okay */
317     name->end = self->ptr;
318     return 1;
319 }
320 
321 static int
_FieldNameIterator_item(FieldNameIterator * self,SubString * name)322 _FieldNameIterator_item(FieldNameIterator *self, SubString *name)
323 {
324     int bracket_seen = 0;
325     STRINGLIB_CHAR c;
326 
327     name->ptr = self->ptr;
328 
329     /* return everything until ']' */
330     while (self->ptr < self->str.end) {
331         switch (c = *self->ptr++) {
332         case ']':
333             bracket_seen = 1;
334             break;
335         default:
336             continue;
337         }
338         break;
339     }
340     /* make sure we ended with a ']' */
341     if (!bracket_seen) {
342         PyErr_SetString(PyExc_ValueError, "Missing ']' in format string");
343         return 0;
344     }
345 
346     /* end of string is okay */
347     /* don't include the ']' */
348     name->end = self->ptr-1;
349     return 1;
350 }
351 
352 /* returns 0 on error, 1 on non-error termination, and 2 if it returns a value */
353 static int
FieldNameIterator_next(FieldNameIterator * self,int * is_attribute,Py_ssize_t * name_idx,SubString * name)354 FieldNameIterator_next(FieldNameIterator *self, int *is_attribute,
355                        Py_ssize_t *name_idx, SubString *name)
356 {
357     /* check at end of input */
358     if (self->ptr >= self->str.end)
359         return 1;
360 
361     switch (*self->ptr++) {
362     case '.':
363         *is_attribute = 1;
364         if (_FieldNameIterator_attr(self, name) == 0)
365             return 0;
366         *name_idx = -1;
367         break;
368     case '[':
369         *is_attribute = 0;
370         if (_FieldNameIterator_item(self, name) == 0)
371             return 0;
372         *name_idx = get_integer(name);
373         if (*name_idx == -1 && PyErr_Occurred())
374             return 0;
375         break;
376     default:
377         /* Invalid character follows ']' */
378         PyErr_SetString(PyExc_ValueError, "Only '.' or '[' may "
379                         "follow ']' in format field specifier");
380         return 0;
381     }
382 
383     /* empty string is an error */
384     if (name->ptr == name->end) {
385         PyErr_SetString(PyExc_ValueError, "Empty attribute in format string");
386         return 0;
387     }
388 
389     return 2;
390 }
391 
392 
393 /* input: field_name
394    output: 'first' points to the part before the first '[' or '.'
395            'first_idx' is -1 if 'first' is not an integer, otherwise
396                        it's the value of first converted to an integer
397            'rest' is an iterator to return the rest
398 */
399 static int
field_name_split(STRINGLIB_CHAR * ptr,Py_ssize_t len,SubString * first,Py_ssize_t * first_idx,FieldNameIterator * rest,AutoNumber * auto_number)400 field_name_split(STRINGLIB_CHAR *ptr, Py_ssize_t len, SubString *first,
401                  Py_ssize_t *first_idx, FieldNameIterator *rest,
402                  AutoNumber *auto_number)
403 {
404     STRINGLIB_CHAR c;
405     STRINGLIB_CHAR *p = ptr;
406     STRINGLIB_CHAR *end = ptr + len;
407     int field_name_is_empty;
408     int using_numeric_index;
409 
410     /* find the part up until the first '.' or '[' */
411     while (p < end) {
412         switch (c = *p++) {
413         case '[':
414         case '.':
415             /* backup so that we this character is available to the
416                "rest" iterator */
417             p--;
418             break;
419         default:
420             continue;
421         }
422         break;
423     }
424 
425     /* set up the return values */
426     SubString_init(first, ptr, p - ptr);
427     FieldNameIterator_init(rest, p, end - p);
428 
429     /* see if "first" is an integer, in which case it's used as an index */
430     *first_idx = get_integer(first);
431     if (*first_idx == -1 && PyErr_Occurred())
432         return 0;
433 
434     field_name_is_empty = first->ptr >= first->end;
435 
436     /* If the field name is omitted or if we have a numeric index
437        specified, then we're doing numeric indexing into args. */
438     using_numeric_index = field_name_is_empty || *first_idx != -1;
439 
440     /* We always get here exactly one time for each field we're
441        processing. And we get here in field order (counting by left
442        braces). So this is the perfect place to handle automatic field
443        numbering if the field name is omitted. */
444 
445     /* Check if we need to do the auto-numbering. It's not needed if
446        we're called from string.Format routines, because it's handled
447        in that class by itself. */
448     if (auto_number) {
449         /* Initialize our auto numbering state if this is the first
450            time we're either auto-numbering or manually numbering. */
451         if (auto_number->an_state == ANS_INIT && using_numeric_index)
452             auto_number->an_state = field_name_is_empty ?
453                 ANS_AUTO : ANS_MANUAL;
454 
455         /* Make sure our state is consistent with what we're doing
456            this time through. Only check if we're using a numeric
457            index. */
458         if (using_numeric_index)
459             if (autonumber_state_error(auto_number->an_state,
460                                        field_name_is_empty))
461                 return 0;
462         /* Zero length field means we want to do auto-numbering of the
463            fields. */
464         if (field_name_is_empty)
465             *first_idx = (auto_number->an_field_number)++;
466     }
467 
468     return 1;
469 }
470 
471 
472 /*
473     get_field_object returns the object inside {}, before the
474     format_spec.  It handles getindex and getattr lookups and consumes
475     the entire input string.
476 */
477 static PyObject *
get_field_object(SubString * input,PyObject * args,PyObject * kwargs,AutoNumber * auto_number)478 get_field_object(SubString *input, PyObject *args, PyObject *kwargs,
479                  AutoNumber *auto_number)
480 {
481     PyObject *obj = NULL;
482     int ok;
483     int is_attribute;
484     SubString name;
485     SubString first;
486     Py_ssize_t index;
487     FieldNameIterator rest;
488 
489     if (!field_name_split(input->ptr, input->end - input->ptr, &first,
490                           &index, &rest, auto_number)) {
491         goto error;
492     }
493 
494     if (index == -1) {
495         /* look up in kwargs */
496         PyObject *key = SubString_new_object(&first);
497         if (key == NULL)
498             goto error;
499         if ((kwargs == NULL) || (obj = PyDict_GetItem(kwargs, key)) == NULL) {
500             PyErr_SetObject(PyExc_KeyError, key);
501             Py_DECREF(key);
502             goto error;
503         }
504         Py_DECREF(key);
505         Py_INCREF(obj);
506     }
507     else {
508         /* look up in args */
509         obj = PySequence_GetItem(args, index);
510         if (obj == NULL)
511             goto error;
512     }
513 
514     /* iterate over the rest of the field_name */
515     while ((ok = FieldNameIterator_next(&rest, &is_attribute, &index,
516                                         &name)) == 2) {
517         PyObject *tmp;
518 
519         if (is_attribute)
520             /* getattr lookup "." */
521             tmp = getattr(obj, &name);
522         else
523             /* getitem lookup "[]" */
524             if (index == -1)
525                 tmp = getitem_str(obj, &name);
526             else
527                 if (PySequence_Check(obj))
528                     tmp = getitem_sequence(obj, index);
529                 else
530                     /* not a sequence */
531                     tmp = getitem_idx(obj, index);
532         if (tmp == NULL)
533             goto error;
534 
535         /* assign to obj */
536         Py_DECREF(obj);
537         obj = tmp;
538     }
539     /* end of iterator, this is the non-error case */
540     if (ok == 1)
541         return obj;
542 error:
543     Py_XDECREF(obj);
544     return NULL;
545 }
546 
547 /************************************************************************/
548 /*****************  Field rendering functions  **************************/
549 /************************************************************************/
550 
551 /*
552     render_field() is the main function in this section.  It takes the
553     field object and field specification string generated by
554     get_field_and_spec, and renders the field into the output string.
555 
556     render_field calls fieldobj.__format__(format_spec) method, and
557     appends to the output.
558 */
559 static int
render_field(PyObject * fieldobj,SubString * format_spec,OutputString * output)560 render_field(PyObject *fieldobj, SubString *format_spec, OutputString *output)
561 {
562     int ok = 0;
563     PyObject *result = NULL;
564     PyObject *format_spec_object = NULL;
565     PyObject *(*formatter)(PyObject *, STRINGLIB_CHAR *, Py_ssize_t) = NULL;
566     STRINGLIB_CHAR* format_spec_start = format_spec->ptr ?
567             format_spec->ptr : NULL;
568     Py_ssize_t format_spec_len = format_spec->ptr ?
569             format_spec->end - format_spec->ptr : 0;
570 
571     /* If we know the type exactly, skip the lookup of __format__ and just
572        call the formatter directly. */
573 #if STRINGLIB_IS_UNICODE
574     if (PyUnicode_CheckExact(fieldobj))
575         formatter = _PyUnicode_FormatAdvanced;
576     /* Unfortunately, there's a problem with checking for int, long,
577        and float here.  If we're being included as unicode, their
578        formatters expect string format_spec args.  For now, just skip
579        this optimization for unicode.  This could be fixed, but it's a
580        hassle. */
581 #else
582     if (PyString_CheckExact(fieldobj))
583         formatter = _PyBytes_FormatAdvanced;
584     else if (PyInt_CheckExact(fieldobj))
585         formatter =_PyInt_FormatAdvanced;
586     else if (PyLong_CheckExact(fieldobj))
587         formatter =_PyLong_FormatAdvanced;
588     else if (PyFloat_CheckExact(fieldobj))
589         formatter = _PyFloat_FormatAdvanced;
590 #endif
591 
592     if (formatter) {
593         /* we know exactly which formatter will be called when __format__ is
594            looked up, so call it directly, instead. */
595         result = formatter(fieldobj, format_spec_start, format_spec_len);
596     }
597     else {
598         /* We need to create an object out of the pointers we have, because
599            __format__ takes a string/unicode object for format_spec. */
600         format_spec_object = STRINGLIB_NEW(format_spec_start,
601                                            format_spec_len);
602         if (format_spec_object == NULL)
603             goto done;
604 
605         result = PyObject_Format(fieldobj, format_spec_object);
606     }
607     if (result == NULL)
608         goto done;
609 
610 #if PY_VERSION_HEX >= 0x03000000
611     assert(PyUnicode_Check(result));
612 #else
613     assert(PyString_Check(result) || PyUnicode_Check(result));
614 
615     /* Convert result to our type.  We could be str, and result could
616        be unicode */
617     {
618         PyObject *tmp = STRINGLIB_TOSTR(result);
619         if (tmp == NULL)
620             goto done;
621         Py_DECREF(result);
622         result = tmp;
623     }
624 #endif
625 
626     ok = output_data(output,
627                      STRINGLIB_STR(result), STRINGLIB_LEN(result));
628 done:
629     Py_XDECREF(format_spec_object);
630     Py_XDECREF(result);
631     return ok;
632 }
633 
634 static int
parse_field(SubString * str,SubString * field_name,SubString * format_spec,STRINGLIB_CHAR * conversion)635 parse_field(SubString *str, SubString *field_name, SubString *format_spec,
636             STRINGLIB_CHAR *conversion)
637 {
638     /* Note this function works if the field name is zero length,
639        which is good.  Zero length field names are handled later, in
640        field_name_split. */
641 
642     STRINGLIB_CHAR c = 0;
643 
644     /* initialize these, as they may be empty */
645     *conversion = '\0';
646     SubString_init(format_spec, NULL, 0);
647 
648     /* Search for the field name.  it's terminated by the end of
649        the string, or a ':' or '!' */
650     field_name->ptr = str->ptr;
651     while (str->ptr < str->end) {
652         switch (c = *(str->ptr++)) {
653         case ':':
654         case '!':
655             break;
656         default:
657             continue;
658         }
659         break;
660     }
661 
662     if (c == '!' || c == ':') {
663         /* we have a format specifier and/or a conversion */
664         /* don't include the last character */
665         field_name->end = str->ptr-1;
666 
667         /* the format specifier is the rest of the string */
668         format_spec->ptr = str->ptr;
669         format_spec->end = str->end;
670 
671         /* see if there's a conversion specifier */
672         if (c == '!') {
673             /* there must be another character present */
674             if (format_spec->ptr >= format_spec->end) {
675                 PyErr_SetString(PyExc_ValueError,
676                                 "end of format while looking for conversion "
677                                 "specifier");
678                 return 0;
679             }
680             *conversion = *(format_spec->ptr++);
681 
682             /* if there is another character, it must be a colon */
683             if (format_spec->ptr < format_spec->end) {
684                 c = *(format_spec->ptr++);
685                 if (c != ':') {
686                     PyErr_SetString(PyExc_ValueError,
687                                     "expected ':' after format specifier");
688                     return 0;
689                 }
690             }
691         }
692     }
693     else
694         /* end of string, there's no format_spec or conversion */
695         field_name->end = str->ptr;
696 
697     return 1;
698 }
699 
700 /************************************************************************/
701 /******* Output string allocation and escape-to-markup processing  ******/
702 /************************************************************************/
703 
704 /* MarkupIterator breaks the string into pieces of either literal
705    text, or things inside {} that need to be marked up.  it is
706    designed to make it easy to wrap a Python iterator around it, for
707    use with the Formatter class */
708 
709 typedef struct {
710     SubString str;
711 } MarkupIterator;
712 
713 static int
MarkupIterator_init(MarkupIterator * self,STRINGLIB_CHAR * ptr,Py_ssize_t len)714 MarkupIterator_init(MarkupIterator *self, STRINGLIB_CHAR *ptr, Py_ssize_t len)
715 {
716     SubString_init(&self->str, ptr, len);
717     return 1;
718 }
719 
720 /* returns 0 on error, 1 on non-error termination, and 2 if it got a
721    string (or something to be expanded) */
722 static int
MarkupIterator_next(MarkupIterator * self,SubString * literal,int * field_present,SubString * field_name,SubString * format_spec,STRINGLIB_CHAR * conversion,int * format_spec_needs_expanding)723 MarkupIterator_next(MarkupIterator *self, SubString *literal,
724                     int *field_present, SubString *field_name,
725                     SubString *format_spec, STRINGLIB_CHAR *conversion,
726                     int *format_spec_needs_expanding)
727 {
728     int at_end;
729     STRINGLIB_CHAR c = 0;
730     STRINGLIB_CHAR *start;
731     int count;
732     Py_ssize_t len;
733     int markup_follows = 0;
734 
735     /* initialize all of the output variables */
736     SubString_init(literal, NULL, 0);
737     SubString_init(field_name, NULL, 0);
738     SubString_init(format_spec, NULL, 0);
739     *conversion = '\0';
740     *format_spec_needs_expanding = 0;
741     *field_present = 0;
742 
743     /* No more input, end of iterator.  This is the normal exit
744        path. */
745     if (self->str.ptr >= self->str.end)
746         return 1;
747 
748     start = self->str.ptr;
749 
750     /* First read any literal text. Read until the end of string, an
751        escaped '{' or '}', or an unescaped '{'.  In order to never
752        allocate memory and so I can just pass pointers around, if
753        there's an escaped '{' or '}' then we'll return the literal
754        including the brace, but no format object.  The next time
755        through, we'll return the rest of the literal, skipping past
756        the second consecutive brace. */
757     while (self->str.ptr < self->str.end) {
758         switch (c = *(self->str.ptr++)) {
759         case '{':
760         case '}':
761             markup_follows = 1;
762             break;
763         default:
764             continue;
765         }
766         break;
767     }
768 
769     at_end = self->str.ptr >= self->str.end;
770     len = self->str.ptr - start;
771 
772     if ((c == '}') && (at_end || (c != *self->str.ptr))) {
773         PyErr_SetString(PyExc_ValueError, "Single '}' encountered "
774                         "in format string");
775         return 0;
776     }
777     if (at_end && c == '{') {
778         PyErr_SetString(PyExc_ValueError, "Single '{' encountered "
779                         "in format string");
780         return 0;
781     }
782     if (!at_end) {
783         if (c == *self->str.ptr) {
784             /* escaped } or {, skip it in the input.  there is no
785                markup object following us, just this literal text */
786             self->str.ptr++;
787             markup_follows = 0;
788         }
789         else
790             len--;
791     }
792 
793     /* record the literal text */
794     literal->ptr = start;
795     literal->end = start + len;
796 
797     if (!markup_follows)
798         return 2;
799 
800     /* this is markup, find the end of the string by counting nested
801        braces.  note that this prohibits escaped braces, so that
802        format_specs cannot have braces in them. */
803     *field_present = 1;
804     count = 1;
805 
806     start = self->str.ptr;
807 
808     /* we know we can't have a zero length string, so don't worry
809        about that case */
810     while (self->str.ptr < self->str.end) {
811         switch (c = *(self->str.ptr++)) {
812         case '{':
813             /* the format spec needs to be recursively expanded.
814                this is an optimization, and not strictly needed */
815             *format_spec_needs_expanding = 1;
816             count++;
817             break;
818         case '}':
819             count--;
820             if (count <= 0) {
821                 /* we're done.  parse and get out */
822                 SubString s;
823 
824                 SubString_init(&s, start, self->str.ptr - 1 - start);
825                 if (parse_field(&s, field_name, format_spec, conversion) == 0)
826                     return 0;
827 
828                 /* success */
829                 return 2;
830             }
831             break;
832         }
833     }
834 
835     /* end of string while searching for matching '}' */
836     PyErr_SetString(PyExc_ValueError, "unmatched '{' in format");
837     return 0;
838 }
839 
840 
841 /* do the !r or !s conversion on obj */
842 static PyObject *
do_conversion(PyObject * obj,STRINGLIB_CHAR conversion)843 do_conversion(PyObject *obj, STRINGLIB_CHAR conversion)
844 {
845     /* XXX in pre-3.0, do we need to convert this to unicode, since it
846        might have returned a string? */
847     switch (conversion) {
848     case 'r':
849         return PyObject_Repr(obj);
850     case 's':
851         return STRINGLIB_TOSTR(obj);
852     default:
853         if (conversion > 32 && conversion < 127) {
854                 /* It's the ASCII subrange; casting to char is safe
855                    (assuming the execution character set is an ASCII
856                    superset). */
857                 PyErr_Format(PyExc_ValueError,
858                      "Unknown conversion specifier %c",
859                      (char)conversion);
860         } else
861                 PyErr_Format(PyExc_ValueError,
862                      "Unknown conversion specifier \\x%x",
863                      (unsigned int)conversion);
864         return NULL;
865     }
866 }
867 
868 /* given:
869 
870    {field_name!conversion:format_spec}
871 
872    compute the result and write it to output.
873    format_spec_needs_expanding is an optimization.  if it's false,
874    just output the string directly, otherwise recursively expand the
875    format_spec string.
876 
877    field_name is allowed to be zero length, in which case we
878    are doing auto field numbering.
879 */
880 
881 static int
output_markup(SubString * field_name,SubString * format_spec,int format_spec_needs_expanding,STRINGLIB_CHAR conversion,OutputString * output,PyObject * args,PyObject * kwargs,int recursion_depth,AutoNumber * auto_number)882 output_markup(SubString *field_name, SubString *format_spec,
883               int format_spec_needs_expanding, STRINGLIB_CHAR conversion,
884               OutputString *output, PyObject *args, PyObject *kwargs,
885               int recursion_depth, AutoNumber *auto_number)
886 {
887     PyObject *tmp = NULL;
888     PyObject *fieldobj = NULL;
889     SubString expanded_format_spec;
890     SubString *actual_format_spec;
891     int result = 0;
892 
893     /* convert field_name to an object */
894     fieldobj = get_field_object(field_name, args, kwargs, auto_number);
895     if (fieldobj == NULL)
896         goto done;
897 
898     if (conversion != '\0') {
899         tmp = do_conversion(fieldobj, conversion);
900         if (tmp == NULL)
901             goto done;
902 
903         /* do the assignment, transferring ownership: fieldobj = tmp */
904         Py_DECREF(fieldobj);
905         fieldobj = tmp;
906         tmp = NULL;
907     }
908 
909     /* if needed, recurively compute the format_spec */
910     if (format_spec_needs_expanding) {
911         tmp = build_string(format_spec, args, kwargs, recursion_depth-1,
912                            auto_number);
913         if (tmp == NULL)
914             goto done;
915 
916         /* note that in the case we're expanding the format string,
917            tmp must be kept around until after the call to
918            render_field. */
919         SubString_init(&expanded_format_spec,
920                        STRINGLIB_STR(tmp), STRINGLIB_LEN(tmp));
921         actual_format_spec = &expanded_format_spec;
922     }
923     else
924         actual_format_spec = format_spec;
925 
926     if (render_field(fieldobj, actual_format_spec, output) == 0)
927         goto done;
928 
929     result = 1;
930 
931 done:
932     Py_XDECREF(fieldobj);
933     Py_XDECREF(tmp);
934 
935     return result;
936 }
937 
938 /*
939     do_markup is the top-level loop for the format() method.  It
940     searches through the format string for escapes to markup codes, and
941     calls other functions to move non-markup text to the output,
942     and to perform the markup to the output.
943 */
944 static int
do_markup(SubString * input,PyObject * args,PyObject * kwargs,OutputString * output,int recursion_depth,AutoNumber * auto_number)945 do_markup(SubString *input, PyObject *args, PyObject *kwargs,
946           OutputString *output, int recursion_depth, AutoNumber *auto_number)
947 {
948     MarkupIterator iter;
949     int format_spec_needs_expanding;
950     int result;
951     int field_present;
952     SubString literal;
953     SubString field_name;
954     SubString format_spec;
955     STRINGLIB_CHAR conversion;
956 
957     MarkupIterator_init(&iter, input->ptr, input->end - input->ptr);
958     while ((result = MarkupIterator_next(&iter, &literal, &field_present,
959                                          &field_name, &format_spec,
960                                          &conversion,
961                                          &format_spec_needs_expanding)) == 2) {
962         if (!output_data(output, literal.ptr, literal.end - literal.ptr))
963             return 0;
964         if (field_present)
965             if (!output_markup(&field_name, &format_spec,
966                                format_spec_needs_expanding, conversion, output,
967                                args, kwargs, recursion_depth, auto_number))
968                 return 0;
969     }
970     return result;
971 }
972 
973 
974 /*
975     build_string allocates the output string and then
976     calls do_markup to do the heavy lifting.
977 */
978 static PyObject *
build_string(SubString * input,PyObject * args,PyObject * kwargs,int recursion_depth,AutoNumber * auto_number)979 build_string(SubString *input, PyObject *args, PyObject *kwargs,
980              int recursion_depth, AutoNumber *auto_number)
981 {
982     OutputString output;
983     PyObject *result = NULL;
984     Py_ssize_t count;
985 
986     output.obj = NULL; /* needed so cleanup code always works */
987 
988     /* check the recursion level */
989     if (recursion_depth <= 0) {
990         PyErr_SetString(PyExc_ValueError,
991                         "Max string recursion exceeded");
992         goto done;
993     }
994 
995     /* initial size is the length of the format string, plus the size
996        increment.  seems like a reasonable default */
997     if (!output_initialize(&output,
998                            input->end - input->ptr +
999                            INITIAL_SIZE_INCREMENT))
1000         goto done;
1001 
1002     if (!do_markup(input, args, kwargs, &output, recursion_depth,
1003                    auto_number)) {
1004         goto done;
1005     }
1006 
1007     count = output.ptr - STRINGLIB_STR(output.obj);
1008     if (STRINGLIB_RESIZE(&output.obj, count) < 0) {
1009         goto done;
1010     }
1011 
1012     /* transfer ownership to result */
1013     result = output.obj;
1014     output.obj = NULL;
1015 
1016 done:
1017     Py_XDECREF(output.obj);
1018     return result;
1019 }
1020 
1021 /************************************************************************/
1022 /*********** main routine ***********************************************/
1023 /************************************************************************/
1024 
1025 /* this is the main entry point */
1026 static PyObject *
do_string_format(PyObject * self,PyObject * args,PyObject * kwargs)1027 do_string_format(PyObject *self, PyObject *args, PyObject *kwargs)
1028 {
1029     SubString input;
1030 
1031     /* PEP 3101 says only 2 levels, so that
1032        "{0:{1}}".format('abc', 's')            # works
1033        "{0:{1:{2}}}".format('abc', 's', '')    # fails
1034     */
1035     int recursion_depth = 2;
1036 
1037     AutoNumber auto_number;
1038 
1039     AutoNumber_Init(&auto_number);
1040     SubString_init(&input, STRINGLIB_STR(self), STRINGLIB_LEN(self));
1041     return build_string(&input, args, kwargs, recursion_depth, &auto_number);
1042 }
1043 
1044 
1045 
1046 /************************************************************************/
1047 /*********** formatteriterator ******************************************/
1048 /************************************************************************/
1049 
1050 /* This is used to implement string.Formatter.vparse().  It exists so
1051    Formatter can share code with the built in unicode.format() method.
1052    It's really just a wrapper around MarkupIterator that is callable
1053    from Python. */
1054 
1055 typedef struct {
1056     PyObject_HEAD
1057 
1058     STRINGLIB_OBJECT *str;
1059 
1060     MarkupIterator it_markup;
1061 } formatteriterobject;
1062 
1063 static void
formatteriter_dealloc(formatteriterobject * it)1064 formatteriter_dealloc(formatteriterobject *it)
1065 {
1066     Py_XDECREF(it->str);
1067     PyObject_FREE(it);
1068 }
1069 
1070 /* returns a tuple:
1071    (literal, field_name, format_spec, conversion)
1072 
1073    literal is any literal text to output.  might be zero length
1074    field_name is the string before the ':'.  might be None
1075    format_spec is the string after the ':'.  mibht be None
1076    conversion is either None, or the string after the '!'
1077 */
1078 static PyObject *
formatteriter_next(formatteriterobject * it)1079 formatteriter_next(formatteriterobject *it)
1080 {
1081     SubString literal;
1082     SubString field_name;
1083     SubString format_spec;
1084     STRINGLIB_CHAR conversion;
1085     int format_spec_needs_expanding;
1086     int field_present;
1087     int result = MarkupIterator_next(&it->it_markup, &literal, &field_present,
1088                                      &field_name, &format_spec, &conversion,
1089                                      &format_spec_needs_expanding);
1090 
1091     /* all of the SubString objects point into it->str, so no
1092        memory management needs to be done on them */
1093     assert(0 <= result && result <= 2);
1094     if (result == 0 || result == 1)
1095         /* if 0, error has already been set, if 1, iterator is empty */
1096         return NULL;
1097     else {
1098         PyObject *literal_str = NULL;
1099         PyObject *field_name_str = NULL;
1100         PyObject *format_spec_str = NULL;
1101         PyObject *conversion_str = NULL;
1102         PyObject *tuple = NULL;
1103 
1104         literal_str = SubString_new_object(&literal);
1105         if (literal_str == NULL)
1106             goto done;
1107 
1108         field_name_str = SubString_new_object(&field_name);
1109         if (field_name_str == NULL)
1110             goto done;
1111 
1112         /* if field_name is non-zero length, return a string for
1113            format_spec (even if zero length), else return None */
1114         format_spec_str = (field_present ?
1115                            SubString_new_object_or_empty :
1116                            SubString_new_object)(&format_spec);
1117         if (format_spec_str == NULL)
1118             goto done;
1119 
1120         /* if the conversion is not specified, return a None,
1121            otherwise create a one length string with the conversion
1122            character */
1123         if (conversion == '\0') {
1124             conversion_str = Py_None;
1125             Py_INCREF(conversion_str);
1126         }
1127         else
1128             conversion_str = STRINGLIB_NEW(&conversion, 1);
1129         if (conversion_str == NULL)
1130             goto done;
1131 
1132         tuple = PyTuple_Pack(4, literal_str, field_name_str, format_spec_str,
1133                              conversion_str);
1134     done:
1135         Py_XDECREF(literal_str);
1136         Py_XDECREF(field_name_str);
1137         Py_XDECREF(format_spec_str);
1138         Py_XDECREF(conversion_str);
1139         return tuple;
1140     }
1141 }
1142 
1143 static PyMethodDef formatteriter_methods[] = {
1144     {NULL,              NULL}           /* sentinel */
1145 };
1146 
1147 static PyTypeObject PyFormatterIter_Type = {
1148     PyVarObject_HEAD_INIT(&PyType_Type, 0)
1149     "formatteriterator",                /* tp_name */
1150     sizeof(formatteriterobject),        /* tp_basicsize */
1151     0,                                  /* tp_itemsize */
1152     /* methods */
1153     (destructor)formatteriter_dealloc,  /* tp_dealloc */
1154     0,                                  /* tp_print */
1155     0,                                  /* tp_getattr */
1156     0,                                  /* tp_setattr */
1157     0,                                  /* tp_compare */
1158     0,                                  /* tp_repr */
1159     0,                                  /* tp_as_number */
1160     0,                                  /* tp_as_sequence */
1161     0,                                  /* tp_as_mapping */
1162     0,                                  /* tp_hash */
1163     0,                                  /* tp_call */
1164     0,                                  /* tp_str */
1165     PyObject_GenericGetAttr,            /* tp_getattro */
1166     0,                                  /* tp_setattro */
1167     0,                                  /* tp_as_buffer */
1168     Py_TPFLAGS_DEFAULT,                 /* tp_flags */
1169     0,                                  /* tp_doc */
1170     0,                                  /* tp_traverse */
1171     0,                                  /* tp_clear */
1172     0,                                  /* tp_richcompare */
1173     0,                                  /* tp_weaklistoffset */
1174     PyObject_SelfIter,                  /* tp_iter */
1175     (iternextfunc)formatteriter_next,   /* tp_iternext */
1176     formatteriter_methods,              /* tp_methods */
1177     0,
1178 };
1179 
1180 /* unicode_formatter_parser is used to implement
1181    string.Formatter.vformat.  it parses a string and returns tuples
1182    describing the parsed elements.  It's a wrapper around
1183    stringlib/string_format.h's MarkupIterator */
1184 static PyObject *
formatter_parser(STRINGLIB_OBJECT * self)1185 formatter_parser(STRINGLIB_OBJECT *self)
1186 {
1187     formatteriterobject *it;
1188 
1189     it = PyObject_New(formatteriterobject, &PyFormatterIter_Type);
1190     if (it == NULL)
1191         return NULL;
1192 
1193     /* take ownership, give the object to the iterator */
1194     Py_INCREF(self);
1195     it->str = self;
1196 
1197     /* initialize the contained MarkupIterator */
1198     MarkupIterator_init(&it->it_markup,
1199                         STRINGLIB_STR(self),
1200                         STRINGLIB_LEN(self));
1201 
1202     return (PyObject *)it;
1203 }
1204 
1205 
1206 /************************************************************************/
1207 /*********** fieldnameiterator ******************************************/
1208 /************************************************************************/
1209 
1210 
1211 /* This is used to implement string.Formatter.vparse().  It parses the
1212    field name into attribute and item values.  It's a Python-callable
1213    wrapper around FieldNameIterator */
1214 
1215 typedef struct {
1216     PyObject_HEAD
1217 
1218     STRINGLIB_OBJECT *str;
1219 
1220     FieldNameIterator it_field;
1221 } fieldnameiterobject;
1222 
1223 static void
fieldnameiter_dealloc(fieldnameiterobject * it)1224 fieldnameiter_dealloc(fieldnameiterobject *it)
1225 {
1226     Py_XDECREF(it->str);
1227     PyObject_FREE(it);
1228 }
1229 
1230 /* returns a tuple:
1231    (is_attr, value)
1232    is_attr is true if we used attribute syntax (e.g., '.foo')
1233               false if we used index syntax (e.g., '[foo]')
1234    value is an integer or string
1235 */
1236 static PyObject *
fieldnameiter_next(fieldnameiterobject * it)1237 fieldnameiter_next(fieldnameiterobject *it)
1238 {
1239     int result;
1240     int is_attr;
1241     Py_ssize_t idx;
1242     SubString name;
1243 
1244     result = FieldNameIterator_next(&it->it_field, &is_attr,
1245                                     &idx, &name);
1246     if (result == 0 || result == 1)
1247         /* if 0, error has already been set, if 1, iterator is empty */
1248         return NULL;
1249     else {
1250         PyObject* result = NULL;
1251         PyObject* is_attr_obj = NULL;
1252         PyObject* obj = NULL;
1253 
1254         is_attr_obj = PyBool_FromLong(is_attr);
1255         if (is_attr_obj == NULL)
1256             goto done;
1257 
1258         /* either an integer or a string */
1259         if (idx != -1)
1260             obj = PyLong_FromSsize_t(idx);
1261         else
1262             obj = SubString_new_object(&name);
1263         if (obj == NULL)
1264             goto done;
1265 
1266         /* return a tuple of values */
1267         result = PyTuple_Pack(2, is_attr_obj, obj);
1268 
1269     done:
1270         Py_XDECREF(is_attr_obj);
1271         Py_XDECREF(obj);
1272         return result;
1273     }
1274 }
1275 
1276 static PyMethodDef fieldnameiter_methods[] = {
1277     {NULL,              NULL}           /* sentinel */
1278 };
1279 
1280 static PyTypeObject PyFieldNameIter_Type = {
1281     PyVarObject_HEAD_INIT(&PyType_Type, 0)
1282     "fieldnameiterator",                /* tp_name */
1283     sizeof(fieldnameiterobject),        /* tp_basicsize */
1284     0,                                  /* tp_itemsize */
1285     /* methods */
1286     (destructor)fieldnameiter_dealloc,  /* tp_dealloc */
1287     0,                                  /* tp_print */
1288     0,                                  /* tp_getattr */
1289     0,                                  /* tp_setattr */
1290     0,                                  /* tp_compare */
1291     0,                                  /* tp_repr */
1292     0,                                  /* tp_as_number */
1293     0,                                  /* tp_as_sequence */
1294     0,                                  /* tp_as_mapping */
1295     0,                                  /* tp_hash */
1296     0,                                  /* tp_call */
1297     0,                                  /* tp_str */
1298     PyObject_GenericGetAttr,            /* tp_getattro */
1299     0,                                  /* tp_setattro */
1300     0,                                  /* tp_as_buffer */
1301     Py_TPFLAGS_DEFAULT,                 /* tp_flags */
1302     0,                                  /* tp_doc */
1303     0,                                  /* tp_traverse */
1304     0,                                  /* tp_clear */
1305     0,                                  /* tp_richcompare */
1306     0,                                  /* tp_weaklistoffset */
1307     PyObject_SelfIter,                  /* tp_iter */
1308     (iternextfunc)fieldnameiter_next,   /* tp_iternext */
1309     fieldnameiter_methods,              /* tp_methods */
1310     0};
1311 
1312 /* unicode_formatter_field_name_split is used to implement
1313    string.Formatter.vformat.  it takes a PEP 3101 "field name", and
1314    returns a tuple of (first, rest): "first", the part before the
1315    first '.' or '['; and "rest", an iterator for the rest of the field
1316    name.  it's a wrapper around stringlib/string_format.h's
1317    field_name_split.  The iterator it returns is a
1318    FieldNameIterator */
1319 static PyObject *
formatter_field_name_split(STRINGLIB_OBJECT * self)1320 formatter_field_name_split(STRINGLIB_OBJECT *self)
1321 {
1322     SubString first;
1323     Py_ssize_t first_idx;
1324     fieldnameiterobject *it;
1325 
1326     PyObject *first_obj = NULL;
1327     PyObject *result = NULL;
1328 
1329     it = PyObject_New(fieldnameiterobject, &PyFieldNameIter_Type);
1330     if (it == NULL)
1331         return NULL;
1332 
1333     /* take ownership, give the object to the iterator.  this is
1334        just to keep the field_name alive */
1335     Py_INCREF(self);
1336     it->str = self;
1337 
1338     /* Pass in auto_number = NULL. We'll return an empty string for
1339        first_obj in that case. */
1340     if (!field_name_split(STRINGLIB_STR(self),
1341                           STRINGLIB_LEN(self),
1342                           &first, &first_idx, &it->it_field, NULL))
1343         goto done;
1344 
1345     /* first becomes an integer, if possible; else a string */
1346     if (first_idx != -1)
1347         first_obj = PyLong_FromSsize_t(first_idx);
1348     else
1349         /* convert "first" into a string object */
1350         first_obj = SubString_new_object(&first);
1351     if (first_obj == NULL)
1352         goto done;
1353 
1354     /* return a tuple of values */
1355     result = PyTuple_Pack(2, first_obj, it);
1356 
1357 done:
1358     Py_XDECREF(it);
1359     Py_XDECREF(first_obj);
1360     return result;
1361 }
1362