1 /*
2     unicode_format.h -- implementation of str.format().
3 */
4 
5 /************************************************************************/
6 /***********   Global data structures and forward declarations  *********/
7 /************************************************************************/
8 
9 /*
10    A SubString consists of the characters between two string or
11    unicode pointers.
12 */
13 typedef struct {
14     PyObject *str; /* borrowed reference */
15     Py_ssize_t start, end;
16 } SubString;
17 
18 
19 typedef enum {
20     ANS_INIT,
21     ANS_AUTO,
22     ANS_MANUAL
23 } AutoNumberState;   /* Keep track if we're auto-numbering fields */
24 
25 /* Keeps track of our auto-numbering state, and which number field we're on */
26 typedef struct {
27     AutoNumberState an_state;
28     int an_field_number;
29 } AutoNumber;
30 
31 
32 /* forward declaration for recursion */
33 static PyObject *
34 build_string(SubString *input, PyObject *args, PyObject *kwargs,
35              int recursion_depth, AutoNumber *auto_number);
36 
37 
38 
39 /************************************************************************/
40 /**************************  Utility  functions  ************************/
41 /************************************************************************/
42 
43 static void
AutoNumber_Init(AutoNumber * auto_number)44 AutoNumber_Init(AutoNumber *auto_number)
45 {
46     auto_number->an_state = ANS_INIT;
47     auto_number->an_field_number = 0;
48 }
49 
50 /* fill in a SubString from a pointer and length */
51 Py_LOCAL_INLINE(void)
SubString_init(SubString * str,PyObject * s,Py_ssize_t start,Py_ssize_t end)52 SubString_init(SubString *str, PyObject *s, Py_ssize_t start, Py_ssize_t end)
53 {
54     str->str = s;
55     str->start = start;
56     str->end = end;
57 }
58 
59 /* return a new string.  if str->str is NULL, return None */
60 Py_LOCAL_INLINE(PyObject *)
SubString_new_object(SubString * str)61 SubString_new_object(SubString *str)
62 {
63     if (str->str == NULL)
64         Py_RETURN_NONE;
65     return PyUnicode_Substring(str->str, str->start, str->end);
66 }
67 
68 /* return a new string.  if str->str is NULL, return a new empty string */
69 Py_LOCAL_INLINE(PyObject *)
SubString_new_object_or_empty(SubString * str)70 SubString_new_object_or_empty(SubString *str)
71 {
72     if (str->str == NULL) {
73         return PyUnicode_New(0, 0);
74     }
75     return SubString_new_object(str);
76 }
77 
78 /* Return 1 if an error has been detected switching between automatic
79    field numbering and manual field specification, else return 0. Set
80    ValueError on error. */
81 static int
autonumber_state_error(AutoNumberState state,int field_name_is_empty)82 autonumber_state_error(AutoNumberState state, int field_name_is_empty)
83 {
84     if (state == ANS_MANUAL) {
85         if (field_name_is_empty) {
86             PyErr_SetString(PyExc_ValueError, "cannot switch from "
87                             "manual field specification to "
88                             "automatic field numbering");
89             return 1;
90         }
91     }
92     else {
93         if (!field_name_is_empty) {
94             PyErr_SetString(PyExc_ValueError, "cannot switch from "
95                             "automatic field numbering to "
96                             "manual field specification");
97             return 1;
98         }
99     }
100     return 0;
101 }
102 
103 
104 /************************************************************************/
105 /***********  Format string parsing -- integers and identifiers *********/
106 /************************************************************************/
107 
108 static Py_ssize_t
get_integer(const SubString * str)109 get_integer(const SubString *str)
110 {
111     Py_ssize_t accumulator = 0;
112     Py_ssize_t digitval;
113     Py_ssize_t i;
114 
115     /* empty string is an error */
116     if (str->start >= str->end)
117         return -1;
118 
119     for (i = str->start; i < str->end; i++) {
120         digitval = Py_UNICODE_TODECIMAL(PyUnicode_READ_CHAR(str->str, i));
121         if (digitval < 0)
122             return -1;
123         /*
124            Detect possible overflow before it happens:
125 
126               accumulator * 10 + digitval > PY_SSIZE_T_MAX if and only if
127               accumulator > (PY_SSIZE_T_MAX - digitval) / 10.
128         */
129         if (accumulator > (PY_SSIZE_T_MAX - digitval) / 10) {
130             PyErr_Format(PyExc_ValueError,
131                          "Too many decimal digits in format string");
132             return -1;
133         }
134         accumulator = accumulator * 10 + digitval;
135     }
136     return accumulator;
137 }
138 
139 /************************************************************************/
140 /******** Functions to get field objects and specification strings ******/
141 /************************************************************************/
142 
143 /* do the equivalent of obj.name */
144 static PyObject *
getattr(PyObject * obj,SubString * name)145 getattr(PyObject *obj, SubString *name)
146 {
147     PyObject *newobj;
148     PyObject *str = SubString_new_object(name);
149     if (str == NULL)
150         return NULL;
151     newobj = PyObject_GetAttr(obj, str);
152     Py_DECREF(str);
153     return newobj;
154 }
155 
156 /* do the equivalent of obj[idx], where obj is a sequence */
157 static PyObject *
getitem_sequence(PyObject * obj,Py_ssize_t idx)158 getitem_sequence(PyObject *obj, Py_ssize_t idx)
159 {
160     return PySequence_GetItem(obj, idx);
161 }
162 
163 /* do the equivalent of obj[idx], where obj is not a sequence */
164 static PyObject *
getitem_idx(PyObject * obj,Py_ssize_t idx)165 getitem_idx(PyObject *obj, Py_ssize_t idx)
166 {
167     PyObject *newobj;
168     PyObject *idx_obj = PyLong_FromSsize_t(idx);
169     if (idx_obj == NULL)
170         return NULL;
171     newobj = PyObject_GetItem(obj, idx_obj);
172     Py_DECREF(idx_obj);
173     return newobj;
174 }
175 
176 /* do the equivalent of obj[name] */
177 static PyObject *
getitem_str(PyObject * obj,SubString * name)178 getitem_str(PyObject *obj, SubString *name)
179 {
180     PyObject *newobj;
181     PyObject *str = SubString_new_object(name);
182     if (str == NULL)
183         return NULL;
184     newobj = PyObject_GetItem(obj, str);
185     Py_DECREF(str);
186     return newobj;
187 }
188 
189 typedef struct {
190     /* the entire string we're parsing.  we assume that someone else
191        is managing its lifetime, and that it will exist for the
192        lifetime of the iterator.  can be empty */
193     SubString str;
194 
195     /* index to where we are inside field_name */
196     Py_ssize_t index;
197 } FieldNameIterator;
198 
199 
200 static int
FieldNameIterator_init(FieldNameIterator * self,PyObject * s,Py_ssize_t start,Py_ssize_t end)201 FieldNameIterator_init(FieldNameIterator *self, PyObject *s,
202                        Py_ssize_t start, Py_ssize_t end)
203 {
204     SubString_init(&self->str, s, start, end);
205     self->index = start;
206     return 1;
207 }
208 
209 static int
_FieldNameIterator_attr(FieldNameIterator * self,SubString * name)210 _FieldNameIterator_attr(FieldNameIterator *self, SubString *name)
211 {
212     Py_UCS4 c;
213 
214     name->str = self->str.str;
215     name->start = self->index;
216 
217     /* return everything until '.' or '[' */
218     while (self->index < self->str.end) {
219         c = PyUnicode_READ_CHAR(self->str.str, self->index++);
220         switch (c) {
221         case '[':
222         case '.':
223             /* backup so that we this character will be seen next time */
224             self->index--;
225             break;
226         default:
227             continue;
228         }
229         break;
230     }
231     /* end of string is okay */
232     name->end = self->index;
233     return 1;
234 }
235 
236 static int
_FieldNameIterator_item(FieldNameIterator * self,SubString * name)237 _FieldNameIterator_item(FieldNameIterator *self, SubString *name)
238 {
239     int bracket_seen = 0;
240     Py_UCS4 c;
241 
242     name->str = self->str.str;
243     name->start = self->index;
244 
245     /* return everything until ']' */
246     while (self->index < self->str.end) {
247         c = PyUnicode_READ_CHAR(self->str.str, self->index++);
248         switch (c) {
249         case ']':
250             bracket_seen = 1;
251             break;
252         default:
253             continue;
254         }
255         break;
256     }
257     /* make sure we ended with a ']' */
258     if (!bracket_seen) {
259         PyErr_SetString(PyExc_ValueError, "Missing ']' in format string");
260         return 0;
261     }
262 
263     /* end of string is okay */
264     /* don't include the ']' */
265     name->end = self->index-1;
266     return 1;
267 }
268 
269 /* returns 0 on error, 1 on non-error termination, and 2 if it returns a value */
270 static int
FieldNameIterator_next(FieldNameIterator * self,int * is_attribute,Py_ssize_t * name_idx,SubString * name)271 FieldNameIterator_next(FieldNameIterator *self, int *is_attribute,
272                        Py_ssize_t *name_idx, SubString *name)
273 {
274     /* check at end of input */
275     if (self->index >= self->str.end)
276         return 1;
277 
278     switch (PyUnicode_READ_CHAR(self->str.str, self->index++)) {
279     case '.':
280         *is_attribute = 1;
281         if (_FieldNameIterator_attr(self, name) == 0)
282             return 0;
283         *name_idx = -1;
284         break;
285     case '[':
286         *is_attribute = 0;
287         if (_FieldNameIterator_item(self, name) == 0)
288             return 0;
289         *name_idx = get_integer(name);
290         if (*name_idx == -1 && PyErr_Occurred())
291             return 0;
292         break;
293     default:
294         /* Invalid character follows ']' */
295         PyErr_SetString(PyExc_ValueError, "Only '.' or '[' may "
296                         "follow ']' in format field specifier");
297         return 0;
298     }
299 
300     /* empty string is an error */
301     if (name->start == name->end) {
302         PyErr_SetString(PyExc_ValueError, "Empty attribute in format string");
303         return 0;
304     }
305 
306     return 2;
307 }
308 
309 
310 /* input: field_name
311    output: 'first' points to the part before the first '[' or '.'
312            'first_idx' is -1 if 'first' is not an integer, otherwise
313                        it's the value of first converted to an integer
314            'rest' is an iterator to return the rest
315 */
316 static int
field_name_split(PyObject * str,Py_ssize_t start,Py_ssize_t end,SubString * first,Py_ssize_t * first_idx,FieldNameIterator * rest,AutoNumber * auto_number)317 field_name_split(PyObject *str, Py_ssize_t start, Py_ssize_t end, SubString *first,
318                  Py_ssize_t *first_idx, FieldNameIterator *rest,
319                  AutoNumber *auto_number)
320 {
321     Py_UCS4 c;
322     Py_ssize_t i = start;
323     int field_name_is_empty;
324     int using_numeric_index;
325 
326     /* find the part up until the first '.' or '[' */
327     while (i < end) {
328         switch (c = PyUnicode_READ_CHAR(str, i++)) {
329         case '[':
330         case '.':
331             /* backup so that we this character is available to the
332                "rest" iterator */
333             i--;
334             break;
335         default:
336             continue;
337         }
338         break;
339     }
340 
341     /* set up the return values */
342     SubString_init(first, str, start, i);
343     FieldNameIterator_init(rest, str, i, end);
344 
345     /* see if "first" is an integer, in which case it's used as an index */
346     *first_idx = get_integer(first);
347     if (*first_idx == -1 && PyErr_Occurred())
348         return 0;
349 
350     field_name_is_empty = first->start >= first->end;
351 
352     /* If the field name is omitted or if we have a numeric index
353        specified, then we're doing numeric indexing into args. */
354     using_numeric_index = field_name_is_empty || *first_idx != -1;
355 
356     /* We always get here exactly one time for each field we're
357        processing. And we get here in field order (counting by left
358        braces). So this is the perfect place to handle automatic field
359        numbering if the field name is omitted. */
360 
361     /* Check if we need to do the auto-numbering. It's not needed if
362        we're called from string.Format routines, because it's handled
363        in that class by itself. */
364     if (auto_number) {
365         /* Initialize our auto numbering state if this is the first
366            time we're either auto-numbering or manually numbering. */
367         if (auto_number->an_state == ANS_INIT && using_numeric_index)
368             auto_number->an_state = field_name_is_empty ?
369                 ANS_AUTO : ANS_MANUAL;
370 
371         /* Make sure our state is consistent with what we're doing
372            this time through. Only check if we're using a numeric
373            index. */
374         if (using_numeric_index)
375             if (autonumber_state_error(auto_number->an_state,
376                                        field_name_is_empty))
377                 return 0;
378         /* Zero length field means we want to do auto-numbering of the
379            fields. */
380         if (field_name_is_empty)
381             *first_idx = (auto_number->an_field_number)++;
382     }
383 
384     return 1;
385 }
386 
387 
388 /*
389     get_field_object returns the object inside {}, before the
390     format_spec.  It handles getindex and getattr lookups and consumes
391     the entire input string.
392 */
393 static PyObject *
get_field_object(SubString * input,PyObject * args,PyObject * kwargs,AutoNumber * auto_number)394 get_field_object(SubString *input, PyObject *args, PyObject *kwargs,
395                  AutoNumber *auto_number)
396 {
397     PyObject *obj = NULL;
398     int ok;
399     int is_attribute;
400     SubString name;
401     SubString first;
402     Py_ssize_t index;
403     FieldNameIterator rest;
404 
405     if (!field_name_split(input->str, input->start, input->end, &first,
406                           &index, &rest, auto_number)) {
407         goto error;
408     }
409 
410     if (index == -1) {
411         /* look up in kwargs */
412         PyObject *key = SubString_new_object(&first);
413         if (key == NULL) {
414             goto error;
415         }
416         if (kwargs == NULL) {
417             PyErr_SetObject(PyExc_KeyError, key);
418             Py_DECREF(key);
419             goto error;
420         }
421         /* Use PyObject_GetItem instead of PyDict_GetItem because this
422            code is no longer just used with kwargs. It might be passed
423            a non-dict when called through format_map. */
424         obj = PyObject_GetItem(kwargs, key);
425         Py_DECREF(key);
426         if (obj == NULL) {
427             goto error;
428         }
429     }
430     else {
431         /* If args is NULL, we have a format string with a positional field
432            with only kwargs to retrieve it from. This can only happen when
433            used with format_map(), where positional arguments are not
434            allowed. */
435         if (args == NULL) {
436             PyErr_SetString(PyExc_ValueError, "Format string contains "
437                             "positional fields");
438             goto error;
439         }
440 
441         /* look up in args */
442         obj = PySequence_GetItem(args, index);
443         if (obj == NULL)
444             goto error;
445     }
446 
447     /* iterate over the rest of the field_name */
448     while ((ok = FieldNameIterator_next(&rest, &is_attribute, &index,
449                                         &name)) == 2) {
450         PyObject *tmp;
451 
452         if (is_attribute)
453             /* getattr lookup "." */
454             tmp = getattr(obj, &name);
455         else
456             /* getitem lookup "[]" */
457             if (index == -1)
458                 tmp = getitem_str(obj, &name);
459             else
460                 if (PySequence_Check(obj))
461                     tmp = getitem_sequence(obj, index);
462                 else
463                     /* not a sequence */
464                     tmp = getitem_idx(obj, index);
465         if (tmp == NULL)
466             goto error;
467 
468         /* assign to obj */
469         Py_DECREF(obj);
470         obj = tmp;
471     }
472     /* end of iterator, this is the non-error case */
473     if (ok == 1)
474         return obj;
475 error:
476     Py_XDECREF(obj);
477     return NULL;
478 }
479 
480 /************************************************************************/
481 /*****************  Field rendering functions  **************************/
482 /************************************************************************/
483 
484 /*
485     render_field() is the main function in this section.  It takes the
486     field object and field specification string generated by
487     get_field_and_spec, and renders the field into the output string.
488 
489     render_field calls fieldobj.__format__(format_spec) method, and
490     appends to the output.
491 */
492 static int
render_field(PyObject * fieldobj,SubString * format_spec,_PyUnicodeWriter * writer)493 render_field(PyObject *fieldobj, SubString *format_spec, _PyUnicodeWriter *writer)
494 {
495     int ok = 0;
496     PyObject *result = NULL;
497     PyObject *format_spec_object = NULL;
498     int (*formatter) (_PyUnicodeWriter*, PyObject *, PyObject *, Py_ssize_t, Py_ssize_t) = NULL;
499     int err;
500 
501     /* If we know the type exactly, skip the lookup of __format__ and just
502        call the formatter directly. */
503     if (PyUnicode_CheckExact(fieldobj))
504         formatter = _PyUnicode_FormatAdvancedWriter;
505     else if (PyLong_CheckExact(fieldobj))
506         formatter = _PyLong_FormatAdvancedWriter;
507     else if (PyFloat_CheckExact(fieldobj))
508         formatter = _PyFloat_FormatAdvancedWriter;
509     else if (PyComplex_CheckExact(fieldobj))
510         formatter = _PyComplex_FormatAdvancedWriter;
511 
512     if (formatter) {
513         /* we know exactly which formatter will be called when __format__ is
514            looked up, so call it directly, instead. */
515         err = formatter(writer, fieldobj, format_spec->str,
516                         format_spec->start, format_spec->end);
517         return (err == 0);
518     }
519     else {
520         /* We need to create an object out of the pointers we have, because
521            __format__ takes a string/unicode object for format_spec. */
522         if (format_spec->str)
523             format_spec_object = PyUnicode_Substring(format_spec->str,
524                                                      format_spec->start,
525                                                      format_spec->end);
526         else
527             format_spec_object = PyUnicode_New(0, 0);
528         if (format_spec_object == NULL)
529             goto done;
530 
531         result = PyObject_Format(fieldobj, format_spec_object);
532     }
533     if (result == NULL)
534         goto done;
535 
536     if (_PyUnicodeWriter_WriteStr(writer, result) == -1)
537         goto done;
538     ok = 1;
539 
540 done:
541     Py_XDECREF(format_spec_object);
542     Py_XDECREF(result);
543     return ok;
544 }
545 
546 static int
parse_field(SubString * str,SubString * field_name,SubString * format_spec,int * format_spec_needs_expanding,Py_UCS4 * conversion)547 parse_field(SubString *str, SubString *field_name, SubString *format_spec,
548             int *format_spec_needs_expanding, Py_UCS4 *conversion)
549 {
550     /* Note this function works if the field name is zero length,
551        which is good.  Zero length field names are handled later, in
552        field_name_split. */
553 
554     Py_UCS4 c = 0;
555 
556     /* initialize these, as they may be empty */
557     *conversion = '\0';
558     SubString_init(format_spec, NULL, 0, 0);
559 
560     /* Search for the field name.  it's terminated by the end of
561        the string, or a ':' or '!' */
562     field_name->str = str->str;
563     field_name->start = str->start;
564     while (str->start < str->end) {
565         switch ((c = PyUnicode_READ_CHAR(str->str, str->start++))) {
566         case '{':
567             PyErr_SetString(PyExc_ValueError, "unexpected '{' in field name");
568             return 0;
569         case '[':
570             for (; str->start < str->end; str->start++)
571                 if (PyUnicode_READ_CHAR(str->str, str->start) == ']')
572                     break;
573             continue;
574         case '}':
575         case ':':
576         case '!':
577             break;
578         default:
579             continue;
580         }
581         break;
582     }
583 
584     field_name->end = str->start - 1;
585     if (c == '!' || c == ':') {
586         Py_ssize_t count;
587         /* we have a format specifier and/or a conversion */
588         /* don't include the last character */
589 
590         /* see if there's a conversion specifier */
591         if (c == '!') {
592             /* there must be another character present */
593             if (str->start >= str->end) {
594                 PyErr_SetString(PyExc_ValueError,
595                                 "end of string while looking for conversion "
596                                 "specifier");
597                 return 0;
598             }
599             *conversion = PyUnicode_READ_CHAR(str->str, str->start++);
600 
601             if (str->start < str->end) {
602                 c = PyUnicode_READ_CHAR(str->str, str->start++);
603                 if (c == '}')
604                     return 1;
605                 if (c != ':') {
606                     PyErr_SetString(PyExc_ValueError,
607                                     "expected ':' after conversion specifier");
608                     return 0;
609                 }
610             }
611         }
612         format_spec->str = str->str;
613         format_spec->start = str->start;
614         count = 1;
615         while (str->start < str->end) {
616             switch ((c = PyUnicode_READ_CHAR(str->str, str->start++))) {
617             case '{':
618                 *format_spec_needs_expanding = 1;
619                 count++;
620                 break;
621             case '}':
622                 count--;
623                 if (count == 0) {
624                     format_spec->end = str->start - 1;
625                     return 1;
626                 }
627                 break;
628             default:
629                 break;
630             }
631         }
632 
633         PyErr_SetString(PyExc_ValueError, "unmatched '{' in format spec");
634         return 0;
635     }
636     else if (c != '}') {
637         PyErr_SetString(PyExc_ValueError, "expected '}' before end of string");
638         return 0;
639     }
640 
641     return 1;
642 }
643 
644 /************************************************************************/
645 /******* Output string allocation and escape-to-markup processing  ******/
646 /************************************************************************/
647 
648 /* MarkupIterator breaks the string into pieces of either literal
649    text, or things inside {} that need to be marked up.  it is
650    designed to make it easy to wrap a Python iterator around it, for
651    use with the Formatter class */
652 
653 typedef struct {
654     SubString str;
655 } MarkupIterator;
656 
657 static int
MarkupIterator_init(MarkupIterator * self,PyObject * str,Py_ssize_t start,Py_ssize_t end)658 MarkupIterator_init(MarkupIterator *self, PyObject *str,
659                     Py_ssize_t start, Py_ssize_t end)
660 {
661     SubString_init(&self->str, str, start, end);
662     return 1;
663 }
664 
665 /* returns 0 on error, 1 on non-error termination, and 2 if it got a
666    string (or something to be expanded) */
667 static int
MarkupIterator_next(MarkupIterator * self,SubString * literal,int * field_present,SubString * field_name,SubString * format_spec,Py_UCS4 * conversion,int * format_spec_needs_expanding)668 MarkupIterator_next(MarkupIterator *self, SubString *literal,
669                     int *field_present, SubString *field_name,
670                     SubString *format_spec, Py_UCS4 *conversion,
671                     int *format_spec_needs_expanding)
672 {
673     int at_end;
674     Py_UCS4 c = 0;
675     Py_ssize_t start;
676     Py_ssize_t len;
677     int markup_follows = 0;
678 
679     /* initialize all of the output variables */
680     SubString_init(literal, NULL, 0, 0);
681     SubString_init(field_name, NULL, 0, 0);
682     SubString_init(format_spec, NULL, 0, 0);
683     *conversion = '\0';
684     *format_spec_needs_expanding = 0;
685     *field_present = 0;
686 
687     /* No more input, end of iterator.  This is the normal exit
688        path. */
689     if (self->str.start >= self->str.end)
690         return 1;
691 
692     start = self->str.start;
693 
694     /* First read any literal text. Read until the end of string, an
695        escaped '{' or '}', or an unescaped '{'.  In order to never
696        allocate memory and so I can just pass pointers around, if
697        there's an escaped '{' or '}' then we'll return the literal
698        including the brace, but no format object.  The next time
699        through, we'll return the rest of the literal, skipping past
700        the second consecutive brace. */
701     while (self->str.start < self->str.end) {
702         switch (c = PyUnicode_READ_CHAR(self->str.str, self->str.start++)) {
703         case '{':
704         case '}':
705             markup_follows = 1;
706             break;
707         default:
708             continue;
709         }
710         break;
711     }
712 
713     at_end = self->str.start >= self->str.end;
714     len = self->str.start - start;
715 
716     if ((c == '}') && (at_end ||
717                        (c != PyUnicode_READ_CHAR(self->str.str,
718                                                  self->str.start)))) {
719         PyErr_SetString(PyExc_ValueError, "Single '}' encountered "
720                         "in format string");
721         return 0;
722     }
723     if (at_end && c == '{') {
724         PyErr_SetString(PyExc_ValueError, "Single '{' encountered "
725                         "in format string");
726         return 0;
727     }
728     if (!at_end) {
729         if (c == PyUnicode_READ_CHAR(self->str.str, self->str.start)) {
730             /* escaped } or {, skip it in the input.  there is no
731                markup object following us, just this literal text */
732             self->str.start++;
733             markup_follows = 0;
734         }
735         else
736             len--;
737     }
738 
739     /* record the literal text */
740     literal->str = self->str.str;
741     literal->start = start;
742     literal->end = start + len;
743 
744     if (!markup_follows)
745         return 2;
746 
747     /* this is markup; parse the field */
748     *field_present = 1;
749     if (!parse_field(&self->str, field_name, format_spec,
750                      format_spec_needs_expanding, conversion))
751         return 0;
752     return 2;
753 }
754 
755 
756 /* do the !r or !s conversion on obj */
757 static PyObject *
do_conversion(PyObject * obj,Py_UCS4 conversion)758 do_conversion(PyObject *obj, Py_UCS4 conversion)
759 {
760     /* XXX in pre-3.0, do we need to convert this to unicode, since it
761        might have returned a string? */
762     switch (conversion) {
763     case 'r':
764         return PyObject_Repr(obj);
765     case 's':
766         return PyObject_Str(obj);
767     case 'a':
768         return PyObject_ASCII(obj);
769     default:
770         if (conversion > 32 && conversion < 127) {
771                 /* It's the ASCII subrange; casting to char is safe
772                    (assuming the execution character set is an ASCII
773                    superset). */
774                 PyErr_Format(PyExc_ValueError,
775                      "Unknown conversion specifier %c",
776                      (char)conversion);
777         } else
778                 PyErr_Format(PyExc_ValueError,
779                      "Unknown conversion specifier \\x%x",
780                      (unsigned int)conversion);
781         return NULL;
782     }
783 }
784 
785 /* given:
786 
787    {field_name!conversion:format_spec}
788 
789    compute the result and write it to output.
790    format_spec_needs_expanding is an optimization.  if it's false,
791    just output the string directly, otherwise recursively expand the
792    format_spec string.
793 
794    field_name is allowed to be zero length, in which case we
795    are doing auto field numbering.
796 */
797 
798 static int
output_markup(SubString * field_name,SubString * format_spec,int format_spec_needs_expanding,Py_UCS4 conversion,_PyUnicodeWriter * writer,PyObject * args,PyObject * kwargs,int recursion_depth,AutoNumber * auto_number)799 output_markup(SubString *field_name, SubString *format_spec,
800               int format_spec_needs_expanding, Py_UCS4 conversion,
801               _PyUnicodeWriter *writer, PyObject *args, PyObject *kwargs,
802               int recursion_depth, AutoNumber *auto_number)
803 {
804     PyObject *tmp = NULL;
805     PyObject *fieldobj = NULL;
806     SubString expanded_format_spec;
807     SubString *actual_format_spec;
808     int result = 0;
809 
810     /* convert field_name to an object */
811     fieldobj = get_field_object(field_name, args, kwargs, auto_number);
812     if (fieldobj == NULL)
813         goto done;
814 
815     if (conversion != '\0') {
816         tmp = do_conversion(fieldobj, conversion);
817         if (tmp == NULL || PyUnicode_READY(tmp) == -1)
818             goto done;
819 
820         /* do the assignment, transferring ownership: fieldobj = tmp */
821         Py_DECREF(fieldobj);
822         fieldobj = tmp;
823         tmp = NULL;
824     }
825 
826     /* if needed, recurively compute the format_spec */
827     if (format_spec_needs_expanding) {
828         tmp = build_string(format_spec, args, kwargs, recursion_depth-1,
829                            auto_number);
830         if (tmp == NULL || PyUnicode_READY(tmp) == -1)
831             goto done;
832 
833         /* note that in the case we're expanding the format string,
834            tmp must be kept around until after the call to
835            render_field. */
836         SubString_init(&expanded_format_spec, tmp, 0, PyUnicode_GET_LENGTH(tmp));
837         actual_format_spec = &expanded_format_spec;
838     }
839     else
840         actual_format_spec = format_spec;
841 
842     if (render_field(fieldobj, actual_format_spec, writer) == 0)
843         goto done;
844 
845     result = 1;
846 
847 done:
848     Py_XDECREF(fieldobj);
849     Py_XDECREF(tmp);
850 
851     return result;
852 }
853 
854 /*
855     do_markup is the top-level loop for the format() method.  It
856     searches through the format string for escapes to markup codes, and
857     calls other functions to move non-markup text to the output,
858     and to perform the markup to the output.
859 */
860 static int
do_markup(SubString * input,PyObject * args,PyObject * kwargs,_PyUnicodeWriter * writer,int recursion_depth,AutoNumber * auto_number)861 do_markup(SubString *input, PyObject *args, PyObject *kwargs,
862           _PyUnicodeWriter *writer, int recursion_depth, AutoNumber *auto_number)
863 {
864     MarkupIterator iter;
865     int format_spec_needs_expanding;
866     int result;
867     int field_present;
868     SubString literal;
869     SubString field_name;
870     SubString format_spec;
871     Py_UCS4 conversion;
872 
873     MarkupIterator_init(&iter, input->str, input->start, input->end);
874     while ((result = MarkupIterator_next(&iter, &literal, &field_present,
875                                          &field_name, &format_spec,
876                                          &conversion,
877                                          &format_spec_needs_expanding)) == 2) {
878         if (literal.end != literal.start) {
879             if (!field_present && iter.str.start == iter.str.end)
880                 writer->overallocate = 0;
881             if (_PyUnicodeWriter_WriteSubstring(writer, literal.str,
882                                                 literal.start, literal.end) < 0)
883                 return 0;
884         }
885 
886         if (field_present) {
887             if (iter.str.start == iter.str.end)
888                 writer->overallocate = 0;
889             if (!output_markup(&field_name, &format_spec,
890                                format_spec_needs_expanding, conversion, writer,
891                                args, kwargs, recursion_depth, auto_number))
892                 return 0;
893         }
894     }
895     return result;
896 }
897 
898 
899 /*
900     build_string allocates the output string and then
901     calls do_markup to do the heavy lifting.
902 */
903 static PyObject *
build_string(SubString * input,PyObject * args,PyObject * kwargs,int recursion_depth,AutoNumber * auto_number)904 build_string(SubString *input, PyObject *args, PyObject *kwargs,
905              int recursion_depth, AutoNumber *auto_number)
906 {
907     _PyUnicodeWriter writer;
908 
909     /* check the recursion level */
910     if (recursion_depth <= 0) {
911         PyErr_SetString(PyExc_ValueError,
912                         "Max string recursion exceeded");
913         return NULL;
914     }
915 
916     _PyUnicodeWriter_Init(&writer);
917     writer.overallocate = 1;
918     writer.min_length = PyUnicode_GET_LENGTH(input->str) + 100;
919 
920     if (!do_markup(input, args, kwargs, &writer, recursion_depth,
921                    auto_number)) {
922         _PyUnicodeWriter_Dealloc(&writer);
923         return NULL;
924     }
925 
926     return _PyUnicodeWriter_Finish(&writer);
927 }
928 
929 /************************************************************************/
930 /*********** main routine ***********************************************/
931 /************************************************************************/
932 
933 /* this is the main entry point */
934 static PyObject *
do_string_format(PyObject * self,PyObject * args,PyObject * kwargs)935 do_string_format(PyObject *self, PyObject *args, PyObject *kwargs)
936 {
937     SubString input;
938 
939     /* PEP 3101 says only 2 levels, so that
940        "{0:{1}}".format('abc', 's')            # works
941        "{0:{1:{2}}}".format('abc', 's', '')    # fails
942     */
943     int recursion_depth = 2;
944 
945     AutoNumber auto_number;
946 
947     if (PyUnicode_READY(self) == -1)
948         return NULL;
949 
950     AutoNumber_Init(&auto_number);
951     SubString_init(&input, self, 0, PyUnicode_GET_LENGTH(self));
952     return build_string(&input, args, kwargs, recursion_depth, &auto_number);
953 }
954 
955 static PyObject *
do_string_format_map(PyObject * self,PyObject * obj)956 do_string_format_map(PyObject *self, PyObject *obj)
957 {
958     return do_string_format(self, NULL, obj);
959 }
960 
961 
962 /************************************************************************/
963 /*********** formatteriterator ******************************************/
964 /************************************************************************/
965 
966 /* This is used to implement string.Formatter.vparse().  It exists so
967    Formatter can share code with the built in unicode.format() method.
968    It's really just a wrapper around MarkupIterator that is callable
969    from Python. */
970 
971 typedef struct {
972     PyObject_HEAD
973     PyObject *str;
974     MarkupIterator it_markup;
975 } formatteriterobject;
976 
977 static void
formatteriter_dealloc(formatteriterobject * it)978 formatteriter_dealloc(formatteriterobject *it)
979 {
980     Py_XDECREF(it->str);
981     PyObject_FREE(it);
982 }
983 
984 /* returns a tuple:
985    (literal, field_name, format_spec, conversion)
986 
987    literal is any literal text to output.  might be zero length
988    field_name is the string before the ':'.  might be None
989    format_spec is the string after the ':'.  mibht be None
990    conversion is either None, or the string after the '!'
991 */
992 static PyObject *
formatteriter_next(formatteriterobject * it)993 formatteriter_next(formatteriterobject *it)
994 {
995     SubString literal;
996     SubString field_name;
997     SubString format_spec;
998     Py_UCS4 conversion;
999     int format_spec_needs_expanding;
1000     int field_present;
1001     int result = MarkupIterator_next(&it->it_markup, &literal, &field_present,
1002                                      &field_name, &format_spec, &conversion,
1003                                      &format_spec_needs_expanding);
1004 
1005     /* all of the SubString objects point into it->str, so no
1006        memory management needs to be done on them */
1007     assert(0 <= result && result <= 2);
1008     if (result == 0 || result == 1)
1009         /* if 0, error has already been set, if 1, iterator is empty */
1010         return NULL;
1011     else {
1012         PyObject *literal_str = NULL;
1013         PyObject *field_name_str = NULL;
1014         PyObject *format_spec_str = NULL;
1015         PyObject *conversion_str = NULL;
1016         PyObject *tuple = NULL;
1017 
1018         literal_str = SubString_new_object(&literal);
1019         if (literal_str == NULL)
1020             goto done;
1021 
1022         field_name_str = SubString_new_object(&field_name);
1023         if (field_name_str == NULL)
1024             goto done;
1025 
1026         /* if field_name is non-zero length, return a string for
1027            format_spec (even if zero length), else return None */
1028         format_spec_str = (field_present ?
1029                            SubString_new_object_or_empty :
1030                            SubString_new_object)(&format_spec);
1031         if (format_spec_str == NULL)
1032             goto done;
1033 
1034         /* if the conversion is not specified, return a None,
1035            otherwise create a one length string with the conversion
1036            character */
1037         if (conversion == '\0') {
1038             conversion_str = Py_None;
1039             Py_INCREF(conversion_str);
1040         }
1041         else
1042             conversion_str = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND,
1043                                                        &conversion, 1);
1044         if (conversion_str == NULL)
1045             goto done;
1046 
1047         tuple = PyTuple_Pack(4, literal_str, field_name_str, format_spec_str,
1048                              conversion_str);
1049     done:
1050         Py_XDECREF(literal_str);
1051         Py_XDECREF(field_name_str);
1052         Py_XDECREF(format_spec_str);
1053         Py_XDECREF(conversion_str);
1054         return tuple;
1055     }
1056 }
1057 
1058 static PyMethodDef formatteriter_methods[] = {
1059     {NULL,              NULL}           /* sentinel */
1060 };
1061 
1062 static PyTypeObject PyFormatterIter_Type = {
1063     PyVarObject_HEAD_INIT(&PyType_Type, 0)
1064     "formatteriterator",                /* tp_name */
1065     sizeof(formatteriterobject),        /* tp_basicsize */
1066     0,                                  /* tp_itemsize */
1067     /* methods */
1068     (destructor)formatteriter_dealloc,  /* tp_dealloc */
1069     0,                                  /* tp_print */
1070     0,                                  /* tp_getattr */
1071     0,                                  /* tp_setattr */
1072     0,                                  /* tp_reserved */
1073     0,                                  /* tp_repr */
1074     0,                                  /* tp_as_number */
1075     0,                                  /* tp_as_sequence */
1076     0,                                  /* tp_as_mapping */
1077     0,                                  /* tp_hash */
1078     0,                                  /* tp_call */
1079     0,                                  /* tp_str */
1080     PyObject_GenericGetAttr,            /* tp_getattro */
1081     0,                                  /* tp_setattro */
1082     0,                                  /* tp_as_buffer */
1083     Py_TPFLAGS_DEFAULT,                 /* tp_flags */
1084     0,                                  /* tp_doc */
1085     0,                                  /* tp_traverse */
1086     0,                                  /* tp_clear */
1087     0,                                  /* tp_richcompare */
1088     0,                                  /* tp_weaklistoffset */
1089     PyObject_SelfIter,                  /* tp_iter */
1090     (iternextfunc)formatteriter_next,   /* tp_iternext */
1091     formatteriter_methods,              /* tp_methods */
1092     0,
1093 };
1094 
1095 /* unicode_formatter_parser is used to implement
1096    string.Formatter.vformat.  it parses a string and returns tuples
1097    describing the parsed elements.  It's a wrapper around
1098    stringlib/string_format.h's MarkupIterator */
1099 static PyObject *
formatter_parser(PyObject * ignored,PyObject * self)1100 formatter_parser(PyObject *ignored, PyObject *self)
1101 {
1102     formatteriterobject *it;
1103 
1104     if (!PyUnicode_Check(self)) {
1105         PyErr_Format(PyExc_TypeError, "expected str, got %s", Py_TYPE(self)->tp_name);
1106         return NULL;
1107     }
1108 
1109     if (PyUnicode_READY(self) == -1)
1110         return NULL;
1111 
1112     it = PyObject_New(formatteriterobject, &PyFormatterIter_Type);
1113     if (it == NULL)
1114         return NULL;
1115 
1116     /* take ownership, give the object to the iterator */
1117     Py_INCREF(self);
1118     it->str = self;
1119 
1120     /* initialize the contained MarkupIterator */
1121     MarkupIterator_init(&it->it_markup, (PyObject*)self, 0, PyUnicode_GET_LENGTH(self));
1122     return (PyObject *)it;
1123 }
1124 
1125 
1126 /************************************************************************/
1127 /*********** fieldnameiterator ******************************************/
1128 /************************************************************************/
1129 
1130 
1131 /* This is used to implement string.Formatter.vparse().  It parses the
1132    field name into attribute and item values.  It's a Python-callable
1133    wrapper around FieldNameIterator */
1134 
1135 typedef struct {
1136     PyObject_HEAD
1137     PyObject *str;
1138     FieldNameIterator it_field;
1139 } fieldnameiterobject;
1140 
1141 static void
fieldnameiter_dealloc(fieldnameiterobject * it)1142 fieldnameiter_dealloc(fieldnameiterobject *it)
1143 {
1144     Py_XDECREF(it->str);
1145     PyObject_FREE(it);
1146 }
1147 
1148 /* returns a tuple:
1149    (is_attr, value)
1150    is_attr is true if we used attribute syntax (e.g., '.foo')
1151               false if we used index syntax (e.g., '[foo]')
1152    value is an integer or string
1153 */
1154 static PyObject *
fieldnameiter_next(fieldnameiterobject * it)1155 fieldnameiter_next(fieldnameiterobject *it)
1156 {
1157     int result;
1158     int is_attr;
1159     Py_ssize_t idx;
1160     SubString name;
1161 
1162     result = FieldNameIterator_next(&it->it_field, &is_attr,
1163                                     &idx, &name);
1164     if (result == 0 || result == 1)
1165         /* if 0, error has already been set, if 1, iterator is empty */
1166         return NULL;
1167     else {
1168         PyObject* result = NULL;
1169         PyObject* is_attr_obj = NULL;
1170         PyObject* obj = NULL;
1171 
1172         is_attr_obj = PyBool_FromLong(is_attr);
1173         if (is_attr_obj == NULL)
1174             goto done;
1175 
1176         /* either an integer or a string */
1177         if (idx != -1)
1178             obj = PyLong_FromSsize_t(idx);
1179         else
1180             obj = SubString_new_object(&name);
1181         if (obj == NULL)
1182             goto done;
1183 
1184         /* return a tuple of values */
1185         result = PyTuple_Pack(2, is_attr_obj, obj);
1186 
1187     done:
1188         Py_XDECREF(is_attr_obj);
1189         Py_XDECREF(obj);
1190         return result;
1191     }
1192 }
1193 
1194 static PyMethodDef fieldnameiter_methods[] = {
1195     {NULL,              NULL}           /* sentinel */
1196 };
1197 
1198 static PyTypeObject PyFieldNameIter_Type = {
1199     PyVarObject_HEAD_INIT(&PyType_Type, 0)
1200     "fieldnameiterator",                /* tp_name */
1201     sizeof(fieldnameiterobject),        /* tp_basicsize */
1202     0,                                  /* tp_itemsize */
1203     /* methods */
1204     (destructor)fieldnameiter_dealloc,  /* tp_dealloc */
1205     0,                                  /* tp_print */
1206     0,                                  /* tp_getattr */
1207     0,                                  /* tp_setattr */
1208     0,                                  /* tp_reserved */
1209     0,                                  /* tp_repr */
1210     0,                                  /* tp_as_number */
1211     0,                                  /* tp_as_sequence */
1212     0,                                  /* tp_as_mapping */
1213     0,                                  /* tp_hash */
1214     0,                                  /* tp_call */
1215     0,                                  /* tp_str */
1216     PyObject_GenericGetAttr,            /* tp_getattro */
1217     0,                                  /* tp_setattro */
1218     0,                                  /* tp_as_buffer */
1219     Py_TPFLAGS_DEFAULT,                 /* tp_flags */
1220     0,                                  /* tp_doc */
1221     0,                                  /* tp_traverse */
1222     0,                                  /* tp_clear */
1223     0,                                  /* tp_richcompare */
1224     0,                                  /* tp_weaklistoffset */
1225     PyObject_SelfIter,                  /* tp_iter */
1226     (iternextfunc)fieldnameiter_next,   /* tp_iternext */
1227     fieldnameiter_methods,              /* tp_methods */
1228     0};
1229 
1230 /* unicode_formatter_field_name_split is used to implement
1231    string.Formatter.vformat.  it takes a PEP 3101 "field name", and
1232    returns a tuple of (first, rest): "first", the part before the
1233    first '.' or '['; and "rest", an iterator for the rest of the field
1234    name.  it's a wrapper around stringlib/string_format.h's
1235    field_name_split.  The iterator it returns is a
1236    FieldNameIterator */
1237 static PyObject *
formatter_field_name_split(PyObject * ignored,PyObject * self)1238 formatter_field_name_split(PyObject *ignored, PyObject *self)
1239 {
1240     SubString first;
1241     Py_ssize_t first_idx;
1242     fieldnameiterobject *it;
1243 
1244     PyObject *first_obj = NULL;
1245     PyObject *result = NULL;
1246 
1247     if (!PyUnicode_Check(self)) {
1248         PyErr_Format(PyExc_TypeError, "expected str, got %s", Py_TYPE(self)->tp_name);
1249         return NULL;
1250     }
1251 
1252     if (PyUnicode_READY(self) == -1)
1253         return NULL;
1254 
1255     it = PyObject_New(fieldnameiterobject, &PyFieldNameIter_Type);
1256     if (it == NULL)
1257         return NULL;
1258 
1259     /* take ownership, give the object to the iterator.  this is
1260        just to keep the field_name alive */
1261     Py_INCREF(self);
1262     it->str = self;
1263 
1264     /* Pass in auto_number = NULL. We'll return an empty string for
1265        first_obj in that case. */
1266     if (!field_name_split((PyObject*)self, 0, PyUnicode_GET_LENGTH(self),
1267                           &first, &first_idx, &it->it_field, NULL))
1268         goto done;
1269 
1270     /* first becomes an integer, if possible; else a string */
1271     if (first_idx != -1)
1272         first_obj = PyLong_FromSsize_t(first_idx);
1273     else
1274         /* convert "first" into a string object */
1275         first_obj = SubString_new_object(&first);
1276     if (first_obj == NULL)
1277         goto done;
1278 
1279     /* return a tuple of values */
1280     result = PyTuple_Pack(2, first_obj, it);
1281 
1282 done:
1283     Py_XDECREF(it);
1284     Py_XDECREF(first_obj);
1285     return result;
1286 }
1287