1 /*
2     unicode_format.h -- implementation of str.format().
3 */
4 
5 /************************************************************************/
6 /***********   Global data structures and forward declarations  *********/
7 /************************************************************************/
8 
9 /*
10    A SubString consists of the characters between two string or
11    unicode pointers.
12 */
13 typedef struct {
14     PyObject *str; /* borrowed reference */
15     Py_ssize_t start, end;
16 } SubString;
17 
18 
19 typedef enum {
20     ANS_INIT,
21     ANS_AUTO,
22     ANS_MANUAL
23 } AutoNumberState;   /* Keep track if we're auto-numbering fields */
24 
25 /* Keeps track of our auto-numbering state, and which number field we're on */
26 typedef struct {
27     AutoNumberState an_state;
28     int an_field_number;
29 } AutoNumber;
30 
31 
32 /* forward declaration for recursion */
33 static PyObject *
34 build_string(SubString *input, PyObject *args, PyObject *kwargs,
35              int recursion_depth, AutoNumber *auto_number);
36 
37 
38 
39 /************************************************************************/
40 /**************************  Utility  functions  ************************/
41 /************************************************************************/
42 
43 static void
AutoNumber_Init(AutoNumber * auto_number)44 AutoNumber_Init(AutoNumber *auto_number)
45 {
46     auto_number->an_state = ANS_INIT;
47     auto_number->an_field_number = 0;
48 }
49 
50 /* fill in a SubString from a pointer and length */
51 Py_LOCAL_INLINE(void)
SubString_init(SubString * str,PyObject * s,Py_ssize_t start,Py_ssize_t end)52 SubString_init(SubString *str, PyObject *s, Py_ssize_t start, Py_ssize_t end)
53 {
54     str->str = s;
55     str->start = start;
56     str->end = end;
57 }
58 
59 /* return a new string.  if str->str is NULL, return None */
60 Py_LOCAL_INLINE(PyObject *)
SubString_new_object(SubString * str)61 SubString_new_object(SubString *str)
62 {
63     if (str->str == NULL)
64         Py_RETURN_NONE;
65     return PyUnicode_Substring(str->str, str->start, str->end);
66 }
67 
68 /* return a new string.  if str->str is NULL, return a new empty string */
69 Py_LOCAL_INLINE(PyObject *)
SubString_new_object_or_empty(SubString * str)70 SubString_new_object_or_empty(SubString *str)
71 {
72     if (str->str == NULL) {
73         return PyUnicode_New(0, 0);
74     }
75     return SubString_new_object(str);
76 }
77 
78 /* Return 1 if an error has been detected switching between automatic
79    field numbering and manual field specification, else return 0. Set
80    ValueError on error. */
81 static int
autonumber_state_error(AutoNumberState state,int field_name_is_empty)82 autonumber_state_error(AutoNumberState state, int field_name_is_empty)
83 {
84     if (state == ANS_MANUAL) {
85         if (field_name_is_empty) {
86             PyErr_SetString(PyExc_ValueError, "cannot switch from "
87                             "manual field specification to "
88                             "automatic field numbering");
89             return 1;
90         }
91     }
92     else {
93         if (!field_name_is_empty) {
94             PyErr_SetString(PyExc_ValueError, "cannot switch from "
95                             "automatic field numbering to "
96                             "manual field specification");
97             return 1;
98         }
99     }
100     return 0;
101 }
102 
103 
104 /************************************************************************/
105 /***********  Format string parsing -- integers and identifiers *********/
106 /************************************************************************/
107 
108 static Py_ssize_t
get_integer(const SubString * str)109 get_integer(const SubString *str)
110 {
111     Py_ssize_t accumulator = 0;
112     Py_ssize_t digitval;
113     Py_ssize_t i;
114 
115     /* empty string is an error */
116     if (str->start >= str->end)
117         return -1;
118 
119     for (i = str->start; i < str->end; i++) {
120         digitval = Py_UNICODE_TODECIMAL(PyUnicode_READ_CHAR(str->str, i));
121         if (digitval < 0)
122             return -1;
123         /*
124            Detect possible overflow before it happens:
125 
126               accumulator * 10 + digitval > PY_SSIZE_T_MAX if and only if
127               accumulator > (PY_SSIZE_T_MAX - digitval) / 10.
128         */
129         if (accumulator > (PY_SSIZE_T_MAX - digitval) / 10) {
130             PyErr_Format(PyExc_ValueError,
131                          "Too many decimal digits in format string");
132             return -1;
133         }
134         accumulator = accumulator * 10 + digitval;
135     }
136     return accumulator;
137 }
138 
139 /************************************************************************/
140 /******** Functions to get field objects and specification strings ******/
141 /************************************************************************/
142 
143 /* do the equivalent of obj.name */
144 static PyObject *
getattr(PyObject * obj,SubString * name)145 getattr(PyObject *obj, SubString *name)
146 {
147     PyObject *newobj;
148     PyObject *str = SubString_new_object(name);
149     if (str == NULL)
150         return NULL;
151     newobj = PyObject_GetAttr(obj, str);
152     Py_DECREF(str);
153     return newobj;
154 }
155 
156 /* do the equivalent of obj[idx], where obj is a sequence */
157 static PyObject *
getitem_sequence(PyObject * obj,Py_ssize_t idx)158 getitem_sequence(PyObject *obj, Py_ssize_t idx)
159 {
160     return PySequence_GetItem(obj, idx);
161 }
162 
163 /* do the equivalent of obj[idx], where obj is not a sequence */
164 static PyObject *
getitem_idx(PyObject * obj,Py_ssize_t idx)165 getitem_idx(PyObject *obj, Py_ssize_t idx)
166 {
167     PyObject *newobj;
168     PyObject *idx_obj = PyLong_FromSsize_t(idx);
169     if (idx_obj == NULL)
170         return NULL;
171     newobj = PyObject_GetItem(obj, idx_obj);
172     Py_DECREF(idx_obj);
173     return newobj;
174 }
175 
176 /* do the equivalent of obj[name] */
177 static PyObject *
getitem_str(PyObject * obj,SubString * name)178 getitem_str(PyObject *obj, SubString *name)
179 {
180     PyObject *newobj;
181     PyObject *str = SubString_new_object(name);
182     if (str == NULL)
183         return NULL;
184     newobj = PyObject_GetItem(obj, str);
185     Py_DECREF(str);
186     return newobj;
187 }
188 
189 typedef struct {
190     /* the entire string we're parsing.  we assume that someone else
191        is managing its lifetime, and that it will exist for the
192        lifetime of the iterator.  can be empty */
193     SubString str;
194 
195     /* index to where we are inside field_name */
196     Py_ssize_t index;
197 } FieldNameIterator;
198 
199 
200 static int
FieldNameIterator_init(FieldNameIterator * self,PyObject * s,Py_ssize_t start,Py_ssize_t end)201 FieldNameIterator_init(FieldNameIterator *self, PyObject *s,
202                        Py_ssize_t start, Py_ssize_t end)
203 {
204     SubString_init(&self->str, s, start, end);
205     self->index = start;
206     return 1;
207 }
208 
209 static int
_FieldNameIterator_attr(FieldNameIterator * self,SubString * name)210 _FieldNameIterator_attr(FieldNameIterator *self, SubString *name)
211 {
212     Py_UCS4 c;
213 
214     name->str = self->str.str;
215     name->start = self->index;
216 
217     /* return everything until '.' or '[' */
218     while (self->index < self->str.end) {
219         c = PyUnicode_READ_CHAR(self->str.str, self->index++);
220         switch (c) {
221         case '[':
222         case '.':
223             /* backup so that we this character will be seen next time */
224             self->index--;
225             break;
226         default:
227             continue;
228         }
229         break;
230     }
231     /* end of string is okay */
232     name->end = self->index;
233     return 1;
234 }
235 
236 static int
_FieldNameIterator_item(FieldNameIterator * self,SubString * name)237 _FieldNameIterator_item(FieldNameIterator *self, SubString *name)
238 {
239     int bracket_seen = 0;
240     Py_UCS4 c;
241 
242     name->str = self->str.str;
243     name->start = self->index;
244 
245     /* return everything until ']' */
246     while (self->index < self->str.end) {
247         c = PyUnicode_READ_CHAR(self->str.str, self->index++);
248         switch (c) {
249         case ']':
250             bracket_seen = 1;
251             break;
252         default:
253             continue;
254         }
255         break;
256     }
257     /* make sure we ended with a ']' */
258     if (!bracket_seen) {
259         PyErr_SetString(PyExc_ValueError, "Missing ']' in format string");
260         return 0;
261     }
262 
263     /* end of string is okay */
264     /* don't include the ']' */
265     name->end = self->index-1;
266     return 1;
267 }
268 
269 /* returns 0 on error, 1 on non-error termination, and 2 if it returns a value */
270 static int
FieldNameIterator_next(FieldNameIterator * self,int * is_attribute,Py_ssize_t * name_idx,SubString * name)271 FieldNameIterator_next(FieldNameIterator *self, int *is_attribute,
272                        Py_ssize_t *name_idx, SubString *name)
273 {
274     /* check at end of input */
275     if (self->index >= self->str.end)
276         return 1;
277 
278     switch (PyUnicode_READ_CHAR(self->str.str, self->index++)) {
279     case '.':
280         *is_attribute = 1;
281         if (_FieldNameIterator_attr(self, name) == 0)
282             return 0;
283         *name_idx = -1;
284         break;
285     case '[':
286         *is_attribute = 0;
287         if (_FieldNameIterator_item(self, name) == 0)
288             return 0;
289         *name_idx = get_integer(name);
290         if (*name_idx == -1 && PyErr_Occurred())
291             return 0;
292         break;
293     default:
294         /* Invalid character follows ']' */
295         PyErr_SetString(PyExc_ValueError, "Only '.' or '[' may "
296                         "follow ']' in format field specifier");
297         return 0;
298     }
299 
300     /* empty string is an error */
301     if (name->start == name->end) {
302         PyErr_SetString(PyExc_ValueError, "Empty attribute in format string");
303         return 0;
304     }
305 
306     return 2;
307 }
308 
309 
310 /* input: field_name
311    output: 'first' points to the part before the first '[' or '.'
312            'first_idx' is -1 if 'first' is not an integer, otherwise
313                        it's the value of first converted to an integer
314            'rest' is an iterator to return the rest
315 */
316 static int
field_name_split(PyObject * str,Py_ssize_t start,Py_ssize_t end,SubString * first,Py_ssize_t * first_idx,FieldNameIterator * rest,AutoNumber * auto_number)317 field_name_split(PyObject *str, Py_ssize_t start, Py_ssize_t end, SubString *first,
318                  Py_ssize_t *first_idx, FieldNameIterator *rest,
319                  AutoNumber *auto_number)
320 {
321     Py_UCS4 c;
322     Py_ssize_t i = start;
323     int field_name_is_empty;
324     int using_numeric_index;
325 
326     /* find the part up until the first '.' or '[' */
327     while (i < end) {
328         switch (c = PyUnicode_READ_CHAR(str, i++)) {
329         case '[':
330         case '.':
331             /* backup so that we this character is available to the
332                "rest" iterator */
333             i--;
334             break;
335         default:
336             continue;
337         }
338         break;
339     }
340 
341     /* set up the return values */
342     SubString_init(first, str, start, i);
343     FieldNameIterator_init(rest, str, i, end);
344 
345     /* see if "first" is an integer, in which case it's used as an index */
346     *first_idx = get_integer(first);
347     if (*first_idx == -1 && PyErr_Occurred())
348         return 0;
349 
350     field_name_is_empty = first->start >= first->end;
351 
352     /* If the field name is omitted or if we have a numeric index
353        specified, then we're doing numeric indexing into args. */
354     using_numeric_index = field_name_is_empty || *first_idx != -1;
355 
356     /* We always get here exactly one time for each field we're
357        processing. And we get here in field order (counting by left
358        braces). So this is the perfect place to handle automatic field
359        numbering if the field name is omitted. */
360 
361     /* Check if we need to do the auto-numbering. It's not needed if
362        we're called from string.Format routines, because it's handled
363        in that class by itself. */
364     if (auto_number) {
365         /* Initialize our auto numbering state if this is the first
366            time we're either auto-numbering or manually numbering. */
367         if (auto_number->an_state == ANS_INIT && using_numeric_index)
368             auto_number->an_state = field_name_is_empty ?
369                 ANS_AUTO : ANS_MANUAL;
370 
371         /* Make sure our state is consistent with what we're doing
372            this time through. Only check if we're using a numeric
373            index. */
374         if (using_numeric_index)
375             if (autonumber_state_error(auto_number->an_state,
376                                        field_name_is_empty))
377                 return 0;
378         /* Zero length field means we want to do auto-numbering of the
379            fields. */
380         if (field_name_is_empty)
381             *first_idx = (auto_number->an_field_number)++;
382     }
383 
384     return 1;
385 }
386 
387 
388 /*
389     get_field_object returns the object inside {}, before the
390     format_spec.  It handles getindex and getattr lookups and consumes
391     the entire input string.
392 */
393 static PyObject *
get_field_object(SubString * input,PyObject * args,PyObject * kwargs,AutoNumber * auto_number)394 get_field_object(SubString *input, PyObject *args, PyObject *kwargs,
395                  AutoNumber *auto_number)
396 {
397     PyObject *obj = NULL;
398     int ok;
399     int is_attribute;
400     SubString name;
401     SubString first;
402     Py_ssize_t index;
403     FieldNameIterator rest;
404 
405     if (!field_name_split(input->str, input->start, input->end, &first,
406                           &index, &rest, auto_number)) {
407         goto error;
408     }
409 
410     if (index == -1) {
411         /* look up in kwargs */
412         PyObject *key = SubString_new_object(&first);
413         if (key == NULL) {
414             goto error;
415         }
416         if (kwargs == NULL) {
417             PyErr_SetObject(PyExc_KeyError, key);
418             Py_DECREF(key);
419             goto error;
420         }
421         /* Use PyObject_GetItem instead of PyDict_GetItem because this
422            code is no longer just used with kwargs. It might be passed
423            a non-dict when called through format_map. */
424         obj = PyObject_GetItem(kwargs, key);
425         Py_DECREF(key);
426         if (obj == NULL) {
427             goto error;
428         }
429     }
430     else {
431         /* If args is NULL, we have a format string with a positional field
432            with only kwargs to retrieve it from. This can only happen when
433            used with format_map(), where positional arguments are not
434            allowed. */
435         if (args == NULL) {
436             PyErr_SetString(PyExc_ValueError, "Format string contains "
437                             "positional fields");
438             goto error;
439         }
440 
441         /* look up in args */
442         obj = PySequence_GetItem(args, index);
443         if (obj == NULL) {
444             PyErr_Format(PyExc_IndexError,
445                          "Replacement index %zd out of range for positional "
446                          "args tuple",
447                          index);
448              goto error;
449         }
450     }
451 
452     /* iterate over the rest of the field_name */
453     while ((ok = FieldNameIterator_next(&rest, &is_attribute, &index,
454                                         &name)) == 2) {
455         PyObject *tmp;
456 
457         if (is_attribute)
458             /* getattr lookup "." */
459             tmp = getattr(obj, &name);
460         else
461             /* getitem lookup "[]" */
462             if (index == -1)
463                 tmp = getitem_str(obj, &name);
464             else
465                 if (PySequence_Check(obj))
466                     tmp = getitem_sequence(obj, index);
467                 else
468                     /* not a sequence */
469                     tmp = getitem_idx(obj, index);
470         if (tmp == NULL)
471             goto error;
472 
473         /* assign to obj */
474         Py_DECREF(obj);
475         obj = tmp;
476     }
477     /* end of iterator, this is the non-error case */
478     if (ok == 1)
479         return obj;
480 error:
481     Py_XDECREF(obj);
482     return NULL;
483 }
484 
485 /************************************************************************/
486 /*****************  Field rendering functions  **************************/
487 /************************************************************************/
488 
489 /*
490     render_field() is the main function in this section.  It takes the
491     field object and field specification string generated by
492     get_field_and_spec, and renders the field into the output string.
493 
494     render_field calls fieldobj.__format__(format_spec) method, and
495     appends to the output.
496 */
497 static int
render_field(PyObject * fieldobj,SubString * format_spec,_PyUnicodeWriter * writer)498 render_field(PyObject *fieldobj, SubString *format_spec, _PyUnicodeWriter *writer)
499 {
500     int ok = 0;
501     PyObject *result = NULL;
502     PyObject *format_spec_object = NULL;
503     int (*formatter) (_PyUnicodeWriter*, PyObject *, PyObject *, Py_ssize_t, Py_ssize_t) = NULL;
504     int err;
505 
506     /* If we know the type exactly, skip the lookup of __format__ and just
507        call the formatter directly. */
508     if (PyUnicode_CheckExact(fieldobj))
509         formatter = _PyUnicode_FormatAdvancedWriter;
510     else if (PyLong_CheckExact(fieldobj))
511         formatter = _PyLong_FormatAdvancedWriter;
512     else if (PyFloat_CheckExact(fieldobj))
513         formatter = _PyFloat_FormatAdvancedWriter;
514     else if (PyComplex_CheckExact(fieldobj))
515         formatter = _PyComplex_FormatAdvancedWriter;
516 
517     if (formatter) {
518         /* we know exactly which formatter will be called when __format__ is
519            looked up, so call it directly, instead. */
520         err = formatter(writer, fieldobj, format_spec->str,
521                         format_spec->start, format_spec->end);
522         return (err == 0);
523     }
524     else {
525         /* We need to create an object out of the pointers we have, because
526            __format__ takes a string/unicode object for format_spec. */
527         if (format_spec->str)
528             format_spec_object = PyUnicode_Substring(format_spec->str,
529                                                      format_spec->start,
530                                                      format_spec->end);
531         else
532             format_spec_object = PyUnicode_New(0, 0);
533         if (format_spec_object == NULL)
534             goto done;
535 
536         result = PyObject_Format(fieldobj, format_spec_object);
537     }
538     if (result == NULL)
539         goto done;
540 
541     if (_PyUnicodeWriter_WriteStr(writer, result) == -1)
542         goto done;
543     ok = 1;
544 
545 done:
546     Py_XDECREF(format_spec_object);
547     Py_XDECREF(result);
548     return ok;
549 }
550 
551 static int
parse_field(SubString * str,SubString * field_name,SubString * format_spec,int * format_spec_needs_expanding,Py_UCS4 * conversion)552 parse_field(SubString *str, SubString *field_name, SubString *format_spec,
553             int *format_spec_needs_expanding, Py_UCS4 *conversion)
554 {
555     /* Note this function works if the field name is zero length,
556        which is good.  Zero length field names are handled later, in
557        field_name_split. */
558 
559     Py_UCS4 c = 0;
560 
561     /* initialize these, as they may be empty */
562     *conversion = '\0';
563     SubString_init(format_spec, NULL, 0, 0);
564 
565     /* Search for the field name.  it's terminated by the end of
566        the string, or a ':' or '!' */
567     field_name->str = str->str;
568     field_name->start = str->start;
569     while (str->start < str->end) {
570         switch ((c = PyUnicode_READ_CHAR(str->str, str->start++))) {
571         case '{':
572             PyErr_SetString(PyExc_ValueError, "unexpected '{' in field name");
573             return 0;
574         case '[':
575             for (; str->start < str->end; str->start++)
576                 if (PyUnicode_READ_CHAR(str->str, str->start) == ']')
577                     break;
578             continue;
579         case '}':
580         case ':':
581         case '!':
582             break;
583         default:
584             continue;
585         }
586         break;
587     }
588 
589     field_name->end = str->start - 1;
590     if (c == '!' || c == ':') {
591         Py_ssize_t count;
592         /* we have a format specifier and/or a conversion */
593         /* don't include the last character */
594 
595         /* see if there's a conversion specifier */
596         if (c == '!') {
597             /* there must be another character present */
598             if (str->start >= str->end) {
599                 PyErr_SetString(PyExc_ValueError,
600                                 "end of string while looking for conversion "
601                                 "specifier");
602                 return 0;
603             }
604             *conversion = PyUnicode_READ_CHAR(str->str, str->start++);
605 
606             if (str->start < str->end) {
607                 c = PyUnicode_READ_CHAR(str->str, str->start++);
608                 if (c == '}')
609                     return 1;
610                 if (c != ':') {
611                     PyErr_SetString(PyExc_ValueError,
612                                     "expected ':' after conversion specifier");
613                     return 0;
614                 }
615             }
616         }
617         format_spec->str = str->str;
618         format_spec->start = str->start;
619         count = 1;
620         while (str->start < str->end) {
621             switch ((c = PyUnicode_READ_CHAR(str->str, str->start++))) {
622             case '{':
623                 *format_spec_needs_expanding = 1;
624                 count++;
625                 break;
626             case '}':
627                 count--;
628                 if (count == 0) {
629                     format_spec->end = str->start - 1;
630                     return 1;
631                 }
632                 break;
633             default:
634                 break;
635             }
636         }
637 
638         PyErr_SetString(PyExc_ValueError, "unmatched '{' in format spec");
639         return 0;
640     }
641     else if (c != '}') {
642         PyErr_SetString(PyExc_ValueError, "expected '}' before end of string");
643         return 0;
644     }
645 
646     return 1;
647 }
648 
649 /************************************************************************/
650 /******* Output string allocation and escape-to-markup processing  ******/
651 /************************************************************************/
652 
653 /* MarkupIterator breaks the string into pieces of either literal
654    text, or things inside {} that need to be marked up.  it is
655    designed to make it easy to wrap a Python iterator around it, for
656    use with the Formatter class */
657 
658 typedef struct {
659     SubString str;
660 } MarkupIterator;
661 
662 static int
MarkupIterator_init(MarkupIterator * self,PyObject * str,Py_ssize_t start,Py_ssize_t end)663 MarkupIterator_init(MarkupIterator *self, PyObject *str,
664                     Py_ssize_t start, Py_ssize_t end)
665 {
666     SubString_init(&self->str, str, start, end);
667     return 1;
668 }
669 
670 /* returns 0 on error, 1 on non-error termination, and 2 if it got a
671    string (or something to be expanded) */
672 static int
MarkupIterator_next(MarkupIterator * self,SubString * literal,int * field_present,SubString * field_name,SubString * format_spec,Py_UCS4 * conversion,int * format_spec_needs_expanding)673 MarkupIterator_next(MarkupIterator *self, SubString *literal,
674                     int *field_present, SubString *field_name,
675                     SubString *format_spec, Py_UCS4 *conversion,
676                     int *format_spec_needs_expanding)
677 {
678     int at_end;
679     Py_UCS4 c = 0;
680     Py_ssize_t start;
681     Py_ssize_t len;
682     int markup_follows = 0;
683 
684     /* initialize all of the output variables */
685     SubString_init(literal, NULL, 0, 0);
686     SubString_init(field_name, NULL, 0, 0);
687     SubString_init(format_spec, NULL, 0, 0);
688     *conversion = '\0';
689     *format_spec_needs_expanding = 0;
690     *field_present = 0;
691 
692     /* No more input, end of iterator.  This is the normal exit
693        path. */
694     if (self->str.start >= self->str.end)
695         return 1;
696 
697     start = self->str.start;
698 
699     /* First read any literal text. Read until the end of string, an
700        escaped '{' or '}', or an unescaped '{'.  In order to never
701        allocate memory and so I can just pass pointers around, if
702        there's an escaped '{' or '}' then we'll return the literal
703        including the brace, but no format object.  The next time
704        through, we'll return the rest of the literal, skipping past
705        the second consecutive brace. */
706     while (self->str.start < self->str.end) {
707         switch (c = PyUnicode_READ_CHAR(self->str.str, self->str.start++)) {
708         case '{':
709         case '}':
710             markup_follows = 1;
711             break;
712         default:
713             continue;
714         }
715         break;
716     }
717 
718     at_end = self->str.start >= self->str.end;
719     len = self->str.start - start;
720 
721     if ((c == '}') && (at_end ||
722                        (c != PyUnicode_READ_CHAR(self->str.str,
723                                                  self->str.start)))) {
724         PyErr_SetString(PyExc_ValueError, "Single '}' encountered "
725                         "in format string");
726         return 0;
727     }
728     if (at_end && c == '{') {
729         PyErr_SetString(PyExc_ValueError, "Single '{' encountered "
730                         "in format string");
731         return 0;
732     }
733     if (!at_end) {
734         if (c == PyUnicode_READ_CHAR(self->str.str, self->str.start)) {
735             /* escaped } or {, skip it in the input.  there is no
736                markup object following us, just this literal text */
737             self->str.start++;
738             markup_follows = 0;
739         }
740         else
741             len--;
742     }
743 
744     /* record the literal text */
745     literal->str = self->str.str;
746     literal->start = start;
747     literal->end = start + len;
748 
749     if (!markup_follows)
750         return 2;
751 
752     /* this is markup; parse the field */
753     *field_present = 1;
754     if (!parse_field(&self->str, field_name, format_spec,
755                      format_spec_needs_expanding, conversion))
756         return 0;
757     return 2;
758 }
759 
760 
761 /* do the !r or !s conversion on obj */
762 static PyObject *
do_conversion(PyObject * obj,Py_UCS4 conversion)763 do_conversion(PyObject *obj, Py_UCS4 conversion)
764 {
765     /* XXX in pre-3.0, do we need to convert this to unicode, since it
766        might have returned a string? */
767     switch (conversion) {
768     case 'r':
769         return PyObject_Repr(obj);
770     case 's':
771         return PyObject_Str(obj);
772     case 'a':
773         return PyObject_ASCII(obj);
774     default:
775         if (conversion > 32 && conversion < 127) {
776                 /* It's the ASCII subrange; casting to char is safe
777                    (assuming the execution character set is an ASCII
778                    superset). */
779                 PyErr_Format(PyExc_ValueError,
780                      "Unknown conversion specifier %c",
781                      (char)conversion);
782         } else
783                 PyErr_Format(PyExc_ValueError,
784                      "Unknown conversion specifier \\x%x",
785                      (unsigned int)conversion);
786         return NULL;
787     }
788 }
789 
790 /* given:
791 
792    {field_name!conversion:format_spec}
793 
794    compute the result and write it to output.
795    format_spec_needs_expanding is an optimization.  if it's false,
796    just output the string directly, otherwise recursively expand the
797    format_spec string.
798 
799    field_name is allowed to be zero length, in which case we
800    are doing auto field numbering.
801 */
802 
803 static int
output_markup(SubString * field_name,SubString * format_spec,int format_spec_needs_expanding,Py_UCS4 conversion,_PyUnicodeWriter * writer,PyObject * args,PyObject * kwargs,int recursion_depth,AutoNumber * auto_number)804 output_markup(SubString *field_name, SubString *format_spec,
805               int format_spec_needs_expanding, Py_UCS4 conversion,
806               _PyUnicodeWriter *writer, PyObject *args, PyObject *kwargs,
807               int recursion_depth, AutoNumber *auto_number)
808 {
809     PyObject *tmp = NULL;
810     PyObject *fieldobj = NULL;
811     SubString expanded_format_spec;
812     SubString *actual_format_spec;
813     int result = 0;
814 
815     /* convert field_name to an object */
816     fieldobj = get_field_object(field_name, args, kwargs, auto_number);
817     if (fieldobj == NULL)
818         goto done;
819 
820     if (conversion != '\0') {
821         tmp = do_conversion(fieldobj, conversion);
822         if (tmp == NULL || PyUnicode_READY(tmp) == -1)
823             goto done;
824 
825         /* do the assignment, transferring ownership: fieldobj = tmp */
826         Py_DECREF(fieldobj);
827         fieldobj = tmp;
828         tmp = NULL;
829     }
830 
831     /* if needed, recursively compute the format_spec */
832     if (format_spec_needs_expanding) {
833         tmp = build_string(format_spec, args, kwargs, recursion_depth-1,
834                            auto_number);
835         if (tmp == NULL || PyUnicode_READY(tmp) == -1)
836             goto done;
837 
838         /* note that in the case we're expanding the format string,
839            tmp must be kept around until after the call to
840            render_field. */
841         SubString_init(&expanded_format_spec, tmp, 0, PyUnicode_GET_LENGTH(tmp));
842         actual_format_spec = &expanded_format_spec;
843     }
844     else
845         actual_format_spec = format_spec;
846 
847     if (render_field(fieldobj, actual_format_spec, writer) == 0)
848         goto done;
849 
850     result = 1;
851 
852 done:
853     Py_XDECREF(fieldobj);
854     Py_XDECREF(tmp);
855 
856     return result;
857 }
858 
859 /*
860     do_markup is the top-level loop for the format() method.  It
861     searches through the format string for escapes to markup codes, and
862     calls other functions to move non-markup text to the output,
863     and to perform the markup to the output.
864 */
865 static int
do_markup(SubString * input,PyObject * args,PyObject * kwargs,_PyUnicodeWriter * writer,int recursion_depth,AutoNumber * auto_number)866 do_markup(SubString *input, PyObject *args, PyObject *kwargs,
867           _PyUnicodeWriter *writer, int recursion_depth, AutoNumber *auto_number)
868 {
869     MarkupIterator iter;
870     int format_spec_needs_expanding;
871     int result;
872     int field_present;
873     SubString literal;
874     SubString field_name;
875     SubString format_spec;
876     Py_UCS4 conversion;
877 
878     MarkupIterator_init(&iter, input->str, input->start, input->end);
879     while ((result = MarkupIterator_next(&iter, &literal, &field_present,
880                                          &field_name, &format_spec,
881                                          &conversion,
882                                          &format_spec_needs_expanding)) == 2) {
883         if (literal.end != literal.start) {
884             if (!field_present && iter.str.start == iter.str.end)
885                 writer->overallocate = 0;
886             if (_PyUnicodeWriter_WriteSubstring(writer, literal.str,
887                                                 literal.start, literal.end) < 0)
888                 return 0;
889         }
890 
891         if (field_present) {
892             if (iter.str.start == iter.str.end)
893                 writer->overallocate = 0;
894             if (!output_markup(&field_name, &format_spec,
895                                format_spec_needs_expanding, conversion, writer,
896                                args, kwargs, recursion_depth, auto_number))
897                 return 0;
898         }
899     }
900     return result;
901 }
902 
903 
904 /*
905     build_string allocates the output string and then
906     calls do_markup to do the heavy lifting.
907 */
908 static PyObject *
build_string(SubString * input,PyObject * args,PyObject * kwargs,int recursion_depth,AutoNumber * auto_number)909 build_string(SubString *input, PyObject *args, PyObject *kwargs,
910              int recursion_depth, AutoNumber *auto_number)
911 {
912     _PyUnicodeWriter writer;
913 
914     /* check the recursion level */
915     if (recursion_depth <= 0) {
916         PyErr_SetString(PyExc_ValueError,
917                         "Max string recursion exceeded");
918         return NULL;
919     }
920 
921     _PyUnicodeWriter_Init(&writer);
922     writer.overallocate = 1;
923     writer.min_length = PyUnicode_GET_LENGTH(input->str) + 100;
924 
925     if (!do_markup(input, args, kwargs, &writer, recursion_depth,
926                    auto_number)) {
927         _PyUnicodeWriter_Dealloc(&writer);
928         return NULL;
929     }
930 
931     return _PyUnicodeWriter_Finish(&writer);
932 }
933 
934 /************************************************************************/
935 /*********** main routine ***********************************************/
936 /************************************************************************/
937 
938 /* this is the main entry point */
939 static PyObject *
do_string_format(PyObject * self,PyObject * args,PyObject * kwargs)940 do_string_format(PyObject *self, PyObject *args, PyObject *kwargs)
941 {
942     SubString input;
943 
944     /* PEP 3101 says only 2 levels, so that
945        "{0:{1}}".format('abc', 's')            # works
946        "{0:{1:{2}}}".format('abc', 's', '')    # fails
947     */
948     int recursion_depth = 2;
949 
950     AutoNumber auto_number;
951 
952     if (PyUnicode_READY(self) == -1)
953         return NULL;
954 
955     AutoNumber_Init(&auto_number);
956     SubString_init(&input, self, 0, PyUnicode_GET_LENGTH(self));
957     return build_string(&input, args, kwargs, recursion_depth, &auto_number);
958 }
959 
960 static PyObject *
do_string_format_map(PyObject * self,PyObject * obj)961 do_string_format_map(PyObject *self, PyObject *obj)
962 {
963     return do_string_format(self, NULL, obj);
964 }
965 
966 
967 /************************************************************************/
968 /*********** formatteriterator ******************************************/
969 /************************************************************************/
970 
971 /* This is used to implement string.Formatter.vparse().  It exists so
972    Formatter can share code with the built in unicode.format() method.
973    It's really just a wrapper around MarkupIterator that is callable
974    from Python. */
975 
976 typedef struct {
977     PyObject_HEAD
978     PyObject *str;
979     MarkupIterator it_markup;
980 } formatteriterobject;
981 
982 static void
formatteriter_dealloc(formatteriterobject * it)983 formatteriter_dealloc(formatteriterobject *it)
984 {
985     Py_XDECREF(it->str);
986     PyObject_FREE(it);
987 }
988 
989 /* returns a tuple:
990    (literal, field_name, format_spec, conversion)
991 
992    literal is any literal text to output.  might be zero length
993    field_name is the string before the ':'.  might be None
994    format_spec is the string after the ':'.  mibht be None
995    conversion is either None, or the string after the '!'
996 */
997 static PyObject *
formatteriter_next(formatteriterobject * it)998 formatteriter_next(formatteriterobject *it)
999 {
1000     SubString literal;
1001     SubString field_name;
1002     SubString format_spec;
1003     Py_UCS4 conversion;
1004     int format_spec_needs_expanding;
1005     int field_present;
1006     int result = MarkupIterator_next(&it->it_markup, &literal, &field_present,
1007                                      &field_name, &format_spec, &conversion,
1008                                      &format_spec_needs_expanding);
1009 
1010     /* all of the SubString objects point into it->str, so no
1011        memory management needs to be done on them */
1012     assert(0 <= result && result <= 2);
1013     if (result == 0 || result == 1)
1014         /* if 0, error has already been set, if 1, iterator is empty */
1015         return NULL;
1016     else {
1017         PyObject *literal_str = NULL;
1018         PyObject *field_name_str = NULL;
1019         PyObject *format_spec_str = NULL;
1020         PyObject *conversion_str = NULL;
1021         PyObject *tuple = NULL;
1022 
1023         literal_str = SubString_new_object(&literal);
1024         if (literal_str == NULL)
1025             goto done;
1026 
1027         field_name_str = SubString_new_object(&field_name);
1028         if (field_name_str == NULL)
1029             goto done;
1030 
1031         /* if field_name is non-zero length, return a string for
1032            format_spec (even if zero length), else return None */
1033         format_spec_str = (field_present ?
1034                            SubString_new_object_or_empty :
1035                            SubString_new_object)(&format_spec);
1036         if (format_spec_str == NULL)
1037             goto done;
1038 
1039         /* if the conversion is not specified, return a None,
1040            otherwise create a one length string with the conversion
1041            character */
1042         if (conversion == '\0') {
1043             conversion_str = Py_None;
1044             Py_INCREF(conversion_str);
1045         }
1046         else
1047             conversion_str = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND,
1048                                                        &conversion, 1);
1049         if (conversion_str == NULL)
1050             goto done;
1051 
1052         tuple = PyTuple_Pack(4, literal_str, field_name_str, format_spec_str,
1053                              conversion_str);
1054     done:
1055         Py_XDECREF(literal_str);
1056         Py_XDECREF(field_name_str);
1057         Py_XDECREF(format_spec_str);
1058         Py_XDECREF(conversion_str);
1059         return tuple;
1060     }
1061 }
1062 
1063 static PyMethodDef formatteriter_methods[] = {
1064     {NULL,              NULL}           /* sentinel */
1065 };
1066 
1067 static PyTypeObject PyFormatterIter_Type = {
1068     PyVarObject_HEAD_INIT(&PyType_Type, 0)
1069     "formatteriterator",                /* tp_name */
1070     sizeof(formatteriterobject),        /* tp_basicsize */
1071     0,                                  /* tp_itemsize */
1072     /* methods */
1073     (destructor)formatteriter_dealloc,  /* tp_dealloc */
1074     0,                                  /* tp_vectorcall_offset */
1075     0,                                  /* tp_getattr */
1076     0,                                  /* tp_setattr */
1077     0,                                  /* tp_as_async */
1078     0,                                  /* tp_repr */
1079     0,                                  /* tp_as_number */
1080     0,                                  /* tp_as_sequence */
1081     0,                                  /* tp_as_mapping */
1082     0,                                  /* tp_hash */
1083     0,                                  /* tp_call */
1084     0,                                  /* tp_str */
1085     PyObject_GenericGetAttr,            /* tp_getattro */
1086     0,                                  /* tp_setattro */
1087     0,                                  /* tp_as_buffer */
1088     Py_TPFLAGS_DEFAULT,                 /* tp_flags */
1089     0,                                  /* tp_doc */
1090     0,                                  /* tp_traverse */
1091     0,                                  /* tp_clear */
1092     0,                                  /* tp_richcompare */
1093     0,                                  /* tp_weaklistoffset */
1094     PyObject_SelfIter,                  /* tp_iter */
1095     (iternextfunc)formatteriter_next,   /* tp_iternext */
1096     formatteriter_methods,              /* tp_methods */
1097     0,
1098 };
1099 
1100 /* unicode_formatter_parser is used to implement
1101    string.Formatter.vformat.  it parses a string and returns tuples
1102    describing the parsed elements.  It's a wrapper around
1103    stringlib/string_format.h's MarkupIterator */
1104 static PyObject *
formatter_parser(PyObject * ignored,PyObject * self)1105 formatter_parser(PyObject *ignored, PyObject *self)
1106 {
1107     formatteriterobject *it;
1108 
1109     if (!PyUnicode_Check(self)) {
1110         PyErr_Format(PyExc_TypeError, "expected str, got %s", Py_TYPE(self)->tp_name);
1111         return NULL;
1112     }
1113 
1114     if (PyUnicode_READY(self) == -1)
1115         return NULL;
1116 
1117     it = PyObject_New(formatteriterobject, &PyFormatterIter_Type);
1118     if (it == NULL)
1119         return NULL;
1120 
1121     /* take ownership, give the object to the iterator */
1122     Py_INCREF(self);
1123     it->str = self;
1124 
1125     /* initialize the contained MarkupIterator */
1126     MarkupIterator_init(&it->it_markup, (PyObject*)self, 0, PyUnicode_GET_LENGTH(self));
1127     return (PyObject *)it;
1128 }
1129 
1130 
1131 /************************************************************************/
1132 /*********** fieldnameiterator ******************************************/
1133 /************************************************************************/
1134 
1135 
1136 /* This is used to implement string.Formatter.vparse().  It parses the
1137    field name into attribute and item values.  It's a Python-callable
1138    wrapper around FieldNameIterator */
1139 
1140 typedef struct {
1141     PyObject_HEAD
1142     PyObject *str;
1143     FieldNameIterator it_field;
1144 } fieldnameiterobject;
1145 
1146 static void
fieldnameiter_dealloc(fieldnameiterobject * it)1147 fieldnameiter_dealloc(fieldnameiterobject *it)
1148 {
1149     Py_XDECREF(it->str);
1150     PyObject_FREE(it);
1151 }
1152 
1153 /* returns a tuple:
1154    (is_attr, value)
1155    is_attr is true if we used attribute syntax (e.g., '.foo')
1156               false if we used index syntax (e.g., '[foo]')
1157    value is an integer or string
1158 */
1159 static PyObject *
fieldnameiter_next(fieldnameiterobject * it)1160 fieldnameiter_next(fieldnameiterobject *it)
1161 {
1162     int result;
1163     int is_attr;
1164     Py_ssize_t idx;
1165     SubString name;
1166 
1167     result = FieldNameIterator_next(&it->it_field, &is_attr,
1168                                     &idx, &name);
1169     if (result == 0 || result == 1)
1170         /* if 0, error has already been set, if 1, iterator is empty */
1171         return NULL;
1172     else {
1173         PyObject* result = NULL;
1174         PyObject* is_attr_obj = NULL;
1175         PyObject* obj = NULL;
1176 
1177         is_attr_obj = PyBool_FromLong(is_attr);
1178         if (is_attr_obj == NULL)
1179             goto done;
1180 
1181         /* either an integer or a string */
1182         if (idx != -1)
1183             obj = PyLong_FromSsize_t(idx);
1184         else
1185             obj = SubString_new_object(&name);
1186         if (obj == NULL)
1187             goto done;
1188 
1189         /* return a tuple of values */
1190         result = PyTuple_Pack(2, is_attr_obj, obj);
1191 
1192     done:
1193         Py_XDECREF(is_attr_obj);
1194         Py_XDECREF(obj);
1195         return result;
1196     }
1197 }
1198 
1199 static PyMethodDef fieldnameiter_methods[] = {
1200     {NULL,              NULL}           /* sentinel */
1201 };
1202 
1203 static PyTypeObject PyFieldNameIter_Type = {
1204     PyVarObject_HEAD_INIT(&PyType_Type, 0)
1205     "fieldnameiterator",                /* tp_name */
1206     sizeof(fieldnameiterobject),        /* tp_basicsize */
1207     0,                                  /* tp_itemsize */
1208     /* methods */
1209     (destructor)fieldnameiter_dealloc,  /* tp_dealloc */
1210     0,                                  /* tp_vectorcall_offset */
1211     0,                                  /* tp_getattr */
1212     0,                                  /* tp_setattr */
1213     0,                                  /* tp_as_async */
1214     0,                                  /* tp_repr */
1215     0,                                  /* tp_as_number */
1216     0,                                  /* tp_as_sequence */
1217     0,                                  /* tp_as_mapping */
1218     0,                                  /* tp_hash */
1219     0,                                  /* tp_call */
1220     0,                                  /* tp_str */
1221     PyObject_GenericGetAttr,            /* tp_getattro */
1222     0,                                  /* tp_setattro */
1223     0,                                  /* tp_as_buffer */
1224     Py_TPFLAGS_DEFAULT,                 /* tp_flags */
1225     0,                                  /* tp_doc */
1226     0,                                  /* tp_traverse */
1227     0,                                  /* tp_clear */
1228     0,                                  /* tp_richcompare */
1229     0,                                  /* tp_weaklistoffset */
1230     PyObject_SelfIter,                  /* tp_iter */
1231     (iternextfunc)fieldnameiter_next,   /* tp_iternext */
1232     fieldnameiter_methods,              /* tp_methods */
1233     0};
1234 
1235 /* unicode_formatter_field_name_split is used to implement
1236    string.Formatter.vformat.  it takes a PEP 3101 "field name", and
1237    returns a tuple of (first, rest): "first", the part before the
1238    first '.' or '['; and "rest", an iterator for the rest of the field
1239    name.  it's a wrapper around stringlib/string_format.h's
1240    field_name_split.  The iterator it returns is a
1241    FieldNameIterator */
1242 static PyObject *
formatter_field_name_split(PyObject * ignored,PyObject * self)1243 formatter_field_name_split(PyObject *ignored, PyObject *self)
1244 {
1245     SubString first;
1246     Py_ssize_t first_idx;
1247     fieldnameiterobject *it;
1248 
1249     PyObject *first_obj = NULL;
1250     PyObject *result = NULL;
1251 
1252     if (!PyUnicode_Check(self)) {
1253         PyErr_Format(PyExc_TypeError, "expected str, got %s", Py_TYPE(self)->tp_name);
1254         return NULL;
1255     }
1256 
1257     if (PyUnicode_READY(self) == -1)
1258         return NULL;
1259 
1260     it = PyObject_New(fieldnameiterobject, &PyFieldNameIter_Type);
1261     if (it == NULL)
1262         return NULL;
1263 
1264     /* take ownership, give the object to the iterator.  this is
1265        just to keep the field_name alive */
1266     Py_INCREF(self);
1267     it->str = self;
1268 
1269     /* Pass in auto_number = NULL. We'll return an empty string for
1270        first_obj in that case. */
1271     if (!field_name_split((PyObject*)self, 0, PyUnicode_GET_LENGTH(self),
1272                           &first, &first_idx, &it->it_field, NULL))
1273         goto done;
1274 
1275     /* first becomes an integer, if possible; else a string */
1276     if (first_idx != -1)
1277         first_obj = PyLong_FromSsize_t(first_idx);
1278     else
1279         /* convert "first" into a string object */
1280         first_obj = SubString_new_object(&first);
1281     if (first_obj == NULL)
1282         goto done;
1283 
1284     /* return a tuple of values */
1285     result = PyTuple_Pack(2, first_obj, it);
1286 
1287 done:
1288     Py_XDECREF(it);
1289     Py_XDECREF(first_obj);
1290     return result;
1291 }
1292