1 /* Peephole optimizations for bytecode compiler. */
2
3 #include "Python.h"
4
5 #include "Python-ast.h"
6 #include "node.h"
7 #include "ast.h"
8 #include "code.h"
9 #include "symtable.h"
10 #include "opcode.h"
11 #include "wordcode_helpers.h"
12
13 #define UNCONDITIONAL_JUMP(op) (op==JUMP_ABSOLUTE || op==JUMP_FORWARD)
14 #define CONDITIONAL_JUMP(op) (op==POP_JUMP_IF_FALSE || op==POP_JUMP_IF_TRUE \
15 || op==JUMP_IF_FALSE_OR_POP || op==JUMP_IF_TRUE_OR_POP)
16 #define ABSOLUTE_JUMP(op) (op==JUMP_ABSOLUTE \
17 || op==POP_JUMP_IF_FALSE || op==POP_JUMP_IF_TRUE \
18 || op==JUMP_IF_FALSE_OR_POP || op==JUMP_IF_TRUE_OR_POP || op==JUMP_IF_NOT_EXC_MATCH)
19 #define JUMPS_ON_TRUE(op) (op==POP_JUMP_IF_TRUE || op==JUMP_IF_TRUE_OR_POP)
20 #define GETJUMPTGT(arr, i) (get_arg(arr, i) / sizeof(_Py_CODEUNIT) + \
21 (ABSOLUTE_JUMP(_Py_OPCODE(arr[i])) ? 0 : i+1))
22 #define ISBASICBLOCK(blocks, start, end) \
23 (blocks[start]==blocks[end])
24
25
26 /* Scans back N consecutive LOAD_CONST instructions, skipping NOPs,
27 returns index of the Nth last's LOAD_CONST's EXTENDED_ARG prefix.
28 Callers are responsible to check CONST_STACK_LEN beforehand.
29 */
30 static Py_ssize_t
lastn_const_start(const _Py_CODEUNIT * codestr,Py_ssize_t i,Py_ssize_t n)31 lastn_const_start(const _Py_CODEUNIT *codestr, Py_ssize_t i, Py_ssize_t n)
32 {
33 assert(n > 0);
34 for (;;) {
35 i--;
36 assert(i >= 0);
37 if (_Py_OPCODE(codestr[i]) == LOAD_CONST) {
38 if (!--n) {
39 while (i > 0 && _Py_OPCODE(codestr[i-1]) == EXTENDED_ARG) {
40 i--;
41 }
42 return i;
43 }
44 }
45 else {
46 assert(_Py_OPCODE(codestr[i]) == EXTENDED_ARG);
47 }
48 }
49 }
50
51 /* Scans through EXTENDED ARGs, seeking the index of the effective opcode */
52 static Py_ssize_t
find_op(const _Py_CODEUNIT * codestr,Py_ssize_t codelen,Py_ssize_t i)53 find_op(const _Py_CODEUNIT *codestr, Py_ssize_t codelen, Py_ssize_t i)
54 {
55 while (i < codelen && _Py_OPCODE(codestr[i]) == EXTENDED_ARG) {
56 i++;
57 }
58 return i;
59 }
60
61 /* Given the index of the effective opcode,
62 scan back to construct the oparg with EXTENDED_ARG */
63 static unsigned int
get_arg(const _Py_CODEUNIT * codestr,Py_ssize_t i)64 get_arg(const _Py_CODEUNIT *codestr, Py_ssize_t i)
65 {
66 _Py_CODEUNIT word;
67 unsigned int oparg = _Py_OPARG(codestr[i]);
68 if (i >= 1 && _Py_OPCODE(word = codestr[i-1]) == EXTENDED_ARG) {
69 oparg |= _Py_OPARG(word) << 8;
70 if (i >= 2 && _Py_OPCODE(word = codestr[i-2]) == EXTENDED_ARG) {
71 oparg |= _Py_OPARG(word) << 16;
72 if (i >= 3 && _Py_OPCODE(word = codestr[i-3]) == EXTENDED_ARG) {
73 oparg |= _Py_OPARG(word) << 24;
74 }
75 }
76 }
77 return oparg;
78 }
79
80 /* Fill the region with NOPs. */
81 static void
fill_nops(_Py_CODEUNIT * codestr,Py_ssize_t start,Py_ssize_t end)82 fill_nops(_Py_CODEUNIT *codestr, Py_ssize_t start, Py_ssize_t end)
83 {
84 memset(codestr + start, NOP, (end - start) * sizeof(_Py_CODEUNIT));
85 }
86
87 /* Given the index of the effective opcode,
88 attempt to replace the argument, taking into account EXTENDED_ARG.
89 Returns -1 on failure, or the new op index on success */
90 static Py_ssize_t
set_arg(_Py_CODEUNIT * codestr,Py_ssize_t i,unsigned int oparg)91 set_arg(_Py_CODEUNIT *codestr, Py_ssize_t i, unsigned int oparg)
92 {
93 unsigned int curarg = get_arg(codestr, i);
94 int curilen, newilen;
95 if (curarg == oparg)
96 return i;
97 curilen = instrsize(curarg);
98 newilen = instrsize(oparg);
99 if (curilen < newilen) {
100 return -1;
101 }
102
103 write_op_arg(codestr + i + 1 - curilen, _Py_OPCODE(codestr[i]), oparg, newilen);
104 fill_nops(codestr, i + 1 - curilen + newilen, i + 1);
105 return i-curilen+newilen;
106 }
107
108 /* Attempt to write op/arg at end of specified region of memory.
109 Preceding memory in the region is overwritten with NOPs.
110 Returns -1 on failure, op index on success */
111 static Py_ssize_t
copy_op_arg(_Py_CODEUNIT * codestr,Py_ssize_t i,unsigned char op,unsigned int oparg,Py_ssize_t maxi)112 copy_op_arg(_Py_CODEUNIT *codestr, Py_ssize_t i, unsigned char op,
113 unsigned int oparg, Py_ssize_t maxi)
114 {
115 int ilen = instrsize(oparg);
116 if (i + ilen > maxi) {
117 return -1;
118 }
119 write_op_arg(codestr + maxi - ilen, op, oparg, ilen);
120 fill_nops(codestr, i, maxi - ilen);
121 return maxi - 1;
122 }
123
124 /* Replace LOAD_CONST c1, LOAD_CONST c2 ... LOAD_CONST cn, BUILD_TUPLE n
125 with LOAD_CONST (c1, c2, ... cn).
126 The consts table must still be in list form so that the
127 new constant (c1, c2, ... cn) can be appended.
128 Called with codestr pointing to the first LOAD_CONST.
129 */
130 static Py_ssize_t
fold_tuple_on_constants(_Py_CODEUNIT * codestr,Py_ssize_t codelen,Py_ssize_t c_start,Py_ssize_t opcode_end,PyObject * consts,int n)131 fold_tuple_on_constants(_Py_CODEUNIT *codestr, Py_ssize_t codelen,
132 Py_ssize_t c_start, Py_ssize_t opcode_end,
133 PyObject *consts, int n)
134 {
135 /* Pre-conditions */
136 assert(PyList_CheckExact(consts));
137
138 /* Buildup new tuple of constants */
139 PyObject *newconst = PyTuple_New(n);
140 if (newconst == NULL) {
141 return -1;
142 }
143
144 for (Py_ssize_t i = 0, pos = c_start; i < n; i++, pos++) {
145 assert(pos < opcode_end);
146 pos = find_op(codestr, codelen, pos);
147 assert(_Py_OPCODE(codestr[pos]) == LOAD_CONST);
148
149 unsigned int arg = get_arg(codestr, pos);
150 PyObject *constant = PyList_GET_ITEM(consts, arg);
151 Py_INCREF(constant);
152 PyTuple_SET_ITEM(newconst, i, constant);
153 }
154
155 Py_ssize_t index = PyList_GET_SIZE(consts);
156 #if SIZEOF_SIZE_T > SIZEOF_INT
157 if ((size_t)index >= UINT_MAX - 1) {
158 Py_DECREF(newconst);
159 PyErr_SetString(PyExc_OverflowError, "too many constants");
160 return -1;
161 }
162 #endif
163
164 /* Append folded constant onto consts */
165 if (PyList_Append(consts, newconst)) {
166 Py_DECREF(newconst);
167 return -1;
168 }
169 Py_DECREF(newconst);
170
171 return copy_op_arg(codestr, c_start, LOAD_CONST,
172 (unsigned int)index, opcode_end);
173 }
174
175 static unsigned int *
markblocks(_Py_CODEUNIT * code,Py_ssize_t len)176 markblocks(_Py_CODEUNIT *code, Py_ssize_t len)
177 {
178 unsigned int *blocks = PyMem_New(unsigned int, len);
179 int i, j, opcode, blockcnt = 0;
180
181 if (blocks == NULL) {
182 PyErr_NoMemory();
183 return NULL;
184 }
185 memset(blocks, 0, len*sizeof(int));
186
187 /* Mark labels in the first pass */
188 for (i = 0; i < len; i++) {
189 opcode = _Py_OPCODE(code[i]);
190 switch (opcode) {
191 case FOR_ITER:
192 case JUMP_FORWARD:
193 case JUMP_IF_FALSE_OR_POP:
194 case JUMP_IF_TRUE_OR_POP:
195 case POP_JUMP_IF_FALSE:
196 case POP_JUMP_IF_TRUE:
197 case JUMP_IF_NOT_EXC_MATCH:
198 case JUMP_ABSOLUTE:
199 case SETUP_FINALLY:
200 case SETUP_WITH:
201 case SETUP_ASYNC_WITH:
202 j = GETJUMPTGT(code, i);
203 assert(j < len);
204 blocks[j] = 1;
205 break;
206 }
207 }
208 /* Build block numbers in the second pass */
209 for (i = 0; i < len; i++) {
210 blockcnt += blocks[i]; /* increment blockcnt over labels */
211 blocks[i] = blockcnt;
212 }
213 return blocks;
214 }
215
216 /* Perform basic peephole optimizations to components of a code object.
217 The consts object should still be in list form to allow new constants
218 to be appended.
219
220 To keep the optimizer simple, it bails when the lineno table has complex
221 encoding for gaps >= 255.
222
223 Optimizations are restricted to simple transformations occurring within a
224 single basic block. All transformations keep the code size the same or
225 smaller. For those that reduce size, the gaps are initially filled with
226 NOPs. Later those NOPs are removed and the jump addresses retargeted in
227 a single pass. */
228
229 PyObject *
PyCode_Optimize(PyObject * code,PyObject * consts,PyObject * names,PyObject * lnotab_obj)230 PyCode_Optimize(PyObject *code, PyObject* consts, PyObject *names,
231 PyObject *lnotab_obj)
232 {
233 Py_ssize_t h, i, nexti, op_start, tgt;
234 unsigned int j, nops;
235 unsigned char opcode, nextop;
236 _Py_CODEUNIT *codestr = NULL;
237 unsigned char *lnotab;
238 unsigned int cum_orig_offset, last_offset;
239 Py_ssize_t tabsiz;
240 // Count runs of consecutive LOAD_CONSTs
241 unsigned int cumlc = 0, lastlc = 0;
242 unsigned int *blocks = NULL;
243
244 /* Bail out if an exception is set */
245 if (PyErr_Occurred())
246 goto exitError;
247
248 /* Bypass optimization when the lnotab table is too complex */
249 assert(PyBytes_Check(lnotab_obj));
250 lnotab = (unsigned char*)PyBytes_AS_STRING(lnotab_obj);
251 tabsiz = PyBytes_GET_SIZE(lnotab_obj);
252 assert(tabsiz == 0 || Py_REFCNT(lnotab_obj) == 1);
253
254 /* Don't optimize if lnotab contains instruction pointer delta larger
255 than +255 (encoded as multiple bytes), just to keep the peephole optimizer
256 simple. The optimizer leaves line number deltas unchanged. */
257
258 for (i = 0; i < tabsiz; i += 2) {
259 if (lnotab[i] == 255) {
260 goto exitUnchanged;
261 }
262 }
263
264 assert(PyBytes_Check(code));
265 Py_ssize_t codesize = PyBytes_GET_SIZE(code);
266 assert(codesize % sizeof(_Py_CODEUNIT) == 0);
267 Py_ssize_t codelen = codesize / sizeof(_Py_CODEUNIT);
268 if (codelen > INT_MAX) {
269 /* Python assembler is limited to INT_MAX: see assembler.a_offset in
270 compile.c. */
271 goto exitUnchanged;
272 }
273
274 /* Make a modifiable copy of the code string */
275 codestr = (_Py_CODEUNIT *)PyMem_Malloc(codesize);
276 if (codestr == NULL) {
277 PyErr_NoMemory();
278 goto exitError;
279 }
280 memcpy(codestr, PyBytes_AS_STRING(code), codesize);
281
282 blocks = markblocks(codestr, codelen);
283 if (blocks == NULL)
284 goto exitError;
285 assert(PyList_Check(consts));
286
287 for (i=find_op(codestr, codelen, 0) ; i<codelen ; i=nexti) {
288 opcode = _Py_OPCODE(codestr[i]);
289 op_start = i;
290 while (op_start >= 1 && _Py_OPCODE(codestr[op_start-1]) == EXTENDED_ARG) {
291 op_start--;
292 }
293
294 nexti = i + 1;
295 while (nexti < codelen && _Py_OPCODE(codestr[nexti]) == EXTENDED_ARG)
296 nexti++;
297 nextop = nexti < codelen ? _Py_OPCODE(codestr[nexti]) : 0;
298
299 lastlc = cumlc;
300 cumlc = 0;
301
302 switch (opcode) {
303 /* Skip over LOAD_CONST trueconst
304 POP_JUMP_IF_FALSE xx. This improves
305 "while 1" performance. */
306 case LOAD_CONST:
307 cumlc = lastlc + 1;
308 if (nextop != POP_JUMP_IF_FALSE ||
309 !ISBASICBLOCK(blocks, op_start, i + 1)) {
310 break;
311 }
312 PyObject* cnt = PyList_GET_ITEM(consts, get_arg(codestr, i));
313 int is_true = PyObject_IsTrue(cnt);
314 if (is_true == -1) {
315 goto exitError;
316 }
317 if (is_true == 1) {
318 fill_nops(codestr, op_start, nexti + 1);
319 cumlc = 0;
320 }
321 break;
322
323 /* Try to fold tuples of constants.
324 Skip over BUILD_SEQN 1 UNPACK_SEQN 1.
325 Replace BUILD_SEQN 2 UNPACK_SEQN 2 with ROT2.
326 Replace BUILD_SEQN 3 UNPACK_SEQN 3 with ROT3 ROT2. */
327 case BUILD_TUPLE:
328 j = get_arg(codestr, i);
329 if (j > 0 && lastlc >= j) {
330 h = lastn_const_start(codestr, op_start, j);
331 if (ISBASICBLOCK(blocks, h, op_start)) {
332 h = fold_tuple_on_constants(codestr, codelen,
333 h, i+1, consts, j);
334 break;
335 }
336 }
337 if (nextop != UNPACK_SEQUENCE ||
338 !ISBASICBLOCK(blocks, op_start, i + 1) ||
339 j != get_arg(codestr, nexti))
340 break;
341 if (j < 2) {
342 fill_nops(codestr, op_start, nexti + 1);
343 } else if (j == 2) {
344 codestr[op_start] = PACKOPARG(ROT_TWO, 0);
345 fill_nops(codestr, op_start + 1, nexti + 1);
346 } else if (j == 3) {
347 codestr[op_start] = PACKOPARG(ROT_THREE, 0);
348 codestr[op_start + 1] = PACKOPARG(ROT_TWO, 0);
349 fill_nops(codestr, op_start + 2, nexti + 1);
350 }
351 break;
352
353 /* Simplify conditional jump to conditional jump where the
354 result of the first test implies the success of a similar
355 test or the failure of the opposite test.
356 Arises in code like:
357 "a and b or c"
358 "(a and b) and c"
359 "(a or b) or c"
360 "(a or b) and c"
361 x:JUMP_IF_FALSE_OR_POP y y:JUMP_IF_FALSE_OR_POP z
362 --> x:JUMP_IF_FALSE_OR_POP z
363 x:JUMP_IF_FALSE_OR_POP y y:JUMP_IF_TRUE_OR_POP z
364 --> x:POP_JUMP_IF_FALSE y+1
365 where y+1 is the instruction following the second test.
366 */
367 case JUMP_IF_FALSE_OR_POP:
368 case JUMP_IF_TRUE_OR_POP:
369 h = get_arg(codestr, i) / sizeof(_Py_CODEUNIT);
370 tgt = find_op(codestr, codelen, h);
371
372 j = _Py_OPCODE(codestr[tgt]);
373 if (CONDITIONAL_JUMP(j)) {
374 /* NOTE: all possible jumps here are absolute. */
375 if (JUMPS_ON_TRUE(j) == JUMPS_ON_TRUE(opcode)) {
376 /* The second jump will be taken iff the first is.
377 The current opcode inherits its target's
378 stack effect */
379 h = set_arg(codestr, i, get_arg(codestr, tgt));
380 } else {
381 /* The second jump is not taken if the first is (so
382 jump past it), and all conditional jumps pop their
383 argument when they're not taken (so change the
384 first jump to pop its argument when it's taken). */
385 Py_ssize_t arg = (tgt + 1);
386 /* cannot overflow: codelen <= INT_MAX */
387 assert((size_t)arg <= UINT_MAX / sizeof(_Py_CODEUNIT));
388 arg *= sizeof(_Py_CODEUNIT);
389 h = set_arg(codestr, i, (unsigned int)arg);
390 j = opcode == JUMP_IF_TRUE_OR_POP ?
391 POP_JUMP_IF_TRUE : POP_JUMP_IF_FALSE;
392 }
393
394 if (h >= 0) {
395 nexti = h;
396 codestr[nexti] = PACKOPARG(j, _Py_OPARG(codestr[nexti]));
397 break;
398 }
399 }
400 /* Intentional fallthrough */
401
402 /* Replace jumps to unconditional jumps */
403 case POP_JUMP_IF_FALSE:
404 case POP_JUMP_IF_TRUE:
405 case JUMP_FORWARD:
406 case JUMP_ABSOLUTE:
407 h = GETJUMPTGT(codestr, i);
408 tgt = find_op(codestr, codelen, h);
409 /* Replace JUMP_* to a RETURN into just a RETURN */
410 if (UNCONDITIONAL_JUMP(opcode) &&
411 _Py_OPCODE(codestr[tgt]) == RETURN_VALUE) {
412 codestr[op_start] = PACKOPARG(RETURN_VALUE, 0);
413 fill_nops(codestr, op_start + 1, i + 1);
414 } else if (UNCONDITIONAL_JUMP(_Py_OPCODE(codestr[tgt]))) {
415 size_t arg = GETJUMPTGT(codestr, tgt);
416 if (opcode == JUMP_FORWARD) { /* JMP_ABS can go backwards */
417 opcode = JUMP_ABSOLUTE;
418 } else if (!ABSOLUTE_JUMP(opcode)) {
419 if (arg < (size_t)(i + 1)) {
420 break; /* No backward relative jumps */
421 }
422 arg -= i + 1; /* Calc relative jump addr */
423 }
424 /* cannot overflow: codelen <= INT_MAX */
425 assert(arg <= (UINT_MAX / sizeof(_Py_CODEUNIT)));
426 arg *= sizeof(_Py_CODEUNIT);
427 copy_op_arg(codestr, op_start, opcode,
428 (unsigned int)arg, i + 1);
429 }
430 break;
431
432 /* Remove unreachable ops after RETURN */
433 case RETURN_VALUE:
434 h = i + 1;
435 while (h < codelen && ISBASICBLOCK(blocks, i, h))
436 {
437 /* Leave SETUP_FINALLY and RERAISE in place to help find block limits. */
438 if (_Py_OPCODE(codestr[h]) == SETUP_FINALLY || _Py_OPCODE(codestr[h]) == RERAISE) {
439 while (h > i + 1 &&
440 _Py_OPCODE(codestr[h - 1]) == EXTENDED_ARG)
441 {
442 h--;
443 }
444 break;
445 }
446 h++;
447 }
448 if (h > i + 1) {
449 fill_nops(codestr, i + 1, h);
450 nexti = find_op(codestr, codelen, h);
451 }
452 break;
453 }
454 }
455
456 /* Fixup lnotab */
457 for (i = 0, nops = 0; i < codelen; i++) {
458 size_t block = (size_t)i - nops;
459 /* cannot overflow: codelen <= INT_MAX */
460 assert(block <= UINT_MAX);
461 /* original code offset => new code offset */
462 blocks[i] = (unsigned int)block;
463 if (_Py_OPCODE(codestr[i]) == NOP) {
464 nops++;
465 }
466 }
467 cum_orig_offset = 0;
468 last_offset = 0;
469 for (i=0 ; i < tabsiz ; i+=2) {
470 unsigned int offset_delta, new_offset;
471 cum_orig_offset += lnotab[i];
472 assert(cum_orig_offset % sizeof(_Py_CODEUNIT) == 0);
473 new_offset = blocks[cum_orig_offset / sizeof(_Py_CODEUNIT)] *
474 sizeof(_Py_CODEUNIT);
475 offset_delta = new_offset - last_offset;
476 assert(offset_delta <= 255);
477 lnotab[i] = (unsigned char)offset_delta;
478 last_offset = new_offset;
479 }
480
481 /* Remove NOPs and fixup jump targets */
482 for (op_start = i = h = 0; i < codelen; i++, op_start = i) {
483 j = _Py_OPARG(codestr[i]);
484 while (_Py_OPCODE(codestr[i]) == EXTENDED_ARG) {
485 i++;
486 j = j<<8 | _Py_OPARG(codestr[i]);
487 }
488 opcode = _Py_OPCODE(codestr[i]);
489 switch (opcode) {
490 case NOP:continue;
491
492 case JUMP_ABSOLUTE:
493 case POP_JUMP_IF_FALSE:
494 case POP_JUMP_IF_TRUE:
495 case JUMP_IF_FALSE_OR_POP:
496 case JUMP_IF_TRUE_OR_POP:
497 case JUMP_IF_NOT_EXC_MATCH:
498 j = blocks[j / sizeof(_Py_CODEUNIT)] * sizeof(_Py_CODEUNIT);
499 break;
500
501 case FOR_ITER:
502 case JUMP_FORWARD:
503 case SETUP_FINALLY:
504 case SETUP_WITH:
505 case SETUP_ASYNC_WITH:
506 j = blocks[j / sizeof(_Py_CODEUNIT) + i + 1] - blocks[i] - 1;
507 j *= sizeof(_Py_CODEUNIT);
508 break;
509 }
510 Py_ssize_t ilen = i - op_start + 1;
511 if (instrsize(j) > ilen) {
512 goto exitUnchanged;
513 }
514 /* If instrsize(j) < ilen, we'll emit EXTENDED_ARG 0 */
515 if (ilen > 4) {
516 /* Can only happen when PyCode_Optimize() is called with
517 malformed bytecode. */
518 goto exitUnchanged;
519 }
520 write_op_arg(codestr + h, opcode, j, (int)ilen);
521 h += ilen;
522 }
523 assert(h + (Py_ssize_t)nops == codelen);
524
525 PyMem_Free(blocks);
526 code = PyBytes_FromStringAndSize((char *)codestr, h * sizeof(_Py_CODEUNIT));
527 PyMem_Free(codestr);
528 return code;
529
530 exitError:
531 code = NULL;
532
533 exitUnchanged:
534 Py_XINCREF(code);
535 PyMem_Free(blocks);
536 PyMem_Free(codestr);
537 return code;
538 }
539