1 /*
2 * Secret Labs' Regular Expression Engine
3 *
4 * regular expression matching engine
5
6 Copyright (c) 2011, Intel Corporation. All rights reserved.<BR>
7 This program and the accompanying materials are licensed and made available under
8 the terms and conditions of the BSD License that accompanies this distribution.
9 The full text of the license may be found at
10 http://opensource.org/licenses/bsd-license.
11
12 THE PROGRAM IS DISTRIBUTED UNDER THE BSD LICENSE ON AN "AS IS" BASIS,
13 WITHOUT WARRANTIES OR REPRESENTATIONS OF ANY KIND, EITHER EXPRESS OR IMPLIED.
14 *
15 * partial history:
16 * 1999-10-24 fl created (based on existing template matcher code)
17 * 2000-03-06 fl first alpha, sort of
18 * 2000-08-01 fl fixes for 1.6b1
19 * 2000-08-07 fl use PyOS_CheckStack() if available
20 * 2000-09-20 fl added expand method
21 * 2001-03-20 fl lots of fixes for 2.1b2
22 * 2001-04-15 fl export copyright as Python attribute, not global
23 * 2001-04-28 fl added __copy__ methods (work in progress)
24 * 2001-05-14 fl fixes for 1.5.2 compatibility
25 * 2001-07-01 fl added BIGCHARSET support (from Martin von Loewis)
26 * 2001-10-18 fl fixed group reset issue (from Matthew Mueller)
27 * 2001-10-20 fl added split primitive; reenable unicode for 1.6/2.0/2.1
28 * 2001-10-21 fl added sub/subn primitive
29 * 2001-10-24 fl added finditer primitive (for 2.2 only)
30 * 2001-12-07 fl fixed memory leak in sub/subn (Guido van Rossum)
31 * 2002-11-09 fl fixed empty sub/subn return type
32 * 2003-04-18 mvl fully support 4-byte codes
33 * 2003-10-17 gn implemented non recursive scheme
34 *
35 * Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved.
36 *
37 * This version of the SRE library can be redistributed under CNRI's
38 * Python 1.6 license. For any other use, please contact Secret Labs
39 * AB (info@pythonware.com).
40 *
41 * Portions of this engine have been developed in cooperation with
42 * CNRI. Hewlett-Packard provided funding for 1.6 integration and
43 * other compatibility work.
44 */
45
46 /* Get rid of these macros to prevent collisions between EFI and Python in this file. */
47 #undef RETURN_ERROR
48 #undef RETURN_SUCCESS
49
50 #ifndef SRE_RECURSIVE
51
52 static char copyright[] =
53 " SRE 2.2.2 Copyright (c) 1997-2002 by Secret Labs AB ";
54
55 #define PY_SSIZE_T_CLEAN
56
57 #include "Python.h"
58 #include "structmember.h" /* offsetof */
59
60 #include "sre.h"
61
62 #include <ctype.h>
63
64 /* name of this module, minus the leading underscore */
65 #if !defined(SRE_MODULE)
66 #define SRE_MODULE "sre"
67 #endif
68
69 #define SRE_PY_MODULE "re"
70
71 /* defining this one enables tracing */
72 #undef VERBOSE
73
74 #if PY_VERSION_HEX >= 0x01060000
75 #if PY_VERSION_HEX < 0x02020000 || defined(Py_USING_UNICODE)
76 /* defining this enables unicode support (default under 1.6a1 and later) */
77 #define HAVE_UNICODE
78 #endif
79 #endif
80
81 /* -------------------------------------------------------------------- */
82 /* optional features */
83
84 /* enables fast searching */
85 #define USE_FAST_SEARCH
86
87 /* enables aggressive inlining (always on for Visual C) */
88 #undef USE_INLINE
89
90 /* enables copy/deepcopy handling (work in progress) */
91 #undef USE_BUILTIN_COPY
92
93 #if PY_VERSION_HEX < 0x01060000
94 #define PyObject_DEL(op) PyMem_DEL((op))
95 #endif
96
97 /* -------------------------------------------------------------------- */
98
99 #if defined(_MSC_VER)
100 #pragma optimize("gt", on) /* doesn't seem to make much difference... */
101 #pragma warning(disable: 4710) /* who cares if functions are not inlined ;-) */
102 /* fastest possible local call under MSVC */
103 #define LOCAL(type) static __inline type __fastcall
104 #elif defined(USE_INLINE)
105 #define LOCAL(type) static inline type
106 #else
107 #define LOCAL(type) static type
108 #endif
109
110 /* error codes */
111 #define SRE_ERROR_ILLEGAL -1 /* illegal opcode */
112 #define SRE_ERROR_STATE -2 /* illegal state */
113 #define SRE_ERROR_RECURSION_LIMIT -3 /* runaway recursion */
114 #define SRE_ERROR_MEMORY -9 /* out of memory */
115 #define SRE_ERROR_INTERRUPTED -10 /* signal handler raised exception */
116
117 #if defined(VERBOSE)
118 #define TRACE(v) printf v
119 #else
120 #define TRACE(v)
121 #endif
122
123 /* -------------------------------------------------------------------- */
124 /* search engine state */
125
126 /* default character predicates (run sre_chars.py to regenerate tables) */
127
128 #define SRE_DIGIT_MASK 1
129 #define SRE_SPACE_MASK 2
130 #define SRE_LINEBREAK_MASK 4
131 #define SRE_ALNUM_MASK 8
132 #define SRE_WORD_MASK 16
133
134 /* FIXME: this assumes ASCII. create tables in init_sre() instead */
135
136 static char sre_char_info[128] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 6, 2,
137 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0,
138 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 25, 25, 25, 25, 25, 25, 25, 25,
139 25, 25, 0, 0, 0, 0, 0, 0, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
140 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0,
141 0, 0, 16, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
142 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0, 0, 0, 0 };
143
144 static char sre_char_lower[128] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
145 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
146 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43,
147 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
148 61, 62, 63, 64, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107,
149 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121,
150 122, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105,
151 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
152 120, 121, 122, 123, 124, 125, 126, 127 };
153
154 #define SRE_IS_DIGIT(ch)\
155 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_DIGIT_MASK) : 0)
156 #define SRE_IS_SPACE(ch)\
157 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_SPACE_MASK) : 0)
158 #define SRE_IS_LINEBREAK(ch)\
159 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_LINEBREAK_MASK) : 0)
160 #define SRE_IS_ALNUM(ch)\
161 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_ALNUM_MASK) : 0)
162 #define SRE_IS_WORD(ch)\
163 ((ch) < 128 ? (sre_char_info[(ch)] & SRE_WORD_MASK) : 0)
164
sre_lower(unsigned int ch)165 static unsigned int sre_lower(unsigned int ch)
166 {
167 return ((ch) < 128 ? (unsigned int)sre_char_lower[ch] : ch);
168 }
169
170 /* locale-specific character predicates */
171 /* !(c & ~N) == (c < N+1) for any unsigned c, this avoids
172 * warnings when c's type supports only numbers < N+1 */
173 #define SRE_LOC_IS_DIGIT(ch) (!((ch) & ~255) ? isdigit((ch)) : 0)
174 #define SRE_LOC_IS_SPACE(ch) (!((ch) & ~255) ? isspace((ch)) : 0)
175 #define SRE_LOC_IS_LINEBREAK(ch) ((ch) == '\n')
176 #define SRE_LOC_IS_ALNUM(ch) (!((ch) & ~255) ? isalnum((ch)) : 0)
177 #define SRE_LOC_IS_WORD(ch) (SRE_LOC_IS_ALNUM((ch)) || (ch) == '_')
178
sre_lower_locale(unsigned int ch)179 static unsigned int sre_lower_locale(unsigned int ch)
180 {
181 return ((ch) < 256 ? (unsigned int)tolower((ch)) : ch);
182 }
183
184 /* unicode-specific character predicates */
185
186 #if defined(HAVE_UNICODE)
187
188 #define SRE_UNI_IS_DIGIT(ch) Py_UNICODE_ISDECIMAL((Py_UNICODE)(ch))
189 #define SRE_UNI_IS_SPACE(ch) Py_UNICODE_ISSPACE((Py_UNICODE)(ch))
190 #define SRE_UNI_IS_LINEBREAK(ch) Py_UNICODE_ISLINEBREAK((Py_UNICODE)(ch))
191 #define SRE_UNI_IS_ALNUM(ch) Py_UNICODE_ISALNUM((Py_UNICODE)(ch))
192 #define SRE_UNI_IS_WORD(ch) (SRE_UNI_IS_ALNUM((ch)) || (ch) == '_')
193
sre_lower_unicode(unsigned int ch)194 static unsigned int sre_lower_unicode(unsigned int ch)
195 {
196 return (unsigned int) Py_UNICODE_TOLOWER((Py_UNICODE)(ch));
197 }
198
199 #endif
200
201 LOCAL(int)
sre_category(SRE_CODE category,unsigned int ch)202 sre_category(SRE_CODE category, unsigned int ch)
203 {
204 switch (category) {
205
206 case SRE_CATEGORY_DIGIT:
207 return SRE_IS_DIGIT(ch);
208 case SRE_CATEGORY_NOT_DIGIT:
209 return !SRE_IS_DIGIT(ch);
210 case SRE_CATEGORY_SPACE:
211 return SRE_IS_SPACE(ch);
212 case SRE_CATEGORY_NOT_SPACE:
213 return !SRE_IS_SPACE(ch);
214 case SRE_CATEGORY_WORD:
215 return SRE_IS_WORD(ch);
216 case SRE_CATEGORY_NOT_WORD:
217 return !SRE_IS_WORD(ch);
218 case SRE_CATEGORY_LINEBREAK:
219 return SRE_IS_LINEBREAK(ch);
220 case SRE_CATEGORY_NOT_LINEBREAK:
221 return !SRE_IS_LINEBREAK(ch);
222
223 case SRE_CATEGORY_LOC_WORD:
224 return SRE_LOC_IS_WORD(ch);
225 case SRE_CATEGORY_LOC_NOT_WORD:
226 return !SRE_LOC_IS_WORD(ch);
227
228 #if defined(HAVE_UNICODE)
229 case SRE_CATEGORY_UNI_DIGIT:
230 return SRE_UNI_IS_DIGIT(ch);
231 case SRE_CATEGORY_UNI_NOT_DIGIT:
232 return !SRE_UNI_IS_DIGIT(ch);
233 case SRE_CATEGORY_UNI_SPACE:
234 return SRE_UNI_IS_SPACE(ch);
235 case SRE_CATEGORY_UNI_NOT_SPACE:
236 return !SRE_UNI_IS_SPACE(ch);
237 case SRE_CATEGORY_UNI_WORD:
238 return SRE_UNI_IS_WORD(ch);
239 case SRE_CATEGORY_UNI_NOT_WORD:
240 return !SRE_UNI_IS_WORD(ch);
241 case SRE_CATEGORY_UNI_LINEBREAK:
242 return SRE_UNI_IS_LINEBREAK(ch);
243 case SRE_CATEGORY_UNI_NOT_LINEBREAK:
244 return !SRE_UNI_IS_LINEBREAK(ch);
245 #else
246 case SRE_CATEGORY_UNI_DIGIT:
247 return SRE_IS_DIGIT(ch);
248 case SRE_CATEGORY_UNI_NOT_DIGIT:
249 return !SRE_IS_DIGIT(ch);
250 case SRE_CATEGORY_UNI_SPACE:
251 return SRE_IS_SPACE(ch);
252 case SRE_CATEGORY_UNI_NOT_SPACE:
253 return !SRE_IS_SPACE(ch);
254 case SRE_CATEGORY_UNI_WORD:
255 return SRE_LOC_IS_WORD(ch);
256 case SRE_CATEGORY_UNI_NOT_WORD:
257 return !SRE_LOC_IS_WORD(ch);
258 case SRE_CATEGORY_UNI_LINEBREAK:
259 return SRE_IS_LINEBREAK(ch);
260 case SRE_CATEGORY_UNI_NOT_LINEBREAK:
261 return !SRE_IS_LINEBREAK(ch);
262 #endif
263 }
264 return 0;
265 }
266
267 /* helpers */
268
269 static void
data_stack_dealloc(SRE_STATE * state)270 data_stack_dealloc(SRE_STATE* state)
271 {
272 if (state->data_stack) {
273 PyMem_FREE(state->data_stack);
274 state->data_stack = NULL;
275 }
276 state->data_stack_size = state->data_stack_base = 0;
277 }
278
279 static int
data_stack_grow(SRE_STATE * state,Py_ssize_t size)280 data_stack_grow(SRE_STATE* state, Py_ssize_t size)
281 {
282 Py_ssize_t minsize, cursize;
283 minsize = state->data_stack_base+size;
284 cursize = state->data_stack_size;
285 if (cursize < minsize) {
286 void* stack;
287 cursize = minsize+minsize/4+1024;
288 TRACE(("allocate/grow stack %d\n", cursize));
289 stack = PyMem_REALLOC(state->data_stack, cursize);
290 if (!stack) {
291 data_stack_dealloc(state);
292 return SRE_ERROR_MEMORY;
293 }
294 state->data_stack = (char *)stack;
295 state->data_stack_size = cursize;
296 }
297 return 0;
298 }
299
300 /* generate 8-bit version */
301
302 #define SRE_CHAR unsigned char
303 #define SRE_AT sre_at
304 #define SRE_COUNT sre_count
305 #define SRE_CHARSET sre_charset
306 #define SRE_INFO sre_info
307 #define SRE_MATCH sre_match
308 #define SRE_MATCH_CONTEXT sre_match_context
309 #define SRE_SEARCH sre_search
310 #define SRE_LITERAL_TEMPLATE sre_literal_template
311
312 #if defined(HAVE_UNICODE)
313
314 #define SRE_RECURSIVE
315 #include "_sre.c"
316 #undef SRE_RECURSIVE
317
318 #undef SRE_LITERAL_TEMPLATE
319 #undef SRE_SEARCH
320 #undef SRE_MATCH
321 #undef SRE_MATCH_CONTEXT
322 #undef SRE_INFO
323 #undef SRE_CHARSET
324 #undef SRE_COUNT
325 #undef SRE_AT
326 #undef SRE_CHAR
327
328 /* generate 16-bit unicode version */
329
330 #define SRE_CHAR Py_UNICODE
331 #define SRE_AT sre_uat
332 #define SRE_COUNT sre_ucount
333 #define SRE_CHARSET sre_ucharset
334 #define SRE_INFO sre_uinfo
335 #define SRE_MATCH sre_umatch
336 #define SRE_MATCH_CONTEXT sre_umatch_context
337 #define SRE_SEARCH sre_usearch
338 #define SRE_LITERAL_TEMPLATE sre_uliteral_template
339 #endif
340
341 #endif /* SRE_RECURSIVE */
342
343 /* -------------------------------------------------------------------- */
344 /* String matching engine */
345
346 /* the following section is compiled twice, with different character
347 settings */
348
349 LOCAL(int)
SRE_AT(SRE_STATE * state,SRE_CHAR * ptr,SRE_CODE at)350 SRE_AT(SRE_STATE* state, SRE_CHAR* ptr, SRE_CODE at)
351 {
352 /* check if pointer is at given position */
353
354 Py_ssize_t thisp, thatp;
355
356 switch (at) {
357
358 case SRE_AT_BEGINNING:
359 case SRE_AT_BEGINNING_STRING:
360 return ((void*) ptr == state->beginning);
361
362 case SRE_AT_BEGINNING_LINE:
363 return ((void*) ptr == state->beginning ||
364 SRE_IS_LINEBREAK((int) ptr[-1]));
365
366 case SRE_AT_END:
367 return (((void*) (ptr+1) == state->end &&
368 SRE_IS_LINEBREAK((int) ptr[0])) ||
369 ((void*) ptr == state->end));
370
371 case SRE_AT_END_LINE:
372 return ((void*) ptr == state->end ||
373 SRE_IS_LINEBREAK((int) ptr[0]));
374
375 case SRE_AT_END_STRING:
376 return ((void*) ptr == state->end);
377
378 case SRE_AT_BOUNDARY:
379 if (state->beginning == state->end)
380 return 0;
381 thatp = ((void*) ptr > state->beginning) ?
382 SRE_IS_WORD((int) ptr[-1]) : 0;
383 thisp = ((void*) ptr < state->end) ?
384 SRE_IS_WORD((int) ptr[0]) : 0;
385 return thisp != thatp;
386
387 case SRE_AT_NON_BOUNDARY:
388 if (state->beginning == state->end)
389 return 0;
390 thatp = ((void*) ptr > state->beginning) ?
391 SRE_IS_WORD((int) ptr[-1]) : 0;
392 thisp = ((void*) ptr < state->end) ?
393 SRE_IS_WORD((int) ptr[0]) : 0;
394 return thisp == thatp;
395
396 case SRE_AT_LOC_BOUNDARY:
397 if (state->beginning == state->end)
398 return 0;
399 thatp = ((void*) ptr > state->beginning) ?
400 SRE_LOC_IS_WORD((int) ptr[-1]) : 0;
401 thisp = ((void*) ptr < state->end) ?
402 SRE_LOC_IS_WORD((int) ptr[0]) : 0;
403 return thisp != thatp;
404
405 case SRE_AT_LOC_NON_BOUNDARY:
406 if (state->beginning == state->end)
407 return 0;
408 thatp = ((void*) ptr > state->beginning) ?
409 SRE_LOC_IS_WORD((int) ptr[-1]) : 0;
410 thisp = ((void*) ptr < state->end) ?
411 SRE_LOC_IS_WORD((int) ptr[0]) : 0;
412 return thisp == thatp;
413
414 #if defined(HAVE_UNICODE)
415 case SRE_AT_UNI_BOUNDARY:
416 if (state->beginning == state->end)
417 return 0;
418 thatp = ((void*) ptr > state->beginning) ?
419 SRE_UNI_IS_WORD((int) ptr[-1]) : 0;
420 thisp = ((void*) ptr < state->end) ?
421 SRE_UNI_IS_WORD((int) ptr[0]) : 0;
422 return thisp != thatp;
423
424 case SRE_AT_UNI_NON_BOUNDARY:
425 if (state->beginning == state->end)
426 return 0;
427 thatp = ((void*) ptr > state->beginning) ?
428 SRE_UNI_IS_WORD((int) ptr[-1]) : 0;
429 thisp = ((void*) ptr < state->end) ?
430 SRE_UNI_IS_WORD((int) ptr[0]) : 0;
431 return thisp == thatp;
432 #endif
433
434 }
435
436 return 0;
437 }
438
439 LOCAL(int)
SRE_CHARSET(SRE_CODE * set,SRE_CODE ch)440 SRE_CHARSET(SRE_CODE* set, SRE_CODE ch)
441 {
442 /* check if character is a member of the given set */
443
444 int ok = 1;
445
446 for (;;) {
447 switch (*set++) {
448
449 case SRE_OP_FAILURE:
450 return !ok;
451
452 case SRE_OP_LITERAL:
453 /* <LITERAL> <code> */
454 if (ch == set[0])
455 return ok;
456 set++;
457 break;
458
459 case SRE_OP_CATEGORY:
460 /* <CATEGORY> <code> */
461 if (sre_category(set[0], (int) ch))
462 return ok;
463 set += 1;
464 break;
465
466 case SRE_OP_CHARSET:
467 if (sizeof(SRE_CODE) == 2) {
468 /* <CHARSET> <bitmap> (16 bits per code word) */
469 if (ch < 256 && (set[ch >> 4] & (1 << (ch & 15))))
470 return ok;
471 set += 16;
472 }
473 else {
474 /* <CHARSET> <bitmap> (32 bits per code word) */
475 if (ch < 256 && (set[ch >> 5] & (1 << (ch & 31))))
476 return ok;
477 set += 8;
478 }
479 break;
480
481 case SRE_OP_RANGE:
482 /* <RANGE> <lower> <upper> */
483 if (set[0] <= ch && ch <= set[1])
484 return ok;
485 set += 2;
486 break;
487
488 case SRE_OP_NEGATE:
489 ok = !ok;
490 break;
491
492 case SRE_OP_BIGCHARSET:
493 /* <BIGCHARSET> <blockcount> <256 blockindices> <blocks> */
494 {
495 Py_ssize_t count, block;
496 count = *(set++);
497
498 if (sizeof(SRE_CODE) == 2) {
499 block = ((unsigned char*)set)[ch >> 8];
500 set += 128;
501 if (set[block*16 + ((ch & 255)>>4)] & (1 << (ch & 15)))
502 return ok;
503 set += count*16;
504 }
505 else {
506 /* !(c & ~N) == (c < N+1) for any unsigned c, this avoids
507 * warnings when c's type supports only numbers < N+1 */
508 if (!(ch & ~65535))
509 block = ((unsigned char*)set)[ch >> 8];
510 else
511 block = -1;
512 set += 64;
513 if (block >=0 &&
514 (set[block*8 + ((ch & 255)>>5)] & (1 << (ch & 31))))
515 return ok;
516 set += count*8;
517 }
518 break;
519 }
520
521 default:
522 /* internal error -- there's not much we can do about it
523 here, so let's just pretend it didn't match... */
524 return 0;
525 }
526 }
527 }
528
529 LOCAL(Py_ssize_t) SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern);
530
531 LOCAL(Py_ssize_t)
SRE_COUNT(SRE_STATE * state,SRE_CODE * pattern,Py_ssize_t maxcount)532 SRE_COUNT(SRE_STATE* state, SRE_CODE* pattern, Py_ssize_t maxcount)
533 {
534 SRE_CODE chr;
535 SRE_CHAR* ptr = (SRE_CHAR *)state->ptr;
536 SRE_CHAR* end = (SRE_CHAR *)state->end;
537 Py_ssize_t i;
538
539 /* adjust end */
540 if (maxcount < end - ptr && maxcount != 65535)
541 end = ptr + maxcount;
542
543 switch (pattern[0]) {
544
545 case SRE_OP_IN:
546 /* repeated set */
547 TRACE(("|%p|%p|COUNT IN\n", pattern, ptr));
548 while (ptr < end && SRE_CHARSET(pattern + 2, *ptr))
549 ptr++;
550 break;
551
552 case SRE_OP_ANY:
553 /* repeated dot wildcard. */
554 TRACE(("|%p|%p|COUNT ANY\n", pattern, ptr));
555 while (ptr < end && !SRE_IS_LINEBREAK(*ptr))
556 ptr++;
557 break;
558
559 case SRE_OP_ANY_ALL:
560 /* repeated dot wildcard. skip to the end of the target
561 string, and backtrack from there */
562 TRACE(("|%p|%p|COUNT ANY_ALL\n", pattern, ptr));
563 ptr = end;
564 break;
565
566 case SRE_OP_LITERAL:
567 /* repeated literal */
568 chr = pattern[1];
569 TRACE(("|%p|%p|COUNT LITERAL %d\n", pattern, ptr, chr));
570 while (ptr < end && (SRE_CODE) *ptr == chr)
571 ptr++;
572 break;
573
574 case SRE_OP_LITERAL_IGNORE:
575 /* repeated literal */
576 chr = pattern[1];
577 TRACE(("|%p|%p|COUNT LITERAL_IGNORE %d\n", pattern, ptr, chr));
578 while (ptr < end && (SRE_CODE) state->lower(*ptr) == chr)
579 ptr++;
580 break;
581
582 case SRE_OP_NOT_LITERAL:
583 /* repeated non-literal */
584 chr = pattern[1];
585 TRACE(("|%p|%p|COUNT NOT_LITERAL %d\n", pattern, ptr, chr));
586 while (ptr < end && (SRE_CODE) *ptr != chr)
587 ptr++;
588 break;
589
590 case SRE_OP_NOT_LITERAL_IGNORE:
591 /* repeated non-literal */
592 chr = pattern[1];
593 TRACE(("|%p|%p|COUNT NOT_LITERAL_IGNORE %d\n", pattern, ptr, chr));
594 while (ptr < end && (SRE_CODE) state->lower(*ptr) != chr)
595 ptr++;
596 break;
597
598 default:
599 /* repeated single character pattern */
600 TRACE(("|%p|%p|COUNT SUBPATTERN\n", pattern, ptr));
601 while ((SRE_CHAR*) state->ptr < end) {
602 i = SRE_MATCH(state, pattern);
603 if (i < 0)
604 return i;
605 if (!i)
606 break;
607 }
608 TRACE(("|%p|%p|COUNT %d\n", pattern, ptr,
609 (SRE_CHAR*) state->ptr - ptr));
610 return (SRE_CHAR*) state->ptr - ptr;
611 }
612
613 TRACE(("|%p|%p|COUNT %d\n", pattern, ptr, ptr - (SRE_CHAR*) state->ptr));
614 return ptr - (SRE_CHAR*) state->ptr;
615 }
616
617 #if 0 /* not used in this release */
618 LOCAL(int)
619 SRE_INFO(SRE_STATE* state, SRE_CODE* pattern)
620 {
621 /* check if an SRE_OP_INFO block matches at the current position.
622 returns the number of SRE_CODE objects to skip if successful, 0
623 if no match */
624
625 SRE_CHAR* end = state->end;
626 SRE_CHAR* ptr = state->ptr;
627 Py_ssize_t i;
628
629 /* check minimal length */
630 if (pattern[3] && (end - ptr) < pattern[3])
631 return 0;
632
633 /* check known prefix */
634 if (pattern[2] & SRE_INFO_PREFIX && pattern[5] > 1) {
635 /* <length> <skip> <prefix data> <overlap data> */
636 for (i = 0; i < pattern[5]; i++)
637 if ((SRE_CODE) ptr[i] != pattern[7 + i])
638 return 0;
639 return pattern[0] + 2 * pattern[6];
640 }
641 return pattern[0];
642 }
643 #endif
644
645 /* The macros below should be used to protect recursive SRE_MATCH()
646 * calls that *failed* and do *not* return immediately (IOW, those
647 * that will backtrack). Explaining:
648 *
649 * - Recursive SRE_MATCH() returned true: that's usually a success
650 * (besides atypical cases like ASSERT_NOT), therefore there's no
651 * reason to restore lastmark;
652 *
653 * - Recursive SRE_MATCH() returned false but the current SRE_MATCH()
654 * is returning to the caller: If the current SRE_MATCH() is the
655 * top function of the recursion, returning false will be a matching
656 * failure, and it doesn't matter where lastmark is pointing to.
657 * If it's *not* the top function, it will be a recursive SRE_MATCH()
658 * failure by itself, and the calling SRE_MATCH() will have to deal
659 * with the failure by the same rules explained here (it will restore
660 * lastmark by itself if necessary);
661 *
662 * - Recursive SRE_MATCH() returned false, and will continue the
663 * outside 'for' loop: must be protected when breaking, since the next
664 * OP could potentially depend on lastmark;
665 *
666 * - Recursive SRE_MATCH() returned false, and will be called again
667 * inside a local for/while loop: must be protected between each
668 * loop iteration, since the recursive SRE_MATCH() could do anything,
669 * and could potentially depend on lastmark.
670 *
671 * For more information, check the discussion at SF patch #712900.
672 */
673 #define LASTMARK_SAVE() \
674 do { \
675 ctx->lastmark = state->lastmark; \
676 ctx->lastindex = state->lastindex; \
677 } while (0)
678 #define LASTMARK_RESTORE() \
679 do { \
680 state->lastmark = ctx->lastmark; \
681 state->lastindex = ctx->lastindex; \
682 } while (0)
683
684 #define RETURN_ERROR(i) do { return i; } while(0)
685 #define RETURN_FAILURE do { ret = 0; goto exit; } while(0)
686 #define RETURN_SUCCESS do { ret = 1; goto exit; } while(0)
687
688 #define RETURN_ON_ERROR(i) \
689 do { if (i < 0) RETURN_ERROR(i); } while (0)
690 #define RETURN_ON_SUCCESS(i) \
691 do { RETURN_ON_ERROR(i); if (i > 0) RETURN_SUCCESS; } while (0)
692 #define RETURN_ON_FAILURE(i) \
693 do { RETURN_ON_ERROR(i); if (i == 0) RETURN_FAILURE; } while (0)
694
695 #define SFY(x) #x
696
697 #define DATA_STACK_ALLOC(state, type, ptr) \
698 do { \
699 alloc_pos = state->data_stack_base; \
700 TRACE(("allocating %s in %d (%d)\n", \
701 SFY(type), alloc_pos, sizeof(type))); \
702 if (state->data_stack_size < alloc_pos+sizeof(type)) { \
703 int j = data_stack_grow(state, sizeof(type)); \
704 if (j < 0) return j; \
705 if (ctx_pos != -1) \
706 DATA_STACK_LOOKUP_AT(state, SRE_MATCH_CONTEXT, ctx, ctx_pos); \
707 } \
708 ptr = (type*)(state->data_stack+alloc_pos); \
709 state->data_stack_base += sizeof(type); \
710 } while (0)
711
712 #define DATA_STACK_LOOKUP_AT(state, type, ptr, pos) \
713 do { \
714 TRACE(("looking up %s at %d\n", SFY(type), pos)); \
715 ptr = (type*)(state->data_stack+pos); \
716 } while (0)
717
718 #define DATA_STACK_PUSH(state, data, size) \
719 do { \
720 TRACE(("copy data in %p to %d (%d)\n", \
721 data, state->data_stack_base, size)); \
722 if (state->data_stack_size < state->data_stack_base+size) { \
723 int j = data_stack_grow(state, size); \
724 if (j < 0) return j; \
725 if (ctx_pos != -1) \
726 DATA_STACK_LOOKUP_AT(state, SRE_MATCH_CONTEXT, ctx, ctx_pos); \
727 } \
728 memcpy(state->data_stack+state->data_stack_base, data, size); \
729 state->data_stack_base += size; \
730 } while (0)
731
732 #define DATA_STACK_POP(state, data, size, discard) \
733 do { \
734 TRACE(("copy data to %p from %d (%d)\n", \
735 data, state->data_stack_base-size, size)); \
736 memcpy(data, state->data_stack+state->data_stack_base-size, size); \
737 if (discard) \
738 state->data_stack_base -= size; \
739 } while (0)
740
741 #define DATA_STACK_POP_DISCARD(state, size) \
742 do { \
743 TRACE(("discard data from %d (%d)\n", \
744 state->data_stack_base-size, size)); \
745 state->data_stack_base -= size; \
746 } while(0)
747
748 #define DATA_PUSH(x) \
749 DATA_STACK_PUSH(state, (x), sizeof(*(x)))
750 #define DATA_POP(x) \
751 DATA_STACK_POP(state, (x), sizeof(*(x)), 1)
752 #define DATA_POP_DISCARD(x) \
753 DATA_STACK_POP_DISCARD(state, sizeof(*(x)))
754 #define DATA_ALLOC(t,p) \
755 DATA_STACK_ALLOC(state, t, p)
756 #define DATA_LOOKUP_AT(t,p,pos) \
757 DATA_STACK_LOOKUP_AT(state,t,p,pos)
758
759 #define MARK_PUSH(lastmark) \
760 do if (lastmark > 0) { \
761 i = lastmark; /* ctx->lastmark may change if reallocated */ \
762 DATA_STACK_PUSH(state, state->mark, (i+1)*sizeof(void*)); \
763 } while (0)
764 #define MARK_POP(lastmark) \
765 do if (lastmark > 0) { \
766 DATA_STACK_POP(state, state->mark, (lastmark+1)*sizeof(void*), 1); \
767 } while (0)
768 #define MARK_POP_KEEP(lastmark) \
769 do if (lastmark > 0) { \
770 DATA_STACK_POP(state, state->mark, (lastmark+1)*sizeof(void*), 0); \
771 } while (0)
772 #define MARK_POP_DISCARD(lastmark) \
773 do if (lastmark > 0) { \
774 DATA_STACK_POP_DISCARD(state, (lastmark+1)*sizeof(void*)); \
775 } while (0)
776
777 #define JUMP_NONE 0
778 #define JUMP_MAX_UNTIL_1 1
779 #define JUMP_MAX_UNTIL_2 2
780 #define JUMP_MAX_UNTIL_3 3
781 #define JUMP_MIN_UNTIL_1 4
782 #define JUMP_MIN_UNTIL_2 5
783 #define JUMP_MIN_UNTIL_3 6
784 #define JUMP_REPEAT 7
785 #define JUMP_REPEAT_ONE_1 8
786 #define JUMP_REPEAT_ONE_2 9
787 #define JUMP_MIN_REPEAT_ONE 10
788 #define JUMP_BRANCH 11
789 #define JUMP_ASSERT 12
790 #define JUMP_ASSERT_NOT 13
791
792 #define DO_JUMP(jumpvalue, jumplabel, nextpattern) \
793 DATA_ALLOC(SRE_MATCH_CONTEXT, nextctx); \
794 nextctx->last_ctx_pos = ctx_pos; \
795 nextctx->jump = jumpvalue; \
796 nextctx->pattern = nextpattern; \
797 ctx_pos = alloc_pos; \
798 ctx = nextctx; \
799 goto entrance; \
800 jumplabel: \
801 while (0) /* gcc doesn't like labels at end of scopes */ \
802
803 typedef struct {
804 Py_ssize_t last_ctx_pos;
805 Py_ssize_t jump;
806 SRE_CHAR* ptr;
807 SRE_CODE* pattern;
808 Py_ssize_t count;
809 Py_ssize_t lastmark;
810 Py_ssize_t lastindex;
811 union {
812 SRE_CODE chr;
813 SRE_REPEAT* rep;
814 } u;
815 } SRE_MATCH_CONTEXT;
816
817 /* check if string matches the given pattern. returns <0 for
818 error, 0 for failure, and 1 for success */
819 LOCAL(Py_ssize_t)
SRE_MATCH(SRE_STATE * state,SRE_CODE * pattern)820 SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
821 {
822 SRE_CHAR* end = (SRE_CHAR *)state->end;
823 Py_ssize_t alloc_pos, ctx_pos = -1;
824 Py_ssize_t i, ret = 0;
825 Py_ssize_t jump;
826 unsigned int sigcount=0;
827
828 SRE_MATCH_CONTEXT* ctx;
829 SRE_MATCH_CONTEXT* nextctx;
830
831 TRACE(("|%p|%p|ENTER\n", pattern, state->ptr));
832
833 DATA_ALLOC(SRE_MATCH_CONTEXT, ctx);
834 ctx->last_ctx_pos = -1;
835 ctx->jump = JUMP_NONE;
836 ctx->pattern = pattern;
837 ctx_pos = alloc_pos;
838
839 entrance:
840
841 ctx->ptr = (SRE_CHAR *)state->ptr;
842
843 if (ctx->pattern[0] == SRE_OP_INFO) {
844 /* optimization info block */
845 /* <INFO> <1=skip> <2=flags> <3=min> ... */
846 if (ctx->pattern[3] && (end - ctx->ptr) < ctx->pattern[3]) {
847 TRACE(("reject (got %d chars, need %d)\n",
848 (end - ctx->ptr), ctx->pattern[3]));
849 RETURN_FAILURE;
850 }
851 ctx->pattern += ctx->pattern[1] + 1;
852 }
853
854 for (;;) {
855 ++sigcount;
856 if ((0 == (sigcount & 0xfff)) && PyErr_CheckSignals())
857 RETURN_ERROR(SRE_ERROR_INTERRUPTED);
858
859 switch (*ctx->pattern++) {
860
861 case SRE_OP_MARK:
862 /* set mark */
863 /* <MARK> <gid> */
864 TRACE(("|%p|%p|MARK %d\n", ctx->pattern,
865 ctx->ptr, ctx->pattern[0]));
866 i = ctx->pattern[0];
867 if (i & 1)
868 state->lastindex = i/2 + 1;
869 if (i > state->lastmark) {
870 /* state->lastmark is the highest valid index in the
871 state->mark array. If it is increased by more than 1,
872 the intervening marks must be set to NULL to signal
873 that these marks have not been encountered. */
874 Py_ssize_t j = state->lastmark + 1;
875 while (j < i)
876 state->mark[j++] = NULL;
877 state->lastmark = i;
878 }
879 state->mark[i] = ctx->ptr;
880 ctx->pattern++;
881 break;
882
883 case SRE_OP_LITERAL:
884 /* match literal string */
885 /* <LITERAL> <code> */
886 TRACE(("|%p|%p|LITERAL %d\n", ctx->pattern,
887 ctx->ptr, *ctx->pattern));
888 if (ctx->ptr >= end || (SRE_CODE) ctx->ptr[0] != ctx->pattern[0])
889 RETURN_FAILURE;
890 ctx->pattern++;
891 ctx->ptr++;
892 break;
893
894 case SRE_OP_NOT_LITERAL:
895 /* match anything that is not literal character */
896 /* <NOT_LITERAL> <code> */
897 TRACE(("|%p|%p|NOT_LITERAL %d\n", ctx->pattern,
898 ctx->ptr, *ctx->pattern));
899 if (ctx->ptr >= end || (SRE_CODE) ctx->ptr[0] == ctx->pattern[0])
900 RETURN_FAILURE;
901 ctx->pattern++;
902 ctx->ptr++;
903 break;
904
905 case SRE_OP_SUCCESS:
906 /* end of pattern */
907 TRACE(("|%p|%p|SUCCESS\n", ctx->pattern, ctx->ptr));
908 state->ptr = ctx->ptr;
909 RETURN_SUCCESS;
910
911 case SRE_OP_AT:
912 /* match at given position */
913 /* <AT> <code> */
914 TRACE(("|%p|%p|AT %d\n", ctx->pattern, ctx->ptr, *ctx->pattern));
915 if (!SRE_AT(state, ctx->ptr, *ctx->pattern))
916 RETURN_FAILURE;
917 ctx->pattern++;
918 break;
919
920 case SRE_OP_CATEGORY:
921 /* match at given category */
922 /* <CATEGORY> <code> */
923 TRACE(("|%p|%p|CATEGORY %d\n", ctx->pattern,
924 ctx->ptr, *ctx->pattern));
925 if (ctx->ptr >= end || !sre_category(ctx->pattern[0], ctx->ptr[0]))
926 RETURN_FAILURE;
927 ctx->pattern++;
928 ctx->ptr++;
929 break;
930
931 case SRE_OP_ANY:
932 /* match anything (except a newline) */
933 /* <ANY> */
934 TRACE(("|%p|%p|ANY\n", ctx->pattern, ctx->ptr));
935 if (ctx->ptr >= end || SRE_IS_LINEBREAK(ctx->ptr[0]))
936 RETURN_FAILURE;
937 ctx->ptr++;
938 break;
939
940 case SRE_OP_ANY_ALL:
941 /* match anything */
942 /* <ANY_ALL> */
943 TRACE(("|%p|%p|ANY_ALL\n", ctx->pattern, ctx->ptr));
944 if (ctx->ptr >= end)
945 RETURN_FAILURE;
946 ctx->ptr++;
947 break;
948
949 case SRE_OP_IN:
950 /* match set member (or non_member) */
951 /* <IN> <skip> <set> */
952 TRACE(("|%p|%p|IN\n", ctx->pattern, ctx->ptr));
953 if (ctx->ptr >= end || !SRE_CHARSET(ctx->pattern + 1, *ctx->ptr))
954 RETURN_FAILURE;
955 ctx->pattern += ctx->pattern[0];
956 ctx->ptr++;
957 break;
958
959 case SRE_OP_LITERAL_IGNORE:
960 TRACE(("|%p|%p|LITERAL_IGNORE %d\n",
961 ctx->pattern, ctx->ptr, ctx->pattern[0]));
962 if (ctx->ptr >= end ||
963 state->lower(*ctx->ptr) != state->lower(*ctx->pattern))
964 RETURN_FAILURE;
965 ctx->pattern++;
966 ctx->ptr++;
967 break;
968
969 case SRE_OP_NOT_LITERAL_IGNORE:
970 TRACE(("|%p|%p|NOT_LITERAL_IGNORE %d\n",
971 ctx->pattern, ctx->ptr, *ctx->pattern));
972 if (ctx->ptr >= end ||
973 state->lower(*ctx->ptr) == state->lower(*ctx->pattern))
974 RETURN_FAILURE;
975 ctx->pattern++;
976 ctx->ptr++;
977 break;
978
979 case SRE_OP_IN_IGNORE:
980 TRACE(("|%p|%p|IN_IGNORE\n", ctx->pattern, ctx->ptr));
981 if (ctx->ptr >= end
982 || !SRE_CHARSET(ctx->pattern+1,
983 (SRE_CODE)state->lower(*ctx->ptr)))
984 RETURN_FAILURE;
985 ctx->pattern += ctx->pattern[0];
986 ctx->ptr++;
987 break;
988
989 case SRE_OP_JUMP:
990 case SRE_OP_INFO:
991 /* jump forward */
992 /* <JUMP> <offset> */
993 TRACE(("|%p|%p|JUMP %d\n", ctx->pattern,
994 ctx->ptr, ctx->pattern[0]));
995 ctx->pattern += ctx->pattern[0];
996 break;
997
998 case SRE_OP_BRANCH:
999 /* alternation */
1000 /* <BRANCH> <0=skip> code <JUMP> ... <NULL> */
1001 TRACE(("|%p|%p|BRANCH\n", ctx->pattern, ctx->ptr));
1002 LASTMARK_SAVE();
1003 ctx->u.rep = state->repeat;
1004 if (ctx->u.rep)
1005 MARK_PUSH(ctx->lastmark);
1006 for (; ctx->pattern[0]; ctx->pattern += ctx->pattern[0]) {
1007 if (ctx->pattern[1] == SRE_OP_LITERAL &&
1008 (ctx->ptr >= end ||
1009 (SRE_CODE) *ctx->ptr != ctx->pattern[2]))
1010 continue;
1011 if (ctx->pattern[1] == SRE_OP_IN &&
1012 (ctx->ptr >= end ||
1013 !SRE_CHARSET(ctx->pattern + 3, (SRE_CODE) *ctx->ptr)))
1014 continue;
1015 state->ptr = ctx->ptr;
1016 DO_JUMP(JUMP_BRANCH, jump_branch, ctx->pattern+1);
1017 if (ret) {
1018 if (ctx->u.rep)
1019 MARK_POP_DISCARD(ctx->lastmark);
1020 RETURN_ON_ERROR(ret);
1021 RETURN_SUCCESS;
1022 }
1023 if (ctx->u.rep)
1024 MARK_POP_KEEP(ctx->lastmark);
1025 LASTMARK_RESTORE();
1026 }
1027 if (ctx->u.rep)
1028 MARK_POP_DISCARD(ctx->lastmark);
1029 RETURN_FAILURE;
1030
1031 case SRE_OP_REPEAT_ONE:
1032 /* match repeated sequence (maximizing regexp) */
1033
1034 /* this operator only works if the repeated item is
1035 exactly one character wide, and we're not already
1036 collecting backtracking points. for other cases,
1037 use the MAX_REPEAT operator */
1038
1039 /* <REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS> tail */
1040
1041 TRACE(("|%p|%p|REPEAT_ONE %d %d\n", ctx->pattern, ctx->ptr,
1042 ctx->pattern[1], ctx->pattern[2]));
1043
1044 if (ctx->ptr + ctx->pattern[1] > end)
1045 RETURN_FAILURE; /* cannot match */
1046
1047 state->ptr = ctx->ptr;
1048
1049 ret = SRE_COUNT(state, ctx->pattern+3, ctx->pattern[2]);
1050 RETURN_ON_ERROR(ret);
1051 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
1052 ctx->count = ret;
1053 ctx->ptr += ctx->count;
1054
1055 /* when we arrive here, count contains the number of
1056 matches, and ctx->ptr points to the tail of the target
1057 string. check if the rest of the pattern matches,
1058 and backtrack if not. */
1059
1060 if (ctx->count < (Py_ssize_t) ctx->pattern[1])
1061 RETURN_FAILURE;
1062
1063 if (ctx->pattern[ctx->pattern[0]] == SRE_OP_SUCCESS) {
1064 /* tail is empty. we're finished */
1065 state->ptr = ctx->ptr;
1066 RETURN_SUCCESS;
1067 }
1068
1069 LASTMARK_SAVE();
1070
1071 if (ctx->pattern[ctx->pattern[0]] == SRE_OP_LITERAL) {
1072 /* tail starts with a literal. skip positions where
1073 the rest of the pattern cannot possibly match */
1074 ctx->u.chr = ctx->pattern[ctx->pattern[0]+1];
1075 for (;;) {
1076 while (ctx->count >= (Py_ssize_t) ctx->pattern[1] &&
1077 (ctx->ptr >= end || *ctx->ptr != ctx->u.chr)) {
1078 ctx->ptr--;
1079 ctx->count--;
1080 }
1081 if (ctx->count < (Py_ssize_t) ctx->pattern[1])
1082 break;
1083 state->ptr = ctx->ptr;
1084 DO_JUMP(JUMP_REPEAT_ONE_1, jump_repeat_one_1,
1085 ctx->pattern+ctx->pattern[0]);
1086 if (ret) {
1087 RETURN_ON_ERROR(ret);
1088 RETURN_SUCCESS;
1089 }
1090
1091 LASTMARK_RESTORE();
1092
1093 ctx->ptr--;
1094 ctx->count--;
1095 }
1096
1097 } else {
1098 /* general case */
1099 while (ctx->count >= (Py_ssize_t) ctx->pattern[1]) {
1100 state->ptr = ctx->ptr;
1101 DO_JUMP(JUMP_REPEAT_ONE_2, jump_repeat_one_2,
1102 ctx->pattern+ctx->pattern[0]);
1103 if (ret) {
1104 RETURN_ON_ERROR(ret);
1105 RETURN_SUCCESS;
1106 }
1107 ctx->ptr--;
1108 ctx->count--;
1109 LASTMARK_RESTORE();
1110 }
1111 }
1112 RETURN_FAILURE;
1113
1114 case SRE_OP_MIN_REPEAT_ONE:
1115 /* match repeated sequence (minimizing regexp) */
1116
1117 /* this operator only works if the repeated item is
1118 exactly one character wide, and we're not already
1119 collecting backtracking points. for other cases,
1120 use the MIN_REPEAT operator */
1121
1122 /* <MIN_REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS> tail */
1123
1124 TRACE(("|%p|%p|MIN_REPEAT_ONE %d %d\n", ctx->pattern, ctx->ptr,
1125 ctx->pattern[1], ctx->pattern[2]));
1126
1127 if (ctx->ptr + ctx->pattern[1] > end)
1128 RETURN_FAILURE; /* cannot match */
1129
1130 state->ptr = ctx->ptr;
1131
1132 if (ctx->pattern[1] == 0)
1133 ctx->count = 0;
1134 else {
1135 /* count using pattern min as the maximum */
1136 ret = SRE_COUNT(state, ctx->pattern+3, ctx->pattern[1]);
1137 RETURN_ON_ERROR(ret);
1138 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
1139 if (ret < (Py_ssize_t) ctx->pattern[1])
1140 /* didn't match minimum number of times */
1141 RETURN_FAILURE;
1142 /* advance past minimum matches of repeat */
1143 ctx->count = ret;
1144 ctx->ptr += ctx->count;
1145 }
1146
1147 if (ctx->pattern[ctx->pattern[0]] == SRE_OP_SUCCESS) {
1148 /* tail is empty. we're finished */
1149 state->ptr = ctx->ptr;
1150 RETURN_SUCCESS;
1151
1152 } else {
1153 /* general case */
1154 LASTMARK_SAVE();
1155 while ((Py_ssize_t)ctx->pattern[2] == 65535
1156 || ctx->count <= (Py_ssize_t)ctx->pattern[2]) {
1157 state->ptr = ctx->ptr;
1158 DO_JUMP(JUMP_MIN_REPEAT_ONE,jump_min_repeat_one,
1159 ctx->pattern+ctx->pattern[0]);
1160 if (ret) {
1161 RETURN_ON_ERROR(ret);
1162 RETURN_SUCCESS;
1163 }
1164 state->ptr = ctx->ptr;
1165 ret = SRE_COUNT(state, ctx->pattern+3, 1);
1166 RETURN_ON_ERROR(ret);
1167 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
1168 if (ret == 0)
1169 break;
1170 assert(ret == 1);
1171 ctx->ptr++;
1172 ctx->count++;
1173 LASTMARK_RESTORE();
1174 }
1175 }
1176 RETURN_FAILURE;
1177
1178 case SRE_OP_REPEAT:
1179 /* create repeat context. all the hard work is done
1180 by the UNTIL operator (MAX_UNTIL, MIN_UNTIL) */
1181 /* <REPEAT> <skip> <1=min> <2=max> item <UNTIL> tail */
1182 TRACE(("|%p|%p|REPEAT %d %d\n", ctx->pattern, ctx->ptr,
1183 ctx->pattern[1], ctx->pattern[2]));
1184
1185 /* install new repeat context */
1186 ctx->u.rep = (SRE_REPEAT*) PyObject_MALLOC(sizeof(*ctx->u.rep));
1187 if (!ctx->u.rep) {
1188 PyErr_NoMemory();
1189 RETURN_FAILURE;
1190 }
1191 ctx->u.rep->count = -1;
1192 ctx->u.rep->pattern = ctx->pattern;
1193 ctx->u.rep->prev = state->repeat;
1194 ctx->u.rep->last_ptr = NULL;
1195 state->repeat = ctx->u.rep;
1196
1197 state->ptr = ctx->ptr;
1198 DO_JUMP(JUMP_REPEAT, jump_repeat, ctx->pattern+ctx->pattern[0]);
1199 state->repeat = ctx->u.rep->prev;
1200 PyObject_FREE(ctx->u.rep);
1201
1202 if (ret) {
1203 RETURN_ON_ERROR(ret);
1204 RETURN_SUCCESS;
1205 }
1206 RETURN_FAILURE;
1207
1208 case SRE_OP_MAX_UNTIL:
1209 /* maximizing repeat */
1210 /* <REPEAT> <skip> <1=min> <2=max> item <MAX_UNTIL> tail */
1211
1212 /* FIXME: we probably need to deal with zero-width
1213 matches in here... */
1214
1215 ctx->u.rep = state->repeat;
1216 if (!ctx->u.rep)
1217 RETURN_ERROR(SRE_ERROR_STATE);
1218
1219 state->ptr = ctx->ptr;
1220
1221 ctx->count = ctx->u.rep->count+1;
1222
1223 TRACE(("|%p|%p|MAX_UNTIL %d\n", ctx->pattern,
1224 ctx->ptr, ctx->count));
1225
1226 if (ctx->count < ctx->u.rep->pattern[1]) {
1227 /* not enough matches */
1228 ctx->u.rep->count = ctx->count;
1229 DO_JUMP(JUMP_MAX_UNTIL_1, jump_max_until_1,
1230 ctx->u.rep->pattern+3);
1231 if (ret) {
1232 RETURN_ON_ERROR(ret);
1233 RETURN_SUCCESS;
1234 }
1235 ctx->u.rep->count = ctx->count-1;
1236 state->ptr = ctx->ptr;
1237 RETURN_FAILURE;
1238 }
1239
1240 if ((ctx->count < ctx->u.rep->pattern[2] ||
1241 ctx->u.rep->pattern[2] == 65535) &&
1242 state->ptr != ctx->u.rep->last_ptr) {
1243 /* we may have enough matches, but if we can
1244 match another item, do so */
1245 ctx->u.rep->count = ctx->count;
1246 LASTMARK_SAVE();
1247 MARK_PUSH(ctx->lastmark);
1248 /* zero-width match protection */
1249 DATA_PUSH(&ctx->u.rep->last_ptr);
1250 ctx->u.rep->last_ptr = state->ptr;
1251 DO_JUMP(JUMP_MAX_UNTIL_2, jump_max_until_2,
1252 ctx->u.rep->pattern+3);
1253 DATA_POP(&ctx->u.rep->last_ptr);
1254 if (ret) {
1255 MARK_POP_DISCARD(ctx->lastmark);
1256 RETURN_ON_ERROR(ret);
1257 RETURN_SUCCESS;
1258 }
1259 MARK_POP(ctx->lastmark);
1260 LASTMARK_RESTORE();
1261 ctx->u.rep->count = ctx->count-1;
1262 state->ptr = ctx->ptr;
1263 }
1264
1265 /* cannot match more repeated items here. make sure the
1266 tail matches */
1267 state->repeat = ctx->u.rep->prev;
1268 DO_JUMP(JUMP_MAX_UNTIL_3, jump_max_until_3, ctx->pattern);
1269 RETURN_ON_SUCCESS(ret);
1270 state->repeat = ctx->u.rep;
1271 state->ptr = ctx->ptr;
1272 RETURN_FAILURE;
1273
1274 case SRE_OP_MIN_UNTIL:
1275 /* minimizing repeat */
1276 /* <REPEAT> <skip> <1=min> <2=max> item <MIN_UNTIL> tail */
1277
1278 ctx->u.rep = state->repeat;
1279 if (!ctx->u.rep)
1280 RETURN_ERROR(SRE_ERROR_STATE);
1281
1282 state->ptr = ctx->ptr;
1283
1284 ctx->count = ctx->u.rep->count+1;
1285
1286 TRACE(("|%p|%p|MIN_UNTIL %d %p\n", ctx->pattern,
1287 ctx->ptr, ctx->count, ctx->u.rep->pattern));
1288
1289 if (ctx->count < ctx->u.rep->pattern[1]) {
1290 /* not enough matches */
1291 ctx->u.rep->count = ctx->count;
1292 DO_JUMP(JUMP_MIN_UNTIL_1, jump_min_until_1,
1293 ctx->u.rep->pattern+3);
1294 if (ret) {
1295 RETURN_ON_ERROR(ret);
1296 RETURN_SUCCESS;
1297 }
1298 ctx->u.rep->count = ctx->count-1;
1299 state->ptr = ctx->ptr;
1300 RETURN_FAILURE;
1301 }
1302
1303 LASTMARK_SAVE();
1304
1305 /* see if the tail matches */
1306 state->repeat = ctx->u.rep->prev;
1307 DO_JUMP(JUMP_MIN_UNTIL_2, jump_min_until_2, ctx->pattern);
1308 if (ret) {
1309 RETURN_ON_ERROR(ret);
1310 RETURN_SUCCESS;
1311 }
1312
1313 state->repeat = ctx->u.rep;
1314 state->ptr = ctx->ptr;
1315
1316 LASTMARK_RESTORE();
1317
1318 if (ctx->count >= ctx->u.rep->pattern[2]
1319 && ctx->u.rep->pattern[2] != 65535)
1320 RETURN_FAILURE;
1321
1322 ctx->u.rep->count = ctx->count;
1323 DO_JUMP(JUMP_MIN_UNTIL_3,jump_min_until_3,
1324 ctx->u.rep->pattern+3);
1325 if (ret) {
1326 RETURN_ON_ERROR(ret);
1327 RETURN_SUCCESS;
1328 }
1329 ctx->u.rep->count = ctx->count-1;
1330 state->ptr = ctx->ptr;
1331 RETURN_FAILURE;
1332
1333 case SRE_OP_GROUPREF:
1334 /* match backreference */
1335 TRACE(("|%p|%p|GROUPREF %d\n", ctx->pattern,
1336 ctx->ptr, ctx->pattern[0]));
1337 i = ctx->pattern[0];
1338 {
1339 Py_ssize_t groupref = i+i;
1340 if (groupref >= state->lastmark) {
1341 RETURN_FAILURE;
1342 } else {
1343 SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref];
1344 SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1];
1345 if (!p || !e || e < p)
1346 RETURN_FAILURE;
1347 while (p < e) {
1348 if (ctx->ptr >= end || *ctx->ptr != *p)
1349 RETURN_FAILURE;
1350 p++; ctx->ptr++;
1351 }
1352 }
1353 }
1354 ctx->pattern++;
1355 break;
1356
1357 case SRE_OP_GROUPREF_IGNORE:
1358 /* match backreference */
1359 TRACE(("|%p|%p|GROUPREF_IGNORE %d\n", ctx->pattern,
1360 ctx->ptr, ctx->pattern[0]));
1361 i = ctx->pattern[0];
1362 {
1363 Py_ssize_t groupref = i+i;
1364 if (groupref >= state->lastmark) {
1365 RETURN_FAILURE;
1366 } else {
1367 SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref];
1368 SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1];
1369 if (!p || !e || e < p)
1370 RETURN_FAILURE;
1371 while (p < e) {
1372 if (ctx->ptr >= end ||
1373 state->lower(*ctx->ptr) != state->lower(*p))
1374 RETURN_FAILURE;
1375 p++; ctx->ptr++;
1376 }
1377 }
1378 }
1379 ctx->pattern++;
1380 break;
1381
1382 case SRE_OP_GROUPREF_EXISTS:
1383 TRACE(("|%p|%p|GROUPREF_EXISTS %d\n", ctx->pattern,
1384 ctx->ptr, ctx->pattern[0]));
1385 /* <GROUPREF_EXISTS> <group> <skip> codeyes <JUMP> codeno ... */
1386 i = ctx->pattern[0];
1387 {
1388 Py_ssize_t groupref = i+i;
1389 if (groupref >= state->lastmark) {
1390 ctx->pattern += ctx->pattern[1];
1391 break;
1392 } else {
1393 SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref];
1394 SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1];
1395 if (!p || !e || e < p) {
1396 ctx->pattern += ctx->pattern[1];
1397 break;
1398 }
1399 }
1400 }
1401 ctx->pattern += 2;
1402 break;
1403
1404 case SRE_OP_ASSERT:
1405 /* assert subpattern */
1406 /* <ASSERT> <skip> <back> <pattern> */
1407 TRACE(("|%p|%p|ASSERT %d\n", ctx->pattern,
1408 ctx->ptr, ctx->pattern[1]));
1409 state->ptr = ctx->ptr - ctx->pattern[1];
1410 if (state->ptr < state->beginning)
1411 RETURN_FAILURE;
1412 DO_JUMP(JUMP_ASSERT, jump_assert, ctx->pattern+2);
1413 RETURN_ON_FAILURE(ret);
1414 ctx->pattern += ctx->pattern[0];
1415 break;
1416
1417 case SRE_OP_ASSERT_NOT:
1418 /* assert not subpattern */
1419 /* <ASSERT_NOT> <skip> <back> <pattern> */
1420 TRACE(("|%p|%p|ASSERT_NOT %d\n", ctx->pattern,
1421 ctx->ptr, ctx->pattern[1]));
1422 state->ptr = ctx->ptr - ctx->pattern[1];
1423 if (state->ptr >= state->beginning) {
1424 DO_JUMP(JUMP_ASSERT_NOT, jump_assert_not, ctx->pattern+2);
1425 if (ret) {
1426 RETURN_ON_ERROR(ret);
1427 RETURN_FAILURE;
1428 }
1429 }
1430 ctx->pattern += ctx->pattern[0];
1431 break;
1432
1433 case SRE_OP_FAILURE:
1434 /* immediate failure */
1435 TRACE(("|%p|%p|FAILURE\n", ctx->pattern, ctx->ptr));
1436 RETURN_FAILURE;
1437
1438 default:
1439 TRACE(("|%p|%p|UNKNOWN %d\n", ctx->pattern, ctx->ptr,
1440 ctx->pattern[-1]));
1441 RETURN_ERROR(SRE_ERROR_ILLEGAL);
1442 }
1443 }
1444
1445 exit:
1446 ctx_pos = ctx->last_ctx_pos;
1447 jump = ctx->jump;
1448 DATA_POP_DISCARD(ctx);
1449 if (ctx_pos == -1)
1450 return ret;
1451 DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
1452
1453 switch (jump) {
1454 case JUMP_MAX_UNTIL_2:
1455 TRACE(("|%p|%p|JUMP_MAX_UNTIL_2\n", ctx->pattern, ctx->ptr));
1456 goto jump_max_until_2;
1457 case JUMP_MAX_UNTIL_3:
1458 TRACE(("|%p|%p|JUMP_MAX_UNTIL_3\n", ctx->pattern, ctx->ptr));
1459 goto jump_max_until_3;
1460 case JUMP_MIN_UNTIL_2:
1461 TRACE(("|%p|%p|JUMP_MIN_UNTIL_2\n", ctx->pattern, ctx->ptr));
1462 goto jump_min_until_2;
1463 case JUMP_MIN_UNTIL_3:
1464 TRACE(("|%p|%p|JUMP_MIN_UNTIL_3\n", ctx->pattern, ctx->ptr));
1465 goto jump_min_until_3;
1466 case JUMP_BRANCH:
1467 TRACE(("|%p|%p|JUMP_BRANCH\n", ctx->pattern, ctx->ptr));
1468 goto jump_branch;
1469 case JUMP_MAX_UNTIL_1:
1470 TRACE(("|%p|%p|JUMP_MAX_UNTIL_1\n", ctx->pattern, ctx->ptr));
1471 goto jump_max_until_1;
1472 case JUMP_MIN_UNTIL_1:
1473 TRACE(("|%p|%p|JUMP_MIN_UNTIL_1\n", ctx->pattern, ctx->ptr));
1474 goto jump_min_until_1;
1475 case JUMP_REPEAT:
1476 TRACE(("|%p|%p|JUMP_REPEAT\n", ctx->pattern, ctx->ptr));
1477 goto jump_repeat;
1478 case JUMP_REPEAT_ONE_1:
1479 TRACE(("|%p|%p|JUMP_REPEAT_ONE_1\n", ctx->pattern, ctx->ptr));
1480 goto jump_repeat_one_1;
1481 case JUMP_REPEAT_ONE_2:
1482 TRACE(("|%p|%p|JUMP_REPEAT_ONE_2\n", ctx->pattern, ctx->ptr));
1483 goto jump_repeat_one_2;
1484 case JUMP_MIN_REPEAT_ONE:
1485 TRACE(("|%p|%p|JUMP_MIN_REPEAT_ONE\n", ctx->pattern, ctx->ptr));
1486 goto jump_min_repeat_one;
1487 case JUMP_ASSERT:
1488 TRACE(("|%p|%p|JUMP_ASSERT\n", ctx->pattern, ctx->ptr));
1489 goto jump_assert;
1490 case JUMP_ASSERT_NOT:
1491 TRACE(("|%p|%p|JUMP_ASSERT_NOT\n", ctx->pattern, ctx->ptr));
1492 goto jump_assert_not;
1493 case JUMP_NONE:
1494 TRACE(("|%p|%p|RETURN %d\n", ctx->pattern, ctx->ptr, ret));
1495 break;
1496 }
1497
1498 return ret; /* should never get here */
1499 }
1500
1501 LOCAL(Py_ssize_t)
SRE_SEARCH(SRE_STATE * state,SRE_CODE * pattern)1502 SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern)
1503 {
1504 SRE_CHAR* ptr = (SRE_CHAR *)state->start;
1505 SRE_CHAR* end = (SRE_CHAR *)state->end;
1506 Py_ssize_t status = 0;
1507 Py_ssize_t prefix_len = 0;
1508 Py_ssize_t prefix_skip = 0;
1509 SRE_CODE* prefix = NULL;
1510 SRE_CODE* charset = NULL;
1511 SRE_CODE* overlap = NULL;
1512 int flags = 0;
1513
1514 if (pattern[0] == SRE_OP_INFO) {
1515 /* optimization info block */
1516 /* <INFO> <1=skip> <2=flags> <3=min> <4=max> <5=prefix info> */
1517
1518 flags = pattern[2];
1519
1520 if (pattern[3] > 1) {
1521 /* adjust end point (but make sure we leave at least one
1522 character in there, so literal search will work) */
1523 end -= pattern[3]-1;
1524 if (end <= ptr)
1525 end = ptr+1;
1526 }
1527
1528 if (flags & SRE_INFO_PREFIX) {
1529 /* pattern starts with a known prefix */
1530 /* <length> <skip> <prefix data> <overlap data> */
1531 prefix_len = pattern[5];
1532 prefix_skip = pattern[6];
1533 prefix = pattern + 7;
1534 overlap = prefix + prefix_len - 1;
1535 } else if (flags & SRE_INFO_CHARSET)
1536 /* pattern starts with a character from a known set */
1537 /* <charset> */
1538 charset = pattern + 5;
1539
1540 pattern += 1 + pattern[1];
1541 }
1542
1543 TRACE(("prefix = %p %d %d\n", prefix, prefix_len, prefix_skip));
1544 TRACE(("charset = %p\n", charset));
1545
1546 #if defined(USE_FAST_SEARCH)
1547 if (prefix_len > 1) {
1548 /* pattern starts with a known prefix. use the overlap
1549 table to skip forward as fast as we possibly can */
1550 Py_ssize_t i = 0;
1551 end = (SRE_CHAR *)state->end;
1552 while (ptr < end) {
1553 for (;;) {
1554 if ((SRE_CODE) ptr[0] != prefix[i]) {
1555 if (!i)
1556 break;
1557 else
1558 i = overlap[i];
1559 } else {
1560 if (++i == prefix_len) {
1561 /* found a potential match */
1562 TRACE(("|%p|%p|SEARCH SCAN\n", pattern, ptr));
1563 state->start = ptr + 1 - prefix_len;
1564 state->ptr = ptr + 1 - prefix_len + prefix_skip;
1565 if (flags & SRE_INFO_LITERAL)
1566 return 1; /* we got all of it */
1567 status = SRE_MATCH(state, pattern + 2*prefix_skip);
1568 if (status != 0)
1569 return status;
1570 /* close but no cigar -- try again */
1571 i = overlap[i];
1572 }
1573 break;
1574 }
1575 }
1576 ptr++;
1577 }
1578 return 0;
1579 }
1580 #endif
1581
1582 if (pattern[0] == SRE_OP_LITERAL) {
1583 /* pattern starts with a literal character. this is used
1584 for short prefixes, and if fast search is disabled */
1585 SRE_CODE chr = pattern[1];
1586 end = (SRE_CHAR *)state->end;
1587 for (;;) {
1588 while (ptr < end && (SRE_CODE) ptr[0] != chr)
1589 ptr++;
1590 if (ptr >= end)
1591 return 0;
1592 TRACE(("|%p|%p|SEARCH LITERAL\n", pattern, ptr));
1593 state->start = ptr;
1594 state->ptr = ++ptr;
1595 if (flags & SRE_INFO_LITERAL)
1596 return 1; /* we got all of it */
1597 status = SRE_MATCH(state, pattern + 2);
1598 if (status != 0)
1599 break;
1600 }
1601 } else if (charset) {
1602 /* pattern starts with a character from a known set */
1603 end = (SRE_CHAR *)state->end;
1604 for (;;) {
1605 while (ptr < end && !SRE_CHARSET(charset, ptr[0]))
1606 ptr++;
1607 if (ptr >= end)
1608 return 0;
1609 TRACE(("|%p|%p|SEARCH CHARSET\n", pattern, ptr));
1610 state->start = ptr;
1611 state->ptr = ptr;
1612 status = SRE_MATCH(state, pattern);
1613 if (status != 0)
1614 break;
1615 ptr++;
1616 }
1617 } else
1618 /* general case */
1619 while (ptr <= end) {
1620 TRACE(("|%p|%p|SEARCH\n", pattern, ptr));
1621 state->start = state->ptr = ptr++;
1622 status = SRE_MATCH(state, pattern);
1623 if (status != 0)
1624 break;
1625 }
1626
1627 return status;
1628 }
1629
1630 LOCAL(int)
SRE_LITERAL_TEMPLATE(SRE_CHAR * ptr,Py_ssize_t len)1631 SRE_LITERAL_TEMPLATE(SRE_CHAR* ptr, Py_ssize_t len)
1632 {
1633 /* check if given string is a literal template (i.e. no escapes) */
1634 while (len-- > 0)
1635 if (*ptr++ == '\\')
1636 return 0;
1637 return 1;
1638 }
1639
1640 #if !defined(SRE_RECURSIVE)
1641
1642 /* -------------------------------------------------------------------- */
1643 /* factories and destructors */
1644
1645 /* see sre.h for object declarations */
1646 static PyObject*pattern_new_match(PatternObject*, SRE_STATE*, int);
1647 static PyObject*pattern_scanner(PatternObject*, PyObject*);
1648
1649 static PyObject *
sre_codesize(PyObject * self,PyObject * unused)1650 sre_codesize(PyObject* self, PyObject *unused)
1651 {
1652 return Py_BuildValue("l", sizeof(SRE_CODE));
1653 }
1654
1655 static PyObject *
sre_getlower(PyObject * self,PyObject * args)1656 sre_getlower(PyObject* self, PyObject* args)
1657 {
1658 int character, flags;
1659 if (!PyArg_ParseTuple(args, "ii", &character, &flags))
1660 return NULL;
1661 if (flags & SRE_FLAG_LOCALE)
1662 return Py_BuildValue("i", sre_lower_locale(character));
1663 if (flags & SRE_FLAG_UNICODE)
1664 #if defined(HAVE_UNICODE)
1665 return Py_BuildValue("i", sre_lower_unicode(character));
1666 #else
1667 return Py_BuildValue("i", sre_lower_locale(character));
1668 #endif
1669 return Py_BuildValue("i", sre_lower(character));
1670 }
1671
1672 LOCAL(void)
state_reset(SRE_STATE * state)1673 state_reset(SRE_STATE* state)
1674 {
1675 /* FIXME: dynamic! */
1676 /*memset(state->mark, 0, sizeof(*state->mark) * SRE_MARK_SIZE);*/
1677
1678 state->lastmark = -1;
1679 state->lastindex = -1;
1680
1681 state->repeat = NULL;
1682
1683 data_stack_dealloc(state);
1684 }
1685
1686 static void*
getstring(PyObject * string,Py_ssize_t * p_length,int * p_charsize)1687 getstring(PyObject* string, Py_ssize_t* p_length, int* p_charsize)
1688 {
1689 /* given a python object, return a data pointer, a length (in
1690 characters), and a character size. return NULL if the object
1691 is not a string (or not compatible) */
1692
1693 PyBufferProcs *buffer;
1694 Py_ssize_t size, bytes;
1695 int charsize;
1696 void* ptr;
1697
1698 #if defined(HAVE_UNICODE)
1699 if (PyUnicode_Check(string)) {
1700 /* unicode strings doesn't always support the buffer interface */
1701 ptr = (void*) PyUnicode_AS_DATA(string);
1702 /* bytes = PyUnicode_GET_DATA_SIZE(string); */
1703 size = PyUnicode_GET_SIZE(string);
1704 charsize = sizeof(Py_UNICODE);
1705
1706 } else {
1707 #endif
1708
1709 /* get pointer to string buffer */
1710 buffer = Py_TYPE(string)->tp_as_buffer;
1711 if (!buffer || !buffer->bf_getreadbuffer || !buffer->bf_getsegcount ||
1712 buffer->bf_getsegcount(string, NULL) != 1) {
1713 PyErr_SetString(PyExc_TypeError, "expected string or buffer");
1714 return NULL;
1715 }
1716
1717 /* determine buffer size */
1718 bytes = buffer->bf_getreadbuffer(string, 0, &ptr);
1719 if (bytes < 0) {
1720 PyErr_SetString(PyExc_TypeError, "buffer has negative size");
1721 return NULL;
1722 }
1723
1724 /* determine character size */
1725 #if PY_VERSION_HEX >= 0x01060000
1726 size = PyObject_Size(string);
1727 #else
1728 size = PyObject_Length(string);
1729 #endif
1730
1731 if (PyString_Check(string) || bytes == size)
1732 charsize = 1;
1733 #if defined(HAVE_UNICODE)
1734 else if (bytes == (Py_ssize_t) (size * sizeof(Py_UNICODE)))
1735 charsize = sizeof(Py_UNICODE);
1736 #endif
1737 else {
1738 PyErr_SetString(PyExc_TypeError, "buffer size mismatch");
1739 return NULL;
1740 }
1741
1742 #if defined(HAVE_UNICODE)
1743 }
1744 #endif
1745
1746 *p_length = size;
1747 *p_charsize = charsize;
1748
1749 return ptr;
1750 }
1751
1752 LOCAL(PyObject*)
state_init(SRE_STATE * state,PatternObject * pattern,PyObject * string,Py_ssize_t start,Py_ssize_t end)1753 state_init(SRE_STATE* state, PatternObject* pattern, PyObject* string,
1754 Py_ssize_t start, Py_ssize_t end)
1755 {
1756 /* prepare state object */
1757
1758 Py_ssize_t length;
1759 int charsize;
1760 void* ptr;
1761
1762 memset(state, 0, sizeof(SRE_STATE));
1763
1764 state->lastmark = -1;
1765 state->lastindex = -1;
1766
1767 ptr = getstring(string, &length, &charsize);
1768 if (!ptr)
1769 return NULL;
1770
1771 /* adjust boundaries */
1772 if (start < 0)
1773 start = 0;
1774 else if (start > length)
1775 start = length;
1776
1777 if (end < 0)
1778 end = 0;
1779 else if (end > length)
1780 end = length;
1781
1782 state->charsize = charsize;
1783
1784 state->beginning = ptr;
1785
1786 state->start = (void*) ((char*) ptr + start * state->charsize);
1787 state->end = (void*) ((char*) ptr + end * state->charsize);
1788
1789 Py_INCREF(string);
1790 state->string = string;
1791 state->pos = start;
1792 state->endpos = end;
1793
1794 if (pattern->flags & SRE_FLAG_LOCALE)
1795 state->lower = sre_lower_locale;
1796 else if (pattern->flags & SRE_FLAG_UNICODE)
1797 #if defined(HAVE_UNICODE)
1798 state->lower = sre_lower_unicode;
1799 #else
1800 state->lower = sre_lower_locale;
1801 #endif
1802 else
1803 state->lower = sre_lower;
1804
1805 return string;
1806 }
1807
1808 LOCAL(void)
state_fini(SRE_STATE * state)1809 state_fini(SRE_STATE* state)
1810 {
1811 Py_XDECREF(state->string);
1812 data_stack_dealloc(state);
1813 }
1814
1815 /* calculate offset from start of string */
1816 #define STATE_OFFSET(state, member)\
1817 (((char*)(member) - (char*)(state)->beginning) / (state)->charsize)
1818
1819 LOCAL(PyObject*)
state_getslice(SRE_STATE * state,Py_ssize_t index,PyObject * string,int empty)1820 state_getslice(SRE_STATE* state, Py_ssize_t index, PyObject* string, int empty)
1821 {
1822 Py_ssize_t i, j;
1823
1824 index = (index - 1) * 2;
1825
1826 if (string == Py_None || index >= state->lastmark || !state->mark[index] || !state->mark[index+1]) {
1827 if (empty)
1828 /* want empty string */
1829 i = j = 0;
1830 else {
1831 Py_INCREF(Py_None);
1832 return Py_None;
1833 }
1834 } else {
1835 i = STATE_OFFSET(state, state->mark[index]);
1836 j = STATE_OFFSET(state, state->mark[index+1]);
1837 }
1838
1839 return PySequence_GetSlice(string, i, j);
1840 }
1841
1842 static void
pattern_error(int status)1843 pattern_error(int status)
1844 {
1845 switch (status) {
1846 case SRE_ERROR_RECURSION_LIMIT:
1847 PyErr_SetString(
1848 PyExc_RuntimeError,
1849 "maximum recursion limit exceeded"
1850 );
1851 break;
1852 case SRE_ERROR_MEMORY:
1853 PyErr_NoMemory();
1854 break;
1855 case SRE_ERROR_INTERRUPTED:
1856 /* An exception has already been raised, so let it fly */
1857 break;
1858 default:
1859 /* other error codes indicate compiler/engine bugs */
1860 PyErr_SetString(
1861 PyExc_RuntimeError,
1862 "internal error in regular expression engine"
1863 );
1864 }
1865 }
1866
1867 static void
pattern_dealloc(PatternObject * self)1868 pattern_dealloc(PatternObject* self)
1869 {
1870 if (self->weakreflist != NULL)
1871 PyObject_ClearWeakRefs((PyObject *) self);
1872 Py_XDECREF(self->pattern);
1873 Py_XDECREF(self->groupindex);
1874 Py_XDECREF(self->indexgroup);
1875 PyObject_DEL(self);
1876 }
1877
1878 static PyObject*
pattern_match(PatternObject * self,PyObject * args,PyObject * kw)1879 pattern_match(PatternObject* self, PyObject* args, PyObject* kw)
1880 {
1881 SRE_STATE state;
1882 int status;
1883
1884 PyObject* string;
1885 Py_ssize_t start = 0;
1886 Py_ssize_t end = PY_SSIZE_T_MAX;
1887 static char* kwlist[] = { "pattern", "pos", "endpos", NULL };
1888 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|nn:match", kwlist,
1889 &string, &start, &end))
1890 return NULL;
1891
1892 string = state_init(&state, self, string, start, end);
1893 if (!string)
1894 return NULL;
1895
1896 state.ptr = state.start;
1897
1898 TRACE(("|%p|%p|MATCH\n", PatternObject_GetCode(self), state.ptr));
1899
1900 if (state.charsize == 1) {
1901 status = sre_match(&state, PatternObject_GetCode(self));
1902 } else {
1903 #if defined(HAVE_UNICODE)
1904 status = sre_umatch(&state, PatternObject_GetCode(self));
1905 #endif
1906 }
1907
1908 TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
1909 if (PyErr_Occurred())
1910 return NULL;
1911
1912 state_fini(&state);
1913
1914 return pattern_new_match(self, &state, status);
1915 }
1916
1917 static PyObject*
pattern_search(PatternObject * self,PyObject * args,PyObject * kw)1918 pattern_search(PatternObject* self, PyObject* args, PyObject* kw)
1919 {
1920 SRE_STATE state;
1921 int status;
1922
1923 PyObject* string;
1924 Py_ssize_t start = 0;
1925 Py_ssize_t end = PY_SSIZE_T_MAX;
1926 static char* kwlist[] = { "pattern", "pos", "endpos", NULL };
1927 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|nn:search", kwlist,
1928 &string, &start, &end))
1929 return NULL;
1930
1931 string = state_init(&state, self, string, start, end);
1932 if (!string)
1933 return NULL;
1934
1935 TRACE(("|%p|%p|SEARCH\n", PatternObject_GetCode(self), state.ptr));
1936
1937 if (state.charsize == 1) {
1938 status = sre_search(&state, PatternObject_GetCode(self));
1939 } else {
1940 #if defined(HAVE_UNICODE)
1941 status = sre_usearch(&state, PatternObject_GetCode(self));
1942 #endif
1943 }
1944
1945 TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
1946
1947 state_fini(&state);
1948
1949 if (PyErr_Occurred())
1950 return NULL;
1951
1952 return pattern_new_match(self, &state, status);
1953 }
1954
1955 static PyObject*
call(char * module,char * function,PyObject * args)1956 call(char* module, char* function, PyObject* args)
1957 {
1958 PyObject* name;
1959 PyObject* mod;
1960 PyObject* func;
1961 PyObject* result;
1962
1963 if (!args)
1964 return NULL;
1965 name = PyString_FromString(module);
1966 if (!name)
1967 return NULL;
1968 mod = PyImport_Import(name);
1969 Py_DECREF(name);
1970 if (!mod)
1971 return NULL;
1972 func = PyObject_GetAttrString(mod, function);
1973 Py_DECREF(mod);
1974 if (!func)
1975 return NULL;
1976 result = PyObject_CallObject(func, args);
1977 Py_DECREF(func);
1978 Py_DECREF(args);
1979 return result;
1980 }
1981
1982 #ifdef USE_BUILTIN_COPY
1983 static int
deepcopy(PyObject ** object,PyObject * memo)1984 deepcopy(PyObject** object, PyObject* memo)
1985 {
1986 PyObject* copy;
1987
1988 copy = call(
1989 "copy", "deepcopy",
1990 PyTuple_Pack(2, *object, memo)
1991 );
1992 if (!copy)
1993 return 0;
1994
1995 Py_DECREF(*object);
1996 *object = copy;
1997
1998 return 1; /* success */
1999 }
2000 #endif
2001
2002 static PyObject*
join_list(PyObject * list,PyObject * string)2003 join_list(PyObject* list, PyObject* string)
2004 {
2005 /* join list elements */
2006
2007 PyObject* joiner;
2008 #if PY_VERSION_HEX >= 0x01060000
2009 PyObject* function;
2010 PyObject* args;
2011 #endif
2012 PyObject* result;
2013
2014 joiner = PySequence_GetSlice(string, 0, 0);
2015 if (!joiner)
2016 return NULL;
2017
2018 if (PyList_GET_SIZE(list) == 0) {
2019 Py_DECREF(list);
2020 return joiner;
2021 }
2022
2023 #if PY_VERSION_HEX >= 0x01060000
2024 function = PyObject_GetAttrString(joiner, "join");
2025 if (!function) {
2026 Py_DECREF(joiner);
2027 return NULL;
2028 }
2029 args = PyTuple_New(1);
2030 if (!args) {
2031 Py_DECREF(function);
2032 Py_DECREF(joiner);
2033 return NULL;
2034 }
2035 PyTuple_SET_ITEM(args, 0, list);
2036 result = PyObject_CallObject(function, args);
2037 Py_DECREF(args); /* also removes list */
2038 Py_DECREF(function);
2039 #else
2040 result = call(
2041 "string", "join",
2042 PyTuple_Pack(2, list, joiner)
2043 );
2044 #endif
2045 Py_DECREF(joiner);
2046
2047 return result;
2048 }
2049
2050 static PyObject*
pattern_findall(PatternObject * self,PyObject * args,PyObject * kw)2051 pattern_findall(PatternObject* self, PyObject* args, PyObject* kw)
2052 {
2053 SRE_STATE state;
2054 PyObject* list;
2055 int status;
2056 Py_ssize_t i, b, e;
2057
2058 PyObject* string;
2059 Py_ssize_t start = 0;
2060 Py_ssize_t end = PY_SSIZE_T_MAX;
2061 static char* kwlist[] = { "source", "pos", "endpos", NULL };
2062 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|nn:findall", kwlist,
2063 &string, &start, &end))
2064 return NULL;
2065
2066 string = state_init(&state, self, string, start, end);
2067 if (!string)
2068 return NULL;
2069
2070 list = PyList_New(0);
2071 if (!list) {
2072 state_fini(&state);
2073 return NULL;
2074 }
2075
2076 while (state.start <= state.end) {
2077
2078 PyObject* item;
2079
2080 state_reset(&state);
2081
2082 state.ptr = state.start;
2083
2084 if (state.charsize == 1) {
2085 status = sre_search(&state, PatternObject_GetCode(self));
2086 } else {
2087 #if defined(HAVE_UNICODE)
2088 status = sre_usearch(&state, PatternObject_GetCode(self));
2089 #endif
2090 }
2091
2092 if (PyErr_Occurred())
2093 goto error;
2094
2095 if (status <= 0) {
2096 if (status == 0)
2097 break;
2098 pattern_error(status);
2099 goto error;
2100 }
2101
2102 /* don't bother to build a match object */
2103 switch (self->groups) {
2104 case 0:
2105 b = STATE_OFFSET(&state, state.start);
2106 e = STATE_OFFSET(&state, state.ptr);
2107 item = PySequence_GetSlice(string, b, e);
2108 if (!item)
2109 goto error;
2110 break;
2111 case 1:
2112 item = state_getslice(&state, 1, string, 1);
2113 if (!item)
2114 goto error;
2115 break;
2116 default:
2117 item = PyTuple_New(self->groups);
2118 if (!item)
2119 goto error;
2120 for (i = 0; i < self->groups; i++) {
2121 PyObject* o = state_getslice(&state, i+1, string, 1);
2122 if (!o) {
2123 Py_DECREF(item);
2124 goto error;
2125 }
2126 PyTuple_SET_ITEM(item, i, o);
2127 }
2128 break;
2129 }
2130
2131 status = PyList_Append(list, item);
2132 Py_DECREF(item);
2133 if (status < 0)
2134 goto error;
2135
2136 if (state.ptr == state.start)
2137 state.start = (void*) ((char*) state.ptr + state.charsize);
2138 else
2139 state.start = state.ptr;
2140
2141 }
2142
2143 state_fini(&state);
2144 return list;
2145
2146 error:
2147 Py_DECREF(list);
2148 state_fini(&state);
2149 return NULL;
2150
2151 }
2152
2153 #if PY_VERSION_HEX >= 0x02020000
2154 static PyObject*
pattern_finditer(PatternObject * pattern,PyObject * args)2155 pattern_finditer(PatternObject* pattern, PyObject* args)
2156 {
2157 PyObject* scanner;
2158 PyObject* search;
2159 PyObject* iterator;
2160
2161 scanner = pattern_scanner(pattern, args);
2162 if (!scanner)
2163 return NULL;
2164
2165 search = PyObject_GetAttrString(scanner, "search");
2166 Py_DECREF(scanner);
2167 if (!search)
2168 return NULL;
2169
2170 iterator = PyCallIter_New(search, Py_None);
2171 Py_DECREF(search);
2172
2173 return iterator;
2174 }
2175 #endif
2176
2177 static PyObject*
pattern_split(PatternObject * self,PyObject * args,PyObject * kw)2178 pattern_split(PatternObject* self, PyObject* args, PyObject* kw)
2179 {
2180 SRE_STATE state;
2181 PyObject* list;
2182 PyObject* item;
2183 int status;
2184 Py_ssize_t n;
2185 Py_ssize_t i;
2186 void* last;
2187
2188 PyObject* string;
2189 Py_ssize_t maxsplit = 0;
2190 static char* kwlist[] = { "source", "maxsplit", NULL };
2191 if (!PyArg_ParseTupleAndKeywords(args, kw, "O|n:split", kwlist,
2192 &string, &maxsplit))
2193 return NULL;
2194
2195 string = state_init(&state, self, string, 0, PY_SSIZE_T_MAX);
2196 if (!string)
2197 return NULL;
2198
2199 list = PyList_New(0);
2200 if (!list) {
2201 state_fini(&state);
2202 return NULL;
2203 }
2204
2205 n = 0;
2206 last = state.start;
2207
2208 while (!maxsplit || n < maxsplit) {
2209
2210 state_reset(&state);
2211
2212 state.ptr = state.start;
2213
2214 if (state.charsize == 1) {
2215 status = sre_search(&state, PatternObject_GetCode(self));
2216 } else {
2217 #if defined(HAVE_UNICODE)
2218 status = sre_usearch(&state, PatternObject_GetCode(self));
2219 #endif
2220 }
2221
2222 if (PyErr_Occurred())
2223 goto error;
2224
2225 if (status <= 0) {
2226 if (status == 0)
2227 break;
2228 pattern_error(status);
2229 goto error;
2230 }
2231
2232 if (state.start == state.ptr) {
2233 if (last == state.end)
2234 break;
2235 /* skip one character */
2236 state.start = (void*) ((char*) state.ptr + state.charsize);
2237 continue;
2238 }
2239
2240 /* get segment before this match */
2241 item = PySequence_GetSlice(
2242 string, STATE_OFFSET(&state, last),
2243 STATE_OFFSET(&state, state.start)
2244 );
2245 if (!item)
2246 goto error;
2247 status = PyList_Append(list, item);
2248 Py_DECREF(item);
2249 if (status < 0)
2250 goto error;
2251
2252 /* add groups (if any) */
2253 for (i = 0; i < self->groups; i++) {
2254 item = state_getslice(&state, i+1, string, 0);
2255 if (!item)
2256 goto error;
2257 status = PyList_Append(list, item);
2258 Py_DECREF(item);
2259 if (status < 0)
2260 goto error;
2261 }
2262
2263 n = n + 1;
2264
2265 last = state.start = state.ptr;
2266
2267 }
2268
2269 /* get segment following last match (even if empty) */
2270 item = PySequence_GetSlice(
2271 string, STATE_OFFSET(&state, last), state.endpos
2272 );
2273 if (!item)
2274 goto error;
2275 status = PyList_Append(list, item);
2276 Py_DECREF(item);
2277 if (status < 0)
2278 goto error;
2279
2280 state_fini(&state);
2281 return list;
2282
2283 error:
2284 Py_DECREF(list);
2285 state_fini(&state);
2286 return NULL;
2287
2288 }
2289
2290 static PyObject*
pattern_subx(PatternObject * self,PyObject * ptemplate,PyObject * string,Py_ssize_t count,Py_ssize_t subn)2291 pattern_subx(PatternObject* self, PyObject* ptemplate, PyObject* string,
2292 Py_ssize_t count, Py_ssize_t subn)
2293 {
2294 SRE_STATE state;
2295 PyObject* list;
2296 PyObject* item;
2297 PyObject* filter;
2298 PyObject* args;
2299 PyObject* match;
2300 void* ptr;
2301 int status;
2302 Py_ssize_t n;
2303 Py_ssize_t i, b, e;
2304 int bint;
2305 int filter_is_callable;
2306
2307 if (PyCallable_Check(ptemplate)) {
2308 /* sub/subn takes either a function or a template */
2309 filter = ptemplate;
2310 Py_INCREF(filter);
2311 filter_is_callable = 1;
2312 } else {
2313 /* if not callable, check if it's a literal string */
2314 int literal;
2315 ptr = getstring(ptemplate, &n, &bint);
2316 b = bint;
2317 if (ptr) {
2318 if (b == 1) {
2319 literal = sre_literal_template((unsigned char *)ptr, n);
2320 } else {
2321 #if defined(HAVE_UNICODE)
2322 literal = sre_uliteral_template((Py_UNICODE *)ptr, n);
2323 #endif
2324 }
2325 } else {
2326 PyErr_Clear();
2327 literal = 0;
2328 }
2329 if (literal) {
2330 filter = ptemplate;
2331 Py_INCREF(filter);
2332 filter_is_callable = 0;
2333 } else {
2334 /* not a literal; hand it over to the template compiler */
2335 filter = call(
2336 SRE_PY_MODULE, "_subx",
2337 PyTuple_Pack(2, self, ptemplate)
2338 );
2339 if (!filter)
2340 return NULL;
2341 filter_is_callable = PyCallable_Check(filter);
2342 }
2343 }
2344
2345 string = state_init(&state, self, string, 0, PY_SSIZE_T_MAX);
2346 if (!string) {
2347 Py_DECREF(filter);
2348 return NULL;
2349 }
2350
2351 list = PyList_New(0);
2352 if (!list) {
2353 Py_DECREF(filter);
2354 state_fini(&state);
2355 return NULL;
2356 }
2357
2358 n = i = 0;
2359
2360 while (!count || n < count) {
2361
2362 state_reset(&state);
2363
2364 state.ptr = state.start;
2365
2366 if (state.charsize == 1) {
2367 status = sre_search(&state, PatternObject_GetCode(self));
2368 } else {
2369 #if defined(HAVE_UNICODE)
2370 status = sre_usearch(&state, PatternObject_GetCode(self));
2371 #endif
2372 }
2373
2374 if (PyErr_Occurred())
2375 goto error;
2376
2377 if (status <= 0) {
2378 if (status == 0)
2379 break;
2380 pattern_error(status);
2381 goto error;
2382 }
2383
2384 b = STATE_OFFSET(&state, state.start);
2385 e = STATE_OFFSET(&state, state.ptr);
2386
2387 if (i < b) {
2388 /* get segment before this match */
2389 item = PySequence_GetSlice(string, i, b);
2390 if (!item)
2391 goto error;
2392 status = PyList_Append(list, item);
2393 Py_DECREF(item);
2394 if (status < 0)
2395 goto error;
2396
2397 } else if (i == b && i == e && n > 0)
2398 /* ignore empty match on latest position */
2399 goto next;
2400
2401 if (filter_is_callable) {
2402 /* pass match object through filter */
2403 match = pattern_new_match(self, &state, 1);
2404 if (!match)
2405 goto error;
2406 args = PyTuple_Pack(1, match);
2407 if (!args) {
2408 Py_DECREF(match);
2409 goto error;
2410 }
2411 item = PyObject_CallObject(filter, args);
2412 Py_DECREF(args);
2413 Py_DECREF(match);
2414 if (!item)
2415 goto error;
2416 } else {
2417 /* filter is literal string */
2418 item = filter;
2419 Py_INCREF(item);
2420 }
2421
2422 /* add to list */
2423 if (item != Py_None) {
2424 status = PyList_Append(list, item);
2425 Py_DECREF(item);
2426 if (status < 0)
2427 goto error;
2428 }
2429
2430 i = e;
2431 n = n + 1;
2432
2433 next:
2434 /* move on */
2435 if (state.ptr == state.start)
2436 state.start = (void*) ((char*) state.ptr + state.charsize);
2437 else
2438 state.start = state.ptr;
2439
2440 }
2441
2442 /* get segment following last match */
2443 if (i < state.endpos) {
2444 item = PySequence_GetSlice(string, i, state.endpos);
2445 if (!item)
2446 goto error;
2447 status = PyList_Append(list, item);
2448 Py_DECREF(item);
2449 if (status < 0)
2450 goto error;
2451 }
2452
2453 state_fini(&state);
2454
2455 Py_DECREF(filter);
2456
2457 /* convert list to single string (also removes list) */
2458 item = join_list(list, string);
2459
2460 if (!item)
2461 return NULL;
2462
2463 if (subn)
2464 return Py_BuildValue("Ni", item, n);
2465
2466 return item;
2467
2468 error:
2469 Py_DECREF(list);
2470 state_fini(&state);
2471 Py_DECREF(filter);
2472 return NULL;
2473
2474 }
2475
2476 static PyObject*
pattern_sub(PatternObject * self,PyObject * args,PyObject * kw)2477 pattern_sub(PatternObject* self, PyObject* args, PyObject* kw)
2478 {
2479 PyObject* ptemplate;
2480 PyObject* string;
2481 Py_ssize_t count = 0;
2482 static char* kwlist[] = { "repl", "string", "count", NULL };
2483 if (!PyArg_ParseTupleAndKeywords(args, kw, "OO|n:sub", kwlist,
2484 &ptemplate, &string, &count))
2485 return NULL;
2486
2487 return pattern_subx(self, ptemplate, string, count, 0);
2488 }
2489
2490 static PyObject*
pattern_subn(PatternObject * self,PyObject * args,PyObject * kw)2491 pattern_subn(PatternObject* self, PyObject* args, PyObject* kw)
2492 {
2493 PyObject* ptemplate;
2494 PyObject* string;
2495 Py_ssize_t count = 0;
2496 static char* kwlist[] = { "repl", "string", "count", NULL };
2497 if (!PyArg_ParseTupleAndKeywords(args, kw, "OO|n:subn", kwlist,
2498 &ptemplate, &string, &count))
2499 return NULL;
2500
2501 return pattern_subx(self, ptemplate, string, count, 1);
2502 }
2503
2504 static PyObject*
pattern_copy(PatternObject * self,PyObject * unused)2505 pattern_copy(PatternObject* self, PyObject *unused)
2506 {
2507 #ifdef USE_BUILTIN_COPY
2508 PatternObject* copy;
2509 int offset;
2510
2511 copy = PyObject_NEW_VAR(PatternObject, &Pattern_Type, self->codesize);
2512 if (!copy)
2513 return NULL;
2514
2515 offset = offsetof(PatternObject, groups);
2516
2517 Py_XINCREF(self->groupindex);
2518 Py_XINCREF(self->indexgroup);
2519 Py_XINCREF(self->pattern);
2520
2521 memcpy((char*) copy + offset, (char*) self + offset,
2522 sizeof(PatternObject) + self->codesize * sizeof(SRE_CODE) - offset);
2523 copy->weakreflist = NULL;
2524
2525 return (PyObject*) copy;
2526 #else
2527 PyErr_SetString(PyExc_TypeError, "cannot copy this pattern object");
2528 return NULL;
2529 #endif
2530 }
2531
2532 static PyObject*
pattern_deepcopy(PatternObject * self,PyObject * memo)2533 pattern_deepcopy(PatternObject* self, PyObject* memo)
2534 {
2535 #ifdef USE_BUILTIN_COPY
2536 PatternObject* copy;
2537
2538 copy = (PatternObject*) pattern_copy(self);
2539 if (!copy)
2540 return NULL;
2541
2542 if (!deepcopy(©->groupindex, memo) ||
2543 !deepcopy(©->indexgroup, memo) ||
2544 !deepcopy(©->pattern, memo)) {
2545 Py_DECREF(copy);
2546 return NULL;
2547 }
2548
2549 #else
2550 PyErr_SetString(PyExc_TypeError, "cannot deepcopy this pattern object");
2551 return NULL;
2552 #endif
2553 }
2554
2555 PyDoc_STRVAR(pattern_match_doc,
2556 "match(string[, pos[, endpos]]) --> match object or None.\n\
2557 Matches zero or more characters at the beginning of the string");
2558
2559 PyDoc_STRVAR(pattern_search_doc,
2560 "search(string[, pos[, endpos]]) --> match object or None.\n\
2561 Scan through string looking for a match, and return a corresponding\n\
2562 MatchObject instance. Return None if no position in the string matches.");
2563
2564 PyDoc_STRVAR(pattern_split_doc,
2565 "split(string[, maxsplit = 0]) --> list.\n\
2566 Split string by the occurrences of pattern.");
2567
2568 PyDoc_STRVAR(pattern_findall_doc,
2569 "findall(string[, pos[, endpos]]) --> list.\n\
2570 Return a list of all non-overlapping matches of pattern in string.");
2571
2572 PyDoc_STRVAR(pattern_finditer_doc,
2573 "finditer(string[, pos[, endpos]]) --> iterator.\n\
2574 Return an iterator over all non-overlapping matches for the \n\
2575 RE pattern in string. For each match, the iterator returns a\n\
2576 match object.");
2577
2578 PyDoc_STRVAR(pattern_sub_doc,
2579 "sub(repl, string[, count = 0]) --> newstring\n\
2580 Return the string obtained by replacing the leftmost non-overlapping\n\
2581 occurrences of pattern in string by the replacement repl.");
2582
2583 PyDoc_STRVAR(pattern_subn_doc,
2584 "subn(repl, string[, count = 0]) --> (newstring, number of subs)\n\
2585 Return the tuple (new_string, number_of_subs_made) found by replacing\n\
2586 the leftmost non-overlapping occurrences of pattern with the\n\
2587 replacement repl.");
2588
2589 PyDoc_STRVAR(pattern_doc, "Compiled regular expression objects");
2590
2591 static PyMethodDef pattern_methods[] = {
2592 {"match", (PyCFunction) pattern_match, METH_VARARGS|METH_KEYWORDS,
2593 pattern_match_doc},
2594 {"search", (PyCFunction) pattern_search, METH_VARARGS|METH_KEYWORDS,
2595 pattern_search_doc},
2596 {"sub", (PyCFunction) pattern_sub, METH_VARARGS|METH_KEYWORDS,
2597 pattern_sub_doc},
2598 {"subn", (PyCFunction) pattern_subn, METH_VARARGS|METH_KEYWORDS,
2599 pattern_subn_doc},
2600 {"split", (PyCFunction) pattern_split, METH_VARARGS|METH_KEYWORDS,
2601 pattern_split_doc},
2602 {"findall", (PyCFunction) pattern_findall, METH_VARARGS|METH_KEYWORDS,
2603 pattern_findall_doc},
2604 #if PY_VERSION_HEX >= 0x02020000
2605 {"finditer", (PyCFunction) pattern_finditer, METH_VARARGS,
2606 pattern_finditer_doc},
2607 #endif
2608 {"scanner", (PyCFunction) pattern_scanner, METH_VARARGS},
2609 {"__copy__", (PyCFunction) pattern_copy, METH_NOARGS},
2610 {"__deepcopy__", (PyCFunction) pattern_deepcopy, METH_O},
2611 {NULL, NULL}
2612 };
2613
2614 #define PAT_OFF(x) offsetof(PatternObject, x)
2615 static PyMemberDef pattern_members[] = {
2616 {"pattern", T_OBJECT, PAT_OFF(pattern), READONLY},
2617 {"flags", T_INT, PAT_OFF(flags), READONLY},
2618 {"groups", T_PYSSIZET, PAT_OFF(groups), READONLY},
2619 {"groupindex", T_OBJECT, PAT_OFF(groupindex), READONLY},
2620 {NULL} /* Sentinel */
2621 };
2622
2623 statichere PyTypeObject Pattern_Type = {
2624 PyObject_HEAD_INIT(NULL)
2625 0, "_" SRE_MODULE ".SRE_Pattern",
2626 sizeof(PatternObject), sizeof(SRE_CODE),
2627 (destructor)pattern_dealloc, /*tp_dealloc*/
2628 0, /* tp_print */
2629 0, /* tp_getattrn */
2630 0, /* tp_setattr */
2631 0, /* tp_compare */
2632 0, /* tp_repr */
2633 0, /* tp_as_number */
2634 0, /* tp_as_sequence */
2635 0, /* tp_as_mapping */
2636 0, /* tp_hash */
2637 0, /* tp_call */
2638 0, /* tp_str */
2639 0, /* tp_getattro */
2640 0, /* tp_setattro */
2641 0, /* tp_as_buffer */
2642 Py_TPFLAGS_DEFAULT, /* tp_flags */
2643 pattern_doc, /* tp_doc */
2644 0, /* tp_traverse */
2645 0, /* tp_clear */
2646 0, /* tp_richcompare */
2647 offsetof(PatternObject, weakreflist), /* tp_weaklistoffset */
2648 0, /* tp_iter */
2649 0, /* tp_iternext */
2650 pattern_methods, /* tp_methods */
2651 pattern_members, /* tp_members */
2652 };
2653
2654 static int _validate(PatternObject *self); /* Forward */
2655
2656 static PyObject *
_compile(PyObject * self_,PyObject * args)2657 _compile(PyObject* self_, PyObject* args)
2658 {
2659 /* "compile" pattern descriptor to pattern object */
2660
2661 PatternObject* self;
2662 Py_ssize_t i, n;
2663
2664 PyObject* pattern;
2665 int flags = 0;
2666 PyObject* code;
2667 Py_ssize_t groups = 0;
2668 PyObject* groupindex = NULL;
2669 PyObject* indexgroup = NULL;
2670 if (!PyArg_ParseTuple(args, "OiO!|nOO", &pattern, &flags,
2671 &PyList_Type, &code, &groups,
2672 &groupindex, &indexgroup))
2673 return NULL;
2674
2675 n = PyList_GET_SIZE(code);
2676 /* coverity[ampersand_in_size] */
2677 self = PyObject_NEW_VAR(PatternObject, &Pattern_Type, n);
2678 if (!self)
2679 return NULL;
2680 self->weakreflist = NULL;
2681 self->pattern = NULL;
2682 self->groupindex = NULL;
2683 self->indexgroup = NULL;
2684
2685 self->codesize = n;
2686
2687 for (i = 0; i < n; i++) {
2688 PyObject *o = PyList_GET_ITEM(code, i);
2689 unsigned long value = PyInt_Check(o) ? (unsigned long)PyInt_AsLong(o)
2690 : PyLong_AsUnsignedLong(o);
2691 self->code[i] = (SRE_CODE) value;
2692 if ((unsigned long) self->code[i] != value) {
2693 PyErr_SetString(PyExc_OverflowError,
2694 "regular expression code size limit exceeded");
2695 break;
2696 }
2697 }
2698
2699 if (PyErr_Occurred()) {
2700 Py_DECREF(self);
2701 return NULL;
2702 }
2703
2704 Py_INCREF(pattern);
2705 self->pattern = pattern;
2706
2707 self->flags = flags;
2708
2709 self->groups = groups;
2710
2711 Py_XINCREF(groupindex);
2712 self->groupindex = groupindex;
2713
2714 Py_XINCREF(indexgroup);
2715 self->indexgroup = indexgroup;
2716
2717 self->weakreflist = NULL;
2718
2719 if (!_validate(self)) {
2720 Py_DECREF(self);
2721 return NULL;
2722 }
2723
2724 return (PyObject*) self;
2725 }
2726
2727 /* -------------------------------------------------------------------- */
2728 /* Code validation */
2729
2730 /* To learn more about this code, have a look at the _compile() function in
2731 Lib/sre_compile.py. The validation functions below checks the code array
2732 for conformance with the code patterns generated there.
2733
2734 The nice thing about the generated code is that it is position-independent:
2735 all jumps are relative jumps forward. Also, jumps don't cross each other:
2736 the target of a later jump is always earlier than the target of an earlier
2737 jump. IOW, this is okay:
2738
2739 J---------J-------T--------T
2740 \ \_____/ /
2741 \______________________/
2742
2743 but this is not:
2744
2745 J---------J-------T--------T
2746 \_________\_____/ /
2747 \____________/
2748
2749 It also helps that SRE_CODE is always an unsigned type, either 2 bytes or 4
2750 bytes wide (the latter if Python is compiled for "wide" unicode support).
2751 */
2752
2753 /* Defining this one enables tracing of the validator */
2754 #undef VVERBOSE
2755
2756 /* Trace macro for the validator */
2757 #if defined(VVERBOSE)
2758 #define VTRACE(v) printf v
2759 #else
2760 #define VTRACE(v)
2761 #endif
2762
2763 /* Report failure */
2764 #define FAIL do { VTRACE(("FAIL: %d\n", __LINE__)); return 0; } while (0)
2765
2766 /* Extract opcode, argument, or skip count from code array */
2767 #define GET_OP \
2768 do { \
2769 VTRACE(("%p: ", code)); \
2770 if (code >= end) FAIL; \
2771 op = *code++; \
2772 VTRACE(("%lu (op)\n", (unsigned long)op)); \
2773 } while (0)
2774 #define GET_ARG \
2775 do { \
2776 VTRACE(("%p= ", code)); \
2777 if (code >= end) FAIL; \
2778 arg = *code++; \
2779 VTRACE(("%lu (arg)\n", (unsigned long)arg)); \
2780 } while (0)
2781 #define GET_SKIP_ADJ(adj) \
2782 do { \
2783 VTRACE(("%p= ", code)); \
2784 if (code >= end) FAIL; \
2785 skip = *code; \
2786 VTRACE(("%lu (skip to %p)\n", \
2787 (unsigned long)skip, code+skip)); \
2788 if (code+skip-adj < code || code+skip-adj > end)\
2789 FAIL; \
2790 code++; \
2791 } while (0)
2792 #define GET_SKIP GET_SKIP_ADJ(0)
2793
2794 static int
_validate_charset(SRE_CODE * code,SRE_CODE * end)2795 _validate_charset(SRE_CODE *code, SRE_CODE *end)
2796 {
2797 /* Some variables are manipulated by the macros above */
2798 SRE_CODE op;
2799 SRE_CODE arg;
2800 SRE_CODE offset;
2801 int i;
2802
2803 while (code < end) {
2804 GET_OP;
2805 switch (op) {
2806
2807 case SRE_OP_NEGATE:
2808 break;
2809
2810 case SRE_OP_LITERAL:
2811 GET_ARG;
2812 break;
2813
2814 case SRE_OP_RANGE:
2815 GET_ARG;
2816 GET_ARG;
2817 break;
2818
2819 case SRE_OP_CHARSET:
2820 offset = 32/sizeof(SRE_CODE); /* 32-byte bitmap */
2821 if (code+offset < code || code+offset > end)
2822 FAIL;
2823 code += offset;
2824 break;
2825
2826 case SRE_OP_BIGCHARSET:
2827 GET_ARG; /* Number of blocks */
2828 offset = 256/sizeof(SRE_CODE); /* 256-byte table */
2829 if (code+offset < code || code+offset > end)
2830 FAIL;
2831 /* Make sure that each byte points to a valid block */
2832 for (i = 0; i < 256; i++) {
2833 if (((unsigned char *)code)[i] >= arg)
2834 FAIL;
2835 }
2836 code += offset;
2837 offset = arg * 32/sizeof(SRE_CODE); /* 32-byte bitmap times arg */
2838 if (code+offset < code || code+offset > end)
2839 FAIL;
2840 code += offset;
2841 break;
2842
2843 case SRE_OP_CATEGORY:
2844 GET_ARG;
2845 switch (arg) {
2846 case SRE_CATEGORY_DIGIT:
2847 case SRE_CATEGORY_NOT_DIGIT:
2848 case SRE_CATEGORY_SPACE:
2849 case SRE_CATEGORY_NOT_SPACE:
2850 case SRE_CATEGORY_WORD:
2851 case SRE_CATEGORY_NOT_WORD:
2852 case SRE_CATEGORY_LINEBREAK:
2853 case SRE_CATEGORY_NOT_LINEBREAK:
2854 case SRE_CATEGORY_LOC_WORD:
2855 case SRE_CATEGORY_LOC_NOT_WORD:
2856 case SRE_CATEGORY_UNI_DIGIT:
2857 case SRE_CATEGORY_UNI_NOT_DIGIT:
2858 case SRE_CATEGORY_UNI_SPACE:
2859 case SRE_CATEGORY_UNI_NOT_SPACE:
2860 case SRE_CATEGORY_UNI_WORD:
2861 case SRE_CATEGORY_UNI_NOT_WORD:
2862 case SRE_CATEGORY_UNI_LINEBREAK:
2863 case SRE_CATEGORY_UNI_NOT_LINEBREAK:
2864 break;
2865 default:
2866 FAIL;
2867 }
2868 break;
2869
2870 default:
2871 FAIL;
2872
2873 }
2874 }
2875
2876 return 1;
2877 }
2878
2879 static int
_validate_inner(SRE_CODE * code,SRE_CODE * end,Py_ssize_t groups)2880 _validate_inner(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups)
2881 {
2882 /* Some variables are manipulated by the macros above */
2883 SRE_CODE op;
2884 SRE_CODE arg;
2885 SRE_CODE skip;
2886
2887 VTRACE(("code=%p, end=%p\n", code, end));
2888
2889 if (code > end)
2890 FAIL;
2891
2892 while (code < end) {
2893 GET_OP;
2894 switch (op) {
2895
2896 case SRE_OP_MARK:
2897 /* We don't check whether marks are properly nested; the
2898 sre_match() code is robust even if they don't, and the worst
2899 you can get is nonsensical match results. */
2900 GET_ARG;
2901 if (arg > 2*groups+1) {
2902 VTRACE(("arg=%d, groups=%d\n", (int)arg, (int)groups));
2903 FAIL;
2904 }
2905 break;
2906
2907 case SRE_OP_LITERAL:
2908 case SRE_OP_NOT_LITERAL:
2909 case SRE_OP_LITERAL_IGNORE:
2910 case SRE_OP_NOT_LITERAL_IGNORE:
2911 GET_ARG;
2912 /* The arg is just a character, nothing to check */
2913 break;
2914
2915 case SRE_OP_SUCCESS:
2916 case SRE_OP_FAILURE:
2917 /* Nothing to check; these normally end the matching process */
2918 break;
2919
2920 case SRE_OP_AT:
2921 GET_ARG;
2922 switch (arg) {
2923 case SRE_AT_BEGINNING:
2924 case SRE_AT_BEGINNING_STRING:
2925 case SRE_AT_BEGINNING_LINE:
2926 case SRE_AT_END:
2927 case SRE_AT_END_LINE:
2928 case SRE_AT_END_STRING:
2929 case SRE_AT_BOUNDARY:
2930 case SRE_AT_NON_BOUNDARY:
2931 case SRE_AT_LOC_BOUNDARY:
2932 case SRE_AT_LOC_NON_BOUNDARY:
2933 case SRE_AT_UNI_BOUNDARY:
2934 case SRE_AT_UNI_NON_BOUNDARY:
2935 break;
2936 default:
2937 FAIL;
2938 }
2939 break;
2940
2941 case SRE_OP_ANY:
2942 case SRE_OP_ANY_ALL:
2943 /* These have no operands */
2944 break;
2945
2946 case SRE_OP_IN:
2947 case SRE_OP_IN_IGNORE:
2948 GET_SKIP;
2949 /* Stop 1 before the end; we check the FAILURE below */
2950 if (!_validate_charset(code, code+skip-2))
2951 FAIL;
2952 if (code[skip-2] != SRE_OP_FAILURE)
2953 FAIL;
2954 code += skip-1;
2955 break;
2956
2957 case SRE_OP_INFO:
2958 {
2959 /* A minimal info field is
2960 <INFO> <1=skip> <2=flags> <3=min> <4=max>;
2961 If SRE_INFO_PREFIX or SRE_INFO_CHARSET is in the flags,
2962 more follows. */
2963 SRE_CODE flags, i;
2964 SRE_CODE *newcode;
2965 GET_SKIP;
2966 newcode = code+skip-1;
2967 GET_ARG; flags = arg;
2968 GET_ARG; /* min */
2969 GET_ARG; /* max */
2970 /* Check that only valid flags are present */
2971 if ((flags & ~(SRE_INFO_PREFIX |
2972 SRE_INFO_LITERAL |
2973 SRE_INFO_CHARSET)) != 0)
2974 FAIL;
2975 /* PREFIX and CHARSET are mutually exclusive */
2976 if ((flags & SRE_INFO_PREFIX) &&
2977 (flags & SRE_INFO_CHARSET))
2978 FAIL;
2979 /* LITERAL implies PREFIX */
2980 if ((flags & SRE_INFO_LITERAL) &&
2981 !(flags & SRE_INFO_PREFIX))
2982 FAIL;
2983 /* Validate the prefix */
2984 if (flags & SRE_INFO_PREFIX) {
2985 SRE_CODE prefix_len;
2986 GET_ARG; prefix_len = arg;
2987 GET_ARG; /* prefix skip */
2988 /* Here comes the prefix string */
2989 if (code+prefix_len < code || code+prefix_len > newcode)
2990 FAIL;
2991 code += prefix_len;
2992 /* And here comes the overlap table */
2993 if (code+prefix_len < code || code+prefix_len > newcode)
2994 FAIL;
2995 /* Each overlap value should be < prefix_len */
2996 for (i = 0; i < prefix_len; i++) {
2997 if (code[i] >= prefix_len)
2998 FAIL;
2999 }
3000 code += prefix_len;
3001 }
3002 /* Validate the charset */
3003 if (flags & SRE_INFO_CHARSET) {
3004 if (!_validate_charset(code, newcode-1))
3005 FAIL;
3006 if (newcode[-1] != SRE_OP_FAILURE)
3007 FAIL;
3008 code = newcode;
3009 }
3010 else if (code != newcode) {
3011 VTRACE(("code=%p, newcode=%p\n", code, newcode));
3012 FAIL;
3013 }
3014 }
3015 break;
3016
3017 case SRE_OP_BRANCH:
3018 {
3019 SRE_CODE *target = NULL;
3020 for (;;) {
3021 GET_SKIP;
3022 if (skip == 0)
3023 break;
3024 /* Stop 2 before the end; we check the JUMP below */
3025 if (!_validate_inner(code, code+skip-3, groups))
3026 FAIL;
3027 code += skip-3;
3028 /* Check that it ends with a JUMP, and that each JUMP
3029 has the same target */
3030 GET_OP;
3031 if (op != SRE_OP_JUMP)
3032 FAIL;
3033 GET_SKIP;
3034 if (target == NULL)
3035 target = code+skip-1;
3036 else if (code+skip-1 != target)
3037 FAIL;
3038 }
3039 }
3040 break;
3041
3042 case SRE_OP_REPEAT_ONE:
3043 case SRE_OP_MIN_REPEAT_ONE:
3044 {
3045 SRE_CODE min, max;
3046 GET_SKIP;
3047 GET_ARG; min = arg;
3048 GET_ARG; max = arg;
3049 if (min > max)
3050 FAIL;
3051 #ifdef Py_UNICODE_WIDE
3052 if (max > 65535)
3053 FAIL;
3054 #endif
3055 if (!_validate_inner(code, code+skip-4, groups))
3056 FAIL;
3057 code += skip-4;
3058 GET_OP;
3059 if (op != SRE_OP_SUCCESS)
3060 FAIL;
3061 }
3062 break;
3063
3064 case SRE_OP_REPEAT:
3065 {
3066 SRE_CODE min, max;
3067 GET_SKIP;
3068 GET_ARG; min = arg;
3069 GET_ARG; max = arg;
3070 if (min > max)
3071 FAIL;
3072 #ifdef Py_UNICODE_WIDE
3073 if (max > 65535)
3074 FAIL;
3075 #endif
3076 if (!_validate_inner(code, code+skip-3, groups))
3077 FAIL;
3078 code += skip-3;
3079 GET_OP;
3080 if (op != SRE_OP_MAX_UNTIL && op != SRE_OP_MIN_UNTIL)
3081 FAIL;
3082 }
3083 break;
3084
3085 case SRE_OP_GROUPREF:
3086 case SRE_OP_GROUPREF_IGNORE:
3087 GET_ARG;
3088 if (arg >= groups)
3089 FAIL;
3090 break;
3091
3092 case SRE_OP_GROUPREF_EXISTS:
3093 /* The regex syntax for this is: '(?(group)then|else)', where
3094 'group' is either an integer group number or a group name,
3095 'then' and 'else' are sub-regexes, and 'else' is optional. */
3096 GET_ARG;
3097 if (arg >= groups)
3098 FAIL;
3099 GET_SKIP_ADJ(1);
3100 code--; /* The skip is relative to the first arg! */
3101 /* There are two possibilities here: if there is both a 'then'
3102 part and an 'else' part, the generated code looks like:
3103
3104 GROUPREF_EXISTS
3105 <group>
3106 <skipyes>
3107 ...then part...
3108 JUMP
3109 <skipno>
3110 (<skipyes> jumps here)
3111 ...else part...
3112 (<skipno> jumps here)
3113
3114 If there is only a 'then' part, it looks like:
3115
3116 GROUPREF_EXISTS
3117 <group>
3118 <skip>
3119 ...then part...
3120 (<skip> jumps here)
3121
3122 There is no direct way to decide which it is, and we don't want
3123 to allow arbitrary jumps anywhere in the code; so we just look
3124 for a JUMP opcode preceding our skip target.
3125 */
3126 if (skip >= 3 && code+skip-3 >= code &&
3127 code[skip-3] == SRE_OP_JUMP)
3128 {
3129 VTRACE(("both then and else parts present\n"));
3130 if (!_validate_inner(code+1, code+skip-3, groups))
3131 FAIL;
3132 code += skip-2; /* Position after JUMP, at <skipno> */
3133 GET_SKIP;
3134 if (!_validate_inner(code, code+skip-1, groups))
3135 FAIL;
3136 code += skip-1;
3137 }
3138 else {
3139 VTRACE(("only a then part present\n"));
3140 if (!_validate_inner(code+1, code+skip-1, groups))
3141 FAIL;
3142 code += skip-1;
3143 }
3144 break;
3145
3146 case SRE_OP_ASSERT:
3147 case SRE_OP_ASSERT_NOT:
3148 GET_SKIP;
3149 GET_ARG; /* 0 for lookahead, width for lookbehind */
3150 code--; /* Back up over arg to simplify math below */
3151 if (arg & 0x80000000)
3152 FAIL; /* Width too large */
3153 /* Stop 1 before the end; we check the SUCCESS below */
3154 if (!_validate_inner(code+1, code+skip-2, groups))
3155 FAIL;
3156 code += skip-2;
3157 GET_OP;
3158 if (op != SRE_OP_SUCCESS)
3159 FAIL;
3160 break;
3161
3162 default:
3163 FAIL;
3164
3165 }
3166 }
3167
3168 VTRACE(("okay\n"));
3169 return 1;
3170 }
3171
3172 static int
_validate_outer(SRE_CODE * code,SRE_CODE * end,Py_ssize_t groups)3173 _validate_outer(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups)
3174 {
3175 if (groups < 0 || groups > 100 || code >= end || end[-1] != SRE_OP_SUCCESS)
3176 FAIL;
3177 if (groups == 0) /* fix for simplejson */
3178 groups = 100; /* 100 groups should always be safe */
3179 return _validate_inner(code, end-1, groups);
3180 }
3181
3182 static int
_validate(PatternObject * self)3183 _validate(PatternObject *self)
3184 {
3185 if (!_validate_outer(self->code, self->code+self->codesize, self->groups))
3186 {
3187 PyErr_SetString(PyExc_RuntimeError, "invalid SRE code");
3188 return 0;
3189 }
3190 else
3191 VTRACE(("Success!\n"));
3192 return 1;
3193 }
3194
3195 /* -------------------------------------------------------------------- */
3196 /* match methods */
3197
3198 static void
match_dealloc(MatchObject * self)3199 match_dealloc(MatchObject* self)
3200 {
3201 Py_XDECREF(self->regs);
3202 Py_XDECREF(self->string);
3203 Py_DECREF(self->pattern);
3204 PyObject_DEL(self);
3205 }
3206
3207 static PyObject*
match_getslice_by_index(MatchObject * self,Py_ssize_t index,PyObject * def)3208 match_getslice_by_index(MatchObject* self, Py_ssize_t index, PyObject* def)
3209 {
3210 if (index < 0 || index >= self->groups) {
3211 /* raise IndexError if we were given a bad group number */
3212 PyErr_SetString(
3213 PyExc_IndexError,
3214 "no such group"
3215 );
3216 return NULL;
3217 }
3218
3219 index *= 2;
3220
3221 if (self->string == Py_None || self->mark[index] < 0) {
3222 /* return default value if the string or group is undefined */
3223 Py_INCREF(def);
3224 return def;
3225 }
3226
3227 return PySequence_GetSlice(
3228 self->string, self->mark[index], self->mark[index+1]
3229 );
3230 }
3231
3232 static Py_ssize_t
match_getindex(MatchObject * self,PyObject * index)3233 match_getindex(MatchObject* self, PyObject* index)
3234 {
3235 Py_ssize_t i;
3236
3237 if (PyInt_Check(index))
3238 return PyInt_AsSsize_t(index);
3239
3240 i = -1;
3241
3242 if (self->pattern->groupindex) {
3243 index = PyObject_GetItem(self->pattern->groupindex, index);
3244 if (index) {
3245 if (PyInt_Check(index) || PyLong_Check(index))
3246 i = PyInt_AsSsize_t(index);
3247 Py_DECREF(index);
3248 } else
3249 PyErr_Clear();
3250 }
3251
3252 return i;
3253 }
3254
3255 static PyObject*
match_getslice(MatchObject * self,PyObject * index,PyObject * def)3256 match_getslice(MatchObject* self, PyObject* index, PyObject* def)
3257 {
3258 return match_getslice_by_index(self, match_getindex(self, index), def);
3259 }
3260
3261 static PyObject*
match_expand(MatchObject * self,PyObject * ptemplate)3262 match_expand(MatchObject* self, PyObject* ptemplate)
3263 {
3264 /* delegate to Python code */
3265 return call(
3266 SRE_PY_MODULE, "_expand",
3267 PyTuple_Pack(3, self->pattern, self, ptemplate)
3268 );
3269 }
3270
3271 static PyObject*
match_group(MatchObject * self,PyObject * args)3272 match_group(MatchObject* self, PyObject* args)
3273 {
3274 PyObject* result;
3275 Py_ssize_t i, size;
3276
3277 size = PyTuple_GET_SIZE(args);
3278
3279 switch (size) {
3280 case 0:
3281 result = match_getslice(self, Py_False, Py_None);
3282 break;
3283 case 1:
3284 result = match_getslice(self, PyTuple_GET_ITEM(args, 0), Py_None);
3285 break;
3286 default:
3287 /* fetch multiple items */
3288 result = PyTuple_New(size);
3289 if (!result)
3290 return NULL;
3291 for (i = 0; i < size; i++) {
3292 PyObject* item = match_getslice(
3293 self, PyTuple_GET_ITEM(args, i), Py_None
3294 );
3295 if (!item) {
3296 Py_DECREF(result);
3297 return NULL;
3298 }
3299 PyTuple_SET_ITEM(result, i, item);
3300 }
3301 break;
3302 }
3303 return result;
3304 }
3305
3306 static PyObject*
match_groups(MatchObject * self,PyObject * args,PyObject * kw)3307 match_groups(MatchObject* self, PyObject* args, PyObject* kw)
3308 {
3309 PyObject* result;
3310 Py_ssize_t index;
3311
3312 PyObject* def = Py_None;
3313 static char* kwlist[] = { "default", NULL };
3314 if (!PyArg_ParseTupleAndKeywords(args, kw, "|O:groups", kwlist, &def))
3315 return NULL;
3316
3317 result = PyTuple_New(self->groups-1);
3318 if (!result)
3319 return NULL;
3320
3321 for (index = 1; index < self->groups; index++) {
3322 PyObject* item;
3323 item = match_getslice_by_index(self, index, def);
3324 if (!item) {
3325 Py_DECREF(result);
3326 return NULL;
3327 }
3328 PyTuple_SET_ITEM(result, index-1, item);
3329 }
3330
3331 return result;
3332 }
3333
3334 static PyObject*
match_groupdict(MatchObject * self,PyObject * args,PyObject * kw)3335 match_groupdict(MatchObject* self, PyObject* args, PyObject* kw)
3336 {
3337 PyObject* result;
3338 PyObject* keys;
3339 Py_ssize_t index;
3340
3341 PyObject* def = Py_None;
3342 static char* kwlist[] = { "default", NULL };
3343 if (!PyArg_ParseTupleAndKeywords(args, kw, "|O:groupdict", kwlist, &def))
3344 return NULL;
3345
3346 result = PyDict_New();
3347 if (!result || !self->pattern->groupindex)
3348 return result;
3349
3350 keys = PyMapping_Keys(self->pattern->groupindex);
3351 if (!keys)
3352 goto failed;
3353
3354 for (index = 0; index < PyList_GET_SIZE(keys); index++) {
3355 int status;
3356 PyObject* key;
3357 PyObject* value;
3358 key = PyList_GET_ITEM(keys, index);
3359 if (!key)
3360 goto failed;
3361 value = match_getslice(self, key, def);
3362 if (!value) {
3363 Py_DECREF(key);
3364 goto failed;
3365 }
3366 status = PyDict_SetItem(result, key, value);
3367 Py_DECREF(value);
3368 if (status < 0)
3369 goto failed;
3370 }
3371
3372 Py_DECREF(keys);
3373
3374 return result;
3375
3376 failed:
3377 Py_XDECREF(keys);
3378 Py_DECREF(result);
3379 return NULL;
3380 }
3381
3382 static PyObject*
match_start(MatchObject * self,PyObject * args)3383 match_start(MatchObject* self, PyObject* args)
3384 {
3385 Py_ssize_t index;
3386
3387 PyObject* index_ = Py_False; /* zero */
3388 if (!PyArg_UnpackTuple(args, "start", 0, 1, &index_))
3389 return NULL;
3390
3391 index = match_getindex(self, index_);
3392
3393 if (index < 0 || index >= self->groups) {
3394 PyErr_SetString(
3395 PyExc_IndexError,
3396 "no such group"
3397 );
3398 return NULL;
3399 }
3400
3401 /* mark is -1 if group is undefined */
3402 return Py_BuildValue("i", self->mark[index*2]);
3403 }
3404
3405 static PyObject*
match_end(MatchObject * self,PyObject * args)3406 match_end(MatchObject* self, PyObject* args)
3407 {
3408 Py_ssize_t index;
3409
3410 PyObject* index_ = Py_False; /* zero */
3411 if (!PyArg_UnpackTuple(args, "end", 0, 1, &index_))
3412 return NULL;
3413
3414 index = match_getindex(self, index_);
3415
3416 if (index < 0 || index >= self->groups) {
3417 PyErr_SetString(
3418 PyExc_IndexError,
3419 "no such group"
3420 );
3421 return NULL;
3422 }
3423
3424 /* mark is -1 if group is undefined */
3425 return Py_BuildValue("i", self->mark[index*2+1]);
3426 }
3427
3428 LOCAL(PyObject*)
_pair(Py_ssize_t i1,Py_ssize_t i2)3429 _pair(Py_ssize_t i1, Py_ssize_t i2)
3430 {
3431 PyObject* pair;
3432 PyObject* item;
3433
3434 pair = PyTuple_New(2);
3435 if (!pair)
3436 return NULL;
3437
3438 item = PyInt_FromSsize_t(i1);
3439 if (!item)
3440 goto error;
3441 PyTuple_SET_ITEM(pair, 0, item);
3442
3443 item = PyInt_FromSsize_t(i2);
3444 if (!item)
3445 goto error;
3446 PyTuple_SET_ITEM(pair, 1, item);
3447
3448 return pair;
3449
3450 error:
3451 Py_DECREF(pair);
3452 return NULL;
3453 }
3454
3455 static PyObject*
match_span(MatchObject * self,PyObject * args)3456 match_span(MatchObject* self, PyObject* args)
3457 {
3458 Py_ssize_t index;
3459
3460 PyObject* index_ = Py_False; /* zero */
3461 if (!PyArg_UnpackTuple(args, "span", 0, 1, &index_))
3462 return NULL;
3463
3464 index = match_getindex(self, index_);
3465
3466 if (index < 0 || index >= self->groups) {
3467 PyErr_SetString(
3468 PyExc_IndexError,
3469 "no such group"
3470 );
3471 return NULL;
3472 }
3473
3474 /* marks are -1 if group is undefined */
3475 return _pair(self->mark[index*2], self->mark[index*2+1]);
3476 }
3477
3478 static PyObject*
match_regs(MatchObject * self)3479 match_regs(MatchObject* self)
3480 {
3481 PyObject* regs;
3482 PyObject* item;
3483 Py_ssize_t index;
3484
3485 regs = PyTuple_New(self->groups);
3486 if (!regs)
3487 return NULL;
3488
3489 for (index = 0; index < self->groups; index++) {
3490 item = _pair(self->mark[index*2], self->mark[index*2+1]);
3491 if (!item) {
3492 Py_DECREF(regs);
3493 return NULL;
3494 }
3495 PyTuple_SET_ITEM(regs, index, item);
3496 }
3497
3498 Py_INCREF(regs);
3499 self->regs = regs;
3500
3501 return regs;
3502 }
3503
3504 static PyObject*
match_copy(MatchObject * self,PyObject * unused)3505 match_copy(MatchObject* self, PyObject *unused)
3506 {
3507 #ifdef USE_BUILTIN_COPY
3508 MatchObject* copy;
3509 Py_ssize_t slots, offset;
3510
3511 slots = 2 * (self->pattern->groups+1);
3512
3513 copy = PyObject_NEW_VAR(MatchObject, &Match_Type, slots);
3514 if (!copy)
3515 return NULL;
3516
3517 /* this value a constant, but any compiler should be able to
3518 figure that out all by itself */
3519 offset = offsetof(MatchObject, string);
3520
3521 Py_XINCREF(self->pattern);
3522 Py_XINCREF(self->string);
3523 Py_XINCREF(self->regs);
3524
3525 memcpy((char*) copy + offset, (char*) self + offset,
3526 sizeof(MatchObject) + slots * sizeof(Py_ssize_t) - offset);
3527
3528 return (PyObject*) copy;
3529 #else
3530 PyErr_SetString(PyExc_TypeError, "cannot copy this match object");
3531 return NULL;
3532 #endif
3533 }
3534
3535 static PyObject*
match_deepcopy(MatchObject * self,PyObject * memo)3536 match_deepcopy(MatchObject* self, PyObject* memo)
3537 {
3538 #ifdef USE_BUILTIN_COPY
3539 MatchObject* copy;
3540
3541 copy = (MatchObject*) match_copy(self);
3542 if (!copy)
3543 return NULL;
3544
3545 if (!deepcopy((PyObject**) ©->pattern, memo) ||
3546 !deepcopy(©->string, memo) ||
3547 !deepcopy(©->regs, memo)) {
3548 Py_DECREF(copy);
3549 return NULL;
3550 }
3551
3552 #else
3553 PyErr_SetString(PyExc_TypeError, "cannot deepcopy this match object");
3554 return NULL;
3555 #endif
3556 }
3557
3558 static struct PyMethodDef match_methods[] = {
3559 {"group", (PyCFunction) match_group, METH_VARARGS},
3560 {"start", (PyCFunction) match_start, METH_VARARGS},
3561 {"end", (PyCFunction) match_end, METH_VARARGS},
3562 {"span", (PyCFunction) match_span, METH_VARARGS},
3563 {"groups", (PyCFunction) match_groups, METH_VARARGS|METH_KEYWORDS},
3564 {"groupdict", (PyCFunction) match_groupdict, METH_VARARGS|METH_KEYWORDS},
3565 {"expand", (PyCFunction) match_expand, METH_O},
3566 {"__copy__", (PyCFunction) match_copy, METH_NOARGS},
3567 {"__deepcopy__", (PyCFunction) match_deepcopy, METH_O},
3568 {NULL, NULL}
3569 };
3570
3571 static PyObject *
match_lastindex_get(MatchObject * self)3572 match_lastindex_get(MatchObject *self)
3573 {
3574 if (self->lastindex >= 0)
3575 return Py_BuildValue("i", self->lastindex);
3576 Py_INCREF(Py_None);
3577 return Py_None;
3578 }
3579
3580 static PyObject *
match_lastgroup_get(MatchObject * self)3581 match_lastgroup_get(MatchObject *self)
3582 {
3583 if (self->pattern->indexgroup && self->lastindex >= 0) {
3584 PyObject* result = PySequence_GetItem(
3585 self->pattern->indexgroup, self->lastindex
3586 );
3587 if (result)
3588 return result;
3589 PyErr_Clear();
3590 }
3591 Py_INCREF(Py_None);
3592 return Py_None;
3593 }
3594
3595 static PyObject *
match_regs_get(MatchObject * self)3596 match_regs_get(MatchObject *self)
3597 {
3598 if (self->regs) {
3599 Py_INCREF(self->regs);
3600 return self->regs;
3601 } else
3602 return match_regs(self);
3603 }
3604
3605 static PyGetSetDef match_getset[] = {
3606 {"lastindex", (getter)match_lastindex_get, (setter)NULL},
3607 {"lastgroup", (getter)match_lastgroup_get, (setter)NULL},
3608 {"regs", (getter)match_regs_get, (setter)NULL},
3609 {NULL}
3610 };
3611
3612 #define MATCH_OFF(x) offsetof(MatchObject, x)
3613 static PyMemberDef match_members[] = {
3614 {"string", T_OBJECT, MATCH_OFF(string), READONLY},
3615 {"re", T_OBJECT, MATCH_OFF(pattern), READONLY},
3616 {"pos", T_PYSSIZET, MATCH_OFF(pos), READONLY},
3617 {"endpos", T_PYSSIZET, MATCH_OFF(endpos), READONLY},
3618 {NULL}
3619 };
3620
3621
3622 /* FIXME: implement setattr("string", None) as a special case (to
3623 detach the associated string, if any */
3624
3625 static PyTypeObject Match_Type = {
3626 PyVarObject_HEAD_INIT(NULL, 0)
3627 "_" SRE_MODULE ".SRE_Match",
3628 sizeof(MatchObject), sizeof(Py_ssize_t),
3629 (destructor)match_dealloc, /* tp_dealloc */
3630 0, /* tp_print */
3631 0, /* tp_getattr */
3632 0, /* tp_setattr */
3633 0, /* tp_compare */
3634 0, /* tp_repr */
3635 0, /* tp_as_number */
3636 0, /* tp_as_sequence */
3637 0, /* tp_as_mapping */
3638 0, /* tp_hash */
3639 0, /* tp_call */
3640 0, /* tp_str */
3641 0, /* tp_getattro */
3642 0, /* tp_setattro */
3643 0, /* tp_as_buffer */
3644 Py_TPFLAGS_DEFAULT,
3645 0, /* tp_doc */
3646 0, /* tp_traverse */
3647 0, /* tp_clear */
3648 0, /* tp_richcompare */
3649 0, /* tp_weaklistoffset */
3650 0, /* tp_iter */
3651 0, /* tp_iternext */
3652 match_methods, /* tp_methods */
3653 match_members, /* tp_members */
3654 match_getset, /* tp_getset */
3655 };
3656
3657 static PyObject*
pattern_new_match(PatternObject * pattern,SRE_STATE * state,int status)3658 pattern_new_match(PatternObject* pattern, SRE_STATE* state, int status)
3659 {
3660 /* create match object (from state object) */
3661
3662 MatchObject* match;
3663 Py_ssize_t i, j;
3664 char* base;
3665 int n;
3666
3667 if (status > 0) {
3668
3669 /* create match object (with room for extra group marks) */
3670 /* coverity[ampersand_in_size] */
3671 match = PyObject_NEW_VAR(MatchObject, &Match_Type,
3672 2*(pattern->groups+1));
3673 if (!match)
3674 return NULL;
3675
3676 Py_INCREF(pattern);
3677 match->pattern = pattern;
3678
3679 Py_INCREF(state->string);
3680 match->string = state->string;
3681
3682 match->regs = NULL;
3683 match->groups = pattern->groups+1;
3684
3685 /* fill in group slices */
3686
3687 base = (char*) state->beginning;
3688 n = state->charsize;
3689
3690 match->mark[0] = ((char*) state->start - base) / n;
3691 match->mark[1] = ((char*) state->ptr - base) / n;
3692
3693 for (i = j = 0; i < pattern->groups; i++, j+=2)
3694 if (j+1 <= state->lastmark && state->mark[j] && state->mark[j+1]) {
3695 match->mark[j+2] = ((char*) state->mark[j] - base) / n;
3696 match->mark[j+3] = ((char*) state->mark[j+1] - base) / n;
3697 } else
3698 match->mark[j+2] = match->mark[j+3] = -1; /* undefined */
3699
3700 match->pos = state->pos;
3701 match->endpos = state->endpos;
3702
3703 match->lastindex = state->lastindex;
3704
3705 return (PyObject*) match;
3706
3707 } else if (status == 0) {
3708
3709 /* no match */
3710 Py_INCREF(Py_None);
3711 return Py_None;
3712
3713 }
3714
3715 /* internal error */
3716 pattern_error(status);
3717 return NULL;
3718 }
3719
3720
3721 /* -------------------------------------------------------------------- */
3722 /* scanner methods (experimental) */
3723
3724 static void
scanner_dealloc(ScannerObject * self)3725 scanner_dealloc(ScannerObject* self)
3726 {
3727 state_fini(&self->state);
3728 Py_XDECREF(self->pattern);
3729 PyObject_DEL(self);
3730 }
3731
3732 static PyObject*
scanner_match(ScannerObject * self,PyObject * unused)3733 scanner_match(ScannerObject* self, PyObject *unused)
3734 {
3735 SRE_STATE* state = &self->state;
3736 PyObject* match;
3737 int status;
3738
3739 state_reset(state);
3740
3741 state->ptr = state->start;
3742
3743 if (state->charsize == 1) {
3744 status = sre_match(state, PatternObject_GetCode(self->pattern));
3745 } else {
3746 #if defined(HAVE_UNICODE)
3747 status = sre_umatch(state, PatternObject_GetCode(self->pattern));
3748 #endif
3749 }
3750 if (PyErr_Occurred())
3751 return NULL;
3752
3753 match = pattern_new_match((PatternObject*) self->pattern,
3754 state, status);
3755
3756 if (status == 0 || state->ptr == state->start)
3757 state->start = (void*) ((char*) state->ptr + state->charsize);
3758 else
3759 state->start = state->ptr;
3760
3761 return match;
3762 }
3763
3764
3765 static PyObject*
scanner_search(ScannerObject * self,PyObject * unused)3766 scanner_search(ScannerObject* self, PyObject *unused)
3767 {
3768 SRE_STATE* state = &self->state;
3769 PyObject* match;
3770 int status;
3771
3772 state_reset(state);
3773
3774 state->ptr = state->start;
3775
3776 if (state->charsize == 1) {
3777 status = sre_search(state, PatternObject_GetCode(self->pattern));
3778 } else {
3779 #if defined(HAVE_UNICODE)
3780 status = sre_usearch(state, PatternObject_GetCode(self->pattern));
3781 #endif
3782 }
3783 if (PyErr_Occurred())
3784 return NULL;
3785
3786 match = pattern_new_match((PatternObject*) self->pattern,
3787 state, status);
3788
3789 if (status == 0 || state->ptr == state->start)
3790 state->start = (void*) ((char*) state->ptr + state->charsize);
3791 else
3792 state->start = state->ptr;
3793
3794 return match;
3795 }
3796
3797 static PyMethodDef scanner_methods[] = {
3798 {"match", (PyCFunction) scanner_match, METH_NOARGS},
3799 {"search", (PyCFunction) scanner_search, METH_NOARGS},
3800 {NULL, NULL}
3801 };
3802
3803 #define SCAN_OFF(x) offsetof(ScannerObject, x)
3804 static PyMemberDef scanner_members[] = {
3805 {"pattern", T_OBJECT, SCAN_OFF(pattern), READONLY},
3806 {NULL} /* Sentinel */
3807 };
3808
3809 statichere PyTypeObject Scanner_Type = {
3810 PyObject_HEAD_INIT(NULL)
3811 0, "_" SRE_MODULE ".SRE_Scanner",
3812 sizeof(ScannerObject), 0,
3813 (destructor)scanner_dealloc, /*tp_dealloc*/
3814 0, /* tp_print */
3815 0, /* tp_getattr */
3816 0, /* tp_setattr */
3817 0, /* tp_reserved */
3818 0, /* tp_repr */
3819 0, /* tp_as_number */
3820 0, /* tp_as_sequence */
3821 0, /* tp_as_mapping */
3822 0, /* tp_hash */
3823 0, /* tp_call */
3824 0, /* tp_str */
3825 0, /* tp_getattro */
3826 0, /* tp_setattro */
3827 0, /* tp_as_buffer */
3828 Py_TPFLAGS_DEFAULT, /* tp_flags */
3829 0, /* tp_doc */
3830 0, /* tp_traverse */
3831 0, /* tp_clear */
3832 0, /* tp_richcompare */
3833 0, /* tp_weaklistoffset */
3834 0, /* tp_iter */
3835 0, /* tp_iternext */
3836 scanner_methods, /* tp_methods */
3837 scanner_members, /* tp_members */
3838 0, /* tp_getset */
3839 };
3840
3841 static PyObject*
pattern_scanner(PatternObject * pattern,PyObject * args)3842 pattern_scanner(PatternObject* pattern, PyObject* args)
3843 {
3844 /* create search state object */
3845
3846 ScannerObject* self;
3847
3848 PyObject* string;
3849 Py_ssize_t start = 0;
3850 Py_ssize_t end = PY_SSIZE_T_MAX;
3851 if (!PyArg_ParseTuple(args, "O|nn:scanner", &string, &start, &end))
3852 return NULL;
3853
3854 /* create scanner object */
3855 self = PyObject_NEW(ScannerObject, &Scanner_Type);
3856 if (!self)
3857 return NULL;
3858 self->pattern = NULL;
3859
3860 string = state_init(&self->state, pattern, string, start, end);
3861 if (!string) {
3862 Py_DECREF(self);
3863 return NULL;
3864 }
3865
3866 Py_INCREF(pattern);
3867 self->pattern = (PyObject*) pattern;
3868
3869 return (PyObject*) self;
3870 }
3871
3872 static PyMethodDef _functions[] = {
3873 {"compile", _compile, METH_VARARGS},
3874 {"getcodesize", sre_codesize, METH_NOARGS},
3875 {"getlower", sre_getlower, METH_VARARGS},
3876 {NULL, NULL}
3877 };
3878
3879 #if PY_VERSION_HEX < 0x02030000
init_sre(void)3880 DL_EXPORT(void) init_sre(void)
3881 #else
3882 PyMODINIT_FUNC init_sre(void)
3883 #endif
3884 {
3885 PyObject* m;
3886 PyObject* d;
3887 PyObject* x;
3888
3889 /* Patch object types */
3890 if (PyType_Ready(&Pattern_Type) || PyType_Ready(&Match_Type) ||
3891 PyType_Ready(&Scanner_Type))
3892 return;
3893
3894 m = Py_InitModule("_" SRE_MODULE, _functions);
3895 if (m == NULL)
3896 return;
3897 d = PyModule_GetDict(m);
3898
3899 x = PyInt_FromLong(SRE_MAGIC);
3900 if (x) {
3901 PyDict_SetItemString(d, "MAGIC", x);
3902 Py_DECREF(x);
3903 }
3904
3905 x = PyInt_FromLong(sizeof(SRE_CODE));
3906 if (x) {
3907 PyDict_SetItemString(d, "CODESIZE", x);
3908 Py_DECREF(x);
3909 }
3910
3911 x = PyString_FromString(copyright);
3912 if (x) {
3913 PyDict_SetItemString(d, "copyright", x);
3914 Py_DECREF(x);
3915 }
3916 }
3917
3918 #endif /* !defined(SRE_RECURSIVE) */
3919
3920 /* vim:ts=4:sw=4:et
3921 */
3922