1/*
2 * Secret Labs' Regular Expression Engine
3 *
4 * regular expression matching engine
5
6    Copyright (c) 2011, Intel Corporation. All rights reserved.<BR>
7    This program and the accompanying materials are licensed and made available under
8    the terms and conditions of the BSD License that accompanies this distribution.
9    The full text of the license may be found at
10    http://opensource.org/licenses/bsd-license.
11
12    THE PROGRAM IS DISTRIBUTED UNDER THE BSD LICENSE ON AN "AS IS" BASIS,
13    WITHOUT WARRANTIES OR REPRESENTATIONS OF ANY KIND, EITHER EXPRESS OR IMPLIED.
14 *
15 * partial history:
16 * 1999-10-24 fl  created (based on existing template matcher code)
17 * 2000-03-06 fl  first alpha, sort of
18 * 2000-08-01 fl  fixes for 1.6b1
19 * 2000-08-07 fl  use PyOS_CheckStack() if available
20 * 2000-09-20 fl  added expand method
21 * 2001-03-20 fl  lots of fixes for 2.1b2
22 * 2001-04-15 fl  export copyright as Python attribute, not global
23 * 2001-04-28 fl  added __copy__ methods (work in progress)
24 * 2001-05-14 fl  fixes for 1.5.2 compatibility
25 * 2001-07-01 fl  added BIGCHARSET support (from Martin von Loewis)
26 * 2001-10-18 fl  fixed group reset issue (from Matthew Mueller)
27 * 2001-10-20 fl  added split primitive; reenable unicode for 1.6/2.0/2.1
28 * 2001-10-21 fl  added sub/subn primitive
29 * 2001-10-24 fl  added finditer primitive (for 2.2 only)
30 * 2001-12-07 fl  fixed memory leak in sub/subn (Guido van Rossum)
31 * 2002-11-09 fl  fixed empty sub/subn return type
32 * 2003-04-18 mvl fully support 4-byte codes
33 * 2003-10-17 gn  implemented non recursive scheme
34 *
35 * Copyright (c) 1997-2001 by Secret Labs AB.  All rights reserved.
36 *
37 * This version of the SRE library can be redistributed under CNRI's
38 * Python 1.6 license.  For any other use, please contact Secret Labs
39 * AB (info@pythonware.com).
40 *
41 * Portions of this engine have been developed in cooperation with
42 * CNRI.  Hewlett-Packard provided funding for 1.6 integration and
43 * other compatibility work.
44 */
45
46/* Get rid of these macros to prevent collisions between EFI and Python in this file. */
47#undef  RETURN_ERROR
48#undef  RETURN_SUCCESS
49
50#ifndef SRE_RECURSIVE
51
52static char copyright[] =
53    " SRE 2.2.2 Copyright (c) 1997-2002 by Secret Labs AB ";
54
55#define PY_SSIZE_T_CLEAN
56
57#include "Python.h"
58#include "structmember.h" /* offsetof */
59
60#include "sre.h"
61
62#include <ctype.h>
63
64/* name of this module, minus the leading underscore */
65#if !defined(SRE_MODULE)
66#define SRE_MODULE "sre"
67#endif
68
69#define SRE_PY_MODULE "re"
70
71/* defining this one enables tracing */
72#undef VERBOSE
73
74#if PY_VERSION_HEX >= 0x01060000
75#if PY_VERSION_HEX  < 0x02020000 || defined(Py_USING_UNICODE)
76/* defining this enables unicode support (default under 1.6a1 and later) */
77#define HAVE_UNICODE
78#endif
79#endif
80
81/* -------------------------------------------------------------------- */
82/* optional features */
83
84/* enables fast searching */
85#define USE_FAST_SEARCH
86
87/* enables aggressive inlining (always on for Visual C) */
88#undef USE_INLINE
89
90/* enables copy/deepcopy handling (work in progress) */
91#undef USE_BUILTIN_COPY
92
93#if PY_VERSION_HEX < 0x01060000
94#define PyObject_DEL(op) PyMem_DEL((op))
95#endif
96
97/* -------------------------------------------------------------------- */
98
99#if defined(_MSC_VER)
100#pragma optimize("gt", on) /* doesn't seem to make much difference... */
101#pragma warning(disable: 4710) /* who cares if functions are not inlined ;-) */
102/* fastest possible local call under MSVC */
103#define LOCAL(type) static __inline type __fastcall
104#elif defined(USE_INLINE)
105#define LOCAL(type) static inline type
106#else
107#define LOCAL(type) static type
108#endif
109
110/* error codes */
111#define SRE_ERROR_ILLEGAL -1 /* illegal opcode */
112#define SRE_ERROR_STATE -2 /* illegal state */
113#define SRE_ERROR_RECURSION_LIMIT -3 /* runaway recursion */
114#define SRE_ERROR_MEMORY -9 /* out of memory */
115#define SRE_ERROR_INTERRUPTED -10 /* signal handler raised exception */
116
117#if defined(VERBOSE)
118#define TRACE(v) printf v
119#else
120#define TRACE(v)
121#endif
122
123/* -------------------------------------------------------------------- */
124/* search engine state */
125
126/* default character predicates (run sre_chars.py to regenerate tables) */
127
128#define SRE_DIGIT_MASK 1
129#define SRE_SPACE_MASK 2
130#define SRE_LINEBREAK_MASK 4
131#define SRE_ALNUM_MASK 8
132#define SRE_WORD_MASK 16
133
134/* FIXME: this assumes ASCII.  create tables in init_sre() instead */
135
136static char sre_char_info[128] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 6, 2,
1372, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0,
1380, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 25, 25, 25, 25, 25, 25, 25, 25,
13925, 25, 0, 0, 0, 0, 0, 0, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
14024, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0,
1410, 0, 16, 0, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
14224, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 0, 0, 0, 0, 0 };
143
144static char sre_char_lower[128] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
14510, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
14627, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43,
14744, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
14861, 62, 63, 64, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107,
149108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121,
150122, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105,
151106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
152120, 121, 122, 123, 124, 125, 126, 127 };
153
154#define SRE_IS_DIGIT(ch)\
155    ((ch) < 128 ? (sre_char_info[(ch)] & SRE_DIGIT_MASK) : 0)
156#define SRE_IS_SPACE(ch)\
157    ((ch) < 128 ? (sre_char_info[(ch)] & SRE_SPACE_MASK) : 0)
158#define SRE_IS_LINEBREAK(ch)\
159    ((ch) < 128 ? (sre_char_info[(ch)] & SRE_LINEBREAK_MASK) : 0)
160#define SRE_IS_ALNUM(ch)\
161    ((ch) < 128 ? (sre_char_info[(ch)] & SRE_ALNUM_MASK) : 0)
162#define SRE_IS_WORD(ch)\
163    ((ch) < 128 ? (sre_char_info[(ch)] & SRE_WORD_MASK) : 0)
164
165static unsigned int sre_lower(unsigned int ch)
166{
167    return ((ch) < 128 ? (unsigned int)sre_char_lower[ch] : ch);
168}
169
170/* locale-specific character predicates */
171/* !(c & ~N) == (c < N+1) for any unsigned c, this avoids
172 * warnings when c's type supports only numbers < N+1 */
173#define SRE_LOC_IS_DIGIT(ch) (!((ch) & ~255) ? isdigit((ch)) : 0)
174#define SRE_LOC_IS_SPACE(ch) (!((ch) & ~255) ? isspace((ch)) : 0)
175#define SRE_LOC_IS_LINEBREAK(ch) ((ch) == '\n')
176#define SRE_LOC_IS_ALNUM(ch) (!((ch) & ~255) ? isalnum((ch)) : 0)
177#define SRE_LOC_IS_WORD(ch) (SRE_LOC_IS_ALNUM((ch)) || (ch) == '_')
178
179static unsigned int sre_lower_locale(unsigned int ch)
180{
181    return ((ch) < 256 ? (unsigned int)tolower((ch)) : ch);
182}
183
184/* unicode-specific character predicates */
185
186#if defined(HAVE_UNICODE)
187
188#define SRE_UNI_IS_DIGIT(ch) Py_UNICODE_ISDECIMAL((Py_UNICODE)(ch))
189#define SRE_UNI_IS_SPACE(ch) Py_UNICODE_ISSPACE((Py_UNICODE)(ch))
190#define SRE_UNI_IS_LINEBREAK(ch) Py_UNICODE_ISLINEBREAK((Py_UNICODE)(ch))
191#define SRE_UNI_IS_ALNUM(ch) Py_UNICODE_ISALNUM((Py_UNICODE)(ch))
192#define SRE_UNI_IS_WORD(ch) (SRE_UNI_IS_ALNUM((ch)) || (ch) == '_')
193
194static unsigned int sre_lower_unicode(unsigned int ch)
195{
196    return (unsigned int) Py_UNICODE_TOLOWER((Py_UNICODE)(ch));
197}
198
199#endif
200
201LOCAL(int)
202sre_category(SRE_CODE category, unsigned int ch)
203{
204    switch (category) {
205
206    case SRE_CATEGORY_DIGIT:
207        return SRE_IS_DIGIT(ch);
208    case SRE_CATEGORY_NOT_DIGIT:
209        return !SRE_IS_DIGIT(ch);
210    case SRE_CATEGORY_SPACE:
211        return SRE_IS_SPACE(ch);
212    case SRE_CATEGORY_NOT_SPACE:
213        return !SRE_IS_SPACE(ch);
214    case SRE_CATEGORY_WORD:
215        return SRE_IS_WORD(ch);
216    case SRE_CATEGORY_NOT_WORD:
217        return !SRE_IS_WORD(ch);
218    case SRE_CATEGORY_LINEBREAK:
219        return SRE_IS_LINEBREAK(ch);
220    case SRE_CATEGORY_NOT_LINEBREAK:
221        return !SRE_IS_LINEBREAK(ch);
222
223    case SRE_CATEGORY_LOC_WORD:
224        return SRE_LOC_IS_WORD(ch);
225    case SRE_CATEGORY_LOC_NOT_WORD:
226        return !SRE_LOC_IS_WORD(ch);
227
228#if defined(HAVE_UNICODE)
229    case SRE_CATEGORY_UNI_DIGIT:
230        return SRE_UNI_IS_DIGIT(ch);
231    case SRE_CATEGORY_UNI_NOT_DIGIT:
232        return !SRE_UNI_IS_DIGIT(ch);
233    case SRE_CATEGORY_UNI_SPACE:
234        return SRE_UNI_IS_SPACE(ch);
235    case SRE_CATEGORY_UNI_NOT_SPACE:
236        return !SRE_UNI_IS_SPACE(ch);
237    case SRE_CATEGORY_UNI_WORD:
238        return SRE_UNI_IS_WORD(ch);
239    case SRE_CATEGORY_UNI_NOT_WORD:
240        return !SRE_UNI_IS_WORD(ch);
241    case SRE_CATEGORY_UNI_LINEBREAK:
242        return SRE_UNI_IS_LINEBREAK(ch);
243    case SRE_CATEGORY_UNI_NOT_LINEBREAK:
244        return !SRE_UNI_IS_LINEBREAK(ch);
245#else
246    case SRE_CATEGORY_UNI_DIGIT:
247        return SRE_IS_DIGIT(ch);
248    case SRE_CATEGORY_UNI_NOT_DIGIT:
249        return !SRE_IS_DIGIT(ch);
250    case SRE_CATEGORY_UNI_SPACE:
251        return SRE_IS_SPACE(ch);
252    case SRE_CATEGORY_UNI_NOT_SPACE:
253        return !SRE_IS_SPACE(ch);
254    case SRE_CATEGORY_UNI_WORD:
255        return SRE_LOC_IS_WORD(ch);
256    case SRE_CATEGORY_UNI_NOT_WORD:
257        return !SRE_LOC_IS_WORD(ch);
258    case SRE_CATEGORY_UNI_LINEBREAK:
259        return SRE_IS_LINEBREAK(ch);
260    case SRE_CATEGORY_UNI_NOT_LINEBREAK:
261        return !SRE_IS_LINEBREAK(ch);
262#endif
263    }
264    return 0;
265}
266
267/* helpers */
268
269static void
270data_stack_dealloc(SRE_STATE* state)
271{
272    if (state->data_stack) {
273        PyMem_FREE(state->data_stack);
274        state->data_stack = NULL;
275    }
276    state->data_stack_size = state->data_stack_base = 0;
277}
278
279static int
280data_stack_grow(SRE_STATE* state, Py_ssize_t size)
281{
282    Py_ssize_t minsize, cursize;
283    minsize = state->data_stack_base+size;
284    cursize = state->data_stack_size;
285    if (cursize < minsize) {
286        void* stack;
287        cursize = minsize+minsize/4+1024;
288        TRACE(("allocate/grow stack %d\n", cursize));
289        stack = PyMem_REALLOC(state->data_stack, cursize);
290        if (!stack) {
291            data_stack_dealloc(state);
292            return SRE_ERROR_MEMORY;
293        }
294        state->data_stack = (char *)stack;
295        state->data_stack_size = cursize;
296    }
297    return 0;
298}
299
300/* generate 8-bit version */
301
302#define SRE_CHAR unsigned char
303#define SRE_AT sre_at
304#define SRE_COUNT sre_count
305#define SRE_CHARSET sre_charset
306#define SRE_INFO sre_info
307#define SRE_MATCH sre_match
308#define SRE_MATCH_CONTEXT sre_match_context
309#define SRE_SEARCH sre_search
310#define SRE_LITERAL_TEMPLATE sre_literal_template
311
312#if defined(HAVE_UNICODE)
313
314#define SRE_RECURSIVE
315#include "_sre.c"
316#undef SRE_RECURSIVE
317
318#undef SRE_LITERAL_TEMPLATE
319#undef SRE_SEARCH
320#undef SRE_MATCH
321#undef SRE_MATCH_CONTEXT
322#undef SRE_INFO
323#undef SRE_CHARSET
324#undef SRE_COUNT
325#undef SRE_AT
326#undef SRE_CHAR
327
328/* generate 16-bit unicode version */
329
330#define SRE_CHAR Py_UNICODE
331#define SRE_AT sre_uat
332#define SRE_COUNT sre_ucount
333#define SRE_CHARSET sre_ucharset
334#define SRE_INFO sre_uinfo
335#define SRE_MATCH sre_umatch
336#define SRE_MATCH_CONTEXT sre_umatch_context
337#define SRE_SEARCH sre_usearch
338#define SRE_LITERAL_TEMPLATE sre_uliteral_template
339#endif
340
341#endif /* SRE_RECURSIVE */
342
343/* -------------------------------------------------------------------- */
344/* String matching engine */
345
346/* the following section is compiled twice, with different character
347   settings */
348
349LOCAL(int)
350SRE_AT(SRE_STATE* state, SRE_CHAR* ptr, SRE_CODE at)
351{
352    /* check if pointer is at given position */
353
354    Py_ssize_t thisp, thatp;
355
356    switch (at) {
357
358    case SRE_AT_BEGINNING:
359    case SRE_AT_BEGINNING_STRING:
360        return ((void*) ptr == state->beginning);
361
362    case SRE_AT_BEGINNING_LINE:
363        return ((void*) ptr == state->beginning ||
364                SRE_IS_LINEBREAK((int) ptr[-1]));
365
366    case SRE_AT_END:
367        return (((void*) (ptr+1) == state->end &&
368                 SRE_IS_LINEBREAK((int) ptr[0])) ||
369                ((void*) ptr == state->end));
370
371    case SRE_AT_END_LINE:
372        return ((void*) ptr == state->end ||
373                SRE_IS_LINEBREAK((int) ptr[0]));
374
375    case SRE_AT_END_STRING:
376        return ((void*) ptr == state->end);
377
378    case SRE_AT_BOUNDARY:
379        if (state->beginning == state->end)
380            return 0;
381        thatp = ((void*) ptr > state->beginning) ?
382            SRE_IS_WORD((int) ptr[-1]) : 0;
383        thisp = ((void*) ptr < state->end) ?
384            SRE_IS_WORD((int) ptr[0]) : 0;
385        return thisp != thatp;
386
387    case SRE_AT_NON_BOUNDARY:
388        if (state->beginning == state->end)
389            return 0;
390        thatp = ((void*) ptr > state->beginning) ?
391            SRE_IS_WORD((int) ptr[-1]) : 0;
392        thisp = ((void*) ptr < state->end) ?
393            SRE_IS_WORD((int) ptr[0]) : 0;
394        return thisp == thatp;
395
396    case SRE_AT_LOC_BOUNDARY:
397        if (state->beginning == state->end)
398            return 0;
399        thatp = ((void*) ptr > state->beginning) ?
400            SRE_LOC_IS_WORD((int) ptr[-1]) : 0;
401        thisp = ((void*) ptr < state->end) ?
402            SRE_LOC_IS_WORD((int) ptr[0]) : 0;
403        return thisp != thatp;
404
405    case SRE_AT_LOC_NON_BOUNDARY:
406        if (state->beginning == state->end)
407            return 0;
408        thatp = ((void*) ptr > state->beginning) ?
409            SRE_LOC_IS_WORD((int) ptr[-1]) : 0;
410        thisp = ((void*) ptr < state->end) ?
411            SRE_LOC_IS_WORD((int) ptr[0]) : 0;
412        return thisp == thatp;
413
414#if defined(HAVE_UNICODE)
415    case SRE_AT_UNI_BOUNDARY:
416        if (state->beginning == state->end)
417            return 0;
418        thatp = ((void*) ptr > state->beginning) ?
419            SRE_UNI_IS_WORD((int) ptr[-1]) : 0;
420        thisp = ((void*) ptr < state->end) ?
421            SRE_UNI_IS_WORD((int) ptr[0]) : 0;
422        return thisp != thatp;
423
424    case SRE_AT_UNI_NON_BOUNDARY:
425        if (state->beginning == state->end)
426            return 0;
427        thatp = ((void*) ptr > state->beginning) ?
428            SRE_UNI_IS_WORD((int) ptr[-1]) : 0;
429        thisp = ((void*) ptr < state->end) ?
430            SRE_UNI_IS_WORD((int) ptr[0]) : 0;
431        return thisp == thatp;
432#endif
433
434    }
435
436    return 0;
437}
438
439LOCAL(int)
440SRE_CHARSET(SRE_CODE* set, SRE_CODE ch)
441{
442    /* check if character is a member of the given set */
443
444    int ok = 1;
445
446    for (;;) {
447        switch (*set++) {
448
449        case SRE_OP_FAILURE:
450            return !ok;
451
452        case SRE_OP_LITERAL:
453            /* <LITERAL> <code> */
454            if (ch == set[0])
455                return ok;
456            set++;
457            break;
458
459        case SRE_OP_CATEGORY:
460            /* <CATEGORY> <code> */
461            if (sre_category(set[0], (int) ch))
462                return ok;
463            set += 1;
464            break;
465
466        case SRE_OP_CHARSET:
467            if (sizeof(SRE_CODE) == 2) {
468                /* <CHARSET> <bitmap> (16 bits per code word) */
469                if (ch < 256 && (set[ch >> 4] & (1 << (ch & 15))))
470                    return ok;
471                set += 16;
472            }
473            else {
474                /* <CHARSET> <bitmap> (32 bits per code word) */
475                if (ch < 256 && (set[ch >> 5] & (1 << (ch & 31))))
476                    return ok;
477                set += 8;
478            }
479            break;
480
481        case SRE_OP_RANGE:
482            /* <RANGE> <lower> <upper> */
483            if (set[0] <= ch && ch <= set[1])
484                return ok;
485            set += 2;
486            break;
487
488        case SRE_OP_NEGATE:
489            ok = !ok;
490            break;
491
492        case SRE_OP_BIGCHARSET:
493            /* <BIGCHARSET> <blockcount> <256 blockindices> <blocks> */
494        {
495            Py_ssize_t count, block;
496            count = *(set++);
497
498            if (sizeof(SRE_CODE) == 2) {
499                block = ((unsigned char*)set)[ch >> 8];
500                set += 128;
501                if (set[block*16 + ((ch & 255)>>4)] & (1 << (ch & 15)))
502                    return ok;
503                set += count*16;
504            }
505            else {
506                /* !(c & ~N) == (c < N+1) for any unsigned c, this avoids
507                 * warnings when c's type supports only numbers < N+1 */
508                if (!(ch & ~65535))
509                    block = ((unsigned char*)set)[ch >> 8];
510                else
511                    block = -1;
512                set += 64;
513                if (block >=0 &&
514                    (set[block*8 + ((ch & 255)>>5)] & (1 << (ch & 31))))
515                    return ok;
516                set += count*8;
517            }
518            break;
519        }
520
521        default:
522            /* internal error -- there's not much we can do about it
523               here, so let's just pretend it didn't match... */
524            return 0;
525        }
526    }
527}
528
529LOCAL(Py_ssize_t) SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern);
530
531LOCAL(Py_ssize_t)
532SRE_COUNT(SRE_STATE* state, SRE_CODE* pattern, Py_ssize_t maxcount)
533{
534    SRE_CODE chr;
535    SRE_CHAR* ptr = (SRE_CHAR *)state->ptr;
536    SRE_CHAR* end = (SRE_CHAR *)state->end;
537    Py_ssize_t i;
538
539    /* adjust end */
540    if (maxcount < end - ptr && maxcount != 65535)
541        end = ptr + maxcount;
542
543    switch (pattern[0]) {
544
545    case SRE_OP_IN:
546        /* repeated set */
547        TRACE(("|%p|%p|COUNT IN\n", pattern, ptr));
548        while (ptr < end && SRE_CHARSET(pattern + 2, *ptr))
549            ptr++;
550        break;
551
552    case SRE_OP_ANY:
553        /* repeated dot wildcard. */
554        TRACE(("|%p|%p|COUNT ANY\n", pattern, ptr));
555        while (ptr < end && !SRE_IS_LINEBREAK(*ptr))
556            ptr++;
557        break;
558
559    case SRE_OP_ANY_ALL:
560        /* repeated dot wildcard.  skip to the end of the target
561           string, and backtrack from there */
562        TRACE(("|%p|%p|COUNT ANY_ALL\n", pattern, ptr));
563        ptr = end;
564        break;
565
566    case SRE_OP_LITERAL:
567        /* repeated literal */
568        chr = pattern[1];
569        TRACE(("|%p|%p|COUNT LITERAL %d\n", pattern, ptr, chr));
570        while (ptr < end && (SRE_CODE) *ptr == chr)
571            ptr++;
572        break;
573
574    case SRE_OP_LITERAL_IGNORE:
575        /* repeated literal */
576        chr = pattern[1];
577        TRACE(("|%p|%p|COUNT LITERAL_IGNORE %d\n", pattern, ptr, chr));
578        while (ptr < end && (SRE_CODE) state->lower(*ptr) == chr)
579            ptr++;
580        break;
581
582    case SRE_OP_NOT_LITERAL:
583        /* repeated non-literal */
584        chr = pattern[1];
585        TRACE(("|%p|%p|COUNT NOT_LITERAL %d\n", pattern, ptr, chr));
586        while (ptr < end && (SRE_CODE) *ptr != chr)
587            ptr++;
588        break;
589
590    case SRE_OP_NOT_LITERAL_IGNORE:
591        /* repeated non-literal */
592        chr = pattern[1];
593        TRACE(("|%p|%p|COUNT NOT_LITERAL_IGNORE %d\n", pattern, ptr, chr));
594        while (ptr < end && (SRE_CODE) state->lower(*ptr) != chr)
595            ptr++;
596        break;
597
598    default:
599        /* repeated single character pattern */
600        TRACE(("|%p|%p|COUNT SUBPATTERN\n", pattern, ptr));
601        while ((SRE_CHAR*) state->ptr < end) {
602            i = SRE_MATCH(state, pattern);
603            if (i < 0)
604                return i;
605            if (!i)
606                break;
607        }
608        TRACE(("|%p|%p|COUNT %d\n", pattern, ptr,
609               (SRE_CHAR*) state->ptr - ptr));
610        return (SRE_CHAR*) state->ptr - ptr;
611    }
612
613    TRACE(("|%p|%p|COUNT %d\n", pattern, ptr, ptr - (SRE_CHAR*) state->ptr));
614    return ptr - (SRE_CHAR*) state->ptr;
615}
616
617#if 0 /* not used in this release */
618LOCAL(int)
619SRE_INFO(SRE_STATE* state, SRE_CODE* pattern)
620{
621    /* check if an SRE_OP_INFO block matches at the current position.
622       returns the number of SRE_CODE objects to skip if successful, 0
623       if no match */
624
625    SRE_CHAR* end = state->end;
626    SRE_CHAR* ptr = state->ptr;
627    Py_ssize_t i;
628
629    /* check minimal length */
630    if (pattern[3] && (end - ptr) < pattern[3])
631        return 0;
632
633    /* check known prefix */
634    if (pattern[2] & SRE_INFO_PREFIX && pattern[5] > 1) {
635        /* <length> <skip> <prefix data> <overlap data> */
636        for (i = 0; i < pattern[5]; i++)
637            if ((SRE_CODE) ptr[i] != pattern[7 + i])
638                return 0;
639        return pattern[0] + 2 * pattern[6];
640    }
641    return pattern[0];
642}
643#endif
644
645/* The macros below should be used to protect recursive SRE_MATCH()
646 * calls that *failed* and do *not* return immediately (IOW, those
647 * that will backtrack). Explaining:
648 *
649 * - Recursive SRE_MATCH() returned true: that's usually a success
650 *   (besides atypical cases like ASSERT_NOT), therefore there's no
651 *   reason to restore lastmark;
652 *
653 * - Recursive SRE_MATCH() returned false but the current SRE_MATCH()
654 *   is returning to the caller: If the current SRE_MATCH() is the
655 *   top function of the recursion, returning false will be a matching
656 *   failure, and it doesn't matter where lastmark is pointing to.
657 *   If it's *not* the top function, it will be a recursive SRE_MATCH()
658 *   failure by itself, and the calling SRE_MATCH() will have to deal
659 *   with the failure by the same rules explained here (it will restore
660 *   lastmark by itself if necessary);
661 *
662 * - Recursive SRE_MATCH() returned false, and will continue the
663 *   outside 'for' loop: must be protected when breaking, since the next
664 *   OP could potentially depend on lastmark;
665 *
666 * - Recursive SRE_MATCH() returned false, and will be called again
667 *   inside a local for/while loop: must be protected between each
668 *   loop iteration, since the recursive SRE_MATCH() could do anything,
669 *   and could potentially depend on lastmark.
670 *
671 * For more information, check the discussion at SF patch #712900.
672 */
673#define LASTMARK_SAVE()     \
674    do { \
675        ctx->lastmark = state->lastmark; \
676        ctx->lastindex = state->lastindex; \
677    } while (0)
678#define LASTMARK_RESTORE()  \
679    do { \
680        state->lastmark = ctx->lastmark; \
681        state->lastindex = ctx->lastindex; \
682    } while (0)
683
684#define RETURN_ERROR(i) do { return i; } while(0)
685#define RETURN_FAILURE do { ret = 0; goto exit; } while(0)
686#define RETURN_SUCCESS do { ret = 1; goto exit; } while(0)
687
688#define RETURN_ON_ERROR(i) \
689    do { if (i < 0) RETURN_ERROR(i); } while (0)
690#define RETURN_ON_SUCCESS(i) \
691    do { RETURN_ON_ERROR(i); if (i > 0) RETURN_SUCCESS; } while (0)
692#define RETURN_ON_FAILURE(i) \
693    do { RETURN_ON_ERROR(i); if (i == 0) RETURN_FAILURE; } while (0)
694
695#define SFY(x) #x
696
697#define DATA_STACK_ALLOC(state, type, ptr) \
698do { \
699    alloc_pos = state->data_stack_base; \
700    TRACE(("allocating %s in %d (%d)\n", \
701           SFY(type), alloc_pos, sizeof(type))); \
702    if (state->data_stack_size < alloc_pos+sizeof(type)) { \
703        int j = data_stack_grow(state, sizeof(type)); \
704        if (j < 0) return j; \
705        if (ctx_pos != -1) \
706            DATA_STACK_LOOKUP_AT(state, SRE_MATCH_CONTEXT, ctx, ctx_pos); \
707    } \
708    ptr = (type*)(state->data_stack+alloc_pos); \
709    state->data_stack_base += sizeof(type); \
710} while (0)
711
712#define DATA_STACK_LOOKUP_AT(state, type, ptr, pos) \
713do { \
714    TRACE(("looking up %s at %d\n", SFY(type), pos)); \
715    ptr = (type*)(state->data_stack+pos); \
716} while (0)
717
718#define DATA_STACK_PUSH(state, data, size) \
719do { \
720    TRACE(("copy data in %p to %d (%d)\n", \
721           data, state->data_stack_base, size)); \
722    if (state->data_stack_size < state->data_stack_base+size) { \
723        int j = data_stack_grow(state, size); \
724        if (j < 0) return j; \
725        if (ctx_pos != -1) \
726            DATA_STACK_LOOKUP_AT(state, SRE_MATCH_CONTEXT, ctx, ctx_pos); \
727    } \
728    memcpy(state->data_stack+state->data_stack_base, data, size); \
729    state->data_stack_base += size; \
730} while (0)
731
732#define DATA_STACK_POP(state, data, size, discard) \
733do { \
734    TRACE(("copy data to %p from %d (%d)\n", \
735           data, state->data_stack_base-size, size)); \
736    memcpy(data, state->data_stack+state->data_stack_base-size, size); \
737    if (discard) \
738        state->data_stack_base -= size; \
739} while (0)
740
741#define DATA_STACK_POP_DISCARD(state, size) \
742do { \
743    TRACE(("discard data from %d (%d)\n", \
744           state->data_stack_base-size, size)); \
745    state->data_stack_base -= size; \
746} while(0)
747
748#define DATA_PUSH(x) \
749    DATA_STACK_PUSH(state, (x), sizeof(*(x)))
750#define DATA_POP(x) \
751    DATA_STACK_POP(state, (x), sizeof(*(x)), 1)
752#define DATA_POP_DISCARD(x) \
753    DATA_STACK_POP_DISCARD(state, sizeof(*(x)))
754#define DATA_ALLOC(t,p) \
755    DATA_STACK_ALLOC(state, t, p)
756#define DATA_LOOKUP_AT(t,p,pos) \
757    DATA_STACK_LOOKUP_AT(state,t,p,pos)
758
759#define MARK_PUSH(lastmark) \
760    do if (lastmark > 0) { \
761        i = lastmark; /* ctx->lastmark may change if reallocated */ \
762        DATA_STACK_PUSH(state, state->mark, (i+1)*sizeof(void*)); \
763    } while (0)
764#define MARK_POP(lastmark) \
765    do if (lastmark > 0) { \
766        DATA_STACK_POP(state, state->mark, (lastmark+1)*sizeof(void*), 1); \
767    } while (0)
768#define MARK_POP_KEEP(lastmark) \
769    do if (lastmark > 0) { \
770        DATA_STACK_POP(state, state->mark, (lastmark+1)*sizeof(void*), 0); \
771    } while (0)
772#define MARK_POP_DISCARD(lastmark) \
773    do if (lastmark > 0) { \
774        DATA_STACK_POP_DISCARD(state, (lastmark+1)*sizeof(void*)); \
775    } while (0)
776
777#define JUMP_NONE            0
778#define JUMP_MAX_UNTIL_1     1
779#define JUMP_MAX_UNTIL_2     2
780#define JUMP_MAX_UNTIL_3     3
781#define JUMP_MIN_UNTIL_1     4
782#define JUMP_MIN_UNTIL_2     5
783#define JUMP_MIN_UNTIL_3     6
784#define JUMP_REPEAT          7
785#define JUMP_REPEAT_ONE_1    8
786#define JUMP_REPEAT_ONE_2    9
787#define JUMP_MIN_REPEAT_ONE  10
788#define JUMP_BRANCH          11
789#define JUMP_ASSERT          12
790#define JUMP_ASSERT_NOT      13
791
792#define DO_JUMP(jumpvalue, jumplabel, nextpattern) \
793    DATA_ALLOC(SRE_MATCH_CONTEXT, nextctx); \
794    nextctx->last_ctx_pos = ctx_pos; \
795    nextctx->jump = jumpvalue; \
796    nextctx->pattern = nextpattern; \
797    ctx_pos = alloc_pos; \
798    ctx = nextctx; \
799    goto entrance; \
800    jumplabel: \
801    while (0) /* gcc doesn't like labels at end of scopes */ \
802
803typedef struct {
804    Py_ssize_t last_ctx_pos;
805    Py_ssize_t jump;
806    SRE_CHAR* ptr;
807    SRE_CODE* pattern;
808    Py_ssize_t count;
809    Py_ssize_t lastmark;
810    Py_ssize_t lastindex;
811    union {
812        SRE_CODE chr;
813        SRE_REPEAT* rep;
814    } u;
815} SRE_MATCH_CONTEXT;
816
817/* check if string matches the given pattern.  returns <0 for
818   error, 0 for failure, and 1 for success */
819LOCAL(Py_ssize_t)
820SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
821{
822    SRE_CHAR* end = (SRE_CHAR *)state->end;
823    Py_ssize_t alloc_pos, ctx_pos = -1;
824    Py_ssize_t i, ret = 0;
825    Py_ssize_t jump;
826    unsigned int sigcount=0;
827
828    SRE_MATCH_CONTEXT* ctx;
829    SRE_MATCH_CONTEXT* nextctx;
830
831    TRACE(("|%p|%p|ENTER\n", pattern, state->ptr));
832
833    DATA_ALLOC(SRE_MATCH_CONTEXT, ctx);
834    ctx->last_ctx_pos = -1;
835    ctx->jump = JUMP_NONE;
836    ctx->pattern = pattern;
837    ctx_pos = alloc_pos;
838
839entrance:
840
841    ctx->ptr = (SRE_CHAR *)state->ptr;
842
843    if (ctx->pattern[0] == SRE_OP_INFO) {
844        /* optimization info block */
845        /* <INFO> <1=skip> <2=flags> <3=min> ... */
846        if (ctx->pattern[3] && (end - ctx->ptr) < ctx->pattern[3]) {
847            TRACE(("reject (got %d chars, need %d)\n",
848                   (end - ctx->ptr), ctx->pattern[3]));
849            RETURN_FAILURE;
850        }
851        ctx->pattern += ctx->pattern[1] + 1;
852    }
853
854    for (;;) {
855        ++sigcount;
856        if ((0 == (sigcount & 0xfff)) && PyErr_CheckSignals())
857            RETURN_ERROR(SRE_ERROR_INTERRUPTED);
858
859        switch (*ctx->pattern++) {
860
861        case SRE_OP_MARK:
862            /* set mark */
863            /* <MARK> <gid> */
864            TRACE(("|%p|%p|MARK %d\n", ctx->pattern,
865                   ctx->ptr, ctx->pattern[0]));
866            i = ctx->pattern[0];
867            if (i & 1)
868                state->lastindex = i/2 + 1;
869            if (i > state->lastmark) {
870                /* state->lastmark is the highest valid index in the
871                   state->mark array.  If it is increased by more than 1,
872                   the intervening marks must be set to NULL to signal
873                   that these marks have not been encountered. */
874                Py_ssize_t j = state->lastmark + 1;
875                while (j < i)
876                    state->mark[j++] = NULL;
877                state->lastmark = i;
878            }
879            state->mark[i] = ctx->ptr;
880            ctx->pattern++;
881            break;
882
883        case SRE_OP_LITERAL:
884            /* match literal string */
885            /* <LITERAL> <code> */
886            TRACE(("|%p|%p|LITERAL %d\n", ctx->pattern,
887                   ctx->ptr, *ctx->pattern));
888            if (ctx->ptr >= end || (SRE_CODE) ctx->ptr[0] != ctx->pattern[0])
889                RETURN_FAILURE;
890            ctx->pattern++;
891            ctx->ptr++;
892            break;
893
894        case SRE_OP_NOT_LITERAL:
895            /* match anything that is not literal character */
896            /* <NOT_LITERAL> <code> */
897            TRACE(("|%p|%p|NOT_LITERAL %d\n", ctx->pattern,
898                   ctx->ptr, *ctx->pattern));
899            if (ctx->ptr >= end || (SRE_CODE) ctx->ptr[0] == ctx->pattern[0])
900                RETURN_FAILURE;
901            ctx->pattern++;
902            ctx->ptr++;
903            break;
904
905        case SRE_OP_SUCCESS:
906            /* end of pattern */
907            TRACE(("|%p|%p|SUCCESS\n", ctx->pattern, ctx->ptr));
908            state->ptr = ctx->ptr;
909            RETURN_SUCCESS;
910
911        case SRE_OP_AT:
912            /* match at given position */
913            /* <AT> <code> */
914            TRACE(("|%p|%p|AT %d\n", ctx->pattern, ctx->ptr, *ctx->pattern));
915            if (!SRE_AT(state, ctx->ptr, *ctx->pattern))
916                RETURN_FAILURE;
917            ctx->pattern++;
918            break;
919
920        case SRE_OP_CATEGORY:
921            /* match at given category */
922            /* <CATEGORY> <code> */
923            TRACE(("|%p|%p|CATEGORY %d\n", ctx->pattern,
924                   ctx->ptr, *ctx->pattern));
925            if (ctx->ptr >= end || !sre_category(ctx->pattern[0], ctx->ptr[0]))
926                RETURN_FAILURE;
927            ctx->pattern++;
928            ctx->ptr++;
929            break;
930
931        case SRE_OP_ANY:
932            /* match anything (except a newline) */
933            /* <ANY> */
934            TRACE(("|%p|%p|ANY\n", ctx->pattern, ctx->ptr));
935            if (ctx->ptr >= end || SRE_IS_LINEBREAK(ctx->ptr[0]))
936                RETURN_FAILURE;
937            ctx->ptr++;
938            break;
939
940        case SRE_OP_ANY_ALL:
941            /* match anything */
942            /* <ANY_ALL> */
943            TRACE(("|%p|%p|ANY_ALL\n", ctx->pattern, ctx->ptr));
944            if (ctx->ptr >= end)
945                RETURN_FAILURE;
946            ctx->ptr++;
947            break;
948
949        case SRE_OP_IN:
950            /* match set member (or non_member) */
951            /* <IN> <skip> <set> */
952            TRACE(("|%p|%p|IN\n", ctx->pattern, ctx->ptr));
953            if (ctx->ptr >= end || !SRE_CHARSET(ctx->pattern + 1, *ctx->ptr))
954                RETURN_FAILURE;
955            ctx->pattern += ctx->pattern[0];
956            ctx->ptr++;
957            break;
958
959        case SRE_OP_LITERAL_IGNORE:
960            TRACE(("|%p|%p|LITERAL_IGNORE %d\n",
961                   ctx->pattern, ctx->ptr, ctx->pattern[0]));
962            if (ctx->ptr >= end ||
963                state->lower(*ctx->ptr) != state->lower(*ctx->pattern))
964                RETURN_FAILURE;
965            ctx->pattern++;
966            ctx->ptr++;
967            break;
968
969        case SRE_OP_NOT_LITERAL_IGNORE:
970            TRACE(("|%p|%p|NOT_LITERAL_IGNORE %d\n",
971                   ctx->pattern, ctx->ptr, *ctx->pattern));
972            if (ctx->ptr >= end ||
973                state->lower(*ctx->ptr) == state->lower(*ctx->pattern))
974                RETURN_FAILURE;
975            ctx->pattern++;
976            ctx->ptr++;
977            break;
978
979        case SRE_OP_IN_IGNORE:
980            TRACE(("|%p|%p|IN_IGNORE\n", ctx->pattern, ctx->ptr));
981            if (ctx->ptr >= end
982                || !SRE_CHARSET(ctx->pattern+1,
983                                (SRE_CODE)state->lower(*ctx->ptr)))
984                RETURN_FAILURE;
985            ctx->pattern += ctx->pattern[0];
986            ctx->ptr++;
987            break;
988
989        case SRE_OP_JUMP:
990        case SRE_OP_INFO:
991            /* jump forward */
992            /* <JUMP> <offset> */
993            TRACE(("|%p|%p|JUMP %d\n", ctx->pattern,
994                   ctx->ptr, ctx->pattern[0]));
995            ctx->pattern += ctx->pattern[0];
996            break;
997
998        case SRE_OP_BRANCH:
999            /* alternation */
1000            /* <BRANCH> <0=skip> code <JUMP> ... <NULL> */
1001            TRACE(("|%p|%p|BRANCH\n", ctx->pattern, ctx->ptr));
1002            LASTMARK_SAVE();
1003            ctx->u.rep = state->repeat;
1004            if (ctx->u.rep)
1005                MARK_PUSH(ctx->lastmark);
1006            for (; ctx->pattern[0]; ctx->pattern += ctx->pattern[0]) {
1007                if (ctx->pattern[1] == SRE_OP_LITERAL &&
1008                    (ctx->ptr >= end ||
1009                     (SRE_CODE) *ctx->ptr != ctx->pattern[2]))
1010                    continue;
1011                if (ctx->pattern[1] == SRE_OP_IN &&
1012                    (ctx->ptr >= end ||
1013                     !SRE_CHARSET(ctx->pattern + 3, (SRE_CODE) *ctx->ptr)))
1014                    continue;
1015                state->ptr = ctx->ptr;
1016                DO_JUMP(JUMP_BRANCH, jump_branch, ctx->pattern+1);
1017                if (ret) {
1018                    if (ctx->u.rep)
1019                        MARK_POP_DISCARD(ctx->lastmark);
1020                    RETURN_ON_ERROR(ret);
1021                    RETURN_SUCCESS;
1022                }
1023                if (ctx->u.rep)
1024                    MARK_POP_KEEP(ctx->lastmark);
1025                LASTMARK_RESTORE();
1026            }
1027            if (ctx->u.rep)
1028                MARK_POP_DISCARD(ctx->lastmark);
1029            RETURN_FAILURE;
1030
1031        case SRE_OP_REPEAT_ONE:
1032            /* match repeated sequence (maximizing regexp) */
1033
1034            /* this operator only works if the repeated item is
1035               exactly one character wide, and we're not already
1036               collecting backtracking points.  for other cases,
1037               use the MAX_REPEAT operator */
1038
1039            /* <REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS> tail */
1040
1041            TRACE(("|%p|%p|REPEAT_ONE %d %d\n", ctx->pattern, ctx->ptr,
1042                   ctx->pattern[1], ctx->pattern[2]));
1043
1044            if (ctx->ptr + ctx->pattern[1] > end)
1045                RETURN_FAILURE; /* cannot match */
1046
1047            state->ptr = ctx->ptr;
1048
1049            ret = SRE_COUNT(state, ctx->pattern+3, ctx->pattern[2]);
1050            RETURN_ON_ERROR(ret);
1051            DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
1052            ctx->count = ret;
1053            ctx->ptr += ctx->count;
1054
1055            /* when we arrive here, count contains the number of
1056               matches, and ctx->ptr points to the tail of the target
1057               string.  check if the rest of the pattern matches,
1058               and backtrack if not. */
1059
1060            if (ctx->count < (Py_ssize_t) ctx->pattern[1])
1061                RETURN_FAILURE;
1062
1063            if (ctx->pattern[ctx->pattern[0]] == SRE_OP_SUCCESS) {
1064                /* tail is empty.  we're finished */
1065                state->ptr = ctx->ptr;
1066                RETURN_SUCCESS;
1067            }
1068
1069            LASTMARK_SAVE();
1070
1071            if (ctx->pattern[ctx->pattern[0]] == SRE_OP_LITERAL) {
1072                /* tail starts with a literal. skip positions where
1073                   the rest of the pattern cannot possibly match */
1074                ctx->u.chr = ctx->pattern[ctx->pattern[0]+1];
1075                for (;;) {
1076                    while (ctx->count >= (Py_ssize_t) ctx->pattern[1] &&
1077                           (ctx->ptr >= end || *ctx->ptr != ctx->u.chr)) {
1078                        ctx->ptr--;
1079                        ctx->count--;
1080                    }
1081                    if (ctx->count < (Py_ssize_t) ctx->pattern[1])
1082                        break;
1083                    state->ptr = ctx->ptr;
1084                    DO_JUMP(JUMP_REPEAT_ONE_1, jump_repeat_one_1,
1085                            ctx->pattern+ctx->pattern[0]);
1086                    if (ret) {
1087                        RETURN_ON_ERROR(ret);
1088                        RETURN_SUCCESS;
1089                    }
1090
1091                    LASTMARK_RESTORE();
1092
1093                    ctx->ptr--;
1094                    ctx->count--;
1095                }
1096
1097            } else {
1098                /* general case */
1099                while (ctx->count >= (Py_ssize_t) ctx->pattern[1]) {
1100                    state->ptr = ctx->ptr;
1101                    DO_JUMP(JUMP_REPEAT_ONE_2, jump_repeat_one_2,
1102                            ctx->pattern+ctx->pattern[0]);
1103                    if (ret) {
1104                        RETURN_ON_ERROR(ret);
1105                        RETURN_SUCCESS;
1106                    }
1107                    ctx->ptr--;
1108                    ctx->count--;
1109                    LASTMARK_RESTORE();
1110                }
1111            }
1112            RETURN_FAILURE;
1113
1114        case SRE_OP_MIN_REPEAT_ONE:
1115            /* match repeated sequence (minimizing regexp) */
1116
1117            /* this operator only works if the repeated item is
1118               exactly one character wide, and we're not already
1119               collecting backtracking points.  for other cases,
1120               use the MIN_REPEAT operator */
1121
1122            /* <MIN_REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS> tail */
1123
1124            TRACE(("|%p|%p|MIN_REPEAT_ONE %d %d\n", ctx->pattern, ctx->ptr,
1125                   ctx->pattern[1], ctx->pattern[2]));
1126
1127            if (ctx->ptr + ctx->pattern[1] > end)
1128                RETURN_FAILURE; /* cannot match */
1129
1130            state->ptr = ctx->ptr;
1131
1132            if (ctx->pattern[1] == 0)
1133                ctx->count = 0;
1134            else {
1135                /* count using pattern min as the maximum */
1136                ret = SRE_COUNT(state, ctx->pattern+3, ctx->pattern[1]);
1137                RETURN_ON_ERROR(ret);
1138                DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
1139                if (ret < (Py_ssize_t) ctx->pattern[1])
1140                    /* didn't match minimum number of times */
1141                    RETURN_FAILURE;
1142                /* advance past minimum matches of repeat */
1143                ctx->count = ret;
1144                ctx->ptr += ctx->count;
1145            }
1146
1147            if (ctx->pattern[ctx->pattern[0]] == SRE_OP_SUCCESS) {
1148                /* tail is empty.  we're finished */
1149                state->ptr = ctx->ptr;
1150                RETURN_SUCCESS;
1151
1152            } else {
1153                /* general case */
1154                LASTMARK_SAVE();
1155                while ((Py_ssize_t)ctx->pattern[2] == 65535
1156                       || ctx->count <= (Py_ssize_t)ctx->pattern[2]) {
1157                    state->ptr = ctx->ptr;
1158                    DO_JUMP(JUMP_MIN_REPEAT_ONE,jump_min_repeat_one,
1159                            ctx->pattern+ctx->pattern[0]);
1160                    if (ret) {
1161                        RETURN_ON_ERROR(ret);
1162                        RETURN_SUCCESS;
1163                    }
1164                    state->ptr = ctx->ptr;
1165                    ret = SRE_COUNT(state, ctx->pattern+3, 1);
1166                    RETURN_ON_ERROR(ret);
1167                    DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
1168                    if (ret == 0)
1169                        break;
1170                    assert(ret == 1);
1171                    ctx->ptr++;
1172                    ctx->count++;
1173                    LASTMARK_RESTORE();
1174                }
1175            }
1176            RETURN_FAILURE;
1177
1178        case SRE_OP_REPEAT:
1179            /* create repeat context.  all the hard work is done
1180               by the UNTIL operator (MAX_UNTIL, MIN_UNTIL) */
1181            /* <REPEAT> <skip> <1=min> <2=max> item <UNTIL> tail */
1182            TRACE(("|%p|%p|REPEAT %d %d\n", ctx->pattern, ctx->ptr,
1183                   ctx->pattern[1], ctx->pattern[2]));
1184
1185            /* install new repeat context */
1186            ctx->u.rep = (SRE_REPEAT*) PyObject_MALLOC(sizeof(*ctx->u.rep));
1187            if (!ctx->u.rep) {
1188                PyErr_NoMemory();
1189                RETURN_FAILURE;
1190            }
1191            ctx->u.rep->count = -1;
1192            ctx->u.rep->pattern = ctx->pattern;
1193            ctx->u.rep->prev = state->repeat;
1194            ctx->u.rep->last_ptr = NULL;
1195            state->repeat = ctx->u.rep;
1196
1197            state->ptr = ctx->ptr;
1198            DO_JUMP(JUMP_REPEAT, jump_repeat, ctx->pattern+ctx->pattern[0]);
1199            state->repeat = ctx->u.rep->prev;
1200            PyObject_FREE(ctx->u.rep);
1201
1202            if (ret) {
1203                RETURN_ON_ERROR(ret);
1204                RETURN_SUCCESS;
1205            }
1206            RETURN_FAILURE;
1207
1208        case SRE_OP_MAX_UNTIL:
1209            /* maximizing repeat */
1210            /* <REPEAT> <skip> <1=min> <2=max> item <MAX_UNTIL> tail */
1211
1212            /* FIXME: we probably need to deal with zero-width
1213               matches in here... */
1214
1215            ctx->u.rep = state->repeat;
1216            if (!ctx->u.rep)
1217                RETURN_ERROR(SRE_ERROR_STATE);
1218
1219            state->ptr = ctx->ptr;
1220
1221            ctx->count = ctx->u.rep->count+1;
1222
1223            TRACE(("|%p|%p|MAX_UNTIL %d\n", ctx->pattern,
1224                   ctx->ptr, ctx->count));
1225
1226            if (ctx->count < ctx->u.rep->pattern[1]) {
1227                /* not enough matches */
1228                ctx->u.rep->count = ctx->count;
1229                DO_JUMP(JUMP_MAX_UNTIL_1, jump_max_until_1,
1230                        ctx->u.rep->pattern+3);
1231                if (ret) {
1232                    RETURN_ON_ERROR(ret);
1233                    RETURN_SUCCESS;
1234                }
1235                ctx->u.rep->count = ctx->count-1;
1236                state->ptr = ctx->ptr;
1237                RETURN_FAILURE;
1238            }
1239
1240            if ((ctx->count < ctx->u.rep->pattern[2] ||
1241                ctx->u.rep->pattern[2] == 65535) &&
1242                state->ptr != ctx->u.rep->last_ptr) {
1243                /* we may have enough matches, but if we can
1244                   match another item, do so */
1245                ctx->u.rep->count = ctx->count;
1246                LASTMARK_SAVE();
1247                MARK_PUSH(ctx->lastmark);
1248                /* zero-width match protection */
1249                DATA_PUSH(&ctx->u.rep->last_ptr);
1250                ctx->u.rep->last_ptr = state->ptr;
1251                DO_JUMP(JUMP_MAX_UNTIL_2, jump_max_until_2,
1252                        ctx->u.rep->pattern+3);
1253                DATA_POP(&ctx->u.rep->last_ptr);
1254                if (ret) {
1255                    MARK_POP_DISCARD(ctx->lastmark);
1256                    RETURN_ON_ERROR(ret);
1257                    RETURN_SUCCESS;
1258                }
1259                MARK_POP(ctx->lastmark);
1260                LASTMARK_RESTORE();
1261                ctx->u.rep->count = ctx->count-1;
1262                state->ptr = ctx->ptr;
1263            }
1264
1265            /* cannot match more repeated items here.  make sure the
1266               tail matches */
1267            state->repeat = ctx->u.rep->prev;
1268            DO_JUMP(JUMP_MAX_UNTIL_3, jump_max_until_3, ctx->pattern);
1269            RETURN_ON_SUCCESS(ret);
1270            state->repeat = ctx->u.rep;
1271            state->ptr = ctx->ptr;
1272            RETURN_FAILURE;
1273
1274        case SRE_OP_MIN_UNTIL:
1275            /* minimizing repeat */
1276            /* <REPEAT> <skip> <1=min> <2=max> item <MIN_UNTIL> tail */
1277
1278            ctx->u.rep = state->repeat;
1279            if (!ctx->u.rep)
1280                RETURN_ERROR(SRE_ERROR_STATE);
1281
1282            state->ptr = ctx->ptr;
1283
1284            ctx->count = ctx->u.rep->count+1;
1285
1286            TRACE(("|%p|%p|MIN_UNTIL %d %p\n", ctx->pattern,
1287                   ctx->ptr, ctx->count, ctx->u.rep->pattern));
1288
1289            if (ctx->count < ctx->u.rep->pattern[1]) {
1290                /* not enough matches */
1291                ctx->u.rep->count = ctx->count;
1292                DO_JUMP(JUMP_MIN_UNTIL_1, jump_min_until_1,
1293                        ctx->u.rep->pattern+3);
1294                if (ret) {
1295                    RETURN_ON_ERROR(ret);
1296                    RETURN_SUCCESS;
1297                }
1298                ctx->u.rep->count = ctx->count-1;
1299                state->ptr = ctx->ptr;
1300                RETURN_FAILURE;
1301            }
1302
1303            LASTMARK_SAVE();
1304
1305            /* see if the tail matches */
1306            state->repeat = ctx->u.rep->prev;
1307            DO_JUMP(JUMP_MIN_UNTIL_2, jump_min_until_2, ctx->pattern);
1308            if (ret) {
1309                RETURN_ON_ERROR(ret);
1310                RETURN_SUCCESS;
1311            }
1312
1313            state->repeat = ctx->u.rep;
1314            state->ptr = ctx->ptr;
1315
1316            LASTMARK_RESTORE();
1317
1318            if (ctx->count >= ctx->u.rep->pattern[2]
1319                && ctx->u.rep->pattern[2] != 65535)
1320                RETURN_FAILURE;
1321
1322            ctx->u.rep->count = ctx->count;
1323            DO_JUMP(JUMP_MIN_UNTIL_3,jump_min_until_3,
1324                    ctx->u.rep->pattern+3);
1325            if (ret) {
1326                RETURN_ON_ERROR(ret);
1327                RETURN_SUCCESS;
1328            }
1329            ctx->u.rep->count = ctx->count-1;
1330            state->ptr = ctx->ptr;
1331            RETURN_FAILURE;
1332
1333        case SRE_OP_GROUPREF:
1334            /* match backreference */
1335            TRACE(("|%p|%p|GROUPREF %d\n", ctx->pattern,
1336                   ctx->ptr, ctx->pattern[0]));
1337            i = ctx->pattern[0];
1338            {
1339                Py_ssize_t groupref = i+i;
1340                if (groupref >= state->lastmark) {
1341                    RETURN_FAILURE;
1342                } else {
1343                    SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref];
1344                    SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1];
1345                    if (!p || !e || e < p)
1346                        RETURN_FAILURE;
1347                    while (p < e) {
1348                        if (ctx->ptr >= end || *ctx->ptr != *p)
1349                            RETURN_FAILURE;
1350                        p++; ctx->ptr++;
1351                    }
1352                }
1353            }
1354            ctx->pattern++;
1355            break;
1356
1357        case SRE_OP_GROUPREF_IGNORE:
1358            /* match backreference */
1359            TRACE(("|%p|%p|GROUPREF_IGNORE %d\n", ctx->pattern,
1360                   ctx->ptr, ctx->pattern[0]));
1361            i = ctx->pattern[0];
1362            {
1363                Py_ssize_t groupref = i+i;
1364                if (groupref >= state->lastmark) {
1365                    RETURN_FAILURE;
1366                } else {
1367                    SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref];
1368                    SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1];
1369                    if (!p || !e || e < p)
1370                        RETURN_FAILURE;
1371                    while (p < e) {
1372                        if (ctx->ptr >= end ||
1373                            state->lower(*ctx->ptr) != state->lower(*p))
1374                            RETURN_FAILURE;
1375                        p++; ctx->ptr++;
1376                    }
1377                }
1378            }
1379            ctx->pattern++;
1380            break;
1381
1382        case SRE_OP_GROUPREF_EXISTS:
1383            TRACE(("|%p|%p|GROUPREF_EXISTS %d\n", ctx->pattern,
1384                   ctx->ptr, ctx->pattern[0]));
1385            /* <GROUPREF_EXISTS> <group> <skip> codeyes <JUMP> codeno ... */
1386            i = ctx->pattern[0];
1387            {
1388                Py_ssize_t groupref = i+i;
1389                if (groupref >= state->lastmark) {
1390                    ctx->pattern += ctx->pattern[1];
1391                    break;
1392                } else {
1393                    SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref];
1394                    SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1];
1395                    if (!p || !e || e < p) {
1396                        ctx->pattern += ctx->pattern[1];
1397                        break;
1398                    }
1399                }
1400            }
1401            ctx->pattern += 2;
1402            break;
1403
1404        case SRE_OP_ASSERT:
1405            /* assert subpattern */
1406            /* <ASSERT> <skip> <back> <pattern> */
1407            TRACE(("|%p|%p|ASSERT %d\n", ctx->pattern,
1408                   ctx->ptr, ctx->pattern[1]));
1409            state->ptr = ctx->ptr - ctx->pattern[1];
1410            if (state->ptr < state->beginning)
1411                RETURN_FAILURE;
1412            DO_JUMP(JUMP_ASSERT, jump_assert, ctx->pattern+2);
1413            RETURN_ON_FAILURE(ret);
1414            ctx->pattern += ctx->pattern[0];
1415            break;
1416
1417        case SRE_OP_ASSERT_NOT:
1418            /* assert not subpattern */
1419            /* <ASSERT_NOT> <skip> <back> <pattern> */
1420            TRACE(("|%p|%p|ASSERT_NOT %d\n", ctx->pattern,
1421                   ctx->ptr, ctx->pattern[1]));
1422            state->ptr = ctx->ptr - ctx->pattern[1];
1423            if (state->ptr >= state->beginning) {
1424                DO_JUMP(JUMP_ASSERT_NOT, jump_assert_not, ctx->pattern+2);
1425                if (ret) {
1426                    RETURN_ON_ERROR(ret);
1427                    RETURN_FAILURE;
1428                }
1429            }
1430            ctx->pattern += ctx->pattern[0];
1431            break;
1432
1433        case SRE_OP_FAILURE:
1434            /* immediate failure */
1435            TRACE(("|%p|%p|FAILURE\n", ctx->pattern, ctx->ptr));
1436            RETURN_FAILURE;
1437
1438        default:
1439            TRACE(("|%p|%p|UNKNOWN %d\n", ctx->pattern, ctx->ptr,
1440                   ctx->pattern[-1]));
1441            RETURN_ERROR(SRE_ERROR_ILLEGAL);
1442        }
1443    }
1444
1445exit:
1446    ctx_pos = ctx->last_ctx_pos;
1447    jump = ctx->jump;
1448    DATA_POP_DISCARD(ctx);
1449    if (ctx_pos == -1)
1450        return ret;
1451    DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
1452
1453    switch (jump) {
1454        case JUMP_MAX_UNTIL_2:
1455            TRACE(("|%p|%p|JUMP_MAX_UNTIL_2\n", ctx->pattern, ctx->ptr));
1456            goto jump_max_until_2;
1457        case JUMP_MAX_UNTIL_3:
1458            TRACE(("|%p|%p|JUMP_MAX_UNTIL_3\n", ctx->pattern, ctx->ptr));
1459            goto jump_max_until_3;
1460        case JUMP_MIN_UNTIL_2:
1461            TRACE(("|%p|%p|JUMP_MIN_UNTIL_2\n", ctx->pattern, ctx->ptr));
1462            goto jump_min_until_2;
1463        case JUMP_MIN_UNTIL_3:
1464            TRACE(("|%p|%p|JUMP_MIN_UNTIL_3\n", ctx->pattern, ctx->ptr));
1465            goto jump_min_until_3;
1466        case JUMP_BRANCH:
1467            TRACE(("|%p|%p|JUMP_BRANCH\n", ctx->pattern, ctx->ptr));
1468            goto jump_branch;
1469        case JUMP_MAX_UNTIL_1:
1470            TRACE(("|%p|%p|JUMP_MAX_UNTIL_1\n", ctx->pattern, ctx->ptr));
1471            goto jump_max_until_1;
1472        case JUMP_MIN_UNTIL_1:
1473            TRACE(("|%p|%p|JUMP_MIN_UNTIL_1\n", ctx->pattern, ctx->ptr));
1474            goto jump_min_until_1;
1475        case JUMP_REPEAT:
1476            TRACE(("|%p|%p|JUMP_REPEAT\n", ctx->pattern, ctx->ptr));
1477            goto jump_repeat;
1478        case JUMP_REPEAT_ONE_1:
1479            TRACE(("|%p|%p|JUMP_REPEAT_ONE_1\n", ctx->pattern, ctx->ptr));
1480            goto jump_repeat_one_1;
1481        case JUMP_REPEAT_ONE_2:
1482            TRACE(("|%p|%p|JUMP_REPEAT_ONE_2\n", ctx->pattern, ctx->ptr));
1483            goto jump_repeat_one_2;
1484        case JUMP_MIN_REPEAT_ONE:
1485            TRACE(("|%p|%p|JUMP_MIN_REPEAT_ONE\n", ctx->pattern, ctx->ptr));
1486            goto jump_min_repeat_one;
1487        case JUMP_ASSERT:
1488            TRACE(("|%p|%p|JUMP_ASSERT\n", ctx->pattern, ctx->ptr));
1489            goto jump_assert;
1490        case JUMP_ASSERT_NOT:
1491            TRACE(("|%p|%p|JUMP_ASSERT_NOT\n", ctx->pattern, ctx->ptr));
1492            goto jump_assert_not;
1493        case JUMP_NONE:
1494            TRACE(("|%p|%p|RETURN %d\n", ctx->pattern, ctx->ptr, ret));
1495            break;
1496    }
1497
1498    return ret; /* should never get here */
1499}
1500
1501LOCAL(Py_ssize_t)
1502SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern)
1503{
1504    SRE_CHAR* ptr = (SRE_CHAR *)state->start;
1505    SRE_CHAR* end = (SRE_CHAR *)state->end;
1506    Py_ssize_t status = 0;
1507    Py_ssize_t prefix_len = 0;
1508    Py_ssize_t prefix_skip = 0;
1509    SRE_CODE* prefix = NULL;
1510    SRE_CODE* charset = NULL;
1511    SRE_CODE* overlap = NULL;
1512    int flags = 0;
1513
1514    if (pattern[0] == SRE_OP_INFO) {
1515        /* optimization info block */
1516        /* <INFO> <1=skip> <2=flags> <3=min> <4=max> <5=prefix info>  */
1517
1518        flags = pattern[2];
1519
1520        if (pattern[3] > 1) {
1521            /* adjust end point (but make sure we leave at least one
1522               character in there, so literal search will work) */
1523            end -= pattern[3]-1;
1524            if (end <= ptr)
1525                end = ptr+1;
1526        }
1527
1528        if (flags & SRE_INFO_PREFIX) {
1529            /* pattern starts with a known prefix */
1530            /* <length> <skip> <prefix data> <overlap data> */
1531            prefix_len = pattern[5];
1532            prefix_skip = pattern[6];
1533            prefix = pattern + 7;
1534            overlap = prefix + prefix_len - 1;
1535        } else if (flags & SRE_INFO_CHARSET)
1536            /* pattern starts with a character from a known set */
1537            /* <charset> */
1538            charset = pattern + 5;
1539
1540        pattern += 1 + pattern[1];
1541    }
1542
1543    TRACE(("prefix = %p %d %d\n", prefix, prefix_len, prefix_skip));
1544    TRACE(("charset = %p\n", charset));
1545
1546#if defined(USE_FAST_SEARCH)
1547    if (prefix_len > 1) {
1548        /* pattern starts with a known prefix.  use the overlap
1549           table to skip forward as fast as we possibly can */
1550        Py_ssize_t i = 0;
1551        end = (SRE_CHAR *)state->end;
1552        while (ptr < end) {
1553            for (;;) {
1554                if ((SRE_CODE) ptr[0] != prefix[i]) {
1555                    if (!i)
1556                        break;
1557                    else
1558                        i = overlap[i];
1559                } else {
1560                    if (++i == prefix_len) {
1561                        /* found a potential match */
1562                        TRACE(("|%p|%p|SEARCH SCAN\n", pattern, ptr));
1563                        state->start = ptr + 1 - prefix_len;
1564                        state->ptr = ptr + 1 - prefix_len + prefix_skip;
1565                        if (flags & SRE_INFO_LITERAL)
1566                            return 1; /* we got all of it */
1567                        status = SRE_MATCH(state, pattern + 2*prefix_skip);
1568                        if (status != 0)
1569                            return status;
1570                        /* close but no cigar -- try again */
1571                        i = overlap[i];
1572                    }
1573                    break;
1574                }
1575            }
1576            ptr++;
1577        }
1578        return 0;
1579    }
1580#endif
1581
1582    if (pattern[0] == SRE_OP_LITERAL) {
1583        /* pattern starts with a literal character.  this is used
1584           for short prefixes, and if fast search is disabled */
1585        SRE_CODE chr = pattern[1];
1586        end = (SRE_CHAR *)state->end;
1587        for (;;) {
1588            while (ptr < end && (SRE_CODE) ptr[0] != chr)
1589                ptr++;
1590            if (ptr >= end)
1591                return 0;
1592            TRACE(("|%p|%p|SEARCH LITERAL\n", pattern, ptr));
1593            state->start = ptr;
1594            state->ptr = ++ptr;
1595            if (flags & SRE_INFO_LITERAL)
1596                return 1; /* we got all of it */
1597            status = SRE_MATCH(state, pattern + 2);
1598            if (status != 0)
1599                break;
1600        }
1601    } else if (charset) {
1602        /* pattern starts with a character from a known set */
1603        end = (SRE_CHAR *)state->end;
1604        for (;;) {
1605            while (ptr < end && !SRE_CHARSET(charset, ptr[0]))
1606                ptr++;
1607            if (ptr >= end)
1608                return 0;
1609            TRACE(("|%p|%p|SEARCH CHARSET\n", pattern, ptr));
1610            state->start = ptr;
1611            state->ptr = ptr;
1612            status = SRE_MATCH(state, pattern);
1613            if (status != 0)
1614                break;
1615            ptr++;
1616        }
1617    } else
1618        /* general case */
1619        while (ptr <= end) {
1620            TRACE(("|%p|%p|SEARCH\n", pattern, ptr));
1621            state->start = state->ptr = ptr++;
1622            status = SRE_MATCH(state, pattern);
1623            if (status != 0)
1624                break;
1625        }
1626
1627    return status;
1628}
1629
1630LOCAL(int)
1631SRE_LITERAL_TEMPLATE(SRE_CHAR* ptr, Py_ssize_t len)
1632{
1633    /* check if given string is a literal template (i.e. no escapes) */
1634    while (len-- > 0)
1635        if (*ptr++ == '\\')
1636            return 0;
1637    return 1;
1638}
1639
1640#if !defined(SRE_RECURSIVE)
1641
1642/* -------------------------------------------------------------------- */
1643/* factories and destructors */
1644
1645/* see sre.h for object declarations */
1646static PyObject*pattern_new_match(PatternObject*, SRE_STATE*, int);
1647static PyObject*pattern_scanner(PatternObject*, PyObject*);
1648
1649static PyObject *
1650sre_codesize(PyObject* self, PyObject *unused)
1651{
1652    return Py_BuildValue("l", sizeof(SRE_CODE));
1653}
1654
1655static PyObject *
1656sre_getlower(PyObject* self, PyObject* args)
1657{
1658    int character, flags;
1659    if (!PyArg_ParseTuple(args, "ii", &character, &flags))
1660        return NULL;
1661    if (flags & SRE_FLAG_LOCALE)
1662        return Py_BuildValue("i", sre_lower_locale(character));
1663    if (flags & SRE_FLAG_UNICODE)
1664#if defined(HAVE_UNICODE)
1665        return Py_BuildValue("i", sre_lower_unicode(character));
1666#else
1667        return Py_BuildValue("i", sre_lower_locale(character));
1668#endif
1669    return Py_BuildValue("i", sre_lower(character));
1670}
1671
1672LOCAL(void)
1673state_reset(SRE_STATE* state)
1674{
1675    /* FIXME: dynamic! */
1676    /*memset(state->mark, 0, sizeof(*state->mark) * SRE_MARK_SIZE);*/
1677
1678    state->lastmark = -1;
1679    state->lastindex = -1;
1680
1681    state->repeat = NULL;
1682
1683    data_stack_dealloc(state);
1684}
1685
1686static void*
1687getstring(PyObject* string, Py_ssize_t* p_length, int* p_charsize)
1688{
1689    /* given a python object, return a data pointer, a length (in
1690       characters), and a character size.  return NULL if the object
1691       is not a string (or not compatible) */
1692
1693    PyBufferProcs *buffer;
1694    Py_ssize_t size, bytes;
1695    int charsize;
1696    void* ptr;
1697
1698#if defined(HAVE_UNICODE)
1699    if (PyUnicode_Check(string)) {
1700        /* unicode strings doesn't always support the buffer interface */
1701        ptr = (void*) PyUnicode_AS_DATA(string);
1702        /* bytes = PyUnicode_GET_DATA_SIZE(string); */
1703        size = PyUnicode_GET_SIZE(string);
1704        charsize = sizeof(Py_UNICODE);
1705
1706    } else {
1707#endif
1708
1709    /* get pointer to string buffer */
1710    buffer = Py_TYPE(string)->tp_as_buffer;
1711    if (!buffer || !buffer->bf_getreadbuffer || !buffer->bf_getsegcount ||
1712        buffer->bf_getsegcount(string, NULL) != 1) {
1713        PyErr_SetString(PyExc_TypeError, "expected string or buffer");
1714        return NULL;
1715    }
1716
1717    /* determine buffer size */
1718    bytes = buffer->bf_getreadbuffer(string, 0, &ptr);
1719    if (bytes < 0) {
1720        PyErr_SetString(PyExc_TypeError, "buffer has negative size");
1721        return NULL;
1722    }
1723
1724    /* determine character size */
1725#if PY_VERSION_HEX >= 0x01060000
1726    size = PyObject_Size(string);
1727#else
1728    size = PyObject_Length(string);
1729#endif
1730
1731    if (PyString_Check(string) || bytes == size)
1732        charsize = 1;
1733#if defined(HAVE_UNICODE)
1734    else if (bytes == (Py_ssize_t) (size * sizeof(Py_UNICODE)))
1735        charsize = sizeof(Py_UNICODE);
1736#endif
1737    else {
1738        PyErr_SetString(PyExc_TypeError, "buffer size mismatch");
1739        return NULL;
1740    }
1741
1742#if defined(HAVE_UNICODE)
1743    }
1744#endif
1745
1746    *p_length = size;
1747    *p_charsize = charsize;
1748
1749    return ptr;
1750}
1751
1752LOCAL(PyObject*)
1753state_init(SRE_STATE* state, PatternObject* pattern, PyObject* string,
1754           Py_ssize_t start, Py_ssize_t end)
1755{
1756    /* prepare state object */
1757
1758    Py_ssize_t length;
1759    int charsize;
1760    void* ptr;
1761
1762    memset(state, 0, sizeof(SRE_STATE));
1763
1764    state->lastmark = -1;
1765    state->lastindex = -1;
1766
1767    ptr = getstring(string, &length, &charsize);
1768    if (!ptr)
1769        return NULL;
1770
1771    /* adjust boundaries */
1772    if (start < 0)
1773        start = 0;
1774    else if (start > length)
1775        start = length;
1776
1777    if (end < 0)
1778        end = 0;
1779    else if (end > length)
1780        end = length;
1781
1782    state->charsize = charsize;
1783
1784    state->beginning = ptr;
1785
1786    state->start = (void*) ((char*) ptr + start * state->charsize);
1787    state->end = (void*) ((char*) ptr + end * state->charsize);
1788
1789    Py_INCREF(string);
1790    state->string = string;
1791    state->pos = start;
1792    state->endpos = end;
1793
1794    if (pattern->flags & SRE_FLAG_LOCALE)
1795        state->lower = sre_lower_locale;
1796    else if (pattern->flags & SRE_FLAG_UNICODE)
1797#if defined(HAVE_UNICODE)
1798        state->lower = sre_lower_unicode;
1799#else
1800        state->lower = sre_lower_locale;
1801#endif
1802    else
1803        state->lower = sre_lower;
1804
1805    return string;
1806}
1807
1808LOCAL(void)
1809state_fini(SRE_STATE* state)
1810{
1811    Py_XDECREF(state->string);
1812    data_stack_dealloc(state);
1813}
1814
1815/* calculate offset from start of string */
1816#define STATE_OFFSET(state, member)\
1817    (((char*)(member) - (char*)(state)->beginning) / (state)->charsize)
1818
1819LOCAL(PyObject*)
1820state_getslice(SRE_STATE* state, Py_ssize_t index, PyObject* string, int empty)
1821{
1822    Py_ssize_t i, j;
1823
1824    index = (index - 1) * 2;
1825
1826    if (string == Py_None || index >= state->lastmark || !state->mark[index] || !state->mark[index+1]) {
1827        if (empty)
1828            /* want empty string */
1829            i = j = 0;
1830        else {
1831            Py_INCREF(Py_None);
1832            return Py_None;
1833        }
1834    } else {
1835        i = STATE_OFFSET(state, state->mark[index]);
1836        j = STATE_OFFSET(state, state->mark[index+1]);
1837    }
1838
1839    return PySequence_GetSlice(string, i, j);
1840}
1841
1842static void
1843pattern_error(int status)
1844{
1845    switch (status) {
1846    case SRE_ERROR_RECURSION_LIMIT:
1847        PyErr_SetString(
1848            PyExc_RuntimeError,
1849            "maximum recursion limit exceeded"
1850            );
1851        break;
1852    case SRE_ERROR_MEMORY:
1853        PyErr_NoMemory();
1854        break;
1855    case SRE_ERROR_INTERRUPTED:
1856    /* An exception has already been raised, so let it fly */
1857        break;
1858    default:
1859        /* other error codes indicate compiler/engine bugs */
1860        PyErr_SetString(
1861            PyExc_RuntimeError,
1862            "internal error in regular expression engine"
1863            );
1864    }
1865}
1866
1867static void
1868pattern_dealloc(PatternObject* self)
1869{
1870    if (self->weakreflist != NULL)
1871        PyObject_ClearWeakRefs((PyObject *) self);
1872    Py_XDECREF(self->pattern);
1873    Py_XDECREF(self->groupindex);
1874    Py_XDECREF(self->indexgroup);
1875    PyObject_DEL(self);
1876}
1877
1878static PyObject*
1879pattern_match(PatternObject* self, PyObject* args, PyObject* kw)
1880{
1881    SRE_STATE state;
1882    int status;
1883
1884    PyObject* string;
1885    Py_ssize_t start = 0;
1886    Py_ssize_t end = PY_SSIZE_T_MAX;
1887    static char* kwlist[] = { "pattern", "pos", "endpos", NULL };
1888    if (!PyArg_ParseTupleAndKeywords(args, kw, "O|nn:match", kwlist,
1889                                     &string, &start, &end))
1890        return NULL;
1891
1892    string = state_init(&state, self, string, start, end);
1893    if (!string)
1894        return NULL;
1895
1896    state.ptr = state.start;
1897
1898    TRACE(("|%p|%p|MATCH\n", PatternObject_GetCode(self), state.ptr));
1899
1900    if (state.charsize == 1) {
1901        status = sre_match(&state, PatternObject_GetCode(self));
1902    } else {
1903#if defined(HAVE_UNICODE)
1904        status = sre_umatch(&state, PatternObject_GetCode(self));
1905#endif
1906    }
1907
1908    TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
1909    if (PyErr_Occurred())
1910        return NULL;
1911
1912    state_fini(&state);
1913
1914    return pattern_new_match(self, &state, status);
1915}
1916
1917static PyObject*
1918pattern_search(PatternObject* self, PyObject* args, PyObject* kw)
1919{
1920    SRE_STATE state;
1921    int status;
1922
1923    PyObject* string;
1924    Py_ssize_t start = 0;
1925    Py_ssize_t end = PY_SSIZE_T_MAX;
1926    static char* kwlist[] = { "pattern", "pos", "endpos", NULL };
1927    if (!PyArg_ParseTupleAndKeywords(args, kw, "O|nn:search", kwlist,
1928                                     &string, &start, &end))
1929        return NULL;
1930
1931    string = state_init(&state, self, string, start, end);
1932    if (!string)
1933        return NULL;
1934
1935    TRACE(("|%p|%p|SEARCH\n", PatternObject_GetCode(self), state.ptr));
1936
1937    if (state.charsize == 1) {
1938        status = sre_search(&state, PatternObject_GetCode(self));
1939    } else {
1940#if defined(HAVE_UNICODE)
1941        status = sre_usearch(&state, PatternObject_GetCode(self));
1942#endif
1943    }
1944
1945    TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
1946
1947    state_fini(&state);
1948
1949    if (PyErr_Occurred())
1950        return NULL;
1951
1952    return pattern_new_match(self, &state, status);
1953}
1954
1955static PyObject*
1956call(char* module, char* function, PyObject* args)
1957{
1958    PyObject* name;
1959    PyObject* mod;
1960    PyObject* func;
1961    PyObject* result;
1962
1963    if (!args)
1964        return NULL;
1965    name = PyString_FromString(module);
1966    if (!name)
1967        return NULL;
1968    mod = PyImport_Import(name);
1969    Py_DECREF(name);
1970    if (!mod)
1971        return NULL;
1972    func = PyObject_GetAttrString(mod, function);
1973    Py_DECREF(mod);
1974    if (!func)
1975        return NULL;
1976    result = PyObject_CallObject(func, args);
1977    Py_DECREF(func);
1978    Py_DECREF(args);
1979    return result;
1980}
1981
1982#ifdef USE_BUILTIN_COPY
1983static int
1984deepcopy(PyObject** object, PyObject* memo)
1985{
1986    PyObject* copy;
1987
1988    copy = call(
1989        "copy", "deepcopy",
1990        PyTuple_Pack(2, *object, memo)
1991        );
1992    if (!copy)
1993        return 0;
1994
1995    Py_DECREF(*object);
1996    *object = copy;
1997
1998    return 1; /* success */
1999}
2000#endif
2001
2002static PyObject*
2003join_list(PyObject* list, PyObject* string)
2004{
2005    /* join list elements */
2006
2007    PyObject* joiner;
2008#if PY_VERSION_HEX >= 0x01060000
2009    PyObject* function;
2010    PyObject* args;
2011#endif
2012    PyObject* result;
2013
2014    joiner = PySequence_GetSlice(string, 0, 0);
2015    if (!joiner)
2016        return NULL;
2017
2018    if (PyList_GET_SIZE(list) == 0) {
2019        Py_DECREF(list);
2020        return joiner;
2021    }
2022
2023#if PY_VERSION_HEX >= 0x01060000
2024    function = PyObject_GetAttrString(joiner, "join");
2025    if (!function) {
2026        Py_DECREF(joiner);
2027        return NULL;
2028    }
2029    args = PyTuple_New(1);
2030    if (!args) {
2031        Py_DECREF(function);
2032        Py_DECREF(joiner);
2033        return NULL;
2034    }
2035    PyTuple_SET_ITEM(args, 0, list);
2036    result = PyObject_CallObject(function, args);
2037    Py_DECREF(args); /* also removes list */
2038    Py_DECREF(function);
2039#else
2040    result = call(
2041        "string", "join",
2042        PyTuple_Pack(2, list, joiner)
2043        );
2044#endif
2045    Py_DECREF(joiner);
2046
2047    return result;
2048}
2049
2050static PyObject*
2051pattern_findall(PatternObject* self, PyObject* args, PyObject* kw)
2052{
2053    SRE_STATE state;
2054    PyObject* list;
2055    int status;
2056    Py_ssize_t i, b, e;
2057
2058    PyObject* string;
2059    Py_ssize_t start = 0;
2060    Py_ssize_t end = PY_SSIZE_T_MAX;
2061    static char* kwlist[] = { "source", "pos", "endpos", NULL };
2062    if (!PyArg_ParseTupleAndKeywords(args, kw, "O|nn:findall", kwlist,
2063                                     &string, &start, &end))
2064        return NULL;
2065
2066    string = state_init(&state, self, string, start, end);
2067    if (!string)
2068        return NULL;
2069
2070    list = PyList_New(0);
2071    if (!list) {
2072        state_fini(&state);
2073        return NULL;
2074    }
2075
2076    while (state.start <= state.end) {
2077
2078        PyObject* item;
2079
2080        state_reset(&state);
2081
2082        state.ptr = state.start;
2083
2084        if (state.charsize == 1) {
2085            status = sre_search(&state, PatternObject_GetCode(self));
2086        } else {
2087#if defined(HAVE_UNICODE)
2088            status = sre_usearch(&state, PatternObject_GetCode(self));
2089#endif
2090        }
2091
2092        if (PyErr_Occurred())
2093            goto error;
2094
2095        if (status <= 0) {
2096            if (status == 0)
2097                break;
2098            pattern_error(status);
2099            goto error;
2100        }
2101
2102        /* don't bother to build a match object */
2103        switch (self->groups) {
2104        case 0:
2105            b = STATE_OFFSET(&state, state.start);
2106            e = STATE_OFFSET(&state, state.ptr);
2107            item = PySequence_GetSlice(string, b, e);
2108            if (!item)
2109                goto error;
2110            break;
2111        case 1:
2112            item = state_getslice(&state, 1, string, 1);
2113            if (!item)
2114                goto error;
2115            break;
2116        default:
2117            item = PyTuple_New(self->groups);
2118            if (!item)
2119                goto error;
2120            for (i = 0; i < self->groups; i++) {
2121                PyObject* o = state_getslice(&state, i+1, string, 1);
2122                if (!o) {
2123                    Py_DECREF(item);
2124                    goto error;
2125                }
2126                PyTuple_SET_ITEM(item, i, o);
2127            }
2128            break;
2129        }
2130
2131        status = PyList_Append(list, item);
2132        Py_DECREF(item);
2133        if (status < 0)
2134            goto error;
2135
2136        if (state.ptr == state.start)
2137            state.start = (void*) ((char*) state.ptr + state.charsize);
2138        else
2139            state.start = state.ptr;
2140
2141    }
2142
2143    state_fini(&state);
2144    return list;
2145
2146error:
2147    Py_DECREF(list);
2148    state_fini(&state);
2149    return NULL;
2150
2151}
2152
2153#if PY_VERSION_HEX >= 0x02020000
2154static PyObject*
2155pattern_finditer(PatternObject* pattern, PyObject* args)
2156{
2157    PyObject* scanner;
2158    PyObject* search;
2159    PyObject* iterator;
2160
2161    scanner = pattern_scanner(pattern, args);
2162    if (!scanner)
2163        return NULL;
2164
2165    search = PyObject_GetAttrString(scanner, "search");
2166    Py_DECREF(scanner);
2167    if (!search)
2168        return NULL;
2169
2170    iterator = PyCallIter_New(search, Py_None);
2171    Py_DECREF(search);
2172
2173    return iterator;
2174}
2175#endif
2176
2177static PyObject*
2178pattern_split(PatternObject* self, PyObject* args, PyObject* kw)
2179{
2180    SRE_STATE state;
2181    PyObject* list;
2182    PyObject* item;
2183    int status;
2184    Py_ssize_t n;
2185    Py_ssize_t i;
2186    void* last;
2187
2188    PyObject* string;
2189    Py_ssize_t maxsplit = 0;
2190    static char* kwlist[] = { "source", "maxsplit", NULL };
2191    if (!PyArg_ParseTupleAndKeywords(args, kw, "O|n:split", kwlist,
2192                                     &string, &maxsplit))
2193        return NULL;
2194
2195    string = state_init(&state, self, string, 0, PY_SSIZE_T_MAX);
2196    if (!string)
2197        return NULL;
2198
2199    list = PyList_New(0);
2200    if (!list) {
2201        state_fini(&state);
2202        return NULL;
2203    }
2204
2205    n = 0;
2206    last = state.start;
2207
2208    while (!maxsplit || n < maxsplit) {
2209
2210        state_reset(&state);
2211
2212        state.ptr = state.start;
2213
2214        if (state.charsize == 1) {
2215            status = sre_search(&state, PatternObject_GetCode(self));
2216        } else {
2217#if defined(HAVE_UNICODE)
2218            status = sre_usearch(&state, PatternObject_GetCode(self));
2219#endif
2220        }
2221
2222        if (PyErr_Occurred())
2223            goto error;
2224
2225        if (status <= 0) {
2226            if (status == 0)
2227                break;
2228            pattern_error(status);
2229            goto error;
2230        }
2231
2232        if (state.start == state.ptr) {
2233            if (last == state.end)
2234                break;
2235            /* skip one character */
2236            state.start = (void*) ((char*) state.ptr + state.charsize);
2237            continue;
2238        }
2239
2240        /* get segment before this match */
2241        item = PySequence_GetSlice(
2242            string, STATE_OFFSET(&state, last),
2243            STATE_OFFSET(&state, state.start)
2244            );
2245        if (!item)
2246            goto error;
2247        status = PyList_Append(list, item);
2248        Py_DECREF(item);
2249        if (status < 0)
2250            goto error;
2251
2252        /* add groups (if any) */
2253        for (i = 0; i < self->groups; i++) {
2254            item = state_getslice(&state, i+1, string, 0);
2255            if (!item)
2256                goto error;
2257            status = PyList_Append(list, item);
2258            Py_DECREF(item);
2259            if (status < 0)
2260                goto error;
2261        }
2262
2263        n = n + 1;
2264
2265        last = state.start = state.ptr;
2266
2267    }
2268
2269    /* get segment following last match (even if empty) */
2270    item = PySequence_GetSlice(
2271        string, STATE_OFFSET(&state, last), state.endpos
2272        );
2273    if (!item)
2274        goto error;
2275    status = PyList_Append(list, item);
2276    Py_DECREF(item);
2277    if (status < 0)
2278        goto error;
2279
2280    state_fini(&state);
2281    return list;
2282
2283error:
2284    Py_DECREF(list);
2285    state_fini(&state);
2286    return NULL;
2287
2288}
2289
2290static PyObject*
2291pattern_subx(PatternObject* self, PyObject* ptemplate, PyObject* string,
2292             Py_ssize_t count, Py_ssize_t subn)
2293{
2294    SRE_STATE state;
2295    PyObject* list;
2296    PyObject* item;
2297    PyObject* filter;
2298    PyObject* args;
2299    PyObject* match;
2300    void* ptr;
2301    int status;
2302    Py_ssize_t n;
2303    Py_ssize_t i, b, e;
2304    int bint;
2305    int filter_is_callable;
2306
2307    if (PyCallable_Check(ptemplate)) {
2308        /* sub/subn takes either a function or a template */
2309        filter = ptemplate;
2310        Py_INCREF(filter);
2311        filter_is_callable = 1;
2312    } else {
2313        /* if not callable, check if it's a literal string */
2314        int literal;
2315        ptr = getstring(ptemplate, &n, &bint);
2316        b = bint;
2317        if (ptr) {
2318            if (b == 1) {
2319                literal = sre_literal_template((unsigned char *)ptr, n);
2320            } else {
2321#if defined(HAVE_UNICODE)
2322                literal = sre_uliteral_template((Py_UNICODE *)ptr, n);
2323#endif
2324            }
2325        } else {
2326            PyErr_Clear();
2327            literal = 0;
2328        }
2329        if (literal) {
2330            filter = ptemplate;
2331            Py_INCREF(filter);
2332            filter_is_callable = 0;
2333        } else {
2334            /* not a literal; hand it over to the template compiler */
2335            filter = call(
2336                SRE_PY_MODULE, "_subx",
2337                PyTuple_Pack(2, self, ptemplate)
2338                );
2339            if (!filter)
2340                return NULL;
2341            filter_is_callable = PyCallable_Check(filter);
2342        }
2343    }
2344
2345    string = state_init(&state, self, string, 0, PY_SSIZE_T_MAX);
2346    if (!string) {
2347        Py_DECREF(filter);
2348        return NULL;
2349    }
2350
2351    list = PyList_New(0);
2352    if (!list) {
2353        Py_DECREF(filter);
2354        state_fini(&state);
2355        return NULL;
2356    }
2357
2358    n = i = 0;
2359
2360    while (!count || n < count) {
2361
2362        state_reset(&state);
2363
2364        state.ptr = state.start;
2365
2366        if (state.charsize == 1) {
2367            status = sre_search(&state, PatternObject_GetCode(self));
2368        } else {
2369#if defined(HAVE_UNICODE)
2370            status = sre_usearch(&state, PatternObject_GetCode(self));
2371#endif
2372        }
2373
2374        if (PyErr_Occurred())
2375            goto error;
2376
2377        if (status <= 0) {
2378            if (status == 0)
2379                break;
2380            pattern_error(status);
2381            goto error;
2382        }
2383
2384        b = STATE_OFFSET(&state, state.start);
2385        e = STATE_OFFSET(&state, state.ptr);
2386
2387        if (i < b) {
2388            /* get segment before this match */
2389            item = PySequence_GetSlice(string, i, b);
2390            if (!item)
2391                goto error;
2392            status = PyList_Append(list, item);
2393            Py_DECREF(item);
2394            if (status < 0)
2395                goto error;
2396
2397        } else if (i == b && i == e && n > 0)
2398            /* ignore empty match on latest position */
2399            goto next;
2400
2401        if (filter_is_callable) {
2402            /* pass match object through filter */
2403            match = pattern_new_match(self, &state, 1);
2404            if (!match)
2405                goto error;
2406            args = PyTuple_Pack(1, match);
2407            if (!args) {
2408                Py_DECREF(match);
2409                goto error;
2410            }
2411            item = PyObject_CallObject(filter, args);
2412            Py_DECREF(args);
2413            Py_DECREF(match);
2414            if (!item)
2415                goto error;
2416        } else {
2417            /* filter is literal string */
2418            item = filter;
2419            Py_INCREF(item);
2420        }
2421
2422        /* add to list */
2423        if (item != Py_None) {
2424            status = PyList_Append(list, item);
2425            Py_DECREF(item);
2426            if (status < 0)
2427                goto error;
2428        }
2429
2430        i = e;
2431        n = n + 1;
2432
2433next:
2434        /* move on */
2435        if (state.ptr == state.start)
2436            state.start = (void*) ((char*) state.ptr + state.charsize);
2437        else
2438            state.start = state.ptr;
2439
2440    }
2441
2442    /* get segment following last match */
2443    if (i < state.endpos) {
2444        item = PySequence_GetSlice(string, i, state.endpos);
2445        if (!item)
2446            goto error;
2447        status = PyList_Append(list, item);
2448        Py_DECREF(item);
2449        if (status < 0)
2450            goto error;
2451    }
2452
2453    state_fini(&state);
2454
2455    Py_DECREF(filter);
2456
2457    /* convert list to single string (also removes list) */
2458    item = join_list(list, string);
2459
2460    if (!item)
2461        return NULL;
2462
2463    if (subn)
2464        return Py_BuildValue("Ni", item, n);
2465
2466    return item;
2467
2468error:
2469    Py_DECREF(list);
2470    state_fini(&state);
2471    Py_DECREF(filter);
2472    return NULL;
2473
2474}
2475
2476static PyObject*
2477pattern_sub(PatternObject* self, PyObject* args, PyObject* kw)
2478{
2479    PyObject* ptemplate;
2480    PyObject* string;
2481    Py_ssize_t count = 0;
2482    static char* kwlist[] = { "repl", "string", "count", NULL };
2483    if (!PyArg_ParseTupleAndKeywords(args, kw, "OO|n:sub", kwlist,
2484                                     &ptemplate, &string, &count))
2485        return NULL;
2486
2487    return pattern_subx(self, ptemplate, string, count, 0);
2488}
2489
2490static PyObject*
2491pattern_subn(PatternObject* self, PyObject* args, PyObject* kw)
2492{
2493    PyObject* ptemplate;
2494    PyObject* string;
2495    Py_ssize_t count = 0;
2496    static char* kwlist[] = { "repl", "string", "count", NULL };
2497    if (!PyArg_ParseTupleAndKeywords(args, kw, "OO|n:subn", kwlist,
2498                                     &ptemplate, &string, &count))
2499        return NULL;
2500
2501    return pattern_subx(self, ptemplate, string, count, 1);
2502}
2503
2504static PyObject*
2505pattern_copy(PatternObject* self, PyObject *unused)
2506{
2507#ifdef USE_BUILTIN_COPY
2508    PatternObject* copy;
2509    int offset;
2510
2511    copy = PyObject_NEW_VAR(PatternObject, &Pattern_Type, self->codesize);
2512    if (!copy)
2513        return NULL;
2514
2515    offset = offsetof(PatternObject, groups);
2516
2517    Py_XINCREF(self->groupindex);
2518    Py_XINCREF(self->indexgroup);
2519    Py_XINCREF(self->pattern);
2520
2521    memcpy((char*) copy + offset, (char*) self + offset,
2522           sizeof(PatternObject) + self->codesize * sizeof(SRE_CODE) - offset);
2523    copy->weakreflist = NULL;
2524
2525    return (PyObject*) copy;
2526#else
2527    PyErr_SetString(PyExc_TypeError, "cannot copy this pattern object");
2528    return NULL;
2529#endif
2530}
2531
2532static PyObject*
2533pattern_deepcopy(PatternObject* self, PyObject* memo)
2534{
2535#ifdef USE_BUILTIN_COPY
2536    PatternObject* copy;
2537
2538    copy = (PatternObject*) pattern_copy(self);
2539    if (!copy)
2540        return NULL;
2541
2542    if (!deepcopy(&copy->groupindex, memo) ||
2543        !deepcopy(&copy->indexgroup, memo) ||
2544        !deepcopy(&copy->pattern, memo)) {
2545        Py_DECREF(copy);
2546        return NULL;
2547    }
2548
2549#else
2550    PyErr_SetString(PyExc_TypeError, "cannot deepcopy this pattern object");
2551    return NULL;
2552#endif
2553}
2554
2555PyDoc_STRVAR(pattern_match_doc,
2556"match(string[, pos[, endpos]]) --> match object or None.\n\
2557    Matches zero or more characters at the beginning of the string");
2558
2559PyDoc_STRVAR(pattern_search_doc,
2560"search(string[, pos[, endpos]]) --> match object or None.\n\
2561    Scan through string looking for a match, and return a corresponding\n\
2562    MatchObject instance. Return None if no position in the string matches.");
2563
2564PyDoc_STRVAR(pattern_split_doc,
2565"split(string[, maxsplit = 0])  --> list.\n\
2566    Split string by the occurrences of pattern.");
2567
2568PyDoc_STRVAR(pattern_findall_doc,
2569"findall(string[, pos[, endpos]]) --> list.\n\
2570   Return a list of all non-overlapping matches of pattern in string.");
2571
2572PyDoc_STRVAR(pattern_finditer_doc,
2573"finditer(string[, pos[, endpos]]) --> iterator.\n\
2574    Return an iterator over all non-overlapping matches for the \n\
2575    RE pattern in string. For each match, the iterator returns a\n\
2576    match object.");
2577
2578PyDoc_STRVAR(pattern_sub_doc,
2579"sub(repl, string[, count = 0]) --> newstring\n\
2580    Return the string obtained by replacing the leftmost non-overlapping\n\
2581    occurrences of pattern in string by the replacement repl.");
2582
2583PyDoc_STRVAR(pattern_subn_doc,
2584"subn(repl, string[, count = 0]) --> (newstring, number of subs)\n\
2585    Return the tuple (new_string, number_of_subs_made) found by replacing\n\
2586    the leftmost non-overlapping occurrences of pattern with the\n\
2587    replacement repl.");
2588
2589PyDoc_STRVAR(pattern_doc, "Compiled regular expression objects");
2590
2591static PyMethodDef pattern_methods[] = {
2592    {"match", (PyCFunction) pattern_match, METH_VARARGS|METH_KEYWORDS,
2593        pattern_match_doc},
2594    {"search", (PyCFunction) pattern_search, METH_VARARGS|METH_KEYWORDS,
2595        pattern_search_doc},
2596    {"sub", (PyCFunction) pattern_sub, METH_VARARGS|METH_KEYWORDS,
2597        pattern_sub_doc},
2598    {"subn", (PyCFunction) pattern_subn, METH_VARARGS|METH_KEYWORDS,
2599        pattern_subn_doc},
2600    {"split", (PyCFunction) pattern_split, METH_VARARGS|METH_KEYWORDS,
2601        pattern_split_doc},
2602    {"findall", (PyCFunction) pattern_findall, METH_VARARGS|METH_KEYWORDS,
2603        pattern_findall_doc},
2604#if PY_VERSION_HEX >= 0x02020000
2605    {"finditer", (PyCFunction) pattern_finditer, METH_VARARGS,
2606        pattern_finditer_doc},
2607#endif
2608    {"scanner", (PyCFunction) pattern_scanner, METH_VARARGS},
2609    {"__copy__", (PyCFunction) pattern_copy, METH_NOARGS},
2610    {"__deepcopy__", (PyCFunction) pattern_deepcopy, METH_O},
2611    {NULL, NULL}
2612};
2613
2614#define PAT_OFF(x) offsetof(PatternObject, x)
2615static PyMemberDef pattern_members[] = {
2616    {"pattern",    T_OBJECT,    PAT_OFF(pattern),       READONLY},
2617    {"flags",      T_INT,       PAT_OFF(flags),         READONLY},
2618    {"groups",     T_PYSSIZET,  PAT_OFF(groups),        READONLY},
2619    {"groupindex", T_OBJECT,    PAT_OFF(groupindex),    READONLY},
2620    {NULL}  /* Sentinel */
2621};
2622
2623statichere PyTypeObject Pattern_Type = {
2624    PyObject_HEAD_INIT(NULL)
2625    0, "_" SRE_MODULE ".SRE_Pattern",
2626    sizeof(PatternObject), sizeof(SRE_CODE),
2627    (destructor)pattern_dealloc, /*tp_dealloc*/
2628    0,                                  /* tp_print */
2629    0,                                  /* tp_getattrn */
2630    0,          /* tp_setattr */
2631    0,          /* tp_compare */
2632    0,          /* tp_repr */
2633    0,          /* tp_as_number */
2634    0,          /* tp_as_sequence */
2635    0,          /* tp_as_mapping */
2636    0,          /* tp_hash */
2637    0,          /* tp_call */
2638    0,          /* tp_str */
2639    0,          /* tp_getattro */
2640    0,          /* tp_setattro */
2641    0,          /* tp_as_buffer */
2642    Py_TPFLAGS_DEFAULT,           /* tp_flags */
2643    pattern_doc,      /* tp_doc */
2644    0,          /* tp_traverse */
2645    0,          /* tp_clear */
2646    0,          /* tp_richcompare */
2647    offsetof(PatternObject, weakreflist), /* tp_weaklistoffset */
2648    0,          /* tp_iter */
2649    0,          /* tp_iternext */
2650    pattern_methods,      /* tp_methods */
2651    pattern_members,      /* tp_members */
2652};
2653
2654static int _validate(PatternObject *self); /* Forward */
2655
2656static PyObject *
2657_compile(PyObject* self_, PyObject* args)
2658{
2659    /* "compile" pattern descriptor to pattern object */
2660
2661    PatternObject* self;
2662    Py_ssize_t i, n;
2663
2664    PyObject* pattern;
2665    int flags = 0;
2666    PyObject* code;
2667    Py_ssize_t groups = 0;
2668    PyObject* groupindex = NULL;
2669    PyObject* indexgroup = NULL;
2670    if (!PyArg_ParseTuple(args, "OiO!|nOO", &pattern, &flags,
2671                          &PyList_Type, &code, &groups,
2672                          &groupindex, &indexgroup))
2673        return NULL;
2674
2675    n = PyList_GET_SIZE(code);
2676    /* coverity[ampersand_in_size] */
2677    self = PyObject_NEW_VAR(PatternObject, &Pattern_Type, n);
2678    if (!self)
2679        return NULL;
2680    self->weakreflist = NULL;
2681    self->pattern = NULL;
2682    self->groupindex = NULL;
2683    self->indexgroup = NULL;
2684
2685    self->codesize = n;
2686
2687    for (i = 0; i < n; i++) {
2688        PyObject *o = PyList_GET_ITEM(code, i);
2689        unsigned long value = PyInt_Check(o) ? (unsigned long)PyInt_AsLong(o)
2690                                              : PyLong_AsUnsignedLong(o);
2691        self->code[i] = (SRE_CODE) value;
2692        if ((unsigned long) self->code[i] != value) {
2693            PyErr_SetString(PyExc_OverflowError,
2694                            "regular expression code size limit exceeded");
2695            break;
2696        }
2697    }
2698
2699    if (PyErr_Occurred()) {
2700        Py_DECREF(self);
2701        return NULL;
2702    }
2703
2704    Py_INCREF(pattern);
2705    self->pattern = pattern;
2706
2707    self->flags = flags;
2708
2709    self->groups = groups;
2710
2711    Py_XINCREF(groupindex);
2712    self->groupindex = groupindex;
2713
2714    Py_XINCREF(indexgroup);
2715    self->indexgroup = indexgroup;
2716
2717    self->weakreflist = NULL;
2718
2719    if (!_validate(self)) {
2720        Py_DECREF(self);
2721        return NULL;
2722    }
2723
2724    return (PyObject*) self;
2725}
2726
2727/* -------------------------------------------------------------------- */
2728/* Code validation */
2729
2730/* To learn more about this code, have a look at the _compile() function in
2731   Lib/sre_compile.py.  The validation functions below checks the code array
2732   for conformance with the code patterns generated there.
2733
2734   The nice thing about the generated code is that it is position-independent:
2735   all jumps are relative jumps forward.  Also, jumps don't cross each other:
2736   the target of a later jump is always earlier than the target of an earlier
2737   jump.  IOW, this is okay:
2738
2739   J---------J-------T--------T
2740    \         \_____/        /
2741     \______________________/
2742
2743   but this is not:
2744
2745   J---------J-------T--------T
2746    \_________\_____/        /
2747               \____________/
2748
2749   It also helps that SRE_CODE is always an unsigned type, either 2 bytes or 4
2750   bytes wide (the latter if Python is compiled for "wide" unicode support).
2751*/
2752
2753/* Defining this one enables tracing of the validator */
2754#undef VVERBOSE
2755
2756/* Trace macro for the validator */
2757#if defined(VVERBOSE)
2758#define VTRACE(v) printf v
2759#else
2760#define VTRACE(v)
2761#endif
2762
2763/* Report failure */
2764#define FAIL do { VTRACE(("FAIL: %d\n", __LINE__)); return 0; } while (0)
2765
2766/* Extract opcode, argument, or skip count from code array */
2767#define GET_OP                                          \
2768    do {                                                \
2769        VTRACE(("%p: ", code));                         \
2770        if (code >= end) FAIL;                          \
2771        op = *code++;                                   \
2772        VTRACE(("%lu (op)\n", (unsigned long)op));      \
2773    } while (0)
2774#define GET_ARG                                         \
2775    do {                                                \
2776        VTRACE(("%p= ", code));                         \
2777        if (code >= end) FAIL;                          \
2778        arg = *code++;                                  \
2779        VTRACE(("%lu (arg)\n", (unsigned long)arg));    \
2780    } while (0)
2781#define GET_SKIP_ADJ(adj)                               \
2782    do {                                                \
2783        VTRACE(("%p= ", code));                         \
2784        if (code >= end) FAIL;                          \
2785        skip = *code;                                   \
2786        VTRACE(("%lu (skip to %p)\n",                   \
2787               (unsigned long)skip, code+skip));        \
2788        if (code+skip-adj < code || code+skip-adj > end)\
2789            FAIL;                                       \
2790        code++;                                         \
2791    } while (0)
2792#define GET_SKIP GET_SKIP_ADJ(0)
2793
2794static int
2795_validate_charset(SRE_CODE *code, SRE_CODE *end)
2796{
2797    /* Some variables are manipulated by the macros above */
2798    SRE_CODE op;
2799    SRE_CODE arg;
2800    SRE_CODE offset;
2801    int i;
2802
2803    while (code < end) {
2804        GET_OP;
2805        switch (op) {
2806
2807        case SRE_OP_NEGATE:
2808            break;
2809
2810        case SRE_OP_LITERAL:
2811            GET_ARG;
2812            break;
2813
2814        case SRE_OP_RANGE:
2815            GET_ARG;
2816            GET_ARG;
2817            break;
2818
2819        case SRE_OP_CHARSET:
2820            offset = 32/sizeof(SRE_CODE); /* 32-byte bitmap */
2821            if (code+offset < code || code+offset > end)
2822                FAIL;
2823            code += offset;
2824            break;
2825
2826        case SRE_OP_BIGCHARSET:
2827            GET_ARG; /* Number of blocks */
2828            offset = 256/sizeof(SRE_CODE); /* 256-byte table */
2829            if (code+offset < code || code+offset > end)
2830                FAIL;
2831            /* Make sure that each byte points to a valid block */
2832            for (i = 0; i < 256; i++) {
2833                if (((unsigned char *)code)[i] >= arg)
2834                    FAIL;
2835            }
2836            code += offset;
2837            offset = arg * 32/sizeof(SRE_CODE); /* 32-byte bitmap times arg */
2838            if (code+offset < code || code+offset > end)
2839                FAIL;
2840            code += offset;
2841            break;
2842
2843        case SRE_OP_CATEGORY:
2844            GET_ARG;
2845            switch (arg) {
2846            case SRE_CATEGORY_DIGIT:
2847            case SRE_CATEGORY_NOT_DIGIT:
2848            case SRE_CATEGORY_SPACE:
2849            case SRE_CATEGORY_NOT_SPACE:
2850            case SRE_CATEGORY_WORD:
2851            case SRE_CATEGORY_NOT_WORD:
2852            case SRE_CATEGORY_LINEBREAK:
2853            case SRE_CATEGORY_NOT_LINEBREAK:
2854            case SRE_CATEGORY_LOC_WORD:
2855            case SRE_CATEGORY_LOC_NOT_WORD:
2856            case SRE_CATEGORY_UNI_DIGIT:
2857            case SRE_CATEGORY_UNI_NOT_DIGIT:
2858            case SRE_CATEGORY_UNI_SPACE:
2859            case SRE_CATEGORY_UNI_NOT_SPACE:
2860            case SRE_CATEGORY_UNI_WORD:
2861            case SRE_CATEGORY_UNI_NOT_WORD:
2862            case SRE_CATEGORY_UNI_LINEBREAK:
2863            case SRE_CATEGORY_UNI_NOT_LINEBREAK:
2864                break;
2865            default:
2866                FAIL;
2867            }
2868            break;
2869
2870        default:
2871            FAIL;
2872
2873        }
2874    }
2875
2876    return 1;
2877}
2878
2879static int
2880_validate_inner(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups)
2881{
2882    /* Some variables are manipulated by the macros above */
2883    SRE_CODE op;
2884    SRE_CODE arg;
2885    SRE_CODE skip;
2886
2887    VTRACE(("code=%p, end=%p\n", code, end));
2888
2889    if (code > end)
2890        FAIL;
2891
2892    while (code < end) {
2893        GET_OP;
2894        switch (op) {
2895
2896        case SRE_OP_MARK:
2897            /* We don't check whether marks are properly nested; the
2898               sre_match() code is robust even if they don't, and the worst
2899               you can get is nonsensical match results. */
2900            GET_ARG;
2901            if (arg > 2*groups+1) {
2902                VTRACE(("arg=%d, groups=%d\n", (int)arg, (int)groups));
2903                FAIL;
2904            }
2905            break;
2906
2907        case SRE_OP_LITERAL:
2908        case SRE_OP_NOT_LITERAL:
2909        case SRE_OP_LITERAL_IGNORE:
2910        case SRE_OP_NOT_LITERAL_IGNORE:
2911            GET_ARG;
2912            /* The arg is just a character, nothing to check */
2913            break;
2914
2915        case SRE_OP_SUCCESS:
2916        case SRE_OP_FAILURE:
2917            /* Nothing to check; these normally end the matching process */
2918            break;
2919
2920        case SRE_OP_AT:
2921            GET_ARG;
2922            switch (arg) {
2923            case SRE_AT_BEGINNING:
2924            case SRE_AT_BEGINNING_STRING:
2925            case SRE_AT_BEGINNING_LINE:
2926            case SRE_AT_END:
2927            case SRE_AT_END_LINE:
2928            case SRE_AT_END_STRING:
2929            case SRE_AT_BOUNDARY:
2930            case SRE_AT_NON_BOUNDARY:
2931            case SRE_AT_LOC_BOUNDARY:
2932            case SRE_AT_LOC_NON_BOUNDARY:
2933            case SRE_AT_UNI_BOUNDARY:
2934            case SRE_AT_UNI_NON_BOUNDARY:
2935                break;
2936            default:
2937                FAIL;
2938            }
2939            break;
2940
2941        case SRE_OP_ANY:
2942        case SRE_OP_ANY_ALL:
2943            /* These have no operands */
2944            break;
2945
2946        case SRE_OP_IN:
2947        case SRE_OP_IN_IGNORE:
2948            GET_SKIP;
2949            /* Stop 1 before the end; we check the FAILURE below */
2950            if (!_validate_charset(code, code+skip-2))
2951                FAIL;
2952            if (code[skip-2] != SRE_OP_FAILURE)
2953                FAIL;
2954            code += skip-1;
2955            break;
2956
2957        case SRE_OP_INFO:
2958            {
2959                /* A minimal info field is
2960                   <INFO> <1=skip> <2=flags> <3=min> <4=max>;
2961                   If SRE_INFO_PREFIX or SRE_INFO_CHARSET is in the flags,
2962                   more follows. */
2963                SRE_CODE flags, i;
2964                SRE_CODE *newcode;
2965                GET_SKIP;
2966                newcode = code+skip-1;
2967                GET_ARG; flags = arg;
2968                GET_ARG; /* min */
2969                GET_ARG; /* max */
2970                /* Check that only valid flags are present */
2971                if ((flags & ~(SRE_INFO_PREFIX |
2972                               SRE_INFO_LITERAL |
2973                               SRE_INFO_CHARSET)) != 0)
2974                    FAIL;
2975                /* PREFIX and CHARSET are mutually exclusive */
2976                if ((flags & SRE_INFO_PREFIX) &&
2977                    (flags & SRE_INFO_CHARSET))
2978                    FAIL;
2979                /* LITERAL implies PREFIX */
2980                if ((flags & SRE_INFO_LITERAL) &&
2981                    !(flags & SRE_INFO_PREFIX))
2982                    FAIL;
2983                /* Validate the prefix */
2984                if (flags & SRE_INFO_PREFIX) {
2985                    SRE_CODE prefix_len;
2986                    GET_ARG; prefix_len = arg;
2987                    GET_ARG; /* prefix skip */
2988                    /* Here comes the prefix string */
2989                    if (code+prefix_len < code || code+prefix_len > newcode)
2990                        FAIL;
2991                    code += prefix_len;
2992                    /* And here comes the overlap table */
2993                    if (code+prefix_len < code || code+prefix_len > newcode)
2994                        FAIL;
2995                    /* Each overlap value should be < prefix_len */
2996                    for (i = 0; i < prefix_len; i++) {
2997                        if (code[i] >= prefix_len)
2998                            FAIL;
2999                    }
3000                    code += prefix_len;
3001                }
3002                /* Validate the charset */
3003                if (flags & SRE_INFO_CHARSET) {
3004                    if (!_validate_charset(code, newcode-1))
3005                        FAIL;
3006                    if (newcode[-1] != SRE_OP_FAILURE)
3007                        FAIL;
3008                    code = newcode;
3009                }
3010                else if (code != newcode) {
3011                  VTRACE(("code=%p, newcode=%p\n", code, newcode));
3012                    FAIL;
3013                }
3014            }
3015            break;
3016
3017        case SRE_OP_BRANCH:
3018            {
3019                SRE_CODE *target = NULL;
3020                for (;;) {
3021                    GET_SKIP;
3022                    if (skip == 0)
3023                        break;
3024                    /* Stop 2 before the end; we check the JUMP below */
3025                    if (!_validate_inner(code, code+skip-3, groups))
3026                        FAIL;
3027                    code += skip-3;
3028                    /* Check that it ends with a JUMP, and that each JUMP
3029                       has the same target */
3030                    GET_OP;
3031                    if (op != SRE_OP_JUMP)
3032                        FAIL;
3033                    GET_SKIP;
3034                    if (target == NULL)
3035                        target = code+skip-1;
3036                    else if (code+skip-1 != target)
3037                        FAIL;
3038                }
3039            }
3040            break;
3041
3042        case SRE_OP_REPEAT_ONE:
3043        case SRE_OP_MIN_REPEAT_ONE:
3044            {
3045                SRE_CODE min, max;
3046                GET_SKIP;
3047                GET_ARG; min = arg;
3048                GET_ARG; max = arg;
3049                if (min > max)
3050                    FAIL;
3051#ifdef Py_UNICODE_WIDE
3052                if (max > 65535)
3053                    FAIL;
3054#endif
3055                if (!_validate_inner(code, code+skip-4, groups))
3056                    FAIL;
3057                code += skip-4;
3058                GET_OP;
3059                if (op != SRE_OP_SUCCESS)
3060                    FAIL;
3061            }
3062            break;
3063
3064        case SRE_OP_REPEAT:
3065            {
3066                SRE_CODE min, max;
3067                GET_SKIP;
3068                GET_ARG; min = arg;
3069                GET_ARG; max = arg;
3070                if (min > max)
3071                    FAIL;
3072#ifdef Py_UNICODE_WIDE
3073                if (max > 65535)
3074                    FAIL;
3075#endif
3076                if (!_validate_inner(code, code+skip-3, groups))
3077                    FAIL;
3078                code += skip-3;
3079                GET_OP;
3080                if (op != SRE_OP_MAX_UNTIL && op != SRE_OP_MIN_UNTIL)
3081                    FAIL;
3082            }
3083            break;
3084
3085        case SRE_OP_GROUPREF:
3086        case SRE_OP_GROUPREF_IGNORE:
3087            GET_ARG;
3088            if (arg >= groups)
3089                FAIL;
3090            break;
3091
3092        case SRE_OP_GROUPREF_EXISTS:
3093            /* The regex syntax for this is: '(?(group)then|else)', where
3094               'group' is either an integer group number or a group name,
3095               'then' and 'else' are sub-regexes, and 'else' is optional. */
3096            GET_ARG;
3097            if (arg >= groups)
3098                FAIL;
3099            GET_SKIP_ADJ(1);
3100            code--; /* The skip is relative to the first arg! */
3101            /* There are two possibilities here: if there is both a 'then'
3102               part and an 'else' part, the generated code looks like:
3103
3104               GROUPREF_EXISTS
3105               <group>
3106               <skipyes>
3107               ...then part...
3108               JUMP
3109               <skipno>
3110               (<skipyes> jumps here)
3111               ...else part...
3112               (<skipno> jumps here)
3113
3114               If there is only a 'then' part, it looks like:
3115
3116               GROUPREF_EXISTS
3117               <group>
3118               <skip>
3119               ...then part...
3120               (<skip> jumps here)
3121
3122               There is no direct way to decide which it is, and we don't want
3123               to allow arbitrary jumps anywhere in the code; so we just look
3124               for a JUMP opcode preceding our skip target.
3125            */
3126            if (skip >= 3 && code+skip-3 >= code &&
3127                code[skip-3] == SRE_OP_JUMP)
3128            {
3129                VTRACE(("both then and else parts present\n"));
3130                if (!_validate_inner(code+1, code+skip-3, groups))
3131                    FAIL;
3132                code += skip-2; /* Position after JUMP, at <skipno> */
3133                GET_SKIP;
3134                if (!_validate_inner(code, code+skip-1, groups))
3135                    FAIL;
3136                code += skip-1;
3137            }
3138            else {
3139                VTRACE(("only a then part present\n"));
3140                if (!_validate_inner(code+1, code+skip-1, groups))
3141                    FAIL;
3142                code += skip-1;
3143            }
3144            break;
3145
3146        case SRE_OP_ASSERT:
3147        case SRE_OP_ASSERT_NOT:
3148            GET_SKIP;
3149            GET_ARG; /* 0 for lookahead, width for lookbehind */
3150            code--; /* Back up over arg to simplify math below */
3151            if (arg & 0x80000000)
3152                FAIL; /* Width too large */
3153            /* Stop 1 before the end; we check the SUCCESS below */
3154            if (!_validate_inner(code+1, code+skip-2, groups))
3155                FAIL;
3156            code += skip-2;
3157            GET_OP;
3158            if (op != SRE_OP_SUCCESS)
3159                FAIL;
3160            break;
3161
3162        default:
3163            FAIL;
3164
3165        }
3166    }
3167
3168    VTRACE(("okay\n"));
3169    return 1;
3170}
3171
3172static int
3173_validate_outer(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups)
3174{
3175    if (groups < 0 || groups > 100 || code >= end || end[-1] != SRE_OP_SUCCESS)
3176        FAIL;
3177    if (groups == 0)  /* fix for simplejson */
3178        groups = 100; /* 100 groups should always be safe */
3179    return _validate_inner(code, end-1, groups);
3180}
3181
3182static int
3183_validate(PatternObject *self)
3184{
3185    if (!_validate_outer(self->code, self->code+self->codesize, self->groups))
3186    {
3187        PyErr_SetString(PyExc_RuntimeError, "invalid SRE code");
3188        return 0;
3189    }
3190    else
3191        VTRACE(("Success!\n"));
3192    return 1;
3193}
3194
3195/* -------------------------------------------------------------------- */
3196/* match methods */
3197
3198static void
3199match_dealloc(MatchObject* self)
3200{
3201    Py_XDECREF(self->regs);
3202    Py_XDECREF(self->string);
3203    Py_DECREF(self->pattern);
3204    PyObject_DEL(self);
3205}
3206
3207static PyObject*
3208match_getslice_by_index(MatchObject* self, Py_ssize_t index, PyObject* def)
3209{
3210    if (index < 0 || index >= self->groups) {
3211        /* raise IndexError if we were given a bad group number */
3212        PyErr_SetString(
3213            PyExc_IndexError,
3214            "no such group"
3215            );
3216        return NULL;
3217    }
3218
3219    index *= 2;
3220
3221    if (self->string == Py_None || self->mark[index] < 0) {
3222        /* return default value if the string or group is undefined */
3223        Py_INCREF(def);
3224        return def;
3225    }
3226
3227    return PySequence_GetSlice(
3228        self->string, self->mark[index], self->mark[index+1]
3229        );
3230}
3231
3232static Py_ssize_t
3233match_getindex(MatchObject* self, PyObject* index)
3234{
3235    Py_ssize_t i;
3236
3237    if (PyInt_Check(index))
3238        return PyInt_AsSsize_t(index);
3239
3240    i = -1;
3241
3242    if (self->pattern->groupindex) {
3243        index = PyObject_GetItem(self->pattern->groupindex, index);
3244        if (index) {
3245            if (PyInt_Check(index) || PyLong_Check(index))
3246                i = PyInt_AsSsize_t(index);
3247            Py_DECREF(index);
3248        } else
3249            PyErr_Clear();
3250    }
3251
3252    return i;
3253}
3254
3255static PyObject*
3256match_getslice(MatchObject* self, PyObject* index, PyObject* def)
3257{
3258    return match_getslice_by_index(self, match_getindex(self, index), def);
3259}
3260
3261static PyObject*
3262match_expand(MatchObject* self, PyObject* ptemplate)
3263{
3264    /* delegate to Python code */
3265    return call(
3266        SRE_PY_MODULE, "_expand",
3267        PyTuple_Pack(3, self->pattern, self, ptemplate)
3268        );
3269}
3270
3271static PyObject*
3272match_group(MatchObject* self, PyObject* args)
3273{
3274    PyObject* result;
3275    Py_ssize_t i, size;
3276
3277    size = PyTuple_GET_SIZE(args);
3278
3279    switch (size) {
3280    case 0:
3281        result = match_getslice(self, Py_False, Py_None);
3282        break;
3283    case 1:
3284        result = match_getslice(self, PyTuple_GET_ITEM(args, 0), Py_None);
3285        break;
3286    default:
3287        /* fetch multiple items */
3288        result = PyTuple_New(size);
3289        if (!result)
3290            return NULL;
3291        for (i = 0; i < size; i++) {
3292            PyObject* item = match_getslice(
3293                self, PyTuple_GET_ITEM(args, i), Py_None
3294                );
3295            if (!item) {
3296                Py_DECREF(result);
3297                return NULL;
3298            }
3299            PyTuple_SET_ITEM(result, i, item);
3300        }
3301        break;
3302    }
3303    return result;
3304}
3305
3306static PyObject*
3307match_groups(MatchObject* self, PyObject* args, PyObject* kw)
3308{
3309    PyObject* result;
3310    Py_ssize_t index;
3311
3312    PyObject* def = Py_None;
3313    static char* kwlist[] = { "default", NULL };
3314    if (!PyArg_ParseTupleAndKeywords(args, kw, "|O:groups", kwlist, &def))
3315        return NULL;
3316
3317    result = PyTuple_New(self->groups-1);
3318    if (!result)
3319        return NULL;
3320
3321    for (index = 1; index < self->groups; index++) {
3322        PyObject* item;
3323        item = match_getslice_by_index(self, index, def);
3324        if (!item) {
3325            Py_DECREF(result);
3326            return NULL;
3327        }
3328        PyTuple_SET_ITEM(result, index-1, item);
3329    }
3330
3331    return result;
3332}
3333
3334static PyObject*
3335match_groupdict(MatchObject* self, PyObject* args, PyObject* kw)
3336{
3337    PyObject* result;
3338    PyObject* keys;
3339    Py_ssize_t index;
3340
3341    PyObject* def = Py_None;
3342    static char* kwlist[] = { "default", NULL };
3343    if (!PyArg_ParseTupleAndKeywords(args, kw, "|O:groupdict", kwlist, &def))
3344        return NULL;
3345
3346    result = PyDict_New();
3347    if (!result || !self->pattern->groupindex)
3348        return result;
3349
3350    keys = PyMapping_Keys(self->pattern->groupindex);
3351    if (!keys)
3352        goto failed;
3353
3354    for (index = 0; index < PyList_GET_SIZE(keys); index++) {
3355        int status;
3356        PyObject* key;
3357        PyObject* value;
3358        key = PyList_GET_ITEM(keys, index);
3359        if (!key)
3360            goto failed;
3361        value = match_getslice(self, key, def);
3362        if (!value) {
3363            Py_DECREF(key);
3364            goto failed;
3365        }
3366        status = PyDict_SetItem(result, key, value);
3367        Py_DECREF(value);
3368        if (status < 0)
3369            goto failed;
3370    }
3371
3372    Py_DECREF(keys);
3373
3374    return result;
3375
3376failed:
3377    Py_XDECREF(keys);
3378    Py_DECREF(result);
3379    return NULL;
3380}
3381
3382static PyObject*
3383match_start(MatchObject* self, PyObject* args)
3384{
3385    Py_ssize_t index;
3386
3387    PyObject* index_ = Py_False; /* zero */
3388    if (!PyArg_UnpackTuple(args, "start", 0, 1, &index_))
3389        return NULL;
3390
3391    index = match_getindex(self, index_);
3392
3393    if (index < 0 || index >= self->groups) {
3394        PyErr_SetString(
3395            PyExc_IndexError,
3396            "no such group"
3397            );
3398        return NULL;
3399    }
3400
3401    /* mark is -1 if group is undefined */
3402    return Py_BuildValue("i", self->mark[index*2]);
3403}
3404
3405static PyObject*
3406match_end(MatchObject* self, PyObject* args)
3407{
3408    Py_ssize_t index;
3409
3410    PyObject* index_ = Py_False; /* zero */
3411    if (!PyArg_UnpackTuple(args, "end", 0, 1, &index_))
3412        return NULL;
3413
3414    index = match_getindex(self, index_);
3415
3416    if (index < 0 || index >= self->groups) {
3417        PyErr_SetString(
3418            PyExc_IndexError,
3419            "no such group"
3420            );
3421        return NULL;
3422    }
3423
3424    /* mark is -1 if group is undefined */
3425    return Py_BuildValue("i", self->mark[index*2+1]);
3426}
3427
3428LOCAL(PyObject*)
3429_pair(Py_ssize_t i1, Py_ssize_t i2)
3430{
3431    PyObject* pair;
3432    PyObject* item;
3433
3434    pair = PyTuple_New(2);
3435    if (!pair)
3436        return NULL;
3437
3438    item = PyInt_FromSsize_t(i1);
3439    if (!item)
3440        goto error;
3441    PyTuple_SET_ITEM(pair, 0, item);
3442
3443    item = PyInt_FromSsize_t(i2);
3444    if (!item)
3445        goto error;
3446    PyTuple_SET_ITEM(pair, 1, item);
3447
3448    return pair;
3449
3450  error:
3451    Py_DECREF(pair);
3452    return NULL;
3453}
3454
3455static PyObject*
3456match_span(MatchObject* self, PyObject* args)
3457{
3458    Py_ssize_t index;
3459
3460    PyObject* index_ = Py_False; /* zero */
3461    if (!PyArg_UnpackTuple(args, "span", 0, 1, &index_))
3462        return NULL;
3463
3464    index = match_getindex(self, index_);
3465
3466    if (index < 0 || index >= self->groups) {
3467        PyErr_SetString(
3468            PyExc_IndexError,
3469            "no such group"
3470            );
3471        return NULL;
3472    }
3473
3474    /* marks are -1 if group is undefined */
3475    return _pair(self->mark[index*2], self->mark[index*2+1]);
3476}
3477
3478static PyObject*
3479match_regs(MatchObject* self)
3480{
3481    PyObject* regs;
3482    PyObject* item;
3483    Py_ssize_t index;
3484
3485    regs = PyTuple_New(self->groups);
3486    if (!regs)
3487        return NULL;
3488
3489    for (index = 0; index < self->groups; index++) {
3490        item = _pair(self->mark[index*2], self->mark[index*2+1]);
3491        if (!item) {
3492            Py_DECREF(regs);
3493            return NULL;
3494        }
3495        PyTuple_SET_ITEM(regs, index, item);
3496    }
3497
3498    Py_INCREF(regs);
3499    self->regs = regs;
3500
3501    return regs;
3502}
3503
3504static PyObject*
3505match_copy(MatchObject* self, PyObject *unused)
3506{
3507#ifdef USE_BUILTIN_COPY
3508    MatchObject* copy;
3509    Py_ssize_t slots, offset;
3510
3511    slots = 2 * (self->pattern->groups+1);
3512
3513    copy = PyObject_NEW_VAR(MatchObject, &Match_Type, slots);
3514    if (!copy)
3515        return NULL;
3516
3517    /* this value a constant, but any compiler should be able to
3518       figure that out all by itself */
3519    offset = offsetof(MatchObject, string);
3520
3521    Py_XINCREF(self->pattern);
3522    Py_XINCREF(self->string);
3523    Py_XINCREF(self->regs);
3524
3525    memcpy((char*) copy + offset, (char*) self + offset,
3526           sizeof(MatchObject) + slots * sizeof(Py_ssize_t) - offset);
3527
3528    return (PyObject*) copy;
3529#else
3530    PyErr_SetString(PyExc_TypeError, "cannot copy this match object");
3531    return NULL;
3532#endif
3533}
3534
3535static PyObject*
3536match_deepcopy(MatchObject* self, PyObject* memo)
3537{
3538#ifdef USE_BUILTIN_COPY
3539    MatchObject* copy;
3540
3541    copy = (MatchObject*) match_copy(self);
3542    if (!copy)
3543        return NULL;
3544
3545    if (!deepcopy((PyObject**) &copy->pattern, memo) ||
3546        !deepcopy(&copy->string, memo) ||
3547        !deepcopy(&copy->regs, memo)) {
3548        Py_DECREF(copy);
3549        return NULL;
3550    }
3551
3552#else
3553    PyErr_SetString(PyExc_TypeError, "cannot deepcopy this match object");
3554    return NULL;
3555#endif
3556}
3557
3558static struct PyMethodDef match_methods[] = {
3559    {"group", (PyCFunction) match_group, METH_VARARGS},
3560    {"start", (PyCFunction) match_start, METH_VARARGS},
3561    {"end", (PyCFunction) match_end, METH_VARARGS},
3562    {"span", (PyCFunction) match_span, METH_VARARGS},
3563    {"groups", (PyCFunction) match_groups, METH_VARARGS|METH_KEYWORDS},
3564    {"groupdict", (PyCFunction) match_groupdict, METH_VARARGS|METH_KEYWORDS},
3565    {"expand", (PyCFunction) match_expand, METH_O},
3566    {"__copy__", (PyCFunction) match_copy, METH_NOARGS},
3567    {"__deepcopy__", (PyCFunction) match_deepcopy, METH_O},
3568    {NULL, NULL}
3569};
3570
3571static PyObject *
3572match_lastindex_get(MatchObject *self)
3573{
3574    if (self->lastindex >= 0)
3575  return Py_BuildValue("i", self->lastindex);
3576    Py_INCREF(Py_None);
3577    return Py_None;
3578}
3579
3580static PyObject *
3581match_lastgroup_get(MatchObject *self)
3582{
3583    if (self->pattern->indexgroup && self->lastindex >= 0) {
3584        PyObject* result = PySequence_GetItem(
3585            self->pattern->indexgroup, self->lastindex
3586            );
3587        if (result)
3588            return result;
3589        PyErr_Clear();
3590    }
3591    Py_INCREF(Py_None);
3592    return Py_None;
3593}
3594
3595static PyObject *
3596match_regs_get(MatchObject *self)
3597{
3598    if (self->regs) {
3599        Py_INCREF(self->regs);
3600        return self->regs;
3601    } else
3602        return match_regs(self);
3603}
3604
3605static PyGetSetDef match_getset[] = {
3606    {"lastindex", (getter)match_lastindex_get, (setter)NULL},
3607    {"lastgroup", (getter)match_lastgroup_get, (setter)NULL},
3608    {"regs",      (getter)match_regs_get,      (setter)NULL},
3609    {NULL}
3610};
3611
3612#define MATCH_OFF(x) offsetof(MatchObject, x)
3613static PyMemberDef match_members[] = {
3614    {"string",  T_OBJECT,   MATCH_OFF(string),  READONLY},
3615    {"re",      T_OBJECT,   MATCH_OFF(pattern), READONLY},
3616    {"pos",     T_PYSSIZET, MATCH_OFF(pos),     READONLY},
3617    {"endpos",  T_PYSSIZET, MATCH_OFF(endpos),  READONLY},
3618    {NULL}
3619};
3620
3621
3622/* FIXME: implement setattr("string", None) as a special case (to
3623   detach the associated string, if any */
3624
3625static PyTypeObject Match_Type = {
3626    PyVarObject_HEAD_INIT(NULL, 0)
3627    "_" SRE_MODULE ".SRE_Match",
3628    sizeof(MatchObject), sizeof(Py_ssize_t),
3629    (destructor)match_dealloc,  /* tp_dealloc */
3630    0,                          /* tp_print */
3631    0,                          /* tp_getattr */
3632    0,                          /* tp_setattr */
3633    0,                          /* tp_compare */
3634    0,                          /* tp_repr */
3635    0,                          /* tp_as_number */
3636    0,                          /* tp_as_sequence */
3637    0,                          /* tp_as_mapping */
3638    0,                          /* tp_hash */
3639    0,                          /* tp_call */
3640    0,                          /* tp_str */
3641    0,                          /* tp_getattro */
3642    0,                          /* tp_setattro */
3643    0,                          /* tp_as_buffer */
3644    Py_TPFLAGS_DEFAULT,
3645    0,                          /* tp_doc */
3646    0,                          /* tp_traverse */
3647    0,                          /* tp_clear */
3648    0,                          /* tp_richcompare */
3649    0,                          /* tp_weaklistoffset */
3650    0,                          /* tp_iter */
3651    0,                          /* tp_iternext */
3652    match_methods,    /* tp_methods */
3653    match_members,    /* tp_members */
3654    match_getset,           /* tp_getset */
3655};
3656
3657static PyObject*
3658pattern_new_match(PatternObject* pattern, SRE_STATE* state, int status)
3659{
3660    /* create match object (from state object) */
3661
3662    MatchObject* match;
3663    Py_ssize_t i, j;
3664    char* base;
3665    int n;
3666
3667    if (status > 0) {
3668
3669        /* create match object (with room for extra group marks) */
3670        /* coverity[ampersand_in_size] */
3671        match = PyObject_NEW_VAR(MatchObject, &Match_Type,
3672                                 2*(pattern->groups+1));
3673        if (!match)
3674            return NULL;
3675
3676        Py_INCREF(pattern);
3677        match->pattern = pattern;
3678
3679        Py_INCREF(state->string);
3680        match->string = state->string;
3681
3682        match->regs = NULL;
3683        match->groups = pattern->groups+1;
3684
3685        /* fill in group slices */
3686
3687        base = (char*) state->beginning;
3688        n = state->charsize;
3689
3690        match->mark[0] = ((char*) state->start - base) / n;
3691        match->mark[1] = ((char*) state->ptr - base) / n;
3692
3693        for (i = j = 0; i < pattern->groups; i++, j+=2)
3694            if (j+1 <= state->lastmark && state->mark[j] && state->mark[j+1]) {
3695                match->mark[j+2] = ((char*) state->mark[j] - base) / n;
3696                match->mark[j+3] = ((char*) state->mark[j+1] - base) / n;
3697            } else
3698                match->mark[j+2] = match->mark[j+3] = -1; /* undefined */
3699
3700        match->pos = state->pos;
3701        match->endpos = state->endpos;
3702
3703        match->lastindex = state->lastindex;
3704
3705        return (PyObject*) match;
3706
3707    } else if (status == 0) {
3708
3709        /* no match */
3710        Py_INCREF(Py_None);
3711        return Py_None;
3712
3713    }
3714
3715    /* internal error */
3716    pattern_error(status);
3717    return NULL;
3718}
3719
3720
3721/* -------------------------------------------------------------------- */
3722/* scanner methods (experimental) */
3723
3724static void
3725scanner_dealloc(ScannerObject* self)
3726{
3727    state_fini(&self->state);
3728    Py_XDECREF(self->pattern);
3729    PyObject_DEL(self);
3730}
3731
3732static PyObject*
3733scanner_match(ScannerObject* self, PyObject *unused)
3734{
3735    SRE_STATE* state = &self->state;
3736    PyObject* match;
3737    int status;
3738
3739    state_reset(state);
3740
3741    state->ptr = state->start;
3742
3743    if (state->charsize == 1) {
3744        status = sre_match(state, PatternObject_GetCode(self->pattern));
3745    } else {
3746#if defined(HAVE_UNICODE)
3747        status = sre_umatch(state, PatternObject_GetCode(self->pattern));
3748#endif
3749    }
3750    if (PyErr_Occurred())
3751        return NULL;
3752
3753    match = pattern_new_match((PatternObject*) self->pattern,
3754                               state, status);
3755
3756    if (status == 0 || state->ptr == state->start)
3757        state->start = (void*) ((char*) state->ptr + state->charsize);
3758    else
3759        state->start = state->ptr;
3760
3761    return match;
3762}
3763
3764
3765static PyObject*
3766scanner_search(ScannerObject* self, PyObject *unused)
3767{
3768    SRE_STATE* state = &self->state;
3769    PyObject* match;
3770    int status;
3771
3772    state_reset(state);
3773
3774    state->ptr = state->start;
3775
3776    if (state->charsize == 1) {
3777        status = sre_search(state, PatternObject_GetCode(self->pattern));
3778    } else {
3779#if defined(HAVE_UNICODE)
3780        status = sre_usearch(state, PatternObject_GetCode(self->pattern));
3781#endif
3782    }
3783    if (PyErr_Occurred())
3784        return NULL;
3785
3786    match = pattern_new_match((PatternObject*) self->pattern,
3787                               state, status);
3788
3789    if (status == 0 || state->ptr == state->start)
3790        state->start = (void*) ((char*) state->ptr + state->charsize);
3791    else
3792        state->start = state->ptr;
3793
3794    return match;
3795}
3796
3797static PyMethodDef scanner_methods[] = {
3798    {"match", (PyCFunction) scanner_match, METH_NOARGS},
3799    {"search", (PyCFunction) scanner_search, METH_NOARGS},
3800    {NULL, NULL}
3801};
3802
3803#define SCAN_OFF(x) offsetof(ScannerObject, x)
3804static PyMemberDef scanner_members[] = {
3805    {"pattern", T_OBJECT, SCAN_OFF(pattern),  READONLY},
3806    {NULL}  /* Sentinel */
3807};
3808
3809statichere PyTypeObject Scanner_Type = {
3810    PyObject_HEAD_INIT(NULL)
3811    0, "_" SRE_MODULE ".SRE_Scanner",
3812    sizeof(ScannerObject), 0,
3813    (destructor)scanner_dealloc, /*tp_dealloc*/
3814    0,        /* tp_print */
3815    0,        /* tp_getattr */
3816    0,        /* tp_setattr */
3817    0,        /* tp_reserved */
3818    0,        /* tp_repr */
3819    0,        /* tp_as_number */
3820    0,        /* tp_as_sequence */
3821    0,        /* tp_as_mapping */
3822    0,        /* tp_hash */
3823    0,        /* tp_call */
3824    0,        /* tp_str */
3825    0,        /* tp_getattro */
3826    0,        /* tp_setattro */
3827    0,        /* tp_as_buffer */
3828    Py_TPFLAGS_DEFAULT,   /* tp_flags */
3829    0,        /* tp_doc */
3830    0,        /* tp_traverse */
3831    0,        /* tp_clear */
3832    0,        /* tp_richcompare */
3833    0,        /* tp_weaklistoffset */
3834    0,        /* tp_iter */
3835    0,        /* tp_iternext */
3836    scanner_methods,    /* tp_methods */
3837    scanner_members,    /* tp_members */
3838    0,        /* tp_getset */
3839};
3840
3841static PyObject*
3842pattern_scanner(PatternObject* pattern, PyObject* args)
3843{
3844    /* create search state object */
3845
3846    ScannerObject* self;
3847
3848    PyObject* string;
3849    Py_ssize_t start = 0;
3850    Py_ssize_t end = PY_SSIZE_T_MAX;
3851    if (!PyArg_ParseTuple(args, "O|nn:scanner", &string, &start, &end))
3852        return NULL;
3853
3854    /* create scanner object */
3855    self = PyObject_NEW(ScannerObject, &Scanner_Type);
3856    if (!self)
3857        return NULL;
3858    self->pattern = NULL;
3859
3860    string = state_init(&self->state, pattern, string, start, end);
3861    if (!string) {
3862        Py_DECREF(self);
3863        return NULL;
3864    }
3865
3866    Py_INCREF(pattern);
3867    self->pattern = (PyObject*) pattern;
3868
3869    return (PyObject*) self;
3870}
3871
3872static PyMethodDef _functions[] = {
3873    {"compile", _compile, METH_VARARGS},
3874    {"getcodesize", sre_codesize, METH_NOARGS},
3875    {"getlower", sre_getlower, METH_VARARGS},
3876    {NULL, NULL}
3877};
3878
3879#if PY_VERSION_HEX < 0x02030000
3880DL_EXPORT(void) init_sre(void)
3881#else
3882PyMODINIT_FUNC init_sre(void)
3883#endif
3884{
3885    PyObject* m;
3886    PyObject* d;
3887    PyObject* x;
3888
3889    /* Patch object types */
3890    if (PyType_Ready(&Pattern_Type) || PyType_Ready(&Match_Type) ||
3891        PyType_Ready(&Scanner_Type))
3892        return;
3893
3894    m = Py_InitModule("_" SRE_MODULE, _functions);
3895    if (m == NULL)
3896        return;
3897    d = PyModule_GetDict(m);
3898
3899    x = PyInt_FromLong(SRE_MAGIC);
3900    if (x) {
3901        PyDict_SetItemString(d, "MAGIC", x);
3902        Py_DECREF(x);
3903    }
3904
3905    x = PyInt_FromLong(sizeof(SRE_CODE));
3906    if (x) {
3907        PyDict_SetItemString(d, "CODESIZE", x);
3908        Py_DECREF(x);
3909    }
3910
3911    x = PyString_FromString(copyright);
3912    if (x) {
3913        PyDict_SetItemString(d, "copyright", x);
3914        Py_DECREF(x);
3915    }
3916}
3917
3918#endif /* !defined(SRE_RECURSIVE) */
3919
3920/* vim:ts=4:sw=4:et
3921*/
3922