1/*************************************************
2*      Perl-Compatible Regular Expressions       *
3*************************************************/
4
5/* PCRE is a library of functions to support regular expressions whose syntax
6and semantics are as close as possible to those of the Perl 5 language.
7
8                       Written by Philip Hazel
9           Copyright (c) 1997-2014 University of Cambridge
10
11-----------------------------------------------------------------------------
12Redistribution and use in source and binary forms, with or without
13modification, are permitted provided that the following conditions are met:
14
15    * Redistributions of source code must retain the above copyright notice,
16      this list of conditions and the following disclaimer.
17
18    * Redistributions in binary form must reproduce the above copyright
19      notice, this list of conditions and the following disclaimer in the
20      documentation and/or other materials provided with the distribution.
21
22    * Neither the name of the University of Cambridge nor the names of its
23      contributors may be used to endorse or promote products derived from
24      this software without specific prior written permission.
25
26THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36POSSIBILITY OF SUCH DAMAGE.
37-----------------------------------------------------------------------------
38*/
39
40
41/* This module contains the external function pcre_compile(), along with
42supporting internal functions that are not used by other modules. */
43
44
45#ifdef HAVE_CONFIG_H
46#include "config.h"
47#endif
48
49#define NLBLOCK cd             /* Block containing newline information */
50#define PSSTART start_pattern  /* Field containing pattern start */
51#define PSEND   end_pattern    /* Field containing pattern end */
52
53#include "pcre_internal.h"
54
55
56/* When PCRE_DEBUG is defined, we need the pcre(16|32)_printint() function, which
57is also used by pcretest. PCRE_DEBUG is not defined when building a production
58library. We do not need to select pcre16_printint.c specially, because the
59COMPILE_PCREx macro will already be appropriately set. */
60
61#ifdef PCRE_DEBUG
62/* pcre_printint.c should not include any headers */
63#define PCRE_INCLUDED
64#include "pcre_printint.c"
65#undef PCRE_INCLUDED
66#endif
67
68
69/* Macro for setting individual bits in class bitmaps. */
70
71#define SETBIT(a,b) a[(b)/8] |= (1 << ((b)&7))
72
73/* Maximum length value to check against when making sure that the integer that
74holds the compiled pattern length does not overflow. We make it a bit less than
75INT_MAX to allow for adding in group terminating bytes, so that we don't have
76to check them every time. */
77
78#define OFLOW_MAX (INT_MAX - 20)
79
80/* Definitions to allow mutual recursion */
81
82static int
83  add_list_to_class(pcre_uint8 *, pcre_uchar **, int, compile_data *,
84    const pcre_uint32 *, unsigned int);
85
86static BOOL
87  compile_regex(int, pcre_uchar **, const pcre_uchar **, int *, BOOL, BOOL, int, int,
88    pcre_uint32 *, pcre_int32 *, pcre_uint32 *, pcre_int32 *, branch_chain *,
89    compile_data *, int *);
90
91
92
93/*************************************************
94*      Code parameters and static tables         *
95*************************************************/
96
97/* This value specifies the size of stack workspace that is used during the
98first pre-compile phase that determines how much memory is required. The regex
99is partly compiled into this space, but the compiled parts are discarded as
100soon as they can be, so that hopefully there will never be an overrun. The code
101does, however, check for an overrun. The largest amount I've seen used is 218,
102so this number is very generous.
103
104The same workspace is used during the second, actual compile phase for
105remembering forward references to groups so that they can be filled in at the
106end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
107is 4 there is plenty of room for most patterns. However, the memory can get
108filled up by repetitions of forward references, for example patterns like
109/(?1){0,1999}(b)/, and one user did hit the limit. The code has been changed so
110that the workspace is expanded using malloc() in this situation. The value
111below is therefore a minimum, and we put a maximum on it for safety. The
112minimum is now also defined in terms of LINK_SIZE so that the use of malloc()
113kicks in at the same number of forward references in all cases. */
114
115#define COMPILE_WORK_SIZE (2048*LINK_SIZE)
116#define COMPILE_WORK_SIZE_MAX (100*COMPILE_WORK_SIZE)
117
118/* This value determines the size of the initial vector that is used for
119remembering named groups during the pre-compile. It is allocated on the stack,
120but if it is too small, it is expanded using malloc(), in a similar way to the
121workspace. The value is the number of slots in the list. */
122
123#define NAMED_GROUP_LIST_SIZE  20
124
125/* The overrun tests check for a slightly smaller size so that they detect the
126overrun before it actually does run off the end of the data block. */
127
128#define WORK_SIZE_SAFETY_MARGIN (100)
129
130/* Private flags added to firstchar and reqchar. */
131
132#define REQ_CASELESS    (1 << 0)        /* Indicates caselessness */
133#define REQ_VARY        (1 << 1)        /* Reqchar followed non-literal item */
134/* Negative values for the firstchar and reqchar flags */
135#define REQ_UNSET       (-2)
136#define REQ_NONE        (-1)
137
138/* Repeated character flags. */
139
140#define UTF_LENGTH     0x10000000l      /* The char contains its length. */
141
142/* Table for handling escaped characters in the range '0'-'z'. Positive returns
143are simple data values; negative values are for special things like \d and so
144on. Zero means further processing is needed (for things like \x), or the escape
145is invalid. */
146
147#ifndef EBCDIC
148
149/* This is the "normal" table for ASCII systems or for EBCDIC systems running
150in UTF-8 mode. */
151
152static const short int escapes[] = {
153     0,                       0,
154     0,                       0,
155     0,                       0,
156     0,                       0,
157     0,                       0,
158     CHAR_COLON,              CHAR_SEMICOLON,
159     CHAR_LESS_THAN_SIGN,     CHAR_EQUALS_SIGN,
160     CHAR_GREATER_THAN_SIGN,  CHAR_QUESTION_MARK,
161     CHAR_COMMERCIAL_AT,      -ESC_A,
162     -ESC_B,                  -ESC_C,
163     -ESC_D,                  -ESC_E,
164     0,                       -ESC_G,
165     -ESC_H,                  0,
166     0,                       -ESC_K,
167     0,                       0,
168     -ESC_N,                  0,
169     -ESC_P,                  -ESC_Q,
170     -ESC_R,                  -ESC_S,
171     0,                       0,
172     -ESC_V,                  -ESC_W,
173     -ESC_X,                  0,
174     -ESC_Z,                  CHAR_LEFT_SQUARE_BRACKET,
175     CHAR_BACKSLASH,          CHAR_RIGHT_SQUARE_BRACKET,
176     CHAR_CIRCUMFLEX_ACCENT,  CHAR_UNDERSCORE,
177     CHAR_GRAVE_ACCENT,       ESC_a,
178     -ESC_b,                  0,
179     -ESC_d,                  ESC_e,
180     ESC_f,                   0,
181     -ESC_h,                  0,
182     0,                       -ESC_k,
183     0,                       0,
184     ESC_n,                   0,
185     -ESC_p,                  0,
186     ESC_r,                   -ESC_s,
187     ESC_tee,                 0,
188     -ESC_v,                  -ESC_w,
189     0,                       0,
190     -ESC_z
191};
192
193#else
194
195/* This is the "abnormal" table for EBCDIC systems without UTF-8 support. */
196
197static const short int escapes[] = {
198/*  48 */     0,     0,      0,     '.',    '<',   '(',    '+',    '|',
199/*  50 */   '&',     0,      0,       0,      0,     0,      0,      0,
200/*  58 */     0,     0,    '!',     '$',    '*',   ')',    ';',    '~',
201/*  60 */   '-',   '/',      0,       0,      0,     0,      0,      0,
202/*  68 */     0,     0,    '|',     ',',    '%',   '_',    '>',    '?',
203/*  70 */     0,     0,      0,       0,      0,     0,      0,      0,
204/*  78 */     0,   '`',    ':',     '#',    '@',  '\'',    '=',    '"',
205/*  80 */     0, ESC_a, -ESC_b,       0, -ESC_d, ESC_e,  ESC_f,      0,
206/*  88 */-ESC_h,     0,      0,     '{',      0,     0,      0,      0,
207/*  90 */     0,     0, -ESC_k,       0,      0, ESC_n,      0, -ESC_p,
208/*  98 */     0, ESC_r,      0,     '}',      0,     0,      0,      0,
209/*  A0 */     0,   '~', -ESC_s, ESC_tee,      0,-ESC_v, -ESC_w,      0,
210/*  A8 */     0,-ESC_z,      0,       0,      0,   '[',      0,      0,
211/*  B0 */     0,     0,      0,       0,      0,     0,      0,      0,
212/*  B8 */     0,     0,      0,       0,      0,   ']',    '=',    '-',
213/*  C0 */   '{',-ESC_A, -ESC_B,  -ESC_C, -ESC_D,-ESC_E,      0, -ESC_G,
214/*  C8 */-ESC_H,     0,      0,       0,      0,     0,      0,      0,
215/*  D0 */   '}',     0, -ESC_K,       0,      0,-ESC_N,      0, -ESC_P,
216/*  D8 */-ESC_Q,-ESC_R,      0,       0,      0,     0,      0,      0,
217/*  E0 */  '\\',     0, -ESC_S,       0,      0,-ESC_V, -ESC_W, -ESC_X,
218/*  E8 */     0,-ESC_Z,      0,       0,      0,     0,      0,      0,
219/*  F0 */     0,     0,      0,       0,      0,     0,      0,      0,
220/*  F8 */     0,     0,      0,       0,      0,     0,      0,      0
221};
222
223/* We also need a table of characters that may follow \c in an EBCDIC
224environment for characters 0-31. */
225
226static unsigned char ebcdic_escape_c[] = "@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_";
227
228#endif
229
230
231/* Table of special "verbs" like (*PRUNE). This is a short table, so it is
232searched linearly. Put all the names into a single string, in order to reduce
233the number of relocations when a shared library is dynamically linked. The
234string is built from string macros so that it works in UTF-8 mode on EBCDIC
235platforms. */
236
237typedef struct verbitem {
238  int   len;                 /* Length of verb name */
239  int   op;                  /* Op when no arg, or -1 if arg mandatory */
240  int   op_arg;              /* Op when arg present, or -1 if not allowed */
241} verbitem;
242
243static const char verbnames[] =
244  "\0"                       /* Empty name is a shorthand for MARK */
245  STRING_MARK0
246  STRING_ACCEPT0
247  STRING_COMMIT0
248  STRING_F0
249  STRING_FAIL0
250  STRING_PRUNE0
251  STRING_SKIP0
252  STRING_THEN;
253
254static const verbitem verbs[] = {
255  { 0, -1,        OP_MARK },
256  { 4, -1,        OP_MARK },
257  { 6, OP_ACCEPT, -1 },
258  { 6, OP_COMMIT, -1 },
259  { 1, OP_FAIL,   -1 },
260  { 4, OP_FAIL,   -1 },
261  { 5, OP_PRUNE,  OP_PRUNE_ARG },
262  { 4, OP_SKIP,   OP_SKIP_ARG  },
263  { 4, OP_THEN,   OP_THEN_ARG  }
264};
265
266static const int verbcount = sizeof(verbs)/sizeof(verbitem);
267
268
269/* Substitutes for [[:<:]] and [[:>:]], which mean start and end of word in
270another regex library. */
271
272static const pcre_uchar sub_start_of_word[] = {
273  CHAR_BACKSLASH, CHAR_b, CHAR_LEFT_PARENTHESIS, CHAR_QUESTION_MARK,
274  CHAR_EQUALS_SIGN, CHAR_BACKSLASH, CHAR_w, CHAR_RIGHT_PARENTHESIS, '\0' };
275
276static const pcre_uchar sub_end_of_word[] = {
277  CHAR_BACKSLASH, CHAR_b, CHAR_LEFT_PARENTHESIS, CHAR_QUESTION_MARK,
278  CHAR_LESS_THAN_SIGN, CHAR_EQUALS_SIGN, CHAR_BACKSLASH, CHAR_w,
279  CHAR_RIGHT_PARENTHESIS, '\0' };
280
281
282/* Tables of names of POSIX character classes and their lengths. The names are
283now all in a single string, to reduce the number of relocations when a shared
284library is dynamically loaded. The list of lengths is terminated by a zero
285length entry. The first three must be alpha, lower, upper, as this is assumed
286for handling case independence. The indices for graph, print, and punct are
287needed, so identify them. */
288
289static const char posix_names[] =
290  STRING_alpha0 STRING_lower0 STRING_upper0 STRING_alnum0
291  STRING_ascii0 STRING_blank0 STRING_cntrl0 STRING_digit0
292  STRING_graph0 STRING_print0 STRING_punct0 STRING_space0
293  STRING_word0  STRING_xdigit;
294
295static const pcre_uint8 posix_name_lengths[] = {
296  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
297
298#define PC_GRAPH  8
299#define PC_PRINT  9
300#define PC_PUNCT 10
301
302
303/* Table of class bit maps for each POSIX class. Each class is formed from a
304base map, with an optional addition or removal of another map. Then, for some
305classes, there is some additional tweaking: for [:blank:] the vertical space
306characters are removed, and for [:alpha:] and [:alnum:] the underscore
307character is removed. The triples in the table consist of the base map offset,
308second map offset or -1 if no second map, and a non-negative value for map
309addition or a negative value for map subtraction (if there are two maps). The
310absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
311remove vertical space characters, 2 => remove underscore. */
312
313static const int posix_class_maps[] = {
314  cbit_word,  cbit_digit, -2,             /* alpha */
315  cbit_lower, -1,          0,             /* lower */
316  cbit_upper, -1,          0,             /* upper */
317  cbit_word,  -1,          2,             /* alnum - word without underscore */
318  cbit_print, cbit_cntrl,  0,             /* ascii */
319  cbit_space, -1,          1,             /* blank - a GNU extension */
320  cbit_cntrl, -1,          0,             /* cntrl */
321  cbit_digit, -1,          0,             /* digit */
322  cbit_graph, -1,          0,             /* graph */
323  cbit_print, -1,          0,             /* print */
324  cbit_punct, -1,          0,             /* punct */
325  cbit_space, -1,          0,             /* space */
326  cbit_word,  -1,          0,             /* word - a Perl extension */
327  cbit_xdigit,-1,          0              /* xdigit */
328};
329
330/* Table of substitutes for \d etc when PCRE_UCP is set. They are replaced by
331Unicode property escapes. */
332
333#ifdef SUPPORT_UCP
334static const pcre_uchar string_PNd[]  = {
335  CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
336  CHAR_N, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
337static const pcre_uchar string_pNd[]  = {
338  CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
339  CHAR_N, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
340static const pcre_uchar string_PXsp[] = {
341  CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
342  CHAR_X, CHAR_s, CHAR_p, CHAR_RIGHT_CURLY_BRACKET, '\0' };
343static const pcre_uchar string_pXsp[] = {
344  CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
345  CHAR_X, CHAR_s, CHAR_p, CHAR_RIGHT_CURLY_BRACKET, '\0' };
346static const pcre_uchar string_PXwd[] = {
347  CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
348  CHAR_X, CHAR_w, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
349static const pcre_uchar string_pXwd[] = {
350  CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
351  CHAR_X, CHAR_w, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
352
353static const pcre_uchar *substitutes[] = {
354  string_PNd,           /* \D */
355  string_pNd,           /* \d */
356  string_PXsp,          /* \S */   /* Xsp is Perl space, but from 8.34, Perl */
357  string_pXsp,          /* \s */   /* space and POSIX space are the same. */
358  string_PXwd,          /* \W */
359  string_pXwd           /* \w */
360};
361
362/* The POSIX class substitutes must be in the order of the POSIX class names,
363defined above, and there are both positive and negative cases. NULL means no
364general substitute of a Unicode property escape (\p or \P). However, for some
365POSIX classes (e.g. graph, print, punct) a special property code is compiled
366directly. */
367
368static const pcre_uchar string_pL[] =   {
369  CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
370  CHAR_L, CHAR_RIGHT_CURLY_BRACKET, '\0' };
371static const pcre_uchar string_pLl[] =  {
372  CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
373  CHAR_L, CHAR_l, CHAR_RIGHT_CURLY_BRACKET, '\0' };
374static const pcre_uchar string_pLu[] =  {
375  CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
376  CHAR_L, CHAR_u, CHAR_RIGHT_CURLY_BRACKET, '\0' };
377static const pcre_uchar string_pXan[] = {
378  CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
379  CHAR_X, CHAR_a, CHAR_n, CHAR_RIGHT_CURLY_BRACKET, '\0' };
380static const pcre_uchar string_h[] =    {
381  CHAR_BACKSLASH, CHAR_h, '\0' };
382static const pcre_uchar string_pXps[] = {
383  CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
384  CHAR_X, CHAR_p, CHAR_s, CHAR_RIGHT_CURLY_BRACKET, '\0' };
385static const pcre_uchar string_PL[] =   {
386  CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
387  CHAR_L, CHAR_RIGHT_CURLY_BRACKET, '\0' };
388static const pcre_uchar string_PLl[] =  {
389  CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
390  CHAR_L, CHAR_l, CHAR_RIGHT_CURLY_BRACKET, '\0' };
391static const pcre_uchar string_PLu[] =  {
392  CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
393  CHAR_L, CHAR_u, CHAR_RIGHT_CURLY_BRACKET, '\0' };
394static const pcre_uchar string_PXan[] = {
395  CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
396  CHAR_X, CHAR_a, CHAR_n, CHAR_RIGHT_CURLY_BRACKET, '\0' };
397static const pcre_uchar string_H[] =    {
398  CHAR_BACKSLASH, CHAR_H, '\0' };
399static const pcre_uchar string_PXps[] = {
400  CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
401  CHAR_X, CHAR_p, CHAR_s, CHAR_RIGHT_CURLY_BRACKET, '\0' };
402
403static const pcre_uchar *posix_substitutes[] = {
404  string_pL,            /* alpha */
405  string_pLl,           /* lower */
406  string_pLu,           /* upper */
407  string_pXan,          /* alnum */
408  NULL,                 /* ascii */
409  string_h,             /* blank */
410  NULL,                 /* cntrl */
411  string_pNd,           /* digit */
412  NULL,                 /* graph */
413  NULL,                 /* print */
414  NULL,                 /* punct */
415  string_pXps,          /* space */   /* Xps is POSIX space, but from 8.34 */
416  string_pXwd,          /* word  */   /* Perl and POSIX space are the same */
417  NULL,                 /* xdigit */
418  /* Negated cases */
419  string_PL,            /* ^alpha */
420  string_PLl,           /* ^lower */
421  string_PLu,           /* ^upper */
422  string_PXan,          /* ^alnum */
423  NULL,                 /* ^ascii */
424  string_H,             /* ^blank */
425  NULL,                 /* ^cntrl */
426  string_PNd,           /* ^digit */
427  NULL,                 /* ^graph */
428  NULL,                 /* ^print */
429  NULL,                 /* ^punct */
430  string_PXps,          /* ^space */  /* Xps is POSIX space, but from 8.34 */
431  string_PXwd,          /* ^word */   /* Perl and POSIX space are the same */
432  NULL                  /* ^xdigit */
433};
434#define POSIX_SUBSIZE (sizeof(posix_substitutes) / sizeof(pcre_uchar *))
435#endif
436
437#define STRING(a)  # a
438#define XSTRING(s) STRING(s)
439
440/* The texts of compile-time error messages. These are "char *" because they
441are passed to the outside world. Do not ever re-use any error number, because
442they are documented. Always add a new error instead. Messages marked DEAD below
443are no longer used. This used to be a table of strings, but in order to reduce
444the number of relocations needed when a shared library is loaded dynamically,
445it is now one long string. We cannot use a table of offsets, because the
446lengths of inserts such as XSTRING(MAX_NAME_SIZE) are not known. Instead, we
447simply count through to the one we want - this isn't a performance issue
448because these strings are used only when there is a compilation error.
449
450Each substring ends with \0 to insert a null character. This includes the final
451substring, so that the whole string ends with \0\0, which can be detected when
452counting through. */
453
454static const char error_texts[] =
455  "no error\0"
456  "\\ at end of pattern\0"
457  "\\c at end of pattern\0"
458  "unrecognized character follows \\\0"
459  "numbers out of order in {} quantifier\0"
460  /* 5 */
461  "number too big in {} quantifier\0"
462  "missing terminating ] for character class\0"
463  "invalid escape sequence in character class\0"
464  "range out of order in character class\0"
465  "nothing to repeat\0"
466  /* 10 */
467  "internal error: invalid forward reference offset\0"
468  "internal error: unexpected repeat\0"
469  "unrecognized character after (? or (?-\0"
470  "POSIX named classes are supported only within a class\0"
471  "missing )\0"
472  /* 15 */
473  "reference to non-existent subpattern\0"
474  "erroffset passed as NULL\0"
475  "unknown option bit(s) set\0"
476  "missing ) after comment\0"
477  "parentheses nested too deeply\0"  /** DEAD **/
478  /* 20 */
479  "regular expression is too large\0"
480  "failed to get memory\0"
481  "unmatched parentheses\0"
482  "internal error: code overflow\0"
483  "unrecognized character after (?<\0"
484  /* 25 */
485  "lookbehind assertion is not fixed length\0"
486  "malformed number or name after (?(\0"
487  "conditional group contains more than two branches\0"
488  "assertion expected after (?(\0"
489  "(?R or (?[+-]digits must be followed by )\0"
490  /* 30 */
491  "unknown POSIX class name\0"
492  "POSIX collating elements are not supported\0"
493  "this version of PCRE is compiled without UTF support\0"
494  "spare error\0"  /** DEAD **/
495  "character value in \\x{} or \\o{} is too large\0"
496  /* 35 */
497  "invalid condition (?(0)\0"
498  "\\C not allowed in lookbehind assertion\0"
499  "PCRE does not support \\L, \\l, \\N{name}, \\U, or \\u\0"
500  "number after (?C is > 255\0"
501  "closing ) for (?C expected\0"
502  /* 40 */
503  "recursive call could loop indefinitely\0"
504  "unrecognized character after (?P\0"
505  "syntax error in subpattern name (missing terminator)\0"
506  "two named subpatterns have the same name\0"
507  "invalid UTF-8 string\0"
508  /* 45 */
509  "support for \\P, \\p, and \\X has not been compiled\0"
510  "malformed \\P or \\p sequence\0"
511  "unknown property name after \\P or \\p\0"
512  "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)\0"
513  "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0"
514  /* 50 */
515  "repeated subpattern is too long\0"    /** DEAD **/
516  "octal value is greater than \\377 in 8-bit non-UTF-8 mode\0"
517  "internal error: overran compiling workspace\0"
518  "internal error: previously-checked referenced subpattern not found\0"
519  "DEFINE group contains more than one branch\0"
520  /* 55 */
521  "repeating a DEFINE group is not allowed\0"  /** DEAD **/
522  "inconsistent NEWLINE options\0"
523  "\\g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number\0"
524  "a numbered reference must not be zero\0"
525  "an argument is not allowed for (*ACCEPT), (*FAIL), or (*COMMIT)\0"
526  /* 60 */
527  "(*VERB) not recognized or malformed\0"
528  "number is too big\0"
529  "subpattern name expected\0"
530  "digit expected after (?+\0"
531  "] is an invalid data character in JavaScript compatibility mode\0"
532  /* 65 */
533  "different names for subpatterns of the same number are not allowed\0"
534  "(*MARK) must have an argument\0"
535  "this version of PCRE is not compiled with Unicode property support\0"
536#ifndef EBCDIC
537  "\\c must be followed by an ASCII character\0"
538#else
539  "\\c must be followed by a letter or one of [\\]^_?\0"
540#endif
541  "\\k is not followed by a braced, angle-bracketed, or quoted name\0"
542  /* 70 */
543  "internal error: unknown opcode in find_fixedlength()\0"
544  "\\N is not supported in a class\0"
545  "too many forward references\0"
546  "disallowed Unicode code point (>= 0xd800 && <= 0xdfff)\0"
547  "invalid UTF-16 string\0"
548  /* 75 */
549  "name is too long in (*MARK), (*PRUNE), (*SKIP), or (*THEN)\0"
550  "character value in \\u.... sequence is too large\0"
551  "invalid UTF-32 string\0"
552  "setting UTF is disabled by the application\0"
553  "non-hex character in \\x{} (closing brace missing?)\0"
554  /* 80 */
555  "non-octal character in \\o{} (closing brace missing?)\0"
556  "missing opening brace after \\o\0"
557  "parentheses are too deeply nested\0"
558  "invalid range in character class\0"
559  "group name must start with a non-digit\0"
560  /* 85 */
561  "parentheses are too deeply nested (stack check)\0"
562  "digits missing in \\x{} or \\o{}\0"
563  ;
564
565/* Table to identify digits and hex digits. This is used when compiling
566patterns. Note that the tables in chartables are dependent on the locale, and
567may mark arbitrary characters as digits - but the PCRE compiling code expects
568to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
569a private table here. It costs 256 bytes, but it is a lot faster than doing
570character value tests (at least in some simple cases I timed), and in some
571applications one wants PCRE to compile efficiently as well as match
572efficiently.
573
574For convenience, we use the same bit definitions as in chartables:
575
576  0x04   decimal digit
577  0x08   hexadecimal digit
578
579Then we can use ctype_digit and ctype_xdigit in the code. */
580
581/* Using a simple comparison for decimal numbers rather than a memory read
582is much faster, and the resulting code is simpler (the compiler turns it
583into a subtraction and unsigned comparison). */
584
585#define IS_DIGIT(x) ((x) >= CHAR_0 && (x) <= CHAR_9)
586
587#ifndef EBCDIC
588
589/* This is the "normal" case, for ASCII systems, and EBCDIC systems running in
590UTF-8 mode. */
591
592static const pcre_uint8 digitab[] =
593  {
594  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7 */
595  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   8- 15 */
596  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  16- 23 */
597  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  24- 31 */
598  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - '  */
599  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  ( - /  */
600  0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /*  0 - 7  */
601  0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /*  8 - ?  */
602  0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /*  @ - G  */
603  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  H - O  */
604  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  P - W  */
605  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  X - _  */
606  0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /*  ` - g  */
607  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  h - o  */
608  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  p - w  */
609  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  x -127 */
610  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
611  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
612  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
613  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
614  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
615  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
616  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
617  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
618  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
619  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
620  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
621  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
622  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
623  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
624  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
625  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
626
627#else
628
629/* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */
630
631static const pcre_uint8 digitab[] =
632  {
633  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7  0 */
634  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   8- 15    */
635  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  16- 23 10 */
636  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  24- 31    */
637  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  32- 39 20 */
638  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  40- 47    */
639  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  48- 55 30 */
640  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  56- 63    */
641  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 40 */
642  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  72- |     */
643  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 50 */
644  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  88- 95    */
645  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 60 */
646  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ?     */
647  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
648  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- "     */
649  0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g  80 */
650  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  h -143    */
651  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p  90 */
652  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  q -159    */
653  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x  A0 */
654  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  y -175    */
655  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  ^ -183 B0 */
656  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191    */
657  0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /*  { - G  C0 */
658  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  H -207    */
659  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  } - P  D0 */
660  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  Q -223    */
661  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  \ - X  E0 */
662  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  Y -239    */
663  0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /*  0 - 7  F0 */
664  0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/*  8 -255    */
665
666static const pcre_uint8 ebcdic_chartab[] = { /* chartable partial dup */
667  0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*   0-  7 */
668  0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /*   8- 15 */
669  0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*  16- 23 */
670  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  24- 31 */
671  0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*  32- 39 */
672  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  40- 47 */
673  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  48- 55 */
674  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  56- 63 */
675  0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 */
676  0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /*  72- |  */
677  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 */
678  0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /*  88- 95 */
679  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 */
680  0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ?  */
681  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
682  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- "  */
683  0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g  */
684  0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  h -143 */
685  0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p  */
686  0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  q -159 */
687  0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x  */
688  0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  y -175 */
689  0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  ^ -183 */
690  0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
691  0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /*  { - G  */
692  0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  H -207 */
693  0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /*  } - P  */
694  0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  Q -223 */
695  0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /*  \ - X  */
696  0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  Y -239 */
697  0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /*  0 - 7  */
698  0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/*  8 -255 */
699#endif
700
701
702/* This table is used to check whether auto-possessification is possible
703between adjacent character-type opcodes. The left-hand (repeated) opcode is
704used to select the row, and the right-hand opcode is use to select the column.
705A value of 1 means that auto-possessification is OK. For example, the second
706value in the first row means that \D+\d can be turned into \D++\d.
707
708The Unicode property types (\P and \p) have to be present to fill out the table
709because of what their opcode values are, but the table values should always be
710zero because property types are handled separately in the code. The last four
711columns apply to items that cannot be repeated, so there is no need to have
712rows for them. Note that OP_DIGIT etc. are generated only when PCRE_UCP is
713*not* set. When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */
714
715#define APTROWS (LAST_AUTOTAB_LEFT_OP - FIRST_AUTOTAB_OP + 1)
716#define APTCOLS (LAST_AUTOTAB_RIGHT_OP - FIRST_AUTOTAB_OP + 1)
717
718static const pcre_uint8 autoposstab[APTROWS][APTCOLS] = {
719/* \D \d \S \s \W \w  . .+ \C \P \p \R \H \h \V \v \X \Z \z  $ $M */
720  { 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 },  /* \D */
721  { 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1 },  /* \d */
722  { 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1 },  /* \S */
723  { 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 },  /* \s */
724  { 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 },  /* \W */
725  { 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1 },  /* \w */
726  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0 },  /* .  */
727  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 },  /* .+ */
728  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 },  /* \C */
729  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },  /* \P */
730  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },  /* \p */
731  { 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0 },  /* \R */
732  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0 },  /* \H */
733  { 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0 },  /* \h */
734  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0 },  /* \V */
735  { 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0 },  /* \v */
736  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 }   /* \X */
737};
738
739
740/* This table is used to check whether auto-possessification is possible
741between adjacent Unicode property opcodes (OP_PROP and OP_NOTPROP). The
742left-hand (repeated) opcode is used to select the row, and the right-hand
743opcode is used to select the column. The values are as follows:
744
745  0   Always return FALSE (never auto-possessify)
746  1   Character groups are distinct (possessify if both are OP_PROP)
747  2   Check character categories in the same group (general or particular)
748  3   TRUE if the two opcodes are not the same (PROP vs NOTPROP)
749
750  4   Check left general category vs right particular category
751  5   Check right general category vs left particular category
752
753  6   Left alphanum vs right general category
754  7   Left space vs right general category
755  8   Left word vs right general category
756
757  9   Right alphanum vs left general category
758 10   Right space vs left general category
759 11   Right word vs left general category
760
761 12   Left alphanum vs right particular category
762 13   Left space vs right particular category
763 14   Left word vs right particular category
764
765 15   Right alphanum vs left particular category
766 16   Right space vs left particular category
767 17   Right word vs left particular category
768*/
769
770static const pcre_uint8 propposstab[PT_TABSIZE][PT_TABSIZE] = {
771/* ANY LAMP GC  PC  SC ALNUM SPACE PXSPACE WORD CLIST UCNC */
772  { 0,  0,  0,  0,  0,    0,    0,      0,   0,    0,   0 },  /* PT_ANY */
773  { 0,  3,  0,  0,  0,    3,    1,      1,   0,    0,   0 },  /* PT_LAMP */
774  { 0,  0,  2,  4,  0,    9,   10,     10,  11,    0,   0 },  /* PT_GC */
775  { 0,  0,  5,  2,  0,   15,   16,     16,  17,    0,   0 },  /* PT_PC */
776  { 0,  0,  0,  0,  2,    0,    0,      0,   0,    0,   0 },  /* PT_SC */
777  { 0,  3,  6, 12,  0,    3,    1,      1,   0,    0,   0 },  /* PT_ALNUM */
778  { 0,  1,  7, 13,  0,    1,    3,      3,   1,    0,   0 },  /* PT_SPACE */
779  { 0,  1,  7, 13,  0,    1,    3,      3,   1,    0,   0 },  /* PT_PXSPACE */
780  { 0,  0,  8, 14,  0,    0,    1,      1,   3,    0,   0 },  /* PT_WORD */
781  { 0,  0,  0,  0,  0,    0,    0,      0,   0,    0,   0 },  /* PT_CLIST */
782  { 0,  0,  0,  0,  0,    0,    0,      0,   0,    0,   3 }   /* PT_UCNC */
783};
784
785/* This table is used to check whether auto-possessification is possible
786between adjacent Unicode property opcodes (OP_PROP and OP_NOTPROP) when one
787specifies a general category and the other specifies a particular category. The
788row is selected by the general category and the column by the particular
789category. The value is 1 if the particular category is not part of the general
790category. */
791
792static const pcre_uint8 catposstab[7][30] = {
793/* Cc Cf Cn Co Cs Ll Lm Lo Lt Lu Mc Me Mn Nd Nl No Pc Pd Pe Pf Pi Po Ps Sc Sk Sm So Zl Zp Zs */
794  { 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },  /* C */
795  { 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },  /* L */
796  { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },  /* M */
797  { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },  /* N */
798  { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1 },  /* P */
799  { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1 },  /* S */
800  { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0 }   /* Z */
801};
802
803/* This table is used when checking ALNUM, (PX)SPACE, SPACE, and WORD against
804a general or particular category. The properties in each row are those
805that apply to the character set in question. Duplication means that a little
806unnecessary work is done when checking, but this keeps things much simpler
807because they can all use the same code. For more details see the comment where
808this table is used.
809
810Note: SPACE and PXSPACE used to be different because Perl excluded VT from
811"space", but from Perl 5.18 it's included, so both categories are treated the
812same here. */
813
814static const pcre_uint8 posspropstab[3][4] = {
815  { ucp_L, ucp_N, ucp_N, ucp_Nl },  /* ALNUM, 3rd and 4th values redundant */
816  { ucp_Z, ucp_Z, ucp_C, ucp_Cc },  /* SPACE and PXSPACE, 2nd value redundant */
817  { ucp_L, ucp_N, ucp_P, ucp_Po }   /* WORD */
818};
819
820/* This table is used when converting repeating opcodes into possessified
821versions as a result of an explicit possessive quantifier such as ++. A zero
822value means there is no possessified version - in those cases the item in
823question must be wrapped in ONCE brackets. The table is truncated at OP_CALLOUT
824because all relevant opcodes are less than that. */
825
826static const pcre_uint8 opcode_possessify[] = {
827  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   /* 0 - 15  */
828  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   /* 16 - 31 */
829
830  0,                       /* NOTI */
831  OP_POSSTAR, 0,           /* STAR, MINSTAR */
832  OP_POSPLUS, 0,           /* PLUS, MINPLUS */
833  OP_POSQUERY, 0,          /* QUERY, MINQUERY */
834  OP_POSUPTO, 0,           /* UPTO, MINUPTO */
835  0,                       /* EXACT */
836  0, 0, 0, 0,              /* POS{STAR,PLUS,QUERY,UPTO} */
837
838  OP_POSSTARI, 0,          /* STARI, MINSTARI */
839  OP_POSPLUSI, 0,          /* PLUSI, MINPLUSI */
840  OP_POSQUERYI, 0,         /* QUERYI, MINQUERYI */
841  OP_POSUPTOI, 0,          /* UPTOI, MINUPTOI */
842  0,                       /* EXACTI */
843  0, 0, 0, 0,              /* POS{STARI,PLUSI,QUERYI,UPTOI} */
844
845  OP_NOTPOSSTAR, 0,        /* NOTSTAR, NOTMINSTAR */
846  OP_NOTPOSPLUS, 0,        /* NOTPLUS, NOTMINPLUS */
847  OP_NOTPOSQUERY, 0,       /* NOTQUERY, NOTMINQUERY */
848  OP_NOTPOSUPTO, 0,        /* NOTUPTO, NOTMINUPTO */
849  0,                       /* NOTEXACT */
850  0, 0, 0, 0,              /* NOTPOS{STAR,PLUS,QUERY,UPTO} */
851
852  OP_NOTPOSSTARI, 0,       /* NOTSTARI, NOTMINSTARI */
853  OP_NOTPOSPLUSI, 0,       /* NOTPLUSI, NOTMINPLUSI */
854  OP_NOTPOSQUERYI, 0,      /* NOTQUERYI, NOTMINQUERYI */
855  OP_NOTPOSUPTOI, 0,       /* NOTUPTOI, NOTMINUPTOI */
856  0,                       /* NOTEXACTI */
857  0, 0, 0, 0,              /* NOTPOS{STARI,PLUSI,QUERYI,UPTOI} */
858
859  OP_TYPEPOSSTAR, 0,       /* TYPESTAR, TYPEMINSTAR */
860  OP_TYPEPOSPLUS, 0,       /* TYPEPLUS, TYPEMINPLUS */
861  OP_TYPEPOSQUERY, 0,      /* TYPEQUERY, TYPEMINQUERY */
862  OP_TYPEPOSUPTO, 0,       /* TYPEUPTO, TYPEMINUPTO */
863  0,                       /* TYPEEXACT */
864  0, 0, 0, 0,              /* TYPEPOS{STAR,PLUS,QUERY,UPTO} */
865
866  OP_CRPOSSTAR, 0,         /* CRSTAR, CRMINSTAR */
867  OP_CRPOSPLUS, 0,         /* CRPLUS, CRMINPLUS */
868  OP_CRPOSQUERY, 0,        /* CRQUERY, CRMINQUERY */
869  OP_CRPOSRANGE, 0,        /* CRRANGE, CRMINRANGE */
870  0, 0, 0, 0,              /* CRPOS{STAR,PLUS,QUERY,RANGE} */
871
872  0, 0, 0,                 /* CLASS, NCLASS, XCLASS */
873  0, 0,                    /* REF, REFI */
874  0, 0,                    /* DNREF, DNREFI */
875  0, 0                     /* RECURSE, CALLOUT */
876};
877
878
879
880/*************************************************
881*            Find an error text                  *
882*************************************************/
883
884/* The error texts are now all in one long string, to save on relocations. As
885some of the text is of unknown length, we can't use a table of offsets.
886Instead, just count through the strings. This is not a performance issue
887because it happens only when there has been a compilation error.
888
889Argument:   the error number
890Returns:    pointer to the error string
891*/
892
893static const char *
894find_error_text(int n)
895{
896const char *s = error_texts;
897for (; n > 0; n--)
898  {
899  while (*s++ != CHAR_NULL) {};
900  if (*s == CHAR_NULL) return "Error text not found (please report)";
901  }
902return s;
903}
904
905
906
907/*************************************************
908*           Expand the workspace                 *
909*************************************************/
910
911/* This function is called during the second compiling phase, if the number of
912forward references fills the existing workspace, which is originally a block on
913the stack. A larger block is obtained from malloc() unless the ultimate limit
914has been reached or the increase will be rather small.
915
916Argument: pointer to the compile data block
917Returns:  0 if all went well, else an error number
918*/
919
920static int
921expand_workspace(compile_data *cd)
922{
923pcre_uchar *newspace;
924int newsize = cd->workspace_size * 2;
925
926if (newsize > COMPILE_WORK_SIZE_MAX) newsize = COMPILE_WORK_SIZE_MAX;
927if (cd->workspace_size >= COMPILE_WORK_SIZE_MAX ||
928    newsize - cd->workspace_size < WORK_SIZE_SAFETY_MARGIN)
929 return ERR72;
930
931newspace = (PUBL(malloc))(IN_UCHARS(newsize));
932if (newspace == NULL) return ERR21;
933memcpy(newspace, cd->start_workspace, cd->workspace_size * sizeof(pcre_uchar));
934cd->hwm = (pcre_uchar *)newspace + (cd->hwm - cd->start_workspace);
935if (cd->workspace_size > COMPILE_WORK_SIZE)
936  (PUBL(free))((void *)cd->start_workspace);
937cd->start_workspace = newspace;
938cd->workspace_size = newsize;
939return 0;
940}
941
942
943
944/*************************************************
945*            Check for counted repeat            *
946*************************************************/
947
948/* This function is called when a '{' is encountered in a place where it might
949start a quantifier. It looks ahead to see if it really is a quantifier or not.
950It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
951where the ddds are digits.
952
953Arguments:
954  p         pointer to the first char after '{'
955
956Returns:    TRUE or FALSE
957*/
958
959static BOOL
960is_counted_repeat(const pcre_uchar *p)
961{
962if (!IS_DIGIT(*p)) return FALSE;
963p++;
964while (IS_DIGIT(*p)) p++;
965if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
966
967if (*p++ != CHAR_COMMA) return FALSE;
968if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
969
970if (!IS_DIGIT(*p)) return FALSE;
971p++;
972while (IS_DIGIT(*p)) p++;
973
974return (*p == CHAR_RIGHT_CURLY_BRACKET);
975}
976
977
978
979/*************************************************
980*            Handle escapes                      *
981*************************************************/
982
983/* This function is called when a \ has been encountered. It either returns a
984positive value for a simple escape such as \n, or 0 for a data character which
985will be placed in chptr. A backreference to group n is returned as negative n.
986When UTF-8 is enabled, a positive value greater than 255 may be returned in
987chptr. On entry, ptr is pointing at the \. On exit, it is on the final
988character of the escape sequence.
989
990Arguments:
991  ptrptr         points to the pattern position pointer
992  chptr          points to a returned data character
993  errorcodeptr   points to the errorcode variable
994  bracount       number of previous extracting brackets
995  options        the options bits
996  isclass        TRUE if inside a character class
997
998Returns:         zero => a data character
999                 positive => a special escape sequence
1000                 negative => a back reference
1001                 on error, errorcodeptr is set
1002*/
1003
1004static int
1005check_escape(const pcre_uchar **ptrptr, pcre_uint32 *chptr, int *errorcodeptr,
1006  int bracount, int options, BOOL isclass)
1007{
1008/* PCRE_UTF16 has the same value as PCRE_UTF8. */
1009BOOL utf = (options & PCRE_UTF8) != 0;
1010const pcre_uchar *ptr = *ptrptr + 1;
1011pcre_uint32 c;
1012int escape = 0;
1013int i;
1014
1015GETCHARINCTEST(c, ptr);           /* Get character value, increment pointer */
1016ptr--;                            /* Set pointer back to the last byte */
1017
1018/* If backslash is at the end of the pattern, it's an error. */
1019
1020if (c == CHAR_NULL) *errorcodeptr = ERR1;
1021
1022/* Non-alphanumerics are literals. For digits or letters, do an initial lookup
1023in a table. A non-zero result is something that can be returned immediately.
1024Otherwise further processing may be required. */
1025
1026#ifndef EBCDIC  /* ASCII/UTF-8 coding */
1027/* Not alphanumeric */
1028else if (c < CHAR_0 || c > CHAR_z) {}
1029else if ((i = escapes[c - CHAR_0]) != 0)
1030  { if (i > 0) c = (pcre_uint32)i; else escape = -i; }
1031
1032#else           /* EBCDIC coding */
1033/* Not alphanumeric */
1034else if (c < CHAR_a || (!MAX_255(c) || (ebcdic_chartab[c] & 0x0E) == 0)) {}
1035else if ((i = escapes[c - 0x48]) != 0)  { if (i > 0) c = (pcre_uint32)i; else escape = -i; }
1036#endif
1037
1038/* Escapes that need further processing, or are illegal. */
1039
1040else
1041  {
1042  const pcre_uchar *oldptr;
1043  BOOL braced, negated, overflow;
1044  int s;
1045
1046  switch (c)
1047    {
1048    /* A number of Perl escapes are not handled by PCRE. We give an explicit
1049    error. */
1050
1051    case CHAR_l:
1052    case CHAR_L:
1053    *errorcodeptr = ERR37;
1054    break;
1055
1056    case CHAR_u:
1057    if ((options & PCRE_JAVASCRIPT_COMPAT) != 0)
1058      {
1059      /* In JavaScript, \u must be followed by four hexadecimal numbers.
1060      Otherwise it is a lowercase u letter. */
1061      if (MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0
1062        && MAX_255(ptr[2]) && (digitab[ptr[2]] & ctype_xdigit) != 0
1063        && MAX_255(ptr[3]) && (digitab[ptr[3]] & ctype_xdigit) != 0
1064        && MAX_255(ptr[4]) && (digitab[ptr[4]] & ctype_xdigit) != 0)
1065        {
1066        c = 0;
1067        for (i = 0; i < 4; ++i)
1068          {
1069          register pcre_uint32 cc = *(++ptr);
1070#ifndef EBCDIC  /* ASCII/UTF-8 coding */
1071          if (cc >= CHAR_a) cc -= 32;               /* Convert to upper case */
1072          c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
1073#else           /* EBCDIC coding */
1074          if (cc >= CHAR_a && cc <= CHAR_z) cc += 64;  /* Convert to upper case */
1075          c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
1076#endif
1077          }
1078
1079#if defined COMPILE_PCRE8
1080        if (c > (utf ? 0x10ffffU : 0xffU))
1081#elif defined COMPILE_PCRE16
1082        if (c > (utf ? 0x10ffffU : 0xffffU))
1083#elif defined COMPILE_PCRE32
1084        if (utf && c > 0x10ffffU)
1085#endif
1086          {
1087          *errorcodeptr = ERR76;
1088          }
1089        else if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
1090        }
1091      }
1092    else
1093      *errorcodeptr = ERR37;
1094    break;
1095
1096    case CHAR_U:
1097    /* In JavaScript, \U is an uppercase U letter. */
1098    if ((options & PCRE_JAVASCRIPT_COMPAT) == 0) *errorcodeptr = ERR37;
1099    break;
1100
1101    /* In a character class, \g is just a literal "g". Outside a character
1102    class, \g must be followed by one of a number of specific things:
1103
1104    (1) A number, either plain or braced. If positive, it is an absolute
1105    backreference. If negative, it is a relative backreference. This is a Perl
1106    5.10 feature.
1107
1108    (2) Perl 5.10 also supports \g{name} as a reference to a named group. This
1109    is part of Perl's movement towards a unified syntax for back references. As
1110    this is synonymous with \k{name}, we fudge it up by pretending it really
1111    was \k.
1112
1113    (3) For Oniguruma compatibility we also support \g followed by a name or a
1114    number either in angle brackets or in single quotes. However, these are
1115    (possibly recursive) subroutine calls, _not_ backreferences. Just return
1116    the ESC_g code (cf \k). */
1117
1118    case CHAR_g:
1119    if (isclass) break;
1120    if (ptr[1] == CHAR_LESS_THAN_SIGN || ptr[1] == CHAR_APOSTROPHE)
1121      {
1122      escape = ESC_g;
1123      break;
1124      }
1125
1126    /* Handle the Perl-compatible cases */
1127
1128    if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
1129      {
1130      const pcre_uchar *p;
1131      for (p = ptr+2; *p != CHAR_NULL && *p != CHAR_RIGHT_CURLY_BRACKET; p++)
1132        if (*p != CHAR_MINUS && !IS_DIGIT(*p)) break;
1133      if (*p != CHAR_NULL && *p != CHAR_RIGHT_CURLY_BRACKET)
1134        {
1135        escape = ESC_k;
1136        break;
1137        }
1138      braced = TRUE;
1139      ptr++;
1140      }
1141    else braced = FALSE;
1142
1143    if (ptr[1] == CHAR_MINUS)
1144      {
1145      negated = TRUE;
1146      ptr++;
1147      }
1148    else negated = FALSE;
1149
1150    /* The integer range is limited by the machine's int representation. */
1151    s = 0;
1152    overflow = FALSE;
1153    while (IS_DIGIT(ptr[1]))
1154      {
1155      if (s > INT_MAX / 10 - 1) /* Integer overflow */
1156        {
1157        overflow = TRUE;
1158        break;
1159        }
1160      s = s * 10 + (int)(*(++ptr) - CHAR_0);
1161      }
1162    if (overflow) /* Integer overflow */
1163      {
1164      while (IS_DIGIT(ptr[1]))
1165        ptr++;
1166      *errorcodeptr = ERR61;
1167      break;
1168      }
1169
1170    if (braced && *(++ptr) != CHAR_RIGHT_CURLY_BRACKET)
1171      {
1172      *errorcodeptr = ERR57;
1173      break;
1174      }
1175
1176    if (s == 0)
1177      {
1178      *errorcodeptr = ERR58;
1179      break;
1180      }
1181
1182    if (negated)
1183      {
1184      if (s > bracount)
1185        {
1186        *errorcodeptr = ERR15;
1187        break;
1188        }
1189      s = bracount - (s - 1);
1190      }
1191
1192    escape = -s;
1193    break;
1194
1195    /* The handling of escape sequences consisting of a string of digits
1196    starting with one that is not zero is not straightforward. Perl has changed
1197    over the years. Nowadays \g{} for backreferences and \o{} for octal are
1198    recommended to avoid the ambiguities in the old syntax.
1199
1200    Outside a character class, the digits are read as a decimal number. If the
1201    number is less than 8 (used to be 10), or if there are that many previous
1202    extracting left brackets, then it is a back reference. Otherwise, up to
1203    three octal digits are read to form an escaped byte. Thus \123 is likely to
1204    be octal 123 (cf \0123, which is octal 012 followed by the literal 3). If
1205    the octal value is greater than 377, the least significant 8 bits are
1206    taken. \8 and \9 are treated as the literal characters 8 and 9.
1207
1208    Inside a character class, \ followed by a digit is always either a literal
1209    8 or 9 or an octal number. */
1210
1211    case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5:
1212    case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
1213
1214    if (!isclass)
1215      {
1216      oldptr = ptr;
1217      /* The integer range is limited by the machine's int representation. */
1218      s = (int)(c -CHAR_0);
1219      overflow = FALSE;
1220      while (IS_DIGIT(ptr[1]))
1221        {
1222        if (s > INT_MAX / 10 - 1) /* Integer overflow */
1223          {
1224          overflow = TRUE;
1225          break;
1226          }
1227        s = s * 10 + (int)(*(++ptr) - CHAR_0);
1228        }
1229      if (overflow) /* Integer overflow */
1230        {
1231        while (IS_DIGIT(ptr[1]))
1232          ptr++;
1233        *errorcodeptr = ERR61;
1234        break;
1235        }
1236      if (s < 8 || s <= bracount)  /* Check for back reference */
1237        {
1238        escape = -s;
1239        break;
1240        }
1241      ptr = oldptr;      /* Put the pointer back and fall through */
1242      }
1243
1244    /* Handle a digit following \ when the number is not a back reference. If
1245    the first digit is 8 or 9, Perl used to generate a binary zero byte and
1246    then treat the digit as a following literal. At least by Perl 5.18 this
1247    changed so as not to insert the binary zero. */
1248
1249    if ((c = *ptr) >= CHAR_8) break;
1250
1251    /* Fall through with a digit less than 8 */
1252
1253    /* \0 always starts an octal number, but we may drop through to here with a
1254    larger first octal digit. The original code used just to take the least
1255    significant 8 bits of octal numbers (I think this is what early Perls used
1256    to do). Nowadays we allow for larger numbers in UTF-8 mode and 16-bit mode,
1257    but no more than 3 octal digits. */
1258
1259    case CHAR_0:
1260    c -= CHAR_0;
1261    while(i++ < 2 && ptr[1] >= CHAR_0 && ptr[1] <= CHAR_7)
1262        c = c * 8 + *(++ptr) - CHAR_0;
1263#ifdef COMPILE_PCRE8
1264    if (!utf && c > 0xff) *errorcodeptr = ERR51;
1265#endif
1266    break;
1267
1268    /* \o is a relatively new Perl feature, supporting a more general way of
1269    specifying character codes in octal. The only supported form is \o{ddd}. */
1270
1271    case CHAR_o:
1272    if (ptr[1] != CHAR_LEFT_CURLY_BRACKET) *errorcodeptr = ERR81; else
1273    if (ptr[2] == CHAR_RIGHT_CURLY_BRACKET) *errorcodeptr = ERR86; else
1274      {
1275      ptr += 2;
1276      c = 0;
1277      overflow = FALSE;
1278      while (*ptr >= CHAR_0 && *ptr <= CHAR_7)
1279        {
1280        register pcre_uint32 cc = *ptr++;
1281        if (c == 0 && cc == CHAR_0) continue;     /* Leading zeroes */
1282#ifdef COMPILE_PCRE32
1283        if (c >= 0x20000000l) { overflow = TRUE; break; }
1284#endif
1285        c = (c << 3) + cc - CHAR_0 ;
1286#if defined COMPILE_PCRE8
1287        if (c > (utf ? 0x10ffffU : 0xffU)) { overflow = TRUE; break; }
1288#elif defined COMPILE_PCRE16
1289        if (c > (utf ? 0x10ffffU : 0xffffU)) { overflow = TRUE; break; }
1290#elif defined COMPILE_PCRE32
1291        if (utf && c > 0x10ffffU) { overflow = TRUE; break; }
1292#endif
1293        }
1294      if (overflow)
1295        {
1296        while (*ptr >= CHAR_0 && *ptr <= CHAR_7) ptr++;
1297        *errorcodeptr = ERR34;
1298        }
1299      else if (*ptr == CHAR_RIGHT_CURLY_BRACKET)
1300        {
1301        if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
1302        }
1303      else *errorcodeptr = ERR80;
1304      }
1305    break;
1306
1307    /* \x is complicated. In JavaScript, \x must be followed by two hexadecimal
1308    numbers. Otherwise it is a lowercase x letter. */
1309
1310    case CHAR_x:
1311    if ((options & PCRE_JAVASCRIPT_COMPAT) != 0)
1312      {
1313      if (MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0
1314        && MAX_255(ptr[2]) && (digitab[ptr[2]] & ctype_xdigit) != 0)
1315        {
1316        c = 0;
1317        for (i = 0; i < 2; ++i)
1318          {
1319          register pcre_uint32 cc = *(++ptr);
1320#ifndef EBCDIC  /* ASCII/UTF-8 coding */
1321          if (cc >= CHAR_a) cc -= 32;               /* Convert to upper case */
1322          c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
1323#else           /* EBCDIC coding */
1324          if (cc >= CHAR_a && cc <= CHAR_z) cc += 64;  /* Convert to upper case */
1325          c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
1326#endif
1327          }
1328        }
1329      }    /* End JavaScript handling */
1330
1331    /* Handle \x in Perl's style. \x{ddd} is a character number which can be
1332    greater than 0xff in utf or non-8bit mode, but only if the ddd are hex
1333    digits. If not, { used to be treated as a data character. However, Perl
1334    seems to read hex digits up to the first non-such, and ignore the rest, so
1335    that, for example \x{zz} matches a binary zero. This seems crazy, so PCRE
1336    now gives an error. */
1337
1338    else
1339      {
1340      if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
1341        {
1342        ptr += 2;
1343        if (*ptr == CHAR_RIGHT_CURLY_BRACKET)
1344          {
1345          *errorcodeptr = ERR86;
1346          break;
1347          }
1348        c = 0;
1349        overflow = FALSE;
1350        while (MAX_255(*ptr) && (digitab[*ptr] & ctype_xdigit) != 0)
1351          {
1352          register pcre_uint32 cc = *ptr++;
1353          if (c == 0 && cc == CHAR_0) continue;     /* Leading zeroes */
1354
1355#ifdef COMPILE_PCRE32
1356          if (c >= 0x10000000l) { overflow = TRUE; break; }
1357#endif
1358
1359#ifndef EBCDIC  /* ASCII/UTF-8 coding */
1360          if (cc >= CHAR_a) cc -= 32;               /* Convert to upper case */
1361          c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
1362#else           /* EBCDIC coding */
1363          if (cc >= CHAR_a && cc <= CHAR_z) cc += 64;  /* Convert to upper case */
1364          c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
1365#endif
1366
1367#if defined COMPILE_PCRE8
1368          if (c > (utf ? 0x10ffffU : 0xffU)) { overflow = TRUE; break; }
1369#elif defined COMPILE_PCRE16
1370          if (c > (utf ? 0x10ffffU : 0xffffU)) { overflow = TRUE; break; }
1371#elif defined COMPILE_PCRE32
1372          if (utf && c > 0x10ffffU) { overflow = TRUE; break; }
1373#endif
1374          }
1375
1376        if (overflow)
1377          {
1378          while (MAX_255(*ptr) && (digitab[*ptr] & ctype_xdigit) != 0) ptr++;
1379          *errorcodeptr = ERR34;
1380          }
1381
1382        else if (*ptr == CHAR_RIGHT_CURLY_BRACKET)
1383          {
1384          if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
1385          }
1386
1387        /* If the sequence of hex digits does not end with '}', give an error.
1388        We used just to recognize this construct and fall through to the normal
1389        \x handling, but nowadays Perl gives an error, which seems much more
1390        sensible, so we do too. */
1391
1392        else *errorcodeptr = ERR79;
1393        }   /* End of \x{} processing */
1394
1395      /* Read a single-byte hex-defined char (up to two hex digits after \x) */
1396
1397      else
1398        {
1399        c = 0;
1400        while (i++ < 2 && MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0)
1401          {
1402          pcre_uint32 cc;                          /* Some compilers don't like */
1403          cc = *(++ptr);                           /* ++ in initializers */
1404#ifndef EBCDIC  /* ASCII/UTF-8 coding */
1405          if (cc >= CHAR_a) cc -= 32;              /* Convert to upper case */
1406          c = c * 16 + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
1407#else           /* EBCDIC coding */
1408          if (cc <= CHAR_z) cc += 64;              /* Convert to upper case */
1409          c = c * 16 + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
1410#endif
1411          }
1412        }     /* End of \xdd handling */
1413      }       /* End of Perl-style \x handling */
1414    break;
1415
1416    /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
1417    An error is given if the byte following \c is not an ASCII character. This
1418    coding is ASCII-specific, but then the whole concept of \cx is
1419    ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
1420
1421    case CHAR_c:
1422    c = *(++ptr);
1423    if (c == CHAR_NULL)
1424      {
1425      *errorcodeptr = ERR2;
1426      break;
1427      }
1428#ifndef EBCDIC    /* ASCII/UTF-8 coding */
1429    if (c > 127)  /* Excludes all non-ASCII in either mode */
1430      {
1431      *errorcodeptr = ERR68;
1432      break;
1433      }
1434    if (c >= CHAR_a && c <= CHAR_z) c -= 32;
1435    c ^= 0x40;
1436#else             /* EBCDIC coding */
1437    if (c >= CHAR_a && c <= CHAR_z) c += 64;
1438    if (c == CHAR_QUESTION_MARK)
1439      c = ('\\' == 188 && '`' == 74)? 0x5f : 0xff;
1440    else
1441      {
1442      for (i = 0; i < 32; i++)
1443        {
1444        if (c == ebcdic_escape_c[i]) break;
1445        }
1446      if (i < 32) c = i; else *errorcodeptr = ERR68;
1447      }
1448#endif
1449    break;
1450
1451    /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
1452    other alphanumeric following \ is an error if PCRE_EXTRA was set;
1453    otherwise, for Perl compatibility, it is a literal. This code looks a bit
1454    odd, but there used to be some cases other than the default, and there may
1455    be again in future, so I haven't "optimized" it. */
1456
1457    default:
1458    if ((options & PCRE_EXTRA) != 0) switch(c)
1459      {
1460      default:
1461      *errorcodeptr = ERR3;
1462      break;
1463      }
1464    break;
1465    }
1466  }
1467
1468/* Perl supports \N{name} for character names, as well as plain \N for "not
1469newline". PCRE does not support \N{name}. However, it does support
1470quantification such as \N{2,3}. */
1471
1472if (escape == ESC_N && ptr[1] == CHAR_LEFT_CURLY_BRACKET &&
1473     !is_counted_repeat(ptr+2))
1474  *errorcodeptr = ERR37;
1475
1476/* If PCRE_UCP is set, we change the values for \d etc. */
1477
1478if ((options & PCRE_UCP) != 0 && escape >= ESC_D && escape <= ESC_w)
1479  escape += (ESC_DU - ESC_D);
1480
1481/* Set the pointer to the final character before returning. */
1482
1483*ptrptr = ptr;
1484*chptr = c;
1485return escape;
1486}
1487
1488
1489
1490#ifdef SUPPORT_UCP
1491/*************************************************
1492*               Handle \P and \p                 *
1493*************************************************/
1494
1495/* This function is called after \P or \p has been encountered, provided that
1496PCRE is compiled with support for Unicode properties. On entry, ptrptr is
1497pointing at the P or p. On exit, it is pointing at the final character of the
1498escape sequence.
1499
1500Argument:
1501  ptrptr         points to the pattern position pointer
1502  negptr         points to a boolean that is set TRUE for negation else FALSE
1503  ptypeptr       points to an unsigned int that is set to the type value
1504  pdataptr       points to an unsigned int that is set to the detailed property value
1505  errorcodeptr   points to the error code variable
1506
1507Returns:         TRUE if the type value was found, or FALSE for an invalid type
1508*/
1509
1510static BOOL
1511get_ucp(const pcre_uchar **ptrptr, BOOL *negptr, unsigned int *ptypeptr,
1512  unsigned int *pdataptr, int *errorcodeptr)
1513{
1514pcre_uchar c;
1515int i, bot, top;
1516const pcre_uchar *ptr = *ptrptr;
1517pcre_uchar name[32];
1518
1519c = *(++ptr);
1520if (c == CHAR_NULL) goto ERROR_RETURN;
1521
1522*negptr = FALSE;
1523
1524/* \P or \p can be followed by a name in {}, optionally preceded by ^ for
1525negation. */
1526
1527if (c == CHAR_LEFT_CURLY_BRACKET)
1528  {
1529  if (ptr[1] == CHAR_CIRCUMFLEX_ACCENT)
1530    {
1531    *negptr = TRUE;
1532    ptr++;
1533    }
1534  for (i = 0; i < (int)(sizeof(name) / sizeof(pcre_uchar)) - 1; i++)
1535    {
1536    c = *(++ptr);
1537    if (c == CHAR_NULL) goto ERROR_RETURN;
1538    if (c == CHAR_RIGHT_CURLY_BRACKET) break;
1539    name[i] = c;
1540    }
1541  if (c != CHAR_RIGHT_CURLY_BRACKET) goto ERROR_RETURN;
1542  name[i] = 0;
1543  }
1544
1545/* Otherwise there is just one following character */
1546
1547else
1548  {
1549  name[0] = c;
1550  name[1] = 0;
1551  }
1552
1553*ptrptr = ptr;
1554
1555/* Search for a recognized property name using binary chop */
1556
1557bot = 0;
1558top = PRIV(utt_size);
1559
1560while (bot < top)
1561  {
1562  int r;
1563  i = (bot + top) >> 1;
1564  r = STRCMP_UC_C8(name, PRIV(utt_names) + PRIV(utt)[i].name_offset);
1565  if (r == 0)
1566    {
1567    *ptypeptr = PRIV(utt)[i].type;
1568    *pdataptr = PRIV(utt)[i].value;
1569    return TRUE;
1570    }
1571  if (r > 0) bot = i + 1; else top = i;
1572  }
1573
1574*errorcodeptr = ERR47;
1575*ptrptr = ptr;
1576return FALSE;
1577
1578ERROR_RETURN:
1579*errorcodeptr = ERR46;
1580*ptrptr = ptr;
1581return FALSE;
1582}
1583#endif
1584
1585
1586
1587/*************************************************
1588*         Read repeat counts                     *
1589*************************************************/
1590
1591/* Read an item of the form {n,m} and return the values. This is called only
1592after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
1593so the syntax is guaranteed to be correct, but we need to check the values.
1594
1595Arguments:
1596  p              pointer to first char after '{'
1597  minp           pointer to int for min
1598  maxp           pointer to int for max
1599                 returned as -1 if no max
1600  errorcodeptr   points to error code variable
1601
1602Returns:         pointer to '}' on success;
1603                 current ptr on error, with errorcodeptr set non-zero
1604*/
1605
1606static const pcre_uchar *
1607read_repeat_counts(const pcre_uchar *p, int *minp, int *maxp, int *errorcodeptr)
1608{
1609int min = 0;
1610int max = -1;
1611
1612while (IS_DIGIT(*p))
1613  {
1614  min = min * 10 + (int)(*p++ - CHAR_0);
1615  if (min > 65535)
1616    {
1617    *errorcodeptr = ERR5;
1618    return p;
1619    }
1620  }
1621
1622if (*p == CHAR_RIGHT_CURLY_BRACKET) max = min; else
1623  {
1624  if (*(++p) != CHAR_RIGHT_CURLY_BRACKET)
1625    {
1626    max = 0;
1627    while(IS_DIGIT(*p))
1628      {
1629      max = max * 10 + (int)(*p++ - CHAR_0);
1630      if (max > 65535)
1631        {
1632        *errorcodeptr = ERR5;
1633        return p;
1634        }
1635      }
1636    if (max < min)
1637      {
1638      *errorcodeptr = ERR4;
1639      return p;
1640      }
1641    }
1642  }
1643
1644*minp = min;
1645*maxp = max;
1646return p;
1647}
1648
1649
1650
1651/*************************************************
1652*      Find first significant op code            *
1653*************************************************/
1654
1655/* This is called by several functions that scan a compiled expression looking
1656for a fixed first character, or an anchoring op code etc. It skips over things
1657that do not influence this. For some calls, it makes sense to skip negative
1658forward and all backward assertions, and also the \b assertion; for others it
1659does not.
1660
1661Arguments:
1662  code         pointer to the start of the group
1663  skipassert   TRUE if certain assertions are to be skipped
1664
1665Returns:       pointer to the first significant opcode
1666*/
1667
1668static const pcre_uchar*
1669first_significant_code(const pcre_uchar *code, BOOL skipassert)
1670{
1671for (;;)
1672  {
1673  switch ((int)*code)
1674    {
1675    case OP_ASSERT_NOT:
1676    case OP_ASSERTBACK:
1677    case OP_ASSERTBACK_NOT:
1678    if (!skipassert) return code;
1679    do code += GET(code, 1); while (*code == OP_ALT);
1680    code += PRIV(OP_lengths)[*code];
1681    break;
1682
1683    case OP_WORD_BOUNDARY:
1684    case OP_NOT_WORD_BOUNDARY:
1685    if (!skipassert) return code;
1686    /* Fall through */
1687
1688    case OP_CALLOUT:
1689    case OP_CREF:
1690    case OP_DNCREF:
1691    case OP_RREF:
1692    case OP_DNRREF:
1693    case OP_DEF:
1694    code += PRIV(OP_lengths)[*code];
1695    break;
1696
1697    default:
1698    return code;
1699    }
1700  }
1701/* Control never reaches here */
1702}
1703
1704
1705
1706/*************************************************
1707*        Find the fixed length of a branch       *
1708*************************************************/
1709
1710/* Scan a branch and compute the fixed length of subject that will match it,
1711if the length is fixed. This is needed for dealing with backward assertions.
1712In UTF8 mode, the result is in characters rather than bytes. The branch is
1713temporarily terminated with OP_END when this function is called.
1714
1715This function is called when a backward assertion is encountered, so that if it
1716fails, the error message can point to the correct place in the pattern.
1717However, we cannot do this when the assertion contains subroutine calls,
1718because they can be forward references. We solve this by remembering this case
1719and doing the check at the end; a flag specifies which mode we are running in.
1720
1721Arguments:
1722  code     points to the start of the pattern (the bracket)
1723  utf      TRUE in UTF-8 / UTF-16 / UTF-32 mode
1724  atend    TRUE if called when the pattern is complete
1725  cd       the "compile data" structure
1726  recurses    chain of recurse_check to catch mutual recursion
1727
1728Returns:   the fixed length,
1729             or -1 if there is no fixed length,
1730             or -2 if \C was encountered (in UTF-8 mode only)
1731             or -3 if an OP_RECURSE item was encountered and atend is FALSE
1732             or -4 if an unknown opcode was encountered (internal error)
1733*/
1734
1735static int
1736find_fixedlength(pcre_uchar *code, BOOL utf, BOOL atend, compile_data *cd,
1737  recurse_check *recurses)
1738{
1739int length = -1;
1740recurse_check this_recurse;
1741register int branchlength = 0;
1742register pcre_uchar *cc = code + 1 + LINK_SIZE;
1743
1744/* Scan along the opcodes for this branch. If we get to the end of the
1745branch, check the length against that of the other branches. */
1746
1747for (;;)
1748  {
1749  int d;
1750  pcre_uchar *ce, *cs;
1751  register pcre_uchar op = *cc;
1752
1753  switch (op)
1754    {
1755    /* We only need to continue for OP_CBRA (normal capturing bracket) and
1756    OP_BRA (normal non-capturing bracket) because the other variants of these
1757    opcodes are all concerned with unlimited repeated groups, which of course
1758    are not of fixed length. */
1759
1760    case OP_CBRA:
1761    case OP_BRA:
1762    case OP_ONCE:
1763    case OP_ONCE_NC:
1764    case OP_COND:
1765    d = find_fixedlength(cc + ((op == OP_CBRA)? IMM2_SIZE : 0), utf, atend, cd,
1766      recurses);
1767    if (d < 0) return d;
1768    branchlength += d;
1769    do cc += GET(cc, 1); while (*cc == OP_ALT);
1770    cc += 1 + LINK_SIZE;
1771    break;
1772
1773    /* Reached end of a branch; if it's a ket it is the end of a nested call.
1774    If it's ALT it is an alternation in a nested call. An ACCEPT is effectively
1775    an ALT. If it is END it's the end of the outer call. All can be handled by
1776    the same code. Note that we must not include the OP_KETRxxx opcodes here,
1777    because they all imply an unlimited repeat. */
1778
1779    case OP_ALT:
1780    case OP_KET:
1781    case OP_END:
1782    case OP_ACCEPT:
1783    case OP_ASSERT_ACCEPT:
1784    if (length < 0) length = branchlength;
1785      else if (length != branchlength) return -1;
1786    if (*cc != OP_ALT) return length;
1787    cc += 1 + LINK_SIZE;
1788    branchlength = 0;
1789    break;
1790
1791    /* A true recursion implies not fixed length, but a subroutine call may
1792    be OK. If the subroutine is a forward reference, we can't deal with
1793    it until the end of the pattern, so return -3. */
1794
1795    case OP_RECURSE:
1796    if (!atend) return -3;
1797    cs = ce = (pcre_uchar *)cd->start_code + GET(cc, 1);  /* Start subpattern */
1798    do ce += GET(ce, 1); while (*ce == OP_ALT);           /* End subpattern */
1799    if (cc > cs && cc < ce) return -1;                    /* Recursion */
1800    else   /* Check for mutual recursion */
1801      {
1802      recurse_check *r = recurses;
1803      for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break;
1804      if (r != NULL) return -1;   /* Mutual recursion */
1805      }
1806    this_recurse.prev = recurses;
1807    this_recurse.group = cs;
1808    d = find_fixedlength(cs + IMM2_SIZE, utf, atend, cd, &this_recurse);
1809    if (d < 0) return d;
1810    branchlength += d;
1811    cc += 1 + LINK_SIZE;
1812    break;
1813
1814    /* Skip over assertive subpatterns */
1815
1816    case OP_ASSERT:
1817    case OP_ASSERT_NOT:
1818    case OP_ASSERTBACK:
1819    case OP_ASSERTBACK_NOT:
1820    do cc += GET(cc, 1); while (*cc == OP_ALT);
1821    cc += 1 + LINK_SIZE;
1822    break;
1823
1824    /* Skip over things that don't match chars */
1825
1826    case OP_MARK:
1827    case OP_PRUNE_ARG:
1828    case OP_SKIP_ARG:
1829    case OP_THEN_ARG:
1830    cc += cc[1] + PRIV(OP_lengths)[*cc];
1831    break;
1832
1833    case OP_CALLOUT:
1834    case OP_CIRC:
1835    case OP_CIRCM:
1836    case OP_CLOSE:
1837    case OP_COMMIT:
1838    case OP_CREF:
1839    case OP_DEF:
1840    case OP_DNCREF:
1841    case OP_DNRREF:
1842    case OP_DOLL:
1843    case OP_DOLLM:
1844    case OP_EOD:
1845    case OP_EODN:
1846    case OP_FAIL:
1847    case OP_NOT_WORD_BOUNDARY:
1848    case OP_PRUNE:
1849    case OP_REVERSE:
1850    case OP_RREF:
1851    case OP_SET_SOM:
1852    case OP_SKIP:
1853    case OP_SOD:
1854    case OP_SOM:
1855    case OP_THEN:
1856    case OP_WORD_BOUNDARY:
1857    cc += PRIV(OP_lengths)[*cc];
1858    break;
1859
1860    /* Handle literal characters */
1861
1862    case OP_CHAR:
1863    case OP_CHARI:
1864    case OP_NOT:
1865    case OP_NOTI:
1866    branchlength++;
1867    cc += 2;
1868#ifdef SUPPORT_UTF
1869    if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
1870#endif
1871    break;
1872
1873    /* Handle exact repetitions. The count is already in characters, but we
1874    need to skip over a multibyte character in UTF8 mode.  */
1875
1876    case OP_EXACT:
1877    case OP_EXACTI:
1878    case OP_NOTEXACT:
1879    case OP_NOTEXACTI:
1880    branchlength += (int)GET2(cc,1);
1881    cc += 2 + IMM2_SIZE;
1882#ifdef SUPPORT_UTF
1883    if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
1884#endif
1885    break;
1886
1887    case OP_TYPEEXACT:
1888    branchlength += GET2(cc,1);
1889    if (cc[1 + IMM2_SIZE] == OP_PROP || cc[1 + IMM2_SIZE] == OP_NOTPROP)
1890      cc += 2;
1891    cc += 1 + IMM2_SIZE + 1;
1892    break;
1893
1894    /* Handle single-char matchers */
1895
1896    case OP_PROP:
1897    case OP_NOTPROP:
1898    cc += 2;
1899    /* Fall through */
1900
1901    case OP_HSPACE:
1902    case OP_VSPACE:
1903    case OP_NOT_HSPACE:
1904    case OP_NOT_VSPACE:
1905    case OP_NOT_DIGIT:
1906    case OP_DIGIT:
1907    case OP_NOT_WHITESPACE:
1908    case OP_WHITESPACE:
1909    case OP_NOT_WORDCHAR:
1910    case OP_WORDCHAR:
1911    case OP_ANY:
1912    case OP_ALLANY:
1913    branchlength++;
1914    cc++;
1915    break;
1916
1917    /* The single-byte matcher isn't allowed. This only happens in UTF-8 mode;
1918    otherwise \C is coded as OP_ALLANY. */
1919
1920    case OP_ANYBYTE:
1921    return -2;
1922
1923    /* Check a class for variable quantification */
1924
1925    case OP_CLASS:
1926    case OP_NCLASS:
1927#if defined SUPPORT_UTF || defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1928    case OP_XCLASS:
1929    /* The original code caused an unsigned overflow in 64 bit systems,
1930    so now we use a conditional statement. */
1931    if (op == OP_XCLASS)
1932      cc += GET(cc, 1);
1933    else
1934      cc += PRIV(OP_lengths)[OP_CLASS];
1935#else
1936    cc += PRIV(OP_lengths)[OP_CLASS];
1937#endif
1938
1939    switch (*cc)
1940      {
1941      case OP_CRSTAR:
1942      case OP_CRMINSTAR:
1943      case OP_CRPLUS:
1944      case OP_CRMINPLUS:
1945      case OP_CRQUERY:
1946      case OP_CRMINQUERY:
1947      case OP_CRPOSSTAR:
1948      case OP_CRPOSPLUS:
1949      case OP_CRPOSQUERY:
1950      return -1;
1951
1952      case OP_CRRANGE:
1953      case OP_CRMINRANGE:
1954      case OP_CRPOSRANGE:
1955      if (GET2(cc,1) != GET2(cc,1+IMM2_SIZE)) return -1;
1956      branchlength += (int)GET2(cc,1);
1957      cc += 1 + 2 * IMM2_SIZE;
1958      break;
1959
1960      default:
1961      branchlength++;
1962      }
1963    break;
1964
1965    /* Anything else is variable length */
1966
1967    case OP_ANYNL:
1968    case OP_BRAMINZERO:
1969    case OP_BRAPOS:
1970    case OP_BRAPOSZERO:
1971    case OP_BRAZERO:
1972    case OP_CBRAPOS:
1973    case OP_EXTUNI:
1974    case OP_KETRMAX:
1975    case OP_KETRMIN:
1976    case OP_KETRPOS:
1977    case OP_MINPLUS:
1978    case OP_MINPLUSI:
1979    case OP_MINQUERY:
1980    case OP_MINQUERYI:
1981    case OP_MINSTAR:
1982    case OP_MINSTARI:
1983    case OP_MINUPTO:
1984    case OP_MINUPTOI:
1985    case OP_NOTMINPLUS:
1986    case OP_NOTMINPLUSI:
1987    case OP_NOTMINQUERY:
1988    case OP_NOTMINQUERYI:
1989    case OP_NOTMINSTAR:
1990    case OP_NOTMINSTARI:
1991    case OP_NOTMINUPTO:
1992    case OP_NOTMINUPTOI:
1993    case OP_NOTPLUS:
1994    case OP_NOTPLUSI:
1995    case OP_NOTPOSPLUS:
1996    case OP_NOTPOSPLUSI:
1997    case OP_NOTPOSQUERY:
1998    case OP_NOTPOSQUERYI:
1999    case OP_NOTPOSSTAR:
2000    case OP_NOTPOSSTARI:
2001    case OP_NOTPOSUPTO:
2002    case OP_NOTPOSUPTOI:
2003    case OP_NOTQUERY:
2004    case OP_NOTQUERYI:
2005    case OP_NOTSTAR:
2006    case OP_NOTSTARI:
2007    case OP_NOTUPTO:
2008    case OP_NOTUPTOI:
2009    case OP_PLUS:
2010    case OP_PLUSI:
2011    case OP_POSPLUS:
2012    case OP_POSPLUSI:
2013    case OP_POSQUERY:
2014    case OP_POSQUERYI:
2015    case OP_POSSTAR:
2016    case OP_POSSTARI:
2017    case OP_POSUPTO:
2018    case OP_POSUPTOI:
2019    case OP_QUERY:
2020    case OP_QUERYI:
2021    case OP_REF:
2022    case OP_REFI:
2023    case OP_DNREF:
2024    case OP_DNREFI:
2025    case OP_SBRA:
2026    case OP_SBRAPOS:
2027    case OP_SCBRA:
2028    case OP_SCBRAPOS:
2029    case OP_SCOND:
2030    case OP_SKIPZERO:
2031    case OP_STAR:
2032    case OP_STARI:
2033    case OP_TYPEMINPLUS:
2034    case OP_TYPEMINQUERY:
2035    case OP_TYPEMINSTAR:
2036    case OP_TYPEMINUPTO:
2037    case OP_TYPEPLUS:
2038    case OP_TYPEPOSPLUS:
2039    case OP_TYPEPOSQUERY:
2040    case OP_TYPEPOSSTAR:
2041    case OP_TYPEPOSUPTO:
2042    case OP_TYPEQUERY:
2043    case OP_TYPESTAR:
2044    case OP_TYPEUPTO:
2045    case OP_UPTO:
2046    case OP_UPTOI:
2047    return -1;
2048
2049    /* Catch unrecognized opcodes so that when new ones are added they
2050    are not forgotten, as has happened in the past. */
2051
2052    default:
2053    return -4;
2054    }
2055  }
2056/* Control never gets here */
2057}
2058
2059
2060
2061/*************************************************
2062*    Scan compiled regex for specific bracket    *
2063*************************************************/
2064
2065/* This little function scans through a compiled pattern until it finds a
2066capturing bracket with the given number, or, if the number is negative, an
2067instance of OP_REVERSE for a lookbehind. The function is global in the C sense
2068so that it can be called from pcre_study() when finding the minimum matching
2069length.
2070
2071Arguments:
2072  code        points to start of expression
2073  utf         TRUE in UTF-8 / UTF-16 / UTF-32 mode
2074  number      the required bracket number or negative to find a lookbehind
2075
2076Returns:      pointer to the opcode for the bracket, or NULL if not found
2077*/
2078
2079const pcre_uchar *
2080PRIV(find_bracket)(const pcre_uchar *code, BOOL utf, int number)
2081{
2082for (;;)
2083  {
2084  register pcre_uchar c = *code;
2085
2086  if (c == OP_END) return NULL;
2087
2088  /* XCLASS is used for classes that cannot be represented just by a bit
2089  map. This includes negated single high-valued characters. The length in
2090  the table is zero; the actual length is stored in the compiled code. */
2091
2092  if (c == OP_XCLASS) code += GET(code, 1);
2093
2094  /* Handle recursion */
2095
2096  else if (c == OP_REVERSE)
2097    {
2098    if (number < 0) return (pcre_uchar *)code;
2099    code += PRIV(OP_lengths)[c];
2100    }
2101
2102  /* Handle capturing bracket */
2103
2104  else if (c == OP_CBRA || c == OP_SCBRA ||
2105           c == OP_CBRAPOS || c == OP_SCBRAPOS)
2106    {
2107    int n = (int)GET2(code, 1+LINK_SIZE);
2108    if (n == number) return (pcre_uchar *)code;
2109    code += PRIV(OP_lengths)[c];
2110    }
2111
2112  /* Otherwise, we can get the item's length from the table, except that for
2113  repeated character types, we have to test for \p and \P, which have an extra
2114  two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we
2115  must add in its length. */
2116
2117  else
2118    {
2119    switch(c)
2120      {
2121      case OP_TYPESTAR:
2122      case OP_TYPEMINSTAR:
2123      case OP_TYPEPLUS:
2124      case OP_TYPEMINPLUS:
2125      case OP_TYPEQUERY:
2126      case OP_TYPEMINQUERY:
2127      case OP_TYPEPOSSTAR:
2128      case OP_TYPEPOSPLUS:
2129      case OP_TYPEPOSQUERY:
2130      if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
2131      break;
2132
2133      case OP_TYPEUPTO:
2134      case OP_TYPEMINUPTO:
2135      case OP_TYPEEXACT:
2136      case OP_TYPEPOSUPTO:
2137      if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
2138        code += 2;
2139      break;
2140
2141      case OP_MARK:
2142      case OP_PRUNE_ARG:
2143      case OP_SKIP_ARG:
2144      case OP_THEN_ARG:
2145      code += code[1];
2146      break;
2147      }
2148
2149    /* Add in the fixed length from the table */
2150
2151    code += PRIV(OP_lengths)[c];
2152
2153  /* In UTF-8 mode, opcodes that are followed by a character may be followed by
2154  a multi-byte character. The length in the table is a minimum, so we have to
2155  arrange to skip the extra bytes. */
2156
2157#if defined SUPPORT_UTF && !defined COMPILE_PCRE32
2158    if (utf) switch(c)
2159      {
2160      case OP_CHAR:
2161      case OP_CHARI:
2162      case OP_NOT:
2163      case OP_NOTI:
2164      case OP_EXACT:
2165      case OP_EXACTI:
2166      case OP_NOTEXACT:
2167      case OP_NOTEXACTI:
2168      case OP_UPTO:
2169      case OP_UPTOI:
2170      case OP_NOTUPTO:
2171      case OP_NOTUPTOI:
2172      case OP_MINUPTO:
2173      case OP_MINUPTOI:
2174      case OP_NOTMINUPTO:
2175      case OP_NOTMINUPTOI:
2176      case OP_POSUPTO:
2177      case OP_POSUPTOI:
2178      case OP_NOTPOSUPTO:
2179      case OP_NOTPOSUPTOI:
2180      case OP_STAR:
2181      case OP_STARI:
2182      case OP_NOTSTAR:
2183      case OP_NOTSTARI:
2184      case OP_MINSTAR:
2185      case OP_MINSTARI:
2186      case OP_NOTMINSTAR:
2187      case OP_NOTMINSTARI:
2188      case OP_POSSTAR:
2189      case OP_POSSTARI:
2190      case OP_NOTPOSSTAR:
2191      case OP_NOTPOSSTARI:
2192      case OP_PLUS:
2193      case OP_PLUSI:
2194      case OP_NOTPLUS:
2195      case OP_NOTPLUSI:
2196      case OP_MINPLUS:
2197      case OP_MINPLUSI:
2198      case OP_NOTMINPLUS:
2199      case OP_NOTMINPLUSI:
2200      case OP_POSPLUS:
2201      case OP_POSPLUSI:
2202      case OP_NOTPOSPLUS:
2203      case OP_NOTPOSPLUSI:
2204      case OP_QUERY:
2205      case OP_QUERYI:
2206      case OP_NOTQUERY:
2207      case OP_NOTQUERYI:
2208      case OP_MINQUERY:
2209      case OP_MINQUERYI:
2210      case OP_NOTMINQUERY:
2211      case OP_NOTMINQUERYI:
2212      case OP_POSQUERY:
2213      case OP_POSQUERYI:
2214      case OP_NOTPOSQUERY:
2215      case OP_NOTPOSQUERYI:
2216      if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
2217      break;
2218      }
2219#else
2220    (void)(utf);  /* Keep compiler happy by referencing function argument */
2221#endif
2222    }
2223  }
2224}
2225
2226
2227
2228/*************************************************
2229*   Scan compiled regex for recursion reference  *
2230*************************************************/
2231
2232/* This little function scans through a compiled pattern until it finds an
2233instance of OP_RECURSE.
2234
2235Arguments:
2236  code        points to start of expression
2237  utf         TRUE in UTF-8 / UTF-16 / UTF-32 mode
2238
2239Returns:      pointer to the opcode for OP_RECURSE, or NULL if not found
2240*/
2241
2242static const pcre_uchar *
2243find_recurse(const pcre_uchar *code, BOOL utf)
2244{
2245for (;;)
2246  {
2247  register pcre_uchar c = *code;
2248  if (c == OP_END) return NULL;
2249  if (c == OP_RECURSE) return code;
2250
2251  /* XCLASS is used for classes that cannot be represented just by a bit
2252  map. This includes negated single high-valued characters. The length in
2253  the table is zero; the actual length is stored in the compiled code. */
2254
2255  if (c == OP_XCLASS) code += GET(code, 1);
2256
2257  /* Otherwise, we can get the item's length from the table, except that for
2258  repeated character types, we have to test for \p and \P, which have an extra
2259  two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we
2260  must add in its length. */
2261
2262  else
2263    {
2264    switch(c)
2265      {
2266      case OP_TYPESTAR:
2267      case OP_TYPEMINSTAR:
2268      case OP_TYPEPLUS:
2269      case OP_TYPEMINPLUS:
2270      case OP_TYPEQUERY:
2271      case OP_TYPEMINQUERY:
2272      case OP_TYPEPOSSTAR:
2273      case OP_TYPEPOSPLUS:
2274      case OP_TYPEPOSQUERY:
2275      if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
2276      break;
2277
2278      case OP_TYPEPOSUPTO:
2279      case OP_TYPEUPTO:
2280      case OP_TYPEMINUPTO:
2281      case OP_TYPEEXACT:
2282      if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
2283        code += 2;
2284      break;
2285
2286      case OP_MARK:
2287      case OP_PRUNE_ARG:
2288      case OP_SKIP_ARG:
2289      case OP_THEN_ARG:
2290      code += code[1];
2291      break;
2292      }
2293
2294    /* Add in the fixed length from the table */
2295
2296    code += PRIV(OP_lengths)[c];
2297
2298    /* In UTF-8 mode, opcodes that are followed by a character may be followed
2299    by a multi-byte character. The length in the table is a minimum, so we have
2300    to arrange to skip the extra bytes. */
2301
2302#if defined SUPPORT_UTF && !defined COMPILE_PCRE32
2303    if (utf) switch(c)
2304      {
2305      case OP_CHAR:
2306      case OP_CHARI:
2307      case OP_NOT:
2308      case OP_NOTI:
2309      case OP_EXACT:
2310      case OP_EXACTI:
2311      case OP_NOTEXACT:
2312      case OP_NOTEXACTI:
2313      case OP_UPTO:
2314      case OP_UPTOI:
2315      case OP_NOTUPTO:
2316      case OP_NOTUPTOI:
2317      case OP_MINUPTO:
2318      case OP_MINUPTOI:
2319      case OP_NOTMINUPTO:
2320      case OP_NOTMINUPTOI:
2321      case OP_POSUPTO:
2322      case OP_POSUPTOI:
2323      case OP_NOTPOSUPTO:
2324      case OP_NOTPOSUPTOI:
2325      case OP_STAR:
2326      case OP_STARI:
2327      case OP_NOTSTAR:
2328      case OP_NOTSTARI:
2329      case OP_MINSTAR:
2330      case OP_MINSTARI:
2331      case OP_NOTMINSTAR:
2332      case OP_NOTMINSTARI:
2333      case OP_POSSTAR:
2334      case OP_POSSTARI:
2335      case OP_NOTPOSSTAR:
2336      case OP_NOTPOSSTARI:
2337      case OP_PLUS:
2338      case OP_PLUSI:
2339      case OP_NOTPLUS:
2340      case OP_NOTPLUSI:
2341      case OP_MINPLUS:
2342      case OP_MINPLUSI:
2343      case OP_NOTMINPLUS:
2344      case OP_NOTMINPLUSI:
2345      case OP_POSPLUS:
2346      case OP_POSPLUSI:
2347      case OP_NOTPOSPLUS:
2348      case OP_NOTPOSPLUSI:
2349      case OP_QUERY:
2350      case OP_QUERYI:
2351      case OP_NOTQUERY:
2352      case OP_NOTQUERYI:
2353      case OP_MINQUERY:
2354      case OP_MINQUERYI:
2355      case OP_NOTMINQUERY:
2356      case OP_NOTMINQUERYI:
2357      case OP_POSQUERY:
2358      case OP_POSQUERYI:
2359      case OP_NOTPOSQUERY:
2360      case OP_NOTPOSQUERYI:
2361      if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
2362      break;
2363      }
2364#else
2365    (void)(utf);  /* Keep compiler happy by referencing function argument */
2366#endif
2367    }
2368  }
2369}
2370
2371
2372
2373/*************************************************
2374*    Scan compiled branch for non-emptiness      *
2375*************************************************/
2376
2377/* This function scans through a branch of a compiled pattern to see whether it
2378can match the empty string or not. It is called from could_be_empty()
2379below and from compile_branch() when checking for an unlimited repeat of a
2380group that can match nothing. Note that first_significant_code() skips over
2381backward and negative forward assertions when its final argument is TRUE. If we
2382hit an unclosed bracket, we return "empty" - this means we've struck an inner
2383bracket whose current branch will already have been scanned.
2384
2385Arguments:
2386  code        points to start of search
2387  endcode     points to where to stop
2388  utf         TRUE if in UTF-8 / UTF-16 / UTF-32 mode
2389  cd          contains pointers to tables etc.
2390  recurses    chain of recurse_check to catch mutual recursion
2391
2392Returns:      TRUE if what is matched could be empty
2393*/
2394
2395static BOOL
2396could_be_empty_branch(const pcre_uchar *code, const pcre_uchar *endcode,
2397  BOOL utf, compile_data *cd, recurse_check *recurses)
2398{
2399register pcre_uchar c;
2400recurse_check this_recurse;
2401
2402for (code = first_significant_code(code + PRIV(OP_lengths)[*code], TRUE);
2403     code < endcode;
2404     code = first_significant_code(code + PRIV(OP_lengths)[c], TRUE))
2405  {
2406  const pcre_uchar *ccode;
2407
2408  c = *code;
2409
2410  /* Skip over forward assertions; the other assertions are skipped by
2411  first_significant_code() with a TRUE final argument. */
2412
2413  if (c == OP_ASSERT)
2414    {
2415    do code += GET(code, 1); while (*code == OP_ALT);
2416    c = *code;
2417    continue;
2418    }
2419
2420  /* For a recursion/subroutine call, if its end has been reached, which
2421  implies a backward reference subroutine call, we can scan it. If it's a
2422  forward reference subroutine call, we can't. To detect forward reference
2423  we have to scan up the list that is kept in the workspace. This function is
2424  called only when doing the real compile, not during the pre-compile that
2425  measures the size of the compiled pattern. */
2426
2427  if (c == OP_RECURSE)
2428    {
2429    const pcre_uchar *scode = cd->start_code + GET(code, 1);
2430    const pcre_uchar *endgroup = scode;
2431    BOOL empty_branch;
2432
2433    /* Test for forward reference or uncompleted reference. This is disabled
2434    when called to scan a completed pattern by setting cd->start_workspace to
2435    NULL. */
2436
2437    if (cd->start_workspace != NULL)
2438      {
2439      const pcre_uchar *tcode;
2440      for (tcode = cd->start_workspace; tcode < cd->hwm; tcode += LINK_SIZE)
2441        if ((int)GET(tcode, 0) == (int)(code + 1 - cd->start_code)) return TRUE;
2442      if (GET(scode, 1) == 0) return TRUE;    /* Unclosed */
2443      }
2444
2445    /* If the reference is to a completed group, we need to detect whether this
2446    is a recursive call, as otherwise there will be an infinite loop. If it is
2447    a recursion, just skip over it. Simple recursions are easily detected. For
2448    mutual recursions we keep a chain on the stack. */
2449
2450    do endgroup += GET(endgroup, 1); while (*endgroup == OP_ALT);
2451    if (code >= scode && code <= endgroup) continue;  /* Simple recursion */
2452    else
2453      {
2454      recurse_check *r = recurses;
2455      for (r = recurses; r != NULL; r = r->prev)
2456        if (r->group == scode) break;
2457      if (r != NULL) continue;   /* Mutual recursion */
2458      }
2459
2460    /* Completed reference; scan the referenced group, remembering it on the
2461    stack chain to detect mutual recursions. */
2462
2463    empty_branch = FALSE;
2464    this_recurse.prev = recurses;
2465    this_recurse.group = scode;
2466
2467    do
2468      {
2469      if (could_be_empty_branch(scode, endcode, utf, cd, &this_recurse))
2470        {
2471        empty_branch = TRUE;
2472        break;
2473        }
2474      scode += GET(scode, 1);
2475      }
2476    while (*scode == OP_ALT);
2477
2478    if (!empty_branch) return FALSE;  /* All branches are non-empty */
2479    continue;
2480    }
2481
2482  /* Groups with zero repeats can of course be empty; skip them. */
2483
2484  if (c == OP_BRAZERO || c == OP_BRAMINZERO || c == OP_SKIPZERO ||
2485      c == OP_BRAPOSZERO)
2486    {
2487    code += PRIV(OP_lengths)[c];
2488    do code += GET(code, 1); while (*code == OP_ALT);
2489    c = *code;
2490    continue;
2491    }
2492
2493  /* A nested group that is already marked as "could be empty" can just be
2494  skipped. */
2495
2496  if (c == OP_SBRA  || c == OP_SBRAPOS ||
2497      c == OP_SCBRA || c == OP_SCBRAPOS)
2498    {
2499    do code += GET(code, 1); while (*code == OP_ALT);
2500    c = *code;
2501    continue;
2502    }
2503
2504  /* For other groups, scan the branches. */
2505
2506  if (c == OP_BRA  || c == OP_BRAPOS ||
2507      c == OP_CBRA || c == OP_CBRAPOS ||
2508      c == OP_ONCE || c == OP_ONCE_NC ||
2509      c == OP_COND || c == OP_SCOND)
2510    {
2511    BOOL empty_branch;
2512    if (GET(code, 1) == 0) return TRUE;    /* Hit unclosed bracket */
2513
2514    /* If a conditional group has only one branch, there is a second, implied,
2515    empty branch, so just skip over the conditional, because it could be empty.
2516    Otherwise, scan the individual branches of the group. */
2517
2518    if (c == OP_COND && code[GET(code, 1)] != OP_ALT)
2519      code += GET(code, 1);
2520    else
2521      {
2522      empty_branch = FALSE;
2523      do
2524        {
2525        if (!empty_branch && could_be_empty_branch(code, endcode, utf, cd,
2526          recurses)) empty_branch = TRUE;
2527        code += GET(code, 1);
2528        }
2529      while (*code == OP_ALT);
2530      if (!empty_branch) return FALSE;   /* All branches are non-empty */
2531      }
2532
2533    c = *code;
2534    continue;
2535    }
2536
2537  /* Handle the other opcodes */
2538
2539  switch (c)
2540    {
2541    /* Check for quantifiers after a class. XCLASS is used for classes that
2542    cannot be represented just by a bit map. This includes negated single
2543    high-valued characters. The length in PRIV(OP_lengths)[] is zero; the
2544    actual length is stored in the compiled code, so we must update "code"
2545    here. */
2546
2547#if defined SUPPORT_UTF || !defined COMPILE_PCRE8
2548    case OP_XCLASS:
2549    ccode = code += GET(code, 1);
2550    goto CHECK_CLASS_REPEAT;
2551#endif
2552
2553    case OP_CLASS:
2554    case OP_NCLASS:
2555    ccode = code + PRIV(OP_lengths)[OP_CLASS];
2556
2557#if defined SUPPORT_UTF || !defined COMPILE_PCRE8
2558    CHECK_CLASS_REPEAT:
2559#endif
2560
2561    switch (*ccode)
2562      {
2563      case OP_CRSTAR:            /* These could be empty; continue */
2564      case OP_CRMINSTAR:
2565      case OP_CRQUERY:
2566      case OP_CRMINQUERY:
2567      case OP_CRPOSSTAR:
2568      case OP_CRPOSQUERY:
2569      break;
2570
2571      default:                   /* Non-repeat => class must match */
2572      case OP_CRPLUS:            /* These repeats aren't empty */
2573      case OP_CRMINPLUS:
2574      case OP_CRPOSPLUS:
2575      return FALSE;
2576
2577      case OP_CRRANGE:
2578      case OP_CRMINRANGE:
2579      case OP_CRPOSRANGE:
2580      if (GET2(ccode, 1) > 0) return FALSE;  /* Minimum > 0 */
2581      break;
2582      }
2583    break;
2584
2585    /* Opcodes that must match a character */
2586
2587    case OP_ANY:
2588    case OP_ALLANY:
2589    case OP_ANYBYTE:
2590
2591    case OP_PROP:
2592    case OP_NOTPROP:
2593    case OP_ANYNL:
2594
2595    case OP_NOT_HSPACE:
2596    case OP_HSPACE:
2597    case OP_NOT_VSPACE:
2598    case OP_VSPACE:
2599    case OP_EXTUNI:
2600
2601    case OP_NOT_DIGIT:
2602    case OP_DIGIT:
2603    case OP_NOT_WHITESPACE:
2604    case OP_WHITESPACE:
2605    case OP_NOT_WORDCHAR:
2606    case OP_WORDCHAR:
2607
2608    case OP_CHAR:
2609    case OP_CHARI:
2610    case OP_NOT:
2611    case OP_NOTI:
2612
2613    case OP_PLUS:
2614    case OP_PLUSI:
2615    case OP_MINPLUS:
2616    case OP_MINPLUSI:
2617
2618    case OP_NOTPLUS:
2619    case OP_NOTPLUSI:
2620    case OP_NOTMINPLUS:
2621    case OP_NOTMINPLUSI:
2622
2623    case OP_POSPLUS:
2624    case OP_POSPLUSI:
2625    case OP_NOTPOSPLUS:
2626    case OP_NOTPOSPLUSI:
2627
2628    case OP_EXACT:
2629    case OP_EXACTI:
2630    case OP_NOTEXACT:
2631    case OP_NOTEXACTI:
2632
2633    case OP_TYPEPLUS:
2634    case OP_TYPEMINPLUS:
2635    case OP_TYPEPOSPLUS:
2636    case OP_TYPEEXACT:
2637
2638    return FALSE;
2639
2640    /* These are going to continue, as they may be empty, but we have to
2641    fudge the length for the \p and \P cases. */
2642
2643    case OP_TYPESTAR:
2644    case OP_TYPEMINSTAR:
2645    case OP_TYPEPOSSTAR:
2646    case OP_TYPEQUERY:
2647    case OP_TYPEMINQUERY:
2648    case OP_TYPEPOSQUERY:
2649    if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
2650    break;
2651
2652    /* Same for these */
2653
2654    case OP_TYPEUPTO:
2655    case OP_TYPEMINUPTO:
2656    case OP_TYPEPOSUPTO:
2657    if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
2658      code += 2;
2659    break;
2660
2661    /* End of branch */
2662
2663    case OP_KET:
2664    case OP_KETRMAX:
2665    case OP_KETRMIN:
2666    case OP_KETRPOS:
2667    case OP_ALT:
2668    return TRUE;
2669
2670    /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
2671    MINUPTO, and POSUPTO and their caseless and negative versions may be
2672    followed by a multibyte character. */
2673
2674#if defined SUPPORT_UTF && !defined COMPILE_PCRE32
2675    case OP_STAR:
2676    case OP_STARI:
2677    case OP_NOTSTAR:
2678    case OP_NOTSTARI:
2679
2680    case OP_MINSTAR:
2681    case OP_MINSTARI:
2682    case OP_NOTMINSTAR:
2683    case OP_NOTMINSTARI:
2684
2685    case OP_POSSTAR:
2686    case OP_POSSTARI:
2687    case OP_NOTPOSSTAR:
2688    case OP_NOTPOSSTARI:
2689
2690    case OP_QUERY:
2691    case OP_QUERYI:
2692    case OP_NOTQUERY:
2693    case OP_NOTQUERYI:
2694
2695    case OP_MINQUERY:
2696    case OP_MINQUERYI:
2697    case OP_NOTMINQUERY:
2698    case OP_NOTMINQUERYI:
2699
2700    case OP_POSQUERY:
2701    case OP_POSQUERYI:
2702    case OP_NOTPOSQUERY:
2703    case OP_NOTPOSQUERYI:
2704
2705    if (utf && HAS_EXTRALEN(code[1])) code += GET_EXTRALEN(code[1]);
2706    break;
2707
2708    case OP_UPTO:
2709    case OP_UPTOI:
2710    case OP_NOTUPTO:
2711    case OP_NOTUPTOI:
2712
2713    case OP_MINUPTO:
2714    case OP_MINUPTOI:
2715    case OP_NOTMINUPTO:
2716    case OP_NOTMINUPTOI:
2717
2718    case OP_POSUPTO:
2719    case OP_POSUPTOI:
2720    case OP_NOTPOSUPTO:
2721    case OP_NOTPOSUPTOI:
2722
2723    if (utf && HAS_EXTRALEN(code[1 + IMM2_SIZE])) code += GET_EXTRALEN(code[1 + IMM2_SIZE]);
2724    break;
2725#endif
2726
2727    /* MARK, and PRUNE/SKIP/THEN with an argument must skip over the argument
2728    string. */
2729
2730    case OP_MARK:
2731    case OP_PRUNE_ARG:
2732    case OP_SKIP_ARG:
2733    case OP_THEN_ARG:
2734    code += code[1];
2735    break;
2736
2737    /* None of the remaining opcodes are required to match a character. */
2738
2739    default:
2740    break;
2741    }
2742  }
2743
2744return TRUE;
2745}
2746
2747
2748
2749/*************************************************
2750*    Scan compiled regex for non-emptiness       *
2751*************************************************/
2752
2753/* This function is called to check for left recursive calls. We want to check
2754the current branch of the current pattern to see if it could match the empty
2755string. If it could, we must look outwards for branches at other levels,
2756stopping when we pass beyond the bracket which is the subject of the recursion.
2757This function is called only during the real compile, not during the
2758pre-compile.
2759
2760Arguments:
2761  code        points to start of the recursion
2762  endcode     points to where to stop (current RECURSE item)
2763  bcptr       points to the chain of current (unclosed) branch starts
2764  utf         TRUE if in UTF-8 / UTF-16 / UTF-32 mode
2765  cd          pointers to tables etc
2766
2767Returns:      TRUE if what is matched could be empty
2768*/
2769
2770static BOOL
2771could_be_empty(const pcre_uchar *code, const pcre_uchar *endcode,
2772  branch_chain *bcptr, BOOL utf, compile_data *cd)
2773{
2774while (bcptr != NULL && bcptr->current_branch >= code)
2775  {
2776  if (!could_be_empty_branch(bcptr->current_branch, endcode, utf, cd, NULL))
2777    return FALSE;
2778  bcptr = bcptr->outer;
2779  }
2780return TRUE;
2781}
2782
2783
2784
2785/*************************************************
2786*        Base opcode of repeated opcodes         *
2787*************************************************/
2788
2789/* Returns the base opcode for repeated single character type opcodes. If the
2790opcode is not a repeated character type, it returns with the original value.
2791
2792Arguments:  c opcode
2793Returns:    base opcode for the type
2794*/
2795
2796static pcre_uchar
2797get_repeat_base(pcre_uchar c)
2798{
2799return (c > OP_TYPEPOSUPTO)? c :
2800       (c >= OP_TYPESTAR)?   OP_TYPESTAR :
2801       (c >= OP_NOTSTARI)?   OP_NOTSTARI :
2802       (c >= OP_NOTSTAR)?    OP_NOTSTAR :
2803       (c >= OP_STARI)?      OP_STARI :
2804                             OP_STAR;
2805}
2806
2807
2808
2809#ifdef SUPPORT_UCP
2810/*************************************************
2811*        Check a character and a property        *
2812*************************************************/
2813
2814/* This function is called by check_auto_possessive() when a property item
2815is adjacent to a fixed character.
2816
2817Arguments:
2818  c            the character
2819  ptype        the property type
2820  pdata        the data for the type
2821  negated      TRUE if it's a negated property (\P or \p{^)
2822
2823Returns:       TRUE if auto-possessifying is OK
2824*/
2825
2826static BOOL
2827check_char_prop(pcre_uint32 c, unsigned int ptype, unsigned int pdata,
2828  BOOL negated)
2829{
2830const pcre_uint32 *p;
2831const ucd_record *prop = GET_UCD(c);
2832
2833switch(ptype)
2834  {
2835  case PT_LAMP:
2836  return (prop->chartype == ucp_Lu ||
2837          prop->chartype == ucp_Ll ||
2838          prop->chartype == ucp_Lt) == negated;
2839
2840  case PT_GC:
2841  return (pdata == PRIV(ucp_gentype)[prop->chartype]) == negated;
2842
2843  case PT_PC:
2844  return (pdata == prop->chartype) == negated;
2845
2846  case PT_SC:
2847  return (pdata == prop->script) == negated;
2848
2849  /* These are specials */
2850
2851  case PT_ALNUM:
2852  return (PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2853          PRIV(ucp_gentype)[prop->chartype] == ucp_N) == negated;
2854
2855  /* Perl space used to exclude VT, but from Perl 5.18 it is included, which
2856  means that Perl space and POSIX space are now identical. PCRE was changed
2857  at release 8.34. */
2858
2859  case PT_SPACE:    /* Perl space */
2860  case PT_PXSPACE:  /* POSIX space */
2861  switch(c)
2862    {
2863    HSPACE_CASES:
2864    VSPACE_CASES:
2865    return negated;
2866
2867    default:
2868    return (PRIV(ucp_gentype)[prop->chartype] == ucp_Z) == negated;
2869    }
2870  break;  /* Control never reaches here */
2871
2872  case PT_WORD:
2873  return (PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2874          PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
2875          c == CHAR_UNDERSCORE) == negated;
2876
2877  case PT_CLIST:
2878  p = PRIV(ucd_caseless_sets) + prop->caseset;
2879  for (;;)
2880    {
2881    if (c < *p) return !negated;
2882    if (c == *p++) return negated;
2883    }
2884  break;  /* Control never reaches here */
2885  }
2886
2887return FALSE;
2888}
2889#endif  /* SUPPORT_UCP */
2890
2891
2892
2893/*************************************************
2894*        Fill the character property list        *
2895*************************************************/
2896
2897/* Checks whether the code points to an opcode that can take part in auto-
2898possessification, and if so, fills a list with its properties.
2899
2900Arguments:
2901  code        points to start of expression
2902  utf         TRUE if in UTF-8 / UTF-16 / UTF-32 mode
2903  fcc         points to case-flipping table
2904  list        points to output list
2905              list[0] will be filled with the opcode
2906              list[1] will be non-zero if this opcode
2907                can match an empty character string
2908              list[2..7] depends on the opcode
2909
2910Returns:      points to the start of the next opcode if *code is accepted
2911              NULL if *code is not accepted
2912*/
2913
2914static const pcre_uchar *
2915get_chr_property_list(const pcre_uchar *code, BOOL utf,
2916  const pcre_uint8 *fcc, pcre_uint32 *list)
2917{
2918pcre_uchar c = *code;
2919pcre_uchar base;
2920const pcre_uchar *end;
2921pcre_uint32 chr;
2922
2923#ifdef SUPPORT_UCP
2924pcre_uint32 *clist_dest;
2925const pcre_uint32 *clist_src;
2926#else
2927utf = utf;  /* Suppress "unused parameter" compiler warning */
2928#endif
2929
2930list[0] = c;
2931list[1] = FALSE;
2932code++;
2933
2934if (c >= OP_STAR && c <= OP_TYPEPOSUPTO)
2935  {
2936  base = get_repeat_base(c);
2937  c -= (base - OP_STAR);
2938
2939  if (c == OP_UPTO || c == OP_MINUPTO || c == OP_EXACT || c == OP_POSUPTO)
2940    code += IMM2_SIZE;
2941
2942  list[1] = (c != OP_PLUS && c != OP_MINPLUS && c != OP_EXACT && c != OP_POSPLUS);
2943
2944  switch(base)
2945    {
2946    case OP_STAR:
2947    list[0] = OP_CHAR;
2948    break;
2949
2950    case OP_STARI:
2951    list[0] = OP_CHARI;
2952    break;
2953
2954    case OP_NOTSTAR:
2955    list[0] = OP_NOT;
2956    break;
2957
2958    case OP_NOTSTARI:
2959    list[0] = OP_NOTI;
2960    break;
2961
2962    case OP_TYPESTAR:
2963    list[0] = *code;
2964    code++;
2965    break;
2966    }
2967  c = list[0];
2968  }
2969
2970switch(c)
2971  {
2972  case OP_NOT_DIGIT:
2973  case OP_DIGIT:
2974  case OP_NOT_WHITESPACE:
2975  case OP_WHITESPACE:
2976  case OP_NOT_WORDCHAR:
2977  case OP_WORDCHAR:
2978  case OP_ANY:
2979  case OP_ALLANY:
2980  case OP_ANYNL:
2981  case OP_NOT_HSPACE:
2982  case OP_HSPACE:
2983  case OP_NOT_VSPACE:
2984  case OP_VSPACE:
2985  case OP_EXTUNI:
2986  case OP_EODN:
2987  case OP_EOD:
2988  case OP_DOLL:
2989  case OP_DOLLM:
2990  return code;
2991
2992  case OP_CHAR:
2993  case OP_NOT:
2994  GETCHARINCTEST(chr, code);
2995  list[2] = chr;
2996  list[3] = NOTACHAR;
2997  return code;
2998
2999  case OP_CHARI:
3000  case OP_NOTI:
3001  list[0] = (c == OP_CHARI) ? OP_CHAR : OP_NOT;
3002  GETCHARINCTEST(chr, code);
3003  list[2] = chr;
3004
3005#ifdef SUPPORT_UCP
3006  if (chr < 128 || (chr < 256 && !utf))
3007    list[3] = fcc[chr];
3008  else
3009    list[3] = UCD_OTHERCASE(chr);
3010#elif defined SUPPORT_UTF || !defined COMPILE_PCRE8
3011  list[3] = (chr < 256) ? fcc[chr] : chr;
3012#else
3013  list[3] = fcc[chr];
3014#endif
3015
3016  /* The othercase might be the same value. */
3017
3018  if (chr == list[3])
3019    list[3] = NOTACHAR;
3020  else
3021    list[4] = NOTACHAR;
3022  return code;
3023
3024#ifdef SUPPORT_UCP
3025  case OP_PROP:
3026  case OP_NOTPROP:
3027  if (code[0] != PT_CLIST)
3028    {
3029    list[2] = code[0];
3030    list[3] = code[1];
3031    return code + 2;
3032    }
3033
3034  /* Convert only if we have enough space. */
3035
3036  clist_src = PRIV(ucd_caseless_sets) + code[1];
3037  clist_dest = list + 2;
3038  code += 2;
3039
3040  do {
3041     if (clist_dest >= list + 8)
3042       {
3043       /* Early return if there is not enough space. This should never
3044       happen, since all clists are shorter than 5 character now. */
3045       list[2] = code[0];
3046       list[3] = code[1];
3047       return code;
3048       }
3049     *clist_dest++ = *clist_src;
3050     }
3051  while(*clist_src++ != NOTACHAR);
3052
3053  /* All characters are stored. The terminating NOTACHAR
3054  is copied form the clist itself. */
3055
3056  list[0] = (c == OP_PROP) ? OP_CHAR : OP_NOT;
3057  return code;
3058#endif
3059
3060  case OP_NCLASS:
3061  case OP_CLASS:
3062#if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3063  case OP_XCLASS:
3064  if (c == OP_XCLASS)
3065    end = code + GET(code, 0) - 1;
3066  else
3067#endif
3068    end = code + 32 / sizeof(pcre_uchar);
3069
3070  switch(*end)
3071    {
3072    case OP_CRSTAR:
3073    case OP_CRMINSTAR:
3074    case OP_CRQUERY:
3075    case OP_CRMINQUERY:
3076    case OP_CRPOSSTAR:
3077    case OP_CRPOSQUERY:
3078    list[1] = TRUE;
3079    end++;
3080    break;
3081
3082    case OP_CRPLUS:
3083    case OP_CRMINPLUS:
3084    case OP_CRPOSPLUS:
3085    end++;
3086    break;
3087
3088    case OP_CRRANGE:
3089    case OP_CRMINRANGE:
3090    case OP_CRPOSRANGE:
3091    list[1] = (GET2(end, 1) == 0);
3092    end += 1 + 2 * IMM2_SIZE;
3093    break;
3094    }
3095  list[2] = (pcre_uint32)(end - code);
3096  return end;
3097  }
3098return NULL;    /* Opcode not accepted */
3099}
3100
3101
3102
3103/*************************************************
3104*    Scan further character sets for match       *
3105*************************************************/
3106
3107/* Checks whether the base and the current opcode have a common character, in
3108which case the base cannot be possessified.
3109
3110Arguments:
3111  code        points to the byte code
3112  utf         TRUE in UTF-8 / UTF-16 / UTF-32 mode
3113  cd          static compile data
3114  base_list   the data list of the base opcode
3115
3116Returns:      TRUE if the auto-possessification is possible
3117*/
3118
3119static BOOL
3120compare_opcodes(const pcre_uchar *code, BOOL utf, const compile_data *cd,
3121  const pcre_uint32 *base_list, const pcre_uchar *base_end, int *rec_limit)
3122{
3123pcre_uchar c;
3124pcre_uint32 list[8];
3125const pcre_uint32 *chr_ptr;
3126const pcre_uint32 *ochr_ptr;
3127const pcre_uint32 *list_ptr;
3128const pcre_uchar *next_code;
3129#if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3130const pcre_uchar *xclass_flags;
3131#endif
3132const pcre_uint8 *class_bitset;
3133const pcre_uint8 *set1, *set2, *set_end;
3134pcre_uint32 chr;
3135BOOL accepted, invert_bits;
3136BOOL entered_a_group = FALSE;
3137
3138if (*rec_limit == 0) return FALSE;
3139--(*rec_limit);
3140
3141/* Note: the base_list[1] contains whether the current opcode has greedy
3142(represented by a non-zero value) quantifier. This is a different from
3143other character type lists, which stores here that the character iterator
3144matches to an empty string (also represented by a non-zero value). */
3145
3146for(;;)
3147  {
3148  /* All operations move the code pointer forward.
3149  Therefore infinite recursions are not possible. */
3150
3151  c = *code;
3152
3153  /* Skip over callouts */
3154
3155  if (c == OP_CALLOUT)
3156    {
3157    code += PRIV(OP_lengths)[c];
3158    continue;
3159    }
3160
3161  if (c == OP_ALT)
3162    {
3163    do code += GET(code, 1); while (*code == OP_ALT);
3164    c = *code;
3165    }
3166
3167  switch(c)
3168    {
3169    case OP_END:
3170    case OP_KETRPOS:
3171    /* TRUE only in greedy case. The non-greedy case could be replaced by
3172    an OP_EXACT, but it is probably not worth it. (And note that OP_EXACT
3173    uses more memory, which we cannot get at this stage.) */
3174
3175    return base_list[1] != 0;
3176
3177    case OP_KET:
3178    /* If the bracket is capturing, and referenced by an OP_RECURSE, or
3179    it is an atomic sub-pattern (assert, once, etc.) the non-greedy case
3180    cannot be converted to a possessive form. */
3181
3182    if (base_list[1] == 0) return FALSE;
3183
3184    switch(*(code - GET(code, 1)))
3185      {
3186      case OP_ASSERT:
3187      case OP_ASSERT_NOT:
3188      case OP_ASSERTBACK:
3189      case OP_ASSERTBACK_NOT:
3190      case OP_ONCE:
3191      case OP_ONCE_NC:
3192      /* Atomic sub-patterns and assertions can always auto-possessify their
3193      last iterator. However, if the group was entered as a result of checking
3194      a previous iterator, this is not possible. */
3195
3196      return !entered_a_group;
3197      }
3198
3199    code += PRIV(OP_lengths)[c];
3200    continue;
3201
3202    case OP_ONCE:
3203    case OP_ONCE_NC:
3204    case OP_BRA:
3205    case OP_CBRA:
3206    next_code = code + GET(code, 1);
3207    code += PRIV(OP_lengths)[c];
3208
3209    while (*next_code == OP_ALT)
3210      {
3211      if (!compare_opcodes(code, utf, cd, base_list, base_end, rec_limit))
3212        return FALSE;
3213      code = next_code + 1 + LINK_SIZE;
3214      next_code += GET(next_code, 1);
3215      }
3216
3217    entered_a_group = TRUE;
3218    continue;
3219
3220    case OP_BRAZERO:
3221    case OP_BRAMINZERO:
3222
3223    next_code = code + 1;
3224    if (*next_code != OP_BRA && *next_code != OP_CBRA
3225        && *next_code != OP_ONCE && *next_code != OP_ONCE_NC) return FALSE;
3226
3227    do next_code += GET(next_code, 1); while (*next_code == OP_ALT);
3228
3229    /* The bracket content will be checked by the
3230    OP_BRA/OP_CBRA case above. */
3231    next_code += 1 + LINK_SIZE;
3232    if (!compare_opcodes(next_code, utf, cd, base_list, base_end, rec_limit))
3233      return FALSE;
3234
3235    code += PRIV(OP_lengths)[c];
3236    continue;
3237
3238    default:
3239    break;
3240    }
3241
3242  /* Check for a supported opcode, and load its properties. */
3243
3244  code = get_chr_property_list(code, utf, cd->fcc, list);
3245  if (code == NULL) return FALSE;    /* Unsupported */
3246
3247  /* If either opcode is a small character list, set pointers for comparing
3248  characters from that list with another list, or with a property. */
3249
3250  if (base_list[0] == OP_CHAR)
3251    {
3252    chr_ptr = base_list + 2;
3253    list_ptr = list;
3254    }
3255  else if (list[0] == OP_CHAR)
3256    {
3257    chr_ptr = list + 2;
3258    list_ptr = base_list;
3259    }
3260
3261  /* Character bitsets can also be compared to certain opcodes. */
3262
3263  else if (base_list[0] == OP_CLASS || list[0] == OP_CLASS
3264#ifdef COMPILE_PCRE8
3265      /* In 8 bit, non-UTF mode, OP_CLASS and OP_NCLASS are the same. */
3266      || (!utf && (base_list[0] == OP_NCLASS || list[0] == OP_NCLASS))
3267#endif
3268      )
3269    {
3270#ifdef COMPILE_PCRE8
3271    if (base_list[0] == OP_CLASS || (!utf && base_list[0] == OP_NCLASS))
3272#else
3273    if (base_list[0] == OP_CLASS)
3274#endif
3275      {
3276      set1 = (pcre_uint8 *)(base_end - base_list[2]);
3277      list_ptr = list;
3278      }
3279    else
3280      {
3281      set1 = (pcre_uint8 *)(code - list[2]);
3282      list_ptr = base_list;
3283      }
3284
3285    invert_bits = FALSE;
3286    switch(list_ptr[0])
3287      {
3288      case OP_CLASS:
3289      case OP_NCLASS:
3290      set2 = (pcre_uint8 *)
3291        ((list_ptr == list ? code : base_end) - list_ptr[2]);
3292      break;
3293
3294#if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3295      case OP_XCLASS:
3296      xclass_flags = (list_ptr == list ? code : base_end) - list_ptr[2] + LINK_SIZE;
3297      if ((*xclass_flags & XCL_HASPROP) != 0) return FALSE;
3298      if ((*xclass_flags & XCL_MAP) == 0)
3299        {
3300        /* No bits are set for characters < 256. */
3301        if (list[1] == 0) return TRUE;
3302        /* Might be an empty repeat. */
3303        continue;
3304        }
3305      set2 = (pcre_uint8 *)(xclass_flags + 1);
3306      break;
3307#endif
3308
3309      case OP_NOT_DIGIT:
3310      invert_bits = TRUE;
3311      /* Fall through */
3312      case OP_DIGIT:
3313      set2 = (pcre_uint8 *)(cd->cbits + cbit_digit);
3314      break;
3315
3316      case OP_NOT_WHITESPACE:
3317      invert_bits = TRUE;
3318      /* Fall through */
3319      case OP_WHITESPACE:
3320      set2 = (pcre_uint8 *)(cd->cbits + cbit_space);
3321      break;
3322
3323      case OP_NOT_WORDCHAR:
3324      invert_bits = TRUE;
3325      /* Fall through */
3326      case OP_WORDCHAR:
3327      set2 = (pcre_uint8 *)(cd->cbits + cbit_word);
3328      break;
3329
3330      default:
3331      return FALSE;
3332      }
3333
3334    /* Because the sets are unaligned, we need
3335    to perform byte comparison here. */
3336    set_end = set1 + 32;
3337    if (invert_bits)
3338      {
3339      do
3340        {
3341        if ((*set1++ & ~(*set2++)) != 0) return FALSE;
3342        }
3343      while (set1 < set_end);
3344      }
3345    else
3346      {
3347      do
3348        {
3349        if ((*set1++ & *set2++) != 0) return FALSE;
3350        }
3351      while (set1 < set_end);
3352      }
3353
3354    if (list[1] == 0) return TRUE;
3355    /* Might be an empty repeat. */
3356    continue;
3357    }
3358
3359  /* Some property combinations also acceptable. Unicode property opcodes are
3360  processed specially; the rest can be handled with a lookup table. */
3361
3362  else
3363    {
3364    pcre_uint32 leftop, rightop;
3365
3366    leftop = base_list[0];
3367    rightop = list[0];
3368
3369#ifdef SUPPORT_UCP
3370    accepted = FALSE; /* Always set in non-unicode case. */
3371    if (leftop == OP_PROP || leftop == OP_NOTPROP)
3372      {
3373      if (rightop == OP_EOD)
3374        accepted = TRUE;
3375      else if (rightop == OP_PROP || rightop == OP_NOTPROP)
3376        {
3377        int n;
3378        const pcre_uint8 *p;
3379        BOOL same = leftop == rightop;
3380        BOOL lisprop = leftop == OP_PROP;
3381        BOOL risprop = rightop == OP_PROP;
3382        BOOL bothprop = lisprop && risprop;
3383
3384        /* There's a table that specifies how each combination is to be
3385        processed:
3386          0   Always return FALSE (never auto-possessify)
3387          1   Character groups are distinct (possessify if both are OP_PROP)
3388          2   Check character categories in the same group (general or particular)
3389          3   Return TRUE if the two opcodes are not the same
3390          ... see comments below
3391        */
3392
3393        n = propposstab[base_list[2]][list[2]];
3394        switch(n)
3395          {
3396          case 0: break;
3397          case 1: accepted = bothprop; break;
3398          case 2: accepted = (base_list[3] == list[3]) != same; break;
3399          case 3: accepted = !same; break;
3400
3401          case 4:  /* Left general category, right particular category */
3402          accepted = risprop && catposstab[base_list[3]][list[3]] == same;
3403          break;
3404
3405          case 5:  /* Right general category, left particular category */
3406          accepted = lisprop && catposstab[list[3]][base_list[3]] == same;
3407          break;
3408
3409          /* This code is logically tricky. Think hard before fiddling with it.
3410          The posspropstab table has four entries per row. Each row relates to
3411          one of PCRE's special properties such as ALNUM or SPACE or WORD.
3412          Only WORD actually needs all four entries, but using repeats for the
3413          others means they can all use the same code below.
3414
3415          The first two entries in each row are Unicode general categories, and
3416          apply always, because all the characters they include are part of the
3417          PCRE character set. The third and fourth entries are a general and a
3418          particular category, respectively, that include one or more relevant
3419          characters. One or the other is used, depending on whether the check
3420          is for a general or a particular category. However, in both cases the
3421          category contains more characters than the specials that are defined
3422          for the property being tested against. Therefore, it cannot be used
3423          in a NOTPROP case.
3424
3425          Example: the row for WORD contains ucp_L, ucp_N, ucp_P, ucp_Po.
3426          Underscore is covered by ucp_P or ucp_Po. */
3427
3428          case 6:  /* Left alphanum vs right general category */
3429          case 7:  /* Left space vs right general category */
3430          case 8:  /* Left word vs right general category */
3431          p = posspropstab[n-6];
3432          accepted = risprop && lisprop ==
3433            (list[3] != p[0] &&
3434             list[3] != p[1] &&
3435            (list[3] != p[2] || !lisprop));
3436          break;
3437
3438          case 9:   /* Right alphanum vs left general category */
3439          case 10:  /* Right space vs left general category */
3440          case 11:  /* Right word vs left general category */
3441          p = posspropstab[n-9];
3442          accepted = lisprop && risprop ==
3443            (base_list[3] != p[0] &&
3444             base_list[3] != p[1] &&
3445            (base_list[3] != p[2] || !risprop));
3446          break;
3447
3448          case 12:  /* Left alphanum vs right particular category */
3449          case 13:  /* Left space vs right particular category */
3450          case 14:  /* Left word vs right particular category */
3451          p = posspropstab[n-12];
3452          accepted = risprop && lisprop ==
3453            (catposstab[p[0]][list[3]] &&
3454             catposstab[p[1]][list[3]] &&
3455            (list[3] != p[3] || !lisprop));
3456          break;
3457
3458          case 15:  /* Right alphanum vs left particular category */
3459          case 16:  /* Right space vs left particular category */
3460          case 17:  /* Right word vs left particular category */
3461          p = posspropstab[n-15];
3462          accepted = lisprop && risprop ==
3463            (catposstab[p[0]][base_list[3]] &&
3464             catposstab[p[1]][base_list[3]] &&
3465            (base_list[3] != p[3] || !risprop));
3466          break;
3467          }
3468        }
3469      }
3470
3471    else
3472#endif  /* SUPPORT_UCP */
3473
3474    accepted = leftop >= FIRST_AUTOTAB_OP && leftop <= LAST_AUTOTAB_LEFT_OP &&
3475           rightop >= FIRST_AUTOTAB_OP && rightop <= LAST_AUTOTAB_RIGHT_OP &&
3476           autoposstab[leftop - FIRST_AUTOTAB_OP][rightop - FIRST_AUTOTAB_OP];
3477
3478    if (!accepted) return FALSE;
3479
3480    if (list[1] == 0) return TRUE;
3481    /* Might be an empty repeat. */
3482    continue;
3483    }
3484
3485  /* Control reaches here only if one of the items is a small character list.
3486  All characters are checked against the other side. */
3487
3488  do
3489    {
3490    chr = *chr_ptr;
3491
3492    switch(list_ptr[0])
3493      {
3494      case OP_CHAR:
3495      ochr_ptr = list_ptr + 2;
3496      do
3497        {
3498        if (chr == *ochr_ptr) return FALSE;
3499        ochr_ptr++;
3500        }
3501      while(*ochr_ptr != NOTACHAR);
3502      break;
3503
3504      case OP_NOT:
3505      ochr_ptr = list_ptr + 2;
3506      do
3507        {
3508        if (chr == *ochr_ptr)
3509          break;
3510        ochr_ptr++;
3511        }
3512      while(*ochr_ptr != NOTACHAR);
3513      if (*ochr_ptr == NOTACHAR) return FALSE;   /* Not found */
3514      break;
3515
3516      /* Note that OP_DIGIT etc. are generated only when PCRE_UCP is *not*
3517      set. When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */
3518
3519      case OP_DIGIT:
3520      if (chr < 256 && (cd->ctypes[chr] & ctype_digit) != 0) return FALSE;
3521      break;
3522
3523      case OP_NOT_DIGIT:
3524      if (chr > 255 || (cd->ctypes[chr] & ctype_digit) == 0) return FALSE;
3525      break;
3526
3527      case OP_WHITESPACE:
3528      if (chr < 256 && (cd->ctypes[chr] & ctype_space) != 0) return FALSE;
3529      break;
3530
3531      case OP_NOT_WHITESPACE:
3532      if (chr > 255 || (cd->ctypes[chr] & ctype_space) == 0) return FALSE;
3533      break;
3534
3535      case OP_WORDCHAR:
3536      if (chr < 255 && (cd->ctypes[chr] & ctype_word) != 0) return FALSE;
3537      break;
3538
3539      case OP_NOT_WORDCHAR:
3540      if (chr > 255 || (cd->ctypes[chr] & ctype_word) == 0) return FALSE;
3541      break;
3542
3543      case OP_HSPACE:
3544      switch(chr)
3545        {
3546        HSPACE_CASES: return FALSE;
3547        default: break;
3548        }
3549      break;
3550
3551      case OP_NOT_HSPACE:
3552      switch(chr)
3553        {
3554        HSPACE_CASES: break;
3555        default: return FALSE;
3556        }
3557      break;
3558
3559      case OP_ANYNL:
3560      case OP_VSPACE:
3561      switch(chr)
3562        {
3563        VSPACE_CASES: return FALSE;
3564        default: break;
3565        }
3566      break;
3567
3568      case OP_NOT_VSPACE:
3569      switch(chr)
3570        {
3571        VSPACE_CASES: break;
3572        default: return FALSE;
3573        }
3574      break;
3575
3576      case OP_DOLL:
3577      case OP_EODN:
3578      switch (chr)
3579        {
3580        case CHAR_CR:
3581        case CHAR_LF:
3582        case CHAR_VT:
3583        case CHAR_FF:
3584        case CHAR_NEL:
3585#ifndef EBCDIC
3586        case 0x2028:
3587        case 0x2029:
3588#endif  /* Not EBCDIC */
3589        return FALSE;
3590        }
3591      break;
3592
3593      case OP_EOD:    /* Can always possessify before \z */
3594      break;
3595
3596#ifdef SUPPORT_UCP
3597      case OP_PROP:
3598      case OP_NOTPROP:
3599      if (!check_char_prop(chr, list_ptr[2], list_ptr[3],
3600            list_ptr[0] == OP_NOTPROP))
3601        return FALSE;
3602      break;
3603#endif
3604
3605      case OP_NCLASS:
3606      if (chr > 255) return FALSE;
3607      /* Fall through */
3608
3609      case OP_CLASS:
3610      if (chr > 255) break;
3611      class_bitset = (pcre_uint8 *)
3612        ((list_ptr == list ? code : base_end) - list_ptr[2]);
3613      if ((class_bitset[chr >> 3] & (1 << (chr & 7))) != 0) return FALSE;
3614      break;
3615
3616#if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3617      case OP_XCLASS:
3618      if (PRIV(xclass)(chr, (list_ptr == list ? code : base_end) -
3619          list_ptr[2] + LINK_SIZE, utf)) return FALSE;
3620      break;
3621#endif
3622
3623      default:
3624      return FALSE;
3625      }
3626
3627    chr_ptr++;
3628    }
3629  while(*chr_ptr != NOTACHAR);
3630
3631  /* At least one character must be matched from this opcode. */
3632
3633  if (list[1] == 0) return TRUE;
3634  }
3635
3636/* Control never reaches here. There used to be a fail-save return FALSE; here,
3637but some compilers complain about an unreachable statement. */
3638
3639}
3640
3641
3642
3643/*************************************************
3644*    Scan compiled regex for auto-possession     *
3645*************************************************/
3646
3647/* Replaces single character iterations with their possessive alternatives
3648if appropriate. This function modifies the compiled opcode!
3649
3650Arguments:
3651  code        points to start of the byte code
3652  utf         TRUE in UTF-8 / UTF-16 / UTF-32 mode
3653  cd          static compile data
3654
3655Returns:      nothing
3656*/
3657
3658static void
3659auto_possessify(pcre_uchar *code, BOOL utf, const compile_data *cd)
3660{
3661register pcre_uchar c;
3662const pcre_uchar *end;
3663pcre_uchar *repeat_opcode;
3664pcre_uint32 list[8];
3665int rec_limit;
3666
3667for (;;)
3668  {
3669  c = *code;
3670
3671  /* When a pattern with bad UTF-8 encoding is compiled with NO_UTF_CHECK,
3672  it may compile without complaining, but may get into a loop here if the code
3673  pointer points to a bad value. This is, of course a documentated possibility,
3674  when NO_UTF_CHECK is set, so it isn't a bug, but we can detect this case and
3675  just give up on this optimization. */
3676
3677  if (c >= OP_TABLE_LENGTH) return;
3678
3679  if (c >= OP_STAR && c <= OP_TYPEPOSUPTO)
3680    {
3681    c -= get_repeat_base(c) - OP_STAR;
3682    end = (c <= OP_MINUPTO) ?
3683      get_chr_property_list(code, utf, cd->fcc, list) : NULL;
3684    list[1] = c == OP_STAR || c == OP_PLUS || c == OP_QUERY || c == OP_UPTO;
3685
3686    rec_limit = 1000;
3687    if (end != NULL && compare_opcodes(end, utf, cd, list, end, &rec_limit))
3688      {
3689      switch(c)
3690        {
3691        case OP_STAR:
3692        *code += OP_POSSTAR - OP_STAR;
3693        break;
3694
3695        case OP_MINSTAR:
3696        *code += OP_POSSTAR - OP_MINSTAR;
3697        break;
3698
3699        case OP_PLUS:
3700        *code += OP_POSPLUS - OP_PLUS;
3701        break;
3702
3703        case OP_MINPLUS:
3704        *code += OP_POSPLUS - OP_MINPLUS;
3705        break;
3706
3707        case OP_QUERY:
3708        *code += OP_POSQUERY - OP_QUERY;
3709        break;
3710
3711        case OP_MINQUERY:
3712        *code += OP_POSQUERY - OP_MINQUERY;
3713        break;
3714
3715        case OP_UPTO:
3716        *code += OP_POSUPTO - OP_UPTO;
3717        break;
3718
3719        case OP_MINUPTO:
3720        *code += OP_POSUPTO - OP_MINUPTO;
3721        break;
3722        }
3723      }
3724    c = *code;
3725    }
3726  else if (c == OP_CLASS || c == OP_NCLASS || c == OP_XCLASS)
3727    {
3728#if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3729    if (c == OP_XCLASS)
3730      repeat_opcode = code + GET(code, 1);
3731    else
3732#endif
3733      repeat_opcode = code + 1 + (32 / sizeof(pcre_uchar));
3734
3735    c = *repeat_opcode;
3736    if (c >= OP_CRSTAR && c <= OP_CRMINRANGE)
3737      {
3738      /* end must not be NULL. */
3739      end = get_chr_property_list(code, utf, cd->fcc, list);
3740
3741      list[1] = (c & 1) == 0;
3742
3743      rec_limit = 1000;
3744      if (compare_opcodes(end, utf, cd, list, end, &rec_limit))
3745        {
3746        switch (c)
3747          {
3748          case OP_CRSTAR:
3749          case OP_CRMINSTAR:
3750          *repeat_opcode = OP_CRPOSSTAR;
3751          break;
3752
3753          case OP_CRPLUS:
3754          case OP_CRMINPLUS:
3755          *repeat_opcode = OP_CRPOSPLUS;
3756          break;
3757
3758          case OP_CRQUERY:
3759          case OP_CRMINQUERY:
3760          *repeat_opcode = OP_CRPOSQUERY;
3761          break;
3762
3763          case OP_CRRANGE:
3764          case OP_CRMINRANGE:
3765          *repeat_opcode = OP_CRPOSRANGE;
3766          break;
3767          }
3768        }
3769      }
3770    c = *code;
3771    }
3772
3773  switch(c)
3774    {
3775    case OP_END:
3776    return;
3777
3778    case OP_TYPESTAR:
3779    case OP_TYPEMINSTAR:
3780    case OP_TYPEPLUS:
3781    case OP_TYPEMINPLUS:
3782    case OP_TYPEQUERY:
3783    case OP_TYPEMINQUERY:
3784    case OP_TYPEPOSSTAR:
3785    case OP_TYPEPOSPLUS:
3786    case OP_TYPEPOSQUERY:
3787    if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
3788    break;
3789
3790    case OP_TYPEUPTO:
3791    case OP_TYPEMINUPTO:
3792    case OP_TYPEEXACT:
3793    case OP_TYPEPOSUPTO:
3794    if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
3795      code += 2;
3796    break;
3797
3798#if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3799    case OP_XCLASS:
3800    code += GET(code, 1);
3801    break;
3802#endif
3803
3804    case OP_MARK:
3805    case OP_PRUNE_ARG:
3806    case OP_SKIP_ARG:
3807    case OP_THEN_ARG:
3808    code += code[1];
3809    break;
3810    }
3811
3812  /* Add in the fixed length from the table */
3813
3814  code += PRIV(OP_lengths)[c];
3815
3816  /* In UTF-8 mode, opcodes that are followed by a character may be followed by
3817  a multi-byte character. The length in the table is a minimum, so we have to
3818  arrange to skip the extra bytes. */
3819
3820#if defined SUPPORT_UTF && !defined COMPILE_PCRE32
3821  if (utf) switch(c)
3822    {
3823    case OP_CHAR:
3824    case OP_CHARI:
3825    case OP_NOT:
3826    case OP_NOTI:
3827    case OP_STAR:
3828    case OP_MINSTAR:
3829    case OP_PLUS:
3830    case OP_MINPLUS:
3831    case OP_QUERY:
3832    case OP_MINQUERY:
3833    case OP_UPTO:
3834    case OP_MINUPTO:
3835    case OP_EXACT:
3836    case OP_POSSTAR:
3837    case OP_POSPLUS:
3838    case OP_POSQUERY:
3839    case OP_POSUPTO:
3840    case OP_STARI:
3841    case OP_MINSTARI:
3842    case OP_PLUSI:
3843    case OP_MINPLUSI:
3844    case OP_QUERYI:
3845    case OP_MINQUERYI:
3846    case OP_UPTOI:
3847    case OP_MINUPTOI:
3848    case OP_EXACTI:
3849    case OP_POSSTARI:
3850    case OP_POSPLUSI:
3851    case OP_POSQUERYI:
3852    case OP_POSUPTOI:
3853    case OP_NOTSTAR:
3854    case OP_NOTMINSTAR:
3855    case OP_NOTPLUS:
3856    case OP_NOTMINPLUS:
3857    case OP_NOTQUERY:
3858    case OP_NOTMINQUERY:
3859    case OP_NOTUPTO:
3860    case OP_NOTMINUPTO:
3861    case OP_NOTEXACT:
3862    case OP_NOTPOSSTAR:
3863    case OP_NOTPOSPLUS:
3864    case OP_NOTPOSQUERY:
3865    case OP_NOTPOSUPTO:
3866    case OP_NOTSTARI:
3867    case OP_NOTMINSTARI:
3868    case OP_NOTPLUSI:
3869    case OP_NOTMINPLUSI:
3870    case OP_NOTQUERYI:
3871    case OP_NOTMINQUERYI:
3872    case OP_NOTUPTOI:
3873    case OP_NOTMINUPTOI:
3874    case OP_NOTEXACTI:
3875    case OP_NOTPOSSTARI:
3876    case OP_NOTPOSPLUSI:
3877    case OP_NOTPOSQUERYI:
3878    case OP_NOTPOSUPTOI:
3879    if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
3880    break;
3881    }
3882#else
3883  (void)(utf);  /* Keep compiler happy by referencing function argument */
3884#endif
3885  }
3886}
3887
3888
3889
3890/*************************************************
3891*           Check for POSIX class syntax         *
3892*************************************************/
3893
3894/* This function is called when the sequence "[:" or "[." or "[=" is
3895encountered in a character class. It checks whether this is followed by a
3896sequence of characters terminated by a matching ":]" or ".]" or "=]". If we
3897reach an unescaped ']' without the special preceding character, return FALSE.
3898
3899Originally, this function only recognized a sequence of letters between the
3900terminators, but it seems that Perl recognizes any sequence of characters,
3901though of course unknown POSIX names are subsequently rejected. Perl gives an
3902"Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE
3903didn't consider this to be a POSIX class. Likewise for [:1234:].
3904
3905The problem in trying to be exactly like Perl is in the handling of escapes. We
3906have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX
3907class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
3908below handles the special cases \\ and \], but does not try to do any other
3909escape processing. This makes it different from Perl for cases such as
3910[:l\ower:] where Perl recognizes it as the POSIX class "lower" but PCRE does
3911not recognize "l\ower". This is a lesser evil than not diagnosing bad classes
3912when Perl does, I think.
3913
3914A user pointed out that PCRE was rejecting [:a[:digit:]] whereas Perl was not.
3915It seems that the appearance of a nested POSIX class supersedes an apparent
3916external class. For example, [:a[:digit:]b:] matches "a", "b", ":", or
3917a digit.
3918
3919In Perl, unescaped square brackets may also appear as part of class names. For
3920example, [:a[:abc]b:] gives unknown POSIX class "[:abc]b:]". However, for
3921[:a[:abc]b][b:] it gives unknown POSIX class "[:abc]b][b:]", which does not
3922seem right at all. PCRE does not allow closing square brackets in POSIX class
3923names.
3924
3925Arguments:
3926  ptr      pointer to the initial [
3927  endptr   where to return the end pointer
3928
3929Returns:   TRUE or FALSE
3930*/
3931
3932static BOOL
3933check_posix_syntax(const pcre_uchar *ptr, const pcre_uchar **endptr)
3934{
3935pcre_uchar terminator;          /* Don't combine these lines; the Solaris cc */
3936terminator = *(++ptr);   /* compiler warns about "non-constant" initializer. */
3937for (++ptr; *ptr != CHAR_NULL; ptr++)
3938  {
3939  if (*ptr == CHAR_BACKSLASH &&
3940      (ptr[1] == CHAR_RIGHT_SQUARE_BRACKET ||
3941       ptr[1] == CHAR_BACKSLASH))
3942    ptr++;
3943  else if ((*ptr == CHAR_LEFT_SQUARE_BRACKET && ptr[1] == terminator) ||
3944            *ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE;
3945  else if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
3946    {
3947    *endptr = ptr;
3948    return TRUE;
3949    }
3950  }
3951return FALSE;
3952}
3953
3954
3955
3956
3957/*************************************************
3958*          Check POSIX class name                *
3959*************************************************/
3960
3961/* This function is called to check the name given in a POSIX-style class entry
3962such as [:alnum:].
3963
3964Arguments:
3965  ptr        points to the first letter
3966  len        the length of the name
3967
3968Returns:     a value representing the name, or -1 if unknown
3969*/
3970
3971static int
3972check_posix_name(const pcre_uchar *ptr, int len)
3973{
3974const char *pn = posix_names;
3975register int yield = 0;
3976while (posix_name_lengths[yield] != 0)
3977  {
3978  if (len == posix_name_lengths[yield] &&
3979    STRNCMP_UC_C8(ptr, pn, (unsigned int)len) == 0) return yield;
3980  pn += posix_name_lengths[yield] + 1;
3981  yield++;
3982  }
3983return -1;
3984}
3985
3986
3987/*************************************************
3988*    Adjust OP_RECURSE items in repeated group   *
3989*************************************************/
3990
3991/* OP_RECURSE items contain an offset from the start of the regex to the group
3992that is referenced. This means that groups can be replicated for fixed
3993repetition simply by copying (because the recursion is allowed to refer to
3994earlier groups that are outside the current group). However, when a group is
3995optional (i.e. the minimum quantifier is zero), OP_BRAZERO or OP_SKIPZERO is
3996inserted before it, after it has been compiled. This means that any OP_RECURSE
3997items within it that refer to the group itself or any contained groups have to
3998have their offsets adjusted. That one of the jobs of this function. Before it
3999is called, the partially compiled regex must be temporarily terminated with
4000OP_END.
4001
4002This function has been extended to cope with forward references for recursions
4003and subroutine calls. It must check the list of such references for the
4004group we are dealing with. If it finds that one of the recursions in the
4005current group is on this list, it does not adjust the value in the reference
4006(which is a group number). After the group has been scanned, all the offsets in
4007the forward reference list for the group are adjusted.
4008
4009Arguments:
4010  group      points to the start of the group
4011  adjust     the amount by which the group is to be moved
4012  utf        TRUE in UTF-8 / UTF-16 / UTF-32 mode
4013  cd         contains pointers to tables etc.
4014  save_hwm_offset   the hwm forward reference offset at the start of the group
4015
4016Returns:     nothing
4017*/
4018
4019static void
4020adjust_recurse(pcre_uchar *group, int adjust, BOOL utf, compile_data *cd,
4021  size_t save_hwm_offset)
4022{
4023int offset;
4024pcre_uchar *hc;
4025pcre_uchar *ptr = group;
4026
4027while ((ptr = (pcre_uchar *)find_recurse(ptr, utf)) != NULL)
4028  {
4029  for (hc = (pcre_uchar *)cd->start_workspace + save_hwm_offset; hc < cd->hwm;
4030       hc += LINK_SIZE)
4031    {
4032    offset = (int)GET(hc, 0);
4033    if (cd->start_code + offset == ptr + 1) break;
4034    }
4035
4036  /* If we have not found this recursion on the forward reference list, adjust
4037  the recursion's offset if it's after the start of this group. */
4038
4039  if (hc >= cd->hwm)
4040    {
4041    offset = (int)GET(ptr, 1);
4042    if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
4043    }
4044
4045  ptr += 1 + LINK_SIZE;
4046  }
4047
4048/* Now adjust all forward reference offsets for the group. */
4049
4050for (hc = (pcre_uchar *)cd->start_workspace + save_hwm_offset; hc < cd->hwm;
4051     hc += LINK_SIZE)
4052  {
4053  offset = (int)GET(hc, 0);
4054  PUT(hc, 0, offset + adjust);
4055  }
4056}
4057
4058
4059
4060/*************************************************
4061*        Insert an automatic callout point       *
4062*************************************************/
4063
4064/* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
4065callout points before each pattern item.
4066
4067Arguments:
4068  code           current code pointer
4069  ptr            current pattern pointer
4070  cd             pointers to tables etc
4071
4072Returns:         new code pointer
4073*/
4074
4075static pcre_uchar *
4076auto_callout(pcre_uchar *code, const pcre_uchar *ptr, compile_data *cd)
4077{
4078*code++ = OP_CALLOUT;
4079*code++ = 255;
4080PUT(code, 0, (int)(ptr - cd->start_pattern));  /* Pattern offset */
4081PUT(code, LINK_SIZE, 0);                       /* Default length */
4082return code + 2 * LINK_SIZE;
4083}
4084
4085
4086
4087/*************************************************
4088*         Complete a callout item                *
4089*************************************************/
4090
4091/* A callout item contains the length of the next item in the pattern, which
4092we can't fill in till after we have reached the relevant point. This is used
4093for both automatic and manual callouts.
4094
4095Arguments:
4096  previous_callout   points to previous callout item
4097  ptr                current pattern pointer
4098  cd                 pointers to tables etc
4099
4100Returns:             nothing
4101*/
4102
4103static void
4104complete_callout(pcre_uchar *previous_callout, const pcre_uchar *ptr, compile_data *cd)
4105{
4106int length = (int)(ptr - cd->start_pattern - GET(previous_callout, 2));
4107PUT(previous_callout, 2 + LINK_SIZE, length);
4108}
4109
4110
4111
4112#ifdef SUPPORT_UCP
4113/*************************************************
4114*           Get othercase range                  *
4115*************************************************/
4116
4117/* This function is passed the start and end of a class range, in UTF-8 mode
4118with UCP support. It searches up the characters, looking for ranges of
4119characters in the "other" case. Each call returns the next one, updating the
4120start address. A character with multiple other cases is returned on its own
4121with a special return value.
4122
4123Arguments:
4124  cptr        points to starting character value; updated
4125  d           end value
4126  ocptr       where to put start of othercase range
4127  odptr       where to put end of othercase range
4128
4129Yield:        -1 when no more
4130               0 when a range is returned
4131              >0 the CASESET offset for char with multiple other cases
4132                in this case, ocptr contains the original
4133*/
4134
4135static int
4136get_othercase_range(pcre_uint32 *cptr, pcre_uint32 d, pcre_uint32 *ocptr,
4137  pcre_uint32 *odptr)
4138{
4139pcre_uint32 c, othercase, next;
4140unsigned int co;
4141
4142/* Find the first character that has an other case. If it has multiple other
4143cases, return its case offset value. */
4144
4145for (c = *cptr; c <= d; c++)
4146  {
4147  if ((co = UCD_CASESET(c)) != 0)
4148    {
4149    *ocptr = c++;   /* Character that has the set */
4150    *cptr = c;      /* Rest of input range */
4151    return (int)co;
4152    }
4153  if ((othercase = UCD_OTHERCASE(c)) != c) break;
4154  }
4155
4156if (c > d) return -1;  /* Reached end of range */
4157
4158/* Found a character that has a single other case. Search for the end of the
4159range, which is either the end of the input range, or a character that has zero
4160or more than one other cases. */
4161
4162*ocptr = othercase;
4163next = othercase + 1;
4164
4165for (++c; c <= d; c++)
4166  {
4167  if ((co = UCD_CASESET(c)) != 0 || UCD_OTHERCASE(c) != next) break;
4168  next++;
4169  }
4170
4171*odptr = next - 1;     /* End of othercase range */
4172*cptr = c;             /* Rest of input range */
4173return 0;
4174}
4175#endif  /* SUPPORT_UCP */
4176
4177
4178
4179/*************************************************
4180*        Add a character or range to a class     *
4181*************************************************/
4182
4183/* This function packages up the logic of adding a character or range of
4184characters to a class. The character values in the arguments will be within the
4185valid values for the current mode (8-bit, 16-bit, UTF, etc). This function is
4186mutually recursive with the function immediately below.
4187
4188Arguments:
4189  classbits     the bit map for characters < 256
4190  uchardptr     points to the pointer for extra data
4191  options       the options word
4192  cd            contains pointers to tables etc.
4193  start         start of range character
4194  end           end of range character
4195
4196Returns:        the number of < 256 characters added
4197                the pointer to extra data is updated
4198*/
4199
4200static int
4201add_to_class(pcre_uint8 *classbits, pcre_uchar **uchardptr, int options,
4202  compile_data *cd, pcre_uint32 start, pcre_uint32 end)
4203{
4204pcre_uint32 c;
4205pcre_uint32 classbits_end = (end <= 0xff ? end : 0xff);
4206int n8 = 0;
4207
4208/* If caseless matching is required, scan the range and process alternate
4209cases. In Unicode, there are 8-bit characters that have alternate cases that
4210are greater than 255 and vice-versa. Sometimes we can just extend the original
4211range. */
4212
4213if ((options & PCRE_CASELESS) != 0)
4214  {
4215#ifdef SUPPORT_UCP
4216  if ((options & PCRE_UTF8) != 0)
4217    {
4218    int rc;
4219    pcre_uint32 oc, od;
4220
4221    options &= ~PCRE_CASELESS;   /* Remove for recursive calls */
4222    c = start;
4223
4224    while ((rc = get_othercase_range(&c, end, &oc, &od)) >= 0)
4225      {
4226      /* Handle a single character that has more than one other case. */
4227
4228      if (rc > 0) n8 += add_list_to_class(classbits, uchardptr, options, cd,
4229        PRIV(ucd_caseless_sets) + rc, oc);
4230
4231      /* Do nothing if the other case range is within the original range. */
4232
4233      else if (oc >= start && od <= end) continue;
4234
4235      /* Extend the original range if there is overlap, noting that if oc < c, we
4236      can't have od > end because a subrange is always shorter than the basic
4237      range. Otherwise, use a recursive call to add the additional range. */
4238
4239      else if (oc < start && od >= start - 1) start = oc; /* Extend downwards */
4240      else if (od > end && oc <= end + 1)
4241        {
4242        end = od;       /* Extend upwards */
4243        if (end > classbits_end) classbits_end = (end <= 0xff ? end : 0xff);
4244        }
4245      else n8 += add_to_class(classbits, uchardptr, options, cd, oc, od);
4246      }
4247    }
4248  else
4249#endif  /* SUPPORT_UCP */
4250
4251  /* Not UTF-mode, or no UCP */
4252
4253  for (c = start; c <= classbits_end; c++)
4254    {
4255    SETBIT(classbits, cd->fcc[c]);
4256    n8++;
4257    }
4258  }
4259
4260/* Now handle the original range. Adjust the final value according to the bit
4261length - this means that the same lists of (e.g.) horizontal spaces can be used
4262in all cases. */
4263
4264#if defined COMPILE_PCRE8
4265#ifdef SUPPORT_UTF
4266  if ((options & PCRE_UTF8) == 0)
4267#endif
4268  if (end > 0xff) end = 0xff;
4269
4270#elif defined COMPILE_PCRE16
4271#ifdef SUPPORT_UTF
4272  if ((options & PCRE_UTF16) == 0)
4273#endif
4274  if (end > 0xffff) end = 0xffff;
4275
4276#endif /* COMPILE_PCRE[8|16] */
4277
4278/* Use the bitmap for characters < 256. Otherwise use extra data.*/
4279
4280for (c = start; c <= classbits_end; c++)
4281  {
4282  /* Regardless of start, c will always be <= 255. */
4283  SETBIT(classbits, c);
4284  n8++;
4285  }
4286
4287#if defined SUPPORT_UTF || !defined COMPILE_PCRE8
4288if (start <= 0xff) start = 0xff + 1;
4289
4290if (end >= start)
4291  {
4292  pcre_uchar *uchardata = *uchardptr;
4293#ifdef SUPPORT_UTF
4294  if ((options & PCRE_UTF8) != 0)  /* All UTFs use the same flag bit */
4295    {
4296    if (start < end)
4297      {
4298      *uchardata++ = XCL_RANGE;
4299      uchardata += PRIV(ord2utf)(start, uchardata);
4300      uchardata += PRIV(ord2utf)(end, uchardata);
4301      }
4302    else if (start == end)
4303      {
4304      *uchardata++ = XCL_SINGLE;
4305      uchardata += PRIV(ord2utf)(start, uchardata);
4306      }
4307    }
4308  else
4309#endif  /* SUPPORT_UTF */
4310
4311  /* Without UTF support, character values are constrained by the bit length,
4312  and can only be > 256 for 16-bit and 32-bit libraries. */
4313
4314#ifdef COMPILE_PCRE8
4315    {}
4316#else
4317  if (start < end)
4318    {
4319    *uchardata++ = XCL_RANGE;
4320    *uchardata++ = start;
4321    *uchardata++ = end;
4322    }
4323  else if (start == end)
4324    {
4325    *uchardata++ = XCL_SINGLE;
4326    *uchardata++ = start;
4327    }
4328#endif
4329
4330  *uchardptr = uchardata;   /* Updata extra data pointer */
4331  }
4332#endif /* SUPPORT_UTF || !COMPILE_PCRE8 */
4333
4334return n8;    /* Number of 8-bit characters */
4335}
4336
4337
4338
4339
4340/*************************************************
4341*        Add a list of characters to a class     *
4342*************************************************/
4343
4344/* This function is used for adding a list of case-equivalent characters to a
4345class, and also for adding a list of horizontal or vertical whitespace. If the
4346list is in order (which it should be), ranges of characters are detected and
4347handled appropriately. This function is mutually recursive with the function
4348above.
4349
4350Arguments:
4351  classbits     the bit map for characters < 256
4352  uchardptr     points to the pointer for extra data
4353  options       the options word
4354  cd            contains pointers to tables etc.
4355  p             points to row of 32-bit values, terminated by NOTACHAR
4356  except        character to omit; this is used when adding lists of
4357                  case-equivalent characters to avoid including the one we
4358                  already know about
4359
4360Returns:        the number of < 256 characters added
4361                the pointer to extra data is updated
4362*/
4363
4364static int
4365add_list_to_class(pcre_uint8 *classbits, pcre_uchar **uchardptr, int options,
4366  compile_data *cd, const pcre_uint32 *p, unsigned int except)
4367{
4368int n8 = 0;
4369while (p[0] < NOTACHAR)
4370  {
4371  int n = 0;
4372  if (p[0] != except)
4373    {
4374    while(p[n+1] == p[0] + n + 1) n++;
4375    n8 += add_to_class(classbits, uchardptr, options, cd, p[0], p[n]);
4376    }
4377  p += n + 1;
4378  }
4379return n8;
4380}
4381
4382
4383
4384/*************************************************
4385*    Add characters not in a list to a class     *
4386*************************************************/
4387
4388/* This function is used for adding the complement of a list of horizontal or
4389vertical whitespace to a class. The list must be in order.
4390
4391Arguments:
4392  classbits     the bit map for characters < 256
4393  uchardptr     points to the pointer for extra data
4394  options       the options word
4395  cd            contains pointers to tables etc.
4396  p             points to row of 32-bit values, terminated by NOTACHAR
4397
4398Returns:        the number of < 256 characters added
4399                the pointer to extra data is updated
4400*/
4401
4402static int
4403add_not_list_to_class(pcre_uint8 *classbits, pcre_uchar **uchardptr,
4404  int options, compile_data *cd, const pcre_uint32 *p)
4405{
4406BOOL utf = (options & PCRE_UTF8) != 0;
4407int n8 = 0;
4408if (p[0] > 0)
4409  n8 += add_to_class(classbits, uchardptr, options, cd, 0, p[0] - 1);
4410while (p[0] < NOTACHAR)
4411  {
4412  while (p[1] == p[0] + 1) p++;
4413  n8 += add_to_class(classbits, uchardptr, options, cd, p[0] + 1,
4414    (p[1] == NOTACHAR) ? (utf ? 0x10ffffu : 0xffffffffu) : p[1] - 1);
4415  p++;
4416  }
4417return n8;
4418}
4419
4420
4421
4422/*************************************************
4423*           Compile one branch                   *
4424*************************************************/
4425
4426/* Scan the pattern, compiling it into the a vector. If the options are
4427changed during the branch, the pointer is used to change the external options
4428bits. This function is used during the pre-compile phase when we are trying
4429to find out the amount of memory needed, as well as during the real compile
4430phase. The value of lengthptr distinguishes the two phases.
4431
4432Arguments:
4433  optionsptr        pointer to the option bits
4434  codeptr           points to the pointer to the current code point
4435  ptrptr            points to the current pattern pointer
4436  errorcodeptr      points to error code variable
4437  firstcharptr      place to put the first required character
4438  firstcharflagsptr place to put the first character flags, or a negative number
4439  reqcharptr        place to put the last required character
4440  reqcharflagsptr   place to put the last required character flags, or a negative number
4441  bcptr             points to current branch chain
4442  cond_depth        conditional nesting depth
4443  cd                contains pointers to tables etc.
4444  lengthptr         NULL during the real compile phase
4445                    points to length accumulator during pre-compile phase
4446
4447Returns:            TRUE on success
4448                    FALSE, with *errorcodeptr set non-zero on error
4449*/
4450
4451static BOOL
4452compile_branch(int *optionsptr, pcre_uchar **codeptr,
4453  const pcre_uchar **ptrptr, int *errorcodeptr,
4454  pcre_uint32 *firstcharptr, pcre_int32 *firstcharflagsptr,
4455  pcre_uint32 *reqcharptr, pcre_int32 *reqcharflagsptr,
4456  branch_chain *bcptr, int cond_depth,
4457  compile_data *cd, int *lengthptr)
4458{
4459int repeat_type, op_type;
4460int repeat_min = 0, repeat_max = 0;      /* To please picky compilers */
4461int bravalue = 0;
4462int greedy_default, greedy_non_default;
4463pcre_uint32 firstchar, reqchar;
4464pcre_int32 firstcharflags, reqcharflags;
4465pcre_uint32 zeroreqchar, zerofirstchar;
4466pcre_int32 zeroreqcharflags, zerofirstcharflags;
4467pcre_int32 req_caseopt, reqvary, tempreqvary;
4468int options = *optionsptr;               /* May change dynamically */
4469int after_manual_callout = 0;
4470int length_prevgroup = 0;
4471register pcre_uint32 c;
4472int escape;
4473register pcre_uchar *code = *codeptr;
4474pcre_uchar *last_code = code;
4475pcre_uchar *orig_code = code;
4476pcre_uchar *tempcode;
4477BOOL inescq = FALSE;
4478BOOL groupsetfirstchar = FALSE;
4479const pcre_uchar *ptr = *ptrptr;
4480const pcre_uchar *tempptr;
4481const pcre_uchar *nestptr = NULL;
4482pcre_uchar *previous = NULL;
4483pcre_uchar *previous_callout = NULL;
4484size_t item_hwm_offset = 0;
4485pcre_uint8 classbits[32];
4486
4487/* We can fish out the UTF-8 setting once and for all into a BOOL, but we
4488must not do this for other options (e.g. PCRE_EXTENDED) because they may change
4489dynamically as we process the pattern. */
4490
4491#ifdef SUPPORT_UTF
4492/* PCRE_UTF[16|32] have the same value as PCRE_UTF8. */
4493BOOL utf = (options & PCRE_UTF8) != 0;
4494#ifndef COMPILE_PCRE32
4495pcre_uchar utf_chars[6];
4496#endif
4497#else
4498BOOL utf = FALSE;
4499#endif
4500
4501/* Helper variables for OP_XCLASS opcode (for characters > 255). We define
4502class_uchardata always so that it can be passed to add_to_class() always,
4503though it will not be used in non-UTF 8-bit cases. This avoids having to supply
4504alternative calls for the different cases. */
4505
4506pcre_uchar *class_uchardata;
4507#if defined SUPPORT_UTF || !defined COMPILE_PCRE8
4508BOOL xclass;
4509pcre_uchar *class_uchardata_base;
4510#endif
4511
4512#ifdef PCRE_DEBUG
4513if (lengthptr != NULL) DPRINTF((">> start branch\n"));
4514#endif
4515
4516/* Set up the default and non-default settings for greediness */
4517
4518greedy_default = ((options & PCRE_UNGREEDY) != 0);
4519greedy_non_default = greedy_default ^ 1;
4520
4521/* Initialize no first byte, no required byte. REQ_UNSET means "no char
4522matching encountered yet". It gets changed to REQ_NONE if we hit something that
4523matches a non-fixed char first char; reqchar just remains unset if we never
4524find one.
4525
4526When we hit a repeat whose minimum is zero, we may have to adjust these values
4527to take the zero repeat into account. This is implemented by setting them to
4528zerofirstbyte and zeroreqchar when such a repeat is encountered. The individual
4529item types that can be repeated set these backoff variables appropriately. */
4530
4531firstchar = reqchar = zerofirstchar = zeroreqchar = 0;
4532firstcharflags = reqcharflags = zerofirstcharflags = zeroreqcharflags = REQ_UNSET;
4533
4534/* The variable req_caseopt contains either the REQ_CASELESS value
4535or zero, according to the current setting of the caseless flag. The
4536REQ_CASELESS leaves the lower 28 bit empty. It is added into the
4537firstchar or reqchar variables to record the case status of the
4538value. This is used only for ASCII characters. */
4539
4540req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS:0;
4541
4542/* Switch on next character until the end of the branch */
4543
4544for (;; ptr++)
4545  {
4546  BOOL negate_class;
4547  BOOL should_flip_negation;
4548  BOOL possessive_quantifier;
4549  BOOL is_quantifier;
4550  BOOL is_recurse;
4551  BOOL reset_bracount;
4552  int class_has_8bitchar;
4553  int class_one_char;
4554#if defined SUPPORT_UTF || !defined COMPILE_PCRE8
4555  BOOL xclass_has_prop;
4556#endif
4557  int newoptions;
4558  int recno;
4559  int refsign;
4560  int skipbytes;
4561  pcre_uint32 subreqchar, subfirstchar;
4562  pcre_int32 subreqcharflags, subfirstcharflags;
4563  int terminator;
4564  unsigned int mclength;
4565  unsigned int tempbracount;
4566  pcre_uint32 ec;
4567  pcre_uchar mcbuffer[8];
4568
4569  /* Get next character in the pattern */
4570
4571  c = *ptr;
4572
4573  /* If we are at the end of a nested substitution, revert to the outer level
4574  string. Nesting only happens one level deep. */
4575
4576  if (c == CHAR_NULL && nestptr != NULL)
4577    {
4578    ptr = nestptr;
4579    nestptr = NULL;
4580    c = *ptr;
4581    }
4582
4583  /* If we are in the pre-compile phase, accumulate the length used for the
4584  previous cycle of this loop. */
4585
4586  if (lengthptr != NULL)
4587    {
4588#ifdef PCRE_DEBUG
4589    if (code > cd->hwm) cd->hwm = code;                 /* High water info */
4590#endif
4591    if (code > cd->start_workspace + cd->workspace_size -
4592        WORK_SIZE_SAFETY_MARGIN)                       /* Check for overrun */
4593      {
4594      *errorcodeptr = ERR52;
4595      goto FAILED;
4596      }
4597
4598    /* There is at least one situation where code goes backwards: this is the
4599    case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
4600    the class is simply eliminated. However, it is created first, so we have to
4601    allow memory for it. Therefore, don't ever reduce the length at this point.
4602    */
4603
4604    if (code < last_code) code = last_code;
4605
4606    /* Paranoid check for integer overflow */
4607
4608    if (OFLOW_MAX - *lengthptr < code - last_code)
4609      {
4610      *errorcodeptr = ERR20;
4611      goto FAILED;
4612      }
4613
4614    *lengthptr += (int)(code - last_code);
4615    DPRINTF(("length=%d added %d c=%c (0x%x)\n", *lengthptr,
4616      (int)(code - last_code), c, c));
4617
4618    /* If "previous" is set and it is not at the start of the work space, move
4619    it back to there, in order to avoid filling up the work space. Otherwise,
4620    if "previous" is NULL, reset the current code pointer to the start. */
4621
4622    if (previous != NULL)
4623      {
4624      if (previous > orig_code)
4625        {
4626        memmove(orig_code, previous, IN_UCHARS(code - previous));
4627        code -= previous - orig_code;
4628        previous = orig_code;
4629        }
4630      }
4631    else code = orig_code;
4632
4633    /* Remember where this code item starts so we can pick up the length
4634    next time round. */
4635
4636    last_code = code;
4637    }
4638
4639  /* In the real compile phase, just check the workspace used by the forward
4640  reference list. */
4641
4642  else if (cd->hwm > cd->start_workspace + cd->workspace_size)
4643    {
4644    *errorcodeptr = ERR52;
4645    goto FAILED;
4646    }
4647
4648  /* If in \Q...\E, check for the end; if not, we have a literal */
4649
4650  if (inescq && c != CHAR_NULL)
4651    {
4652    if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E)
4653      {
4654      inescq = FALSE;
4655      ptr++;
4656      continue;
4657      }
4658    else
4659      {
4660      if (previous_callout != NULL)
4661        {
4662        if (lengthptr == NULL)  /* Don't attempt in pre-compile phase */
4663          complete_callout(previous_callout, ptr, cd);
4664        previous_callout = NULL;
4665        }
4666      if ((options & PCRE_AUTO_CALLOUT) != 0)
4667        {
4668        previous_callout = code;
4669        code = auto_callout(code, ptr, cd);
4670        }
4671      goto NORMAL_CHAR;
4672      }
4673    /* Control does not reach here. */
4674    }
4675
4676  /* In extended mode, skip white space and comments. We need a loop in order
4677  to check for more white space and more comments after a comment. */
4678
4679  if ((options & PCRE_EXTENDED) != 0)
4680    {
4681    for (;;)
4682      {
4683      while (MAX_255(c) && (cd->ctypes[c] & ctype_space) != 0) c = *(++ptr);
4684      if (c != CHAR_NUMBER_SIGN) break;
4685      ptr++;
4686      while (*ptr != CHAR_NULL)
4687        {
4688        if (IS_NEWLINE(ptr))         /* For non-fixed-length newline cases, */
4689          {                          /* IS_NEWLINE sets cd->nllen. */
4690          ptr += cd->nllen;
4691          break;
4692          }
4693        ptr++;
4694#ifdef SUPPORT_UTF
4695        if (utf) FORWARDCHAR(ptr);
4696#endif
4697        }
4698      c = *ptr;     /* Either NULL or the char after a newline */
4699      }
4700    }
4701
4702  /* See if the next thing is a quantifier. */
4703
4704  is_quantifier =
4705    c == CHAR_ASTERISK || c == CHAR_PLUS || c == CHAR_QUESTION_MARK ||
4706    (c == CHAR_LEFT_CURLY_BRACKET && is_counted_repeat(ptr+1));
4707
4708  /* Fill in length of a previous callout, except when the next thing is a
4709  quantifier or when processing a property substitution string in UCP mode. */
4710
4711  if (!is_quantifier && previous_callout != NULL && nestptr == NULL &&
4712       after_manual_callout-- <= 0)
4713    {
4714    if (lengthptr == NULL)      /* Don't attempt in pre-compile phase */
4715      complete_callout(previous_callout, ptr, cd);
4716    previous_callout = NULL;
4717    }
4718
4719  /* Create auto callout, except for quantifiers, or while processing property
4720  strings that are substituted for \w etc in UCP mode. */
4721
4722  if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier && nestptr == NULL)
4723    {
4724    previous_callout = code;
4725    code = auto_callout(code, ptr, cd);
4726    }
4727
4728  /* Process the next pattern item. */
4729
4730  switch(c)
4731    {
4732    /* ===================================================================*/
4733    case CHAR_NULL:                /* The branch terminates at string end */
4734    case CHAR_VERTICAL_LINE:       /* or | or ) */
4735    case CHAR_RIGHT_PARENTHESIS:
4736    *firstcharptr = firstchar;
4737    *firstcharflagsptr = firstcharflags;
4738    *reqcharptr = reqchar;
4739    *reqcharflagsptr = reqcharflags;
4740    *codeptr = code;
4741    *ptrptr = ptr;
4742    if (lengthptr != NULL)
4743      {
4744      if (OFLOW_MAX - *lengthptr < code - last_code)
4745        {
4746        *errorcodeptr = ERR20;
4747        goto FAILED;
4748        }
4749      *lengthptr += (int)(code - last_code);   /* To include callout length */
4750      DPRINTF((">> end branch\n"));
4751      }
4752    return TRUE;
4753
4754
4755    /* ===================================================================*/
4756    /* Handle single-character metacharacters. In multiline mode, ^ disables
4757    the setting of any following char as a first character. */
4758
4759    case CHAR_CIRCUMFLEX_ACCENT:
4760    previous = NULL;
4761    if ((options & PCRE_MULTILINE) != 0)
4762      {
4763      if (firstcharflags == REQ_UNSET)
4764        zerofirstcharflags = firstcharflags = REQ_NONE;
4765      *code++ = OP_CIRCM;
4766      }
4767    else *code++ = OP_CIRC;
4768    break;
4769
4770    case CHAR_DOLLAR_SIGN:
4771    previous = NULL;
4772    *code++ = ((options & PCRE_MULTILINE) != 0)? OP_DOLLM : OP_DOLL;
4773    break;
4774
4775    /* There can never be a first char if '.' is first, whatever happens about
4776    repeats. The value of reqchar doesn't change either. */
4777
4778    case CHAR_DOT:
4779    if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
4780    zerofirstchar = firstchar;
4781    zerofirstcharflags = firstcharflags;
4782    zeroreqchar = reqchar;
4783    zeroreqcharflags = reqcharflags;
4784    previous = code;
4785    item_hwm_offset = cd->hwm - cd->start_workspace;
4786    *code++ = ((options & PCRE_DOTALL) != 0)? OP_ALLANY: OP_ANY;
4787    break;
4788
4789
4790    /* ===================================================================*/
4791    /* Character classes. If the included characters are all < 256, we build a
4792    32-byte bitmap of the permitted characters, except in the special case
4793    where there is only one such character. For negated classes, we build the
4794    map as usual, then invert it at the end. However, we use a different opcode
4795    so that data characters > 255 can be handled correctly.
4796
4797    If the class contains characters outside the 0-255 range, a different
4798    opcode is compiled. It may optionally have a bit map for characters < 256,
4799    but those above are are explicitly listed afterwards. A flag byte tells
4800    whether the bitmap is present, and whether this is a negated class or not.
4801
4802    In JavaScript compatibility mode, an isolated ']' causes an error. In
4803    default (Perl) mode, it is treated as a data character. */
4804
4805    case CHAR_RIGHT_SQUARE_BRACKET:
4806    if ((cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
4807      {
4808      *errorcodeptr = ERR64;
4809      goto FAILED;
4810      }
4811    goto NORMAL_CHAR;
4812
4813    /* In another (POSIX) regex library, the ugly syntax [[:<:]] and [[:>:]] is
4814    used for "start of word" and "end of word". As these are otherwise illegal
4815    sequences, we don't break anything by recognizing them. They are replaced
4816    by \b(?=\w) and \b(?<=\w) respectively. Sequences like [a[:<:]] are
4817    erroneous and are handled by the normal code below. */
4818
4819    case CHAR_LEFT_SQUARE_BRACKET:
4820    if (STRNCMP_UC_C8(ptr+1, STRING_WEIRD_STARTWORD, 6) == 0)
4821      {
4822      nestptr = ptr + 7;
4823      ptr = sub_start_of_word - 1;
4824      continue;
4825      }
4826
4827    if (STRNCMP_UC_C8(ptr+1, STRING_WEIRD_ENDWORD, 6) == 0)
4828      {
4829      nestptr = ptr + 7;
4830      ptr = sub_end_of_word - 1;
4831      continue;
4832      }
4833
4834    /* Handle a real character class. */
4835
4836    previous = code;
4837    item_hwm_offset = cd->hwm - cd->start_workspace;
4838
4839    /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
4840    they are encountered at the top level, so we'll do that too. */
4841
4842    if ((ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
4843         ptr[1] == CHAR_EQUALS_SIGN) &&
4844        check_posix_syntax(ptr, &tempptr))
4845      {
4846      *errorcodeptr = (ptr[1] == CHAR_COLON)? ERR13 : ERR31;
4847      goto FAILED;
4848      }
4849
4850    /* If the first character is '^', set the negation flag and skip it. Also,
4851    if the first few characters (either before or after ^) are \Q\E or \E we
4852    skip them too. This makes for compatibility with Perl. */
4853
4854    negate_class = FALSE;
4855    for (;;)
4856      {
4857      c = *(++ptr);
4858      if (c == CHAR_BACKSLASH)
4859        {
4860        if (ptr[1] == CHAR_E)
4861          ptr++;
4862        else if (STRNCMP_UC_C8(ptr + 1, STR_Q STR_BACKSLASH STR_E, 3) == 0)
4863          ptr += 3;
4864        else
4865          break;
4866        }
4867      else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT)
4868        negate_class = TRUE;
4869      else break;
4870      }
4871
4872    /* Empty classes are allowed in JavaScript compatibility mode. Otherwise,
4873    an initial ']' is taken as a data character -- the code below handles
4874    that. In JS mode, [] must always fail, so generate OP_FAIL, whereas
4875    [^] must match any character, so generate OP_ALLANY. */
4876
4877    if (c == CHAR_RIGHT_SQUARE_BRACKET &&
4878        (cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
4879      {
4880      *code++ = negate_class? OP_ALLANY : OP_FAIL;
4881      if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
4882      zerofirstchar = firstchar;
4883      zerofirstcharflags = firstcharflags;
4884      break;
4885      }
4886
4887    /* If a class contains a negative special such as \S, we need to flip the
4888    negation flag at the end, so that support for characters > 255 works
4889    correctly (they are all included in the class). */
4890
4891    should_flip_negation = FALSE;
4892
4893    /* Extended class (xclass) will be used when characters > 255
4894    might match. */
4895
4896#if defined SUPPORT_UTF || !defined COMPILE_PCRE8
4897    xclass = FALSE;
4898    class_uchardata = code + LINK_SIZE + 2;   /* For XCLASS items */
4899    class_uchardata_base = class_uchardata;   /* Save the start */
4900#endif
4901
4902    /* For optimization purposes, we track some properties of the class:
4903    class_has_8bitchar will be non-zero if the class contains at least one <
4904    256 character; class_one_char will be 1 if the class contains just one
4905    character; xclass_has_prop will be TRUE if unicode property checks
4906    are present in the class. */
4907
4908    class_has_8bitchar = 0;
4909    class_one_char = 0;
4910#if defined SUPPORT_UTF || !defined COMPILE_PCRE8
4911    xclass_has_prop = FALSE;
4912#endif
4913
4914    /* Initialize the 32-char bit map to all zeros. We build the map in a
4915    temporary bit of memory, in case the class contains fewer than two
4916    8-bit characters because in that case the compiled code doesn't use the bit
4917    map. */
4918
4919    memset(classbits, 0, 32 * sizeof(pcre_uint8));
4920
4921    /* Process characters until ] is reached. By writing this as a "do" it
4922    means that an initial ] is taken as a data character. At the start of the
4923    loop, c contains the first byte of the character. */
4924
4925    if (c != CHAR_NULL) do
4926      {
4927      const pcre_uchar *oldptr;
4928
4929#ifdef SUPPORT_UTF
4930      if (utf && HAS_EXTRALEN(c))
4931        {                           /* Braces are required because the */
4932        GETCHARLEN(c, ptr, ptr);    /* macro generates multiple statements */
4933        }
4934#endif
4935
4936#if defined SUPPORT_UTF || !defined COMPILE_PCRE8
4937      /* In the pre-compile phase, accumulate the length of any extra
4938      data and reset the pointer. This is so that very large classes that
4939      contain a zillion > 255 characters no longer overwrite the work space
4940      (which is on the stack). We have to remember that there was XCLASS data,
4941      however. */
4942
4943      if (class_uchardata > class_uchardata_base) xclass = TRUE;
4944
4945      if (lengthptr != NULL && class_uchardata > class_uchardata_base)
4946        {
4947        *lengthptr += (int)(class_uchardata - class_uchardata_base);
4948        class_uchardata = class_uchardata_base;
4949        }
4950#endif
4951
4952      /* Inside \Q...\E everything is literal except \E */
4953
4954      if (inescq)
4955        {
4956        if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E)  /* If we are at \E */
4957          {
4958          inescq = FALSE;                   /* Reset literal state */
4959          ptr++;                            /* Skip the 'E' */
4960          continue;                         /* Carry on with next */
4961          }
4962        goto CHECK_RANGE;                   /* Could be range if \E follows */
4963        }
4964
4965      /* Handle POSIX class names. Perl allows a negation extension of the
4966      form [:^name:]. A square bracket that doesn't match the syntax is
4967      treated as a literal. We also recognize the POSIX constructions
4968      [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
4969      5.6 and 5.8 do. */
4970
4971      if (c == CHAR_LEFT_SQUARE_BRACKET &&
4972          (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
4973           ptr[1] == CHAR_EQUALS_SIGN) && check_posix_syntax(ptr, &tempptr))
4974        {
4975        BOOL local_negate = FALSE;
4976        int posix_class, taboffset, tabopt;
4977        register const pcre_uint8 *cbits = cd->cbits;
4978        pcre_uint8 pbits[32];
4979
4980        if (ptr[1] != CHAR_COLON)
4981          {
4982          *errorcodeptr = ERR31;
4983          goto FAILED;
4984          }
4985
4986        ptr += 2;
4987        if (*ptr == CHAR_CIRCUMFLEX_ACCENT)
4988          {
4989          local_negate = TRUE;
4990          should_flip_negation = TRUE;  /* Note negative special */
4991          ptr++;
4992          }
4993
4994        posix_class = check_posix_name(ptr, (int)(tempptr - ptr));
4995        if (posix_class < 0)
4996          {
4997          *errorcodeptr = ERR30;
4998          goto FAILED;
4999          }
5000
5001        /* If matching is caseless, upper and lower are converted to
5002        alpha. This relies on the fact that the class table starts with
5003        alpha, lower, upper as the first 3 entries. */
5004
5005        if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
5006          posix_class = 0;
5007
5008        /* When PCRE_UCP is set, some of the POSIX classes are converted to
5009        different escape sequences that use Unicode properties \p or \P. Others
5010        that are not available via \p or \P generate XCL_PROP/XCL_NOTPROP
5011        directly. */
5012
5013#ifdef SUPPORT_UCP
5014        if ((options & PCRE_UCP) != 0)
5015          {
5016          unsigned int ptype = 0;
5017          int pc = posix_class + ((local_negate)? POSIX_SUBSIZE/2 : 0);
5018
5019          /* The posix_substitutes table specifies which POSIX classes can be
5020          converted to \p or \P items. */
5021
5022          if (posix_substitutes[pc] != NULL)
5023            {
5024            nestptr = tempptr + 1;
5025            ptr = posix_substitutes[pc] - 1;
5026            continue;
5027            }
5028
5029          /* There are three other classes that generate special property calls
5030          that are recognized only in an XCLASS. */
5031
5032          else switch(posix_class)
5033            {
5034            case PC_GRAPH:
5035            ptype = PT_PXGRAPH;
5036            /* Fall through */
5037            case PC_PRINT:
5038            if (ptype == 0) ptype = PT_PXPRINT;
5039            /* Fall through */
5040            case PC_PUNCT:
5041            if (ptype == 0) ptype = PT_PXPUNCT;
5042            *class_uchardata++ = local_negate? XCL_NOTPROP : XCL_PROP;
5043            *class_uchardata++ = ptype;
5044            *class_uchardata++ = 0;
5045            xclass_has_prop = TRUE;
5046            ptr = tempptr + 1;
5047            continue;
5048
5049            /* For the other POSIX classes (ascii, xdigit) we are going to fall
5050            through to the non-UCP case and build a bit map for characters with
5051            code points less than 256. If we are in a negated POSIX class
5052            within a non-negated overall class, characters with code points
5053            greater than 255 must all match. In the special case where we have
5054            not yet generated any xclass data, and this is the final item in
5055            the overall class, we need do nothing: later on, the opcode
5056            OP_NCLASS will be used to indicate that characters greater than 255
5057            are acceptable. If we have already seen an xclass item or one may
5058            follow (we have to assume that it might if this is not the end of
5059            the class), explicitly match all wide codepoints. */
5060
5061            default:
5062            if (!negate_class && local_negate &&
5063                (xclass || tempptr[2] != CHAR_RIGHT_SQUARE_BRACKET))
5064              {
5065              *class_uchardata++ = XCL_RANGE;
5066              class_uchardata += PRIV(ord2utf)(0x100, class_uchardata);
5067              class_uchardata += PRIV(ord2utf)(0x10ffff, class_uchardata);
5068              }
5069            break;
5070            }
5071          }
5072#endif
5073        /* In the non-UCP case, or when UCP makes no difference, we build the
5074        bit map for the POSIX class in a chunk of local store because we may be
5075        adding and subtracting from it, and we don't want to subtract bits that
5076        may be in the main map already. At the end we or the result into the
5077        bit map that is being built. */
5078
5079        posix_class *= 3;
5080
5081        /* Copy in the first table (always present) */
5082
5083        memcpy(pbits, cbits + posix_class_maps[posix_class],
5084          32 * sizeof(pcre_uint8));
5085
5086        /* If there is a second table, add or remove it as required. */
5087
5088        taboffset = posix_class_maps[posix_class + 1];
5089        tabopt = posix_class_maps[posix_class + 2];
5090
5091        if (taboffset >= 0)
5092          {
5093          if (tabopt >= 0)
5094            for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
5095          else
5096            for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
5097          }
5098
5099        /* Now see if we need to remove any special characters. An option
5100        value of 1 removes vertical space and 2 removes underscore. */
5101
5102        if (tabopt < 0) tabopt = -tabopt;
5103        if (tabopt == 1) pbits[1] &= ~0x3c;
5104          else if (tabopt == 2) pbits[11] &= 0x7f;
5105
5106        /* Add the POSIX table or its complement into the main table that is
5107        being built and we are done. */
5108
5109        if (local_negate)
5110          for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
5111        else
5112          for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
5113
5114        ptr = tempptr + 1;
5115        /* Every class contains at least one < 256 character. */
5116        class_has_8bitchar = 1;
5117        /* Every class contains at least two characters. */
5118        class_one_char = 2;
5119        continue;    /* End of POSIX syntax handling */
5120        }
5121
5122      /* Backslash may introduce a single character, or it may introduce one
5123      of the specials, which just set a flag. The sequence \b is a special
5124      case. Inside a class (and only there) it is treated as backspace. We
5125      assume that other escapes have more than one character in them, so
5126      speculatively set both class_has_8bitchar and class_one_char bigger
5127      than one. Unrecognized escapes fall through and are either treated
5128      as literal characters (by default), or are faulted if
5129      PCRE_EXTRA is set. */
5130
5131      if (c == CHAR_BACKSLASH)
5132        {
5133        escape = check_escape(&ptr, &ec, errorcodeptr, cd->bracount, options,
5134          TRUE);
5135        if (*errorcodeptr != 0) goto FAILED;
5136        if (escape == 0) c = ec;
5137        else if (escape == ESC_b) c = CHAR_BS; /* \b is backspace in a class */
5138        else if (escape == ESC_N)          /* \N is not supported in a class */
5139          {
5140          *errorcodeptr = ERR71;
5141          goto FAILED;
5142          }
5143        else if (escape == ESC_Q)            /* Handle start of quoted string */
5144          {
5145          if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
5146            {
5147            ptr += 2; /* avoid empty string */
5148            }
5149          else inescq = TRUE;
5150          continue;
5151          }
5152        else if (escape == ESC_E) continue;  /* Ignore orphan \E */
5153
5154        else
5155          {
5156          register const pcre_uint8 *cbits = cd->cbits;
5157          /* Every class contains at least two < 256 characters. */
5158          class_has_8bitchar++;
5159          /* Every class contains at least two characters. */
5160          class_one_char += 2;
5161
5162          switch (escape)
5163            {
5164#ifdef SUPPORT_UCP
5165            case ESC_du:     /* These are the values given for \d etc */
5166            case ESC_DU:     /* when PCRE_UCP is set. We replace the */
5167            case ESC_wu:     /* escape sequence with an appropriate \p */
5168            case ESC_WU:     /* or \P to test Unicode properties instead */
5169            case ESC_su:     /* of the default ASCII testing. */
5170            case ESC_SU:
5171            nestptr = ptr;
5172            ptr = substitutes[escape - ESC_DU] - 1;  /* Just before substitute */
5173            class_has_8bitchar--;                /* Undo! */
5174            continue;
5175#endif
5176            case ESC_d:
5177            for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
5178            continue;
5179
5180            case ESC_D:
5181            should_flip_negation = TRUE;
5182            for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
5183            continue;
5184
5185            case ESC_w:
5186            for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
5187            continue;
5188
5189            case ESC_W:
5190            should_flip_negation = TRUE;
5191            for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
5192            continue;
5193
5194            /* Perl 5.004 onwards omitted VT from \s, but restored it at Perl
5195            5.18. Before PCRE 8.34, we had to preserve the VT bit if it was
5196            previously set by something earlier in the character class.
5197            Luckily, the value of CHAR_VT is 0x0b in both ASCII and EBCDIC, so
5198            we could just adjust the appropriate bit. From PCRE 8.34 we no
5199            longer treat \s and \S specially. */
5200
5201            case ESC_s:
5202            for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
5203            continue;
5204
5205            case ESC_S:
5206            should_flip_negation = TRUE;
5207            for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
5208            continue;
5209
5210            /* The rest apply in both UCP and non-UCP cases. */
5211
5212            case ESC_h:
5213            (void)add_list_to_class(classbits, &class_uchardata, options, cd,
5214              PRIV(hspace_list), NOTACHAR);
5215            continue;
5216
5217            case ESC_H:
5218            (void)add_not_list_to_class(classbits, &class_uchardata, options,
5219              cd, PRIV(hspace_list));
5220            continue;
5221
5222            case ESC_v:
5223            (void)add_list_to_class(classbits, &class_uchardata, options, cd,
5224              PRIV(vspace_list), NOTACHAR);
5225            continue;
5226
5227            case ESC_V:
5228            (void)add_not_list_to_class(classbits, &class_uchardata, options,
5229              cd, PRIV(vspace_list));
5230            continue;
5231
5232            case ESC_p:
5233            case ESC_P:
5234#ifdef SUPPORT_UCP
5235              {
5236              BOOL negated;
5237              unsigned int ptype = 0, pdata = 0;
5238              if (!get_ucp(&ptr, &negated, &ptype, &pdata, errorcodeptr))
5239                goto FAILED;
5240              *class_uchardata++ = ((escape == ESC_p) != negated)?
5241                XCL_PROP : XCL_NOTPROP;
5242              *class_uchardata++ = ptype;
5243              *class_uchardata++ = pdata;
5244              xclass_has_prop = TRUE;
5245              class_has_8bitchar--;                /* Undo! */
5246              continue;
5247              }
5248#else
5249            *errorcodeptr = ERR45;
5250            goto FAILED;
5251#endif
5252            /* Unrecognized escapes are faulted if PCRE is running in its
5253            strict mode. By default, for compatibility with Perl, they are
5254            treated as literals. */
5255
5256            default:
5257            if ((options & PCRE_EXTRA) != 0)
5258              {
5259              *errorcodeptr = ERR7;
5260              goto FAILED;
5261              }
5262            class_has_8bitchar--;    /* Undo the speculative increase. */
5263            class_one_char -= 2;     /* Undo the speculative increase. */
5264            c = *ptr;                /* Get the final character and fall through */
5265            break;
5266            }
5267          }
5268
5269        /* Fall through if the escape just defined a single character (c >= 0).
5270        This may be greater than 256. */
5271
5272        escape = 0;
5273
5274        }   /* End of backslash handling */
5275
5276      /* A character may be followed by '-' to form a range. However, Perl does
5277      not permit ']' to be the end of the range. A '-' character at the end is
5278      treated as a literal. Perl ignores orphaned \E sequences entirely. The
5279      code for handling \Q and \E is messy. */
5280
5281      CHECK_RANGE:
5282      while (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
5283        {
5284        inescq = FALSE;
5285        ptr += 2;
5286        }
5287      oldptr = ptr;
5288
5289      /* Remember if \r or \n were explicitly used */
5290
5291      if (c == CHAR_CR || c == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
5292
5293      /* Check for range */
5294
5295      if (!inescq && ptr[1] == CHAR_MINUS)
5296        {
5297        pcre_uint32 d;
5298        ptr += 2;
5299        while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E) ptr += 2;
5300
5301        /* If we hit \Q (not followed by \E) at this point, go into escaped
5302        mode. */
5303
5304        while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_Q)
5305          {
5306          ptr += 2;
5307          if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E)
5308            { ptr += 2; continue; }
5309          inescq = TRUE;
5310          break;
5311          }
5312
5313        /* Minus (hyphen) at the end of a class is treated as a literal, so put
5314        back the pointer and jump to handle the character that preceded it. */
5315
5316        if (*ptr == CHAR_NULL || (!inescq && *ptr == CHAR_RIGHT_SQUARE_BRACKET))
5317          {
5318          ptr = oldptr;
5319          goto CLASS_SINGLE_CHARACTER;
5320          }
5321
5322        /* Otherwise, we have a potential range; pick up the next character */
5323
5324#ifdef SUPPORT_UTF
5325        if (utf)
5326          {                           /* Braces are required because the */
5327          GETCHARLEN(d, ptr, ptr);    /* macro generates multiple statements */
5328          }
5329        else
5330#endif
5331        d = *ptr;  /* Not UTF-8 mode */
5332
5333        /* The second part of a range can be a single-character escape
5334        sequence, but not any of the other escapes. Perl treats a hyphen as a
5335        literal in such circumstances. However, in Perl's warning mode, a
5336        warning is given, so PCRE now faults it as it is almost certainly a
5337        mistake on the user's part. */
5338
5339        if (!inescq)
5340          {
5341          if (d == CHAR_BACKSLASH)
5342            {
5343            int descape;
5344            descape = check_escape(&ptr, &d, errorcodeptr, cd->bracount, options, TRUE);
5345            if (*errorcodeptr != 0) goto FAILED;
5346
5347            /* 0 means a character was put into d; \b is backspace; any other
5348            special causes an error. */
5349
5350            if (descape != 0)
5351              {
5352              if (descape == ESC_b) d = CHAR_BS; else
5353                {
5354                *errorcodeptr = ERR83;
5355                goto FAILED;
5356                }
5357              }
5358            }
5359
5360          /* A hyphen followed by a POSIX class is treated in the same way. */
5361
5362          else if (d == CHAR_LEFT_SQUARE_BRACKET &&
5363                   (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
5364                    ptr[1] == CHAR_EQUALS_SIGN) &&
5365                   check_posix_syntax(ptr, &tempptr))
5366            {
5367            *errorcodeptr = ERR83;
5368            goto FAILED;
5369            }
5370          }
5371
5372        /* Check that the two values are in the correct order. Optimize
5373        one-character ranges. */
5374
5375        if (d < c)
5376          {
5377          *errorcodeptr = ERR8;
5378          goto FAILED;
5379          }
5380        if (d == c) goto CLASS_SINGLE_CHARACTER;  /* A few lines below */
5381
5382        /* We have found a character range, so single character optimizations
5383        cannot be done anymore. Any value greater than 1 indicates that there
5384        is more than one character. */
5385
5386        class_one_char = 2;
5387
5388        /* Remember an explicit \r or \n, and add the range to the class. */
5389
5390        if (d == CHAR_CR || d == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
5391
5392        class_has_8bitchar +=
5393          add_to_class(classbits, &class_uchardata, options, cd, c, d);
5394
5395        continue;   /* Go get the next char in the class */
5396        }
5397
5398      /* Handle a single character - we can get here for a normal non-escape
5399      char, or after \ that introduces a single character or for an apparent
5400      range that isn't. Only the value 1 matters for class_one_char, so don't
5401      increase it if it is already 2 or more ... just in case there's a class
5402      with a zillion characters in it. */
5403
5404      CLASS_SINGLE_CHARACTER:
5405      if (class_one_char < 2) class_one_char++;
5406
5407      /* If xclass_has_prop is false and class_one_char is 1, we have the first
5408      single character in the class, and there have been no prior ranges, or
5409      XCLASS items generated by escapes. If this is the final character in the
5410      class, we can optimize by turning the item into a 1-character OP_CHAR[I]
5411      if it's positive, or OP_NOT[I] if it's negative. In the positive case, it
5412      can cause firstchar to be set. Otherwise, there can be no first char if
5413      this item is first, whatever repeat count may follow. In the case of
5414      reqchar, save the previous value for reinstating. */
5415
5416      if (!inescq &&
5417#ifdef SUPPORT_UCP
5418          !xclass_has_prop &&
5419#endif
5420          class_one_char == 1 && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
5421        {
5422        ptr++;
5423        zeroreqchar = reqchar;
5424        zeroreqcharflags = reqcharflags;
5425
5426        if (negate_class)
5427          {
5428#ifdef SUPPORT_UCP
5429          int d;
5430#endif
5431          if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
5432          zerofirstchar = firstchar;
5433          zerofirstcharflags = firstcharflags;
5434
5435          /* For caseless UTF-8 mode when UCP support is available, check
5436          whether this character has more than one other case. If so, generate
5437          a special OP_NOTPROP item instead of OP_NOTI. */
5438
5439#ifdef SUPPORT_UCP
5440          if (utf && (options & PCRE_CASELESS) != 0 &&
5441              (d = UCD_CASESET(c)) != 0)
5442            {
5443            *code++ = OP_NOTPROP;
5444            *code++ = PT_CLIST;
5445            *code++ = d;
5446            }
5447          else
5448#endif
5449          /* Char has only one other case, or UCP not available */
5450
5451            {
5452            *code++ = ((options & PCRE_CASELESS) != 0)? OP_NOTI: OP_NOT;
5453#if defined SUPPORT_UTF && !defined COMPILE_PCRE32
5454            if (utf && c > MAX_VALUE_FOR_SINGLE_CHAR)
5455              code += PRIV(ord2utf)(c, code);
5456            else
5457#endif
5458              *code++ = c;
5459            }
5460
5461          /* We are finished with this character class */
5462
5463          goto END_CLASS;
5464          }
5465
5466        /* For a single, positive character, get the value into mcbuffer, and
5467        then we can handle this with the normal one-character code. */
5468
5469#if defined SUPPORT_UTF && !defined COMPILE_PCRE32
5470        if (utf && c > MAX_VALUE_FOR_SINGLE_CHAR)
5471          mclength = PRIV(ord2utf)(c, mcbuffer);
5472        else
5473#endif
5474          {
5475          mcbuffer[0] = c;
5476          mclength = 1;
5477          }
5478        goto ONE_CHAR;
5479        }       /* End of 1-char optimization */
5480
5481      /* There is more than one character in the class, or an XCLASS item
5482      has been generated. Add this character to the class. */
5483
5484      class_has_8bitchar +=
5485        add_to_class(classbits, &class_uchardata, options, cd, c, c);
5486      }
5487
5488    /* Loop until ']' reached. This "while" is the end of the "do" far above.
5489    If we are at the end of an internal nested string, revert to the outer
5490    string. */
5491
5492    while (((c = *(++ptr)) != CHAR_NULL ||
5493           (nestptr != NULL &&
5494             (ptr = nestptr, nestptr = NULL, c = *(++ptr)) != CHAR_NULL)) &&
5495           (c != CHAR_RIGHT_SQUARE_BRACKET || inescq));
5496
5497    /* Check for missing terminating ']' */
5498
5499    if (c == CHAR_NULL)
5500      {
5501      *errorcodeptr = ERR6;
5502      goto FAILED;
5503      }
5504
5505    /* We will need an XCLASS if data has been placed in class_uchardata. In
5506    the second phase this is a sufficient test. However, in the pre-compile
5507    phase, class_uchardata gets emptied to prevent workspace overflow, so it
5508    only if the very last character in the class needs XCLASS will it contain
5509    anything at this point. For this reason, xclass gets set TRUE above when
5510    uchar_classdata is emptied, and that's why this code is the way it is here
5511    instead of just doing a test on class_uchardata below. */
5512
5513#if defined SUPPORT_UTF || !defined COMPILE_PCRE8
5514    if (class_uchardata > class_uchardata_base) xclass = TRUE;
5515#endif
5516
5517    /* If this is the first thing in the branch, there can be no first char
5518    setting, whatever the repeat count. Any reqchar setting must remain
5519    unchanged after any kind of repeat. */
5520
5521    if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
5522    zerofirstchar = firstchar;
5523    zerofirstcharflags = firstcharflags;
5524    zeroreqchar = reqchar;
5525    zeroreqcharflags = reqcharflags;
5526
5527    /* If there are characters with values > 255, we have to compile an
5528    extended class, with its own opcode, unless there was a negated special
5529    such as \S in the class, and PCRE_UCP is not set, because in that case all
5530    characters > 255 are in the class, so any that were explicitly given as
5531    well can be ignored. If (when there are explicit characters > 255 that must
5532    be listed) there are no characters < 256, we can omit the bitmap in the
5533    actual compiled code. */
5534
5535#ifdef SUPPORT_UTF
5536    if (xclass && (xclass_has_prop || !should_flip_negation ||
5537        (options & PCRE_UCP) != 0))
5538#elif !defined COMPILE_PCRE8
5539    if (xclass && (xclass_has_prop || !should_flip_negation))
5540#endif
5541#if defined SUPPORT_UTF || !defined COMPILE_PCRE8
5542      {
5543      *class_uchardata++ = XCL_END;    /* Marks the end of extra data */
5544      *code++ = OP_XCLASS;
5545      code += LINK_SIZE;
5546      *code = negate_class? XCL_NOT:0;
5547      if (xclass_has_prop) *code |= XCL_HASPROP;
5548
5549      /* If the map is required, move up the extra data to make room for it;
5550      otherwise just move the code pointer to the end of the extra data. */
5551
5552      if (class_has_8bitchar > 0)
5553        {
5554        *code++ |= XCL_MAP;
5555        memmove(code + (32 / sizeof(pcre_uchar)), code,
5556          IN_UCHARS(class_uchardata - code));
5557        if (negate_class && !xclass_has_prop)
5558          for (c = 0; c < 32; c++) classbits[c] = ~classbits[c];
5559        memcpy(code, classbits, 32);
5560        code = class_uchardata + (32 / sizeof(pcre_uchar));
5561        }
5562      else code = class_uchardata;
5563
5564      /* Now fill in the complete length of the item */
5565
5566      PUT(previous, 1, (int)(code - previous));
5567      break;   /* End of class handling */
5568      }
5569
5570    /* Even though any XCLASS list is now discarded, we must allow for
5571    its memory. */
5572
5573    if (lengthptr != NULL)
5574      *lengthptr += (int)(class_uchardata - class_uchardata_base);
5575#endif
5576
5577    /* If there are no characters > 255, or they are all to be included or
5578    excluded, set the opcode to OP_CLASS or OP_NCLASS, depending on whether the
5579    whole class was negated and whether there were negative specials such as \S
5580    (non-UCP) in the class. Then copy the 32-byte map into the code vector,
5581    negating it if necessary. */
5582
5583    *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
5584    if (lengthptr == NULL)    /* Save time in the pre-compile phase */
5585      {
5586      if (negate_class)
5587        for (c = 0; c < 32; c++) classbits[c] = ~classbits[c];
5588      memcpy(code, classbits, 32);
5589      }
5590    code += 32 / sizeof(pcre_uchar);
5591
5592    END_CLASS:
5593    break;
5594
5595
5596    /* ===================================================================*/
5597    /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
5598    has been tested above. */
5599
5600    case CHAR_LEFT_CURLY_BRACKET:
5601    if (!is_quantifier) goto NORMAL_CHAR;
5602    ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
5603    if (*errorcodeptr != 0) goto FAILED;
5604    goto REPEAT;
5605
5606    case CHAR_ASTERISK:
5607    repeat_min = 0;
5608    repeat_max = -1;
5609    goto REPEAT;
5610
5611    case CHAR_PLUS:
5612    repeat_min = 1;
5613    repeat_max = -1;
5614    goto REPEAT;
5615
5616    case CHAR_QUESTION_MARK:
5617    repeat_min = 0;
5618    repeat_max = 1;
5619
5620    REPEAT:
5621    if (previous == NULL)
5622      {
5623      *errorcodeptr = ERR9;
5624      goto FAILED;
5625      }
5626
5627    if (repeat_min == 0)
5628      {
5629      firstchar = zerofirstchar;    /* Adjust for zero repeat */
5630      firstcharflags = zerofirstcharflags;
5631      reqchar = zeroreqchar;        /* Ditto */
5632      reqcharflags = zeroreqcharflags;
5633      }
5634
5635    /* Remember whether this is a variable length repeat */
5636
5637    reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
5638
5639    op_type = 0;                    /* Default single-char op codes */
5640    possessive_quantifier = FALSE;  /* Default not possessive quantifier */
5641
5642    /* Save start of previous item, in case we have to move it up in order to
5643    insert something before it. */
5644
5645    tempcode = previous;
5646
5647    /* Before checking for a possessive quantifier, we must skip over
5648    whitespace and comments in extended mode because Perl allows white space at
5649    this point. */
5650
5651    if ((options & PCRE_EXTENDED) != 0)
5652      {
5653      const pcre_uchar *p = ptr + 1;
5654      for (;;)
5655        {
5656        while (MAX_255(*p) && (cd->ctypes[*p] & ctype_space) != 0) p++;
5657        if (*p != CHAR_NUMBER_SIGN) break;
5658        p++;
5659        while (*p != CHAR_NULL)
5660          {
5661          if (IS_NEWLINE(p))         /* For non-fixed-length newline cases, */
5662            {                        /* IS_NEWLINE sets cd->nllen. */
5663            p += cd->nllen;
5664            break;
5665            }
5666          p++;
5667#ifdef SUPPORT_UTF
5668          if (utf) FORWARDCHAR(p);
5669#endif
5670          }           /* Loop for comment characters */
5671        }             /* Loop for multiple comments */
5672      ptr = p - 1;    /* Character before the next significant one. */
5673      }
5674
5675    /* If the next character is '+', we have a possessive quantifier. This
5676    implies greediness, whatever the setting of the PCRE_UNGREEDY option.
5677    If the next character is '?' this is a minimizing repeat, by default,
5678    but if PCRE_UNGREEDY is set, it works the other way round. We change the
5679    repeat type to the non-default. */
5680
5681    if (ptr[1] == CHAR_PLUS)
5682      {
5683      repeat_type = 0;                  /* Force greedy */
5684      possessive_quantifier = TRUE;
5685      ptr++;
5686      }
5687    else if (ptr[1] == CHAR_QUESTION_MARK)
5688      {
5689      repeat_type = greedy_non_default;
5690      ptr++;
5691      }
5692    else repeat_type = greedy_default;
5693
5694    /* If previous was a recursion call, wrap it in atomic brackets so that
5695    previous becomes the atomic group. All recursions were so wrapped in the
5696    past, but it no longer happens for non-repeated recursions. In fact, the
5697    repeated ones could be re-implemented independently so as not to need this,
5698    but for the moment we rely on the code for repeating groups. */
5699
5700    if (*previous == OP_RECURSE)
5701      {
5702      memmove(previous + 1 + LINK_SIZE, previous, IN_UCHARS(1 + LINK_SIZE));
5703      *previous = OP_ONCE;
5704      PUT(previous, 1, 2 + 2*LINK_SIZE);
5705      previous[2 + 2*LINK_SIZE] = OP_KET;
5706      PUT(previous, 3 + 2*LINK_SIZE, 2 + 2*LINK_SIZE);
5707      code += 2 + 2 * LINK_SIZE;
5708      length_prevgroup = 3 + 3*LINK_SIZE;
5709
5710      /* When actually compiling, we need to check whether this was a forward
5711      reference, and if so, adjust the offset. */
5712
5713      if (lengthptr == NULL && cd->hwm >= cd->start_workspace + LINK_SIZE)
5714        {
5715        int offset = GET(cd->hwm, -LINK_SIZE);
5716        if (offset == previous + 1 - cd->start_code)
5717          PUT(cd->hwm, -LINK_SIZE, offset + 1 + LINK_SIZE);
5718        }
5719      }
5720
5721    /* Now handle repetition for the different types of item. */
5722
5723    /* If previous was a character or negated character match, abolish the item
5724    and generate a repeat item instead. If a char item has a minimum of more
5725    than one, ensure that it is set in reqchar - it might not be if a sequence
5726    such as x{3} is the first thing in a branch because the x will have gone
5727    into firstchar instead.  */
5728
5729    if (*previous == OP_CHAR || *previous == OP_CHARI
5730        || *previous == OP_NOT || *previous == OP_NOTI)
5731      {
5732      switch (*previous)
5733        {
5734        default: /* Make compiler happy. */
5735        case OP_CHAR:  op_type = OP_STAR - OP_STAR; break;
5736        case OP_CHARI: op_type = OP_STARI - OP_STAR; break;
5737        case OP_NOT:   op_type = OP_NOTSTAR - OP_STAR; break;
5738        case OP_NOTI:  op_type = OP_NOTSTARI - OP_STAR; break;
5739        }
5740
5741      /* Deal with UTF characters that take up more than one character. It's
5742      easier to write this out separately than try to macrify it. Use c to
5743      hold the length of the character in bytes, plus UTF_LENGTH to flag that
5744      it's a length rather than a small character. */
5745
5746#if defined SUPPORT_UTF && !defined COMPILE_PCRE32
5747      if (utf && NOT_FIRSTCHAR(code[-1]))
5748        {
5749        pcre_uchar *lastchar = code - 1;
5750        BACKCHAR(lastchar);
5751        c = (int)(code - lastchar);     /* Length of UTF-8 character */
5752        memcpy(utf_chars, lastchar, IN_UCHARS(c)); /* Save the char */
5753        c |= UTF_LENGTH;                /* Flag c as a length */
5754        }
5755      else
5756#endif /* SUPPORT_UTF */
5757
5758      /* Handle the case of a single charater - either with no UTF support, or
5759      with UTF disabled, or for a single character UTF character. */
5760        {
5761        c = code[-1];
5762        if (*previous <= OP_CHARI && repeat_min > 1)
5763          {
5764          reqchar = c;
5765          reqcharflags = req_caseopt | cd->req_varyopt;
5766          }
5767        }
5768
5769      goto OUTPUT_SINGLE_REPEAT;   /* Code shared with single character types */
5770      }
5771
5772    /* If previous was a character type match (\d or similar), abolish it and
5773    create a suitable repeat item. The code is shared with single-character
5774    repeats by setting op_type to add a suitable offset into repeat_type. Note
5775    the the Unicode property types will be present only when SUPPORT_UCP is
5776    defined, but we don't wrap the little bits of code here because it just
5777    makes it horribly messy. */
5778
5779    else if (*previous < OP_EODN)
5780      {
5781      pcre_uchar *oldcode;
5782      int prop_type, prop_value;
5783      op_type = OP_TYPESTAR - OP_STAR;  /* Use type opcodes */
5784      c = *previous;
5785
5786      OUTPUT_SINGLE_REPEAT:
5787      if (*previous == OP_PROP || *previous == OP_NOTPROP)
5788        {
5789        prop_type = previous[1];
5790        prop_value = previous[2];
5791        }
5792      else prop_type = prop_value = -1;
5793
5794      oldcode = code;
5795      code = previous;                  /* Usually overwrite previous item */
5796
5797      /* If the maximum is zero then the minimum must also be zero; Perl allows
5798      this case, so we do too - by simply omitting the item altogether. */
5799
5800      if (repeat_max == 0) goto END_REPEAT;
5801
5802      /* Combine the op_type with the repeat_type */
5803
5804      repeat_type += op_type;
5805
5806      /* A minimum of zero is handled either as the special case * or ?, or as
5807      an UPTO, with the maximum given. */
5808
5809      if (repeat_min == 0)
5810        {
5811        if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
5812          else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
5813        else
5814          {
5815          *code++ = OP_UPTO + repeat_type;
5816          PUT2INC(code, 0, repeat_max);
5817          }
5818        }
5819
5820      /* A repeat minimum of 1 is optimized into some special cases. If the
5821      maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
5822      left in place and, if the maximum is greater than 1, we use OP_UPTO with
5823      one less than the maximum. */
5824
5825      else if (repeat_min == 1)
5826        {
5827        if (repeat_max == -1)
5828          *code++ = OP_PLUS + repeat_type;
5829        else
5830          {
5831          code = oldcode;                 /* leave previous item in place */
5832          if (repeat_max == 1) goto END_REPEAT;
5833          *code++ = OP_UPTO + repeat_type;
5834          PUT2INC(code, 0, repeat_max - 1);
5835          }
5836        }
5837
5838      /* The case {n,n} is just an EXACT, while the general case {n,m} is
5839      handled as an EXACT followed by an UPTO. */
5840
5841      else
5842        {
5843        *code++ = OP_EXACT + op_type;  /* NB EXACT doesn't have repeat_type */
5844        PUT2INC(code, 0, repeat_min);
5845
5846        /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
5847        we have to insert the character for the previous code. For a repeated
5848        Unicode property match, there are two extra bytes that define the
5849        required property. In UTF-8 mode, long characters have their length in
5850        c, with the UTF_LENGTH bit as a flag. */
5851
5852        if (repeat_max < 0)
5853          {
5854#if defined SUPPORT_UTF && !defined COMPILE_PCRE32
5855          if (utf && (c & UTF_LENGTH) != 0)
5856            {
5857            memcpy(code, utf_chars, IN_UCHARS(c & 7));
5858            code += c & 7;
5859            }
5860          else
5861#endif
5862            {
5863            *code++ = c;
5864            if (prop_type >= 0)
5865              {
5866              *code++ = prop_type;
5867              *code++ = prop_value;
5868              }
5869            }
5870          *code++ = OP_STAR + repeat_type;
5871          }
5872
5873        /* Else insert an UPTO if the max is greater than the min, again
5874        preceded by the character, for the previously inserted code. If the
5875        UPTO is just for 1 instance, we can use QUERY instead. */
5876
5877        else if (repeat_max != repeat_min)
5878          {
5879#if defined SUPPORT_UTF && !defined COMPILE_PCRE32
5880          if (utf && (c & UTF_LENGTH) != 0)
5881            {
5882            memcpy(code, utf_chars, IN_UCHARS(c & 7));
5883            code += c & 7;
5884            }
5885          else
5886#endif
5887          *code++ = c;
5888          if (prop_type >= 0)
5889            {
5890            *code++ = prop_type;
5891            *code++ = prop_value;
5892            }
5893          repeat_max -= repeat_min;
5894
5895          if (repeat_max == 1)
5896            {
5897            *code++ = OP_QUERY + repeat_type;
5898            }
5899          else
5900            {
5901            *code++ = OP_UPTO + repeat_type;
5902            PUT2INC(code, 0, repeat_max);
5903            }
5904          }
5905        }
5906
5907      /* The character or character type itself comes last in all cases. */
5908
5909#if defined SUPPORT_UTF && !defined COMPILE_PCRE32
5910      if (utf && (c & UTF_LENGTH) != 0)
5911        {
5912        memcpy(code, utf_chars, IN_UCHARS(c & 7));
5913        code += c & 7;
5914        }
5915      else
5916#endif
5917      *code++ = c;
5918
5919      /* For a repeated Unicode property match, there are two extra bytes that
5920      define the required property. */
5921
5922#ifdef SUPPORT_UCP
5923      if (prop_type >= 0)
5924        {
5925        *code++ = prop_type;
5926        *code++ = prop_value;
5927        }
5928#endif
5929      }
5930
5931    /* If previous was a character class or a back reference, we put the repeat
5932    stuff after it, but just skip the item if the repeat was {0,0}. */
5933
5934    else if (*previous == OP_CLASS || *previous == OP_NCLASS ||
5935#if defined SUPPORT_UTF || !defined COMPILE_PCRE8
5936             *previous == OP_XCLASS ||
5937#endif
5938             *previous == OP_REF   || *previous == OP_REFI ||
5939             *previous == OP_DNREF || *previous == OP_DNREFI)
5940      {
5941      if (repeat_max == 0)
5942        {
5943        code = previous;
5944        goto END_REPEAT;
5945        }
5946
5947      if (repeat_min == 0 && repeat_max == -1)
5948        *code++ = OP_CRSTAR + repeat_type;
5949      else if (repeat_min == 1 && repeat_max == -1)
5950        *code++ = OP_CRPLUS + repeat_type;
5951      else if (repeat_min == 0 && repeat_max == 1)
5952        *code++ = OP_CRQUERY + repeat_type;
5953      else
5954        {
5955        *code++ = OP_CRRANGE + repeat_type;
5956        PUT2INC(code, 0, repeat_min);
5957        if (repeat_max == -1) repeat_max = 0;  /* 2-byte encoding for max */
5958        PUT2INC(code, 0, repeat_max);
5959        }
5960      }
5961
5962    /* If previous was a bracket group, we may have to replicate it in certain
5963    cases. Note that at this point we can encounter only the "basic" bracket
5964    opcodes such as BRA and CBRA, as this is the place where they get converted
5965    into the more special varieties such as BRAPOS and SBRA. A test for >=
5966    OP_ASSERT and <= OP_COND includes ASSERT, ASSERT_NOT, ASSERTBACK,
5967    ASSERTBACK_NOT, ONCE, ONCE_NC, BRA, BRAPOS, CBRA, CBRAPOS, and COND.
5968    Originally, PCRE did not allow repetition of assertions, but now it does,
5969    for Perl compatibility. */
5970
5971    else if (*previous >= OP_ASSERT && *previous <= OP_COND)
5972      {
5973      register int i;
5974      int len = (int)(code - previous);
5975      size_t base_hwm_offset = item_hwm_offset;
5976      pcre_uchar *bralink = NULL;
5977      pcre_uchar *brazeroptr = NULL;
5978
5979      /* Repeating a DEFINE group is pointless, but Perl allows the syntax, so
5980      we just ignore the repeat. */
5981
5982      if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
5983        goto END_REPEAT;
5984
5985      /* There is no sense in actually repeating assertions. The only potential
5986      use of repetition is in cases when the assertion is optional. Therefore,
5987      if the minimum is greater than zero, just ignore the repeat. If the
5988      maximum is not zero or one, set it to 1. */
5989
5990      if (*previous < OP_ONCE)    /* Assertion */
5991        {
5992        if (repeat_min > 0) goto END_REPEAT;
5993        if (repeat_max < 0 || repeat_max > 1) repeat_max = 1;
5994        }
5995
5996      /* The case of a zero minimum is special because of the need to stick
5997      OP_BRAZERO in front of it, and because the group appears once in the
5998      data, whereas in other cases it appears the minimum number of times. For
5999      this reason, it is simplest to treat this case separately, as otherwise
6000      the code gets far too messy. There are several special subcases when the
6001      minimum is zero. */
6002
6003      if (repeat_min == 0)
6004        {
6005        /* If the maximum is also zero, we used to just omit the group from the
6006        output altogether, like this:
6007
6008        ** if (repeat_max == 0)
6009        **   {
6010        **   code = previous;
6011        **   goto END_REPEAT;
6012        **   }
6013
6014        However, that fails when a group or a subgroup within it is referenced
6015        as a subroutine from elsewhere in the pattern, so now we stick in
6016        OP_SKIPZERO in front of it so that it is skipped on execution. As we
6017        don't have a list of which groups are referenced, we cannot do this
6018        selectively.
6019
6020        If the maximum is 1 or unlimited, we just have to stick in the BRAZERO
6021        and do no more at this point. However, we do need to adjust any
6022        OP_RECURSE calls inside the group that refer to the group itself or any
6023        internal or forward referenced group, because the offset is from the
6024        start of the whole regex. Temporarily terminate the pattern while doing
6025        this. */
6026
6027        if (repeat_max <= 1)    /* Covers 0, 1, and unlimited */
6028          {
6029          *code = OP_END;
6030          adjust_recurse(previous, 1, utf, cd, item_hwm_offset);
6031          memmove(previous + 1, previous, IN_UCHARS(len));
6032          code++;
6033          if (repeat_max == 0)
6034            {
6035            *previous++ = OP_SKIPZERO;
6036            goto END_REPEAT;
6037            }
6038          brazeroptr = previous;    /* Save for possessive optimizing */
6039          *previous++ = OP_BRAZERO + repeat_type;
6040          }
6041
6042        /* If the maximum is greater than 1 and limited, we have to replicate
6043        in a nested fashion, sticking OP_BRAZERO before each set of brackets.
6044        The first one has to be handled carefully because it's the original
6045        copy, which has to be moved up. The remainder can be handled by code
6046        that is common with the non-zero minimum case below. We have to
6047        adjust the value or repeat_max, since one less copy is required. Once
6048        again, we may have to adjust any OP_RECURSE calls inside the group. */
6049
6050        else
6051          {
6052          int offset;
6053          *code = OP_END;
6054          adjust_recurse(previous, 2 + LINK_SIZE, utf, cd, item_hwm_offset);
6055          memmove(previous + 2 + LINK_SIZE, previous, IN_UCHARS(len));
6056          code += 2 + LINK_SIZE;
6057          *previous++ = OP_BRAZERO + repeat_type;
6058          *previous++ = OP_BRA;
6059
6060          /* We chain together the bracket offset fields that have to be
6061          filled in later when the ends of the brackets are reached. */
6062
6063          offset = (bralink == NULL)? 0 : (int)(previous - bralink);
6064          bralink = previous;
6065          PUTINC(previous, 0, offset);
6066          }
6067
6068        repeat_max--;
6069        }
6070
6071      /* If the minimum is greater than zero, replicate the group as many
6072      times as necessary, and adjust the maximum to the number of subsequent
6073      copies that we need. If we set a first char from the group, and didn't
6074      set a required char, copy the latter from the former. If there are any
6075      forward reference subroutine calls in the group, there will be entries on
6076      the workspace list; replicate these with an appropriate increment. */
6077
6078      else
6079        {
6080        if (repeat_min > 1)
6081          {
6082          /* In the pre-compile phase, we don't actually do the replication. We
6083          just adjust the length as if we had. Do some paranoid checks for
6084          potential integer overflow. The INT64_OR_DOUBLE type is a 64-bit
6085          integer type when available, otherwise double. */
6086
6087          if (lengthptr != NULL)
6088            {
6089            int delta = (repeat_min - 1)*length_prevgroup;
6090            if ((INT64_OR_DOUBLE)(repeat_min - 1)*
6091                  (INT64_OR_DOUBLE)length_prevgroup >
6092                    (INT64_OR_DOUBLE)INT_MAX ||
6093                OFLOW_MAX - *lengthptr < delta)
6094              {
6095              *errorcodeptr = ERR20;
6096              goto FAILED;
6097              }
6098            *lengthptr += delta;
6099            }
6100
6101          /* This is compiling for real. If there is a set first byte for
6102          the group, and we have not yet set a "required byte", set it. Make
6103          sure there is enough workspace for copying forward references before
6104          doing the copy. */
6105
6106          else
6107            {
6108            if (groupsetfirstchar && reqcharflags < 0)
6109              {
6110              reqchar = firstchar;
6111              reqcharflags = firstcharflags;
6112              }
6113
6114            for (i = 1; i < repeat_min; i++)
6115              {
6116              pcre_uchar *hc;
6117              size_t this_hwm_offset = cd->hwm - cd->start_workspace;
6118              memcpy(code, previous, IN_UCHARS(len));
6119
6120              while (cd->hwm > cd->start_workspace + cd->workspace_size -
6121                     WORK_SIZE_SAFETY_MARGIN -
6122                     (this_hwm_offset - base_hwm_offset))
6123                {
6124                *errorcodeptr = expand_workspace(cd);
6125                if (*errorcodeptr != 0) goto FAILED;
6126                }
6127
6128              for (hc = (pcre_uchar *)cd->start_workspace + base_hwm_offset;
6129                   hc < (pcre_uchar *)cd->start_workspace + this_hwm_offset;
6130                   hc += LINK_SIZE)
6131                {
6132                PUT(cd->hwm, 0, GET(hc, 0) + len);
6133                cd->hwm += LINK_SIZE;
6134                }
6135              base_hwm_offset = this_hwm_offset;
6136              code += len;
6137              }
6138            }
6139          }
6140
6141        if (repeat_max > 0) repeat_max -= repeat_min;
6142        }
6143
6144      /* This code is common to both the zero and non-zero minimum cases. If
6145      the maximum is limited, it replicates the group in a nested fashion,
6146      remembering the bracket starts on a stack. In the case of a zero minimum,
6147      the first one was set up above. In all cases the repeat_max now specifies
6148      the number of additional copies needed. Again, we must remember to
6149      replicate entries on the forward reference list. */
6150
6151      if (repeat_max >= 0)
6152        {
6153        /* In the pre-compile phase, we don't actually do the replication. We
6154        just adjust the length as if we had. For each repetition we must add 1
6155        to the length for BRAZERO and for all but the last repetition we must
6156        add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
6157        paranoid checks to avoid integer overflow. The INT64_OR_DOUBLE type is
6158        a 64-bit integer type when available, otherwise double. */
6159
6160        if (lengthptr != NULL && repeat_max > 0)
6161          {
6162          int delta = repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
6163                      2 - 2*LINK_SIZE;   /* Last one doesn't nest */
6164          if ((INT64_OR_DOUBLE)repeat_max *
6165                (INT64_OR_DOUBLE)(length_prevgroup + 1 + 2 + 2*LINK_SIZE)
6166                  > (INT64_OR_DOUBLE)INT_MAX ||
6167              OFLOW_MAX - *lengthptr < delta)
6168            {
6169            *errorcodeptr = ERR20;
6170            goto FAILED;
6171            }
6172          *lengthptr += delta;
6173          }
6174
6175        /* This is compiling for real */
6176
6177        else for (i = repeat_max - 1; i >= 0; i--)
6178          {
6179          pcre_uchar *hc;
6180          size_t this_hwm_offset = cd->hwm - cd->start_workspace;
6181
6182          *code++ = OP_BRAZERO + repeat_type;
6183
6184          /* All but the final copy start a new nesting, maintaining the
6185          chain of brackets outstanding. */
6186
6187          if (i != 0)
6188            {
6189            int offset;
6190            *code++ = OP_BRA;
6191            offset = (bralink == NULL)? 0 : (int)(code - bralink);
6192            bralink = code;
6193            PUTINC(code, 0, offset);
6194            }
6195
6196          memcpy(code, previous, IN_UCHARS(len));
6197
6198          /* Ensure there is enough workspace for forward references before
6199          copying them. */
6200
6201          while (cd->hwm > cd->start_workspace + cd->workspace_size -
6202                 WORK_SIZE_SAFETY_MARGIN -
6203                 (this_hwm_offset - base_hwm_offset))
6204            {
6205            *errorcodeptr = expand_workspace(cd);
6206            if (*errorcodeptr != 0) goto FAILED;
6207            }
6208
6209          for (hc = (pcre_uchar *)cd->start_workspace + base_hwm_offset;
6210               hc < (pcre_uchar *)cd->start_workspace + this_hwm_offset;
6211               hc += LINK_SIZE)
6212            {
6213            PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
6214            cd->hwm += LINK_SIZE;
6215            }
6216          base_hwm_offset = this_hwm_offset;
6217          code += len;
6218          }
6219
6220        /* Now chain through the pending brackets, and fill in their length
6221        fields (which are holding the chain links pro tem). */
6222
6223        while (bralink != NULL)
6224          {
6225          int oldlinkoffset;
6226          int offset = (int)(code - bralink + 1);
6227          pcre_uchar *bra = code - offset;
6228          oldlinkoffset = GET(bra, 1);
6229          bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
6230          *code++ = OP_KET;
6231          PUTINC(code, 0, offset);
6232          PUT(bra, 1, offset);
6233          }
6234        }
6235
6236      /* If the maximum is unlimited, set a repeater in the final copy. For
6237      ONCE brackets, that's all we need to do. However, possessively repeated
6238      ONCE brackets can be converted into non-capturing brackets, as the
6239      behaviour of (?:xx)++ is the same as (?>xx)++ and this saves having to
6240      deal with possessive ONCEs specially.
6241
6242      Otherwise, when we are doing the actual compile phase, check to see
6243      whether this group is one that could match an empty string. If so,
6244      convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
6245      that runtime checking can be done. [This check is also applied to ONCE
6246      groups at runtime, but in a different way.]
6247
6248      Then, if the quantifier was possessive and the bracket is not a
6249      conditional, we convert the BRA code to the POS form, and the KET code to
6250      KETRPOS. (It turns out to be convenient at runtime to detect this kind of
6251      subpattern at both the start and at the end.) The use of special opcodes
6252      makes it possible to reduce greatly the stack usage in pcre_exec(). If
6253      the group is preceded by OP_BRAZERO, convert this to OP_BRAPOSZERO.
6254
6255      Then, if the minimum number of matches is 1 or 0, cancel the possessive
6256      flag so that the default action below, of wrapping everything inside
6257      atomic brackets, does not happen. When the minimum is greater than 1,
6258      there will be earlier copies of the group, and so we still have to wrap
6259      the whole thing. */
6260
6261      else
6262        {
6263        pcre_uchar *ketcode = code - 1 - LINK_SIZE;
6264        pcre_uchar *bracode = ketcode - GET(ketcode, 1);
6265
6266        /* Convert possessive ONCE brackets to non-capturing */
6267
6268        if ((*bracode == OP_ONCE || *bracode == OP_ONCE_NC) &&
6269            possessive_quantifier) *bracode = OP_BRA;
6270
6271        /* For non-possessive ONCE brackets, all we need to do is to
6272        set the KET. */
6273
6274        if (*bracode == OP_ONCE || *bracode == OP_ONCE_NC)
6275          *ketcode = OP_KETRMAX + repeat_type;
6276
6277        /* Handle non-ONCE brackets and possessive ONCEs (which have been
6278        converted to non-capturing above). */
6279
6280        else
6281          {
6282          /* In the compile phase, check for empty string matching. */
6283
6284          if (lengthptr == NULL)
6285            {
6286            pcre_uchar *scode = bracode;
6287            do
6288              {
6289              if (could_be_empty_branch(scode, ketcode, utf, cd, NULL))
6290                {
6291                *bracode += OP_SBRA - OP_BRA;
6292                break;
6293                }
6294              scode += GET(scode, 1);
6295              }
6296            while (*scode == OP_ALT);
6297            }
6298
6299          /* A conditional group with only one branch has an implicit empty
6300          alternative branch. */
6301
6302          if (*bracode == OP_COND && bracode[GET(bracode,1)] != OP_ALT)
6303            *bracode = OP_SCOND;
6304
6305          /* Handle possessive quantifiers. */
6306
6307          if (possessive_quantifier)
6308            {
6309            /* For COND brackets, we wrap the whole thing in a possessively
6310            repeated non-capturing bracket, because we have not invented POS
6311            versions of the COND opcodes. Because we are moving code along, we
6312            must ensure that any pending recursive references are updated. */
6313
6314            if (*bracode == OP_COND || *bracode == OP_SCOND)
6315              {
6316              int nlen = (int)(code - bracode);
6317              *code = OP_END;
6318              adjust_recurse(bracode, 1 + LINK_SIZE, utf, cd, item_hwm_offset);
6319              memmove(bracode + 1 + LINK_SIZE, bracode, IN_UCHARS(nlen));
6320              code += 1 + LINK_SIZE;
6321              nlen += 1 + LINK_SIZE;
6322              *bracode = (*bracode == OP_COND)? OP_BRAPOS : OP_SBRAPOS;
6323              *code++ = OP_KETRPOS;
6324              PUTINC(code, 0, nlen);
6325              PUT(bracode, 1, nlen);
6326              }
6327
6328            /* For non-COND brackets, we modify the BRA code and use KETRPOS. */
6329
6330            else
6331              {
6332              *bracode += 1;              /* Switch to xxxPOS opcodes */
6333              *ketcode = OP_KETRPOS;
6334              }
6335
6336            /* If the minimum is zero, mark it as possessive, then unset the
6337            possessive flag when the minimum is 0 or 1. */
6338
6339            if (brazeroptr != NULL) *brazeroptr = OP_BRAPOSZERO;
6340            if (repeat_min < 2) possessive_quantifier = FALSE;
6341            }
6342
6343          /* Non-possessive quantifier */
6344
6345          else *ketcode = OP_KETRMAX + repeat_type;
6346          }
6347        }
6348      }
6349
6350    /* If previous is OP_FAIL, it was generated by an empty class [] in
6351    JavaScript mode. The other ways in which OP_FAIL can be generated, that is
6352    by (*FAIL) or (?!) set previous to NULL, which gives a "nothing to repeat"
6353    error above. We can just ignore the repeat in JS case. */
6354
6355    else if (*previous == OP_FAIL) goto END_REPEAT;
6356
6357    /* Else there's some kind of shambles */
6358
6359    else
6360      {
6361      *errorcodeptr = ERR11;
6362      goto FAILED;
6363      }
6364
6365    /* If the character following a repeat is '+', possessive_quantifier is
6366    TRUE. For some opcodes, there are special alternative opcodes for this
6367    case. For anything else, we wrap the entire repeated item inside OP_ONCE
6368    brackets. Logically, the '+' notation is just syntactic sugar, taken from
6369    Sun's Java package, but the special opcodes can optimize it.
6370
6371    Some (but not all) possessively repeated subpatterns have already been
6372    completely handled in the code just above. For them, possessive_quantifier
6373    is always FALSE at this stage. Note that the repeated item starts at
6374    tempcode, not at previous, which might be the first part of a string whose
6375    (former) last char we repeated. */
6376
6377    if (possessive_quantifier)
6378      {
6379      int len;
6380
6381      /* Possessifying an EXACT quantifier has no effect, so we can ignore it.
6382      However, QUERY, STAR, or UPTO may follow (for quantifiers such as {5,6},
6383      {5,}, or {5,10}). We skip over an EXACT item; if the length of what
6384      remains is greater than zero, there's a further opcode that can be
6385      handled. If not, do nothing, leaving the EXACT alone. */
6386
6387      switch(*tempcode)
6388        {
6389        case OP_TYPEEXACT:
6390        tempcode += PRIV(OP_lengths)[*tempcode] +
6391          ((tempcode[1 + IMM2_SIZE] == OP_PROP
6392          || tempcode[1 + IMM2_SIZE] == OP_NOTPROP)? 2 : 0);
6393        break;
6394
6395        /* CHAR opcodes are used for exacts whose count is 1. */
6396
6397        case OP_CHAR:
6398        case OP_CHARI:
6399        case OP_NOT:
6400        case OP_NOTI:
6401        case OP_EXACT:
6402        case OP_EXACTI:
6403        case OP_NOTEXACT:
6404        case OP_NOTEXACTI:
6405        tempcode += PRIV(OP_lengths)[*tempcode];
6406#ifdef SUPPORT_UTF
6407        if (utf && HAS_EXTRALEN(tempcode[-1]))
6408          tempcode += GET_EXTRALEN(tempcode[-1]);
6409#endif
6410        break;
6411
6412        /* For the class opcodes, the repeat operator appears at the end;
6413        adjust tempcode to point to it. */
6414
6415        case OP_CLASS:
6416        case OP_NCLASS:
6417        tempcode += 1 + 32/sizeof(pcre_uchar);
6418        break;
6419
6420#if defined SUPPORT_UTF || !defined COMPILE_PCRE8
6421        case OP_XCLASS:
6422        tempcode += GET(tempcode, 1);
6423        break;
6424#endif
6425        }
6426
6427      /* If tempcode is equal to code (which points to the end of the repeated
6428      item), it means we have skipped an EXACT item but there is no following
6429      QUERY, STAR, or UPTO; the value of len will be 0, and we do nothing. In
6430      all other cases, tempcode will be pointing to the repeat opcode, and will
6431      be less than code, so the value of len will be greater than 0. */
6432
6433      len = (int)(code - tempcode);
6434      if (len > 0)
6435        {
6436        unsigned int repcode = *tempcode;
6437
6438        /* There is a table for possessifying opcodes, all of which are less
6439        than OP_CALLOUT. A zero entry means there is no possessified version.
6440        */
6441
6442        if (repcode < OP_CALLOUT && opcode_possessify[repcode] > 0)
6443          *tempcode = opcode_possessify[repcode];
6444
6445        /* For opcode without a special possessified version, wrap the item in
6446        ONCE brackets. Because we are moving code along, we must ensure that any
6447        pending recursive references are updated. */
6448
6449        else
6450          {
6451          *code = OP_END;
6452          adjust_recurse(tempcode, 1 + LINK_SIZE, utf, cd, item_hwm_offset);
6453          memmove(tempcode + 1 + LINK_SIZE, tempcode, IN_UCHARS(len));
6454          code += 1 + LINK_SIZE;
6455          len += 1 + LINK_SIZE;
6456          tempcode[0] = OP_ONCE;
6457          *code++ = OP_KET;
6458          PUTINC(code, 0, len);
6459          PUT(tempcode, 1, len);
6460          }
6461        }
6462
6463#ifdef NEVER
6464      if (len > 0) switch (*tempcode)
6465        {
6466        case OP_STAR:  *tempcode = OP_POSSTAR; break;
6467        case OP_PLUS:  *tempcode = OP_POSPLUS; break;
6468        case OP_QUERY: *tempcode = OP_POSQUERY; break;
6469        case OP_UPTO:  *tempcode = OP_POSUPTO; break;
6470
6471        case OP_STARI:  *tempcode = OP_POSSTARI; break;
6472        case OP_PLUSI:  *tempcode = OP_POSPLUSI; break;
6473        case OP_QUERYI: *tempcode = OP_POSQUERYI; break;
6474        case OP_UPTOI:  *tempcode = OP_POSUPTOI; break;
6475
6476        case OP_NOTSTAR:  *tempcode = OP_NOTPOSSTAR; break;
6477        case OP_NOTPLUS:  *tempcode = OP_NOTPOSPLUS; break;
6478        case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
6479        case OP_NOTUPTO:  *tempcode = OP_NOTPOSUPTO; break;
6480
6481        case OP_NOTSTARI:  *tempcode = OP_NOTPOSSTARI; break;
6482        case OP_NOTPLUSI:  *tempcode = OP_NOTPOSPLUSI; break;
6483        case OP_NOTQUERYI: *tempcode = OP_NOTPOSQUERYI; break;
6484        case OP_NOTUPTOI:  *tempcode = OP_NOTPOSUPTOI; break;
6485
6486        case OP_TYPESTAR:  *tempcode = OP_TYPEPOSSTAR; break;
6487        case OP_TYPEPLUS:  *tempcode = OP_TYPEPOSPLUS; break;
6488        case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
6489        case OP_TYPEUPTO:  *tempcode = OP_TYPEPOSUPTO; break;
6490
6491        case OP_CRSTAR:   *tempcode = OP_CRPOSSTAR; break;
6492        case OP_CRPLUS:   *tempcode = OP_CRPOSPLUS; break;
6493        case OP_CRQUERY:  *tempcode = OP_CRPOSQUERY; break;
6494        case OP_CRRANGE:  *tempcode = OP_CRPOSRANGE; break;
6495
6496        /* Because we are moving code along, we must ensure that any
6497        pending recursive references are updated. */
6498
6499        default:
6500        *code = OP_END;
6501        adjust_recurse(tempcode, 1 + LINK_SIZE, utf, cd, item_hwm_offset);
6502        memmove(tempcode + 1 + LINK_SIZE, tempcode, IN_UCHARS(len));
6503        code += 1 + LINK_SIZE;
6504        len += 1 + LINK_SIZE;
6505        tempcode[0] = OP_ONCE;
6506        *code++ = OP_KET;
6507        PUTINC(code, 0, len);
6508        PUT(tempcode, 1, len);
6509        break;
6510        }
6511#endif
6512      }
6513
6514    /* In all case we no longer have a previous item. We also set the
6515    "follows varying string" flag for subsequently encountered reqchars if
6516    it isn't already set and we have just passed a varying length item. */
6517
6518    END_REPEAT:
6519    previous = NULL;
6520    cd->req_varyopt |= reqvary;
6521    break;
6522
6523
6524    /* ===================================================================*/
6525    /* Start of nested parenthesized sub-expression, or comment or lookahead or
6526    lookbehind or option setting or condition or all the other extended
6527    parenthesis forms.  */
6528
6529    case CHAR_LEFT_PARENTHESIS:
6530    ptr++;
6531
6532    /* First deal with comments. Putting this code right at the start ensures
6533    that comments have no bad side effects. */
6534
6535    if (ptr[0] == CHAR_QUESTION_MARK && ptr[1] == CHAR_NUMBER_SIGN)
6536      {
6537      ptr += 2;
6538      while (*ptr != CHAR_NULL && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
6539      if (*ptr == CHAR_NULL)
6540        {
6541        *errorcodeptr = ERR18;
6542        goto FAILED;
6543        }
6544      continue;
6545      }
6546
6547    /* Now deal with various "verbs" that can be introduced by '*'. */
6548
6549    if (ptr[0] == CHAR_ASTERISK && (ptr[1] == ':'
6550         || (MAX_255(ptr[1]) && ((cd->ctypes[ptr[1]] & ctype_letter) != 0))))
6551      {
6552      int i, namelen;
6553      int arglen = 0;
6554      const char *vn = verbnames;
6555      const pcre_uchar *name = ptr + 1;
6556      const pcre_uchar *arg = NULL;
6557      previous = NULL;
6558      ptr++;
6559      while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
6560      namelen = (int)(ptr - name);
6561
6562      /* It appears that Perl allows any characters whatsoever, other than
6563      a closing parenthesis, to appear in arguments, so we no longer insist on
6564      letters, digits, and underscores. */
6565
6566      if (*ptr == CHAR_COLON)
6567        {
6568        arg = ++ptr;
6569        while (*ptr != CHAR_NULL && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
6570        arglen = (int)(ptr - arg);
6571        if ((unsigned int)arglen > MAX_MARK)
6572          {
6573          *errorcodeptr = ERR75;
6574          goto FAILED;
6575          }
6576        }
6577
6578      if (*ptr != CHAR_RIGHT_PARENTHESIS)
6579        {
6580        *errorcodeptr = ERR60;
6581        goto FAILED;
6582        }
6583
6584      /* Scan the table of verb names */
6585
6586      for (i = 0; i < verbcount; i++)
6587        {
6588        if (namelen == verbs[i].len &&
6589            STRNCMP_UC_C8(name, vn, namelen) == 0)
6590          {
6591          int setverb;
6592
6593          /* Check for open captures before ACCEPT and convert it to
6594          ASSERT_ACCEPT if in an assertion. */
6595
6596          if (verbs[i].op == OP_ACCEPT)
6597            {
6598            open_capitem *oc;
6599            if (arglen != 0)
6600              {
6601              *errorcodeptr = ERR59;
6602              goto FAILED;
6603              }
6604            cd->had_accept = TRUE;
6605            for (oc = cd->open_caps; oc != NULL; oc = oc->next)
6606              {
6607              *code++ = OP_CLOSE;
6608              PUT2INC(code, 0, oc->number);
6609              }
6610            setverb = *code++ =
6611              (cd->assert_depth > 0)? OP_ASSERT_ACCEPT : OP_ACCEPT;
6612
6613            /* Do not set firstchar after *ACCEPT */
6614            if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
6615            }
6616
6617          /* Handle other cases with/without an argument */
6618
6619          else if (arglen == 0)
6620            {
6621            if (verbs[i].op < 0)   /* Argument is mandatory */
6622              {
6623              *errorcodeptr = ERR66;
6624              goto FAILED;
6625              }
6626            setverb = *code++ = verbs[i].op;
6627            }
6628
6629          else
6630            {
6631            if (verbs[i].op_arg < 0)   /* Argument is forbidden */
6632              {
6633              *errorcodeptr = ERR59;
6634              goto FAILED;
6635              }
6636            setverb = *code++ = verbs[i].op_arg;
6637            if (lengthptr != NULL)    /* In pass 1 just add in the length */
6638              {                       /* to avoid potential workspace */
6639              *lengthptr += arglen;   /* overflow. */
6640              *code++ = 0;
6641              }
6642            else
6643              {
6644              *code++ = arglen;
6645              memcpy(code, arg, IN_UCHARS(arglen));
6646              code += arglen;
6647              }
6648            *code++ = 0;
6649            }
6650
6651          switch (setverb)
6652            {
6653            case OP_THEN:
6654            case OP_THEN_ARG:
6655            cd->external_flags |= PCRE_HASTHEN;
6656            break;
6657
6658            case OP_PRUNE:
6659            case OP_PRUNE_ARG:
6660            case OP_SKIP:
6661            case OP_SKIP_ARG:
6662            cd->had_pruneorskip = TRUE;
6663            break;
6664            }
6665
6666          break;  /* Found verb, exit loop */
6667          }
6668
6669        vn += verbs[i].len + 1;
6670        }
6671
6672      if (i < verbcount) continue;    /* Successfully handled a verb */
6673      *errorcodeptr = ERR60;          /* Verb not recognized */
6674      goto FAILED;
6675      }
6676
6677    /* Initialize for "real" parentheses */
6678
6679    newoptions = options;
6680    skipbytes = 0;
6681    bravalue = OP_CBRA;
6682    item_hwm_offset = cd->hwm - cd->start_workspace;
6683    reset_bracount = FALSE;
6684
6685    /* Deal with the extended parentheses; all are introduced by '?', and the
6686    appearance of any of them means that this is not a capturing group. */
6687
6688    if (*ptr == CHAR_QUESTION_MARK)
6689      {
6690      int i, set, unset, namelen;
6691      int *optset;
6692      const pcre_uchar *name;
6693      pcre_uchar *slot;
6694
6695      switch (*(++ptr))
6696        {
6697        /* ------------------------------------------------------------ */
6698        case CHAR_VERTICAL_LINE:  /* Reset capture count for each branch */
6699        reset_bracount = TRUE;
6700        cd->dupgroups = TRUE;     /* Record (?| encountered */
6701        /* Fall through */
6702
6703        /* ------------------------------------------------------------ */
6704        case CHAR_COLON:          /* Non-capturing bracket */
6705        bravalue = OP_BRA;
6706        ptr++;
6707        break;
6708
6709
6710        /* ------------------------------------------------------------ */
6711        case CHAR_LEFT_PARENTHESIS:
6712        bravalue = OP_COND;       /* Conditional group */
6713        tempptr = ptr;
6714
6715        /* A condition can be an assertion, a number (referring to a numbered
6716        group's having been set), a name (referring to a named group), or 'R',
6717        referring to recursion. R<digits> and R&name are also permitted for
6718        recursion tests.
6719
6720        There are ways of testing a named group: (?(name)) is used by Python;
6721        Perl 5.10 onwards uses (?(<name>) or (?('name')).
6722
6723        There is one unfortunate ambiguity, caused by history. 'R' can be the
6724        recursive thing or the name 'R' (and similarly for 'R' followed by
6725        digits). We look for a name first; if not found, we try the other case.
6726
6727        For compatibility with auto-callouts, we allow a callout to be
6728        specified before a condition that is an assertion. First, check for the
6729        syntax of a callout; if found, adjust the temporary pointer that is
6730        used to check for an assertion condition. That's all that is needed! */
6731
6732        if (ptr[1] == CHAR_QUESTION_MARK && ptr[2] == CHAR_C)
6733          {
6734          for (i = 3;; i++) if (!IS_DIGIT(ptr[i])) break;
6735          if (ptr[i] == CHAR_RIGHT_PARENTHESIS)
6736            tempptr += i + 1;
6737          }
6738
6739        /* For conditions that are assertions, check the syntax, and then exit
6740        the switch. This will take control down to where bracketed groups,
6741        including assertions, are processed. */
6742
6743        if (tempptr[1] == CHAR_QUESTION_MARK &&
6744              (tempptr[2] == CHAR_EQUALS_SIGN ||
6745               tempptr[2] == CHAR_EXCLAMATION_MARK ||
6746                 (tempptr[2] == CHAR_LESS_THAN_SIGN &&
6747                   (tempptr[3] == CHAR_EQUALS_SIGN ||
6748                    tempptr[3] == CHAR_EXCLAMATION_MARK))))
6749          {
6750          cd->iscondassert = TRUE;
6751          break;
6752          }
6753
6754        /* Other conditions use OP_CREF/OP_DNCREF/OP_RREF/OP_DNRREF, and all
6755        need to skip at least 1+IMM2_SIZE bytes at the start of the group. */
6756
6757        code[1+LINK_SIZE] = OP_CREF;
6758        skipbytes = 1+IMM2_SIZE;
6759        refsign = -1;     /* => not a number */
6760        namelen = -1;     /* => not a name; must set to avoid warning */
6761        name = NULL;      /* Always set to avoid warning */
6762        recno = 0;        /* Always set to avoid warning */
6763
6764        /* Check for a test for recursion in a named group. */
6765
6766        ptr++;
6767        if (*ptr == CHAR_R && ptr[1] == CHAR_AMPERSAND)
6768          {
6769          terminator = -1;
6770          ptr += 2;
6771          code[1+LINK_SIZE] = OP_RREF;    /* Change the type of test */
6772          }
6773
6774        /* Check for a test for a named group's having been set, using the Perl
6775        syntax (?(<name>) or (?('name'), and also allow for the original PCRE
6776        syntax of (?(name) or for (?(+n), (?(-n), and just (?(n). */
6777
6778        else if (*ptr == CHAR_LESS_THAN_SIGN)
6779          {
6780          terminator = CHAR_GREATER_THAN_SIGN;
6781          ptr++;
6782          }
6783        else if (*ptr == CHAR_APOSTROPHE)
6784          {
6785          terminator = CHAR_APOSTROPHE;
6786          ptr++;
6787          }
6788        else
6789          {
6790          terminator = CHAR_NULL;
6791          if (*ptr == CHAR_MINUS || *ptr == CHAR_PLUS) refsign = *ptr++;
6792            else if (IS_DIGIT(*ptr)) refsign = 0;
6793          }
6794
6795        /* Handle a number */
6796
6797        if (refsign >= 0)
6798          {
6799          while (IS_DIGIT(*ptr))
6800            {
6801            if (recno > INT_MAX / 10 - 1)  /* Integer overflow */
6802              {
6803              while (IS_DIGIT(*ptr)) ptr++;
6804              *errorcodeptr = ERR61;
6805              goto FAILED;
6806              }
6807            recno = recno * 10 + (int)(*ptr - CHAR_0);
6808            ptr++;
6809            }
6810          }
6811
6812        /* Otherwise we expect to read a name; anything else is an error. When
6813        a name is one of a number of duplicates, a different opcode is used and
6814        it needs more memory. Unfortunately we cannot tell whether a name is a
6815        duplicate in the first pass, so we have to allow for more memory. */
6816
6817        else
6818          {
6819          if (IS_DIGIT(*ptr))
6820            {
6821            *errorcodeptr = ERR84;
6822            goto FAILED;
6823            }
6824          if (!MAX_255(*ptr) || (cd->ctypes[*ptr] & ctype_word) == 0)
6825            {
6826            *errorcodeptr = ERR28;   /* Assertion expected */
6827            goto FAILED;
6828            }
6829          name = ptr++;
6830          while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_word) != 0)
6831            {
6832            ptr++;
6833            }
6834          namelen = (int)(ptr - name);
6835          if (lengthptr != NULL) skipbytes += IMM2_SIZE;
6836          }
6837
6838        /* Check the terminator */
6839
6840        if ((terminator > 0 && *ptr++ != (pcre_uchar)terminator) ||
6841            *ptr++ != CHAR_RIGHT_PARENTHESIS)
6842          {
6843          ptr--;                  /* Error offset */
6844          *errorcodeptr = ERR26;  /* Malformed number or name */
6845          goto FAILED;
6846          }
6847
6848        /* Do no further checking in the pre-compile phase. */
6849
6850        if (lengthptr != NULL) break;
6851
6852        /* In the real compile we do the work of looking for the actual
6853        reference. If refsign is not negative, it means we have a number in
6854        recno. */
6855
6856        if (refsign >= 0)
6857          {
6858          if (recno <= 0)
6859            {
6860            *errorcodeptr = ERR35;
6861            goto FAILED;
6862            }
6863          if (refsign != 0) recno = (refsign == CHAR_MINUS)?
6864            cd->bracount - recno + 1 : recno + cd->bracount;
6865          if (recno <= 0 || recno > cd->final_bracount)
6866            {
6867            *errorcodeptr = ERR15;
6868            goto FAILED;
6869            }
6870          PUT2(code, 2+LINK_SIZE, recno);
6871          if (recno > cd->top_backref) cd->top_backref = recno;
6872          break;
6873          }
6874
6875        /* Otherwise look for the name. */
6876
6877        slot = cd->name_table;
6878        for (i = 0; i < cd->names_found; i++)
6879          {
6880          if (STRNCMP_UC_UC(name, slot+IMM2_SIZE, namelen) == 0) break;
6881          slot += cd->name_entry_size;
6882          }
6883
6884        /* Found the named subpattern. If the name is duplicated, add one to
6885        the opcode to change CREF/RREF into DNCREF/DNRREF and insert
6886        appropriate data values. Otherwise, just insert the unique subpattern
6887        number. */
6888
6889        if (i < cd->names_found)
6890          {
6891          int offset = i++;
6892          int count = 1;
6893          recno = GET2(slot, 0);   /* Number from first found */
6894          if (recno > cd->top_backref) cd->top_backref = recno;
6895          for (; i < cd->names_found; i++)
6896            {
6897            slot += cd->name_entry_size;
6898            if (STRNCMP_UC_UC(name, slot+IMM2_SIZE, namelen) != 0 ||
6899              (slot+IMM2_SIZE)[namelen] != 0) break;
6900            count++;
6901            }
6902
6903          if (count > 1)
6904            {
6905            PUT2(code, 2+LINK_SIZE, offset);
6906            PUT2(code, 2+LINK_SIZE+IMM2_SIZE, count);
6907            skipbytes += IMM2_SIZE;
6908            code[1+LINK_SIZE]++;
6909            }
6910          else  /* Not a duplicated name */
6911            {
6912            PUT2(code, 2+LINK_SIZE, recno);
6913            }
6914          }
6915
6916        /* If terminator == CHAR_NULL it means that the name followed directly
6917        after the opening parenthesis [e.g. (?(abc)...] and in this case there
6918        are some further alternatives to try. For the cases where terminator !=
6919        CHAR_NULL [things like (?(<name>... or (?('name')... or (?(R&name)... ]
6920        we have now checked all the possibilities, so give an error. */
6921
6922        else if (terminator != CHAR_NULL)
6923          {
6924          *errorcodeptr = ERR15;
6925          goto FAILED;
6926          }
6927
6928        /* Check for (?(R) for recursion. Allow digits after R to specify a
6929        specific group number. */
6930
6931        else if (*name == CHAR_R)
6932          {
6933          recno = 0;
6934          for (i = 1; i < namelen; i++)
6935            {
6936            if (!IS_DIGIT(name[i]))
6937              {
6938              *errorcodeptr = ERR15;
6939              goto FAILED;
6940              }
6941            if (recno > INT_MAX / 10 - 1)   /* Integer overflow */
6942              {
6943              *errorcodeptr = ERR61;
6944              goto FAILED;
6945              }
6946            recno = recno * 10 + name[i] - CHAR_0;
6947            }
6948          if (recno == 0) recno = RREF_ANY;
6949          code[1+LINK_SIZE] = OP_RREF;      /* Change test type */
6950          PUT2(code, 2+LINK_SIZE, recno);
6951          }
6952
6953        /* Similarly, check for the (?(DEFINE) "condition", which is always
6954        false. */
6955
6956        else if (namelen == 6 && STRNCMP_UC_C8(name, STRING_DEFINE, 6) == 0)
6957          {
6958          code[1+LINK_SIZE] = OP_DEF;
6959          skipbytes = 1;
6960          }
6961
6962        /* Reference to an unidentified subpattern. */
6963
6964        else
6965          {
6966          *errorcodeptr = ERR15;
6967          goto FAILED;
6968          }
6969        break;
6970
6971
6972        /* ------------------------------------------------------------ */
6973        case CHAR_EQUALS_SIGN:                 /* Positive lookahead */
6974        bravalue = OP_ASSERT;
6975        cd->assert_depth += 1;
6976        ptr++;
6977        break;
6978
6979        /* Optimize (?!) to (*FAIL) unless it is quantified - which is a weird
6980        thing to do, but Perl allows all assertions to be quantified, and when
6981        they contain capturing parentheses there may be a potential use for
6982        this feature. Not that that applies to a quantified (?!) but we allow
6983        it for uniformity. */
6984
6985        /* ------------------------------------------------------------ */
6986        case CHAR_EXCLAMATION_MARK:            /* Negative lookahead */
6987        ptr++;
6988        if (*ptr == CHAR_RIGHT_PARENTHESIS && ptr[1] != CHAR_ASTERISK &&
6989             ptr[1] != CHAR_PLUS && ptr[1] != CHAR_QUESTION_MARK &&
6990            (ptr[1] != CHAR_LEFT_CURLY_BRACKET || !is_counted_repeat(ptr+2)))
6991          {
6992          *code++ = OP_FAIL;
6993          previous = NULL;
6994          continue;
6995          }
6996        bravalue = OP_ASSERT_NOT;
6997        cd->assert_depth += 1;
6998        break;
6999
7000
7001        /* ------------------------------------------------------------ */
7002        case CHAR_LESS_THAN_SIGN:              /* Lookbehind or named define */
7003        switch (ptr[1])
7004          {
7005          case CHAR_EQUALS_SIGN:               /* Positive lookbehind */
7006          bravalue = OP_ASSERTBACK;
7007          cd->assert_depth += 1;
7008          ptr += 2;
7009          break;
7010
7011          case CHAR_EXCLAMATION_MARK:          /* Negative lookbehind */
7012          bravalue = OP_ASSERTBACK_NOT;
7013          cd->assert_depth += 1;
7014          ptr += 2;
7015          break;
7016
7017          default:                /* Could be name define, else bad */
7018          if (MAX_255(ptr[1]) && (cd->ctypes[ptr[1]] & ctype_word) != 0)
7019            goto DEFINE_NAME;
7020          ptr++;                  /* Correct offset for error */
7021          *errorcodeptr = ERR24;
7022          goto FAILED;
7023          }
7024        break;
7025
7026
7027        /* ------------------------------------------------------------ */
7028        case CHAR_GREATER_THAN_SIGN:           /* One-time brackets */
7029        bravalue = OP_ONCE;
7030        ptr++;
7031        break;
7032
7033
7034        /* ------------------------------------------------------------ */
7035        case CHAR_C:                 /* Callout - may be followed by digits; */
7036        previous_callout = code;     /* Save for later completion */
7037        after_manual_callout = 1;    /* Skip one item before completing */
7038        *code++ = OP_CALLOUT;
7039          {
7040          int n = 0;
7041          ptr++;
7042          while(IS_DIGIT(*ptr))
7043            n = n * 10 + *ptr++ - CHAR_0;
7044          if (*ptr != CHAR_RIGHT_PARENTHESIS)
7045            {
7046            *errorcodeptr = ERR39;
7047            goto FAILED;
7048            }
7049          if (n > 255)
7050            {
7051            *errorcodeptr = ERR38;
7052            goto FAILED;
7053            }
7054          *code++ = n;
7055          PUT(code, 0, (int)(ptr - cd->start_pattern + 1)); /* Pattern offset */
7056          PUT(code, LINK_SIZE, 0);                          /* Default length */
7057          code += 2 * LINK_SIZE;
7058          }
7059        previous = NULL;
7060        continue;
7061
7062
7063        /* ------------------------------------------------------------ */
7064        case CHAR_P:              /* Python-style named subpattern handling */
7065        if (*(++ptr) == CHAR_EQUALS_SIGN ||
7066            *ptr == CHAR_GREATER_THAN_SIGN)  /* Reference or recursion */
7067          {
7068          is_recurse = *ptr == CHAR_GREATER_THAN_SIGN;
7069          terminator = CHAR_RIGHT_PARENTHESIS;
7070          goto NAMED_REF_OR_RECURSE;
7071          }
7072        else if (*ptr != CHAR_LESS_THAN_SIGN)  /* Test for Python-style defn */
7073          {
7074          *errorcodeptr = ERR41;
7075          goto FAILED;
7076          }
7077        /* Fall through to handle (?P< as (?< is handled */
7078
7079
7080        /* ------------------------------------------------------------ */
7081        DEFINE_NAME:    /* Come here from (?< handling */
7082        case CHAR_APOSTROPHE:
7083        terminator = (*ptr == CHAR_LESS_THAN_SIGN)?
7084          CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;
7085        name = ++ptr;
7086        if (IS_DIGIT(*ptr))
7087          {
7088          *errorcodeptr = ERR84;   /* Group name must start with non-digit */
7089          goto FAILED;
7090          }
7091        while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
7092        namelen = (int)(ptr - name);
7093
7094        /* In the pre-compile phase, do a syntax check, remember the longest
7095        name, and then remember the group in a vector, expanding it if
7096        necessary. Duplicates for the same number are skipped; other duplicates
7097        are checked for validity. In the actual compile, there is nothing to
7098        do. */
7099
7100        if (lengthptr != NULL)
7101          {
7102          named_group *ng;
7103          pcre_uint32 number = cd->bracount + 1;
7104
7105          if (*ptr != (pcre_uchar)terminator)
7106            {
7107            *errorcodeptr = ERR42;
7108            goto FAILED;
7109            }
7110
7111          if (cd->names_found >= MAX_NAME_COUNT)
7112            {
7113            *errorcodeptr = ERR49;
7114            goto FAILED;
7115            }
7116
7117          if (namelen + IMM2_SIZE + 1 > cd->name_entry_size)
7118            {
7119            cd->name_entry_size = namelen + IMM2_SIZE + 1;
7120            if (namelen > MAX_NAME_SIZE)
7121              {
7122              *errorcodeptr = ERR48;
7123              goto FAILED;
7124              }
7125            }
7126
7127          /* Scan the list to check for duplicates. For duplicate names, if the
7128          number is the same, break the loop, which causes the name to be
7129          discarded; otherwise, if DUPNAMES is not set, give an error.
7130          If it is set, allow the name with a different number, but continue
7131          scanning in case this is a duplicate with the same number. For
7132          non-duplicate names, give an error if the number is duplicated. */
7133
7134          ng = cd->named_groups;
7135          for (i = 0; i < cd->names_found; i++, ng++)
7136            {
7137            if (namelen == ng->length &&
7138                STRNCMP_UC_UC(name, ng->name, namelen) == 0)
7139              {
7140              if (ng->number == number) break;
7141              if ((options & PCRE_DUPNAMES) == 0)
7142                {
7143                *errorcodeptr = ERR43;
7144                goto FAILED;
7145                }
7146              cd->dupnames = TRUE;  /* Duplicate names exist */
7147              }
7148            else if (ng->number == number)
7149              {
7150              *errorcodeptr = ERR65;
7151              goto FAILED;
7152              }
7153            }
7154
7155          if (i >= cd->names_found)     /* Not a duplicate with same number */
7156            {
7157            /* Increase the list size if necessary */
7158
7159            if (cd->names_found >= cd->named_group_list_size)
7160              {
7161              int newsize = cd->named_group_list_size * 2;
7162              named_group *newspace = (PUBL(malloc))
7163                (newsize * sizeof(named_group));
7164
7165              if (newspace == NULL)
7166                {
7167                *errorcodeptr = ERR21;
7168                goto FAILED;
7169                }
7170
7171              memcpy(newspace, cd->named_groups,
7172                cd->named_group_list_size * sizeof(named_group));
7173              if (cd->named_group_list_size > NAMED_GROUP_LIST_SIZE)
7174                (PUBL(free))((void *)cd->named_groups);
7175              cd->named_groups = newspace;
7176              cd->named_group_list_size = newsize;
7177              }
7178
7179            cd->named_groups[cd->names_found].name = name;
7180            cd->named_groups[cd->names_found].length = namelen;
7181            cd->named_groups[cd->names_found].number = number;
7182            cd->names_found++;
7183            }
7184          }
7185
7186        ptr++;                    /* Move past > or ' in both passes. */
7187        goto NUMBERED_GROUP;
7188
7189
7190        /* ------------------------------------------------------------ */
7191        case CHAR_AMPERSAND:            /* Perl recursion/subroutine syntax */
7192        terminator = CHAR_RIGHT_PARENTHESIS;
7193        is_recurse = TRUE;
7194        /* Fall through */
7195
7196        /* We come here from the Python syntax above that handles both
7197        references (?P=name) and recursion (?P>name), as well as falling
7198        through from the Perl recursion syntax (?&name). We also come here from
7199        the Perl \k<name> or \k'name' back reference syntax and the \k{name}
7200        .NET syntax, and the Oniguruma \g<...> and \g'...' subroutine syntax. */
7201
7202        NAMED_REF_OR_RECURSE:
7203        name = ++ptr;
7204        if (IS_DIGIT(*ptr))
7205          {
7206          *errorcodeptr = ERR84;   /* Group name must start with non-digit */
7207          goto FAILED;
7208          }
7209        while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
7210        namelen = (int)(ptr - name);
7211
7212        /* In the pre-compile phase, do a syntax check. We used to just set
7213        a dummy reference number, because it was not used in the first pass.
7214        However, with the change of recursive back references to be atomic,
7215        we have to look for the number so that this state can be identified, as
7216        otherwise the incorrect length is computed. If it's not a backwards
7217        reference, the dummy number will do. */
7218
7219        if (lengthptr != NULL)
7220          {
7221          named_group *ng;
7222          recno = 0;
7223
7224          if (namelen == 0)
7225            {
7226            *errorcodeptr = ERR62;
7227            goto FAILED;
7228            }
7229          if (*ptr != (pcre_uchar)terminator)
7230            {
7231            *errorcodeptr = ERR42;
7232            goto FAILED;
7233            }
7234          if (namelen > MAX_NAME_SIZE)
7235            {
7236            *errorcodeptr = ERR48;
7237            goto FAILED;
7238            }
7239
7240          /* Count named back references. */
7241
7242          if (!is_recurse) cd->namedrefcount++;
7243
7244          /* We have to allow for a named reference to a duplicated name (this
7245          cannot be determined until the second pass). This needs an extra
7246          16-bit data item. */
7247
7248          *lengthptr += IMM2_SIZE;
7249
7250          /* If this is a forward reference and we are within a (?|...) group,
7251          the reference may end up as the number of a group which we are
7252          currently inside, that is, it could be a recursive reference. In the
7253          real compile this will be picked up and the reference wrapped with
7254          OP_ONCE to make it atomic, so we must space in case this occurs. */
7255
7256          /* In fact, this can happen for a non-forward reference because
7257          another group with the same number might be created later. This
7258          issue is fixed "properly" in PCRE2. As PCRE1 is now in maintenance
7259          only mode, we finesse the bug by allowing more memory always. */
7260
7261          *lengthptr += 2 + 2*LINK_SIZE;
7262
7263          /* It is even worse than that. The current reference may be to an
7264          existing named group with a different number (so apparently not
7265          recursive) but which later on is also attached to a group with the
7266          current number. This can only happen if $(| has been previous
7267          encountered. In that case, we allow yet more memory, just in case.
7268          (Again, this is fixed "properly" in PCRE2. */
7269
7270          if (cd->dupgroups) *lengthptr += 4 + 4*LINK_SIZE;
7271
7272          /* Otherwise, check for recursion here. The name table does not exist
7273          in the first pass; instead we must scan the list of names encountered
7274          so far in order to get the number. If the name is not found, leave
7275          the value of recno as 0 for a forward reference. */
7276
7277          else
7278            {
7279            ng = cd->named_groups;
7280            for (i = 0; i < cd->names_found; i++, ng++)
7281              {
7282              if (namelen == ng->length &&
7283                  STRNCMP_UC_UC(name, ng->name, namelen) == 0)
7284                {
7285                open_capitem *oc;
7286                recno = ng->number;
7287                if (is_recurse) break;
7288                for (oc = cd->open_caps; oc != NULL; oc = oc->next)
7289                  {
7290                  if (oc->number == recno)
7291                    {
7292                    oc->flag = TRUE;
7293                    break;
7294                    }
7295                  }
7296                }
7297              }
7298            }
7299          }
7300
7301        /* In the real compile, search the name table. We check the name
7302        first, and then check that we have reached the end of the name in the
7303        table. That way, if the name is longer than any in the table, the
7304        comparison will fail without reading beyond the table entry. */
7305
7306        else
7307          {
7308          slot = cd->name_table;
7309          for (i = 0; i < cd->names_found; i++)
7310            {
7311            if (STRNCMP_UC_UC(name, slot+IMM2_SIZE, namelen) == 0 &&
7312                slot[IMM2_SIZE+namelen] == 0)
7313              break;
7314            slot += cd->name_entry_size;
7315            }
7316
7317          if (i < cd->names_found)
7318            {
7319            recno = GET2(slot, 0);
7320            }
7321          else
7322            {
7323            *errorcodeptr = ERR15;
7324            goto FAILED;
7325            }
7326          }
7327
7328        /* In both phases, for recursions, we can now go to the code than
7329        handles numerical recursion. */
7330
7331        if (is_recurse) goto HANDLE_RECURSION;
7332
7333        /* In the second pass we must see if the name is duplicated. If so, we
7334        generate a different opcode. */
7335
7336        if (lengthptr == NULL && cd->dupnames)
7337          {
7338          int count = 1;
7339          unsigned int index = i;
7340          pcre_uchar *cslot = slot + cd->name_entry_size;
7341
7342          for (i++; i < cd->names_found; i++)
7343            {
7344            if (STRCMP_UC_UC(slot + IMM2_SIZE, cslot + IMM2_SIZE) != 0) break;
7345            count++;
7346            cslot += cd->name_entry_size;
7347            }
7348
7349          if (count > 1)
7350            {
7351            if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
7352            previous = code;
7353            item_hwm_offset = cd->hwm - cd->start_workspace;
7354            *code++ = ((options & PCRE_CASELESS) != 0)? OP_DNREFI : OP_DNREF;
7355            PUT2INC(code, 0, index);
7356            PUT2INC(code, 0, count);
7357
7358            /* Process each potentially referenced group. */
7359
7360            for (; slot < cslot; slot += cd->name_entry_size)
7361              {
7362              open_capitem *oc;
7363              recno = GET2(slot, 0);
7364              cd->backref_map |= (recno < 32)? (1 << recno) : 1;
7365              if (recno > cd->top_backref) cd->top_backref = recno;
7366
7367              /* Check to see if this back reference is recursive, that it, it
7368              is inside the group that it references. A flag is set so that the
7369              group can be made atomic. */
7370
7371              for (oc = cd->open_caps; oc != NULL; oc = oc->next)
7372                {
7373                if (oc->number == recno)
7374                  {
7375                  oc->flag = TRUE;
7376                  break;
7377                  }
7378                }
7379              }
7380
7381            continue;  /* End of back ref handling */
7382            }
7383          }
7384
7385        /* First pass, or a non-duplicated name. */
7386
7387        goto HANDLE_REFERENCE;
7388
7389
7390        /* ------------------------------------------------------------ */
7391        case CHAR_R:              /* Recursion, same as (?0) */
7392        recno = 0;
7393        if (*(++ptr) != CHAR_RIGHT_PARENTHESIS)
7394          {
7395          *errorcodeptr = ERR29;
7396          goto FAILED;
7397          }
7398        goto HANDLE_RECURSION;
7399
7400
7401        /* ------------------------------------------------------------ */
7402        case CHAR_MINUS: case CHAR_PLUS:  /* Recursion or subroutine */
7403        case CHAR_0: case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4:
7404        case CHAR_5: case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
7405          {
7406          const pcre_uchar *called;
7407          terminator = CHAR_RIGHT_PARENTHESIS;
7408
7409          /* Come here from the \g<...> and \g'...' code (Oniguruma
7410          compatibility). However, the syntax has been checked to ensure that
7411          the ... are a (signed) number, so that neither ERR63 nor ERR29 will
7412          be called on this path, nor with the jump to OTHER_CHAR_AFTER_QUERY
7413          ever be taken. */
7414
7415          HANDLE_NUMERICAL_RECURSION:
7416
7417          if ((refsign = *ptr) == CHAR_PLUS)
7418            {
7419            ptr++;
7420            if (!IS_DIGIT(*ptr))
7421              {
7422              *errorcodeptr = ERR63;
7423              goto FAILED;
7424              }
7425            }
7426          else if (refsign == CHAR_MINUS)
7427            {
7428            if (!IS_DIGIT(ptr[1]))
7429              goto OTHER_CHAR_AFTER_QUERY;
7430            ptr++;
7431            }
7432
7433          recno = 0;
7434          while(IS_DIGIT(*ptr))
7435            {
7436            if (recno > INT_MAX / 10 - 1) /* Integer overflow */
7437              {
7438              while (IS_DIGIT(*ptr)) ptr++;
7439              *errorcodeptr = ERR61;
7440              goto FAILED;
7441              }
7442            recno = recno * 10 + *ptr++ - CHAR_0;
7443            }
7444
7445          if (*ptr != (pcre_uchar)terminator)
7446            {
7447            *errorcodeptr = ERR29;
7448            goto FAILED;
7449            }
7450
7451          if (refsign == CHAR_MINUS)
7452            {
7453            if (recno == 0)
7454              {
7455              *errorcodeptr = ERR58;
7456              goto FAILED;
7457              }
7458            recno = cd->bracount - recno + 1;
7459            if (recno <= 0)
7460              {
7461              *errorcodeptr = ERR15;
7462              goto FAILED;
7463              }
7464            }
7465          else if (refsign == CHAR_PLUS)
7466            {
7467            if (recno == 0)
7468              {
7469              *errorcodeptr = ERR58;
7470              goto FAILED;
7471              }
7472            recno += cd->bracount;
7473            }
7474
7475          /* Come here from code above that handles a named recursion */
7476
7477          HANDLE_RECURSION:
7478
7479          previous = code;
7480          item_hwm_offset = cd->hwm - cd->start_workspace;
7481          called = cd->start_code;
7482
7483          /* When we are actually compiling, find the bracket that is being
7484          referenced. Temporarily end the regex in case it doesn't exist before
7485          this point. If we end up with a forward reference, first check that
7486          the bracket does occur later so we can give the error (and position)
7487          now. Then remember this forward reference in the workspace so it can
7488          be filled in at the end. */
7489
7490          if (lengthptr == NULL)
7491            {
7492            *code = OP_END;
7493            if (recno != 0)
7494              called = PRIV(find_bracket)(cd->start_code, utf, recno);
7495
7496            /* Forward reference */
7497
7498            if (called == NULL)
7499              {
7500              if (recno > cd->final_bracount)
7501                {
7502                *errorcodeptr = ERR15;
7503                goto FAILED;
7504                }
7505
7506              /* Fudge the value of "called" so that when it is inserted as an
7507              offset below, what it actually inserted is the reference number
7508              of the group. Then remember the forward reference. */
7509
7510              called = cd->start_code + recno;
7511              if (cd->hwm >= cd->start_workspace + cd->workspace_size -
7512                  WORK_SIZE_SAFETY_MARGIN)
7513                {
7514                *errorcodeptr = expand_workspace(cd);
7515                if (*errorcodeptr != 0) goto FAILED;
7516                }
7517              PUTINC(cd->hwm, 0, (int)(code + 1 - cd->start_code));
7518              }
7519
7520            /* If not a forward reference, and the subpattern is still open,
7521            this is a recursive call. We check to see if this is a left
7522            recursion that could loop for ever, and diagnose that case. We
7523            must not, however, do this check if we are in a conditional
7524            subpattern because the condition might be testing for recursion in
7525            a pattern such as /(?(R)a+|(?R)b)/, which is perfectly valid.
7526            Forever loops are also detected at runtime, so those that occur in
7527            conditional subpatterns will be picked up then. */
7528
7529            else if (GET(called, 1) == 0 && cond_depth <= 0 &&
7530                     could_be_empty(called, code, bcptr, utf, cd))
7531              {
7532              *errorcodeptr = ERR40;
7533              goto FAILED;
7534              }
7535            }
7536
7537          /* Insert the recursion/subroutine item. It does not have a set first
7538          character (relevant if it is repeated, because it will then be
7539          wrapped with ONCE brackets). */
7540
7541          *code = OP_RECURSE;
7542          PUT(code, 1, (int)(called - cd->start_code));
7543          code += 1 + LINK_SIZE;
7544          groupsetfirstchar = FALSE;
7545          }
7546
7547        /* Can't determine a first byte now */
7548
7549        if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
7550        continue;
7551
7552
7553        /* ------------------------------------------------------------ */
7554        default:              /* Other characters: check option setting */
7555        OTHER_CHAR_AFTER_QUERY:
7556        set = unset = 0;
7557        optset = &set;
7558
7559        while (*ptr != CHAR_RIGHT_PARENTHESIS && *ptr != CHAR_COLON)
7560          {
7561          switch (*ptr++)
7562            {
7563            case CHAR_MINUS: optset = &unset; break;
7564
7565            case CHAR_J:    /* Record that it changed in the external options */
7566            *optset |= PCRE_DUPNAMES;
7567            cd->external_flags |= PCRE_JCHANGED;
7568            break;
7569
7570            case CHAR_i: *optset |= PCRE_CASELESS; break;
7571            case CHAR_m: *optset |= PCRE_MULTILINE; break;
7572            case CHAR_s: *optset |= PCRE_DOTALL; break;
7573            case CHAR_x: *optset |= PCRE_EXTENDED; break;
7574            case CHAR_U: *optset |= PCRE_UNGREEDY; break;
7575            case CHAR_X: *optset |= PCRE_EXTRA; break;
7576
7577            default:  *errorcodeptr = ERR12;
7578                      ptr--;    /* Correct the offset */
7579                      goto FAILED;
7580            }
7581          }
7582
7583        /* Set up the changed option bits, but don't change anything yet. */
7584
7585        newoptions = (options | set) & (~unset);
7586
7587        /* If the options ended with ')' this is not the start of a nested
7588        group with option changes, so the options change at this level. If this
7589        item is right at the start of the pattern, the options can be
7590        abstracted and made external in the pre-compile phase, and ignored in
7591        the compile phase. This can be helpful when matching -- for instance in
7592        caseless checking of required bytes.
7593
7594        If the code pointer is not (cd->start_code + 1 + LINK_SIZE), we are
7595        definitely *not* at the start of the pattern because something has been
7596        compiled. In the pre-compile phase, however, the code pointer can have
7597        that value after the start, because it gets reset as code is discarded
7598        during the pre-compile. However, this can happen only at top level - if
7599        we are within parentheses, the starting BRA will still be present. At
7600        any parenthesis level, the length value can be used to test if anything
7601        has been compiled at that level. Thus, a test for both these conditions
7602        is necessary to ensure we correctly detect the start of the pattern in
7603        both phases.
7604
7605        If we are not at the pattern start, reset the greedy defaults and the
7606        case value for firstchar and reqchar. */
7607
7608        if (*ptr == CHAR_RIGHT_PARENTHESIS)
7609          {
7610          if (code == cd->start_code + 1 + LINK_SIZE &&
7611               (lengthptr == NULL || *lengthptr == 2 + 2*LINK_SIZE))
7612            {
7613            cd->external_options = newoptions;
7614            }
7615          else
7616            {
7617            greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
7618            greedy_non_default = greedy_default ^ 1;
7619            req_caseopt = ((newoptions & PCRE_CASELESS) != 0)? REQ_CASELESS:0;
7620            }
7621
7622          /* Change options at this level, and pass them back for use
7623          in subsequent branches. */
7624
7625          *optionsptr = options = newoptions;
7626          previous = NULL;       /* This item can't be repeated */
7627          continue;              /* It is complete */
7628          }
7629
7630        /* If the options ended with ':' we are heading into a nested group
7631        with possible change of options. Such groups are non-capturing and are
7632        not assertions of any kind. All we need to do is skip over the ':';
7633        the newoptions value is handled below. */
7634
7635        bravalue = OP_BRA;
7636        ptr++;
7637        }     /* End of switch for character following (? */
7638      }       /* End of (? handling */
7639
7640    /* Opening parenthesis not followed by '*' or '?'. If PCRE_NO_AUTO_CAPTURE
7641    is set, all unadorned brackets become non-capturing and behave like (?:...)
7642    brackets. */
7643
7644    else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
7645      {
7646      bravalue = OP_BRA;
7647      }
7648
7649    /* Else we have a capturing group. */
7650
7651    else
7652      {
7653      NUMBERED_GROUP:
7654      cd->bracount += 1;
7655      PUT2(code, 1+LINK_SIZE, cd->bracount);
7656      skipbytes = IMM2_SIZE;
7657      }
7658
7659    /* Process nested bracketed regex. First check for parentheses nested too
7660    deeply. */
7661
7662    if ((cd->parens_depth += 1) > PARENS_NEST_LIMIT)
7663      {
7664      *errorcodeptr = ERR82;
7665      goto FAILED;
7666      }
7667
7668    /* All assertions used not to be repeatable, but this was changed for Perl
7669    compatibility. All kinds can now be repeated except for assertions that are
7670    conditions (Perl also forbids these to be repeated). We copy code into a
7671    non-register variable (tempcode) in order to be able to pass its address
7672    because some compilers complain otherwise. At the start of a conditional
7673    group whose condition is an assertion, cd->iscondassert is set. We unset it
7674    here so as to allow assertions later in the group to be quantified. */
7675
7676    if (bravalue >= OP_ASSERT && bravalue <= OP_ASSERTBACK_NOT &&
7677        cd->iscondassert)
7678      {
7679      previous = NULL;
7680      cd->iscondassert = FALSE;
7681      }
7682    else
7683      {
7684      previous = code;
7685      item_hwm_offset = cd->hwm - cd->start_workspace;
7686      }
7687
7688    *code = bravalue;
7689    tempcode = code;
7690    tempreqvary = cd->req_varyopt;        /* Save value before bracket */
7691    tempbracount = cd->bracount;          /* Save value before bracket */
7692    length_prevgroup = 0;                 /* Initialize for pre-compile phase */
7693
7694    if (!compile_regex(
7695         newoptions,                      /* The complete new option state */
7696         &tempcode,                       /* Where to put code (updated) */
7697         &ptr,                            /* Input pointer (updated) */
7698         errorcodeptr,                    /* Where to put an error message */
7699         (bravalue == OP_ASSERTBACK ||
7700          bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
7701         reset_bracount,                  /* True if (?| group */
7702         skipbytes,                       /* Skip over bracket number */
7703         cond_depth +
7704           ((bravalue == OP_COND)?1:0),   /* Depth of condition subpatterns */
7705         &subfirstchar,                   /* For possible first char */
7706         &subfirstcharflags,
7707         &subreqchar,                     /* For possible last char */
7708         &subreqcharflags,
7709         bcptr,                           /* Current branch chain */
7710         cd,                              /* Tables block */
7711         (lengthptr == NULL)? NULL :      /* Actual compile phase */
7712           &length_prevgroup              /* Pre-compile phase */
7713         ))
7714      goto FAILED;
7715
7716    cd->parens_depth -= 1;
7717
7718    /* If this was an atomic group and there are no capturing groups within it,
7719    generate OP_ONCE_NC instead of OP_ONCE. */
7720
7721    if (bravalue == OP_ONCE && cd->bracount <= tempbracount)
7722      *code = OP_ONCE_NC;
7723
7724    if (bravalue >= OP_ASSERT && bravalue <= OP_ASSERTBACK_NOT)
7725      cd->assert_depth -= 1;
7726
7727    /* At the end of compiling, code is still pointing to the start of the
7728    group, while tempcode has been updated to point past the end of the group.
7729    The pattern pointer (ptr) is on the bracket.
7730
7731    If this is a conditional bracket, check that there are no more than
7732    two branches in the group, or just one if it's a DEFINE group. We do this
7733    in the real compile phase, not in the pre-pass, where the whole group may
7734    not be available. */
7735
7736    if (bravalue == OP_COND && lengthptr == NULL)
7737      {
7738      pcre_uchar *tc = code;
7739      int condcount = 0;
7740
7741      do {
7742         condcount++;
7743         tc += GET(tc,1);
7744         }
7745      while (*tc != OP_KET);
7746
7747      /* A DEFINE group is never obeyed inline (the "condition" is always
7748      false). It must have only one branch. */
7749
7750      if (code[LINK_SIZE+1] == OP_DEF)
7751        {
7752        if (condcount > 1)
7753          {
7754          *errorcodeptr = ERR54;
7755          goto FAILED;
7756          }
7757        bravalue = OP_DEF;   /* Just a flag to suppress char handling below */
7758        }
7759
7760      /* A "normal" conditional group. If there is just one branch, we must not
7761      make use of its firstchar or reqchar, because this is equivalent to an
7762      empty second branch. */
7763
7764      else
7765        {
7766        if (condcount > 2)
7767          {
7768          *errorcodeptr = ERR27;
7769          goto FAILED;
7770          }
7771        if (condcount == 1) subfirstcharflags = subreqcharflags = REQ_NONE;
7772        }
7773      }
7774
7775    /* Error if hit end of pattern */
7776
7777    if (*ptr != CHAR_RIGHT_PARENTHESIS)
7778      {
7779      *errorcodeptr = ERR14;
7780      goto FAILED;
7781      }
7782
7783    /* In the pre-compile phase, update the length by the length of the group,
7784    less the brackets at either end. Then reduce the compiled code to just a
7785    set of non-capturing brackets so that it doesn't use much memory if it is
7786    duplicated by a quantifier.*/
7787
7788    if (lengthptr != NULL)
7789      {
7790      if (OFLOW_MAX - *lengthptr < length_prevgroup - 2 - 2*LINK_SIZE)
7791        {
7792        *errorcodeptr = ERR20;
7793        goto FAILED;
7794        }
7795      *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE;
7796      code++;   /* This already contains bravalue */
7797      PUTINC(code, 0, 1 + LINK_SIZE);
7798      *code++ = OP_KET;
7799      PUTINC(code, 0, 1 + LINK_SIZE);
7800      break;    /* No need to waste time with special character handling */
7801      }
7802
7803    /* Otherwise update the main code pointer to the end of the group. */
7804
7805    code = tempcode;
7806
7807    /* For a DEFINE group, required and first character settings are not
7808    relevant. */
7809
7810    if (bravalue == OP_DEF) break;
7811
7812    /* Handle updating of the required and first characters for other types of
7813    group. Update for normal brackets of all kinds, and conditions with two
7814    branches (see code above). If the bracket is followed by a quantifier with
7815    zero repeat, we have to back off. Hence the definition of zeroreqchar and
7816    zerofirstchar outside the main loop so that they can be accessed for the
7817    back off. */
7818
7819    zeroreqchar = reqchar;
7820    zeroreqcharflags = reqcharflags;
7821    zerofirstchar = firstchar;
7822    zerofirstcharflags = firstcharflags;
7823    groupsetfirstchar = FALSE;
7824
7825    if (bravalue >= OP_ONCE)
7826      {
7827      /* If we have not yet set a firstchar in this branch, take it from the
7828      subpattern, remembering that it was set here so that a repeat of more
7829      than one can replicate it as reqchar if necessary. If the subpattern has
7830      no firstchar, set "none" for the whole branch. In both cases, a zero
7831      repeat forces firstchar to "none". */
7832
7833      if (firstcharflags == REQ_UNSET)
7834        {
7835        if (subfirstcharflags >= 0)
7836          {
7837          firstchar = subfirstchar;
7838          firstcharflags = subfirstcharflags;
7839          groupsetfirstchar = TRUE;
7840          }
7841        else firstcharflags = REQ_NONE;
7842        zerofirstcharflags = REQ_NONE;
7843        }
7844
7845      /* If firstchar was previously set, convert the subpattern's firstchar
7846      into reqchar if there wasn't one, using the vary flag that was in
7847      existence beforehand. */
7848
7849      else if (subfirstcharflags >= 0 && subreqcharflags < 0)
7850        {
7851        subreqchar = subfirstchar;
7852        subreqcharflags = subfirstcharflags | tempreqvary;
7853        }
7854
7855      /* If the subpattern set a required byte (or set a first byte that isn't
7856      really the first byte - see above), set it. */
7857
7858      if (subreqcharflags >= 0)
7859        {
7860        reqchar = subreqchar;
7861        reqcharflags = subreqcharflags;
7862        }
7863      }
7864
7865    /* For a forward assertion, we take the reqchar, if set. This can be
7866    helpful if the pattern that follows the assertion doesn't set a different
7867    char. For example, it's useful for /(?=abcde).+/. We can't set firstchar
7868    for an assertion, however because it leads to incorrect effect for patterns
7869    such as /(?=a)a.+/ when the "real" "a" would then become a reqchar instead
7870    of a firstchar. This is overcome by a scan at the end if there's no
7871    firstchar, looking for an asserted first char. */
7872
7873    else if (bravalue == OP_ASSERT && subreqcharflags >= 0)
7874      {
7875      reqchar = subreqchar;
7876      reqcharflags = subreqcharflags;
7877      }
7878    break;     /* End of processing '(' */
7879
7880
7881    /* ===================================================================*/
7882    /* Handle metasequences introduced by \. For ones like \d, the ESC_ values
7883    are arranged to be the negation of the corresponding OP_values in the
7884    default case when PCRE_UCP is not set. For the back references, the values
7885    are negative the reference number. Only back references and those types
7886    that consume a character may be repeated. We can test for values between
7887    ESC_b and ESC_Z for the latter; this may have to change if any new ones are
7888    ever created. */
7889
7890    case CHAR_BACKSLASH:
7891    tempptr = ptr;
7892    escape = check_escape(&ptr, &ec, errorcodeptr, cd->bracount, options, FALSE);
7893    if (*errorcodeptr != 0) goto FAILED;
7894
7895    if (escape == 0)                  /* The escape coded a single character */
7896      c = ec;
7897    else
7898      {
7899      if (escape == ESC_Q)            /* Handle start of quoted string */
7900        {
7901        if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
7902          ptr += 2;               /* avoid empty string */
7903            else inescq = TRUE;
7904        continue;
7905        }
7906
7907      if (escape == ESC_E) continue;  /* Perl ignores an orphan \E */
7908
7909      /* For metasequences that actually match a character, we disable the
7910      setting of a first character if it hasn't already been set. */
7911
7912      if (firstcharflags == REQ_UNSET && escape > ESC_b && escape < ESC_Z)
7913        firstcharflags = REQ_NONE;
7914
7915      /* Set values to reset to if this is followed by a zero repeat. */
7916
7917      zerofirstchar = firstchar;
7918      zerofirstcharflags = firstcharflags;
7919      zeroreqchar = reqchar;
7920      zeroreqcharflags = reqcharflags;
7921
7922      /* \g<name> or \g'name' is a subroutine call by name and \g<n> or \g'n'
7923      is a subroutine call by number (Oniguruma syntax). In fact, the value
7924      ESC_g is returned only for these cases. So we don't need to check for <
7925      or ' if the value is ESC_g. For the Perl syntax \g{n} the value is
7926      -n, and for the Perl syntax \g{name} the result is ESC_k (as
7927      that is a synonym for a named back reference). */
7928
7929      if (escape == ESC_g)
7930        {
7931        const pcre_uchar *p;
7932        pcre_uint32 cf;
7933
7934        item_hwm_offset = cd->hwm - cd->start_workspace;   /* Normally this is set when '(' is read */
7935        terminator = (*(++ptr) == CHAR_LESS_THAN_SIGN)?
7936          CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;
7937
7938        /* These two statements stop the compiler for warning about possibly
7939        unset variables caused by the jump to HANDLE_NUMERICAL_RECURSION. In
7940        fact, because we do the check for a number below, the paths that
7941        would actually be in error are never taken. */
7942
7943        skipbytes = 0;
7944        reset_bracount = FALSE;
7945
7946        /* If it's not a signed or unsigned number, treat it as a name. */
7947
7948        cf = ptr[1];
7949        if (cf != CHAR_PLUS && cf != CHAR_MINUS && !IS_DIGIT(cf))
7950          {
7951          is_recurse = TRUE;
7952          goto NAMED_REF_OR_RECURSE;
7953          }
7954
7955        /* Signed or unsigned number (cf = ptr[1]) is known to be plus or minus
7956        or a digit. */
7957
7958        p = ptr + 2;
7959        while (IS_DIGIT(*p)) p++;
7960        if (*p != (pcre_uchar)terminator)
7961          {
7962          *errorcodeptr = ERR57;
7963          goto FAILED;
7964          }
7965        ptr++;
7966        goto HANDLE_NUMERICAL_RECURSION;
7967        }
7968
7969      /* \k<name> or \k'name' is a back reference by name (Perl syntax).
7970      We also support \k{name} (.NET syntax).  */
7971
7972      if (escape == ESC_k)
7973        {
7974        if ((ptr[1] != CHAR_LESS_THAN_SIGN &&
7975          ptr[1] != CHAR_APOSTROPHE && ptr[1] != CHAR_LEFT_CURLY_BRACKET))
7976          {
7977          *errorcodeptr = ERR69;
7978          goto FAILED;
7979          }
7980        is_recurse = FALSE;
7981        terminator = (*(++ptr) == CHAR_LESS_THAN_SIGN)?
7982          CHAR_GREATER_THAN_SIGN : (*ptr == CHAR_APOSTROPHE)?
7983          CHAR_APOSTROPHE : CHAR_RIGHT_CURLY_BRACKET;
7984        goto NAMED_REF_OR_RECURSE;
7985        }
7986
7987      /* Back references are handled specially; must disable firstchar if
7988      not set to cope with cases like (?=(\w+))\1: which would otherwise set
7989      ':' later. */
7990
7991      if (escape < 0)
7992        {
7993        open_capitem *oc;
7994        recno = -escape;
7995
7996        /* Come here from named backref handling when the reference is to a
7997        single group (i.e. not to a duplicated name. */
7998
7999        HANDLE_REFERENCE:
8000        if (firstcharflags == REQ_UNSET) firstcharflags = REQ_NONE;
8001        previous = code;
8002        item_hwm_offset = cd->hwm - cd->start_workspace;
8003        *code++ = ((options & PCRE_CASELESS) != 0)? OP_REFI : OP_REF;
8004        PUT2INC(code, 0, recno);
8005        cd->backref_map |= (recno < 32)? (1 << recno) : 1;
8006        if (recno > cd->top_backref) cd->top_backref = recno;
8007
8008        /* Check to see if this back reference is recursive, that it, it
8009        is inside the group that it references. A flag is set so that the
8010        group can be made atomic. */
8011
8012        for (oc = cd->open_caps; oc != NULL; oc = oc->next)
8013          {
8014          if (oc->number == recno)
8015            {
8016            oc->flag = TRUE;
8017            break;
8018            }
8019          }
8020        }
8021
8022      /* So are Unicode property matches, if supported. */
8023
8024#ifdef SUPPORT_UCP
8025      else if (escape == ESC_P || escape == ESC_p)
8026        {
8027        BOOL negated;
8028        unsigned int ptype = 0, pdata = 0;
8029        if (!get_ucp(&ptr, &negated, &ptype, &pdata, errorcodeptr))
8030          goto FAILED;
8031        previous = code;
8032        item_hwm_offset = cd->hwm - cd->start_workspace;
8033        *code++ = ((escape == ESC_p) != negated)? OP_PROP : OP_NOTPROP;
8034        *code++ = ptype;
8035        *code++ = pdata;
8036        }
8037#else
8038
8039      /* If Unicode properties are not supported, \X, \P, and \p are not
8040      allowed. */
8041
8042      else if (escape == ESC_X || escape == ESC_P || escape == ESC_p)
8043        {
8044        *errorcodeptr = ERR45;
8045        goto FAILED;
8046        }
8047#endif
8048
8049      /* For the rest (including \X when Unicode properties are supported), we
8050      can obtain the OP value by negating the escape value in the default
8051      situation when PCRE_UCP is not set. When it *is* set, we substitute
8052      Unicode property tests. Note that \b and \B do a one-character
8053      lookbehind, and \A also behaves as if it does. */
8054
8055      else
8056        {
8057        if ((escape == ESC_b || escape == ESC_B || escape == ESC_A) &&
8058             cd->max_lookbehind == 0)
8059          cd->max_lookbehind = 1;
8060#ifdef SUPPORT_UCP
8061        if (escape >= ESC_DU && escape <= ESC_wu)
8062          {
8063          nestptr = ptr + 1;                   /* Where to resume */
8064          ptr = substitutes[escape - ESC_DU] - 1;  /* Just before substitute */
8065          }
8066        else
8067#endif
8068        /* In non-UTF-8 mode, we turn \C into OP_ALLANY instead of OP_ANYBYTE
8069        so that it works in DFA mode and in lookbehinds. */
8070
8071          {
8072          previous = (escape > ESC_b && escape < ESC_Z)? code : NULL;
8073          item_hwm_offset = cd->hwm - cd->start_workspace;
8074          *code++ = (!utf && escape == ESC_C)? OP_ALLANY : escape;
8075          }
8076        }
8077      continue;
8078      }
8079
8080    /* We have a data character whose value is in c. In UTF-8 mode it may have
8081    a value > 127. We set its representation in the length/buffer, and then
8082    handle it as a data character. */
8083
8084#if defined SUPPORT_UTF && !defined COMPILE_PCRE32
8085    if (utf && c > MAX_VALUE_FOR_SINGLE_CHAR)
8086      mclength = PRIV(ord2utf)(c, mcbuffer);
8087    else
8088#endif
8089
8090     {
8091     mcbuffer[0] = c;
8092     mclength = 1;
8093     }
8094    goto ONE_CHAR;
8095
8096
8097    /* ===================================================================*/
8098    /* Handle a literal character. It is guaranteed not to be whitespace or #
8099    when the extended flag is set. If we are in a UTF mode, it may be a
8100    multi-unit literal character. */
8101
8102    default:
8103    NORMAL_CHAR:
8104    mclength = 1;
8105    mcbuffer[0] = c;
8106
8107#ifdef SUPPORT_UTF
8108    if (utf && HAS_EXTRALEN(c))
8109      ACROSSCHAR(TRUE, ptr[1], mcbuffer[mclength++] = *(++ptr));
8110#endif
8111
8112    /* At this point we have the character's bytes in mcbuffer, and the length
8113    in mclength. When not in UTF-8 mode, the length is always 1. */
8114
8115    ONE_CHAR:
8116    previous = code;
8117    item_hwm_offset = cd->hwm - cd->start_workspace;
8118
8119    /* For caseless UTF-8 mode when UCP support is available, check whether
8120    this character has more than one other case. If so, generate a special
8121    OP_PROP item instead of OP_CHARI. */
8122
8123#ifdef SUPPORT_UCP
8124    if (utf && (options & PCRE_CASELESS) != 0)
8125      {
8126      GETCHAR(c, mcbuffer);
8127      if ((c = UCD_CASESET(c)) != 0)
8128        {
8129        *code++ = OP_PROP;
8130        *code++ = PT_CLIST;
8131        *code++ = c;
8132        if (firstcharflags == REQ_UNSET)
8133          firstcharflags = zerofirstcharflags = REQ_NONE;
8134        break;
8135        }
8136      }
8137#endif
8138
8139    /* Caseful matches, or not one of the multicase characters. */
8140
8141    *code++ = ((options & PCRE_CASELESS) != 0)? OP_CHARI : OP_CHAR;
8142    for (c = 0; c < mclength; c++) *code++ = mcbuffer[c];
8143
8144    /* Remember if \r or \n were seen */
8145
8146    if (mcbuffer[0] == CHAR_CR || mcbuffer[0] == CHAR_NL)
8147      cd->external_flags |= PCRE_HASCRORLF;
8148
8149    /* Set the first and required bytes appropriately. If no previous first
8150    byte, set it from this character, but revert to none on a zero repeat.
8151    Otherwise, leave the firstchar value alone, and don't change it on a zero
8152    repeat. */
8153
8154    if (firstcharflags == REQ_UNSET)
8155      {
8156      zerofirstcharflags = REQ_NONE;
8157      zeroreqchar = reqchar;
8158      zeroreqcharflags = reqcharflags;
8159
8160      /* If the character is more than one byte long, we can set firstchar
8161      only if it is not to be matched caselessly. */
8162
8163      if (mclength == 1 || req_caseopt == 0)
8164        {
8165        firstchar = mcbuffer[0] | req_caseopt;
8166        firstchar = mcbuffer[0];
8167        firstcharflags = req_caseopt;
8168
8169        if (mclength != 1)
8170          {
8171          reqchar = code[-1];
8172          reqcharflags = cd->req_varyopt;
8173          }
8174        }
8175      else firstcharflags = reqcharflags = REQ_NONE;
8176      }
8177
8178    /* firstchar was previously set; we can set reqchar only if the length is
8179    1 or the matching is caseful. */
8180
8181    else
8182      {
8183      zerofirstchar = firstchar;
8184      zerofirstcharflags = firstcharflags;
8185      zeroreqchar = reqchar;
8186      zeroreqcharflags = reqcharflags;
8187      if (mclength == 1 || req_caseopt == 0)
8188        {
8189        reqchar = code[-1];
8190        reqcharflags = req_caseopt | cd->req_varyopt;
8191        }
8192      }
8193
8194    break;            /* End of literal character handling */
8195    }
8196  }                   /* end of big loop */
8197
8198
8199/* Control never reaches here by falling through, only by a goto for all the
8200error states. Pass back the position in the pattern so that it can be displayed
8201to the user for diagnosing the error. */
8202
8203FAILED:
8204*ptrptr = ptr;
8205return FALSE;
8206}
8207
8208
8209
8210/*************************************************
8211*     Compile sequence of alternatives           *
8212*************************************************/
8213
8214/* On entry, ptr is pointing past the bracket character, but on return it
8215points to the closing bracket, or vertical bar, or end of string. The code
8216variable is pointing at the byte into which the BRA operator has been stored.
8217This function is used during the pre-compile phase when we are trying to find
8218out the amount of memory needed, as well as during the real compile phase. The
8219value of lengthptr distinguishes the two phases.
8220
8221Arguments:
8222  options           option bits, including any changes for this subpattern
8223  codeptr           -> the address of the current code pointer
8224  ptrptr            -> the address of the current pattern pointer
8225  errorcodeptr      -> pointer to error code variable
8226  lookbehind        TRUE if this is a lookbehind assertion
8227  reset_bracount    TRUE to reset the count for each branch
8228  skipbytes         skip this many bytes at start (for brackets and OP_COND)
8229  cond_depth        depth of nesting for conditional subpatterns
8230  firstcharptr      place to put the first required character
8231  firstcharflagsptr place to put the first character flags, or a negative number
8232  reqcharptr        place to put the last required character
8233  reqcharflagsptr   place to put the last required character flags, or a negative number
8234  bcptr             pointer to the chain of currently open branches
8235  cd                points to the data block with tables pointers etc.
8236  lengthptr         NULL during the real compile phase
8237                    points to length accumulator during pre-compile phase
8238
8239Returns:            TRUE on success
8240*/
8241
8242static BOOL
8243compile_regex(int options, pcre_uchar **codeptr, const pcre_uchar **ptrptr,
8244  int *errorcodeptr, BOOL lookbehind, BOOL reset_bracount, int skipbytes,
8245  int cond_depth,
8246  pcre_uint32 *firstcharptr, pcre_int32 *firstcharflagsptr,
8247  pcre_uint32 *reqcharptr, pcre_int32 *reqcharflagsptr,
8248  branch_chain *bcptr, compile_data *cd, int *lengthptr)
8249{
8250const pcre_uchar *ptr = *ptrptr;
8251pcre_uchar *code = *codeptr;
8252pcre_uchar *last_branch = code;
8253pcre_uchar *start_bracket = code;
8254pcre_uchar *reverse_count = NULL;
8255open_capitem capitem;
8256int capnumber = 0;
8257pcre_uint32 firstchar, reqchar;
8258pcre_int32 firstcharflags, reqcharflags;
8259pcre_uint32 branchfirstchar, branchreqchar;
8260pcre_int32 branchfirstcharflags, branchreqcharflags;
8261int length;
8262unsigned int orig_bracount;
8263unsigned int max_bracount;
8264branch_chain bc;
8265size_t save_hwm_offset;
8266
8267/* If set, call the external function that checks for stack availability. */
8268
8269if (PUBL(stack_guard) != NULL && PUBL(stack_guard)())
8270  {
8271  *errorcodeptr= ERR85;
8272  return FALSE;
8273  }
8274
8275/* Miscellaneous initialization */
8276
8277bc.outer = bcptr;
8278bc.current_branch = code;
8279
8280firstchar = reqchar = 0;
8281firstcharflags = reqcharflags = REQ_UNSET;
8282
8283save_hwm_offset = cd->hwm - cd->start_workspace;
8284
8285/* Accumulate the length for use in the pre-compile phase. Start with the
8286length of the BRA and KET and any extra bytes that are required at the
8287beginning. We accumulate in a local variable to save frequent testing of
8288lenthptr for NULL. We cannot do this by looking at the value of code at the
8289start and end of each alternative, because compiled items are discarded during
8290the pre-compile phase so that the work space is not exceeded. */
8291
8292length = 2 + 2*LINK_SIZE + skipbytes;
8293
8294/* WARNING: If the above line is changed for any reason, you must also change
8295the code that abstracts option settings at the start of the pattern and makes
8296them global. It tests the value of length for (2 + 2*LINK_SIZE) in the
8297pre-compile phase to find out whether anything has yet been compiled or not. */
8298
8299/* If this is a capturing subpattern, add to the chain of open capturing items
8300so that we can detect them if (*ACCEPT) is encountered. This is also used to
8301detect groups that contain recursive back references to themselves. Note that
8302only OP_CBRA need be tested here; changing this opcode to one of its variants,
8303e.g. OP_SCBRAPOS, happens later, after the group has been compiled. */
8304
8305if (*code == OP_CBRA)
8306  {
8307  capnumber = GET2(code, 1 + LINK_SIZE);
8308  capitem.number = capnumber;
8309  capitem.next = cd->open_caps;
8310  capitem.flag = FALSE;
8311  cd->open_caps = &capitem;
8312  }
8313
8314/* Offset is set zero to mark that this bracket is still open */
8315
8316PUT(code, 1, 0);
8317code += 1 + LINK_SIZE + skipbytes;
8318
8319/* Loop for each alternative branch */
8320
8321orig_bracount = max_bracount = cd->bracount;
8322for (;;)
8323  {
8324  /* For a (?| group, reset the capturing bracket count so that each branch
8325  uses the same numbers. */
8326
8327  if (reset_bracount) cd->bracount = orig_bracount;
8328
8329  /* Set up dummy OP_REVERSE if lookbehind assertion */
8330
8331  if (lookbehind)
8332    {
8333    *code++ = OP_REVERSE;
8334    reverse_count = code;
8335    PUTINC(code, 0, 0);
8336    length += 1 + LINK_SIZE;
8337    }
8338
8339  /* Now compile the branch; in the pre-compile phase its length gets added
8340  into the length. */
8341
8342  if (!compile_branch(&options, &code, &ptr, errorcodeptr, &branchfirstchar,
8343        &branchfirstcharflags, &branchreqchar, &branchreqcharflags, &bc,
8344        cond_depth, cd, (lengthptr == NULL)? NULL : &length))
8345    {
8346    *ptrptr = ptr;
8347    return FALSE;
8348    }
8349
8350  /* Keep the highest bracket count in case (?| was used and some branch
8351  has fewer than the rest. */
8352
8353  if (cd->bracount > max_bracount) max_bracount = cd->bracount;
8354
8355  /* In the real compile phase, there is some post-processing to be done. */
8356
8357  if (lengthptr == NULL)
8358    {
8359    /* If this is the first branch, the firstchar and reqchar values for the
8360    branch become the values for the regex. */
8361
8362    if (*last_branch != OP_ALT)
8363      {
8364      firstchar = branchfirstchar;
8365      firstcharflags = branchfirstcharflags;
8366      reqchar = branchreqchar;
8367      reqcharflags = branchreqcharflags;
8368      }
8369
8370    /* If this is not the first branch, the first char and reqchar have to
8371    match the values from all the previous branches, except that if the
8372    previous value for reqchar didn't have REQ_VARY set, it can still match,
8373    and we set REQ_VARY for the regex. */
8374
8375    else
8376      {
8377      /* If we previously had a firstchar, but it doesn't match the new branch,
8378      we have to abandon the firstchar for the regex, but if there was
8379      previously no reqchar, it takes on the value of the old firstchar. */
8380
8381      if (firstcharflags >= 0 &&
8382          (firstcharflags != branchfirstcharflags || firstchar != branchfirstchar))
8383        {
8384        if (reqcharflags < 0)
8385          {
8386          reqchar = firstchar;
8387          reqcharflags = firstcharflags;
8388          }
8389        firstcharflags = REQ_NONE;
8390        }
8391
8392      /* If we (now or from before) have no firstchar, a firstchar from the
8393      branch becomes a reqchar if there isn't a branch reqchar. */
8394
8395      if (firstcharflags < 0 && branchfirstcharflags >= 0 && branchreqcharflags < 0)
8396        {
8397        branchreqchar = branchfirstchar;
8398        branchreqcharflags = branchfirstcharflags;
8399        }
8400
8401      /* Now ensure that the reqchars match */
8402
8403      if (((reqcharflags & ~REQ_VARY) != (branchreqcharflags & ~REQ_VARY)) ||
8404          reqchar != branchreqchar)
8405        reqcharflags = REQ_NONE;
8406      else
8407        {
8408        reqchar = branchreqchar;
8409        reqcharflags |= branchreqcharflags; /* To "or" REQ_VARY */
8410        }
8411      }
8412
8413    /* If lookbehind, check that this branch matches a fixed-length string, and
8414    put the length into the OP_REVERSE item. Temporarily mark the end of the
8415    branch with OP_END. If the branch contains OP_RECURSE, the result is -3
8416    because there may be forward references that we can't check here. Set a
8417    flag to cause another lookbehind check at the end. Why not do it all at the
8418    end? Because common, erroneous checks are picked up here and the offset of
8419    the problem can be shown. */
8420
8421    if (lookbehind)
8422      {
8423      int fixed_length;
8424      *code = OP_END;
8425      fixed_length = find_fixedlength(last_branch,  (options & PCRE_UTF8) != 0,
8426        FALSE, cd, NULL);
8427      DPRINTF(("fixed length = %d\n", fixed_length));
8428      if (fixed_length == -3)
8429        {
8430        cd->check_lookbehind = TRUE;
8431        }
8432      else if (fixed_length < 0)
8433        {
8434        *errorcodeptr = (fixed_length == -2)? ERR36 :
8435                        (fixed_length == -4)? ERR70: ERR25;
8436        *ptrptr = ptr;
8437        return FALSE;
8438        }
8439      else
8440        {
8441        if (fixed_length > cd->max_lookbehind)
8442          cd->max_lookbehind = fixed_length;
8443        PUT(reverse_count, 0, fixed_length);
8444        }
8445      }
8446    }
8447
8448  /* Reached end of expression, either ')' or end of pattern. In the real
8449  compile phase, go back through the alternative branches and reverse the chain
8450  of offsets, with the field in the BRA item now becoming an offset to the
8451  first alternative. If there are no alternatives, it points to the end of the
8452  group. The length in the terminating ket is always the length of the whole
8453  bracketed item. Return leaving the pointer at the terminating char. */
8454
8455  if (*ptr != CHAR_VERTICAL_LINE)
8456    {
8457    if (lengthptr == NULL)
8458      {
8459      int branch_length = (int)(code - last_branch);
8460      do
8461        {
8462        int prev_length = GET(last_branch, 1);
8463        PUT(last_branch, 1, branch_length);
8464        branch_length = prev_length;
8465        last_branch -= branch_length;
8466        }
8467      while (branch_length > 0);
8468      }
8469
8470    /* Fill in the ket */
8471
8472    *code = OP_KET;
8473    PUT(code, 1, (int)(code - start_bracket));
8474    code += 1 + LINK_SIZE;
8475
8476    /* If it was a capturing subpattern, check to see if it contained any
8477    recursive back references. If so, we must wrap it in atomic brackets.
8478    Because we are moving code along, we must ensure that any pending recursive
8479    references are updated. In any event, remove the block from the chain. */
8480
8481    if (capnumber > 0)
8482      {
8483      if (cd->open_caps->flag)
8484        {
8485        *code = OP_END;
8486        adjust_recurse(start_bracket, 1 + LINK_SIZE,
8487          (options & PCRE_UTF8) != 0, cd, save_hwm_offset);
8488        memmove(start_bracket + 1 + LINK_SIZE, start_bracket,
8489          IN_UCHARS(code - start_bracket));
8490        *start_bracket = OP_ONCE;
8491        code += 1 + LINK_SIZE;
8492        PUT(start_bracket, 1, (int)(code - start_bracket));
8493        *code = OP_KET;
8494        PUT(code, 1, (int)(code - start_bracket));
8495        code += 1 + LINK_SIZE;
8496        length += 2 + 2*LINK_SIZE;
8497        }
8498      cd->open_caps = cd->open_caps->next;
8499      }
8500
8501    /* Retain the highest bracket number, in case resetting was used. */
8502
8503    cd->bracount = max_bracount;
8504
8505    /* Set values to pass back */
8506
8507    *codeptr = code;
8508    *ptrptr = ptr;
8509    *firstcharptr = firstchar;
8510    *firstcharflagsptr = firstcharflags;
8511    *reqcharptr = reqchar;
8512    *reqcharflagsptr = reqcharflags;
8513    if (lengthptr != NULL)
8514      {
8515      if (OFLOW_MAX - *lengthptr < length)
8516        {
8517        *errorcodeptr = ERR20;
8518        return FALSE;
8519        }
8520      *lengthptr += length;
8521      }
8522    return TRUE;
8523    }
8524
8525  /* Another branch follows. In the pre-compile phase, we can move the code
8526  pointer back to where it was for the start of the first branch. (That is,
8527  pretend that each branch is the only one.)
8528
8529  In the real compile phase, insert an ALT node. Its length field points back
8530  to the previous branch while the bracket remains open. At the end the chain
8531  is reversed. It's done like this so that the start of the bracket has a
8532  zero offset until it is closed, making it possible to detect recursion. */
8533
8534  if (lengthptr != NULL)
8535    {
8536    code = *codeptr + 1 + LINK_SIZE + skipbytes;
8537    length += 1 + LINK_SIZE;
8538    }
8539  else
8540    {
8541    *code = OP_ALT;
8542    PUT(code, 1, (int)(code - last_branch));
8543    bc.current_branch = last_branch = code;
8544    code += 1 + LINK_SIZE;
8545    }
8546
8547  ptr++;
8548  }
8549/* Control never reaches here */
8550}
8551
8552
8553
8554
8555/*************************************************
8556*          Check for anchored expression         *
8557*************************************************/
8558
8559/* Try to find out if this is an anchored regular expression. Consider each
8560alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
8561all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
8562it's anchored. However, if this is a multiline pattern, then only OP_SOD will
8563be found, because ^ generates OP_CIRCM in that mode.
8564
8565We can also consider a regex to be anchored if OP_SOM starts all its branches.
8566This is the code for \G, which means "match at start of match position, taking
8567into account the match offset".
8568
8569A branch is also implicitly anchored if it starts with .* and DOTALL is set,
8570because that will try the rest of the pattern at all possible matching points,
8571so there is no point trying again.... er ....
8572
8573.... except when the .* appears inside capturing parentheses, and there is a
8574subsequent back reference to those parentheses. We haven't enough information
8575to catch that case precisely.
8576
8577At first, the best we could do was to detect when .* was in capturing brackets
8578and the highest back reference was greater than or equal to that level.
8579However, by keeping a bitmap of the first 31 back references, we can catch some
8580of the more common cases more precisely.
8581
8582... A second exception is when the .* appears inside an atomic group, because
8583this prevents the number of characters it matches from being adjusted.
8584
8585Arguments:
8586  code           points to start of expression (the bracket)
8587  bracket_map    a bitmap of which brackets we are inside while testing; this
8588                  handles up to substring 31; after that we just have to take
8589                  the less precise approach
8590  cd             points to the compile data block
8591  atomcount      atomic group level
8592
8593Returns:     TRUE or FALSE
8594*/
8595
8596static BOOL
8597is_anchored(register const pcre_uchar *code, unsigned int bracket_map,
8598  compile_data *cd, int atomcount)
8599{
8600do {
8601   const pcre_uchar *scode = first_significant_code(
8602     code + PRIV(OP_lengths)[*code], FALSE);
8603   register int op = *scode;
8604
8605   /* Non-capturing brackets */
8606
8607   if (op == OP_BRA  || op == OP_BRAPOS ||
8608       op == OP_SBRA || op == OP_SBRAPOS)
8609     {
8610     if (!is_anchored(scode, bracket_map, cd, atomcount)) return FALSE;
8611     }
8612
8613   /* Capturing brackets */
8614
8615   else if (op == OP_CBRA  || op == OP_CBRAPOS ||
8616            op == OP_SCBRA || op == OP_SCBRAPOS)
8617     {
8618     int n = GET2(scode, 1+LINK_SIZE);
8619     int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
8620     if (!is_anchored(scode, new_map, cd, atomcount)) return FALSE;
8621     }
8622
8623   /* Positive forward assertions and conditions */
8624
8625   else if (op == OP_ASSERT || op == OP_COND)
8626     {
8627     if (!is_anchored(scode, bracket_map, cd, atomcount)) return FALSE;
8628     }
8629
8630   /* Atomic groups */
8631
8632   else if (op == OP_ONCE || op == OP_ONCE_NC)
8633     {
8634     if (!is_anchored(scode, bracket_map, cd, atomcount + 1))
8635       return FALSE;
8636     }
8637
8638   /* .* is not anchored unless DOTALL is set (which generates OP_ALLANY) and
8639   it isn't in brackets that are or may be referenced or inside an atomic
8640   group. */
8641
8642   else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR ||
8643             op == OP_TYPEPOSSTAR))
8644     {
8645     if (scode[1] != OP_ALLANY || (bracket_map & cd->backref_map) != 0 ||
8646         atomcount > 0 || cd->had_pruneorskip)
8647       return FALSE;
8648     }
8649
8650   /* Check for explicit anchoring */
8651
8652   else if (op != OP_SOD && op != OP_SOM && op != OP_CIRC) return FALSE;
8653
8654   code += GET(code, 1);
8655   }
8656while (*code == OP_ALT);   /* Loop for each alternative */
8657return TRUE;
8658}
8659
8660
8661
8662/*************************************************
8663*         Check for starting with ^ or .*        *
8664*************************************************/
8665
8666/* This is called to find out if every branch starts with ^ or .* so that
8667"first char" processing can be done to speed things up in multiline
8668matching and for non-DOTALL patterns that start with .* (which must start at
8669the beginning or after \n). As in the case of is_anchored() (see above), we
8670have to take account of back references to capturing brackets that contain .*
8671because in that case we can't make the assumption. Also, the appearance of .*
8672inside atomic brackets or in a pattern that contains *PRUNE or *SKIP does not
8673count, because once again the assumption no longer holds.
8674
8675Arguments:
8676  code           points to start of expression (the bracket)
8677  bracket_map    a bitmap of which brackets we are inside while testing; this
8678                  handles up to substring 31; after that we just have to take
8679                  the less precise approach
8680  cd             points to the compile data
8681  atomcount      atomic group level
8682
8683Returns:         TRUE or FALSE
8684*/
8685
8686static BOOL
8687is_startline(const pcre_uchar *code, unsigned int bracket_map,
8688  compile_data *cd, int atomcount)
8689{
8690do {
8691   const pcre_uchar *scode = first_significant_code(
8692     code + PRIV(OP_lengths)[*code], FALSE);
8693   register int op = *scode;
8694
8695   /* If we are at the start of a conditional assertion group, *both* the
8696   conditional assertion *and* what follows the condition must satisfy the test
8697   for start of line. Other kinds of condition fail. Note that there may be an
8698   auto-callout at the start of a condition. */
8699
8700   if (op == OP_COND)
8701     {
8702     scode += 1 + LINK_SIZE;
8703     if (*scode == OP_CALLOUT) scode += PRIV(OP_lengths)[OP_CALLOUT];
8704     switch (*scode)
8705       {
8706       case OP_CREF:
8707       case OP_DNCREF:
8708       case OP_RREF:
8709       case OP_DNRREF:
8710       case OP_DEF:
8711       case OP_FAIL:
8712       return FALSE;
8713
8714       default:     /* Assertion */
8715       if (!is_startline(scode, bracket_map, cd, atomcount)) return FALSE;
8716       do scode += GET(scode, 1); while (*scode == OP_ALT);
8717       scode += 1 + LINK_SIZE;
8718       break;
8719       }
8720     scode = first_significant_code(scode, FALSE);
8721     op = *scode;
8722     }
8723
8724   /* Non-capturing brackets */
8725
8726   if (op == OP_BRA  || op == OP_BRAPOS ||
8727       op == OP_SBRA || op == OP_SBRAPOS)
8728     {
8729     if (!is_startline(scode, bracket_map, cd, atomcount)) return FALSE;
8730     }
8731
8732   /* Capturing brackets */
8733
8734   else if (op == OP_CBRA  || op == OP_CBRAPOS ||
8735            op == OP_SCBRA || op == OP_SCBRAPOS)
8736     {
8737     int n = GET2(scode, 1+LINK_SIZE);
8738     int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
8739     if (!is_startline(scode, new_map, cd, atomcount)) return FALSE;
8740     }
8741
8742   /* Positive forward assertions */
8743
8744   else if (op == OP_ASSERT)
8745     {
8746     if (!is_startline(scode, bracket_map, cd, atomcount)) return FALSE;
8747     }
8748
8749   /* Atomic brackets */
8750
8751   else if (op == OP_ONCE || op == OP_ONCE_NC)
8752     {
8753     if (!is_startline(scode, bracket_map, cd, atomcount + 1)) return FALSE;
8754     }
8755
8756   /* .* means "start at start or after \n" if it isn't in atomic brackets or
8757   brackets that may be referenced, as long as the pattern does not contain
8758   *PRUNE or *SKIP, because these break the feature. Consider, for example,
8759   /.*?a(*PRUNE)b/ with the subject "aab", which matches "ab", i.e. not at the
8760   start of a line. */
8761
8762   else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR)
8763     {
8764     if (scode[1] != OP_ANY || (bracket_map & cd->backref_map) != 0 ||
8765         atomcount > 0 || cd->had_pruneorskip)
8766       return FALSE;
8767     }
8768
8769   /* Check for explicit circumflex; anything else gives a FALSE result. Note
8770   in particular that this includes atomic brackets OP_ONCE and OP_ONCE_NC
8771   because the number of characters matched by .* cannot be adjusted inside
8772   them. */
8773
8774   else if (op != OP_CIRC && op != OP_CIRCM) return FALSE;
8775
8776   /* Move on to the next alternative */
8777
8778   code += GET(code, 1);
8779   }
8780while (*code == OP_ALT);  /* Loop for each alternative */
8781return TRUE;
8782}
8783
8784
8785
8786/*************************************************
8787*       Check for asserted fixed first char      *
8788*************************************************/
8789
8790/* During compilation, the "first char" settings from forward assertions are
8791discarded, because they can cause conflicts with actual literals that follow.
8792However, if we end up without a first char setting for an unanchored pattern,
8793it is worth scanning the regex to see if there is an initial asserted first
8794char. If all branches start with the same asserted char, or with a
8795non-conditional bracket all of whose alternatives start with the same asserted
8796char (recurse ad lib), then we return that char, with the flags set to zero or
8797REQ_CASELESS; otherwise return zero with REQ_NONE in the flags.
8798
8799Arguments:
8800  code       points to start of expression (the bracket)
8801  flags      points to the first char flags, or to REQ_NONE
8802  inassert   TRUE if in an assertion
8803
8804Returns:     the fixed first char, or 0 with REQ_NONE in flags
8805*/
8806
8807static pcre_uint32
8808find_firstassertedchar(const pcre_uchar *code, pcre_int32 *flags,
8809  BOOL inassert)
8810{
8811register pcre_uint32 c = 0;
8812int cflags = REQ_NONE;
8813
8814*flags = REQ_NONE;
8815do {
8816   pcre_uint32 d;
8817   int dflags;
8818   int xl = (*code == OP_CBRA || *code == OP_SCBRA ||
8819             *code == OP_CBRAPOS || *code == OP_SCBRAPOS)? IMM2_SIZE:0;
8820   const pcre_uchar *scode = first_significant_code(code + 1+LINK_SIZE + xl,
8821     TRUE);
8822   register pcre_uchar op = *scode;
8823
8824   switch(op)
8825     {
8826     default:
8827     return 0;
8828
8829     case OP_BRA:
8830     case OP_BRAPOS:
8831     case OP_CBRA:
8832     case OP_SCBRA:
8833     case OP_CBRAPOS:
8834     case OP_SCBRAPOS:
8835     case OP_ASSERT:
8836     case OP_ONCE:
8837     case OP_ONCE_NC:
8838     d = find_firstassertedchar(scode, &dflags, op == OP_ASSERT);
8839     if (dflags < 0)
8840       return 0;
8841     if (cflags < 0) { c = d; cflags = dflags; } else if (c != d || cflags != dflags) return 0;
8842     break;
8843
8844     case OP_EXACT:
8845     scode += IMM2_SIZE;
8846     /* Fall through */
8847
8848     case OP_CHAR:
8849     case OP_PLUS:
8850     case OP_MINPLUS:
8851     case OP_POSPLUS:
8852     if (!inassert) return 0;
8853     if (cflags < 0) { c = scode[1]; cflags = 0; }
8854       else if (c != scode[1]) return 0;
8855     break;
8856
8857     case OP_EXACTI:
8858     scode += IMM2_SIZE;
8859     /* Fall through */
8860
8861     case OP_CHARI:
8862     case OP_PLUSI:
8863     case OP_MINPLUSI:
8864     case OP_POSPLUSI:
8865     if (!inassert) return 0;
8866     if (cflags < 0) { c = scode[1]; cflags = REQ_CASELESS; }
8867       else if (c != scode[1]) return 0;
8868     break;
8869     }
8870
8871   code += GET(code, 1);
8872   }
8873while (*code == OP_ALT);
8874
8875*flags = cflags;
8876return c;
8877}
8878
8879
8880
8881/*************************************************
8882*     Add an entry to the name/number table      *
8883*************************************************/
8884
8885/* This function is called between compiling passes to add an entry to the
8886name/number table, maintaining alphabetical order. Checking for permitted
8887and forbidden duplicates has already been done.
8888
8889Arguments:
8890  cd           the compile data block
8891  name         the name to add
8892  length       the length of the name
8893  groupno      the group number
8894
8895Returns:       nothing
8896*/
8897
8898static void
8899add_name(compile_data *cd, const pcre_uchar *name, int length,
8900  unsigned int groupno)
8901{
8902int i;
8903pcre_uchar *slot = cd->name_table;
8904
8905for (i = 0; i < cd->names_found; i++)
8906  {
8907  int crc = memcmp(name, slot+IMM2_SIZE, IN_UCHARS(length));
8908  if (crc == 0 && slot[IMM2_SIZE+length] != 0)
8909    crc = -1; /* Current name is a substring */
8910
8911  /* Make space in the table and break the loop for an earlier name. For a
8912  duplicate or later name, carry on. We do this for duplicates so that in the
8913  simple case (when ?(| is not used) they are in order of their numbers. In all
8914  cases they are in the order in which they appear in the pattern. */
8915
8916  if (crc < 0)
8917    {
8918    memmove(slot + cd->name_entry_size, slot,
8919      IN_UCHARS((cd->names_found - i) * cd->name_entry_size));
8920    break;
8921    }
8922
8923  /* Continue the loop for a later or duplicate name */
8924
8925  slot += cd->name_entry_size;
8926  }
8927
8928PUT2(slot, 0, groupno);
8929memcpy(slot + IMM2_SIZE, name, IN_UCHARS(length));
8930slot[IMM2_SIZE + length] = 0;
8931cd->names_found++;
8932}
8933
8934
8935
8936/*************************************************
8937*        Compile a Regular Expression            *
8938*************************************************/
8939
8940/* This function takes a string and returns a pointer to a block of store
8941holding a compiled version of the expression. The original API for this
8942function had no error code return variable; it is retained for backwards
8943compatibility. The new function is given a new name.
8944
8945Arguments:
8946  pattern       the regular expression
8947  options       various option bits
8948  errorcodeptr  pointer to error code variable (pcre_compile2() only)
8949                  can be NULL if you don't want a code value
8950  errorptr      pointer to pointer to error text
8951  erroroffset   ptr offset in pattern where error was detected
8952  tables        pointer to character tables or NULL
8953
8954Returns:        pointer to compiled data block, or NULL on error,
8955                with errorptr and erroroffset set
8956*/
8957
8958#if defined COMPILE_PCRE8
8959PCRE_EXP_DEFN pcre * PCRE_CALL_CONVENTION
8960pcre_compile(const char *pattern, int options, const char **errorptr,
8961  int *erroroffset, const unsigned char *tables)
8962#elif defined COMPILE_PCRE16
8963PCRE_EXP_DEFN pcre16 * PCRE_CALL_CONVENTION
8964pcre16_compile(PCRE_SPTR16 pattern, int options, const char **errorptr,
8965  int *erroroffset, const unsigned char *tables)
8966#elif defined COMPILE_PCRE32
8967PCRE_EXP_DEFN pcre32 * PCRE_CALL_CONVENTION
8968pcre32_compile(PCRE_SPTR32 pattern, int options, const char **errorptr,
8969  int *erroroffset, const unsigned char *tables)
8970#endif
8971{
8972#if defined COMPILE_PCRE8
8973return pcre_compile2(pattern, options, NULL, errorptr, erroroffset, tables);
8974#elif defined COMPILE_PCRE16
8975return pcre16_compile2(pattern, options, NULL, errorptr, erroroffset, tables);
8976#elif defined COMPILE_PCRE32
8977return pcre32_compile2(pattern, options, NULL, errorptr, erroroffset, tables);
8978#endif
8979}
8980
8981
8982#if defined COMPILE_PCRE8
8983PCRE_EXP_DEFN pcre * PCRE_CALL_CONVENTION
8984pcre_compile2(const char *pattern, int options, int *errorcodeptr,
8985  const char **errorptr, int *erroroffset, const unsigned char *tables)
8986#elif defined COMPILE_PCRE16
8987PCRE_EXP_DEFN pcre16 * PCRE_CALL_CONVENTION
8988pcre16_compile2(PCRE_SPTR16 pattern, int options, int *errorcodeptr,
8989  const char **errorptr, int *erroroffset, const unsigned char *tables)
8990#elif defined COMPILE_PCRE32
8991PCRE_EXP_DEFN pcre32 * PCRE_CALL_CONVENTION
8992pcre32_compile2(PCRE_SPTR32 pattern, int options, int *errorcodeptr,
8993  const char **errorptr, int *erroroffset, const unsigned char *tables)
8994#endif
8995{
8996REAL_PCRE *re;
8997int length = 1;  /* For final END opcode */
8998pcre_int32 firstcharflags, reqcharflags;
8999pcre_uint32 firstchar, reqchar;
9000pcre_uint32 limit_match = PCRE_UINT32_MAX;
9001pcre_uint32 limit_recursion = PCRE_UINT32_MAX;
9002int newline;
9003int errorcode = 0;
9004int skipatstart = 0;
9005BOOL utf;
9006BOOL never_utf = FALSE;
9007size_t size;
9008pcre_uchar *code;
9009const pcre_uchar *codestart;
9010const pcre_uchar *ptr;
9011compile_data compile_block;
9012compile_data *cd = &compile_block;
9013
9014/* This space is used for "compiling" into during the first phase, when we are
9015computing the amount of memory that is needed. Compiled items are thrown away
9016as soon as possible, so that a fairly large buffer should be sufficient for
9017this purpose. The same space is used in the second phase for remembering where
9018to fill in forward references to subpatterns. That may overflow, in which case
9019new memory is obtained from malloc(). */
9020
9021pcre_uchar cworkspace[COMPILE_WORK_SIZE];
9022
9023/* This vector is used for remembering name groups during the pre-compile. In a
9024similar way to cworkspace, it can be expanded using malloc() if necessary. */
9025
9026named_group named_groups[NAMED_GROUP_LIST_SIZE];
9027
9028/* Set this early so that early errors get offset 0. */
9029
9030ptr = (const pcre_uchar *)pattern;
9031
9032/* We can't pass back an error message if errorptr is NULL; I guess the best we
9033can do is just return NULL, but we can set a code value if there is a code
9034pointer. */
9035
9036if (errorptr == NULL)
9037  {
9038  if (errorcodeptr != NULL) *errorcodeptr = 99;
9039  return NULL;
9040  }
9041
9042*errorptr = NULL;
9043if (errorcodeptr != NULL) *errorcodeptr = ERR0;
9044
9045/* However, we can give a message for this error */
9046
9047if (erroroffset == NULL)
9048  {
9049  errorcode = ERR16;
9050  goto PCRE_EARLY_ERROR_RETURN2;
9051  }
9052
9053*erroroffset = 0;
9054
9055/* Set up pointers to the individual character tables */
9056
9057if (tables == NULL) tables = PRIV(default_tables);
9058cd->lcc = tables + lcc_offset;
9059cd->fcc = tables + fcc_offset;
9060cd->cbits = tables + cbits_offset;
9061cd->ctypes = tables + ctypes_offset;
9062
9063/* Check that all undefined public option bits are zero */
9064
9065if ((options & ~PUBLIC_COMPILE_OPTIONS) != 0)
9066  {
9067  errorcode = ERR17;
9068  goto PCRE_EARLY_ERROR_RETURN;
9069  }
9070
9071/* If PCRE_NEVER_UTF is set, remember it. */
9072
9073if ((options & PCRE_NEVER_UTF) != 0) never_utf = TRUE;
9074
9075/* Check for global one-time settings at the start of the pattern, and remember
9076the offset for later. */
9077
9078cd->external_flags = 0;   /* Initialize here for LIMIT_MATCH/RECURSION */
9079
9080while (ptr[skipatstart] == CHAR_LEFT_PARENTHESIS &&
9081       ptr[skipatstart+1] == CHAR_ASTERISK)
9082  {
9083  int newnl = 0;
9084  int newbsr = 0;
9085
9086/* For completeness and backward compatibility, (*UTFn) is supported in the
9087relevant libraries, but (*UTF) is generic and always supported. Note that
9088PCRE_UTF8 == PCRE_UTF16 == PCRE_UTF32. */
9089
9090#ifdef COMPILE_PCRE8
9091  if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_UTF8_RIGHTPAR, 5) == 0)
9092    { skipatstart += 7; options |= PCRE_UTF8; continue; }
9093#endif
9094#ifdef COMPILE_PCRE16
9095  if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_UTF16_RIGHTPAR, 6) == 0)
9096    { skipatstart += 8; options |= PCRE_UTF16; continue; }
9097#endif
9098#ifdef COMPILE_PCRE32
9099  if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_UTF32_RIGHTPAR, 6) == 0)
9100    { skipatstart += 8; options |= PCRE_UTF32; continue; }
9101#endif
9102
9103  else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_UTF_RIGHTPAR, 4) == 0)
9104    { skipatstart += 6; options |= PCRE_UTF8; continue; }
9105  else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_UCP_RIGHTPAR, 4) == 0)
9106    { skipatstart += 6; options |= PCRE_UCP; continue; }
9107  else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_NO_AUTO_POSSESS_RIGHTPAR, 16) == 0)
9108    { skipatstart += 18; options |= PCRE_NO_AUTO_POSSESS; continue; }
9109  else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_NO_START_OPT_RIGHTPAR, 13) == 0)
9110    { skipatstart += 15; options |= PCRE_NO_START_OPTIMIZE; continue; }
9111
9112  else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_LIMIT_MATCH_EQ, 12) == 0)
9113    {
9114    pcre_uint32 c = 0;
9115    int p = skipatstart + 14;
9116    while (isdigit(ptr[p]))
9117      {
9118      if (c > PCRE_UINT32_MAX / 10 - 1) break;   /* Integer overflow */
9119      c = c*10 + ptr[p++] - CHAR_0;
9120      }
9121    if (ptr[p++] != CHAR_RIGHT_PARENTHESIS) break;
9122    if (c < limit_match)
9123      {
9124      limit_match = c;
9125      cd->external_flags |= PCRE_MLSET;
9126      }
9127    skipatstart = p;
9128    continue;
9129    }
9130
9131  else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_LIMIT_RECURSION_EQ, 16) == 0)
9132    {
9133    pcre_uint32 c = 0;
9134    int p = skipatstart + 18;
9135    while (isdigit(ptr[p]))
9136      {
9137      if (c > PCRE_UINT32_MAX / 10 - 1) break;   /* Integer overflow check */
9138      c = c*10 + ptr[p++] - CHAR_0;
9139      }
9140    if (ptr[p++] != CHAR_RIGHT_PARENTHESIS) break;
9141    if (c < limit_recursion)
9142      {
9143      limit_recursion = c;
9144      cd->external_flags |= PCRE_RLSET;
9145      }
9146    skipatstart = p;
9147    continue;
9148    }
9149
9150  if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_CR_RIGHTPAR, 3) == 0)
9151    { skipatstart += 5; newnl = PCRE_NEWLINE_CR; }
9152  else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_LF_RIGHTPAR, 3)  == 0)
9153    { skipatstart += 5; newnl = PCRE_NEWLINE_LF; }
9154  else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_CRLF_RIGHTPAR, 5)  == 0)
9155    { skipatstart += 7; newnl = PCRE_NEWLINE_CR + PCRE_NEWLINE_LF; }
9156  else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_ANY_RIGHTPAR, 4) == 0)
9157    { skipatstart += 6; newnl = PCRE_NEWLINE_ANY; }
9158  else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_ANYCRLF_RIGHTPAR, 8) == 0)
9159    { skipatstart += 10; newnl = PCRE_NEWLINE_ANYCRLF; }
9160
9161  else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_BSR_ANYCRLF_RIGHTPAR, 12) == 0)
9162    { skipatstart += 14; newbsr = PCRE_BSR_ANYCRLF; }
9163  else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_BSR_UNICODE_RIGHTPAR, 12) == 0)
9164    { skipatstart += 14; newbsr = PCRE_BSR_UNICODE; }
9165
9166  if (newnl != 0)
9167    options = (options & ~PCRE_NEWLINE_BITS) | newnl;
9168  else if (newbsr != 0)
9169    options = (options & ~(PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) | newbsr;
9170  else break;
9171  }
9172
9173/* PCRE_UTF(16|32) have the same value as PCRE_UTF8. */
9174utf = (options & PCRE_UTF8) != 0;
9175if (utf && never_utf)
9176  {
9177  errorcode = ERR78;
9178  goto PCRE_EARLY_ERROR_RETURN2;
9179  }
9180
9181/* Can't support UTF unless PCRE has been compiled to include the code. The
9182return of an error code from PRIV(valid_utf)() is a new feature, introduced in
9183release 8.13. It is passed back from pcre_[dfa_]exec(), but at the moment is
9184not used here. */
9185
9186#ifdef SUPPORT_UTF
9187if (utf && (options & PCRE_NO_UTF8_CHECK) == 0 &&
9188     (errorcode = PRIV(valid_utf)((PCRE_PUCHAR)pattern, -1, erroroffset)) != 0)
9189  {
9190#if defined COMPILE_PCRE8
9191  errorcode = ERR44;
9192#elif defined COMPILE_PCRE16
9193  errorcode = ERR74;
9194#elif defined COMPILE_PCRE32
9195  errorcode = ERR77;
9196#endif
9197  goto PCRE_EARLY_ERROR_RETURN2;
9198  }
9199#else
9200if (utf)
9201  {
9202  errorcode = ERR32;
9203  goto PCRE_EARLY_ERROR_RETURN;
9204  }
9205#endif
9206
9207/* Can't support UCP unless PCRE has been compiled to include the code. */
9208
9209#ifndef SUPPORT_UCP
9210if ((options & PCRE_UCP) != 0)
9211  {
9212  errorcode = ERR67;
9213  goto PCRE_EARLY_ERROR_RETURN;
9214  }
9215#endif
9216
9217/* Check validity of \R options. */
9218
9219if ((options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) ==
9220     (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
9221  {
9222  errorcode = ERR56;
9223  goto PCRE_EARLY_ERROR_RETURN;
9224  }
9225
9226/* Handle different types of newline. The three bits give seven cases. The
9227current code allows for fixed one- or two-byte sequences, plus "any" and
9228"anycrlf". */
9229
9230switch (options & PCRE_NEWLINE_BITS)
9231  {
9232  case 0: newline = NEWLINE; break;   /* Build-time default */
9233  case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
9234  case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
9235  case PCRE_NEWLINE_CR+
9236       PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
9237  case PCRE_NEWLINE_ANY: newline = -1; break;
9238  case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
9239  default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN;
9240  }
9241
9242if (newline == -2)
9243  {
9244  cd->nltype = NLTYPE_ANYCRLF;
9245  }
9246else if (newline < 0)
9247  {
9248  cd->nltype = NLTYPE_ANY;
9249  }
9250else
9251  {
9252  cd->nltype = NLTYPE_FIXED;
9253  if (newline > 255)
9254    {
9255    cd->nllen = 2;
9256    cd->nl[0] = (newline >> 8) & 255;
9257    cd->nl[1] = newline & 255;
9258    }
9259  else
9260    {
9261    cd->nllen = 1;
9262    cd->nl[0] = newline;
9263    }
9264  }
9265
9266/* Maximum back reference and backref bitmap. The bitmap records up to 31 back
9267references to help in deciding whether (.*) can be treated as anchored or not.
9268*/
9269
9270cd->top_backref = 0;
9271cd->backref_map = 0;
9272
9273/* Reflect pattern for debugging output */
9274
9275DPRINTF(("------------------------------------------------------------------\n"));
9276#ifdef PCRE_DEBUG
9277print_puchar(stdout, (PCRE_PUCHAR)pattern);
9278#endif
9279DPRINTF(("\n"));
9280
9281/* Pretend to compile the pattern while actually just accumulating the length
9282of memory required. This behaviour is triggered by passing a non-NULL final
9283argument to compile_regex(). We pass a block of workspace (cworkspace) for it
9284to compile parts of the pattern into; the compiled code is discarded when it is
9285no longer needed, so hopefully this workspace will never overflow, though there
9286is a test for its doing so. */
9287
9288cd->bracount = cd->final_bracount = 0;
9289cd->names_found = 0;
9290cd->name_entry_size = 0;
9291cd->name_table = NULL;
9292cd->dupnames = FALSE;
9293cd->dupgroups = FALSE;
9294cd->namedrefcount = 0;
9295cd->start_code = cworkspace;
9296cd->hwm = cworkspace;
9297cd->iscondassert = FALSE;
9298cd->start_workspace = cworkspace;
9299cd->workspace_size = COMPILE_WORK_SIZE;
9300cd->named_groups = named_groups;
9301cd->named_group_list_size = NAMED_GROUP_LIST_SIZE;
9302cd->start_pattern = (const pcre_uchar *)pattern;
9303cd->end_pattern = (const pcre_uchar *)(pattern + STRLEN_UC((const pcre_uchar *)pattern));
9304cd->req_varyopt = 0;
9305cd->parens_depth = 0;
9306cd->assert_depth = 0;
9307cd->max_lookbehind = 0;
9308cd->external_options = options;
9309cd->open_caps = NULL;
9310
9311/* Now do the pre-compile. On error, errorcode will be set non-zero, so we
9312don't need to look at the result of the function here. The initial options have
9313been put into the cd block so that they can be changed if an option setting is
9314found within the regex right at the beginning. Bringing initial option settings
9315outside can help speed up starting point checks. */
9316
9317ptr += skipatstart;
9318code = cworkspace;
9319*code = OP_BRA;
9320
9321(void)compile_regex(cd->external_options, &code, &ptr, &errorcode, FALSE,
9322  FALSE, 0, 0, &firstchar, &firstcharflags, &reqchar, &reqcharflags, NULL,
9323  cd, &length);
9324if (errorcode != 0) goto PCRE_EARLY_ERROR_RETURN;
9325
9326DPRINTF(("end pre-compile: length=%d workspace=%d\n", length,
9327  (int)(cd->hwm - cworkspace)));
9328
9329if (length > MAX_PATTERN_SIZE)
9330  {
9331  errorcode = ERR20;
9332  goto PCRE_EARLY_ERROR_RETURN;
9333  }
9334
9335/* Compute the size of the data block for storing the compiled pattern. Integer
9336overflow should no longer be possible because nowadays we limit the maximum
9337value of cd->names_found and cd->name_entry_size. */
9338
9339size = sizeof(REAL_PCRE) +
9340  (length + cd->names_found * cd->name_entry_size) * sizeof(pcre_uchar);
9341
9342/* Get the memory. */
9343
9344re = (REAL_PCRE *)(PUBL(malloc))(size);
9345if (re == NULL)
9346  {
9347  errorcode = ERR21;
9348  goto PCRE_EARLY_ERROR_RETURN;
9349  }
9350
9351/* Put in the magic number, and save the sizes, initial options, internal
9352flags, and character table pointer. NULL is used for the default character
9353tables. The nullpad field is at the end; it's there to help in the case when a
9354regex compiled on a system with 4-byte pointers is run on another with 8-byte
9355pointers. */
9356
9357re->magic_number = MAGIC_NUMBER;
9358re->size = (int)size;
9359re->options = cd->external_options;
9360re->flags = cd->external_flags;
9361re->limit_match = limit_match;
9362re->limit_recursion = limit_recursion;
9363re->first_char = 0;
9364re->req_char = 0;
9365re->name_table_offset = sizeof(REAL_PCRE) / sizeof(pcre_uchar);
9366re->name_entry_size = cd->name_entry_size;
9367re->name_count = cd->names_found;
9368re->ref_count = 0;
9369re->tables = (tables == PRIV(default_tables))? NULL : tables;
9370re->nullpad = NULL;
9371#ifdef COMPILE_PCRE32
9372re->dummy = 0;
9373#else
9374re->dummy1 = re->dummy2 = re->dummy3 = 0;
9375#endif
9376
9377/* The starting points of the name/number translation table and of the code are
9378passed around in the compile data block. The start/end pattern and initial
9379options are already set from the pre-compile phase, as is the name_entry_size
9380field. Reset the bracket count and the names_found field. Also reset the hwm
9381field; this time it's used for remembering forward references to subpatterns.
9382*/
9383
9384cd->final_bracount = cd->bracount;  /* Save for checking forward references */
9385cd->parens_depth = 0;
9386cd->assert_depth = 0;
9387cd->bracount = 0;
9388cd->max_lookbehind = 0;
9389cd->name_table = (pcre_uchar *)re + re->name_table_offset;
9390codestart = cd->name_table + re->name_entry_size * re->name_count;
9391cd->start_code = codestart;
9392cd->hwm = (pcre_uchar *)(cd->start_workspace);
9393cd->iscondassert = FALSE;
9394cd->req_varyopt = 0;
9395cd->had_accept = FALSE;
9396cd->had_pruneorskip = FALSE;
9397cd->check_lookbehind = FALSE;
9398cd->open_caps = NULL;
9399
9400/* If any named groups were found, create the name/number table from the list
9401created in the first pass. */
9402
9403if (cd->names_found > 0)
9404  {
9405  int i = cd->names_found;
9406  named_group *ng = cd->named_groups;
9407  cd->names_found = 0;
9408  for (; i > 0; i--, ng++)
9409    add_name(cd, ng->name, ng->length, ng->number);
9410  if (cd->named_group_list_size > NAMED_GROUP_LIST_SIZE)
9411    (PUBL(free))((void *)cd->named_groups);
9412  }
9413
9414/* Set up a starting, non-extracting bracket, then compile the expression. On
9415error, errorcode will be set non-zero, so we don't need to look at the result
9416of the function here. */
9417
9418ptr = (const pcre_uchar *)pattern + skipatstart;
9419code = (pcre_uchar *)codestart;
9420*code = OP_BRA;
9421(void)compile_regex(re->options, &code, &ptr, &errorcode, FALSE, FALSE, 0, 0,
9422  &firstchar, &firstcharflags, &reqchar, &reqcharflags, NULL, cd, NULL);
9423re->top_bracket = cd->bracount;
9424re->top_backref = cd->top_backref;
9425re->max_lookbehind = cd->max_lookbehind;
9426re->flags = cd->external_flags | PCRE_MODE;
9427
9428if (cd->had_accept)
9429  {
9430  reqchar = 0;              /* Must disable after (*ACCEPT) */
9431  reqcharflags = REQ_NONE;
9432  }
9433
9434/* If not reached end of pattern on success, there's an excess bracket. */
9435
9436if (errorcode == 0 && *ptr != CHAR_NULL) errorcode = ERR22;
9437
9438/* Fill in the terminating state and check for disastrous overflow, but
9439if debugging, leave the test till after things are printed out. */
9440
9441*code++ = OP_END;
9442
9443#ifndef PCRE_DEBUG
9444if (code - codestart > length) errorcode = ERR23;
9445#endif
9446
9447#ifdef SUPPORT_VALGRIND
9448/* If the estimated length exceeds the really used length, mark the extra
9449allocated memory as unaddressable, so that any out-of-bound reads can be
9450detected. */
9451VALGRIND_MAKE_MEM_NOACCESS(code, (length - (code - codestart)) * sizeof(pcre_uchar));
9452#endif
9453
9454/* Fill in any forward references that are required. There may be repeated
9455references; optimize for them, as searching a large regex takes time. */
9456
9457if (cd->hwm > cd->start_workspace)
9458  {
9459  int prev_recno = -1;
9460  const pcre_uchar *groupptr = NULL;
9461  while (errorcode == 0 && cd->hwm > cd->start_workspace)
9462    {
9463    int offset, recno;
9464    cd->hwm -= LINK_SIZE;
9465    offset = GET(cd->hwm, 0);
9466
9467    /* Check that the hwm handling hasn't gone wrong. This whole area is
9468    rewritten in PCRE2 because there are some obscure cases. */
9469
9470    if (offset == 0 || codestart[offset-1] != OP_RECURSE)
9471      {
9472      errorcode = ERR10;
9473      break;
9474      }
9475
9476    recno = GET(codestart, offset);
9477    if (recno != prev_recno)
9478      {
9479      groupptr = PRIV(find_bracket)(codestart, utf, recno);
9480      prev_recno = recno;
9481      }
9482    if (groupptr == NULL) errorcode = ERR53;
9483      else PUT(((pcre_uchar *)codestart), offset, (int)(groupptr - codestart));
9484    }
9485  }
9486
9487/* If the workspace had to be expanded, free the new memory. Set the pointer to
9488NULL to indicate that forward references have been filled in. */
9489
9490if (cd->workspace_size > COMPILE_WORK_SIZE)
9491  (PUBL(free))((void *)cd->start_workspace);
9492cd->start_workspace = NULL;
9493
9494/* Give an error if there's back reference to a non-existent capturing
9495subpattern. */
9496
9497if (errorcode == 0 && re->top_backref > re->top_bracket) errorcode = ERR15;
9498
9499/* Unless disabled, check whether any single character iterators can be
9500auto-possessified. The function overwrites the appropriate opcode values, so
9501the type of the pointer must be cast. NOTE: the intermediate variable "temp" is
9502used in this code because at least one compiler gives a warning about loss of
9503"const" attribute if the cast (pcre_uchar *)codestart is used directly in the
9504function call. */
9505
9506if (errorcode == 0 && (options & PCRE_NO_AUTO_POSSESS) == 0)
9507  {
9508  pcre_uchar *temp = (pcre_uchar *)codestart;
9509  auto_possessify(temp, utf, cd);
9510  }
9511
9512/* If there were any lookbehind assertions that contained OP_RECURSE
9513(recursions or subroutine calls), a flag is set for them to be checked here,
9514because they may contain forward references. Actual recursions cannot be fixed
9515length, but subroutine calls can. It is done like this so that those without
9516OP_RECURSE that are not fixed length get a diagnosic with a useful offset. The
9517exceptional ones forgo this. We scan the pattern to check that they are fixed
9518length, and set their lengths. */
9519
9520if (errorcode == 0 && cd->check_lookbehind)
9521  {
9522  pcre_uchar *cc = (pcre_uchar *)codestart;
9523
9524  /* Loop, searching for OP_REVERSE items, and process those that do not have
9525  their length set. (Actually, it will also re-process any that have a length
9526  of zero, but that is a pathological case, and it does no harm.) When we find
9527  one, we temporarily terminate the branch it is in while we scan it. */
9528
9529  for (cc = (pcre_uchar *)PRIV(find_bracket)(codestart, utf, -1);
9530       cc != NULL;
9531       cc = (pcre_uchar *)PRIV(find_bracket)(cc, utf, -1))
9532    {
9533    if (GET(cc, 1) == 0)
9534      {
9535      int fixed_length;
9536      pcre_uchar *be = cc - 1 - LINK_SIZE + GET(cc, -LINK_SIZE);
9537      int end_op = *be;
9538      *be = OP_END;
9539      fixed_length = find_fixedlength(cc, (re->options & PCRE_UTF8) != 0, TRUE,
9540        cd, NULL);
9541      *be = end_op;
9542      DPRINTF(("fixed length = %d\n", fixed_length));
9543      if (fixed_length < 0)
9544        {
9545        errorcode = (fixed_length == -2)? ERR36 :
9546                    (fixed_length == -4)? ERR70 : ERR25;
9547        break;
9548        }
9549      if (fixed_length > cd->max_lookbehind) cd->max_lookbehind = fixed_length;
9550      PUT(cc, 1, fixed_length);
9551      }
9552    cc += 1 + LINK_SIZE;
9553    }
9554  }
9555
9556/* Failed to compile, or error while post-processing */
9557
9558if (errorcode != 0)
9559  {
9560  (PUBL(free))(re);
9561  PCRE_EARLY_ERROR_RETURN:
9562  *erroroffset = (int)(ptr - (const pcre_uchar *)pattern);
9563  PCRE_EARLY_ERROR_RETURN2:
9564  *errorptr = find_error_text(errorcode);
9565  if (errorcodeptr != NULL) *errorcodeptr = errorcode;
9566  return NULL;
9567  }
9568
9569/* If the anchored option was not passed, set the flag if we can determine that
9570the pattern is anchored by virtue of ^ characters or \A or anything else, such
9571as starting with non-atomic .* when DOTALL is set and there are no occurrences
9572of *PRUNE or *SKIP.
9573
9574Otherwise, if we know what the first byte has to be, save it, because that
9575speeds up unanchored matches no end. If not, see if we can set the
9576PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
9577start with ^. and also when all branches start with non-atomic .* for
9578non-DOTALL matches when *PRUNE and SKIP are not present. */
9579
9580if ((re->options & PCRE_ANCHORED) == 0)
9581  {
9582  if (is_anchored(codestart, 0, cd, 0)) re->options |= PCRE_ANCHORED;
9583  else
9584    {
9585    if (firstcharflags < 0)
9586      firstchar = find_firstassertedchar(codestart, &firstcharflags, FALSE);
9587    if (firstcharflags >= 0)   /* Remove caseless flag for non-caseable chars */
9588      {
9589#if defined COMPILE_PCRE8
9590      re->first_char = firstchar & 0xff;
9591#elif defined COMPILE_PCRE16
9592      re->first_char = firstchar & 0xffff;
9593#elif defined COMPILE_PCRE32
9594      re->first_char = firstchar;
9595#endif
9596      if ((firstcharflags & REQ_CASELESS) != 0)
9597        {
9598#if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
9599        /* We ignore non-ASCII first chars in 8 bit mode. */
9600        if (utf)
9601          {
9602          if (re->first_char < 128)
9603            {
9604            if (cd->fcc[re->first_char] != re->first_char)
9605              re->flags |= PCRE_FCH_CASELESS;
9606            }
9607          else if (UCD_OTHERCASE(re->first_char) != re->first_char)
9608            re->flags |= PCRE_FCH_CASELESS;
9609          }
9610        else
9611#endif
9612        if (MAX_255(re->first_char)
9613            && cd->fcc[re->first_char] != re->first_char)
9614          re->flags |= PCRE_FCH_CASELESS;
9615        }
9616
9617      re->flags |= PCRE_FIRSTSET;
9618      }
9619
9620    else if (is_startline(codestart, 0, cd, 0)) re->flags |= PCRE_STARTLINE;
9621    }
9622  }
9623
9624/* For an anchored pattern, we use the "required byte" only if it follows a
9625variable length item in the regex. Remove the caseless flag for non-caseable
9626bytes. */
9627
9628if (reqcharflags >= 0 &&
9629     ((re->options & PCRE_ANCHORED) == 0 || (reqcharflags & REQ_VARY) != 0))
9630  {
9631#if defined COMPILE_PCRE8
9632  re->req_char = reqchar & 0xff;
9633#elif defined COMPILE_PCRE16
9634  re->req_char = reqchar & 0xffff;
9635#elif defined COMPILE_PCRE32
9636  re->req_char = reqchar;
9637#endif
9638  if ((reqcharflags & REQ_CASELESS) != 0)
9639    {
9640#if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
9641    /* We ignore non-ASCII first chars in 8 bit mode. */
9642    if (utf)
9643      {
9644      if (re->req_char < 128)
9645        {
9646        if (cd->fcc[re->req_char] != re->req_char)
9647          re->flags |= PCRE_RCH_CASELESS;
9648        }
9649      else if (UCD_OTHERCASE(re->req_char) != re->req_char)
9650        re->flags |= PCRE_RCH_CASELESS;
9651      }
9652    else
9653#endif
9654    if (MAX_255(re->req_char) && cd->fcc[re->req_char] != re->req_char)
9655      re->flags |= PCRE_RCH_CASELESS;
9656    }
9657
9658  re->flags |= PCRE_REQCHSET;
9659  }
9660
9661/* Print out the compiled data if debugging is enabled. This is never the
9662case when building a production library. */
9663
9664#ifdef PCRE_DEBUG
9665printf("Length = %d top_bracket = %d top_backref = %d\n",
9666  length, re->top_bracket, re->top_backref);
9667
9668printf("Options=%08x\n", re->options);
9669
9670if ((re->flags & PCRE_FIRSTSET) != 0)
9671  {
9672  pcre_uchar ch = re->first_char;
9673  const char *caseless =
9674    ((re->flags & PCRE_FCH_CASELESS) == 0)? "" : " (caseless)";
9675  if (PRINTABLE(ch)) printf("First char = %c%s\n", ch, caseless);
9676    else printf("First char = \\x%02x%s\n", ch, caseless);
9677  }
9678
9679if ((re->flags & PCRE_REQCHSET) != 0)
9680  {
9681  pcre_uchar ch = re->req_char;
9682  const char *caseless =
9683    ((re->flags & PCRE_RCH_CASELESS) == 0)? "" : " (caseless)";
9684  if (PRINTABLE(ch)) printf("Req char = %c%s\n", ch, caseless);
9685    else printf("Req char = \\x%02x%s\n", ch, caseless);
9686  }
9687
9688#if defined COMPILE_PCRE8
9689pcre_printint((pcre *)re, stdout, TRUE);
9690#elif defined COMPILE_PCRE16
9691pcre16_printint((pcre *)re, stdout, TRUE);
9692#elif defined COMPILE_PCRE32
9693pcre32_printint((pcre *)re, stdout, TRUE);
9694#endif
9695
9696/* This check is done here in the debugging case so that the code that
9697was compiled can be seen. */
9698
9699if (code - codestart > length)
9700  {
9701  (PUBL(free))(re);
9702  *errorptr = find_error_text(ERR23);
9703  *erroroffset = ptr - (pcre_uchar *)pattern;
9704  if (errorcodeptr != NULL) *errorcodeptr = ERR23;
9705  return NULL;
9706  }
9707#endif   /* PCRE_DEBUG */
9708
9709/* Check for a pattern than can match an empty string, so that this information
9710can be provided to applications. */
9711
9712do
9713  {
9714  if (could_be_empty_branch(codestart, code, utf, cd, NULL))
9715    {
9716    re->flags |= PCRE_MATCH_EMPTY;
9717    break;
9718    }
9719  codestart += GET(codestart, 1);
9720  }
9721while (*codestart == OP_ALT);
9722
9723#if defined COMPILE_PCRE8
9724return (pcre *)re;
9725#elif defined COMPILE_PCRE16
9726return (pcre16 *)re;
9727#elif defined COMPILE_PCRE32
9728return (pcre32 *)re;
9729#endif
9730}
9731
9732/* End of pcre_compile.c */
9733