1/*************************************************
2*      Perl-Compatible Regular Expressions       *
3*************************************************/
4
5/* PCRE is a library of functions to support regular expressions whose syntax
6and semantics are as close as possible to those of the Perl 5 language.
7
8                       Written by Philip Hazel
9           Copyright (c) 1997-2010 University of Cambridge
10
11-----------------------------------------------------------------------------
12Redistribution and use in source and binary forms, with or without
13modification, are permitted provided that the following conditions are met:
14
15    * Redistributions of source code must retain the above copyright notice,
16      this list of conditions and the following disclaimer.
17
18    * Redistributions in binary form must reproduce the above copyright
19      notice, this list of conditions and the following disclaimer in the
20      documentation and/or other materials provided with the distribution.
21
22    * Neither the name of the University of Cambridge nor the names of its
23      contributors may be used to endorse or promote products derived from
24      this software without specific prior written permission.
25
26THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36POSSIBILITY OF SUCH DAMAGE.
37-----------------------------------------------------------------------------
38*/
39
40
41/* This module contains the external function pcre_compile(), along with
42supporting internal functions that are not used by other modules. */
43
44
45#ifdef HAVE_CONFIG_H
46#include "config.h"
47#endif
48
49#define NLBLOCK cd             /* Block containing newline information */
50#define PSSTART start_pattern  /* Field containing processed string start */
51#define PSEND   end_pattern    /* Field containing processed string end */
52
53#include "pcre_internal.h"
54
55
56/* When PCRE_DEBUG is defined, we need the pcre_printint() function, which is
57also used by pcretest. PCRE_DEBUG is not defined when building a production
58library. */
59
60#ifdef PCRE_DEBUG
61#include "pcre_printint.src"
62#endif
63
64
65/* Macro for setting individual bits in class bitmaps. */
66
67#define SETBIT(a,b) a[b/8] |= (1 << (b%8))
68
69/* Maximum length value to check against when making sure that the integer that
70holds the compiled pattern length does not overflow. We make it a bit less than
71INT_MAX to allow for adding in group terminating bytes, so that we don't have
72to check them every time. */
73
74#define OFLOW_MAX (INT_MAX - 20)
75
76
77/*************************************************
78*      Code parameters and static tables         *
79*************************************************/
80
81/* This value specifies the size of stack workspace that is used during the
82first pre-compile phase that determines how much memory is required. The regex
83is partly compiled into this space, but the compiled parts are discarded as
84soon as they can be, so that hopefully there will never be an overrun. The code
85does, however, check for an overrun. The largest amount I've seen used is 218,
86so this number is very generous.
87
88The same workspace is used during the second, actual compile phase for
89remembering forward references to groups so that they can be filled in at the
90end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
91is 4 there is plenty of room. */
92
93#define COMPILE_WORK_SIZE (4096)
94
95/* The overrun tests check for a slightly smaller size so that they detect the
96overrun before it actually does run off the end of the data block. */
97
98#define WORK_SIZE_CHECK (COMPILE_WORK_SIZE - 100)
99
100
101/* Table for handling escaped characters in the range '0'-'z'. Positive returns
102are simple data values; negative values are for special things like \d and so
103on. Zero means further processing is needed (for things like \x), or the escape
104is invalid. */
105
106#ifndef EBCDIC
107
108/* This is the "normal" table for ASCII systems or for EBCDIC systems running
109in UTF-8 mode. */
110
111static const short int escapes[] = {
112     0,                       0,
113     0,                       0,
114     0,                       0,
115     0,                       0,
116     0,                       0,
117     CHAR_COLON,              CHAR_SEMICOLON,
118     CHAR_LESS_THAN_SIGN,     CHAR_EQUALS_SIGN,
119     CHAR_GREATER_THAN_SIGN,  CHAR_QUESTION_MARK,
120     CHAR_COMMERCIAL_AT,      -ESC_A,
121     -ESC_B,                  -ESC_C,
122     -ESC_D,                  -ESC_E,
123     0,                       -ESC_G,
124     -ESC_H,                  0,
125     0,                       -ESC_K,
126     0,                       0,
127     -ESC_N,                  0,
128     -ESC_P,                  -ESC_Q,
129     -ESC_R,                  -ESC_S,
130     0,                       0,
131     -ESC_V,                  -ESC_W,
132     -ESC_X,                  0,
133     -ESC_Z,                  CHAR_LEFT_SQUARE_BRACKET,
134     CHAR_BACKSLASH,          CHAR_RIGHT_SQUARE_BRACKET,
135     CHAR_CIRCUMFLEX_ACCENT,  CHAR_UNDERSCORE,
136     CHAR_GRAVE_ACCENT,       7,
137     -ESC_b,                  0,
138     -ESC_d,                  ESC_e,
139     ESC_f,                   0,
140     -ESC_h,                  0,
141     0,                       -ESC_k,
142     0,                       0,
143     ESC_n,                   0,
144     -ESC_p,                  0,
145     ESC_r,                   -ESC_s,
146     ESC_tee,                 0,
147     -ESC_v,                  -ESC_w,
148     0,                       0,
149     -ESC_z
150};
151
152#else
153
154/* This is the "abnormal" table for EBCDIC systems without UTF-8 support. */
155
156static const short int escapes[] = {
157/*  48 */     0,     0,      0,     '.',    '<',   '(',    '+',    '|',
158/*  50 */   '&',     0,      0,       0,      0,     0,      0,      0,
159/*  58 */     0,     0,    '!',     '$',    '*',   ')',    ';',    '~',
160/*  60 */   '-',   '/',      0,       0,      0,     0,      0,      0,
161/*  68 */     0,     0,    '|',     ',',    '%',   '_',    '>',    '?',
162/*  70 */     0,     0,      0,       0,      0,     0,      0,      0,
163/*  78 */     0,   '`',    ':',     '#',    '@',  '\'',    '=',    '"',
164/*  80 */     0,     7, -ESC_b,       0, -ESC_d, ESC_e,  ESC_f,      0,
165/*  88 */-ESC_h,     0,      0,     '{',      0,     0,      0,      0,
166/*  90 */     0,     0, -ESC_k,     'l',      0, ESC_n,      0, -ESC_p,
167/*  98 */     0, ESC_r,      0,     '}',      0,     0,      0,      0,
168/*  A0 */     0,   '~', -ESC_s, ESC_tee,      0,-ESC_v, -ESC_w,      0,
169/*  A8 */     0,-ESC_z,      0,       0,      0,   '[',      0,      0,
170/*  B0 */     0,     0,      0,       0,      0,     0,      0,      0,
171/*  B8 */     0,     0,      0,       0,      0,   ']',    '=',    '-',
172/*  C0 */   '{',-ESC_A, -ESC_B,  -ESC_C, -ESC_D,-ESC_E,      0, -ESC_G,
173/*  C8 */-ESC_H,     0,      0,       0,      0,     0,      0,      0,
174/*  D0 */   '}',     0, -ESC_K,       0,      0,-ESC_N,      0, -ESC_P,
175/*  D8 */-ESC_Q,-ESC_R,      0,       0,      0,     0,      0,      0,
176/*  E0 */  '\\',     0, -ESC_S,       0,      0,-ESC_V, -ESC_W, -ESC_X,
177/*  E8 */     0,-ESC_Z,      0,       0,      0,     0,      0,      0,
178/*  F0 */     0,     0,      0,       0,      0,     0,      0,      0,
179/*  F8 */     0,     0,      0,       0,      0,     0,      0,      0
180};
181#endif
182
183
184/* Table of special "verbs" like (*PRUNE). This is a short table, so it is
185searched linearly. Put all the names into a single string, in order to reduce
186the number of relocations when a shared library is dynamically linked. The
187string is built from string macros so that it works in UTF-8 mode on EBCDIC
188platforms. */
189
190typedef struct verbitem {
191  int   len;                 /* Length of verb name */
192  int   op;                  /* Op when no arg, or -1 if arg mandatory */
193  int   op_arg;              /* Op when arg present, or -1 if not allowed */
194} verbitem;
195
196static const char verbnames[] =
197  "\0"                       /* Empty name is a shorthand for MARK */
198  STRING_MARK0
199  STRING_ACCEPT0
200  STRING_COMMIT0
201  STRING_F0
202  STRING_FAIL0
203  STRING_PRUNE0
204  STRING_SKIP0
205  STRING_THEN;
206
207static const verbitem verbs[] = {
208  { 0, -1,        OP_MARK },
209  { 4, -1,        OP_MARK },
210  { 6, OP_ACCEPT, -1 },
211  { 6, OP_COMMIT, -1 },
212  { 1, OP_FAIL,   -1 },
213  { 4, OP_FAIL,   -1 },
214  { 5, OP_PRUNE,  OP_PRUNE_ARG },
215  { 4, OP_SKIP,   OP_SKIP_ARG  },
216  { 4, OP_THEN,   OP_THEN_ARG  }
217};
218
219static const int verbcount = sizeof(verbs)/sizeof(verbitem);
220
221
222/* Tables of names of POSIX character classes and their lengths. The names are
223now all in a single string, to reduce the number of relocations when a shared
224library is dynamically loaded. The list of lengths is terminated by a zero
225length entry. The first three must be alpha, lower, upper, as this is assumed
226for handling case independence. */
227
228static const char posix_names[] =
229  STRING_alpha0 STRING_lower0 STRING_upper0 STRING_alnum0
230  STRING_ascii0 STRING_blank0 STRING_cntrl0 STRING_digit0
231  STRING_graph0 STRING_print0 STRING_punct0 STRING_space0
232  STRING_word0  STRING_xdigit;
233
234static const uschar posix_name_lengths[] = {
235  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
236
237/* Table of class bit maps for each POSIX class. Each class is formed from a
238base map, with an optional addition or removal of another map. Then, for some
239classes, there is some additional tweaking: for [:blank:] the vertical space
240characters are removed, and for [:alpha:] and [:alnum:] the underscore
241character is removed. The triples in the table consist of the base map offset,
242second map offset or -1 if no second map, and a non-negative value for map
243addition or a negative value for map subtraction (if there are two maps). The
244absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
245remove vertical space characters, 2 => remove underscore. */
246
247static const int posix_class_maps[] = {
248  cbit_word,  cbit_digit, -2,             /* alpha */
249  cbit_lower, -1,          0,             /* lower */
250  cbit_upper, -1,          0,             /* upper */
251  cbit_word,  -1,          2,             /* alnum - word without underscore */
252  cbit_print, cbit_cntrl,  0,             /* ascii */
253  cbit_space, -1,          1,             /* blank - a GNU extension */
254  cbit_cntrl, -1,          0,             /* cntrl */
255  cbit_digit, -1,          0,             /* digit */
256  cbit_graph, -1,          0,             /* graph */
257  cbit_print, -1,          0,             /* print */
258  cbit_punct, -1,          0,             /* punct */
259  cbit_space, -1,          0,             /* space */
260  cbit_word,  -1,          0,             /* word - a Perl extension */
261  cbit_xdigit,-1,          0              /* xdigit */
262};
263
264/* Table of substitutes for \d etc when PCRE_UCP is set. The POSIX class
265substitutes must be in the order of the names, defined above, and there are
266both positive and negative cases. NULL means no substitute. */
267
268#ifdef SUPPORT_UCP
269static const uschar *substitutes[] = {
270  (uschar *)"\\P{Nd}",    /* \D */
271  (uschar *)"\\p{Nd}",    /* \d */
272  (uschar *)"\\P{Xsp}",   /* \S */       /* NOTE: Xsp is Perl space */
273  (uschar *)"\\p{Xsp}",   /* \s */
274  (uschar *)"\\P{Xwd}",   /* \W */
275  (uschar *)"\\p{Xwd}"    /* \w */
276};
277
278static const uschar *posix_substitutes[] = {
279  (uschar *)"\\p{L}",     /* alpha */
280  (uschar *)"\\p{Ll}",    /* lower */
281  (uschar *)"\\p{Lu}",    /* upper */
282  (uschar *)"\\p{Xan}",   /* alnum */
283  NULL,                   /* ascii */
284  (uschar *)"\\h",        /* blank */
285  NULL,                   /* cntrl */
286  (uschar *)"\\p{Nd}",    /* digit */
287  NULL,                   /* graph */
288  NULL,                   /* print */
289  NULL,                   /* punct */
290  (uschar *)"\\p{Xps}",   /* space */    /* NOTE: Xps is POSIX space */
291  (uschar *)"\\p{Xwd}",   /* word */
292  NULL,                   /* xdigit */
293  /* Negated cases */
294  (uschar *)"\\P{L}",     /* ^alpha */
295  (uschar *)"\\P{Ll}",    /* ^lower */
296  (uschar *)"\\P{Lu}",    /* ^upper */
297  (uschar *)"\\P{Xan}",   /* ^alnum */
298  NULL,                   /* ^ascii */
299  (uschar *)"\\H",        /* ^blank */
300  NULL,                   /* ^cntrl */
301  (uschar *)"\\P{Nd}",    /* ^digit */
302  NULL,                   /* ^graph */
303  NULL,                   /* ^print */
304  NULL,                   /* ^punct */
305  (uschar *)"\\P{Xps}",   /* ^space */   /* NOTE: Xps is POSIX space */
306  (uschar *)"\\P{Xwd}",   /* ^word */
307  NULL                    /* ^xdigit */
308};
309#define POSIX_SUBSIZE (sizeof(posix_substitutes)/sizeof(uschar *))
310#endif
311
312#define STRING(a)  # a
313#define XSTRING(s) STRING(s)
314
315/* The texts of compile-time error messages. These are "char *" because they
316are passed to the outside world. Do not ever re-use any error number, because
317they are documented. Always add a new error instead. Messages marked DEAD below
318are no longer used. This used to be a table of strings, but in order to reduce
319the number of relocations needed when a shared library is loaded dynamically,
320it is now one long string. We cannot use a table of offsets, because the
321lengths of inserts such as XSTRING(MAX_NAME_SIZE) are not known. Instead, we
322simply count through to the one we want - this isn't a performance issue
323because these strings are used only when there is a compilation error.
324
325Each substring ends with \0 to insert a null character. This includes the final
326substring, so that the whole string ends with \0\0, which can be detected when
327counting through. */
328
329static const char error_texts[] =
330  "no error\0"
331  "\\ at end of pattern\0"
332  "\\c at end of pattern\0"
333  "unrecognized character follows \\\0"
334  "numbers out of order in {} quantifier\0"
335  /* 5 */
336  "number too big in {} quantifier\0"
337  "missing terminating ] for character class\0"
338  "invalid escape sequence in character class\0"
339  "range out of order in character class\0"
340  "nothing to repeat\0"
341  /* 10 */
342  "operand of unlimited repeat could match the empty string\0"  /** DEAD **/
343  "internal error: unexpected repeat\0"
344  "unrecognized character after (? or (?-\0"
345  "POSIX named classes are supported only within a class\0"
346  "missing )\0"
347  /* 15 */
348  "reference to non-existent subpattern\0"
349  "erroffset passed as NULL\0"
350  "unknown option bit(s) set\0"
351  "missing ) after comment\0"
352  "parentheses nested too deeply\0"  /** DEAD **/
353  /* 20 */
354  "regular expression is too large\0"
355  "failed to get memory\0"
356  "unmatched parentheses\0"
357  "internal error: code overflow\0"
358  "unrecognized character after (?<\0"
359  /* 25 */
360  "lookbehind assertion is not fixed length\0"
361  "malformed number or name after (?(\0"
362  "conditional group contains more than two branches\0"
363  "assertion expected after (?(\0"
364  "(?R or (?[+-]digits must be followed by )\0"
365  /* 30 */
366  "unknown POSIX class name\0"
367  "POSIX collating elements are not supported\0"
368  "this version of PCRE is not compiled with PCRE_UTF8 support\0"
369  "spare error\0"  /** DEAD **/
370  "character value in \\x{...} sequence is too large\0"
371  /* 35 */
372  "invalid condition (?(0)\0"
373  "\\C not allowed in lookbehind assertion\0"
374  "PCRE does not support \\L, \\l, \\N{name}, \\U, or \\u\0"
375  "number after (?C is > 255\0"
376  "closing ) for (?C expected\0"
377  /* 40 */
378  "recursive call could loop indefinitely\0"
379  "unrecognized character after (?P\0"
380  "syntax error in subpattern name (missing terminator)\0"
381  "two named subpatterns have the same name\0"
382  "invalid UTF-8 string\0"
383  /* 45 */
384  "support for \\P, \\p, and \\X has not been compiled\0"
385  "malformed \\P or \\p sequence\0"
386  "unknown property name after \\P or \\p\0"
387  "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)\0"
388  "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0"
389  /* 50 */
390  "repeated subpattern is too long\0"    /** DEAD **/
391  "octal value is greater than \\377 (not in UTF-8 mode)\0"
392  "internal error: overran compiling workspace\0"
393  "internal error: previously-checked referenced subpattern not found\0"
394  "DEFINE group contains more than one branch\0"
395  /* 55 */
396  "repeating a DEFINE group is not allowed\0"
397  "inconsistent NEWLINE options\0"
398  "\\g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number\0"
399  "a numbered reference must not be zero\0"
400  "an argument is not allowed for (*ACCEPT), (*FAIL), or (*COMMIT)\0"
401  /* 60 */
402  "(*VERB) not recognized\0"
403  "number is too big\0"
404  "subpattern name expected\0"
405  "digit expected after (?+\0"
406  "] is an invalid data character in JavaScript compatibility mode\0"
407  /* 65 */
408  "different names for subpatterns of the same number are not allowed\0"
409  "(*MARK) must have an argument\0"
410  "this version of PCRE is not compiled with PCRE_UCP support\0"
411  "\\c must be followed by an ASCII character\0"
412  ;
413
414/* Table to identify digits and hex digits. This is used when compiling
415patterns. Note that the tables in chartables are dependent on the locale, and
416may mark arbitrary characters as digits - but the PCRE compiling code expects
417to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
418a private table here. It costs 256 bytes, but it is a lot faster than doing
419character value tests (at least in some simple cases I timed), and in some
420applications one wants PCRE to compile efficiently as well as match
421efficiently.
422
423For convenience, we use the same bit definitions as in chartables:
424
425  0x04   decimal digit
426  0x08   hexadecimal digit
427
428Then we can use ctype_digit and ctype_xdigit in the code. */
429
430#ifndef EBCDIC
431
432/* This is the "normal" case, for ASCII systems, and EBCDIC systems running in
433UTF-8 mode. */
434
435static const unsigned char digitab[] =
436  {
437  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7 */
438  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   8- 15 */
439  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  16- 23 */
440  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  24- 31 */
441  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - '  */
442  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  ( - /  */
443  0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /*  0 - 7  */
444  0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /*  8 - ?  */
445  0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /*  @ - G  */
446  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  H - O  */
447  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  P - W  */
448  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  X - _  */
449  0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /*  ` - g  */
450  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  h - o  */
451  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  p - w  */
452  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  x -127 */
453  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
454  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
455  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
456  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
457  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
458  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
459  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
460  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
461  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
462  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
463  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
464  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
465  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
466  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
467  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
468  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
469
470#else
471
472/* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */
473
474static const unsigned char digitab[] =
475  {
476  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7  0 */
477  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   8- 15    */
478  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  16- 23 10 */
479  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  24- 31    */
480  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  32- 39 20 */
481  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  40- 47    */
482  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  48- 55 30 */
483  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  56- 63    */
484  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 40 */
485  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  72- |     */
486  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 50 */
487  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  88- 95    */
488  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 60 */
489  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ?     */
490  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
491  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- "     */
492  0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g  80 */
493  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  h -143    */
494  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p  90 */
495  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  q -159    */
496  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x  A0 */
497  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  y -175    */
498  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  ^ -183 B0 */
499  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191    */
500  0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /*  { - G  C0 */
501  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  H -207    */
502  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  } - P  D0 */
503  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  Q -223    */
504  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  \ - X  E0 */
505  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  Y -239    */
506  0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /*  0 - 7  F0 */
507  0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/*  8 -255    */
508
509static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */
510  0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*   0-  7 */
511  0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /*   8- 15 */
512  0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*  16- 23 */
513  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  24- 31 */
514  0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*  32- 39 */
515  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  40- 47 */
516  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  48- 55 */
517  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  56- 63 */
518  0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 */
519  0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /*  72- |  */
520  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 */
521  0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /*  88- 95 */
522  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 */
523  0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ?  */
524  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
525  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- "  */
526  0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g  */
527  0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  h -143 */
528  0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p  */
529  0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  q -159 */
530  0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x  */
531  0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  y -175 */
532  0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  ^ -183 */
533  0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
534  0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /*  { - G  */
535  0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  H -207 */
536  0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /*  } - P  */
537  0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  Q -223 */
538  0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /*  \ - X  */
539  0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  Y -239 */
540  0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /*  0 - 7  */
541  0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/*  8 -255 */
542#endif
543
544
545/* Definition to allow mutual recursion */
546
547static BOOL
548  compile_regex(int, int, uschar **, const uschar **, int *, BOOL, BOOL, int,
549    int *, int *, branch_chain *, compile_data *, int *);
550
551
552
553/*************************************************
554*            Find an error text                  *
555*************************************************/
556
557/* The error texts are now all in one long string, to save on relocations. As
558some of the text is of unknown length, we can't use a table of offsets.
559Instead, just count through the strings. This is not a performance issue
560because it happens only when there has been a compilation error.
561
562Argument:   the error number
563Returns:    pointer to the error string
564*/
565
566static const char *
567find_error_text(int n)
568{
569const char *s = error_texts;
570for (; n > 0; n--)
571  {
572  while (*s++ != 0) {};
573  if (*s == 0) return "Error text not found (please report)";
574  }
575return s;
576}
577
578
579/*************************************************
580*            Handle escapes                      *
581*************************************************/
582
583/* This function is called when a \ has been encountered. It either returns a
584positive value for a simple escape such as \n, or a negative value which
585encodes one of the more complicated things such as \d. A backreference to group
586n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
587UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
588ptr is pointing at the \. On exit, it is on the final character of the escape
589sequence.
590
591Arguments:
592  ptrptr         points to the pattern position pointer
593  errorcodeptr   points to the errorcode variable
594  bracount       number of previous extracting brackets
595  options        the options bits
596  isclass        TRUE if inside a character class
597
598Returns:         zero or positive => a data character
599                 negative => a special escape sequence
600                 on error, errorcodeptr is set
601*/
602
603static int
604check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,
605  int options, BOOL isclass)
606{
607BOOL utf8 = (options & PCRE_UTF8) != 0;
608const uschar *ptr = *ptrptr + 1;
609int c, i;
610
611GETCHARINCTEST(c, ptr);           /* Get character value, increment pointer */
612ptr--;                            /* Set pointer back to the last byte */
613
614/* If backslash is at the end of the pattern, it's an error. */
615
616if (c == 0) *errorcodeptr = ERR1;
617
618/* Non-alphanumerics are literals. For digits or letters, do an initial lookup
619in a table. A non-zero result is something that can be returned immediately.
620Otherwise further processing may be required. */
621
622#ifndef EBCDIC  /* ASCII/UTF-8 coding */
623else if (c < CHAR_0 || c > CHAR_z) {}                     /* Not alphanumeric */
624else if ((i = escapes[c - CHAR_0]) != 0) c = i;
625
626#else           /* EBCDIC coding */
627else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {}   /* Not alphanumeric */
628else if ((i = escapes[c - 0x48]) != 0)  c = i;
629#endif
630
631/* Escapes that need further processing, or are illegal. */
632
633else
634  {
635  const uschar *oldptr;
636  BOOL braced, negated;
637
638  switch (c)
639    {
640    /* A number of Perl escapes are not handled by PCRE. We give an explicit
641    error. */
642
643    case CHAR_l:
644    case CHAR_L:
645    case CHAR_u:
646    case CHAR_U:
647    *errorcodeptr = ERR37;
648    break;
649
650    /* \g must be followed by one of a number of specific things:
651
652    (1) A number, either plain or braced. If positive, it is an absolute
653    backreference. If negative, it is a relative backreference. This is a Perl
654    5.10 feature.
655
656    (2) Perl 5.10 also supports \g{name} as a reference to a named group. This
657    is part of Perl's movement towards a unified syntax for back references. As
658    this is synonymous with \k{name}, we fudge it up by pretending it really
659    was \k.
660
661    (3) For Oniguruma compatibility we also support \g followed by a name or a
662    number either in angle brackets or in single quotes. However, these are
663    (possibly recursive) subroutine calls, _not_ backreferences. Just return
664    the -ESC_g code (cf \k). */
665
666    case CHAR_g:
667    if (ptr[1] == CHAR_LESS_THAN_SIGN || ptr[1] == CHAR_APOSTROPHE)
668      {
669      c = -ESC_g;
670      break;
671      }
672
673    /* Handle the Perl-compatible cases */
674
675    if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
676      {
677      const uschar *p;
678      for (p = ptr+2; *p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET; p++)
679        if (*p != CHAR_MINUS && (digitab[*p] & ctype_digit) == 0) break;
680      if (*p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET)
681        {
682        c = -ESC_k;
683        break;
684        }
685      braced = TRUE;
686      ptr++;
687      }
688    else braced = FALSE;
689
690    if (ptr[1] == CHAR_MINUS)
691      {
692      negated = TRUE;
693      ptr++;
694      }
695    else negated = FALSE;
696
697    c = 0;
698    while ((digitab[ptr[1]] & ctype_digit) != 0)
699      c = c * 10 + *(++ptr) - CHAR_0;
700
701    if (c < 0)   /* Integer overflow */
702      {
703      *errorcodeptr = ERR61;
704      break;
705      }
706
707    if (braced && *(++ptr) != CHAR_RIGHT_CURLY_BRACKET)
708      {
709      *errorcodeptr = ERR57;
710      break;
711      }
712
713    if (c == 0)
714      {
715      *errorcodeptr = ERR58;
716      break;
717      }
718
719    if (negated)
720      {
721      if (c > bracount)
722        {
723        *errorcodeptr = ERR15;
724        break;
725        }
726      c = bracount - (c - 1);
727      }
728
729    c = -(ESC_REF + c);
730    break;
731
732    /* The handling of escape sequences consisting of a string of digits
733    starting with one that is not zero is not straightforward. By experiment,
734    the way Perl works seems to be as follows:
735
736    Outside a character class, the digits are read as a decimal number. If the
737    number is less than 10, or if there are that many previous extracting
738    left brackets, then it is a back reference. Otherwise, up to three octal
739    digits are read to form an escaped byte. Thus \123 is likely to be octal
740    123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
741    value is greater than 377, the least significant 8 bits are taken. Inside a
742    character class, \ followed by a digit is always an octal number. */
743
744    case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5:
745    case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
746
747    if (!isclass)
748      {
749      oldptr = ptr;
750      c -= CHAR_0;
751      while ((digitab[ptr[1]] & ctype_digit) != 0)
752        c = c * 10 + *(++ptr) - CHAR_0;
753      if (c < 0)    /* Integer overflow */
754        {
755        *errorcodeptr = ERR61;
756        break;
757        }
758      if (c < 10 || c <= bracount)
759        {
760        c = -(ESC_REF + c);
761        break;
762        }
763      ptr = oldptr;      /* Put the pointer back and fall through */
764      }
765
766    /* Handle an octal number following \. If the first digit is 8 or 9, Perl
767    generates a binary zero byte and treats the digit as a following literal.
768    Thus we have to pull back the pointer by one. */
769
770    if ((c = *ptr) >= CHAR_8)
771      {
772      ptr--;
773      c = 0;
774      break;
775      }
776
777    /* \0 always starts an octal number, but we may drop through to here with a
778    larger first octal digit. The original code used just to take the least
779    significant 8 bits of octal numbers (I think this is what early Perls used
780    to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more
781    than 3 octal digits. */
782
783    case CHAR_0:
784    c -= CHAR_0;
785    while(i++ < 2 && ptr[1] >= CHAR_0 && ptr[1] <= CHAR_7)
786        c = c * 8 + *(++ptr) - CHAR_0;
787    if (!utf8 && c > 255) *errorcodeptr = ERR51;
788    break;
789
790    /* \x is complicated. \x{ddd} is a character number which can be greater
791    than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is
792    treated as a data character. */
793
794    case CHAR_x:
795    if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
796      {
797      const uschar *pt = ptr + 2;
798      int count = 0;
799
800      c = 0;
801      while ((digitab[*pt] & ctype_xdigit) != 0)
802        {
803        register int cc = *pt++;
804        if (c == 0 && cc == CHAR_0) continue;     /* Leading zeroes */
805        count++;
806
807#ifndef EBCDIC  /* ASCII/UTF-8 coding */
808        if (cc >= CHAR_a) cc -= 32;               /* Convert to upper case */
809        c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
810#else           /* EBCDIC coding */
811        if (cc >= CHAR_a && cc <= CHAR_z) cc += 64;  /* Convert to upper case */
812        c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
813#endif
814        }
815
816      if (*pt == CHAR_RIGHT_CURLY_BRACKET)
817        {
818        if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;
819        ptr = pt;
820        break;
821        }
822
823      /* If the sequence of hex digits does not end with '}', then we don't
824      recognize this construct; fall through to the normal \x handling. */
825      }
826
827    /* Read just a single-byte hex-defined char */
828
829    c = 0;
830    while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
831      {
832      int cc;                                  /* Some compilers don't like */
833      cc = *(++ptr);                           /* ++ in initializers */
834#ifndef EBCDIC  /* ASCII/UTF-8 coding */
835      if (cc >= CHAR_a) cc -= 32;              /* Convert to upper case */
836      c = c * 16 + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
837#else           /* EBCDIC coding */
838      if (cc <= CHAR_z) cc += 64;              /* Convert to upper case */
839      c = c * 16 + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
840#endif
841      }
842    break;
843
844    /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
845    An error is given if the byte following \c is not an ASCII character. This
846    coding is ASCII-specific, but then the whole concept of \cx is
847    ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
848
849    case CHAR_c:
850    c = *(++ptr);
851    if (c == 0)
852      {
853      *errorcodeptr = ERR2;
854      break;
855      }
856#ifndef EBCDIC    /* ASCII/UTF-8 coding */
857    if (c > 127)  /* Excludes all non-ASCII in either mode */
858      {
859      *errorcodeptr = ERR68;
860      break;
861      }
862    if (c >= CHAR_a && c <= CHAR_z) c -= 32;
863    c ^= 0x40;
864#else             /* EBCDIC coding */
865    if (c >= CHAR_a && c <= CHAR_z) c += 64;
866    c ^= 0xC0;
867#endif
868    break;
869
870    /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
871    other alphanumeric following \ is an error if PCRE_EXTRA was set;
872    otherwise, for Perl compatibility, it is a literal. This code looks a bit
873    odd, but there used to be some cases other than the default, and there may
874    be again in future, so I haven't "optimized" it. */
875
876    default:
877    if ((options & PCRE_EXTRA) != 0) switch(c)
878      {
879      default:
880      *errorcodeptr = ERR3;
881      break;
882      }
883    break;
884    }
885  }
886
887/* Perl supports \N{name} for character names, as well as plain \N for "not
888newline". PCRE does not support \N{name}. */
889
890if (c == -ESC_N && ptr[1] == CHAR_LEFT_CURLY_BRACKET)
891  *errorcodeptr = ERR37;
892
893/* If PCRE_UCP is set, we change the values for \d etc. */
894
895if ((options & PCRE_UCP) != 0 && c <= -ESC_D && c >= -ESC_w)
896  c -= (ESC_DU - ESC_D);
897
898/* Set the pointer to the final character before returning. */
899
900*ptrptr = ptr;
901return c;
902}
903
904
905
906#ifdef SUPPORT_UCP
907/*************************************************
908*               Handle \P and \p                 *
909*************************************************/
910
911/* This function is called after \P or \p has been encountered, provided that
912PCRE is compiled with support for Unicode properties. On entry, ptrptr is
913pointing at the P or p. On exit, it is pointing at the final character of the
914escape sequence.
915
916Argument:
917  ptrptr         points to the pattern position pointer
918  negptr         points to a boolean that is set TRUE for negation else FALSE
919  dptr           points to an int that is set to the detailed property value
920  errorcodeptr   points to the error code variable
921
922Returns:         type value from ucp_type_table, or -1 for an invalid type
923*/
924
925static int
926get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
927{
928int c, i, bot, top;
929const uschar *ptr = *ptrptr;
930char name[32];
931
932c = *(++ptr);
933if (c == 0) goto ERROR_RETURN;
934
935*negptr = FALSE;
936
937/* \P or \p can be followed by a name in {}, optionally preceded by ^ for
938negation. */
939
940if (c == CHAR_LEFT_CURLY_BRACKET)
941  {
942  if (ptr[1] == CHAR_CIRCUMFLEX_ACCENT)
943    {
944    *negptr = TRUE;
945    ptr++;
946    }
947  for (i = 0; i < (int)sizeof(name) - 1; i++)
948    {
949    c = *(++ptr);
950    if (c == 0) goto ERROR_RETURN;
951    if (c == CHAR_RIGHT_CURLY_BRACKET) break;
952    name[i] = c;
953    }
954  if (c != CHAR_RIGHT_CURLY_BRACKET) goto ERROR_RETURN;
955  name[i] = 0;
956  }
957
958/* Otherwise there is just one following character */
959
960else
961  {
962  name[0] = c;
963  name[1] = 0;
964  }
965
966*ptrptr = ptr;
967
968/* Search for a recognized property name using binary chop */
969
970bot = 0;
971top = _pcre_utt_size;
972
973while (bot < top)
974  {
975  i = (bot + top) >> 1;
976  c = strcmp(name, _pcre_utt_names + _pcre_utt[i].name_offset);
977  if (c == 0)
978    {
979    *dptr = _pcre_utt[i].value;
980    return _pcre_utt[i].type;
981    }
982  if (c > 0) bot = i + 1; else top = i;
983  }
984
985*errorcodeptr = ERR47;
986*ptrptr = ptr;
987return -1;
988
989ERROR_RETURN:
990*errorcodeptr = ERR46;
991*ptrptr = ptr;
992return -1;
993}
994#endif
995
996
997
998
999/*************************************************
1000*            Check for counted repeat            *
1001*************************************************/
1002
1003/* This function is called when a '{' is encountered in a place where it might
1004start a quantifier. It looks ahead to see if it really is a quantifier or not.
1005It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
1006where the ddds are digits.
1007
1008Arguments:
1009  p         pointer to the first char after '{'
1010
1011Returns:    TRUE or FALSE
1012*/
1013
1014static BOOL
1015is_counted_repeat(const uschar *p)
1016{
1017if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
1018while ((digitab[*p] & ctype_digit) != 0) p++;
1019if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
1020
1021if (*p++ != CHAR_COMMA) return FALSE;
1022if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
1023
1024if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
1025while ((digitab[*p] & ctype_digit) != 0) p++;
1026
1027return (*p == CHAR_RIGHT_CURLY_BRACKET);
1028}
1029
1030
1031
1032/*************************************************
1033*         Read repeat counts                     *
1034*************************************************/
1035
1036/* Read an item of the form {n,m} and return the values. This is called only
1037after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
1038so the syntax is guaranteed to be correct, but we need to check the values.
1039
1040Arguments:
1041  p              pointer to first char after '{'
1042  minp           pointer to int for min
1043  maxp           pointer to int for max
1044                 returned as -1 if no max
1045  errorcodeptr   points to error code variable
1046
1047Returns:         pointer to '}' on success;
1048                 current ptr on error, with errorcodeptr set non-zero
1049*/
1050
1051static const uschar *
1052read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr)
1053{
1054int min = 0;
1055int max = -1;
1056
1057/* Read the minimum value and do a paranoid check: a negative value indicates
1058an integer overflow. */
1059
1060while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - CHAR_0;
1061if (min < 0 || min > 65535)
1062  {
1063  *errorcodeptr = ERR5;
1064  return p;
1065  }
1066
1067/* Read the maximum value if there is one, and again do a paranoid on its size.
1068Also, max must not be less than min. */
1069
1070if (*p == CHAR_RIGHT_CURLY_BRACKET) max = min; else
1071  {
1072  if (*(++p) != CHAR_RIGHT_CURLY_BRACKET)
1073    {
1074    max = 0;
1075    while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - CHAR_0;
1076    if (max < 0 || max > 65535)
1077      {
1078      *errorcodeptr = ERR5;
1079      return p;
1080      }
1081    if (max < min)
1082      {
1083      *errorcodeptr = ERR4;
1084      return p;
1085      }
1086    }
1087  }
1088
1089/* Fill in the required variables, and pass back the pointer to the terminating
1090'}'. */
1091
1092*minp = min;
1093*maxp = max;
1094return p;
1095}
1096
1097
1098
1099/*************************************************
1100*  Subroutine for finding forward reference      *
1101*************************************************/
1102
1103/* This recursive function is called only from find_parens() below. The
1104top-level call starts at the beginning of the pattern. All other calls must
1105start at a parenthesis. It scans along a pattern's text looking for capturing
1106subpatterns, and counting them. If it finds a named pattern that matches the
1107name it is given, it returns its number. Alternatively, if the name is NULL, it
1108returns when it reaches a given numbered subpattern. Recursion is used to keep
1109track of subpatterns that reset the capturing group numbers - the (?| feature.
1110
1111This function was originally called only from the second pass, in which we know
1112that if (?< or (?' or (?P< is encountered, the name will be correctly
1113terminated because that is checked in the first pass. There is now one call to
1114this function in the first pass, to check for a recursive back reference by
1115name (so that we can make the whole group atomic). In this case, we need check
1116only up to the current position in the pattern, and that is still OK because
1117and previous occurrences will have been checked. To make this work, the test
1118for "end of pattern" is a check against cd->end_pattern in the main loop,
1119instead of looking for a binary zero. This means that the special first-pass
1120call can adjust cd->end_pattern temporarily. (Checks for binary zero while
1121processing items within the loop are OK, because afterwards the main loop will
1122terminate.)
1123
1124Arguments:
1125  ptrptr       address of the current character pointer (updated)
1126  cd           compile background data
1127  name         name to seek, or NULL if seeking a numbered subpattern
1128  lorn         name length, or subpattern number if name is NULL
1129  xmode        TRUE if we are in /x mode
1130  utf8         TRUE if we are in UTF-8 mode
1131  count        pointer to the current capturing subpattern number (updated)
1132
1133Returns:       the number of the named subpattern, or -1 if not found
1134*/
1135
1136static int
1137find_parens_sub(uschar **ptrptr, compile_data *cd, const uschar *name, int lorn,
1138  BOOL xmode, BOOL utf8, int *count)
1139{
1140uschar *ptr = *ptrptr;
1141int start_count = *count;
1142int hwm_count = start_count;
1143BOOL dup_parens = FALSE;
1144
1145/* If the first character is a parenthesis, check on the type of group we are
1146dealing with. The very first call may not start with a parenthesis. */
1147
1148if (ptr[0] == CHAR_LEFT_PARENTHESIS)
1149  {
1150  /* Handle specials such as (*SKIP) or (*UTF8) etc. */
1151
1152  if (ptr[1] == CHAR_ASTERISK) ptr += 2;
1153
1154  /* Handle a normal, unnamed capturing parenthesis. */
1155
1156  else if (ptr[1] != CHAR_QUESTION_MARK)
1157    {
1158    *count += 1;
1159    if (name == NULL && *count == lorn) return *count;
1160    ptr++;
1161    }
1162
1163  /* All cases now have (? at the start. Remember when we are in a group
1164  where the parenthesis numbers are duplicated. */
1165
1166  else if (ptr[2] == CHAR_VERTICAL_LINE)
1167    {
1168    ptr += 3;
1169    dup_parens = TRUE;
1170    }
1171
1172  /* Handle comments; all characters are allowed until a ket is reached. */
1173
1174  else if (ptr[2] == CHAR_NUMBER_SIGN)
1175    {
1176    for (ptr += 3; *ptr != 0; ptr++) if (*ptr == CHAR_RIGHT_PARENTHESIS) break;
1177    goto FAIL_EXIT;
1178    }
1179
1180  /* Handle a condition. If it is an assertion, just carry on so that it
1181  is processed as normal. If not, skip to the closing parenthesis of the
1182  condition (there can't be any nested parens). */
1183
1184  else if (ptr[2] == CHAR_LEFT_PARENTHESIS)
1185    {
1186    ptr += 2;
1187    if (ptr[1] != CHAR_QUESTION_MARK)
1188      {
1189      while (*ptr != 0 && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
1190      if (*ptr != 0) ptr++;
1191      }
1192    }
1193
1194  /* Start with (? but not a condition. */
1195
1196  else
1197    {
1198    ptr += 2;
1199    if (*ptr == CHAR_P) ptr++;                      /* Allow optional P */
1200
1201    /* We have to disambiguate (?<! and (?<= from (?<name> for named groups */
1202
1203    if ((*ptr == CHAR_LESS_THAN_SIGN && ptr[1] != CHAR_EXCLAMATION_MARK &&
1204        ptr[1] != CHAR_EQUALS_SIGN) || *ptr == CHAR_APOSTROPHE)
1205      {
1206      int term;
1207      const uschar *thisname;
1208      *count += 1;
1209      if (name == NULL && *count == lorn) return *count;
1210      term = *ptr++;
1211      if (term == CHAR_LESS_THAN_SIGN) term = CHAR_GREATER_THAN_SIGN;
1212      thisname = ptr;
1213      while (*ptr != term) ptr++;
1214      if (name != NULL && lorn == ptr - thisname &&
1215          strncmp((const char *)name, (const char *)thisname, lorn) == 0)
1216        return *count;
1217      term++;
1218      }
1219    }
1220  }
1221
1222/* Past any initial parenthesis handling, scan for parentheses or vertical
1223bars. Stop if we get to cd->end_pattern. Note that this is important for the
1224first-pass call when this value is temporarily adjusted to stop at the current
1225position. So DO NOT change this to a test for binary zero. */
1226
1227for (; ptr < cd->end_pattern; ptr++)
1228  {
1229  /* Skip over backslashed characters and also entire \Q...\E */
1230
1231  if (*ptr == CHAR_BACKSLASH)
1232    {
1233    if (*(++ptr) == 0) goto FAIL_EXIT;
1234    if (*ptr == CHAR_Q) for (;;)
1235      {
1236      while (*(++ptr) != 0 && *ptr != CHAR_BACKSLASH) {};
1237      if (*ptr == 0) goto FAIL_EXIT;
1238      if (*(++ptr) == CHAR_E) break;
1239      }
1240    continue;
1241    }
1242
1243  /* Skip over character classes; this logic must be similar to the way they
1244  are handled for real. If the first character is '^', skip it. Also, if the
1245  first few characters (either before or after ^) are \Q\E or \E we skip them
1246  too. This makes for compatibility with Perl. Note the use of STR macros to
1247  encode "Q\\E" so that it works in UTF-8 on EBCDIC platforms. */
1248
1249  if (*ptr == CHAR_LEFT_SQUARE_BRACKET)
1250    {
1251    BOOL negate_class = FALSE;
1252    for (;;)
1253      {
1254      if (ptr[1] == CHAR_BACKSLASH)
1255        {
1256        if (ptr[2] == CHAR_E)
1257          ptr+= 2;
1258        else if (strncmp((const char *)ptr+2,
1259                 STR_Q STR_BACKSLASH STR_E, 3) == 0)
1260          ptr += 4;
1261        else
1262          break;
1263        }
1264      else if (!negate_class && ptr[1] == CHAR_CIRCUMFLEX_ACCENT)
1265        {
1266        negate_class = TRUE;
1267        ptr++;
1268        }
1269      else break;
1270      }
1271
1272    /* If the next character is ']', it is a data character that must be
1273    skipped, except in JavaScript compatibility mode. */
1274
1275    if (ptr[1] == CHAR_RIGHT_SQUARE_BRACKET &&
1276        (cd->external_options & PCRE_JAVASCRIPT_COMPAT) == 0)
1277      ptr++;
1278
1279    while (*(++ptr) != CHAR_RIGHT_SQUARE_BRACKET)
1280      {
1281      if (*ptr == 0) return -1;
1282      if (*ptr == CHAR_BACKSLASH)
1283        {
1284        if (*(++ptr) == 0) goto FAIL_EXIT;
1285        if (*ptr == CHAR_Q) for (;;)
1286          {
1287          while (*(++ptr) != 0 && *ptr != CHAR_BACKSLASH) {};
1288          if (*ptr == 0) goto FAIL_EXIT;
1289          if (*(++ptr) == CHAR_E) break;
1290          }
1291        continue;
1292        }
1293      }
1294    continue;
1295    }
1296
1297  /* Skip comments in /x mode */
1298
1299  if (xmode && *ptr == CHAR_NUMBER_SIGN)
1300    {
1301    ptr++;
1302    while (*ptr != 0)
1303      {
1304      if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
1305      ptr++;
1306#ifdef SUPPORT_UTF8
1307      if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++;
1308#endif
1309      }
1310    if (*ptr == 0) goto FAIL_EXIT;
1311    continue;
1312    }
1313
1314  /* Check for the special metacharacters */
1315
1316  if (*ptr == CHAR_LEFT_PARENTHESIS)
1317    {
1318    int rc = find_parens_sub(&ptr, cd, name, lorn, xmode, utf8, count);
1319    if (rc > 0) return rc;
1320    if (*ptr == 0) goto FAIL_EXIT;
1321    }
1322
1323  else if (*ptr == CHAR_RIGHT_PARENTHESIS)
1324    {
1325    if (dup_parens && *count < hwm_count) *count = hwm_count;
1326    goto FAIL_EXIT;
1327    }
1328
1329  else if (*ptr == CHAR_VERTICAL_LINE && dup_parens)
1330    {
1331    if (*count > hwm_count) hwm_count = *count;
1332    *count = start_count;
1333    }
1334  }
1335
1336FAIL_EXIT:
1337*ptrptr = ptr;
1338return -1;
1339}
1340
1341
1342
1343
1344/*************************************************
1345*       Find forward referenced subpattern       *
1346*************************************************/
1347
1348/* This function scans along a pattern's text looking for capturing
1349subpatterns, and counting them. If it finds a named pattern that matches the
1350name it is given, it returns its number. Alternatively, if the name is NULL, it
1351returns when it reaches a given numbered subpattern. This is used for forward
1352references to subpatterns. We used to be able to start this scan from the
1353current compiling point, using the current count value from cd->bracount, and
1354do it all in a single loop, but the addition of the possibility of duplicate
1355subpattern numbers means that we have to scan from the very start, in order to
1356take account of such duplicates, and to use a recursive function to keep track
1357of the different types of group.
1358
1359Arguments:
1360  cd           compile background data
1361  name         name to seek, or NULL if seeking a numbered subpattern
1362  lorn         name length, or subpattern number if name is NULL
1363  xmode        TRUE if we are in /x mode
1364  utf8         TRUE if we are in UTF-8 mode
1365
1366Returns:       the number of the found subpattern, or -1 if not found
1367*/
1368
1369static int
1370find_parens(compile_data *cd, const uschar *name, int lorn, BOOL xmode,
1371  BOOL utf8)
1372{
1373uschar *ptr = (uschar *)cd->start_pattern;
1374int count = 0;
1375int rc;
1376
1377/* If the pattern does not start with an opening parenthesis, the first call
1378to find_parens_sub() will scan right to the end (if necessary). However, if it
1379does start with a parenthesis, find_parens_sub() will return when it hits the
1380matching closing parens. That is why we have to have a loop. */
1381
1382for (;;)
1383  {
1384  rc = find_parens_sub(&ptr, cd, name, lorn, xmode, utf8, &count);
1385  if (rc > 0 || *ptr++ == 0) break;
1386  }
1387
1388return rc;
1389}
1390
1391
1392
1393
1394/*************************************************
1395*      Find first significant op code            *
1396*************************************************/
1397
1398/* This is called by several functions that scan a compiled expression looking
1399for a fixed first character, or an anchoring op code etc. It skips over things
1400that do not influence this. For some calls, a change of option is important.
1401For some calls, it makes sense to skip negative forward and all backward
1402assertions, and also the \b assertion; for others it does not.
1403
1404Arguments:
1405  code         pointer to the start of the group
1406  options      pointer to external options
1407  optbit       the option bit whose changing is significant, or
1408                 zero if none are
1409  skipassert   TRUE if certain assertions are to be skipped
1410
1411Returns:       pointer to the first significant opcode
1412*/
1413
1414static const uschar*
1415first_significant_code(const uschar *code, int *options, int optbit,
1416  BOOL skipassert)
1417{
1418for (;;)
1419  {
1420  switch ((int)*code)
1421    {
1422    case OP_OPT:
1423    if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
1424      *options = (int)code[1];
1425    code += 2;
1426    break;
1427
1428    case OP_ASSERT_NOT:
1429    case OP_ASSERTBACK:
1430    case OP_ASSERTBACK_NOT:
1431    if (!skipassert) return code;
1432    do code += GET(code, 1); while (*code == OP_ALT);
1433    code += _pcre_OP_lengths[*code];
1434    break;
1435
1436    case OP_WORD_BOUNDARY:
1437    case OP_NOT_WORD_BOUNDARY:
1438    if (!skipassert) return code;
1439    /* Fall through */
1440
1441    case OP_CALLOUT:
1442    case OP_CREF:
1443    case OP_NCREF:
1444    case OP_RREF:
1445    case OP_NRREF:
1446    case OP_DEF:
1447    code += _pcre_OP_lengths[*code];
1448    break;
1449
1450    default:
1451    return code;
1452    }
1453  }
1454/* Control never reaches here */
1455}
1456
1457
1458
1459
1460/*************************************************
1461*        Find the fixed length of a branch       *
1462*************************************************/
1463
1464/* Scan a branch and compute the fixed length of subject that will match it,
1465if the length is fixed. This is needed for dealing with backward assertions.
1466In UTF8 mode, the result is in characters rather than bytes. The branch is
1467temporarily terminated with OP_END when this function is called.
1468
1469This function is called when a backward assertion is encountered, so that if it
1470fails, the error message can point to the correct place in the pattern.
1471However, we cannot do this when the assertion contains subroutine calls,
1472because they can be forward references. We solve this by remembering this case
1473and doing the check at the end; a flag specifies which mode we are running in.
1474
1475Arguments:
1476  code     points to the start of the pattern (the bracket)
1477  options  the compiling options
1478  atend    TRUE if called when the pattern is complete
1479  cd       the "compile data" structure
1480
1481Returns:   the fixed length,
1482             or -1 if there is no fixed length,
1483             or -2 if \C was encountered
1484             or -3 if an OP_RECURSE item was encountered and atend is FALSE
1485*/
1486
1487static int
1488find_fixedlength(uschar *code, int options, BOOL atend, compile_data *cd)
1489{
1490int length = -1;
1491
1492register int branchlength = 0;
1493register uschar *cc = code + 1 + LINK_SIZE;
1494
1495/* Scan along the opcodes for this branch. If we get to the end of the
1496branch, check the length against that of the other branches. */
1497
1498for (;;)
1499  {
1500  int d;
1501  uschar *ce, *cs;
1502  register int op = *cc;
1503  switch (op)
1504    {
1505    case OP_CBRA:
1506    case OP_BRA:
1507    case OP_ONCE:
1508    case OP_COND:
1509    d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options, atend, cd);
1510    if (d < 0) return d;
1511    branchlength += d;
1512    do cc += GET(cc, 1); while (*cc == OP_ALT);
1513    cc += 1 + LINK_SIZE;
1514    break;
1515
1516    /* Reached end of a branch; if it's a ket it is the end of a nested
1517    call. If it's ALT it is an alternation in a nested call. If it is
1518    END it's the end of the outer call. All can be handled by the same code. */
1519
1520    case OP_ALT:
1521    case OP_KET:
1522    case OP_KETRMAX:
1523    case OP_KETRMIN:
1524    case OP_END:
1525    if (length < 0) length = branchlength;
1526      else if (length != branchlength) return -1;
1527    if (*cc != OP_ALT) return length;
1528    cc += 1 + LINK_SIZE;
1529    branchlength = 0;
1530    break;
1531
1532    /* A true recursion implies not fixed length, but a subroutine call may
1533    be OK. If the subroutine is a forward reference, we can't deal with
1534    it until the end of the pattern, so return -3. */
1535
1536    case OP_RECURSE:
1537    if (!atend) return -3;
1538    cs = ce = (uschar *)cd->start_code + GET(cc, 1);  /* Start subpattern */
1539    do ce += GET(ce, 1); while (*ce == OP_ALT);       /* End subpattern */
1540    if (cc > cs && cc < ce) return -1;                /* Recursion */
1541    d = find_fixedlength(cs + 2, options, atend, cd);
1542    if (d < 0) return d;
1543    branchlength += d;
1544    cc += 1 + LINK_SIZE;
1545    break;
1546
1547    /* Skip over assertive subpatterns */
1548
1549    case OP_ASSERT:
1550    case OP_ASSERT_NOT:
1551    case OP_ASSERTBACK:
1552    case OP_ASSERTBACK_NOT:
1553    do cc += GET(cc, 1); while (*cc == OP_ALT);
1554    /* Fall through */
1555
1556    /* Skip over things that don't match chars */
1557
1558    case OP_REVERSE:
1559    case OP_CREF:
1560    case OP_NCREF:
1561    case OP_RREF:
1562    case OP_NRREF:
1563    case OP_DEF:
1564    case OP_OPT:
1565    case OP_CALLOUT:
1566    case OP_SOD:
1567    case OP_SOM:
1568    case OP_SET_SOM:
1569    case OP_EOD:
1570    case OP_EODN:
1571    case OP_CIRC:
1572    case OP_DOLL:
1573    case OP_NOT_WORD_BOUNDARY:
1574    case OP_WORD_BOUNDARY:
1575    cc += _pcre_OP_lengths[*cc];
1576    break;
1577
1578    /* Handle literal characters */
1579
1580    case OP_CHAR:
1581    case OP_CHARNC:
1582    case OP_NOT:
1583    branchlength++;
1584    cc += 2;
1585#ifdef SUPPORT_UTF8
1586    if ((options & PCRE_UTF8) != 0 && cc[-1] >= 0xc0)
1587      cc += _pcre_utf8_table4[cc[-1] & 0x3f];
1588#endif
1589    break;
1590
1591    /* Handle exact repetitions. The count is already in characters, but we
1592    need to skip over a multibyte character in UTF8 mode.  */
1593
1594    case OP_EXACT:
1595    branchlength += GET2(cc,1);
1596    cc += 4;
1597#ifdef SUPPORT_UTF8
1598    if ((options & PCRE_UTF8) != 0 && cc[-1] >= 0xc0)
1599      cc += _pcre_utf8_table4[cc[-1] & 0x3f];
1600#endif
1601    break;
1602
1603    case OP_TYPEEXACT:
1604    branchlength += GET2(cc,1);
1605    if (cc[3] == OP_PROP || cc[3] == OP_NOTPROP) cc += 2;
1606    cc += 4;
1607    break;
1608
1609    /* Handle single-char matchers */
1610
1611    case OP_PROP:
1612    case OP_NOTPROP:
1613    cc += 2;
1614    /* Fall through */
1615
1616    case OP_NOT_DIGIT:
1617    case OP_DIGIT:
1618    case OP_NOT_WHITESPACE:
1619    case OP_WHITESPACE:
1620    case OP_NOT_WORDCHAR:
1621    case OP_WORDCHAR:
1622    case OP_ANY:
1623    case OP_ALLANY:
1624    branchlength++;
1625    cc++;
1626    break;
1627
1628    /* The single-byte matcher isn't allowed */
1629
1630    case OP_ANYBYTE:
1631    return -2;
1632
1633    /* Check a class for variable quantification */
1634
1635#ifdef SUPPORT_UTF8
1636    case OP_XCLASS:
1637    cc += GET(cc, 1) - 33;
1638    /* Fall through */
1639#endif
1640
1641    case OP_CLASS:
1642    case OP_NCLASS:
1643    cc += 33;
1644
1645    switch (*cc)
1646      {
1647      case OP_CRSTAR:
1648      case OP_CRMINSTAR:
1649      case OP_CRQUERY:
1650      case OP_CRMINQUERY:
1651      return -1;
1652
1653      case OP_CRRANGE:
1654      case OP_CRMINRANGE:
1655      if (GET2(cc,1) != GET2(cc,3)) return -1;
1656      branchlength += GET2(cc,1);
1657      cc += 5;
1658      break;
1659
1660      default:
1661      branchlength++;
1662      }
1663    break;
1664
1665    /* Anything else is variable length */
1666
1667    default:
1668    return -1;
1669    }
1670  }
1671/* Control never gets here */
1672}
1673
1674
1675
1676
1677/*************************************************
1678*    Scan compiled regex for specific bracket    *
1679*************************************************/
1680
1681/* This little function scans through a compiled pattern until it finds a
1682capturing bracket with the given number, or, if the number is negative, an
1683instance of OP_REVERSE for a lookbehind. The function is global in the C sense
1684so that it can be called from pcre_study() when finding the minimum matching
1685length.
1686
1687Arguments:
1688  code        points to start of expression
1689  utf8        TRUE in UTF-8 mode
1690  number      the required bracket number or negative to find a lookbehind
1691
1692Returns:      pointer to the opcode for the bracket, or NULL if not found
1693*/
1694
1695const uschar *
1696_pcre_find_bracket(const uschar *code, BOOL utf8, int number)
1697{
1698for (;;)
1699  {
1700  register int c = *code;
1701  if (c == OP_END) return NULL;
1702
1703  /* XCLASS is used for classes that cannot be represented just by a bit
1704  map. This includes negated single high-valued characters. The length in
1705  the table is zero; the actual length is stored in the compiled code. */
1706
1707  if (c == OP_XCLASS) code += GET(code, 1);
1708
1709  /* Handle recursion */
1710
1711  else if (c == OP_REVERSE)
1712    {
1713    if (number < 0) return (uschar *)code;
1714    code += _pcre_OP_lengths[c];
1715    }
1716
1717  /* Handle capturing bracket */
1718
1719  else if (c == OP_CBRA)
1720    {
1721    int n = GET2(code, 1+LINK_SIZE);
1722    if (n == number) return (uschar *)code;
1723    code += _pcre_OP_lengths[c];
1724    }
1725
1726  /* Otherwise, we can get the item's length from the table, except that for
1727  repeated character types, we have to test for \p and \P, which have an extra
1728  two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we
1729  must add in its length. */
1730
1731  else
1732    {
1733    switch(c)
1734      {
1735      case OP_TYPESTAR:
1736      case OP_TYPEMINSTAR:
1737      case OP_TYPEPLUS:
1738      case OP_TYPEMINPLUS:
1739      case OP_TYPEQUERY:
1740      case OP_TYPEMINQUERY:
1741      case OP_TYPEPOSSTAR:
1742      case OP_TYPEPOSPLUS:
1743      case OP_TYPEPOSQUERY:
1744      if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1745      break;
1746
1747      case OP_TYPEUPTO:
1748      case OP_TYPEMINUPTO:
1749      case OP_TYPEEXACT:
1750      case OP_TYPEPOSUPTO:
1751      if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1752      break;
1753
1754      case OP_MARK:
1755      case OP_PRUNE_ARG:
1756      case OP_SKIP_ARG:
1757      code += code[1];
1758      break;
1759
1760      case OP_THEN_ARG:
1761      code += code[1+LINK_SIZE];
1762      break;
1763      }
1764
1765    /* Add in the fixed length from the table */
1766
1767    code += _pcre_OP_lengths[c];
1768
1769  /* In UTF-8 mode, opcodes that are followed by a character may be followed by
1770  a multi-byte character. The length in the table is a minimum, so we have to
1771  arrange to skip the extra bytes. */
1772
1773#ifdef SUPPORT_UTF8
1774    if (utf8) switch(c)
1775      {
1776      case OP_CHAR:
1777      case OP_CHARNC:
1778      case OP_EXACT:
1779      case OP_UPTO:
1780      case OP_MINUPTO:
1781      case OP_POSUPTO:
1782      case OP_STAR:
1783      case OP_MINSTAR:
1784      case OP_POSSTAR:
1785      case OP_PLUS:
1786      case OP_MINPLUS:
1787      case OP_POSPLUS:
1788      case OP_QUERY:
1789      case OP_MINQUERY:
1790      case OP_POSQUERY:
1791      if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1792      break;
1793      }
1794#else
1795    (void)(utf8);  /* Keep compiler happy by referencing function argument */
1796#endif
1797    }
1798  }
1799}
1800
1801
1802
1803/*************************************************
1804*   Scan compiled regex for recursion reference  *
1805*************************************************/
1806
1807/* This little function scans through a compiled pattern until it finds an
1808instance of OP_RECURSE.
1809
1810Arguments:
1811  code        points to start of expression
1812  utf8        TRUE in UTF-8 mode
1813
1814Returns:      pointer to the opcode for OP_RECURSE, or NULL if not found
1815*/
1816
1817static const uschar *
1818find_recurse(const uschar *code, BOOL utf8)
1819{
1820for (;;)
1821  {
1822  register int c = *code;
1823  if (c == OP_END) return NULL;
1824  if (c == OP_RECURSE) return code;
1825
1826  /* XCLASS is used for classes that cannot be represented just by a bit
1827  map. This includes negated single high-valued characters. The length in
1828  the table is zero; the actual length is stored in the compiled code. */
1829
1830  if (c == OP_XCLASS) code += GET(code, 1);
1831
1832  /* Otherwise, we can get the item's length from the table, except that for
1833  repeated character types, we have to test for \p and \P, which have an extra
1834  two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we
1835  must add in its length. */
1836
1837  else
1838    {
1839    switch(c)
1840      {
1841      case OP_TYPESTAR:
1842      case OP_TYPEMINSTAR:
1843      case OP_TYPEPLUS:
1844      case OP_TYPEMINPLUS:
1845      case OP_TYPEQUERY:
1846      case OP_TYPEMINQUERY:
1847      case OP_TYPEPOSSTAR:
1848      case OP_TYPEPOSPLUS:
1849      case OP_TYPEPOSQUERY:
1850      if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1851      break;
1852
1853      case OP_TYPEPOSUPTO:
1854      case OP_TYPEUPTO:
1855      case OP_TYPEMINUPTO:
1856      case OP_TYPEEXACT:
1857      if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1858      break;
1859
1860      case OP_MARK:
1861      case OP_PRUNE_ARG:
1862      case OP_SKIP_ARG:
1863      code += code[1];
1864      break;
1865
1866      case OP_THEN_ARG:
1867      code += code[1+LINK_SIZE];
1868      break;
1869      }
1870
1871    /* Add in the fixed length from the table */
1872
1873    code += _pcre_OP_lengths[c];
1874
1875    /* In UTF-8 mode, opcodes that are followed by a character may be followed
1876    by a multi-byte character. The length in the table is a minimum, so we have
1877    to arrange to skip the extra bytes. */
1878
1879#ifdef SUPPORT_UTF8
1880    if (utf8) switch(c)
1881      {
1882      case OP_CHAR:
1883      case OP_CHARNC:
1884      case OP_EXACT:
1885      case OP_UPTO:
1886      case OP_MINUPTO:
1887      case OP_POSUPTO:
1888      case OP_STAR:
1889      case OP_MINSTAR:
1890      case OP_POSSTAR:
1891      case OP_PLUS:
1892      case OP_MINPLUS:
1893      case OP_POSPLUS:
1894      case OP_QUERY:
1895      case OP_MINQUERY:
1896      case OP_POSQUERY:
1897      if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1898      break;
1899      }
1900#else
1901    (void)(utf8);  /* Keep compiler happy by referencing function argument */
1902#endif
1903    }
1904  }
1905}
1906
1907
1908
1909/*************************************************
1910*    Scan compiled branch for non-emptiness      *
1911*************************************************/
1912
1913/* This function scans through a branch of a compiled pattern to see whether it
1914can match the empty string or not. It is called from could_be_empty()
1915below and from compile_branch() when checking for an unlimited repeat of a
1916group that can match nothing. Note that first_significant_code() skips over
1917backward and negative forward assertions when its final argument is TRUE. If we
1918hit an unclosed bracket, we return "empty" - this means we've struck an inner
1919bracket whose current branch will already have been scanned.
1920
1921Arguments:
1922  code        points to start of search
1923  endcode     points to where to stop
1924  utf8        TRUE if in UTF8 mode
1925  cd          contains pointers to tables etc.
1926
1927Returns:      TRUE if what is matched could be empty
1928*/
1929
1930static BOOL
1931could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8,
1932  compile_data *cd)
1933{
1934register int c;
1935for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE);
1936     code < endcode;
1937     code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
1938  {
1939  const uschar *ccode;
1940
1941  c = *code;
1942
1943  /* Skip over forward assertions; the other assertions are skipped by
1944  first_significant_code() with a TRUE final argument. */
1945
1946  if (c == OP_ASSERT)
1947    {
1948    do code += GET(code, 1); while (*code == OP_ALT);
1949    c = *code;
1950    continue;
1951    }
1952
1953  /* Groups with zero repeats can of course be empty; skip them. */
1954
1955  if (c == OP_BRAZERO || c == OP_BRAMINZERO || c == OP_SKIPZERO)
1956    {
1957    code += _pcre_OP_lengths[c];
1958    do code += GET(code, 1); while (*code == OP_ALT);
1959    c = *code;
1960    continue;
1961    }
1962
1963  /* For a recursion/subroutine call, if its end has been reached, which
1964  implies a subroutine call, we can scan it. */
1965
1966  if (c == OP_RECURSE)
1967    {
1968    BOOL empty_branch = FALSE;
1969    const uschar *scode = cd->start_code + GET(code, 1);
1970    if (GET(scode, 1) == 0) return TRUE;    /* Unclosed */
1971    do
1972      {
1973      if (could_be_empty_branch(scode, endcode, utf8, cd))
1974        {
1975        empty_branch = TRUE;
1976        break;
1977        }
1978      scode += GET(scode, 1);
1979      }
1980    while (*scode == OP_ALT);
1981    if (!empty_branch) return FALSE;  /* All branches are non-empty */
1982    continue;
1983    }
1984
1985  /* For other groups, scan the branches. */
1986
1987  if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE || c == OP_COND)
1988    {
1989    BOOL empty_branch;
1990    if (GET(code, 1) == 0) return TRUE;    /* Hit unclosed bracket */
1991
1992    /* If a conditional group has only one branch, there is a second, implied,
1993    empty branch, so just skip over the conditional, because it could be empty.
1994    Otherwise, scan the individual branches of the group. */
1995
1996    if (c == OP_COND && code[GET(code, 1)] != OP_ALT)
1997      code += GET(code, 1);
1998    else
1999      {
2000      empty_branch = FALSE;
2001      do
2002        {
2003        if (!empty_branch && could_be_empty_branch(code, endcode, utf8, cd))
2004          empty_branch = TRUE;
2005        code += GET(code, 1);
2006        }
2007      while (*code == OP_ALT);
2008      if (!empty_branch) return FALSE;   /* All branches are non-empty */
2009      }
2010
2011    c = *code;
2012    continue;
2013    }
2014
2015  /* Handle the other opcodes */
2016
2017  switch (c)
2018    {
2019    /* Check for quantifiers after a class. XCLASS is used for classes that
2020    cannot be represented just by a bit map. This includes negated single
2021    high-valued characters. The length in _pcre_OP_lengths[] is zero; the
2022    actual length is stored in the compiled code, so we must update "code"
2023    here. */
2024
2025#ifdef SUPPORT_UTF8
2026    case OP_XCLASS:
2027    ccode = code += GET(code, 1);
2028    goto CHECK_CLASS_REPEAT;
2029#endif
2030
2031    case OP_CLASS:
2032    case OP_NCLASS:
2033    ccode = code + 33;
2034
2035#ifdef SUPPORT_UTF8
2036    CHECK_CLASS_REPEAT:
2037#endif
2038
2039    switch (*ccode)
2040      {
2041      case OP_CRSTAR:            /* These could be empty; continue */
2042      case OP_CRMINSTAR:
2043      case OP_CRQUERY:
2044      case OP_CRMINQUERY:
2045      break;
2046
2047      default:                   /* Non-repeat => class must match */
2048      case OP_CRPLUS:            /* These repeats aren't empty */
2049      case OP_CRMINPLUS:
2050      return FALSE;
2051
2052      case OP_CRRANGE:
2053      case OP_CRMINRANGE:
2054      if (GET2(ccode, 1) > 0) return FALSE;  /* Minimum > 0 */
2055      break;
2056      }
2057    break;
2058
2059    /* Opcodes that must match a character */
2060
2061    case OP_PROP:
2062    case OP_NOTPROP:
2063    case OP_EXTUNI:
2064    case OP_NOT_DIGIT:
2065    case OP_DIGIT:
2066    case OP_NOT_WHITESPACE:
2067    case OP_WHITESPACE:
2068    case OP_NOT_WORDCHAR:
2069    case OP_WORDCHAR:
2070    case OP_ANY:
2071    case OP_ALLANY:
2072    case OP_ANYBYTE:
2073    case OP_CHAR:
2074    case OP_CHARNC:
2075    case OP_NOT:
2076    case OP_PLUS:
2077    case OP_MINPLUS:
2078    case OP_POSPLUS:
2079    case OP_EXACT:
2080    case OP_NOTPLUS:
2081    case OP_NOTMINPLUS:
2082    case OP_NOTPOSPLUS:
2083    case OP_NOTEXACT:
2084    case OP_TYPEPLUS:
2085    case OP_TYPEMINPLUS:
2086    case OP_TYPEPOSPLUS:
2087    case OP_TYPEEXACT:
2088    return FALSE;
2089
2090    /* These are going to continue, as they may be empty, but we have to
2091    fudge the length for the \p and \P cases. */
2092
2093    case OP_TYPESTAR:
2094    case OP_TYPEMINSTAR:
2095    case OP_TYPEPOSSTAR:
2096    case OP_TYPEQUERY:
2097    case OP_TYPEMINQUERY:
2098    case OP_TYPEPOSQUERY:
2099    if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
2100    break;
2101
2102    /* Same for these */
2103
2104    case OP_TYPEUPTO:
2105    case OP_TYPEMINUPTO:
2106    case OP_TYPEPOSUPTO:
2107    if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
2108    break;
2109
2110    /* End of branch */
2111
2112    case OP_KET:
2113    case OP_KETRMAX:
2114    case OP_KETRMIN:
2115    case OP_ALT:
2116    return TRUE;
2117
2118    /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
2119    MINUPTO, and POSUPTO may be followed by a multibyte character */
2120
2121#ifdef SUPPORT_UTF8
2122    case OP_STAR:
2123    case OP_MINSTAR:
2124    case OP_POSSTAR:
2125    case OP_QUERY:
2126    case OP_MINQUERY:
2127    case OP_POSQUERY:
2128    if (utf8 && code[1] >= 0xc0) code += _pcre_utf8_table4[code[1] & 0x3f];
2129    break;
2130
2131    case OP_UPTO:
2132    case OP_MINUPTO:
2133    case OP_POSUPTO:
2134    if (utf8 && code[3] >= 0xc0) code += _pcre_utf8_table4[code[3] & 0x3f];
2135    break;
2136#endif
2137
2138    /* MARK, and PRUNE/SKIP/THEN with an argument must skip over the argument
2139    string. */
2140
2141    case OP_MARK:
2142    case OP_PRUNE_ARG:
2143    case OP_SKIP_ARG:
2144    code += code[1];
2145    break;
2146
2147    case OP_THEN_ARG:
2148    code += code[1+LINK_SIZE];
2149    break;
2150
2151    /* None of the remaining opcodes are required to match a character. */
2152
2153    default:
2154    break;
2155    }
2156  }
2157
2158return TRUE;
2159}
2160
2161
2162
2163/*************************************************
2164*    Scan compiled regex for non-emptiness       *
2165*************************************************/
2166
2167/* This function is called to check for left recursive calls. We want to check
2168the current branch of the current pattern to see if it could match the empty
2169string. If it could, we must look outwards for branches at other levels,
2170stopping when we pass beyond the bracket which is the subject of the recursion.
2171
2172Arguments:
2173  code        points to start of the recursion
2174  endcode     points to where to stop (current RECURSE item)
2175  bcptr       points to the chain of current (unclosed) branch starts
2176  utf8        TRUE if in UTF-8 mode
2177  cd          pointers to tables etc
2178
2179Returns:      TRUE if what is matched could be empty
2180*/
2181
2182static BOOL
2183could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
2184  BOOL utf8, compile_data *cd)
2185{
2186while (bcptr != NULL && bcptr->current_branch >= code)
2187  {
2188  if (!could_be_empty_branch(bcptr->current_branch, endcode, utf8, cd))
2189    return FALSE;
2190  bcptr = bcptr->outer;
2191  }
2192return TRUE;
2193}
2194
2195
2196
2197/*************************************************
2198*           Check for POSIX class syntax         *
2199*************************************************/
2200
2201/* This function is called when the sequence "[:" or "[." or "[=" is
2202encountered in a character class. It checks whether this is followed by a
2203sequence of characters terminated by a matching ":]" or ".]" or "=]". If we
2204reach an unescaped ']' without the special preceding character, return FALSE.
2205
2206Originally, this function only recognized a sequence of letters between the
2207terminators, but it seems that Perl recognizes any sequence of characters,
2208though of course unknown POSIX names are subsequently rejected. Perl gives an
2209"Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE
2210didn't consider this to be a POSIX class. Likewise for [:1234:].
2211
2212The problem in trying to be exactly like Perl is in the handling of escapes. We
2213have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX
2214class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
2215below handles the special case of \], but does not try to do any other escape
2216processing. This makes it different from Perl for cases such as [:l\ower:]
2217where Perl recognizes it as the POSIX class "lower" but PCRE does not recognize
2218"l\ower". This is a lesser evil that not diagnosing bad classes when Perl does,
2219I think.
2220
2221Arguments:
2222  ptr      pointer to the initial [
2223  endptr   where to return the end pointer
2224
2225Returns:   TRUE or FALSE
2226*/
2227
2228static BOOL
2229check_posix_syntax(const uschar *ptr, const uschar **endptr)
2230{
2231int terminator;          /* Don't combine these lines; the Solaris cc */
2232terminator = *(++ptr);   /* compiler warns about "non-constant" initializer. */
2233for (++ptr; *ptr != 0; ptr++)
2234  {
2235  if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET) ptr++; else
2236    {
2237    if (*ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE;
2238    if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
2239      {
2240      *endptr = ptr;
2241      return TRUE;
2242      }
2243    }
2244  }
2245return FALSE;
2246}
2247
2248
2249
2250
2251/*************************************************
2252*          Check POSIX class name                *
2253*************************************************/
2254
2255/* This function is called to check the name given in a POSIX-style class entry
2256such as [:alnum:].
2257
2258Arguments:
2259  ptr        points to the first letter
2260  len        the length of the name
2261
2262Returns:     a value representing the name, or -1 if unknown
2263*/
2264
2265static int
2266check_posix_name(const uschar *ptr, int len)
2267{
2268const char *pn = posix_names;
2269register int yield = 0;
2270while (posix_name_lengths[yield] != 0)
2271  {
2272  if (len == posix_name_lengths[yield] &&
2273    strncmp((const char *)ptr, pn, len) == 0) return yield;
2274  pn += posix_name_lengths[yield] + 1;
2275  yield++;
2276  }
2277return -1;
2278}
2279
2280
2281/*************************************************
2282*    Adjust OP_RECURSE items in repeated group   *
2283*************************************************/
2284
2285/* OP_RECURSE items contain an offset from the start of the regex to the group
2286that is referenced. This means that groups can be replicated for fixed
2287repetition simply by copying (because the recursion is allowed to refer to
2288earlier groups that are outside the current group). However, when a group is
2289optional (i.e. the minimum quantifier is zero), OP_BRAZERO or OP_SKIPZERO is
2290inserted before it, after it has been compiled. This means that any OP_RECURSE
2291items within it that refer to the group itself or any contained groups have to
2292have their offsets adjusted. That one of the jobs of this function. Before it
2293is called, the partially compiled regex must be temporarily terminated with
2294OP_END.
2295
2296This function has been extended with the possibility of forward references for
2297recursions and subroutine calls. It must also check the list of such references
2298for the group we are dealing with. If it finds that one of the recursions in
2299the current group is on this list, it adjusts the offset in the list, not the
2300value in the reference (which is a group number).
2301
2302Arguments:
2303  group      points to the start of the group
2304  adjust     the amount by which the group is to be moved
2305  utf8       TRUE in UTF-8 mode
2306  cd         contains pointers to tables etc.
2307  save_hwm   the hwm forward reference pointer at the start of the group
2308
2309Returns:     nothing
2310*/
2311
2312static void
2313adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,
2314  uschar *save_hwm)
2315{
2316uschar *ptr = group;
2317
2318while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
2319  {
2320  int offset;
2321  uschar *hc;
2322
2323  /* See if this recursion is on the forward reference list. If so, adjust the
2324  reference. */
2325
2326  for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
2327    {
2328    offset = GET(hc, 0);
2329    if (cd->start_code + offset == ptr + 1)
2330      {
2331      PUT(hc, 0, offset + adjust);
2332      break;
2333      }
2334    }
2335
2336  /* Otherwise, adjust the recursion offset if it's after the start of this
2337  group. */
2338
2339  if (hc >= cd->hwm)
2340    {
2341    offset = GET(ptr, 1);
2342    if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
2343    }
2344
2345  ptr += 1 + LINK_SIZE;
2346  }
2347}
2348
2349
2350
2351/*************************************************
2352*        Insert an automatic callout point       *
2353*************************************************/
2354
2355/* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
2356callout points before each pattern item.
2357
2358Arguments:
2359  code           current code pointer
2360  ptr            current pattern pointer
2361  cd             pointers to tables etc
2362
2363Returns:         new code pointer
2364*/
2365
2366static uschar *
2367auto_callout(uschar *code, const uschar *ptr, compile_data *cd)
2368{
2369*code++ = OP_CALLOUT;
2370*code++ = 255;
2371PUT(code, 0, (int)(ptr - cd->start_pattern));  /* Pattern offset */
2372PUT(code, LINK_SIZE, 0);                       /* Default length */
2373return code + 2*LINK_SIZE;
2374}
2375
2376
2377
2378/*************************************************
2379*         Complete a callout item                *
2380*************************************************/
2381
2382/* A callout item contains the length of the next item in the pattern, which
2383we can't fill in till after we have reached the relevant point. This is used
2384for both automatic and manual callouts.
2385
2386Arguments:
2387  previous_callout   points to previous callout item
2388  ptr                current pattern pointer
2389  cd                 pointers to tables etc
2390
2391Returns:             nothing
2392*/
2393
2394static void
2395complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
2396{
2397int length = (int)(ptr - cd->start_pattern - GET(previous_callout, 2));
2398PUT(previous_callout, 2 + LINK_SIZE, length);
2399}
2400
2401
2402
2403#ifdef SUPPORT_UCP
2404/*************************************************
2405*           Get othercase range                  *
2406*************************************************/
2407
2408/* This function is passed the start and end of a class range, in UTF-8 mode
2409with UCP support. It searches up the characters, looking for internal ranges of
2410characters in the "other" case. Each call returns the next one, updating the
2411start address.
2412
2413Arguments:
2414  cptr        points to starting character value; updated
2415  d           end value
2416  ocptr       where to put start of othercase range
2417  odptr       where to put end of othercase range
2418
2419Yield:        TRUE when range returned; FALSE when no more
2420*/
2421
2422static BOOL
2423get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,
2424  unsigned int *odptr)
2425{
2426unsigned int c, othercase, next;
2427
2428for (c = *cptr; c <= d; c++)
2429  { if ((othercase = UCD_OTHERCASE(c)) != c) break; }
2430
2431if (c > d) return FALSE;
2432
2433*ocptr = othercase;
2434next = othercase + 1;
2435
2436for (++c; c <= d; c++)
2437  {
2438  if (UCD_OTHERCASE(c) != next) break;
2439  next++;
2440  }
2441
2442*odptr = next - 1;
2443*cptr = c;
2444
2445return TRUE;
2446}
2447
2448
2449
2450/*************************************************
2451*        Check a character and a property        *
2452*************************************************/
2453
2454/* This function is called by check_auto_possessive() when a property item
2455is adjacent to a fixed character.
2456
2457Arguments:
2458  c            the character
2459  ptype        the property type
2460  pdata        the data for the type
2461  negated      TRUE if it's a negated property (\P or \p{^)
2462
2463Returns:       TRUE if auto-possessifying is OK
2464*/
2465
2466static BOOL
2467check_char_prop(int c, int ptype, int pdata, BOOL negated)
2468{
2469const ucd_record *prop = GET_UCD(c);
2470switch(ptype)
2471  {
2472  case PT_LAMP:
2473  return (prop->chartype == ucp_Lu ||
2474          prop->chartype == ucp_Ll ||
2475          prop->chartype == ucp_Lt) == negated;
2476
2477  case PT_GC:
2478  return (pdata == _pcre_ucp_gentype[prop->chartype]) == negated;
2479
2480  case PT_PC:
2481  return (pdata == prop->chartype) == negated;
2482
2483  case PT_SC:
2484  return (pdata == prop->script) == negated;
2485
2486  /* These are specials */
2487
2488  case PT_ALNUM:
2489  return (_pcre_ucp_gentype[prop->chartype] == ucp_L ||
2490          _pcre_ucp_gentype[prop->chartype] == ucp_N) == negated;
2491
2492  case PT_SPACE:    /* Perl space */
2493  return (_pcre_ucp_gentype[prop->chartype] == ucp_Z ||
2494          c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)
2495          == negated;
2496
2497  case PT_PXSPACE:  /* POSIX space */
2498  return (_pcre_ucp_gentype[prop->chartype] == ucp_Z ||
2499          c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
2500          c == CHAR_FF || c == CHAR_CR)
2501          == negated;
2502
2503  case PT_WORD:
2504  return (_pcre_ucp_gentype[prop->chartype] == ucp_L ||
2505          _pcre_ucp_gentype[prop->chartype] == ucp_N ||
2506          c == CHAR_UNDERSCORE) == negated;
2507  }
2508return FALSE;
2509}
2510#endif  /* SUPPORT_UCP */
2511
2512
2513
2514/*************************************************
2515*     Check if auto-possessifying is possible    *
2516*************************************************/
2517
2518/* This function is called for unlimited repeats of certain items, to see
2519whether the next thing could possibly match the repeated item. If not, it makes
2520sense to automatically possessify the repeated item.
2521
2522Arguments:
2523  previous      pointer to the repeated opcode
2524  utf8          TRUE in UTF-8 mode
2525  ptr           next character in pattern
2526  options       options bits
2527  cd            contains pointers to tables etc.
2528
2529Returns:        TRUE if possessifying is wanted
2530*/
2531
2532static BOOL
2533check_auto_possessive(const uschar *previous, BOOL utf8, const uschar *ptr,
2534  int options, compile_data *cd)
2535{
2536int c, next;
2537int op_code = *previous++;
2538
2539/* Skip whitespace and comments in extended mode */
2540
2541if ((options & PCRE_EXTENDED) != 0)
2542  {
2543  for (;;)
2544    {
2545    while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2546    if (*ptr == CHAR_NUMBER_SIGN)
2547      {
2548      ptr++;
2549      while (*ptr != 0)
2550        {
2551        if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2552        ptr++;
2553#ifdef SUPPORT_UTF8
2554        if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++;
2555#endif
2556        }
2557      }
2558    else break;
2559    }
2560  }
2561
2562/* If the next item is one that we can handle, get its value. A non-negative
2563value is a character, a negative value is an escape value. */
2564
2565if (*ptr == CHAR_BACKSLASH)
2566  {
2567  int temperrorcode = 0;
2568  next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
2569  if (temperrorcode != 0) return FALSE;
2570  ptr++;    /* Point after the escape sequence */
2571  }
2572
2573else if ((cd->ctypes[*ptr] & ctype_meta) == 0)
2574  {
2575#ifdef SUPPORT_UTF8
2576  if (utf8) { GETCHARINC(next, ptr); } else
2577#endif
2578  next = *ptr++;
2579  }
2580
2581else return FALSE;
2582
2583/* Skip whitespace and comments in extended mode */
2584
2585if ((options & PCRE_EXTENDED) != 0)
2586  {
2587  for (;;)
2588    {
2589    while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2590    if (*ptr == CHAR_NUMBER_SIGN)
2591      {
2592      ptr++;
2593      while (*ptr != 0)
2594        {
2595        if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2596        ptr++;
2597#ifdef SUPPORT_UTF8
2598        if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++;
2599#endif
2600        }
2601      }
2602    else break;
2603    }
2604  }
2605
2606/* If the next thing is itself optional, we have to give up. */
2607
2608if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK ||
2609  strncmp((char *)ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0)
2610    return FALSE;
2611
2612/* Now compare the next item with the previous opcode. First, handle cases when
2613the next item is a character. */
2614
2615if (next >= 0) switch(op_code)
2616  {
2617  case OP_CHAR:
2618#ifdef SUPPORT_UTF8
2619  GETCHARTEST(c, previous);
2620#else
2621  c = *previous;
2622#endif
2623  return c != next;
2624
2625  /* For CHARNC (caseless character) we must check the other case. If we have
2626  Unicode property support, we can use it to test the other case of
2627  high-valued characters. */
2628
2629  case OP_CHARNC:
2630#ifdef SUPPORT_UTF8
2631  GETCHARTEST(c, previous);
2632#else
2633  c = *previous;
2634#endif
2635  if (c == next) return FALSE;
2636#ifdef SUPPORT_UTF8
2637  if (utf8)
2638    {
2639    unsigned int othercase;
2640    if (next < 128) othercase = cd->fcc[next]; else
2641#ifdef SUPPORT_UCP
2642    othercase = UCD_OTHERCASE((unsigned int)next);
2643#else
2644    othercase = NOTACHAR;
2645#endif
2646    return (unsigned int)c != othercase;
2647    }
2648  else
2649#endif  /* SUPPORT_UTF8 */
2650  return (c != cd->fcc[next]);  /* Non-UTF-8 mode */
2651
2652  /* For OP_NOT, its data is always a single-byte character. */
2653
2654  case OP_NOT:
2655  if ((c = *previous) == next) return TRUE;
2656  if ((options & PCRE_CASELESS) == 0) return FALSE;
2657#ifdef SUPPORT_UTF8
2658  if (utf8)
2659    {
2660    unsigned int othercase;
2661    if (next < 128) othercase = cd->fcc[next]; else
2662#ifdef SUPPORT_UCP
2663    othercase = UCD_OTHERCASE(next);
2664#else
2665    othercase = NOTACHAR;
2666#endif
2667    return (unsigned int)c == othercase;
2668    }
2669  else
2670#endif  /* SUPPORT_UTF8 */
2671  return (c == cd->fcc[next]);  /* Non-UTF-8 mode */
2672
2673  /* Note that OP_DIGIT etc. are generated only when PCRE_UCP is *not* set.
2674  When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */
2675
2676  case OP_DIGIT:
2677  return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
2678
2679  case OP_NOT_DIGIT:
2680  return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;
2681
2682  case OP_WHITESPACE:
2683  return next > 127 || (cd->ctypes[next] & ctype_space) == 0;
2684
2685  case OP_NOT_WHITESPACE:
2686  return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;
2687
2688  case OP_WORDCHAR:
2689  return next > 127 || (cd->ctypes[next] & ctype_word) == 0;
2690
2691  case OP_NOT_WORDCHAR:
2692  return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
2693
2694  case OP_HSPACE:
2695  case OP_NOT_HSPACE:
2696  switch(next)
2697    {
2698    case 0x09:
2699    case 0x20:
2700    case 0xa0:
2701    case 0x1680:
2702    case 0x180e:
2703    case 0x2000:
2704    case 0x2001:
2705    case 0x2002:
2706    case 0x2003:
2707    case 0x2004:
2708    case 0x2005:
2709    case 0x2006:
2710    case 0x2007:
2711    case 0x2008:
2712    case 0x2009:
2713    case 0x200A:
2714    case 0x202f:
2715    case 0x205f:
2716    case 0x3000:
2717    return op_code == OP_NOT_HSPACE;
2718    default:
2719    return op_code != OP_NOT_HSPACE;
2720    }
2721
2722  case OP_ANYNL:
2723  case OP_VSPACE:
2724  case OP_NOT_VSPACE:
2725  switch(next)
2726    {
2727    case 0x0a:
2728    case 0x0b:
2729    case 0x0c:
2730    case 0x0d:
2731    case 0x85:
2732    case 0x2028:
2733    case 0x2029:
2734    return op_code == OP_NOT_VSPACE;
2735    default:
2736    return op_code != OP_NOT_VSPACE;
2737    }
2738
2739#ifdef SUPPORT_UCP
2740  case OP_PROP:
2741  return check_char_prop(next, previous[0], previous[1], FALSE);
2742
2743  case OP_NOTPROP:
2744  return check_char_prop(next, previous[0], previous[1], TRUE);
2745#endif
2746
2747  default:
2748  return FALSE;
2749  }
2750
2751
2752/* Handle the case when the next item is \d, \s, etc. Note that when PCRE_UCP
2753is set, \d turns into ESC_du rather than ESC_d, etc., so ESC_d etc. are
2754generated only when PCRE_UCP is *not* set, that is, when only ASCII
2755characteristics are recognized. Similarly, the opcodes OP_DIGIT etc. are
2756replaced by OP_PROP codes when PCRE_UCP is set. */
2757
2758switch(op_code)
2759  {
2760  case OP_CHAR:
2761  case OP_CHARNC:
2762#ifdef SUPPORT_UTF8
2763  GETCHARTEST(c, previous);
2764#else
2765  c = *previous;
2766#endif
2767  switch(-next)
2768    {
2769    case ESC_d:
2770    return c > 127 || (cd->ctypes[c] & ctype_digit) == 0;
2771
2772    case ESC_D:
2773    return c <= 127 && (cd->ctypes[c] & ctype_digit) != 0;
2774
2775    case ESC_s:
2776    return c > 127 || (cd->ctypes[c] & ctype_space) == 0;
2777
2778    case ESC_S:
2779    return c <= 127 && (cd->ctypes[c] & ctype_space) != 0;
2780
2781    case ESC_w:
2782    return c > 127 || (cd->ctypes[c] & ctype_word) == 0;
2783
2784    case ESC_W:
2785    return c <= 127 && (cd->ctypes[c] & ctype_word) != 0;
2786
2787    case ESC_h:
2788    case ESC_H:
2789    switch(c)
2790      {
2791      case 0x09:
2792      case 0x20:
2793      case 0xa0:
2794      case 0x1680:
2795      case 0x180e:
2796      case 0x2000:
2797      case 0x2001:
2798      case 0x2002:
2799      case 0x2003:
2800      case 0x2004:
2801      case 0x2005:
2802      case 0x2006:
2803      case 0x2007:
2804      case 0x2008:
2805      case 0x2009:
2806      case 0x200A:
2807      case 0x202f:
2808      case 0x205f:
2809      case 0x3000:
2810      return -next != ESC_h;
2811      default:
2812      return -next == ESC_h;
2813      }
2814
2815    case ESC_v:
2816    case ESC_V:
2817    switch(c)
2818      {
2819      case 0x0a:
2820      case 0x0b:
2821      case 0x0c:
2822      case 0x0d:
2823      case 0x85:
2824      case 0x2028:
2825      case 0x2029:
2826      return -next != ESC_v;
2827      default:
2828      return -next == ESC_v;
2829      }
2830
2831    /* When PCRE_UCP is set, these values get generated for \d etc. Find
2832    their substitutions and process them. The result will always be either
2833    -ESC_p or -ESC_P. Then fall through to process those values. */
2834
2835#ifdef SUPPORT_UCP
2836    case ESC_du:
2837    case ESC_DU:
2838    case ESC_wu:
2839    case ESC_WU:
2840    case ESC_su:
2841    case ESC_SU:
2842      {
2843      int temperrorcode = 0;
2844      ptr = substitutes[-next - ESC_DU];
2845      next = check_escape(&ptr, &temperrorcode, 0, options, FALSE);
2846      if (temperrorcode != 0) return FALSE;
2847      ptr++;    /* For compatibility */
2848      }
2849    /* Fall through */
2850
2851    case ESC_p:
2852    case ESC_P:
2853      {
2854      int ptype, pdata, errorcodeptr;
2855      BOOL negated;
2856
2857      ptr--;      /* Make ptr point at the p or P */
2858      ptype = get_ucp(&ptr, &negated, &pdata, &errorcodeptr);
2859      if (ptype < 0) return FALSE;
2860      ptr++;      /* Point past the final curly ket */
2861
2862      /* If the property item is optional, we have to give up. (When generated
2863      from \d etc by PCRE_UCP, this test will have been applied much earlier,
2864      to the original \d etc. At this point, ptr will point to a zero byte. */
2865
2866      if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK ||
2867        strncmp((char *)ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0)
2868          return FALSE;
2869
2870      /* Do the property check. */
2871
2872      return check_char_prop(c, ptype, pdata, (next == -ESC_P) != negated);
2873      }
2874#endif
2875
2876    default:
2877    return FALSE;
2878    }
2879
2880  /* In principle, support for Unicode properties should be integrated here as
2881  well. It means re-organizing the above code so as to get hold of the property
2882  values before switching on the op-code. However, I wonder how many patterns
2883  combine ASCII \d etc with Unicode properties? (Note that if PCRE_UCP is set,
2884  these op-codes are never generated.) */
2885
2886  case OP_DIGIT:
2887  return next == -ESC_D || next == -ESC_s || next == -ESC_W ||
2888         next == -ESC_h || next == -ESC_v || next == -ESC_R;
2889
2890  case OP_NOT_DIGIT:
2891  return next == -ESC_d;
2892
2893  case OP_WHITESPACE:
2894  return next == -ESC_S || next == -ESC_d || next == -ESC_w || next == -ESC_R;
2895
2896  case OP_NOT_WHITESPACE:
2897  return next == -ESC_s || next == -ESC_h || next == -ESC_v;
2898
2899  case OP_HSPACE:
2900  return next == -ESC_S || next == -ESC_H || next == -ESC_d ||
2901         next == -ESC_w || next == -ESC_v || next == -ESC_R;
2902
2903  case OP_NOT_HSPACE:
2904  return next == -ESC_h;
2905
2906  /* Can't have \S in here because VT matches \S (Perl anomaly) */
2907  case OP_ANYNL:
2908  case OP_VSPACE:
2909  return next == -ESC_V || next == -ESC_d || next == -ESC_w;
2910
2911  case OP_NOT_VSPACE:
2912  return next == -ESC_v || next == -ESC_R;
2913
2914  case OP_WORDCHAR:
2915  return next == -ESC_W || next == -ESC_s || next == -ESC_h ||
2916         next == -ESC_v || next == -ESC_R;
2917
2918  case OP_NOT_WORDCHAR:
2919  return next == -ESC_w || next == -ESC_d;
2920
2921  default:
2922  return FALSE;
2923  }
2924
2925/* Control does not reach here */
2926}
2927
2928
2929
2930/*************************************************
2931*           Compile one branch                   *
2932*************************************************/
2933
2934/* Scan the pattern, compiling it into the a vector. If the options are
2935changed during the branch, the pointer is used to change the external options
2936bits. This function is used during the pre-compile phase when we are trying
2937to find out the amount of memory needed, as well as during the real compile
2938phase. The value of lengthptr distinguishes the two phases.
2939
2940Arguments:
2941  optionsptr     pointer to the option bits
2942  codeptr        points to the pointer to the current code point
2943  ptrptr         points to the current pattern pointer
2944  errorcodeptr   points to error code variable
2945  firstbyteptr   set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
2946  reqbyteptr     set to the last literal character required, else < 0
2947  bcptr          points to current branch chain
2948  cd             contains pointers to tables etc.
2949  lengthptr      NULL during the real compile phase
2950                 points to length accumulator during pre-compile phase
2951
2952Returns:         TRUE on success
2953                 FALSE, with *errorcodeptr set non-zero on error
2954*/
2955
2956static BOOL
2957compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,
2958  int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,
2959  compile_data *cd, int *lengthptr)
2960{
2961int repeat_type, op_type;
2962int repeat_min = 0, repeat_max = 0;      /* To please picky compilers */
2963int bravalue = 0;
2964int greedy_default, greedy_non_default;
2965int firstbyte, reqbyte;
2966int zeroreqbyte, zerofirstbyte;
2967int req_caseopt, reqvary, tempreqvary;
2968int options = *optionsptr;
2969int after_manual_callout = 0;
2970int length_prevgroup = 0;
2971register int c;
2972register uschar *code = *codeptr;
2973uschar *last_code = code;
2974uschar *orig_code = code;
2975uschar *tempcode;
2976BOOL inescq = FALSE;
2977BOOL groupsetfirstbyte = FALSE;
2978const uschar *ptr = *ptrptr;
2979const uschar *tempptr;
2980const uschar *nestptr = NULL;
2981uschar *previous = NULL;
2982uschar *previous_callout = NULL;
2983uschar *save_hwm = NULL;
2984uschar classbits[32];
2985
2986#ifdef SUPPORT_UTF8
2987BOOL class_utf8;
2988BOOL utf8 = (options & PCRE_UTF8) != 0;
2989uschar *class_utf8data;
2990uschar *class_utf8data_base;
2991uschar utf8_char[6];
2992#else
2993BOOL utf8 = FALSE;
2994uschar *utf8_char = NULL;
2995#endif
2996
2997#ifdef PCRE_DEBUG
2998if (lengthptr != NULL) DPRINTF((">> start branch\n"));
2999#endif
3000
3001/* Set up the default and non-default settings for greediness */
3002
3003greedy_default = ((options & PCRE_UNGREEDY) != 0);
3004greedy_non_default = greedy_default ^ 1;
3005
3006/* Initialize no first byte, no required byte. REQ_UNSET means "no char
3007matching encountered yet". It gets changed to REQ_NONE if we hit something that
3008matches a non-fixed char first char; reqbyte just remains unset if we never
3009find one.
3010
3011When we hit a repeat whose minimum is zero, we may have to adjust these values
3012to take the zero repeat into account. This is implemented by setting them to
3013zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
3014item types that can be repeated set these backoff variables appropriately. */
3015
3016firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
3017
3018/* The variable req_caseopt contains either the REQ_CASELESS value or zero,
3019according to the current setting of the caseless flag. REQ_CASELESS is a bit
3020value > 255. It is added into the firstbyte or reqbyte variables to record the
3021case status of the value. This is used only for ASCII characters. */
3022
3023req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
3024
3025/* Switch on next character until the end of the branch */
3026
3027for (;; ptr++)
3028  {
3029  BOOL negate_class;
3030  BOOL should_flip_negation;
3031  BOOL possessive_quantifier;
3032  BOOL is_quantifier;
3033  BOOL is_recurse;
3034  BOOL reset_bracount;
3035  int class_charcount;
3036  int class_lastchar;
3037  int newoptions;
3038  int recno;
3039  int refsign;
3040  int skipbytes;
3041  int subreqbyte;
3042  int subfirstbyte;
3043  int terminator;
3044  int mclength;
3045  uschar mcbuffer[8];
3046
3047  /* Get next byte in the pattern */
3048
3049  c = *ptr;
3050
3051  /* If we are at the end of a nested substitution, revert to the outer level
3052  string. Nesting only happens one level deep. */
3053
3054  if (c == 0 && nestptr != NULL)
3055    {
3056    ptr = nestptr;
3057    nestptr = NULL;
3058    c = *ptr;
3059    }
3060
3061  /* If we are in the pre-compile phase, accumulate the length used for the
3062  previous cycle of this loop. */
3063
3064  if (lengthptr != NULL)
3065    {
3066#ifdef PCRE_DEBUG
3067    if (code > cd->hwm) cd->hwm = code;                 /* High water info */
3068#endif
3069    if (code > cd->start_workspace + WORK_SIZE_CHECK)   /* Check for overrun */
3070      {
3071      *errorcodeptr = ERR52;
3072      goto FAILED;
3073      }
3074
3075    /* There is at least one situation where code goes backwards: this is the
3076    case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
3077    the class is simply eliminated. However, it is created first, so we have to
3078    allow memory for it. Therefore, don't ever reduce the length at this point.
3079    */
3080
3081    if (code < last_code) code = last_code;
3082
3083    /* Paranoid check for integer overflow */
3084
3085    if (OFLOW_MAX - *lengthptr < code - last_code)
3086      {
3087      *errorcodeptr = ERR20;
3088      goto FAILED;
3089      }
3090
3091    *lengthptr += (int)(code - last_code);
3092    DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));
3093
3094    /* If "previous" is set and it is not at the start of the work space, move
3095    it back to there, in order to avoid filling up the work space. Otherwise,
3096    if "previous" is NULL, reset the current code pointer to the start. */
3097
3098    if (previous != NULL)
3099      {
3100      if (previous > orig_code)
3101        {
3102        memmove(orig_code, previous, code - previous);
3103        code -= previous - orig_code;
3104        previous = orig_code;
3105        }
3106      }
3107    else code = orig_code;
3108
3109    /* Remember where this code item starts so we can pick up the length
3110    next time round. */
3111
3112    last_code = code;
3113    }
3114
3115  /* In the real compile phase, just check the workspace used by the forward
3116  reference list. */
3117
3118  else if (cd->hwm > cd->start_workspace + WORK_SIZE_CHECK)
3119    {
3120    *errorcodeptr = ERR52;
3121    goto FAILED;
3122    }
3123
3124  /* If in \Q...\E, check for the end; if not, we have a literal */
3125
3126  if (inescq && c != 0)
3127    {
3128    if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E)
3129      {
3130      inescq = FALSE;
3131      ptr++;
3132      continue;
3133      }
3134    else
3135      {
3136      if (previous_callout != NULL)
3137        {
3138        if (lengthptr == NULL)  /* Don't attempt in pre-compile phase */
3139          complete_callout(previous_callout, ptr, cd);
3140        previous_callout = NULL;
3141        }
3142      if ((options & PCRE_AUTO_CALLOUT) != 0)
3143        {
3144        previous_callout = code;
3145        code = auto_callout(code, ptr, cd);
3146        }
3147      goto NORMAL_CHAR;
3148      }
3149    }
3150
3151  /* Fill in length of a previous callout, except when the next thing is
3152  a quantifier. */
3153
3154  is_quantifier =
3155    c == CHAR_ASTERISK || c == CHAR_PLUS || c == CHAR_QUESTION_MARK ||
3156    (c == CHAR_LEFT_CURLY_BRACKET && is_counted_repeat(ptr+1));
3157
3158  if (!is_quantifier && previous_callout != NULL &&
3159       after_manual_callout-- <= 0)
3160    {
3161    if (lengthptr == NULL)      /* Don't attempt in pre-compile phase */
3162      complete_callout(previous_callout, ptr, cd);
3163    previous_callout = NULL;
3164    }
3165
3166  /* In extended mode, skip white space and comments */
3167
3168  if ((options & PCRE_EXTENDED) != 0)
3169    {
3170    if ((cd->ctypes[c] & ctype_space) != 0) continue;
3171    if (c == CHAR_NUMBER_SIGN)
3172      {
3173      ptr++;
3174      while (*ptr != 0)
3175        {
3176        if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
3177        ptr++;
3178#ifdef SUPPORT_UTF8
3179        if (utf8) while ((*ptr & 0xc0) == 0x80) ptr++;
3180#endif
3181        }
3182      if (*ptr != 0) continue;
3183
3184      /* Else fall through to handle end of string */
3185      c = 0;
3186      }
3187    }
3188
3189  /* No auto callout for quantifiers. */
3190
3191  if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
3192    {
3193    previous_callout = code;
3194    code = auto_callout(code, ptr, cd);
3195    }
3196
3197  switch(c)
3198    {
3199    /* ===================================================================*/
3200    case 0:                        /* The branch terminates at string end */
3201    case CHAR_VERTICAL_LINE:       /* or | or ) */
3202    case CHAR_RIGHT_PARENTHESIS:
3203    *firstbyteptr = firstbyte;
3204    *reqbyteptr = reqbyte;
3205    *codeptr = code;
3206    *ptrptr = ptr;
3207    if (lengthptr != NULL)
3208      {
3209      if (OFLOW_MAX - *lengthptr < code - last_code)
3210        {
3211        *errorcodeptr = ERR20;
3212        goto FAILED;
3213        }
3214      *lengthptr += (int)(code - last_code);   /* To include callout length */
3215      DPRINTF((">> end branch\n"));
3216      }
3217    return TRUE;
3218
3219
3220    /* ===================================================================*/
3221    /* Handle single-character metacharacters. In multiline mode, ^ disables
3222    the setting of any following char as a first character. */
3223
3224    case CHAR_CIRCUMFLEX_ACCENT:
3225    if ((options & PCRE_MULTILINE) != 0)
3226      {
3227      if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3228      }
3229    previous = NULL;
3230    *code++ = OP_CIRC;
3231    break;
3232
3233    case CHAR_DOLLAR_SIGN:
3234    previous = NULL;
3235    *code++ = OP_DOLL;
3236    break;
3237
3238    /* There can never be a first char if '.' is first, whatever happens about
3239    repeats. The value of reqbyte doesn't change either. */
3240
3241    case CHAR_DOT:
3242    if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3243    zerofirstbyte = firstbyte;
3244    zeroreqbyte = reqbyte;
3245    previous = code;
3246    *code++ = ((options & PCRE_DOTALL) != 0)? OP_ALLANY: OP_ANY;
3247    break;
3248
3249
3250    /* ===================================================================*/
3251    /* Character classes. If the included characters are all < 256, we build a
3252    32-byte bitmap of the permitted characters, except in the special case
3253    where there is only one such character. For negated classes, we build the
3254    map as usual, then invert it at the end. However, we use a different opcode
3255    so that data characters > 255 can be handled correctly.
3256
3257    If the class contains characters outside the 0-255 range, a different
3258    opcode is compiled. It may optionally have a bit map for characters < 256,
3259    but those above are are explicitly listed afterwards. A flag byte tells
3260    whether the bitmap is present, and whether this is a negated class or not.
3261
3262    In JavaScript compatibility mode, an isolated ']' causes an error. In
3263    default (Perl) mode, it is treated as a data character. */
3264
3265    case CHAR_RIGHT_SQUARE_BRACKET:
3266    if ((cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
3267      {
3268      *errorcodeptr = ERR64;
3269      goto FAILED;
3270      }
3271    goto NORMAL_CHAR;
3272
3273    case CHAR_LEFT_SQUARE_BRACKET:
3274    previous = code;
3275
3276    /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
3277    they are encountered at the top level, so we'll do that too. */
3278
3279    if ((ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
3280         ptr[1] == CHAR_EQUALS_SIGN) &&
3281        check_posix_syntax(ptr, &tempptr))
3282      {
3283      *errorcodeptr = (ptr[1] == CHAR_COLON)? ERR13 : ERR31;
3284      goto FAILED;
3285      }
3286
3287    /* If the first character is '^', set the negation flag and skip it. Also,
3288    if the first few characters (either before or after ^) are \Q\E or \E we
3289    skip them too. This makes for compatibility with Perl. */
3290
3291    negate_class = FALSE;
3292    for (;;)
3293      {
3294      c = *(++ptr);
3295      if (c == CHAR_BACKSLASH)
3296        {
3297        if (ptr[1] == CHAR_E)
3298          ptr++;
3299        else if (strncmp((const char *)ptr+1,
3300                          STR_Q STR_BACKSLASH STR_E, 3) == 0)
3301          ptr += 3;
3302        else
3303          break;
3304        }
3305      else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT)
3306        negate_class = TRUE;
3307      else break;
3308      }
3309
3310    /* Empty classes are allowed in JavaScript compatibility mode. Otherwise,
3311    an initial ']' is taken as a data character -- the code below handles
3312    that. In JS mode, [] must always fail, so generate OP_FAIL, whereas
3313    [^] must match any character, so generate OP_ALLANY. */
3314
3315    if (c == CHAR_RIGHT_SQUARE_BRACKET &&
3316        (cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
3317      {
3318      *code++ = negate_class? OP_ALLANY : OP_FAIL;
3319      if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3320      zerofirstbyte = firstbyte;
3321      break;
3322      }
3323
3324    /* If a class contains a negative special such as \S, we need to flip the
3325    negation flag at the end, so that support for characters > 255 works
3326    correctly (they are all included in the class). */
3327
3328    should_flip_negation = FALSE;
3329
3330    /* Keep a count of chars with values < 256 so that we can optimize the case
3331    of just a single character (as long as it's < 256). However, For higher
3332    valued UTF-8 characters, we don't yet do any optimization. */
3333
3334    class_charcount = 0;
3335    class_lastchar = -1;
3336
3337    /* Initialize the 32-char bit map to all zeros. We build the map in a
3338    temporary bit of memory, in case the class contains only 1 character (less
3339    than 256), because in that case the compiled code doesn't use the bit map.
3340    */
3341
3342    memset(classbits, 0, 32 * sizeof(uschar));
3343
3344#ifdef SUPPORT_UTF8
3345    class_utf8 = FALSE;                       /* No chars >= 256 */
3346    class_utf8data = code + LINK_SIZE + 2;    /* For UTF-8 items */
3347    class_utf8data_base = class_utf8data;     /* For resetting in pass 1 */
3348#endif
3349
3350    /* Process characters until ] is reached. By writing this as a "do" it
3351    means that an initial ] is taken as a data character. At the start of the
3352    loop, c contains the first byte of the character. */
3353
3354    if (c != 0) do
3355      {
3356      const uschar *oldptr;
3357
3358#ifdef SUPPORT_UTF8
3359      if (utf8 && c > 127)
3360        {                           /* Braces are required because the */
3361        GETCHARLEN(c, ptr, ptr);    /* macro generates multiple statements */
3362        }
3363
3364      /* In the pre-compile phase, accumulate the length of any UTF-8 extra
3365      data and reset the pointer. This is so that very large classes that
3366      contain a zillion UTF-8 characters no longer overwrite the work space
3367      (which is on the stack). */
3368
3369      if (lengthptr != NULL)
3370        {
3371        *lengthptr += class_utf8data - class_utf8data_base;
3372        class_utf8data = class_utf8data_base;
3373        }
3374
3375#endif
3376
3377      /* Inside \Q...\E everything is literal except \E */
3378
3379      if (inescq)
3380        {
3381        if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E)  /* If we are at \E */
3382          {
3383          inescq = FALSE;                   /* Reset literal state */
3384          ptr++;                            /* Skip the 'E' */
3385          continue;                         /* Carry on with next */
3386          }
3387        goto CHECK_RANGE;                   /* Could be range if \E follows */
3388        }
3389
3390      /* Handle POSIX class names. Perl allows a negation extension of the
3391      form [:^name:]. A square bracket that doesn't match the syntax is
3392      treated as a literal. We also recognize the POSIX constructions
3393      [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
3394      5.6 and 5.8 do. */
3395
3396      if (c == CHAR_LEFT_SQUARE_BRACKET &&
3397          (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
3398           ptr[1] == CHAR_EQUALS_SIGN) && check_posix_syntax(ptr, &tempptr))
3399        {
3400        BOOL local_negate = FALSE;
3401        int posix_class, taboffset, tabopt;
3402        register const uschar *cbits = cd->cbits;
3403        uschar pbits[32];
3404
3405        if (ptr[1] != CHAR_COLON)
3406          {
3407          *errorcodeptr = ERR31;
3408          goto FAILED;
3409          }
3410
3411        ptr += 2;
3412        if (*ptr == CHAR_CIRCUMFLEX_ACCENT)
3413          {
3414          local_negate = TRUE;
3415          should_flip_negation = TRUE;  /* Note negative special */
3416          ptr++;
3417          }
3418
3419        posix_class = check_posix_name(ptr, (int)(tempptr - ptr));
3420        if (posix_class < 0)
3421          {
3422          *errorcodeptr = ERR30;
3423          goto FAILED;
3424          }
3425
3426        /* If matching is caseless, upper and lower are converted to
3427        alpha. This relies on the fact that the class table starts with
3428        alpha, lower, upper as the first 3 entries. */
3429
3430        if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
3431          posix_class = 0;
3432
3433        /* When PCRE_UCP is set, some of the POSIX classes are converted to
3434        different escape sequences that use Unicode properties. */
3435
3436#ifdef SUPPORT_UCP
3437        if ((options & PCRE_UCP) != 0)
3438          {
3439          int pc = posix_class + ((local_negate)? POSIX_SUBSIZE/2 : 0);
3440          if (posix_substitutes[pc] != NULL)
3441            {
3442            nestptr = tempptr + 1;
3443            ptr = posix_substitutes[pc] - 1;
3444            continue;
3445            }
3446          }
3447#endif
3448        /* In the non-UCP case, we build the bit map for the POSIX class in a
3449        chunk of local store because we may be adding and subtracting from it,
3450        and we don't want to subtract bits that may be in the main map already.
3451        At the end we or the result into the bit map that is being built. */
3452
3453        posix_class *= 3;
3454
3455        /* Copy in the first table (always present) */
3456
3457        memcpy(pbits, cbits + posix_class_maps[posix_class],
3458          32 * sizeof(uschar));
3459
3460        /* If there is a second table, add or remove it as required. */
3461
3462        taboffset = posix_class_maps[posix_class + 1];
3463        tabopt = posix_class_maps[posix_class + 2];
3464
3465        if (taboffset >= 0)
3466          {
3467          if (tabopt >= 0)
3468            for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
3469          else
3470            for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
3471          }
3472
3473        /* Not see if we need to remove any special characters. An option
3474        value of 1 removes vertical space and 2 removes underscore. */
3475
3476        if (tabopt < 0) tabopt = -tabopt;
3477        if (tabopt == 1) pbits[1] &= ~0x3c;
3478          else if (tabopt == 2) pbits[11] &= 0x7f;
3479
3480        /* Add the POSIX table or its complement into the main table that is
3481        being built and we are done. */
3482
3483        if (local_negate)
3484          for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
3485        else
3486          for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
3487
3488        ptr = tempptr + 1;
3489        class_charcount = 10;  /* Set > 1; assumes more than 1 per class */
3490        continue;    /* End of POSIX syntax handling */
3491        }
3492
3493      /* Backslash may introduce a single character, or it may introduce one
3494      of the specials, which just set a flag. The sequence \b is a special
3495      case. Inside a class (and only there) it is treated as backspace. We
3496      assume that other escapes have more than one character in them, so set
3497      class_charcount bigger than one. Unrecognized escapes fall through and
3498      are either treated as literal characters (by default), or are faulted if
3499      PCRE_EXTRA is set. */
3500
3501      if (c == CHAR_BACKSLASH)
3502        {
3503        c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
3504        if (*errorcodeptr != 0) goto FAILED;
3505
3506        if (-c == ESC_b) c = CHAR_BS;    /* \b is backspace in a class */
3507        else if (-c == ESC_Q)            /* Handle start of quoted string */
3508          {
3509          if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
3510            {
3511            ptr += 2; /* avoid empty string */
3512            }
3513          else inescq = TRUE;
3514          continue;
3515          }
3516        else if (-c == ESC_E) continue;  /* Ignore orphan \E */
3517
3518        if (c < 0)
3519          {
3520          register const uschar *cbits = cd->cbits;
3521          class_charcount += 2;     /* Greater than 1 is what matters */
3522
3523          switch (-c)
3524            {
3525#ifdef SUPPORT_UCP
3526            case ESC_du:     /* These are the values given for \d etc */
3527            case ESC_DU:     /* when PCRE_UCP is set. We replace the */
3528            case ESC_wu:     /* escape sequence with an appropriate \p */
3529            case ESC_WU:     /* or \P to test Unicode properties instead */
3530            case ESC_su:     /* of the default ASCII testing. */
3531            case ESC_SU:
3532            nestptr = ptr;
3533            ptr = substitutes[-c - ESC_DU] - 1;  /* Just before substitute */
3534            class_charcount -= 2;                /* Undo! */
3535            continue;
3536#endif
3537            case ESC_d:
3538            for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
3539            continue;
3540
3541            case ESC_D:
3542            should_flip_negation = TRUE;
3543            for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
3544            continue;
3545
3546            case ESC_w:
3547            for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
3548            continue;
3549
3550            case ESC_W:
3551            should_flip_negation = TRUE;
3552            for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
3553            continue;
3554
3555            /* Perl 5.004 onwards omits VT from \s, but we must preserve it
3556            if it was previously set by something earlier in the character
3557            class. */
3558
3559            case ESC_s:
3560            classbits[0] |= cbits[cbit_space];
3561            classbits[1] |= cbits[cbit_space+1] & ~0x08;
3562            for (c = 2; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
3563            continue;
3564
3565            case ESC_S:
3566            should_flip_negation = TRUE;
3567            for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
3568            classbits[1] |= 0x08;    /* Perl 5.004 onwards omits VT from \s */
3569            continue;
3570
3571            case ESC_h:
3572            SETBIT(classbits, 0x09); /* VT */
3573            SETBIT(classbits, 0x20); /* SPACE */
3574            SETBIT(classbits, 0xa0); /* NSBP */
3575#ifdef SUPPORT_UTF8
3576            if (utf8)
3577              {
3578              class_utf8 = TRUE;
3579              *class_utf8data++ = XCL_SINGLE;
3580              class_utf8data += _pcre_ord2utf8(0x1680, class_utf8data);
3581              *class_utf8data++ = XCL_SINGLE;
3582              class_utf8data += _pcre_ord2utf8(0x180e, class_utf8data);
3583              *class_utf8data++ = XCL_RANGE;
3584              class_utf8data += _pcre_ord2utf8(0x2000, class_utf8data);
3585              class_utf8data += _pcre_ord2utf8(0x200A, class_utf8data);
3586              *class_utf8data++ = XCL_SINGLE;
3587              class_utf8data += _pcre_ord2utf8(0x202f, class_utf8data);
3588              *class_utf8data++ = XCL_SINGLE;
3589              class_utf8data += _pcre_ord2utf8(0x205f, class_utf8data);
3590              *class_utf8data++ = XCL_SINGLE;
3591              class_utf8data += _pcre_ord2utf8(0x3000, class_utf8data);
3592              }
3593#endif
3594            continue;
3595
3596            case ESC_H:
3597            for (c = 0; c < 32; c++)
3598              {
3599              int x = 0xff;
3600              switch (c)
3601                {
3602                case 0x09/8: x ^= 1 << (0x09%8); break;
3603                case 0x20/8: x ^= 1 << (0x20%8); break;
3604                case 0xa0/8: x ^= 1 << (0xa0%8); break;
3605                default: break;
3606                }
3607              classbits[c] |= x;
3608              }
3609
3610#ifdef SUPPORT_UTF8
3611            if (utf8)
3612              {
3613              class_utf8 = TRUE;
3614              *class_utf8data++ = XCL_RANGE;
3615              class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
3616              class_utf8data += _pcre_ord2utf8(0x167f, class_utf8data);
3617              *class_utf8data++ = XCL_RANGE;
3618              class_utf8data += _pcre_ord2utf8(0x1681, class_utf8data);
3619              class_utf8data += _pcre_ord2utf8(0x180d, class_utf8data);
3620              *class_utf8data++ = XCL_RANGE;
3621              class_utf8data += _pcre_ord2utf8(0x180f, class_utf8data);
3622              class_utf8data += _pcre_ord2utf8(0x1fff, class_utf8data);
3623              *class_utf8data++ = XCL_RANGE;
3624              class_utf8data += _pcre_ord2utf8(0x200B, class_utf8data);
3625              class_utf8data += _pcre_ord2utf8(0x202e, class_utf8data);
3626              *class_utf8data++ = XCL_RANGE;
3627              class_utf8data += _pcre_ord2utf8(0x2030, class_utf8data);
3628              class_utf8data += _pcre_ord2utf8(0x205e, class_utf8data);
3629              *class_utf8data++ = XCL_RANGE;
3630              class_utf8data += _pcre_ord2utf8(0x2060, class_utf8data);
3631              class_utf8data += _pcre_ord2utf8(0x2fff, class_utf8data);
3632              *class_utf8data++ = XCL_RANGE;
3633              class_utf8data += _pcre_ord2utf8(0x3001, class_utf8data);
3634              class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
3635              }
3636#endif
3637            continue;
3638
3639            case ESC_v:
3640            SETBIT(classbits, 0x0a); /* LF */
3641            SETBIT(classbits, 0x0b); /* VT */
3642            SETBIT(classbits, 0x0c); /* FF */
3643            SETBIT(classbits, 0x0d); /* CR */
3644            SETBIT(classbits, 0x85); /* NEL */
3645#ifdef SUPPORT_UTF8
3646            if (utf8)
3647              {
3648              class_utf8 = TRUE;
3649              *class_utf8data++ = XCL_RANGE;
3650              class_utf8data += _pcre_ord2utf8(0x2028, class_utf8data);
3651              class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
3652              }
3653#endif
3654            continue;
3655
3656            case ESC_V:
3657            for (c = 0; c < 32; c++)
3658              {
3659              int x = 0xff;
3660              switch (c)
3661                {
3662                case 0x0a/8: x ^= 1 << (0x0a%8);
3663                             x ^= 1 << (0x0b%8);
3664                             x ^= 1 << (0x0c%8);
3665                             x ^= 1 << (0x0d%8);
3666                             break;
3667                case 0x85/8: x ^= 1 << (0x85%8); break;
3668                default: break;
3669                }
3670              classbits[c] |= x;
3671              }
3672
3673#ifdef SUPPORT_UTF8
3674            if (utf8)
3675              {
3676              class_utf8 = TRUE;
3677              *class_utf8data++ = XCL_RANGE;
3678              class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
3679              class_utf8data += _pcre_ord2utf8(0x2027, class_utf8data);
3680              *class_utf8data++ = XCL_RANGE;
3681              class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
3682              class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
3683              }
3684#endif
3685            continue;
3686
3687#ifdef SUPPORT_UCP
3688            case ESC_p:
3689            case ESC_P:
3690              {
3691              BOOL negated;
3692              int pdata;
3693              int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
3694              if (ptype < 0) goto FAILED;
3695              class_utf8 = TRUE;
3696              *class_utf8data++ = ((-c == ESC_p) != negated)?
3697                XCL_PROP : XCL_NOTPROP;
3698              *class_utf8data++ = ptype;
3699              *class_utf8data++ = pdata;
3700              class_charcount -= 2;   /* Not a < 256 character */
3701              continue;
3702              }
3703#endif
3704            /* Unrecognized escapes are faulted if PCRE is running in its
3705            strict mode. By default, for compatibility with Perl, they are
3706            treated as literals. */
3707
3708            default:
3709            if ((options & PCRE_EXTRA) != 0)
3710              {
3711              *errorcodeptr = ERR7;
3712              goto FAILED;
3713              }
3714            class_charcount -= 2;  /* Undo the default count from above */
3715            c = *ptr;              /* Get the final character and fall through */
3716            break;
3717            }
3718          }
3719
3720        /* Fall through if we have a single character (c >= 0). This may be
3721        greater than 256 in UTF-8 mode. */
3722
3723        }   /* End of backslash handling */
3724
3725      /* A single character may be followed by '-' to form a range. However,
3726      Perl does not permit ']' to be the end of the range. A '-' character
3727      at the end is treated as a literal. Perl ignores orphaned \E sequences
3728      entirely. The code for handling \Q and \E is messy. */
3729
3730      CHECK_RANGE:
3731      while (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
3732        {
3733        inescq = FALSE;
3734        ptr += 2;
3735        }
3736
3737      oldptr = ptr;
3738
3739      /* Remember \r or \n */
3740
3741      if (c == CHAR_CR || c == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
3742
3743      /* Check for range */
3744
3745      if (!inescq && ptr[1] == CHAR_MINUS)
3746        {
3747        int d;
3748        ptr += 2;
3749        while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E) ptr += 2;
3750
3751        /* If we hit \Q (not followed by \E) at this point, go into escaped
3752        mode. */
3753
3754        while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_Q)
3755          {
3756          ptr += 2;
3757          if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E)
3758            { ptr += 2; continue; }
3759          inescq = TRUE;
3760          break;
3761          }
3762
3763        if (*ptr == 0 || (!inescq && *ptr == CHAR_RIGHT_SQUARE_BRACKET))
3764          {
3765          ptr = oldptr;
3766          goto LONE_SINGLE_CHARACTER;
3767          }
3768
3769#ifdef SUPPORT_UTF8
3770        if (utf8)
3771          {                           /* Braces are required because the */
3772          GETCHARLEN(d, ptr, ptr);    /* macro generates multiple statements */
3773          }
3774        else
3775#endif
3776        d = *ptr;  /* Not UTF-8 mode */
3777
3778        /* The second part of a range can be a single-character escape, but
3779        not any of the other escapes. Perl 5.6 treats a hyphen as a literal
3780        in such circumstances. */
3781
3782        if (!inescq && d == CHAR_BACKSLASH)
3783          {
3784          d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
3785          if (*errorcodeptr != 0) goto FAILED;
3786
3787          /* \b is backspace; any other special means the '-' was literal */
3788
3789          if (d < 0)
3790            {
3791            if (d == -ESC_b) d = CHAR_BS; else
3792              {
3793              ptr = oldptr;
3794              goto LONE_SINGLE_CHARACTER;  /* A few lines below */
3795              }
3796            }
3797          }
3798
3799        /* Check that the two values are in the correct order. Optimize
3800        one-character ranges */
3801
3802        if (d < c)
3803          {
3804          *errorcodeptr = ERR8;
3805          goto FAILED;
3806          }
3807
3808        if (d == c) goto LONE_SINGLE_CHARACTER;  /* A few lines below */
3809
3810        /* Remember \r or \n */
3811
3812        if (d == CHAR_CR || d == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
3813
3814        /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
3815        matching, we have to use an XCLASS with extra data items. Caseless
3816        matching for characters > 127 is available only if UCP support is
3817        available. */
3818
3819#ifdef SUPPORT_UTF8
3820        if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
3821          {
3822          class_utf8 = TRUE;
3823
3824          /* With UCP support, we can find the other case equivalents of
3825          the relevant characters. There may be several ranges. Optimize how
3826          they fit with the basic range. */
3827
3828#ifdef SUPPORT_UCP
3829          if ((options & PCRE_CASELESS) != 0)
3830            {
3831            unsigned int occ, ocd;
3832            unsigned int cc = c;
3833            unsigned int origd = d;
3834            while (get_othercase_range(&cc, origd, &occ, &ocd))
3835              {
3836              if (occ >= (unsigned int)c &&
3837                  ocd <= (unsigned int)d)
3838                continue;                          /* Skip embedded ranges */
3839
3840              if (occ < (unsigned int)c  &&
3841                  ocd >= (unsigned int)c - 1)      /* Extend the basic range */
3842                {                                  /* if there is overlap,   */
3843                c = occ;                           /* noting that if occ < c */
3844                continue;                          /* we can't have ocd > d  */
3845                }                                  /* because a subrange is  */
3846              if (ocd > (unsigned int)d &&
3847                  occ <= (unsigned int)d + 1)      /* always shorter than    */
3848                {                                  /* the basic range.       */
3849                d = ocd;
3850                continue;
3851                }
3852
3853              if (occ == ocd)
3854                {
3855                *class_utf8data++ = XCL_SINGLE;
3856                }
3857              else
3858                {
3859                *class_utf8data++ = XCL_RANGE;
3860                class_utf8data += _pcre_ord2utf8(occ, class_utf8data);
3861                }
3862              class_utf8data += _pcre_ord2utf8(ocd, class_utf8data);
3863              }
3864            }
3865#endif  /* SUPPORT_UCP */
3866
3867          /* Now record the original range, possibly modified for UCP caseless
3868          overlapping ranges. */
3869
3870          *class_utf8data++ = XCL_RANGE;
3871          class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3872          class_utf8data += _pcre_ord2utf8(d, class_utf8data);
3873
3874          /* With UCP support, we are done. Without UCP support, there is no
3875          caseless matching for UTF-8 characters > 127; we can use the bit map
3876          for the smaller ones. */
3877
3878#ifdef SUPPORT_UCP
3879          continue;    /* With next character in the class */
3880#else
3881          if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
3882
3883          /* Adjust upper limit and fall through to set up the map */
3884
3885          d = 127;
3886
3887#endif  /* SUPPORT_UCP */
3888          }
3889#endif  /* SUPPORT_UTF8 */
3890
3891        /* We use the bit map for all cases when not in UTF-8 mode; else
3892        ranges that lie entirely within 0-127 when there is UCP support; else
3893        for partial ranges without UCP support. */
3894
3895        class_charcount += d - c + 1;
3896        class_lastchar = d;
3897
3898        /* We can save a bit of time by skipping this in the pre-compile. */
3899
3900        if (lengthptr == NULL) for (; c <= d; c++)
3901          {
3902          classbits[c/8] |= (1 << (c&7));
3903          if ((options & PCRE_CASELESS) != 0)
3904            {
3905            int uc = cd->fcc[c];           /* flip case */
3906            classbits[uc/8] |= (1 << (uc&7));
3907            }
3908          }
3909
3910        continue;   /* Go get the next char in the class */
3911        }
3912
3913      /* Handle a lone single character - we can get here for a normal
3914      non-escape char, or after \ that introduces a single character or for an
3915      apparent range that isn't. */
3916
3917      LONE_SINGLE_CHARACTER:
3918
3919      /* Handle a character that cannot go in the bit map */
3920
3921#ifdef SUPPORT_UTF8
3922      if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
3923        {
3924        class_utf8 = TRUE;
3925        *class_utf8data++ = XCL_SINGLE;
3926        class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3927
3928#ifdef SUPPORT_UCP
3929        if ((options & PCRE_CASELESS) != 0)
3930          {
3931          unsigned int othercase;
3932          if ((othercase = UCD_OTHERCASE(c)) != c)
3933            {
3934            *class_utf8data++ = XCL_SINGLE;
3935            class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
3936            }
3937          }
3938#endif  /* SUPPORT_UCP */
3939
3940        }
3941      else
3942#endif  /* SUPPORT_UTF8 */
3943
3944      /* Handle a single-byte character */
3945        {
3946        classbits[c/8] |= (1 << (c&7));
3947        if ((options & PCRE_CASELESS) != 0)
3948          {
3949          c = cd->fcc[c];   /* flip case */
3950          classbits[c/8] |= (1 << (c&7));
3951          }
3952        class_charcount++;
3953        class_lastchar = c;
3954        }
3955      }
3956
3957    /* Loop until ']' reached. This "while" is the end of the "do" far above.
3958    If we are at the end of an internal nested string, revert to the outer
3959    string. */
3960
3961    while (((c = *(++ptr)) != 0 ||
3962           (nestptr != NULL &&
3963             (ptr = nestptr, nestptr = NULL, c = *(++ptr)) != 0)) &&
3964           (c != CHAR_RIGHT_SQUARE_BRACKET || inescq));
3965
3966    /* Check for missing terminating ']' */
3967
3968    if (c == 0)
3969      {
3970      *errorcodeptr = ERR6;
3971      goto FAILED;
3972      }
3973
3974    /* If class_charcount is 1, we saw precisely one character whose value is
3975    less than 256. As long as there were no characters >= 128 and there was no
3976    use of \p or \P, in other words, no use of any XCLASS features, we can
3977    optimize.
3978
3979    In UTF-8 mode, we can optimize the negative case only if there were no
3980    characters >= 128 because OP_NOT and the related opcodes like OP_NOTSTAR
3981    operate on single-bytes only. This is an historical hangover. Maybe one day
3982    we can tidy these opcodes to handle multi-byte characters.
3983
3984    The optimization throws away the bit map. We turn the item into a
3985    1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
3986    that OP_NOT does not support multibyte characters. In the positive case, it
3987    can cause firstbyte to be set. Otherwise, there can be no first char if
3988    this item is first, whatever repeat count may follow. In the case of
3989    reqbyte, save the previous value for reinstating. */
3990
3991#ifdef SUPPORT_UTF8
3992    if (class_charcount == 1 && !class_utf8 &&
3993      (!utf8 || !negate_class || class_lastchar < 128))
3994#else
3995    if (class_charcount == 1)
3996#endif
3997      {
3998      zeroreqbyte = reqbyte;
3999
4000      /* The OP_NOT opcode works on one-byte characters only. */
4001
4002      if (negate_class)
4003        {
4004        if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
4005        zerofirstbyte = firstbyte;
4006        *code++ = OP_NOT;
4007        *code++ = class_lastchar;
4008        break;
4009        }
4010
4011      /* For a single, positive character, get the value into mcbuffer, and
4012      then we can handle this with the normal one-character code. */
4013
4014#ifdef SUPPORT_UTF8
4015      if (utf8 && class_lastchar > 127)
4016        mclength = _pcre_ord2utf8(class_lastchar, mcbuffer);
4017      else
4018#endif
4019        {
4020        mcbuffer[0] = class_lastchar;
4021        mclength = 1;
4022        }
4023      goto ONE_CHAR;
4024      }       /* End of 1-char optimization */
4025
4026    /* The general case - not the one-char optimization. If this is the first
4027    thing in the branch, there can be no first char setting, whatever the
4028    repeat count. Any reqbyte setting must remain unchanged after any kind of
4029    repeat. */
4030
4031    if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
4032    zerofirstbyte = firstbyte;
4033    zeroreqbyte = reqbyte;
4034
4035    /* If there are characters with values > 255, we have to compile an
4036    extended class, with its own opcode, unless there was a negated special
4037    such as \S in the class, and PCRE_UCP is not set, because in that case all
4038    characters > 255 are in the class, so any that were explicitly given as
4039    well can be ignored. If (when there are explicit characters > 255 that must
4040    be listed) there are no characters < 256, we can omit the bitmap in the
4041    actual compiled code. */
4042
4043#ifdef SUPPORT_UTF8
4044    if (class_utf8 && (!should_flip_negation || (options & PCRE_UCP) != 0))
4045      {
4046      *class_utf8data++ = XCL_END;    /* Marks the end of extra data */
4047      *code++ = OP_XCLASS;
4048      code += LINK_SIZE;
4049      *code = negate_class? XCL_NOT : 0;
4050
4051      /* If the map is required, move up the extra data to make room for it;
4052      otherwise just move the code pointer to the end of the extra data. */
4053
4054      if (class_charcount > 0)
4055        {
4056        *code++ |= XCL_MAP;
4057        memmove(code + 32, code, class_utf8data - code);
4058        memcpy(code, classbits, 32);
4059        code = class_utf8data + 32;
4060        }
4061      else code = class_utf8data;
4062
4063      /* Now fill in the complete length of the item */
4064
4065      PUT(previous, 1, code - previous);
4066      break;   /* End of class handling */
4067      }
4068#endif
4069
4070    /* If there are no characters > 255, or they are all to be included or
4071    excluded, set the opcode to OP_CLASS or OP_NCLASS, depending on whether the
4072    whole class was negated and whether there were negative specials such as \S
4073    (non-UCP) in the class. Then copy the 32-byte map into the code vector,
4074    negating it if necessary. */
4075
4076    *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
4077    if (negate_class)
4078      {
4079      if (lengthptr == NULL)    /* Save time in the pre-compile phase */
4080        for (c = 0; c < 32; c++) code[c] = ~classbits[c];
4081      }
4082    else
4083      {
4084      memcpy(code, classbits, 32);
4085      }
4086    code += 32;
4087    break;
4088
4089
4090    /* ===================================================================*/
4091    /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
4092    has been tested above. */
4093
4094    case CHAR_LEFT_CURLY_BRACKET:
4095    if (!is_quantifier) goto NORMAL_CHAR;
4096    ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
4097    if (*errorcodeptr != 0) goto FAILED;
4098    goto REPEAT;
4099
4100    case CHAR_ASTERISK:
4101    repeat_min = 0;
4102    repeat_max = -1;
4103    goto REPEAT;
4104
4105    case CHAR_PLUS:
4106    repeat_min = 1;
4107    repeat_max = -1;
4108    goto REPEAT;
4109
4110    case CHAR_QUESTION_MARK:
4111    repeat_min = 0;
4112    repeat_max = 1;
4113
4114    REPEAT:
4115    if (previous == NULL)
4116      {
4117      *errorcodeptr = ERR9;
4118      goto FAILED;
4119      }
4120
4121    if (repeat_min == 0)
4122      {
4123      firstbyte = zerofirstbyte;    /* Adjust for zero repeat */
4124      reqbyte = zeroreqbyte;        /* Ditto */
4125      }
4126
4127    /* Remember whether this is a variable length repeat */
4128
4129    reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
4130
4131    op_type = 0;                    /* Default single-char op codes */
4132    possessive_quantifier = FALSE;  /* Default not possessive quantifier */
4133
4134    /* Save start of previous item, in case we have to move it up to make space
4135    for an inserted OP_ONCE for the additional '+' extension. */
4136
4137    tempcode = previous;
4138
4139    /* If the next character is '+', we have a possessive quantifier. This
4140    implies greediness, whatever the setting of the PCRE_UNGREEDY option.
4141    If the next character is '?' this is a minimizing repeat, by default,
4142    but if PCRE_UNGREEDY is set, it works the other way round. We change the
4143    repeat type to the non-default. */
4144
4145    if (ptr[1] == CHAR_PLUS)
4146      {
4147      repeat_type = 0;                  /* Force greedy */
4148      possessive_quantifier = TRUE;
4149      ptr++;
4150      }
4151    else if (ptr[1] == CHAR_QUESTION_MARK)
4152      {
4153      repeat_type = greedy_non_default;
4154      ptr++;
4155      }
4156    else repeat_type = greedy_default;
4157
4158    /* If previous was a character match, abolish the item and generate a
4159    repeat item instead. If a char item has a minumum of more than one, ensure
4160    that it is set in reqbyte - it might not be if a sequence such as x{3} is
4161    the first thing in a branch because the x will have gone into firstbyte
4162    instead.  */
4163
4164    if (*previous == OP_CHAR || *previous == OP_CHARNC)
4165      {
4166      /* Deal with UTF-8 characters that take up more than one byte. It's
4167      easier to write this out separately than try to macrify it. Use c to
4168      hold the length of the character in bytes, plus 0x80 to flag that it's a
4169      length rather than a small character. */
4170
4171#ifdef SUPPORT_UTF8
4172      if (utf8 && (code[-1] & 0x80) != 0)
4173        {
4174        uschar *lastchar = code - 1;
4175        while((*lastchar & 0xc0) == 0x80) lastchar--;
4176        c = code - lastchar;            /* Length of UTF-8 character */
4177        memcpy(utf8_char, lastchar, c); /* Save the char */
4178        c |= 0x80;                      /* Flag c as a length */
4179        }
4180      else
4181#endif
4182
4183      /* Handle the case of a single byte - either with no UTF8 support, or
4184      with UTF-8 disabled, or for a UTF-8 character < 128. */
4185
4186        {
4187        c = code[-1];
4188        if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
4189        }
4190
4191      /* If the repetition is unlimited, it pays to see if the next thing on
4192      the line is something that cannot possibly match this character. If so,
4193      automatically possessifying this item gains some performance in the case
4194      where the match fails. */
4195
4196      if (!possessive_quantifier &&
4197          repeat_max < 0 &&
4198          check_auto_possessive(previous, utf8, ptr + 1, options, cd))
4199        {
4200        repeat_type = 0;    /* Force greedy */
4201        possessive_quantifier = TRUE;
4202        }
4203
4204      goto OUTPUT_SINGLE_REPEAT;   /* Code shared with single character types */
4205      }
4206
4207    /* If previous was a single negated character ([^a] or similar), we use
4208    one of the special opcodes, replacing it. The code is shared with single-
4209    character repeats by setting opt_type to add a suitable offset into
4210    repeat_type. We can also test for auto-possessification. OP_NOT is
4211    currently used only for single-byte chars. */
4212
4213    else if (*previous == OP_NOT)
4214      {
4215      op_type = OP_NOTSTAR - OP_STAR;  /* Use "not" opcodes */
4216      c = previous[1];
4217      if (!possessive_quantifier &&
4218          repeat_max < 0 &&
4219          check_auto_possessive(previous, utf8, ptr + 1, options, cd))
4220        {
4221        repeat_type = 0;    /* Force greedy */
4222        possessive_quantifier = TRUE;
4223        }
4224      goto OUTPUT_SINGLE_REPEAT;
4225      }
4226
4227    /* If previous was a character type match (\d or similar), abolish it and
4228    create a suitable repeat item. The code is shared with single-character
4229    repeats by setting op_type to add a suitable offset into repeat_type. Note
4230    the the Unicode property types will be present only when SUPPORT_UCP is
4231    defined, but we don't wrap the little bits of code here because it just
4232    makes it horribly messy. */
4233
4234    else if (*previous < OP_EODN)
4235      {
4236      uschar *oldcode;
4237      int prop_type, prop_value;
4238      op_type = OP_TYPESTAR - OP_STAR;  /* Use type opcodes */
4239      c = *previous;
4240
4241      if (!possessive_quantifier &&
4242          repeat_max < 0 &&
4243          check_auto_possessive(previous, utf8, ptr + 1, options, cd))
4244        {
4245        repeat_type = 0;    /* Force greedy */
4246        possessive_quantifier = TRUE;
4247        }
4248
4249      OUTPUT_SINGLE_REPEAT:
4250      if (*previous == OP_PROP || *previous == OP_NOTPROP)
4251        {
4252        prop_type = previous[1];
4253        prop_value = previous[2];
4254        }
4255      else prop_type = prop_value = -1;
4256
4257      oldcode = code;
4258      code = previous;                  /* Usually overwrite previous item */
4259
4260      /* If the maximum is zero then the minimum must also be zero; Perl allows
4261      this case, so we do too - by simply omitting the item altogether. */
4262
4263      if (repeat_max == 0) goto END_REPEAT;
4264
4265      /*--------------------------------------------------------------------*/
4266      /* This code is obsolete from release 8.00; the restriction was finally
4267      removed: */
4268
4269      /* All real repeats make it impossible to handle partial matching (maybe
4270      one day we will be able to remove this restriction). */
4271
4272      /* if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL; */
4273      /*--------------------------------------------------------------------*/
4274
4275      /* Combine the op_type with the repeat_type */
4276
4277      repeat_type += op_type;
4278
4279      /* A minimum of zero is handled either as the special case * or ?, or as
4280      an UPTO, with the maximum given. */
4281
4282      if (repeat_min == 0)
4283        {
4284        if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
4285          else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
4286        else
4287          {
4288          *code++ = OP_UPTO + repeat_type;
4289          PUT2INC(code, 0, repeat_max);
4290          }
4291        }
4292
4293      /* A repeat minimum of 1 is optimized into some special cases. If the
4294      maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
4295      left in place and, if the maximum is greater than 1, we use OP_UPTO with
4296      one less than the maximum. */
4297
4298      else if (repeat_min == 1)
4299        {
4300        if (repeat_max == -1)
4301          *code++ = OP_PLUS + repeat_type;
4302        else
4303          {
4304          code = oldcode;                 /* leave previous item in place */
4305          if (repeat_max == 1) goto END_REPEAT;
4306          *code++ = OP_UPTO + repeat_type;
4307          PUT2INC(code, 0, repeat_max - 1);
4308          }
4309        }
4310
4311      /* The case {n,n} is just an EXACT, while the general case {n,m} is
4312      handled as an EXACT followed by an UPTO. */
4313
4314      else
4315        {
4316        *code++ = OP_EXACT + op_type;  /* NB EXACT doesn't have repeat_type */
4317        PUT2INC(code, 0, repeat_min);
4318
4319        /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
4320        we have to insert the character for the previous code. For a repeated
4321        Unicode property match, there are two extra bytes that define the
4322        required property. In UTF-8 mode, long characters have their length in
4323        c, with the 0x80 bit as a flag. */
4324
4325        if (repeat_max < 0)
4326          {
4327#ifdef SUPPORT_UTF8
4328          if (utf8 && c >= 128)
4329            {
4330            memcpy(code, utf8_char, c & 7);
4331            code += c & 7;
4332            }
4333          else
4334#endif
4335            {
4336            *code++ = c;
4337            if (prop_type >= 0)
4338              {
4339              *code++ = prop_type;
4340              *code++ = prop_value;
4341              }
4342            }
4343          *code++ = OP_STAR + repeat_type;
4344          }
4345
4346        /* Else insert an UPTO if the max is greater than the min, again
4347        preceded by the character, for the previously inserted code. If the
4348        UPTO is just for 1 instance, we can use QUERY instead. */
4349
4350        else if (repeat_max != repeat_min)
4351          {
4352#ifdef SUPPORT_UTF8
4353          if (utf8 && c >= 128)
4354            {
4355            memcpy(code, utf8_char, c & 7);
4356            code += c & 7;
4357            }
4358          else
4359#endif
4360          *code++ = c;
4361          if (prop_type >= 0)
4362            {
4363            *code++ = prop_type;
4364            *code++ = prop_value;
4365            }
4366          repeat_max -= repeat_min;
4367
4368          if (repeat_max == 1)
4369            {
4370            *code++ = OP_QUERY + repeat_type;
4371            }
4372          else
4373            {
4374            *code++ = OP_UPTO + repeat_type;
4375            PUT2INC(code, 0, repeat_max);
4376            }
4377          }
4378        }
4379
4380      /* The character or character type itself comes last in all cases. */
4381
4382#ifdef SUPPORT_UTF8
4383      if (utf8 && c >= 128)
4384        {
4385        memcpy(code, utf8_char, c & 7);
4386        code += c & 7;
4387        }
4388      else
4389#endif
4390      *code++ = c;
4391
4392      /* For a repeated Unicode property match, there are two extra bytes that
4393      define the required property. */
4394
4395#ifdef SUPPORT_UCP
4396      if (prop_type >= 0)
4397        {
4398        *code++ = prop_type;
4399        *code++ = prop_value;
4400        }
4401#endif
4402      }
4403
4404    /* If previous was a character class or a back reference, we put the repeat
4405    stuff after it, but just skip the item if the repeat was {0,0}. */
4406
4407    else if (*previous == OP_CLASS ||
4408             *previous == OP_NCLASS ||
4409#ifdef SUPPORT_UTF8
4410             *previous == OP_XCLASS ||
4411#endif
4412             *previous == OP_REF)
4413      {
4414      if (repeat_max == 0)
4415        {
4416        code = previous;
4417        goto END_REPEAT;
4418        }
4419
4420      /*--------------------------------------------------------------------*/
4421      /* This code is obsolete from release 8.00; the restriction was finally
4422      removed: */
4423
4424      /* All real repeats make it impossible to handle partial matching (maybe
4425      one day we will be able to remove this restriction). */
4426
4427      /* if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL; */
4428      /*--------------------------------------------------------------------*/
4429
4430      if (repeat_min == 0 && repeat_max == -1)
4431        *code++ = OP_CRSTAR + repeat_type;
4432      else if (repeat_min == 1 && repeat_max == -1)
4433        *code++ = OP_CRPLUS + repeat_type;
4434      else if (repeat_min == 0 && repeat_max == 1)
4435        *code++ = OP_CRQUERY + repeat_type;
4436      else
4437        {
4438        *code++ = OP_CRRANGE + repeat_type;
4439        PUT2INC(code, 0, repeat_min);
4440        if (repeat_max == -1) repeat_max = 0;  /* 2-byte encoding for max */
4441        PUT2INC(code, 0, repeat_max);
4442        }
4443      }
4444
4445    /* If previous was a bracket group, we may have to replicate it in certain
4446    cases. */
4447
4448    else if (*previous == OP_BRA  || *previous == OP_CBRA ||
4449             *previous == OP_ONCE || *previous == OP_COND)
4450      {
4451      register int i;
4452      int ketoffset = 0;
4453      int len = (int)(code - previous);
4454      uschar *bralink = NULL;
4455
4456      /* Repeating a DEFINE group is pointless */
4457
4458      if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
4459        {
4460        *errorcodeptr = ERR55;
4461        goto FAILED;
4462        }
4463
4464      /* If the maximum repeat count is unlimited, find the end of the bracket
4465      by scanning through from the start, and compute the offset back to it
4466      from the current code pointer. There may be an OP_OPT setting following
4467      the final KET, so we can't find the end just by going back from the code
4468      pointer. */
4469
4470      if (repeat_max == -1)
4471        {
4472        register uschar *ket = previous;
4473        do ket += GET(ket, 1); while (*ket != OP_KET);
4474        ketoffset = (int)(code - ket);
4475        }
4476
4477      /* The case of a zero minimum is special because of the need to stick
4478      OP_BRAZERO in front of it, and because the group appears once in the
4479      data, whereas in other cases it appears the minimum number of times. For
4480      this reason, it is simplest to treat this case separately, as otherwise
4481      the code gets far too messy. There are several special subcases when the
4482      minimum is zero. */
4483
4484      if (repeat_min == 0)
4485        {
4486        /* If the maximum is also zero, we used to just omit the group from the
4487        output altogether, like this:
4488
4489        ** if (repeat_max == 0)
4490        **   {
4491        **   code = previous;
4492        **   goto END_REPEAT;
4493        **   }
4494
4495        However, that fails when a group is referenced as a subroutine from
4496        elsewhere in the pattern, so now we stick in OP_SKIPZERO in front of it
4497        so that it is skipped on execution. As we don't have a list of which
4498        groups are referenced, we cannot do this selectively.
4499
4500        If the maximum is 1 or unlimited, we just have to stick in the BRAZERO
4501        and do no more at this point. However, we do need to adjust any
4502        OP_RECURSE calls inside the group that refer to the group itself or any
4503        internal or forward referenced group, because the offset is from the
4504        start of the whole regex. Temporarily terminate the pattern while doing
4505        this. */
4506
4507        if (repeat_max <= 1)    /* Covers 0, 1, and unlimited */
4508          {
4509          *code = OP_END;
4510          adjust_recurse(previous, 1, utf8, cd, save_hwm);
4511          memmove(previous+1, previous, len);
4512          code++;
4513          if (repeat_max == 0)
4514            {
4515            *previous++ = OP_SKIPZERO;
4516            goto END_REPEAT;
4517            }
4518          *previous++ = OP_BRAZERO + repeat_type;
4519          }
4520
4521        /* If the maximum is greater than 1 and limited, we have to replicate
4522        in a nested fashion, sticking OP_BRAZERO before each set of brackets.
4523        The first one has to be handled carefully because it's the original
4524        copy, which has to be moved up. The remainder can be handled by code
4525        that is common with the non-zero minimum case below. We have to
4526        adjust the value or repeat_max, since one less copy is required. Once
4527        again, we may have to adjust any OP_RECURSE calls inside the group. */
4528
4529        else
4530          {
4531          int offset;
4532          *code = OP_END;
4533          adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd, save_hwm);
4534          memmove(previous + 2 + LINK_SIZE, previous, len);
4535          code += 2 + LINK_SIZE;
4536          *previous++ = OP_BRAZERO + repeat_type;
4537          *previous++ = OP_BRA;
4538
4539          /* We chain together the bracket offset fields that have to be
4540          filled in later when the ends of the brackets are reached. */
4541
4542          offset = (bralink == NULL)? 0 : (int)(previous - bralink);
4543          bralink = previous;
4544          PUTINC(previous, 0, offset);
4545          }
4546
4547        repeat_max--;
4548        }
4549
4550      /* If the minimum is greater than zero, replicate the group as many
4551      times as necessary, and adjust the maximum to the number of subsequent
4552      copies that we need. If we set a first char from the group, and didn't
4553      set a required char, copy the latter from the former. If there are any
4554      forward reference subroutine calls in the group, there will be entries on
4555      the workspace list; replicate these with an appropriate increment. */
4556
4557      else
4558        {
4559        if (repeat_min > 1)
4560          {
4561          /* In the pre-compile phase, we don't actually do the replication. We
4562          just adjust the length as if we had. Do some paranoid checks for
4563          potential integer overflow. The INT64_OR_DOUBLE type is a 64-bit
4564          integer type when available, otherwise double. */
4565
4566          if (lengthptr != NULL)
4567            {
4568            int delta = (repeat_min - 1)*length_prevgroup;
4569            if ((INT64_OR_DOUBLE)(repeat_min - 1)*
4570                  (INT64_OR_DOUBLE)length_prevgroup >
4571                    (INT64_OR_DOUBLE)INT_MAX ||
4572                OFLOW_MAX - *lengthptr < delta)
4573              {
4574              *errorcodeptr = ERR20;
4575              goto FAILED;
4576              }
4577            *lengthptr += delta;
4578            }
4579
4580          /* This is compiling for real */
4581
4582          else
4583            {
4584            if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
4585            for (i = 1; i < repeat_min; i++)
4586              {
4587              uschar *hc;
4588              uschar *this_hwm = cd->hwm;
4589              memcpy(code, previous, len);
4590              for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
4591                {
4592                PUT(cd->hwm, 0, GET(hc, 0) + len);
4593                cd->hwm += LINK_SIZE;
4594                }
4595              save_hwm = this_hwm;
4596              code += len;
4597              }
4598            }
4599          }
4600
4601        if (repeat_max > 0) repeat_max -= repeat_min;
4602        }
4603
4604      /* This code is common to both the zero and non-zero minimum cases. If
4605      the maximum is limited, it replicates the group in a nested fashion,
4606      remembering the bracket starts on a stack. In the case of a zero minimum,
4607      the first one was set up above. In all cases the repeat_max now specifies
4608      the number of additional copies needed. Again, we must remember to
4609      replicate entries on the forward reference list. */
4610
4611      if (repeat_max >= 0)
4612        {
4613        /* In the pre-compile phase, we don't actually do the replication. We
4614        just adjust the length as if we had. For each repetition we must add 1
4615        to the length for BRAZERO and for all but the last repetition we must
4616        add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
4617        paranoid checks to avoid integer overflow. The INT64_OR_DOUBLE type is
4618        a 64-bit integer type when available, otherwise double. */
4619
4620        if (lengthptr != NULL && repeat_max > 0)
4621          {
4622          int delta = repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
4623                      2 - 2*LINK_SIZE;   /* Last one doesn't nest */
4624          if ((INT64_OR_DOUBLE)repeat_max *
4625                (INT64_OR_DOUBLE)(length_prevgroup + 1 + 2 + 2*LINK_SIZE)
4626                  > (INT64_OR_DOUBLE)INT_MAX ||
4627              OFLOW_MAX - *lengthptr < delta)
4628            {
4629            *errorcodeptr = ERR20;
4630            goto FAILED;
4631            }
4632          *lengthptr += delta;
4633          }
4634
4635        /* This is compiling for real */
4636
4637        else for (i = repeat_max - 1; i >= 0; i--)
4638          {
4639          uschar *hc;
4640          uschar *this_hwm = cd->hwm;
4641
4642          *code++ = OP_BRAZERO + repeat_type;
4643
4644          /* All but the final copy start a new nesting, maintaining the
4645          chain of brackets outstanding. */
4646
4647          if (i != 0)
4648            {
4649            int offset;
4650            *code++ = OP_BRA;
4651            offset = (bralink == NULL)? 0 : (int)(code - bralink);
4652            bralink = code;
4653            PUTINC(code, 0, offset);
4654            }
4655
4656          memcpy(code, previous, len);
4657          for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
4658            {
4659            PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
4660            cd->hwm += LINK_SIZE;
4661            }
4662          save_hwm = this_hwm;
4663          code += len;
4664          }
4665
4666        /* Now chain through the pending brackets, and fill in their length
4667        fields (which are holding the chain links pro tem). */
4668
4669        while (bralink != NULL)
4670          {
4671          int oldlinkoffset;
4672          int offset = (int)(code - bralink + 1);
4673          uschar *bra = code - offset;
4674          oldlinkoffset = GET(bra, 1);
4675          bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
4676          *code++ = OP_KET;
4677          PUTINC(code, 0, offset);
4678          PUT(bra, 1, offset);
4679          }
4680        }
4681
4682      /* If the maximum is unlimited, set a repeater in the final copy. We
4683      can't just offset backwards from the current code point, because we
4684      don't know if there's been an options resetting after the ket. The
4685      correct offset was computed above.
4686
4687      Then, when we are doing the actual compile phase, check to see whether
4688      this group is a non-atomic one that could match an empty string. If so,
4689      convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
4690      that runtime checking can be done. [This check is also applied to
4691      atomic groups at runtime, but in a different way.] */
4692
4693      else
4694        {
4695        uschar *ketcode = code - ketoffset;
4696        uschar *bracode = ketcode - GET(ketcode, 1);
4697        *ketcode = OP_KETRMAX + repeat_type;
4698        if (lengthptr == NULL && *bracode != OP_ONCE)
4699          {
4700          uschar *scode = bracode;
4701          do
4702            {
4703            if (could_be_empty_branch(scode, ketcode, utf8, cd))
4704              {
4705              *bracode += OP_SBRA - OP_BRA;
4706              break;
4707              }
4708            scode += GET(scode, 1);
4709            }
4710          while (*scode == OP_ALT);
4711          }
4712        }
4713      }
4714
4715    /* If previous is OP_FAIL, it was generated by an empty class [] in
4716    JavaScript mode. The other ways in which OP_FAIL can be generated, that is
4717    by (*FAIL) or (?!) set previous to NULL, which gives a "nothing to repeat"
4718    error above. We can just ignore the repeat in JS case. */
4719
4720    else if (*previous == OP_FAIL) goto END_REPEAT;
4721
4722    /* Else there's some kind of shambles */
4723
4724    else
4725      {
4726      *errorcodeptr = ERR11;
4727      goto FAILED;
4728      }
4729
4730    /* If the character following a repeat is '+', or if certain optimization
4731    tests above succeeded, possessive_quantifier is TRUE. For some of the
4732    simpler opcodes, there is an special alternative opcode for this. For
4733    anything else, we wrap the entire repeated item inside OP_ONCE brackets.
4734    The '+' notation is just syntactic sugar, taken from Sun's Java package,
4735    but the special opcodes can optimize it a bit. The repeated item starts at
4736    tempcode, not at previous, which might be the first part of a string whose
4737    (former) last char we repeated.
4738
4739    Possessifying an 'exact' quantifier has no effect, so we can ignore it. But
4740    an 'upto' may follow. We skip over an 'exact' item, and then test the
4741    length of what remains before proceeding. */
4742
4743    if (possessive_quantifier)
4744      {
4745      int len;
4746
4747      if (*tempcode == OP_TYPEEXACT)
4748        tempcode += _pcre_OP_lengths[*tempcode] +
4749          ((tempcode[3] == OP_PROP || tempcode[3] == OP_NOTPROP)? 2 : 0);
4750
4751      else if (*tempcode == OP_EXACT || *tempcode == OP_NOTEXACT)
4752        {
4753        tempcode += _pcre_OP_lengths[*tempcode];
4754#ifdef SUPPORT_UTF8
4755        if (utf8 && tempcode[-1] >= 0xc0)
4756          tempcode += _pcre_utf8_table4[tempcode[-1] & 0x3f];
4757#endif
4758        }
4759
4760      len = (int)(code - tempcode);
4761      if (len > 0) switch (*tempcode)
4762        {
4763        case OP_STAR:  *tempcode = OP_POSSTAR; break;
4764        case OP_PLUS:  *tempcode = OP_POSPLUS; break;
4765        case OP_QUERY: *tempcode = OP_POSQUERY; break;
4766        case OP_UPTO:  *tempcode = OP_POSUPTO; break;
4767
4768        case OP_TYPESTAR:  *tempcode = OP_TYPEPOSSTAR; break;
4769        case OP_TYPEPLUS:  *tempcode = OP_TYPEPOSPLUS; break;
4770        case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
4771        case OP_TYPEUPTO:  *tempcode = OP_TYPEPOSUPTO; break;
4772
4773        case OP_NOTSTAR:  *tempcode = OP_NOTPOSSTAR; break;
4774        case OP_NOTPLUS:  *tempcode = OP_NOTPOSPLUS; break;
4775        case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
4776        case OP_NOTUPTO:  *tempcode = OP_NOTPOSUPTO; break;
4777
4778        /* Because we are moving code along, we must ensure that any
4779        pending recursive references are updated. */
4780
4781        default:
4782        *code = OP_END;
4783        adjust_recurse(tempcode, 1 + LINK_SIZE, utf8, cd, save_hwm);
4784        memmove(tempcode + 1+LINK_SIZE, tempcode, len);
4785        code += 1 + LINK_SIZE;
4786        len += 1 + LINK_SIZE;
4787        tempcode[0] = OP_ONCE;
4788        *code++ = OP_KET;
4789        PUTINC(code, 0, len);
4790        PUT(tempcode, 1, len);
4791        break;
4792        }
4793      }
4794
4795    /* In all case we no longer have a previous item. We also set the
4796    "follows varying string" flag for subsequently encountered reqbytes if
4797    it isn't already set and we have just passed a varying length item. */
4798
4799    END_REPEAT:
4800    previous = NULL;
4801    cd->req_varyopt |= reqvary;
4802    break;
4803
4804
4805    /* ===================================================================*/
4806    /* Start of nested parenthesized sub-expression, or comment or lookahead or
4807    lookbehind or option setting or condition or all the other extended
4808    parenthesis forms.  */
4809
4810    case CHAR_LEFT_PARENTHESIS:
4811    newoptions = options;
4812    skipbytes = 0;
4813    bravalue = OP_CBRA;
4814    save_hwm = cd->hwm;
4815    reset_bracount = FALSE;
4816
4817    /* First deal with various "verbs" that can be introduced by '*'. */
4818
4819    if (*(++ptr) == CHAR_ASTERISK &&
4820         ((cd->ctypes[ptr[1]] & ctype_letter) != 0 || ptr[1] == ':'))
4821      {
4822      int i, namelen;
4823      int arglen = 0;
4824      const char *vn = verbnames;
4825      const uschar *name = ptr + 1;
4826      const uschar *arg = NULL;
4827      previous = NULL;
4828      while ((cd->ctypes[*++ptr] & ctype_letter) != 0) {};
4829      namelen = (int)(ptr - name);
4830
4831      if (*ptr == CHAR_COLON)
4832        {
4833        arg = ++ptr;
4834        while ((cd->ctypes[*ptr] & (ctype_letter|ctype_digit)) != 0
4835          || *ptr == '_') ptr++;
4836        arglen = (int)(ptr - arg);
4837        }
4838
4839      if (*ptr != CHAR_RIGHT_PARENTHESIS)
4840        {
4841        *errorcodeptr = ERR60;
4842        goto FAILED;
4843        }
4844
4845      /* Scan the table of verb names */
4846
4847      for (i = 0; i < verbcount; i++)
4848        {
4849        if (namelen == verbs[i].len &&
4850            strncmp((char *)name, vn, namelen) == 0)
4851          {
4852          /* Check for open captures before ACCEPT */
4853
4854          if (verbs[i].op == OP_ACCEPT)
4855            {
4856            open_capitem *oc;
4857            cd->had_accept = TRUE;
4858            for (oc = cd->open_caps; oc != NULL; oc = oc->next)
4859              {
4860              *code++ = OP_CLOSE;
4861              PUT2INC(code, 0, oc->number);
4862              }
4863            }
4864
4865          /* Handle the cases with/without an argument */
4866
4867          if (arglen == 0)
4868            {
4869            if (verbs[i].op < 0)   /* Argument is mandatory */
4870              {
4871              *errorcodeptr = ERR66;
4872              goto FAILED;
4873              }
4874            *code = verbs[i].op;
4875            if (*code++ == OP_THEN)
4876              {
4877              PUT(code, 0, code - bcptr->current_branch - 1);
4878              code += LINK_SIZE;
4879              }
4880            }
4881
4882          else
4883            {
4884            if (verbs[i].op_arg < 0)   /* Argument is forbidden */
4885              {
4886              *errorcodeptr = ERR59;
4887              goto FAILED;
4888              }
4889            *code = verbs[i].op_arg;
4890            if (*code++ == OP_THEN_ARG)
4891              {
4892              PUT(code, 0, code - bcptr->current_branch - 1);
4893              code += LINK_SIZE;
4894              }
4895            *code++ = arglen;
4896            memcpy(code, arg, arglen);
4897            code += arglen;
4898            *code++ = 0;
4899            }
4900
4901          break;  /* Found verb, exit loop */
4902          }
4903
4904        vn += verbs[i].len + 1;
4905        }
4906
4907      if (i < verbcount) continue;    /* Successfully handled a verb */
4908      *errorcodeptr = ERR60;          /* Verb not recognized */
4909      goto FAILED;
4910      }
4911
4912    /* Deal with the extended parentheses; all are introduced by '?', and the
4913    appearance of any of them means that this is not a capturing group. */
4914
4915    else if (*ptr == CHAR_QUESTION_MARK)
4916      {
4917      int i, set, unset, namelen;
4918      int *optset;
4919      const uschar *name;
4920      uschar *slot;
4921
4922      switch (*(++ptr))
4923        {
4924        case CHAR_NUMBER_SIGN:                 /* Comment; skip to ket */
4925        ptr++;
4926        while (*ptr != 0 && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
4927        if (*ptr == 0)
4928          {
4929          *errorcodeptr = ERR18;
4930          goto FAILED;
4931          }
4932        continue;
4933
4934
4935        /* ------------------------------------------------------------ */
4936        case CHAR_VERTICAL_LINE:  /* Reset capture count for each branch */
4937        reset_bracount = TRUE;
4938        /* Fall through */
4939
4940        /* ------------------------------------------------------------ */
4941        case CHAR_COLON:          /* Non-capturing bracket */
4942        bravalue = OP_BRA;
4943        ptr++;
4944        break;
4945
4946
4947        /* ------------------------------------------------------------ */
4948        case CHAR_LEFT_PARENTHESIS:
4949        bravalue = OP_COND;       /* Conditional group */
4950
4951        /* A condition can be an assertion, a number (referring to a numbered
4952        group), a name (referring to a named group), or 'R', referring to
4953        recursion. R<digits> and R&name are also permitted for recursion tests.
4954
4955        There are several syntaxes for testing a named group: (?(name)) is used
4956        by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
4957
4958        There are two unfortunate ambiguities, caused by history. (a) 'R' can
4959        be the recursive thing or the name 'R' (and similarly for 'R' followed
4960        by digits), and (b) a number could be a name that consists of digits.
4961        In both cases, we look for a name first; if not found, we try the other
4962        cases. */
4963
4964        /* For conditions that are assertions, check the syntax, and then exit
4965        the switch. This will take control down to where bracketed groups,
4966        including assertions, are processed. */
4967
4968        if (ptr[1] == CHAR_QUESTION_MARK && (ptr[2] == CHAR_EQUALS_SIGN ||
4969            ptr[2] == CHAR_EXCLAMATION_MARK || ptr[2] == CHAR_LESS_THAN_SIGN))
4970          break;
4971
4972        /* Most other conditions use OP_CREF (a couple change to OP_RREF
4973        below), and all need to skip 3 bytes at the start of the group. */
4974
4975        code[1+LINK_SIZE] = OP_CREF;
4976        skipbytes = 3;
4977        refsign = -1;
4978
4979        /* Check for a test for recursion in a named group. */
4980
4981        if (ptr[1] == CHAR_R && ptr[2] == CHAR_AMPERSAND)
4982          {
4983          terminator = -1;
4984          ptr += 2;
4985          code[1+LINK_SIZE] = OP_RREF;    /* Change the type of test */
4986          }
4987
4988        /* Check for a test for a named group's having been set, using the Perl
4989        syntax (?(<name>) or (?('name') */
4990
4991        else if (ptr[1] == CHAR_LESS_THAN_SIGN)
4992          {
4993          terminator = CHAR_GREATER_THAN_SIGN;
4994          ptr++;
4995          }
4996        else if (ptr[1] == CHAR_APOSTROPHE)
4997          {
4998          terminator = CHAR_APOSTROPHE;
4999          ptr++;
5000          }
5001        else
5002          {
5003          terminator = 0;
5004          if (ptr[1] == CHAR_MINUS || ptr[1] == CHAR_PLUS) refsign = *(++ptr);
5005          }
5006
5007        /* We now expect to read a name; any thing else is an error */
5008
5009        if ((cd->ctypes[ptr[1]] & ctype_word) == 0)
5010          {
5011          ptr += 1;  /* To get the right offset */
5012          *errorcodeptr = ERR28;
5013          goto FAILED;
5014          }
5015
5016        /* Read the name, but also get it as a number if it's all digits */
5017
5018        recno = 0;
5019        name = ++ptr;
5020        while ((cd->ctypes[*ptr] & ctype_word) != 0)
5021          {
5022          if (recno >= 0)
5023            recno = ((digitab[*ptr] & ctype_digit) != 0)?
5024              recno * 10 + *ptr - CHAR_0 : -1;
5025          ptr++;
5026          }
5027        namelen = (int)(ptr - name);
5028
5029        if ((terminator > 0 && *ptr++ != terminator) ||
5030            *ptr++ != CHAR_RIGHT_PARENTHESIS)
5031          {
5032          ptr--;      /* Error offset */
5033          *errorcodeptr = ERR26;
5034          goto FAILED;
5035          }
5036
5037        /* Do no further checking in the pre-compile phase. */
5038
5039        if (lengthptr != NULL) break;
5040
5041        /* In the real compile we do the work of looking for the actual
5042        reference. If the string started with "+" or "-" we require the rest to
5043        be digits, in which case recno will be set. */
5044
5045        if (refsign > 0)
5046          {
5047          if (recno <= 0)
5048            {
5049            *errorcodeptr = ERR58;
5050            goto FAILED;
5051            }
5052          recno = (refsign == CHAR_MINUS)?
5053            cd->bracount - recno + 1 : recno +cd->bracount;
5054          if (recno <= 0 || recno > cd->final_bracount)
5055            {
5056            *errorcodeptr = ERR15;
5057            goto FAILED;
5058            }
5059          PUT2(code, 2+LINK_SIZE, recno);
5060          break;
5061          }
5062
5063        /* Otherwise (did not start with "+" or "-"), start by looking for the
5064        name. If we find a name, add one to the opcode to change OP_CREF or
5065        OP_RREF into OP_NCREF or OP_NRREF. These behave exactly the same,
5066        except they record that the reference was originally to a name. The
5067        information is used to check duplicate names. */
5068
5069        slot = cd->name_table;
5070        for (i = 0; i < cd->names_found; i++)
5071          {
5072          if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
5073          slot += cd->name_entry_size;
5074          }
5075
5076        /* Found a previous named subpattern */
5077
5078        if (i < cd->names_found)
5079          {
5080          recno = GET2(slot, 0);
5081          PUT2(code, 2+LINK_SIZE, recno);
5082          code[1+LINK_SIZE]++;
5083          }
5084
5085        /* Search the pattern for a forward reference */
5086
5087        else if ((i = find_parens(cd, name, namelen,
5088                        (options & PCRE_EXTENDED) != 0, utf8)) > 0)
5089          {
5090          PUT2(code, 2+LINK_SIZE, i);
5091          code[1+LINK_SIZE]++;
5092          }
5093
5094        /* If terminator == 0 it means that the name followed directly after
5095        the opening parenthesis [e.g. (?(abc)...] and in this case there are
5096        some further alternatives to try. For the cases where terminator != 0
5097        [things like (?(<name>... or (?('name')... or (?(R&name)... ] we have
5098        now checked all the possibilities, so give an error. */
5099
5100        else if (terminator != 0)
5101          {
5102          *errorcodeptr = ERR15;
5103          goto FAILED;
5104          }
5105
5106        /* Check for (?(R) for recursion. Allow digits after R to specify a
5107        specific group number. */
5108
5109        else if (*name == CHAR_R)
5110          {
5111          recno = 0;
5112          for (i = 1; i < namelen; i++)
5113            {
5114            if ((digitab[name[i]] & ctype_digit) == 0)
5115              {
5116              *errorcodeptr = ERR15;
5117              goto FAILED;
5118              }
5119            recno = recno * 10 + name[i] - CHAR_0;
5120            }
5121          if (recno == 0) recno = RREF_ANY;
5122          code[1+LINK_SIZE] = OP_RREF;      /* Change test type */
5123          PUT2(code, 2+LINK_SIZE, recno);
5124          }
5125
5126        /* Similarly, check for the (?(DEFINE) "condition", which is always
5127        false. */
5128
5129        else if (namelen == 6 && strncmp((char *)name, STRING_DEFINE, 6) == 0)
5130          {
5131          code[1+LINK_SIZE] = OP_DEF;
5132          skipbytes = 1;
5133          }
5134
5135        /* Check for the "name" actually being a subpattern number. We are
5136        in the second pass here, so final_bracount is set. */
5137
5138        else if (recno > 0 && recno <= cd->final_bracount)
5139          {
5140          PUT2(code, 2+LINK_SIZE, recno);
5141          }
5142
5143        /* Either an unidentified subpattern, or a reference to (?(0) */
5144
5145        else
5146          {
5147          *errorcodeptr = (recno == 0)? ERR35: ERR15;
5148          goto FAILED;
5149          }
5150        break;
5151
5152
5153        /* ------------------------------------------------------------ */
5154        case CHAR_EQUALS_SIGN:                 /* Positive lookahead */
5155        bravalue = OP_ASSERT;
5156        ptr++;
5157        break;
5158
5159
5160        /* ------------------------------------------------------------ */
5161        case CHAR_EXCLAMATION_MARK:            /* Negative lookahead */
5162        ptr++;
5163        if (*ptr == CHAR_RIGHT_PARENTHESIS)    /* Optimize (?!) */
5164          {
5165          *code++ = OP_FAIL;
5166          previous = NULL;
5167          continue;
5168          }
5169        bravalue = OP_ASSERT_NOT;
5170        break;
5171
5172
5173        /* ------------------------------------------------------------ */
5174        case CHAR_LESS_THAN_SIGN:              /* Lookbehind or named define */
5175        switch (ptr[1])
5176          {
5177          case CHAR_EQUALS_SIGN:               /* Positive lookbehind */
5178          bravalue = OP_ASSERTBACK;
5179          ptr += 2;
5180          break;
5181
5182          case CHAR_EXCLAMATION_MARK:          /* Negative lookbehind */
5183          bravalue = OP_ASSERTBACK_NOT;
5184          ptr += 2;
5185          break;
5186
5187          default:                /* Could be name define, else bad */
5188          if ((cd->ctypes[ptr[1]] & ctype_word) != 0) goto DEFINE_NAME;
5189          ptr++;                  /* Correct offset for error */
5190          *errorcodeptr = ERR24;
5191          goto FAILED;
5192          }
5193        break;
5194
5195
5196        /* ------------------------------------------------------------ */
5197        case CHAR_GREATER_THAN_SIGN:           /* One-time brackets */
5198        bravalue = OP_ONCE;
5199        ptr++;
5200        break;
5201
5202
5203        /* ------------------------------------------------------------ */
5204        case CHAR_C:                 /* Callout - may be followed by digits; */
5205        previous_callout = code;  /* Save for later completion */
5206        after_manual_callout = 1; /* Skip one item before completing */
5207        *code++ = OP_CALLOUT;
5208          {
5209          int n = 0;
5210          while ((digitab[*(++ptr)] & ctype_digit) != 0)
5211            n = n * 10 + *ptr - CHAR_0;
5212          if (*ptr != CHAR_RIGHT_PARENTHESIS)
5213            {
5214            *errorcodeptr = ERR39;
5215            goto FAILED;
5216            }
5217          if (n > 255)
5218            {
5219            *errorcodeptr = ERR38;
5220            goto FAILED;
5221            }
5222          *code++ = n;
5223          PUT(code, 0, (int)(ptr - cd->start_pattern + 1)); /* Pattern offset */
5224          PUT(code, LINK_SIZE, 0);                          /* Default length */
5225          code += 2 * LINK_SIZE;
5226          }
5227        previous = NULL;
5228        continue;
5229
5230
5231        /* ------------------------------------------------------------ */
5232        case CHAR_P:              /* Python-style named subpattern handling */
5233        if (*(++ptr) == CHAR_EQUALS_SIGN ||
5234            *ptr == CHAR_GREATER_THAN_SIGN)  /* Reference or recursion */
5235          {
5236          is_recurse = *ptr == CHAR_GREATER_THAN_SIGN;
5237          terminator = CHAR_RIGHT_PARENTHESIS;
5238          goto NAMED_REF_OR_RECURSE;
5239          }
5240        else if (*ptr != CHAR_LESS_THAN_SIGN)  /* Test for Python-style defn */
5241          {
5242          *errorcodeptr = ERR41;
5243          goto FAILED;
5244          }
5245        /* Fall through to handle (?P< as (?< is handled */
5246
5247
5248        /* ------------------------------------------------------------ */
5249        DEFINE_NAME:    /* Come here from (?< handling */
5250        case CHAR_APOSTROPHE:
5251          {
5252          terminator = (*ptr == CHAR_LESS_THAN_SIGN)?
5253            CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;
5254          name = ++ptr;
5255
5256          while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
5257          namelen = (int)(ptr - name);
5258
5259          /* In the pre-compile phase, just do a syntax check. */
5260
5261          if (lengthptr != NULL)
5262            {
5263            if (*ptr != terminator)
5264              {
5265              *errorcodeptr = ERR42;
5266              goto FAILED;
5267              }
5268            if (cd->names_found >= MAX_NAME_COUNT)
5269              {
5270              *errorcodeptr = ERR49;
5271              goto FAILED;
5272              }
5273            if (namelen + 3 > cd->name_entry_size)
5274              {
5275              cd->name_entry_size = namelen + 3;
5276              if (namelen > MAX_NAME_SIZE)
5277                {
5278                *errorcodeptr = ERR48;
5279                goto FAILED;
5280                }
5281              }
5282            }
5283
5284          /* In the real compile, create the entry in the table, maintaining
5285          alphabetical order. Duplicate names for different numbers are
5286          permitted only if PCRE_DUPNAMES is set. Duplicate names for the same
5287          number are always OK. (An existing number can be re-used if (?|
5288          appears in the pattern.) In either event, a duplicate name results in
5289          a duplicate entry in the table, even if the number is the same. This
5290          is because the number of names, and hence the table size, is computed
5291          in the pre-compile, and it affects various numbers and pointers which
5292          would all have to be modified, and the compiled code moved down, if
5293          duplicates with the same number were omitted from the table. This
5294          doesn't seem worth the hassle. However, *different* names for the
5295          same number are not permitted. */
5296
5297          else
5298            {
5299            BOOL dupname = FALSE;
5300            slot = cd->name_table;
5301
5302            for (i = 0; i < cd->names_found; i++)
5303              {
5304              int crc = memcmp(name, slot+2, namelen);
5305              if (crc == 0)
5306                {
5307                if (slot[2+namelen] == 0)
5308                  {
5309                  if (GET2(slot, 0) != cd->bracount + 1 &&
5310                      (options & PCRE_DUPNAMES) == 0)
5311                    {
5312                    *errorcodeptr = ERR43;
5313                    goto FAILED;
5314                    }
5315                  else dupname = TRUE;
5316                  }
5317                else crc = -1;      /* Current name is a substring */
5318                }
5319
5320              /* Make space in the table and break the loop for an earlier
5321              name. For a duplicate or later name, carry on. We do this for
5322              duplicates so that in the simple case (when ?(| is not used) they
5323              are in order of their numbers. */
5324
5325              if (crc < 0)
5326                {
5327                memmove(slot + cd->name_entry_size, slot,
5328                  (cd->names_found - i) * cd->name_entry_size);
5329                break;
5330                }
5331
5332              /* Continue the loop for a later or duplicate name */
5333
5334              slot += cd->name_entry_size;
5335              }
5336
5337            /* For non-duplicate names, check for a duplicate number before
5338            adding the new name. */
5339
5340            if (!dupname)
5341              {
5342              uschar *cslot = cd->name_table;
5343              for (i = 0; i < cd->names_found; i++)
5344                {
5345                if (cslot != slot)
5346                  {
5347                  if (GET2(cslot, 0) == cd->bracount + 1)
5348                    {
5349                    *errorcodeptr = ERR65;
5350                    goto FAILED;
5351                    }
5352                  }
5353                else i--;
5354                cslot += cd->name_entry_size;
5355                }
5356              }
5357
5358            PUT2(slot, 0, cd->bracount + 1);
5359            memcpy(slot + 2, name, namelen);
5360            slot[2+namelen] = 0;
5361            }
5362          }
5363
5364        /* In both pre-compile and compile, count the number of names we've
5365        encountered. */
5366
5367        cd->names_found++;
5368        ptr++;                    /* Move past > or ' */
5369        goto NUMBERED_GROUP;
5370
5371
5372        /* ------------------------------------------------------------ */
5373        case CHAR_AMPERSAND:            /* Perl recursion/subroutine syntax */
5374        terminator = CHAR_RIGHT_PARENTHESIS;
5375        is_recurse = TRUE;
5376        /* Fall through */
5377
5378        /* We come here from the Python syntax above that handles both
5379        references (?P=name) and recursion (?P>name), as well as falling
5380        through from the Perl recursion syntax (?&name). We also come here from
5381        the Perl \k<name> or \k'name' back reference syntax and the \k{name}
5382        .NET syntax, and the Oniguruma \g<...> and \g'...' subroutine syntax. */
5383
5384        NAMED_REF_OR_RECURSE:
5385        name = ++ptr;
5386        while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
5387        namelen = (int)(ptr - name);
5388
5389        /* In the pre-compile phase, do a syntax check. We used to just set
5390        a dummy reference number, because it was not used in the first pass.
5391        However, with the change of recursive back references to be atomic,
5392        we have to look for the number so that this state can be identified, as
5393        otherwise the incorrect length is computed. If it's not a backwards
5394        reference, the dummy number will do. */
5395
5396        if (lengthptr != NULL)
5397          {
5398          const uschar *temp;
5399
5400          if (namelen == 0)
5401            {
5402            *errorcodeptr = ERR62;
5403            goto FAILED;
5404            }
5405          if (*ptr != terminator)
5406            {
5407            *errorcodeptr = ERR42;
5408            goto FAILED;
5409            }
5410          if (namelen > MAX_NAME_SIZE)
5411            {
5412            *errorcodeptr = ERR48;
5413            goto FAILED;
5414            }
5415
5416          /* The name table does not exist in the first pass, so we cannot
5417          do a simple search as in the code below. Instead, we have to scan the
5418          pattern to find the number. It is important that we scan it only as
5419          far as we have got because the syntax of named subpatterns has not
5420          been checked for the rest of the pattern, and find_parens() assumes
5421          correct syntax. In any case, it's a waste of resources to scan
5422          further. We stop the scan at the current point by temporarily
5423          adjusting the value of cd->endpattern. */
5424
5425          temp = cd->end_pattern;
5426          cd->end_pattern = ptr;
5427          recno = find_parens(cd, name, namelen,
5428            (options & PCRE_EXTENDED) != 0, utf8);
5429          cd->end_pattern = temp;
5430          if (recno < 0) recno = 0;    /* Forward ref; set dummy number */
5431          }
5432
5433        /* In the real compile, seek the name in the table. We check the name
5434        first, and then check that we have reached the end of the name in the
5435        table. That way, if the name that is longer than any in the table,
5436        the comparison will fail without reading beyond the table entry. */
5437
5438        else
5439          {
5440          slot = cd->name_table;
5441          for (i = 0; i < cd->names_found; i++)
5442            {
5443            if (strncmp((char *)name, (char *)slot+2, namelen) == 0 &&
5444                slot[2+namelen] == 0)
5445              break;
5446            slot += cd->name_entry_size;
5447            }
5448
5449          if (i < cd->names_found)         /* Back reference */
5450            {
5451            recno = GET2(slot, 0);
5452            }
5453          else if ((recno =                /* Forward back reference */
5454                    find_parens(cd, name, namelen,
5455                      (options & PCRE_EXTENDED) != 0, utf8)) <= 0)
5456            {
5457            *errorcodeptr = ERR15;
5458            goto FAILED;
5459            }
5460          }
5461
5462        /* In both phases, we can now go to the code than handles numerical
5463        recursion or backreferences. */
5464
5465        if (is_recurse) goto HANDLE_RECURSION;
5466          else goto HANDLE_REFERENCE;
5467
5468
5469        /* ------------------------------------------------------------ */
5470        case CHAR_R:              /* Recursion */
5471        ptr++;                    /* Same as (?0)      */
5472        /* Fall through */
5473
5474
5475        /* ------------------------------------------------------------ */
5476        case CHAR_MINUS: case CHAR_PLUS:  /* Recursion or subroutine */
5477        case CHAR_0: case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4:
5478        case CHAR_5: case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
5479          {
5480          const uschar *called;
5481          terminator = CHAR_RIGHT_PARENTHESIS;
5482
5483          /* Come here from the \g<...> and \g'...' code (Oniguruma
5484          compatibility). However, the syntax has been checked to ensure that
5485          the ... are a (signed) number, so that neither ERR63 nor ERR29 will
5486          be called on this path, nor with the jump to OTHER_CHAR_AFTER_QUERY
5487          ever be taken. */
5488
5489          HANDLE_NUMERICAL_RECURSION:
5490
5491          if ((refsign = *ptr) == CHAR_PLUS)
5492            {
5493            ptr++;
5494            if ((digitab[*ptr] & ctype_digit) == 0)
5495              {
5496              *errorcodeptr = ERR63;
5497              goto FAILED;
5498              }
5499            }
5500          else if (refsign == CHAR_MINUS)
5501            {
5502            if ((digitab[ptr[1]] & ctype_digit) == 0)
5503              goto OTHER_CHAR_AFTER_QUERY;
5504            ptr++;
5505            }
5506
5507          recno = 0;
5508          while((digitab[*ptr] & ctype_digit) != 0)
5509            recno = recno * 10 + *ptr++ - CHAR_0;
5510
5511          if (*ptr != terminator)
5512            {
5513            *errorcodeptr = ERR29;
5514            goto FAILED;
5515            }
5516
5517          if (refsign == CHAR_MINUS)
5518            {
5519            if (recno == 0)
5520              {
5521              *errorcodeptr = ERR58;
5522              goto FAILED;
5523              }
5524            recno = cd->bracount - recno + 1;
5525            if (recno <= 0)
5526              {
5527              *errorcodeptr = ERR15;
5528              goto FAILED;
5529              }
5530            }
5531          else if (refsign == CHAR_PLUS)
5532            {
5533            if (recno == 0)
5534              {
5535              *errorcodeptr = ERR58;
5536              goto FAILED;
5537              }
5538            recno += cd->bracount;
5539            }
5540
5541          /* Come here from code above that handles a named recursion */
5542
5543          HANDLE_RECURSION:
5544
5545          previous = code;
5546          called = cd->start_code;
5547
5548          /* When we are actually compiling, find the bracket that is being
5549          referenced. Temporarily end the regex in case it doesn't exist before
5550          this point. If we end up with a forward reference, first check that
5551          the bracket does occur later so we can give the error (and position)
5552          now. Then remember this forward reference in the workspace so it can
5553          be filled in at the end. */
5554
5555          if (lengthptr == NULL)
5556            {
5557            *code = OP_END;
5558            if (recno != 0)
5559              called = _pcre_find_bracket(cd->start_code, utf8, recno);
5560
5561            /* Forward reference */
5562
5563            if (called == NULL)
5564              {
5565              if (find_parens(cd, NULL, recno,
5566                    (options & PCRE_EXTENDED) != 0, utf8) < 0)
5567                {
5568                *errorcodeptr = ERR15;
5569                goto FAILED;
5570                }
5571
5572              /* Fudge the value of "called" so that when it is inserted as an
5573              offset below, what it actually inserted is the reference number
5574              of the group. */
5575
5576              called = cd->start_code + recno;
5577              PUTINC(cd->hwm, 0, (int)(code + 2 + LINK_SIZE - cd->start_code));
5578              }
5579
5580            /* If not a forward reference, and the subpattern is still open,
5581            this is a recursive call. We check to see if this is a left
5582            recursion that could loop for ever, and diagnose that case. */
5583
5584            else if (GET(called, 1) == 0 &&
5585                     could_be_empty(called, code, bcptr, utf8, cd))
5586              {
5587              *errorcodeptr = ERR40;
5588              goto FAILED;
5589              }
5590            }
5591
5592          /* Insert the recursion/subroutine item, automatically wrapped inside
5593          "once" brackets. Set up a "previous group" length so that a
5594          subsequent quantifier will work. */
5595
5596          *code = OP_ONCE;
5597          PUT(code, 1, 2 + 2*LINK_SIZE);
5598          code += 1 + LINK_SIZE;
5599
5600          *code = OP_RECURSE;
5601          PUT(code, 1, (int)(called - cd->start_code));
5602          code += 1 + LINK_SIZE;
5603
5604          *code = OP_KET;
5605          PUT(code, 1, 2 + 2*LINK_SIZE);
5606          code += 1 + LINK_SIZE;
5607
5608          length_prevgroup = 3 + 3*LINK_SIZE;
5609          }
5610
5611        /* Can't determine a first byte now */
5612
5613        if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
5614        continue;
5615
5616
5617        /* ------------------------------------------------------------ */
5618        default:              /* Other characters: check option setting */
5619        OTHER_CHAR_AFTER_QUERY:
5620        set = unset = 0;
5621        optset = &set;
5622
5623        while (*ptr != CHAR_RIGHT_PARENTHESIS && *ptr != CHAR_COLON)
5624          {
5625          switch (*ptr++)
5626            {
5627            case CHAR_MINUS: optset = &unset; break;
5628
5629            case CHAR_J:    /* Record that it changed in the external options */
5630            *optset |= PCRE_DUPNAMES;
5631            cd->external_flags |= PCRE_JCHANGED;
5632            break;
5633
5634            case CHAR_i: *optset |= PCRE_CASELESS; break;
5635            case CHAR_m: *optset |= PCRE_MULTILINE; break;
5636            case CHAR_s: *optset |= PCRE_DOTALL; break;
5637            case CHAR_x: *optset |= PCRE_EXTENDED; break;
5638            case CHAR_U: *optset |= PCRE_UNGREEDY; break;
5639            case CHAR_X: *optset |= PCRE_EXTRA; break;
5640
5641            default:  *errorcodeptr = ERR12;
5642                      ptr--;    /* Correct the offset */
5643                      goto FAILED;
5644            }
5645          }
5646
5647        /* Set up the changed option bits, but don't change anything yet. */
5648
5649        newoptions = (options | set) & (~unset);
5650
5651        /* If the options ended with ')' this is not the start of a nested
5652        group with option changes, so the options change at this level. If this
5653        item is right at the start of the pattern, the options can be
5654        abstracted and made external in the pre-compile phase, and ignored in
5655        the compile phase. This can be helpful when matching -- for instance in
5656        caseless checking of required bytes.
5657
5658        If the code pointer is not (cd->start_code + 1 + LINK_SIZE), we are
5659        definitely *not* at the start of the pattern because something has been
5660        compiled. In the pre-compile phase, however, the code pointer can have
5661        that value after the start, because it gets reset as code is discarded
5662        during the pre-compile. However, this can happen only at top level - if
5663        we are within parentheses, the starting BRA will still be present. At
5664        any parenthesis level, the length value can be used to test if anything
5665        has been compiled at that level. Thus, a test for both these conditions
5666        is necessary to ensure we correctly detect the start of the pattern in
5667        both phases.
5668
5669        If we are not at the pattern start, compile code to change the ims
5670        options if this setting actually changes any of them, and reset the
5671        greedy defaults and the case value for firstbyte and reqbyte. */
5672
5673        if (*ptr == CHAR_RIGHT_PARENTHESIS)
5674          {
5675          if (code == cd->start_code + 1 + LINK_SIZE &&
5676               (lengthptr == NULL || *lengthptr == 2 + 2*LINK_SIZE))
5677            {
5678            cd->external_options = newoptions;
5679            }
5680          else
5681            {
5682            if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))
5683              {
5684              *code++ = OP_OPT;
5685              *code++ = newoptions & PCRE_IMS;
5686              }
5687            greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
5688            greedy_non_default = greedy_default ^ 1;
5689            req_caseopt = ((newoptions & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
5690            }
5691
5692          /* Change options at this level, and pass them back for use
5693          in subsequent branches. When not at the start of the pattern, this
5694          information is also necessary so that a resetting item can be
5695          compiled at the end of a group (if we are in a group). */
5696
5697          *optionsptr = options = newoptions;
5698          previous = NULL;       /* This item can't be repeated */
5699          continue;              /* It is complete */
5700          }
5701
5702        /* If the options ended with ':' we are heading into a nested group
5703        with possible change of options. Such groups are non-capturing and are
5704        not assertions of any kind. All we need to do is skip over the ':';
5705        the newoptions value is handled below. */
5706
5707        bravalue = OP_BRA;
5708        ptr++;
5709        }     /* End of switch for character following (? */
5710      }       /* End of (? handling */
5711
5712    /* Opening parenthesis not followed by '*' or '?'. If PCRE_NO_AUTO_CAPTURE
5713    is set, all unadorned brackets become non-capturing and behave like (?:...)
5714    brackets. */
5715
5716    else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
5717      {
5718      bravalue = OP_BRA;
5719      }
5720
5721    /* Else we have a capturing group. */
5722
5723    else
5724      {
5725      NUMBERED_GROUP:
5726      cd->bracount += 1;
5727      PUT2(code, 1+LINK_SIZE, cd->bracount);
5728      skipbytes = 2;
5729      }
5730
5731    /* Process nested bracketed regex. Assertions may not be repeated, but
5732    other kinds can be. All their opcodes are >= OP_ONCE. We copy code into a
5733    non-register variable in order to be able to pass its address because some
5734    compilers complain otherwise. Pass in a new setting for the ims options if
5735    they have changed. */
5736
5737    previous = (bravalue >= OP_ONCE)? code : NULL;
5738    *code = bravalue;
5739    tempcode = code;
5740    tempreqvary = cd->req_varyopt;     /* Save value before bracket */
5741    length_prevgroup = 0;              /* Initialize for pre-compile phase */
5742
5743    if (!compile_regex(
5744         newoptions,                   /* The complete new option state */
5745         options & PCRE_IMS,           /* The previous ims option state */
5746         &tempcode,                    /* Where to put code (updated) */
5747         &ptr,                         /* Input pointer (updated) */
5748         errorcodeptr,                 /* Where to put an error message */
5749         (bravalue == OP_ASSERTBACK ||
5750          bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
5751         reset_bracount,               /* True if (?| group */
5752         skipbytes,                    /* Skip over bracket number */
5753         &subfirstbyte,                /* For possible first char */
5754         &subreqbyte,                  /* For possible last char */
5755         bcptr,                        /* Current branch chain */
5756         cd,                           /* Tables block */
5757         (lengthptr == NULL)? NULL :   /* Actual compile phase */
5758           &length_prevgroup           /* Pre-compile phase */
5759         ))
5760      goto FAILED;
5761
5762    /* At the end of compiling, code is still pointing to the start of the
5763    group, while tempcode has been updated to point past the end of the group
5764    and any option resetting that may follow it. The pattern pointer (ptr)
5765    is on the bracket. */
5766
5767    /* If this is a conditional bracket, check that there are no more than
5768    two branches in the group, or just one if it's a DEFINE group. We do this
5769    in the real compile phase, not in the pre-pass, where the whole group may
5770    not be available. */
5771
5772    if (bravalue == OP_COND && lengthptr == NULL)
5773      {
5774      uschar *tc = code;
5775      int condcount = 0;
5776
5777      do {
5778         condcount++;
5779         tc += GET(tc,1);
5780         }
5781      while (*tc != OP_KET);
5782
5783      /* A DEFINE group is never obeyed inline (the "condition" is always
5784      false). It must have only one branch. */
5785
5786      if (code[LINK_SIZE+1] == OP_DEF)
5787        {
5788        if (condcount > 1)
5789          {
5790          *errorcodeptr = ERR54;
5791          goto FAILED;
5792          }
5793        bravalue = OP_DEF;   /* Just a flag to suppress char handling below */
5794        }
5795
5796      /* A "normal" conditional group. If there is just one branch, we must not
5797      make use of its firstbyte or reqbyte, because this is equivalent to an
5798      empty second branch. */
5799
5800      else
5801        {
5802        if (condcount > 2)
5803          {
5804          *errorcodeptr = ERR27;
5805          goto FAILED;
5806          }
5807        if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE;
5808        }
5809      }
5810
5811    /* Error if hit end of pattern */
5812
5813    if (*ptr != CHAR_RIGHT_PARENTHESIS)
5814      {
5815      *errorcodeptr = ERR14;
5816      goto FAILED;
5817      }
5818
5819    /* In the pre-compile phase, update the length by the length of the group,
5820    less the brackets at either end. Then reduce the compiled code to just a
5821    set of non-capturing brackets so that it doesn't use much memory if it is
5822    duplicated by a quantifier.*/
5823
5824    if (lengthptr != NULL)
5825      {
5826      if (OFLOW_MAX - *lengthptr < length_prevgroup - 2 - 2*LINK_SIZE)
5827        {
5828        *errorcodeptr = ERR20;
5829        goto FAILED;
5830        }
5831      *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE;
5832      *code++ = OP_BRA;
5833      PUTINC(code, 0, 1 + LINK_SIZE);
5834      *code++ = OP_KET;
5835      PUTINC(code, 0, 1 + LINK_SIZE);
5836      break;    /* No need to waste time with special character handling */
5837      }
5838
5839    /* Otherwise update the main code pointer to the end of the group. */
5840
5841    code = tempcode;
5842
5843    /* For a DEFINE group, required and first character settings are not
5844    relevant. */
5845
5846    if (bravalue == OP_DEF) break;
5847
5848    /* Handle updating of the required and first characters for other types of
5849    group. Update for normal brackets of all kinds, and conditions with two
5850    branches (see code above). If the bracket is followed by a quantifier with
5851    zero repeat, we have to back off. Hence the definition of zeroreqbyte and
5852    zerofirstbyte outside the main loop so that they can be accessed for the
5853    back off. */
5854
5855    zeroreqbyte = reqbyte;
5856    zerofirstbyte = firstbyte;
5857    groupsetfirstbyte = FALSE;
5858
5859    if (bravalue >= OP_ONCE)
5860      {
5861      /* If we have not yet set a firstbyte in this branch, take it from the
5862      subpattern, remembering that it was set here so that a repeat of more
5863      than one can replicate it as reqbyte if necessary. If the subpattern has
5864      no firstbyte, set "none" for the whole branch. In both cases, a zero
5865      repeat forces firstbyte to "none". */
5866
5867      if (firstbyte == REQ_UNSET)
5868        {
5869        if (subfirstbyte >= 0)
5870          {
5871          firstbyte = subfirstbyte;
5872          groupsetfirstbyte = TRUE;
5873          }
5874        else firstbyte = REQ_NONE;
5875        zerofirstbyte = REQ_NONE;
5876        }
5877
5878      /* If firstbyte was previously set, convert the subpattern's firstbyte
5879      into reqbyte if there wasn't one, using the vary flag that was in
5880      existence beforehand. */
5881
5882      else if (subfirstbyte >= 0 && subreqbyte < 0)
5883        subreqbyte = subfirstbyte | tempreqvary;
5884
5885      /* If the subpattern set a required byte (or set a first byte that isn't
5886      really the first byte - see above), set it. */
5887
5888      if (subreqbyte >= 0) reqbyte = subreqbyte;
5889      }
5890
5891    /* For a forward assertion, we take the reqbyte, if set. This can be
5892    helpful if the pattern that follows the assertion doesn't set a different
5893    char. For example, it's useful for /(?=abcde).+/. We can't set firstbyte
5894    for an assertion, however because it leads to incorrect effect for patterns
5895    such as /(?=a)a.+/ when the "real" "a" would then become a reqbyte instead
5896    of a firstbyte. This is overcome by a scan at the end if there's no
5897    firstbyte, looking for an asserted first char. */
5898
5899    else if (bravalue == OP_ASSERT && subreqbyte >= 0) reqbyte = subreqbyte;
5900    break;     /* End of processing '(' */
5901
5902
5903    /* ===================================================================*/
5904    /* Handle metasequences introduced by \. For ones like \d, the ESC_ values
5905    are arranged to be the negation of the corresponding OP_values in the
5906    default case when PCRE_UCP is not set. For the back references, the values
5907    are ESC_REF plus the reference number. Only back references and those types
5908    that consume a character may be repeated. We can test for values between
5909    ESC_b and ESC_Z for the latter; this may have to change if any new ones are
5910    ever created. */
5911
5912    case CHAR_BACKSLASH:
5913    tempptr = ptr;
5914    c = check_escape(&ptr, errorcodeptr, cd->bracount, options, FALSE);
5915    if (*errorcodeptr != 0) goto FAILED;
5916
5917    if (c < 0)
5918      {
5919      if (-c == ESC_Q)            /* Handle start of quoted string */
5920        {
5921        if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
5922          ptr += 2;               /* avoid empty string */
5923            else inescq = TRUE;
5924        continue;
5925        }
5926
5927      if (-c == ESC_E) continue;  /* Perl ignores an orphan \E */
5928
5929      /* For metasequences that actually match a character, we disable the
5930      setting of a first character if it hasn't already been set. */
5931
5932      if (firstbyte == REQ_UNSET && -c > ESC_b && -c < ESC_Z)
5933        firstbyte = REQ_NONE;
5934
5935      /* Set values to reset to if this is followed by a zero repeat. */
5936
5937      zerofirstbyte = firstbyte;
5938      zeroreqbyte = reqbyte;
5939
5940      /* \g<name> or \g'name' is a subroutine call by name and \g<n> or \g'n'
5941      is a subroutine call by number (Oniguruma syntax). In fact, the value
5942      -ESC_g is returned only for these cases. So we don't need to check for <
5943      or ' if the value is -ESC_g. For the Perl syntax \g{n} the value is
5944      -ESC_REF+n, and for the Perl syntax \g{name} the result is -ESC_k (as
5945      that is a synonym for a named back reference). */
5946
5947      if (-c == ESC_g)
5948        {
5949        const uschar *p;
5950        save_hwm = cd->hwm;   /* Normally this is set when '(' is read */
5951        terminator = (*(++ptr) == CHAR_LESS_THAN_SIGN)?
5952          CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;
5953
5954        /* These two statements stop the compiler for warning about possibly
5955        unset variables caused by the jump to HANDLE_NUMERICAL_RECURSION. In
5956        fact, because we actually check for a number below, the paths that
5957        would actually be in error are never taken. */
5958
5959        skipbytes = 0;
5960        reset_bracount = FALSE;
5961
5962        /* Test for a name */
5963
5964        if (ptr[1] != CHAR_PLUS && ptr[1] != CHAR_MINUS)
5965          {
5966          BOOL isnumber = TRUE;
5967          for (p = ptr + 1; *p != 0 && *p != terminator; p++)
5968            {
5969            if ((cd->ctypes[*p] & ctype_digit) == 0) isnumber = FALSE;
5970            if ((cd->ctypes[*p] & ctype_word) == 0) break;
5971            }
5972          if (*p != terminator)
5973            {
5974            *errorcodeptr = ERR57;
5975            break;
5976            }
5977          if (isnumber)
5978            {
5979            ptr++;
5980            goto HANDLE_NUMERICAL_RECURSION;
5981            }
5982          is_recurse = TRUE;
5983          goto NAMED_REF_OR_RECURSE;
5984          }
5985
5986        /* Test a signed number in angle brackets or quotes. */
5987
5988        p = ptr + 2;
5989        while ((digitab[*p] & ctype_digit) != 0) p++;
5990        if (*p != terminator)
5991          {
5992          *errorcodeptr = ERR57;
5993          break;
5994          }
5995        ptr++;
5996        goto HANDLE_NUMERICAL_RECURSION;
5997        }
5998
5999      /* \k<name> or \k'name' is a back reference by name (Perl syntax).
6000      We also support \k{name} (.NET syntax) */
6001
6002      if (-c == ESC_k && (ptr[1] == CHAR_LESS_THAN_SIGN ||
6003          ptr[1] == CHAR_APOSTROPHE || ptr[1] == CHAR_LEFT_CURLY_BRACKET))
6004        {
6005        is_recurse = FALSE;
6006        terminator = (*(++ptr) == CHAR_LESS_THAN_SIGN)?
6007          CHAR_GREATER_THAN_SIGN : (*ptr == CHAR_APOSTROPHE)?
6008          CHAR_APOSTROPHE : CHAR_RIGHT_CURLY_BRACKET;
6009        goto NAMED_REF_OR_RECURSE;
6010        }
6011
6012      /* Back references are handled specially; must disable firstbyte if
6013      not set to cope with cases like (?=(\w+))\1: which would otherwise set
6014      ':' later. */
6015
6016      if (-c >= ESC_REF)
6017        {
6018        open_capitem *oc;
6019        recno = -c - ESC_REF;
6020
6021        HANDLE_REFERENCE:    /* Come here from named backref handling */
6022        if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
6023        previous = code;
6024        *code++ = OP_REF;
6025        PUT2INC(code, 0, recno);
6026        cd->backref_map |= (recno < 32)? (1 << recno) : 1;
6027        if (recno > cd->top_backref) cd->top_backref = recno;
6028
6029        /* Check to see if this back reference is recursive, that it, it
6030        is inside the group that it references. A flag is set so that the
6031        group can be made atomic. */
6032
6033        for (oc = cd->open_caps; oc != NULL; oc = oc->next)
6034          {
6035          if (oc->number == recno)
6036            {
6037            oc->flag = TRUE;
6038            break;
6039            }
6040          }
6041        }
6042
6043      /* So are Unicode property matches, if supported. */
6044
6045#ifdef SUPPORT_UCP
6046      else if (-c == ESC_P || -c == ESC_p)
6047        {
6048        BOOL negated;
6049        int pdata;
6050        int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
6051        if (ptype < 0) goto FAILED;
6052        previous = code;
6053        *code++ = ((-c == ESC_p) != negated)? OP_PROP : OP_NOTPROP;
6054        *code++ = ptype;
6055        *code++ = pdata;
6056        }
6057#else
6058
6059      /* If Unicode properties are not supported, \X, \P, and \p are not
6060      allowed. */
6061
6062      else if (-c == ESC_X || -c == ESC_P || -c == ESC_p)
6063        {
6064        *errorcodeptr = ERR45;
6065        goto FAILED;
6066        }
6067#endif
6068
6069      /* For the rest (including \X when Unicode properties are supported), we
6070      can obtain the OP value by negating the escape value in the default
6071      situation when PCRE_UCP is not set. When it *is* set, we substitute
6072      Unicode property tests. */
6073
6074      else
6075        {
6076#ifdef SUPPORT_UCP
6077        if (-c >= ESC_DU && -c <= ESC_wu)
6078          {
6079          nestptr = ptr + 1;                   /* Where to resume */
6080          ptr = substitutes[-c - ESC_DU] - 1;  /* Just before substitute */
6081          }
6082        else
6083#endif
6084          {
6085          previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
6086          *code++ = -c;
6087          }
6088        }
6089      continue;
6090      }
6091
6092    /* We have a data character whose value is in c. In UTF-8 mode it may have
6093    a value > 127. We set its representation in the length/buffer, and then
6094    handle it as a data character. */
6095
6096#ifdef SUPPORT_UTF8
6097    if (utf8 && c > 127)
6098      mclength = _pcre_ord2utf8(c, mcbuffer);
6099    else
6100#endif
6101
6102     {
6103     mcbuffer[0] = c;
6104     mclength = 1;
6105     }
6106    goto ONE_CHAR;
6107
6108
6109    /* ===================================================================*/
6110    /* Handle a literal character. It is guaranteed not to be whitespace or #
6111    when the extended flag is set. If we are in UTF-8 mode, it may be a
6112    multi-byte literal character. */
6113
6114    default:
6115    NORMAL_CHAR:
6116    mclength = 1;
6117    mcbuffer[0] = c;
6118
6119#ifdef SUPPORT_UTF8
6120    if (utf8 && c >= 0xc0)
6121      {
6122      while ((ptr[1] & 0xc0) == 0x80)
6123        mcbuffer[mclength++] = *(++ptr);
6124      }
6125#endif
6126
6127    /* At this point we have the character's bytes in mcbuffer, and the length
6128    in mclength. When not in UTF-8 mode, the length is always 1. */
6129
6130    ONE_CHAR:
6131    previous = code;
6132    *code++ = ((options & PCRE_CASELESS) != 0)? OP_CHARNC : OP_CHAR;
6133    for (c = 0; c < mclength; c++) *code++ = mcbuffer[c];
6134
6135    /* Remember if \r or \n were seen */
6136
6137    if (mcbuffer[0] == CHAR_CR || mcbuffer[0] == CHAR_NL)
6138      cd->external_flags |= PCRE_HASCRORLF;
6139
6140    /* Set the first and required bytes appropriately. If no previous first
6141    byte, set it from this character, but revert to none on a zero repeat.
6142    Otherwise, leave the firstbyte value alone, and don't change it on a zero
6143    repeat. */
6144
6145    if (firstbyte == REQ_UNSET)
6146      {
6147      zerofirstbyte = REQ_NONE;
6148      zeroreqbyte = reqbyte;
6149
6150      /* If the character is more than one byte long, we can set firstbyte
6151      only if it is not to be matched caselessly. */
6152
6153      if (mclength == 1 || req_caseopt == 0)
6154        {
6155        firstbyte = mcbuffer[0] | req_caseopt;
6156        if (mclength != 1) reqbyte = code[-1] | cd->req_varyopt;
6157        }
6158      else firstbyte = reqbyte = REQ_NONE;
6159      }
6160
6161    /* firstbyte was previously set; we can set reqbyte only the length is
6162    1 or the matching is caseful. */
6163
6164    else
6165      {
6166      zerofirstbyte = firstbyte;
6167      zeroreqbyte = reqbyte;
6168      if (mclength == 1 || req_caseopt == 0)
6169        reqbyte = code[-1] | req_caseopt | cd->req_varyopt;
6170      }
6171
6172    break;            /* End of literal character handling */
6173    }
6174  }                   /* end of big loop */
6175
6176
6177/* Control never reaches here by falling through, only by a goto for all the
6178error states. Pass back the position in the pattern so that it can be displayed
6179to the user for diagnosing the error. */
6180
6181FAILED:
6182*ptrptr = ptr;
6183return FALSE;
6184}
6185
6186
6187
6188
6189/*************************************************
6190*     Compile sequence of alternatives           *
6191*************************************************/
6192
6193/* On entry, ptr is pointing past the bracket character, but on return it
6194points to the closing bracket, or vertical bar, or end of string. The code
6195variable is pointing at the byte into which the BRA operator has been stored.
6196If the ims options are changed at the start (for a (?ims: group) or during any
6197branch, we need to insert an OP_OPT item at the start of every following branch
6198to ensure they get set correctly at run time, and also pass the new options
6199into every subsequent branch compile.
6200
6201This function is used during the pre-compile phase when we are trying to find
6202out the amount of memory needed, as well as during the real compile phase. The
6203value of lengthptr distinguishes the two phases.
6204
6205Arguments:
6206  options        option bits, including any changes for this subpattern
6207  oldims         previous settings of ims option bits
6208  codeptr        -> the address of the current code pointer
6209  ptrptr         -> the address of the current pattern pointer
6210  errorcodeptr   -> pointer to error code variable
6211  lookbehind     TRUE if this is a lookbehind assertion
6212  reset_bracount TRUE to reset the count for each branch
6213  skipbytes      skip this many bytes at start (for brackets and OP_COND)
6214  firstbyteptr   place to put the first required character, or a negative number
6215  reqbyteptr     place to put the last required character, or a negative number
6216  bcptr          pointer to the chain of currently open branches
6217  cd             points to the data block with tables pointers etc.
6218  lengthptr      NULL during the real compile phase
6219                 points to length accumulator during pre-compile phase
6220
6221Returns:         TRUE on success
6222*/
6223
6224static BOOL
6225compile_regex(int options, int oldims, uschar **codeptr, const uschar **ptrptr,
6226  int *errorcodeptr, BOOL lookbehind, BOOL reset_bracount, int skipbytes,
6227  int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr, compile_data *cd,
6228  int *lengthptr)
6229{
6230const uschar *ptr = *ptrptr;
6231uschar *code = *codeptr;
6232uschar *last_branch = code;
6233uschar *start_bracket = code;
6234uschar *reverse_count = NULL;
6235open_capitem capitem;
6236int capnumber = 0;
6237int firstbyte, reqbyte;
6238int branchfirstbyte, branchreqbyte;
6239int length;
6240int orig_bracount;
6241int max_bracount;
6242int old_external_options = cd->external_options;
6243branch_chain bc;
6244
6245bc.outer = bcptr;
6246bc.current_branch = code;
6247
6248firstbyte = reqbyte = REQ_UNSET;
6249
6250/* Accumulate the length for use in the pre-compile phase. Start with the
6251length of the BRA and KET and any extra bytes that are required at the
6252beginning. We accumulate in a local variable to save frequent testing of
6253lenthptr for NULL. We cannot do this by looking at the value of code at the
6254start and end of each alternative, because compiled items are discarded during
6255the pre-compile phase so that the work space is not exceeded. */
6256
6257length = 2 + 2*LINK_SIZE + skipbytes;
6258
6259/* WARNING: If the above line is changed for any reason, you must also change
6260the code that abstracts option settings at the start of the pattern and makes
6261them global. It tests the value of length for (2 + 2*LINK_SIZE) in the
6262pre-compile phase to find out whether anything has yet been compiled or not. */
6263
6264/* If this is a capturing subpattern, add to the chain of open capturing items
6265so that we can detect them if (*ACCEPT) is encountered. This is also used to
6266detect groups that contain recursive back references to themselves. */
6267
6268if (*code == OP_CBRA)
6269  {
6270  capnumber = GET2(code, 1 + LINK_SIZE);
6271  capitem.number = capnumber;
6272  capitem.next = cd->open_caps;
6273  capitem.flag = FALSE;
6274  cd->open_caps = &capitem;
6275  }
6276
6277/* Offset is set zero to mark that this bracket is still open */
6278
6279PUT(code, 1, 0);
6280code += 1 + LINK_SIZE + skipbytes;
6281
6282/* Loop for each alternative branch */
6283
6284orig_bracount = max_bracount = cd->bracount;
6285for (;;)
6286  {
6287  /* For a (?| group, reset the capturing bracket count so that each branch
6288  uses the same numbers. */
6289
6290  if (reset_bracount) cd->bracount = orig_bracount;
6291
6292  /* Handle a change of ims options at the start of the branch */
6293
6294  if ((options & PCRE_IMS) != oldims)
6295    {
6296    *code++ = OP_OPT;
6297    *code++ = options & PCRE_IMS;
6298    length += 2;
6299    }
6300
6301  /* Set up dummy OP_REVERSE if lookbehind assertion */
6302
6303  if (lookbehind)
6304    {
6305    *code++ = OP_REVERSE;
6306    reverse_count = code;
6307    PUTINC(code, 0, 0);
6308    length += 1 + LINK_SIZE;
6309    }
6310
6311  /* Now compile the branch; in the pre-compile phase its length gets added
6312  into the length. */
6313
6314  if (!compile_branch(&options, &code, &ptr, errorcodeptr, &branchfirstbyte,
6315        &branchreqbyte, &bc, cd, (lengthptr == NULL)? NULL : &length))
6316    {
6317    *ptrptr = ptr;
6318    return FALSE;
6319    }
6320
6321  /* If the external options have changed during this branch, it means that we
6322  are at the top level, and a leading option setting has been encountered. We
6323  need to re-set the original option values to take account of this so that,
6324  during the pre-compile phase, we know to allow for a re-set at the start of
6325  subsequent branches. */
6326
6327  if (old_external_options != cd->external_options)
6328    oldims = cd->external_options & PCRE_IMS;
6329
6330  /* Keep the highest bracket count in case (?| was used and some branch
6331  has fewer than the rest. */
6332
6333  if (cd->bracount > max_bracount) max_bracount = cd->bracount;
6334
6335  /* In the real compile phase, there is some post-processing to be done. */
6336
6337  if (lengthptr == NULL)
6338    {
6339    /* If this is the first branch, the firstbyte and reqbyte values for the
6340    branch become the values for the regex. */
6341
6342    if (*last_branch != OP_ALT)
6343      {
6344      firstbyte = branchfirstbyte;
6345      reqbyte = branchreqbyte;
6346      }
6347
6348    /* If this is not the first branch, the first char and reqbyte have to
6349    match the values from all the previous branches, except that if the
6350    previous value for reqbyte didn't have REQ_VARY set, it can still match,
6351    and we set REQ_VARY for the regex. */
6352
6353    else
6354      {
6355      /* If we previously had a firstbyte, but it doesn't match the new branch,
6356      we have to abandon the firstbyte for the regex, but if there was
6357      previously no reqbyte, it takes on the value of the old firstbyte. */
6358
6359      if (firstbyte >= 0 && firstbyte != branchfirstbyte)
6360        {
6361        if (reqbyte < 0) reqbyte = firstbyte;
6362        firstbyte = REQ_NONE;
6363        }
6364
6365      /* If we (now or from before) have no firstbyte, a firstbyte from the
6366      branch becomes a reqbyte if there isn't a branch reqbyte. */
6367
6368      if (firstbyte < 0 && branchfirstbyte >= 0 && branchreqbyte < 0)
6369          branchreqbyte = branchfirstbyte;
6370
6371      /* Now ensure that the reqbytes match */
6372
6373      if ((reqbyte & ~REQ_VARY) != (branchreqbyte & ~REQ_VARY))
6374        reqbyte = REQ_NONE;
6375      else reqbyte |= branchreqbyte;   /* To "or" REQ_VARY */
6376      }
6377
6378    /* If lookbehind, check that this branch matches a fixed-length string, and
6379    put the length into the OP_REVERSE item. Temporarily mark the end of the
6380    branch with OP_END. If the branch contains OP_RECURSE, the result is -3
6381    because there may be forward references that we can't check here. Set a
6382    flag to cause another lookbehind check at the end. Why not do it all at the
6383    end? Because common, erroneous checks are picked up here and the offset of
6384    the problem can be shown. */
6385
6386    if (lookbehind)
6387      {
6388      int fixed_length;
6389      *code = OP_END;
6390      fixed_length = find_fixedlength(last_branch, options, FALSE, cd);
6391      DPRINTF(("fixed length = %d\n", fixed_length));
6392      if (fixed_length == -3)
6393        {
6394        cd->check_lookbehind = TRUE;
6395        }
6396      else if (fixed_length < 0)
6397        {
6398        *errorcodeptr = (fixed_length == -2)? ERR36 : ERR25;
6399        *ptrptr = ptr;
6400        return FALSE;
6401        }
6402      else { PUT(reverse_count, 0, fixed_length); }
6403      }
6404    }
6405
6406  /* Reached end of expression, either ')' or end of pattern. In the real
6407  compile phase, go back through the alternative branches and reverse the chain
6408  of offsets, with the field in the BRA item now becoming an offset to the
6409  first alternative. If there are no alternatives, it points to the end of the
6410  group. The length in the terminating ket is always the length of the whole
6411  bracketed item. If any of the ims options were changed inside the group,
6412  compile a resetting op-code following, except at the very end of the pattern.
6413  Return leaving the pointer at the terminating char. */
6414
6415  if (*ptr != CHAR_VERTICAL_LINE)
6416    {
6417    if (lengthptr == NULL)
6418      {
6419      int branch_length = (int)(code - last_branch);
6420      do
6421        {
6422        int prev_length = GET(last_branch, 1);
6423        PUT(last_branch, 1, branch_length);
6424        branch_length = prev_length;
6425        last_branch -= branch_length;
6426        }
6427      while (branch_length > 0);
6428      }
6429
6430    /* Fill in the ket */
6431
6432    *code = OP_KET;
6433    PUT(code, 1, (int)(code - start_bracket));
6434    code += 1 + LINK_SIZE;
6435
6436    /* If it was a capturing subpattern, check to see if it contained any
6437    recursive back references. If so, we must wrap it in atomic brackets.
6438    In any event, remove the block from the chain. */
6439
6440    if (capnumber > 0)
6441      {
6442      if (cd->open_caps->flag)
6443        {
6444        memmove(start_bracket + 1 + LINK_SIZE, start_bracket,
6445          code - start_bracket);
6446        *start_bracket = OP_ONCE;
6447        code += 1 + LINK_SIZE;
6448        PUT(start_bracket, 1, (int)(code - start_bracket));
6449        *code = OP_KET;
6450        PUT(code, 1, (int)(code - start_bracket));
6451        code += 1 + LINK_SIZE;
6452        length += 2 + 2*LINK_SIZE;
6453        }
6454      cd->open_caps = cd->open_caps->next;
6455      }
6456
6457    /* Reset options if needed. */
6458
6459    if ((options & PCRE_IMS) != oldims && *ptr == CHAR_RIGHT_PARENTHESIS)
6460      {
6461      *code++ = OP_OPT;
6462      *code++ = oldims;
6463      length += 2;
6464      }
6465
6466    /* Retain the highest bracket number, in case resetting was used. */
6467
6468    cd->bracount = max_bracount;
6469
6470    /* Set values to pass back */
6471
6472    *codeptr = code;
6473    *ptrptr = ptr;
6474    *firstbyteptr = firstbyte;
6475    *reqbyteptr = reqbyte;
6476    if (lengthptr != NULL)
6477      {
6478      if (OFLOW_MAX - *lengthptr < length)
6479        {
6480        *errorcodeptr = ERR20;
6481        return FALSE;
6482        }
6483      *lengthptr += length;
6484      }
6485    return TRUE;
6486    }
6487
6488  /* Another branch follows. In the pre-compile phase, we can move the code
6489  pointer back to where it was for the start of the first branch. (That is,
6490  pretend that each branch is the only one.)
6491
6492  In the real compile phase, insert an ALT node. Its length field points back
6493  to the previous branch while the bracket remains open. At the end the chain
6494  is reversed. It's done like this so that the start of the bracket has a
6495  zero offset until it is closed, making it possible to detect recursion. */
6496
6497  if (lengthptr != NULL)
6498    {
6499    code = *codeptr + 1 + LINK_SIZE + skipbytes;
6500    length += 1 + LINK_SIZE;
6501    }
6502  else
6503    {
6504    *code = OP_ALT;
6505    PUT(code, 1, (int)(code - last_branch));
6506    bc.current_branch = last_branch = code;
6507    code += 1 + LINK_SIZE;
6508    }
6509
6510  ptr++;
6511  }
6512/* Control never reaches here */
6513}
6514
6515
6516
6517
6518/*************************************************
6519*          Check for anchored expression         *
6520*************************************************/
6521
6522/* Try to find out if this is an anchored regular expression. Consider each
6523alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
6524all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
6525it's anchored. However, if this is a multiline pattern, then only OP_SOD
6526counts, since OP_CIRC can match in the middle.
6527
6528We can also consider a regex to be anchored if OP_SOM starts all its branches.
6529This is the code for \G, which means "match at start of match position, taking
6530into account the match offset".
6531
6532A branch is also implicitly anchored if it starts with .* and DOTALL is set,
6533because that will try the rest of the pattern at all possible matching points,
6534so there is no point trying again.... er ....
6535
6536.... except when the .* appears inside capturing parentheses, and there is a
6537subsequent back reference to those parentheses. We haven't enough information
6538to catch that case precisely.
6539
6540At first, the best we could do was to detect when .* was in capturing brackets
6541and the highest back reference was greater than or equal to that level.
6542However, by keeping a bitmap of the first 31 back references, we can catch some
6543of the more common cases more precisely.
6544
6545Arguments:
6546  code           points to start of expression (the bracket)
6547  options        points to the options setting
6548  bracket_map    a bitmap of which brackets we are inside while testing; this
6549                  handles up to substring 31; after that we just have to take
6550                  the less precise approach
6551  backref_map    the back reference bitmap
6552
6553Returns:     TRUE or FALSE
6554*/
6555
6556static BOOL
6557is_anchored(register const uschar *code, int *options, unsigned int bracket_map,
6558  unsigned int backref_map)
6559{
6560do {
6561   const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],
6562     options, PCRE_MULTILINE, FALSE);
6563   register int op = *scode;
6564
6565   /* Non-capturing brackets */
6566
6567   if (op == OP_BRA)
6568     {
6569     if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
6570     }
6571
6572   /* Capturing brackets */
6573
6574   else if (op == OP_CBRA)
6575     {
6576     int n = GET2(scode, 1+LINK_SIZE);
6577     int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
6578     if (!is_anchored(scode, options, new_map, backref_map)) return FALSE;
6579     }
6580
6581   /* Other brackets */
6582
6583   else if (op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
6584     {
6585     if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
6586     }
6587
6588   /* .* is not anchored unless DOTALL is set (which generates OP_ALLANY) and
6589   it isn't in brackets that are or may be referenced. */
6590
6591   else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR ||
6592             op == OP_TYPEPOSSTAR))
6593     {
6594     if (scode[1] != OP_ALLANY || (bracket_map & backref_map) != 0)
6595       return FALSE;
6596     }
6597
6598   /* Check for explicit anchoring */
6599
6600   else if (op != OP_SOD && op != OP_SOM &&
6601           ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
6602     return FALSE;
6603   code += GET(code, 1);
6604   }
6605while (*code == OP_ALT);   /* Loop for each alternative */
6606return TRUE;
6607}
6608
6609
6610
6611/*************************************************
6612*         Check for starting with ^ or .*        *
6613*************************************************/
6614
6615/* This is called to find out if every branch starts with ^ or .* so that
6616"first char" processing can be done to speed things up in multiline
6617matching and for non-DOTALL patterns that start with .* (which must start at
6618the beginning or after \n). As in the case of is_anchored() (see above), we
6619have to take account of back references to capturing brackets that contain .*
6620because in that case we can't make the assumption.
6621
6622Arguments:
6623  code           points to start of expression (the bracket)
6624  bracket_map    a bitmap of which brackets we are inside while testing; this
6625                  handles up to substring 31; after that we just have to take
6626                  the less precise approach
6627  backref_map    the back reference bitmap
6628
6629Returns:         TRUE or FALSE
6630*/
6631
6632static BOOL
6633is_startline(const uschar *code, unsigned int bracket_map,
6634  unsigned int backref_map)
6635{
6636do {
6637   const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],
6638     NULL, 0, FALSE);
6639   register int op = *scode;
6640
6641   /* If we are at the start of a conditional assertion group, *both* the
6642   conditional assertion *and* what follows the condition must satisfy the test
6643   for start of line. Other kinds of condition fail. Note that there may be an
6644   auto-callout at the start of a condition. */
6645
6646   if (op == OP_COND)
6647     {
6648     scode += 1 + LINK_SIZE;
6649     if (*scode == OP_CALLOUT) scode += _pcre_OP_lengths[OP_CALLOUT];
6650     switch (*scode)
6651       {
6652       case OP_CREF:
6653       case OP_NCREF:
6654       case OP_RREF:
6655       case OP_NRREF:
6656       case OP_DEF:
6657       return FALSE;
6658
6659       default:     /* Assertion */
6660       if (!is_startline(scode, bracket_map, backref_map)) return FALSE;
6661       do scode += GET(scode, 1); while (*scode == OP_ALT);
6662       scode += 1 + LINK_SIZE;
6663       break;
6664       }
6665     scode = first_significant_code(scode, NULL, 0, FALSE);
6666     op = *scode;
6667     }
6668
6669   /* Non-capturing brackets */
6670
6671   if (op == OP_BRA)
6672     {
6673     if (!is_startline(scode, bracket_map, backref_map)) return FALSE;
6674     }
6675
6676   /* Capturing brackets */
6677
6678   else if (op == OP_CBRA)
6679     {
6680     int n = GET2(scode, 1+LINK_SIZE);
6681     int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
6682     if (!is_startline(scode, new_map, backref_map)) return FALSE;
6683     }
6684
6685   /* Other brackets */
6686
6687   else if (op == OP_ASSERT || op == OP_ONCE)
6688     {
6689     if (!is_startline(scode, bracket_map, backref_map)) return FALSE;
6690     }
6691
6692   /* .* means "start at start or after \n" if it isn't in brackets that
6693   may be referenced. */
6694
6695   else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR)
6696     {
6697     if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
6698     }
6699
6700   /* Check for explicit circumflex */
6701
6702   else if (op != OP_CIRC) return FALSE;
6703
6704   /* Move on to the next alternative */
6705
6706   code += GET(code, 1);
6707   }
6708while (*code == OP_ALT);  /* Loop for each alternative */
6709return TRUE;
6710}
6711
6712
6713
6714/*************************************************
6715*       Check for asserted fixed first char      *
6716*************************************************/
6717
6718/* During compilation, the "first char" settings from forward assertions are
6719discarded, because they can cause conflicts with actual literals that follow.
6720However, if we end up without a first char setting for an unanchored pattern,
6721it is worth scanning the regex to see if there is an initial asserted first
6722char. If all branches start with the same asserted char, or with a bracket all
6723of whose alternatives start with the same asserted char (recurse ad lib), then
6724we return that char, otherwise -1.
6725
6726Arguments:
6727  code       points to start of expression (the bracket)
6728  options    pointer to the options (used to check casing changes)
6729  inassert   TRUE if in an assertion
6730
6731Returns:     -1 or the fixed first char
6732*/
6733
6734static int
6735find_firstassertedchar(const uschar *code, int *options, BOOL inassert)
6736{
6737register int c = -1;
6738do {
6739   int d;
6740   const uschar *scode =
6741     first_significant_code(code + 1+LINK_SIZE, options, PCRE_CASELESS, TRUE);
6742   register int op = *scode;
6743
6744   switch(op)
6745     {
6746     default:
6747     return -1;
6748
6749     case OP_BRA:
6750     case OP_CBRA:
6751     case OP_ASSERT:
6752     case OP_ONCE:
6753     case OP_COND:
6754     if ((d = find_firstassertedchar(scode, options, op == OP_ASSERT)) < 0)
6755       return -1;
6756     if (c < 0) c = d; else if (c != d) return -1;
6757     break;
6758
6759     case OP_EXACT:       /* Fall through */
6760     scode += 2;
6761
6762     case OP_CHAR:
6763     case OP_CHARNC:
6764     case OP_PLUS:
6765     case OP_MINPLUS:
6766     case OP_POSPLUS:
6767     if (!inassert) return -1;
6768     if (c < 0)
6769       {
6770       c = scode[1];
6771       if ((*options & PCRE_CASELESS) != 0) c |= REQ_CASELESS;
6772       }
6773     else if (c != scode[1]) return -1;
6774     break;
6775     }
6776
6777   code += GET(code, 1);
6778   }
6779while (*code == OP_ALT);
6780return c;
6781}
6782
6783
6784
6785/*************************************************
6786*        Compile a Regular Expression            *
6787*************************************************/
6788
6789/* This function takes a string and returns a pointer to a block of store
6790holding a compiled version of the expression. The original API for this
6791function had no error code return variable; it is retained for backwards
6792compatibility. The new function is given a new name.
6793
6794Arguments:
6795  pattern       the regular expression
6796  options       various option bits
6797  errorcodeptr  pointer to error code variable (pcre_compile2() only)
6798                  can be NULL if you don't want a code value
6799  errorptr      pointer to pointer to error text
6800  erroroffset   ptr offset in pattern where error was detected
6801  tables        pointer to character tables or NULL
6802
6803Returns:        pointer to compiled data block, or NULL on error,
6804                with errorptr and erroroffset set
6805*/
6806
6807PCRE_EXP_DEFN pcre * PCRE_CALL_CONVENTION
6808pcre_compile(const char *pattern, int options, const char **errorptr,
6809  int *erroroffset, const unsigned char *tables)
6810{
6811return pcre_compile2(pattern, options, NULL, errorptr, erroroffset, tables);
6812}
6813
6814
6815PCRE_EXP_DEFN pcre * PCRE_CALL_CONVENTION
6816pcre_compile2(const char *pattern, int options, int *errorcodeptr,
6817  const char **errorptr, int *erroroffset, const unsigned char *tables)
6818{
6819real_pcre *re;
6820int length = 1;  /* For final END opcode */
6821int firstbyte, reqbyte, newline;
6822int errorcode = 0;
6823int skipatstart = 0;
6824BOOL utf8;
6825size_t size;
6826uschar *code;
6827const uschar *codestart;
6828const uschar *ptr;
6829compile_data compile_block;
6830compile_data *cd = &compile_block;
6831
6832/* This space is used for "compiling" into during the first phase, when we are
6833computing the amount of memory that is needed. Compiled items are thrown away
6834as soon as possible, so that a fairly large buffer should be sufficient for
6835this purpose. The same space is used in the second phase for remembering where
6836to fill in forward references to subpatterns. */
6837
6838uschar cworkspace[COMPILE_WORK_SIZE];
6839
6840/* Set this early so that early errors get offset 0. */
6841
6842ptr = (const uschar *)pattern;
6843
6844/* We can't pass back an error message if errorptr is NULL; I guess the best we
6845can do is just return NULL, but we can set a code value if there is a code
6846pointer. */
6847
6848if (errorptr == NULL)
6849  {
6850  if (errorcodeptr != NULL) *errorcodeptr = 99;
6851  return NULL;
6852  }
6853
6854*errorptr = NULL;
6855if (errorcodeptr != NULL) *errorcodeptr = ERR0;
6856
6857/* However, we can give a message for this error */
6858
6859if (erroroffset == NULL)
6860  {
6861  errorcode = ERR16;
6862  goto PCRE_EARLY_ERROR_RETURN2;
6863  }
6864
6865*erroroffset = 0;
6866
6867/* Set up pointers to the individual character tables */
6868
6869if (tables == NULL) tables = _pcre_default_tables;
6870cd->lcc = tables + lcc_offset;
6871cd->fcc = tables + fcc_offset;
6872cd->cbits = tables + cbits_offset;
6873cd->ctypes = tables + ctypes_offset;
6874
6875/* Check that all undefined public option bits are zero */
6876
6877if ((options & ~PUBLIC_COMPILE_OPTIONS) != 0)
6878  {
6879  errorcode = ERR17;
6880  goto PCRE_EARLY_ERROR_RETURN;
6881  }
6882
6883/* Check for global one-time settings at the start of the pattern, and remember
6884the offset for later. */
6885
6886while (ptr[skipatstart] == CHAR_LEFT_PARENTHESIS &&
6887       ptr[skipatstart+1] == CHAR_ASTERISK)
6888  {
6889  int newnl = 0;
6890  int newbsr = 0;
6891
6892  if (strncmp((char *)(ptr+skipatstart+2), STRING_UTF8_RIGHTPAR, 5) == 0)
6893    { skipatstart += 7; options |= PCRE_UTF8; continue; }
6894  else if (strncmp((char *)(ptr+skipatstart+2), STRING_UCP_RIGHTPAR, 4) == 0)
6895    { skipatstart += 6; options |= PCRE_UCP; continue; }
6896  else if (strncmp((char *)(ptr+skipatstart+2), STRING_NO_START_OPT_RIGHTPAR, 13) == 0)
6897    { skipatstart += 15; options |= PCRE_NO_START_OPTIMIZE; continue; }
6898
6899  if (strncmp((char *)(ptr+skipatstart+2), STRING_CR_RIGHTPAR, 3) == 0)
6900    { skipatstart += 5; newnl = PCRE_NEWLINE_CR; }
6901  else if (strncmp((char *)(ptr+skipatstart+2), STRING_LF_RIGHTPAR, 3)  == 0)
6902    { skipatstart += 5; newnl = PCRE_NEWLINE_LF; }
6903  else if (strncmp((char *)(ptr+skipatstart+2), STRING_CRLF_RIGHTPAR, 5)  == 0)
6904    { skipatstart += 7; newnl = PCRE_NEWLINE_CR + PCRE_NEWLINE_LF; }
6905  else if (strncmp((char *)(ptr+skipatstart+2), STRING_ANY_RIGHTPAR, 4) == 0)
6906    { skipatstart += 6; newnl = PCRE_NEWLINE_ANY; }
6907  else if (strncmp((char *)(ptr+skipatstart+2), STRING_ANYCRLF_RIGHTPAR, 8) == 0)
6908    { skipatstart += 10; newnl = PCRE_NEWLINE_ANYCRLF; }
6909
6910  else if (strncmp((char *)(ptr+skipatstart+2), STRING_BSR_ANYCRLF_RIGHTPAR, 12) == 0)
6911    { skipatstart += 14; newbsr = PCRE_BSR_ANYCRLF; }
6912  else if (strncmp((char *)(ptr+skipatstart+2), STRING_BSR_UNICODE_RIGHTPAR, 12) == 0)
6913    { skipatstart += 14; newbsr = PCRE_BSR_UNICODE; }
6914
6915  if (newnl != 0)
6916    options = (options & ~PCRE_NEWLINE_BITS) | newnl;
6917  else if (newbsr != 0)
6918    options = (options & ~(PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) | newbsr;
6919  else break;
6920  }
6921
6922utf8 = (options & PCRE_UTF8) != 0;
6923
6924/* Can't support UTF8 unless PCRE has been compiled to include the code. */
6925
6926#ifdef SUPPORT_UTF8
6927if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0 &&
6928     (*erroroffset = _pcre_valid_utf8((USPTR)pattern, -1)) >= 0)
6929  {
6930  errorcode = ERR44;
6931  goto PCRE_EARLY_ERROR_RETURN2;
6932  }
6933#else
6934if (utf8)
6935  {
6936  errorcode = ERR32;
6937  goto PCRE_EARLY_ERROR_RETURN;
6938  }
6939#endif
6940
6941/* Can't support UCP unless PCRE has been compiled to include the code. */
6942
6943#ifndef SUPPORT_UCP
6944if ((options & PCRE_UCP) != 0)
6945  {
6946  errorcode = ERR67;
6947  goto PCRE_EARLY_ERROR_RETURN;
6948  }
6949#endif
6950
6951/* Check validity of \R options. */
6952
6953switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
6954  {
6955  case 0:
6956  case PCRE_BSR_ANYCRLF:
6957  case PCRE_BSR_UNICODE:
6958  break;
6959  default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN;
6960  }
6961
6962/* Handle different types of newline. The three bits give seven cases. The
6963current code allows for fixed one- or two-byte sequences, plus "any" and
6964"anycrlf". */
6965
6966switch (options & PCRE_NEWLINE_BITS)
6967  {
6968  case 0: newline = NEWLINE; break;   /* Build-time default */
6969  case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
6970  case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
6971  case PCRE_NEWLINE_CR+
6972       PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
6973  case PCRE_NEWLINE_ANY: newline = -1; break;
6974  case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
6975  default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN;
6976  }
6977
6978if (newline == -2)
6979  {
6980  cd->nltype = NLTYPE_ANYCRLF;
6981  }
6982else if (newline < 0)
6983  {
6984  cd->nltype = NLTYPE_ANY;
6985  }
6986else
6987  {
6988  cd->nltype = NLTYPE_FIXED;
6989  if (newline > 255)
6990    {
6991    cd->nllen = 2;
6992    cd->nl[0] = (newline >> 8) & 255;
6993    cd->nl[1] = newline & 255;
6994    }
6995  else
6996    {
6997    cd->nllen = 1;
6998    cd->nl[0] = newline;
6999    }
7000  }
7001
7002/* Maximum back reference and backref bitmap. The bitmap records up to 31 back
7003references to help in deciding whether (.*) can be treated as anchored or not.
7004*/
7005
7006cd->top_backref = 0;
7007cd->backref_map = 0;
7008
7009/* Reflect pattern for debugging output */
7010
7011DPRINTF(("------------------------------------------------------------------\n"));
7012DPRINTF(("%s\n", pattern));
7013
7014/* Pretend to compile the pattern while actually just accumulating the length
7015of memory required. This behaviour is triggered by passing a non-NULL final
7016argument to compile_regex(). We pass a block of workspace (cworkspace) for it
7017to compile parts of the pattern into; the compiled code is discarded when it is
7018no longer needed, so hopefully this workspace will never overflow, though there
7019is a test for its doing so. */
7020
7021cd->bracount = cd->final_bracount = 0;
7022cd->names_found = 0;
7023cd->name_entry_size = 0;
7024cd->name_table = NULL;
7025cd->start_workspace = cworkspace;
7026cd->start_code = cworkspace;
7027cd->hwm = cworkspace;
7028cd->start_pattern = (const uschar *)pattern;
7029cd->end_pattern = (const uschar *)(pattern + strlen(pattern));
7030cd->req_varyopt = 0;
7031cd->external_options = options;
7032cd->external_flags = 0;
7033cd->open_caps = NULL;
7034
7035/* Now do the pre-compile. On error, errorcode will be set non-zero, so we
7036don't need to look at the result of the function here. The initial options have
7037been put into the cd block so that they can be changed if an option setting is
7038found within the regex right at the beginning. Bringing initial option settings
7039outside can help speed up starting point checks. */
7040
7041ptr += skipatstart;
7042code = cworkspace;
7043*code = OP_BRA;
7044(void)compile_regex(cd->external_options, cd->external_options & PCRE_IMS,
7045  &code, &ptr, &errorcode, FALSE, FALSE, 0, &firstbyte, &reqbyte, NULL, cd,
7046  &length);
7047if (errorcode != 0) goto PCRE_EARLY_ERROR_RETURN;
7048
7049DPRINTF(("end pre-compile: length=%d workspace=%d\n", length,
7050  cd->hwm - cworkspace));
7051
7052if (length > MAX_PATTERN_SIZE)
7053  {
7054  errorcode = ERR20;
7055  goto PCRE_EARLY_ERROR_RETURN;
7056  }
7057
7058/* Compute the size of data block needed and get it, either from malloc or
7059externally provided function. Integer overflow should no longer be possible
7060because nowadays we limit the maximum value of cd->names_found and
7061cd->name_entry_size. */
7062
7063size = length + sizeof(real_pcre) + cd->names_found * (cd->name_entry_size + 3);
7064re = (real_pcre *)(pcre_malloc)(size);
7065
7066if (re == NULL)
7067  {
7068  errorcode = ERR21;
7069  goto PCRE_EARLY_ERROR_RETURN;
7070  }
7071
7072/* Put in the magic number, and save the sizes, initial options, internal
7073flags, and character table pointer. NULL is used for the default character
7074tables. The nullpad field is at the end; it's there to help in the case when a
7075regex compiled on a system with 4-byte pointers is run on another with 8-byte
7076pointers. */
7077
7078re->magic_number = MAGIC_NUMBER;
7079re->size = (int)size;
7080re->options = cd->external_options;
7081re->flags = cd->external_flags;
7082re->dummy1 = 0;
7083re->first_byte = 0;
7084re->req_byte = 0;
7085re->name_table_offset = sizeof(real_pcre);
7086re->name_entry_size = cd->name_entry_size;
7087re->name_count = cd->names_found;
7088re->ref_count = 0;
7089re->tables = (tables == _pcre_default_tables)? NULL : tables;
7090re->nullpad = NULL;
7091
7092/* The starting points of the name/number translation table and of the code are
7093passed around in the compile data block. The start/end pattern and initial
7094options are already set from the pre-compile phase, as is the name_entry_size
7095field. Reset the bracket count and the names_found field. Also reset the hwm
7096field; this time it's used for remembering forward references to subpatterns.
7097*/
7098
7099cd->final_bracount = cd->bracount;  /* Save for checking forward references */
7100cd->bracount = 0;
7101cd->names_found = 0;
7102cd->name_table = (uschar *)re + re->name_table_offset;
7103codestart = cd->name_table + re->name_entry_size * re->name_count;
7104cd->start_code = codestart;
7105cd->hwm = cworkspace;
7106cd->req_varyopt = 0;
7107cd->had_accept = FALSE;
7108cd->check_lookbehind = FALSE;
7109cd->open_caps = NULL;
7110
7111/* Set up a starting, non-extracting bracket, then compile the expression. On
7112error, errorcode will be set non-zero, so we don't need to look at the result
7113of the function here. */
7114
7115ptr = (const uschar *)pattern + skipatstart;
7116code = (uschar *)codestart;
7117*code = OP_BRA;
7118(void)compile_regex(re->options, re->options & PCRE_IMS, &code, &ptr,
7119  &errorcode, FALSE, FALSE, 0, &firstbyte, &reqbyte, NULL, cd, NULL);
7120re->top_bracket = cd->bracount;
7121re->top_backref = cd->top_backref;
7122re->flags = cd->external_flags;
7123
7124if (cd->had_accept) reqbyte = -1;   /* Must disable after (*ACCEPT) */
7125
7126/* If not reached end of pattern on success, there's an excess bracket. */
7127
7128if (errorcode == 0 && *ptr != 0) errorcode = ERR22;
7129
7130/* Fill in the terminating state and check for disastrous overflow, but
7131if debugging, leave the test till after things are printed out. */
7132
7133*code++ = OP_END;
7134
7135#ifndef PCRE_DEBUG
7136if (code - codestart > length) errorcode = ERR23;
7137#endif
7138
7139/* Fill in any forward references that are required. */
7140
7141while (errorcode == 0 && cd->hwm > cworkspace)
7142  {
7143  int offset, recno;
7144  const uschar *groupptr;
7145  cd->hwm -= LINK_SIZE;
7146  offset = GET(cd->hwm, 0);
7147  recno = GET(codestart, offset);
7148  groupptr = _pcre_find_bracket(codestart, utf8, recno);
7149  if (groupptr == NULL) errorcode = ERR53;
7150    else PUT(((uschar *)codestart), offset, (int)(groupptr - codestart));
7151  }
7152
7153/* Give an error if there's back reference to a non-existent capturing
7154subpattern. */
7155
7156if (errorcode == 0 && re->top_backref > re->top_bracket) errorcode = ERR15;
7157
7158/* If there were any lookbehind assertions that contained OP_RECURSE
7159(recursions or subroutine calls), a flag is set for them to be checked here,
7160because they may contain forward references. Actual recursions can't be fixed
7161length, but subroutine calls can. It is done like this so that those without
7162OP_RECURSE that are not fixed length get a diagnosic with a useful offset. The
7163exceptional ones forgo this. We scan the pattern to check that they are fixed
7164length, and set their lengths. */
7165
7166if (cd->check_lookbehind)
7167  {
7168  uschar *cc = (uschar *)codestart;
7169
7170  /* Loop, searching for OP_REVERSE items, and process those that do not have
7171  their length set. (Actually, it will also re-process any that have a length
7172  of zero, but that is a pathological case, and it does no harm.) When we find
7173  one, we temporarily terminate the branch it is in while we scan it. */
7174
7175  for (cc = (uschar *)_pcre_find_bracket(codestart, utf8, -1);
7176       cc != NULL;
7177       cc = (uschar *)_pcre_find_bracket(cc, utf8, -1))
7178    {
7179    if (GET(cc, 1) == 0)
7180      {
7181      int fixed_length;
7182      uschar *be = cc - 1 - LINK_SIZE + GET(cc, -LINK_SIZE);
7183      int end_op = *be;
7184      *be = OP_END;
7185      fixed_length = find_fixedlength(cc, re->options, TRUE, cd);
7186      *be = end_op;
7187      DPRINTF(("fixed length = %d\n", fixed_length));
7188      if (fixed_length < 0)
7189        {
7190        errorcode = (fixed_length == -2)? ERR36 : ERR25;
7191        break;
7192        }
7193      PUT(cc, 1, fixed_length);
7194      }
7195    cc += 1 + LINK_SIZE;
7196    }
7197  }
7198
7199/* Failed to compile, or error while post-processing */
7200
7201if (errorcode != 0)
7202  {
7203  (pcre_free)(re);
7204  PCRE_EARLY_ERROR_RETURN:
7205  *erroroffset = (int)(ptr - (const uschar *)pattern);
7206  PCRE_EARLY_ERROR_RETURN2:
7207  *errorptr = find_error_text(errorcode);
7208  if (errorcodeptr != NULL) *errorcodeptr = errorcode;
7209  return NULL;
7210  }
7211
7212/* If the anchored option was not passed, set the flag if we can determine that
7213the pattern is anchored by virtue of ^ characters or \A or anything else (such
7214as starting with .* when DOTALL is set).
7215
7216Otherwise, if we know what the first byte has to be, save it, because that
7217speeds up unanchored matches no end. If not, see if we can set the
7218PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
7219start with ^. and also when all branches start with .* for non-DOTALL matches.
7220*/
7221
7222if ((re->options & PCRE_ANCHORED) == 0)
7223  {
7224  int temp_options = re->options;   /* May get changed during these scans */
7225  if (is_anchored(codestart, &temp_options, 0, cd->backref_map))
7226    re->options |= PCRE_ANCHORED;
7227  else
7228    {
7229    if (firstbyte < 0)
7230      firstbyte = find_firstassertedchar(codestart, &temp_options, FALSE);
7231    if (firstbyte >= 0)   /* Remove caseless flag for non-caseable chars */
7232      {
7233      int ch = firstbyte & 255;
7234      re->first_byte = ((firstbyte & REQ_CASELESS) != 0 &&
7235         cd->fcc[ch] == ch)? ch : firstbyte;
7236      re->flags |= PCRE_FIRSTSET;
7237      }
7238    else if (is_startline(codestart, 0, cd->backref_map))
7239      re->flags |= PCRE_STARTLINE;
7240    }
7241  }
7242
7243/* For an anchored pattern, we use the "required byte" only if it follows a
7244variable length item in the regex. Remove the caseless flag for non-caseable
7245bytes. */
7246
7247if (reqbyte >= 0 &&
7248     ((re->options & PCRE_ANCHORED) == 0 || (reqbyte & REQ_VARY) != 0))
7249  {
7250  int ch = reqbyte & 255;
7251  re->req_byte = ((reqbyte & REQ_CASELESS) != 0 &&
7252    cd->fcc[ch] == ch)? (reqbyte & ~REQ_CASELESS) : reqbyte;
7253  re->flags |= PCRE_REQCHSET;
7254  }
7255
7256/* Print out the compiled data if debugging is enabled. This is never the
7257case when building a production library. */
7258
7259#ifdef PCRE_DEBUG
7260printf("Length = %d top_bracket = %d top_backref = %d\n",
7261  length, re->top_bracket, re->top_backref);
7262
7263printf("Options=%08x\n", re->options);
7264
7265if ((re->flags & PCRE_FIRSTSET) != 0)
7266  {
7267  int ch = re->first_byte & 255;
7268  const char *caseless = ((re->first_byte & REQ_CASELESS) == 0)?
7269    "" : " (caseless)";
7270  if (isprint(ch)) printf("First char = %c%s\n", ch, caseless);
7271    else printf("First char = \\x%02x%s\n", ch, caseless);
7272  }
7273
7274if ((re->flags & PCRE_REQCHSET) != 0)
7275  {
7276  int ch = re->req_byte & 255;
7277  const char *caseless = ((re->req_byte & REQ_CASELESS) == 0)?
7278    "" : " (caseless)";
7279  if (isprint(ch)) printf("Req char = %c%s\n", ch, caseless);
7280    else printf("Req char = \\x%02x%s\n", ch, caseless);
7281  }
7282
7283pcre_printint(re, stdout, TRUE);
7284
7285/* This check is done here in the debugging case so that the code that
7286was compiled can be seen. */
7287
7288if (code - codestart > length)
7289  {
7290  (pcre_free)(re);
7291  *errorptr = find_error_text(ERR23);
7292  *erroroffset = ptr - (uschar *)pattern;
7293  if (errorcodeptr != NULL) *errorcodeptr = ERR23;
7294  return NULL;
7295  }
7296#endif   /* PCRE_DEBUG */
7297
7298return (pcre *)re;
7299}
7300
7301/* End of pcre_compile.c */
7302