pcre2_compile.c revision 8b979b2abae173bb836d8e85a842cfd00447d4be
1/*************************************************
2*      Perl-Compatible Regular Expressions       *
3*************************************************/
4
5/* PCRE is a library of functions to support regular expressions whose syntax
6and semantics are as close as possible to those of the Perl 5 language.
7
8                       Written by Philip Hazel
9     Original API code Copyright (c) 1997-2012 University of Cambridge
10         New API code Copyright (c) 2016 University of Cambridge
11
12-----------------------------------------------------------------------------
13Redistribution and use in source and binary forms, with or without
14modification, are permitted provided that the following conditions are met:
15
16    * Redistributions of source code must retain the above copyright notice,
17      this list of conditions and the following disclaimer.
18
19    * Redistributions in binary form must reproduce the above copyright
20      notice, this list of conditions and the following disclaimer in the
21      documentation and/or other materials provided with the distribution.
22
23    * Neither the name of the University of Cambridge nor the names of its
24      contributors may be used to endorse or promote products derived from
25      this software without specific prior written permission.
26
27THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37POSSIBILITY OF SUCH DAMAGE.
38-----------------------------------------------------------------------------
39*/
40
41
42#ifdef HAVE_CONFIG_H
43#include "config.h"
44#endif
45
46#define NLBLOCK cb             /* Block containing newline information */
47#define PSSTART start_pattern  /* Field containing processed string start */
48#define PSEND   end_pattern    /* Field containing processed string end */
49
50#include "pcre2_internal.h"
51
52/* In rare error cases debugging might require calling pcre2_printint(). */
53
54#if 0
55#ifdef EBCDIC
56#define PRINTABLE(c) ((c) >= 64 && (c) < 255)
57#else
58#define PRINTABLE(c) ((c) >= 32 && (c) < 127)
59#endif
60#include "pcre2_printint.c"
61#define CALL_PRINTINT
62#endif
63
64/* There are a few things that vary with different code unit sizes. Handle them
65by defining macros in order to minimize #if usage. */
66
67#if PCRE2_CODE_UNIT_WIDTH == 8
68#define STRING_UTFn_RIGHTPAR     STRING_UTF8_RIGHTPAR, 5
69#define XDIGIT(c)                xdigitab[c]
70
71#else  /* Either 16-bit or 32-bit */
72#define XDIGIT(c)                (MAX_255(c)? xdigitab[c] : 0xff)
73
74#if PCRE2_CODE_UNIT_WIDTH == 16
75#define STRING_UTFn_RIGHTPAR     STRING_UTF16_RIGHTPAR, 6
76
77#else  /* 32-bit */
78#define STRING_UTFn_RIGHTPAR     STRING_UTF32_RIGHTPAR, 6
79#endif
80#endif
81
82/* Function definitions to allow mutual recursion */
83
84static unsigned int
85  add_list_to_class(uint8_t *, PCRE2_UCHAR **, uint32_t, compile_block *,
86    const uint32_t *, unsigned int);
87
88static BOOL
89  compile_regex(uint32_t, PCRE2_UCHAR **, PCRE2_SPTR *, int *, BOOL, BOOL,
90    uint32_t, int, uint32_t *, int32_t *, uint32_t *, int32_t *,
91    branch_chain *, compile_block *, size_t *);
92
93
94
95/*************************************************
96*      Code parameters and static tables         *
97*************************************************/
98
99/* This value specifies the size of stack workspace, which is used in different
100ways in the different pattern scans. The group-identifying pre-scan uses it to
101handle nesting, and needs it to be 16-bit aligned.
102
103During the first compiling phase, when determining how much memory is required,
104the regex is partly compiled into this space, but the compiled parts are
105discarded as soon as they can be, so that hopefully there will never be an
106overrun. The code does, however, check for an overrun, which can occur for
107pathological patterns. The size of the workspace depends on LINK_SIZE because
108the length of compiled items varies with this.
109
110In the real compile phase, the workspace is used for remembering data about
111numbered groups, provided there are not too many of them (if there are, extra
112memory is acquired). For this phase the memory must be 32-bit aligned. Having
113defined the size in code units, we set up C32_WORK_SIZE as the number of
114elements in the 32-bit vector. */
115
116#define COMPILE_WORK_SIZE (2048*LINK_SIZE)   /* Size in code units */
117
118#define C32_WORK_SIZE \
119  ((COMPILE_WORK_SIZE * sizeof(PCRE2_UCHAR))/sizeof(uint32_t))
120
121/* The overrun tests check for a slightly smaller size so that they detect the
122overrun before it actually does run off the end of the data block. */
123
124#define WORK_SIZE_SAFETY_MARGIN (100)
125
126/* This value determines the size of the initial vector that is used for
127remembering named groups during the pre-compile. It is allocated on the stack,
128but if it is too small, it is expanded, in a similar way to the workspace. The
129value is the number of slots in the list. */
130
131#define NAMED_GROUP_LIST_SIZE  20
132
133/* The original PCRE required patterns to be zero-terminated, and it simplifies
134the compiling code if it is guaranteed that there is a zero code unit at the
135end of the pattern, because this means that tests for coding sequences such as
136(*SKIP) or even just (?<= can check a sequence of code units without having to
137keep checking for the end of the pattern. The new PCRE2 API allows zero code
138units within patterns if a positive length is given, but in order to keep most
139of the compiling code as it was, we copy such patterns and add a zero on the
140end. This value determines the size of space on the stack that is used if the
141pattern fits; if not, heap memory is used. */
142
143#define COPIED_PATTERN_SIZE 1024
144
145/* Maximum length value to check against when making sure that the variable
146that holds the compiled pattern length does not overflow. We make it a bit less
147than INT_MAX to allow for adding in group terminating bytes, so that we don't
148have to check them every time. */
149
150#define OFLOW_MAX (INT_MAX - 20)
151
152/* Macro for setting individual bits in class bitmaps. It took some
153experimenting to figure out how to stop gcc 5.3.0 from warning with
154-Wconversion. This version gets a warning:
155
156  #define SETBIT(a,b) a[(b)/8] |= (uint8_t)(1 << ((b)&7))
157
158Let's hope the apparently less efficient version isn't actually so bad if the
159compiler is clever with identical subexpressions. */
160
161#define SETBIT(a,b) a[(b)/8] = (uint8_t)(a[(b)/8] | (1 << ((b)&7)))
162
163/* Private flags added to firstcu and reqcu. */
164
165#define REQ_CASELESS    (1 << 0)        /* Indicates caselessness */
166#define REQ_VARY        (1 << 1)        /* reqcu followed non-literal item */
167/* Negative values for the firstcu and reqcu flags */
168#define REQ_UNSET       (-2)            /* Not yet found anything */
169#define REQ_NONE        (-1)            /* Found not fixed char */
170
171/* These flags are used in the groupinfo vector. */
172
173#define GI_SET_COULD_BE_EMPTY  0x80000000u
174#define GI_COULD_BE_EMPTY      0x40000000u
175#define GI_NOT_FIXED_LENGTH    0x20000000u
176#define GI_SET_FIXED_LENGTH    0x10000000u
177#define GI_FIXED_LENGTH_MASK   0x0000ffffu
178
179/* This bit (which is greater than any UTF value) is used to indicate that a
180variable contains a number of code units instead of an actual code point. */
181
182#define UTF_LENGTH     0x10000000l
183
184/* This simple test for a decimal digit works for both ASCII/Unicode and EBCDIC
185and is fast (a good compiler can turn it into a subtraction and unsigned
186comparison). */
187
188#define IS_DIGIT(x) ((x) >= CHAR_0 && (x) <= CHAR_9)
189
190/* Table to identify hex digits. The tables in chartables are dependent on the
191locale, and may mark arbitrary characters as digits. We want to recognize only
1920-9, a-z, and A-Z as hex digits, which is why we have a private table here. It
193costs 256 bytes, but it is a lot faster than doing character value tests (at
194least in some simple cases I timed), and in some applications one wants PCRE to
195compile efficiently as well as match efficiently. The value in the table is
196the binary hex digit value, or 0xff for non-hex digits. */
197
198/* This is the "normal" case, for ASCII systems, and EBCDIC systems running in
199UTF-8 mode. */
200
201#ifndef EBCDIC
202static const uint8_t xdigitab[] =
203  {
204  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*   0-  7 */
205  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*   8- 15 */
206  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  16- 23 */
207  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  24- 31 */
208  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*    - '  */
209  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  ( - /  */
210  0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, /*  0 - 7  */
211  0x08,0x09,0xff,0xff,0xff,0xff,0xff,0xff, /*  8 - ?  */
212  0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /*  @ - G  */
213  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  H - O  */
214  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  P - W  */
215  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  X - _  */
216  0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /*  ` - g  */
217  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  h - o  */
218  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  p - w  */
219  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  x -127 */
220  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 128-135 */
221  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 136-143 */
222  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 144-151 */
223  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 152-159 */
224  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 160-167 */
225  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 168-175 */
226  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 176-183 */
227  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 184-191 */
228  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 192-199 */
229  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 2ff-207 */
230  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 208-215 */
231  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 216-223 */
232  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 224-231 */
233  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 232-239 */
234  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 240-247 */
235  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff};/* 248-255 */
236
237#else
238
239/* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */
240
241static const uint8_t xdigitab[] =
242  {
243  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*   0-  7  0 */
244  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*   8- 15    */
245  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  16- 23 10 */
246  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  24- 31    */
247  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  32- 39 20 */
248  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  40- 47    */
249  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  48- 55 30 */
250  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  56- 63    */
251  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*    - 71 40 */
252  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  72- |     */
253  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  & - 87 50 */
254  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  88- 95    */
255  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  - -103 60 */
256  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 104- ?     */
257  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 112-119 70 */
258  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 120- "     */
259  0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /* 128- g  80 */
260  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  h -143    */
261  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 144- p  90 */
262  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  q -159    */
263  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 160- x  A0 */
264  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  y -175    */
265  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  ^ -183 B0 */
266  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 184-191    */
267  0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /*  { - G  C0 */
268  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  H -207    */
269  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  } - P  D0 */
270  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  Q -223    */
271  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  \ - X  E0 */
272  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  Y -239    */
273  0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, /*  0 - 7  F0 */
274  0x08,0x09,0xff,0xff,0xff,0xff,0xff,0xff};/*  8 -255    */
275#endif  /* EBCDIC */
276
277
278/* Table for handling alphanumeric escaped characters. Positive returns are
279simple data values; negative values are for special things like \d and so on.
280Zero means further processing is needed (for things like \x), or the escape is
281invalid. */
282
283/* This is the "normal" table for ASCII systems or for EBCDIC systems running
284in UTF-8 mode. It runs from '0' to 'z'. */
285
286#ifndef EBCDIC
287#define ESCAPES_FIRST       CHAR_0
288#define ESCAPES_LAST        CHAR_z
289#define UPPER_CASE(c)       (c-32)
290
291static const short int escapes[] = {
292     0,                       0,
293     0,                       0,
294     0,                       0,
295     0,                       0,
296     0,                       0,
297     CHAR_COLON,              CHAR_SEMICOLON,
298     CHAR_LESS_THAN_SIGN,     CHAR_EQUALS_SIGN,
299     CHAR_GREATER_THAN_SIGN,  CHAR_QUESTION_MARK,
300     CHAR_COMMERCIAL_AT,      -ESC_A,
301     -ESC_B,                  -ESC_C,
302     -ESC_D,                  -ESC_E,
303     0,                       -ESC_G,
304     -ESC_H,                  0,
305     0,                       -ESC_K,
306     0,                       0,
307     -ESC_N,                  0,
308     -ESC_P,                  -ESC_Q,
309     -ESC_R,                  -ESC_S,
310     0,                       0,
311     -ESC_V,                  -ESC_W,
312     -ESC_X,                  0,
313     -ESC_Z,                  CHAR_LEFT_SQUARE_BRACKET,
314     CHAR_BACKSLASH,          CHAR_RIGHT_SQUARE_BRACKET,
315     CHAR_CIRCUMFLEX_ACCENT,  CHAR_UNDERSCORE,
316     CHAR_GRAVE_ACCENT,       ESC_a,
317     -ESC_b,                  0,
318     -ESC_d,                  ESC_e,
319     ESC_f,                   0,
320     -ESC_h,                  0,
321     0,                       -ESC_k,
322     0,                       0,
323     ESC_n,                   0,
324     -ESC_p,                  0,
325     ESC_r,                   -ESC_s,
326     ESC_tee,                 0,
327     -ESC_v,                  -ESC_w,
328     0,                       0,
329     -ESC_z
330};
331
332#else
333
334/* This is the "abnormal" table for EBCDIC systems without UTF-8 support.
335It runs from 'a' to '9'. For some minimal testing of EBCDIC features, the code
336is sometimes compiled on an ASCII system. In this case, we must not use CHAR_a
337because it is defined as 'a', which of course picks up the ASCII value. */
338
339#if 'a' == 0x81                    /* Check for a real EBCDIC environment */
340#define ESCAPES_FIRST       CHAR_a
341#define ESCAPES_LAST        CHAR_9
342#define UPPER_CASE(c)       (c+64)
343#else                              /* Testing in an ASCII environment */
344#define ESCAPES_FIRST  ((unsigned char)'\x81')   /* EBCDIC 'a' */
345#define ESCAPES_LAST   ((unsigned char)'\xf9')   /* EBCDIC '9' */
346#define UPPER_CASE(c)  (c-32)
347#endif
348
349static const short int escapes[] = {
350/*  80 */        ESC_a, -ESC_b,       0, -ESC_d, ESC_e,  ESC_f,      0,
351/*  88 */-ESC_h,     0,      0,     '{',      0,     0,      0,      0,
352/*  90 */     0,     0, -ESC_k,       0,      0, ESC_n,      0, -ESC_p,
353/*  98 */     0, ESC_r,      0,     '}',      0,     0,      0,      0,
354/*  A0 */     0,   '~', -ESC_s, ESC_tee,      0,-ESC_v, -ESC_w,      0,
355/*  A8 */     0,-ESC_z,      0,       0,      0,   '[',      0,      0,
356/*  B0 */     0,     0,      0,       0,      0,     0,      0,      0,
357/*  B8 */     0,     0,      0,       0,      0,   ']',    '=',    '-',
358/*  C0 */   '{',-ESC_A, -ESC_B,  -ESC_C, -ESC_D,-ESC_E,      0, -ESC_G,
359/*  C8 */-ESC_H,     0,      0,       0,      0,     0,      0,      0,
360/*  D0 */   '}',     0, -ESC_K,       0,      0,-ESC_N,      0, -ESC_P,
361/*  D8 */-ESC_Q,-ESC_R,      0,       0,      0,     0,      0,      0,
362/*  E0 */  '\\',     0, -ESC_S,       0,      0,-ESC_V, -ESC_W, -ESC_X,
363/*  E8 */     0,-ESC_Z,      0,       0,      0,     0,      0,      0,
364/*  F0 */     0,     0,      0,       0,      0,     0,      0,      0,
365/*  F8 */     0,     0
366};
367
368/* We also need a table of characters that may follow \c in an EBCDIC
369environment for characters 0-31. */
370
371static unsigned char ebcdic_escape_c[] = "@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_";
372
373#endif   /* EBCDIC */
374
375
376/* Table of special "verbs" like (*PRUNE). This is a short table, so it is
377searched linearly. Put all the names into a single string, in order to reduce
378the number of relocations when a shared library is dynamically linked. The
379string is built from string macros so that it works in UTF-8 mode on EBCDIC
380platforms. */
381
382typedef struct verbitem {
383  int   len;                 /* Length of verb name */
384  int   op;                  /* Op when no arg, or -1 if arg mandatory */
385  int   op_arg;              /* Op when arg present, or -1 if not allowed */
386} verbitem;
387
388static const char verbnames[] =
389  "\0"                       /* Empty name is a shorthand for MARK */
390  STRING_MARK0
391  STRING_ACCEPT0
392  STRING_COMMIT0
393  STRING_F0
394  STRING_FAIL0
395  STRING_PRUNE0
396  STRING_SKIP0
397  STRING_THEN;
398
399static const verbitem verbs[] = {
400  { 0, -1,        OP_MARK },
401  { 4, -1,        OP_MARK },
402  { 6, OP_ACCEPT, -1 },
403  { 6, OP_COMMIT, -1 },
404  { 1, OP_FAIL,   -1 },
405  { 4, OP_FAIL,   -1 },
406  { 5, OP_PRUNE,  OP_PRUNE_ARG },
407  { 4, OP_SKIP,   OP_SKIP_ARG  },
408  { 4, OP_THEN,   OP_THEN_ARG  }
409};
410
411static const int verbcount = sizeof(verbs)/sizeof(verbitem);
412
413
414/* Substitutes for [[:<:]] and [[:>:]], which mean start and end of word in
415another regex library. */
416
417static const PCRE2_UCHAR sub_start_of_word[] = {
418  CHAR_BACKSLASH, CHAR_b, CHAR_LEFT_PARENTHESIS, CHAR_QUESTION_MARK,
419  CHAR_EQUALS_SIGN, CHAR_BACKSLASH, CHAR_w, CHAR_RIGHT_PARENTHESIS, '\0' };
420
421static const PCRE2_UCHAR sub_end_of_word[] = {
422  CHAR_BACKSLASH, CHAR_b, CHAR_LEFT_PARENTHESIS, CHAR_QUESTION_MARK,
423  CHAR_LESS_THAN_SIGN, CHAR_EQUALS_SIGN, CHAR_BACKSLASH, CHAR_w,
424  CHAR_RIGHT_PARENTHESIS, '\0' };
425
426
427/* Tables of names of POSIX character classes and their lengths. The names are
428now all in a single string, to reduce the number of relocations when a shared
429library is dynamically loaded. The list of lengths is terminated by a zero
430length entry. The first three must be alpha, lower, upper, as this is assumed
431for handling case independence. The indices for graph, print, and punct are
432needed, so identify them. */
433
434static const char posix_names[] =
435  STRING_alpha0 STRING_lower0 STRING_upper0 STRING_alnum0
436  STRING_ascii0 STRING_blank0 STRING_cntrl0 STRING_digit0
437  STRING_graph0 STRING_print0 STRING_punct0 STRING_space0
438  STRING_word0  STRING_xdigit;
439
440static const uint8_t posix_name_lengths[] = {
441  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
442
443#define PC_GRAPH  8
444#define PC_PRINT  9
445#define PC_PUNCT 10
446
447
448/* Table of class bit maps for each POSIX class. Each class is formed from a
449base map, with an optional addition or removal of another map. Then, for some
450classes, there is some additional tweaking: for [:blank:] the vertical space
451characters are removed, and for [:alpha:] and [:alnum:] the underscore
452character is removed. The triples in the table consist of the base map offset,
453second map offset or -1 if no second map, and a non-negative value for map
454addition or a negative value for map subtraction (if there are two maps). The
455absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
456remove vertical space characters, 2 => remove underscore. */
457
458static const int posix_class_maps[] = {
459  cbit_word,  cbit_digit, -2,             /* alpha */
460  cbit_lower, -1,          0,             /* lower */
461  cbit_upper, -1,          0,             /* upper */
462  cbit_word,  -1,          2,             /* alnum - word without underscore */
463  cbit_print, cbit_cntrl,  0,             /* ascii */
464  cbit_space, -1,          1,             /* blank - a GNU extension */
465  cbit_cntrl, -1,          0,             /* cntrl */
466  cbit_digit, -1,          0,             /* digit */
467  cbit_graph, -1,          0,             /* graph */
468  cbit_print, -1,          0,             /* print */
469  cbit_punct, -1,          0,             /* punct */
470  cbit_space, -1,          0,             /* space */
471  cbit_word,  -1,          0,             /* word - a Perl extension */
472  cbit_xdigit,-1,          0              /* xdigit */
473};
474
475/* Table of substitutes for \d etc when PCRE2_UCP is set. They are replaced by
476Unicode property escapes. */
477
478#ifdef SUPPORT_UNICODE
479static const PCRE2_UCHAR string_PNd[]  = {
480  CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
481  CHAR_N, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
482static const PCRE2_UCHAR string_pNd[]  = {
483  CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
484  CHAR_N, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
485static const PCRE2_UCHAR string_PXsp[] = {
486  CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
487  CHAR_X, CHAR_s, CHAR_p, CHAR_RIGHT_CURLY_BRACKET, '\0' };
488static const PCRE2_UCHAR string_pXsp[] = {
489  CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
490  CHAR_X, CHAR_s, CHAR_p, CHAR_RIGHT_CURLY_BRACKET, '\0' };
491static const PCRE2_UCHAR string_PXwd[] = {
492  CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
493  CHAR_X, CHAR_w, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
494static const PCRE2_UCHAR string_pXwd[] = {
495  CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
496  CHAR_X, CHAR_w, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
497
498static PCRE2_SPTR substitutes[] = {
499  string_PNd,           /* \D */
500  string_pNd,           /* \d */
501  string_PXsp,          /* \S */   /* Xsp is Perl space, but from 8.34, Perl */
502  string_pXsp,          /* \s */   /* space and POSIX space are the same. */
503  string_PXwd,          /* \W */
504  string_pXwd           /* \w */
505};
506
507/* The POSIX class substitutes must be in the order of the POSIX class names,
508defined above, and there are both positive and negative cases. NULL means no
509general substitute of a Unicode property escape (\p or \P). However, for some
510POSIX classes (e.g. graph, print, punct) a special property code is compiled
511directly. */
512
513static const PCRE2_UCHAR string_pCc[] =  {
514  CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
515  CHAR_C, CHAR_c, CHAR_RIGHT_CURLY_BRACKET, '\0' };
516static const PCRE2_UCHAR string_pL[] =   {
517  CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
518  CHAR_L, CHAR_RIGHT_CURLY_BRACKET, '\0' };
519static const PCRE2_UCHAR string_pLl[] =  {
520  CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
521  CHAR_L, CHAR_l, CHAR_RIGHT_CURLY_BRACKET, '\0' };
522static const PCRE2_UCHAR string_pLu[] =  {
523  CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
524  CHAR_L, CHAR_u, CHAR_RIGHT_CURLY_BRACKET, '\0' };
525static const PCRE2_UCHAR string_pXan[] = {
526  CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
527  CHAR_X, CHAR_a, CHAR_n, CHAR_RIGHT_CURLY_BRACKET, '\0' };
528static const PCRE2_UCHAR string_h[] =    {
529  CHAR_BACKSLASH, CHAR_h, '\0' };
530static const PCRE2_UCHAR string_pXps[] = {
531  CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
532  CHAR_X, CHAR_p, CHAR_s, CHAR_RIGHT_CURLY_BRACKET, '\0' };
533static const PCRE2_UCHAR string_PCc[] =  {
534  CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
535  CHAR_C, CHAR_c, CHAR_RIGHT_CURLY_BRACKET, '\0' };
536static const PCRE2_UCHAR string_PL[] =   {
537  CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
538  CHAR_L, CHAR_RIGHT_CURLY_BRACKET, '\0' };
539static const PCRE2_UCHAR string_PLl[] =  {
540  CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
541  CHAR_L, CHAR_l, CHAR_RIGHT_CURLY_BRACKET, '\0' };
542static const PCRE2_UCHAR string_PLu[] =  {
543  CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
544  CHAR_L, CHAR_u, CHAR_RIGHT_CURLY_BRACKET, '\0' };
545static const PCRE2_UCHAR string_PXan[] = {
546  CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
547  CHAR_X, CHAR_a, CHAR_n, CHAR_RIGHT_CURLY_BRACKET, '\0' };
548static const PCRE2_UCHAR string_H[] =    {
549  CHAR_BACKSLASH, CHAR_H, '\0' };
550static const PCRE2_UCHAR string_PXps[] = {
551  CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
552  CHAR_X, CHAR_p, CHAR_s, CHAR_RIGHT_CURLY_BRACKET, '\0' };
553
554static PCRE2_SPTR posix_substitutes[] = {
555  string_pL,            /* alpha */
556  string_pLl,           /* lower */
557  string_pLu,           /* upper */
558  string_pXan,          /* alnum */
559  NULL,                 /* ascii */
560  string_h,             /* blank */
561  string_pCc,           /* cntrl */
562  string_pNd,           /* digit */
563  NULL,                 /* graph */
564  NULL,                 /* print */
565  NULL,                 /* punct */
566  string_pXps,          /* space */   /* Xps is POSIX space, but from 8.34 */
567  string_pXwd,          /* word  */   /* Perl and POSIX space are the same */
568  NULL,                 /* xdigit */
569  /* Negated cases */
570  string_PL,            /* ^alpha */
571  string_PLl,           /* ^lower */
572  string_PLu,           /* ^upper */
573  string_PXan,          /* ^alnum */
574  NULL,                 /* ^ascii */
575  string_H,             /* ^blank */
576  string_PCc,           /* ^cntrl */
577  string_PNd,           /* ^digit */
578  NULL,                 /* ^graph */
579  NULL,                 /* ^print */
580  NULL,                 /* ^punct */
581  string_PXps,          /* ^space */  /* Xps is POSIX space, but from 8.34 */
582  string_PXwd,          /* ^word */   /* Perl and POSIX space are the same */
583  NULL                  /* ^xdigit */
584};
585#define POSIX_SUBSIZE (sizeof(posix_substitutes) / sizeof(PCRE2_UCHAR *))
586#endif  /* SUPPORT_UNICODE */
587
588/* Masks for checking option settings. */
589
590#define PUBLIC_COMPILE_OPTIONS \
591  (PCRE2_ANCHORED|PCRE2_ALLOW_EMPTY_CLASS|PCRE2_ALT_BSUX|PCRE2_ALT_CIRCUMFLEX| \
592   PCRE2_ALT_VERBNAMES|PCRE2_AUTO_CALLOUT|PCRE2_CASELESS|PCRE2_DOLLAR_ENDONLY| \
593   PCRE2_DOTALL|PCRE2_DUPNAMES|PCRE2_EXTENDED|PCRE2_FIRSTLINE| \
594   PCRE2_MATCH_UNSET_BACKREF|PCRE2_MULTILINE|PCRE2_NEVER_BACKSLASH_C| \
595   PCRE2_NEVER_UCP|PCRE2_NEVER_UTF|PCRE2_NO_AUTO_CAPTURE| \
596   PCRE2_NO_AUTO_POSSESS|PCRE2_NO_DOTSTAR_ANCHOR|PCRE2_NO_START_OPTIMIZE| \
597   PCRE2_NO_UTF_CHECK|PCRE2_UCP|PCRE2_UNGREEDY|PCRE2_USE_OFFSET_LIMIT| \
598   PCRE2_UTF)
599
600/* Compile time error code numbers. They are given names so that they can more
601easily be tracked. When a new number is added, the tables called eint1 and
602eint2 in pcre2posix.c may need to be updated, and a new error text must be
603added to compile_error_texts in pcre2_error.c. */
604
605enum { ERR0 = COMPILE_ERROR_BASE,
606       ERR1,  ERR2,  ERR3,  ERR4,  ERR5,  ERR6,  ERR7,  ERR8,  ERR9,  ERR10,
607       ERR11, ERR12, ERR13, ERR14, ERR15, ERR16, ERR17, ERR18, ERR19, ERR20,
608       ERR21, ERR22, ERR23, ERR24, ERR25, ERR26, ERR27, ERR28, ERR29, ERR30,
609       ERR31, ERR32, ERR33, ERR34, ERR35, ERR36, ERR37, ERR38, ERR39, ERR40,
610       ERR41, ERR42, ERR43, ERR44, ERR45, ERR46, ERR47, ERR48, ERR49, ERR50,
611       ERR51, ERR52, ERR53, ERR54, ERR55, ERR56, ERR57, ERR58, ERR59, ERR60,
612       ERR61, ERR62, ERR63, ERR64, ERR65, ERR66, ERR67, ERR68, ERR69, ERR70,
613       ERR71, ERR72, ERR73, ERR74, ERR75, ERR76, ERR77, ERR78, ERR79, ERR80,
614       ERR81, ERR82, ERR83, ERR84, ERR85, ERR86, ERR87, ERR88 };
615
616/* Error codes that correspond to negative error codes returned by
617find_fixedlength(). */
618
619static int fixed_length_errors[] =
620  {
621  ERR0,    /* Not an error */
622  ERR0,    /* Not an error; -1 is used for "process later" */
623  ERR25,   /* Lookbehind is not fixed length */
624  ERR36,   /* \C in lookbehind is not allowed */
625  ERR87,   /* Lookbehind is too long */
626  ERR86,   /* Pattern too complicated */
627  ERR70    /* Internal error: unknown opcode encountered */
628  };
629
630/* This is a table of start-of-pattern options such as (*UTF) and settings such
631as (*LIMIT_MATCH=nnnn) and (*CRLF). For completeness and backward
632compatibility, (*UTFn) is supported in the relevant libraries, but (*UTF) is
633generic and always supported. */
634
635enum { PSO_OPT,     /* Value is an option bit */
636       PSO_FLG,     /* Value is a flag bit */
637       PSO_NL,      /* Value is a newline type */
638       PSO_BSR,     /* Value is a \R type */
639       PSO_LIMM,    /* Read integer value for match limit */
640       PSO_LIMR };  /* Read integer value for recursion limit */
641
642typedef struct pso {
643  const uint8_t *name;
644  uint16_t length;
645  uint16_t type;
646  uint32_t value;
647} pso;
648
649/* NB: STRING_UTFn_RIGHTPAR contains the length as well */
650
651static pso pso_list[] = {
652  { (uint8_t *)STRING_UTFn_RIGHTPAR,                  PSO_OPT, PCRE2_UTF },
653  { (uint8_t *)STRING_UTF_RIGHTPAR,                4, PSO_OPT, PCRE2_UTF },
654  { (uint8_t *)STRING_UCP_RIGHTPAR,                4, PSO_OPT, PCRE2_UCP },
655  { (uint8_t *)STRING_NOTEMPTY_RIGHTPAR,           9, PSO_FLG, PCRE2_NOTEMPTY_SET },
656  { (uint8_t *)STRING_NOTEMPTY_ATSTART_RIGHTPAR,  17, PSO_FLG, PCRE2_NE_ATST_SET },
657  { (uint8_t *)STRING_NO_AUTO_POSSESS_RIGHTPAR,   16, PSO_OPT, PCRE2_NO_AUTO_POSSESS },
658  { (uint8_t *)STRING_NO_DOTSTAR_ANCHOR_RIGHTPAR, 18, PSO_OPT, PCRE2_NO_DOTSTAR_ANCHOR },
659  { (uint8_t *)STRING_NO_JIT_RIGHTPAR,             7, PSO_FLG, PCRE2_NOJIT },
660  { (uint8_t *)STRING_NO_START_OPT_RIGHTPAR,      13, PSO_OPT, PCRE2_NO_START_OPTIMIZE },
661  { (uint8_t *)STRING_LIMIT_MATCH_EQ,             12, PSO_LIMM, 0 },
662  { (uint8_t *)STRING_LIMIT_RECURSION_EQ,         16, PSO_LIMR, 0 },
663  { (uint8_t *)STRING_CR_RIGHTPAR,                 3, PSO_NL,  PCRE2_NEWLINE_CR },
664  { (uint8_t *)STRING_LF_RIGHTPAR,                 3, PSO_NL,  PCRE2_NEWLINE_LF },
665  { (uint8_t *)STRING_CRLF_RIGHTPAR,               5, PSO_NL,  PCRE2_NEWLINE_CRLF },
666  { (uint8_t *)STRING_ANY_RIGHTPAR,                4, PSO_NL,  PCRE2_NEWLINE_ANY },
667  { (uint8_t *)STRING_ANYCRLF_RIGHTPAR,            8, PSO_NL,  PCRE2_NEWLINE_ANYCRLF },
668  { (uint8_t *)STRING_BSR_ANYCRLF_RIGHTPAR,       12, PSO_BSR, PCRE2_BSR_ANYCRLF },
669  { (uint8_t *)STRING_BSR_UNICODE_RIGHTPAR,       12, PSO_BSR, PCRE2_BSR_UNICODE }
670};
671
672/* This table is used when converting repeating opcodes into possessified
673versions as a result of an explicit possessive quantifier such as ++. A zero
674value means there is no possessified version - in those cases the item in
675question must be wrapped in ONCE brackets. The table is truncated at OP_CALLOUT
676because all relevant opcodes are less than that. */
677
678static const uint8_t opcode_possessify[] = {
679  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   /* 0 - 15  */
680  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   /* 16 - 31 */
681
682  0,                       /* NOTI */
683  OP_POSSTAR, 0,           /* STAR, MINSTAR */
684  OP_POSPLUS, 0,           /* PLUS, MINPLUS */
685  OP_POSQUERY, 0,          /* QUERY, MINQUERY */
686  OP_POSUPTO, 0,           /* UPTO, MINUPTO */
687  0,                       /* EXACT */
688  0, 0, 0, 0,              /* POS{STAR,PLUS,QUERY,UPTO} */
689
690  OP_POSSTARI, 0,          /* STARI, MINSTARI */
691  OP_POSPLUSI, 0,          /* PLUSI, MINPLUSI */
692  OP_POSQUERYI, 0,         /* QUERYI, MINQUERYI */
693  OP_POSUPTOI, 0,          /* UPTOI, MINUPTOI */
694  0,                       /* EXACTI */
695  0, 0, 0, 0,              /* POS{STARI,PLUSI,QUERYI,UPTOI} */
696
697  OP_NOTPOSSTAR, 0,        /* NOTSTAR, NOTMINSTAR */
698  OP_NOTPOSPLUS, 0,        /* NOTPLUS, NOTMINPLUS */
699  OP_NOTPOSQUERY, 0,       /* NOTQUERY, NOTMINQUERY */
700  OP_NOTPOSUPTO, 0,        /* NOTUPTO, NOTMINUPTO */
701  0,                       /* NOTEXACT */
702  0, 0, 0, 0,              /* NOTPOS{STAR,PLUS,QUERY,UPTO} */
703
704  OP_NOTPOSSTARI, 0,       /* NOTSTARI, NOTMINSTARI */
705  OP_NOTPOSPLUSI, 0,       /* NOTPLUSI, NOTMINPLUSI */
706  OP_NOTPOSQUERYI, 0,      /* NOTQUERYI, NOTMINQUERYI */
707  OP_NOTPOSUPTOI, 0,       /* NOTUPTOI, NOTMINUPTOI */
708  0,                       /* NOTEXACTI */
709  0, 0, 0, 0,              /* NOTPOS{STARI,PLUSI,QUERYI,UPTOI} */
710
711  OP_TYPEPOSSTAR, 0,       /* TYPESTAR, TYPEMINSTAR */
712  OP_TYPEPOSPLUS, 0,       /* TYPEPLUS, TYPEMINPLUS */
713  OP_TYPEPOSQUERY, 0,      /* TYPEQUERY, TYPEMINQUERY */
714  OP_TYPEPOSUPTO, 0,       /* TYPEUPTO, TYPEMINUPTO */
715  0,                       /* TYPEEXACT */
716  0, 0, 0, 0,              /* TYPEPOS{STAR,PLUS,QUERY,UPTO} */
717
718  OP_CRPOSSTAR, 0,         /* CRSTAR, CRMINSTAR */
719  OP_CRPOSPLUS, 0,         /* CRPLUS, CRMINPLUS */
720  OP_CRPOSQUERY, 0,        /* CRQUERY, CRMINQUERY */
721  OP_CRPOSRANGE, 0,        /* CRRANGE, CRMINRANGE */
722  0, 0, 0, 0,              /* CRPOS{STAR,PLUS,QUERY,RANGE} */
723
724  0, 0, 0,                 /* CLASS, NCLASS, XCLASS */
725  0, 0,                    /* REF, REFI */
726  0, 0,                    /* DNREF, DNREFI */
727  0, 0                     /* RECURSE, CALLOUT */
728};
729
730
731
732/*************************************************
733*               Copy compiled code               *
734*************************************************/
735
736/* Compiled JIT code cannot be copied, so the new compiled block has no
737associated JIT data. */
738
739PCRE2_EXP_DEFN pcre2_code * PCRE2_CALL_CONVENTION
740pcre2_code_copy(const pcre2_code *code)
741{
742PCRE2_SIZE* ref_count;
743pcre2_code *newcode;
744
745if (code == NULL) return NULL;
746newcode = code->memctl.malloc(code->blocksize, code->memctl.memory_data);
747if (newcode == NULL) return NULL;
748memcpy(newcode, code, code->blocksize);
749newcode->executable_jit = NULL;
750
751/* If the code is one that has been deserialized, increment the reference count
752in the decoded tables. */
753
754if ((code->flags & PCRE2_DEREF_TABLES) != 0)
755  {
756  ref_count = (PCRE2_SIZE *)(code->tables + tables_length);
757  (*ref_count)++;
758  }
759
760return newcode;
761}
762
763
764
765/*************************************************
766*               Free compiled code               *
767*************************************************/
768
769PCRE2_EXP_DEFN void PCRE2_CALL_CONVENTION
770pcre2_code_free(pcre2_code *code)
771{
772PCRE2_SIZE* ref_count;
773
774if (code != NULL)
775  {
776  if (code->executable_jit != NULL)
777    PRIV(jit_free)(code->executable_jit, &code->memctl);
778
779  if ((code->flags & PCRE2_DEREF_TABLES) != 0)
780    {
781    /* Decoded tables belong to the codes after deserialization, and they must
782    be freed when there are no more reference to them. The *ref_count should
783    always be > 0. */
784
785    ref_count = (PCRE2_SIZE *)(code->tables + tables_length);
786    if (*ref_count > 0)
787      {
788      (*ref_count)--;
789      if (*ref_count == 0)
790        code->memctl.free((void *)code->tables, code->memctl.memory_data);
791      }
792    }
793
794  code->memctl.free(code, code->memctl.memory_data);
795  }
796}
797
798
799
800/*************************************************
801*        Insert an automatic callout point       *
802*************************************************/
803
804/* This function is called when the PCRE2_AUTO_CALLOUT option is set, to insert
805callout points before each pattern item.
806
807Arguments:
808  code           current code pointer
809  ptr            current pattern pointer
810  cb             general compile-time data
811
812Returns:         new code pointer
813*/
814
815static PCRE2_UCHAR *
816auto_callout(PCRE2_UCHAR *code, PCRE2_SPTR ptr, compile_block *cb)
817{
818code[0] = OP_CALLOUT;
819PUT(code, 1, ptr - cb->start_pattern);  /* Pattern offset */
820PUT(code, 1 + LINK_SIZE, 0);            /* Default length */
821code[1 + 2*LINK_SIZE] = 255;
822return code + PRIV(OP_lengths)[OP_CALLOUT];
823}
824
825
826
827/*************************************************
828*         Complete a callout item                *
829*************************************************/
830
831/* A callout item contains the length of the next item in the pattern, which
832we can't fill in till after we have reached the relevant point. This is used
833for both automatic and manual callouts.
834
835Arguments:
836  previous_callout   points to previous callout item
837  ptr                current pattern pointer
838  cb                 general compile-time data
839
840Returns:             nothing
841*/
842
843static void
844complete_callout(PCRE2_UCHAR *previous_callout, PCRE2_SPTR ptr,
845  compile_block *cb)
846{
847size_t length = (size_t)(ptr - cb->start_pattern - GET(previous_callout, 1));
848PUT(previous_callout, 1 + LINK_SIZE, length);
849}
850
851
852
853/*************************************************
854*        Find the fixed length of a branch       *
855*************************************************/
856
857/* Scan a branch and compute the fixed length of subject that will match it, if
858the length is fixed. This is needed for dealing with lookbehind assertions. In
859UTF mode, the result is in code units rather than bytes. The branch is
860temporarily terminated with OP_END when this function is called.
861
862This function is called when a lookbehind assertion is encountered, so that if
863it fails, the error message can point to the correct place in the pattern.
864However, we cannot do this when the assertion contains subroutine calls,
865because they can be forward references. We solve this by remembering this case
866and doing the check at the end; a flag specifies which mode we are running in.
867
868Lookbehind lengths are held in 16-bit fields and the maximum value is defined
869as LOOKBEHIND_MAX.
870
871Arguments:
872  code        points to the start of the pattern (the bracket)
873  utf         TRUE in UTF mode
874  atend       TRUE if called when the pattern is complete
875  cb          the "compile data" structure
876  recurses    chain of recurse_check to catch mutual recursion
877  countptr    pointer to counter, to catch over-complexity
878
879Returns:   if non-negative, the fixed length,
880             or -1 if an OP_RECURSE item was encountered and atend is FALSE
881             or -2 if there is no fixed length,
882             or -3 if \C was encountered (in UTF mode only)
883             or -4 if length is too long
884             or -5 if regex is too complicated
885             or -6 if an unknown opcode was encountered (internal error)
886*/
887
888#define FFL_LATER           (-1)
889#define FFL_NOTFIXED        (-2)
890#define FFL_BACKSLASHC      (-3)
891#define FFL_TOOLONG         (-4)
892#define FFL_TOOCOMPLICATED  (-5)
893#define FFL_UNKNOWNOP       (-6)
894
895static int
896find_fixedlength(PCRE2_UCHAR *code, BOOL utf, BOOL atend, compile_block *cb,
897  recurse_check *recurses, int *countptr)
898{
899uint32_t length = 0xffffffffu;   /* Unset */
900uint32_t group = 0;
901uint32_t groupinfo = 0;
902recurse_check this_recurse;
903register uint32_t branchlength = 0;
904register PCRE2_UCHAR *cc = code + 1 + LINK_SIZE;
905
906/* If this is a capturing group, we may have the answer cached, but we can only
907use this information if there are no (?| groups in the pattern, because
908otherwise group numbers are not unique. */
909
910if (*code == OP_CBRA || *code == OP_CBRAPOS || *code == OP_SCBRA ||
911    *code == OP_SCBRAPOS)
912  {
913  group = GET2(cc, 0);
914  cc += IMM2_SIZE;
915  groupinfo = cb->groupinfo[group];
916  if ((cb->external_flags & PCRE2_DUPCAPUSED) == 0)
917    {
918    if ((groupinfo & GI_NOT_FIXED_LENGTH) != 0) return FFL_NOTFIXED;
919    if ((groupinfo & GI_SET_FIXED_LENGTH) != 0)
920      return groupinfo & GI_FIXED_LENGTH_MASK;
921    }
922  }
923
924/* A large and/or complex regex can take too long to process. This can happen
925more often when (?| groups are present in the pattern. */
926
927if ((*countptr)++ > 2000) return FFL_TOOCOMPLICATED;
928
929/* Scan along the opcodes for this branch. If we get to the end of the
930branch, check the length against that of the other branches. */
931
932for (;;)
933  {
934  int d;
935  PCRE2_UCHAR *ce, *cs;
936  register PCRE2_UCHAR op = *cc;
937
938  if (branchlength > LOOKBEHIND_MAX) return FFL_TOOLONG;
939
940  switch (op)
941    {
942    /* We only need to continue for OP_CBRA (normal capturing bracket) and
943    OP_BRA (normal non-capturing bracket) because the other variants of these
944    opcodes are all concerned with unlimited repeated groups, which of course
945    are not of fixed length. */
946
947    case OP_CBRA:
948    case OP_BRA:
949    case OP_ONCE:
950    case OP_ONCE_NC:
951    case OP_COND:
952    d = find_fixedlength(cc, utf, atend, cb, recurses, countptr);
953    if (d < 0) return d;
954    branchlength += (uint32_t)d;
955    do cc += GET(cc, 1); while (*cc == OP_ALT);
956    cc += 1 + LINK_SIZE;
957    break;
958
959    /* Reached end of a branch; if it's a ket it is the end of a nested call.
960    If it's ALT it is an alternation in a nested call. An ACCEPT is effectively
961    an ALT. If it is END it's the end of the outer call. All can be handled by
962    the same code. Note that we must not include the OP_KETRxxx opcodes here,
963    because they all imply an unlimited repeat. */
964
965    case OP_ALT:
966    case OP_KET:
967    case OP_END:
968    case OP_ACCEPT:
969    case OP_ASSERT_ACCEPT:
970    if (length == 0xffffffffu) length = branchlength;
971      else if (length != branchlength) goto ISNOTFIXED;
972    if (*cc != OP_ALT)
973      {
974      if (group > 0)
975        {
976        groupinfo |= (uint32_t)(GI_SET_FIXED_LENGTH | length);
977        cb->groupinfo[group] = groupinfo;
978        }
979      return (int)length;
980      }
981    cc += 1 + LINK_SIZE;
982    branchlength = 0;
983    break;
984
985    /* A true recursion implies not fixed length, but a subroutine call may
986    be OK. If the subroutine is a forward reference, we can't deal with
987    it until the end of the pattern, so return FFL_LATER. */
988
989    case OP_RECURSE:
990    if (!atend) return FFL_LATER;
991    cs = ce = (PCRE2_UCHAR *)cb->start_code + GET(cc, 1); /* Start subpattern */
992    do ce += GET(ce, 1); while (*ce == OP_ALT);           /* End subpattern */
993    if (cc > cs && cc < ce) goto ISNOTFIXED;          /* Recursion */
994    else   /* Check for mutual recursion */
995      {
996      recurse_check *r = recurses;
997      for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break;
998      if (r != NULL) goto ISNOTFIXED;   /* Mutual recursion */
999      }
1000    this_recurse.prev = recurses;
1001    this_recurse.group = cs;
1002    d = find_fixedlength(cs, utf, atend, cb, &this_recurse, countptr);
1003    if (d < 0) return d;
1004    branchlength += (uint32_t)d;
1005    cc += 1 + LINK_SIZE;
1006    break;
1007
1008    /* Skip over assertive subpatterns. Note that we must increment cc by
1009    1 + LINK_SIZE at the end, not by OP_length[*cc] because in a recursive
1010    situation this assertion may be the one that is ultimately being checked
1011    for having a fixed length, in which case its terminating OP_KET will have
1012    been temporarily replaced by OP_END. */
1013
1014    case OP_ASSERT:
1015    case OP_ASSERT_NOT:
1016    case OP_ASSERTBACK:
1017    case OP_ASSERTBACK_NOT:
1018    do cc += GET(cc, 1); while (*cc == OP_ALT);
1019    cc += 1 + LINK_SIZE;
1020    break;
1021
1022    /* Skip over things that don't match chars */
1023
1024    case OP_MARK:
1025    case OP_PRUNE_ARG:
1026    case OP_SKIP_ARG:
1027    case OP_THEN_ARG:
1028    cc += cc[1] + PRIV(OP_lengths)[*cc];
1029    break;
1030
1031    case OP_CALLOUT:
1032    case OP_CIRC:
1033    case OP_CIRCM:
1034    case OP_CLOSE:
1035    case OP_COMMIT:
1036    case OP_CREF:
1037    case OP_FALSE:
1038    case OP_TRUE:
1039    case OP_DNCREF:
1040    case OP_DNRREF:
1041    case OP_DOLL:
1042    case OP_DOLLM:
1043    case OP_EOD:
1044    case OP_EODN:
1045    case OP_FAIL:
1046    case OP_NOT_WORD_BOUNDARY:
1047    case OP_PRUNE:
1048    case OP_REVERSE:
1049    case OP_RREF:
1050    case OP_SET_SOM:
1051    case OP_SKIP:
1052    case OP_SOD:
1053    case OP_SOM:
1054    case OP_THEN:
1055    case OP_WORD_BOUNDARY:
1056    cc += PRIV(OP_lengths)[*cc];
1057    break;
1058
1059    case OP_CALLOUT_STR:
1060    cc += GET(cc, 1 + 2*LINK_SIZE);
1061    break;
1062
1063    /* Handle literal characters */
1064
1065    case OP_CHAR:
1066    case OP_CHARI:
1067    case OP_NOT:
1068    case OP_NOTI:
1069    branchlength++;
1070    cc += 2;
1071#ifdef SUPPORT_UNICODE
1072    if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
1073#endif
1074    break;
1075
1076    /* Handle exact repetitions. The count is already in characters, but we
1077    need to skip over a multibyte character in UTF8 mode.  */
1078
1079    case OP_EXACT:
1080    case OP_EXACTI:
1081    case OP_NOTEXACT:
1082    case OP_NOTEXACTI:
1083    branchlength += GET2(cc,1);
1084    cc += 2 + IMM2_SIZE;
1085#ifdef SUPPORT_UNICODE
1086    if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
1087#endif
1088    break;
1089
1090    case OP_TYPEEXACT:
1091    branchlength += GET2(cc,1);
1092    if (cc[1 + IMM2_SIZE] == OP_PROP || cc[1 + IMM2_SIZE] == OP_NOTPROP)
1093      cc += 2;
1094    cc += 1 + IMM2_SIZE + 1;
1095    break;
1096
1097    /* Handle single-char matchers */
1098
1099    case OP_PROP:
1100    case OP_NOTPROP:
1101    cc += 2;
1102    /* Fall through */
1103
1104    case OP_HSPACE:
1105    case OP_VSPACE:
1106    case OP_NOT_HSPACE:
1107    case OP_NOT_VSPACE:
1108    case OP_NOT_DIGIT:
1109    case OP_DIGIT:
1110    case OP_NOT_WHITESPACE:
1111    case OP_WHITESPACE:
1112    case OP_NOT_WORDCHAR:
1113    case OP_WORDCHAR:
1114    case OP_ANY:
1115    case OP_ALLANY:
1116    branchlength++;
1117    cc++;
1118    break;
1119
1120    /* The single-byte matcher isn't allowed. This only happens in UTF-8 or
1121    UTF-16 mode; otherwise \C is coded as OP_ALLANY. */
1122
1123    case OP_ANYBYTE:
1124    return FFL_BACKSLASHC;
1125
1126    /* Check a class for variable quantification */
1127
1128    case OP_CLASS:
1129    case OP_NCLASS:
1130#ifdef SUPPORT_WIDE_CHARS
1131    case OP_XCLASS:
1132    /* The original code caused an unsigned overflow in 64 bit systems,
1133    so now we use a conditional statement. */
1134    if (op == OP_XCLASS)
1135      cc += GET(cc, 1);
1136    else
1137      cc += PRIV(OP_lengths)[OP_CLASS];
1138#else
1139    cc += PRIV(OP_lengths)[OP_CLASS];
1140#endif
1141
1142    switch (*cc)
1143      {
1144      case OP_CRSTAR:
1145      case OP_CRMINSTAR:
1146      case OP_CRPLUS:
1147      case OP_CRMINPLUS:
1148      case OP_CRQUERY:
1149      case OP_CRMINQUERY:
1150      case OP_CRPOSSTAR:
1151      case OP_CRPOSPLUS:
1152      case OP_CRPOSQUERY:
1153      goto ISNOTFIXED;
1154
1155      case OP_CRRANGE:
1156      case OP_CRMINRANGE:
1157      case OP_CRPOSRANGE:
1158      if (GET2(cc,1) != GET2(cc,1+IMM2_SIZE)) goto ISNOTFIXED;
1159      branchlength += GET2(cc,1);
1160      cc += 1 + 2 * IMM2_SIZE;
1161      break;
1162
1163      default:
1164      branchlength++;
1165      }
1166    break;
1167
1168    /* Anything else is variable length */
1169
1170    case OP_ANYNL:
1171    case OP_BRAMINZERO:
1172    case OP_BRAPOS:
1173    case OP_BRAPOSZERO:
1174    case OP_BRAZERO:
1175    case OP_CBRAPOS:
1176    case OP_EXTUNI:
1177    case OP_KETRMAX:
1178    case OP_KETRMIN:
1179    case OP_KETRPOS:
1180    case OP_MINPLUS:
1181    case OP_MINPLUSI:
1182    case OP_MINQUERY:
1183    case OP_MINQUERYI:
1184    case OP_MINSTAR:
1185    case OP_MINSTARI:
1186    case OP_MINUPTO:
1187    case OP_MINUPTOI:
1188    case OP_NOTMINPLUS:
1189    case OP_NOTMINPLUSI:
1190    case OP_NOTMINQUERY:
1191    case OP_NOTMINQUERYI:
1192    case OP_NOTMINSTAR:
1193    case OP_NOTMINSTARI:
1194    case OP_NOTMINUPTO:
1195    case OP_NOTMINUPTOI:
1196    case OP_NOTPLUS:
1197    case OP_NOTPLUSI:
1198    case OP_NOTPOSPLUS:
1199    case OP_NOTPOSPLUSI:
1200    case OP_NOTPOSQUERY:
1201    case OP_NOTPOSQUERYI:
1202    case OP_NOTPOSSTAR:
1203    case OP_NOTPOSSTARI:
1204    case OP_NOTPOSUPTO:
1205    case OP_NOTPOSUPTOI:
1206    case OP_NOTQUERY:
1207    case OP_NOTQUERYI:
1208    case OP_NOTSTAR:
1209    case OP_NOTSTARI:
1210    case OP_NOTUPTO:
1211    case OP_NOTUPTOI:
1212    case OP_PLUS:
1213    case OP_PLUSI:
1214    case OP_POSPLUS:
1215    case OP_POSPLUSI:
1216    case OP_POSQUERY:
1217    case OP_POSQUERYI:
1218    case OP_POSSTAR:
1219    case OP_POSSTARI:
1220    case OP_POSUPTO:
1221    case OP_POSUPTOI:
1222    case OP_QUERY:
1223    case OP_QUERYI:
1224    case OP_REF:
1225    case OP_REFI:
1226    case OP_DNREF:
1227    case OP_DNREFI:
1228    case OP_SBRA:
1229    case OP_SBRAPOS:
1230    case OP_SCBRA:
1231    case OP_SCBRAPOS:
1232    case OP_SCOND:
1233    case OP_SKIPZERO:
1234    case OP_STAR:
1235    case OP_STARI:
1236    case OP_TYPEMINPLUS:
1237    case OP_TYPEMINQUERY:
1238    case OP_TYPEMINSTAR:
1239    case OP_TYPEMINUPTO:
1240    case OP_TYPEPLUS:
1241    case OP_TYPEPOSPLUS:
1242    case OP_TYPEPOSQUERY:
1243    case OP_TYPEPOSSTAR:
1244    case OP_TYPEPOSUPTO:
1245    case OP_TYPEQUERY:
1246    case OP_TYPESTAR:
1247    case OP_TYPEUPTO:
1248    case OP_UPTO:
1249    case OP_UPTOI:
1250    goto ISNOTFIXED;
1251
1252    /* Catch unrecognized opcodes so that when new ones are added they
1253    are not forgotten, as has happened in the past. */
1254
1255    default:
1256    return FFL_UNKNOWNOP;
1257    }
1258  }
1259/* Control never gets here except by goto. */
1260
1261ISNOTFIXED:
1262if (group > 0)
1263  {
1264  groupinfo |= GI_NOT_FIXED_LENGTH;
1265  cb->groupinfo[group] = groupinfo;
1266  }
1267return FFL_NOTFIXED;
1268}
1269
1270
1271
1272/*************************************************
1273*      Find first significant op code            *
1274*************************************************/
1275
1276/* This is called by several functions that scan a compiled expression looking
1277for a fixed first character, or an anchoring op code etc. It skips over things
1278that do not influence this. For some calls, it makes sense to skip negative
1279forward and all backward assertions, and also the \b assertion; for others it
1280does not.
1281
1282Arguments:
1283  code         pointer to the start of the group
1284  skipassert   TRUE if certain assertions are to be skipped
1285
1286Returns:       pointer to the first significant opcode
1287*/
1288
1289static const PCRE2_UCHAR*
1290first_significant_code(PCRE2_SPTR code, BOOL skipassert)
1291{
1292for (;;)
1293  {
1294  switch ((int)*code)
1295    {
1296    case OP_ASSERT_NOT:
1297    case OP_ASSERTBACK:
1298    case OP_ASSERTBACK_NOT:
1299    if (!skipassert) return code;
1300    do code += GET(code, 1); while (*code == OP_ALT);
1301    code += PRIV(OP_lengths)[*code];
1302    break;
1303
1304    case OP_WORD_BOUNDARY:
1305    case OP_NOT_WORD_BOUNDARY:
1306    if (!skipassert) return code;
1307    /* Fall through */
1308
1309    case OP_CALLOUT:
1310    case OP_CREF:
1311    case OP_DNCREF:
1312    case OP_RREF:
1313    case OP_DNRREF:
1314    case OP_FALSE:
1315    case OP_TRUE:
1316    code += PRIV(OP_lengths)[*code];
1317    break;
1318
1319    case OP_CALLOUT_STR:
1320    code += GET(code, 1 + 2*LINK_SIZE);
1321    break;
1322
1323    default:
1324    return code;
1325    }
1326  }
1327/* Control never reaches here */
1328}
1329
1330
1331
1332/*************************************************
1333*    Scan compiled branch for non-emptiness      *
1334*************************************************/
1335
1336/* This function scans through a branch of a compiled pattern to see whether it
1337can match the empty string. It is called at the end of compiling to check the
1338entire pattern, and from compile_branch() when checking for an unlimited repeat
1339of a group that can match nothing. In the latter case it is called only when
1340doing the real compile, not during the pre-compile that measures the size of
1341the compiled pattern.
1342
1343Note that first_significant_code() skips over backward and negative forward
1344assertions when its final argument is TRUE. If we hit an unclosed bracket, we
1345return "empty" - this means we've struck an inner bracket whose current branch
1346will already have been scanned.
1347
1348Arguments:
1349  code        points to start of search
1350  endcode     points to where to stop
1351  utf         TRUE if in UTF mode
1352  cb          compile data
1353  atend       TRUE if being called to check an entire pattern
1354  recurses    chain of recurse_check to catch mutual recursion
1355  countptr    pointer to count to catch over-complicated pattern
1356
1357Returns:      0 if what is matched cannot be empty
1358              1 if what is matched could be empty
1359             -1 if the pattern is too complicated
1360*/
1361
1362#define CBE_NOTEMPTY          0
1363#define CBE_EMPTY             1
1364#define CBE_TOOCOMPLICATED  (-1)
1365
1366
1367static int
1368could_be_empty_branch(PCRE2_SPTR code, PCRE2_SPTR endcode, BOOL utf,
1369  compile_block *cb, BOOL atend, recurse_check *recurses, int *countptr)
1370{
1371uint32_t group = 0;
1372uint32_t groupinfo = 0;
1373register PCRE2_UCHAR c;
1374recurse_check this_recurse;
1375
1376/* If what we are checking has already been set as "could be empty", we know
1377the answer. */
1378
1379if (*code >= OP_SBRA && *code <= OP_SCOND) return CBE_EMPTY;
1380
1381/* If this is a capturing group, we may have the answer cached, but we can only
1382use this information if there are no (?| groups in the pattern, because
1383otherwise group numbers are not unique. */
1384
1385if ((cb->external_flags & PCRE2_DUPCAPUSED) == 0 &&
1386    (*code == OP_CBRA || *code == OP_CBRAPOS))
1387  {
1388  group = GET2(code, 1 + LINK_SIZE);
1389  groupinfo = cb->groupinfo[group];
1390  if ((groupinfo & GI_SET_COULD_BE_EMPTY) != 0)
1391    return ((groupinfo & GI_COULD_BE_EMPTY) != 0)? CBE_EMPTY : CBE_NOTEMPTY;
1392  }
1393
1394/* A large and/or complex regex can take too long to process. We have to assume
1395it can match an empty string. This can happen more often when (?| groups are
1396present in the pattern and the caching is disabled. Setting the cap at 1100
1397allows the test for more than 1023 capturing patterns to work. */
1398
1399if ((*countptr)++ > 1100) return CBE_TOOCOMPLICATED;
1400
1401/* Scan the opcodes for this branch. */
1402
1403for (code = first_significant_code(code + PRIV(OP_lengths)[*code], TRUE);
1404     code < endcode;
1405     code = first_significant_code(code + PRIV(OP_lengths)[c], TRUE))
1406  {
1407  PCRE2_SPTR ccode;
1408
1409  c = *code;
1410
1411  /* Skip over forward assertions; the other assertions are skipped by
1412  first_significant_code() with a TRUE final argument. */
1413
1414  if (c == OP_ASSERT)
1415    {
1416    do code += GET(code, 1); while (*code == OP_ALT);
1417    c = *code;
1418    continue;
1419    }
1420
1421  /* For a recursion/subroutine call we can scan the recursion when this
1422  function is called at the end, to check a complete pattern. Before then,
1423  recursions just have the group number as their argument and in any case may
1424  be forward references. In that situation, we return CBE_EMPTY, just in case.
1425  It means that unlimited repeats of groups that contain recursions are always
1426  treated as "could be empty" - which just adds a bit more processing time
1427  because of the runtime check. */
1428
1429  if (c == OP_RECURSE)
1430    {
1431    PCRE2_SPTR scode, endgroup;
1432    BOOL empty_branch;
1433
1434    if (!atend) goto ISTRUE;
1435    scode = cb->start_code + GET(code, 1);
1436    endgroup = scode;
1437
1438    /* We need to detect whether this is a recursive call, as otherwise there
1439    will be an infinite loop. If it is a recursion, just skip over it. Simple
1440    recursions are easily detected. For mutual recursions we keep a chain on
1441    the stack. */
1442
1443    do endgroup += GET(endgroup, 1); while (*endgroup == OP_ALT);
1444    if (code >= scode && code <= endgroup) continue;  /* Simple recursion */
1445    else
1446      {
1447      recurse_check *r = recurses;
1448      for (r = recurses; r != NULL; r = r->prev)
1449        if (r->group == scode) break;
1450      if (r != NULL) continue;   /* Mutual recursion */
1451      }
1452
1453    /* Scan the referenced group, remembering it on the stack chain to detect
1454    mutual recursions. */
1455
1456    empty_branch = FALSE;
1457    this_recurse.prev = recurses;
1458    this_recurse.group = scode;
1459
1460    do
1461      {
1462      int rc = could_be_empty_branch(scode, endcode, utf, cb, atend,
1463        &this_recurse, countptr);
1464      if (rc < 0) return rc;
1465      if (rc > 0)
1466        {
1467        empty_branch = TRUE;
1468        break;
1469        }
1470      scode += GET(scode, 1);
1471      }
1472    while (*scode == OP_ALT);
1473
1474    if (!empty_branch) goto ISFALSE;  /* All branches are non-empty */
1475    continue;
1476    }
1477
1478  /* Groups with zero repeats can of course be empty; skip them. */
1479
1480  if (c == OP_BRAZERO || c == OP_BRAMINZERO || c == OP_SKIPZERO ||
1481      c == OP_BRAPOSZERO)
1482    {
1483    code += PRIV(OP_lengths)[c];
1484    do code += GET(code, 1); while (*code == OP_ALT);
1485    c = *code;
1486    continue;
1487    }
1488
1489  /* A nested group that is already marked as "could be empty" can just be
1490  skipped. */
1491
1492  if (c == OP_SBRA  || c == OP_SBRAPOS ||
1493      c == OP_SCBRA || c == OP_SCBRAPOS)
1494    {
1495    do code += GET(code, 1); while (*code == OP_ALT);
1496    c = *code;
1497    continue;
1498    }
1499
1500  /* For other groups, scan the branches. */
1501
1502  if (c == OP_BRA  || c == OP_BRAPOS ||
1503      c == OP_CBRA || c == OP_CBRAPOS ||
1504      c == OP_ONCE || c == OP_ONCE_NC ||
1505      c == OP_COND || c == OP_SCOND)
1506    {
1507    BOOL empty_branch;
1508    if (GET(code, 1) == 0) goto ISTRUE;    /* Hit unclosed bracket */
1509
1510    /* If a conditional group has only one branch, there is a second, implied,
1511    empty branch, so just skip over the conditional, because it could be empty.
1512    Otherwise, scan the individual branches of the group. */
1513
1514    if (c == OP_COND && code[GET(code, 1)] != OP_ALT)
1515      code += GET(code, 1);
1516    else
1517      {
1518      empty_branch = FALSE;
1519      do
1520        {
1521        if (!empty_branch)
1522          {
1523          int rc = could_be_empty_branch(code, endcode, utf, cb, atend,
1524            recurses, countptr);
1525          if (rc < 0) return rc;
1526          if (rc > 0) empty_branch = TRUE;
1527          }
1528        code += GET(code, 1);
1529        }
1530      while (*code == OP_ALT);
1531      if (!empty_branch) goto ISFALSE;   /* All branches are non-empty */
1532      }
1533
1534    c = *code;
1535    continue;
1536    }
1537
1538  /* Handle the other opcodes */
1539
1540  switch (c)
1541    {
1542    /* Check for quantifiers after a class. XCLASS is used for classes that
1543    cannot be represented just by a bit map. This includes negated single
1544    high-valued characters. The length in PRIV(OP_lengths)[] is zero; the
1545    actual length is stored in the compiled code, so we must update "code"
1546    here. */
1547
1548#if defined SUPPORT_UNICODE || PCRE2_CODE_UNIT_WIDTH != 8
1549    case OP_XCLASS:
1550    ccode = code += GET(code, 1);
1551    goto CHECK_CLASS_REPEAT;
1552#endif
1553
1554    case OP_CLASS:
1555    case OP_NCLASS:
1556    ccode = code + PRIV(OP_lengths)[OP_CLASS];
1557
1558#if defined SUPPORT_UNICODE || PCRE2_CODE_UNIT_WIDTH != 8
1559    CHECK_CLASS_REPEAT:
1560#endif
1561
1562    switch (*ccode)
1563      {
1564      case OP_CRSTAR:            /* These could be empty; continue */
1565      case OP_CRMINSTAR:
1566      case OP_CRQUERY:
1567      case OP_CRMINQUERY:
1568      case OP_CRPOSSTAR:
1569      case OP_CRPOSQUERY:
1570      break;
1571
1572      default:                   /* Non-repeat => class must match */
1573      case OP_CRPLUS:            /* These repeats aren't empty */
1574      case OP_CRMINPLUS:
1575      case OP_CRPOSPLUS:
1576      goto ISFALSE;
1577
1578      case OP_CRRANGE:
1579      case OP_CRMINRANGE:
1580      case OP_CRPOSRANGE:
1581      if (GET2(ccode, 1) > 0) goto ISFALSE;  /* Minimum > 0 */
1582      break;
1583      }
1584    break;
1585
1586    /* Opcodes that must match a character */
1587
1588    case OP_ANY:
1589    case OP_ALLANY:
1590    case OP_ANYBYTE:
1591
1592    case OP_PROP:
1593    case OP_NOTPROP:
1594    case OP_ANYNL:
1595
1596    case OP_NOT_HSPACE:
1597    case OP_HSPACE:
1598    case OP_NOT_VSPACE:
1599    case OP_VSPACE:
1600    case OP_EXTUNI:
1601
1602    case OP_NOT_DIGIT:
1603    case OP_DIGIT:
1604    case OP_NOT_WHITESPACE:
1605    case OP_WHITESPACE:
1606    case OP_NOT_WORDCHAR:
1607    case OP_WORDCHAR:
1608
1609    case OP_CHAR:
1610    case OP_CHARI:
1611    case OP_NOT:
1612    case OP_NOTI:
1613
1614    case OP_PLUS:
1615    case OP_PLUSI:
1616    case OP_MINPLUS:
1617    case OP_MINPLUSI:
1618
1619    case OP_NOTPLUS:
1620    case OP_NOTPLUSI:
1621    case OP_NOTMINPLUS:
1622    case OP_NOTMINPLUSI:
1623
1624    case OP_POSPLUS:
1625    case OP_POSPLUSI:
1626    case OP_NOTPOSPLUS:
1627    case OP_NOTPOSPLUSI:
1628
1629    case OP_EXACT:
1630    case OP_EXACTI:
1631    case OP_NOTEXACT:
1632    case OP_NOTEXACTI:
1633
1634    case OP_TYPEPLUS:
1635    case OP_TYPEMINPLUS:
1636    case OP_TYPEPOSPLUS:
1637    case OP_TYPEEXACT:
1638    goto ISFALSE;
1639
1640    /* These are going to continue, as they may be empty, but we have to
1641    fudge the length for the \p and \P cases. */
1642
1643    case OP_TYPESTAR:
1644    case OP_TYPEMINSTAR:
1645    case OP_TYPEPOSSTAR:
1646    case OP_TYPEQUERY:
1647    case OP_TYPEMINQUERY:
1648    case OP_TYPEPOSQUERY:
1649    if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1650    break;
1651
1652    /* Same for these */
1653
1654    case OP_TYPEUPTO:
1655    case OP_TYPEMINUPTO:
1656    case OP_TYPEPOSUPTO:
1657    if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
1658      code += 2;
1659    break;
1660
1661    /* End of branch */
1662
1663    case OP_KET:
1664    case OP_KETRMAX:
1665    case OP_KETRMIN:
1666    case OP_KETRPOS:
1667    case OP_ALT:
1668    goto ISTRUE;
1669
1670    /* In UTF-8 or UTF-16 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY,
1671    POSQUERY, UPTO, MINUPTO, and POSUPTO and their caseless and negative
1672    versions may be followed by a multibyte character. */
1673
1674#ifdef MAYBE_UTF_MULTI
1675    case OP_STAR:
1676    case OP_STARI:
1677    case OP_NOTSTAR:
1678    case OP_NOTSTARI:
1679
1680    case OP_MINSTAR:
1681    case OP_MINSTARI:
1682    case OP_NOTMINSTAR:
1683    case OP_NOTMINSTARI:
1684
1685    case OP_POSSTAR:
1686    case OP_POSSTARI:
1687    case OP_NOTPOSSTAR:
1688    case OP_NOTPOSSTARI:
1689
1690    case OP_QUERY:
1691    case OP_QUERYI:
1692    case OP_NOTQUERY:
1693    case OP_NOTQUERYI:
1694
1695    case OP_MINQUERY:
1696    case OP_MINQUERYI:
1697    case OP_NOTMINQUERY:
1698    case OP_NOTMINQUERYI:
1699
1700    case OP_POSQUERY:
1701    case OP_POSQUERYI:
1702    case OP_NOTPOSQUERY:
1703    case OP_NOTPOSQUERYI:
1704    if (utf && HAS_EXTRALEN(code[1])) code += GET_EXTRALEN(code[1]);
1705    break;
1706
1707    case OP_UPTO:
1708    case OP_UPTOI:
1709    case OP_NOTUPTO:
1710    case OP_NOTUPTOI:
1711
1712    case OP_MINUPTO:
1713    case OP_MINUPTOI:
1714    case OP_NOTMINUPTO:
1715    case OP_NOTMINUPTOI:
1716
1717    case OP_POSUPTO:
1718    case OP_POSUPTOI:
1719    case OP_NOTPOSUPTO:
1720    case OP_NOTPOSUPTOI:
1721    if (utf && HAS_EXTRALEN(code[1 + IMM2_SIZE])) code += GET_EXTRALEN(code[1 + IMM2_SIZE]);
1722    break;
1723#endif  /* MAYBE_UTF_MULTI */
1724
1725    /* MARK, and PRUNE/SKIP/THEN with an argument must skip over the argument
1726    string. */
1727
1728    case OP_MARK:
1729    case OP_PRUNE_ARG:
1730    case OP_SKIP_ARG:
1731    case OP_THEN_ARG:
1732    code += code[1];
1733    break;
1734
1735    /* None of the remaining opcodes are required to match a character. */
1736
1737    default:
1738    break;
1739    }
1740  }
1741
1742ISTRUE:
1743groupinfo |= GI_COULD_BE_EMPTY;
1744
1745ISFALSE:
1746if (group > 0) cb->groupinfo[group] = groupinfo | GI_SET_COULD_BE_EMPTY;
1747
1748return ((groupinfo & GI_COULD_BE_EMPTY) != 0)? CBE_EMPTY : CBE_NOTEMPTY;
1749}
1750
1751
1752
1753/*************************************************
1754*            Check for counted repeat            *
1755*************************************************/
1756
1757/* This function is called when a '{' is encountered in a place where it might
1758start a quantifier. It looks ahead to see if it really is a quantifier, that
1759is, one of the forms {ddd} {ddd,} or {ddd,ddd} where the ddds are digits.
1760
1761Argument:   pointer to the first char after '{'
1762Returns:    TRUE or FALSE
1763*/
1764
1765static BOOL
1766is_counted_repeat(PCRE2_SPTR p)
1767{
1768if (!IS_DIGIT(*p)) return FALSE;
1769p++;
1770while (IS_DIGIT(*p)) p++;
1771if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
1772
1773if (*p++ != CHAR_COMMA) return FALSE;
1774if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
1775
1776if (!IS_DIGIT(*p)) return FALSE;
1777p++;
1778while (IS_DIGIT(*p)) p++;
1779
1780return (*p == CHAR_RIGHT_CURLY_BRACKET);
1781}
1782
1783
1784
1785/*************************************************
1786*            Handle escapes                      *
1787*************************************************/
1788
1789/* This function is called when a \ has been encountered. It either returns a
1790positive value for a simple escape such as \d, or 0 for a data character, which
1791is placed in chptr. A backreference to group n is returned as negative n. On
1792entry, ptr is pointing at the \. On exit, it points the final code unit of the
1793escape sequence.
1794
1795This function is also called from pcre2_substitute() to handle escape sequences
1796in replacement strings. In this case, the cb argument is NULL, and only
1797sequences that define a data character are recognised. The isclass argument is
1798not relevant, but the options argument is the final value of the compiled
1799pattern's options.
1800
1801There is one "trick" case: when a sequence such as [[:>:]] or \s in UCP mode is
1802processed, it is replaced by a nested alternative sequence. If this contains a
1803backslash (which is usually does), ptrend does not point to its end - it still
1804points to the end of the whole pattern. However, we can detect this case
1805because cb->nestptr[0] will be non-NULL. The nested sequences are all zero-
1806terminated and there are only ever two levels of nesting.
1807
1808Arguments:
1809  ptrptr         points to the input position pointer
1810  ptrend         points to the end of the input
1811  chptr          points to a returned data character
1812  errorcodeptr   points to the errorcode variable (containing zero)
1813  options        the current options bits
1814  isclass        TRUE if inside a character class
1815  cb             compile data block
1816
1817Returns:         zero => a data character
1818                 positive => a special escape sequence
1819                 negative => a back reference
1820                 on error, errorcodeptr is set non-zero
1821*/
1822
1823int
1824PRIV(check_escape)(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, uint32_t *chptr,
1825  int *errorcodeptr, uint32_t options, BOOL isclass, compile_block *cb)
1826{
1827BOOL utf = (options & PCRE2_UTF) != 0;
1828PCRE2_SPTR ptr = *ptrptr + 1;
1829register uint32_t c, cc;
1830int escape = 0;
1831int i;
1832
1833/* Find the end of a nested insert. */
1834
1835if (cb != NULL && cb->nestptr[0] != NULL)
1836  ptrend = ptr + PRIV(strlen)(ptr);
1837
1838/* If backslash is at the end of the string, it's an error. */
1839
1840if (ptr >= ptrend)
1841  {
1842  *errorcodeptr = ERR1;
1843  return 0;
1844  }
1845
1846GETCHARINCTEST(c, ptr);         /* Get character value, increment pointer */
1847ptr--;                          /* Set pointer back to the last code unit */
1848
1849/* Non-alphanumerics are literals, so we just leave the value in c. An initial
1850value test saves a memory lookup for code points outside the alphanumeric
1851range. Otherwise, do a table lookup. A non-zero result is something that can be
1852returned immediately. Otherwise further processing is required. */
1853
1854if (c < ESCAPES_FIRST || c > ESCAPES_LAST) {}  /* Definitely literal */
1855
1856else if ((i = escapes[c - ESCAPES_FIRST]) != 0)
1857  {
1858  if (i > 0) c = (uint32_t)i; else  /* Positive is a data character */
1859    {
1860    escape = -i;                    /* Else return a special escape */
1861    if (escape == ESC_P || escape == ESC_p || escape == ESC_X)
1862      cb->external_flags |= PCRE2_HASBKPORX;   /* Note \P, \p, or \X */
1863    }
1864  }
1865
1866/* Escapes that need further processing, including those that are unknown.
1867When called from pcre2_substitute(), only \c, \o, and \x are recognized (and \u
1868when BSUX is set). */
1869
1870else
1871  {
1872  PCRE2_SPTR oldptr;
1873  BOOL braced, negated, overflow;
1874  unsigned int s;
1875
1876  /* Filter calls from pcre2_substitute(). */
1877
1878  if (cb == NULL && c != CHAR_c && c != CHAR_o && c != CHAR_x &&
1879      (c != CHAR_u || (options & PCRE2_ALT_BSUX) != 0))
1880    {
1881    *errorcodeptr = ERR3;
1882    return 0;
1883    }
1884
1885  switch (c)
1886    {
1887    /* A number of Perl escapes are not handled by PCRE. We give an explicit
1888    error. */
1889
1890    case CHAR_l:
1891    case CHAR_L:
1892    *errorcodeptr = ERR37;
1893    break;
1894
1895    /* \u is unrecognized when PCRE2_ALT_BSUX is not set. When it is treated
1896    specially, \u must be followed by four hex digits. Otherwise it is a
1897    lowercase u letter. */
1898
1899    case CHAR_u:
1900    if ((options & PCRE2_ALT_BSUX) == 0) *errorcodeptr = ERR37; else
1901      {
1902      uint32_t xc;
1903      if ((cc = XDIGIT(ptr[1])) == 0xff) break;  /* Not a hex digit */
1904      if ((xc = XDIGIT(ptr[2])) == 0xff) break;  /* Not a hex digit */
1905      cc = (cc << 4) | xc;
1906      if ((xc = XDIGIT(ptr[3])) == 0xff) break;  /* Not a hex digit */
1907      cc = (cc << 4) | xc;
1908      if ((xc = XDIGIT(ptr[4])) == 0xff) break;  /* Not a hex digit */
1909      c = (cc << 4) | xc;
1910      ptr += 4;
1911      if (utf)
1912        {
1913        if (c > 0x10ffffU) *errorcodeptr = ERR77;
1914          else if (c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
1915        }
1916      else if (c > MAX_NON_UTF_CHAR) *errorcodeptr = ERR77;
1917      }
1918    break;
1919
1920    case CHAR_U:
1921    /* \U is unrecognized unless PCRE2_ALT_BSUX is set, in which case it is an
1922    upper case letter. */
1923    if ((options & PCRE2_ALT_BSUX) == 0) *errorcodeptr = ERR37;
1924    break;
1925
1926    /* In a character class, \g is just a literal "g". Outside a character
1927    class, \g must be followed by one of a number of specific things:
1928
1929    (1) A number, either plain or braced. If positive, it is an absolute
1930    backreference. If negative, it is a relative backreference. This is a Perl
1931    5.10 feature.
1932
1933    (2) Perl 5.10 also supports \g{name} as a reference to a named group. This
1934    is part of Perl's movement towards a unified syntax for back references. As
1935    this is synonymous with \k{name}, we fudge it up by pretending it really
1936    was \k.
1937
1938    (3) For Oniguruma compatibility we also support \g followed by a name or a
1939    number either in angle brackets or in single quotes. However, these are
1940    (possibly recursive) subroutine calls, _not_ backreferences. Just return
1941    the ESC_g code (cf \k). */
1942
1943    case CHAR_g:
1944    if (isclass) break;
1945    if (ptr[1] == CHAR_LESS_THAN_SIGN || ptr[1] == CHAR_APOSTROPHE)
1946      {
1947      escape = ESC_g;
1948      break;
1949      }
1950
1951    /* Handle the Perl-compatible cases */
1952
1953    if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
1954      {
1955      PCRE2_SPTR p;
1956      for (p = ptr+2; *p != CHAR_NULL && *p != CHAR_RIGHT_CURLY_BRACKET; p++)
1957        if (*p != CHAR_MINUS && !IS_DIGIT(*p)) break;
1958      if (*p != CHAR_NULL && *p != CHAR_RIGHT_CURLY_BRACKET)
1959        {
1960        escape = ESC_k;
1961        break;
1962        }
1963      braced = TRUE;
1964      ptr++;
1965      }
1966    else braced = FALSE;
1967
1968    if (ptr[1] == CHAR_MINUS)
1969      {
1970      negated = TRUE;
1971      ptr++;
1972      }
1973    else negated = FALSE;
1974
1975    /* The integer range is limited by the machine's int representation. */
1976    s = 0;
1977    overflow = FALSE;
1978    while (IS_DIGIT(ptr[1]))
1979      {
1980      if (s > INT_MAX / 10 - 1) /* Integer overflow */
1981        {
1982        overflow = TRUE;
1983        break;
1984        }
1985      s = s * 10 + (unsigned int)(*(++ptr) - CHAR_0);
1986      }
1987    if (overflow) /* Integer overflow */
1988      {
1989      while (IS_DIGIT(ptr[1])) ptr++;
1990      *errorcodeptr = ERR61;
1991      break;
1992      }
1993
1994    if (braced && *(++ptr) != CHAR_RIGHT_CURLY_BRACKET)
1995      {
1996      *errorcodeptr = ERR57;
1997      break;
1998      }
1999
2000    if (s == 0)
2001      {
2002      *errorcodeptr = ERR58;
2003      break;
2004      }
2005
2006    if (negated)
2007      {
2008      if (s > cb->bracount)
2009        {
2010        *errorcodeptr = ERR15;
2011        break;
2012        }
2013      s = cb->bracount - (s - 1);
2014      }
2015
2016    escape = -(int)s;
2017    break;
2018
2019    /* The handling of escape sequences consisting of a string of digits
2020    starting with one that is not zero is not straightforward. Perl has changed
2021    over the years. Nowadays \g{} for backreferences and \o{} for octal are
2022    recommended to avoid the ambiguities in the old syntax.
2023
2024    Outside a character class, the digits are read as a decimal number. If the
2025    number is less than 10, or if there are that many previous extracting left
2026    brackets, it is a back reference. Otherwise, up to three octal digits are
2027    read to form an escaped character code. Thus \123 is likely to be octal 123
2028    (cf \0123, which is octal 012 followed by the literal 3).
2029
2030    Inside a character class, \ followed by a digit is always either a literal
2031    8 or 9 or an octal number. */
2032
2033    case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5:
2034    case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
2035
2036    if (!isclass)
2037      {
2038      oldptr = ptr;
2039      /* The integer range is limited by the machine's int representation. */
2040      s = c - CHAR_0;
2041      overflow = FALSE;
2042      while (IS_DIGIT(ptr[1]))
2043        {
2044        if (s > INT_MAX / 10 - 1) /* Integer overflow */
2045          {
2046          overflow = TRUE;
2047          break;
2048          }
2049        s = s * 10 + (unsigned int)(*(++ptr) - CHAR_0);
2050        }
2051      if (overflow) /* Integer overflow */
2052        {
2053        while (IS_DIGIT(ptr[1])) ptr++;
2054        *errorcodeptr = ERR61;
2055        break;
2056        }
2057
2058      /* \1 to \9 are always back references. \8x and \9x are too; \1x to \7x
2059      are octal escapes if there are not that many previous captures. */
2060
2061      if (s < 10 || *oldptr >= CHAR_8 || s <= cb->bracount)
2062        {
2063        escape = -(int)s;     /* Indicates a back reference */
2064        break;
2065        }
2066      ptr = oldptr;      /* Put the pointer back and fall through */
2067      }
2068
2069    /* Handle a digit following \ when the number is not a back reference, or
2070    we are within a character class. If the first digit is 8 or 9, Perl used to
2071    generate a binary zero byte and then treat the digit as a following
2072    literal. At least by Perl 5.18 this changed so as not to insert the binary
2073    zero. */
2074
2075    if ((c = *ptr) >= CHAR_8) break;
2076
2077    /* Fall through with a digit less than 8 */
2078
2079    /* \0 always starts an octal number, but we may drop through to here with a
2080    larger first octal digit. The original code used just to take the least
2081    significant 8 bits of octal numbers (I think this is what early Perls used
2082    to do). Nowadays we allow for larger numbers in UTF-8 mode and 16-bit mode,
2083    but no more than 3 octal digits. */
2084
2085    case CHAR_0:
2086    c -= CHAR_0;
2087    while(i++ < 2 && ptr[1] >= CHAR_0 && ptr[1] <= CHAR_7)
2088        c = c * 8 + *(++ptr) - CHAR_0;
2089#if PCRE2_CODE_UNIT_WIDTH == 8
2090    if (!utf && c > 0xff) *errorcodeptr = ERR51;
2091#endif
2092    break;
2093
2094    /* \o is a relatively new Perl feature, supporting a more general way of
2095    specifying character codes in octal. The only supported form is \o{ddd}. */
2096
2097    case CHAR_o:
2098    if (ptr[1] != CHAR_LEFT_CURLY_BRACKET) *errorcodeptr = ERR55; else
2099    if (ptr[2] == CHAR_RIGHT_CURLY_BRACKET) *errorcodeptr = ERR78; else
2100      {
2101      ptr += 2;
2102      c = 0;
2103      overflow = FALSE;
2104      while (*ptr >= CHAR_0 && *ptr <= CHAR_7)
2105        {
2106        cc = *ptr++;
2107        if (c == 0 && cc == CHAR_0) continue;     /* Leading zeroes */
2108#if PCRE2_CODE_UNIT_WIDTH == 32
2109        if (c >= 0x20000000l) { overflow = TRUE; break; }
2110#endif
2111        c = (c << 3) + (cc - CHAR_0);
2112#if PCRE2_CODE_UNIT_WIDTH == 8
2113        if (c > (utf ? 0x10ffffU : 0xffU)) { overflow = TRUE; break; }
2114#elif PCRE2_CODE_UNIT_WIDTH == 16
2115        if (c > (utf ? 0x10ffffU : 0xffffU)) { overflow = TRUE; break; }
2116#elif PCRE2_CODE_UNIT_WIDTH == 32
2117        if (utf && c > 0x10ffffU) { overflow = TRUE; break; }
2118#endif
2119        }
2120      if (overflow)
2121        {
2122        while (*ptr >= CHAR_0 && *ptr <= CHAR_7) ptr++;
2123        *errorcodeptr = ERR34;
2124        }
2125      else if (*ptr == CHAR_RIGHT_CURLY_BRACKET)
2126        {
2127        if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
2128        }
2129      else *errorcodeptr = ERR64;
2130      }
2131    break;
2132
2133    /* \x is complicated. When PCRE2_ALT_BSUX is set, \x must be followed by
2134    two hexadecimal digits. Otherwise it is a lowercase x letter. */
2135
2136    case CHAR_x:
2137    if ((options & PCRE2_ALT_BSUX) != 0)
2138      {
2139      uint32_t xc;
2140      if ((cc = XDIGIT(ptr[1])) == 0xff) break;  /* Not a hex digit */
2141      if ((xc = XDIGIT(ptr[2])) == 0xff) break;  /* Not a hex digit */
2142      c = (cc << 4) | xc;
2143      ptr += 2;
2144      }    /* End PCRE2_ALT_BSUX handling */
2145
2146    /* Handle \x in Perl's style. \x{ddd} is a character number which can be
2147    greater than 0xff in UTF-8 or non-8bit mode, but only if the ddd are hex
2148    digits. If not, { used to be treated as a data character. However, Perl
2149    seems to read hex digits up to the first non-such, and ignore the rest, so
2150    that, for example \x{zz} matches a binary zero. This seems crazy, so PCRE
2151    now gives an error. */
2152
2153    else
2154      {
2155      if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
2156        {
2157        ptr += 2;
2158        if (*ptr == CHAR_RIGHT_CURLY_BRACKET)
2159          {
2160          *errorcodeptr = ERR78;
2161          break;
2162          }
2163        c = 0;
2164        overflow = FALSE;
2165
2166        while ((cc = XDIGIT(*ptr)) != 0xff)
2167          {
2168          ptr++;
2169          if (c == 0 && cc == 0) continue;   /* Leading zeroes */
2170#if PCRE2_CODE_UNIT_WIDTH == 32
2171          if (c >= 0x10000000l) { overflow = TRUE; break; }
2172#endif
2173          c = (c << 4) | cc;
2174          if ((utf && c > 0x10ffffU) || (!utf && c > MAX_NON_UTF_CHAR))
2175            {
2176            overflow = TRUE;
2177            break;
2178            }
2179          }
2180
2181        if (overflow)
2182          {
2183          while (XDIGIT(*ptr) != 0xff) ptr++;
2184          *errorcodeptr = ERR34;
2185          }
2186        else if (*ptr == CHAR_RIGHT_CURLY_BRACKET)
2187          {
2188          if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
2189          }
2190
2191        /* If the sequence of hex digits does not end with '}', give an error.
2192        We used just to recognize this construct and fall through to the normal
2193        \x handling, but nowadays Perl gives an error, which seems much more
2194        sensible, so we do too. */
2195
2196        else *errorcodeptr = ERR67;
2197        }   /* End of \x{} processing */
2198
2199      /* Read a single-byte hex-defined char (up to two hex digits after \x) */
2200
2201      else
2202        {
2203        c = 0;
2204        if ((cc = XDIGIT(ptr[1])) == 0xff) break;  /* Not a hex digit */
2205        ptr++;
2206        c = cc;
2207        if ((cc = XDIGIT(ptr[1])) == 0xff) break;  /* Not a hex digit */
2208        ptr++;
2209        c = (c << 4) | cc;
2210        }     /* End of \xdd handling */
2211      }       /* End of Perl-style \x handling */
2212    break;
2213
2214    /* The handling of \c is different in ASCII and EBCDIC environments. In an
2215    ASCII (or Unicode) environment, an error is given if the character
2216    following \c is not a printable ASCII character. Otherwise, the following
2217    character is upper-cased if it is a letter, and after that the 0x40 bit is
2218    flipped. The result is the value of the escape.
2219
2220    In an EBCDIC environment the handling of \c is compatible with the
2221    specification in the perlebcdic document. The following character must be
2222    a letter or one of small number of special characters. These provide a
2223    means of defining the character values 0-31.
2224
2225    For testing the EBCDIC handling of \c in an ASCII environment, recognize
2226    the EBCDIC value of 'c' explicitly. */
2227
2228#if defined EBCDIC && 'a' != 0x81
2229    case 0x83:
2230#else
2231    case CHAR_c:
2232#endif
2233
2234    c = *(++ptr);
2235    if (c >= CHAR_a && c <= CHAR_z) c = UPPER_CASE(c);
2236    if (c == CHAR_NULL && ptr >= ptrend)
2237      {
2238      *errorcodeptr = ERR2;
2239      break;
2240      }
2241
2242    /* Handle \c in an ASCII/Unicode environment. */
2243
2244#ifndef EBCDIC    /* ASCII/UTF-8 coding */
2245    if (c < 32 || c > 126)  /* Excludes all non-printable ASCII */
2246      {
2247      *errorcodeptr = ERR68;
2248      break;
2249      }
2250    c ^= 0x40;
2251
2252    /* Handle \c in an EBCDIC environment. The special case \c? is converted to
2253    255 (0xff) or 95 (0x5f) if other character suggest we are using th POSIX-BC
2254    encoding. (This is the way Perl indicates that it handles \c?.) The other
2255    valid sequences correspond to a list of specific characters. */
2256
2257#else
2258    if (c == CHAR_QUESTION_MARK)
2259      c = ('\\' == 188 && '`' == 74)? 0x5f : 0xff;
2260    else
2261      {
2262      for (i = 0; i < 32; i++)
2263        {
2264        if (c == ebcdic_escape_c[i]) break;
2265        }
2266      if (i < 32) c = i; else *errorcodeptr = ERR68;
2267      }
2268#endif  /* EBCDIC */
2269
2270    break;
2271
2272    /* Any other alphanumeric following \ is an error. Perl gives an error only
2273    if in warning mode, but PCRE doesn't have a warning mode. */
2274
2275    default:
2276    *errorcodeptr = ERR3;
2277    break;
2278    }
2279  }
2280
2281/* Perl supports \N{name} for character names, as well as plain \N for "not
2282newline". PCRE does not support \N{name}. However, it does support
2283quantification such as \N{2,3}. */
2284
2285if (escape == ESC_N && ptr[1] == CHAR_LEFT_CURLY_BRACKET &&
2286     !is_counted_repeat(ptr+2))
2287  *errorcodeptr = ERR37;
2288
2289/* If PCRE2_UCP is set, we change the values for \d etc. */
2290
2291if ((options & PCRE2_UCP) != 0 && escape >= ESC_D && escape <= ESC_w)
2292  escape += (ESC_DU - ESC_D);
2293
2294/* Set the pointer to the final character before returning. */
2295
2296*ptrptr = ptr;
2297*chptr = c;
2298return escape;
2299}
2300
2301
2302
2303#ifdef SUPPORT_UNICODE
2304/*************************************************
2305*               Handle \P and \p                 *
2306*************************************************/
2307
2308/* This function is called after \P or \p has been encountered, provided that
2309PCRE2 is compiled with support for UTF and Unicode properties. On entry, the
2310contents of ptrptr are pointing at the P or p. On exit, it is left pointing at
2311the final code unit of the escape sequence.
2312
2313Arguments:
2314  ptrptr         the pattern position pointer
2315  negptr         a boolean that is set TRUE for negation else FALSE
2316  ptypeptr       an unsigned int that is set to the type value
2317  pdataptr       an unsigned int that is set to the detailed property value
2318  errorcodeptr   the error code variable
2319  cb             the compile data
2320
2321Returns:         TRUE if the type value was found, or FALSE for an invalid type
2322*/
2323
2324static BOOL
2325get_ucp(PCRE2_SPTR *ptrptr, BOOL *negptr, unsigned int *ptypeptr,
2326  unsigned int *pdataptr, int *errorcodeptr, compile_block *cb)
2327{
2328register PCRE2_UCHAR c;
2329size_t i, bot, top;
2330PCRE2_SPTR ptr = *ptrptr;
2331PCRE2_UCHAR name[32];
2332
2333*negptr = FALSE;
2334c = *(++ptr);
2335
2336/* \P or \p can be followed by a name in {}, optionally preceded by ^ for
2337negation. */
2338
2339if (c == CHAR_LEFT_CURLY_BRACKET)
2340  {
2341  if (ptr[1] == CHAR_CIRCUMFLEX_ACCENT)
2342    {
2343    *negptr = TRUE;
2344    ptr++;
2345    }
2346  for (i = 0; i < (int)(sizeof(name) / sizeof(PCRE2_UCHAR)) - 1; i++)
2347    {
2348    c = *(++ptr);
2349    if (c == CHAR_NULL) goto ERROR_RETURN;
2350    if (c == CHAR_RIGHT_CURLY_BRACKET) break;
2351    name[i] = c;
2352    }
2353  if (c != CHAR_RIGHT_CURLY_BRACKET) goto ERROR_RETURN;
2354  name[i] = 0;
2355  }
2356
2357/* Otherwise there is just one following character, which must be an ASCII
2358letter. */
2359
2360else if (MAX_255(c) && (cb->ctypes[c] & ctype_letter) != 0)
2361  {
2362  name[0] = c;
2363  name[1] = 0;
2364  }
2365else goto ERROR_RETURN;
2366
2367*ptrptr = ptr;
2368
2369/* Search for a recognized property name using binary chop. */
2370
2371bot = 0;
2372top = PRIV(utt_size);
2373
2374while (bot < top)
2375  {
2376  int r;
2377  i = (bot + top) >> 1;
2378  r = PRIV(strcmp_c8)(name, PRIV(utt_names) + PRIV(utt)[i].name_offset);
2379  if (r == 0)
2380    {
2381    *ptypeptr = PRIV(utt)[i].type;
2382    *pdataptr = PRIV(utt)[i].value;
2383    return TRUE;
2384    }
2385  if (r > 0) bot = i + 1; else top = i;
2386  }
2387*errorcodeptr = ERR47;   /* Unrecognized name */
2388return FALSE;
2389
2390ERROR_RETURN:            /* Malformed \P or \p */
2391*errorcodeptr = ERR46;
2392*ptrptr = ptr;
2393return FALSE;
2394}
2395#endif
2396
2397
2398
2399/*************************************************
2400*         Read repeat counts                     *
2401*************************************************/
2402
2403/* Read an item of the form {n,m} and return the values. This is called only
2404after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
2405so the syntax is guaranteed to be correct, but we need to check the values.
2406
2407Arguments:
2408  p              pointer to first char after '{'
2409  minp           pointer to int for min
2410  maxp           pointer to int for max
2411                 returned as -1 if no max
2412  errorcodeptr   points to error code variable
2413
2414Returns:         pointer to '}' on success;
2415                 current ptr on error, with errorcodeptr set non-zero
2416*/
2417
2418static PCRE2_SPTR
2419read_repeat_counts(PCRE2_SPTR p, int *minp, int *maxp, int *errorcodeptr)
2420{
2421int min = 0;
2422int max = -1;
2423
2424while (IS_DIGIT(*p))
2425  {
2426  min = min * 10 + (int)(*p++ - CHAR_0);
2427  if (min > 65535)
2428    {
2429    *errorcodeptr = ERR5;
2430    return p;
2431    }
2432  }
2433
2434if (*p == CHAR_RIGHT_CURLY_BRACKET) max = min; else
2435  {
2436  if (*(++p) != CHAR_RIGHT_CURLY_BRACKET)
2437    {
2438    max = 0;
2439    while(IS_DIGIT(*p))
2440      {
2441      max = max * 10 + (int)(*p++ - CHAR_0);
2442      if (max > 65535)
2443        {
2444        *errorcodeptr = ERR5;
2445        return p;
2446        }
2447      }
2448    if (max < min)
2449      {
2450      *errorcodeptr = ERR4;
2451      return p;
2452      }
2453    }
2454  }
2455
2456*minp = min;
2457*maxp = max;
2458return p;
2459}
2460
2461
2462
2463/*************************************************
2464*   Scan compiled regex for recursion reference  *
2465*************************************************/
2466
2467/* This function scans through a compiled pattern until it finds an instance of
2468OP_RECURSE.
2469
2470Arguments:
2471  code        points to start of expression
2472  utf         TRUE in UTF mode
2473
2474Returns:      pointer to the opcode for OP_RECURSE, or NULL if not found
2475*/
2476
2477static PCRE2_SPTR
2478find_recurse(PCRE2_SPTR code, BOOL utf)
2479{
2480for (;;)
2481  {
2482  register PCRE2_UCHAR c = *code;
2483  if (c == OP_END) return NULL;
2484  if (c == OP_RECURSE) return code;
2485
2486  /* XCLASS is used for classes that cannot be represented just by a bit map.
2487  This includes negated single high-valued characters. CALLOUT_STR is used for
2488  callouts with string arguments. In both cases the length in the table is
2489  zero; the actual length is stored in the compiled code. */
2490
2491  if (c == OP_XCLASS) code += GET(code, 1);
2492    else if (c == OP_CALLOUT_STR) code += GET(code, 1 + 2*LINK_SIZE);
2493
2494  /* Otherwise, we can get the item's length from the table, except that for
2495  repeated character types, we have to test for \p and \P, which have an extra
2496  two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we
2497  must add in its length. */
2498
2499  else
2500    {
2501    switch(c)
2502      {
2503      case OP_TYPESTAR:
2504      case OP_TYPEMINSTAR:
2505      case OP_TYPEPLUS:
2506      case OP_TYPEMINPLUS:
2507      case OP_TYPEQUERY:
2508      case OP_TYPEMINQUERY:
2509      case OP_TYPEPOSSTAR:
2510      case OP_TYPEPOSPLUS:
2511      case OP_TYPEPOSQUERY:
2512      if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
2513      break;
2514
2515      case OP_TYPEPOSUPTO:
2516      case OP_TYPEUPTO:
2517      case OP_TYPEMINUPTO:
2518      case OP_TYPEEXACT:
2519      if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
2520        code += 2;
2521      break;
2522
2523      case OP_MARK:
2524      case OP_PRUNE_ARG:
2525      case OP_SKIP_ARG:
2526      case OP_THEN_ARG:
2527      code += code[1];
2528      break;
2529      }
2530
2531    /* Add in the fixed length from the table */
2532
2533    code += PRIV(OP_lengths)[c];
2534
2535    /* In UTF-8 and UTF-16 modes, opcodes that are followed by a character may
2536    be followed by a multi-unit character. The length in the table is a
2537    minimum, so we have to arrange to skip the extra units. */
2538
2539#ifdef MAYBE_UTF_MULTI
2540    if (utf) switch(c)
2541      {
2542      case OP_CHAR:
2543      case OP_CHARI:
2544      case OP_NOT:
2545      case OP_NOTI:
2546      case OP_EXACT:
2547      case OP_EXACTI:
2548      case OP_NOTEXACT:
2549      case OP_NOTEXACTI:
2550      case OP_UPTO:
2551      case OP_UPTOI:
2552      case OP_NOTUPTO:
2553      case OP_NOTUPTOI:
2554      case OP_MINUPTO:
2555      case OP_MINUPTOI:
2556      case OP_NOTMINUPTO:
2557      case OP_NOTMINUPTOI:
2558      case OP_POSUPTO:
2559      case OP_POSUPTOI:
2560      case OP_NOTPOSUPTO:
2561      case OP_NOTPOSUPTOI:
2562      case OP_STAR:
2563      case OP_STARI:
2564      case OP_NOTSTAR:
2565      case OP_NOTSTARI:
2566      case OP_MINSTAR:
2567      case OP_MINSTARI:
2568      case OP_NOTMINSTAR:
2569      case OP_NOTMINSTARI:
2570      case OP_POSSTAR:
2571      case OP_POSSTARI:
2572      case OP_NOTPOSSTAR:
2573      case OP_NOTPOSSTARI:
2574      case OP_PLUS:
2575      case OP_PLUSI:
2576      case OP_NOTPLUS:
2577      case OP_NOTPLUSI:
2578      case OP_MINPLUS:
2579      case OP_MINPLUSI:
2580      case OP_NOTMINPLUS:
2581      case OP_NOTMINPLUSI:
2582      case OP_POSPLUS:
2583      case OP_POSPLUSI:
2584      case OP_NOTPOSPLUS:
2585      case OP_NOTPOSPLUSI:
2586      case OP_QUERY:
2587      case OP_QUERYI:
2588      case OP_NOTQUERY:
2589      case OP_NOTQUERYI:
2590      case OP_MINQUERY:
2591      case OP_MINQUERYI:
2592      case OP_NOTMINQUERY:
2593      case OP_NOTMINQUERYI:
2594      case OP_POSQUERY:
2595      case OP_POSQUERYI:
2596      case OP_NOTPOSQUERY:
2597      case OP_NOTPOSQUERYI:
2598      if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
2599      break;
2600      }
2601#else
2602    (void)(utf);  /* Keep compiler happy by referencing function argument */
2603#endif  /* MAYBE_UTF_MULTI */
2604    }
2605  }
2606}
2607
2608
2609
2610/*************************************************
2611*           Check for POSIX class syntax         *
2612*************************************************/
2613
2614/* This function is called when the sequence "[:" or "[." or "[=" is
2615encountered in a character class. It checks whether this is followed by a
2616sequence of characters terminated by a matching ":]" or ".]" or "=]". If we
2617reach an unescaped ']' without the special preceding character, return FALSE.
2618
2619Originally, this function only recognized a sequence of letters between the
2620terminators, but it seems that Perl recognizes any sequence of characters,
2621though of course unknown POSIX names are subsequently rejected. Perl gives an
2622"Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE
2623didn't consider this to be a POSIX class. Likewise for [:1234:].
2624
2625The problem in trying to be exactly like Perl is in the handling of escapes. We
2626have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX
2627class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
2628below handles the special cases \\ and \], but does not try to do any other
2629escape processing. This makes it different from Perl for cases such as
2630[:l\ower:] where Perl recognizes it as the POSIX class "lower" but PCRE does
2631not recognize "l\ower". This is a lesser evil than not diagnosing bad classes
2632when Perl does, I think.
2633
2634A user pointed out that PCRE was rejecting [:a[:digit:]] whereas Perl was not.
2635It seems that the appearance of a nested POSIX class supersedes an apparent
2636external class. For example, [:a[:digit:]b:] matches "a", "b", ":", or
2637a digit. This is handled by returning FALSE if the start of a new group with
2638the same terminator is encountered, since the next closing sequence must close
2639the nested group, not the outer one.
2640
2641In Perl, unescaped square brackets may also appear as part of class names. For
2642example, [:a[:abc]b:] gives unknown POSIX class "[:abc]b:]". However, for
2643[:a[:abc]b][b:] it gives unknown POSIX class "[:abc]b][b:]", which does not
2644seem right at all. PCRE does not allow closing square brackets in POSIX class
2645names.
2646
2647Arguments:
2648  ptr      pointer to the initial [
2649  endptr   where to return a pointer to the terminating ':', '.', or '='
2650
2651Returns:   TRUE or FALSE
2652*/
2653
2654static BOOL
2655check_posix_syntax(PCRE2_SPTR ptr, PCRE2_SPTR *endptr)
2656{
2657PCRE2_UCHAR terminator;  /* Don't combine these lines; the Solaris cc */
2658terminator = *(++ptr);   /* compiler warns about "non-constant" initializer. */
2659
2660for (++ptr; *ptr != CHAR_NULL; ptr++)
2661  {
2662  if (*ptr == CHAR_BACKSLASH &&
2663      (ptr[1] == CHAR_RIGHT_SQUARE_BRACKET || ptr[1] == CHAR_BACKSLASH))
2664    ptr++;
2665  else if ((*ptr == CHAR_LEFT_SQUARE_BRACKET && ptr[1] == terminator) ||
2666            *ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE;
2667  else if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
2668    {
2669    *endptr = ptr;
2670    return TRUE;
2671    }
2672  }
2673
2674return FALSE;
2675}
2676
2677
2678
2679/*************************************************
2680*          Check POSIX class name                *
2681*************************************************/
2682
2683/* This function is called to check the name given in a POSIX-style class entry
2684such as [:alnum:].
2685
2686Arguments:
2687  ptr        points to the first letter
2688  len        the length of the name
2689
2690Returns:     a value representing the name, or -1 if unknown
2691*/
2692
2693static int
2694check_posix_name(PCRE2_SPTR ptr, int len)
2695{
2696const char *pn = posix_names;
2697register int yield = 0;
2698while (posix_name_lengths[yield] != 0)
2699  {
2700  if (len == posix_name_lengths[yield] &&
2701    PRIV(strncmp_c8)(ptr, pn, (unsigned int)len) == 0) return yield;
2702  pn += posix_name_lengths[yield] + 1;
2703  yield++;
2704  }
2705return -1;
2706}
2707
2708
2709
2710#ifdef SUPPORT_UNICODE
2711/*************************************************
2712*           Get othercase range                  *
2713*************************************************/
2714
2715/* This function is passed the start and end of a class range in UCT mode. It
2716searches up the characters, looking for ranges of characters in the "other"
2717case. Each call returns the next one, updating the start address. A character
2718with multiple other cases is returned on its own with a special return value.
2719
2720Arguments:
2721  cptr        points to starting character value; updated
2722  d           end value
2723  ocptr       where to put start of othercase range
2724  odptr       where to put end of othercase range
2725
2726Yield:        -1 when no more
2727               0 when a range is returned
2728              >0 the CASESET offset for char with multiple other cases
2729                in this case, ocptr contains the original
2730*/
2731
2732static int
2733get_othercase_range(uint32_t *cptr, uint32_t d, uint32_t *ocptr,
2734  uint32_t *odptr)
2735{
2736uint32_t c, othercase, next;
2737unsigned int co;
2738
2739/* Find the first character that has an other case. If it has multiple other
2740cases, return its case offset value. */
2741
2742for (c = *cptr; c <= d; c++)
2743  {
2744  if ((co = UCD_CASESET(c)) != 0)
2745    {
2746    *ocptr = c++;   /* Character that has the set */
2747    *cptr = c;      /* Rest of input range */
2748    return (int)co;
2749    }
2750  if ((othercase = UCD_OTHERCASE(c)) != c) break;
2751  }
2752
2753if (c > d) return -1;  /* Reached end of range */
2754
2755/* Found a character that has a single other case. Search for the end of the
2756range, which is either the end of the input range, or a character that has zero
2757or more than one other cases. */
2758
2759*ocptr = othercase;
2760next = othercase + 1;
2761
2762for (++c; c <= d; c++)
2763  {
2764  if ((co = UCD_CASESET(c)) != 0 || UCD_OTHERCASE(c) != next) break;
2765  next++;
2766  }
2767
2768*odptr = next - 1;     /* End of othercase range */
2769*cptr = c;             /* Rest of input range */
2770return 0;
2771}
2772#endif  /* SUPPORT_UNICODE */
2773
2774
2775
2776/*************************************************
2777*        Add a character or range to a class     *
2778*************************************************/
2779
2780/* This function packages up the logic of adding a character or range of
2781characters to a class. The character values in the arguments will be within the
2782valid values for the current mode (8-bit, 16-bit, UTF, etc). This function is
2783mutually recursive with the function immediately below.
2784
2785Arguments:
2786  classbits     the bit map for characters < 256
2787  uchardptr     points to the pointer for extra data
2788  options       the options word
2789  cb            compile data
2790  start         start of range character
2791  end           end of range character
2792
2793Returns:        the number of < 256 characters added
2794                the pointer to extra data is updated
2795*/
2796
2797static unsigned int
2798add_to_class(uint8_t *classbits, PCRE2_UCHAR **uchardptr, uint32_t options,
2799  compile_block *cb, uint32_t start, uint32_t end)
2800{
2801uint32_t c;
2802uint32_t classbits_end = (end <= 0xff ? end : 0xff);
2803unsigned int n8 = 0;
2804
2805/* If caseless matching is required, scan the range and process alternate
2806cases. In Unicode, there are 8-bit characters that have alternate cases that
2807are greater than 255 and vice-versa. Sometimes we can just extend the original
2808range. */
2809
2810if ((options & PCRE2_CASELESS) != 0)
2811  {
2812#ifdef SUPPORT_UNICODE
2813  if ((options & PCRE2_UTF) != 0)
2814    {
2815    int rc;
2816    uint32_t oc, od;
2817
2818    options &= ~PCRE2_CASELESS;   /* Remove for recursive calls */
2819    c = start;
2820
2821    while ((rc = get_othercase_range(&c, end, &oc, &od)) >= 0)
2822      {
2823      /* Handle a single character that has more than one other case. */
2824
2825      if (rc > 0) n8 += add_list_to_class(classbits, uchardptr, options, cb,
2826        PRIV(ucd_caseless_sets) + rc, oc);
2827
2828      /* Do nothing if the other case range is within the original range. */
2829
2830      else if (oc >= start && od <= end) continue;
2831
2832      /* Extend the original range if there is overlap, noting that if oc < c, we
2833      can't have od > end because a subrange is always shorter than the basic
2834      range. Otherwise, use a recursive call to add the additional range. */
2835
2836      else if (oc < start && od >= start - 1) start = oc; /* Extend downwards */
2837      else if (od > end && oc <= end + 1)
2838        {
2839        end = od;       /* Extend upwards */
2840        if (end > classbits_end) classbits_end = (end <= 0xff ? end : 0xff);
2841        }
2842      else n8 += add_to_class(classbits, uchardptr, options, cb, oc, od);
2843      }
2844    }
2845  else
2846#endif  /* SUPPORT_UNICODE */
2847
2848  /* Not UTF mode */
2849
2850  for (c = start; c <= classbits_end; c++)
2851    {
2852    SETBIT(classbits, cb->fcc[c]);
2853    n8++;
2854    }
2855  }
2856
2857/* Now handle the original range. Adjust the final value according to the bit
2858length - this means that the same lists of (e.g.) horizontal spaces can be used
2859in all cases. */
2860
2861if ((options & PCRE2_UTF) == 0 && end > MAX_NON_UTF_CHAR)
2862  end = MAX_NON_UTF_CHAR;
2863
2864/* Use the bitmap for characters < 256. Otherwise use extra data.*/
2865
2866for (c = start; c <= classbits_end; c++)
2867  {
2868  /* Regardless of start, c will always be <= 255. */
2869  SETBIT(classbits, c);
2870  n8++;
2871  }
2872
2873#ifdef SUPPORT_WIDE_CHARS
2874if (start <= 0xff) start = 0xff + 1;
2875
2876if (end >= start)
2877  {
2878  PCRE2_UCHAR *uchardata = *uchardptr;
2879
2880#ifdef SUPPORT_UNICODE
2881  if ((options & PCRE2_UTF) != 0)
2882    {
2883    if (start < end)
2884      {
2885      *uchardata++ = XCL_RANGE;
2886      uchardata += PRIV(ord2utf)(start, uchardata);
2887      uchardata += PRIV(ord2utf)(end, uchardata);
2888      }
2889    else if (start == end)
2890      {
2891      *uchardata++ = XCL_SINGLE;
2892      uchardata += PRIV(ord2utf)(start, uchardata);
2893      }
2894    }
2895  else
2896#endif  /* SUPPORT_UNICODE */
2897
2898  /* Without UTF support, character values are constrained by the bit length,
2899  and can only be > 256 for 16-bit and 32-bit libraries. */
2900
2901#if PCRE2_CODE_UNIT_WIDTH == 8
2902    {}
2903#else
2904  if (start < end)
2905    {
2906    *uchardata++ = XCL_RANGE;
2907    *uchardata++ = start;
2908    *uchardata++ = end;
2909    }
2910  else if (start == end)
2911    {
2912    *uchardata++ = XCL_SINGLE;
2913    *uchardata++ = start;
2914    }
2915#endif
2916  *uchardptr = uchardata;   /* Updata extra data pointer */
2917  }
2918#else
2919  (void)uchardptr;          /* Avoid compiler warning */
2920#endif /* SUPPORT_WIDE_CHARS */
2921
2922return n8;    /* Number of 8-bit characters */
2923}
2924
2925
2926
2927/*************************************************
2928*        Add a list of characters to a class     *
2929*************************************************/
2930
2931/* This function is used for adding a list of case-equivalent characters to a
2932class, and also for adding a list of horizontal or vertical whitespace. If the
2933list is in order (which it should be), ranges of characters are detected and
2934handled appropriately. This function is mutually recursive with the function
2935above.
2936
2937Arguments:
2938  classbits     the bit map for characters < 256
2939  uchardptr     points to the pointer for extra data
2940  options       the options word
2941  cb            contains pointers to tables etc.
2942  p             points to row of 32-bit values, terminated by NOTACHAR
2943  except        character to omit; this is used when adding lists of
2944                  case-equivalent characters to avoid including the one we
2945                  already know about
2946
2947Returns:        the number of < 256 characters added
2948                the pointer to extra data is updated
2949*/
2950
2951static unsigned int
2952add_list_to_class(uint8_t *classbits, PCRE2_UCHAR **uchardptr, uint32_t options,
2953  compile_block *cb, const uint32_t *p, unsigned int except)
2954{
2955unsigned int n8 = 0;
2956while (p[0] < NOTACHAR)
2957  {
2958  unsigned int n = 0;
2959  if (p[0] != except)
2960    {
2961    while(p[n+1] == p[0] + n + 1) n++;
2962    n8 += add_to_class(classbits, uchardptr, options, cb, p[0], p[n]);
2963    }
2964  p += n + 1;
2965  }
2966return n8;
2967}
2968
2969
2970
2971/*************************************************
2972*    Add characters not in a list to a class     *
2973*************************************************/
2974
2975/* This function is used for adding the complement of a list of horizontal or
2976vertical whitespace to a class. The list must be in order.
2977
2978Arguments:
2979  classbits     the bit map for characters < 256
2980  uchardptr     points to the pointer for extra data
2981  options       the options word
2982  cb            contains pointers to tables etc.
2983  p             points to row of 32-bit values, terminated by NOTACHAR
2984
2985Returns:        the number of < 256 characters added
2986                the pointer to extra data is updated
2987*/
2988
2989static unsigned int
2990add_not_list_to_class(uint8_t *classbits, PCRE2_UCHAR **uchardptr,
2991  uint32_t options, compile_block *cb, const uint32_t *p)
2992{
2993BOOL utf = (options & PCRE2_UTF) != 0;
2994unsigned int n8 = 0;
2995if (p[0] > 0)
2996  n8 += add_to_class(classbits, uchardptr, options, cb, 0, p[0] - 1);
2997while (p[0] < NOTACHAR)
2998  {
2999  while (p[1] == p[0] + 1) p++;
3000  n8 += add_to_class(classbits, uchardptr, options, cb, p[0] + 1,
3001    (p[1] == NOTACHAR) ? (utf ? 0x10ffffu : 0xffffffffu) : p[1] - 1);
3002  p++;
3003  }
3004return n8;
3005}
3006
3007
3008
3009/*************************************************
3010*       Process (*VERB) name for escapes         *
3011*************************************************/
3012
3013/* This function is called when the PCRE2_ALT_VERBNAMES option is set, to
3014process the characters in a verb's name argument. It is called twice, once with
3015codeptr == NULL, to find out the length of the processed name, and again to put
3016the name into memory.
3017
3018Arguments:
3019  ptrptr        pointer to the input pointer
3020  codeptr       pointer to the compiled code pointer
3021  errorcodeptr  pointer to the error code
3022  options       the options bits
3023  utf           TRUE if processing UTF
3024  cb            compile data block
3025
3026Returns:        length of the processed name, or < 0 on error
3027*/
3028
3029static int
3030process_verb_name(PCRE2_SPTR *ptrptr, PCRE2_UCHAR **codeptr, int *errorcodeptr,
3031  uint32_t options, BOOL utf, compile_block *cb)
3032{
3033int32_t arglen = 0;
3034BOOL inescq = FALSE;
3035PCRE2_SPTR ptr = *ptrptr;
3036PCRE2_UCHAR *code = (codeptr == NULL)? NULL : *codeptr;
3037
3038for (; ptr < cb->end_pattern; ptr++)
3039  {
3040  uint32_t x = *ptr;
3041
3042  /* Skip over literals */
3043
3044  if (inescq)
3045    {
3046    if (x == CHAR_BACKSLASH && ptr[1] == CHAR_E)
3047      {
3048      inescq = FALSE;
3049      ptr++;;
3050      continue;
3051      }
3052    }
3053
3054  else  /* Not a literal character */
3055    {
3056    if (x == CHAR_RIGHT_PARENTHESIS) break;
3057
3058    /* Skip over comments and whitespace in extended mode. */
3059
3060    if ((options & PCRE2_EXTENDED) != 0)
3061      {
3062      PCRE2_SPTR wscptr = ptr;
3063      while (MAX_255(x) && (cb->ctypes[x] & ctype_space) != 0) x = *(++ptr);
3064      if (x == CHAR_NUMBER_SIGN)
3065        {
3066        ptr++;
3067        while (*ptr != CHAR_NULL || ptr < cb->end_pattern)
3068          {
3069          if (IS_NEWLINE(ptr))       /* For non-fixed-length newline cases, */
3070            {                        /* IS_NEWLINE sets cb->nllen. */
3071            ptr += cb->nllen;
3072            break;
3073            }
3074          ptr++;
3075#ifdef SUPPORT_UNICODE
3076          if (utf) FORWARDCHAR(ptr);
3077#endif
3078          }
3079        }
3080
3081      /* If we have skipped any characters, restart the loop. */
3082
3083      if (ptr > wscptr)
3084        {
3085        ptr--;
3086        continue;
3087        }
3088      }
3089
3090    /* Process escapes */
3091
3092    if (x == '\\')
3093      {
3094      int rc;
3095      *errorcodeptr = 0;
3096      rc = PRIV(check_escape)(&ptr, cb->end_pattern, &x, errorcodeptr, options,
3097        FALSE, cb);
3098      *ptrptr = ptr;   /* For possible error */
3099      if (*errorcodeptr != 0) return -1;
3100      if (rc != 0)
3101        {
3102        if (rc == ESC_Q)
3103          {
3104          inescq = TRUE;
3105          continue;
3106          }
3107        if (rc == ESC_E) continue;
3108        *errorcodeptr = ERR40;
3109        return -1;
3110        }
3111      }
3112    }
3113
3114  /* We have the next character in the name. */
3115
3116#ifdef SUPPORT_UNICODE
3117  if (utf)
3118    {
3119    if (code == NULL)   /* Just want the length */
3120      {
3121#if PCRE2_CODE_UNIT_WIDTH == 8
3122      int i;
3123      for (i = 0; i < PRIV(utf8_table1_size); i++)
3124        if ((int)x <= PRIV(utf8_table1)[i]) break;
3125      arglen += i;
3126#elif PCRE2_CODE_UNIT_WIDTH == 16
3127      if (x > 0xffff) arglen++;
3128#endif
3129      }
3130    else
3131      {
3132      PCRE2_UCHAR cbuff[8];
3133      x = PRIV(ord2utf)(x, cbuff);
3134      memcpy(code, cbuff, CU2BYTES(x));
3135      code += x;
3136      }
3137    }
3138  else
3139#endif  /* SUPPORT_UNICODE */
3140
3141  /* Not UTF */
3142    {
3143    if (code != NULL) *code++ = (PCRE2_UCHAR)x;
3144    }
3145
3146  arglen++;
3147
3148  if ((unsigned int)arglen > MAX_MARK)
3149    {
3150    *errorcodeptr = ERR76;
3151    *ptrptr = ptr;
3152    return -1;
3153    }
3154  }
3155
3156/* Update the pointers before returning. */
3157
3158*ptrptr = ptr;
3159if (codeptr != NULL) *codeptr = code;
3160return arglen;
3161}
3162
3163
3164
3165/*************************************************
3166*          Macro for the next two functions      *
3167*************************************************/
3168
3169/* Both scan_for_captures() and compile_branch() use this macro to generate a
3170fragment of code that reads the characters of a name and sets its length
3171(checking for not being too long). Count the characters dynamically, to avoid
3172the possibility of integer overflow. The same macro is used for reading *VERB
3173names. */
3174
3175#define READ_NAME(ctype, errno, errset)                      \
3176  namelen = 0;                                               \
3177  while (MAX_255(*ptr) && (cb->ctypes[*ptr] & ctype) != 0)   \
3178    {                                                        \
3179    ptr++;                                                   \
3180    namelen++;                                               \
3181    if (namelen > MAX_NAME_SIZE)                             \
3182      {                                                      \
3183      errset = errno;                                        \
3184      goto FAILED;                                           \
3185      }                                                      \
3186    }
3187
3188
3189
3190/*************************************************
3191*      Scan regex to identify named groups       *
3192*************************************************/
3193
3194/* This function is called first of all, to scan for named capturing groups so
3195that information about them is fully available to both the compiling scans.
3196It skips over everything except parenthesized items.
3197
3198Arguments:
3199  ptrptr   points to pointer to the start of the pattern
3200  options  compiling dynamic options
3201  cb       pointer to the compile data block
3202
3203Returns:   zero on success or a non-zero error code, with pointer updated
3204*/
3205
3206typedef struct nest_save {
3207  uint16_t  nest_depth;
3208  uint16_t  reset_group;
3209  uint16_t  max_group;
3210  uint16_t  flags;
3211} nest_save;
3212
3213#define NSF_RESET    0x0001u
3214#define NSF_EXTENDED 0x0002u
3215#define NSF_DUPNAMES 0x0004u
3216
3217static int scan_for_captures(PCRE2_SPTR *ptrptr, uint32_t options,
3218  compile_block *cb)
3219{
3220uint32_t c;
3221uint32_t delimiter;
3222uint32_t set, unset, *optset;
3223uint32_t skiptoket = 0;
3224uint16_t nest_depth = 0;
3225int errorcode = 0;
3226int escape;
3227int namelen;
3228int i;
3229BOOL inescq = FALSE;
3230BOOL isdupname;
3231BOOL utf = (options & PCRE2_UTF) != 0;
3232BOOL negate_class;
3233PCRE2_SPTR name;
3234PCRE2_SPTR start;
3235PCRE2_SPTR ptr = *ptrptr;
3236named_group *ng;
3237nest_save *top_nest = NULL;
3238nest_save *end_nests = (nest_save *)(cb->start_workspace + cb->workspace_size);
3239
3240/* The size of the nest_save structure might not be a factor of the size of the
3241workspace. Therefore we must round down end_nests so as to correctly avoid
3242creating a nest_save that spans the end of the workspace. */
3243
3244end_nests = (nest_save *)((char *)end_nests -
3245  ((cb->workspace_size * sizeof(PCRE2_UCHAR)) % sizeof(nest_save)));
3246
3247/* Now scan the pattern */
3248
3249for (; ptr < cb->end_pattern; ptr++)
3250  {
3251  c = *ptr;
3252
3253  /* Parenthesized groups set skiptoket when all following characters up to the
3254  next closing parenthesis must be ignored. The parenthesis itself must be
3255  processed (to end the nested parenthesized item). */
3256
3257  if (skiptoket != 0)
3258    {
3259    if (c != CHAR_RIGHT_PARENTHESIS) continue;
3260    skiptoket = 0;
3261    }
3262
3263  /* Skip over literals */
3264
3265  if (inescq)
3266    {
3267    if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E)
3268      {
3269      inescq = FALSE;
3270      ptr++;
3271      }
3272    continue;
3273    }
3274
3275  /* Skip over # comments and whitespace in extended mode. */
3276
3277  if ((options & PCRE2_EXTENDED) != 0)
3278    {
3279    PCRE2_SPTR wscptr = ptr;
3280    while (MAX_255(c) && (cb->ctypes[c] & ctype_space) != 0) c = *(++ptr);
3281    if (c == CHAR_NUMBER_SIGN)
3282      {
3283      ptr++;
3284      while (ptr < cb->end_pattern)
3285        {
3286        if (IS_NEWLINE(ptr))         /* For non-fixed-length newline cases, */
3287          {                          /* IS_NEWLINE sets cb->nllen. */
3288          ptr += cb->nllen;
3289          break;
3290          }
3291        ptr++;
3292#ifdef SUPPORT_UNICODE
3293        if (utf) FORWARDCHAR(ptr);
3294#endif
3295        }
3296      }
3297
3298    /* If we skipped any characters, restart the loop. Otherwise, we didn't see
3299    a comment. */
3300
3301    if (ptr > wscptr)
3302      {
3303      ptr--;
3304      continue;
3305      }
3306    }
3307
3308  /* Process the next pattern item. */
3309
3310  switch(c)
3311    {
3312    default:              /* Most characters are just skipped */
3313    break;
3314
3315    /* Skip escapes except for \Q */
3316
3317    case CHAR_BACKSLASH:
3318    errorcode = 0;
3319    escape = PRIV(check_escape)(&ptr, cb->end_pattern, &c, &errorcode, options,
3320      FALSE, cb);
3321    if (errorcode != 0) goto FAILED;
3322    if (escape == ESC_Q) inescq = TRUE;
3323    break;
3324
3325    /* Skip a character class. The syntax is complicated so we have to
3326    replicate some of what happens when a class is processed for real. */
3327
3328    case CHAR_LEFT_SQUARE_BRACKET:
3329    if (PRIV(strncmp_c8)(ptr+1, STRING_WEIRD_STARTWORD, 6) == 0 ||
3330        PRIV(strncmp_c8)(ptr+1, STRING_WEIRD_ENDWORD, 6) == 0)
3331      {
3332      ptr += 6;
3333      break;
3334      }
3335
3336    /* If the first character is '^', set the negation flag (not actually used
3337    here, except to recognize only one ^) and skip it. If the first few
3338    characters (either before or after ^) are \Q\E or \E we skip them too. This
3339    makes for compatibility with Perl. */
3340
3341    negate_class = FALSE;
3342    for (;;)
3343      {
3344      c = *(++ptr);   /* First character in class */
3345      if (c == CHAR_BACKSLASH)
3346        {
3347        if (ptr[1] == CHAR_E)
3348          ptr++;
3349        else if (PRIV(strncmp_c8)(ptr + 1, STR_Q STR_BACKSLASH STR_E, 3) == 0)
3350          ptr += 3;
3351        else
3352          break;
3353        }
3354      else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT)
3355        negate_class = TRUE;
3356      else break;
3357      }
3358
3359    if (c == CHAR_RIGHT_SQUARE_BRACKET &&
3360        (cb->external_options & PCRE2_ALLOW_EMPTY_CLASS) != 0)
3361      break;
3362
3363    /* Loop for the contents of the class */
3364
3365    for (;;)
3366      {
3367      PCRE2_SPTR tempptr;
3368
3369      if (c == CHAR_NULL && ptr >= cb->end_pattern)
3370        {
3371        errorcode = ERR6;  /* Missing terminating ']' */
3372        goto FAILED;
3373        }
3374
3375#ifdef SUPPORT_UNICODE
3376      if (utf && HAS_EXTRALEN(c))
3377        {                           /* Braces are required because the */
3378        GETCHARLEN(c, ptr, ptr);    /* macro generates multiple statements */
3379        }
3380#endif
3381
3382      /* Inside \Q...\E everything is literal except \E */
3383
3384      if (inescq)
3385        {
3386        if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E)  /* If we are at \E */
3387          {
3388          inescq = FALSE;                   /* Reset literal state */
3389          ptr++;                            /* Skip the 'E' */
3390          }
3391        goto CONTINUE_CLASS;
3392        }
3393
3394      /* Skip POSIX class names. */
3395      if (c == CHAR_LEFT_SQUARE_BRACKET &&
3396          (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
3397           ptr[1] == CHAR_EQUALS_SIGN) && check_posix_syntax(ptr, &tempptr))
3398        {
3399        ptr = tempptr + 1;
3400        }
3401      else if (c == CHAR_BACKSLASH)
3402        {
3403        errorcode = 0;
3404        escape = PRIV(check_escape)(&ptr, cb->end_pattern, &c, &errorcode,
3405          options, TRUE, cb);
3406        if (errorcode != 0) goto FAILED;
3407        if (escape == ESC_Q) inescq = TRUE;
3408        }
3409
3410      CONTINUE_CLASS:
3411      c = *(++ptr);
3412      if (c == CHAR_RIGHT_SQUARE_BRACKET && !inescq) break;
3413      }     /* End of class-processing loop */
3414    break;
3415
3416    /* This is the real work of this function - handling parentheses. */
3417
3418    case CHAR_LEFT_PARENTHESIS:
3419    nest_depth++;
3420
3421    if (ptr[1] != CHAR_QUESTION_MARK)
3422      {
3423      if (ptr[1] != CHAR_ASTERISK)
3424        {
3425        if ((options & PCRE2_NO_AUTO_CAPTURE) == 0) cb->bracount++;
3426        }
3427
3428      /* (*something) - skip over a name, and then just skip to closing ket
3429      unless PCRE2_ALT_VERBNAMES is set, in which case we have to process
3430      escapes in the string after a verb name terminated by a colon. */
3431
3432      else
3433        {
3434        ptr += 2;
3435        while (MAX_255(*ptr) && (cb->ctypes[*ptr] & ctype_word) != 0) ptr++;
3436        if (*ptr == CHAR_COLON && (options & PCRE2_ALT_VERBNAMES) != 0)
3437          {
3438          ptr++;
3439          if (process_verb_name(&ptr, NULL, &errorcode, options, utf, cb) < 0)
3440            goto FAILED;
3441          }
3442        else
3443          {
3444          while (ptr < cb->end_pattern && *ptr != CHAR_RIGHT_PARENTHESIS)
3445            ptr++;
3446          }
3447        nest_depth--;
3448        }
3449      }
3450
3451    /* Handle (?...) groups */
3452
3453    else switch(ptr[2])
3454      {
3455      default:
3456      ptr += 2;
3457      if (ptr[0] == CHAR_R ||                           /* (?R) */
3458          ptr[0] == CHAR_NUMBER_SIGN ||                 /* (?#) */
3459          IS_DIGIT(ptr[0]) ||                           /* (?n) */
3460          (ptr[0] == CHAR_MINUS && IS_DIGIT(ptr[1])))   /* (?-n) */
3461        {
3462        skiptoket = ptr[0];
3463        break;
3464        }
3465
3466      /* Handle (?| and (?imsxJU: which are the only other valid forms. Both
3467      need a new block on the nest stack. */
3468
3469      if (top_nest == NULL) top_nest = (nest_save *)(cb->start_workspace);
3470      else if (++top_nest >= end_nests)
3471        {
3472        errorcode = ERR84;
3473        goto FAILED;
3474        }
3475      top_nest->nest_depth = nest_depth;
3476      top_nest->flags = 0;
3477      if ((options & PCRE2_EXTENDED) != 0) top_nest->flags |= NSF_EXTENDED;
3478      if ((options & PCRE2_DUPNAMES) != 0) top_nest->flags |= NSF_DUPNAMES;
3479
3480      if (*ptr == CHAR_VERTICAL_LINE)
3481        {
3482        top_nest->reset_group = (uint16_t)cb->bracount;
3483        top_nest->max_group = (uint16_t)cb->bracount;
3484        top_nest->flags |= NSF_RESET;
3485        cb->external_flags |= PCRE2_DUPCAPUSED;
3486        break;
3487        }
3488
3489      /* Scan options */
3490
3491      top_nest->reset_group = 0;
3492      top_nest->max_group = 0;
3493
3494      set = unset = 0;
3495      optset = &set;
3496
3497      /* Need only track (?x: and (?J: at this stage */
3498
3499      while (*ptr != CHAR_RIGHT_PARENTHESIS && *ptr != CHAR_COLON)
3500        {
3501        switch (*ptr++)
3502          {
3503          case CHAR_MINUS: optset = &unset; break;
3504
3505          case CHAR_x: *optset |= PCRE2_EXTENDED; break;
3506
3507          case CHAR_J:
3508          *optset |= PCRE2_DUPNAMES;
3509          cb->external_flags |= PCRE2_JCHANGED;
3510          break;
3511
3512          case CHAR_i:
3513          case CHAR_m:
3514          case CHAR_s:
3515          case CHAR_U:
3516          break;
3517
3518          default:
3519          errorcode = ERR11;
3520          ptr--;    /* Correct the offset */
3521          goto FAILED;
3522          }
3523        }
3524
3525      options = (options | set) & (~unset);
3526
3527      /* If the options ended with ')' this is not the start of a nested
3528      group with option changes, so the options change at this level. If the
3529      previous level set up a nest block, discard the one we have just created.
3530      Otherwise adjust it for the previous level. */
3531
3532      if (*ptr == CHAR_RIGHT_PARENTHESIS)
3533        {
3534        nest_depth--;
3535        if (top_nest > (nest_save *)(cb->start_workspace) &&
3536            (top_nest-1)->nest_depth == nest_depth) top_nest --;
3537        else top_nest->nest_depth = nest_depth;
3538        }
3539      break;
3540
3541      /* Skip over a numerical or string argument for a callout. */
3542
3543      case CHAR_C:
3544      ptr += 2;
3545      if (ptr[1] == CHAR_RIGHT_PARENTHESIS) break;
3546      if (IS_DIGIT(ptr[1]))
3547        {
3548        while (IS_DIGIT(ptr[1])) ptr++;
3549        }
3550
3551      /* Handle a string argument */
3552
3553      else
3554        {
3555        ptr++;
3556        delimiter = 0;
3557        for (i = 0; PRIV(callout_start_delims)[i] != 0; i++)
3558          {
3559          if (*ptr == PRIV(callout_start_delims)[i])
3560            {
3561            delimiter = PRIV(callout_end_delims)[i];
3562            break;
3563            }
3564          }
3565
3566        if (delimiter == 0)
3567          {
3568          errorcode = ERR82;
3569          goto FAILED;
3570          }
3571
3572        start = ptr;
3573        do
3574          {
3575          if (++ptr >= cb->end_pattern)
3576            {
3577            errorcode = ERR81;
3578            ptr = start;   /* To give a more useful message */
3579            goto FAILED;
3580            }
3581          if (ptr[0] == delimiter && ptr[1] == delimiter) ptr += 2;
3582          }
3583        while (ptr[0] != delimiter);
3584        }
3585
3586      /* Check terminating ) */
3587
3588      if (ptr[1] != CHAR_RIGHT_PARENTHESIS)
3589        {
3590        errorcode = ERR39;
3591        ptr++;
3592        goto FAILED;
3593        }
3594      break;
3595
3596      /* Conditional group */
3597
3598      case CHAR_LEFT_PARENTHESIS:
3599      if (ptr[3] != CHAR_QUESTION_MARK)   /* Not assertion or callout */
3600        {
3601        nest_depth++;
3602        ptr += 2;
3603        break;
3604        }
3605
3606      /* Must be an assertion or a callout */
3607
3608      switch(ptr[4])
3609       {
3610       case CHAR_LESS_THAN_SIGN:
3611       if (ptr[5] != CHAR_EXCLAMATION_MARK && ptr[5] != CHAR_EQUALS_SIGN)
3612         goto MISSING_ASSERTION;
3613       /* Fall through */
3614
3615       case CHAR_C:
3616       case CHAR_EXCLAMATION_MARK:
3617       case CHAR_EQUALS_SIGN:
3618       ptr++;
3619       break;
3620
3621       default:
3622       MISSING_ASSERTION:
3623       ptr += 3;            /* To improve error message */
3624       errorcode = ERR28;
3625       goto FAILED;
3626       }
3627      break;
3628
3629      case CHAR_COLON:
3630      case CHAR_GREATER_THAN_SIGN:
3631      case CHAR_EQUALS_SIGN:
3632      case CHAR_EXCLAMATION_MARK:
3633      case CHAR_AMPERSAND:
3634      case CHAR_PLUS:
3635      ptr += 2;
3636      break;
3637
3638      case CHAR_P:
3639      if (ptr[3] != CHAR_LESS_THAN_SIGN)
3640        {
3641        ptr += 3;
3642        break;
3643        }
3644      ptr++;
3645      c = CHAR_GREATER_THAN_SIGN;   /* Terminator */
3646      goto DEFINE_NAME;
3647
3648      case CHAR_LESS_THAN_SIGN:
3649      if (ptr[3] == CHAR_EQUALS_SIGN || ptr[3] == CHAR_EXCLAMATION_MARK)
3650        {
3651        ptr += 3;
3652        break;
3653        }
3654      c = CHAR_GREATER_THAN_SIGN;   /* Terminator */
3655      goto DEFINE_NAME;
3656
3657      case CHAR_APOSTROPHE:
3658      c = CHAR_APOSTROPHE;    /* Terminator */
3659
3660      DEFINE_NAME:
3661      name = ptr = ptr + 3;
3662
3663      if (*ptr == c)          /* Empty name */
3664        {
3665        errorcode = ERR62;
3666        goto FAILED;
3667        }
3668
3669      if (IS_DIGIT(*ptr))
3670        {
3671        errorcode = ERR44;   /* Group name must start with non-digit */
3672        goto FAILED;
3673        }
3674
3675      if (MAX_255(*ptr) && (cb->ctypes[*ptr] & ctype_word) == 0)
3676        {
3677        errorcode = ERR24;
3678        goto FAILED;
3679        }
3680
3681      /* Advance ptr, set namelen and check its length. */
3682      READ_NAME(ctype_word, ERR48, errorcode);
3683
3684      if (*ptr != c)
3685        {
3686        errorcode = ERR42;
3687        goto FAILED;
3688        }
3689
3690      if (cb->names_found >= MAX_NAME_COUNT)
3691        {
3692        errorcode = ERR49;
3693        goto FAILED;
3694        }
3695
3696      if (namelen + IMM2_SIZE + 1 > cb->name_entry_size)
3697        cb->name_entry_size = (uint16_t)(namelen + IMM2_SIZE + 1);
3698
3699      /* We have a valid name for this capturing group. */
3700
3701      cb->bracount++;
3702
3703      /* Scan the list to check for duplicates. For duplicate names, if the
3704      number is the same, break the loop, which causes the name to be
3705      discarded; otherwise, if DUPNAMES is not set, give an error.
3706      If it is set, allow the name with a different number, but continue
3707      scanning in case this is a duplicate with the same number. For
3708      non-duplicate names, give an error if the number is duplicated. */
3709
3710      isdupname = FALSE;
3711      ng = cb->named_groups;
3712      for (i = 0; i < cb->names_found; i++, ng++)
3713        {
3714        if (namelen == ng->length &&
3715            PRIV(strncmp)(name, ng->name, (size_t)namelen) == 0)
3716          {
3717          if (ng->number == cb->bracount) break;
3718          if ((options & PCRE2_DUPNAMES) == 0)
3719            {
3720            errorcode = ERR43;
3721            goto FAILED;
3722            }
3723          isdupname = ng->isdup = TRUE;     /* Mark as a duplicate */
3724          cb->dupnames = TRUE;              /* Duplicate names exist */
3725          }
3726        else if (ng->number == cb->bracount)
3727          {
3728          errorcode = ERR65;
3729          goto FAILED;
3730          }
3731        }
3732
3733      if (i < cb->names_found) break;   /* Ignore duplicate with same number */
3734
3735      /* Increase the list size if necessary */
3736
3737      if (cb->names_found >= cb->named_group_list_size)
3738        {
3739        uint32_t newsize = cb->named_group_list_size * 2;
3740        named_group *newspace =
3741          cb->cx->memctl.malloc(newsize * sizeof(named_group),
3742          cb->cx->memctl.memory_data);
3743        if (newspace == NULL)
3744          {
3745          errorcode = ERR21;
3746          goto FAILED;
3747          }
3748
3749        memcpy(newspace, cb->named_groups,
3750          cb->named_group_list_size * sizeof(named_group));
3751        if (cb->named_group_list_size > NAMED_GROUP_LIST_SIZE)
3752          cb->cx->memctl.free((void *)cb->named_groups,
3753          cb->cx->memctl.memory_data);
3754        cb->named_groups = newspace;
3755        cb->named_group_list_size = newsize;
3756        }
3757
3758      /* Add this name to the list */
3759
3760      cb->named_groups[cb->names_found].name = name;
3761      cb->named_groups[cb->names_found].length = (uint16_t)namelen;
3762      cb->named_groups[cb->names_found].number = cb->bracount;
3763      cb->named_groups[cb->names_found].isdup = (uint16_t)isdupname;
3764      cb->names_found++;
3765      break;
3766      }        /* End of (? switch */
3767    break;     /* End of ( handling */
3768
3769    /* At an alternation, reset the capture count if we are in a (?| group. */
3770
3771    case CHAR_VERTICAL_LINE:
3772    if (top_nest != NULL && top_nest->nest_depth == nest_depth &&
3773        (top_nest->flags & NSF_RESET) != 0)
3774      {
3775      if (cb->bracount > top_nest->max_group)
3776        top_nest->max_group = (uint16_t)cb->bracount;
3777      cb->bracount = top_nest->reset_group;
3778      }
3779    break;
3780
3781    /* At a right parenthesis, reset the capture count to the maximum if we
3782    are in a (?| group and/or reset the extended option. */
3783
3784    case CHAR_RIGHT_PARENTHESIS:
3785    if (top_nest != NULL && top_nest->nest_depth == nest_depth)
3786      {
3787      if ((top_nest->flags & NSF_RESET) != 0 &&
3788          top_nest->max_group > cb->bracount)
3789        cb->bracount = top_nest->max_group;
3790      if ((top_nest->flags & NSF_EXTENDED) != 0) options |= PCRE2_EXTENDED;
3791        else options &= ~PCRE2_EXTENDED;
3792      if ((top_nest->flags & NSF_DUPNAMES) != 0) options |= PCRE2_DUPNAMES;
3793        else options &= ~PCRE2_DUPNAMES;
3794      if (top_nest == (nest_save *)(cb->start_workspace)) top_nest = NULL;
3795        else top_nest--;
3796      }
3797    if (nest_depth == 0)    /* Unmatched closing parenthesis */
3798      {
3799      errorcode = ERR22;
3800      goto FAILED;
3801      }
3802    nest_depth--;
3803    break;
3804    }
3805  }
3806
3807if (nest_depth == 0)
3808  {
3809  cb->final_bracount = cb->bracount;
3810  return 0;
3811  }
3812
3813/* We give a special error for a missing closing parentheses after (?# because
3814it might otherwise be hard to see where the missing character is. */
3815
3816errorcode = (skiptoket == CHAR_NUMBER_SIGN)? ERR18 : ERR14;
3817
3818FAILED:
3819*ptrptr = ptr;
3820return errorcode;
3821}
3822
3823
3824
3825/*************************************************
3826*           Compile one branch                   *
3827*************************************************/
3828
3829/* Scan the pattern, compiling it into the a vector. If the options are
3830changed during the branch, the pointer is used to change the external options
3831bits. This function is used during the pre-compile phase when we are trying
3832to find out the amount of memory needed, as well as during the real compile
3833phase. The value of lengthptr distinguishes the two phases.
3834
3835Arguments:
3836  optionsptr        pointer to the option bits
3837  codeptr           points to the pointer to the current code point
3838  ptrptr            points to the current pattern pointer
3839  errorcodeptr      points to error code variable
3840  firstcuptr        place to put the first required code unit
3841  firstcuflagsptr   place to put the first code unit flags, or a negative number
3842  reqcuptr          place to put the last required code unit
3843  reqcuflagsptr     place to put the last required code unit flags, or a negative number
3844  bcptr             points to current branch chain
3845  cond_depth        conditional nesting depth
3846  cb                contains pointers to tables etc.
3847  lengthptr         NULL during the real compile phase
3848                    points to length accumulator during pre-compile phase
3849
3850Returns:            TRUE on success
3851                    FALSE, with *errorcodeptr set non-zero on error
3852*/
3853
3854static BOOL
3855compile_branch(uint32_t *optionsptr, PCRE2_UCHAR **codeptr,
3856  PCRE2_SPTR *ptrptr, int *errorcodeptr,
3857  uint32_t *firstcuptr, int32_t *firstcuflagsptr,
3858  uint32_t *reqcuptr, int32_t *reqcuflagsptr,
3859  branch_chain *bcptr, int cond_depth,
3860  compile_block *cb, size_t *lengthptr)
3861{
3862int repeat_min = 0, repeat_max = 0;      /* To please picky compilers */
3863int bravalue = 0;
3864uint32_t greedy_default, greedy_non_default;
3865uint32_t repeat_type, op_type;
3866uint32_t options = *optionsptr;               /* May change dynamically */
3867uint32_t firstcu, reqcu;
3868int32_t firstcuflags, reqcuflags;
3869uint32_t zeroreqcu, zerofirstcu;
3870int32_t zeroreqcuflags, zerofirstcuflags;
3871int32_t req_caseopt, reqvary, tempreqvary;
3872int after_manual_callout = 0;
3873int escape;
3874size_t length_prevgroup = 0;
3875register uint32_t c;
3876register PCRE2_UCHAR *code = *codeptr;
3877PCRE2_UCHAR *last_code = code;
3878PCRE2_UCHAR *orig_code = code;
3879PCRE2_UCHAR *tempcode;
3880BOOL inescq = FALSE;
3881BOOL groupsetfirstcu = FALSE;
3882PCRE2_SPTR ptr = *ptrptr;
3883PCRE2_SPTR tempptr;
3884PCRE2_UCHAR *previous = NULL;
3885PCRE2_UCHAR *previous_callout = NULL;
3886uint8_t classbits[32];
3887
3888/* We can fish out the UTF setting once and for all into a BOOL, but we must
3889not do this for other options (e.g. PCRE2_EXTENDED) because they may change
3890dynamically as we process the pattern. */
3891
3892#ifdef SUPPORT_UNICODE
3893BOOL utf = (options & PCRE2_UTF) != 0;
3894#if PCRE2_CODE_UNIT_WIDTH != 32
3895PCRE2_UCHAR utf_units[6];      /* For setting up multi-cu chars */
3896#endif
3897
3898#else  /* No UTF support */
3899BOOL utf = FALSE;
3900#endif
3901
3902/* Helper variables for OP_XCLASS opcode (for characters > 255). We define
3903class_uchardata always so that it can be passed to add_to_class() always,
3904though it will not be used in non-UTF 8-bit cases. This avoids having to supply
3905alternative calls for the different cases. */
3906
3907PCRE2_UCHAR *class_uchardata;
3908#ifdef SUPPORT_WIDE_CHARS
3909BOOL xclass;
3910PCRE2_UCHAR *class_uchardata_base;
3911#endif
3912
3913/* Set up the default and non-default settings for greediness */
3914
3915greedy_default = ((options & PCRE2_UNGREEDY) != 0);
3916greedy_non_default = greedy_default ^ 1;
3917
3918/* Initialize no first unit, no required unit. REQ_UNSET means "no char
3919matching encountered yet". It gets changed to REQ_NONE if we hit something that
3920matches a non-fixed first unit; reqcu just remains unset if we never find one.
3921
3922When we hit a repeat whose minimum is zero, we may have to adjust these values
3923to take the zero repeat into account. This is implemented by setting them to
3924zerofirstcu and zeroreqcu when such a repeat is encountered. The individual
3925item types that can be repeated set these backoff variables appropriately. */
3926
3927firstcu = reqcu = zerofirstcu = zeroreqcu = 0;
3928firstcuflags = reqcuflags = zerofirstcuflags = zeroreqcuflags = REQ_UNSET;
3929
3930/* The variable req_caseopt contains either the REQ_CASELESS value or zero,
3931according to the current setting of the caseless flag. The REQ_CASELESS value
3932leaves the lower 28 bit empty. It is added into the firstcu or reqcu variables
3933to record the case status of the value. This is used only for ASCII characters.
3934*/
3935
3936req_caseopt = ((options & PCRE2_CASELESS) != 0)? REQ_CASELESS:0;
3937
3938/* Switch on next character until the end of the branch */
3939
3940for (;; ptr++)
3941  {
3942  BOOL negate_class;
3943  BOOL should_flip_negation;
3944  BOOL match_all_or_no_wide_chars;
3945  BOOL possessive_quantifier;
3946  BOOL is_quantifier;
3947  BOOL is_recurse;
3948  BOOL is_dupname;
3949  BOOL reset_bracount;
3950  int class_has_8bitchar;
3951  int class_one_char;
3952#ifdef SUPPORT_WIDE_CHARS
3953  BOOL xclass_has_prop;
3954#endif
3955  int recno;                               /* Must be signed */
3956  int refsign;                             /* Must be signed */
3957  int terminator;                          /* Must be signed */
3958  unsigned int mclength;
3959  unsigned int tempbracount;
3960  uint32_t ec;
3961  uint32_t newoptions;
3962  uint32_t skipunits;
3963  uint32_t subreqcu, subfirstcu;
3964  int32_t subreqcuflags, subfirstcuflags;  /* Must be signed */
3965  PCRE2_UCHAR mcbuffer[8];
3966
3967  /* Come here to restart the loop. */
3968
3969  REDO_LOOP:
3970
3971  /* Get next character in the pattern */
3972
3973  c = *ptr;
3974
3975  /* If we are at the end of a nested substitution, revert to the outer level
3976  string. Nesting only happens one or two levels deep, and the inserted string
3977  is always zero terminated. */
3978
3979  if (c == CHAR_NULL && cb->nestptr[0] != NULL)
3980    {
3981    ptr = cb->nestptr[0];
3982    cb->nestptr[0] = cb->nestptr[1];
3983    cb->nestptr[1] = NULL;
3984    c = *ptr;
3985    }
3986
3987  /* If we are in the pre-compile phase, accumulate the length used for the
3988  previous cycle of this loop. */
3989
3990  if (lengthptr != NULL)
3991    {
3992    if (code > cb->start_workspace + cb->workspace_size -
3993        WORK_SIZE_SAFETY_MARGIN)                       /* Check for overrun */
3994      {
3995      *errorcodeptr = (code >= cb->start_workspace + cb->workspace_size)?
3996        ERR52 : ERR86;
3997      goto FAILED;
3998      }
3999
4000    /* There is at least one situation where code goes backwards: this is the
4001    case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
4002    the class is simply eliminated. However, it is created first, so we have to
4003    allow memory for it. Therefore, don't ever reduce the length at this point.
4004    */
4005
4006    if (code < last_code) code = last_code;
4007
4008    /* Paranoid check for integer overflow */
4009
4010    if (OFLOW_MAX - *lengthptr < (size_t)(code - last_code))
4011      {
4012      *errorcodeptr = ERR20;
4013      goto FAILED;
4014      }
4015    *lengthptr += (size_t)(code - last_code);
4016
4017    /* If "previous" is set and it is not at the start of the work space, move
4018    it back to there, in order to avoid filling up the work space. Otherwise,
4019    if "previous" is NULL, reset the current code pointer to the start. */
4020
4021    if (previous != NULL)
4022      {
4023      if (previous > orig_code)
4024        {
4025        memmove(orig_code, previous, (size_t)CU2BYTES(code - previous));
4026        code -= previous - orig_code;
4027        previous = orig_code;
4028        }
4029      }
4030    else code = orig_code;
4031
4032    /* Remember where this code item starts so we can pick up the length
4033    next time round. */
4034
4035    last_code = code;
4036    }
4037
4038  /* Before doing anything else we must handle all the special items that do
4039  nothing, and which may come between an item and its quantifier. Otherwise,
4040  when auto-callouts are enabled, a callout gets incorrectly inserted before
4041  the quantifier is recognized. After recognizing a "do nothing" item, restart
4042  the loop in case another one follows. */
4043
4044  /* If c is not NULL we are not at the end of the pattern. If it is NULL, we
4045  may still be in the pattern with a NULL data item. In these cases, if we are
4046  in \Q...\E, check for the \E that ends the literal string; if not, we have a
4047  literal character. If not in \Q...\E, an isolated \E is ignored. */
4048
4049  if (c != CHAR_NULL || ptr < cb->end_pattern)
4050    {
4051    if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E)
4052      {
4053      inescq = FALSE;
4054      ptr++;
4055      continue;
4056      }
4057    else if (inescq)   /* Literal character */
4058      {
4059      if (previous_callout != NULL)
4060        {
4061        if (lengthptr == NULL)  /* Don't attempt in pre-compile phase */
4062          complete_callout(previous_callout, ptr, cb);
4063        previous_callout = NULL;
4064        }
4065      if ((options & PCRE2_AUTO_CALLOUT) != 0)
4066        {
4067        previous_callout = code;
4068        code = auto_callout(code, ptr, cb);
4069        }
4070      goto NORMAL_CHAR;
4071      }
4072
4073    /* Check for the start of a \Q...\E sequence. We must do this here rather
4074    than later in case it is immediately followed by \E, which turns it into a
4075    "do nothing" sequence. */
4076
4077    if (c == CHAR_BACKSLASH && ptr[1] == CHAR_Q)
4078      {
4079      inescq = TRUE;
4080      ptr++;
4081      continue;
4082      }
4083    }
4084
4085  /* In extended mode, skip white space and #-comments that end at newline. */
4086
4087  if ((options & PCRE2_EXTENDED) != 0)
4088    {
4089    PCRE2_SPTR wscptr = ptr;
4090    while (MAX_255(c) && (cb->ctypes[c] & ctype_space) != 0) c = *(++ptr);
4091    if (c == CHAR_NUMBER_SIGN)
4092      {
4093      ptr++;
4094      while (ptr < cb->end_pattern)
4095        {
4096        if (IS_NEWLINE(ptr))         /* For non-fixed-length newline cases, */
4097          {                          /* IS_NEWLINE sets cb->nllen. */
4098          ptr += cb->nllen;
4099          break;
4100          }
4101        ptr++;
4102#ifdef SUPPORT_UNICODE
4103        if (utf) FORWARDCHAR(ptr);
4104#endif
4105        }
4106      }
4107
4108    /* If we skipped any characters, restart the loop. Otherwise, we didn't see
4109    a comment. */
4110
4111    if (ptr > wscptr) goto REDO_LOOP;
4112    }
4113
4114  /* Skip over (?# comments. */
4115
4116  if (c == CHAR_LEFT_PARENTHESIS && ptr[1] == CHAR_QUESTION_MARK &&
4117      ptr[2] == CHAR_NUMBER_SIGN)
4118    {
4119    ptr += 3;
4120    while (ptr < cb->end_pattern && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
4121    if (*ptr != CHAR_RIGHT_PARENTHESIS)
4122      {
4123      *errorcodeptr = ERR18;
4124      goto FAILED;
4125      }
4126    continue;
4127    }
4128
4129  /* End of processing "do nothing" items. See if the next thing is a
4130  quantifier. */
4131
4132  is_quantifier =
4133    c == CHAR_ASTERISK || c == CHAR_PLUS || c == CHAR_QUESTION_MARK ||
4134     (c == CHAR_LEFT_CURLY_BRACKET && is_counted_repeat(ptr+1));
4135
4136  /* Fill in length of a previous callout and create an auto callout if
4137  required, except when the next thing is a quantifier or when processing a
4138  property substitution string for \w etc in UCP mode. */
4139
4140  if (!is_quantifier && cb->nestptr[0] == NULL)
4141    {
4142    if (previous_callout != NULL && after_manual_callout-- <= 0)
4143      {
4144      if (lengthptr == NULL)      /* Don't attempt in pre-compile phase */
4145        complete_callout(previous_callout, ptr, cb);
4146      previous_callout = NULL;
4147      }
4148
4149    if ((options & PCRE2_AUTO_CALLOUT) != 0)
4150      {
4151      previous_callout = code;
4152      code = auto_callout(code, ptr, cb);
4153      }
4154    }
4155
4156  /* Process the next pattern item. */
4157
4158  switch(c)
4159    {
4160    /* ===================================================================*/
4161    /* The branch terminates at string end or | or ) */
4162
4163    case CHAR_NULL:
4164    if (ptr < cb->end_pattern) goto NORMAL_CHAR;   /* Zero data character */
4165    /* Fall through */
4166
4167    case CHAR_VERTICAL_LINE:
4168    case CHAR_RIGHT_PARENTHESIS:
4169    *firstcuptr = firstcu;
4170    *firstcuflagsptr = firstcuflags;
4171    *reqcuptr = reqcu;
4172    *reqcuflagsptr = reqcuflags;
4173    *codeptr = code;
4174    *ptrptr = ptr;
4175    if (lengthptr != NULL)
4176      {
4177      if (OFLOW_MAX - *lengthptr < (size_t)(code - last_code))
4178        {
4179        *errorcodeptr = ERR20;
4180        goto FAILED;
4181        }
4182      *lengthptr += (size_t)(code - last_code);  /* To include callout length */
4183      }
4184    return TRUE;
4185
4186
4187    /* ===================================================================*/
4188    /* Handle single-character metacharacters. In multiline mode, ^ disables
4189    the setting of any following char as a first character. */
4190
4191    case CHAR_CIRCUMFLEX_ACCENT:
4192    previous = NULL;
4193    if ((options & PCRE2_MULTILINE) != 0)
4194      {
4195      if (firstcuflags == REQ_UNSET)
4196        zerofirstcuflags = firstcuflags = REQ_NONE;
4197      *code++ = OP_CIRCM;
4198      }
4199    else *code++ = OP_CIRC;
4200    break;
4201
4202    case CHAR_DOLLAR_SIGN:
4203    previous = NULL;
4204    *code++ = ((options & PCRE2_MULTILINE) != 0)? OP_DOLLM : OP_DOLL;
4205    break;
4206
4207    /* There can never be a first char if '.' is first, whatever happens about
4208    repeats. The value of reqcu doesn't change either. */
4209
4210    case CHAR_DOT:
4211    if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
4212    zerofirstcu = firstcu;
4213    zerofirstcuflags = firstcuflags;
4214    zeroreqcu = reqcu;
4215    zeroreqcuflags = reqcuflags;
4216    previous = code;
4217    *code++ = ((options & PCRE2_DOTALL) != 0)? OP_ALLANY: OP_ANY;
4218    break;
4219
4220
4221    /* ===================================================================*/
4222    /* Character classes. If the included characters are all < 256, we build a
4223    32-byte bitmap of the permitted characters, except in the special case
4224    where there is only one such character. For negated classes, we build the
4225    map as usual, then invert it at the end. However, we use a different opcode
4226    so that data characters > 255 can be handled correctly.
4227
4228    If the class contains characters outside the 0-255 range, a different
4229    opcode is compiled. It may optionally have a bit map for characters < 256,
4230    but those above are are explicitly listed afterwards. A flag byte tells
4231    whether the bitmap is present, and whether this is a negated class or not.
4232
4233    An isolated ']' character is not treated specially, so is just another data
4234    character. In earlier versions of PCRE that used the original API there was
4235    a "JavaScript compatibility mode" in which it gave an error. However,
4236    JavaScript itself has changed in this respect so there is no longer any
4237    need for this special handling.
4238
4239    In another (POSIX) regex library, the ugly syntax [[:<:]] and [[:>:]] is
4240    used for "start of word" and "end of word". As these are otherwise illegal
4241    sequences, we don't break anything by recognizing them. They are replaced
4242    by \b(?=\w) and \b(?<=\w) respectively. This can only happen at the top
4243    nesting level, as no other inserted sequences will contains these oddities.
4244    Sequences like [a[:<:]] are erroneous and are handled by the normal code
4245    below. */
4246
4247    case CHAR_LEFT_SQUARE_BRACKET:
4248    if (PRIV(strncmp_c8)(ptr+1, STRING_WEIRD_STARTWORD, 6) == 0)
4249      {
4250      cb->nestptr[0] = ptr + 7;
4251      ptr = sub_start_of_word;
4252      goto REDO_LOOP;
4253      }
4254
4255    if (PRIV(strncmp_c8)(ptr+1, STRING_WEIRD_ENDWORD, 6) == 0)
4256      {
4257      cb->nestptr[0] = ptr + 7;
4258      ptr = sub_end_of_word;
4259      goto REDO_LOOP;
4260      }
4261
4262    /* Handle a real character class. */
4263
4264    previous = code;
4265
4266    /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
4267    they are encountered at the top level, so we'll do that too. */
4268
4269    if ((ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
4270         ptr[1] == CHAR_EQUALS_SIGN) &&
4271        check_posix_syntax(ptr, &tempptr))
4272      {
4273      *errorcodeptr = (ptr[1] == CHAR_COLON)? ERR12 : ERR13;
4274      goto FAILED;
4275      }
4276
4277    /* If the first character is '^', set the negation flag and skip it. Also,
4278    if the first few characters (either before or after ^) are \Q\E or \E we
4279    skip them too. This makes for compatibility with Perl. */
4280
4281    negate_class = FALSE;
4282    for (;;)
4283      {
4284      c = *(++ptr);
4285      if (c == CHAR_BACKSLASH)
4286        {
4287        if (ptr[1] == CHAR_E)
4288          ptr++;
4289        else if (PRIV(strncmp_c8)(ptr + 1, STR_Q STR_BACKSLASH STR_E, 3) == 0)
4290          ptr += 3;
4291        else
4292          break;
4293        }
4294      else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT)
4295        negate_class = TRUE;
4296      else break;
4297      }
4298
4299    /* Empty classes are allowed if PCRE2_ALLOW_EMPTY_CLASS is set. Otherwise,
4300    an initial ']' is taken as a data character -- the code below handles
4301    that. When empty classes are allowed, [] must always fail, so generate
4302    OP_FAIL, whereas [^] must match any character, so generate OP_ALLANY. */
4303
4304    if (c == CHAR_RIGHT_SQUARE_BRACKET &&
4305        (cb->external_options & PCRE2_ALLOW_EMPTY_CLASS) != 0)
4306      {
4307      *code++ = negate_class? OP_ALLANY : OP_FAIL;
4308      if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
4309      zerofirstcu = firstcu;
4310      zerofirstcuflags = firstcuflags;
4311      break;
4312      }
4313
4314    /* If a non-extended class contains a negative special such as \S, we need
4315    to flip the negation flag at the end, so that support for characters > 255
4316    works correctly (they are all included in the class). An extended class may
4317    need to insert specific matching or non-matching code for wide characters.
4318    */
4319
4320    should_flip_negation = match_all_or_no_wide_chars = FALSE;
4321
4322    /* Extended class (xclass) will be used when characters > 255
4323    might match. */
4324
4325#ifdef SUPPORT_WIDE_CHARS
4326    xclass = FALSE;
4327    class_uchardata = code + LINK_SIZE + 2;   /* For XCLASS items */
4328    class_uchardata_base = class_uchardata;   /* Save the start */
4329#endif
4330
4331    /* For optimization purposes, we track some properties of the class:
4332    class_has_8bitchar will be non-zero if the class contains at least one 256
4333    character with a code point less than 256; class_one_char will be 1 if the
4334    class contains just one character; xclass_has_prop will be TRUE if Unicode
4335    property checks are present in the class. */
4336
4337    class_has_8bitchar = 0;
4338    class_one_char = 0;
4339#ifdef SUPPORT_WIDE_CHARS
4340    xclass_has_prop = FALSE;
4341#endif
4342
4343    /* Initialize the 256-bit (32-byte) bit map to all zeros. We build the map
4344    in a temporary bit of memory, in case the class contains fewer than two
4345    8-bit characters because in that case the compiled code doesn't use the bit
4346    map. */
4347
4348    memset(classbits, 0, 32 * sizeof(uint8_t));
4349
4350    /* Process characters until ] is reached. As the test is at the end of the
4351    loop, an initial ] is taken as a data character. At the start of the loop,
4352    c contains the first code unit of the character. If it is zero, check for
4353    the end of the pattern, to allow binary zero as data. */
4354
4355    for(;;)
4356      {
4357      PCRE2_SPTR oldptr;
4358#ifdef EBCDIC
4359      BOOL range_is_literal = TRUE;
4360#endif
4361
4362      if (c == CHAR_NULL && ptr >= cb->end_pattern)
4363        {
4364        *errorcodeptr = ERR6;  /* Missing terminating ']' */
4365        goto FAILED;
4366        }
4367
4368#ifdef SUPPORT_UNICODE
4369      if (utf && HAS_EXTRALEN(c))
4370        {                           /* Braces are required because the */
4371        GETCHARLEN(c, ptr, ptr);    /* macro generates multiple statements */
4372        }
4373#endif
4374
4375      /* Inside \Q...\E everything is literal except \E */
4376
4377      if (inescq)
4378        {
4379        if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E)  /* If we are at \E */
4380          {
4381          inescq = FALSE;                   /* Reset literal state */
4382          ptr++;                            /* Skip the 'E' */
4383          goto CONTINUE_CLASS;              /* Carry on with next char */
4384          }
4385        goto CHECK_RANGE;                   /* Could be range if \E follows */
4386        }
4387
4388      /* Handle POSIX class names. Perl allows a negation extension of the
4389      form [:^name:]. A square bracket that doesn't match the syntax is
4390      treated as a literal. We also recognize the POSIX constructions
4391      [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
4392      5.6 and 5.8 do. */
4393
4394      if (c == CHAR_LEFT_SQUARE_BRACKET &&
4395          (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
4396           ptr[1] == CHAR_EQUALS_SIGN) && check_posix_syntax(ptr, &tempptr))
4397        {
4398        BOOL local_negate = FALSE;
4399        int posix_class, taboffset, tabopt;
4400        register const uint8_t *cbits = cb->cbits;
4401        uint8_t pbits[32];
4402
4403        if (ptr[1] != CHAR_COLON)
4404          {
4405          *errorcodeptr = ERR13;
4406          goto FAILED;
4407          }
4408
4409        ptr += 2;
4410        if (*ptr == CHAR_CIRCUMFLEX_ACCENT)
4411          {
4412          local_negate = TRUE;
4413          should_flip_negation = TRUE;  /* Note negative special */
4414          ptr++;
4415          }
4416
4417        posix_class = check_posix_name(ptr, (int)(tempptr - ptr));
4418        if (posix_class < 0)
4419          {
4420          *errorcodeptr = ERR30;
4421          goto FAILED;
4422          }
4423
4424        /* If matching is caseless, upper and lower are converted to
4425        alpha. This relies on the fact that the class table starts with
4426        alpha, lower, upper as the first 3 entries. */
4427
4428        if ((options & PCRE2_CASELESS) != 0 && posix_class <= 2)
4429          posix_class = 0;
4430
4431        /* When PCRE2_UCP is set, some of the POSIX classes are converted to
4432        different escape sequences that use Unicode properties \p or \P. Others
4433        that are not available via \p or \P generate XCL_PROP/XCL_NOTPROP
4434        directly. UCP support is not available unless UTF support is.*/
4435
4436#ifdef SUPPORT_UNICODE
4437        if ((options & PCRE2_UCP) != 0)
4438          {
4439          unsigned int ptype = 0;
4440          int pc = posix_class + ((local_negate)? POSIX_SUBSIZE/2 : 0);
4441
4442          /* The posix_substitutes table specifies which POSIX classes can be
4443          converted to \p or \P items. This can only happen at top nestling
4444          level, as there will never be a POSIX class in a string that is
4445          substituted for something else. */
4446
4447          if (posix_substitutes[pc] != NULL)
4448            {
4449            cb->nestptr[0] = tempptr + 1;
4450            ptr = posix_substitutes[pc] - 1;
4451            goto CONTINUE_CLASS;
4452            }
4453
4454          /* There are three other classes that generate special property calls
4455          that are recognized only in an XCLASS. */
4456
4457          else switch(posix_class)
4458            {
4459            case PC_GRAPH:
4460            ptype = PT_PXGRAPH;
4461            /* Fall through */
4462            case PC_PRINT:
4463            if (ptype == 0) ptype = PT_PXPRINT;
4464            /* Fall through */
4465            case PC_PUNCT:
4466            if (ptype == 0) ptype = PT_PXPUNCT;
4467            *class_uchardata++ = local_negate? XCL_NOTPROP : XCL_PROP;
4468            *class_uchardata++ = (PCRE2_UCHAR)ptype;
4469            *class_uchardata++ = 0;
4470            xclass_has_prop = TRUE;
4471            ptr = tempptr + 1;
4472            goto CONTINUE_CLASS;
4473
4474            /* For the other POSIX classes (ascii, xdigit) we are going to fall
4475            through to the non-UCP case and build a bit map for characters with
4476            code points less than 256. However, if we are in a negated POSIX
4477            class, characters with code points greater than 255 must either all
4478            match or all not match, depending on whether the whole class is not
4479            or is negated. For example, for [[:^ascii:]... they must all match,
4480            whereas for [^[:^xdigit:]... they must not.
4481
4482            In the special case where there are no xclass items, this is
4483            automatically handled by the use of OP_CLASS or OP_NCLASS, but an
4484            explicit range is needed for OP_XCLASS. Setting a flag here causes
4485            the range to be generated later when it is known that OP_XCLASS is
4486            required. */
4487
4488            default:
4489            match_all_or_no_wide_chars |= local_negate;
4490            break;
4491            }
4492          }
4493#endif  /* SUPPORT_UNICODE */
4494
4495        /* In the non-UCP case, or when UCP makes no difference, we build the
4496        bit map for the POSIX class in a chunk of local store because we may be
4497        adding and subtracting from it, and we don't want to subtract bits that
4498        may be in the main map already. At the end we or the result into the
4499        bit map that is being built. */
4500
4501        posix_class *= 3;
4502
4503        /* Copy in the first table (always present) */
4504
4505        memcpy(pbits, cbits + posix_class_maps[posix_class],
4506          32 * sizeof(uint8_t));
4507
4508        /* If there is a second table, add or remove it as required. */
4509
4510        taboffset = posix_class_maps[posix_class + 1];
4511        tabopt = posix_class_maps[posix_class + 2];
4512
4513        if (taboffset >= 0)
4514          {
4515          if (tabopt >= 0)
4516            for (c = 0; c < 32; c++) pbits[c] |= cbits[(int)c + taboffset];
4517          else
4518            for (c = 0; c < 32; c++) pbits[c] &= ~cbits[(int)c + taboffset];
4519          }
4520
4521        /* Now see if we need to remove any special characters. An option
4522        value of 1 removes vertical space and 2 removes underscore. */
4523
4524        if (tabopt < 0) tabopt = -tabopt;
4525        if (tabopt == 1) pbits[1] &= ~0x3c;
4526          else if (tabopt == 2) pbits[11] &= 0x7f;
4527
4528        /* Add the POSIX table or its complement into the main table that is
4529        being built and we are done. */
4530
4531        if (local_negate)
4532          for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
4533        else
4534          for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
4535
4536        ptr = tempptr + 1;
4537        /* Every class contains at least one < 256 character. */
4538        class_has_8bitchar = 1;
4539        /* Every class contains at least two characters. */
4540        class_one_char = 2;
4541        goto CONTINUE_CLASS;    /* End of POSIX syntax handling */
4542        }
4543
4544      /* Backslash may introduce a single character, or it may introduce one
4545      of the specials, which just set a flag. The sequence \b is a special
4546      case. Inside a class (and only there) it is treated as backspace. We
4547      assume that other escapes have more than one character in them, so
4548      speculatively set both class_has_8bitchar and class_one_char bigger
4549      than one. Unrecognized escapes fall through and are faulted. */
4550
4551      if (c == CHAR_BACKSLASH)
4552        {
4553        escape = PRIV(check_escape)(&ptr, cb->end_pattern, &ec, errorcodeptr,
4554          options, TRUE, cb);
4555        if (*errorcodeptr != 0) goto FAILED;
4556        if (escape == 0)    /* Escaped single char */
4557          {
4558          c = ec;
4559#ifdef EBCDIC
4560          range_is_literal = FALSE;
4561#endif
4562          }
4563        else if (escape == ESC_b) c = CHAR_BS; /* \b is backspace in a class */
4564        else if (escape == ESC_N)          /* \N is not supported in a class */
4565          {
4566          *errorcodeptr = ERR71;
4567          goto FAILED;
4568          }
4569        else if (escape == ESC_Q)            /* Handle start of quoted string */
4570          {
4571          if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
4572            {
4573            ptr += 2; /* avoid empty string */
4574            }
4575          else inescq = TRUE;
4576          goto CONTINUE_CLASS;
4577          }
4578        else if (escape == ESC_E) goto CONTINUE_CLASS;  /* Ignore orphan \E */
4579
4580        else  /* Handle \d-type escapes */
4581          {
4582          register const uint8_t *cbits = cb->cbits;
4583          /* Every class contains at least two < 256 characters. */
4584          class_has_8bitchar++;
4585          /* Every class contains at least two characters. */
4586          class_one_char += 2;
4587
4588          switch (escape)
4589            {
4590#ifdef SUPPORT_UNICODE
4591            case ESC_du:     /* These are the values given for \d etc */
4592            case ESC_DU:     /* when PCRE2_UCP is set. We replace the */
4593            case ESC_wu:     /* escape sequence with an appropriate \p */
4594            case ESC_WU:     /* or \P to test Unicode properties instead */
4595            case ESC_su:     /* of the default ASCII testing. This might be */
4596            case ESC_SU:     /* a 2nd-level nesting for [[:<:]] or [[:>:]]. */
4597            cb->nestptr[1] = cb->nestptr[0];
4598            cb->nestptr[0] = ptr;
4599            ptr = substitutes[escape - ESC_DU] - 1;  /* Just before substitute */
4600            class_has_8bitchar--;                /* Undo! */
4601            break;
4602#endif
4603            case ESC_d:
4604            for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
4605            break;
4606
4607            case ESC_D:
4608            should_flip_negation = TRUE;
4609            for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
4610            break;
4611
4612            case ESC_w:
4613            for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
4614            break;
4615
4616            case ESC_W:
4617            should_flip_negation = TRUE;
4618            for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
4619            break;
4620
4621            /* Perl 5.004 onwards omitted VT from \s, but restored it at Perl
4622            5.18. Before PCRE 8.34, we had to preserve the VT bit if it was
4623            previously set by something earlier in the character class.
4624            Luckily, the value of CHAR_VT is 0x0b in both ASCII and EBCDIC, so
4625            we could just adjust the appropriate bit. From PCRE 8.34 we no
4626            longer treat \s and \S specially. */
4627
4628            case ESC_s:
4629            for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
4630            break;
4631
4632            case ESC_S:
4633            should_flip_negation = TRUE;
4634            for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
4635            break;
4636
4637            /* The rest apply in both UCP and non-UCP cases. */
4638
4639            case ESC_h:
4640            (void)add_list_to_class(classbits, &class_uchardata, options, cb,
4641              PRIV(hspace_list), NOTACHAR);
4642            break;
4643
4644            case ESC_H:
4645            (void)add_not_list_to_class(classbits, &class_uchardata, options,
4646              cb, PRIV(hspace_list));
4647            break;
4648
4649            case ESC_v:
4650            (void)add_list_to_class(classbits, &class_uchardata, options, cb,
4651              PRIV(vspace_list), NOTACHAR);
4652            break;
4653
4654            case ESC_V:
4655            (void)add_not_list_to_class(classbits, &class_uchardata, options,
4656              cb, PRIV(vspace_list));
4657            break;
4658
4659            case ESC_p:
4660            case ESC_P:
4661#ifdef SUPPORT_UNICODE
4662              {
4663              BOOL negated;
4664              unsigned int ptype = 0, pdata = 0;
4665              if (!get_ucp(&ptr, &negated, &ptype, &pdata, errorcodeptr, cb))
4666                goto FAILED;
4667              *class_uchardata++ = ((escape == ESC_p) != negated)?
4668                XCL_PROP : XCL_NOTPROP;
4669              *class_uchardata++ = ptype;
4670              *class_uchardata++ = pdata;
4671              xclass_has_prop = TRUE;
4672              class_has_8bitchar--;                /* Undo! */
4673              }
4674            break;
4675#else
4676            *errorcodeptr = ERR45;
4677            goto FAILED;
4678#endif
4679            /* Unrecognized escapes are faulted. */
4680
4681            default:
4682            *errorcodeptr = ERR7;
4683            goto FAILED;
4684            }
4685
4686          /* Handled \d-type escape */
4687
4688          goto CONTINUE_CLASS;
4689          }
4690
4691        /* Control gets here if the escape just defined a single character.
4692        This is in c and may be greater than 256. */
4693
4694        escape = 0;
4695        }   /* End of backslash handling */
4696
4697      /* A character may be followed by '-' to form a range. However, Perl does
4698      not permit ']' to be the end of the range. A '-' character at the end is
4699      treated as a literal. Perl ignores orphaned \E sequences entirely. The
4700      code for handling \Q and \E is messy. */
4701
4702      CHECK_RANGE:
4703      while (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
4704        {
4705        inescq = FALSE;
4706        ptr += 2;
4707        }
4708      oldptr = ptr;
4709
4710      /* Remember if \r or \n were explicitly used */
4711
4712      if (c == CHAR_CR || c == CHAR_NL) cb->external_flags |= PCRE2_HASCRORLF;
4713
4714      /* Check for range */
4715
4716      if (!inescq && ptr[1] == CHAR_MINUS)
4717        {
4718        uint32_t d;
4719        ptr += 2;
4720        while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E) ptr += 2;
4721
4722        /* If we hit \Q (not followed by \E) at this point, go into escaped
4723        mode. */
4724
4725        while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_Q)
4726          {
4727          ptr += 2;
4728          if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E)
4729            { ptr += 2; continue; }
4730          inescq = TRUE;
4731          break;
4732          }
4733
4734        /* Minus (hyphen) at the end of a class is treated as a literal, so put
4735        back the pointer and jump to handle the character that preceded it. */
4736
4737        if (*ptr == CHAR_NULL || (!inescq && *ptr == CHAR_RIGHT_SQUARE_BRACKET))
4738          {
4739          ptr = oldptr;
4740          goto CLASS_SINGLE_CHARACTER;
4741          }
4742
4743        /* Otherwise, we have a potential range; pick up the next character */
4744
4745#ifdef SUPPORT_UNICODE
4746        if (utf)
4747          {                           /* Braces are required because the */
4748          GETCHARLEN(d, ptr, ptr);    /* macro generates multiple statements */
4749          }
4750        else
4751#endif
4752        d = *ptr;  /* Not UTF mode */
4753
4754        /* The second part of a range can be a single-character escape
4755        sequence, but not any of the other escapes. Perl treats a hyphen as a
4756        literal in such circumstances. However, in Perl's warning mode, a
4757        warning is given, so PCRE now faults it as it is almost certainly a
4758        mistake on the user's part. */
4759
4760        if (!inescq)
4761          {
4762          if (d == CHAR_BACKSLASH)
4763            {
4764            int descape;
4765            descape = PRIV(check_escape)(&ptr, cb->end_pattern, &d,
4766              errorcodeptr, options, TRUE, cb);
4767            if (*errorcodeptr != 0) goto FAILED;
4768#ifdef EBCDIC
4769            range_is_literal = FALSE;
4770#endif
4771            /* 0 means a character was put into d; \b is backspace; any other
4772            special causes an error. */
4773
4774            if (descape != 0)
4775              {
4776              if (descape == ESC_b) d = CHAR_BS; else
4777                {
4778                *errorcodeptr = ERR50;
4779                goto FAILED;
4780                }
4781              }
4782            }
4783
4784          /* A hyphen followed by a POSIX class is treated in the same way. */
4785
4786          else if (d == CHAR_LEFT_SQUARE_BRACKET &&
4787                   (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
4788                    ptr[1] == CHAR_EQUALS_SIGN) &&
4789                   check_posix_syntax(ptr, &tempptr))
4790            {
4791            *errorcodeptr = ERR50;
4792            goto FAILED;
4793            }
4794          }
4795
4796        /* Check that the two values are in the correct order. Optimize
4797        one-character ranges. */
4798
4799        if (d < c)
4800          {
4801          *errorcodeptr = ERR8;
4802          goto FAILED;
4803          }
4804        if (d == c) goto CLASS_SINGLE_CHARACTER;  /* A few lines below */
4805
4806        /* We have found a character range, so single character optimizations
4807        cannot be done anymore. Any value greater than 1 indicates that there
4808        is more than one character. */
4809
4810        class_one_char = 2;
4811
4812        /* Remember an explicit \r or \n, and add the range to the class. */
4813
4814        if (d == CHAR_CR || d == CHAR_NL) cb->external_flags |= PCRE2_HASCRORLF;
4815
4816        /* In an EBCDIC environment, Perl treats alphabetic ranges specially
4817        because there are holes in the encoding, and simply using the range A-Z
4818        (for example) would include the characters in the holes. This applies
4819        only to literal ranges; [\xC1-\xE9] is different to [A-Z]. */
4820
4821#ifdef EBCDIC
4822        if (range_is_literal &&
4823             (cb->ctypes[c] & ctype_letter) != 0 &&
4824             (cb->ctypes[d] & ctype_letter) != 0 &&
4825             (c <= CHAR_z) == (d <= CHAR_z))
4826          {
4827          uint32_t uc = (c <= CHAR_z)? 0 : 64;
4828          uint32_t C = c - uc;
4829          uint32_t D = d - uc;
4830
4831          if (C <= CHAR_i)
4832            {
4833            class_has_8bitchar +=
4834              add_to_class(classbits, &class_uchardata, options, cb, C + uc,
4835                ((D < CHAR_i)? D : CHAR_i) + uc);
4836            C = CHAR_j;
4837            }
4838
4839          if (C <= D && C <= CHAR_r)
4840            {
4841            class_has_8bitchar +=
4842              add_to_class(classbits, &class_uchardata, options, cb, C + uc,
4843                ((D < CHAR_r)? D : CHAR_r) + uc);
4844            C = CHAR_s;
4845            }
4846
4847          if (C <= D)
4848            {
4849            class_has_8bitchar +=
4850              add_to_class(classbits, &class_uchardata, options, cb, C + uc,
4851                D + uc);
4852            }
4853          }
4854        else
4855#endif
4856        class_has_8bitchar +=
4857          add_to_class(classbits, &class_uchardata, options, cb, c, d);
4858        goto CONTINUE_CLASS;   /* Go get the next char in the class */
4859        }
4860
4861      /* Handle a single character - we can get here for a normal non-escape
4862      char, or after \ that introduces a single character or for an apparent
4863      range that isn't. Only the value 1 matters for class_one_char, so don't
4864      increase it if it is already 2 or more ... just in case there's a class
4865      with a zillion characters in it. */
4866
4867      CLASS_SINGLE_CHARACTER:
4868      if (class_one_char < 2) class_one_char++;
4869
4870      /* If class_one_char is 1 and xclass_has_prop is false, we have the first
4871      single character in the class, and there have been no prior ranges, or
4872      XCLASS items generated by escapes. If this is the final character in the
4873      class, we can optimize by turning the item into a 1-character OP_CHAR[I]
4874      if it's positive, or OP_NOT[I] if it's negative. In the positive case, it
4875      can cause firstcu to be set. Otherwise, there can be no first char if
4876      this item is first, whatever repeat count may follow. In the case of
4877      reqcu, save the previous value for reinstating. */
4878
4879      if (!inescq &&
4880#ifdef SUPPORT_UNICODE
4881          !xclass_has_prop &&
4882#endif
4883          class_one_char == 1 && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
4884        {
4885        ptr++;
4886        zeroreqcu = reqcu;
4887        zeroreqcuflags = reqcuflags;
4888
4889        if (negate_class)
4890          {
4891#ifdef SUPPORT_UNICODE
4892          int d;
4893#endif
4894          if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
4895          zerofirstcu = firstcu;
4896          zerofirstcuflags = firstcuflags;
4897
4898          /* For caseless UTF mode, check whether this character has more than
4899          one other case. If so, generate a special OP_NOTPROP item instead of
4900          OP_NOTI. */
4901
4902#ifdef SUPPORT_UNICODE
4903          if (utf && (options & PCRE2_CASELESS) != 0 &&
4904              (d = UCD_CASESET(c)) != 0)
4905            {
4906            *code++ = OP_NOTPROP;
4907            *code++ = PT_CLIST;
4908            *code++ = d;
4909            }
4910          else
4911#endif
4912          /* Char has only one other case, or UCP not available */
4913
4914            {
4915            *code++ = ((options & PCRE2_CASELESS) != 0)? OP_NOTI: OP_NOT;
4916            code += PUTCHAR(c, code);
4917            }
4918
4919          /* We are finished with this character class */
4920
4921          goto END_CLASS;
4922          }
4923
4924        /* For a single, positive character, get the value into mcbuffer, and
4925        then we can handle this with the normal one-character code. */
4926
4927        mclength = PUTCHAR(c, mcbuffer);
4928        goto ONE_CHAR;
4929        }       /* End of 1-char optimization */
4930
4931      /* There is more than one character in the class, or an XCLASS item
4932      has been generated. Add this character to the class. */
4933
4934      class_has_8bitchar +=
4935        add_to_class(classbits, &class_uchardata, options, cb, c, c);
4936
4937      /* Continue to the next character in the class. Closing square bracket
4938      not within \Q..\E ends the class. A NULL character terminates a
4939      nested substitution string, but may be a data character in the main
4940      pattern (tested at the start of this loop). */
4941
4942      CONTINUE_CLASS:
4943      c = *(++ptr);
4944      if (c == CHAR_NULL && cb->nestptr[0] != NULL)
4945        {
4946        ptr = cb->nestptr[0];
4947        cb->nestptr[0] = cb->nestptr[1];
4948        cb->nestptr[1] = NULL;
4949        c = *(++ptr);
4950        }
4951
4952#ifdef SUPPORT_WIDE_CHARS
4953      /* If any wide characters have been encountered, set xclass = TRUE. Then,
4954      in the pre-compile phase, accumulate the length of the wide characters
4955      and reset the pointer. This is so that very large classes that contain a
4956      zillion wide characters do not overwrite the work space (which is on the
4957      stack). */
4958
4959      if (class_uchardata > class_uchardata_base)
4960        {
4961        xclass = TRUE;
4962        if (lengthptr != NULL)
4963          {
4964          *lengthptr += class_uchardata - class_uchardata_base;
4965          class_uchardata = class_uchardata_base;
4966          }
4967        }
4968#endif
4969      /* An unescaped ] ends the class */
4970
4971      if (c == CHAR_RIGHT_SQUARE_BRACKET && !inescq) break;
4972      }   /* End of main class-processing loop */
4973
4974    /* If this is the first thing in the branch, there can be no first char
4975    setting, whatever the repeat count. Any reqcu setting must remain
4976    unchanged after any kind of repeat. */
4977
4978    if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
4979    zerofirstcu = firstcu;
4980    zerofirstcuflags = firstcuflags;
4981    zeroreqcu = reqcu;
4982    zeroreqcuflags = reqcuflags;
4983
4984    /* If there are characters with values > 255, or Unicode property settings
4985    (\p or \P), we have to compile an extended class, with its own opcode,
4986    unless there were no property settings and there was a negated special such
4987    as \S in the class, and PCRE2_UCP is not set, because in that case all
4988    characters > 255 are in or not in the class, so any that were explicitly
4989    given as well can be ignored.
4990
4991    In the UCP case, if certain negated POSIX classes ([:^ascii:] or
4992    [^:xdigit:]) were present in a class, we either have to match or not match
4993    all wide characters (depending on whether the whole class is or is not
4994    negated). This requirement is indicated by match_all_or_no_wide_chars being
4995    true. We do this by including an explicit range, which works in both cases.
4996
4997    If, when generating an xclass, there are no characters < 256, we can omit
4998    the bitmap in the actual compiled code. */
4999
5000#ifdef SUPPORT_WIDE_CHARS
5001#ifdef SUPPORT_UNICODE
5002    if (xclass && (xclass_has_prop || !should_flip_negation ||
5003         (options & PCRE2_UCP) != 0))
5004#elif PCRE2_CODE_UNIT_WIDTH != 8
5005    if (xclass && (xclass_has_prop || !should_flip_negation))
5006#endif
5007      {
5008      if (match_all_or_no_wide_chars)
5009        {
5010        *class_uchardata++ = XCL_RANGE;
5011        class_uchardata += PRIV(ord2utf)(0x100, class_uchardata);
5012        class_uchardata += PRIV(ord2utf)(MAX_UTF_CODE_POINT, class_uchardata);
5013        }
5014      *class_uchardata++ = XCL_END;    /* Marks the end of extra data */
5015      *code++ = OP_XCLASS;
5016      code += LINK_SIZE;
5017      *code = negate_class? XCL_NOT:0;
5018      if (xclass_has_prop) *code |= XCL_HASPROP;
5019
5020      /* If the map is required, move up the extra data to make room for it;
5021      otherwise just move the code pointer to the end of the extra data. */
5022
5023      if (class_has_8bitchar > 0)
5024        {
5025        *code++ |= XCL_MAP;
5026        memmove(code + (32 / sizeof(PCRE2_UCHAR)), code,
5027          CU2BYTES(class_uchardata - code));
5028        if (negate_class && !xclass_has_prop)
5029          for (c = 0; c < 32; c++) classbits[c] = ~classbits[c];
5030        memcpy(code, classbits, 32);
5031        code = class_uchardata + (32 / sizeof(PCRE2_UCHAR));
5032        }
5033      else code = class_uchardata;
5034
5035      /* Now fill in the complete length of the item */
5036
5037      PUT(previous, 1, (int)(code - previous));
5038      break;   /* End of class handling */
5039      }
5040#endif
5041
5042    /* If there are no characters > 255, or they are all to be included or
5043    excluded, set the opcode to OP_CLASS or OP_NCLASS, depending on whether the
5044    whole class was negated and whether there were negative specials such as \S
5045    (non-UCP) in the class. Then copy the 32-byte map into the code vector,
5046    negating it if necessary. */
5047
5048    *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
5049    if (lengthptr == NULL)    /* Save time in the pre-compile phase */
5050      {
5051      if (negate_class)
5052        for (c = 0; c < 32; c++) classbits[c] = ~classbits[c];
5053      memcpy(code, classbits, 32);
5054      }
5055    code += 32 / sizeof(PCRE2_UCHAR);
5056
5057    END_CLASS:
5058    break;
5059
5060
5061    /* ===================================================================*/
5062    /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
5063    has been tested above. */
5064
5065    case CHAR_LEFT_CURLY_BRACKET:
5066    if (!is_quantifier) goto NORMAL_CHAR;
5067    ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
5068    if (*errorcodeptr != 0) goto FAILED;
5069    goto REPEAT;
5070
5071    case CHAR_ASTERISK:
5072    repeat_min = 0;
5073    repeat_max = -1;
5074    goto REPEAT;
5075
5076    case CHAR_PLUS:
5077    repeat_min = 1;
5078    repeat_max = -1;
5079    goto REPEAT;
5080
5081    case CHAR_QUESTION_MARK:
5082    repeat_min = 0;
5083    repeat_max = 1;
5084
5085    REPEAT:
5086    if (previous == NULL)
5087      {
5088      *errorcodeptr = ERR9;
5089      goto FAILED;
5090      }
5091
5092    if (repeat_min == 0)
5093      {
5094      firstcu = zerofirstcu;    /* Adjust for zero repeat */
5095      firstcuflags = zerofirstcuflags;
5096      reqcu = zeroreqcu;        /* Ditto */
5097      reqcuflags = zeroreqcuflags;
5098      }
5099
5100    /* Remember whether this is a variable length repeat */
5101
5102    reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
5103
5104    op_type = 0;                    /* Default single-char op codes */
5105    possessive_quantifier = FALSE;  /* Default not possessive quantifier */
5106
5107    /* Save start of previous item, in case we have to move it up in order to
5108    insert something before it. */
5109
5110    tempcode = previous;
5111
5112    /* Before checking for a possessive quantifier, we must skip over
5113    whitespace and comments in extended mode because Perl allows white space at
5114    this point. */
5115
5116    if ((options & PCRE2_EXTENDED) != 0)
5117      {
5118      ptr++;
5119      for (;;)
5120        {
5121        while (MAX_255(*ptr) && (cb->ctypes[*ptr] & ctype_space) != 0) ptr++;
5122        if (*ptr != CHAR_NUMBER_SIGN) break;
5123        ptr++;
5124        while (ptr < cb->end_pattern)
5125          {
5126          if (IS_NEWLINE(ptr))         /* For non-fixed-length newline cases, */
5127            {                        /* IS_NEWLINE sets cb->nllen. */
5128            ptr += cb->nllen;
5129            break;
5130            }
5131          ptr++;
5132#ifdef SUPPORT_UNICODE
5133          if (utf) FORWARDCHAR(ptr);
5134#endif
5135          }           /* Loop for comment characters */
5136        }             /* Loop for multiple comments */
5137      ptr--;          /* Last code unit of previous character. */
5138      }
5139
5140    /* If the next character is '+', we have a possessive quantifier. This
5141    implies greediness, whatever the setting of the PCRE2_UNGREEDY option.
5142    If the next character is '?' this is a minimizing repeat, by default,
5143    but if PCRE2_UNGREEDY is set, it works the other way round. We change the
5144    repeat type to the non-default. */
5145
5146    if (ptr[1] == CHAR_PLUS)
5147      {
5148      repeat_type = 0;                  /* Force greedy */
5149      possessive_quantifier = TRUE;
5150      ptr++;
5151      }
5152    else if (ptr[1] == CHAR_QUESTION_MARK)
5153      {
5154      repeat_type = greedy_non_default;
5155      ptr++;
5156      }
5157    else repeat_type = greedy_default;
5158
5159    /* If the repeat is {1} we can ignore it. */
5160
5161    if (repeat_max == 1 && repeat_min == 1) goto END_REPEAT;
5162
5163    /* If previous was a recursion call, wrap it in atomic brackets so that
5164    previous becomes the atomic group. All recursions were so wrapped in the
5165    past, but it no longer happens for non-repeated recursions. In fact, the
5166    repeated ones could be re-implemented independently so as not to need this,
5167    but for the moment we rely on the code for repeating groups. */
5168
5169    if (*previous == OP_RECURSE)
5170      {
5171      memmove(previous + 1 + LINK_SIZE, previous, CU2BYTES(1 + LINK_SIZE));
5172      *previous = OP_ONCE;
5173      PUT(previous, 1, 2 + 2*LINK_SIZE);
5174      previous[2 + 2*LINK_SIZE] = OP_KET;
5175      PUT(previous, 3 + 2*LINK_SIZE, 2 + 2*LINK_SIZE);
5176      code += 2 + 2 * LINK_SIZE;
5177      length_prevgroup = 3 + 3*LINK_SIZE;
5178      }
5179
5180    /* Now handle repetition for the different types of item. */
5181
5182    /* If previous was a character or negated character match, abolish the item
5183    and generate a repeat item instead. If a char item has a minimum of more
5184    than one, ensure that it is set in reqcu - it might not be if a sequence
5185    such as x{3} is the first thing in a branch because the x will have gone
5186    into firstcu instead.  */
5187
5188    if (*previous == OP_CHAR || *previous == OP_CHARI
5189        || *previous == OP_NOT || *previous == OP_NOTI)
5190      {
5191      switch (*previous)
5192        {
5193        default: /* Make compiler happy. */
5194        case OP_CHAR:  op_type = OP_STAR - OP_STAR; break;
5195        case OP_CHARI: op_type = OP_STARI - OP_STAR; break;
5196        case OP_NOT:   op_type = OP_NOTSTAR - OP_STAR; break;
5197        case OP_NOTI:  op_type = OP_NOTSTARI - OP_STAR; break;
5198        }
5199
5200      /* Deal with UTF characters that take up more than one code unit. It's
5201      easier to write this out separately than try to macrify it. Use c to
5202      hold the length of the character in code units, plus UTF_LENGTH to flag
5203      that it's a length rather than a small character. */
5204
5205#ifdef MAYBE_UTF_MULTI
5206      if (utf && NOT_FIRSTCU(code[-1]))
5207        {
5208        PCRE2_UCHAR *lastchar = code - 1;
5209        BACKCHAR(lastchar);
5210        c = (int)(code - lastchar);               /* Length of UTF character */
5211        memcpy(utf_units, lastchar, CU2BYTES(c)); /* Save the char */
5212        c |= UTF_LENGTH;                          /* Flag c as a length */
5213        }
5214      else
5215#endif  /* MAYBE_UTF_MULTI */
5216
5217      /* Handle the case of a single charater - either with no UTF support, or
5218      with UTF disabled, or for a single-code-unit UTF character. */
5219        {
5220        c = code[-1];
5221        if (*previous <= OP_CHARI && repeat_min > 1)
5222          {
5223          reqcu = c;
5224          reqcuflags = req_caseopt | cb->req_varyopt;
5225          }
5226        }
5227
5228      goto OUTPUT_SINGLE_REPEAT;   /* Code shared with single character types */
5229      }
5230
5231    /* If previous was a character type match (\d or similar), abolish it and
5232    create a suitable repeat item. The code is shared with single-character
5233    repeats by setting op_type to add a suitable offset into repeat_type. Note
5234    the the Unicode property types will be present only when SUPPORT_UNICODE is
5235    defined, but we don't wrap the little bits of code here because it just
5236    makes it horribly messy. */
5237
5238    else if (*previous < OP_EODN)
5239      {
5240      PCRE2_UCHAR *oldcode;
5241      int prop_type, prop_value;
5242      op_type = OP_TYPESTAR - OP_STAR;      /* Use type opcodes */
5243      c = *previous;                        /* Save previous opcode */
5244      if (c == OP_PROP || c == OP_NOTPROP)
5245        {
5246        prop_type = previous[1];
5247        prop_value = previous[2];
5248        }
5249      else
5250        {
5251        /* Come here from just above with a character in c */
5252        OUTPUT_SINGLE_REPEAT:
5253        prop_type = prop_value = -1;
5254        }
5255
5256      /* At this point we either have prop_type == prop_value == -1 and either
5257      a code point or a character type that is not OP_[NOT]PROP in c, or we
5258      have OP_[NOT]PROP in c and prop_type/prop_value not negative. */
5259
5260      oldcode = code;                   /* Save where we were */
5261      code = previous;                  /* Usually overwrite previous item */
5262
5263      /* If the maximum is zero then the minimum must also be zero; Perl allows
5264      this case, so we do too - by simply omitting the item altogether. */
5265
5266      if (repeat_max == 0) goto END_REPEAT;
5267
5268      /* Combine the op_type with the repeat_type */
5269
5270      repeat_type += op_type;
5271
5272      /* A minimum of zero is handled either as the special case * or ?, or as
5273      an UPTO, with the maximum given. */
5274
5275      if (repeat_min == 0)
5276        {
5277        if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
5278          else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
5279        else
5280          {
5281          *code++ = OP_UPTO + repeat_type;
5282          PUT2INC(code, 0, repeat_max);
5283          }
5284        }
5285
5286      /* A repeat minimum of 1 is optimized into some special cases. If the
5287      maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
5288      left in place and, if the maximum is greater than 1, we use OP_UPTO with
5289      one less than the maximum. */
5290
5291      else if (repeat_min == 1)
5292        {
5293        if (repeat_max == -1)
5294          *code++ = OP_PLUS + repeat_type;
5295        else
5296          {
5297          code = oldcode;                 /* Leave previous item in place */
5298          if (repeat_max == 1) goto END_REPEAT;
5299          *code++ = OP_UPTO + repeat_type;
5300          PUT2INC(code, 0, repeat_max - 1);
5301          }
5302        }
5303
5304      /* The case {n,n} is just an EXACT, while the general case {n,m} is
5305      handled as an EXACT followed by an UPTO or STAR or QUERY. */
5306
5307      else
5308        {
5309        *code++ = OP_EXACT + op_type;  /* NB EXACT doesn't have repeat_type */
5310        PUT2INC(code, 0, repeat_min);
5311
5312        /* Unless repeat_max equals repeat_min, fill in the data for EXACT, and
5313        then generate the second opcode. In UTF mode, multi-code-unit
5314        characters have their length in c, with the UTF_LENGTH bit as a flag,
5315        and the code units in utf_units. For a repeated Unicode property match,
5316        there are two extra values that define the required property, and c
5317        never has the UTF_LENGTH bit set. */
5318
5319        if (repeat_max != repeat_min)
5320          {
5321#ifdef MAYBE_UTF_MULTI
5322          if (utf && (c & UTF_LENGTH) != 0)
5323            {
5324            memcpy(code, utf_units, CU2BYTES(c & 7));
5325            code += c & 7;
5326            }
5327          else
5328#endif  /* MAYBE_UTF_MULTI */
5329            {
5330            *code++ = c;
5331            if (prop_type >= 0)
5332              {
5333              *code++ = prop_type;
5334              *code++ = prop_value;
5335              }
5336            }
5337
5338          /* Now set up the following opcode */
5339
5340          if (repeat_max < 0) *code++ = OP_STAR + repeat_type; else
5341            {
5342            repeat_max -= repeat_min;
5343            if (repeat_max == 1)
5344              {
5345              *code++ = OP_QUERY + repeat_type;
5346              }
5347            else
5348              {
5349              *code++ = OP_UPTO + repeat_type;
5350              PUT2INC(code, 0, repeat_max);
5351              }
5352            }
5353          }
5354        }
5355
5356      /* Fill in the character or character type for the final opcode. */
5357
5358#ifdef MAYBE_UTF_MULTI
5359      if (utf && (c & UTF_LENGTH) != 0)
5360        {
5361        memcpy(code, utf_units, CU2BYTES(c & 7));
5362        code += c & 7;
5363        }
5364      else
5365#endif  /* MAYBEW_UTF_MULTI */
5366        {
5367        *code++ = c;
5368        if (prop_type >= 0)
5369          {
5370          *code++ = prop_type;
5371          *code++ = prop_value;
5372          }
5373        }
5374      }
5375
5376    /* If previous was a character class or a back reference, we put the repeat
5377    stuff after it, but just skip the item if the repeat was {0,0}. */
5378
5379    else if (*previous == OP_CLASS || *previous == OP_NCLASS ||
5380#ifdef SUPPORT_WIDE_CHARS
5381             *previous == OP_XCLASS ||
5382#endif
5383             *previous == OP_REF   || *previous == OP_REFI ||
5384             *previous == OP_DNREF || *previous == OP_DNREFI)
5385      {
5386      if (repeat_max == 0)
5387        {
5388        code = previous;
5389        goto END_REPEAT;
5390        }
5391
5392      if (repeat_min == 0 && repeat_max == -1)
5393        *code++ = OP_CRSTAR + repeat_type;
5394      else if (repeat_min == 1 && repeat_max == -1)
5395        *code++ = OP_CRPLUS + repeat_type;
5396      else if (repeat_min == 0 && repeat_max == 1)
5397        *code++ = OP_CRQUERY + repeat_type;
5398      else
5399        {
5400        *code++ = OP_CRRANGE + repeat_type;
5401        PUT2INC(code, 0, repeat_min);
5402        if (repeat_max == -1) repeat_max = 0;  /* 2-byte encoding for max */
5403        PUT2INC(code, 0, repeat_max);
5404        }
5405      }
5406
5407    /* If previous was a bracket group, we may have to replicate it in certain
5408    cases. Note that at this point we can encounter only the "basic" bracket
5409    opcodes such as BRA and CBRA, as this is the place where they get converted
5410    into the more special varieties such as BRAPOS and SBRA. A test for >=
5411    OP_ASSERT and <= OP_COND includes ASSERT, ASSERT_NOT, ASSERTBACK,
5412    ASSERTBACK_NOT, ONCE, ONCE_NC, BRA, BRAPOS, CBRA, CBRAPOS, and COND.
5413    Originally, PCRE did not allow repetition of assertions, but now it does,
5414    for Perl compatibility. */
5415
5416    else if (*previous >= OP_ASSERT && *previous <= OP_COND)
5417      {
5418      register int i;
5419      int len = (int)(code - previous);
5420      PCRE2_UCHAR *bralink = NULL;
5421      PCRE2_UCHAR *brazeroptr = NULL;
5422
5423      /* Repeating a DEFINE group (or any group where the condition is always
5424      FALSE and there is only one branch) is pointless, but Perl allows the
5425      syntax, so we just ignore the repeat. */
5426
5427      if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_FALSE &&
5428          previous[GET(previous, 1)] != OP_ALT)
5429        goto END_REPEAT;
5430
5431      /* There is no sense in actually repeating assertions. The only potential
5432      use of repetition is in cases when the assertion is optional. Therefore,
5433      if the minimum is greater than zero, just ignore the repeat. If the
5434      maximum is not zero or one, set it to 1. */
5435
5436      if (*previous < OP_ONCE)    /* Assertion */
5437        {
5438        if (repeat_min > 0) goto END_REPEAT;
5439        if (repeat_max < 0 || repeat_max > 1) repeat_max = 1;
5440        }
5441
5442      /* The case of a zero minimum is special because of the need to stick
5443      OP_BRAZERO in front of it, and because the group appears once in the
5444      data, whereas in other cases it appears the minimum number of times. For
5445      this reason, it is simplest to treat this case separately, as otherwise
5446      the code gets far too messy. There are several special subcases when the
5447      minimum is zero. */
5448
5449      if (repeat_min == 0)
5450        {
5451        /* If the maximum is also zero, we used to just omit the group from the
5452        output altogether, like this:
5453
5454        ** if (repeat_max == 0)
5455        **   {
5456        **   code = previous;
5457        **   goto END_REPEAT;
5458        **   }
5459
5460        However, that fails when a group or a subgroup within it is referenced
5461        as a subroutine from elsewhere in the pattern, so now we stick in
5462        OP_SKIPZERO in front of it so that it is skipped on execution. As we
5463        don't have a list of which groups are referenced, we cannot do this
5464        selectively.
5465
5466        If the maximum is 1 or unlimited, we just have to stick in the BRAZERO
5467        and do no more at this point. */
5468
5469        if (repeat_max <= 1)    /* Covers 0, 1, and unlimited */
5470          {
5471          memmove(previous + 1, previous, CU2BYTES(len));
5472          code++;
5473          if (repeat_max == 0)
5474            {
5475            *previous++ = OP_SKIPZERO;
5476            goto END_REPEAT;
5477            }
5478          brazeroptr = previous;    /* Save for possessive optimizing */
5479          *previous++ = OP_BRAZERO + repeat_type;
5480          }
5481
5482        /* If the maximum is greater than 1 and limited, we have to replicate
5483        in a nested fashion, sticking OP_BRAZERO before each set of brackets.
5484        The first one has to be handled carefully because it's the original
5485        copy, which has to be moved up. The remainder can be handled by code
5486        that is common with the non-zero minimum case below. We have to
5487        adjust the value or repeat_max, since one less copy is required. */
5488
5489        else
5490          {
5491          int offset;
5492          memmove(previous + 2 + LINK_SIZE, previous, CU2BYTES(len));
5493          code += 2 + LINK_SIZE;
5494          *previous++ = OP_BRAZERO + repeat_type;
5495          *previous++ = OP_BRA;
5496
5497          /* We chain together the bracket offset fields that have to be
5498          filled in later when the ends of the brackets are reached. */
5499
5500          offset = (bralink == NULL)? 0 : (int)(previous - bralink);
5501          bralink = previous;
5502          PUTINC(previous, 0, offset);
5503          }
5504
5505        repeat_max--;
5506        }
5507
5508      /* If the minimum is greater than zero, replicate the group as many
5509      times as necessary, and adjust the maximum to the number of subsequent
5510      copies that we need. */
5511
5512      else
5513        {
5514        if (repeat_min > 1)
5515          {
5516          /* In the pre-compile phase, we don't actually do the replication. We
5517          just adjust the length as if we had. Do some paranoid checks for
5518          potential integer overflow. The INT64_OR_DOUBLE type is a 64-bit
5519          integer type when available, otherwise double. */
5520
5521          if (lengthptr != NULL)
5522            {
5523            size_t delta = (repeat_min - 1)*length_prevgroup;
5524            if ((INT64_OR_DOUBLE)(repeat_min - 1)*
5525                  (INT64_OR_DOUBLE)length_prevgroup >
5526                    (INT64_OR_DOUBLE)INT_MAX ||
5527                OFLOW_MAX - *lengthptr < delta)
5528              {
5529              *errorcodeptr = ERR20;
5530              goto FAILED;
5531              }
5532            *lengthptr += delta;
5533            }
5534
5535          /* This is compiling for real. If there is a set first byte for
5536          the group, and we have not yet set a "required byte", set it. */
5537
5538          else
5539            {
5540            if (groupsetfirstcu && reqcuflags < 0)
5541              {
5542              reqcu = firstcu;
5543              reqcuflags = firstcuflags;
5544              }
5545            for (i = 1; i < repeat_min; i++)
5546              {
5547              memcpy(code, previous, CU2BYTES(len));
5548              code += len;
5549              }
5550            }
5551          }
5552
5553        if (repeat_max > 0) repeat_max -= repeat_min;
5554        }
5555
5556      /* This code is common to both the zero and non-zero minimum cases. If
5557      the maximum is limited, it replicates the group in a nested fashion,
5558      remembering the bracket starts on a stack. In the case of a zero minimum,
5559      the first one was set up above. In all cases the repeat_max now specifies
5560      the number of additional copies needed. Again, we must remember to
5561      replicate entries on the forward reference list. */
5562
5563      if (repeat_max >= 0)
5564        {
5565        /* In the pre-compile phase, we don't actually do the replication. We
5566        just adjust the length as if we had. For each repetition we must add 1
5567        to the length for BRAZERO and for all but the last repetition we must
5568        add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
5569        paranoid checks to avoid integer overflow. The INT64_OR_DOUBLE type is
5570        a 64-bit integer type when available, otherwise double. */
5571
5572        if (lengthptr != NULL && repeat_max > 0)
5573          {
5574          size_t delta = repeat_max*(length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
5575                      2 - 2*LINK_SIZE;   /* Last one doesn't nest */
5576          if ((INT64_OR_DOUBLE)repeat_max *
5577                (INT64_OR_DOUBLE)(length_prevgroup + 1 + 2 + 2*LINK_SIZE)
5578                  > (INT64_OR_DOUBLE)INT_MAX ||
5579              OFLOW_MAX - *lengthptr < delta)
5580            {
5581            *errorcodeptr = ERR20;
5582            goto FAILED;
5583            }
5584          *lengthptr += delta;
5585          }
5586
5587        /* This is compiling for real */
5588
5589        else for (i = repeat_max - 1; i >= 0; i--)
5590          {
5591          *code++ = OP_BRAZERO + repeat_type;
5592
5593          /* All but the final copy start a new nesting, maintaining the
5594          chain of brackets outstanding. */
5595
5596          if (i != 0)
5597            {
5598            int offset;
5599            *code++ = OP_BRA;
5600            offset = (bralink == NULL)? 0 : (int)(code - bralink);
5601            bralink = code;
5602            PUTINC(code, 0, offset);
5603            }
5604
5605          memcpy(code, previous, CU2BYTES(len));
5606          code += len;
5607          }
5608
5609        /* Now chain through the pending brackets, and fill in their length
5610        fields (which are holding the chain links pro tem). */
5611
5612        while (bralink != NULL)
5613          {
5614          int oldlinkoffset;
5615          int offset = (int)(code - bralink + 1);
5616          PCRE2_UCHAR *bra = code - offset;
5617          oldlinkoffset = GET(bra, 1);
5618          bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
5619          *code++ = OP_KET;
5620          PUTINC(code, 0, offset);
5621          PUT(bra, 1, offset);
5622          }
5623        }
5624
5625      /* If the maximum is unlimited, set a repeater in the final copy. For
5626      ONCE brackets, that's all we need to do. However, possessively repeated
5627      ONCE brackets can be converted into non-capturing brackets, as the
5628      behaviour of (?:xx)++ is the same as (?>xx)++ and this saves having to
5629      deal with possessive ONCEs specially.
5630
5631      Otherwise, when we are doing the actual compile phase, check to see
5632      whether this group is one that could match an empty string. If so,
5633      convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
5634      that runtime checking can be done. [This check is also applied to ONCE
5635      groups at runtime, but in a different way.]
5636
5637      Then, if the quantifier was possessive and the bracket is not a
5638      conditional, we convert the BRA code to the POS form, and the KET code to
5639      KETRPOS. (It turns out to be convenient at runtime to detect this kind of
5640      subpattern at both the start and at the end.) The use of special opcodes
5641      makes it possible to reduce greatly the stack usage in pcre2_match(). If
5642      the group is preceded by OP_BRAZERO, convert this to OP_BRAPOSZERO.
5643
5644      Then, if the minimum number of matches is 1 or 0, cancel the possessive
5645      flag so that the default action below, of wrapping everything inside
5646      atomic brackets, does not happen. When the minimum is greater than 1,
5647      there will be earlier copies of the group, and so we still have to wrap
5648      the whole thing. */
5649
5650      else
5651        {
5652        PCRE2_UCHAR *ketcode = code - 1 - LINK_SIZE;
5653        PCRE2_UCHAR *bracode = ketcode - GET(ketcode, 1);
5654
5655        /* Convert possessive ONCE brackets to non-capturing */
5656
5657        if ((*bracode == OP_ONCE || *bracode == OP_ONCE_NC) &&
5658            possessive_quantifier) *bracode = OP_BRA;
5659
5660        /* For non-possessive ONCE brackets, all we need to do is to
5661        set the KET. */
5662
5663        if (*bracode == OP_ONCE || *bracode == OP_ONCE_NC)
5664          *ketcode = OP_KETRMAX + repeat_type;
5665
5666        /* Handle non-ONCE brackets and possessive ONCEs (which have been
5667        converted to non-capturing above). */
5668
5669        else
5670          {
5671          /* In the compile phase, check whether the group could match an empty
5672          string. */
5673
5674          if (lengthptr == NULL)
5675            {
5676            PCRE2_UCHAR *scode = bracode;
5677            do
5678              {
5679              int count = 0;
5680              int rc = could_be_empty_branch(scode, ketcode, utf, cb, FALSE,
5681                NULL, &count);
5682              if (rc < 0)
5683                {
5684                *errorcodeptr = ERR86;
5685                goto FAILED;
5686                }
5687              if (rc > 0)
5688                {
5689                *bracode += OP_SBRA - OP_BRA;
5690                break;
5691                }
5692              scode += GET(scode, 1);
5693              }
5694            while (*scode == OP_ALT);
5695
5696            /* A conditional group with only one branch has an implicit empty
5697            alternative branch. */
5698
5699            if (*bracode == OP_COND && bracode[GET(bracode,1)] != OP_ALT)
5700              *bracode = OP_SCOND;
5701            }
5702
5703          /* Handle possessive quantifiers. */
5704
5705          if (possessive_quantifier)
5706            {
5707            /* For COND brackets, we wrap the whole thing in a possessively
5708            repeated non-capturing bracket, because we have not invented POS
5709            versions of the COND opcodes. */
5710
5711            if (*bracode == OP_COND || *bracode == OP_SCOND)
5712              {
5713              int nlen = (int)(code - bracode);
5714              memmove(bracode + 1 + LINK_SIZE, bracode, CU2BYTES(nlen));
5715              code += 1 + LINK_SIZE;
5716              nlen += 1 + LINK_SIZE;
5717              *bracode = (*bracode == OP_COND)? OP_BRAPOS : OP_SBRAPOS;
5718              *code++ = OP_KETRPOS;
5719              PUTINC(code, 0, nlen);
5720              PUT(bracode, 1, nlen);
5721              }
5722
5723            /* For non-COND brackets, we modify the BRA code and use KETRPOS. */
5724
5725            else
5726              {
5727              *bracode += 1;              /* Switch to xxxPOS opcodes */
5728              *ketcode = OP_KETRPOS;
5729              }
5730
5731            /* If the minimum is zero, mark it as possessive, then unset the
5732            possessive flag when the minimum is 0 or 1. */
5733
5734            if (brazeroptr != NULL) *brazeroptr = OP_BRAPOSZERO;
5735            if (repeat_min < 2) possessive_quantifier = FALSE;
5736            }
5737
5738          /* Non-possessive quantifier */
5739
5740          else *ketcode = OP_KETRMAX + repeat_type;
5741          }
5742        }
5743      }
5744
5745    /* If previous is OP_FAIL, it was generated by an empty class []
5746    (PCRE2_ALLOW_EMPTY_CLASS is set). The other ways in which OP_FAIL can be
5747    generated, that is by (*FAIL) or (?!), set previous to NULL, which gives a
5748    "nothing to repeat" error above. We can just ignore the repeat in empty
5749    class case. */
5750
5751    else if (*previous == OP_FAIL) goto END_REPEAT;
5752
5753    /* Else there's some kind of shambles */
5754
5755    else
5756      {
5757      *errorcodeptr = ERR10;
5758      goto FAILED;
5759      }
5760
5761    /* If the character following a repeat is '+', possessive_quantifier is
5762    TRUE. For some opcodes, there are special alternative opcodes for this
5763    case. For anything else, we wrap the entire repeated item inside OP_ONCE
5764    brackets. Logically, the '+' notation is just syntactic sugar, taken from
5765    Sun's Java package, but the special opcodes can optimize it.
5766
5767    Some (but not all) possessively repeated subpatterns have already been
5768    completely handled in the code just above. For them, possessive_quantifier
5769    is always FALSE at this stage. Note that the repeated item starts at
5770    tempcode, not at previous, which might be the first part of a string whose
5771    (former) last char we repeated. */
5772
5773    if (possessive_quantifier)
5774      {
5775      int len;
5776
5777      /* Possessifying an EXACT quantifier has no effect, so we can ignore it.
5778      However, QUERY, STAR, or UPTO may follow (for quantifiers such as {5,6},
5779      {5,}, or {5,10}). We skip over an EXACT item; if the length of what
5780      remains is greater than zero, there's a further opcode that can be
5781      handled. If not, do nothing, leaving the EXACT alone. */
5782
5783      switch(*tempcode)
5784        {
5785        case OP_TYPEEXACT:
5786        tempcode += PRIV(OP_lengths)[*tempcode] +
5787          ((tempcode[1 + IMM2_SIZE] == OP_PROP
5788          || tempcode[1 + IMM2_SIZE] == OP_NOTPROP)? 2 : 0);
5789        break;
5790
5791        /* CHAR opcodes are used for exacts whose count is 1. */
5792
5793        case OP_CHAR:
5794        case OP_CHARI:
5795        case OP_NOT:
5796        case OP_NOTI:
5797        case OP_EXACT:
5798        case OP_EXACTI:
5799        case OP_NOTEXACT:
5800        case OP_NOTEXACTI:
5801        tempcode += PRIV(OP_lengths)[*tempcode];
5802#ifdef SUPPORT_UNICODE
5803        if (utf && HAS_EXTRALEN(tempcode[-1]))
5804          tempcode += GET_EXTRALEN(tempcode[-1]);
5805#endif
5806        break;
5807
5808        /* For the class opcodes, the repeat operator appears at the end;
5809        adjust tempcode to point to it. */
5810
5811        case OP_CLASS:
5812        case OP_NCLASS:
5813        tempcode += 1 + 32/sizeof(PCRE2_UCHAR);
5814        break;
5815
5816#ifdef SUPPORT_WIDE_CHARS
5817        case OP_XCLASS:
5818        tempcode += GET(tempcode, 1);
5819        break;
5820#endif
5821        }
5822
5823      /* If tempcode is equal to code (which points to the end of the repeated
5824      item), it means we have skipped an EXACT item but there is no following
5825      QUERY, STAR, or UPTO; the value of len will be 0, and we do nothing. In
5826      all other cases, tempcode will be pointing to the repeat opcode, and will
5827      be less than code, so the value of len will be greater than 0. */
5828
5829      len = (int)(code - tempcode);
5830      if (len > 0)
5831        {
5832        unsigned int repcode = *tempcode;
5833
5834        /* There is a table for possessifying opcodes, all of which are less
5835        than OP_CALLOUT. A zero entry means there is no possessified version.
5836        */
5837
5838        if (repcode < OP_CALLOUT && opcode_possessify[repcode] > 0)
5839          *tempcode = opcode_possessify[repcode];
5840
5841        /* For opcode without a special possessified version, wrap the item in
5842        ONCE brackets. */
5843
5844        else
5845          {
5846          memmove(tempcode + 1 + LINK_SIZE, tempcode, CU2BYTES(len));
5847          code += 1 + LINK_SIZE;
5848          len += 1 + LINK_SIZE;
5849          tempcode[0] = OP_ONCE;
5850          *code++ = OP_KET;
5851          PUTINC(code, 0, len);
5852          PUT(tempcode, 1, len);
5853          }
5854        }
5855      }
5856
5857    /* In all case we no longer have a previous item. We also set the
5858    "follows varying string" flag for subsequently encountered reqcus if
5859    it isn't already set and we have just passed a varying length item. */
5860
5861    END_REPEAT:
5862    previous = NULL;
5863    cb->req_varyopt |= reqvary;
5864    break;
5865
5866
5867    /* ===================================================================*/
5868    /* Start of nested parenthesized sub-expression, or lookahead or lookbehind
5869    or option setting or condition or all the other extended parenthesis forms.
5870    We must save the current high-water-mark for the forward reference list so
5871    that we know where they start for this group. However, because the list may
5872    be extended when there are very many forward references (usually the result
5873    of a replicated inner group), we must use an offset rather than an absolute
5874    address. Note that (?# comments are dealt with at the top of the loop;
5875    they do not get this far. */
5876
5877    case CHAR_LEFT_PARENTHESIS:
5878    ptr++;
5879
5880    /* Deal with various "verbs" that can be introduced by '*'. */
5881
5882    if (ptr[0] == CHAR_ASTERISK && (ptr[1] == ':'
5883         || (MAX_255(ptr[1]) && ((cb->ctypes[ptr[1]] & ctype_letter) != 0))))
5884      {
5885      int i, namelen;
5886      int arglen = 0;
5887      const char *vn = verbnames;
5888      PCRE2_SPTR name = ptr + 1;
5889      PCRE2_SPTR arg = NULL;
5890      previous = NULL;
5891      ptr++;
5892
5893      /* Increment ptr, set namelen, check length */
5894
5895      READ_NAME(ctype_letter, ERR60, *errorcodeptr);
5896
5897      /* It appears that Perl allows any characters whatsoever, other than
5898      a closing parenthesis, to appear in arguments, so we no longer insist on
5899      letters, digits, and underscores. Perl does not, however, do any
5900      interpretation within arguments, and has no means of including a closing
5901      parenthesis. PCRE supports escape processing but only when it is
5902      requested by an option. Note that check_escape() will not return values
5903      greater than the code unit maximum when not in UTF mode. */
5904
5905      if (*ptr == CHAR_COLON)
5906        {
5907        arg = ++ptr;
5908
5909        if ((options & PCRE2_ALT_VERBNAMES) == 0)
5910          {
5911          arglen = 0;
5912          while (ptr < cb->end_pattern && *ptr != CHAR_RIGHT_PARENTHESIS)
5913            {
5914            ptr++;                                /* Check length as we go */
5915            arglen++;                             /* along, to avoid the   */
5916            if ((unsigned int)arglen > MAX_MARK)  /* possibility of overflow. */
5917              {
5918              *errorcodeptr = ERR76;
5919              goto FAILED;
5920              }
5921            }
5922          }
5923        else
5924          {
5925          /* The length check is in process_verb_names() */
5926          arglen = process_verb_name(&ptr, NULL, errorcodeptr, options,
5927            utf, cb);
5928          if (arglen < 0) goto FAILED;
5929          }
5930        }
5931
5932      if (*ptr != CHAR_RIGHT_PARENTHESIS)
5933        {
5934        *errorcodeptr = ERR60;
5935        goto FAILED;
5936        }
5937
5938      /* Scan the table of verb names */
5939
5940      for (i = 0; i < verbcount; i++)
5941        {
5942        if (namelen == verbs[i].len &&
5943            PRIV(strncmp_c8)(name, vn, namelen) == 0)
5944          {
5945          int setverb;
5946
5947          /* Check for open captures before ACCEPT and convert it to
5948          ASSERT_ACCEPT if in an assertion. */
5949
5950          if (verbs[i].op == OP_ACCEPT)
5951            {
5952            open_capitem *oc;
5953            if (arglen != 0)
5954              {
5955              *errorcodeptr = ERR59;
5956              goto FAILED;
5957              }
5958            cb->had_accept = TRUE;
5959
5960            /* In the first pass, just accumulate the length required;
5961            otherwise hitting (*ACCEPT) inside many nested parentheses can
5962            cause workspace overflow. */
5963
5964            for (oc = cb->open_caps; oc != NULL; oc = oc->next)
5965              {
5966              if (lengthptr != NULL)
5967                {
5968                *lengthptr += CU2BYTES(1) + IMM2_SIZE;
5969                }
5970              else
5971                {
5972                *code++ = OP_CLOSE;
5973                PUT2INC(code, 0, oc->number);
5974                }
5975              }
5976            setverb = *code++ =
5977              (cb->assert_depth > 0)? OP_ASSERT_ACCEPT : OP_ACCEPT;
5978
5979            /* Do not set firstcu after *ACCEPT */
5980            if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
5981            }
5982
5983          /* Handle other cases with/without an argument */
5984
5985          else if (arglen == 0)    /* There is no argument */
5986            {
5987            if (verbs[i].op < 0)   /* Argument is mandatory */
5988              {
5989              *errorcodeptr = ERR66;
5990              goto FAILED;
5991              }
5992            setverb = *code++ = verbs[i].op;
5993            }
5994
5995          else                        /* An argument is present */
5996            {
5997            if (verbs[i].op_arg < 0)  /* Argument is forbidden */
5998              {
5999              *errorcodeptr = ERR59;
6000              goto FAILED;
6001              }
6002            setverb = *code++ = verbs[i].op_arg;
6003
6004            /* Arguments can be very long, especially in 16- and 32-bit modes,
6005            and can overflow the workspace in the first pass. Instead of
6006            putting the argument into memory, we just update the length counter
6007            and set up an empty argument. */
6008
6009            if (lengthptr != NULL)
6010              {
6011              *lengthptr += arglen;
6012              *code++ = 0;
6013              }
6014            else
6015              {
6016              *code++ = arglen;
6017              if ((options & PCRE2_ALT_VERBNAMES) != 0)
6018                {
6019                PCRE2_UCHAR *memcode = code;  /* code is "register" */
6020                (void)process_verb_name(&arg, &memcode, errorcodeptr, options,
6021                  utf, cb);
6022                code = memcode;
6023                }
6024              else   /* No argument processing */
6025                {
6026                memcpy(code, arg, CU2BYTES(arglen));
6027                code += arglen;
6028                }
6029              }
6030
6031            *code++ = 0;
6032            }
6033
6034          switch (setverb)
6035            {
6036            case OP_THEN:
6037            case OP_THEN_ARG:
6038            cb->external_flags |= PCRE2_HASTHEN;
6039            break;
6040
6041            case OP_PRUNE:
6042            case OP_PRUNE_ARG:
6043            case OP_SKIP:
6044            case OP_SKIP_ARG:
6045            cb->had_pruneorskip = TRUE;
6046            break;
6047            }
6048
6049          break;  /* Found verb, exit loop */
6050          }
6051
6052        vn += verbs[i].len + 1;
6053        }
6054
6055      if (i < verbcount) continue;    /* Successfully handled a verb */
6056      *errorcodeptr = ERR60;          /* Verb not recognized */
6057      goto FAILED;
6058      }
6059
6060    /* Initialization for "real" parentheses */
6061
6062    newoptions = options;
6063    skipunits = 0;
6064    bravalue = OP_CBRA;
6065    reset_bracount = FALSE;
6066
6067    /* Deal with the extended parentheses; all are introduced by '?', and the
6068    appearance of any of them means that this is not a capturing group. */
6069
6070    if (*ptr == CHAR_QUESTION_MARK)
6071      {
6072      int i, count;
6073      int namelen;                /* Must be signed */
6074      uint32_t index;
6075      uint32_t set, unset, *optset;
6076      named_group *ng;
6077      PCRE2_SPTR name;
6078      PCRE2_UCHAR *slot;
6079
6080      switch (*(++ptr))
6081        {
6082        /* ------------------------------------------------------------ */
6083        case CHAR_VERTICAL_LINE:  /* Reset capture count for each branch */
6084        reset_bracount = TRUE;
6085        /* Fall through */
6086
6087        /* ------------------------------------------------------------ */
6088        case CHAR_COLON:          /* Non-capturing bracket */
6089        bravalue = OP_BRA;
6090        ptr++;
6091        break;
6092
6093        /* ------------------------------------------------------------ */
6094        case CHAR_LEFT_PARENTHESIS:
6095        bravalue = OP_COND;       /* Conditional group */
6096        tempptr = ptr;
6097
6098        /* A condition can be an assertion, a number (referring to a numbered
6099        group's having been set), a name (referring to a named group), or 'R',
6100        referring to recursion. R<digits> and R&name are also permitted for
6101        recursion tests.
6102
6103        There are ways of testing a named group: (?(name)) is used by Python;
6104        Perl 5.10 onwards uses (?(<name>) or (?('name')).
6105
6106        There is one unfortunate ambiguity, caused by history. 'R' can be the
6107        recursive thing or the name 'R' (and similarly for 'R' followed by
6108        digits). We look for a name first; if not found, we try the other case.
6109
6110        For compatibility with auto-callouts, we allow a callout to be
6111        specified before a condition that is an assertion. First, check for the
6112        syntax of a callout; if found, adjust the temporary pointer that is
6113        used to check for an assertion condition. That's all that is needed! */
6114
6115        if (ptr[1] == CHAR_QUESTION_MARK && ptr[2] == CHAR_C)
6116          {
6117          if (IS_DIGIT(ptr[3]) || ptr[3] == CHAR_RIGHT_PARENTHESIS)
6118            {
6119            for (i = 3;; i++) if (!IS_DIGIT(ptr[i])) break;
6120            if (ptr[i] == CHAR_RIGHT_PARENTHESIS)
6121              tempptr += i + 1;
6122            }
6123          else
6124            {
6125            uint32_t delimiter = 0;
6126            for (i = 0; PRIV(callout_start_delims)[i] != 0; i++)
6127              {
6128              if (ptr[3] == PRIV(callout_start_delims)[i])
6129                {
6130                delimiter = PRIV(callout_end_delims)[i];
6131                break;
6132                }
6133              }
6134            if (delimiter != 0)
6135              {
6136              for (i = 4; ptr + i < cb->end_pattern; i++)
6137                {
6138                if (ptr[i] == delimiter)
6139                  {
6140                  if (ptr[i+1] == delimiter) i++;
6141                  else
6142                    {
6143                    if (ptr[i+1] == CHAR_RIGHT_PARENTHESIS) tempptr += i + 2;
6144                    break;
6145                    }
6146                  }
6147                }
6148              }
6149            }
6150
6151          /* tempptr should now be pointing to the opening parenthesis of the
6152          assertion condition. */
6153
6154          if (*tempptr != CHAR_LEFT_PARENTHESIS)
6155            {
6156            *errorcodeptr = ERR28;
6157            goto FAILED;
6158            }
6159          }
6160
6161        /* For conditions that are assertions, check the syntax, and then exit
6162        the switch. This will take control down to where bracketed groups
6163        are processed. The assertion will be handled as part of the group,
6164        but we need to identify this case because the conditional assertion may
6165        not be quantifier. */
6166
6167        if (tempptr[1] == CHAR_QUESTION_MARK &&
6168              (tempptr[2] == CHAR_EQUALS_SIGN ||
6169               tempptr[2] == CHAR_EXCLAMATION_MARK ||
6170                 (tempptr[2] == CHAR_LESS_THAN_SIGN &&
6171                   (tempptr[3] == CHAR_EQUALS_SIGN ||
6172                    tempptr[3] == CHAR_EXCLAMATION_MARK))))
6173          {
6174          cb->iscondassert = TRUE;
6175          break;
6176          }
6177
6178        /* Other conditions use OP_CREF/OP_DNCREF/OP_RREF/OP_DNRREF, and all
6179        need to skip at least 1+IMM2_SIZE bytes at the start of the group. */
6180
6181        code[1+LINK_SIZE] = OP_CREF;
6182        skipunits = 1+IMM2_SIZE;
6183        refsign = -1;     /* => not a number */
6184        namelen = -1;     /* => not a name; must set to avoid warning */
6185        name = NULL;      /* Always set to avoid warning */
6186        recno = 0;        /* Always set to avoid warning */
6187
6188        /* Point at character after (?( */
6189
6190        ptr++;
6191
6192        /* Check for (?(VERSION[>]=n.m), which is a facility whereby indirect
6193        users of PCRE2 via an application can discover which release of PCRE2
6194        is being used. */
6195
6196        if (PRIV(strncmp_c8)(ptr, STRING_VERSION, 7) == 0 &&
6197            ptr[7] != CHAR_RIGHT_PARENTHESIS)
6198          {
6199          BOOL ge = FALSE;
6200          int major = 0;
6201          int minor = 0;
6202
6203          ptr += 7;
6204          if (*ptr == CHAR_GREATER_THAN_SIGN)
6205            {
6206            ge = TRUE;
6207            ptr++;
6208            }
6209
6210          /* NOTE: cannot write IS_DIGIT(*(++ptr)) here because IS_DIGIT
6211          references its argument twice. */
6212
6213          if (*ptr != CHAR_EQUALS_SIGN || (ptr++, !IS_DIGIT(*ptr)))
6214            {
6215            *errorcodeptr = ERR79;
6216            goto FAILED;
6217            }
6218
6219          while (IS_DIGIT(*ptr)) major = major * 10 + *ptr++ - '0';
6220          if (*ptr == CHAR_DOT)
6221            {
6222            ptr++;
6223            while (IS_DIGIT(*ptr)) minor = minor * 10 + *ptr++ - '0';
6224            if (minor < 10) minor *= 10;
6225            }
6226
6227          if (*ptr != CHAR_RIGHT_PARENTHESIS || minor > 99)
6228            {
6229            *errorcodeptr = ERR79;
6230            goto FAILED;
6231            }
6232
6233          if (ge)
6234            code[1+LINK_SIZE] = ((PCRE2_MAJOR > major) ||
6235              (PCRE2_MAJOR == major && PCRE2_MINOR >= minor))?
6236                OP_TRUE : OP_FALSE;
6237          else
6238            code[1+LINK_SIZE] = (PCRE2_MAJOR == major && PCRE2_MINOR == minor)?
6239              OP_TRUE : OP_FALSE;
6240
6241          ptr++;
6242          skipunits = 1;
6243          break;  /* End of condition processing */
6244          }
6245
6246        /* Check for a test for recursion in a named group. */
6247
6248        if (*ptr == CHAR_R && ptr[1] == CHAR_AMPERSAND)
6249          {
6250          terminator = -1;
6251          ptr += 2;
6252          code[1+LINK_SIZE] = OP_RREF;    /* Change the type of test */
6253          }
6254
6255        /* Check for a test for a named group's having been set, using the Perl
6256        syntax (?(<name>) or (?('name'), and also allow for the original PCRE
6257        syntax of (?(name) or for (?(+n), (?(-n), and just (?(n). */
6258
6259        else if (*ptr == CHAR_LESS_THAN_SIGN)
6260          {
6261          terminator = CHAR_GREATER_THAN_SIGN;
6262          ptr++;
6263          }
6264        else if (*ptr == CHAR_APOSTROPHE)
6265          {
6266          terminator = CHAR_APOSTROPHE;
6267          ptr++;
6268          }
6269        else
6270          {
6271          terminator = CHAR_NULL;
6272          if (*ptr == CHAR_MINUS || *ptr == CHAR_PLUS) refsign = *ptr++;
6273            else if (IS_DIGIT(*ptr)) refsign = 0;
6274          }
6275
6276        /* Handle a number */
6277
6278        if (refsign >= 0)
6279          {
6280          while (IS_DIGIT(*ptr))
6281            {
6282            if (recno > INT_MAX / 10 - 1)  /* Integer overflow */
6283              {
6284              while (IS_DIGIT(*ptr)) ptr++;
6285              *errorcodeptr = ERR61;
6286              goto FAILED;
6287              }
6288            recno = recno * 10 + (int)(*ptr - CHAR_0);
6289            ptr++;
6290            }
6291          }
6292
6293        /* Otherwise we expect to read a name; anything else is an error. When
6294        the referenced name is one of a number of duplicates, a different
6295        opcode is used and it needs more memory. Unfortunately we cannot tell
6296        whether this is the case in the first pass, so we have to allow for
6297        more memory always. In the second pass, the additional to skipunits
6298        happens later. */
6299
6300        else
6301          {
6302          if (IS_DIGIT(*ptr))
6303            {
6304            *errorcodeptr = ERR44;  /* Group name must start with non-digit */
6305            goto FAILED;
6306            }
6307          if (!MAX_255(*ptr) || (cb->ctypes[*ptr] & ctype_word) == 0)
6308            {
6309            *errorcodeptr = ERR28;   /* Assertion expected */
6310            goto FAILED;
6311            }
6312          name = ptr;
6313          /* Increment ptr, set namelen, check length */
6314          READ_NAME(ctype_word, ERR48, *errorcodeptr);
6315          if (lengthptr != NULL) skipunits += IMM2_SIZE;
6316          }
6317
6318        /* Check the terminator */
6319
6320        if ((terminator > 0 && *ptr++ != (PCRE2_UCHAR)terminator) ||
6321            *ptr++ != CHAR_RIGHT_PARENTHESIS)
6322          {
6323          ptr--;                  /* Error offset */
6324          *errorcodeptr = ERR26;  /* Malformed number or name */
6325          goto FAILED;
6326          }
6327
6328        /* Do no further checking in the pre-compile phase. */
6329
6330        if (lengthptr != NULL) break;
6331
6332        /* In the real compile we do the work of looking for the actual
6333        reference. If refsign is not negative, it means we have a number in
6334        recno. */
6335
6336        if (refsign >= 0)
6337          {
6338          if (recno <= 0)
6339            {
6340            *errorcodeptr = ERR35;
6341            goto FAILED;
6342            }
6343          if (refsign != 0) recno = (refsign == CHAR_MINUS)?
6344            (cb->bracount + 1) - recno : recno + cb->bracount;
6345          if (recno <= 0 || (uint32_t)recno > cb->final_bracount)
6346            {
6347            *errorcodeptr = ERR15;
6348            goto FAILED;
6349            }
6350          PUT2(code, 2+LINK_SIZE, recno);
6351          if ((uint32_t)recno > cb->top_backref) cb->top_backref = recno;
6352          break;
6353          }
6354
6355        /* Otherwise look for the name. */
6356
6357        slot = cb->name_table;
6358        for (i = 0; i < cb->names_found; i++)
6359          {
6360          if (PRIV(strncmp)(name, slot+IMM2_SIZE, namelen) == 0) break;
6361          slot += cb->name_entry_size;
6362          }
6363
6364        /* Found the named subpattern. If the name is duplicated, add one to
6365        the opcode to change CREF/RREF into DNCREF/DNRREF and insert
6366        appropriate data values. Otherwise, just insert the unique subpattern
6367        number. */
6368
6369        if (i < cb->names_found)
6370          {
6371          int offset = i;            /* Offset of first name found */
6372
6373          count = 0;
6374          for (;;)
6375            {
6376            recno = GET2(slot, 0);   /* Number for last found */
6377            if ((uint32_t)recno > cb->top_backref) cb->top_backref = recno;
6378            count++;
6379            if (++i >= cb->names_found) break;
6380            slot += cb->name_entry_size;
6381            if (PRIV(strncmp)(name, slot+IMM2_SIZE, namelen) != 0 ||
6382              (slot+IMM2_SIZE)[namelen] != 0) break;
6383            }
6384
6385          if (count > 1)
6386            {
6387            PUT2(code, 2+LINK_SIZE, offset);
6388            PUT2(code, 2+LINK_SIZE+IMM2_SIZE, count);
6389            skipunits += IMM2_SIZE;
6390            code[1+LINK_SIZE]++;
6391            }
6392          else  /* Not a duplicated name */
6393            {
6394            PUT2(code, 2+LINK_SIZE, recno);
6395            }
6396          }
6397
6398        /* If terminator == CHAR_NULL it means that the name followed directly
6399        after the opening parenthesis [e.g. (?(abc)...] and in this case there
6400        are some further alternatives to try. For the cases where terminator !=
6401        CHAR_NULL [things like (?(<name>... or (?('name')... or (?(R&name)... ]
6402        we have now checked all the possibilities, so give an error. */
6403
6404        else if (terminator != CHAR_NULL)
6405          {
6406          *errorcodeptr = ERR15;
6407          goto FAILED;
6408          }
6409
6410        /* Check for (?(R) for recursion. Allow digits after R to specify a
6411        specific group number. */
6412
6413        else if (*name == CHAR_R)
6414          {
6415          recno = 0;
6416          for (i = 1; i < namelen; i++)
6417            {
6418            if (!IS_DIGIT(name[i]))
6419              {
6420              *errorcodeptr = ERR15;        /* Non-existent subpattern */
6421              goto FAILED;
6422              }
6423            if (recno > INT_MAX / 10 - 1)   /* Integer overflow */
6424              {
6425              *errorcodeptr = ERR61;
6426              goto FAILED;
6427              }
6428            recno = recno * 10 + name[i] - CHAR_0;
6429            }
6430          if (recno == 0) recno = RREF_ANY;
6431          code[1+LINK_SIZE] = OP_RREF;      /* Change test type */
6432          PUT2(code, 2+LINK_SIZE, recno);
6433          }
6434
6435        /* Similarly, check for the (?(DEFINE) "condition", which is always
6436        false. During compilation we set OP_DEFINE to distinguish this from
6437        other OP_FALSE conditions so that it can be checked for having only one
6438        branch, but after that the opcode is changed to OP_FALSE. */
6439
6440        else if (namelen == 6 && PRIV(strncmp_c8)(name, STRING_DEFINE, 6) == 0)
6441          {
6442          code[1+LINK_SIZE] = OP_DEFINE;
6443          skipunits = 1;
6444          }
6445
6446        /* Reference to an unidentified subpattern. */
6447
6448        else
6449          {
6450          *errorcodeptr = ERR15;
6451          goto FAILED;
6452          }
6453        break;
6454
6455
6456        /* ------------------------------------------------------------ */
6457        case CHAR_EQUALS_SIGN:                 /* Positive lookahead */
6458        bravalue = OP_ASSERT;
6459        cb->assert_depth += 1;
6460        ptr++;
6461        break;
6462
6463        /* Optimize (?!) to (*FAIL) unless it is quantified - which is a weird
6464        thing to do, but Perl allows all assertions to be quantified, and when
6465        they contain capturing parentheses there may be a potential use for
6466        this feature. Not that that applies to a quantified (?!) but we allow
6467        it for uniformity. */
6468
6469        /* ------------------------------------------------------------ */
6470        case CHAR_EXCLAMATION_MARK:            /* Negative lookahead */
6471        ptr++;
6472        if (*ptr == CHAR_RIGHT_PARENTHESIS && ptr[1] != CHAR_ASTERISK &&
6473             ptr[1] != CHAR_PLUS && ptr[1] != CHAR_QUESTION_MARK &&
6474            (ptr[1] != CHAR_LEFT_CURLY_BRACKET || !is_counted_repeat(ptr+2)))
6475          {
6476          *code++ = OP_FAIL;
6477          previous = NULL;
6478          continue;
6479          }
6480        bravalue = OP_ASSERT_NOT;
6481        cb->assert_depth += 1;
6482        break;
6483
6484
6485        /* ------------------------------------------------------------ */
6486        case CHAR_LESS_THAN_SIGN:              /* Lookbehind or named define */
6487        switch (ptr[1])
6488          {
6489          case CHAR_EQUALS_SIGN:               /* Positive lookbehind */
6490          bravalue = OP_ASSERTBACK;
6491          cb->assert_depth += 1;
6492          ptr += 2;
6493          break;
6494
6495          case CHAR_EXCLAMATION_MARK:          /* Negative lookbehind */
6496          bravalue = OP_ASSERTBACK_NOT;
6497          cb->assert_depth += 1;
6498          ptr += 2;
6499          break;
6500
6501          /* Must be a name definition - as the syntax was checked in the
6502          pre-pass, we can assume here that it is valid. Skip over the name
6503          and go to handle the numbered group. */
6504
6505          default:
6506          while (*(++ptr) != CHAR_GREATER_THAN_SIGN);
6507          ptr++;
6508          goto NUMBERED_GROUP;
6509          }
6510        break;
6511
6512
6513        /* ------------------------------------------------------------ */
6514        case CHAR_GREATER_THAN_SIGN:           /* One-time brackets */
6515        bravalue = OP_ONCE;
6516        ptr++;
6517        break;
6518
6519
6520        /* ------------------------------------------------------------ */
6521        case CHAR_C:                 /* Callout */
6522        previous_callout = code;     /* Save for later completion */
6523        after_manual_callout = 1;    /* Skip one item before completing */
6524        ptr++;                       /* Character after (?C */
6525
6526        /* A callout may have a string argument, delimited by one of a fixed
6527        number of characters, or an undelimited numerical argument, or no
6528        argument, which is the same as (?C0). Different opcodes are used for
6529        the two cases. */
6530
6531        if (*ptr != CHAR_RIGHT_PARENTHESIS && !IS_DIGIT(*ptr))
6532          {
6533          uint32_t delimiter = 0;
6534
6535          for (i = 0; PRIV(callout_start_delims)[i] != 0; i++)
6536            {
6537            if (*ptr == PRIV(callout_start_delims)[i])
6538              {
6539              delimiter = PRIV(callout_end_delims)[i];
6540              break;
6541              }
6542            }
6543
6544          if (delimiter == 0)
6545            {
6546            *errorcodeptr = ERR82;
6547            goto FAILED;
6548            }
6549
6550          /* During the pre-compile phase, we parse the string and update the
6551          length. There is no need to generate any code. (In fact, the string
6552          has already been parsed in the pre-pass that looks for named
6553          parentheses, but it does no harm to leave this code in.) */
6554
6555          if (lengthptr != NULL)     /* Only check the string */
6556            {
6557            PCRE2_SPTR start = ptr;
6558            do
6559              {
6560              if (++ptr >= cb->end_pattern)
6561                {
6562                *errorcodeptr = ERR81;
6563                ptr = start;   /* To give a more useful message */
6564                goto FAILED;
6565                }
6566              if (ptr[0] == delimiter && ptr[1] == delimiter) ptr += 2;
6567              }
6568            while (ptr[0] != delimiter);
6569
6570            /* Start points to the opening delimiter, ptr points to the
6571            closing delimiter. We must allow for including the delimiter and
6572            for the terminating zero. Any doubled delimiters within the string
6573            make this an overestimate, but it is not worth bothering about. */
6574
6575            (*lengthptr) += (ptr - start) + 2 + (1 + 4*LINK_SIZE);
6576            }
6577
6578          /* In the real compile we can copy the string, knowing that it is
6579          syntactically OK. The starting delimiter is included so that the
6580          client can discover it if they want. We also pass the start offset to
6581          help a script language give better error messages. */
6582
6583          else
6584            {
6585            PCRE2_UCHAR *callout_string = code + (1 + 4*LINK_SIZE);
6586            *callout_string++ = *ptr++;
6587            PUT(code, 1 + 3*LINK_SIZE, (int)(ptr - cb->start_pattern)); /* Start offset */
6588            for(;;)
6589              {
6590              if (*ptr == delimiter)
6591                {
6592                if (ptr[1] == delimiter) ptr++; else break;
6593                }
6594              *callout_string++ = *ptr++;
6595              }
6596            *callout_string++ = CHAR_NULL;
6597            code[0] = OP_CALLOUT_STR;
6598            PUT(code, 1, (int)(ptr + 2 - cb->start_pattern)); /* Next offset */
6599            PUT(code, 1 + LINK_SIZE, 0);      /* Default length */
6600            PUT(code, 1 + 2*LINK_SIZE,        /* Compute size */
6601                (int)(callout_string - code));
6602            code = callout_string;
6603            }
6604
6605          /* Advance to what should be the closing parenthesis, which is
6606          checked below. */
6607
6608          ptr++;
6609          }
6610
6611        /* Handle a callout with an optional numerical argument, which must be
6612        less than or equal to 255. A missing argument gives 0. */
6613
6614        else
6615          {
6616          int n = 0;
6617          code[0] = OP_CALLOUT;     /* Numerical callout */
6618          while (IS_DIGIT(*ptr))
6619            {
6620            n = n * 10 + *ptr++ - CHAR_0;
6621            if (n > 255)
6622              {
6623              *errorcodeptr = ERR38;
6624              goto FAILED;
6625              }
6626            }
6627          PUT(code, 1, (int)(ptr - cb->start_pattern + 1));  /* Next offset */
6628          PUT(code, 1 + LINK_SIZE, 0);                    /* Default length */
6629          code[1 + 2*LINK_SIZE] = n;                      /* Callout number */
6630          code += PRIV(OP_lengths)[OP_CALLOUT];
6631          }
6632
6633        /* Both formats must have a closing parenthesis */
6634
6635        if (*ptr != CHAR_RIGHT_PARENTHESIS)
6636          {
6637          *errorcodeptr = ERR39;
6638          goto FAILED;
6639          }
6640
6641        /* Callouts cannot be quantified. */
6642
6643        previous = NULL;
6644        continue;
6645
6646
6647        /* ------------------------------------------------------------ */
6648        case CHAR_P:              /* Python-style named subpattern handling */
6649        if (*(++ptr) == CHAR_EQUALS_SIGN ||
6650            *ptr == CHAR_GREATER_THAN_SIGN)  /* Reference or recursion */
6651          {
6652          is_recurse = *ptr == CHAR_GREATER_THAN_SIGN;
6653          terminator = CHAR_RIGHT_PARENTHESIS;
6654          goto NAMED_REF_OR_RECURSE;
6655          }
6656        else if (*ptr != CHAR_LESS_THAN_SIGN)  /* Test for Python-style defn */
6657          {
6658          *errorcodeptr = ERR41;
6659          goto FAILED;
6660          }
6661        /* Fall through to handle (?P< as (?< is handled */
6662
6663
6664        /* ------------------------------------------------------------ */
6665        case CHAR_APOSTROPHE:   /* Define a name - note fall through above */
6666
6667        /* The syntax was checked and the list of names was set up in the
6668        pre-pass, so there is nothing to be done now except to skip over the
6669        name. */
6670
6671        terminator = (*ptr == CHAR_LESS_THAN_SIGN)?
6672                  CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;
6673        while (*(++ptr) != (unsigned int)terminator);
6674        ptr++;
6675        goto NUMBERED_GROUP;      /* Set up numbered group */
6676
6677
6678        /* ------------------------------------------------------------ */
6679        case CHAR_AMPERSAND:            /* Perl recursion/subroutine syntax */
6680        terminator = CHAR_RIGHT_PARENTHESIS;
6681        is_recurse = TRUE;
6682        /* Fall through */
6683
6684        /* We come here from the Python syntax above that handles both
6685        references (?P=name) and recursion (?P>name), as well as falling
6686        through from the Perl recursion syntax (?&name). We also come here from
6687        the Perl \k<name> or \k'name' back reference syntax and the \k{name}
6688        .NET syntax, and the Oniguruma \g<...> and \g'...' subroutine syntax. */
6689
6690        NAMED_REF_OR_RECURSE:
6691        name = ++ptr;
6692        if (IS_DIGIT(*ptr))
6693          {
6694          *errorcodeptr = ERR44;   /* Group name must start with non-digit */
6695          goto FAILED;
6696          }
6697        /* Increment ptr, set namelen, check length */
6698        READ_NAME(ctype_word, ERR48, *errorcodeptr);
6699
6700        /* In the pre-compile phase, do a syntax check. */
6701
6702        if (lengthptr != NULL)
6703          {
6704          if (namelen == 0)
6705            {
6706            *errorcodeptr = ERR62;
6707            goto FAILED;
6708            }
6709          if (*ptr != (PCRE2_UCHAR)terminator)
6710            {
6711            *errorcodeptr = ERR42;
6712            goto FAILED;
6713            }
6714          }
6715
6716        /* Scan the list of names generated in the pre-pass in order to get
6717        a number and whether or not this name is duplicated. */
6718
6719        recno = 0;
6720        is_dupname = FALSE;
6721        ng = cb->named_groups;
6722
6723        for (i = 0; i < cb->names_found; i++, ng++)
6724          {
6725          if (namelen == ng->length &&
6726              PRIV(strncmp)(name, ng->name, namelen) == 0)
6727            {
6728            open_capitem *oc;
6729            is_dupname = ng->isdup;
6730            recno = ng->number;
6731
6732            /* For a recursion, that's all that is needed. We can now go to the
6733            code that handles numerical recursion. */
6734
6735            if (is_recurse) goto HANDLE_RECURSION;
6736
6737            /* For a back reference, update the back reference map and the
6738            maximum back reference. Then for each group we must check to see if
6739            it is recursive, that is, it is inside the group that it
6740            references. A flag is set so that the group can be made atomic. */
6741
6742            cb->backref_map |= (recno < 32)? (1u << recno) : 1;
6743            if ((uint32_t)recno > cb->top_backref) cb->top_backref = recno;
6744
6745            for (oc = cb->open_caps; oc != NULL; oc = oc->next)
6746              {
6747              if (oc->number == recno)
6748                {
6749                oc->flag = TRUE;
6750                break;
6751                }
6752              }
6753            }
6754          }
6755
6756        /* If the name was not found we have a bad reference. */
6757
6758        if (recno == 0)
6759          {
6760          *errorcodeptr = ERR15;
6761          goto FAILED;
6762          }
6763
6764        /* If a back reference name is not duplicated, we can handle it as a
6765        numerical reference. */
6766
6767        if (!is_dupname) goto HANDLE_REFERENCE;
6768
6769        /* If a back reference name is duplicated, we generate a different
6770        opcode to a numerical back reference. In the second pass we must search
6771        for the index and count in the final name table. */
6772
6773        count = 0;
6774        index = 0;
6775
6776        if (lengthptr == NULL)
6777          {
6778          slot = cb->name_table;
6779          for (i = 0; i < cb->names_found; i++)
6780            {
6781            if (PRIV(strncmp)(name, slot+IMM2_SIZE, namelen) == 0 &&
6782                slot[IMM2_SIZE+namelen] == 0)
6783              {
6784              if (count == 0) index = i;
6785              count++;
6786              }
6787            slot += cb->name_entry_size;
6788            }
6789
6790          if (count == 0)
6791            {
6792            *errorcodeptr = ERR15;
6793            goto FAILED;
6794            }
6795          }
6796
6797        if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
6798        previous = code;
6799        *code++ = ((options & PCRE2_CASELESS) != 0)? OP_DNREFI : OP_DNREF;
6800        PUT2INC(code, 0, index);
6801        PUT2INC(code, 0, count);
6802        continue;  /* End of back ref handling */
6803
6804
6805        /* ------------------------------------------------------------ */
6806        case CHAR_R:              /* Recursion, same as (?0) */
6807        recno = 0;
6808        if (*(++ptr) != CHAR_RIGHT_PARENTHESIS)
6809          {
6810          *errorcodeptr = ERR29;
6811          goto FAILED;
6812          }
6813        goto HANDLE_RECURSION;
6814
6815
6816        /* ------------------------------------------------------------ */
6817        case CHAR_MINUS: case CHAR_PLUS:  /* Recursion or subroutine */
6818        case CHAR_0: case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4:
6819        case CHAR_5: case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
6820          {
6821          terminator = CHAR_RIGHT_PARENTHESIS;
6822
6823          /* Come here from the \g<...> and \g'...' code (Oniguruma
6824          compatibility). However, the syntax has been checked to ensure that
6825          the ... are a (signed) number, so that neither ERR63 nor ERR29 will
6826          be called on this path, nor with the jump to OTHER_CHAR_AFTER_QUERY
6827          ever be taken. */
6828
6829          HANDLE_NUMERICAL_RECURSION:
6830
6831          if ((refsign = *ptr) == CHAR_PLUS)
6832            {
6833            ptr++;
6834            if (!IS_DIGIT(*ptr))
6835              {
6836              *errorcodeptr = ERR63;
6837              goto FAILED;
6838              }
6839            }
6840          else if (refsign == CHAR_MINUS)
6841            {
6842            if (!IS_DIGIT(ptr[1]))
6843              goto OTHER_CHAR_AFTER_QUERY;
6844            ptr++;
6845            }
6846
6847          recno = 0;
6848          while (IS_DIGIT(*ptr))
6849            {
6850            if (recno > INT_MAX / 10 - 1) /* Integer overflow */
6851              {
6852              while (IS_DIGIT(*ptr)) ptr++;
6853              *errorcodeptr = ERR61;
6854              goto FAILED;
6855              }
6856            recno = recno * 10 + *ptr++ - CHAR_0;
6857            }
6858
6859          if (*ptr != (PCRE2_UCHAR)terminator)
6860            {
6861            *errorcodeptr = ERR29;
6862            goto FAILED;
6863            }
6864
6865          if (refsign == CHAR_MINUS)
6866            {
6867            if (recno == 0)
6868              {
6869              *errorcodeptr = ERR58;
6870              goto FAILED;
6871              }
6872            recno = (int)(cb->bracount + 1) - recno;
6873            if (recno <= 0)
6874              {
6875              *errorcodeptr = ERR15;
6876              goto FAILED;
6877              }
6878            }
6879          else if (refsign == CHAR_PLUS)
6880            {
6881            if (recno == 0)
6882              {
6883              *errorcodeptr = ERR58;
6884              goto FAILED;
6885              }
6886            recno += cb->bracount;
6887            }
6888
6889          if ((uint32_t)recno > cb->final_bracount)
6890            {
6891            *errorcodeptr = ERR15;
6892            goto FAILED;
6893            }
6894
6895          /* Come here from code above that handles a named recursion.
6896          We insert the number of the called group after OP_RECURSE. At the
6897          end of compiling the pattern is scanned and these numbers are
6898          replaced by offsets within the pattern. It is done like this to avoid
6899          problems with forward references and adjusting offsets when groups
6900          are duplicated and moved (as discovered in previous implementations).
6901          Note that a recursion does not have a set first character (relevant
6902          if it is repeated, because it will then be wrapped with ONCE
6903          brackets). */
6904
6905          HANDLE_RECURSION:
6906          previous = code;
6907          *code = OP_RECURSE;
6908          PUT(code, 1, recno);
6909          code += 1 + LINK_SIZE;
6910          groupsetfirstcu = FALSE;
6911          cb->had_recurse = TRUE;
6912          }
6913
6914        /* Can't determine a first byte now */
6915
6916        if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
6917        continue;
6918
6919
6920        /* ------------------------------------------------------------ */
6921        default:              /* Other characters: check option setting */
6922        OTHER_CHAR_AFTER_QUERY:
6923        set = unset = 0;
6924        optset = &set;
6925
6926        while (*ptr != CHAR_RIGHT_PARENTHESIS && *ptr != CHAR_COLON)
6927          {
6928          switch (*ptr++)
6929            {
6930            case CHAR_MINUS: optset = &unset; break;
6931
6932            case CHAR_J:    /* Record that it changed in the external options */
6933            *optset |= PCRE2_DUPNAMES;
6934            cb->external_flags |= PCRE2_JCHANGED;
6935            break;
6936
6937            case CHAR_i: *optset |= PCRE2_CASELESS; break;
6938            case CHAR_m: *optset |= PCRE2_MULTILINE; break;
6939            case CHAR_s: *optset |= PCRE2_DOTALL; break;
6940            case CHAR_x: *optset |= PCRE2_EXTENDED; break;
6941            case CHAR_U: *optset |= PCRE2_UNGREEDY; break;
6942
6943            default:  *errorcodeptr = ERR11;
6944                      ptr--;    /* Correct the offset */
6945                      goto FAILED;
6946            }
6947          }
6948
6949        /* Set up the changed option bits, but don't change anything yet. */
6950
6951        newoptions = (options | set) & (~unset);
6952
6953        /* If the options ended with ')' this is not the start of a nested
6954        group with option changes, so the options change at this level. They
6955        must also be passed back for use in subsequent branches. Reset the
6956        greedy defaults and the case value for firstcu and reqcu. */
6957
6958        if (*ptr == CHAR_RIGHT_PARENTHESIS)
6959          {
6960          *optionsptr = options = newoptions;
6961          greedy_default = ((newoptions & PCRE2_UNGREEDY) != 0);
6962          greedy_non_default = greedy_default ^ 1;
6963          req_caseopt = ((newoptions & PCRE2_CASELESS) != 0)? REQ_CASELESS:0;
6964          previous = NULL;       /* This item can't be repeated */
6965          continue;              /* It is complete */
6966          }
6967
6968        /* If the options ended with ':' we are heading into a nested group
6969        with possible change of options. Such groups are non-capturing and are
6970        not assertions of any kind. All we need to do is skip over the ':';
6971        the newoptions value is handled below. */
6972
6973        bravalue = OP_BRA;
6974        ptr++;
6975        }     /* End of switch for character following (? */
6976      }       /* End of (? handling */
6977
6978    /* Opening parenthesis not followed by '*' or '?'. If PCRE2_NO_AUTO_CAPTURE
6979    is set, all unadorned brackets become non-capturing and behave like (?:...)
6980    brackets. */
6981
6982    else if ((options & PCRE2_NO_AUTO_CAPTURE) != 0)
6983      {
6984      bravalue = OP_BRA;
6985      }
6986
6987    /* Else we have a capturing group. */
6988
6989    else
6990      {
6991      NUMBERED_GROUP:
6992      cb->bracount += 1;
6993      PUT2(code, 1+LINK_SIZE, cb->bracount);
6994      skipunits = IMM2_SIZE;
6995      }
6996
6997    /* Process nested bracketed regex. First check for parentheses nested too
6998    deeply. */
6999
7000    if ((cb->parens_depth += 1) > (int)(cb->cx->parens_nest_limit))
7001      {
7002      *errorcodeptr = ERR19;
7003      goto FAILED;
7004      }
7005
7006    /* All assertions used not to be repeatable, but this was changed for Perl
7007    compatibility. All kinds can now be repeated except for assertions that are
7008    conditions (Perl also forbids these to be repeated). We copy code into a
7009    non-register variable (tempcode) in order to be able to pass its address
7010    because some compilers complain otherwise. At the start of a conditional
7011    group whose condition is an assertion, cb->iscondassert is set. We unset it
7012    here so as to allow assertions later in the group to be quantified. */
7013
7014    if (bravalue >= OP_ASSERT && bravalue <= OP_ASSERTBACK_NOT &&
7015        cb->iscondassert)
7016      {
7017      previous = NULL;
7018      cb->iscondassert = FALSE;
7019      }
7020    else
7021      {
7022      previous = code;
7023      }
7024
7025    *code = bravalue;
7026    tempcode = code;
7027    tempreqvary = cb->req_varyopt;        /* Save value before bracket */
7028    tempbracount = cb->bracount;          /* Save value before bracket */
7029    length_prevgroup = 0;                 /* Initialize for pre-compile phase */
7030
7031    if (!compile_regex(
7032         newoptions,                      /* The complete new option state */
7033         &tempcode,                       /* Where to put code (updated) */
7034         &ptr,                            /* Input pointer (updated) */
7035         errorcodeptr,                    /* Where to put an error message */
7036         (bravalue == OP_ASSERTBACK ||
7037          bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
7038         reset_bracount,                  /* True if (?| group */
7039         skipunits,                       /* Skip over bracket number */
7040         cond_depth +
7041           ((bravalue == OP_COND)?1:0),   /* Depth of condition subpatterns */
7042         &subfirstcu,                     /* For possible first char */
7043         &subfirstcuflags,
7044         &subreqcu,                       /* For possible last char */
7045         &subreqcuflags,
7046         bcptr,                           /* Current branch chain */
7047         cb,                              /* Compile data block */
7048         (lengthptr == NULL)? NULL :      /* Actual compile phase */
7049           &length_prevgroup              /* Pre-compile phase */
7050         ))
7051      goto FAILED;
7052
7053    cb->parens_depth -= 1;
7054
7055    /* If this was an atomic group and there are no capturing groups within it,
7056    generate OP_ONCE_NC instead of OP_ONCE. */
7057
7058    if (bravalue == OP_ONCE && cb->bracount <= tempbracount)
7059      *code = OP_ONCE_NC;
7060
7061    if (bravalue >= OP_ASSERT && bravalue <= OP_ASSERTBACK_NOT)
7062      cb->assert_depth -= 1;
7063
7064    /* At the end of compiling, code is still pointing to the start of the
7065    group, while tempcode has been updated to point past the end of the group.
7066    The pattern pointer (ptr) is on the bracket.
7067
7068    If this is a conditional bracket, check that there are no more than
7069    two branches in the group, or just one if it's a DEFINE group. We do this
7070    in the real compile phase, not in the pre-pass, where the whole group may
7071    not be available. */
7072
7073    if (bravalue == OP_COND && lengthptr == NULL)
7074      {
7075      PCRE2_UCHAR *tc = code;
7076      int condcount = 0;
7077
7078      do {
7079         condcount++;
7080         tc += GET(tc,1);
7081         }
7082      while (*tc != OP_KET);
7083
7084      /* A DEFINE group is never obeyed inline (the "condition" is always
7085      false). It must have only one branch. Having checked this, change the
7086      opcode to OP_FALSE. */
7087
7088      if (code[LINK_SIZE+1] == OP_DEFINE)
7089        {
7090        if (condcount > 1)
7091          {
7092          *errorcodeptr = ERR54;
7093          goto FAILED;
7094          }
7095        code[LINK_SIZE+1] = OP_FALSE;
7096        bravalue = OP_DEFINE;   /* Just a flag to suppress char handling below */
7097        }
7098
7099      /* A "normal" conditional group. If there is just one branch, we must not
7100      make use of its firstcu or reqcu, because this is equivalent to an
7101      empty second branch. */
7102
7103      else
7104        {
7105        if (condcount > 2)
7106          {
7107          *errorcodeptr = ERR27;
7108          goto FAILED;
7109          }
7110        if (condcount == 1) subfirstcuflags = subreqcuflags = REQ_NONE;
7111        }
7112      }
7113
7114    /* At the end of a group, it's an error if we hit end of pattern or
7115    any non-closing parenthesis. This check also happens in the pre-scan,
7116    so should not trigger here, but leave this code as an insurance. */
7117
7118    if (*ptr != CHAR_RIGHT_PARENTHESIS)
7119      {
7120      *errorcodeptr = ERR14;
7121      goto FAILED;
7122      }
7123
7124    /* In the pre-compile phase, update the length by the length of the group,
7125    less the brackets at either end. Then reduce the compiled code to just a
7126    set of non-capturing brackets so that it doesn't use much memory if it is
7127    duplicated by a quantifier.*/
7128
7129    if (lengthptr != NULL)
7130      {
7131      if (OFLOW_MAX - *lengthptr < length_prevgroup - 2 - 2*LINK_SIZE)
7132        {
7133        *errorcodeptr = ERR20;
7134        goto FAILED;
7135        }
7136      *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE;
7137      code++;   /* This already contains bravalue */
7138      PUTINC(code, 0, 1 + LINK_SIZE);
7139      *code++ = OP_KET;
7140      PUTINC(code, 0, 1 + LINK_SIZE);
7141      break;    /* No need to waste time with special character handling */
7142      }
7143
7144    /* Otherwise update the main code pointer to the end of the group. */
7145
7146    code = tempcode;
7147
7148    /* For a DEFINE group, required and first character settings are not
7149    relevant. */
7150
7151    if (bravalue == OP_DEFINE) break;
7152
7153    /* Handle updating of the required and first characters for other types of
7154    group. Update for normal brackets of all kinds, and conditions with two
7155    branches (see code above). If the bracket is followed by a quantifier with
7156    zero repeat, we have to back off. Hence the definition of zeroreqcu and
7157    zerofirstcu outside the main loop so that they can be accessed for the
7158    back off. */
7159
7160    zeroreqcu = reqcu;
7161    zeroreqcuflags = reqcuflags;
7162    zerofirstcu = firstcu;
7163    zerofirstcuflags = firstcuflags;
7164    groupsetfirstcu = FALSE;
7165
7166    if (bravalue >= OP_ONCE)
7167      {
7168      /* If we have not yet set a firstcu in this branch, take it from the
7169      subpattern, remembering that it was set here so that a repeat of more
7170      than one can replicate it as reqcu if necessary. If the subpattern has
7171      no firstcu, set "none" for the whole branch. In both cases, a zero
7172      repeat forces firstcu to "none". */
7173
7174      if (firstcuflags == REQ_UNSET && subfirstcuflags != REQ_UNSET)
7175        {
7176        if (subfirstcuflags >= 0)
7177          {
7178          firstcu = subfirstcu;
7179          firstcuflags = subfirstcuflags;
7180          groupsetfirstcu = TRUE;
7181          }
7182        else firstcuflags = REQ_NONE;
7183        zerofirstcuflags = REQ_NONE;
7184        }
7185
7186      /* If firstcu was previously set, convert the subpattern's firstcu
7187      into reqcu if there wasn't one, using the vary flag that was in
7188      existence beforehand. */
7189
7190      else if (subfirstcuflags >= 0 && subreqcuflags < 0)
7191        {
7192        subreqcu = subfirstcu;
7193        subreqcuflags = subfirstcuflags | tempreqvary;
7194        }
7195
7196      /* If the subpattern set a required byte (or set a first byte that isn't
7197      really the first byte - see above), set it. */
7198
7199      if (subreqcuflags >= 0)
7200        {
7201        reqcu = subreqcu;
7202        reqcuflags = subreqcuflags;
7203        }
7204      }
7205
7206    /* For a forward assertion, we take the reqcu, if set. This can be
7207    helpful if the pattern that follows the assertion doesn't set a different
7208    char. For example, it's useful for /(?=abcde).+/. We can't set firstcu
7209    for an assertion, however because it leads to incorrect effect for patterns
7210    such as /(?=a)a.+/ when the "real" "a" would then become a reqcu instead
7211    of a firstcu. This is overcome by a scan at the end if there's no
7212    firstcu, looking for an asserted first char. */
7213
7214    else if (bravalue == OP_ASSERT && subreqcuflags >= 0)
7215      {
7216      reqcu = subreqcu;
7217      reqcuflags = subreqcuflags;
7218      }
7219    break;     /* End of processing '(' */
7220
7221
7222    /* ===================================================================*/
7223    /* Handle metasequences introduced by \. For ones like \d, the ESC_ values
7224    are arranged to be the negation of the corresponding OP_values in the
7225    default case when PCRE2_UCP is not set. For the back references, the values
7226    are negative the reference number. Only back references and those types
7227    that consume a character may be repeated. We can test for values between
7228    ESC_b and ESC_Z for the latter; this may have to change if any new ones are
7229    ever created.
7230
7231    Note: \Q and \E are handled at the start of the character-processing loop,
7232    not here. */
7233
7234    case CHAR_BACKSLASH:
7235    tempptr = ptr;
7236    escape = PRIV(check_escape)(&ptr, cb->end_pattern, &ec, errorcodeptr,
7237      options, FALSE, cb);
7238    if (*errorcodeptr != 0) goto FAILED;
7239
7240    if (escape == 0)                  /* The escape coded a single character */
7241      c = ec;
7242    else
7243      {
7244      /* For metasequences that actually match a character, we disable the
7245      setting of a first character if it hasn't already been set. */
7246
7247      if (firstcuflags == REQ_UNSET && escape > ESC_b && escape < ESC_Z)
7248        firstcuflags = REQ_NONE;
7249
7250      /* Set values to reset to if this is followed by a zero repeat. */
7251
7252      zerofirstcu = firstcu;
7253      zerofirstcuflags = firstcuflags;
7254      zeroreqcu = reqcu;
7255      zeroreqcuflags = reqcuflags;
7256
7257      /* \g<name> or \g'name' is a subroutine call by name and \g<n> or \g'n'
7258      is a subroutine call by number (Oniguruma syntax). In fact, the value
7259      ESC_g is returned only for these cases. So we don't need to check for <
7260      or ' if the value is ESC_g. For the Perl syntax \g{n} the value is
7261      -n, and for the Perl syntax \g{name} the result is ESC_k (as
7262      that is a synonym for a named back reference). */
7263
7264      if (escape == ESC_g)
7265        {
7266        PCRE2_SPTR p;
7267        uint32_t cf;
7268
7269        terminator = (*(++ptr) == CHAR_LESS_THAN_SIGN)?
7270          CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;
7271
7272        /* These two statements stop the compiler for warning about possibly
7273        unset variables caused by the jump to HANDLE_NUMERICAL_RECURSION. In
7274        fact, because we do the check for a number below, the paths that
7275        would actually be in error are never taken. */
7276
7277        skipunits = 0;
7278        reset_bracount = FALSE;
7279
7280        /* If it's not a signed or unsigned number, treat it as a name. */
7281
7282        cf = ptr[1];
7283        if (cf != CHAR_PLUS && cf != CHAR_MINUS && !IS_DIGIT(cf))
7284          {
7285          is_recurse = TRUE;
7286          goto NAMED_REF_OR_RECURSE;
7287          }
7288
7289        /* Signed or unsigned number (cf = ptr[1]) is known to be plus or minus
7290        or a digit. */
7291
7292        p = ptr + 2;
7293        while (IS_DIGIT(*p)) p++;
7294        if (*p != (PCRE2_UCHAR)terminator)
7295          {
7296          *errorcodeptr = ERR57;
7297          goto FAILED;
7298          }
7299        ptr++;
7300        goto HANDLE_NUMERICAL_RECURSION;
7301        }
7302
7303      /* \k<name> or \k'name' is a back reference by name (Perl syntax).
7304      We also support \k{name} (.NET syntax).  */
7305
7306      if (escape == ESC_k)
7307        {
7308        if ((ptr[1] != CHAR_LESS_THAN_SIGN &&
7309          ptr[1] != CHAR_APOSTROPHE && ptr[1] != CHAR_LEFT_CURLY_BRACKET))
7310          {
7311          *errorcodeptr = ERR69;
7312          goto FAILED;
7313          }
7314        is_recurse = FALSE;
7315        terminator = (*(++ptr) == CHAR_LESS_THAN_SIGN)?
7316          CHAR_GREATER_THAN_SIGN : (*ptr == CHAR_APOSTROPHE)?
7317          CHAR_APOSTROPHE : CHAR_RIGHT_CURLY_BRACKET;
7318        goto NAMED_REF_OR_RECURSE;
7319        }
7320
7321      /* Back references are handled specially; must disable firstcu if
7322      not set to cope with cases like (?=(\w+))\1: which would otherwise set
7323      ':' later. */
7324
7325      if (escape < 0)
7326        {
7327        open_capitem *oc;
7328        recno = -escape;
7329
7330        /* Come here from named backref handling when the reference is to a
7331        single group (i.e. not to a duplicated name). */
7332
7333        HANDLE_REFERENCE:
7334        if (recno > (int)cb->final_bracount)
7335          {
7336          *errorcodeptr = ERR15;
7337          goto FAILED;
7338          }
7339        if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
7340        previous = code;
7341        *code++ = ((options & PCRE2_CASELESS) != 0)? OP_REFI : OP_REF;
7342        PUT2INC(code, 0, recno);
7343        cb->backref_map |= (recno < 32)? (1u << recno) : 1;
7344        if ((uint32_t)recno > cb->top_backref) cb->top_backref = recno;
7345
7346        /* Check to see if this back reference is recursive, that it, it
7347        is inside the group that it references. A flag is set so that the
7348        group can be made atomic. */
7349
7350        for (oc = cb->open_caps; oc != NULL; oc = oc->next)
7351          {
7352          if (oc->number == recno)
7353            {
7354            oc->flag = TRUE;
7355            break;
7356            }
7357          }
7358        }
7359
7360      /* So are Unicode property matches, if supported. */
7361
7362#ifdef SUPPORT_UNICODE
7363      else if (escape == ESC_P || escape == ESC_p)
7364        {
7365        BOOL negated;
7366        unsigned int ptype = 0, pdata = 0;
7367        if (!get_ucp(&ptr, &negated, &ptype, &pdata, errorcodeptr, cb))
7368          goto FAILED;
7369        previous = code;
7370        *code++ = ((escape == ESC_p) != negated)? OP_PROP : OP_NOTPROP;
7371        *code++ = ptype;
7372        *code++ = pdata;
7373        }
7374#else
7375
7376      /* If Unicode properties are not supported, \X, \P, and \p are not
7377      allowed. */
7378
7379      else if (escape == ESC_X || escape == ESC_P || escape == ESC_p)
7380        {
7381        *errorcodeptr = ERR45;
7382        goto FAILED;
7383        }
7384#endif
7385
7386      /* The use of \C can be locked out. */
7387
7388#ifdef NEVER_BACKSLASH_C
7389      else if (escape == ESC_C)
7390        {
7391        *errorcodeptr = ERR85;
7392        goto FAILED;
7393        }
7394#else
7395      else if (escape == ESC_C && (options & PCRE2_NEVER_BACKSLASH_C) != 0)
7396        {
7397        *errorcodeptr = ERR83;
7398        goto FAILED;
7399        }
7400#endif
7401
7402      /* For the rest (including \X when Unicode properties are supported), we
7403      can obtain the OP value by negating the escape value in the default
7404      situation when PCRE2_UCP is not set. When it *is* set, we substitute
7405      Unicode property tests. Note that \b and \B do a one-character
7406      lookbehind, and \A also behaves as if it does. */
7407
7408      else
7409        {
7410        if (escape == ESC_C) cb->external_flags |= PCRE2_HASBKC; /* Record */
7411        if ((escape == ESC_b || escape == ESC_B || escape == ESC_A) &&
7412             cb->max_lookbehind == 0)
7413          cb->max_lookbehind = 1;
7414#ifdef SUPPORT_UNICODE
7415        if (escape >= ESC_DU && escape <= ESC_wu)
7416          {
7417          cb->nestptr[1] = cb->nestptr[0];         /* Back up if at 2nd level */
7418          cb->nestptr[0] = ptr + 1;                /* Where to resume */
7419          ptr = substitutes[escape - ESC_DU] - 1;  /* Just before substitute */
7420          }
7421        else
7422#endif
7423        /* In non-UTF mode, and for both 32-bit modes, we turn \C into
7424        OP_ALLANY instead of OP_ANYBYTE so that it works in DFA mode and in
7425        lookbehinds. */
7426
7427          {
7428          previous = (escape > ESC_b && escape < ESC_Z)? code : NULL;
7429#if PCRE2_CODE_UNIT_WIDTH == 32
7430          *code++ = (escape == ESC_C)? OP_ALLANY : escape;
7431#else
7432          *code++ = (!utf && escape == ESC_C)? OP_ALLANY : escape;
7433#endif
7434          }
7435        }
7436      continue;
7437      }
7438
7439    /* We have a data character whose value is in c. In UTF-8 mode it may have
7440    a value > 127. We set its representation in the length/buffer, and then
7441    handle it as a data character. */
7442
7443    mclength = PUTCHAR(c, mcbuffer);
7444    goto ONE_CHAR;
7445
7446
7447    /* ===================================================================*/
7448    /* Handle a literal character. It is guaranteed not to be whitespace or #
7449    when the extended flag is set. If we are in a UTF mode, it may be a
7450    multi-unit literal character. */
7451
7452    default:
7453    NORMAL_CHAR:
7454    mclength = 1;
7455    mcbuffer[0] = c;
7456
7457#ifdef SUPPORT_UNICODE
7458    if (utf && HAS_EXTRALEN(c))
7459      ACROSSCHAR(TRUE, ptr[1], mcbuffer[mclength++] = *(++ptr));
7460#endif
7461
7462    /* At this point we have the character's bytes in mcbuffer, and the length
7463    in mclength. When not in UTF mode, the length is always 1. */
7464
7465    ONE_CHAR:
7466    previous = code;
7467
7468    /* For caseless UTF mode, check whether this character has more than one
7469    other case. If so, generate a special OP_PROP item instead of OP_CHARI. */
7470
7471#ifdef SUPPORT_UNICODE
7472    if (utf && (options & PCRE2_CASELESS) != 0)
7473      {
7474      GETCHAR(c, mcbuffer);
7475      if ((c = UCD_CASESET(c)) != 0)
7476        {
7477        *code++ = OP_PROP;
7478        *code++ = PT_CLIST;
7479        *code++ = c;
7480        if (firstcuflags == REQ_UNSET)
7481          firstcuflags = zerofirstcuflags = REQ_NONE;
7482        break;
7483        }
7484      }
7485#endif
7486
7487    /* Caseful matches, or not one of the multicase characters. */
7488
7489    *code++ = ((options & PCRE2_CASELESS) != 0)? OP_CHARI : OP_CHAR;
7490    for (c = 0; c < mclength; c++) *code++ = mcbuffer[c];
7491
7492    /* Remember if \r or \n were seen */
7493
7494    if (mcbuffer[0] == CHAR_CR || mcbuffer[0] == CHAR_NL)
7495      cb->external_flags |= PCRE2_HASCRORLF;
7496
7497    /* Set the first and required bytes appropriately. If no previous first
7498    byte, set it from this character, but revert to none on a zero repeat.
7499    Otherwise, leave the firstcu value alone, and don't change it on a zero
7500    repeat. */
7501
7502    if (firstcuflags == REQ_UNSET)
7503      {
7504      zerofirstcuflags = REQ_NONE;
7505      zeroreqcu = reqcu;
7506      zeroreqcuflags = reqcuflags;
7507
7508      /* If the character is more than one byte long, we can set firstcu
7509      only if it is not to be matched caselessly. */
7510
7511      if (mclength == 1 || req_caseopt == 0)
7512        {
7513        firstcu = mcbuffer[0] | req_caseopt;
7514        firstcu = mcbuffer[0];
7515        firstcuflags = req_caseopt;
7516
7517        if (mclength != 1)
7518          {
7519          reqcu = code[-1];
7520          reqcuflags = cb->req_varyopt;
7521          }
7522        }
7523      else firstcuflags = reqcuflags = REQ_NONE;
7524      }
7525
7526    /* firstcu was previously set; we can set reqcu only if the length is
7527    1 or the matching is caseful. */
7528
7529    else
7530      {
7531      zerofirstcu = firstcu;
7532      zerofirstcuflags = firstcuflags;
7533      zeroreqcu = reqcu;
7534      zeroreqcuflags = reqcuflags;
7535      if (mclength == 1 || req_caseopt == 0)
7536        {
7537        reqcu = code[-1];
7538        reqcuflags = req_caseopt | cb->req_varyopt;
7539        }
7540      }
7541
7542    break;            /* End of literal character handling */
7543    }
7544  }                   /* end of big loop */
7545
7546/* Control never reaches here by falling through, only by a goto for all the
7547error states. Pass back the position in the pattern so that it can be displayed
7548to the user for diagnosing the error. */
7549
7550FAILED:
7551*ptrptr = ptr;
7552return FALSE;
7553}
7554
7555
7556
7557/*************************************************
7558*   Compile regex: a sequence of alternatives    *
7559*************************************************/
7560
7561/* On entry, ptr is pointing past the bracket character, but on return it
7562points to the closing bracket, or vertical bar, or end of string. The code
7563variable is pointing at the byte into which the BRA operator has been stored.
7564This function is used during the pre-compile phase when we are trying to find
7565out the amount of memory needed, as well as during the real compile phase. The
7566value of lengthptr distinguishes the two phases.
7567
7568Arguments:
7569  options           option bits, including any changes for this subpattern
7570  codeptr           -> the address of the current code pointer
7571  ptrptr            -> the address of the current pattern pointer
7572  errorcodeptr      -> pointer to error code variable
7573  lookbehind        TRUE if this is a lookbehind assertion
7574  reset_bracount    TRUE to reset the count for each branch
7575  skipunits         skip this many code units at start (for brackets and OP_COND)
7576  cond_depth        depth of nesting for conditional subpatterns
7577  firstcuptr        place to put the first required code unit
7578  firstcuflagsptr   place to put the first code unit flags, or a negative number
7579  reqcuptr          place to put the last required code unit
7580  reqcuflagsptr     place to put the last required code unit flags, or a negative number
7581  bcptr             pointer to the chain of currently open branches
7582  cb                points to the data block with tables pointers etc.
7583  lengthptr         NULL during the real compile phase
7584                    points to length accumulator during pre-compile phase
7585
7586Returns:            TRUE on success
7587*/
7588
7589static BOOL
7590compile_regex(uint32_t options, PCRE2_UCHAR **codeptr, PCRE2_SPTR *ptrptr,
7591  int *errorcodeptr, BOOL lookbehind, BOOL reset_bracount, uint32_t skipunits,
7592  int cond_depth, uint32_t *firstcuptr, int32_t *firstcuflagsptr,
7593  uint32_t *reqcuptr, int32_t *reqcuflagsptr, branch_chain *bcptr,
7594  compile_block *cb, size_t *lengthptr)
7595{
7596PCRE2_SPTR ptr = *ptrptr;
7597PCRE2_UCHAR *code = *codeptr;
7598PCRE2_UCHAR *last_branch = code;
7599PCRE2_UCHAR *start_bracket = code;
7600PCRE2_UCHAR *reverse_count = NULL;
7601open_capitem capitem;
7602int capnumber = 0;
7603uint32_t firstcu, reqcu;
7604int32_t firstcuflags, reqcuflags;
7605uint32_t branchfirstcu, branchreqcu;
7606int32_t branchfirstcuflags, branchreqcuflags;
7607size_t length;
7608unsigned int orig_bracount;
7609unsigned int max_bracount;
7610branch_chain bc;
7611
7612/* If set, call the external function that checks for stack availability. */
7613
7614if (cb->cx->stack_guard != NULL &&
7615    cb->cx->stack_guard(cb->parens_depth, cb->cx->stack_guard_data))
7616  {
7617  *errorcodeptr= ERR33;
7618  return FALSE;
7619  }
7620
7621/* Miscellaneous initialization */
7622
7623bc.outer = bcptr;
7624bc.current_branch = code;
7625
7626firstcu = reqcu = 0;
7627firstcuflags = reqcuflags = REQ_UNSET;
7628
7629/* Accumulate the length for use in the pre-compile phase. Start with the
7630length of the BRA and KET and any extra code units that are required at the
7631beginning. We accumulate in a local variable to save frequent testing of
7632lengthptr for NULL. We cannot do this by looking at the value of 'code' at the
7633start and end of each alternative, because compiled items are discarded during
7634the pre-compile phase so that the work space is not exceeded. */
7635
7636length = 2 + 2*LINK_SIZE + skipunits;
7637
7638/* WARNING: If the above line is changed for any reason, you must also change
7639the code that abstracts option settings at the start of the pattern and makes
7640them global. It tests the value of length for (2 + 2*LINK_SIZE) in the
7641pre-compile phase to find out whether or not anything has yet been compiled.
7642
7643If this is a capturing subpattern, add to the chain of open capturing items
7644so that we can detect them if (*ACCEPT) is encountered. This is also used to
7645detect groups that contain recursive back references to themselves. Note that
7646only OP_CBRA need be tested here; changing this opcode to one of its variants,
7647e.g. OP_SCBRAPOS, happens later, after the group has been compiled. */
7648
7649if (*code == OP_CBRA)
7650  {
7651  capnumber = GET2(code, 1 + LINK_SIZE);
7652  capitem.number = capnumber;
7653  capitem.next = cb->open_caps;
7654  capitem.flag = FALSE;
7655  cb->open_caps = &capitem;
7656  }
7657
7658/* Offset is set zero to mark that this bracket is still open */
7659
7660PUT(code, 1, 0);
7661code += 1 + LINK_SIZE + skipunits;
7662
7663/* Loop for each alternative branch */
7664
7665orig_bracount = max_bracount = cb->bracount;
7666
7667for (;;)
7668  {
7669  /* For a (?| group, reset the capturing bracket count so that each branch
7670  uses the same numbers. */
7671
7672  if (reset_bracount) cb->bracount = orig_bracount;
7673
7674  /* Set up dummy OP_REVERSE if lookbehind assertion */
7675
7676  if (lookbehind)
7677    {
7678    *code++ = OP_REVERSE;
7679    reverse_count = code;
7680    PUTINC(code, 0, 0);
7681    length += 1 + LINK_SIZE;
7682    }
7683
7684  /* Now compile the branch; in the pre-compile phase its length gets added
7685  into the length. */
7686
7687  if (!compile_branch(&options, &code, &ptr, errorcodeptr, &branchfirstcu,
7688        &branchfirstcuflags, &branchreqcu, &branchreqcuflags, &bc,
7689        cond_depth, cb, (lengthptr == NULL)? NULL : &length))
7690    {
7691    *ptrptr = ptr;
7692    return FALSE;
7693    }
7694
7695  /* Keep the highest bracket count in case (?| was used and some branch
7696  has fewer than the rest. */
7697
7698  if (cb->bracount > max_bracount) max_bracount = cb->bracount;
7699
7700  /* In the real compile phase, there is some post-processing to be done. */
7701
7702  if (lengthptr == NULL)
7703    {
7704    /* If this is the first branch, the firstcu and reqcu values for the
7705    branch become the values for the regex. */
7706
7707    if (*last_branch != OP_ALT)
7708      {
7709      firstcu = branchfirstcu;
7710      firstcuflags = branchfirstcuflags;
7711      reqcu = branchreqcu;
7712      reqcuflags = branchreqcuflags;
7713      }
7714
7715    /* If this is not the first branch, the first char and reqcu have to
7716    match the values from all the previous branches, except that if the
7717    previous value for reqcu didn't have REQ_VARY set, it can still match,
7718    and we set REQ_VARY for the regex. */
7719
7720    else
7721      {
7722      /* If we previously had a firstcu, but it doesn't match the new branch,
7723      we have to abandon the firstcu for the regex, but if there was
7724      previously no reqcu, it takes on the value of the old firstcu. */
7725
7726      if (firstcuflags != branchfirstcuflags || firstcu != branchfirstcu)
7727        {
7728        if (firstcuflags >= 0)
7729          {
7730          if (reqcuflags < 0)
7731            {
7732            reqcu = firstcu;
7733            reqcuflags = firstcuflags;
7734            }
7735          }
7736        firstcuflags = REQ_NONE;
7737        }
7738
7739      /* If we (now or from before) have no firstcu, a firstcu from the
7740      branch becomes a reqcu if there isn't a branch reqcu. */
7741
7742      if (firstcuflags < 0 && branchfirstcuflags >= 0 &&
7743          branchreqcuflags < 0)
7744        {
7745        branchreqcu = branchfirstcu;
7746        branchreqcuflags = branchfirstcuflags;
7747        }
7748
7749      /* Now ensure that the reqcus match */
7750
7751      if (((reqcuflags & ~REQ_VARY) != (branchreqcuflags & ~REQ_VARY)) ||
7752          reqcu != branchreqcu)
7753        reqcuflags = REQ_NONE;
7754      else
7755        {
7756        reqcu = branchreqcu;
7757        reqcuflags |= branchreqcuflags; /* To "or" REQ_VARY */
7758        }
7759      }
7760
7761    /* If lookbehind, check that this branch matches a fixed-length string, and
7762    put the length into the OP_REVERSE item. Temporarily mark the end of the
7763    branch with OP_END. If the branch contains OP_RECURSE, the result is
7764    FFL_LATER (a negative value) because there may be forward references that
7765    we can't check here. Set a flag to cause another lookbehind check at the
7766    end. Why not do it all at the end? Because common errors can be picked up
7767    here and the offset of the problem can be shown. */
7768
7769    if (lookbehind)
7770      {
7771      int fixed_length;
7772      int count = 0;
7773      *code = OP_END;
7774      fixed_length = find_fixedlength(last_branch,  (options & PCRE2_UTF) != 0,
7775        FALSE, cb, NULL, &count);
7776      if (fixed_length == FFL_LATER)
7777        {
7778        cb->check_lookbehind = TRUE;
7779        }
7780      else if (fixed_length < 0)
7781        {
7782        *errorcodeptr = fixed_length_errors[-fixed_length];
7783        *ptrptr = ptr;
7784        return FALSE;
7785        }
7786      else
7787        {
7788        if (fixed_length > cb->max_lookbehind)
7789          cb->max_lookbehind = fixed_length;
7790        PUT(reverse_count, 0, fixed_length);
7791        }
7792      }
7793    }
7794
7795  /* Reached end of expression, either ')' or end of pattern. In the real
7796  compile phase, go back through the alternative branches and reverse the chain
7797  of offsets, with the field in the BRA item now becoming an offset to the
7798  first alternative. If there are no alternatives, it points to the end of the
7799  group. The length in the terminating ket is always the length of the whole
7800  bracketed item. Return leaving the pointer at the terminating char. */
7801
7802  if (*ptr != CHAR_VERTICAL_LINE)
7803    {
7804    if (lengthptr == NULL)
7805      {
7806      size_t branch_length = code - last_branch;
7807      do
7808        {
7809        size_t prev_length = GET(last_branch, 1);
7810        PUT(last_branch, 1, branch_length);
7811        branch_length = prev_length;
7812        last_branch -= branch_length;
7813        }
7814      while (branch_length > 0);
7815      }
7816
7817    /* Fill in the ket */
7818
7819    *code = OP_KET;
7820    PUT(code, 1, (int)(code - start_bracket));
7821    code += 1 + LINK_SIZE;
7822
7823    /* If it was a capturing subpattern, check to see if it contained any
7824    recursive back references. If so, we must wrap it in atomic brackets. In
7825    any event, remove the block from the chain. */
7826
7827    if (capnumber > 0)
7828      {
7829      if (cb->open_caps->flag)
7830        {
7831        memmove(start_bracket + 1 + LINK_SIZE, start_bracket,
7832          CU2BYTES(code - start_bracket));
7833        *start_bracket = OP_ONCE;
7834        code += 1 + LINK_SIZE;
7835        PUT(start_bracket, 1, (int)(code - start_bracket));
7836        *code = OP_KET;
7837        PUT(code, 1, (int)(code - start_bracket));
7838        code += 1 + LINK_SIZE;
7839        length += 2 + 2*LINK_SIZE;
7840        }
7841      cb->open_caps = cb->open_caps->next;
7842      }
7843
7844    /* Retain the highest bracket number, in case resetting was used. */
7845
7846    cb->bracount = max_bracount;
7847
7848    /* Set values to pass back */
7849
7850    *codeptr = code;
7851    *ptrptr = ptr;
7852    *firstcuptr = firstcu;
7853    *firstcuflagsptr = firstcuflags;
7854    *reqcuptr = reqcu;
7855    *reqcuflagsptr = reqcuflags;
7856    if (lengthptr != NULL)
7857      {
7858      if (OFLOW_MAX - *lengthptr < length)
7859        {
7860        *errorcodeptr = ERR20;
7861        return FALSE;
7862        }
7863      *lengthptr += length;
7864      }
7865    return TRUE;
7866    }
7867
7868  /* Another branch follows. In the pre-compile phase, we can move the code
7869  pointer back to where it was for the start of the first branch. (That is,
7870  pretend that each branch is the only one.)
7871
7872  In the real compile phase, insert an ALT node. Its length field points back
7873  to the previous branch while the bracket remains open. At the end the chain
7874  is reversed. It's done like this so that the start of the bracket has a
7875  zero offset until it is closed, making it possible to detect recursion. */
7876
7877  if (lengthptr != NULL)
7878    {
7879    code = *codeptr + 1 + LINK_SIZE + skipunits;
7880    length += 1 + LINK_SIZE;
7881    }
7882  else
7883    {
7884    *code = OP_ALT;
7885    PUT(code, 1, (int)(code - last_branch));
7886    bc.current_branch = last_branch = code;
7887    code += 1 + LINK_SIZE;
7888    }
7889
7890  /* Advance past the vertical bar */
7891
7892  ptr++;
7893  }
7894/* Control never reaches here */
7895}
7896
7897
7898
7899/*************************************************
7900*          Check for anchored pattern            *
7901*************************************************/
7902
7903/* Try to find out if this is an anchored regular expression. Consider each
7904alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
7905all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
7906it's anchored. However, if this is a multiline pattern, then only OP_SOD will
7907be found, because ^ generates OP_CIRCM in that mode.
7908
7909We can also consider a regex to be anchored if OP_SOM starts all its branches.
7910This is the code for \G, which means "match at start of match position, taking
7911into account the match offset".
7912
7913A branch is also implicitly anchored if it starts with .* and DOTALL is set,
7914because that will try the rest of the pattern at all possible matching points,
7915so there is no point trying again.... er ....
7916
7917.... except when the .* appears inside capturing parentheses, and there is a
7918subsequent back reference to those parentheses. We haven't enough information
7919to catch that case precisely.
7920
7921At first, the best we could do was to detect when .* was in capturing brackets
7922and the highest back reference was greater than or equal to that level.
7923However, by keeping a bitmap of the first 31 back references, we can catch some
7924of the more common cases more precisely.
7925
7926... A second exception is when the .* appears inside an atomic group, because
7927this prevents the number of characters it matches from being adjusted.
7928
7929Arguments:
7930  code           points to start of the compiled pattern
7931  bracket_map    a bitmap of which brackets we are inside while testing; this
7932                   handles up to substring 31; after that we just have to take
7933                   the less precise approach
7934  cb             points to the compile data block
7935  atomcount      atomic group level
7936
7937Returns:     TRUE or FALSE
7938*/
7939
7940static BOOL
7941is_anchored(register PCRE2_SPTR code, unsigned int bracket_map,
7942  compile_block *cb, int atomcount)
7943{
7944do {
7945   PCRE2_SPTR scode = first_significant_code(
7946     code + PRIV(OP_lengths)[*code], FALSE);
7947   register int op = *scode;
7948
7949   /* Non-capturing brackets */
7950
7951   if (op == OP_BRA  || op == OP_BRAPOS ||
7952       op == OP_SBRA || op == OP_SBRAPOS)
7953     {
7954     if (!is_anchored(scode, bracket_map, cb, atomcount)) return FALSE;
7955     }
7956
7957   /* Capturing brackets */
7958
7959   else if (op == OP_CBRA  || op == OP_CBRAPOS ||
7960            op == OP_SCBRA || op == OP_SCBRAPOS)
7961     {
7962     int n = GET2(scode, 1+LINK_SIZE);
7963     int new_map = bracket_map | ((n < 32)? (1u << n) : 1);
7964     if (!is_anchored(scode, new_map, cb, atomcount)) return FALSE;
7965     }
7966
7967   /* Positive forward assertions and conditions */
7968
7969   else if (op == OP_ASSERT || op == OP_COND)
7970     {
7971     if (!is_anchored(scode, bracket_map, cb, atomcount)) return FALSE;
7972     }
7973
7974   /* Atomic groups */
7975
7976   else if (op == OP_ONCE || op == OP_ONCE_NC)
7977     {
7978     if (!is_anchored(scode, bracket_map, cb, atomcount + 1))
7979       return FALSE;
7980     }
7981
7982   /* .* is not anchored unless DOTALL is set (which generates OP_ALLANY) and
7983   it isn't in brackets that are or may be referenced or inside an atomic
7984   group. There is also an option that disables auto-anchoring. */
7985
7986   else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR ||
7987             op == OP_TYPEPOSSTAR))
7988     {
7989     if (scode[1] != OP_ALLANY || (bracket_map & cb->backref_map) != 0 ||
7990         atomcount > 0 || cb->had_pruneorskip ||
7991         (cb->external_options & PCRE2_NO_DOTSTAR_ANCHOR) != 0)
7992       return FALSE;
7993     }
7994
7995   /* Check for explicit anchoring */
7996
7997   else if (op != OP_SOD && op != OP_SOM && op != OP_CIRC) return FALSE;
7998
7999   code += GET(code, 1);
8000   }
8001while (*code == OP_ALT);   /* Loop for each alternative */
8002return TRUE;
8003}
8004
8005
8006
8007/*************************************************
8008*         Check for starting with ^ or .*        *
8009*************************************************/
8010
8011/* This is called to find out if every branch starts with ^ or .* so that
8012"first char" processing can be done to speed things up in multiline
8013matching and for non-DOTALL patterns that start with .* (which must start at
8014the beginning or after \n). As in the case of is_anchored() (see above), we
8015have to take account of back references to capturing brackets that contain .*
8016because in that case we can't make the assumption. Also, the appearance of .*
8017inside atomic brackets or in a pattern that contains *PRUNE or *SKIP does not
8018count, because once again the assumption no longer holds.
8019
8020Arguments:
8021  code           points to start of the compiled pattern or a group
8022  bracket_map    a bitmap of which brackets we are inside while testing; this
8023                   handles up to substring 31; after that we just have to take
8024                   the less precise approach
8025  cb             points to the compile data
8026  atomcount      atomic group level
8027
8028Returns:         TRUE or FALSE
8029*/
8030
8031static BOOL
8032is_startline(PCRE2_SPTR code, unsigned int bracket_map, compile_block *cb,
8033  int atomcount)
8034{
8035do {
8036   PCRE2_SPTR scode = first_significant_code(
8037     code + PRIV(OP_lengths)[*code], FALSE);
8038   register int op = *scode;
8039
8040   /* If we are at the start of a conditional assertion group, *both* the
8041   conditional assertion *and* what follows the condition must satisfy the test
8042   for start of line. Other kinds of condition fail. Note that there may be an
8043   auto-callout at the start of a condition. */
8044
8045   if (op == OP_COND)
8046     {
8047     scode += 1 + LINK_SIZE;
8048
8049     if (*scode == OP_CALLOUT) scode += PRIV(OP_lengths)[OP_CALLOUT];
8050       else if (*scode == OP_CALLOUT_STR) scode += GET(scode, 1 + 2*LINK_SIZE);
8051
8052     switch (*scode)
8053       {
8054       case OP_CREF:
8055       case OP_DNCREF:
8056       case OP_RREF:
8057       case OP_DNRREF:
8058       case OP_FAIL:
8059       case OP_FALSE:
8060       case OP_TRUE:
8061       return FALSE;
8062
8063       default:     /* Assertion */
8064       if (!is_startline(scode, bracket_map, cb, atomcount)) return FALSE;
8065       do scode += GET(scode, 1); while (*scode == OP_ALT);
8066       scode += 1 + LINK_SIZE;
8067       break;
8068       }
8069     scode = first_significant_code(scode, FALSE);
8070     op = *scode;
8071     }
8072
8073   /* Non-capturing brackets */
8074
8075   if (op == OP_BRA  || op == OP_BRAPOS ||
8076       op == OP_SBRA || op == OP_SBRAPOS)
8077     {
8078     if (!is_startline(scode, bracket_map, cb, atomcount)) return FALSE;
8079     }
8080
8081   /* Capturing brackets */
8082
8083   else if (op == OP_CBRA  || op == OP_CBRAPOS ||
8084            op == OP_SCBRA || op == OP_SCBRAPOS)
8085     {
8086     int n = GET2(scode, 1+LINK_SIZE);
8087     int new_map = bracket_map | ((n < 32)? (1u << n) : 1);
8088     if (!is_startline(scode, new_map, cb, atomcount)) return FALSE;
8089     }
8090
8091   /* Positive forward assertions */
8092
8093   else if (op == OP_ASSERT)
8094     {
8095     if (!is_startline(scode, bracket_map, cb, atomcount)) return FALSE;
8096     }
8097
8098   /* Atomic brackets */
8099
8100   else if (op == OP_ONCE || op == OP_ONCE_NC)
8101     {
8102     if (!is_startline(scode, bracket_map, cb, atomcount + 1)) return FALSE;
8103     }
8104
8105   /* .* means "start at start or after \n" if it isn't in atomic brackets or
8106   brackets that may be referenced, as long as the pattern does not contain
8107   *PRUNE or *SKIP, because these break the feature. Consider, for example,
8108   /.*?a(*PRUNE)b/ with the subject "aab", which matches "ab", i.e. not at the
8109   start of a line. There is also an option that disables this optimization. */
8110
8111   else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR)
8112     {
8113     if (scode[1] != OP_ANY || (bracket_map & cb->backref_map) != 0 ||
8114         atomcount > 0 || cb->had_pruneorskip ||
8115         (cb->external_options & PCRE2_NO_DOTSTAR_ANCHOR) != 0)
8116       return FALSE;
8117     }
8118
8119   /* Check for explicit circumflex; anything else gives a FALSE result. Note
8120   in particular that this includes atomic brackets OP_ONCE and OP_ONCE_NC
8121   because the number of characters matched by .* cannot be adjusted inside
8122   them. */
8123
8124   else if (op != OP_CIRC && op != OP_CIRCM) return FALSE;
8125
8126   /* Move on to the next alternative */
8127
8128   code += GET(code, 1);
8129   }
8130while (*code == OP_ALT);  /* Loop for each alternative */
8131return TRUE;
8132}
8133
8134
8135
8136/*************************************************
8137*    Check for asserted fixed first code unit    *
8138*************************************************/
8139
8140/* During compilation, the "first code unit" settings from forward assertions
8141are discarded, because they can cause conflicts with actual literals that
8142follow. However, if we end up without a first code unit setting for an
8143unanchored pattern, it is worth scanning the regex to see if there is an
8144initial asserted first code unit. If all branches start with the same asserted
8145code unit, or with a non-conditional bracket all of whose alternatives start
8146with the same asserted code unit (recurse ad lib), then we return that code
8147unit, with the flags set to zero or REQ_CASELESS; otherwise return zero with
8148REQ_NONE in the flags.
8149
8150Arguments:
8151  code       points to start of compiled pattern
8152  flags      points to the first code unit flags
8153  inassert   TRUE if in an assertion
8154
8155Returns:     the fixed first code unit, or 0 with REQ_NONE in flags
8156*/
8157
8158static uint32_t
8159find_firstassertedcu(PCRE2_SPTR code, int32_t *flags, BOOL inassert)
8160{
8161register uint32_t c = 0;
8162int cflags = REQ_NONE;
8163
8164*flags = REQ_NONE;
8165do {
8166   uint32_t d;
8167   int dflags;
8168   int xl = (*code == OP_CBRA || *code == OP_SCBRA ||
8169             *code == OP_CBRAPOS || *code == OP_SCBRAPOS)? IMM2_SIZE:0;
8170   PCRE2_SPTR scode = first_significant_code(code + 1+LINK_SIZE + xl, TRUE);
8171   register PCRE2_UCHAR op = *scode;
8172
8173   switch(op)
8174     {
8175     default:
8176     return 0;
8177
8178     case OP_BRA:
8179     case OP_BRAPOS:
8180     case OP_CBRA:
8181     case OP_SCBRA:
8182     case OP_CBRAPOS:
8183     case OP_SCBRAPOS:
8184     case OP_ASSERT:
8185     case OP_ONCE:
8186     case OP_ONCE_NC:
8187     d = find_firstassertedcu(scode, &dflags, op == OP_ASSERT);
8188     if (dflags < 0)
8189       return 0;
8190     if (cflags < 0) { c = d; cflags = dflags; }
8191       else if (c != d || cflags != dflags) return 0;
8192     break;
8193
8194     case OP_EXACT:
8195     scode += IMM2_SIZE;
8196     /* Fall through */
8197
8198     case OP_CHAR:
8199     case OP_PLUS:
8200     case OP_MINPLUS:
8201     case OP_POSPLUS:
8202     if (!inassert) return 0;
8203     if (cflags < 0) { c = scode[1]; cflags = 0; }
8204       else if (c != scode[1]) return 0;
8205     break;
8206
8207     case OP_EXACTI:
8208     scode += IMM2_SIZE;
8209     /* Fall through */
8210
8211     case OP_CHARI:
8212     case OP_PLUSI:
8213     case OP_MINPLUSI:
8214     case OP_POSPLUSI:
8215     if (!inassert) return 0;
8216     if (cflags < 0) { c = scode[1]; cflags = REQ_CASELESS; }
8217       else if (c != scode[1]) return 0;
8218     break;
8219     }
8220
8221   code += GET(code, 1);
8222   }
8223while (*code == OP_ALT);
8224
8225*flags = cflags;
8226return c;
8227}
8228
8229
8230
8231/*************************************************
8232*     Add an entry to the name/number table      *
8233*************************************************/
8234
8235/* This function is called between compiling passes to add an entry to the
8236name/number table, maintaining alphabetical order. Checking for permitted
8237and forbidden duplicates has already been done.
8238
8239Arguments:
8240  cb           the compile data block
8241  name         the name to add
8242  length       the length of the name
8243  groupno      the group number
8244
8245Returns:       nothing
8246*/
8247
8248static void
8249add_name_to_table(compile_block *cb, PCRE2_SPTR name, int length,
8250  unsigned int groupno)
8251{
8252int i;
8253PCRE2_UCHAR *slot = cb->name_table;
8254
8255for (i = 0; i < cb->names_found; i++)
8256  {
8257  int crc = memcmp(name, slot+IMM2_SIZE, CU2BYTES(length));
8258  if (crc == 0 && slot[IMM2_SIZE+length] != 0)
8259    crc = -1; /* Current name is a substring */
8260
8261  /* Make space in the table and break the loop for an earlier name. For a
8262  duplicate or later name, carry on. We do this for duplicates so that in the
8263  simple case (when ?(| is not used) they are in order of their numbers. In all
8264  cases they are in the order in which they appear in the pattern. */
8265
8266  if (crc < 0)
8267    {
8268    memmove(slot + cb->name_entry_size, slot,
8269      CU2BYTES((cb->names_found - i) * cb->name_entry_size));
8270    break;
8271    }
8272
8273  /* Continue the loop for a later or duplicate name */
8274
8275  slot += cb->name_entry_size;
8276  }
8277
8278PUT2(slot, 0, groupno);
8279memcpy(slot + IMM2_SIZE, name, CU2BYTES(length));
8280cb->names_found++;
8281
8282/* Add a terminating zero and fill the rest of the slot with zeroes so that
8283the memory is all initialized. Otherwise valgrind moans about uninitialized
8284memory when saving serialized compiled patterns. */
8285
8286memset(slot + IMM2_SIZE + length, 0,
8287  CU2BYTES(cb->name_entry_size - length - IMM2_SIZE));
8288}
8289
8290
8291
8292/*************************************************
8293*     External function to compile a pattern     *
8294*************************************************/
8295
8296/* This function reads a regular expression in the form of a string and returns
8297a pointer to a block of store holding a compiled version of the expression.
8298
8299Arguments:
8300  pattern       the regular expression
8301  patlen        the length of the pattern, or PCRE2_ZERO_TERMINATED
8302  options       option bits
8303  errorptr      pointer to errorcode
8304  erroroffset   pointer to error offset
8305  ccontext      points to a compile context or is NULL
8306
8307Returns:        pointer to compiled data block, or NULL on error,
8308                with errorcode and erroroffset set
8309*/
8310
8311PCRE2_EXP_DEFN pcre2_code * PCRE2_CALL_CONVENTION
8312pcre2_compile(PCRE2_SPTR pattern, PCRE2_SIZE patlen, uint32_t options,
8313   int *errorptr, PCRE2_SIZE *erroroffset, pcre2_compile_context *ccontext)
8314{
8315BOOL utf;                               /* Set TRUE for UTF mode */
8316pcre2_real_code *re = NULL;             /* What we will return */
8317compile_block cb;                       /* "Static" compile-time data */
8318const uint8_t *tables;                  /* Char tables base pointer */
8319
8320PCRE2_UCHAR *code;                      /* Current pointer in compiled code */
8321PCRE2_SPTR codestart;                   /* Start of compiled code */
8322PCRE2_SPTR ptr;                         /* Current pointer in pattern */
8323
8324size_t length = 1;                      /* Allow or final END opcode */
8325size_t usedlength;                      /* Actual length used */
8326size_t re_blocksize;                    /* Size of memory block */
8327
8328int32_t firstcuflags, reqcuflags;       /* Type of first/req code unit */
8329uint32_t firstcu, reqcu;                /* Value of first/req code unit */
8330uint32_t setflags = 0;                  /* NL and BSR set flags */
8331
8332uint32_t skipatstart;                   /* When checking (*UTF) etc */
8333uint32_t limit_match = UINT32_MAX;      /* Unset match limits */
8334uint32_t limit_recursion = UINT32_MAX;
8335
8336int newline = 0;                        /* Unset; can be set by the pattern */
8337int bsr = 0;                            /* Unset; can be set by the pattern */
8338int errorcode = 0;                      /* Initialize to avoid compiler warn */
8339
8340/* Comments at the head of this file explain about these variables. */
8341
8342PCRE2_UCHAR *copied_pattern = NULL;
8343PCRE2_UCHAR stack_copied_pattern[COPIED_PATTERN_SIZE];
8344named_group named_groups[NAMED_GROUP_LIST_SIZE];
8345
8346/* The workspace is used in different ways in the different compiling phases.
8347It needs to be 16-bit aligned for the preliminary group scan, and 32-bit
8348aligned for the group information cache. */
8349
8350uint32_t c32workspace[C32_WORK_SIZE];
8351PCRE2_UCHAR *cworkspace = (PCRE2_UCHAR *)c32workspace;
8352
8353
8354/* -------------- Check arguments and set up the pattern ----------------- */
8355
8356/* There must be error code and offset pointers. */
8357
8358if (errorptr == NULL || erroroffset == NULL) return NULL;
8359*errorptr = ERR0;
8360*erroroffset = 0;
8361
8362/* There must be a pattern! */
8363
8364if (pattern == NULL)
8365  {
8366  *errorptr = ERR16;
8367  return NULL;
8368  }
8369
8370/* Check that all undefined public option bits are zero. */
8371
8372if ((options & ~PUBLIC_COMPILE_OPTIONS) != 0)
8373  {
8374  *errorptr = ERR17;
8375  return NULL;
8376  }
8377
8378/* A NULL compile context means "use a default context" */
8379
8380if (ccontext == NULL)
8381  ccontext = (pcre2_compile_context *)(&PRIV(default_compile_context));
8382
8383/* A zero-terminated pattern is indicated by the special length value
8384PCRE2_ZERO_TERMINATED. Otherwise, we make a copy of the pattern and add a zero,
8385to ensure that it is always possible to look one code unit beyond the end of
8386the pattern's characters. In both cases, check that the pattern is overlong. */
8387
8388if (patlen == PCRE2_ZERO_TERMINATED)
8389  {
8390  patlen = PRIV(strlen)(pattern);
8391  if (patlen > ccontext->max_pattern_length)
8392    {
8393    *errorptr = ERR88;
8394    return NULL;
8395    }
8396  }
8397else
8398  {
8399  if (patlen > ccontext->max_pattern_length)
8400    {
8401    *errorptr = ERR88;
8402    return NULL;
8403    }
8404  if (patlen < COPIED_PATTERN_SIZE)
8405    copied_pattern = stack_copied_pattern;
8406  else
8407    {
8408    copied_pattern = ccontext->memctl.malloc(CU2BYTES(patlen + 1),
8409      ccontext->memctl.memory_data);
8410    if (copied_pattern == NULL)
8411      {
8412      *errorptr = ERR21;
8413      return NULL;
8414      }
8415    }
8416  memcpy(copied_pattern, pattern, CU2BYTES(patlen));
8417  copied_pattern[patlen] = 0;
8418  pattern = copied_pattern;
8419  }
8420
8421/* ------------ Initialize the "static" compile data -------------- */
8422
8423tables = (ccontext->tables != NULL)? ccontext->tables : PRIV(default_tables);
8424
8425cb.lcc = tables + lcc_offset;          /* Individual */
8426cb.fcc = tables + fcc_offset;          /*   character */
8427cb.cbits = tables + cbits_offset;      /*      tables */
8428cb.ctypes = tables + ctypes_offset;
8429
8430cb.assert_depth = 0;
8431cb.bracount = cb.final_bracount = 0;
8432cb.cx = ccontext;
8433cb.dupnames = FALSE;
8434cb.end_pattern = pattern + patlen;
8435cb.nestptr[0] = cb.nestptr[1] = NULL;
8436cb.external_flags = 0;
8437cb.external_options = options;
8438cb.groupinfo = c32workspace;
8439cb.had_recurse = FALSE;
8440cb.iscondassert = FALSE;
8441cb.max_lookbehind = 0;
8442cb.name_entry_size = 0;
8443cb.name_table = NULL;
8444cb.named_groups = named_groups;
8445cb.named_group_list_size = NAMED_GROUP_LIST_SIZE;
8446cb.names_found = 0;
8447cb.open_caps = NULL;
8448cb.parens_depth = 0;
8449cb.req_varyopt = 0;
8450cb.start_code = cworkspace;
8451cb.start_pattern = pattern;
8452cb.start_workspace = cworkspace;
8453cb.workspace_size = COMPILE_WORK_SIZE;
8454
8455/* Maximum back reference and backref bitmap. The bitmap records up to 31 back
8456references to help in deciding whether (.*) can be treated as anchored or not.
8457*/
8458
8459cb.top_backref = 0;
8460cb.backref_map = 0;
8461
8462/* --------------- Start looking at the pattern --------------- */
8463
8464/* Check for global one-time option settings at the start of the pattern, and
8465remember the offset to the actual regex. */
8466
8467ptr = pattern;
8468skipatstart = 0;
8469
8470while (ptr[skipatstart] == CHAR_LEFT_PARENTHESIS &&
8471       ptr[skipatstart+1] == CHAR_ASTERISK)
8472  {
8473  unsigned int i;
8474  for (i = 0; i < sizeof(pso_list)/sizeof(pso); i++)
8475    {
8476    pso *p = pso_list + i;
8477
8478    if (PRIV(strncmp_c8)(ptr+skipatstart+2, (char *)(p->name), p->length) == 0)
8479      {
8480      uint32_t c, pp;
8481
8482      skipatstart += p->length + 2;
8483      switch(p->type)
8484        {
8485        case PSO_OPT:
8486        cb.external_options |= p->value;
8487        break;
8488
8489        case PSO_FLG:
8490        setflags |= p->value;
8491        break;
8492
8493        case PSO_NL:
8494        newline = p->value;
8495        setflags |= PCRE2_NL_SET;
8496        break;
8497
8498        case PSO_BSR:
8499        bsr = p->value;
8500        setflags |= PCRE2_BSR_SET;
8501        break;
8502
8503        case PSO_LIMM:
8504        case PSO_LIMR:
8505        c = 0;
8506        pp = skipatstart;
8507        if (!IS_DIGIT(ptr[pp]))
8508          {
8509          errorcode = ERR60;
8510          ptr += pp;
8511          goto HAD_ERROR;
8512          }
8513        while (IS_DIGIT(ptr[pp]))
8514          {
8515          if (c > UINT32_MAX / 10 - 1) break;   /* Integer overflow */
8516          c = c*10 + (ptr[pp++] - CHAR_0);
8517          }
8518        if (ptr[pp++] != CHAR_RIGHT_PARENTHESIS)
8519          {
8520          errorcode = ERR60;
8521          ptr += pp;
8522          goto HAD_ERROR;
8523          }
8524        if (p->type == PSO_LIMM) limit_match = c;
8525          else limit_recursion = c;
8526        skipatstart += pp - skipatstart;
8527        break;
8528        }
8529      break;   /* Out of the table scan loop */
8530      }
8531    }
8532  if (i >= sizeof(pso_list)/sizeof(pso)) break;   /* Out of pso loop */
8533  }
8534
8535/* End of pattern-start options; advance to start of real regex. */
8536
8537ptr += skipatstart;
8538
8539/* Can't support UTF or UCP unless PCRE2 has been compiled with UTF support. */
8540
8541#ifndef SUPPORT_UNICODE
8542if ((cb.external_options & (PCRE2_UTF|PCRE2_UCP)) != 0)
8543  {
8544  errorcode = ERR32;
8545  goto HAD_ERROR;
8546  }
8547#endif
8548
8549/* Check UTF. We have the original options in 'options', with that value as
8550modified by (*UTF) etc in cb->external_options. */
8551
8552utf = (cb.external_options & PCRE2_UTF) != 0;
8553if (utf)
8554  {
8555  if ((options & PCRE2_NEVER_UTF) != 0)
8556    {
8557    errorcode = ERR74;
8558    goto HAD_ERROR;
8559    }
8560  if ((options & PCRE2_NO_UTF_CHECK) == 0 &&
8561       (errorcode = PRIV(valid_utf)(pattern, patlen, erroroffset)) != 0)
8562    goto HAD_UTF_ERROR;
8563  }
8564
8565/* Check UCP lockout. */
8566
8567if ((cb.external_options & (PCRE2_UCP|PCRE2_NEVER_UCP)) ==
8568    (PCRE2_UCP|PCRE2_NEVER_UCP))
8569  {
8570  errorcode = ERR75;
8571  goto HAD_ERROR;
8572  }
8573
8574/* Process the BSR setting. */
8575
8576if (bsr == 0) bsr = ccontext->bsr_convention;
8577
8578/* Process the newline setting. */
8579
8580if (newline == 0) newline = ccontext->newline_convention;
8581cb.nltype = NLTYPE_FIXED;
8582switch(newline)
8583  {
8584  case PCRE2_NEWLINE_CR:
8585  cb.nllen = 1;
8586  cb.nl[0] = CHAR_CR;
8587  break;
8588
8589  case PCRE2_NEWLINE_LF:
8590  cb.nllen = 1;
8591  cb.nl[0] = CHAR_NL;
8592  break;
8593
8594  case PCRE2_NEWLINE_CRLF:
8595  cb.nllen = 2;
8596  cb.nl[0] = CHAR_CR;
8597  cb.nl[1] = CHAR_NL;
8598  break;
8599
8600  case PCRE2_NEWLINE_ANY:
8601  cb.nltype = NLTYPE_ANY;
8602  break;
8603
8604  case PCRE2_NEWLINE_ANYCRLF:
8605  cb.nltype = NLTYPE_ANYCRLF;
8606  break;
8607
8608  default:
8609  errorcode = ERR56;
8610  goto HAD_ERROR;
8611  }
8612
8613/* Before we do anything else, do a pre-scan of the pattern in order to
8614discover the named groups and their numerical equivalents, so that this
8615information is always available for the remaining processing. */
8616
8617errorcode = scan_for_captures(&ptr, cb.external_options, &cb);
8618if (errorcode != 0) goto HAD_ERROR;
8619
8620/* For obscure debugging this code can be enabled. */
8621
8622#if 0
8623  {
8624  int i;
8625  named_group *ng = cb.named_groups;
8626  fprintf(stderr, "+++Captures: %d\n", cb.final_bracount);
8627  for (i = 0; i < cb.names_found; i++, ng++)
8628    {
8629    fprintf(stderr, "+++%3d %.*s\n", ng->number, ng->length, ng->name);
8630    }
8631  }
8632#endif
8633
8634/* Reset current bracket count to zero and current pointer to the start of the
8635pattern. */
8636
8637cb.bracount = 0;
8638ptr = pattern + skipatstart;
8639
8640/* Pretend to compile the pattern while actually just accumulating the amount
8641of memory required in the 'length' variable. This behaviour is triggered by
8642passing a non-NULL final argument to compile_regex(). We pass a block of
8643workspace (cworkspace) for it to compile parts of the pattern into; the
8644compiled code is discarded when it is no longer needed, so hopefully this
8645workspace will never overflow, though there is a test for its doing so.
8646
8647On error, errorcode will be set non-zero, so we don't need to look at the
8648result of the function. The initial options have been put into the cb block so
8649that they can be changed if an option setting is found within the regex right
8650at the beginning. Bringing initial option settings outside can help speed up
8651starting point checks. We still have to pass a separate options variable (the
8652first argument) because that may change as the pattern is processed. */
8653
8654code = cworkspace;
8655*code = OP_BRA;
8656
8657(void)compile_regex(cb.external_options, &code, &ptr, &errorcode, FALSE,
8658  FALSE, 0, 0, &firstcu, &firstcuflags, &reqcu, &reqcuflags, NULL,
8659  &cb, &length);
8660
8661if (errorcode != 0) goto HAD_ERROR;
8662if (length > MAX_PATTERN_SIZE)
8663  {
8664  errorcode = ERR20;
8665  goto HAD_ERROR;
8666  }
8667
8668/* Compute the size of, and then get and initialize, the data block for storing
8669the compiled pattern and names table. Integer overflow should no longer be
8670possible because nowadays we limit the maximum value of cb.names_found and
8671cb.name_entry_size. */
8672
8673re_blocksize = sizeof(pcre2_real_code) +
8674  CU2BYTES(length + cb.names_found * cb.name_entry_size);
8675re = (pcre2_real_code *)
8676  ccontext->memctl.malloc(re_blocksize, ccontext->memctl.memory_data);
8677if (re == NULL)
8678  {
8679  errorcode = ERR21;
8680  goto HAD_ERROR;
8681  }
8682
8683re->memctl = ccontext->memctl;
8684re->tables = tables;
8685re->executable_jit = NULL;
8686memset(re->start_bitmap, 0, 32 * sizeof(uint8_t));
8687re->blocksize = re_blocksize;
8688re->magic_number = MAGIC_NUMBER;
8689re->compile_options = options;
8690re->overall_options = cb.external_options;
8691re->flags = PCRE2_CODE_UNIT_WIDTH/8 | cb.external_flags | setflags;
8692re->limit_match = limit_match;
8693re->limit_recursion = limit_recursion;
8694re->first_codeunit = 0;
8695re->last_codeunit = 0;
8696re->bsr_convention = bsr;
8697re->newline_convention = newline;
8698re->max_lookbehind = 0;
8699re->minlength = 0;
8700re->top_bracket = 0;
8701re->top_backref = 0;
8702re->name_entry_size = cb.name_entry_size;
8703re->name_count = cb.names_found;
8704
8705/* The basic block is immediately followed by the name table, and the compiled
8706code follows after that. */
8707
8708codestart = (PCRE2_SPTR)((uint8_t *)re + sizeof(pcre2_real_code)) +
8709  re->name_entry_size * re->name_count;
8710
8711/* Workspace is needed to remember information about numbered groups: whether a
8712group can match an empty string and what its fixed length is. This is done to
8713avoid the possibility of recursive references causing very long compile times
8714when checking these features. Unnumbered groups do not have this exposure since
8715they cannot be referenced. We use an indexed vector for this purpose. If there
8716are sufficiently few groups, it can be the c32workspace vector, as set up
8717above. Otherwise we have to get/free a special vector. The vector must be
8718initialized to zero. */
8719
8720if (cb.final_bracount >= C32_WORK_SIZE)
8721  {
8722  cb.groupinfo = ccontext->memctl.malloc(
8723    (cb.final_bracount + 1)*sizeof(uint32_t), ccontext->memctl.memory_data);
8724  if (cb.groupinfo == NULL)
8725    {
8726    errorcode = ERR21;
8727    goto HAD_ERROR;
8728    }
8729  }
8730memset(cb.groupinfo, 0, (cb.final_bracount + 1) * sizeof(uint32_t));
8731
8732/* Update the compile data block for the actual compile. The starting points of
8733the name/number translation table and of the code are passed around in the
8734compile data block. The start/end pattern and initial options are already set
8735from the pre-compile phase, as is the name_entry_size field. Reset the bracket
8736count and the names_found field. */
8737
8738cb.parens_depth = 0;
8739cb.assert_depth = 0;
8740cb.bracount = 0;
8741cb.max_lookbehind = 0;
8742cb.name_table = (PCRE2_UCHAR *)((uint8_t *)re + sizeof(pcre2_real_code));
8743cb.start_code = codestart;
8744cb.iscondassert = FALSE;
8745cb.req_varyopt = 0;
8746cb.had_accept = FALSE;
8747cb.had_pruneorskip = FALSE;
8748cb.check_lookbehind = FALSE;
8749cb.open_caps = NULL;
8750
8751/* If any named groups were found, create the name/number table from the list
8752created in the pre-pass. */
8753
8754if (cb.names_found > 0)
8755  {
8756  int i = cb.names_found;
8757  named_group *ng = cb.named_groups;
8758  cb.names_found = 0;
8759  for (; i > 0; i--, ng++)
8760    add_name_to_table(&cb, ng->name, ng->length, ng->number);
8761  }
8762
8763/* Set up a starting, non-extracting bracket, then compile the expression. On
8764error, errorcode will be set non-zero, so we don't need to look at the result
8765of the function here. */
8766
8767ptr = pattern + skipatstart;
8768code = (PCRE2_UCHAR *)codestart;
8769*code = OP_BRA;
8770(void)compile_regex(re->overall_options, &code, &ptr, &errorcode, FALSE, FALSE,
8771   0, 0, &firstcu, &firstcuflags, &reqcu, &reqcuflags, NULL, &cb, NULL);
8772
8773re->top_bracket = cb.bracount;
8774re->top_backref = cb.top_backref;
8775re->max_lookbehind = cb.max_lookbehind;
8776
8777if (cb.had_accept)
8778  {
8779  reqcu = 0;              /* Must disable after (*ACCEPT) */
8780  reqcuflags = REQ_NONE;
8781  }
8782
8783/* Fill in the final opcode and check for disastrous overflow. If no overflow,
8784but the estimated length exceeds the really used length, adjust the value of
8785re->blocksize, and if valgrind support is configured, mark the extra allocated
8786memory as unaddressable, so that any out-of-bound reads can be detected. */
8787
8788*code++ = OP_END;
8789usedlength = code - codestart;
8790if (usedlength > length) errorcode = ERR23; else
8791  {
8792  re->blocksize -= CU2BYTES(length - usedlength);
8793#ifdef SUPPORT_VALGRIND
8794  VALGRIND_MAKE_MEM_NOACCESS(code, CU2BYTES(length - usedlength));
8795#endif
8796  }
8797
8798/* Scan the pattern for recursion/subroutine calls and convert the group
8799numbers into offsets. Maintain a small cache so that repeated groups containing
8800recursions are efficiently handled. */
8801
8802#define RSCAN_CACHE_SIZE 8
8803
8804if (errorcode == 0 && cb.had_recurse)
8805  {
8806  PCRE2_UCHAR *rcode;
8807  PCRE2_SPTR rgroup;
8808  int ccount = 0;
8809  int start = RSCAN_CACHE_SIZE;
8810  recurse_cache rc[RSCAN_CACHE_SIZE];
8811
8812  for (rcode = (PCRE2_UCHAR *)find_recurse(codestart, utf);
8813       rcode != NULL;
8814       rcode = (PCRE2_UCHAR *)find_recurse(rcode + 1 + LINK_SIZE, utf))
8815    {
8816    int i, p, recno;
8817
8818    recno = (int)GET(rcode, 1);
8819    if (recno == 0) rgroup = codestart; else
8820      {
8821      PCRE2_SPTR search_from = codestart;
8822      rgroup = NULL;
8823      for (i = 0, p = start; i < ccount; i++, p = (p + 1) & 7)
8824        {
8825        if (recno == rc[p].recno)
8826          {
8827          rgroup = rc[p].group;
8828          break;
8829          }
8830
8831        /* Group n+1 must always start to the right of group n, so we can save
8832        search time below when the new group number is greater than any of the
8833        previously found groups. */
8834
8835        if (recno > rc[p].recno) search_from = rc[p].group;
8836        }
8837
8838      if (rgroup == NULL)
8839        {
8840        rgroup = PRIV(find_bracket)(search_from, utf, recno);
8841        if (rgroup == NULL)
8842          {
8843          errorcode = ERR53;
8844          break;
8845          }
8846        if (--start < 0) start = RSCAN_CACHE_SIZE - 1;
8847        rc[start].recno = recno;
8848        rc[start].group = rgroup;
8849        if (ccount < RSCAN_CACHE_SIZE) ccount++;
8850        }
8851      }
8852
8853    PUT(rcode, 1, rgroup - codestart);
8854    }
8855  }
8856
8857/* In rare debugging situations we sometimes need to look at the compiled code
8858at this stage. */
8859
8860#ifdef CALL_PRINTINT
8861pcre2_printint(re, stderr, TRUE);
8862fprintf(stderr, "Length=%lu Used=%lu\n", length, usedlength);
8863#endif
8864
8865/* After a successful compile, give an error if there's back reference to a
8866non-existent capturing subpattern. Then, unless disabled, check whether any
8867single character iterators can be auto-possessified. The function overwrites
8868the appropriate opcode values, so the type of the pointer must be cast. NOTE:
8869the intermediate variable "temp" is used in this code because at least one
8870compiler gives a warning about loss of "const" attribute if the cast
8871(PCRE2_UCHAR *)codestart is used directly in the function call. */
8872
8873if (errorcode == 0)
8874  {
8875  if (re->top_backref > re->top_bracket) errorcode = ERR15;
8876  else if ((re->overall_options & PCRE2_NO_AUTO_POSSESS) == 0)
8877    {
8878    PCRE2_UCHAR *temp = (PCRE2_UCHAR *)codestart;
8879    if (PRIV(auto_possessify)(temp, utf, &cb) != 0) errorcode = ERR80;
8880    }
8881  }
8882
8883/* If there were any lookbehind assertions that contained OP_RECURSE
8884(recursions or subroutine calls), a flag is set for them to be checked here,
8885because they may contain forward references. Actual recursions cannot be fixed
8886length, but subroutine calls can. It is done like this so that those without
8887OP_RECURSE that are not fixed length get a diagnosic with a useful offset. The
8888exceptional ones forgo this. We scan the pattern to check that they are fixed
8889length, and set their lengths. */
8890
8891if (errorcode == 0 && cb.check_lookbehind)
8892  {
8893  PCRE2_UCHAR *cc = (PCRE2_UCHAR *)codestart;
8894
8895  /* Loop, searching for OP_REVERSE items, and process those that do not have
8896  their length set. (Actually, it will also re-process any that have a length
8897  of zero, but that is a pathological case, and it does no harm.) When we find
8898  one, we temporarily terminate the branch it is in while we scan it. Note that
8899  calling find_bracket() with a negative group number returns a pointer to the
8900  OP_REVERSE item, not the actual lookbehind. */
8901
8902  for (cc = (PCRE2_UCHAR *)PRIV(find_bracket)(codestart, utf, -1);
8903       cc != NULL;
8904       cc = (PCRE2_UCHAR *)PRIV(find_bracket)(cc, utf, -1))
8905    {
8906    if (GET(cc, 1) == 0)
8907      {
8908      int fixed_length;
8909      int count = 0;
8910      PCRE2_UCHAR *be = cc - 1 - LINK_SIZE + GET(cc, -LINK_SIZE);
8911      int end_op = *be;
8912      *be = OP_END;
8913      fixed_length = find_fixedlength(cc, utf, TRUE, &cb, NULL, &count);
8914      *be = end_op;
8915      if (fixed_length < 0)
8916        {
8917        errorcode = fixed_length_errors[-fixed_length];
8918        break;
8919        }
8920      if (fixed_length > cb.max_lookbehind) cb.max_lookbehind = fixed_length;
8921      PUT(cc, 1, fixed_length);
8922      }
8923    cc += 1 + LINK_SIZE;
8924    }
8925
8926  /* The previous value of the maximum lookbehind was transferred to the
8927  compiled regex block above. We could have updated this value in the loop
8928  above, but keep the two values in step, just in case some later code below
8929  uses the cb value. */
8930
8931  re->max_lookbehind = cb.max_lookbehind;
8932  }
8933
8934/* Failed to compile, or error while post-processing. Earlier errors get here
8935via the dreaded goto. */
8936
8937if (errorcode != 0)
8938  {
8939  HAD_ERROR:
8940  *erroroffset = (int)(ptr - pattern);
8941  HAD_UTF_ERROR:
8942  *errorptr = errorcode;
8943  pcre2_code_free(re);
8944  re = NULL;
8945  goto EXIT;
8946  }
8947
8948/* Successful compile. If the anchored option was not passed, set it if
8949we can determine that the pattern is anchored by virtue of ^ characters or \A
8950or anything else, such as starting with non-atomic .* when DOTALL is set and
8951there are no occurrences of *PRUNE or *SKIP (though there is an option to
8952disable this case). */
8953
8954if ((re->overall_options & PCRE2_ANCHORED) == 0 &&
8955     is_anchored(codestart, 0, &cb, 0))
8956  re->overall_options |= PCRE2_ANCHORED;
8957
8958/* If the pattern is still not anchored and we do not have a first code unit,
8959see if there is one that is asserted (these are not saved during the compile
8960because they can cause conflicts with actual literals that follow). This code
8961need not be obeyed if PCRE2_NO_START_OPTIMIZE is set, as the data it would
8962create will not be used. */
8963
8964if ((re->overall_options & (PCRE2_ANCHORED|PCRE2_NO_START_OPTIMIZE)) == 0)
8965  {
8966  if (firstcuflags < 0)
8967    firstcu = find_firstassertedcu(codestart, &firstcuflags, FALSE);
8968
8969  /* Save the data for a first code unit. */
8970
8971  if (firstcuflags >= 0)
8972    {
8973    re->first_codeunit = firstcu;
8974    re->flags |= PCRE2_FIRSTSET;
8975
8976    /* Handle caseless first code units. */
8977
8978    if ((firstcuflags & REQ_CASELESS) != 0)
8979      {
8980      if (firstcu < 128 || (!utf && firstcu < 255))
8981        {
8982        if (cb.fcc[firstcu] != firstcu) re->flags |= PCRE2_FIRSTCASELESS;
8983        }
8984
8985      /* The first code unit is > 128 in UTF mode, or > 255 otherwise. In
8986      8-bit UTF mode, codepoints in the range 128-255 are introductory code
8987      points and cannot have another case. In 16-bit and 32-bit modes, we can
8988      check wide characters when UTF (and therefore UCP) is supported. */
8989
8990#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8
8991      else if (firstcu <= MAX_UTF_CODE_POINT &&
8992               UCD_OTHERCASE(firstcu) != firstcu)
8993        re->flags |= PCRE2_FIRSTCASELESS;
8994#endif
8995      }
8996    }
8997
8998  /* When there is no first code unit, see if we can set the PCRE2_STARTLINE
8999  flag. This is helpful for multiline matches when all branches start with ^
9000  and also when all branches start with non-atomic .* for non-DOTALL matches
9001  when *PRUNE and SKIP are not present. (There is an option that disables this
9002  case.) */
9003
9004  else if (is_startline(codestart, 0, &cb, 0)) re->flags |= PCRE2_STARTLINE;
9005  }
9006
9007/* Handle the "required code unit", if one is set. In the case of an anchored
9008pattern, do this only if it follows a variable length item in the pattern.
9009Again, skip this if PCRE2_NO_START_OPTIMIZE is set. */
9010
9011if (reqcuflags >= 0 &&
9012     ((re->overall_options & (PCRE2_ANCHORED|PCRE2_NO_START_OPTIMIZE)) == 0 ||
9013      (reqcuflags & REQ_VARY) != 0))
9014  {
9015  re->last_codeunit = reqcu;
9016  re->flags |= PCRE2_LASTSET;
9017
9018  /* Handle caseless required code units as for first code units (above). */
9019
9020  if ((reqcuflags & REQ_CASELESS) != 0)
9021    {
9022    if (reqcu < 128 || (!utf && reqcu < 255))
9023      {
9024      if (cb.fcc[reqcu] != reqcu) re->flags |= PCRE2_LASTCASELESS;
9025      }
9026#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8
9027    else if (reqcu <= MAX_UTF_CODE_POINT && UCD_OTHERCASE(reqcu) != reqcu)
9028      re->flags |= PCRE2_LASTCASELESS;
9029#endif
9030    }
9031  }
9032
9033/* Check for a pattern than can match an empty string, so that this information
9034can be provided to applications. */
9035
9036do
9037  {
9038  int count = 0;
9039  int rc = could_be_empty_branch(codestart, code, utf, &cb, TRUE, NULL, &count);
9040  if (rc < 0)
9041    {
9042    errorcode = ERR86;
9043    goto HAD_ERROR;
9044    }
9045  if (rc > 0)
9046    {
9047    re->flags |= PCRE2_MATCH_EMPTY;
9048    break;
9049    }
9050  codestart += GET(codestart, 1);
9051  }
9052while (*codestart == OP_ALT);
9053
9054/* Finally, unless PCRE2_NO_START_OPTIMIZE is set, study the compiled pattern
9055to set up information such as a bitmap of starting code units and a minimum
9056matching length. */
9057
9058if ((re->overall_options & PCRE2_NO_START_OPTIMIZE) == 0 &&
9059    PRIV(study)(re) != 0)
9060  {
9061  errorcode = ERR31;
9062  goto HAD_ERROR;
9063  }
9064
9065/* Control ends up here in all cases. If memory was obtained for a
9066zero-terminated copy of the pattern, remember to free it before returning. Also
9067free the list of named groups if a larger one had to be obtained, and likewise
9068the group information vector. */
9069
9070EXIT:
9071if (copied_pattern != stack_copied_pattern)
9072  ccontext->memctl.free(copied_pattern, ccontext->memctl.memory_data);
9073if (cb.named_group_list_size > NAMED_GROUP_LIST_SIZE)
9074  ccontext->memctl.free((void *)cb.named_groups, ccontext->memctl.memory_data);
9075if (cb.groupinfo != c32workspace)
9076  ccontext->memctl.free((void *)cb.groupinfo, ccontext->memctl.memory_data);
9077
9078return re;    /* Will be NULL after an error */
9079}
9080
9081/* End of pcre2_compile.c */
9082