1/*************************************************
2*      Perl-Compatible Regular Expressions       *
3*************************************************/
4
5/* PCRE is a library of functions to support regular expressions whose syntax
6and semantics are as close as possible to those of the Perl 5 language (but see
7below for why this module is different).
8
9                       Written by Philip Hazel
10           Copyright (c) 1997-2014 University of Cambridge
11
12-----------------------------------------------------------------------------
13Redistribution and use in source and binary forms, with or without
14modification, are permitted provided that the following conditions are met:
15
16    * Redistributions of source code must retain the above copyright notice,
17      this list of conditions and the following disclaimer.
18
19    * Redistributions in binary form must reproduce the above copyright
20      notice, this list of conditions and the following disclaimer in the
21      documentation and/or other materials provided with the distribution.
22
23    * Neither the name of the University of Cambridge nor the names of its
24      contributors may be used to endorse or promote products derived from
25      this software without specific prior written permission.
26
27THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37POSSIBILITY OF SUCH DAMAGE.
38-----------------------------------------------------------------------------
39*/
40
41/* This module contains the external function pcre_dfa_exec(), which is an
42alternative matching function that uses a sort of DFA algorithm (not a true
43FSM). This is NOT Perl-compatible, but it has advantages in certain
44applications. */
45
46
47/* NOTE ABOUT PERFORMANCE: A user of this function sent some code that improved
48the performance of his patterns greatly. I could not use it as it stood, as it
49was not thread safe, and made assumptions about pattern sizes. Also, it caused
50test 7 to loop, and test 9 to crash with a segfault.
51
52The issue is the check for duplicate states, which is done by a simple linear
53search up the state list. (Grep for "duplicate" below to find the code.) For
54many patterns, there will never be many states active at one time, so a simple
55linear search is fine. In patterns that have many active states, it might be a
56bottleneck. The suggested code used an indexing scheme to remember which states
57had previously been used for each character, and avoided the linear search when
58it knew there was no chance of a duplicate. This was implemented when adding
59states to the state lists.
60
61I wrote some thread-safe, not-limited code to try something similar at the time
62of checking for duplicates (instead of when adding states), using index vectors
63on the stack. It did give a 13% improvement with one specially constructed
64pattern for certain subject strings, but on other strings and on many of the
65simpler patterns in the test suite it did worse. The major problem, I think,
66was the extra time to initialize the index. This had to be done for each call
67of internal_dfa_exec(). (The supplied patch used a static vector, initialized
68only once - I suspect this was the cause of the problems with the tests.)
69
70Overall, I concluded that the gains in some cases did not outweigh the losses
71in others, so I abandoned this code. */
72
73
74
75#ifdef HAVE_CONFIG_H
76#include "config.h"
77#endif
78
79#define NLBLOCK md             /* Block containing newline information */
80#define PSSTART start_subject  /* Field containing processed string start */
81#define PSEND   end_subject    /* Field containing processed string end */
82
83#include "pcre_internal.h"
84
85
86/* For use to indent debugging output */
87
88#define SP "                   "
89
90
91/*************************************************
92*      Code parameters and static tables         *
93*************************************************/
94
95/* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes
96into others, under special conditions. A gap of 20 between the blocks should be
97enough. The resulting opcodes don't have to be less than 256 because they are
98never stored, so we push them well clear of the normal opcodes. */
99
100#define OP_PROP_EXTRA       300
101#define OP_EXTUNI_EXTRA     320
102#define OP_ANYNL_EXTRA      340
103#define OP_HSPACE_EXTRA     360
104#define OP_VSPACE_EXTRA     380
105
106
107/* This table identifies those opcodes that are followed immediately by a
108character that is to be tested in some way. This makes it possible to
109centralize the loading of these characters. In the case of Type * etc, the
110"character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a
111small value. Non-zero values in the table are the offsets from the opcode where
112the character is to be found. ***NOTE*** If the start of this table is
113modified, the three tables that follow must also be modified. */
114
115static const pcre_uint8 coptable[] = {
116  0,                             /* End                                    */
117  0, 0, 0, 0, 0,                 /* \A, \G, \K, \B, \b                     */
118  0, 0, 0, 0, 0, 0,              /* \D, \d, \S, \s, \W, \w                 */
119  0, 0, 0,                       /* Any, AllAny, Anybyte                   */
120  0, 0,                          /* \P, \p                                 */
121  0, 0, 0, 0, 0,                 /* \R, \H, \h, \V, \v                     */
122  0,                             /* \X                                     */
123  0, 0, 0, 0, 0, 0,              /* \Z, \z, $, $M, ^, ^M                   */
124  1,                             /* Char                                   */
125  1,                             /* Chari                                  */
126  1,                             /* not                                    */
127  1,                             /* noti                                   */
128  /* Positive single-char repeats                                          */
129  1, 1, 1, 1, 1, 1,              /* *, *?, +, +?, ?, ??                    */
130  1+IMM2_SIZE, 1+IMM2_SIZE,      /* upto, minupto                          */
131  1+IMM2_SIZE,                   /* exact                                  */
132  1, 1, 1, 1+IMM2_SIZE,          /* *+, ++, ?+, upto+                      */
133  1, 1, 1, 1, 1, 1,              /* *I, *?I, +I, +?I, ?I, ??I              */
134  1+IMM2_SIZE, 1+IMM2_SIZE,      /* upto I, minupto I                      */
135  1+IMM2_SIZE,                   /* exact I                                */
136  1, 1, 1, 1+IMM2_SIZE,          /* *+I, ++I, ?+I, upto+I                  */
137  /* Negative single-char repeats - only for chars < 256                   */
138  1, 1, 1, 1, 1, 1,              /* NOT *, *?, +, +?, ?, ??                */
139  1+IMM2_SIZE, 1+IMM2_SIZE,      /* NOT upto, minupto                      */
140  1+IMM2_SIZE,                   /* NOT exact                              */
141  1, 1, 1, 1+IMM2_SIZE,          /* NOT *+, ++, ?+, upto+                  */
142  1, 1, 1, 1, 1, 1,              /* NOT *I, *?I, +I, +?I, ?I, ??I          */
143  1+IMM2_SIZE, 1+IMM2_SIZE,      /* NOT upto I, minupto I                  */
144  1+IMM2_SIZE,                   /* NOT exact I                            */
145  1, 1, 1, 1+IMM2_SIZE,          /* NOT *+I, ++I, ?+I, upto+I              */
146  /* Positive type repeats                                                 */
147  1, 1, 1, 1, 1, 1,              /* Type *, *?, +, +?, ?, ??               */
148  1+IMM2_SIZE, 1+IMM2_SIZE,      /* Type upto, minupto                     */
149  1+IMM2_SIZE,                   /* Type exact                             */
150  1, 1, 1, 1+IMM2_SIZE,          /* Type *+, ++, ?+, upto+                 */
151  /* Character class & ref repeats                                         */
152  0, 0, 0, 0, 0, 0,              /* *, *?, +, +?, ?, ??                    */
153  0, 0,                          /* CRRANGE, CRMINRANGE                    */
154  0, 0, 0, 0,                    /* Possessive *+, ++, ?+, CRPOSRANGE      */
155  0,                             /* CLASS                                  */
156  0,                             /* NCLASS                                 */
157  0,                             /* XCLASS - variable length               */
158  0,                             /* REF                                    */
159  0,                             /* REFI                                   */
160  0,                             /* DNREF                                  */
161  0,                             /* DNREFI                                 */
162  0,                             /* RECURSE                                */
163  0,                             /* CALLOUT                                */
164  0,                             /* Alt                                    */
165  0,                             /* Ket                                    */
166  0,                             /* KetRmax                                */
167  0,                             /* KetRmin                                */
168  0,                             /* KetRpos                                */
169  0,                             /* Reverse                                */
170  0,                             /* Assert                                 */
171  0,                             /* Assert not                             */
172  0,                             /* Assert behind                          */
173  0,                             /* Assert behind not                      */
174  0, 0,                          /* ONCE, ONCE_NC                          */
175  0, 0, 0, 0, 0,                 /* BRA, BRAPOS, CBRA, CBRAPOS, COND       */
176  0, 0, 0, 0, 0,                 /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND  */
177  0, 0,                          /* CREF, DNCREF                           */
178  0, 0,                          /* RREF, DNRREF                           */
179  0,                             /* DEF                                    */
180  0, 0, 0,                       /* BRAZERO, BRAMINZERO, BRAPOSZERO        */
181  0, 0, 0,                       /* MARK, PRUNE, PRUNE_ARG                 */
182  0, 0, 0, 0,                    /* SKIP, SKIP_ARG, THEN, THEN_ARG         */
183  0, 0, 0, 0,                    /* COMMIT, FAIL, ACCEPT, ASSERT_ACCEPT    */
184  0, 0                           /* CLOSE, SKIPZERO  */
185};
186
187/* This table identifies those opcodes that inspect a character. It is used to
188remember the fact that a character could have been inspected when the end of
189the subject is reached. ***NOTE*** If the start of this table is modified, the
190two tables that follow must also be modified. */
191
192static const pcre_uint8 poptable[] = {
193  0,                             /* End                                    */
194  0, 0, 0, 1, 1,                 /* \A, \G, \K, \B, \b                     */
195  1, 1, 1, 1, 1, 1,              /* \D, \d, \S, \s, \W, \w                 */
196  1, 1, 1,                       /* Any, AllAny, Anybyte                   */
197  1, 1,                          /* \P, \p                                 */
198  1, 1, 1, 1, 1,                 /* \R, \H, \h, \V, \v                     */
199  1,                             /* \X                                     */
200  0, 0, 0, 0, 0, 0,              /* \Z, \z, $, $M, ^, ^M                   */
201  1,                             /* Char                                   */
202  1,                             /* Chari                                  */
203  1,                             /* not                                    */
204  1,                             /* noti                                   */
205  /* Positive single-char repeats                                          */
206  1, 1, 1, 1, 1, 1,              /* *, *?, +, +?, ?, ??                    */
207  1, 1, 1,                       /* upto, minupto, exact                   */
208  1, 1, 1, 1,                    /* *+, ++, ?+, upto+                      */
209  1, 1, 1, 1, 1, 1,              /* *I, *?I, +I, +?I, ?I, ??I              */
210  1, 1, 1,                       /* upto I, minupto I, exact I             */
211  1, 1, 1, 1,                    /* *+I, ++I, ?+I, upto+I                  */
212  /* Negative single-char repeats - only for chars < 256                   */
213  1, 1, 1, 1, 1, 1,              /* NOT *, *?, +, +?, ?, ??                */
214  1, 1, 1,                       /* NOT upto, minupto, exact               */
215  1, 1, 1, 1,                    /* NOT *+, ++, ?+, upto+                  */
216  1, 1, 1, 1, 1, 1,              /* NOT *I, *?I, +I, +?I, ?I, ??I          */
217  1, 1, 1,                       /* NOT upto I, minupto I, exact I         */
218  1, 1, 1, 1,                    /* NOT *+I, ++I, ?+I, upto+I              */
219  /* Positive type repeats                                                 */
220  1, 1, 1, 1, 1, 1,              /* Type *, *?, +, +?, ?, ??               */
221  1, 1, 1,                       /* Type upto, minupto, exact              */
222  1, 1, 1, 1,                    /* Type *+, ++, ?+, upto+                 */
223  /* Character class & ref repeats                                         */
224  1, 1, 1, 1, 1, 1,              /* *, *?, +, +?, ?, ??                    */
225  1, 1,                          /* CRRANGE, CRMINRANGE                    */
226  1, 1, 1, 1,                    /* Possessive *+, ++, ?+, CRPOSRANGE      */
227  1,                             /* CLASS                                  */
228  1,                             /* NCLASS                                 */
229  1,                             /* XCLASS - variable length               */
230  0,                             /* REF                                    */
231  0,                             /* REFI                                   */
232  0,                             /* DNREF                                  */
233  0,                             /* DNREFI                                 */
234  0,                             /* RECURSE                                */
235  0,                             /* CALLOUT                                */
236  0,                             /* Alt                                    */
237  0,                             /* Ket                                    */
238  0,                             /* KetRmax                                */
239  0,                             /* KetRmin                                */
240  0,                             /* KetRpos                                */
241  0,                             /* Reverse                                */
242  0,                             /* Assert                                 */
243  0,                             /* Assert not                             */
244  0,                             /* Assert behind                          */
245  0,                             /* Assert behind not                      */
246  0, 0,                          /* ONCE, ONCE_NC                          */
247  0, 0, 0, 0, 0,                 /* BRA, BRAPOS, CBRA, CBRAPOS, COND       */
248  0, 0, 0, 0, 0,                 /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND  */
249  0, 0,                          /* CREF, DNCREF                           */
250  0, 0,                          /* RREF, DNRREF                           */
251  0,                             /* DEF                                    */
252  0, 0, 0,                       /* BRAZERO, BRAMINZERO, BRAPOSZERO        */
253  0, 0, 0,                       /* MARK, PRUNE, PRUNE_ARG                 */
254  0, 0, 0, 0,                    /* SKIP, SKIP_ARG, THEN, THEN_ARG         */
255  0, 0, 0, 0,                    /* COMMIT, FAIL, ACCEPT, ASSERT_ACCEPT    */
256  0, 0                           /* CLOSE, SKIPZERO                        */
257};
258
259/* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,
260and \w */
261
262static const pcre_uint8 toptable1[] = {
263  0, 0, 0, 0, 0, 0,
264  ctype_digit, ctype_digit,
265  ctype_space, ctype_space,
266  ctype_word,  ctype_word,
267  0, 0                            /* OP_ANY, OP_ALLANY */
268};
269
270static const pcre_uint8 toptable2[] = {
271  0, 0, 0, 0, 0, 0,
272  ctype_digit, 0,
273  ctype_space, 0,
274  ctype_word,  0,
275  1, 1                            /* OP_ANY, OP_ALLANY */
276};
277
278
279/* Structure for holding data about a particular state, which is in effect the
280current data for an active path through the match tree. It must consist
281entirely of ints because the working vector we are passed, and which we put
282these structures in, is a vector of ints. */
283
284typedef struct stateblock {
285  int offset;                     /* Offset to opcode */
286  int count;                      /* Count for repeats */
287  int data;                       /* Some use extra data */
288} stateblock;
289
290#define INTS_PER_STATEBLOCK  (int)(sizeof(stateblock)/sizeof(int))
291
292
293#ifdef PCRE_DEBUG
294/*************************************************
295*             Print character string             *
296*************************************************/
297
298/* Character string printing function for debugging.
299
300Arguments:
301  p            points to string
302  length       number of bytes
303  f            where to print
304
305Returns:       nothing
306*/
307
308static void
309pchars(const pcre_uchar *p, int length, FILE *f)
310{
311pcre_uint32 c;
312while (length-- > 0)
313  {
314  if (isprint(c = *(p++)))
315    fprintf(f, "%c", c);
316  else
317    fprintf(f, "\\x{%02x}", c);
318  }
319}
320#endif
321
322
323
324/*************************************************
325*    Execute a Regular Expression - DFA engine   *
326*************************************************/
327
328/* This internal function applies a compiled pattern to a subject string,
329starting at a given point, using a DFA engine. This function is called from the
330external one, possibly multiple times if the pattern is not anchored. The
331function calls itself recursively for some kinds of subpattern.
332
333Arguments:
334  md                the match_data block with fixed information
335  this_start_code   the opening bracket of this subexpression's code
336  current_subject   where we currently are in the subject string
337  start_offset      start offset in the subject string
338  offsets           vector to contain the matching string offsets
339  offsetcount       size of same
340  workspace         vector of workspace
341  wscount           size of same
342  rlevel            function call recursion level
343
344Returns:            > 0 => number of match offset pairs placed in offsets
345                    = 0 => offsets overflowed; longest matches are present
346                     -1 => failed to match
347                   < -1 => some kind of unexpected problem
348
349The following macros are used for adding states to the two state vectors (one
350for the current character, one for the following character). */
351
352#define ADD_ACTIVE(x,y) \
353  if (active_count++ < wscount) \
354    { \
355    next_active_state->offset = (x); \
356    next_active_state->count  = (y); \
357    next_active_state++; \
358    DPRINTF(("%.*sADD_ACTIVE(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
359    } \
360  else return PCRE_ERROR_DFA_WSSIZE
361
362#define ADD_ACTIVE_DATA(x,y,z) \
363  if (active_count++ < wscount) \
364    { \
365    next_active_state->offset = (x); \
366    next_active_state->count  = (y); \
367    next_active_state->data   = (z); \
368    next_active_state++; \
369    DPRINTF(("%.*sADD_ACTIVE_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
370    } \
371  else return PCRE_ERROR_DFA_WSSIZE
372
373#define ADD_NEW(x,y) \
374  if (new_count++ < wscount) \
375    { \
376    next_new_state->offset = (x); \
377    next_new_state->count  = (y); \
378    next_new_state++; \
379    DPRINTF(("%.*sADD_NEW(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
380    } \
381  else return PCRE_ERROR_DFA_WSSIZE
382
383#define ADD_NEW_DATA(x,y,z) \
384  if (new_count++ < wscount) \
385    { \
386    next_new_state->offset = (x); \
387    next_new_state->count  = (y); \
388    next_new_state->data   = (z); \
389    next_new_state++; \
390    DPRINTF(("%.*sADD_NEW_DATA(%d,%d,%d) line %d\n", rlevel*2-2, SP, \
391      (x), (y), (z), __LINE__)); \
392    } \
393  else return PCRE_ERROR_DFA_WSSIZE
394
395/* And now, here is the code */
396
397static int
398internal_dfa_exec(
399  dfa_match_data *md,
400  const pcre_uchar *this_start_code,
401  const pcre_uchar *current_subject,
402  int start_offset,
403  int *offsets,
404  int offsetcount,
405  int *workspace,
406  int wscount,
407  int  rlevel)
408{
409stateblock *active_states, *new_states, *temp_states;
410stateblock *next_active_state, *next_new_state;
411
412const pcre_uint8 *ctypes, *lcc, *fcc;
413const pcre_uchar *ptr;
414const pcre_uchar *end_code, *first_op;
415
416dfa_recursion_info new_recursive;
417
418int active_count, new_count, match_count;
419
420/* Some fields in the md block are frequently referenced, so we load them into
421independent variables in the hope that this will perform better. */
422
423const pcre_uchar *start_subject = md->start_subject;
424const pcre_uchar *end_subject = md->end_subject;
425const pcre_uchar *start_code = md->start_code;
426
427#ifdef SUPPORT_UTF
428BOOL utf = (md->poptions & PCRE_UTF8) != 0;
429#else
430BOOL utf = FALSE;
431#endif
432
433BOOL reset_could_continue = FALSE;
434
435rlevel++;
436offsetcount &= (-2);
437
438wscount -= 2;
439wscount = (wscount - (wscount % (INTS_PER_STATEBLOCK * 2))) /
440          (2 * INTS_PER_STATEBLOCK);
441
442DPRINTF(("\n%.*s---------------------\n"
443  "%.*sCall to internal_dfa_exec f=%d\n",
444  rlevel*2-2, SP, rlevel*2-2, SP, rlevel));
445
446ctypes = md->tables + ctypes_offset;
447lcc = md->tables + lcc_offset;
448fcc = md->tables + fcc_offset;
449
450match_count = PCRE_ERROR_NOMATCH;   /* A negative number */
451
452active_states = (stateblock *)(workspace + 2);
453next_new_state = new_states = active_states + wscount;
454new_count = 0;
455
456first_op = this_start_code + 1 + LINK_SIZE +
457  ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA ||
458    *this_start_code == OP_CBRAPOS || *this_start_code == OP_SCBRAPOS)
459    ? IMM2_SIZE:0);
460
461/* The first thing in any (sub) pattern is a bracket of some sort. Push all
462the alternative states onto the list, and find out where the end is. This
463makes is possible to use this function recursively, when we want to stop at a
464matching internal ket rather than at the end.
465
466If the first opcode in the first alternative is OP_REVERSE, we are dealing with
467a backward assertion. In that case, we have to find out the maximum amount to
468move back, and set up each alternative appropriately. */
469
470if (*first_op == OP_REVERSE)
471  {
472  int max_back = 0;
473  int gone_back;
474
475  end_code = this_start_code;
476  do
477    {
478    int back = GET(end_code, 2+LINK_SIZE);
479    if (back > max_back) max_back = back;
480    end_code += GET(end_code, 1);
481    }
482  while (*end_code == OP_ALT);
483
484  /* If we can't go back the amount required for the longest lookbehind
485  pattern, go back as far as we can; some alternatives may still be viable. */
486
487#ifdef SUPPORT_UTF
488  /* In character mode we have to step back character by character */
489
490  if (utf)
491    {
492    for (gone_back = 0; gone_back < max_back; gone_back++)
493      {
494      if (current_subject <= start_subject) break;
495      current_subject--;
496      ACROSSCHAR(current_subject > start_subject, *current_subject, current_subject--);
497      }
498    }
499  else
500#endif
501
502  /* In byte-mode we can do this quickly. */
503
504    {
505    gone_back = (current_subject - max_back < start_subject)?
506      (int)(current_subject - start_subject) : max_back;
507    current_subject -= gone_back;
508    }
509
510  /* Save the earliest consulted character */
511
512  if (current_subject < md->start_used_ptr)
513    md->start_used_ptr = current_subject;
514
515  /* Now we can process the individual branches. */
516
517  end_code = this_start_code;
518  do
519    {
520    int back = GET(end_code, 2+LINK_SIZE);
521    if (back <= gone_back)
522      {
523      int bstate = (int)(end_code - start_code + 2 + 2*LINK_SIZE);
524      ADD_NEW_DATA(-bstate, 0, gone_back - back);
525      }
526    end_code += GET(end_code, 1);
527    }
528  while (*end_code == OP_ALT);
529 }
530
531/* This is the code for a "normal" subpattern (not a backward assertion). The
532start of a whole pattern is always one of these. If we are at the top level,
533we may be asked to restart matching from the same point that we reached for a
534previous partial match. We still have to scan through the top-level branches to
535find the end state. */
536
537else
538  {
539  end_code = this_start_code;
540
541  /* Restarting */
542
543  if (rlevel == 1 && (md->moptions & PCRE_DFA_RESTART) != 0)
544    {
545    do { end_code += GET(end_code, 1); } while (*end_code == OP_ALT);
546    new_count = workspace[1];
547    if (!workspace[0])
548      memcpy(new_states, active_states, new_count * sizeof(stateblock));
549    }
550
551  /* Not restarting */
552
553  else
554    {
555    int length = 1 + LINK_SIZE +
556      ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA ||
557        *this_start_code == OP_CBRAPOS || *this_start_code == OP_SCBRAPOS)
558        ? IMM2_SIZE:0);
559    do
560      {
561      ADD_NEW((int)(end_code - start_code + length), 0);
562      end_code += GET(end_code, 1);
563      length = 1 + LINK_SIZE;
564      }
565    while (*end_code == OP_ALT);
566    }
567  }
568
569workspace[0] = 0;    /* Bit indicating which vector is current */
570
571DPRINTF(("%.*sEnd state = %d\n", rlevel*2-2, SP, (int)(end_code - start_code)));
572
573/* Loop for scanning the subject */
574
575ptr = current_subject;
576for (;;)
577  {
578  int i, j;
579  int clen, dlen;
580  pcre_uint32 c, d;
581  int forced_fail = 0;
582  BOOL partial_newline = FALSE;
583  BOOL could_continue = reset_could_continue;
584  reset_could_continue = FALSE;
585
586  /* Make the new state list into the active state list and empty the
587  new state list. */
588
589  temp_states = active_states;
590  active_states = new_states;
591  new_states = temp_states;
592  active_count = new_count;
593  new_count = 0;
594
595  workspace[0] ^= 1;              /* Remember for the restarting feature */
596  workspace[1] = active_count;
597
598#ifdef PCRE_DEBUG
599  printf("%.*sNext character: rest of subject = \"", rlevel*2-2, SP);
600  pchars(ptr, STRLEN_UC(ptr), stdout);
601  printf("\"\n");
602
603  printf("%.*sActive states: ", rlevel*2-2, SP);
604  for (i = 0; i < active_count; i++)
605    printf("%d/%d ", active_states[i].offset, active_states[i].count);
606  printf("\n");
607#endif
608
609  /* Set the pointers for adding new states */
610
611  next_active_state = active_states + active_count;
612  next_new_state = new_states;
613
614  /* Load the current character from the subject outside the loop, as many
615  different states may want to look at it, and we assume that at least one
616  will. */
617
618  if (ptr < end_subject)
619    {
620    clen = 1;        /* Number of data items in the character */
621#ifdef SUPPORT_UTF
622    GETCHARLENTEST(c, ptr, clen);
623#else
624    c = *ptr;
625#endif  /* SUPPORT_UTF */
626    }
627  else
628    {
629    clen = 0;        /* This indicates the end of the subject */
630    c = NOTACHAR;    /* This value should never actually be used */
631    }
632
633  /* Scan up the active states and act on each one. The result of an action
634  may be to add more states to the currently active list (e.g. on hitting a
635  parenthesis) or it may be to put states on the new list, for considering
636  when we move the character pointer on. */
637
638  for (i = 0; i < active_count; i++)
639    {
640    stateblock *current_state = active_states + i;
641    BOOL caseless = FALSE;
642    const pcre_uchar *code;
643    int state_offset = current_state->offset;
644    int codevalue, rrc;
645    int count;
646
647#ifdef PCRE_DEBUG
648    printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset);
649    if (clen == 0) printf("EOL\n");
650      else if (c > 32 && c < 127) printf("'%c'\n", c);
651        else printf("0x%02x\n", c);
652#endif
653
654    /* A negative offset is a special case meaning "hold off going to this
655    (negated) state until the number of characters in the data field have
656    been skipped". If the could_continue flag was passed over from a previous
657    state, arrange for it to passed on. */
658
659    if (state_offset < 0)
660      {
661      if (current_state->data > 0)
662        {
663        DPRINTF(("%.*sSkipping this character\n", rlevel*2-2, SP));
664        ADD_NEW_DATA(state_offset, current_state->count,
665          current_state->data - 1);
666        if (could_continue) reset_could_continue = TRUE;
667        continue;
668        }
669      else
670        {
671        current_state->offset = state_offset = -state_offset;
672        }
673      }
674
675    /* Check for a duplicate state with the same count, and skip if found.
676    See the note at the head of this module about the possibility of improving
677    performance here. */
678
679    for (j = 0; j < i; j++)
680      {
681      if (active_states[j].offset == state_offset &&
682          active_states[j].count == current_state->count)
683        {
684        DPRINTF(("%.*sDuplicate state: skipped\n", rlevel*2-2, SP));
685        goto NEXT_ACTIVE_STATE;
686        }
687      }
688
689    /* The state offset is the offset to the opcode */
690
691    code = start_code + state_offset;
692    codevalue = *code;
693
694    /* If this opcode inspects a character, but we are at the end of the
695    subject, remember the fact for use when testing for a partial match. */
696
697    if (clen == 0 && poptable[codevalue] != 0)
698      could_continue = TRUE;
699
700    /* If this opcode is followed by an inline character, load it. It is
701    tempting to test for the presence of a subject character here, but that
702    is wrong, because sometimes zero repetitions of the subject are
703    permitted.
704
705    We also use this mechanism for opcodes such as OP_TYPEPLUS that take an
706    argument that is not a data character - but is always one byte long because
707    the values are small. We have to take special action to deal with  \P, \p,
708    \H, \h, \V, \v and \X in this case. To keep the other cases fast, convert
709    these ones to new opcodes. */
710
711    if (coptable[codevalue] > 0)
712      {
713      dlen = 1;
714#ifdef SUPPORT_UTF
715      if (utf) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else
716#endif  /* SUPPORT_UTF */
717      d = code[coptable[codevalue]];
718      if (codevalue >= OP_TYPESTAR)
719        {
720        switch(d)
721          {
722          case OP_ANYBYTE: return PCRE_ERROR_DFA_UITEM;
723          case OP_NOTPROP:
724          case OP_PROP: codevalue += OP_PROP_EXTRA; break;
725          case OP_ANYNL: codevalue += OP_ANYNL_EXTRA; break;
726          case OP_EXTUNI: codevalue += OP_EXTUNI_EXTRA; break;
727          case OP_NOT_HSPACE:
728          case OP_HSPACE: codevalue += OP_HSPACE_EXTRA; break;
729          case OP_NOT_VSPACE:
730          case OP_VSPACE: codevalue += OP_VSPACE_EXTRA; break;
731          default: break;
732          }
733        }
734      }
735    else
736      {
737      dlen = 0;         /* Not strictly necessary, but compilers moan */
738      d = NOTACHAR;     /* if these variables are not set. */
739      }
740
741
742    /* Now process the individual opcodes */
743
744    switch (codevalue)
745      {
746/* ========================================================================== */
747      /* These cases are never obeyed. This is a fudge that causes a compile-
748      time error if the vectors coptable or poptable, which are indexed by
749      opcode, are not the correct length. It seems to be the only way to do
750      such a check at compile time, as the sizeof() operator does not work
751      in the C preprocessor. */
752
753      case OP_TABLE_LENGTH:
754      case OP_TABLE_LENGTH +
755        ((sizeof(coptable) == OP_TABLE_LENGTH) &&
756         (sizeof(poptable) == OP_TABLE_LENGTH)):
757      break;
758
759/* ========================================================================== */
760      /* Reached a closing bracket. If not at the end of the pattern, carry
761      on with the next opcode. For repeating opcodes, also add the repeat
762      state. Note that KETRPOS will always be encountered at the end of the
763      subpattern, because the possessive subpattern repeats are always handled
764      using recursive calls. Thus, it never adds any new states.
765
766      At the end of the (sub)pattern, unless we have an empty string and
767      PCRE_NOTEMPTY is set, or PCRE_NOTEMPTY_ATSTART is set and we are at the
768      start of the subject, save the match data, shifting up all previous
769      matches so we always have the longest first. */
770
771      case OP_KET:
772      case OP_KETRMIN:
773      case OP_KETRMAX:
774      case OP_KETRPOS:
775      if (code != end_code)
776        {
777        ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0);
778        if (codevalue != OP_KET)
779          {
780          ADD_ACTIVE(state_offset - GET(code, 1), 0);
781          }
782        }
783      else
784        {
785        if (ptr > current_subject ||
786            ((md->moptions & PCRE_NOTEMPTY) == 0 &&
787              ((md->moptions & PCRE_NOTEMPTY_ATSTART) == 0 ||
788                current_subject > start_subject + md->start_offset)))
789          {
790          if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;
791            else if (match_count > 0 && ++match_count * 2 > offsetcount)
792              match_count = 0;
793          count = ((match_count == 0)? offsetcount : match_count * 2) - 2;
794          if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int));
795          if (offsetcount >= 2)
796            {
797            offsets[0] = (int)(current_subject - start_subject);
798            offsets[1] = (int)(ptr - start_subject);
799            DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP,
800              offsets[1] - offsets[0], (char *)current_subject));
801            }
802          if ((md->moptions & PCRE_DFA_SHORTEST) != 0)
803            {
804            DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
805              "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel,
806              match_count, rlevel*2-2, SP));
807            return match_count;
808            }
809          }
810        }
811      break;
812
813/* ========================================================================== */
814      /* These opcodes add to the current list of states without looking
815      at the current character. */
816
817      /*-----------------------------------------------------------------*/
818      case OP_ALT:
819      do { code += GET(code, 1); } while (*code == OP_ALT);
820      ADD_ACTIVE((int)(code - start_code), 0);
821      break;
822
823      /*-----------------------------------------------------------------*/
824      case OP_BRA:
825      case OP_SBRA:
826      do
827        {
828        ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
829        code += GET(code, 1);
830        }
831      while (*code == OP_ALT);
832      break;
833
834      /*-----------------------------------------------------------------*/
835      case OP_CBRA:
836      case OP_SCBRA:
837      ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE + IMM2_SIZE),  0);
838      code += GET(code, 1);
839      while (*code == OP_ALT)
840        {
841        ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE),  0);
842        code += GET(code, 1);
843        }
844      break;
845
846      /*-----------------------------------------------------------------*/
847      case OP_BRAZERO:
848      case OP_BRAMINZERO:
849      ADD_ACTIVE(state_offset + 1, 0);
850      code += 1 + GET(code, 2);
851      while (*code == OP_ALT) code += GET(code, 1);
852      ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
853      break;
854
855      /*-----------------------------------------------------------------*/
856      case OP_SKIPZERO:
857      code += 1 + GET(code, 2);
858      while (*code == OP_ALT) code += GET(code, 1);
859      ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
860      break;
861
862      /*-----------------------------------------------------------------*/
863      case OP_CIRC:
864      if (ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0)
865        { ADD_ACTIVE(state_offset + 1, 0); }
866      break;
867
868      /*-----------------------------------------------------------------*/
869      case OP_CIRCM:
870      if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) ||
871          (ptr != end_subject && WAS_NEWLINE(ptr)))
872        { ADD_ACTIVE(state_offset + 1, 0); }
873      break;
874
875      /*-----------------------------------------------------------------*/
876      case OP_EOD:
877      if (ptr >= end_subject)
878        {
879        if ((md->moptions & PCRE_PARTIAL_HARD) != 0)
880          could_continue = TRUE;
881        else { ADD_ACTIVE(state_offset + 1, 0); }
882        }
883      break;
884
885      /*-----------------------------------------------------------------*/
886      case OP_SOD:
887      if (ptr == start_subject) { ADD_ACTIVE(state_offset + 1, 0); }
888      break;
889
890      /*-----------------------------------------------------------------*/
891      case OP_SOM:
892      if (ptr == start_subject + start_offset) { ADD_ACTIVE(state_offset + 1, 0); }
893      break;
894
895
896/* ========================================================================== */
897      /* These opcodes inspect the next subject character, and sometimes
898      the previous one as well, but do not have an argument. The variable
899      clen contains the length of the current character and is zero if we are
900      at the end of the subject. */
901
902      /*-----------------------------------------------------------------*/
903      case OP_ANY:
904      if (clen > 0 && !IS_NEWLINE(ptr))
905        {
906        if (ptr + 1 >= md->end_subject &&
907            (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
908            NLBLOCK->nltype == NLTYPE_FIXED &&
909            NLBLOCK->nllen == 2 &&
910            c == NLBLOCK->nl[0])
911          {
912          could_continue = partial_newline = TRUE;
913          }
914        else
915          {
916          ADD_NEW(state_offset + 1, 0);
917          }
918        }
919      break;
920
921      /*-----------------------------------------------------------------*/
922      case OP_ALLANY:
923      if (clen > 0)
924        { ADD_NEW(state_offset + 1, 0); }
925      break;
926
927      /*-----------------------------------------------------------------*/
928      case OP_EODN:
929      if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
930        could_continue = TRUE;
931      else if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - md->nllen))
932        { ADD_ACTIVE(state_offset + 1, 0); }
933      break;
934
935      /*-----------------------------------------------------------------*/
936      case OP_DOLL:
937      if ((md->moptions & PCRE_NOTEOL) == 0)
938        {
939        if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
940          could_continue = TRUE;
941        else if (clen == 0 ||
942            ((md->poptions & PCRE_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr) &&
943               (ptr == end_subject - md->nllen)
944            ))
945          { ADD_ACTIVE(state_offset + 1, 0); }
946        else if (ptr + 1 >= md->end_subject &&
947                 (md->moptions & (PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT)) != 0 &&
948                 NLBLOCK->nltype == NLTYPE_FIXED &&
949                 NLBLOCK->nllen == 2 &&
950                 c == NLBLOCK->nl[0])
951          {
952          if ((md->moptions & PCRE_PARTIAL_HARD) != 0)
953            {
954            reset_could_continue = TRUE;
955            ADD_NEW_DATA(-(state_offset + 1), 0, 1);
956            }
957          else could_continue = partial_newline = TRUE;
958          }
959        }
960      break;
961
962      /*-----------------------------------------------------------------*/
963      case OP_DOLLM:
964      if ((md->moptions & PCRE_NOTEOL) == 0)
965        {
966        if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
967          could_continue = TRUE;
968        else if (clen == 0 ||
969            ((md->poptions & PCRE_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr)))
970          { ADD_ACTIVE(state_offset + 1, 0); }
971        else if (ptr + 1 >= md->end_subject &&
972                 (md->moptions & (PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT)) != 0 &&
973                 NLBLOCK->nltype == NLTYPE_FIXED &&
974                 NLBLOCK->nllen == 2 &&
975                 c == NLBLOCK->nl[0])
976          {
977          if ((md->moptions & PCRE_PARTIAL_HARD) != 0)
978            {
979            reset_could_continue = TRUE;
980            ADD_NEW_DATA(-(state_offset + 1), 0, 1);
981            }
982          else could_continue = partial_newline = TRUE;
983          }
984        }
985      else if (IS_NEWLINE(ptr))
986        { ADD_ACTIVE(state_offset + 1, 0); }
987      break;
988
989      /*-----------------------------------------------------------------*/
990
991      case OP_DIGIT:
992      case OP_WHITESPACE:
993      case OP_WORDCHAR:
994      if (clen > 0 && c < 256 &&
995            ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0)
996        { ADD_NEW(state_offset + 1, 0); }
997      break;
998
999      /*-----------------------------------------------------------------*/
1000      case OP_NOT_DIGIT:
1001      case OP_NOT_WHITESPACE:
1002      case OP_NOT_WORDCHAR:
1003      if (clen > 0 && (c >= 256 ||
1004            ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0))
1005        { ADD_NEW(state_offset + 1, 0); }
1006      break;
1007
1008      /*-----------------------------------------------------------------*/
1009      case OP_WORD_BOUNDARY:
1010      case OP_NOT_WORD_BOUNDARY:
1011        {
1012        int left_word, right_word;
1013
1014        if (ptr > start_subject)
1015          {
1016          const pcre_uchar *temp = ptr - 1;
1017          if (temp < md->start_used_ptr) md->start_used_ptr = temp;
1018#if defined SUPPORT_UTF && !defined COMPILE_PCRE32
1019          if (utf) { BACKCHAR(temp); }
1020#endif
1021          GETCHARTEST(d, temp);
1022#ifdef SUPPORT_UCP
1023          if ((md->poptions & PCRE_UCP) != 0)
1024            {
1025            if (d == '_') left_word = TRUE; else
1026              {
1027              int cat = UCD_CATEGORY(d);
1028              left_word = (cat == ucp_L || cat == ucp_N);
1029              }
1030            }
1031          else
1032#endif
1033          left_word = d < 256 && (ctypes[d] & ctype_word) != 0;
1034          }
1035        else left_word = FALSE;
1036
1037        if (clen > 0)
1038          {
1039#ifdef SUPPORT_UCP
1040          if ((md->poptions & PCRE_UCP) != 0)
1041            {
1042            if (c == '_') right_word = TRUE; else
1043              {
1044              int cat = UCD_CATEGORY(c);
1045              right_word = (cat == ucp_L || cat == ucp_N);
1046              }
1047            }
1048          else
1049#endif
1050          right_word = c < 256 && (ctypes[c] & ctype_word) != 0;
1051          }
1052        else right_word = FALSE;
1053
1054        if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))
1055          { ADD_ACTIVE(state_offset + 1, 0); }
1056        }
1057      break;
1058
1059
1060      /*-----------------------------------------------------------------*/
1061      /* Check the next character by Unicode property. We will get here only
1062      if the support is in the binary; otherwise a compile-time error occurs.
1063      */
1064
1065#ifdef SUPPORT_UCP
1066      case OP_PROP:
1067      case OP_NOTPROP:
1068      if (clen > 0)
1069        {
1070        BOOL OK;
1071        const pcre_uint32 *cp;
1072        const ucd_record * prop = GET_UCD(c);
1073        switch(code[1])
1074          {
1075          case PT_ANY:
1076          OK = TRUE;
1077          break;
1078
1079          case PT_LAMP:
1080          OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1081               prop->chartype == ucp_Lt;
1082          break;
1083
1084          case PT_GC:
1085          OK = PRIV(ucp_gentype)[prop->chartype] == code[2];
1086          break;
1087
1088          case PT_PC:
1089          OK = prop->chartype == code[2];
1090          break;
1091
1092          case PT_SC:
1093          OK = prop->script == code[2];
1094          break;
1095
1096          /* These are specials for combination cases. */
1097
1098          case PT_ALNUM:
1099          OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1100               PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1101          break;
1102
1103          /* Perl space used to exclude VT, but from Perl 5.18 it is included,
1104          which means that Perl space and POSIX space are now identical. PCRE
1105          was changed at release 8.34. */
1106
1107          case PT_SPACE:    /* Perl space */
1108          case PT_PXSPACE:  /* POSIX space */
1109          switch(c)
1110            {
1111            HSPACE_CASES:
1112            VSPACE_CASES:
1113            OK = TRUE;
1114            break;
1115
1116            default:
1117            OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z;
1118            break;
1119            }
1120          break;
1121
1122          case PT_WORD:
1123          OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1124               PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1125               c == CHAR_UNDERSCORE;
1126          break;
1127
1128          case PT_CLIST:
1129          cp = PRIV(ucd_caseless_sets) + code[2];
1130          for (;;)
1131            {
1132            if (c < *cp) { OK = FALSE; break; }
1133            if (c == *cp++) { OK = TRUE; break; }
1134            }
1135          break;
1136
1137          case PT_UCNC:
1138          OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
1139               c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
1140               c >= 0xe000;
1141          break;
1142
1143          /* Should never occur, but keep compilers from grumbling. */
1144
1145          default:
1146          OK = codevalue != OP_PROP;
1147          break;
1148          }
1149
1150        if (OK == (codevalue == OP_PROP)) { ADD_NEW(state_offset + 3, 0); }
1151        }
1152      break;
1153#endif
1154
1155
1156
1157/* ========================================================================== */
1158      /* These opcodes likewise inspect the subject character, but have an
1159      argument that is not a data character. It is one of these opcodes:
1160      OP_ANY, OP_ALLANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE,
1161      OP_WORDCHAR, OP_NOT_WORDCHAR. The value is loaded into d. */
1162
1163      case OP_TYPEPLUS:
1164      case OP_TYPEMINPLUS:
1165      case OP_TYPEPOSPLUS:
1166      count = current_state->count;  /* Already matched */
1167      if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1168      if (clen > 0)
1169        {
1170        if (d == OP_ANY && ptr + 1 >= md->end_subject &&
1171            (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
1172            NLBLOCK->nltype == NLTYPE_FIXED &&
1173            NLBLOCK->nllen == 2 &&
1174            c == NLBLOCK->nl[0])
1175          {
1176          could_continue = partial_newline = TRUE;
1177          }
1178        else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1179            (c < 256 &&
1180              (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1181              ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1182          {
1183          if (count > 0 && codevalue == OP_TYPEPOSPLUS)
1184            {
1185            active_count--;            /* Remove non-match possibility */
1186            next_active_state--;
1187            }
1188          count++;
1189          ADD_NEW(state_offset, count);
1190          }
1191        }
1192      break;
1193
1194      /*-----------------------------------------------------------------*/
1195      case OP_TYPEQUERY:
1196      case OP_TYPEMINQUERY:
1197      case OP_TYPEPOSQUERY:
1198      ADD_ACTIVE(state_offset + 2, 0);
1199      if (clen > 0)
1200        {
1201        if (d == OP_ANY && ptr + 1 >= md->end_subject &&
1202            (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
1203            NLBLOCK->nltype == NLTYPE_FIXED &&
1204            NLBLOCK->nllen == 2 &&
1205            c == NLBLOCK->nl[0])
1206          {
1207          could_continue = partial_newline = TRUE;
1208          }
1209        else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1210            (c < 256 &&
1211              (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1212              ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1213          {
1214          if (codevalue == OP_TYPEPOSQUERY)
1215            {
1216            active_count--;            /* Remove non-match possibility */
1217            next_active_state--;
1218            }
1219          ADD_NEW(state_offset + 2, 0);
1220          }
1221        }
1222      break;
1223
1224      /*-----------------------------------------------------------------*/
1225      case OP_TYPESTAR:
1226      case OP_TYPEMINSTAR:
1227      case OP_TYPEPOSSTAR:
1228      ADD_ACTIVE(state_offset + 2, 0);
1229      if (clen > 0)
1230        {
1231        if (d == OP_ANY && ptr + 1 >= md->end_subject &&
1232            (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
1233            NLBLOCK->nltype == NLTYPE_FIXED &&
1234            NLBLOCK->nllen == 2 &&
1235            c == NLBLOCK->nl[0])
1236          {
1237          could_continue = partial_newline = TRUE;
1238          }
1239        else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1240            (c < 256 &&
1241              (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1242              ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1243          {
1244          if (codevalue == OP_TYPEPOSSTAR)
1245            {
1246            active_count--;            /* Remove non-match possibility */
1247            next_active_state--;
1248            }
1249          ADD_NEW(state_offset, 0);
1250          }
1251        }
1252      break;
1253
1254      /*-----------------------------------------------------------------*/
1255      case OP_TYPEEXACT:
1256      count = current_state->count;  /* Number already matched */
1257      if (clen > 0)
1258        {
1259        if (d == OP_ANY && ptr + 1 >= md->end_subject &&
1260            (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
1261            NLBLOCK->nltype == NLTYPE_FIXED &&
1262            NLBLOCK->nllen == 2 &&
1263            c == NLBLOCK->nl[0])
1264          {
1265          could_continue = partial_newline = TRUE;
1266          }
1267        else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1268            (c < 256 &&
1269              (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1270              ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1271          {
1272          if (++count >= (int)GET2(code, 1))
1273            { ADD_NEW(state_offset + 1 + IMM2_SIZE + 1, 0); }
1274          else
1275            { ADD_NEW(state_offset, count); }
1276          }
1277        }
1278      break;
1279
1280      /*-----------------------------------------------------------------*/
1281      case OP_TYPEUPTO:
1282      case OP_TYPEMINUPTO:
1283      case OP_TYPEPOSUPTO:
1284      ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0);
1285      count = current_state->count;  /* Number already matched */
1286      if (clen > 0)
1287        {
1288        if (d == OP_ANY && ptr + 1 >= md->end_subject &&
1289            (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
1290            NLBLOCK->nltype == NLTYPE_FIXED &&
1291            NLBLOCK->nllen == 2 &&
1292            c == NLBLOCK->nl[0])
1293          {
1294          could_continue = partial_newline = TRUE;
1295          }
1296        else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1297            (c < 256 &&
1298              (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1299              ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1300          {
1301          if (codevalue == OP_TYPEPOSUPTO)
1302            {
1303            active_count--;           /* Remove non-match possibility */
1304            next_active_state--;
1305            }
1306          if (++count >= (int)GET2(code, 1))
1307            { ADD_NEW(state_offset + 2 + IMM2_SIZE, 0); }
1308          else
1309            { ADD_NEW(state_offset, count); }
1310          }
1311        }
1312      break;
1313
1314/* ========================================================================== */
1315      /* These are virtual opcodes that are used when something like
1316      OP_TYPEPLUS has OP_PROP, OP_NOTPROP, OP_ANYNL, or OP_EXTUNI as its
1317      argument. It keeps the code above fast for the other cases. The argument
1318      is in the d variable. */
1319
1320#ifdef SUPPORT_UCP
1321      case OP_PROP_EXTRA + OP_TYPEPLUS:
1322      case OP_PROP_EXTRA + OP_TYPEMINPLUS:
1323      case OP_PROP_EXTRA + OP_TYPEPOSPLUS:
1324      count = current_state->count;           /* Already matched */
1325      if (count > 0) { ADD_ACTIVE(state_offset + 4, 0); }
1326      if (clen > 0)
1327        {
1328        BOOL OK;
1329        const pcre_uint32 *cp;
1330        const ucd_record * prop = GET_UCD(c);
1331        switch(code[2])
1332          {
1333          case PT_ANY:
1334          OK = TRUE;
1335          break;
1336
1337          case PT_LAMP:
1338          OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1339            prop->chartype == ucp_Lt;
1340          break;
1341
1342          case PT_GC:
1343          OK = PRIV(ucp_gentype)[prop->chartype] == code[3];
1344          break;
1345
1346          case PT_PC:
1347          OK = prop->chartype == code[3];
1348          break;
1349
1350          case PT_SC:
1351          OK = prop->script == code[3];
1352          break;
1353
1354          /* These are specials for combination cases. */
1355
1356          case PT_ALNUM:
1357          OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1358               PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1359          break;
1360
1361          /* Perl space used to exclude VT, but from Perl 5.18 it is included,
1362          which means that Perl space and POSIX space are now identical. PCRE
1363          was changed at release 8.34. */
1364
1365          case PT_SPACE:    /* Perl space */
1366          case PT_PXSPACE:  /* POSIX space */
1367          switch(c)
1368            {
1369            HSPACE_CASES:
1370            VSPACE_CASES:
1371            OK = TRUE;
1372            break;
1373
1374            default:
1375            OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z;
1376            break;
1377            }
1378          break;
1379
1380          case PT_WORD:
1381          OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1382               PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1383               c == CHAR_UNDERSCORE;
1384          break;
1385
1386          case PT_CLIST:
1387          cp = PRIV(ucd_caseless_sets) + code[3];
1388          for (;;)
1389            {
1390            if (c < *cp) { OK = FALSE; break; }
1391            if (c == *cp++) { OK = TRUE; break; }
1392            }
1393          break;
1394
1395          case PT_UCNC:
1396          OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
1397               c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
1398               c >= 0xe000;
1399          break;
1400
1401          /* Should never occur, but keep compilers from grumbling. */
1402
1403          default:
1404          OK = codevalue != OP_PROP;
1405          break;
1406          }
1407
1408        if (OK == (d == OP_PROP))
1409          {
1410          if (count > 0 && codevalue == OP_PROP_EXTRA + OP_TYPEPOSPLUS)
1411            {
1412            active_count--;           /* Remove non-match possibility */
1413            next_active_state--;
1414            }
1415          count++;
1416          ADD_NEW(state_offset, count);
1417          }
1418        }
1419      break;
1420
1421      /*-----------------------------------------------------------------*/
1422      case OP_EXTUNI_EXTRA + OP_TYPEPLUS:
1423      case OP_EXTUNI_EXTRA + OP_TYPEMINPLUS:
1424      case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS:
1425      count = current_state->count;  /* Already matched */
1426      if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1427      if (clen > 0)
1428        {
1429        int lgb, rgb;
1430        const pcre_uchar *nptr = ptr + clen;
1431        int ncount = 0;
1432        if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS)
1433          {
1434          active_count--;           /* Remove non-match possibility */
1435          next_active_state--;
1436          }
1437        lgb = UCD_GRAPHBREAK(c);
1438        while (nptr < end_subject)
1439          {
1440          dlen = 1;
1441          if (!utf) d = *nptr; else { GETCHARLEN(d, nptr, dlen); }
1442          rgb = UCD_GRAPHBREAK(d);
1443          if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
1444          ncount++;
1445          lgb = rgb;
1446          nptr += dlen;
1447          }
1448        count++;
1449        ADD_NEW_DATA(-state_offset, count, ncount);
1450        }
1451      break;
1452#endif
1453
1454      /*-----------------------------------------------------------------*/
1455      case OP_ANYNL_EXTRA + OP_TYPEPLUS:
1456      case OP_ANYNL_EXTRA + OP_TYPEMINPLUS:
1457      case OP_ANYNL_EXTRA + OP_TYPEPOSPLUS:
1458      count = current_state->count;  /* Already matched */
1459      if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1460      if (clen > 0)
1461        {
1462        int ncount = 0;
1463        switch (c)
1464          {
1465          case CHAR_VT:
1466          case CHAR_FF:
1467          case CHAR_NEL:
1468#ifndef EBCDIC
1469          case 0x2028:
1470          case 0x2029:
1471#endif  /* Not EBCDIC */
1472          if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1473          goto ANYNL01;
1474
1475          case CHAR_CR:
1476          if (ptr + 1 < end_subject && UCHAR21TEST(ptr + 1) == CHAR_LF) ncount = 1;
1477          /* Fall through */
1478
1479          ANYNL01:
1480          case CHAR_LF:
1481          if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS)
1482            {
1483            active_count--;           /* Remove non-match possibility */
1484            next_active_state--;
1485            }
1486          count++;
1487          ADD_NEW_DATA(-state_offset, count, ncount);
1488          break;
1489
1490          default:
1491          break;
1492          }
1493        }
1494      break;
1495
1496      /*-----------------------------------------------------------------*/
1497      case OP_VSPACE_EXTRA + OP_TYPEPLUS:
1498      case OP_VSPACE_EXTRA + OP_TYPEMINPLUS:
1499      case OP_VSPACE_EXTRA + OP_TYPEPOSPLUS:
1500      count = current_state->count;  /* Already matched */
1501      if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1502      if (clen > 0)
1503        {
1504        BOOL OK;
1505        switch (c)
1506          {
1507          VSPACE_CASES:
1508          OK = TRUE;
1509          break;
1510
1511          default:
1512          OK = FALSE;
1513          break;
1514          }
1515
1516        if (OK == (d == OP_VSPACE))
1517          {
1518          if (count > 0 && codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSPLUS)
1519            {
1520            active_count--;           /* Remove non-match possibility */
1521            next_active_state--;
1522            }
1523          count++;
1524          ADD_NEW_DATA(-state_offset, count, 0);
1525          }
1526        }
1527      break;
1528
1529      /*-----------------------------------------------------------------*/
1530      case OP_HSPACE_EXTRA + OP_TYPEPLUS:
1531      case OP_HSPACE_EXTRA + OP_TYPEMINPLUS:
1532      case OP_HSPACE_EXTRA + OP_TYPEPOSPLUS:
1533      count = current_state->count;  /* Already matched */
1534      if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1535      if (clen > 0)
1536        {
1537        BOOL OK;
1538        switch (c)
1539          {
1540          HSPACE_CASES:
1541          OK = TRUE;
1542          break;
1543
1544          default:
1545          OK = FALSE;
1546          break;
1547          }
1548
1549        if (OK == (d == OP_HSPACE))
1550          {
1551          if (count > 0 && codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSPLUS)
1552            {
1553            active_count--;           /* Remove non-match possibility */
1554            next_active_state--;
1555            }
1556          count++;
1557          ADD_NEW_DATA(-state_offset, count, 0);
1558          }
1559        }
1560      break;
1561
1562      /*-----------------------------------------------------------------*/
1563#ifdef SUPPORT_UCP
1564      case OP_PROP_EXTRA + OP_TYPEQUERY:
1565      case OP_PROP_EXTRA + OP_TYPEMINQUERY:
1566      case OP_PROP_EXTRA + OP_TYPEPOSQUERY:
1567      count = 4;
1568      goto QS1;
1569
1570      case OP_PROP_EXTRA + OP_TYPESTAR:
1571      case OP_PROP_EXTRA + OP_TYPEMINSTAR:
1572      case OP_PROP_EXTRA + OP_TYPEPOSSTAR:
1573      count = 0;
1574
1575      QS1:
1576
1577      ADD_ACTIVE(state_offset + 4, 0);
1578      if (clen > 0)
1579        {
1580        BOOL OK;
1581        const pcre_uint32 *cp;
1582        const ucd_record * prop = GET_UCD(c);
1583        switch(code[2])
1584          {
1585          case PT_ANY:
1586          OK = TRUE;
1587          break;
1588
1589          case PT_LAMP:
1590          OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1591            prop->chartype == ucp_Lt;
1592          break;
1593
1594          case PT_GC:
1595          OK = PRIV(ucp_gentype)[prop->chartype] == code[3];
1596          break;
1597
1598          case PT_PC:
1599          OK = prop->chartype == code[3];
1600          break;
1601
1602          case PT_SC:
1603          OK = prop->script == code[3];
1604          break;
1605
1606          /* These are specials for combination cases. */
1607
1608          case PT_ALNUM:
1609          OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1610               PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1611          break;
1612
1613          /* Perl space used to exclude VT, but from Perl 5.18 it is included,
1614          which means that Perl space and POSIX space are now identical. PCRE
1615          was changed at release 8.34. */
1616
1617          case PT_SPACE:    /* Perl space */
1618          case PT_PXSPACE:  /* POSIX space */
1619          switch(c)
1620            {
1621            HSPACE_CASES:
1622            VSPACE_CASES:
1623            OK = TRUE;
1624            break;
1625
1626            default:
1627            OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z;
1628            break;
1629            }
1630          break;
1631
1632          case PT_WORD:
1633          OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1634               PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1635               c == CHAR_UNDERSCORE;
1636          break;
1637
1638          case PT_CLIST:
1639          cp = PRIV(ucd_caseless_sets) + code[3];
1640          for (;;)
1641            {
1642            if (c < *cp) { OK = FALSE; break; }
1643            if (c == *cp++) { OK = TRUE; break; }
1644            }
1645          break;
1646
1647          case PT_UCNC:
1648          OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
1649               c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
1650               c >= 0xe000;
1651          break;
1652
1653          /* Should never occur, but keep compilers from grumbling. */
1654
1655          default:
1656          OK = codevalue != OP_PROP;
1657          break;
1658          }
1659
1660        if (OK == (d == OP_PROP))
1661          {
1662          if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSSTAR ||
1663              codevalue == OP_PROP_EXTRA + OP_TYPEPOSQUERY)
1664            {
1665            active_count--;           /* Remove non-match possibility */
1666            next_active_state--;
1667            }
1668          ADD_NEW(state_offset + count, 0);
1669          }
1670        }
1671      break;
1672
1673      /*-----------------------------------------------------------------*/
1674      case OP_EXTUNI_EXTRA + OP_TYPEQUERY:
1675      case OP_EXTUNI_EXTRA + OP_TYPEMINQUERY:
1676      case OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY:
1677      count = 2;
1678      goto QS2;
1679
1680      case OP_EXTUNI_EXTRA + OP_TYPESTAR:
1681      case OP_EXTUNI_EXTRA + OP_TYPEMINSTAR:
1682      case OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR:
1683      count = 0;
1684
1685      QS2:
1686
1687      ADD_ACTIVE(state_offset + 2, 0);
1688      if (clen > 0)
1689        {
1690        int lgb, rgb;
1691        const pcre_uchar *nptr = ptr + clen;
1692        int ncount = 0;
1693        if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR ||
1694            codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY)
1695          {
1696          active_count--;           /* Remove non-match possibility */
1697          next_active_state--;
1698          }
1699        lgb = UCD_GRAPHBREAK(c);
1700        while (nptr < end_subject)
1701          {
1702          dlen = 1;
1703          if (!utf) d = *nptr; else { GETCHARLEN(d, nptr, dlen); }
1704          rgb = UCD_GRAPHBREAK(d);
1705          if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
1706          ncount++;
1707          lgb = rgb;
1708          nptr += dlen;
1709          }
1710        ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1711        }
1712      break;
1713#endif
1714
1715      /*-----------------------------------------------------------------*/
1716      case OP_ANYNL_EXTRA + OP_TYPEQUERY:
1717      case OP_ANYNL_EXTRA + OP_TYPEMINQUERY:
1718      case OP_ANYNL_EXTRA + OP_TYPEPOSQUERY:
1719      count = 2;
1720      goto QS3;
1721
1722      case OP_ANYNL_EXTRA + OP_TYPESTAR:
1723      case OP_ANYNL_EXTRA + OP_TYPEMINSTAR:
1724      case OP_ANYNL_EXTRA + OP_TYPEPOSSTAR:
1725      count = 0;
1726
1727      QS3:
1728      ADD_ACTIVE(state_offset + 2, 0);
1729      if (clen > 0)
1730        {
1731        int ncount = 0;
1732        switch (c)
1733          {
1734          case CHAR_VT:
1735          case CHAR_FF:
1736          case CHAR_NEL:
1737#ifndef EBCDIC
1738          case 0x2028:
1739          case 0x2029:
1740#endif  /* Not EBCDIC */
1741          if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1742          goto ANYNL02;
1743
1744          case CHAR_CR:
1745          if (ptr + 1 < end_subject && UCHAR21TEST(ptr + 1) == CHAR_LF) ncount = 1;
1746          /* Fall through */
1747
1748          ANYNL02:
1749          case CHAR_LF:
1750          if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR ||
1751              codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY)
1752            {
1753            active_count--;           /* Remove non-match possibility */
1754            next_active_state--;
1755            }
1756          ADD_NEW_DATA(-(state_offset + (int)count), 0, ncount);
1757          break;
1758
1759          default:
1760          break;
1761          }
1762        }
1763      break;
1764
1765      /*-----------------------------------------------------------------*/
1766      case OP_VSPACE_EXTRA + OP_TYPEQUERY:
1767      case OP_VSPACE_EXTRA + OP_TYPEMINQUERY:
1768      case OP_VSPACE_EXTRA + OP_TYPEPOSQUERY:
1769      count = 2;
1770      goto QS4;
1771
1772      case OP_VSPACE_EXTRA + OP_TYPESTAR:
1773      case OP_VSPACE_EXTRA + OP_TYPEMINSTAR:
1774      case OP_VSPACE_EXTRA + OP_TYPEPOSSTAR:
1775      count = 0;
1776
1777      QS4:
1778      ADD_ACTIVE(state_offset + 2, 0);
1779      if (clen > 0)
1780        {
1781        BOOL OK;
1782        switch (c)
1783          {
1784          VSPACE_CASES:
1785          OK = TRUE;
1786          break;
1787
1788          default:
1789          OK = FALSE;
1790          break;
1791          }
1792        if (OK == (d == OP_VSPACE))
1793          {
1794          if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSSTAR ||
1795              codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSQUERY)
1796            {
1797            active_count--;           /* Remove non-match possibility */
1798            next_active_state--;
1799            }
1800          ADD_NEW_DATA(-(state_offset + (int)count), 0, 0);
1801          }
1802        }
1803      break;
1804
1805      /*-----------------------------------------------------------------*/
1806      case OP_HSPACE_EXTRA + OP_TYPEQUERY:
1807      case OP_HSPACE_EXTRA + OP_TYPEMINQUERY:
1808      case OP_HSPACE_EXTRA + OP_TYPEPOSQUERY:
1809      count = 2;
1810      goto QS5;
1811
1812      case OP_HSPACE_EXTRA + OP_TYPESTAR:
1813      case OP_HSPACE_EXTRA + OP_TYPEMINSTAR:
1814      case OP_HSPACE_EXTRA + OP_TYPEPOSSTAR:
1815      count = 0;
1816
1817      QS5:
1818      ADD_ACTIVE(state_offset + 2, 0);
1819      if (clen > 0)
1820        {
1821        BOOL OK;
1822        switch (c)
1823          {
1824          HSPACE_CASES:
1825          OK = TRUE;
1826          break;
1827
1828          default:
1829          OK = FALSE;
1830          break;
1831          }
1832
1833        if (OK == (d == OP_HSPACE))
1834          {
1835          if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSSTAR ||
1836              codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSQUERY)
1837            {
1838            active_count--;           /* Remove non-match possibility */
1839            next_active_state--;
1840            }
1841          ADD_NEW_DATA(-(state_offset + (int)count), 0, 0);
1842          }
1843        }
1844      break;
1845
1846      /*-----------------------------------------------------------------*/
1847#ifdef SUPPORT_UCP
1848      case OP_PROP_EXTRA + OP_TYPEEXACT:
1849      case OP_PROP_EXTRA + OP_TYPEUPTO:
1850      case OP_PROP_EXTRA + OP_TYPEMINUPTO:
1851      case OP_PROP_EXTRA + OP_TYPEPOSUPTO:
1852      if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT)
1853        { ADD_ACTIVE(state_offset + 1 + IMM2_SIZE + 3, 0); }
1854      count = current_state->count;  /* Number already matched */
1855      if (clen > 0)
1856        {
1857        BOOL OK;
1858        const pcre_uint32 *cp;
1859        const ucd_record * prop = GET_UCD(c);
1860        switch(code[1 + IMM2_SIZE + 1])
1861          {
1862          case PT_ANY:
1863          OK = TRUE;
1864          break;
1865
1866          case PT_LAMP:
1867          OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1868            prop->chartype == ucp_Lt;
1869          break;
1870
1871          case PT_GC:
1872          OK = PRIV(ucp_gentype)[prop->chartype] == code[1 + IMM2_SIZE + 2];
1873          break;
1874
1875          case PT_PC:
1876          OK = prop->chartype == code[1 + IMM2_SIZE + 2];
1877          break;
1878
1879          case PT_SC:
1880          OK = prop->script == code[1 + IMM2_SIZE + 2];
1881          break;
1882
1883          /* These are specials for combination cases. */
1884
1885          case PT_ALNUM:
1886          OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1887               PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1888          break;
1889
1890          /* Perl space used to exclude VT, but from Perl 5.18 it is included,
1891          which means that Perl space and POSIX space are now identical. PCRE
1892          was changed at release 8.34. */
1893
1894          case PT_SPACE:    /* Perl space */
1895          case PT_PXSPACE:  /* POSIX space */
1896          switch(c)
1897            {
1898            HSPACE_CASES:
1899            VSPACE_CASES:
1900            OK = TRUE;
1901            break;
1902
1903            default:
1904            OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z;
1905            break;
1906            }
1907          break;
1908
1909          case PT_WORD:
1910          OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1911               PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1912               c == CHAR_UNDERSCORE;
1913          break;
1914
1915          case PT_CLIST:
1916          cp = PRIV(ucd_caseless_sets) + code[1 + IMM2_SIZE + 2];
1917          for (;;)
1918            {
1919            if (c < *cp) { OK = FALSE; break; }
1920            if (c == *cp++) { OK = TRUE; break; }
1921            }
1922          break;
1923
1924          case PT_UCNC:
1925          OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
1926               c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
1927               c >= 0xe000;
1928          break;
1929
1930          /* Should never occur, but keep compilers from grumbling. */
1931
1932          default:
1933          OK = codevalue != OP_PROP;
1934          break;
1935          }
1936
1937        if (OK == (d == OP_PROP))
1938          {
1939          if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSUPTO)
1940            {
1941            active_count--;           /* Remove non-match possibility */
1942            next_active_state--;
1943            }
1944          if (++count >= (int)GET2(code, 1))
1945            { ADD_NEW(state_offset + 1 + IMM2_SIZE + 3, 0); }
1946          else
1947            { ADD_NEW(state_offset, count); }
1948          }
1949        }
1950      break;
1951
1952      /*-----------------------------------------------------------------*/
1953      case OP_EXTUNI_EXTRA + OP_TYPEEXACT:
1954      case OP_EXTUNI_EXTRA + OP_TYPEUPTO:
1955      case OP_EXTUNI_EXTRA + OP_TYPEMINUPTO:
1956      case OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO:
1957      if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)
1958        { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
1959      count = current_state->count;  /* Number already matched */
1960      if (clen > 0)
1961        {
1962        int lgb, rgb;
1963        const pcre_uchar *nptr = ptr + clen;
1964        int ncount = 0;
1965        if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO)
1966          {
1967          active_count--;           /* Remove non-match possibility */
1968          next_active_state--;
1969          }
1970        lgb = UCD_GRAPHBREAK(c);
1971        while (nptr < end_subject)
1972          {
1973          dlen = 1;
1974          if (!utf) d = *nptr; else { GETCHARLEN(d, nptr, dlen); }
1975          rgb = UCD_GRAPHBREAK(d);
1976          if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
1977          ncount++;
1978          lgb = rgb;
1979          nptr += dlen;
1980          }
1981        if (nptr >= end_subject && (md->moptions & PCRE_PARTIAL_HARD) != 0)
1982            reset_could_continue = TRUE;
1983        if (++count >= (int)GET2(code, 1))
1984          { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, ncount); }
1985        else
1986          { ADD_NEW_DATA(-state_offset, count, ncount); }
1987        }
1988      break;
1989#endif
1990
1991      /*-----------------------------------------------------------------*/
1992      case OP_ANYNL_EXTRA + OP_TYPEEXACT:
1993      case OP_ANYNL_EXTRA + OP_TYPEUPTO:
1994      case OP_ANYNL_EXTRA + OP_TYPEMINUPTO:
1995      case OP_ANYNL_EXTRA + OP_TYPEPOSUPTO:
1996      if (codevalue != OP_ANYNL_EXTRA + OP_TYPEEXACT)
1997        { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
1998      count = current_state->count;  /* Number already matched */
1999      if (clen > 0)
2000        {
2001        int ncount = 0;
2002        switch (c)
2003          {
2004          case CHAR_VT:
2005          case CHAR_FF:
2006          case CHAR_NEL:
2007#ifndef EBCDIC
2008          case 0x2028:
2009          case 0x2029:
2010#endif  /* Not EBCDIC */
2011          if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
2012          goto ANYNL03;
2013
2014          case CHAR_CR:
2015          if (ptr + 1 < end_subject && UCHAR21TEST(ptr + 1) == CHAR_LF) ncount = 1;
2016          /* Fall through */
2017
2018          ANYNL03:
2019          case CHAR_LF:
2020          if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO)
2021            {
2022            active_count--;           /* Remove non-match possibility */
2023            next_active_state--;
2024            }
2025          if (++count >= (int)GET2(code, 1))
2026            { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, ncount); }
2027          else
2028            { ADD_NEW_DATA(-state_offset, count, ncount); }
2029          break;
2030
2031          default:
2032          break;
2033          }
2034        }
2035      break;
2036
2037      /*-----------------------------------------------------------------*/
2038      case OP_VSPACE_EXTRA + OP_TYPEEXACT:
2039      case OP_VSPACE_EXTRA + OP_TYPEUPTO:
2040      case OP_VSPACE_EXTRA + OP_TYPEMINUPTO:
2041      case OP_VSPACE_EXTRA + OP_TYPEPOSUPTO:
2042      if (codevalue != OP_VSPACE_EXTRA + OP_TYPEEXACT)
2043        { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
2044      count = current_state->count;  /* Number already matched */
2045      if (clen > 0)
2046        {
2047        BOOL OK;
2048        switch (c)
2049          {
2050          VSPACE_CASES:
2051          OK = TRUE;
2052          break;
2053
2054          default:
2055          OK = FALSE;
2056          }
2057
2058        if (OK == (d == OP_VSPACE))
2059          {
2060          if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSUPTO)
2061            {
2062            active_count--;           /* Remove non-match possibility */
2063            next_active_state--;
2064            }
2065          if (++count >= (int)GET2(code, 1))
2066            { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, 0); }
2067          else
2068            { ADD_NEW_DATA(-state_offset, count, 0); }
2069          }
2070        }
2071      break;
2072
2073      /*-----------------------------------------------------------------*/
2074      case OP_HSPACE_EXTRA + OP_TYPEEXACT:
2075      case OP_HSPACE_EXTRA + OP_TYPEUPTO:
2076      case OP_HSPACE_EXTRA + OP_TYPEMINUPTO:
2077      case OP_HSPACE_EXTRA + OP_TYPEPOSUPTO:
2078      if (codevalue != OP_HSPACE_EXTRA + OP_TYPEEXACT)
2079        { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
2080      count = current_state->count;  /* Number already matched */
2081      if (clen > 0)
2082        {
2083        BOOL OK;
2084        switch (c)
2085          {
2086          HSPACE_CASES:
2087          OK = TRUE;
2088          break;
2089
2090          default:
2091          OK = FALSE;
2092          break;
2093          }
2094
2095        if (OK == (d == OP_HSPACE))
2096          {
2097          if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSUPTO)
2098            {
2099            active_count--;           /* Remove non-match possibility */
2100            next_active_state--;
2101            }
2102          if (++count >= (int)GET2(code, 1))
2103            { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, 0); }
2104          else
2105            { ADD_NEW_DATA(-state_offset, count, 0); }
2106          }
2107        }
2108      break;
2109
2110/* ========================================================================== */
2111      /* These opcodes are followed by a character that is usually compared
2112      to the current subject character; it is loaded into d. We still get
2113      here even if there is no subject character, because in some cases zero
2114      repetitions are permitted. */
2115
2116      /*-----------------------------------------------------------------*/
2117      case OP_CHAR:
2118      if (clen > 0 && c == d) { ADD_NEW(state_offset + dlen + 1, 0); }
2119      break;
2120
2121      /*-----------------------------------------------------------------*/
2122      case OP_CHARI:
2123      if (clen == 0) break;
2124
2125#ifdef SUPPORT_UTF
2126      if (utf)
2127        {
2128        if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else
2129          {
2130          unsigned int othercase;
2131          if (c < 128)
2132            othercase = fcc[c];
2133          else
2134            /* If we have Unicode property support, we can use it to test the
2135            other case of the character. */
2136#ifdef SUPPORT_UCP
2137            othercase = UCD_OTHERCASE(c);
2138#else
2139            othercase = NOTACHAR;
2140#endif
2141
2142          if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); }
2143          }
2144        }
2145      else
2146#endif  /* SUPPORT_UTF */
2147      /* Not UTF mode */
2148        {
2149        if (TABLE_GET(c, lcc, c) == TABLE_GET(d, lcc, d))
2150          { ADD_NEW(state_offset + 2, 0); }
2151        }
2152      break;
2153
2154
2155#ifdef SUPPORT_UCP
2156      /*-----------------------------------------------------------------*/
2157      /* This is a tricky one because it can match more than one character.
2158      Find out how many characters to skip, and then set up a negative state
2159      to wait for them to pass before continuing. */
2160
2161      case OP_EXTUNI:
2162      if (clen > 0)
2163        {
2164        int lgb, rgb;
2165        const pcre_uchar *nptr = ptr + clen;
2166        int ncount = 0;
2167        lgb = UCD_GRAPHBREAK(c);
2168        while (nptr < end_subject)
2169          {
2170          dlen = 1;
2171          if (!utf) d = *nptr; else { GETCHARLEN(d, nptr, dlen); }
2172          rgb = UCD_GRAPHBREAK(d);
2173          if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
2174          ncount++;
2175          lgb = rgb;
2176          nptr += dlen;
2177          }
2178        if (nptr >= end_subject && (md->moptions & PCRE_PARTIAL_HARD) != 0)
2179            reset_could_continue = TRUE;
2180        ADD_NEW_DATA(-(state_offset + 1), 0, ncount);
2181        }
2182      break;
2183#endif
2184
2185      /*-----------------------------------------------------------------*/
2186      /* This is a tricky like EXTUNI because it too can match more than one
2187      character (when CR is followed by LF). In this case, set up a negative
2188      state to wait for one character to pass before continuing. */
2189
2190      case OP_ANYNL:
2191      if (clen > 0) switch(c)
2192        {
2193        case CHAR_VT:
2194        case CHAR_FF:
2195        case CHAR_NEL:
2196#ifndef EBCDIC
2197        case 0x2028:
2198        case 0x2029:
2199#endif  /* Not EBCDIC */
2200        if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
2201
2202        case CHAR_LF:
2203        ADD_NEW(state_offset + 1, 0);
2204        break;
2205
2206        case CHAR_CR:
2207        if (ptr + 1 >= end_subject)
2208          {
2209          ADD_NEW(state_offset + 1, 0);
2210          if ((md->moptions & PCRE_PARTIAL_HARD) != 0)
2211            reset_could_continue = TRUE;
2212          }
2213        else if (UCHAR21TEST(ptr + 1) == CHAR_LF)
2214          {
2215          ADD_NEW_DATA(-(state_offset + 1), 0, 1);
2216          }
2217        else
2218          {
2219          ADD_NEW(state_offset + 1, 0);
2220          }
2221        break;
2222        }
2223      break;
2224
2225      /*-----------------------------------------------------------------*/
2226      case OP_NOT_VSPACE:
2227      if (clen > 0) switch(c)
2228        {
2229        VSPACE_CASES:
2230        break;
2231
2232        default:
2233        ADD_NEW(state_offset + 1, 0);
2234        break;
2235        }
2236      break;
2237
2238      /*-----------------------------------------------------------------*/
2239      case OP_VSPACE:
2240      if (clen > 0) switch(c)
2241        {
2242        VSPACE_CASES:
2243        ADD_NEW(state_offset + 1, 0);
2244        break;
2245
2246        default:
2247        break;
2248        }
2249      break;
2250
2251      /*-----------------------------------------------------------------*/
2252      case OP_NOT_HSPACE:
2253      if (clen > 0) switch(c)
2254        {
2255        HSPACE_CASES:
2256        break;
2257
2258        default:
2259        ADD_NEW(state_offset + 1, 0);
2260        break;
2261        }
2262      break;
2263
2264      /*-----------------------------------------------------------------*/
2265      case OP_HSPACE:
2266      if (clen > 0) switch(c)
2267        {
2268        HSPACE_CASES:
2269        ADD_NEW(state_offset + 1, 0);
2270        break;
2271
2272        default:
2273        break;
2274        }
2275      break;
2276
2277      /*-----------------------------------------------------------------*/
2278      /* Match a negated single character casefully. */
2279
2280      case OP_NOT:
2281      if (clen > 0 && c != d) { ADD_NEW(state_offset + dlen + 1, 0); }
2282      break;
2283
2284      /*-----------------------------------------------------------------*/
2285      /* Match a negated single character caselessly. */
2286
2287      case OP_NOTI:
2288      if (clen > 0)
2289        {
2290        unsigned int otherd;
2291#ifdef SUPPORT_UTF
2292        if (utf && d >= 128)
2293          {
2294#ifdef SUPPORT_UCP
2295          otherd = UCD_OTHERCASE(d);
2296#endif  /* SUPPORT_UCP */
2297          }
2298        else
2299#endif  /* SUPPORT_UTF */
2300        otherd = TABLE_GET(d, fcc, d);
2301        if (c != d && c != otherd)
2302          { ADD_NEW(state_offset + dlen + 1, 0); }
2303        }
2304      break;
2305
2306      /*-----------------------------------------------------------------*/
2307      case OP_PLUSI:
2308      case OP_MINPLUSI:
2309      case OP_POSPLUSI:
2310      case OP_NOTPLUSI:
2311      case OP_NOTMINPLUSI:
2312      case OP_NOTPOSPLUSI:
2313      caseless = TRUE;
2314      codevalue -= OP_STARI - OP_STAR;
2315
2316      /* Fall through */
2317      case OP_PLUS:
2318      case OP_MINPLUS:
2319      case OP_POSPLUS:
2320      case OP_NOTPLUS:
2321      case OP_NOTMINPLUS:
2322      case OP_NOTPOSPLUS:
2323      count = current_state->count;  /* Already matched */
2324      if (count > 0) { ADD_ACTIVE(state_offset + dlen + 1, 0); }
2325      if (clen > 0)
2326        {
2327        pcre_uint32 otherd = NOTACHAR;
2328        if (caseless)
2329          {
2330#ifdef SUPPORT_UTF
2331          if (utf && d >= 128)
2332            {
2333#ifdef SUPPORT_UCP
2334            otherd = UCD_OTHERCASE(d);
2335#endif  /* SUPPORT_UCP */
2336            }
2337          else
2338#endif  /* SUPPORT_UTF */
2339          otherd = TABLE_GET(d, fcc, d);
2340          }
2341        if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2342          {
2343          if (count > 0 &&
2344              (codevalue == OP_POSPLUS || codevalue == OP_NOTPOSPLUS))
2345            {
2346            active_count--;             /* Remove non-match possibility */
2347            next_active_state--;
2348            }
2349          count++;
2350          ADD_NEW(state_offset, count);
2351          }
2352        }
2353      break;
2354
2355      /*-----------------------------------------------------------------*/
2356      case OP_QUERYI:
2357      case OP_MINQUERYI:
2358      case OP_POSQUERYI:
2359      case OP_NOTQUERYI:
2360      case OP_NOTMINQUERYI:
2361      case OP_NOTPOSQUERYI:
2362      caseless = TRUE;
2363      codevalue -= OP_STARI - OP_STAR;
2364      /* Fall through */
2365      case OP_QUERY:
2366      case OP_MINQUERY:
2367      case OP_POSQUERY:
2368      case OP_NOTQUERY:
2369      case OP_NOTMINQUERY:
2370      case OP_NOTPOSQUERY:
2371      ADD_ACTIVE(state_offset + dlen + 1, 0);
2372      if (clen > 0)
2373        {
2374        pcre_uint32 otherd = NOTACHAR;
2375        if (caseless)
2376          {
2377#ifdef SUPPORT_UTF
2378          if (utf && d >= 128)
2379            {
2380#ifdef SUPPORT_UCP
2381            otherd = UCD_OTHERCASE(d);
2382#endif  /* SUPPORT_UCP */
2383            }
2384          else
2385#endif  /* SUPPORT_UTF */
2386          otherd = TABLE_GET(d, fcc, d);
2387          }
2388        if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2389          {
2390          if (codevalue == OP_POSQUERY || codevalue == OP_NOTPOSQUERY)
2391            {
2392            active_count--;            /* Remove non-match possibility */
2393            next_active_state--;
2394            }
2395          ADD_NEW(state_offset + dlen + 1, 0);
2396          }
2397        }
2398      break;
2399
2400      /*-----------------------------------------------------------------*/
2401      case OP_STARI:
2402      case OP_MINSTARI:
2403      case OP_POSSTARI:
2404      case OP_NOTSTARI:
2405      case OP_NOTMINSTARI:
2406      case OP_NOTPOSSTARI:
2407      caseless = TRUE;
2408      codevalue -= OP_STARI - OP_STAR;
2409      /* Fall through */
2410      case OP_STAR:
2411      case OP_MINSTAR:
2412      case OP_POSSTAR:
2413      case OP_NOTSTAR:
2414      case OP_NOTMINSTAR:
2415      case OP_NOTPOSSTAR:
2416      ADD_ACTIVE(state_offset + dlen + 1, 0);
2417      if (clen > 0)
2418        {
2419        pcre_uint32 otherd = NOTACHAR;
2420        if (caseless)
2421          {
2422#ifdef SUPPORT_UTF
2423          if (utf && d >= 128)
2424            {
2425#ifdef SUPPORT_UCP
2426            otherd = UCD_OTHERCASE(d);
2427#endif  /* SUPPORT_UCP */
2428            }
2429          else
2430#endif  /* SUPPORT_UTF */
2431          otherd = TABLE_GET(d, fcc, d);
2432          }
2433        if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2434          {
2435          if (codevalue == OP_POSSTAR || codevalue == OP_NOTPOSSTAR)
2436            {
2437            active_count--;            /* Remove non-match possibility */
2438            next_active_state--;
2439            }
2440          ADD_NEW(state_offset, 0);
2441          }
2442        }
2443      break;
2444
2445      /*-----------------------------------------------------------------*/
2446      case OP_EXACTI:
2447      case OP_NOTEXACTI:
2448      caseless = TRUE;
2449      codevalue -= OP_STARI - OP_STAR;
2450      /* Fall through */
2451      case OP_EXACT:
2452      case OP_NOTEXACT:
2453      count = current_state->count;  /* Number already matched */
2454      if (clen > 0)
2455        {
2456        pcre_uint32 otherd = NOTACHAR;
2457        if (caseless)
2458          {
2459#ifdef SUPPORT_UTF
2460          if (utf && d >= 128)
2461            {
2462#ifdef SUPPORT_UCP
2463            otherd = UCD_OTHERCASE(d);
2464#endif  /* SUPPORT_UCP */
2465            }
2466          else
2467#endif  /* SUPPORT_UTF */
2468          otherd = TABLE_GET(d, fcc, d);
2469          }
2470        if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2471          {
2472          if (++count >= (int)GET2(code, 1))
2473            { ADD_NEW(state_offset + dlen + 1 + IMM2_SIZE, 0); }
2474          else
2475            { ADD_NEW(state_offset, count); }
2476          }
2477        }
2478      break;
2479
2480      /*-----------------------------------------------------------------*/
2481      case OP_UPTOI:
2482      case OP_MINUPTOI:
2483      case OP_POSUPTOI:
2484      case OP_NOTUPTOI:
2485      case OP_NOTMINUPTOI:
2486      case OP_NOTPOSUPTOI:
2487      caseless = TRUE;
2488      codevalue -= OP_STARI - OP_STAR;
2489      /* Fall through */
2490      case OP_UPTO:
2491      case OP_MINUPTO:
2492      case OP_POSUPTO:
2493      case OP_NOTUPTO:
2494      case OP_NOTMINUPTO:
2495      case OP_NOTPOSUPTO:
2496      ADD_ACTIVE(state_offset + dlen + 1 + IMM2_SIZE, 0);
2497      count = current_state->count;  /* Number already matched */
2498      if (clen > 0)
2499        {
2500        pcre_uint32 otherd = NOTACHAR;
2501        if (caseless)
2502          {
2503#ifdef SUPPORT_UTF
2504          if (utf && d >= 128)
2505            {
2506#ifdef SUPPORT_UCP
2507            otherd = UCD_OTHERCASE(d);
2508#endif  /* SUPPORT_UCP */
2509            }
2510          else
2511#endif  /* SUPPORT_UTF */
2512          otherd = TABLE_GET(d, fcc, d);
2513          }
2514        if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2515          {
2516          if (codevalue == OP_POSUPTO || codevalue == OP_NOTPOSUPTO)
2517            {
2518            active_count--;             /* Remove non-match possibility */
2519            next_active_state--;
2520            }
2521          if (++count >= (int)GET2(code, 1))
2522            { ADD_NEW(state_offset + dlen + 1 + IMM2_SIZE, 0); }
2523          else
2524            { ADD_NEW(state_offset, count); }
2525          }
2526        }
2527      break;
2528
2529
2530/* ========================================================================== */
2531      /* These are the class-handling opcodes */
2532
2533      case OP_CLASS:
2534      case OP_NCLASS:
2535      case OP_XCLASS:
2536        {
2537        BOOL isinclass = FALSE;
2538        int next_state_offset;
2539        const pcre_uchar *ecode;
2540
2541        /* For a simple class, there is always just a 32-byte table, and we
2542        can set isinclass from it. */
2543
2544        if (codevalue != OP_XCLASS)
2545          {
2546          ecode = code + 1 + (32 / sizeof(pcre_uchar));
2547          if (clen > 0)
2548            {
2549            isinclass = (c > 255)? (codevalue == OP_NCLASS) :
2550              ((((pcre_uint8 *)(code + 1))[c/8] & (1 << (c&7))) != 0);
2551            }
2552          }
2553
2554        /* An extended class may have a table or a list of single characters,
2555        ranges, or both, and it may be positive or negative. There's a
2556        function that sorts all this out. */
2557
2558        else
2559         {
2560         ecode = code + GET(code, 1);
2561         if (clen > 0) isinclass = PRIV(xclass)(c, code + 1 + LINK_SIZE, utf);
2562         }
2563
2564        /* At this point, isinclass is set for all kinds of class, and ecode
2565        points to the byte after the end of the class. If there is a
2566        quantifier, this is where it will be. */
2567
2568        next_state_offset = (int)(ecode - start_code);
2569
2570        switch (*ecode)
2571          {
2572          case OP_CRSTAR:
2573          case OP_CRMINSTAR:
2574          case OP_CRPOSSTAR:
2575          ADD_ACTIVE(next_state_offset + 1, 0);
2576          if (isinclass)
2577            {
2578            if (*ecode == OP_CRPOSSTAR)
2579              {
2580              active_count--;           /* Remove non-match possibility */
2581              next_active_state--;
2582              }
2583            ADD_NEW(state_offset, 0);
2584            }
2585          break;
2586
2587          case OP_CRPLUS:
2588          case OP_CRMINPLUS:
2589          case OP_CRPOSPLUS:
2590          count = current_state->count;  /* Already matched */
2591          if (count > 0) { ADD_ACTIVE(next_state_offset + 1, 0); }
2592          if (isinclass)
2593            {
2594            if (count > 0 && *ecode == OP_CRPOSPLUS)
2595              {
2596              active_count--;           /* Remove non-match possibility */
2597              next_active_state--;
2598              }
2599            count++;
2600            ADD_NEW(state_offset, count);
2601            }
2602          break;
2603
2604          case OP_CRQUERY:
2605          case OP_CRMINQUERY:
2606          case OP_CRPOSQUERY:
2607          ADD_ACTIVE(next_state_offset + 1, 0);
2608          if (isinclass)
2609            {
2610            if (*ecode == OP_CRPOSQUERY)
2611              {
2612              active_count--;           /* Remove non-match possibility */
2613              next_active_state--;
2614              }
2615            ADD_NEW(next_state_offset + 1, 0);
2616            }
2617          break;
2618
2619          case OP_CRRANGE:
2620          case OP_CRMINRANGE:
2621          case OP_CRPOSRANGE:
2622          count = current_state->count;  /* Already matched */
2623          if (count >= (int)GET2(ecode, 1))
2624            { ADD_ACTIVE(next_state_offset + 1 + 2 * IMM2_SIZE, 0); }
2625          if (isinclass)
2626            {
2627            int max = (int)GET2(ecode, 1 + IMM2_SIZE);
2628            if (*ecode == OP_CRPOSRANGE)
2629              {
2630              active_count--;           /* Remove non-match possibility */
2631              next_active_state--;
2632              }
2633            if (++count >= max && max != 0)   /* Max 0 => no limit */
2634              { ADD_NEW(next_state_offset + 1 + 2 * IMM2_SIZE, 0); }
2635            else
2636              { ADD_NEW(state_offset, count); }
2637            }
2638          break;
2639
2640          default:
2641          if (isinclass) { ADD_NEW(next_state_offset, 0); }
2642          break;
2643          }
2644        }
2645      break;
2646
2647/* ========================================================================== */
2648      /* These are the opcodes for fancy brackets of various kinds. We have
2649      to use recursion in order to handle them. The "always failing" assertion
2650      (?!) is optimised to OP_FAIL when compiling, so we have to support that,
2651      though the other "backtracking verbs" are not supported. */
2652
2653      case OP_FAIL:
2654      forced_fail++;    /* Count FAILs for multiple states */
2655      break;
2656
2657      case OP_ASSERT:
2658      case OP_ASSERT_NOT:
2659      case OP_ASSERTBACK:
2660      case OP_ASSERTBACK_NOT:
2661        {
2662        int rc;
2663        int local_offsets[2];
2664        int local_workspace[1000];
2665        const pcre_uchar *endasscode = code + GET(code, 1);
2666
2667        while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2668
2669        rc = internal_dfa_exec(
2670          md,                                   /* static match data */
2671          code,                                 /* this subexpression's code */
2672          ptr,                                  /* where we currently are */
2673          (int)(ptr - start_subject),           /* start offset */
2674          local_offsets,                        /* offset vector */
2675          sizeof(local_offsets)/sizeof(int),    /* size of same */
2676          local_workspace,                      /* workspace vector */
2677          sizeof(local_workspace)/sizeof(int),  /* size of same */
2678          rlevel);                              /* function recursion level */
2679
2680        if (rc == PCRE_ERROR_DFA_UITEM) return rc;
2681        if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))
2682            { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
2683        }
2684      break;
2685
2686      /*-----------------------------------------------------------------*/
2687      case OP_COND:
2688      case OP_SCOND:
2689        {
2690        int local_offsets[1000];
2691        int local_workspace[1000];
2692        int codelink = GET(code, 1);
2693        int condcode;
2694
2695        /* Because of the way auto-callout works during compile, a callout item
2696        is inserted between OP_COND and an assertion condition. This does not
2697        happen for the other conditions. */
2698
2699        if (code[LINK_SIZE+1] == OP_CALLOUT)
2700          {
2701          rrc = 0;
2702          if (PUBL(callout) != NULL)
2703            {
2704            PUBL(callout_block) cb;
2705            cb.version          = 1;   /* Version 1 of the callout block */
2706            cb.callout_number   = code[LINK_SIZE+2];
2707            cb.offset_vector    = offsets;
2708#if defined COMPILE_PCRE8
2709            cb.subject          = (PCRE_SPTR)start_subject;
2710#elif defined COMPILE_PCRE16
2711            cb.subject          = (PCRE_SPTR16)start_subject;
2712#elif defined COMPILE_PCRE32
2713            cb.subject          = (PCRE_SPTR32)start_subject;
2714#endif
2715            cb.subject_length   = (int)(end_subject - start_subject);
2716            cb.start_match      = (int)(current_subject - start_subject);
2717            cb.current_position = (int)(ptr - start_subject);
2718            cb.pattern_position = GET(code, LINK_SIZE + 3);
2719            cb.next_item_length = GET(code, 3 + 2*LINK_SIZE);
2720            cb.capture_top      = 1;
2721            cb.capture_last     = -1;
2722            cb.callout_data     = md->callout_data;
2723            cb.mark             = NULL;   /* No (*MARK) support */
2724            if ((rrc = (*PUBL(callout))(&cb)) < 0) return rrc;   /* Abandon */
2725            }
2726          if (rrc > 0) break;                      /* Fail this thread */
2727          code += PRIV(OP_lengths)[OP_CALLOUT];    /* Skip callout data */
2728          }
2729
2730        condcode = code[LINK_SIZE+1];
2731
2732        /* Back reference conditions and duplicate named recursion conditions
2733        are not supported */
2734
2735        if (condcode == OP_CREF || condcode == OP_DNCREF ||
2736            condcode == OP_DNRREF)
2737          return PCRE_ERROR_DFA_UCOND;
2738
2739        /* The DEFINE condition is always false */
2740
2741        if (condcode == OP_DEF)
2742          { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2743
2744        /* The only supported version of OP_RREF is for the value RREF_ANY,
2745        which means "test if in any recursion". We can't test for specifically
2746        recursed groups. */
2747
2748        else if (condcode == OP_RREF)
2749          {
2750          int value = GET2(code, LINK_SIZE + 2);
2751          if (value != RREF_ANY) return PCRE_ERROR_DFA_UCOND;
2752          if (md->recursive != NULL)
2753            { ADD_ACTIVE(state_offset + LINK_SIZE + 2 + IMM2_SIZE, 0); }
2754          else { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2755          }
2756
2757        /* Otherwise, the condition is an assertion */
2758
2759        else
2760          {
2761          int rc;
2762          const pcre_uchar *asscode = code + LINK_SIZE + 1;
2763          const pcre_uchar *endasscode = asscode + GET(asscode, 1);
2764
2765          while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2766
2767          rc = internal_dfa_exec(
2768            md,                                   /* fixed match data */
2769            asscode,                              /* this subexpression's code */
2770            ptr,                                  /* where we currently are */
2771            (int)(ptr - start_subject),           /* start offset */
2772            local_offsets,                        /* offset vector */
2773            sizeof(local_offsets)/sizeof(int),    /* size of same */
2774            local_workspace,                      /* workspace vector */
2775            sizeof(local_workspace)/sizeof(int),  /* size of same */
2776            rlevel);                              /* function recursion level */
2777
2778          if (rc == PCRE_ERROR_DFA_UITEM) return rc;
2779          if ((rc >= 0) ==
2780                (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))
2781            { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
2782          else
2783            { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2784          }
2785        }
2786      break;
2787
2788      /*-----------------------------------------------------------------*/
2789      case OP_RECURSE:
2790        {
2791        dfa_recursion_info *ri;
2792        int local_offsets[1000];
2793        int local_workspace[1000];
2794        const pcre_uchar *callpat = start_code + GET(code, 1);
2795        int recno = (callpat == md->start_code)? 0 :
2796          GET2(callpat, 1 + LINK_SIZE);
2797        int rc;
2798
2799        DPRINTF(("%.*sStarting regex recursion\n", rlevel*2-2, SP));
2800
2801        /* Check for repeating a recursion without advancing the subject
2802        pointer. This should catch convoluted mutual recursions. (Some simple
2803        cases are caught at compile time.) */
2804
2805        for (ri = md->recursive; ri != NULL; ri = ri->prevrec)
2806          if (recno == ri->group_num && ptr == ri->subject_position)
2807            return PCRE_ERROR_RECURSELOOP;
2808
2809        /* Remember this recursion and where we started it so as to
2810        catch infinite loops. */
2811
2812        new_recursive.group_num = recno;
2813        new_recursive.subject_position = ptr;
2814        new_recursive.prevrec = md->recursive;
2815        md->recursive = &new_recursive;
2816
2817        rc = internal_dfa_exec(
2818          md,                                   /* fixed match data */
2819          callpat,                              /* this subexpression's code */
2820          ptr,                                  /* where we currently are */
2821          (int)(ptr - start_subject),           /* start offset */
2822          local_offsets,                        /* offset vector */
2823          sizeof(local_offsets)/sizeof(int),    /* size of same */
2824          local_workspace,                      /* workspace vector */
2825          sizeof(local_workspace)/sizeof(int),  /* size of same */
2826          rlevel);                              /* function recursion level */
2827
2828        md->recursive = new_recursive.prevrec;  /* Done this recursion */
2829
2830        DPRINTF(("%.*sReturn from regex recursion: rc=%d\n", rlevel*2-2, SP,
2831          rc));
2832
2833        /* Ran out of internal offsets */
2834
2835        if (rc == 0) return PCRE_ERROR_DFA_RECURSE;
2836
2837        /* For each successful matched substring, set up the next state with a
2838        count of characters to skip before trying it. Note that the count is in
2839        characters, not bytes. */
2840
2841        if (rc > 0)
2842          {
2843          for (rc = rc*2 - 2; rc >= 0; rc -= 2)
2844            {
2845            int charcount = local_offsets[rc+1] - local_offsets[rc];
2846#if defined SUPPORT_UTF && !defined COMPILE_PCRE32
2847            if (utf)
2848              {
2849              const pcre_uchar *p = start_subject + local_offsets[rc];
2850              const pcre_uchar *pp = start_subject + local_offsets[rc+1];
2851              while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
2852              }
2853#endif
2854            if (charcount > 0)
2855              {
2856              ADD_NEW_DATA(-(state_offset + LINK_SIZE + 1), 0, (charcount - 1));
2857              }
2858            else
2859              {
2860              ADD_ACTIVE(state_offset + LINK_SIZE + 1, 0);
2861              }
2862            }
2863          }
2864        else if (rc != PCRE_ERROR_NOMATCH) return rc;
2865        }
2866      break;
2867
2868      /*-----------------------------------------------------------------*/
2869      case OP_BRAPOS:
2870      case OP_SBRAPOS:
2871      case OP_CBRAPOS:
2872      case OP_SCBRAPOS:
2873      case OP_BRAPOSZERO:
2874        {
2875        int charcount, matched_count;
2876        const pcre_uchar *local_ptr = ptr;
2877        BOOL allow_zero;
2878
2879        if (codevalue == OP_BRAPOSZERO)
2880          {
2881          allow_zero = TRUE;
2882          codevalue = *(++code);  /* Codevalue will be one of above BRAs */
2883          }
2884        else allow_zero = FALSE;
2885
2886        /* Loop to match the subpattern as many times as possible as if it were
2887        a complete pattern. */
2888
2889        for (matched_count = 0;; matched_count++)
2890          {
2891          int local_offsets[2];
2892          int local_workspace[1000];
2893
2894          int rc = internal_dfa_exec(
2895            md,                                   /* fixed match data */
2896            code,                                 /* this subexpression's code */
2897            local_ptr,                            /* where we currently are */
2898            (int)(ptr - start_subject),           /* start offset */
2899            local_offsets,                        /* offset vector */
2900            sizeof(local_offsets)/sizeof(int),    /* size of same */
2901            local_workspace,                      /* workspace vector */
2902            sizeof(local_workspace)/sizeof(int),  /* size of same */
2903            rlevel);                              /* function recursion level */
2904
2905          /* Failed to match */
2906
2907          if (rc < 0)
2908            {
2909            if (rc != PCRE_ERROR_NOMATCH) return rc;
2910            break;
2911            }
2912
2913          /* Matched: break the loop if zero characters matched. */
2914
2915          charcount = local_offsets[1] - local_offsets[0];
2916          if (charcount == 0) break;
2917          local_ptr += charcount;    /* Advance temporary position ptr */
2918          }
2919
2920        /* At this point we have matched the subpattern matched_count
2921        times, and local_ptr is pointing to the character after the end of the
2922        last match. */
2923
2924        if (matched_count > 0 || allow_zero)
2925          {
2926          const pcre_uchar *end_subpattern = code;
2927          int next_state_offset;
2928
2929          do { end_subpattern += GET(end_subpattern, 1); }
2930            while (*end_subpattern == OP_ALT);
2931          next_state_offset =
2932            (int)(end_subpattern - start_code + LINK_SIZE + 1);
2933
2934          /* Optimization: if there are no more active states, and there
2935          are no new states yet set up, then skip over the subject string
2936          right here, to save looping. Otherwise, set up the new state to swing
2937          into action when the end of the matched substring is reached. */
2938
2939          if (i + 1 >= active_count && new_count == 0)
2940            {
2941            ptr = local_ptr;
2942            clen = 0;
2943            ADD_NEW(next_state_offset, 0);
2944            }
2945          else
2946            {
2947            const pcre_uchar *p = ptr;
2948            const pcre_uchar *pp = local_ptr;
2949            charcount = (int)(pp - p);
2950#if defined SUPPORT_UTF && !defined COMPILE_PCRE32
2951            if (utf) while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
2952#endif
2953            ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
2954            }
2955          }
2956        }
2957      break;
2958
2959      /*-----------------------------------------------------------------*/
2960      case OP_ONCE:
2961      case OP_ONCE_NC:
2962        {
2963        int local_offsets[2];
2964        int local_workspace[1000];
2965
2966        int rc = internal_dfa_exec(
2967          md,                                   /* fixed match data */
2968          code,                                 /* this subexpression's code */
2969          ptr,                                  /* where we currently are */
2970          (int)(ptr - start_subject),           /* start offset */
2971          local_offsets,                        /* offset vector */
2972          sizeof(local_offsets)/sizeof(int),    /* size of same */
2973          local_workspace,                      /* workspace vector */
2974          sizeof(local_workspace)/sizeof(int),  /* size of same */
2975          rlevel);                              /* function recursion level */
2976
2977        if (rc >= 0)
2978          {
2979          const pcre_uchar *end_subpattern = code;
2980          int charcount = local_offsets[1] - local_offsets[0];
2981          int next_state_offset, repeat_state_offset;
2982
2983          do { end_subpattern += GET(end_subpattern, 1); }
2984            while (*end_subpattern == OP_ALT);
2985          next_state_offset =
2986            (int)(end_subpattern - start_code + LINK_SIZE + 1);
2987
2988          /* If the end of this subpattern is KETRMAX or KETRMIN, we must
2989          arrange for the repeat state also to be added to the relevant list.
2990          Calculate the offset, or set -1 for no repeat. */
2991
2992          repeat_state_offset = (*end_subpattern == OP_KETRMAX ||
2993                                 *end_subpattern == OP_KETRMIN)?
2994            (int)(end_subpattern - start_code - GET(end_subpattern, 1)) : -1;
2995
2996          /* If we have matched an empty string, add the next state at the
2997          current character pointer. This is important so that the duplicate
2998          checking kicks in, which is what breaks infinite loops that match an
2999          empty string. */
3000
3001          if (charcount == 0)
3002            {
3003            ADD_ACTIVE(next_state_offset, 0);
3004            }
3005
3006          /* Optimization: if there are no more active states, and there
3007          are no new states yet set up, then skip over the subject string
3008          right here, to save looping. Otherwise, set up the new state to swing
3009          into action when the end of the matched substring is reached. */
3010
3011          else if (i + 1 >= active_count && new_count == 0)
3012            {
3013            ptr += charcount;
3014            clen = 0;
3015            ADD_NEW(next_state_offset, 0);
3016
3017            /* If we are adding a repeat state at the new character position,
3018            we must fudge things so that it is the only current state.
3019            Otherwise, it might be a duplicate of one we processed before, and
3020            that would cause it to be skipped. */
3021
3022            if (repeat_state_offset >= 0)
3023              {
3024              next_active_state = active_states;
3025              active_count = 0;
3026              i = -1;
3027              ADD_ACTIVE(repeat_state_offset, 0);
3028              }
3029            }
3030          else
3031            {
3032#if defined SUPPORT_UTF && !defined COMPILE_PCRE32
3033            if (utf)
3034              {
3035              const pcre_uchar *p = start_subject + local_offsets[0];
3036              const pcre_uchar *pp = start_subject + local_offsets[1];
3037              while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
3038              }
3039#endif
3040            ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
3041            if (repeat_state_offset >= 0)
3042              { ADD_NEW_DATA(-repeat_state_offset, 0, (charcount - 1)); }
3043            }
3044          }
3045        else if (rc != PCRE_ERROR_NOMATCH) return rc;
3046        }
3047      break;
3048
3049
3050/* ========================================================================== */
3051      /* Handle callouts */
3052
3053      case OP_CALLOUT:
3054      rrc = 0;
3055      if (PUBL(callout) != NULL)
3056        {
3057        PUBL(callout_block) cb;
3058        cb.version          = 1;   /* Version 1 of the callout block */
3059        cb.callout_number   = code[1];
3060        cb.offset_vector    = offsets;
3061#if defined COMPILE_PCRE8
3062        cb.subject          = (PCRE_SPTR)start_subject;
3063#elif defined COMPILE_PCRE16
3064        cb.subject          = (PCRE_SPTR16)start_subject;
3065#elif defined COMPILE_PCRE32
3066        cb.subject          = (PCRE_SPTR32)start_subject;
3067#endif
3068        cb.subject_length   = (int)(end_subject - start_subject);
3069        cb.start_match      = (int)(current_subject - start_subject);
3070        cb.current_position = (int)(ptr - start_subject);
3071        cb.pattern_position = GET(code, 2);
3072        cb.next_item_length = GET(code, 2 + LINK_SIZE);
3073        cb.capture_top      = 1;
3074        cb.capture_last     = -1;
3075        cb.callout_data     = md->callout_data;
3076        cb.mark             = NULL;   /* No (*MARK) support */
3077        if ((rrc = (*PUBL(callout))(&cb)) < 0) return rrc;   /* Abandon */
3078        }
3079      if (rrc == 0)
3080        { ADD_ACTIVE(state_offset + PRIV(OP_lengths)[OP_CALLOUT], 0); }
3081      break;
3082
3083
3084/* ========================================================================== */
3085      default:        /* Unsupported opcode */
3086      return PCRE_ERROR_DFA_UITEM;
3087      }
3088
3089    NEXT_ACTIVE_STATE: continue;
3090
3091    }      /* End of loop scanning active states */
3092
3093  /* We have finished the processing at the current subject character. If no
3094  new states have been set for the next character, we have found all the
3095  matches that we are going to find. If we are at the top level and partial
3096  matching has been requested, check for appropriate conditions.
3097
3098  The "forced_ fail" variable counts the number of (*F) encountered for the
3099  character. If it is equal to the original active_count (saved in
3100  workspace[1]) it means that (*F) was found on every active state. In this
3101  case we don't want to give a partial match.
3102
3103  The "could_continue" variable is true if a state could have continued but
3104  for the fact that the end of the subject was reached. */
3105
3106  if (new_count <= 0)
3107    {
3108    if (rlevel == 1 &&                               /* Top level, and */
3109        could_continue &&                            /* Some could go on, and */
3110        forced_fail != workspace[1] &&               /* Not all forced fail & */
3111        (                                            /* either... */
3112        (md->moptions & PCRE_PARTIAL_HARD) != 0      /* Hard partial */
3113        ||                                           /* or... */
3114        ((md->moptions & PCRE_PARTIAL_SOFT) != 0 &&  /* Soft partial and */
3115         match_count < 0)                            /* no matches */
3116        ) &&                                         /* And... */
3117        (
3118        partial_newline ||                           /* Either partial NL */
3119          (                                          /* or ... */
3120          ptr >= end_subject &&                /* End of subject and */
3121          ptr > md->start_used_ptr)            /* Inspected non-empty string */
3122          )
3123        )
3124      match_count = PCRE_ERROR_PARTIAL;
3125    DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
3126      "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel, match_count,
3127      rlevel*2-2, SP));
3128    break;        /* In effect, "return", but see the comment below */
3129    }
3130
3131  /* One or more states are active for the next character. */
3132
3133  ptr += clen;    /* Advance to next subject character */
3134  }               /* Loop to move along the subject string */
3135
3136/* Control gets here from "break" a few lines above. We do it this way because
3137if we use "return" above, we have compiler trouble. Some compilers warn if
3138there's nothing here because they think the function doesn't return a value. On
3139the other hand, if we put a dummy statement here, some more clever compilers
3140complain that it can't be reached. Sigh. */
3141
3142return match_count;
3143}
3144
3145
3146
3147
3148/*************************************************
3149*    Execute a Regular Expression - DFA engine   *
3150*************************************************/
3151
3152/* This external function applies a compiled re to a subject string using a DFA
3153engine. This function calls the internal function multiple times if the pattern
3154is not anchored.
3155
3156Arguments:
3157  argument_re     points to the compiled expression
3158  extra_data      points to extra data or is NULL
3159  subject         points to the subject string
3160  length          length of subject string (may contain binary zeros)
3161  start_offset    where to start in the subject string
3162  options         option bits
3163  offsets         vector of match offsets
3164  offsetcount     size of same
3165  workspace       workspace vector
3166  wscount         size of same
3167
3168Returns:          > 0 => number of match offset pairs placed in offsets
3169                  = 0 => offsets overflowed; longest matches are present
3170                   -1 => failed to match
3171                 < -1 => some kind of unexpected problem
3172*/
3173
3174#if defined COMPILE_PCRE8
3175PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
3176pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data,
3177  const char *subject, int length, int start_offset, int options, int *offsets,
3178  int offsetcount, int *workspace, int wscount)
3179#elif defined COMPILE_PCRE16
3180PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
3181pcre16_dfa_exec(const pcre16 *argument_re, const pcre16_extra *extra_data,
3182  PCRE_SPTR16 subject, int length, int start_offset, int options, int *offsets,
3183  int offsetcount, int *workspace, int wscount)
3184#elif defined COMPILE_PCRE32
3185PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
3186pcre32_dfa_exec(const pcre32 *argument_re, const pcre32_extra *extra_data,
3187  PCRE_SPTR32 subject, int length, int start_offset, int options, int *offsets,
3188  int offsetcount, int *workspace, int wscount)
3189#endif
3190{
3191REAL_PCRE *re = (REAL_PCRE *)argument_re;
3192dfa_match_data match_block;
3193dfa_match_data *md = &match_block;
3194BOOL utf, anchored, startline, firstline;
3195const pcre_uchar *current_subject, *end_subject;
3196const pcre_study_data *study = NULL;
3197
3198const pcre_uchar *req_char_ptr;
3199const pcre_uint8 *start_bits = NULL;
3200BOOL has_first_char = FALSE;
3201BOOL has_req_char = FALSE;
3202pcre_uchar first_char = 0;
3203pcre_uchar first_char2 = 0;
3204pcre_uchar req_char = 0;
3205pcre_uchar req_char2 = 0;
3206int newline;
3207
3208/* Plausibility checks */
3209
3210if ((options & ~PUBLIC_DFA_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
3211if (re == NULL || subject == NULL || workspace == NULL ||
3212   (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
3213if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
3214if (wscount < 20) return PCRE_ERROR_DFA_WSSIZE;
3215if (length < 0) return PCRE_ERROR_BADLENGTH;
3216if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET;
3217
3218/* Check that the first field in the block is the magic number. If it is not,
3219return with PCRE_ERROR_BADMAGIC. However, if the magic number is equal to
3220REVERSED_MAGIC_NUMBER we return with PCRE_ERROR_BADENDIANNESS, which
3221means that the pattern is likely compiled with different endianness. */
3222
3223if (re->magic_number != MAGIC_NUMBER)
3224  return re->magic_number == REVERSED_MAGIC_NUMBER?
3225    PCRE_ERROR_BADENDIANNESS:PCRE_ERROR_BADMAGIC;
3226if ((re->flags & PCRE_MODE) == 0) return PCRE_ERROR_BADMODE;
3227
3228/* If restarting after a partial match, do some sanity checks on the contents
3229of the workspace. */
3230
3231if ((options & PCRE_DFA_RESTART) != 0)
3232  {
3233  if ((workspace[0] & (-2)) != 0 || workspace[1] < 1 ||
3234    workspace[1] > (wscount - 2)/INTS_PER_STATEBLOCK)
3235      return PCRE_ERROR_DFA_BADRESTART;
3236  }
3237
3238/* Set up study, callout, and table data */
3239
3240md->tables = re->tables;
3241md->callout_data = NULL;
3242
3243if (extra_data != NULL)
3244  {
3245  unsigned long int flags = extra_data->flags;
3246  if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
3247    study = (const pcre_study_data *)extra_data->study_data;
3248  if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0) return PCRE_ERROR_DFA_UMLIMIT;
3249  if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
3250    return PCRE_ERROR_DFA_UMLIMIT;
3251  if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
3252    md->callout_data = extra_data->callout_data;
3253  if ((flags & PCRE_EXTRA_TABLES) != 0)
3254    md->tables = extra_data->tables;
3255  }
3256
3257/* Set some local values */
3258
3259current_subject = (const pcre_uchar *)subject + start_offset;
3260end_subject = (const pcre_uchar *)subject + length;
3261req_char_ptr = current_subject - 1;
3262
3263#ifdef SUPPORT_UTF
3264/* PCRE_UTF(16|32) have the same value as PCRE_UTF8. */
3265utf = (re->options & PCRE_UTF8) != 0;
3266#else
3267utf = FALSE;
3268#endif
3269
3270anchored = (options & (PCRE_ANCHORED|PCRE_DFA_RESTART)) != 0 ||
3271  (re->options & PCRE_ANCHORED) != 0;
3272
3273/* The remaining fixed data for passing around. */
3274
3275md->start_code = (const pcre_uchar *)argument_re +
3276    re->name_table_offset + re->name_count * re->name_entry_size;
3277md->start_subject = (const pcre_uchar *)subject;
3278md->end_subject = end_subject;
3279md->start_offset = start_offset;
3280md->moptions = options;
3281md->poptions = re->options;
3282
3283/* If the BSR option is not set at match time, copy what was set
3284at compile time. */
3285
3286if ((md->moptions & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) == 0)
3287  {
3288  if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
3289    md->moptions |= re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE);
3290#ifdef BSR_ANYCRLF
3291  else md->moptions |= PCRE_BSR_ANYCRLF;
3292#endif
3293  }
3294
3295/* Handle different types of newline. The three bits give eight cases. If
3296nothing is set at run time, whatever was used at compile time applies. */
3297
3298switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options : (pcre_uint32)options) &
3299         PCRE_NEWLINE_BITS)
3300  {
3301  case 0: newline = NEWLINE; break;   /* Compile-time default */
3302  case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
3303  case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
3304  case PCRE_NEWLINE_CR+
3305       PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
3306  case PCRE_NEWLINE_ANY: newline = -1; break;
3307  case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
3308  default: return PCRE_ERROR_BADNEWLINE;
3309  }
3310
3311if (newline == -2)
3312  {
3313  md->nltype = NLTYPE_ANYCRLF;
3314  }
3315else if (newline < 0)
3316  {
3317  md->nltype = NLTYPE_ANY;
3318  }
3319else
3320  {
3321  md->nltype = NLTYPE_FIXED;
3322  if (newline > 255)
3323    {
3324    md->nllen = 2;
3325    md->nl[0] = (newline >> 8) & 255;
3326    md->nl[1] = newline & 255;
3327    }
3328  else
3329    {
3330    md->nllen = 1;
3331    md->nl[0] = newline;
3332    }
3333  }
3334
3335/* Check a UTF-8 string if required. Unfortunately there's no way of passing
3336back the character offset. */
3337
3338#ifdef SUPPORT_UTF
3339if (utf && (options & PCRE_NO_UTF8_CHECK) == 0)
3340  {
3341  int erroroffset;
3342  int errorcode = PRIV(valid_utf)((pcre_uchar *)subject, length, &erroroffset);
3343  if (errorcode != 0)
3344    {
3345    if (offsetcount >= 2)
3346      {
3347      offsets[0] = erroroffset;
3348      offsets[1] = errorcode;
3349      }
3350#if defined COMPILE_PCRE8
3351    return (errorcode <= PCRE_UTF8_ERR5 && (options & PCRE_PARTIAL_HARD) != 0) ?
3352      PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;
3353#elif defined COMPILE_PCRE16
3354    return (errorcode <= PCRE_UTF16_ERR1 && (options & PCRE_PARTIAL_HARD) != 0) ?
3355      PCRE_ERROR_SHORTUTF16 : PCRE_ERROR_BADUTF16;
3356#elif defined COMPILE_PCRE32
3357    return PCRE_ERROR_BADUTF32;
3358#endif
3359    }
3360#if defined COMPILE_PCRE8 || defined COMPILE_PCRE16
3361  if (start_offset > 0 && start_offset < length &&
3362        NOT_FIRSTCHAR(((PCRE_PUCHAR)subject)[start_offset]))
3363    return PCRE_ERROR_BADUTF8_OFFSET;
3364#endif
3365  }
3366#endif
3367
3368/* If the exec call supplied NULL for tables, use the inbuilt ones. This
3369is a feature that makes it possible to save compiled regex and re-use them
3370in other programs later. */
3371
3372if (md->tables == NULL) md->tables = PRIV(default_tables);
3373
3374/* The "must be at the start of a line" flags are used in a loop when finding
3375where to start. */
3376
3377startline = (re->flags & PCRE_STARTLINE) != 0;
3378firstline = (re->options & PCRE_FIRSTLINE) != 0;
3379
3380/* Set up the first character to match, if available. The first_byte value is
3381never set for an anchored regular expression, but the anchoring may be forced
3382at run time, so we have to test for anchoring. The first char may be unset for
3383an unanchored pattern, of course. If there's no first char and the pattern was
3384studied, there may be a bitmap of possible first characters. */
3385
3386if (!anchored)
3387  {
3388  if ((re->flags & PCRE_FIRSTSET) != 0)
3389    {
3390    has_first_char = TRUE;
3391    first_char = first_char2 = (pcre_uchar)(re->first_char);
3392    if ((re->flags & PCRE_FCH_CASELESS) != 0)
3393      {
3394      first_char2 = TABLE_GET(first_char, md->tables + fcc_offset, first_char);
3395#if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
3396      if (utf && first_char > 127)
3397        first_char2 = UCD_OTHERCASE(first_char);
3398#endif
3399      }
3400    }
3401  else
3402    {
3403    if (!startline && study != NULL &&
3404         (study->flags & PCRE_STUDY_MAPPED) != 0)
3405      start_bits = study->start_bits;
3406    }
3407  }
3408
3409/* For anchored or unanchored matches, there may be a "last known required
3410character" set. */
3411
3412if ((re->flags & PCRE_REQCHSET) != 0)
3413  {
3414  has_req_char = TRUE;
3415  req_char = req_char2 = (pcre_uchar)(re->req_char);
3416  if ((re->flags & PCRE_RCH_CASELESS) != 0)
3417    {
3418    req_char2 = TABLE_GET(req_char, md->tables + fcc_offset, req_char);
3419#if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
3420    if (utf && req_char > 127)
3421      req_char2 = UCD_OTHERCASE(req_char);
3422#endif
3423    }
3424  }
3425
3426/* Call the main matching function, looping for a non-anchored regex after a
3427failed match. If not restarting, perform certain optimizations at the start of
3428a match. */
3429
3430for (;;)
3431  {
3432  int rc;
3433
3434  if ((options & PCRE_DFA_RESTART) == 0)
3435    {
3436    const pcre_uchar *save_end_subject = end_subject;
3437
3438    /* If firstline is TRUE, the start of the match is constrained to the first
3439    line of a multiline string. Implement this by temporarily adjusting
3440    end_subject so that we stop scanning at a newline. If the match fails at
3441    the newline, later code breaks this loop. */
3442
3443    if (firstline)
3444      {
3445      PCRE_PUCHAR t = current_subject;
3446#ifdef SUPPORT_UTF
3447      if (utf)
3448        {
3449        while (t < md->end_subject && !IS_NEWLINE(t))
3450          {
3451          t++;
3452          ACROSSCHAR(t < end_subject, *t, t++);
3453          }
3454        }
3455      else
3456#endif
3457      while (t < md->end_subject && !IS_NEWLINE(t)) t++;
3458      end_subject = t;
3459      }
3460
3461    /* There are some optimizations that avoid running the match if a known
3462    starting point is not found. However, there is an option that disables
3463    these, for testing and for ensuring that all callouts do actually occur.
3464    The option can be set in the regex by (*NO_START_OPT) or passed in
3465    match-time options. */
3466
3467    if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0)
3468      {
3469      /* Advance to a known first pcre_uchar (i.e. data item) */
3470
3471      if (has_first_char)
3472        {
3473        if (first_char != first_char2)
3474          {
3475          pcre_uchar csc;
3476          while (current_subject < end_subject &&
3477                 (csc = UCHAR21TEST(current_subject)) != first_char && csc != first_char2)
3478            current_subject++;
3479          }
3480        else
3481          while (current_subject < end_subject &&
3482                 UCHAR21TEST(current_subject) != first_char)
3483            current_subject++;
3484        }
3485
3486      /* Or to just after a linebreak for a multiline match if possible */
3487
3488      else if (startline)
3489        {
3490        if (current_subject > md->start_subject + start_offset)
3491          {
3492#ifdef SUPPORT_UTF
3493          if (utf)
3494            {
3495            while (current_subject < end_subject &&
3496                   !WAS_NEWLINE(current_subject))
3497              {
3498              current_subject++;
3499              ACROSSCHAR(current_subject < end_subject, *current_subject,
3500                current_subject++);
3501              }
3502            }
3503          else
3504#endif
3505          while (current_subject < end_subject && !WAS_NEWLINE(current_subject))
3506            current_subject++;
3507
3508          /* If we have just passed a CR and the newline option is ANY or
3509          ANYCRLF, and we are now at a LF, advance the match position by one
3510          more character. */
3511
3512          if (UCHAR21TEST(current_subject - 1) == CHAR_CR &&
3513               (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
3514               current_subject < end_subject &&
3515               UCHAR21TEST(current_subject) == CHAR_NL)
3516            current_subject++;
3517          }
3518        }
3519
3520      /* Advance to a non-unique first pcre_uchar after study */
3521
3522      else if (start_bits != NULL)
3523        {
3524        while (current_subject < end_subject)
3525          {
3526          register pcre_uint32 c = UCHAR21TEST(current_subject);
3527#ifndef COMPILE_PCRE8
3528          if (c > 255) c = 255;
3529#endif
3530          if ((start_bits[c/8] & (1 << (c&7))) != 0) break;
3531          current_subject++;
3532          }
3533        }
3534      }
3535
3536    /* Restore fudged end_subject */
3537
3538    end_subject = save_end_subject;
3539
3540    /* The following two optimizations are disabled for partial matching or if
3541    disabling is explicitly requested (and of course, by the test above, this
3542    code is not obeyed when restarting after a partial match). */
3543
3544    if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0 &&
3545        (options & (PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT)) == 0)
3546      {
3547      /* If the pattern was studied, a minimum subject length may be set. This
3548      is a lower bound; no actual string of that length may actually match the
3549      pattern. Although the value is, strictly, in characters, we treat it as
3550      in pcre_uchar units to avoid spending too much time in this optimization.
3551      */
3552
3553      if (study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0 &&
3554          (pcre_uint32)(end_subject - current_subject) < study->minlength)
3555        return PCRE_ERROR_NOMATCH;
3556
3557      /* If req_char is set, we know that that pcre_uchar must appear in the
3558      subject for the match to succeed. If the first pcre_uchar is set,
3559      req_char must be later in the subject; otherwise the test starts at the
3560      match point. This optimization can save a huge amount of work in patterns
3561      with nested unlimited repeats that aren't going to match. Writing
3562      separate code for cased/caseless versions makes it go faster, as does
3563      using an autoincrement and backing off on a match.
3564
3565      HOWEVER: when the subject string is very, very long, searching to its end
3566      can take a long time, and give bad performance on quite ordinary
3567      patterns. This showed up when somebody was matching /^C/ on a 32-megabyte
3568      string... so we don't do this when the string is sufficiently long. */
3569
3570      if (has_req_char && end_subject - current_subject < REQ_BYTE_MAX)
3571        {
3572        register PCRE_PUCHAR p = current_subject + (has_first_char? 1:0);
3573
3574        /* We don't need to repeat the search if we haven't yet reached the
3575        place we found it at last time. */
3576
3577        if (p > req_char_ptr)
3578          {
3579          if (req_char != req_char2)
3580            {
3581            while (p < end_subject)
3582              {
3583              register pcre_uint32 pp = UCHAR21INCTEST(p);
3584              if (pp == req_char || pp == req_char2) { p--; break; }
3585              }
3586            }
3587          else
3588            {
3589            while (p < end_subject)
3590              {
3591              if (UCHAR21INCTEST(p) == req_char) { p--; break; }
3592              }
3593            }
3594
3595          /* If we can't find the required pcre_uchar, break the matching loop,
3596          which will cause a return or PCRE_ERROR_NOMATCH. */
3597
3598          if (p >= end_subject) break;
3599
3600          /* If we have found the required pcre_uchar, save the point where we
3601          found it, so that we don't search again next time round the loop if
3602          the start hasn't passed this point yet. */
3603
3604          req_char_ptr = p;
3605          }
3606        }
3607      }
3608    }   /* End of optimizations that are done when not restarting */
3609
3610  /* OK, now we can do the business */
3611
3612  md->start_used_ptr = current_subject;
3613  md->recursive = NULL;
3614
3615  rc = internal_dfa_exec(
3616    md,                                /* fixed match data */
3617    md->start_code,                    /* this subexpression's code */
3618    current_subject,                   /* where we currently are */
3619    start_offset,                      /* start offset in subject */
3620    offsets,                           /* offset vector */
3621    offsetcount,                       /* size of same */
3622    workspace,                         /* workspace vector */
3623    wscount,                           /* size of same */
3624    0);                                /* function recurse level */
3625
3626  /* Anything other than "no match" means we are done, always; otherwise, carry
3627  on only if not anchored. */
3628
3629  if (rc != PCRE_ERROR_NOMATCH || anchored)
3630    {
3631    if (rc == PCRE_ERROR_PARTIAL && offsetcount >= 2)
3632      {
3633      offsets[0] = (int)(md->start_used_ptr - (PCRE_PUCHAR)subject);
3634      offsets[1] = (int)(end_subject - (PCRE_PUCHAR)subject);
3635      if (offsetcount > 2)
3636        offsets[2] = (int)(current_subject - (PCRE_PUCHAR)subject);
3637      }
3638    return rc;
3639    }
3640
3641  /* Advance to the next subject character unless we are at the end of a line
3642  and firstline is set. */
3643
3644  if (firstline && IS_NEWLINE(current_subject)) break;
3645  current_subject++;
3646#ifdef SUPPORT_UTF
3647  if (utf)
3648    {
3649    ACROSSCHAR(current_subject < end_subject, *current_subject,
3650      current_subject++);
3651    }
3652#endif
3653  if (current_subject > end_subject) break;
3654
3655  /* If we have just passed a CR and we are now at a LF, and the pattern does
3656  not contain any explicit matches for \r or \n, and the newline option is CRLF
3657  or ANY or ANYCRLF, advance the match position by one more character. */
3658
3659  if (UCHAR21TEST(current_subject - 1) == CHAR_CR &&
3660      current_subject < end_subject &&
3661      UCHAR21TEST(current_subject) == CHAR_NL &&
3662      (re->flags & PCRE_HASCRORLF) == 0 &&
3663        (md->nltype == NLTYPE_ANY ||
3664         md->nltype == NLTYPE_ANYCRLF ||
3665         md->nllen == 2))
3666    current_subject++;
3667
3668  }   /* "Bumpalong" loop */
3669
3670return PCRE_ERROR_NOMATCH;
3671}
3672
3673/* End of pcre_dfa_exec.c */
3674