1/*************************************************
2*      Perl-Compatible Regular Expressions       *
3*************************************************/
4
5/* PCRE is a library of functions to support regular expressions whose syntax
6and semantics are as close as possible to those of the Perl 5 language.
7
8                       Written by Philip Hazel
9     Original API code Copyright (c) 1997-2012 University of Cambridge
10         New API code Copyright (c) 2016 University of Cambridge
11
12-----------------------------------------------------------------------------
13Redistribution and use in source and binary forms, with or without
14modification, are permitted provided that the following conditions are met:
15
16    * Redistributions of source code must retain the above copyright notice,
17      this list of conditions and the following disclaimer.
18
19    * Redistributions in binary form must reproduce the above copyright
20      notice, this list of conditions and the following disclaimer in the
21      documentation and/or other materials provided with the distribution.
22
23    * Neither the name of the University of Cambridge nor the names of its
24      contributors may be used to endorse or promote products derived from
25      this software without specific prior written permission.
26
27THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37POSSIBILITY OF SUCH DAMAGE.
38-----------------------------------------------------------------------------
39*/
40
41
42/* This module contains the external function pcre2_dfa_match(), which is an
43alternative matching function that uses a sort of DFA algorithm (not a true
44FSM). This is NOT Perl-compatible, but it has advantages in certain
45applications. */
46
47
48/* NOTE ABOUT PERFORMANCE: A user of this function sent some code that improved
49the performance of his patterns greatly. I could not use it as it stood, as it
50was not thread safe, and made assumptions about pattern sizes. Also, it caused
51test 7 to loop, and test 9 to crash with a segfault.
52
53The issue is the check for duplicate states, which is done by a simple linear
54search up the state list. (Grep for "duplicate" below to find the code.) For
55many patterns, there will never be many states active at one time, so a simple
56linear search is fine. In patterns that have many active states, it might be a
57bottleneck. The suggested code used an indexing scheme to remember which states
58had previously been used for each character, and avoided the linear search when
59it knew there was no chance of a duplicate. This was implemented when adding
60states to the state lists.
61
62I wrote some thread-safe, not-limited code to try something similar at the time
63of checking for duplicates (instead of when adding states), using index vectors
64on the stack. It did give a 13% improvement with one specially constructed
65pattern for certain subject strings, but on other strings and on many of the
66simpler patterns in the test suite it did worse. The major problem, I think,
67was the extra time to initialize the index. This had to be done for each call
68of internal_dfa_match(). (The supplied patch used a static vector, initialized
69only once - I suspect this was the cause of the problems with the tests.)
70
71Overall, I concluded that the gains in some cases did not outweigh the losses
72in others, so I abandoned this code. */
73
74
75#ifdef HAVE_CONFIG_H
76#include "config.h"
77#endif
78
79#define NLBLOCK mb             /* Block containing newline information */
80#define PSSTART start_subject  /* Field containing processed string start */
81#define PSEND   end_subject    /* Field containing processed string end */
82
83#include "pcre2_internal.h"
84
85#define PUBLIC_DFA_MATCH_OPTIONS \
86  (PCRE2_ANCHORED|PCRE2_NOTBOL|PCRE2_NOTEOL|PCRE2_NOTEMPTY| \
87   PCRE2_NOTEMPTY_ATSTART|PCRE2_NO_UTF_CHECK|PCRE2_PARTIAL_HARD| \
88   PCRE2_PARTIAL_SOFT|PCRE2_DFA_SHORTEST|PCRE2_DFA_RESTART)
89
90
91/*************************************************
92*      Code parameters and static tables         *
93*************************************************/
94
95/* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes
96into others, under special conditions. A gap of 20 between the blocks should be
97enough. The resulting opcodes don't have to be less than 256 because they are
98never stored, so we push them well clear of the normal opcodes. */
99
100#define OP_PROP_EXTRA       300
101#define OP_EXTUNI_EXTRA     320
102#define OP_ANYNL_EXTRA      340
103#define OP_HSPACE_EXTRA     360
104#define OP_VSPACE_EXTRA     380
105
106
107/* This table identifies those opcodes that are followed immediately by a
108character that is to be tested in some way. This makes it possible to
109centralize the loading of these characters. In the case of Type * etc, the
110"character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a
111small value. Non-zero values in the table are the offsets from the opcode where
112the character is to be found. ***NOTE*** If the start of this table is
113modified, the three tables that follow must also be modified. */
114
115static const uint8_t coptable[] = {
116  0,                             /* End                                    */
117  0, 0, 0, 0, 0,                 /* \A, \G, \K, \B, \b                     */
118  0, 0, 0, 0, 0, 0,              /* \D, \d, \S, \s, \W, \w                 */
119  0, 0, 0,                       /* Any, AllAny, Anybyte                   */
120  0, 0,                          /* \P, \p                                 */
121  0, 0, 0, 0, 0,                 /* \R, \H, \h, \V, \v                     */
122  0,                             /* \X                                     */
123  0, 0, 0, 0, 0, 0,              /* \Z, \z, $, $M, ^, ^M                   */
124  1,                             /* Char                                   */
125  1,                             /* Chari                                  */
126  1,                             /* not                                    */
127  1,                             /* noti                                   */
128  /* Positive single-char repeats                                          */
129  1, 1, 1, 1, 1, 1,              /* *, *?, +, +?, ?, ??                    */
130  1+IMM2_SIZE, 1+IMM2_SIZE,      /* upto, minupto                          */
131  1+IMM2_SIZE,                   /* exact                                  */
132  1, 1, 1, 1+IMM2_SIZE,          /* *+, ++, ?+, upto+                      */
133  1, 1, 1, 1, 1, 1,              /* *I, *?I, +I, +?I, ?I, ??I              */
134  1+IMM2_SIZE, 1+IMM2_SIZE,      /* upto I, minupto I                      */
135  1+IMM2_SIZE,                   /* exact I                                */
136  1, 1, 1, 1+IMM2_SIZE,          /* *+I, ++I, ?+I, upto+I                  */
137  /* Negative single-char repeats - only for chars < 256                   */
138  1, 1, 1, 1, 1, 1,              /* NOT *, *?, +, +?, ?, ??                */
139  1+IMM2_SIZE, 1+IMM2_SIZE,      /* NOT upto, minupto                      */
140  1+IMM2_SIZE,                   /* NOT exact                              */
141  1, 1, 1, 1+IMM2_SIZE,          /* NOT *+, ++, ?+, upto+                  */
142  1, 1, 1, 1, 1, 1,              /* NOT *I, *?I, +I, +?I, ?I, ??I          */
143  1+IMM2_SIZE, 1+IMM2_SIZE,      /* NOT upto I, minupto I                  */
144  1+IMM2_SIZE,                   /* NOT exact I                            */
145  1, 1, 1, 1+IMM2_SIZE,          /* NOT *+I, ++I, ?+I, upto+I              */
146  /* Positive type repeats                                                 */
147  1, 1, 1, 1, 1, 1,              /* Type *, *?, +, +?, ?, ??               */
148  1+IMM2_SIZE, 1+IMM2_SIZE,      /* Type upto, minupto                     */
149  1+IMM2_SIZE,                   /* Type exact                             */
150  1, 1, 1, 1+IMM2_SIZE,          /* Type *+, ++, ?+, upto+                 */
151  /* Character class & ref repeats                                         */
152  0, 0, 0, 0, 0, 0,              /* *, *?, +, +?, ?, ??                    */
153  0, 0,                          /* CRRANGE, CRMINRANGE                    */
154  0, 0, 0, 0,                    /* Possessive *+, ++, ?+, CRPOSRANGE      */
155  0,                             /* CLASS                                  */
156  0,                             /* NCLASS                                 */
157  0,                             /* XCLASS - variable length               */
158  0,                             /* REF                                    */
159  0,                             /* REFI                                   */
160  0,                             /* DNREF                                  */
161  0,                             /* DNREFI                                 */
162  0,                             /* RECURSE                                */
163  0,                             /* CALLOUT                                */
164  0,                             /* CALLOUT_STR                            */
165  0,                             /* Alt                                    */
166  0,                             /* Ket                                    */
167  0,                             /* KetRmax                                */
168  0,                             /* KetRmin                                */
169  0,                             /* KetRpos                                */
170  0,                             /* Reverse                                */
171  0,                             /* Assert                                 */
172  0,                             /* Assert not                             */
173  0,                             /* Assert behind                          */
174  0,                             /* Assert behind not                      */
175  0, 0,                          /* ONCE, ONCE_NC                          */
176  0, 0, 0, 0, 0,                 /* BRA, BRAPOS, CBRA, CBRAPOS, COND       */
177  0, 0, 0, 0, 0,                 /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND  */
178  0, 0,                          /* CREF, DNCREF                           */
179  0, 0,                          /* RREF, DNRREF                           */
180  0, 0,                          /* FALSE, TRUE                            */
181  0, 0, 0,                       /* BRAZERO, BRAMINZERO, BRAPOSZERO        */
182  0, 0, 0,                       /* MARK, PRUNE, PRUNE_ARG                 */
183  0, 0, 0, 0,                    /* SKIP, SKIP_ARG, THEN, THEN_ARG         */
184  0, 0, 0, 0,                    /* COMMIT, FAIL, ACCEPT, ASSERT_ACCEPT    */
185  0, 0, 0                        /* CLOSE, SKIPZERO, DEFINE                */
186};
187
188/* This table identifies those opcodes that inspect a character. It is used to
189remember the fact that a character could have been inspected when the end of
190the subject is reached. ***NOTE*** If the start of this table is modified, the
191two tables that follow must also be modified. */
192
193static const uint8_t poptable[] = {
194  0,                             /* End                                    */
195  0, 0, 0, 1, 1,                 /* \A, \G, \K, \B, \b                     */
196  1, 1, 1, 1, 1, 1,              /* \D, \d, \S, \s, \W, \w                 */
197  1, 1, 1,                       /* Any, AllAny, Anybyte                   */
198  1, 1,                          /* \P, \p                                 */
199  1, 1, 1, 1, 1,                 /* \R, \H, \h, \V, \v                     */
200  1,                             /* \X                                     */
201  0, 0, 0, 0, 0, 0,              /* \Z, \z, $, $M, ^, ^M                   */
202  1,                             /* Char                                   */
203  1,                             /* Chari                                  */
204  1,                             /* not                                    */
205  1,                             /* noti                                   */
206  /* Positive single-char repeats                                          */
207  1, 1, 1, 1, 1, 1,              /* *, *?, +, +?, ?, ??                    */
208  1, 1, 1,                       /* upto, minupto, exact                   */
209  1, 1, 1, 1,                    /* *+, ++, ?+, upto+                      */
210  1, 1, 1, 1, 1, 1,              /* *I, *?I, +I, +?I, ?I, ??I              */
211  1, 1, 1,                       /* upto I, minupto I, exact I             */
212  1, 1, 1, 1,                    /* *+I, ++I, ?+I, upto+I                  */
213  /* Negative single-char repeats - only for chars < 256                   */
214  1, 1, 1, 1, 1, 1,              /* NOT *, *?, +, +?, ?, ??                */
215  1, 1, 1,                       /* NOT upto, minupto, exact               */
216  1, 1, 1, 1,                    /* NOT *+, ++, ?+, upto+                  */
217  1, 1, 1, 1, 1, 1,              /* NOT *I, *?I, +I, +?I, ?I, ??I          */
218  1, 1, 1,                       /* NOT upto I, minupto I, exact I         */
219  1, 1, 1, 1,                    /* NOT *+I, ++I, ?+I, upto+I              */
220  /* Positive type repeats                                                 */
221  1, 1, 1, 1, 1, 1,              /* Type *, *?, +, +?, ?, ??               */
222  1, 1, 1,                       /* Type upto, minupto, exact              */
223  1, 1, 1, 1,                    /* Type *+, ++, ?+, upto+                 */
224  /* Character class & ref repeats                                         */
225  1, 1, 1, 1, 1, 1,              /* *, *?, +, +?, ?, ??                    */
226  1, 1,                          /* CRRANGE, CRMINRANGE                    */
227  1, 1, 1, 1,                    /* Possessive *+, ++, ?+, CRPOSRANGE      */
228  1,                             /* CLASS                                  */
229  1,                             /* NCLASS                                 */
230  1,                             /* XCLASS - variable length               */
231  0,                             /* REF                                    */
232  0,                             /* REFI                                   */
233  0,                             /* DNREF                                  */
234  0,                             /* DNREFI                                 */
235  0,                             /* RECURSE                                */
236  0,                             /* CALLOUT                                */
237  0,                             /* CALLOUT_STR                            */
238  0,                             /* Alt                                    */
239  0,                             /* Ket                                    */
240  0,                             /* KetRmax                                */
241  0,                             /* KetRmin                                */
242  0,                             /* KetRpos                                */
243  0,                             /* Reverse                                */
244  0,                             /* Assert                                 */
245  0,                             /* Assert not                             */
246  0,                             /* Assert behind                          */
247  0,                             /* Assert behind not                      */
248  0, 0,                          /* ONCE, ONCE_NC                          */
249  0, 0, 0, 0, 0,                 /* BRA, BRAPOS, CBRA, CBRAPOS, COND       */
250  0, 0, 0, 0, 0,                 /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND  */
251  0, 0,                          /* CREF, DNCREF                           */
252  0, 0,                          /* RREF, DNRREF                           */
253  0, 0,                          /* FALSE, TRUE                            */
254  0, 0, 0,                       /* BRAZERO, BRAMINZERO, BRAPOSZERO        */
255  0, 0, 0,                       /* MARK, PRUNE, PRUNE_ARG                 */
256  0, 0, 0, 0,                    /* SKIP, SKIP_ARG, THEN, THEN_ARG         */
257  0, 0, 0, 0,                    /* COMMIT, FAIL, ACCEPT, ASSERT_ACCEPT    */
258  0, 0, 0                        /* CLOSE, SKIPZERO, DEFINE                */
259};
260
261/* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,
262and \w */
263
264static const uint8_t toptable1[] = {
265  0, 0, 0, 0, 0, 0,
266  ctype_digit, ctype_digit,
267  ctype_space, ctype_space,
268  ctype_word,  ctype_word,
269  0, 0                            /* OP_ANY, OP_ALLANY */
270};
271
272static const uint8_t toptable2[] = {
273  0, 0, 0, 0, 0, 0,
274  ctype_digit, 0,
275  ctype_space, 0,
276  ctype_word,  0,
277  1, 1                            /* OP_ANY, OP_ALLANY */
278};
279
280
281/* Structure for holding data about a particular state, which is in effect the
282current data for an active path through the match tree. It must consist
283entirely of ints because the working vector we are passed, and which we put
284these structures in, is a vector of ints. */
285
286typedef struct stateblock {
287  int offset;                     /* Offset to opcode (-ve has meaning) */
288  int count;                      /* Count for repeats */
289  int data;                       /* Some use extra data */
290} stateblock;
291
292#define INTS_PER_STATEBLOCK  (int)(sizeof(stateblock)/sizeof(int))
293
294
295
296/*************************************************
297*     Match a Regular Expression - DFA engine    *
298*************************************************/
299
300/* This internal function applies a compiled pattern to a subject string,
301starting at a given point, using a DFA engine. This function is called from the
302external one, possibly multiple times if the pattern is not anchored. The
303function calls itself recursively for some kinds of subpattern.
304
305Arguments:
306  mb                the match_data block with fixed information
307  this_start_code   the opening bracket of this subexpression's code
308  current_subject   where we currently are in the subject string
309  start_offset      start offset in the subject string
310  offsets           vector to contain the matching string offsets
311  offsetcount       size of same
312  workspace         vector of workspace
313  wscount           size of same
314  rlevel            function call recursion level
315
316Returns:            > 0 => number of match offset pairs placed in offsets
317                    = 0 => offsets overflowed; longest matches are present
318                     -1 => failed to match
319                   < -1 => some kind of unexpected problem
320
321The following macros are used for adding states to the two state vectors (one
322for the current character, one for the following character). */
323
324#define ADD_ACTIVE(x,y) \
325  if (active_count++ < wscount) \
326    { \
327    next_active_state->offset = (x); \
328    next_active_state->count  = (y); \
329    next_active_state++; \
330    } \
331  else return PCRE2_ERROR_DFA_WSSIZE
332
333#define ADD_ACTIVE_DATA(x,y,z) \
334  if (active_count++ < wscount) \
335    { \
336    next_active_state->offset = (x); \
337    next_active_state->count  = (y); \
338    next_active_state->data   = (z); \
339    next_active_state++; \
340    } \
341  else return PCRE2_ERROR_DFA_WSSIZE
342
343#define ADD_NEW(x,y) \
344  if (new_count++ < wscount) \
345    { \
346    next_new_state->offset = (x); \
347    next_new_state->count  = (y); \
348    next_new_state++; \
349    } \
350  else return PCRE2_ERROR_DFA_WSSIZE
351
352#define ADD_NEW_DATA(x,y,z) \
353  if (new_count++ < wscount) \
354    { \
355    next_new_state->offset = (x); \
356    next_new_state->count  = (y); \
357    next_new_state->data   = (z); \
358    next_new_state++; \
359    } \
360  else return PCRE2_ERROR_DFA_WSSIZE
361
362/* And now, here is the code */
363
364static int
365internal_dfa_match(
366  dfa_match_block *mb,
367  PCRE2_SPTR this_start_code,
368  PCRE2_SPTR current_subject,
369  PCRE2_SIZE start_offset,
370  PCRE2_SIZE *offsets,
371  uint32_t offsetcount,
372  int *workspace,
373  int wscount,
374  int  rlevel)
375{
376stateblock *active_states, *new_states, *temp_states;
377stateblock *next_active_state, *next_new_state;
378
379const uint8_t *ctypes, *lcc, *fcc;
380PCRE2_SPTR ptr;
381PCRE2_SPTR end_code;
382PCRE2_SPTR first_op;
383
384dfa_recursion_info new_recursive;
385
386int active_count, new_count, match_count;
387
388/* Some fields in the mb block are frequently referenced, so we load them into
389independent variables in the hope that this will perform better. */
390
391PCRE2_SPTR start_subject = mb->start_subject;
392PCRE2_SPTR end_subject = mb->end_subject;
393PCRE2_SPTR start_code = mb->start_code;
394
395#ifdef SUPPORT_UNICODE
396BOOL utf = (mb->poptions & PCRE2_UTF) != 0;
397#else
398BOOL utf = FALSE;
399#endif
400
401BOOL reset_could_continue = FALSE;
402
403rlevel++;
404offsetcount &= (uint32_t)(-2);  /* Round down */
405
406wscount -= 2;
407wscount = (wscount - (wscount % (INTS_PER_STATEBLOCK * 2))) /
408          (2 * INTS_PER_STATEBLOCK);
409
410ctypes = mb->tables + ctypes_offset;
411lcc = mb->tables + lcc_offset;
412fcc = mb->tables + fcc_offset;
413
414match_count = PCRE2_ERROR_NOMATCH;   /* A negative number */
415
416active_states = (stateblock *)(workspace + 2);
417next_new_state = new_states = active_states + wscount;
418new_count = 0;
419
420first_op = this_start_code + 1 + LINK_SIZE +
421  ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA ||
422    *this_start_code == OP_CBRAPOS || *this_start_code == OP_SCBRAPOS)
423    ? IMM2_SIZE:0);
424
425/* The first thing in any (sub) pattern is a bracket of some sort. Push all
426the alternative states onto the list, and find out where the end is. This
427makes is possible to use this function recursively, when we want to stop at a
428matching internal ket rather than at the end.
429
430If the first opcode in the first alternative is OP_REVERSE, we are dealing with
431a backward assertion. In that case, we have to find out the maximum amount to
432move back, and set up each alternative appropriately. */
433
434if (*first_op == OP_REVERSE)
435  {
436  size_t max_back = 0;
437  size_t gone_back;
438
439  end_code = this_start_code;
440  do
441    {
442    size_t back = (size_t)GET(end_code, 2+LINK_SIZE);
443    if (back > max_back) max_back = back;
444    end_code += GET(end_code, 1);
445    }
446  while (*end_code == OP_ALT);
447
448  /* If we can't go back the amount required for the longest lookbehind
449  pattern, go back as far as we can; some alternatives may still be viable. */
450
451#ifdef SUPPORT_UNICODE
452  /* In character mode we have to step back character by character */
453
454  if (utf)
455    {
456    for (gone_back = 0; gone_back < max_back; gone_back++)
457      {
458      if (current_subject <= start_subject) break;
459      current_subject--;
460      ACROSSCHAR(current_subject > start_subject, *current_subject, current_subject--);
461      }
462    }
463  else
464#endif
465
466  /* In byte-mode we can do this quickly. */
467
468    {
469    size_t current_offset = (size_t)(current_subject - start_subject);
470    gone_back = (current_offset < max_back)? current_offset : max_back;
471    current_subject -= gone_back;
472    }
473
474  /* Save the earliest consulted character */
475
476  if (current_subject < mb->start_used_ptr)
477    mb->start_used_ptr = current_subject;
478
479  /* Now we can process the individual branches. */
480
481  end_code = this_start_code;
482  do
483    {
484    size_t back = (size_t)GET(end_code, 2+LINK_SIZE);
485    if (back <= gone_back)
486      {
487      int bstate = (int)(end_code - start_code + 2 + 2*LINK_SIZE);
488      ADD_NEW_DATA(-bstate, 0, (int)(gone_back - back));
489      }
490    end_code += GET(end_code, 1);
491    }
492  while (*end_code == OP_ALT);
493 }
494
495/* This is the code for a "normal" subpattern (not a backward assertion). The
496start of a whole pattern is always one of these. If we are at the top level,
497we may be asked to restart matching from the same point that we reached for a
498previous partial match. We still have to scan through the top-level branches to
499find the end state. */
500
501else
502  {
503  end_code = this_start_code;
504
505  /* Restarting */
506
507  if (rlevel == 1 && (mb->moptions & PCRE2_DFA_RESTART) != 0)
508    {
509    do { end_code += GET(end_code, 1); } while (*end_code == OP_ALT);
510    new_count = workspace[1];
511    if (!workspace[0])
512      memcpy(new_states, active_states, (size_t)new_count * sizeof(stateblock));
513    }
514
515  /* Not restarting */
516
517  else
518    {
519    int length = 1 + LINK_SIZE +
520      ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA ||
521        *this_start_code == OP_CBRAPOS || *this_start_code == OP_SCBRAPOS)
522        ? IMM2_SIZE:0);
523    do
524      {
525      ADD_NEW((int)(end_code - start_code + length), 0);
526      end_code += GET(end_code, 1);
527      length = 1 + LINK_SIZE;
528      }
529    while (*end_code == OP_ALT);
530    }
531  }
532
533workspace[0] = 0;    /* Bit indicating which vector is current */
534
535/* Loop for scanning the subject */
536
537ptr = current_subject;
538for (;;)
539  {
540  int i, j;
541  int clen, dlen;
542  uint32_t c, d;
543  int forced_fail = 0;
544  BOOL partial_newline = FALSE;
545  BOOL could_continue = reset_could_continue;
546  reset_could_continue = FALSE;
547
548  if (ptr > mb->last_used_ptr) mb->last_used_ptr = ptr;
549
550  /* Make the new state list into the active state list and empty the
551  new state list. */
552
553  temp_states = active_states;
554  active_states = new_states;
555  new_states = temp_states;
556  active_count = new_count;
557  new_count = 0;
558
559  workspace[0] ^= 1;              /* Remember for the restarting feature */
560  workspace[1] = active_count;
561
562  /* Set the pointers for adding new states */
563
564  next_active_state = active_states + active_count;
565  next_new_state = new_states;
566
567  /* Load the current character from the subject outside the loop, as many
568  different states may want to look at it, and we assume that at least one
569  will. */
570
571  if (ptr < end_subject)
572    {
573    clen = 1;        /* Number of data items in the character */
574#ifdef SUPPORT_UNICODE
575    GETCHARLENTEST(c, ptr, clen);
576#else
577    c = *ptr;
578#endif  /* SUPPORT_UNICODE */
579    }
580  else
581    {
582    clen = 0;        /* This indicates the end of the subject */
583    c = NOTACHAR;    /* This value should never actually be used */
584    }
585
586  /* Scan up the active states and act on each one. The result of an action
587  may be to add more states to the currently active list (e.g. on hitting a
588  parenthesis) or it may be to put states on the new list, for considering
589  when we move the character pointer on. */
590
591  for (i = 0; i < active_count; i++)
592    {
593    stateblock *current_state = active_states + i;
594    BOOL caseless = FALSE;
595    PCRE2_SPTR code;
596    uint32_t codevalue;
597    int state_offset = current_state->offset;
598    int rrc;
599    int count;
600
601    /* A negative offset is a special case meaning "hold off going to this
602    (negated) state until the number of characters in the data field have
603    been skipped". If the could_continue flag was passed over from a previous
604    state, arrange for it to passed on. */
605
606    if (state_offset < 0)
607      {
608      if (current_state->data > 0)
609        {
610        ADD_NEW_DATA(state_offset, current_state->count,
611          current_state->data - 1);
612        if (could_continue) reset_could_continue = TRUE;
613        continue;
614        }
615      else
616        {
617        current_state->offset = state_offset = -state_offset;
618        }
619      }
620
621    /* Check for a duplicate state with the same count, and skip if found.
622    See the note at the head of this module about the possibility of improving
623    performance here. */
624
625    for (j = 0; j < i; j++)
626      {
627      if (active_states[j].offset == state_offset &&
628          active_states[j].count == current_state->count)
629        goto NEXT_ACTIVE_STATE;
630      }
631
632    /* The state offset is the offset to the opcode */
633
634    code = start_code + state_offset;
635    codevalue = *code;
636
637    /* If this opcode inspects a character, but we are at the end of the
638    subject, remember the fact for use when testing for a partial match. */
639
640    if (clen == 0 && poptable[codevalue] != 0)
641      could_continue = TRUE;
642
643    /* If this opcode is followed by an inline character, load it. It is
644    tempting to test for the presence of a subject character here, but that
645    is wrong, because sometimes zero repetitions of the subject are
646    permitted.
647
648    We also use this mechanism for opcodes such as OP_TYPEPLUS that take an
649    argument that is not a data character - but is always one byte long because
650    the values are small. We have to take special action to deal with  \P, \p,
651    \H, \h, \V, \v and \X in this case. To keep the other cases fast, convert
652    these ones to new opcodes. */
653
654    if (coptable[codevalue] > 0)
655      {
656      dlen = 1;
657#ifdef SUPPORT_UNICODE
658      if (utf) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else
659#endif  /* SUPPORT_UNICODE */
660      d = code[coptable[codevalue]];
661      if (codevalue >= OP_TYPESTAR)
662        {
663        switch(d)
664          {
665          case OP_ANYBYTE: return PCRE2_ERROR_DFA_UITEM;
666          case OP_NOTPROP:
667          case OP_PROP: codevalue += OP_PROP_EXTRA; break;
668          case OP_ANYNL: codevalue += OP_ANYNL_EXTRA; break;
669          case OP_EXTUNI: codevalue += OP_EXTUNI_EXTRA; break;
670          case OP_NOT_HSPACE:
671          case OP_HSPACE: codevalue += OP_HSPACE_EXTRA; break;
672          case OP_NOT_VSPACE:
673          case OP_VSPACE: codevalue += OP_VSPACE_EXTRA; break;
674          default: break;
675          }
676        }
677      }
678    else
679      {
680      dlen = 0;         /* Not strictly necessary, but compilers moan */
681      d = NOTACHAR;     /* if these variables are not set. */
682      }
683
684
685    /* Now process the individual opcodes */
686
687    switch (codevalue)
688      {
689/* ========================================================================== */
690      /* These cases are never obeyed. This is a fudge that causes a compile-
691      time error if the vectors coptable or poptable, which are indexed by
692      opcode, are not the correct length. It seems to be the only way to do
693      such a check at compile time, as the sizeof() operator does not work
694      in the C preprocessor. */
695
696      case OP_TABLE_LENGTH:
697      case OP_TABLE_LENGTH +
698        ((sizeof(coptable) == OP_TABLE_LENGTH) &&
699         (sizeof(poptable) == OP_TABLE_LENGTH)):
700      break;
701
702/* ========================================================================== */
703      /* Reached a closing bracket. If not at the end of the pattern, carry
704      on with the next opcode. For repeating opcodes, also add the repeat
705      state. Note that KETRPOS will always be encountered at the end of the
706      subpattern, because the possessive subpattern repeats are always handled
707      using recursive calls. Thus, it never adds any new states.
708
709      At the end of the (sub)pattern, unless we have an empty string and
710      PCRE2_NOTEMPTY is set, or PCRE2_NOTEMPTY_ATSTART is set and we are at the
711      start of the subject, save the match data, shifting up all previous
712      matches so we always have the longest first. */
713
714      case OP_KET:
715      case OP_KETRMIN:
716      case OP_KETRMAX:
717      case OP_KETRPOS:
718      if (code != end_code)
719        {
720        ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0);
721        if (codevalue != OP_KET)
722          {
723          ADD_ACTIVE(state_offset - (int)GET(code, 1), 0);
724          }
725        }
726      else
727        {
728        if (ptr > current_subject ||
729            ((mb->moptions & PCRE2_NOTEMPTY) == 0 &&
730              ((mb->moptions & PCRE2_NOTEMPTY_ATSTART) == 0 ||
731                current_subject > start_subject + mb->start_offset)))
732          {
733          if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;
734            else if (match_count > 0 && ++match_count * 2 > (int)offsetcount)
735              match_count = 0;
736          count = ((match_count == 0)? (int)offsetcount : match_count * 2) - 2;
737          if (count > 0) memmove(offsets + 2, offsets,
738            (size_t)count * sizeof(PCRE2_SIZE));
739          if (offsetcount >= 2)
740            {
741            offsets[0] = (PCRE2_SIZE)(current_subject - start_subject);
742            offsets[1] = (PCRE2_SIZE)(ptr - start_subject);
743            }
744          if ((mb->moptions & PCRE2_DFA_SHORTEST) != 0) return match_count;
745          }
746        }
747      break;
748
749/* ========================================================================== */
750      /* These opcodes add to the current list of states without looking
751      at the current character. */
752
753      /*-----------------------------------------------------------------*/
754      case OP_ALT:
755      do { code += GET(code, 1); } while (*code == OP_ALT);
756      ADD_ACTIVE((int)(code - start_code), 0);
757      break;
758
759      /*-----------------------------------------------------------------*/
760      case OP_BRA:
761      case OP_SBRA:
762      do
763        {
764        ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
765        code += GET(code, 1);
766        }
767      while (*code == OP_ALT);
768      break;
769
770      /*-----------------------------------------------------------------*/
771      case OP_CBRA:
772      case OP_SCBRA:
773      ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE + IMM2_SIZE),  0);
774      code += GET(code, 1);
775      while (*code == OP_ALT)
776        {
777        ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE),  0);
778        code += GET(code, 1);
779        }
780      break;
781
782      /*-----------------------------------------------------------------*/
783      case OP_BRAZERO:
784      case OP_BRAMINZERO:
785      ADD_ACTIVE(state_offset + 1, 0);
786      code += 1 + GET(code, 2);
787      while (*code == OP_ALT) code += GET(code, 1);
788      ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
789      break;
790
791      /*-----------------------------------------------------------------*/
792      case OP_SKIPZERO:
793      code += 1 + GET(code, 2);
794      while (*code == OP_ALT) code += GET(code, 1);
795      ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
796      break;
797
798      /*-----------------------------------------------------------------*/
799      case OP_CIRC:
800      if (ptr == start_subject && (mb->moptions & PCRE2_NOTBOL) == 0)
801        { ADD_ACTIVE(state_offset + 1, 0); }
802      break;
803
804      /*-----------------------------------------------------------------*/
805      case OP_CIRCM:
806      if ((ptr == start_subject && (mb->moptions & PCRE2_NOTBOL) == 0) ||
807          ((ptr != end_subject || (mb->poptions & PCRE2_ALT_CIRCUMFLEX) != 0 )
808            && WAS_NEWLINE(ptr)))
809        { ADD_ACTIVE(state_offset + 1, 0); }
810      break;
811
812      /*-----------------------------------------------------------------*/
813      case OP_EOD:
814      if (ptr >= end_subject)
815        {
816        if ((mb->moptions & PCRE2_PARTIAL_HARD) != 0)
817          could_continue = TRUE;
818        else { ADD_ACTIVE(state_offset + 1, 0); }
819        }
820      break;
821
822      /*-----------------------------------------------------------------*/
823      case OP_SOD:
824      if (ptr == start_subject) { ADD_ACTIVE(state_offset + 1, 0); }
825      break;
826
827      /*-----------------------------------------------------------------*/
828      case OP_SOM:
829      if (ptr == start_subject + start_offset) { ADD_ACTIVE(state_offset + 1, 0); }
830      break;
831
832
833/* ========================================================================== */
834      /* These opcodes inspect the next subject character, and sometimes
835      the previous one as well, but do not have an argument. The variable
836      clen contains the length of the current character and is zero if we are
837      at the end of the subject. */
838
839      /*-----------------------------------------------------------------*/
840      case OP_ANY:
841      if (clen > 0 && !IS_NEWLINE(ptr))
842        {
843        if (ptr + 1 >= mb->end_subject &&
844            (mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 &&
845            NLBLOCK->nltype == NLTYPE_FIXED &&
846            NLBLOCK->nllen == 2 &&
847            c == NLBLOCK->nl[0])
848          {
849          could_continue = partial_newline = TRUE;
850          }
851        else
852          {
853          ADD_NEW(state_offset + 1, 0);
854          }
855        }
856      break;
857
858      /*-----------------------------------------------------------------*/
859      case OP_ALLANY:
860      if (clen > 0)
861        { ADD_NEW(state_offset + 1, 0); }
862      break;
863
864      /*-----------------------------------------------------------------*/
865      case OP_EODN:
866      if (clen == 0 && (mb->moptions & PCRE2_PARTIAL_HARD) != 0)
867        could_continue = TRUE;
868      else if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - mb->nllen))
869        { ADD_ACTIVE(state_offset + 1, 0); }
870      break;
871
872      /*-----------------------------------------------------------------*/
873      case OP_DOLL:
874      if ((mb->moptions & PCRE2_NOTEOL) == 0)
875        {
876        if (clen == 0 && (mb->moptions & PCRE2_PARTIAL_HARD) != 0)
877          could_continue = TRUE;
878        else if (clen == 0 ||
879            ((mb->poptions & PCRE2_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr) &&
880               (ptr == end_subject - mb->nllen)
881            ))
882          { ADD_ACTIVE(state_offset + 1, 0); }
883        else if (ptr + 1 >= mb->end_subject &&
884                 (mb->moptions & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) != 0 &&
885                 NLBLOCK->nltype == NLTYPE_FIXED &&
886                 NLBLOCK->nllen == 2 &&
887                 c == NLBLOCK->nl[0])
888          {
889          if ((mb->moptions & PCRE2_PARTIAL_HARD) != 0)
890            {
891            reset_could_continue = TRUE;
892            ADD_NEW_DATA(-(state_offset + 1), 0, 1);
893            }
894          else could_continue = partial_newline = TRUE;
895          }
896        }
897      break;
898
899      /*-----------------------------------------------------------------*/
900      case OP_DOLLM:
901      if ((mb->moptions & PCRE2_NOTEOL) == 0)
902        {
903        if (clen == 0 && (mb->moptions & PCRE2_PARTIAL_HARD) != 0)
904          could_continue = TRUE;
905        else if (clen == 0 ||
906            ((mb->poptions & PCRE2_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr)))
907          { ADD_ACTIVE(state_offset + 1, 0); }
908        else if (ptr + 1 >= mb->end_subject &&
909                 (mb->moptions & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) != 0 &&
910                 NLBLOCK->nltype == NLTYPE_FIXED &&
911                 NLBLOCK->nllen == 2 &&
912                 c == NLBLOCK->nl[0])
913          {
914          if ((mb->moptions & PCRE2_PARTIAL_HARD) != 0)
915            {
916            reset_could_continue = TRUE;
917            ADD_NEW_DATA(-(state_offset + 1), 0, 1);
918            }
919          else could_continue = partial_newline = TRUE;
920          }
921        }
922      else if (IS_NEWLINE(ptr))
923        { ADD_ACTIVE(state_offset + 1, 0); }
924      break;
925
926      /*-----------------------------------------------------------------*/
927
928      case OP_DIGIT:
929      case OP_WHITESPACE:
930      case OP_WORDCHAR:
931      if (clen > 0 && c < 256 &&
932            ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0)
933        { ADD_NEW(state_offset + 1, 0); }
934      break;
935
936      /*-----------------------------------------------------------------*/
937      case OP_NOT_DIGIT:
938      case OP_NOT_WHITESPACE:
939      case OP_NOT_WORDCHAR:
940      if (clen > 0 && (c >= 256 ||
941            ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0))
942        { ADD_NEW(state_offset + 1, 0); }
943      break;
944
945      /*-----------------------------------------------------------------*/
946      case OP_WORD_BOUNDARY:
947      case OP_NOT_WORD_BOUNDARY:
948        {
949        int left_word, right_word;
950
951        if (ptr > start_subject)
952          {
953          PCRE2_SPTR temp = ptr - 1;
954          if (temp < mb->start_used_ptr) mb->start_used_ptr = temp;
955#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
956          if (utf) { BACKCHAR(temp); }
957#endif
958          GETCHARTEST(d, temp);
959#ifdef SUPPORT_UNICODE
960          if ((mb->poptions & PCRE2_UCP) != 0)
961            {
962            if (d == '_') left_word = TRUE; else
963              {
964              uint32_t cat = UCD_CATEGORY(d);
965              left_word = (cat == ucp_L || cat == ucp_N);
966              }
967            }
968          else
969#endif
970          left_word = d < 256 && (ctypes[d] & ctype_word) != 0;
971          }
972        else left_word = FALSE;
973
974        if (clen > 0)
975          {
976          if (ptr >= mb->last_used_ptr)
977            {
978            PCRE2_SPTR temp = ptr + 1;
979#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
980            if (utf) { FORWARDCHARTEST(temp, mb->end_subject); }
981#endif
982            mb->last_used_ptr = temp;
983            }
984#ifdef SUPPORT_UNICODE
985          if ((mb->poptions & PCRE2_UCP) != 0)
986            {
987            if (c == '_') right_word = TRUE; else
988              {
989              uint32_t cat = UCD_CATEGORY(c);
990              right_word = (cat == ucp_L || cat == ucp_N);
991              }
992            }
993          else
994#endif
995          right_word = c < 256 && (ctypes[c] & ctype_word) != 0;
996          }
997        else right_word = FALSE;
998
999        if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))
1000          { ADD_ACTIVE(state_offset + 1, 0); }
1001        }
1002      break;
1003
1004
1005      /*-----------------------------------------------------------------*/
1006      /* Check the next character by Unicode property. We will get here only
1007      if the support is in the binary; otherwise a compile-time error occurs.
1008      */
1009
1010#ifdef SUPPORT_UNICODE
1011      case OP_PROP:
1012      case OP_NOTPROP:
1013      if (clen > 0)
1014        {
1015        BOOL OK;
1016        const uint32_t *cp;
1017        const ucd_record * prop = GET_UCD(c);
1018        switch(code[1])
1019          {
1020          case PT_ANY:
1021          OK = TRUE;
1022          break;
1023
1024          case PT_LAMP:
1025          OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1026               prop->chartype == ucp_Lt;
1027          break;
1028
1029          case PT_GC:
1030          OK = PRIV(ucp_gentype)[prop->chartype] == code[2];
1031          break;
1032
1033          case PT_PC:
1034          OK = prop->chartype == code[2];
1035          break;
1036
1037          case PT_SC:
1038          OK = prop->script == code[2];
1039          break;
1040
1041          /* These are specials for combination cases. */
1042
1043          case PT_ALNUM:
1044          OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1045               PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1046          break;
1047
1048          /* Perl space used to exclude VT, but from Perl 5.18 it is included,
1049          which means that Perl space and POSIX space are now identical. PCRE
1050          was changed at release 8.34. */
1051
1052          case PT_SPACE:    /* Perl space */
1053          case PT_PXSPACE:  /* POSIX space */
1054          switch(c)
1055            {
1056            HSPACE_CASES:
1057            VSPACE_CASES:
1058            OK = TRUE;
1059            break;
1060
1061            default:
1062            OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z;
1063            break;
1064            }
1065          break;
1066
1067          case PT_WORD:
1068          OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1069               PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1070               c == CHAR_UNDERSCORE;
1071          break;
1072
1073          case PT_CLIST:
1074          cp = PRIV(ucd_caseless_sets) + code[2];
1075          for (;;)
1076            {
1077            if (c < *cp) { OK = FALSE; break; }
1078            if (c == *cp++) { OK = TRUE; break; }
1079            }
1080          break;
1081
1082          case PT_UCNC:
1083          OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
1084               c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
1085               c >= 0xe000;
1086          break;
1087
1088          /* Should never occur, but keep compilers from grumbling. */
1089
1090          default:
1091          OK = codevalue != OP_PROP;
1092          break;
1093          }
1094
1095        if (OK == (codevalue == OP_PROP)) { ADD_NEW(state_offset + 3, 0); }
1096        }
1097      break;
1098#endif
1099
1100
1101
1102/* ========================================================================== */
1103      /* These opcodes likewise inspect the subject character, but have an
1104      argument that is not a data character. It is one of these opcodes:
1105      OP_ANY, OP_ALLANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE,
1106      OP_WORDCHAR, OP_NOT_WORDCHAR. The value is loaded into d. */
1107
1108      case OP_TYPEPLUS:
1109      case OP_TYPEMINPLUS:
1110      case OP_TYPEPOSPLUS:
1111      count = current_state->count;  /* Already matched */
1112      if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1113      if (clen > 0)
1114        {
1115        if (d == OP_ANY && ptr + 1 >= mb->end_subject &&
1116            (mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 &&
1117            NLBLOCK->nltype == NLTYPE_FIXED &&
1118            NLBLOCK->nllen == 2 &&
1119            c == NLBLOCK->nl[0])
1120          {
1121          could_continue = partial_newline = TRUE;
1122          }
1123        else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1124            (c < 256 &&
1125              (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1126              ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1127          {
1128          if (count > 0 && codevalue == OP_TYPEPOSPLUS)
1129            {
1130            active_count--;            /* Remove non-match possibility */
1131            next_active_state--;
1132            }
1133          count++;
1134          ADD_NEW(state_offset, count);
1135          }
1136        }
1137      break;
1138
1139      /*-----------------------------------------------------------------*/
1140      case OP_TYPEQUERY:
1141      case OP_TYPEMINQUERY:
1142      case OP_TYPEPOSQUERY:
1143      ADD_ACTIVE(state_offset + 2, 0);
1144      if (clen > 0)
1145        {
1146        if (d == OP_ANY && ptr + 1 >= mb->end_subject &&
1147            (mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 &&
1148            NLBLOCK->nltype == NLTYPE_FIXED &&
1149            NLBLOCK->nllen == 2 &&
1150            c == NLBLOCK->nl[0])
1151          {
1152          could_continue = partial_newline = TRUE;
1153          }
1154        else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1155            (c < 256 &&
1156              (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1157              ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1158          {
1159          if (codevalue == OP_TYPEPOSQUERY)
1160            {
1161            active_count--;            /* Remove non-match possibility */
1162            next_active_state--;
1163            }
1164          ADD_NEW(state_offset + 2, 0);
1165          }
1166        }
1167      break;
1168
1169      /*-----------------------------------------------------------------*/
1170      case OP_TYPESTAR:
1171      case OP_TYPEMINSTAR:
1172      case OP_TYPEPOSSTAR:
1173      ADD_ACTIVE(state_offset + 2, 0);
1174      if (clen > 0)
1175        {
1176        if (d == OP_ANY && ptr + 1 >= mb->end_subject &&
1177            (mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 &&
1178            NLBLOCK->nltype == NLTYPE_FIXED &&
1179            NLBLOCK->nllen == 2 &&
1180            c == NLBLOCK->nl[0])
1181          {
1182          could_continue = partial_newline = TRUE;
1183          }
1184        else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1185            (c < 256 &&
1186              (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1187              ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1188          {
1189          if (codevalue == OP_TYPEPOSSTAR)
1190            {
1191            active_count--;            /* Remove non-match possibility */
1192            next_active_state--;
1193            }
1194          ADD_NEW(state_offset, 0);
1195          }
1196        }
1197      break;
1198
1199      /*-----------------------------------------------------------------*/
1200      case OP_TYPEEXACT:
1201      count = current_state->count;  /* Number already matched */
1202      if (clen > 0)
1203        {
1204        if (d == OP_ANY && ptr + 1 >= mb->end_subject &&
1205            (mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 &&
1206            NLBLOCK->nltype == NLTYPE_FIXED &&
1207            NLBLOCK->nllen == 2 &&
1208            c == NLBLOCK->nl[0])
1209          {
1210          could_continue = partial_newline = TRUE;
1211          }
1212        else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1213            (c < 256 &&
1214              (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1215              ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1216          {
1217          if (++count >= (int)GET2(code, 1))
1218            { ADD_NEW(state_offset + 1 + IMM2_SIZE + 1, 0); }
1219          else
1220            { ADD_NEW(state_offset, count); }
1221          }
1222        }
1223      break;
1224
1225      /*-----------------------------------------------------------------*/
1226      case OP_TYPEUPTO:
1227      case OP_TYPEMINUPTO:
1228      case OP_TYPEPOSUPTO:
1229      ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0);
1230      count = current_state->count;  /* Number already matched */
1231      if (clen > 0)
1232        {
1233        if (d == OP_ANY && ptr + 1 >= mb->end_subject &&
1234            (mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 &&
1235            NLBLOCK->nltype == NLTYPE_FIXED &&
1236            NLBLOCK->nllen == 2 &&
1237            c == NLBLOCK->nl[0])
1238          {
1239          could_continue = partial_newline = TRUE;
1240          }
1241        else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1242            (c < 256 &&
1243              (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1244              ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1245          {
1246          if (codevalue == OP_TYPEPOSUPTO)
1247            {
1248            active_count--;           /* Remove non-match possibility */
1249            next_active_state--;
1250            }
1251          if (++count >= (int)GET2(code, 1))
1252            { ADD_NEW(state_offset + 2 + IMM2_SIZE, 0); }
1253          else
1254            { ADD_NEW(state_offset, count); }
1255          }
1256        }
1257      break;
1258
1259/* ========================================================================== */
1260      /* These are virtual opcodes that are used when something like
1261      OP_TYPEPLUS has OP_PROP, OP_NOTPROP, OP_ANYNL, or OP_EXTUNI as its
1262      argument. It keeps the code above fast for the other cases. The argument
1263      is in the d variable. */
1264
1265#ifdef SUPPORT_UNICODE
1266      case OP_PROP_EXTRA + OP_TYPEPLUS:
1267      case OP_PROP_EXTRA + OP_TYPEMINPLUS:
1268      case OP_PROP_EXTRA + OP_TYPEPOSPLUS:
1269      count = current_state->count;           /* Already matched */
1270      if (count > 0) { ADD_ACTIVE(state_offset + 4, 0); }
1271      if (clen > 0)
1272        {
1273        BOOL OK;
1274        const uint32_t *cp;
1275        const ucd_record * prop = GET_UCD(c);
1276        switch(code[2])
1277          {
1278          case PT_ANY:
1279          OK = TRUE;
1280          break;
1281
1282          case PT_LAMP:
1283          OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1284            prop->chartype == ucp_Lt;
1285          break;
1286
1287          case PT_GC:
1288          OK = PRIV(ucp_gentype)[prop->chartype] == code[3];
1289          break;
1290
1291          case PT_PC:
1292          OK = prop->chartype == code[3];
1293          break;
1294
1295          case PT_SC:
1296          OK = prop->script == code[3];
1297          break;
1298
1299          /* These are specials for combination cases. */
1300
1301          case PT_ALNUM:
1302          OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1303               PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1304          break;
1305
1306          /* Perl space used to exclude VT, but from Perl 5.18 it is included,
1307          which means that Perl space and POSIX space are now identical. PCRE
1308          was changed at release 8.34. */
1309
1310          case PT_SPACE:    /* Perl space */
1311          case PT_PXSPACE:  /* POSIX space */
1312          switch(c)
1313            {
1314            HSPACE_CASES:
1315            VSPACE_CASES:
1316            OK = TRUE;
1317            break;
1318
1319            default:
1320            OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z;
1321            break;
1322            }
1323          break;
1324
1325          case PT_WORD:
1326          OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1327               PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1328               c == CHAR_UNDERSCORE;
1329          break;
1330
1331          case PT_CLIST:
1332          cp = PRIV(ucd_caseless_sets) + code[3];
1333          for (;;)
1334            {
1335            if (c < *cp) { OK = FALSE; break; }
1336            if (c == *cp++) { OK = TRUE; break; }
1337            }
1338          break;
1339
1340          case PT_UCNC:
1341          OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
1342               c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
1343               c >= 0xe000;
1344          break;
1345
1346          /* Should never occur, but keep compilers from grumbling. */
1347
1348          default:
1349          OK = codevalue != OP_PROP;
1350          break;
1351          }
1352
1353        if (OK == (d == OP_PROP))
1354          {
1355          if (count > 0 && codevalue == OP_PROP_EXTRA + OP_TYPEPOSPLUS)
1356            {
1357            active_count--;           /* Remove non-match possibility */
1358            next_active_state--;
1359            }
1360          count++;
1361          ADD_NEW(state_offset, count);
1362          }
1363        }
1364      break;
1365
1366      /*-----------------------------------------------------------------*/
1367      case OP_EXTUNI_EXTRA + OP_TYPEPLUS:
1368      case OP_EXTUNI_EXTRA + OP_TYPEMINPLUS:
1369      case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS:
1370      count = current_state->count;  /* Already matched */
1371      if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1372      if (clen > 0)
1373        {
1374        uint32_t lgb, rgb;
1375        PCRE2_SPTR nptr = ptr + clen;
1376        int ncount = 0;
1377        if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS)
1378          {
1379          active_count--;           /* Remove non-match possibility */
1380          next_active_state--;
1381          }
1382        lgb = UCD_GRAPHBREAK(c);
1383        while (nptr < end_subject)
1384          {
1385          dlen = 1;
1386          if (!utf) d = *nptr; else { GETCHARLEN(d, nptr, dlen); }
1387          rgb = UCD_GRAPHBREAK(d);
1388          if ((PRIV(ucp_gbtable)[lgb] & (1u << rgb)) == 0) break;
1389          ncount++;
1390          lgb = rgb;
1391          nptr += dlen;
1392          }
1393        count++;
1394        ADD_NEW_DATA(-state_offset, count, ncount);
1395        }
1396      break;
1397#endif
1398
1399      /*-----------------------------------------------------------------*/
1400      case OP_ANYNL_EXTRA + OP_TYPEPLUS:
1401      case OP_ANYNL_EXTRA + OP_TYPEMINPLUS:
1402      case OP_ANYNL_EXTRA + OP_TYPEPOSPLUS:
1403      count = current_state->count;  /* Already matched */
1404      if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1405      if (clen > 0)
1406        {
1407        int ncount = 0;
1408        switch (c)
1409          {
1410          case CHAR_VT:
1411          case CHAR_FF:
1412          case CHAR_NEL:
1413#ifndef EBCDIC
1414          case 0x2028:
1415          case 0x2029:
1416#endif  /* Not EBCDIC */
1417          if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) break;
1418          goto ANYNL01;
1419
1420          case CHAR_CR:
1421          if (ptr + 1 < end_subject && UCHAR21TEST(ptr + 1) == CHAR_LF) ncount = 1;
1422          /* Fall through */
1423
1424          ANYNL01:
1425          case CHAR_LF:
1426          if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS)
1427            {
1428            active_count--;           /* Remove non-match possibility */
1429            next_active_state--;
1430            }
1431          count++;
1432          ADD_NEW_DATA(-state_offset, count, ncount);
1433          break;
1434
1435          default:
1436          break;
1437          }
1438        }
1439      break;
1440
1441      /*-----------------------------------------------------------------*/
1442      case OP_VSPACE_EXTRA + OP_TYPEPLUS:
1443      case OP_VSPACE_EXTRA + OP_TYPEMINPLUS:
1444      case OP_VSPACE_EXTRA + OP_TYPEPOSPLUS:
1445      count = current_state->count;  /* Already matched */
1446      if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1447      if (clen > 0)
1448        {
1449        BOOL OK;
1450        switch (c)
1451          {
1452          VSPACE_CASES:
1453          OK = TRUE;
1454          break;
1455
1456          default:
1457          OK = FALSE;
1458          break;
1459          }
1460
1461        if (OK == (d == OP_VSPACE))
1462          {
1463          if (count > 0 && codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSPLUS)
1464            {
1465            active_count--;           /* Remove non-match possibility */
1466            next_active_state--;
1467            }
1468          count++;
1469          ADD_NEW_DATA(-state_offset, count, 0);
1470          }
1471        }
1472      break;
1473
1474      /*-----------------------------------------------------------------*/
1475      case OP_HSPACE_EXTRA + OP_TYPEPLUS:
1476      case OP_HSPACE_EXTRA + OP_TYPEMINPLUS:
1477      case OP_HSPACE_EXTRA + OP_TYPEPOSPLUS:
1478      count = current_state->count;  /* Already matched */
1479      if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1480      if (clen > 0)
1481        {
1482        BOOL OK;
1483        switch (c)
1484          {
1485          HSPACE_CASES:
1486          OK = TRUE;
1487          break;
1488
1489          default:
1490          OK = FALSE;
1491          break;
1492          }
1493
1494        if (OK == (d == OP_HSPACE))
1495          {
1496          if (count > 0 && codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSPLUS)
1497            {
1498            active_count--;           /* Remove non-match possibility */
1499            next_active_state--;
1500            }
1501          count++;
1502          ADD_NEW_DATA(-state_offset, count, 0);
1503          }
1504        }
1505      break;
1506
1507      /*-----------------------------------------------------------------*/
1508#ifdef SUPPORT_UNICODE
1509      case OP_PROP_EXTRA + OP_TYPEQUERY:
1510      case OP_PROP_EXTRA + OP_TYPEMINQUERY:
1511      case OP_PROP_EXTRA + OP_TYPEPOSQUERY:
1512      count = 4;
1513      goto QS1;
1514
1515      case OP_PROP_EXTRA + OP_TYPESTAR:
1516      case OP_PROP_EXTRA + OP_TYPEMINSTAR:
1517      case OP_PROP_EXTRA + OP_TYPEPOSSTAR:
1518      count = 0;
1519
1520      QS1:
1521
1522      ADD_ACTIVE(state_offset + 4, 0);
1523      if (clen > 0)
1524        {
1525        BOOL OK;
1526        const uint32_t *cp;
1527        const ucd_record * prop = GET_UCD(c);
1528        switch(code[2])
1529          {
1530          case PT_ANY:
1531          OK = TRUE;
1532          break;
1533
1534          case PT_LAMP:
1535          OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1536            prop->chartype == ucp_Lt;
1537          break;
1538
1539          case PT_GC:
1540          OK = PRIV(ucp_gentype)[prop->chartype] == code[3];
1541          break;
1542
1543          case PT_PC:
1544          OK = prop->chartype == code[3];
1545          break;
1546
1547          case PT_SC:
1548          OK = prop->script == code[3];
1549          break;
1550
1551          /* These are specials for combination cases. */
1552
1553          case PT_ALNUM:
1554          OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1555               PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1556          break;
1557
1558          /* Perl space used to exclude VT, but from Perl 5.18 it is included,
1559          which means that Perl space and POSIX space are now identical. PCRE
1560          was changed at release 8.34. */
1561
1562          case PT_SPACE:    /* Perl space */
1563          case PT_PXSPACE:  /* POSIX space */
1564          switch(c)
1565            {
1566            HSPACE_CASES:
1567            VSPACE_CASES:
1568            OK = TRUE;
1569            break;
1570
1571            default:
1572            OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z;
1573            break;
1574            }
1575          break;
1576
1577          case PT_WORD:
1578          OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1579               PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1580               c == CHAR_UNDERSCORE;
1581          break;
1582
1583          case PT_CLIST:
1584          cp = PRIV(ucd_caseless_sets) + code[3];
1585          for (;;)
1586            {
1587            if (c < *cp) { OK = FALSE; break; }
1588            if (c == *cp++) { OK = TRUE; break; }
1589            }
1590          break;
1591
1592          case PT_UCNC:
1593          OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
1594               c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
1595               c >= 0xe000;
1596          break;
1597
1598          /* Should never occur, but keep compilers from grumbling. */
1599
1600          default:
1601          OK = codevalue != OP_PROP;
1602          break;
1603          }
1604
1605        if (OK == (d == OP_PROP))
1606          {
1607          if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSSTAR ||
1608              codevalue == OP_PROP_EXTRA + OP_TYPEPOSQUERY)
1609            {
1610            active_count--;           /* Remove non-match possibility */
1611            next_active_state--;
1612            }
1613          ADD_NEW(state_offset + count, 0);
1614          }
1615        }
1616      break;
1617
1618      /*-----------------------------------------------------------------*/
1619      case OP_EXTUNI_EXTRA + OP_TYPEQUERY:
1620      case OP_EXTUNI_EXTRA + OP_TYPEMINQUERY:
1621      case OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY:
1622      count = 2;
1623      goto QS2;
1624
1625      case OP_EXTUNI_EXTRA + OP_TYPESTAR:
1626      case OP_EXTUNI_EXTRA + OP_TYPEMINSTAR:
1627      case OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR:
1628      count = 0;
1629
1630      QS2:
1631
1632      ADD_ACTIVE(state_offset + 2, 0);
1633      if (clen > 0)
1634        {
1635        uint32_t lgb, rgb;
1636        PCRE2_SPTR nptr = ptr + clen;
1637        int ncount = 0;
1638        if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR ||
1639            codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY)
1640          {
1641          active_count--;           /* Remove non-match possibility */
1642          next_active_state--;
1643          }
1644        lgb = UCD_GRAPHBREAK(c);
1645        while (nptr < end_subject)
1646          {
1647          dlen = 1;
1648          if (!utf) d = *nptr; else { GETCHARLEN(d, nptr, dlen); }
1649          rgb = UCD_GRAPHBREAK(d);
1650          if ((PRIV(ucp_gbtable)[lgb] & (1u << rgb)) == 0) break;
1651          ncount++;
1652          lgb = rgb;
1653          nptr += dlen;
1654          }
1655        ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1656        }
1657      break;
1658#endif
1659
1660      /*-----------------------------------------------------------------*/
1661      case OP_ANYNL_EXTRA + OP_TYPEQUERY:
1662      case OP_ANYNL_EXTRA + OP_TYPEMINQUERY:
1663      case OP_ANYNL_EXTRA + OP_TYPEPOSQUERY:
1664      count = 2;
1665      goto QS3;
1666
1667      case OP_ANYNL_EXTRA + OP_TYPESTAR:
1668      case OP_ANYNL_EXTRA + OP_TYPEMINSTAR:
1669      case OP_ANYNL_EXTRA + OP_TYPEPOSSTAR:
1670      count = 0;
1671
1672      QS3:
1673      ADD_ACTIVE(state_offset + 2, 0);
1674      if (clen > 0)
1675        {
1676        int ncount = 0;
1677        switch (c)
1678          {
1679          case CHAR_VT:
1680          case CHAR_FF:
1681          case CHAR_NEL:
1682#ifndef EBCDIC
1683          case 0x2028:
1684          case 0x2029:
1685#endif  /* Not EBCDIC */
1686          if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) break;
1687          goto ANYNL02;
1688
1689          case CHAR_CR:
1690          if (ptr + 1 < end_subject && UCHAR21TEST(ptr + 1) == CHAR_LF) ncount = 1;
1691          /* Fall through */
1692
1693          ANYNL02:
1694          case CHAR_LF:
1695          if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR ||
1696              codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY)
1697            {
1698            active_count--;           /* Remove non-match possibility */
1699            next_active_state--;
1700            }
1701          ADD_NEW_DATA(-(state_offset + (int)count), 0, ncount);
1702          break;
1703
1704          default:
1705          break;
1706          }
1707        }
1708      break;
1709
1710      /*-----------------------------------------------------------------*/
1711      case OP_VSPACE_EXTRA + OP_TYPEQUERY:
1712      case OP_VSPACE_EXTRA + OP_TYPEMINQUERY:
1713      case OP_VSPACE_EXTRA + OP_TYPEPOSQUERY:
1714      count = 2;
1715      goto QS4;
1716
1717      case OP_VSPACE_EXTRA + OP_TYPESTAR:
1718      case OP_VSPACE_EXTRA + OP_TYPEMINSTAR:
1719      case OP_VSPACE_EXTRA + OP_TYPEPOSSTAR:
1720      count = 0;
1721
1722      QS4:
1723      ADD_ACTIVE(state_offset + 2, 0);
1724      if (clen > 0)
1725        {
1726        BOOL OK;
1727        switch (c)
1728          {
1729          VSPACE_CASES:
1730          OK = TRUE;
1731          break;
1732
1733          default:
1734          OK = FALSE;
1735          break;
1736          }
1737        if (OK == (d == OP_VSPACE))
1738          {
1739          if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSSTAR ||
1740              codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSQUERY)
1741            {
1742            active_count--;           /* Remove non-match possibility */
1743            next_active_state--;
1744            }
1745          ADD_NEW_DATA(-(state_offset + (int)count), 0, 0);
1746          }
1747        }
1748      break;
1749
1750      /*-----------------------------------------------------------------*/
1751      case OP_HSPACE_EXTRA + OP_TYPEQUERY:
1752      case OP_HSPACE_EXTRA + OP_TYPEMINQUERY:
1753      case OP_HSPACE_EXTRA + OP_TYPEPOSQUERY:
1754      count = 2;
1755      goto QS5;
1756
1757      case OP_HSPACE_EXTRA + OP_TYPESTAR:
1758      case OP_HSPACE_EXTRA + OP_TYPEMINSTAR:
1759      case OP_HSPACE_EXTRA + OP_TYPEPOSSTAR:
1760      count = 0;
1761
1762      QS5:
1763      ADD_ACTIVE(state_offset + 2, 0);
1764      if (clen > 0)
1765        {
1766        BOOL OK;
1767        switch (c)
1768          {
1769          HSPACE_CASES:
1770          OK = TRUE;
1771          break;
1772
1773          default:
1774          OK = FALSE;
1775          break;
1776          }
1777
1778        if (OK == (d == OP_HSPACE))
1779          {
1780          if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSSTAR ||
1781              codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSQUERY)
1782            {
1783            active_count--;           /* Remove non-match possibility */
1784            next_active_state--;
1785            }
1786          ADD_NEW_DATA(-(state_offset + (int)count), 0, 0);
1787          }
1788        }
1789      break;
1790
1791      /*-----------------------------------------------------------------*/
1792#ifdef SUPPORT_UNICODE
1793      case OP_PROP_EXTRA + OP_TYPEEXACT:
1794      case OP_PROP_EXTRA + OP_TYPEUPTO:
1795      case OP_PROP_EXTRA + OP_TYPEMINUPTO:
1796      case OP_PROP_EXTRA + OP_TYPEPOSUPTO:
1797      if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT)
1798        { ADD_ACTIVE(state_offset + 1 + IMM2_SIZE + 3, 0); }
1799      count = current_state->count;  /* Number already matched */
1800      if (clen > 0)
1801        {
1802        BOOL OK;
1803        const uint32_t *cp;
1804        const ucd_record * prop = GET_UCD(c);
1805        switch(code[1 + IMM2_SIZE + 1])
1806          {
1807          case PT_ANY:
1808          OK = TRUE;
1809          break;
1810
1811          case PT_LAMP:
1812          OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1813            prop->chartype == ucp_Lt;
1814          break;
1815
1816          case PT_GC:
1817          OK = PRIV(ucp_gentype)[prop->chartype] == code[1 + IMM2_SIZE + 2];
1818          break;
1819
1820          case PT_PC:
1821          OK = prop->chartype == code[1 + IMM2_SIZE + 2];
1822          break;
1823
1824          case PT_SC:
1825          OK = prop->script == code[1 + IMM2_SIZE + 2];
1826          break;
1827
1828          /* These are specials for combination cases. */
1829
1830          case PT_ALNUM:
1831          OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1832               PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1833          break;
1834
1835          /* Perl space used to exclude VT, but from Perl 5.18 it is included,
1836          which means that Perl space and POSIX space are now identical. PCRE
1837          was changed at release 8.34. */
1838
1839          case PT_SPACE:    /* Perl space */
1840          case PT_PXSPACE:  /* POSIX space */
1841          switch(c)
1842            {
1843            HSPACE_CASES:
1844            VSPACE_CASES:
1845            OK = TRUE;
1846            break;
1847
1848            default:
1849            OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z;
1850            break;
1851            }
1852          break;
1853
1854          case PT_WORD:
1855          OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1856               PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1857               c == CHAR_UNDERSCORE;
1858          break;
1859
1860          case PT_CLIST:
1861          cp = PRIV(ucd_caseless_sets) + code[1 + IMM2_SIZE + 2];
1862          for (;;)
1863            {
1864            if (c < *cp) { OK = FALSE; break; }
1865            if (c == *cp++) { OK = TRUE; break; }
1866            }
1867          break;
1868
1869          case PT_UCNC:
1870          OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
1871               c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
1872               c >= 0xe000;
1873          break;
1874
1875          /* Should never occur, but keep compilers from grumbling. */
1876
1877          default:
1878          OK = codevalue != OP_PROP;
1879          break;
1880          }
1881
1882        if (OK == (d == OP_PROP))
1883          {
1884          if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSUPTO)
1885            {
1886            active_count--;           /* Remove non-match possibility */
1887            next_active_state--;
1888            }
1889          if (++count >= (int)GET2(code, 1))
1890            { ADD_NEW(state_offset + 1 + IMM2_SIZE + 3, 0); }
1891          else
1892            { ADD_NEW(state_offset, count); }
1893          }
1894        }
1895      break;
1896
1897      /*-----------------------------------------------------------------*/
1898      case OP_EXTUNI_EXTRA + OP_TYPEEXACT:
1899      case OP_EXTUNI_EXTRA + OP_TYPEUPTO:
1900      case OP_EXTUNI_EXTRA + OP_TYPEMINUPTO:
1901      case OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO:
1902      if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)
1903        { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
1904      count = current_state->count;  /* Number already matched */
1905      if (clen > 0)
1906        {
1907        uint32_t lgb, rgb;
1908        PCRE2_SPTR nptr = ptr + clen;
1909        int ncount = 0;
1910        if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO)
1911          {
1912          active_count--;           /* Remove non-match possibility */
1913          next_active_state--;
1914          }
1915        lgb = UCD_GRAPHBREAK(c);
1916        while (nptr < end_subject)
1917          {
1918          dlen = 1;
1919          if (!utf) d = *nptr; else { GETCHARLEN(d, nptr, dlen); }
1920          rgb = UCD_GRAPHBREAK(d);
1921          if ((PRIV(ucp_gbtable)[lgb] & (1u << rgb)) == 0) break;
1922          ncount++;
1923          lgb = rgb;
1924          nptr += dlen;
1925          }
1926        if (nptr >= end_subject && (mb->moptions & PCRE2_PARTIAL_HARD) != 0)
1927            reset_could_continue = TRUE;
1928        if (++count >= (int)GET2(code, 1))
1929          { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, ncount); }
1930        else
1931          { ADD_NEW_DATA(-state_offset, count, ncount); }
1932        }
1933      break;
1934#endif
1935
1936      /*-----------------------------------------------------------------*/
1937      case OP_ANYNL_EXTRA + OP_TYPEEXACT:
1938      case OP_ANYNL_EXTRA + OP_TYPEUPTO:
1939      case OP_ANYNL_EXTRA + OP_TYPEMINUPTO:
1940      case OP_ANYNL_EXTRA + OP_TYPEPOSUPTO:
1941      if (codevalue != OP_ANYNL_EXTRA + OP_TYPEEXACT)
1942        { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
1943      count = current_state->count;  /* Number already matched */
1944      if (clen > 0)
1945        {
1946        int ncount = 0;
1947        switch (c)
1948          {
1949          case CHAR_VT:
1950          case CHAR_FF:
1951          case CHAR_NEL:
1952#ifndef EBCDIC
1953          case 0x2028:
1954          case 0x2029:
1955#endif  /* Not EBCDIC */
1956          if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) break;
1957          goto ANYNL03;
1958
1959          case CHAR_CR:
1960          if (ptr + 1 < end_subject && UCHAR21TEST(ptr + 1) == CHAR_LF) ncount = 1;
1961          /* Fall through */
1962
1963          ANYNL03:
1964          case CHAR_LF:
1965          if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO)
1966            {
1967            active_count--;           /* Remove non-match possibility */
1968            next_active_state--;
1969            }
1970          if (++count >= (int)GET2(code, 1))
1971            { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, ncount); }
1972          else
1973            { ADD_NEW_DATA(-state_offset, count, ncount); }
1974          break;
1975
1976          default:
1977          break;
1978          }
1979        }
1980      break;
1981
1982      /*-----------------------------------------------------------------*/
1983      case OP_VSPACE_EXTRA + OP_TYPEEXACT:
1984      case OP_VSPACE_EXTRA + OP_TYPEUPTO:
1985      case OP_VSPACE_EXTRA + OP_TYPEMINUPTO:
1986      case OP_VSPACE_EXTRA + OP_TYPEPOSUPTO:
1987      if (codevalue != OP_VSPACE_EXTRA + OP_TYPEEXACT)
1988        { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
1989      count = current_state->count;  /* Number already matched */
1990      if (clen > 0)
1991        {
1992        BOOL OK;
1993        switch (c)
1994          {
1995          VSPACE_CASES:
1996          OK = TRUE;
1997          break;
1998
1999          default:
2000          OK = FALSE;
2001          }
2002
2003        if (OK == (d == OP_VSPACE))
2004          {
2005          if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSUPTO)
2006            {
2007            active_count--;           /* Remove non-match possibility */
2008            next_active_state--;
2009            }
2010          if (++count >= (int)GET2(code, 1))
2011            { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, 0); }
2012          else
2013            { ADD_NEW_DATA(-state_offset, count, 0); }
2014          }
2015        }
2016      break;
2017
2018      /*-----------------------------------------------------------------*/
2019      case OP_HSPACE_EXTRA + OP_TYPEEXACT:
2020      case OP_HSPACE_EXTRA + OP_TYPEUPTO:
2021      case OP_HSPACE_EXTRA + OP_TYPEMINUPTO:
2022      case OP_HSPACE_EXTRA + OP_TYPEPOSUPTO:
2023      if (codevalue != OP_HSPACE_EXTRA + OP_TYPEEXACT)
2024        { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
2025      count = current_state->count;  /* Number already matched */
2026      if (clen > 0)
2027        {
2028        BOOL OK;
2029        switch (c)
2030          {
2031          HSPACE_CASES:
2032          OK = TRUE;
2033          break;
2034
2035          default:
2036          OK = FALSE;
2037          break;
2038          }
2039
2040        if (OK == (d == OP_HSPACE))
2041          {
2042          if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSUPTO)
2043            {
2044            active_count--;           /* Remove non-match possibility */
2045            next_active_state--;
2046            }
2047          if (++count >= (int)GET2(code, 1))
2048            { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, 0); }
2049          else
2050            { ADD_NEW_DATA(-state_offset, count, 0); }
2051          }
2052        }
2053      break;
2054
2055/* ========================================================================== */
2056      /* These opcodes are followed by a character that is usually compared
2057      to the current subject character; it is loaded into d. We still get
2058      here even if there is no subject character, because in some cases zero
2059      repetitions are permitted. */
2060
2061      /*-----------------------------------------------------------------*/
2062      case OP_CHAR:
2063      if (clen > 0 && c == d) { ADD_NEW(state_offset + dlen + 1, 0); }
2064      break;
2065
2066      /*-----------------------------------------------------------------*/
2067      case OP_CHARI:
2068      if (clen == 0) break;
2069
2070#ifdef SUPPORT_UNICODE
2071      if (utf)
2072        {
2073        if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else
2074          {
2075          unsigned int othercase;
2076          if (c < 128)
2077            othercase = fcc[c];
2078          else
2079            othercase = UCD_OTHERCASE(c);
2080          if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); }
2081          }
2082        }
2083      else
2084#endif  /* SUPPORT_UNICODE */
2085      /* Not UTF mode */
2086        {
2087        if (TABLE_GET(c, lcc, c) == TABLE_GET(d, lcc, d))
2088          { ADD_NEW(state_offset + 2, 0); }
2089        }
2090      break;
2091
2092
2093#ifdef SUPPORT_UNICODE
2094      /*-----------------------------------------------------------------*/
2095      /* This is a tricky one because it can match more than one character.
2096      Find out how many characters to skip, and then set up a negative state
2097      to wait for them to pass before continuing. */
2098
2099      case OP_EXTUNI:
2100      if (clen > 0)
2101        {
2102        uint32_t lgb, rgb;
2103        PCRE2_SPTR nptr = ptr + clen;
2104        int ncount = 0;
2105        lgb = UCD_GRAPHBREAK(c);
2106        while (nptr < end_subject)
2107          {
2108          dlen = 1;
2109          if (!utf) d = *nptr; else { GETCHARLEN(d, nptr, dlen); }
2110          rgb = UCD_GRAPHBREAK(d);
2111          if ((PRIV(ucp_gbtable)[lgb] & (1u << rgb)) == 0) break;
2112          ncount++;
2113          lgb = rgb;
2114          nptr += dlen;
2115          }
2116        if (nptr >= end_subject && (mb->moptions & PCRE2_PARTIAL_HARD) != 0)
2117            reset_could_continue = TRUE;
2118        ADD_NEW_DATA(-(state_offset + 1), 0, ncount);
2119        }
2120      break;
2121#endif
2122
2123      /*-----------------------------------------------------------------*/
2124      /* This is a tricky like EXTUNI because it too can match more than one
2125      character (when CR is followed by LF). In this case, set up a negative
2126      state to wait for one character to pass before continuing. */
2127
2128      case OP_ANYNL:
2129      if (clen > 0) switch(c)
2130        {
2131        case CHAR_VT:
2132        case CHAR_FF:
2133        case CHAR_NEL:
2134#ifndef EBCDIC
2135        case 0x2028:
2136        case 0x2029:
2137#endif  /* Not EBCDIC */
2138        if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) break;
2139
2140        case CHAR_LF:
2141        ADD_NEW(state_offset + 1, 0);
2142        break;
2143
2144        case CHAR_CR:
2145        if (ptr + 1 >= end_subject)
2146          {
2147          ADD_NEW(state_offset + 1, 0);
2148          if ((mb->moptions & PCRE2_PARTIAL_HARD) != 0)
2149            reset_could_continue = TRUE;
2150          }
2151        else if (UCHAR21TEST(ptr + 1) == CHAR_LF)
2152          {
2153          ADD_NEW_DATA(-(state_offset + 1), 0, 1);
2154          }
2155        else
2156          {
2157          ADD_NEW(state_offset + 1, 0);
2158          }
2159        break;
2160        }
2161      break;
2162
2163      /*-----------------------------------------------------------------*/
2164      case OP_NOT_VSPACE:
2165      if (clen > 0) switch(c)
2166        {
2167        VSPACE_CASES:
2168        break;
2169
2170        default:
2171        ADD_NEW(state_offset + 1, 0);
2172        break;
2173        }
2174      break;
2175
2176      /*-----------------------------------------------------------------*/
2177      case OP_VSPACE:
2178      if (clen > 0) switch(c)
2179        {
2180        VSPACE_CASES:
2181        ADD_NEW(state_offset + 1, 0);
2182        break;
2183
2184        default:
2185        break;
2186        }
2187      break;
2188
2189      /*-----------------------------------------------------------------*/
2190      case OP_NOT_HSPACE:
2191      if (clen > 0) switch(c)
2192        {
2193        HSPACE_CASES:
2194        break;
2195
2196        default:
2197        ADD_NEW(state_offset + 1, 0);
2198        break;
2199        }
2200      break;
2201
2202      /*-----------------------------------------------------------------*/
2203      case OP_HSPACE:
2204      if (clen > 0) switch(c)
2205        {
2206        HSPACE_CASES:
2207        ADD_NEW(state_offset + 1, 0);
2208        break;
2209
2210        default:
2211        break;
2212        }
2213      break;
2214
2215      /*-----------------------------------------------------------------*/
2216      /* Match a negated single character casefully. */
2217
2218      case OP_NOT:
2219      if (clen > 0 && c != d) { ADD_NEW(state_offset + dlen + 1, 0); }
2220      break;
2221
2222      /*-----------------------------------------------------------------*/
2223      /* Match a negated single character caselessly. */
2224
2225      case OP_NOTI:
2226      if (clen > 0)
2227        {
2228        unsigned int otherd;
2229#ifdef SUPPORT_UNICODE
2230        if (utf && d >= 128)
2231          otherd = UCD_OTHERCASE(d);
2232        else
2233#endif  /* SUPPORT_UNICODE */
2234        otherd = TABLE_GET(d, fcc, d);
2235        if (c != d && c != otherd)
2236          { ADD_NEW(state_offset + dlen + 1, 0); }
2237        }
2238      break;
2239
2240      /*-----------------------------------------------------------------*/
2241      case OP_PLUSI:
2242      case OP_MINPLUSI:
2243      case OP_POSPLUSI:
2244      case OP_NOTPLUSI:
2245      case OP_NOTMINPLUSI:
2246      case OP_NOTPOSPLUSI:
2247      caseless = TRUE;
2248      codevalue -= OP_STARI - OP_STAR;
2249
2250      /* Fall through */
2251      case OP_PLUS:
2252      case OP_MINPLUS:
2253      case OP_POSPLUS:
2254      case OP_NOTPLUS:
2255      case OP_NOTMINPLUS:
2256      case OP_NOTPOSPLUS:
2257      count = current_state->count;  /* Already matched */
2258      if (count > 0) { ADD_ACTIVE(state_offset + dlen + 1, 0); }
2259      if (clen > 0)
2260        {
2261        uint32_t otherd = NOTACHAR;
2262        if (caseless)
2263          {
2264#ifdef SUPPORT_UNICODE
2265          if (utf && d >= 128)
2266            otherd = UCD_OTHERCASE(d);
2267          else
2268#endif  /* SUPPORT_UNICODE */
2269          otherd = TABLE_GET(d, fcc, d);
2270          }
2271        if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2272          {
2273          if (count > 0 &&
2274              (codevalue == OP_POSPLUS || codevalue == OP_NOTPOSPLUS))
2275            {
2276            active_count--;             /* Remove non-match possibility */
2277            next_active_state--;
2278            }
2279          count++;
2280          ADD_NEW(state_offset, count);
2281          }
2282        }
2283      break;
2284
2285      /*-----------------------------------------------------------------*/
2286      case OP_QUERYI:
2287      case OP_MINQUERYI:
2288      case OP_POSQUERYI:
2289      case OP_NOTQUERYI:
2290      case OP_NOTMINQUERYI:
2291      case OP_NOTPOSQUERYI:
2292      caseless = TRUE;
2293      codevalue -= OP_STARI - OP_STAR;
2294      /* Fall through */
2295      case OP_QUERY:
2296      case OP_MINQUERY:
2297      case OP_POSQUERY:
2298      case OP_NOTQUERY:
2299      case OP_NOTMINQUERY:
2300      case OP_NOTPOSQUERY:
2301      ADD_ACTIVE(state_offset + dlen + 1, 0);
2302      if (clen > 0)
2303        {
2304        uint32_t otherd = NOTACHAR;
2305        if (caseless)
2306          {
2307#ifdef SUPPORT_UNICODE
2308          if (utf && d >= 128)
2309            otherd = UCD_OTHERCASE(d);
2310          else
2311#endif  /* SUPPORT_UNICODE */
2312          otherd = TABLE_GET(d, fcc, d);
2313          }
2314        if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2315          {
2316          if (codevalue == OP_POSQUERY || codevalue == OP_NOTPOSQUERY)
2317            {
2318            active_count--;            /* Remove non-match possibility */
2319            next_active_state--;
2320            }
2321          ADD_NEW(state_offset + dlen + 1, 0);
2322          }
2323        }
2324      break;
2325
2326      /*-----------------------------------------------------------------*/
2327      case OP_STARI:
2328      case OP_MINSTARI:
2329      case OP_POSSTARI:
2330      case OP_NOTSTARI:
2331      case OP_NOTMINSTARI:
2332      case OP_NOTPOSSTARI:
2333      caseless = TRUE;
2334      codevalue -= OP_STARI - OP_STAR;
2335      /* Fall through */
2336      case OP_STAR:
2337      case OP_MINSTAR:
2338      case OP_POSSTAR:
2339      case OP_NOTSTAR:
2340      case OP_NOTMINSTAR:
2341      case OP_NOTPOSSTAR:
2342      ADD_ACTIVE(state_offset + dlen + 1, 0);
2343      if (clen > 0)
2344        {
2345        uint32_t otherd = NOTACHAR;
2346        if (caseless)
2347          {
2348#ifdef SUPPORT_UNICODE
2349          if (utf && d >= 128)
2350            otherd = UCD_OTHERCASE(d);
2351          else
2352#endif  /* SUPPORT_UNICODE */
2353          otherd = TABLE_GET(d, fcc, d);
2354          }
2355        if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2356          {
2357          if (codevalue == OP_POSSTAR || codevalue == OP_NOTPOSSTAR)
2358            {
2359            active_count--;            /* Remove non-match possibility */
2360            next_active_state--;
2361            }
2362          ADD_NEW(state_offset, 0);
2363          }
2364        }
2365      break;
2366
2367      /*-----------------------------------------------------------------*/
2368      case OP_EXACTI:
2369      case OP_NOTEXACTI:
2370      caseless = TRUE;
2371      codevalue -= OP_STARI - OP_STAR;
2372      /* Fall through */
2373      case OP_EXACT:
2374      case OP_NOTEXACT:
2375      count = current_state->count;  /* Number already matched */
2376      if (clen > 0)
2377        {
2378        uint32_t otherd = NOTACHAR;
2379        if (caseless)
2380          {
2381#ifdef SUPPORT_UNICODE
2382          if (utf && d >= 128)
2383            otherd = UCD_OTHERCASE(d);
2384          else
2385#endif  /* SUPPORT_UNICODE */
2386          otherd = TABLE_GET(d, fcc, d);
2387          }
2388        if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2389          {
2390          if (++count >= (int)GET2(code, 1))
2391            { ADD_NEW(state_offset + dlen + 1 + IMM2_SIZE, 0); }
2392          else
2393            { ADD_NEW(state_offset, count); }
2394          }
2395        }
2396      break;
2397
2398      /*-----------------------------------------------------------------*/
2399      case OP_UPTOI:
2400      case OP_MINUPTOI:
2401      case OP_POSUPTOI:
2402      case OP_NOTUPTOI:
2403      case OP_NOTMINUPTOI:
2404      case OP_NOTPOSUPTOI:
2405      caseless = TRUE;
2406      codevalue -= OP_STARI - OP_STAR;
2407      /* Fall through */
2408      case OP_UPTO:
2409      case OP_MINUPTO:
2410      case OP_POSUPTO:
2411      case OP_NOTUPTO:
2412      case OP_NOTMINUPTO:
2413      case OP_NOTPOSUPTO:
2414      ADD_ACTIVE(state_offset + dlen + 1 + IMM2_SIZE, 0);
2415      count = current_state->count;  /* Number already matched */
2416      if (clen > 0)
2417        {
2418        uint32_t otherd = NOTACHAR;
2419        if (caseless)
2420          {
2421#ifdef SUPPORT_UNICODE
2422          if (utf && d >= 128)
2423            otherd = UCD_OTHERCASE(d);
2424          else
2425#endif  /* SUPPORT_UNICODE */
2426          otherd = TABLE_GET(d, fcc, d);
2427          }
2428        if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2429          {
2430          if (codevalue == OP_POSUPTO || codevalue == OP_NOTPOSUPTO)
2431            {
2432            active_count--;             /* Remove non-match possibility */
2433            next_active_state--;
2434            }
2435          if (++count >= (int)GET2(code, 1))
2436            { ADD_NEW(state_offset + dlen + 1 + IMM2_SIZE, 0); }
2437          else
2438            { ADD_NEW(state_offset, count); }
2439          }
2440        }
2441      break;
2442
2443
2444/* ========================================================================== */
2445      /* These are the class-handling opcodes */
2446
2447      case OP_CLASS:
2448      case OP_NCLASS:
2449      case OP_XCLASS:
2450        {
2451        BOOL isinclass = FALSE;
2452        int next_state_offset;
2453        PCRE2_SPTR ecode;
2454
2455        /* For a simple class, there is always just a 32-byte table, and we
2456        can set isinclass from it. */
2457
2458        if (codevalue != OP_XCLASS)
2459          {
2460          ecode = code + 1 + (32 / sizeof(PCRE2_UCHAR));
2461          if (clen > 0)
2462            {
2463            isinclass = (c > 255)? (codevalue == OP_NCLASS) :
2464              ((((uint8_t *)(code + 1))[c/8] & (1 << (c&7))) != 0);
2465            }
2466          }
2467
2468        /* An extended class may have a table or a list of single characters,
2469        ranges, or both, and it may be positive or negative. There's a
2470        function that sorts all this out. */
2471
2472        else
2473         {
2474         ecode = code + GET(code, 1);
2475         if (clen > 0) isinclass = PRIV(xclass)(c, code + 1 + LINK_SIZE, utf);
2476         }
2477
2478        /* At this point, isinclass is set for all kinds of class, and ecode
2479        points to the byte after the end of the class. If there is a
2480        quantifier, this is where it will be. */
2481
2482        next_state_offset = (int)(ecode - start_code);
2483
2484        switch (*ecode)
2485          {
2486          case OP_CRSTAR:
2487          case OP_CRMINSTAR:
2488          case OP_CRPOSSTAR:
2489          ADD_ACTIVE(next_state_offset + 1, 0);
2490          if (isinclass)
2491            {
2492            if (*ecode == OP_CRPOSSTAR)
2493              {
2494              active_count--;           /* Remove non-match possibility */
2495              next_active_state--;
2496              }
2497            ADD_NEW(state_offset, 0);
2498            }
2499          break;
2500
2501          case OP_CRPLUS:
2502          case OP_CRMINPLUS:
2503          case OP_CRPOSPLUS:
2504          count = current_state->count;  /* Already matched */
2505          if (count > 0) { ADD_ACTIVE(next_state_offset + 1, 0); }
2506          if (isinclass)
2507            {
2508            if (count > 0 && *ecode == OP_CRPOSPLUS)
2509              {
2510              active_count--;           /* Remove non-match possibility */
2511              next_active_state--;
2512              }
2513            count++;
2514            ADD_NEW(state_offset, count);
2515            }
2516          break;
2517
2518          case OP_CRQUERY:
2519          case OP_CRMINQUERY:
2520          case OP_CRPOSQUERY:
2521          ADD_ACTIVE(next_state_offset + 1, 0);
2522          if (isinclass)
2523            {
2524            if (*ecode == OP_CRPOSQUERY)
2525              {
2526              active_count--;           /* Remove non-match possibility */
2527              next_active_state--;
2528              }
2529            ADD_NEW(next_state_offset + 1, 0);
2530            }
2531          break;
2532
2533          case OP_CRRANGE:
2534          case OP_CRMINRANGE:
2535          case OP_CRPOSRANGE:
2536          count = current_state->count;  /* Already matched */
2537          if (count >= (int)GET2(ecode, 1))
2538            { ADD_ACTIVE(next_state_offset + 1 + 2 * IMM2_SIZE, 0); }
2539          if (isinclass)
2540            {
2541            int max = (int)GET2(ecode, 1 + IMM2_SIZE);
2542            if (*ecode == OP_CRPOSRANGE)
2543              {
2544              active_count--;           /* Remove non-match possibility */
2545              next_active_state--;
2546              }
2547            if (++count >= max && max != 0)   /* Max 0 => no limit */
2548              { ADD_NEW(next_state_offset + 1 + 2 * IMM2_SIZE, 0); }
2549            else
2550              { ADD_NEW(state_offset, count); }
2551            }
2552          break;
2553
2554          default:
2555          if (isinclass) { ADD_NEW(next_state_offset, 0); }
2556          break;
2557          }
2558        }
2559      break;
2560
2561/* ========================================================================== */
2562      /* These are the opcodes for fancy brackets of various kinds. We have
2563      to use recursion in order to handle them. The "always failing" assertion
2564      (?!) is optimised to OP_FAIL when compiling, so we have to support that,
2565      though the other "backtracking verbs" are not supported. */
2566
2567      case OP_FAIL:
2568      forced_fail++;    /* Count FAILs for multiple states */
2569      break;
2570
2571      case OP_ASSERT:
2572      case OP_ASSERT_NOT:
2573      case OP_ASSERTBACK:
2574      case OP_ASSERTBACK_NOT:
2575        {
2576        PCRE2_SPTR endasscode = code + GET(code, 1);
2577        PCRE2_SIZE local_offsets[2];
2578        int rc;
2579        int local_workspace[1000];
2580
2581        while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2582
2583        rc = internal_dfa_match(
2584          mb,                                   /* static match data */
2585          code,                                 /* this subexpression's code */
2586          ptr,                                  /* where we currently are */
2587          (PCRE2_SIZE)(ptr - start_subject),    /* start offset */
2588          local_offsets,                        /* offset vector */
2589          sizeof(local_offsets)/sizeof(PCRE2_SIZE), /* size of same */
2590          local_workspace,                      /* workspace vector */
2591          sizeof(local_workspace)/sizeof(int),  /* size of same */
2592          rlevel);                              /* function recursion level */
2593
2594        if (rc == PCRE2_ERROR_DFA_UITEM) return rc;
2595        if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))
2596            { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
2597        }
2598      break;
2599
2600      /*-----------------------------------------------------------------*/
2601      case OP_COND:
2602      case OP_SCOND:
2603        {
2604        PCRE2_SIZE local_offsets[1000];
2605        int local_workspace[1000];
2606        int codelink = (int)GET(code, 1);
2607        PCRE2_UCHAR condcode;
2608
2609        /* Because of the way auto-callout works during compile, a callout item
2610        is inserted between OP_COND and an assertion condition. This does not
2611        happen for the other conditions. */
2612
2613        if (code[LINK_SIZE + 1] == OP_CALLOUT
2614            || code[LINK_SIZE + 1] == OP_CALLOUT_STR)
2615          {
2616          PCRE2_SIZE callout_length = (code[LINK_SIZE + 1] == OP_CALLOUT)?
2617            (PCRE2_SIZE)PRIV(OP_lengths)[OP_CALLOUT] :
2618            (PCRE2_SIZE)GET(code, 2 + 3*LINK_SIZE);
2619
2620          rrc = 0;
2621          if (mb->callout != NULL)
2622            {
2623            pcre2_callout_block cb;
2624            cb.version          = 1;
2625            cb.capture_top      = 1;
2626            cb.capture_last     = 0;
2627            cb.offset_vector    = offsets;
2628            cb.mark             = NULL;   /* No (*MARK) support */
2629            cb.subject          = start_subject;
2630            cb.subject_length   = (PCRE2_SIZE)(end_subject - start_subject);
2631            cb.start_match      = (PCRE2_SIZE)(current_subject - start_subject);
2632            cb.current_position = (PCRE2_SIZE)(ptr - start_subject);
2633            cb.pattern_position = GET(code, LINK_SIZE + 2);
2634            cb.next_item_length = GET(code, LINK_SIZE + 2 + LINK_SIZE);
2635
2636            if (code[LINK_SIZE + 1] == OP_CALLOUT)
2637              {
2638              cb.callout_number = code[2 + 3*LINK_SIZE];
2639              cb.callout_string_offset = 0;
2640              cb.callout_string = NULL;
2641              cb.callout_string_length = 0;
2642              }
2643            else
2644              {
2645              cb.callout_number = 0;
2646              cb.callout_string_offset = GET(code, 2 + 4*LINK_SIZE);
2647              cb.callout_string = code + (2 + 5*LINK_SIZE) + 1;
2648              cb.callout_string_length =
2649                callout_length - (1 + 4*LINK_SIZE) - 2;
2650              }
2651
2652            if ((rrc = (mb->callout)(&cb, mb->callout_data)) < 0)
2653              return rrc;   /* Abandon */
2654            }
2655          if (rrc > 0) break;                      /* Fail this thread */
2656          code += callout_length;                  /* Skip callout data */
2657          }
2658
2659        condcode = code[LINK_SIZE+1];
2660
2661        /* Back reference conditions and duplicate named recursion conditions
2662        are not supported */
2663
2664        if (condcode == OP_CREF || condcode == OP_DNCREF ||
2665            condcode == OP_DNRREF)
2666          return PCRE2_ERROR_DFA_UCOND;
2667
2668        /* The DEFINE condition is always false, and the assertion (?!) is
2669        converted to OP_FAIL. */
2670
2671        if (condcode == OP_FALSE || condcode == OP_FAIL)
2672          { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2673
2674        /* There is also an always-true condition */
2675
2676        else if (condcode == OP_TRUE)
2677          { ADD_ACTIVE(state_offset + LINK_SIZE + 2 + IMM2_SIZE, 0); }
2678
2679        /* The only supported version of OP_RREF is for the value RREF_ANY,
2680        which means "test if in any recursion". We can't test for specifically
2681        recursed groups. */
2682
2683        else if (condcode == OP_RREF)
2684          {
2685          unsigned int value = GET2(code, LINK_SIZE + 2);
2686          if (value != RREF_ANY) return PCRE2_ERROR_DFA_UCOND;
2687          if (mb->recursive != NULL)
2688            { ADD_ACTIVE(state_offset + LINK_SIZE + 2 + IMM2_SIZE, 0); }
2689          else { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2690          }
2691
2692        /* Otherwise, the condition is an assertion */
2693
2694        else
2695          {
2696          int rc;
2697          PCRE2_SPTR asscode = code + LINK_SIZE + 1;
2698          PCRE2_SPTR endasscode = asscode + GET(asscode, 1);
2699
2700          while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2701
2702          rc = internal_dfa_match(
2703            mb,                                   /* fixed match data */
2704            asscode,                              /* this subexpression's code */
2705            ptr,                                  /* where we currently are */
2706            (PCRE2_SIZE)(ptr - start_subject),    /* start offset */
2707            local_offsets,                        /* offset vector */
2708            sizeof(local_offsets)/sizeof(PCRE2_SIZE), /* size of same */
2709            local_workspace,                      /* workspace vector */
2710            sizeof(local_workspace)/sizeof(int),  /* size of same */
2711            rlevel);                              /* function recursion level */
2712
2713          if (rc == PCRE2_ERROR_DFA_UITEM) return rc;
2714          if ((rc >= 0) ==
2715                (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))
2716            { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
2717          else
2718            { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2719          }
2720        }
2721      break;
2722
2723      /*-----------------------------------------------------------------*/
2724      case OP_RECURSE:
2725        {
2726        dfa_recursion_info *ri;
2727        PCRE2_SIZE local_offsets[1000];
2728        int local_workspace[1000];
2729        PCRE2_SPTR callpat = start_code + GET(code, 1);
2730        uint32_t recno = (callpat == mb->start_code)? 0 :
2731          GET2(callpat, 1 + LINK_SIZE);
2732        int rc;
2733
2734        /* Check for repeating a recursion without advancing the subject
2735        pointer. This should catch convoluted mutual recursions. (Some simple
2736        cases are caught at compile time.) */
2737
2738        for (ri = mb->recursive; ri != NULL; ri = ri->prevrec)
2739          if (recno == ri->group_num && ptr == ri->subject_position)
2740            return PCRE2_ERROR_RECURSELOOP;
2741
2742        /* Remember this recursion and where we started it so as to
2743        catch infinite loops. */
2744
2745        new_recursive.group_num = recno;
2746        new_recursive.subject_position = ptr;
2747        new_recursive.prevrec = mb->recursive;
2748        mb->recursive = &new_recursive;
2749
2750        rc = internal_dfa_match(
2751          mb,                                   /* fixed match data */
2752          callpat,                              /* this subexpression's code */
2753          ptr,                                  /* where we currently are */
2754          (PCRE2_SIZE)(ptr - start_subject),    /* start offset */
2755          local_offsets,                        /* offset vector */
2756          sizeof(local_offsets)/sizeof(PCRE2_SIZE), /* size of same */
2757          local_workspace,                      /* workspace vector */
2758          sizeof(local_workspace)/sizeof(int),  /* size of same */
2759          rlevel);                              /* function recursion level */
2760
2761        mb->recursive = new_recursive.prevrec;  /* Done this recursion */
2762
2763        /* Ran out of internal offsets */
2764
2765        if (rc == 0) return PCRE2_ERROR_DFA_RECURSE;
2766
2767        /* For each successful matched substring, set up the next state with a
2768        count of characters to skip before trying it. Note that the count is in
2769        characters, not bytes. */
2770
2771        if (rc > 0)
2772          {
2773          for (rc = rc*2 - 2; rc >= 0; rc -= 2)
2774            {
2775            PCRE2_SIZE charcount = local_offsets[rc+1] - local_offsets[rc];
2776#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
2777            if (utf)
2778              {
2779              PCRE2_SPTR p = start_subject + local_offsets[rc];
2780              PCRE2_SPTR pp = start_subject + local_offsets[rc+1];
2781              while (p < pp) if (NOT_FIRSTCU(*p++)) charcount--;
2782              }
2783#endif
2784            if (charcount > 0)
2785              {
2786              ADD_NEW_DATA(-(state_offset + LINK_SIZE + 1), 0,
2787                (int)(charcount - 1));
2788              }
2789            else
2790              {
2791              ADD_ACTIVE(state_offset + LINK_SIZE + 1, 0);
2792              }
2793            }
2794          }
2795        else if (rc != PCRE2_ERROR_NOMATCH) return rc;
2796        }
2797      break;
2798
2799      /*-----------------------------------------------------------------*/
2800      case OP_BRAPOS:
2801      case OP_SBRAPOS:
2802      case OP_CBRAPOS:
2803      case OP_SCBRAPOS:
2804      case OP_BRAPOSZERO:
2805        {
2806        PCRE2_SIZE charcount, matched_count;
2807        PCRE2_SPTR local_ptr = ptr;
2808        BOOL allow_zero;
2809
2810        if (codevalue == OP_BRAPOSZERO)
2811          {
2812          allow_zero = TRUE;
2813          codevalue = *(++code);  /* Codevalue will be one of above BRAs */
2814          }
2815        else allow_zero = FALSE;
2816
2817        /* Loop to match the subpattern as many times as possible as if it were
2818        a complete pattern. */
2819
2820        for (matched_count = 0;; matched_count++)
2821          {
2822          PCRE2_SIZE local_offsets[2];
2823          int local_workspace[1000];
2824
2825          int rc = internal_dfa_match(
2826            mb,                                   /* fixed match data */
2827            code,                                 /* this subexpression's code */
2828            local_ptr,                            /* where we currently are */
2829            (PCRE2_SIZE)(ptr - start_subject),    /* start offset */
2830            local_offsets,                        /* offset vector */
2831            sizeof(local_offsets)/sizeof(PCRE2_SIZE), /* size of same */
2832            local_workspace,                      /* workspace vector */
2833            sizeof(local_workspace)/sizeof(int),  /* size of same */
2834            rlevel);                              /* function recursion level */
2835
2836          /* Failed to match */
2837
2838          if (rc < 0)
2839            {
2840            if (rc != PCRE2_ERROR_NOMATCH) return rc;
2841            break;
2842            }
2843
2844          /* Matched: break the loop if zero characters matched. */
2845
2846          charcount = local_offsets[1] - local_offsets[0];
2847          if (charcount == 0) break;
2848          local_ptr += charcount;    /* Advance temporary position ptr */
2849          }
2850
2851        /* At this point we have matched the subpattern matched_count
2852        times, and local_ptr is pointing to the character after the end of the
2853        last match. */
2854
2855        if (matched_count > 0 || allow_zero)
2856          {
2857          PCRE2_SPTR end_subpattern = code;
2858          int next_state_offset;
2859
2860          do { end_subpattern += GET(end_subpattern, 1); }
2861            while (*end_subpattern == OP_ALT);
2862          next_state_offset =
2863            (int)(end_subpattern - start_code + LINK_SIZE + 1);
2864
2865          /* Optimization: if there are no more active states, and there
2866          are no new states yet set up, then skip over the subject string
2867          right here, to save looping. Otherwise, set up the new state to swing
2868          into action when the end of the matched substring is reached. */
2869
2870          if (i + 1 >= active_count && new_count == 0)
2871            {
2872            ptr = local_ptr;
2873            clen = 0;
2874            ADD_NEW(next_state_offset, 0);
2875            }
2876          else
2877            {
2878            PCRE2_SPTR p = ptr;
2879            PCRE2_SPTR pp = local_ptr;
2880            charcount = (PCRE2_SIZE)(pp - p);
2881#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
2882            if (utf) while (p < pp) if (NOT_FIRSTCU(*p++)) charcount--;
2883#endif
2884            ADD_NEW_DATA(-next_state_offset, 0, (int)(charcount - 1));
2885            }
2886          }
2887        }
2888      break;
2889
2890      /*-----------------------------------------------------------------*/
2891      case OP_ONCE:
2892      case OP_ONCE_NC:
2893        {
2894        PCRE2_SIZE local_offsets[2];
2895        int local_workspace[1000];
2896
2897        int rc = internal_dfa_match(
2898          mb,                                   /* fixed match data */
2899          code,                                 /* this subexpression's code */
2900          ptr,                                  /* where we currently are */
2901          (PCRE2_SIZE)(ptr - start_subject),    /* start offset */
2902          local_offsets,                        /* offset vector */
2903          sizeof(local_offsets)/sizeof(PCRE2_SIZE), /* size of same */
2904          local_workspace,                      /* workspace vector */
2905          sizeof(local_workspace)/sizeof(int),  /* size of same */
2906          rlevel);                              /* function recursion level */
2907
2908        if (rc >= 0)
2909          {
2910          PCRE2_SPTR end_subpattern = code;
2911          PCRE2_SIZE charcount = local_offsets[1] - local_offsets[0];
2912          int next_state_offset, repeat_state_offset;
2913
2914          do { end_subpattern += GET(end_subpattern, 1); }
2915            while (*end_subpattern == OP_ALT);
2916          next_state_offset =
2917            (int)(end_subpattern - start_code + LINK_SIZE + 1);
2918
2919          /* If the end of this subpattern is KETRMAX or KETRMIN, we must
2920          arrange for the repeat state also to be added to the relevant list.
2921          Calculate the offset, or set -1 for no repeat. */
2922
2923          repeat_state_offset = (*end_subpattern == OP_KETRMAX ||
2924                                 *end_subpattern == OP_KETRMIN)?
2925            (int)(end_subpattern - start_code - GET(end_subpattern, 1)) : -1;
2926
2927          /* If we have matched an empty string, add the next state at the
2928          current character pointer. This is important so that the duplicate
2929          checking kicks in, which is what breaks infinite loops that match an
2930          empty string. */
2931
2932          if (charcount == 0)
2933            {
2934            ADD_ACTIVE(next_state_offset, 0);
2935            }
2936
2937          /* Optimization: if there are no more active states, and there
2938          are no new states yet set up, then skip over the subject string
2939          right here, to save looping. Otherwise, set up the new state to swing
2940          into action when the end of the matched substring is reached. */
2941
2942          else if (i + 1 >= active_count && new_count == 0)
2943            {
2944            ptr += charcount;
2945            clen = 0;
2946            ADD_NEW(next_state_offset, 0);
2947
2948            /* If we are adding a repeat state at the new character position,
2949            we must fudge things so that it is the only current state.
2950            Otherwise, it might be a duplicate of one we processed before, and
2951            that would cause it to be skipped. */
2952
2953            if (repeat_state_offset >= 0)
2954              {
2955              next_active_state = active_states;
2956              active_count = 0;
2957              i = -1;
2958              ADD_ACTIVE(repeat_state_offset, 0);
2959              }
2960            }
2961          else
2962            {
2963#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
2964            if (utf)
2965              {
2966              PCRE2_SPTR p = start_subject + local_offsets[0];
2967              PCRE2_SPTR pp = start_subject + local_offsets[1];
2968              while (p < pp) if (NOT_FIRSTCU(*p++)) charcount--;
2969              }
2970#endif
2971            ADD_NEW_DATA(-next_state_offset, 0, (int)(charcount - 1));
2972            if (repeat_state_offset >= 0)
2973              { ADD_NEW_DATA(-repeat_state_offset, 0, (int)(charcount - 1)); }
2974            }
2975          }
2976        else if (rc != PCRE2_ERROR_NOMATCH) return rc;
2977        }
2978      break;
2979
2980
2981/* ========================================================================== */
2982      /* Handle callouts */
2983
2984      case OP_CALLOUT:
2985      case OP_CALLOUT_STR:
2986        {
2987        unsigned int callout_length = (*code == OP_CALLOUT)
2988            ? PRIV(OP_lengths)[OP_CALLOUT] : GET(code, 1 + 2*LINK_SIZE);
2989        rrc = 0;
2990
2991        if (mb->callout != NULL)
2992          {
2993          pcre2_callout_block cb;
2994          cb.version          = 1;
2995          cb.capture_top      = 1;
2996          cb.capture_last     = 0;
2997          cb.offset_vector    = offsets;
2998          cb.mark             = NULL;   /* No (*MARK) support */
2999          cb.subject          = start_subject;
3000          cb.subject_length   = (PCRE2_SIZE)(end_subject - start_subject);
3001          cb.start_match      = (PCRE2_SIZE)(current_subject - start_subject);
3002          cb.current_position = (PCRE2_SIZE)(ptr - start_subject);
3003          cb.pattern_position = GET(code, 1);
3004          cb.next_item_length = GET(code, 1 + LINK_SIZE);
3005
3006          if (*code == OP_CALLOUT)
3007            {
3008            cb.callout_number = code[1 + 2*LINK_SIZE];
3009            cb.callout_string_offset = 0;
3010            cb.callout_string = NULL;
3011            cb.callout_string_length = 0;
3012            }
3013          else
3014            {
3015            cb.callout_number = 0;
3016            cb.callout_string_offset = GET(code, 1 + 3*LINK_SIZE);
3017            cb.callout_string = code + (1 + 4*LINK_SIZE) + 1;
3018            cb.callout_string_length =
3019              callout_length - (1 + 4*LINK_SIZE) - 2;
3020            }
3021
3022          if ((rrc = (mb->callout)(&cb, mb->callout_data)) < 0)
3023            return rrc;   /* Abandon */
3024          }
3025        if (rrc == 0)
3026          { ADD_ACTIVE(state_offset + (int)callout_length, 0); }
3027        }
3028      break;
3029
3030
3031/* ========================================================================== */
3032      default:        /* Unsupported opcode */
3033      return PCRE2_ERROR_DFA_UITEM;
3034      }
3035
3036    NEXT_ACTIVE_STATE: continue;
3037
3038    }      /* End of loop scanning active states */
3039
3040  /* We have finished the processing at the current subject character. If no
3041  new states have been set for the next character, we have found all the
3042  matches that we are going to find. If we are at the top level and partial
3043  matching has been requested, check for appropriate conditions.
3044
3045  The "forced_ fail" variable counts the number of (*F) encountered for the
3046  character. If it is equal to the original active_count (saved in
3047  workspace[1]) it means that (*F) was found on every active state. In this
3048  case we don't want to give a partial match.
3049
3050  The "could_continue" variable is true if a state could have continued but
3051  for the fact that the end of the subject was reached. */
3052
3053  if (new_count <= 0)
3054    {
3055    if (rlevel == 1 &&                               /* Top level, and */
3056        could_continue &&                            /* Some could go on, and */
3057        forced_fail != workspace[1] &&               /* Not all forced fail & */
3058        (                                            /* either... */
3059        (mb->moptions & PCRE2_PARTIAL_HARD) != 0      /* Hard partial */
3060        ||                                           /* or... */
3061        ((mb->moptions & PCRE2_PARTIAL_SOFT) != 0 &&  /* Soft partial and */
3062         match_count < 0)                            /* no matches */
3063        ) &&                                         /* And... */
3064        (
3065        partial_newline ||                           /* Either partial NL */
3066          (                                          /* or ... */
3067          ptr >= end_subject &&                /* End of subject and */
3068          ptr > mb->start_used_ptr)            /* Inspected non-empty string */
3069          )
3070        )
3071      match_count = PCRE2_ERROR_PARTIAL;
3072    break;        /* In effect, "return", but see the comment below */
3073    }
3074
3075  /* One or more states are active for the next character. */
3076
3077  ptr += clen;    /* Advance to next subject character */
3078  }               /* Loop to move along the subject string */
3079
3080/* Control gets here from "break" a few lines above. We do it this way because
3081if we use "return" above, we have compiler trouble. Some compilers warn if
3082there's nothing here because they think the function doesn't return a value. On
3083the other hand, if we put a dummy statement here, some more clever compilers
3084complain that it can't be reached. Sigh. */
3085
3086return match_count;
3087}
3088
3089
3090
3091/*************************************************
3092*     Match a pattern using the DFA algorithm    *
3093*************************************************/
3094
3095/* This function matches a compiled pattern to a subject string, using the
3096alternate matching algorithm that finds all matches at once.
3097
3098Arguments:
3099  code          points to the compiled pattern
3100  subject       subject string
3101  length        length of subject string
3102  startoffset   where to start matching in the subject
3103  options       option bits
3104  match_data    points to a match data structure
3105  gcontext      points to a match context
3106  workspace     pointer to workspace
3107  wscount       size of workspace
3108
3109Returns:        > 0 => number of match offset pairs placed in offsets
3110                = 0 => offsets overflowed; longest matches are present
3111                 -1 => failed to match
3112               < -1 => some kind of unexpected problem
3113*/
3114
3115PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION
3116pcre2_dfa_match(const pcre2_code *code, PCRE2_SPTR subject, PCRE2_SIZE length,
3117  PCRE2_SIZE start_offset, uint32_t options, pcre2_match_data *match_data,
3118  pcre2_match_context *mcontext, int *workspace, size_t wscount)
3119{
3120const pcre2_real_code *re = (const pcre2_real_code *)code;
3121
3122PCRE2_SPTR start_match;
3123PCRE2_SPTR end_subject;
3124PCRE2_SPTR bumpalong_limit;
3125PCRE2_SPTR req_cu_ptr;
3126
3127BOOL utf, anchored, startline, firstline;
3128
3129BOOL has_first_cu = FALSE;
3130BOOL has_req_cu = FALSE;
3131PCRE2_UCHAR first_cu = 0;
3132PCRE2_UCHAR first_cu2 = 0;
3133PCRE2_UCHAR req_cu = 0;
3134PCRE2_UCHAR req_cu2 = 0;
3135
3136const uint8_t *start_bits = NULL;
3137
3138/* We need to have mb pointing to a match block, because the IS_NEWLINE macro
3139is used below, and it expects NLBLOCK to be defined as a pointer. */
3140
3141dfa_match_block actual_match_block;
3142dfa_match_block *mb = &actual_match_block;
3143
3144/* A length equal to PCRE2_ZERO_TERMINATED implies a zero-terminated
3145subject string. */
3146
3147if (length == PCRE2_ZERO_TERMINATED) length = PRIV(strlen)(subject);
3148
3149/* Plausibility checks */
3150
3151if ((options & ~PUBLIC_DFA_MATCH_OPTIONS) != 0) return PCRE2_ERROR_BADOPTION;
3152if (re == NULL || subject == NULL || workspace == NULL || match_data == NULL)
3153  return PCRE2_ERROR_NULL;
3154if (wscount < 20) return PCRE2_ERROR_DFA_WSSIZE;
3155if (start_offset > length) return PCRE2_ERROR_BADOFFSET;
3156
3157/* Check that the first field in the block is the magic number. If it is not,
3158return with PCRE2_ERROR_BADMAGIC. */
3159
3160if (re->magic_number != MAGIC_NUMBER) return PCRE2_ERROR_BADMAGIC;
3161
3162/* Check the code unit width. */
3163
3164if ((re->flags & PCRE2_MODE_MASK) != PCRE2_CODE_UNIT_WIDTH/8)
3165  return PCRE2_ERROR_BADMODE;
3166
3167/* PCRE2_NOTEMPTY and PCRE2_NOTEMPTY_ATSTART are match-time flags in the
3168options variable for this function. Users of PCRE2 who are not calling the
3169function directly would like to have a way of setting these flags, in the same
3170way that they can set pcre2_compile() flags like PCRE2_NO_AUTOPOSSESS with
3171constructions like (*NO_AUTOPOSSESS). To enable this, (*NOTEMPTY) and
3172(*NOTEMPTY_ATSTART) set bits in the pattern's "flag" function which can now be
3173transferred to the options for this function. The bits are guaranteed to be
3174adjacent, but do not have the same values. This bit of Boolean trickery assumes
3175that the match-time bits are not more significant than the flag bits. If by
3176accident this is not the case, a compile-time division by zero error will
3177occur. */
3178
3179#define FF (PCRE2_NOTEMPTY_SET|PCRE2_NE_ATST_SET)
3180#define OO (PCRE2_NOTEMPTY|PCRE2_NOTEMPTY_ATSTART)
3181options |= (re->flags & FF) / ((FF & (~FF+1)) / (OO & (~OO+1)));
3182#undef FF
3183#undef OO
3184
3185/* If restarting after a partial match, do some sanity checks on the contents
3186of the workspace. */
3187
3188if ((options & PCRE2_DFA_RESTART) != 0)
3189  {
3190  if ((workspace[0] & (-2)) != 0 || workspace[1] < 1 ||
3191    workspace[1] > (int)((wscount - 2)/INTS_PER_STATEBLOCK))
3192      return PCRE2_ERROR_DFA_BADRESTART;
3193  }
3194
3195/* Set some local values */
3196
3197utf = (re->overall_options & PCRE2_UTF) != 0;
3198start_match = subject + start_offset;
3199end_subject = subject + length;
3200req_cu_ptr = start_match - 1;
3201anchored = (options & (PCRE2_ANCHORED|PCRE2_DFA_RESTART)) != 0 ||
3202  (re->overall_options & PCRE2_ANCHORED) != 0;
3203
3204/* The "must be at the start of a line" flags are used in a loop when finding
3205where to start. */
3206
3207startline = (re->flags & PCRE2_STARTLINE) != 0;
3208firstline = (re->overall_options & PCRE2_FIRSTLINE) != 0;
3209bumpalong_limit = end_subject;
3210
3211/* Get data from the match context, if present, and fill in the fields in the
3212match block. It is an error to set an offset limit without setting the flag at
3213compile time. */
3214
3215if (mcontext == NULL)
3216  {
3217  mb->callout = NULL;
3218  mb->memctl = re->memctl;
3219  }
3220else
3221  {
3222  if (mcontext->offset_limit != PCRE2_UNSET)
3223    {
3224    if ((re->overall_options & PCRE2_USE_OFFSET_LIMIT) == 0)
3225      return PCRE2_ERROR_BADOFFSETLIMIT;
3226    bumpalong_limit = subject + mcontext->offset_limit;
3227    }
3228  mb->callout = mcontext->callout;
3229  mb->callout_data = mcontext->callout_data;
3230  mb->memctl = mcontext->memctl;
3231  }
3232
3233mb->start_code = (PCRE2_UCHAR *)((uint8_t *)re + sizeof(pcre2_real_code)) +
3234  re->name_count * re->name_entry_size;
3235mb->tables = re->tables;
3236mb->start_subject = subject;
3237mb->end_subject = end_subject;
3238mb->start_offset = start_offset;
3239mb->moptions = options;
3240mb->poptions = re->overall_options;
3241
3242/* Process the \R and newline settings. */
3243
3244mb->bsr_convention = re->bsr_convention;
3245mb->nltype = NLTYPE_FIXED;
3246switch(re->newline_convention)
3247  {
3248  case PCRE2_NEWLINE_CR:
3249  mb->nllen = 1;
3250  mb->nl[0] = CHAR_CR;
3251  break;
3252
3253  case PCRE2_NEWLINE_LF:
3254  mb->nllen = 1;
3255  mb->nl[0] = CHAR_NL;
3256  break;
3257
3258  case PCRE2_NEWLINE_CRLF:
3259  mb->nllen = 2;
3260  mb->nl[0] = CHAR_CR;
3261  mb->nl[1] = CHAR_NL;
3262  break;
3263
3264  case PCRE2_NEWLINE_ANY:
3265  mb->nltype = NLTYPE_ANY;
3266  break;
3267
3268  case PCRE2_NEWLINE_ANYCRLF:
3269  mb->nltype = NLTYPE_ANYCRLF;
3270  break;
3271
3272  default: return PCRE2_ERROR_INTERNAL;
3273  }
3274
3275/* Check a UTF string for validity if required. For 8-bit and 16-bit strings,
3276we must also check that a starting offset does not point into the middle of a
3277multiunit character. We check only the portion of the subject that is going to
3278be inspected during matching - from the offset minus the maximum back reference
3279to the given length. This saves time when a small part of a large subject is
3280being matched by the use of a starting offset. Note that the maximum lookbehind
3281is a number of characters, not code units. */
3282
3283#ifdef SUPPORT_UNICODE
3284if (utf && (options & PCRE2_NO_UTF_CHECK) == 0)
3285  {
3286  PCRE2_SPTR check_subject = start_match;  /* start_match includes offset */
3287
3288  if (start_offset > 0)
3289    {
3290#if PCRE2_CODE_UNIT_WIDTH != 32
3291    unsigned int i;
3292    if (start_match < end_subject && NOT_FIRSTCU(*start_match))
3293      return PCRE2_ERROR_BADUTFOFFSET;
3294    for (i = re->max_lookbehind; i > 0 && check_subject > subject; i--)
3295      {
3296      check_subject--;
3297      while (check_subject > subject &&
3298#if PCRE2_CODE_UNIT_WIDTH == 8
3299      (*check_subject & 0xc0) == 0x80)
3300#else  /* 16-bit */
3301      (*check_subject & 0xfc00) == 0xdc00)
3302#endif /* PCRE2_CODE_UNIT_WIDTH == 8 */
3303        check_subject--;
3304      }
3305#else   /* In the 32-bit library, one code unit equals one character. */
3306    check_subject -= re->max_lookbehind;
3307    if (check_subject < subject) check_subject = subject;
3308#endif  /* PCRE2_CODE_UNIT_WIDTH != 32 */
3309    }
3310
3311  /* Validate the relevant portion of the subject. After an error, adjust the
3312  offset to be an absolute offset in the whole string. */
3313
3314  match_data->rc = PRIV(valid_utf)(check_subject,
3315    length - (PCRE2_SIZE)(check_subject - subject), &(match_data->startchar));
3316  if (match_data->rc != 0)
3317    {
3318    match_data->startchar += (PCRE2_SIZE)(check_subject - subject);
3319    return match_data->rc;
3320    }
3321  }
3322#endif  /* SUPPORT_UNICODE */
3323
3324/* Set up the first code unit to match, if available. The first_codeunit value
3325is never set for an anchored regular expression, but the anchoring may be
3326forced at run time, so we have to test for anchoring. The first code unit may
3327be unset for an unanchored pattern, of course. If there's no first code unit
3328there may be a bitmap of possible first characters. */
3329
3330if (!anchored)
3331  {
3332  if ((re->flags & PCRE2_FIRSTSET) != 0)
3333    {
3334    has_first_cu = TRUE;
3335    first_cu = first_cu2 = (PCRE2_UCHAR)(re->first_codeunit);
3336    if ((re->flags & PCRE2_FIRSTCASELESS) != 0)
3337      {
3338      first_cu2 = TABLE_GET(first_cu, mb->tables + fcc_offset, first_cu);
3339#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8
3340      if (utf && first_cu > 127)
3341        first_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(first_cu);
3342#endif
3343      }
3344    }
3345  else
3346    if (!startline && (re->flags & PCRE2_FIRSTMAPSET) != 0)
3347      start_bits = re->start_bitmap;
3348  }
3349
3350/* For anchored or unanchored matches, there may be a "last known required
3351character" set. */
3352
3353if ((re->flags & PCRE2_LASTSET) != 0)
3354  {
3355  has_req_cu = TRUE;
3356  req_cu = req_cu2 = (PCRE2_UCHAR)(re->last_codeunit);
3357  if ((re->flags & PCRE2_LASTCASELESS) != 0)
3358    {
3359    req_cu2 = TABLE_GET(req_cu, mb->tables + fcc_offset, req_cu);
3360#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8
3361    if (utf && req_cu > 127) req_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(req_cu);
3362#endif
3363    }
3364  }
3365
3366/* Fill in fields that are always returned in the match data. */
3367
3368match_data->code = re;
3369match_data->subject = subject;
3370match_data->mark = NULL;
3371match_data->matchedby = PCRE2_MATCHEDBY_DFA_INTERPRETER;
3372
3373/* Call the main matching function, looping for a non-anchored regex after a
3374failed match. If not restarting, perform certain optimizations at the start of
3375a match. */
3376
3377for (;;)
3378  {
3379  int rc;
3380
3381  /* ----------------- Start of match optimizations ---------------- */
3382
3383  /* There are some optimizations that avoid running the match if a known
3384  starting point is not found, or if a known later code unit is not present.
3385  However, there is an option (settable at compile time) that disables
3386  these, for testing and for ensuring that all callouts do actually occur.
3387  The optimizations must also be avoided when restarting a DFA match. */
3388
3389  if ((re->overall_options & PCRE2_NO_START_OPTIMIZE) == 0 &&
3390      (options & PCRE2_DFA_RESTART) == 0)
3391    {
3392    PCRE2_SPTR save_end_subject = end_subject;
3393
3394    /* If firstline is TRUE, the start of the match is constrained to the first
3395    line of a multiline string. That is, the match must be before or at the
3396    first newline. Implement this by temporarily adjusting end_subject so that
3397    we stop the optimization scans at a newline. If the match fails at the
3398    newline, later code breaks this loop. */
3399
3400    if (firstline)
3401      {
3402      PCRE2_SPTR t = start_match;
3403#ifdef SUPPORT_UNICODE
3404      if (utf)
3405        {
3406        while (t < mb->end_subject && !IS_NEWLINE(t))
3407          {
3408          t++;
3409          ACROSSCHAR(t < end_subject, *t, t++);
3410          }
3411        }
3412      else
3413#endif
3414      while (t < mb->end_subject && !IS_NEWLINE(t)) t++;
3415      end_subject = t;
3416      }
3417
3418    /* Advance to a unique first code unit if there is one. */
3419
3420    if (has_first_cu)
3421      {
3422      PCRE2_UCHAR smc;
3423      if (first_cu != first_cu2)
3424        while (start_match < end_subject &&
3425          (smc = UCHAR21TEST(start_match)) != first_cu && smc != first_cu2)
3426          start_match++;
3427      else
3428        while (start_match < end_subject && UCHAR21TEST(start_match) != first_cu)
3429          start_match++;
3430      }
3431
3432    /* Or to just after a linebreak for a multiline match */
3433
3434    else if (startline)
3435      {
3436      if (start_match > mb->start_subject + start_offset)
3437        {
3438#ifdef SUPPORT_UNICODE
3439        if (utf)
3440          {
3441          while (start_match < end_subject && !WAS_NEWLINE(start_match))
3442            {
3443            start_match++;
3444            ACROSSCHAR(start_match < end_subject, *start_match,
3445              start_match++);
3446            }
3447          }
3448        else
3449#endif
3450        while (start_match < end_subject && !WAS_NEWLINE(start_match))
3451          start_match++;
3452
3453        /* If we have just passed a CR and the newline option is ANY or
3454        ANYCRLF, and we are now at a LF, advance the match position by one more
3455        code unit. */
3456
3457        if (start_match[-1] == CHAR_CR &&
3458             (mb->nltype == NLTYPE_ANY || mb->nltype == NLTYPE_ANYCRLF) &&
3459             start_match < end_subject &&
3460             UCHAR21TEST(start_match) == CHAR_NL)
3461          start_match++;
3462        }
3463      }
3464
3465    /* Or to a non-unique first code unit if any have been identified. The
3466    bitmap contains only 256 bits. When code units are 16 or 32 bits wide, all
3467    code units greater than 254 set the 255 bit. */
3468
3469    else if (start_bits != NULL)
3470      {
3471      while (start_match < end_subject)
3472        {
3473        register uint32_t c = UCHAR21TEST(start_match);
3474#if PCRE2_CODE_UNIT_WIDTH != 8
3475        if (c > 255) c = 255;
3476#endif
3477        if ((start_bits[c/8] & (1 << (c&7))) != 0) break;
3478        start_match++;
3479        }
3480      }
3481
3482    /* Restore fudged end_subject */
3483
3484    end_subject = save_end_subject;
3485
3486    /* The following two optimizations are disabled for partial matching. */
3487
3488    if ((mb->moptions & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) == 0)
3489      {
3490      /* The minimum matching length is a lower bound; no actual string of that
3491      length may actually match the pattern. Although the value is, strictly,
3492      in characters, we treat it as code units to avoid spending too much time
3493      in this optimization. */
3494
3495      if (end_subject - start_match < re->minlength) return PCRE2_ERROR_NOMATCH;
3496
3497      /* If req_cu is set, we know that that code unit must appear in the
3498      subject for the match to succeed. If the first code unit is set, req_cu
3499      must be later in the subject; otherwise the test starts at the match
3500      point. This optimization can save a huge amount of backtracking in
3501      patterns with nested unlimited repeats that aren't going to match.
3502      Writing separate code for cased/caseless versions makes it go faster, as
3503      does using an autoincrement and backing off on a match.
3504
3505      HOWEVER: when the subject string is very, very long, searching to its end
3506      can take a long time, and give bad performance on quite ordinary
3507      patterns. This showed up when somebody was matching something like
3508      /^\d+C/ on a 32-megabyte string... so we don't do this when the string is
3509      sufficiently long. */
3510
3511      if (has_req_cu && end_subject - start_match < REQ_CU_MAX)
3512        {
3513        register PCRE2_SPTR p = start_match + (has_first_cu? 1:0);
3514
3515        /* We don't need to repeat the search if we haven't yet reached the
3516        place we found it at last time. */
3517
3518        if (p > req_cu_ptr)
3519          {
3520          if (req_cu != req_cu2)
3521            {
3522            while (p < end_subject)
3523              {
3524              register uint32_t pp = UCHAR21INCTEST(p);
3525              if (pp == req_cu || pp == req_cu2) { p--; break; }
3526              }
3527            }
3528          else
3529            {
3530            while (p < end_subject)
3531              {
3532              if (UCHAR21INCTEST(p) == req_cu) { p--; break; }
3533              }
3534            }
3535
3536          /* If we can't find the required code unit, break the matching loop,
3537          forcing a match failure. */
3538
3539          if (p >= end_subject) break;
3540
3541          /* If we have found the required code unit, save the point where we
3542          found it, so that we don't search again next time round the loop if
3543          the start hasn't passed this code unit yet. */
3544
3545          req_cu_ptr = p;
3546          }
3547        }
3548      }
3549    }
3550
3551  /* ------------ End of start of match optimizations ------------ */
3552
3553  /* Give no match if we have passed the bumpalong limit. */
3554
3555  if (start_match > bumpalong_limit) break;
3556
3557  /* OK, now we can do the business */
3558
3559  mb->start_used_ptr = start_match;
3560  mb->last_used_ptr = start_match;
3561  mb->recursive = NULL;
3562
3563  rc = internal_dfa_match(
3564    mb,                           /* fixed match data */
3565    mb->start_code,               /* this subexpression's code */
3566    start_match,                  /* where we currently are */
3567    start_offset,                 /* start offset in subject */
3568    match_data->ovector,          /* offset vector */
3569    (uint32_t)match_data->oveccount * 2,  /* actual size of same */
3570    workspace,                    /* workspace vector */
3571    (int)wscount,                 /* size of same */
3572    0);                           /* function recurse level */
3573
3574  /* Anything other than "no match" means we are done, always; otherwise, carry
3575  on only if not anchored. */
3576
3577  if (rc != PCRE2_ERROR_NOMATCH || anchored)
3578    {
3579    if (rc == PCRE2_ERROR_PARTIAL && match_data->oveccount > 0)
3580      {
3581      match_data->ovector[0] = (PCRE2_SIZE)(start_match - subject);
3582      match_data->ovector[1] = (PCRE2_SIZE)(end_subject - subject);
3583      }
3584    match_data->leftchar = (PCRE2_SIZE)(mb->start_used_ptr - subject);
3585    match_data->rightchar = (PCRE2_SIZE)( mb->last_used_ptr - subject);
3586    match_data->startchar = (PCRE2_SIZE)(start_match - subject);
3587    match_data->rc = rc;
3588    return rc;
3589    }
3590
3591  /* Advance to the next subject character unless we are at the end of a line
3592  and firstline is set. */
3593
3594  if (firstline && IS_NEWLINE(start_match)) break;
3595  start_match++;
3596#ifdef SUPPORT_UNICODE
3597  if (utf)
3598    {
3599    ACROSSCHAR(start_match < end_subject, *start_match,
3600      start_match++);
3601    }
3602#endif
3603  if (start_match > end_subject) break;
3604
3605  /* If we have just passed a CR and we are now at a LF, and the pattern does
3606  not contain any explicit matches for \r or \n, and the newline option is CRLF
3607  or ANY or ANYCRLF, advance the match position by one more character. */
3608
3609  if (UCHAR21TEST(start_match - 1) == CHAR_CR &&
3610      start_match < end_subject &&
3611      UCHAR21TEST(start_match) == CHAR_NL &&
3612      (re->flags & PCRE2_HASCRORLF) == 0 &&
3613        (mb->nltype == NLTYPE_ANY ||
3614         mb->nltype == NLTYPE_ANYCRLF ||
3615         mb->nllen == 2))
3616    start_match++;
3617
3618  }   /* "Bumpalong" loop */
3619
3620
3621return PCRE2_ERROR_NOMATCH;
3622}
3623
3624/* End of pcre2_dfa_match.c */
3625