1/*************************************************
2*      Perl-Compatible Regular Expressions       *
3*************************************************/
4
5/* PCRE is a library of functions to support regular expressions whose syntax
6and semantics are as close as possible to those of the Perl 5 language.
7
8                       Written by Philip Hazel
9     Original API code Copyright (c) 1997-2012 University of Cambridge
10         New API code Copyright (c) 2016 University of Cambridge
11
12-----------------------------------------------------------------------------
13Redistribution and use in source and binary forms, with or without
14modification, are permitted provided that the following conditions are met:
15
16    * Redistributions of source code must retain the above copyright notice,
17      this list of conditions and the following disclaimer.
18
19    * Redistributions in binary form must reproduce the above copyright
20      notice, this list of conditions and the following disclaimer in the
21      documentation and/or other materials provided with the distribution.
22
23    * Neither the name of the University of Cambridge nor the names of its
24      contributors may be used to endorse or promote products derived from
25      this software without specific prior written permission.
26
27THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37POSSIBILITY OF SUCH DAMAGE.
38-----------------------------------------------------------------------------
39*/
40
41/* This module contains functions for scanning a compiled pattern and
42collecting data (e.g. minimum matching length). */
43
44
45#ifdef HAVE_CONFIG_H
46#include "config.h"
47#endif
48
49
50#include "pcre2_internal.h"
51
52
53/* Set a bit in the starting code unit bit map. */
54
55#define SET_BIT(c) re->start_bitmap[(c)/8] |= (1 << ((c)&7))
56
57/* Returns from set_start_bits() */
58
59enum { SSB_FAIL, SSB_DONE, SSB_CONTINUE, SSB_UNKNOWN };
60
61
62/*************************************************
63*   Find the minimum subject length for a group  *
64*************************************************/
65
66/* Scan a parenthesized group and compute the minimum length of subject that
67is needed to match it. This is a lower bound; it does not mean there is a
68string of that length that matches. In UTF mode, the result is in characters
69rather than code units. The field in a compiled pattern for storing the minimum
70length is 16-bits long (on the grounds that anything longer than that is
71pathological), so we give up when we reach that amount. This also means that
72integer overflow for really crazy patterns cannot happen.
73
74Arguments:
75  re              compiled pattern block
76  code            pointer to start of group (the bracket)
77  startcode       pointer to start of the whole pattern's code
78  utf             UTF flag
79  recurses        chain of recurse_check to catch mutual recursion
80  countptr        pointer to call count (to catch over complexity)
81
82Returns:   the minimum length
83           -1 \C in UTF-8 mode
84              or (*ACCEPT)
85              or pattern too complicated
86              or back reference to duplicate name/number
87           -2 internal error (missing capturing bracket)
88           -3 internal error (opcode not listed)
89*/
90
91static int
92find_minlength(const pcre2_real_code *re, PCRE2_SPTR code,
93  PCRE2_SPTR startcode, BOOL utf, recurse_check *recurses, int *countptr)
94{
95int length = -1;
96int prev_cap_recno = -1;
97int prev_cap_d = 0;
98int prev_recurse_recno = -1;
99int prev_recurse_d = 0;
100uint32_t once_fudge = 0;
101BOOL had_recurse = FALSE;
102BOOL dupcapused = (re->flags & PCRE2_DUPCAPUSED) != 0;
103recurse_check this_recurse;
104register int branchlength = 0;
105register PCRE2_UCHAR *cc = (PCRE2_UCHAR *)code + 1 + LINK_SIZE;
106
107/* If this is a "could be empty" group, its minimum length is 0. */
108
109if (*code >= OP_SBRA && *code <= OP_SCOND) return 0;
110
111/* Skip over capturing bracket number */
112
113if (*code == OP_CBRA || *code == OP_CBRAPOS) cc += IMM2_SIZE;
114
115/* A large and/or complex regex can take too long to process. */
116
117if ((*countptr)++ > 1000) return -1;
118
119/* Scan along the opcodes for this branch. If we get to the end of the branch,
120check the length against that of the other branches. If the accumulated length
121passes 16-bits, stop. */
122
123for (;;)
124  {
125  int d, min, recno;
126  PCRE2_UCHAR *cs, *ce;
127  register PCRE2_UCHAR op = *cc;
128
129  if (branchlength >= UINT16_MAX) return UINT16_MAX;
130
131  switch (op)
132    {
133    case OP_COND:
134    case OP_SCOND:
135
136    /* If there is only one branch in a condition, the implied branch has zero
137    length, so we don't add anything. This covers the DEFINE "condition"
138    automatically. If there are two branches we can treat it the same as any
139    other non-capturing subpattern. */
140
141    cs = cc + GET(cc, 1);
142    if (*cs != OP_ALT)
143      {
144      cc = cs + 1 + LINK_SIZE;
145      break;
146      }
147    goto PROCESS_NON_CAPTURE;
148
149    /* There's a special case of OP_ONCE, when it is wrapped round an
150    OP_RECURSE. We'd like to process the latter at this level so that
151    remembering the value works for repeated cases. So we do nothing, but
152    set a fudge value to skip over the OP_KET after the recurse. */
153
154    case OP_ONCE:
155    if (cc[1+LINK_SIZE] == OP_RECURSE && cc[2*(1+LINK_SIZE)] == OP_KET)
156      {
157      once_fudge = 1 + LINK_SIZE;
158      cc += 1 + LINK_SIZE;
159      break;
160      }
161    /* Fall through */
162
163    case OP_ONCE_NC:
164    case OP_BRA:
165    case OP_SBRA:
166    case OP_BRAPOS:
167    case OP_SBRAPOS:
168    PROCESS_NON_CAPTURE:
169    d = find_minlength(re, cc, startcode, utf, recurses, countptr);
170    if (d < 0) return d;
171    branchlength += d;
172    do cc += GET(cc, 1); while (*cc == OP_ALT);
173    cc += 1 + LINK_SIZE;
174    break;
175
176    /* To save time for repeated capturing subpatterns, we remember the
177    length of the previous one. Unfortunately we can't do the same for
178    the unnumbered ones above. Nor can we do this if (?| is present in the
179    pattern because captures with the same number are not then identical. */
180
181    case OP_CBRA:
182    case OP_SCBRA:
183    case OP_CBRAPOS:
184    case OP_SCBRAPOS:
185    recno = dupcapused? prev_cap_recno - 1 : (int)GET2(cc, 1+LINK_SIZE);
186    if (recno != prev_cap_recno)
187      {
188      prev_cap_recno = recno;
189      prev_cap_d = find_minlength(re, cc, startcode, utf, recurses, countptr);
190      if (prev_cap_d < 0) return prev_cap_d;
191      }
192    branchlength += prev_cap_d;
193    do cc += GET(cc, 1); while (*cc == OP_ALT);
194    cc += 1 + LINK_SIZE;
195    break;
196
197    /* ACCEPT makes things far too complicated; we have to give up. */
198
199    case OP_ACCEPT:
200    case OP_ASSERT_ACCEPT:
201    return -1;
202
203    /* Reached end of a branch; if it's a ket it is the end of a nested
204    call. If it's ALT it is an alternation in a nested call. If it is END it's
205    the end of the outer call. All can be handled by the same code. If an
206    ACCEPT was previously encountered, use the length that was in force at that
207    time, and pass back the shortest ACCEPT length. */
208
209    case OP_ALT:
210    case OP_KET:
211    case OP_KETRMAX:
212    case OP_KETRMIN:
213    case OP_KETRPOS:
214    case OP_END:
215    if (length < 0 || (!had_recurse && branchlength < length))
216      length = branchlength;
217    if (op != OP_ALT) return length;
218    cc += 1 + LINK_SIZE;
219    branchlength = 0;
220    had_recurse = FALSE;
221    break;
222
223    /* Skip over assertive subpatterns */
224
225    case OP_ASSERT:
226    case OP_ASSERT_NOT:
227    case OP_ASSERTBACK:
228    case OP_ASSERTBACK_NOT:
229    do cc += GET(cc, 1); while (*cc == OP_ALT);
230    /* Fall through */
231
232    /* Skip over things that don't match chars */
233
234    case OP_REVERSE:
235    case OP_CREF:
236    case OP_DNCREF:
237    case OP_RREF:
238    case OP_DNRREF:
239    case OP_FALSE:
240    case OP_TRUE:
241    case OP_CALLOUT:
242    case OP_SOD:
243    case OP_SOM:
244    case OP_EOD:
245    case OP_EODN:
246    case OP_CIRC:
247    case OP_CIRCM:
248    case OP_DOLL:
249    case OP_DOLLM:
250    case OP_NOT_WORD_BOUNDARY:
251    case OP_WORD_BOUNDARY:
252    cc += PRIV(OP_lengths)[*cc];
253    break;
254
255    case OP_CALLOUT_STR:
256    cc += GET(cc, 1 + 2*LINK_SIZE);
257    break;
258
259    /* Skip over a subpattern that has a {0} or {0,x} quantifier */
260
261    case OP_BRAZERO:
262    case OP_BRAMINZERO:
263    case OP_BRAPOSZERO:
264    case OP_SKIPZERO:
265    cc += PRIV(OP_lengths)[*cc];
266    do cc += GET(cc, 1); while (*cc == OP_ALT);
267    cc += 1 + LINK_SIZE;
268    break;
269
270    /* Handle literal characters and + repetitions */
271
272    case OP_CHAR:
273    case OP_CHARI:
274    case OP_NOT:
275    case OP_NOTI:
276    case OP_PLUS:
277    case OP_PLUSI:
278    case OP_MINPLUS:
279    case OP_MINPLUSI:
280    case OP_POSPLUS:
281    case OP_POSPLUSI:
282    case OP_NOTPLUS:
283    case OP_NOTPLUSI:
284    case OP_NOTMINPLUS:
285    case OP_NOTMINPLUSI:
286    case OP_NOTPOSPLUS:
287    case OP_NOTPOSPLUSI:
288    branchlength++;
289    cc += 2;
290#ifdef SUPPORT_UNICODE
291    if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
292#endif
293    break;
294
295    case OP_TYPEPLUS:
296    case OP_TYPEMINPLUS:
297    case OP_TYPEPOSPLUS:
298    branchlength++;
299    cc += (cc[1] == OP_PROP || cc[1] == OP_NOTPROP)? 4 : 2;
300    break;
301
302    /* Handle exact repetitions. The count is already in characters, but we
303    may need to skip over a multibyte character in UTF mode.  */
304
305    case OP_EXACT:
306    case OP_EXACTI:
307    case OP_NOTEXACT:
308    case OP_NOTEXACTI:
309    branchlength += GET2(cc,1);
310    cc += 2 + IMM2_SIZE;
311#ifdef SUPPORT_UNICODE
312    if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
313#endif
314    break;
315
316    case OP_TYPEEXACT:
317    branchlength += GET2(cc,1);
318    cc += 2 + IMM2_SIZE + ((cc[1 + IMM2_SIZE] == OP_PROP
319      || cc[1 + IMM2_SIZE] == OP_NOTPROP)? 2 : 0);
320    break;
321
322    /* Handle single-char non-literal matchers */
323
324    case OP_PROP:
325    case OP_NOTPROP:
326    cc += 2;
327    /* Fall through */
328
329    case OP_NOT_DIGIT:
330    case OP_DIGIT:
331    case OP_NOT_WHITESPACE:
332    case OP_WHITESPACE:
333    case OP_NOT_WORDCHAR:
334    case OP_WORDCHAR:
335    case OP_ANY:
336    case OP_ALLANY:
337    case OP_EXTUNI:
338    case OP_HSPACE:
339    case OP_NOT_HSPACE:
340    case OP_VSPACE:
341    case OP_NOT_VSPACE:
342    branchlength++;
343    cc++;
344    break;
345
346    /* "Any newline" might match two characters, but it also might match just
347    one. */
348
349    case OP_ANYNL:
350    branchlength += 1;
351    cc++;
352    break;
353
354    /* The single-byte matcher means we can't proceed in UTF mode. (In
355    non-UTF mode \C will actually be turned into OP_ALLANY, so won't ever
356    appear, but leave the code, just in case.) */
357
358    case OP_ANYBYTE:
359#ifdef SUPPORT_UNICODE
360    if (utf) return -1;
361#endif
362    branchlength++;
363    cc++;
364    break;
365
366    /* For repeated character types, we have to test for \p and \P, which have
367    an extra two bytes of parameters. */
368
369    case OP_TYPESTAR:
370    case OP_TYPEMINSTAR:
371    case OP_TYPEQUERY:
372    case OP_TYPEMINQUERY:
373    case OP_TYPEPOSSTAR:
374    case OP_TYPEPOSQUERY:
375    if (cc[1] == OP_PROP || cc[1] == OP_NOTPROP) cc += 2;
376    cc += PRIV(OP_lengths)[op];
377    break;
378
379    case OP_TYPEUPTO:
380    case OP_TYPEMINUPTO:
381    case OP_TYPEPOSUPTO:
382    if (cc[1 + IMM2_SIZE] == OP_PROP
383      || cc[1 + IMM2_SIZE] == OP_NOTPROP) cc += 2;
384    cc += PRIV(OP_lengths)[op];
385    break;
386
387    /* Check a class for variable quantification */
388
389    case OP_CLASS:
390    case OP_NCLASS:
391#ifdef SUPPORT_WIDE_CHARS
392    case OP_XCLASS:
393    /* The original code caused an unsigned overflow in 64 bit systems,
394    so now we use a conditional statement. */
395    if (op == OP_XCLASS)
396      cc += GET(cc, 1);
397    else
398      cc += PRIV(OP_lengths)[OP_CLASS];
399#else
400    cc += PRIV(OP_lengths)[OP_CLASS];
401#endif
402
403    switch (*cc)
404      {
405      case OP_CRPLUS:
406      case OP_CRMINPLUS:
407      case OP_CRPOSPLUS:
408      branchlength++;
409      /* Fall through */
410
411      case OP_CRSTAR:
412      case OP_CRMINSTAR:
413      case OP_CRQUERY:
414      case OP_CRMINQUERY:
415      case OP_CRPOSSTAR:
416      case OP_CRPOSQUERY:
417      cc++;
418      break;
419
420      case OP_CRRANGE:
421      case OP_CRMINRANGE:
422      case OP_CRPOSRANGE:
423      branchlength += GET2(cc,1);
424      cc += 1 + 2 * IMM2_SIZE;
425      break;
426
427      default:
428      branchlength++;
429      break;
430      }
431    break;
432
433    /* Backreferences and subroutine calls (OP_RECURSE) are treated in the same
434    way: we find the minimum length for the subpattern. A recursion
435    (backreference or subroutine) causes an a flag to be set that causes the
436    length of this branch to be ignored. The logic is that a recursion can only
437    make sense if there is another alternative that stops the recursing. That
438    will provide the minimum length (when no recursion happens).
439
440    If PCRE2_MATCH_UNSET_BACKREF is set, a backreference to an unset bracket
441    matches an empty string (by default it causes a matching failure), so in
442    that case we must set the minimum length to zero. */
443
444    /* Duplicate named pattern back reference. We cannot reliably find a length
445    for this if duplicate numbers are present in the pattern. */
446
447    case OP_DNREF:
448    case OP_DNREFI:
449    if (dupcapused) return -1;
450    if ((re->overall_options & PCRE2_MATCH_UNSET_BACKREF) == 0)
451      {
452      int count = GET2(cc, 1+IMM2_SIZE);
453      PCRE2_UCHAR *slot =
454        (PCRE2_UCHAR *)((uint8_t *)re + sizeof(pcre2_real_code)) +
455          GET2(cc, 1) * re->name_entry_size;
456
457      d = INT_MAX;
458
459      /* Scan all groups with the same name */
460
461      while (count-- > 0)
462        {
463        ce = cs = (PCRE2_UCHAR *)PRIV(find_bracket)(startcode, utf, GET2(slot, 0));
464        if (cs == NULL) return -2;
465        do ce += GET(ce, 1); while (*ce == OP_ALT);
466        if (cc > cs && cc < ce)    /* Simple recursion */
467          {
468          d = 0;
469          had_recurse = TRUE;
470          break;
471          }
472        else
473          {
474          recurse_check *r = recurses;
475          for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break;
476          if (r != NULL)           /* Mutual recursion */
477            {
478            d = 0;
479            had_recurse = TRUE;
480            break;
481            }
482          else
483            {
484            int dd;
485            this_recurse.prev = recurses;
486            this_recurse.group = cs;
487            dd = find_minlength(re, cs, startcode, utf, &this_recurse, countptr);
488            if (dd < d) d = dd;
489            }
490          }
491        slot += re->name_entry_size;
492        }
493      }
494    else d = 0;
495    cc += 1 + 2*IMM2_SIZE;
496    goto REPEAT_BACK_REFERENCE;
497
498    /* Single back reference. We cannot find a length for this if duplicate
499    numbers are present in the pattern. */
500
501    case OP_REF:
502    case OP_REFI:
503    if (dupcapused) return -1;
504    if ((re->overall_options & PCRE2_MATCH_UNSET_BACKREF) == 0)
505      {
506      ce = cs = (PCRE2_UCHAR *)PRIV(find_bracket)(startcode, utf, GET2(cc, 1));
507      if (cs == NULL) return -2;
508      do ce += GET(ce, 1); while (*ce == OP_ALT);
509      if (cc > cs && cc < ce)    /* Simple recursion */
510        {
511        d = 0;
512        had_recurse = TRUE;
513        }
514      else
515        {
516        recurse_check *r = recurses;
517        for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break;
518        if (r != NULL)           /* Mutual recursion */
519          {
520          d = 0;
521          had_recurse = TRUE;
522          }
523        else
524          {
525          this_recurse.prev = recurses;
526          this_recurse.group = cs;
527          d = find_minlength(re, cs, startcode, utf, &this_recurse, countptr);
528          }
529        }
530      }
531    else d = 0;
532    cc += 1 + IMM2_SIZE;
533
534    /* Handle repeated back references */
535
536    REPEAT_BACK_REFERENCE:
537    switch (*cc)
538      {
539      case OP_CRSTAR:
540      case OP_CRMINSTAR:
541      case OP_CRQUERY:
542      case OP_CRMINQUERY:
543      case OP_CRPOSSTAR:
544      case OP_CRPOSQUERY:
545      min = 0;
546      cc++;
547      break;
548
549      case OP_CRPLUS:
550      case OP_CRMINPLUS:
551      case OP_CRPOSPLUS:
552      min = 1;
553      cc++;
554      break;
555
556      case OP_CRRANGE:
557      case OP_CRMINRANGE:
558      case OP_CRPOSRANGE:
559      min = GET2(cc, 1);
560      cc += 1 + 2 * IMM2_SIZE;
561      break;
562
563      default:
564      min = 1;
565      break;
566      }
567
568     /* Take care not to overflow: (1) min and d are ints, so check that their
569     product is not greater than INT_MAX. (2) branchlength is limited to
570     UINT16_MAX (checked at the top of the loop). */
571
572    if ((d > 0 && (INT_MAX/d) < min) || UINT16_MAX - branchlength < min*d)
573      branchlength = UINT16_MAX;
574    else branchlength += min * d;
575    break;
576
577    /* Recursion always refers to the first occurrence of a subpattern with a
578    given number. Therefore, we can always make use of caching, even when the
579    pattern contains multiple subpatterns with the same number. */
580
581    case OP_RECURSE:
582    cs = ce = (PCRE2_UCHAR *)startcode + GET(cc, 1);
583    recno = GET2(cs, 1+LINK_SIZE);
584    if (recno == prev_recurse_recno)
585      {
586      branchlength += prev_recurse_d;
587      }
588    else
589      {
590      do ce += GET(ce, 1); while (*ce == OP_ALT);
591      if (cc > cs && cc < ce)    /* Simple recursion */
592        had_recurse = TRUE;
593      else
594        {
595        recurse_check *r = recurses;
596        for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break;
597        if (r != NULL)          /* Mutual recursion */
598          had_recurse = TRUE;
599        else
600          {
601          this_recurse.prev = recurses;
602          this_recurse.group = cs;
603          prev_recurse_d = find_minlength(re, cs, startcode, utf, &this_recurse,
604            countptr);
605          if (prev_recurse_d < 0) return prev_recurse_d;
606          prev_recurse_recno = recno;
607          branchlength += prev_recurse_d;
608          }
609        }
610      }
611    cc += 1 + LINK_SIZE + once_fudge;
612    once_fudge = 0;
613    break;
614
615    /* Anything else does not or need not match a character. We can get the
616    item's length from the table, but for those that can match zero occurrences
617    of a character, we must take special action for UTF-8 characters. As it
618    happens, the "NOT" versions of these opcodes are used at present only for
619    ASCII characters, so they could be omitted from this list. However, in
620    future that may change, so we include them here so as not to leave a
621    gotcha for a future maintainer. */
622
623    case OP_UPTO:
624    case OP_UPTOI:
625    case OP_NOTUPTO:
626    case OP_NOTUPTOI:
627    case OP_MINUPTO:
628    case OP_MINUPTOI:
629    case OP_NOTMINUPTO:
630    case OP_NOTMINUPTOI:
631    case OP_POSUPTO:
632    case OP_POSUPTOI:
633    case OP_NOTPOSUPTO:
634    case OP_NOTPOSUPTOI:
635
636    case OP_STAR:
637    case OP_STARI:
638    case OP_NOTSTAR:
639    case OP_NOTSTARI:
640    case OP_MINSTAR:
641    case OP_MINSTARI:
642    case OP_NOTMINSTAR:
643    case OP_NOTMINSTARI:
644    case OP_POSSTAR:
645    case OP_POSSTARI:
646    case OP_NOTPOSSTAR:
647    case OP_NOTPOSSTARI:
648
649    case OP_QUERY:
650    case OP_QUERYI:
651    case OP_NOTQUERY:
652    case OP_NOTQUERYI:
653    case OP_MINQUERY:
654    case OP_MINQUERYI:
655    case OP_NOTMINQUERY:
656    case OP_NOTMINQUERYI:
657    case OP_POSQUERY:
658    case OP_POSQUERYI:
659    case OP_NOTPOSQUERY:
660    case OP_NOTPOSQUERYI:
661
662    cc += PRIV(OP_lengths)[op];
663#ifdef SUPPORT_UNICODE
664    if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
665#endif
666    break;
667
668    /* Skip these, but we need to add in the name length. */
669
670    case OP_MARK:
671    case OP_PRUNE_ARG:
672    case OP_SKIP_ARG:
673    case OP_THEN_ARG:
674    cc += PRIV(OP_lengths)[op] + cc[1];
675    break;
676
677    /* The remaining opcodes are just skipped over. */
678
679    case OP_CLOSE:
680    case OP_COMMIT:
681    case OP_FAIL:
682    case OP_PRUNE:
683    case OP_SET_SOM:
684    case OP_SKIP:
685    case OP_THEN:
686    cc += PRIV(OP_lengths)[op];
687    break;
688
689    /* This should not occur: we list all opcodes explicitly so that when
690    new ones get added they are properly considered. */
691
692    default:
693    return -3;
694    }
695  }
696/* Control never gets here */
697}
698
699
700
701/*************************************************
702*      Set a bit and maybe its alternate case    *
703*************************************************/
704
705/* Given a character, set its first code unit's bit in the table, and also the
706corresponding bit for the other version of a letter if we are caseless.
707
708Arguments:
709  re            points to the regex block
710  p             points to the first code unit of the character
711  caseless      TRUE if caseless
712  utf           TRUE for UTF mode
713
714Returns:        pointer after the character
715*/
716
717static PCRE2_SPTR
718set_table_bit(pcre2_real_code *re, PCRE2_SPTR p, BOOL caseless, BOOL utf)
719{
720uint32_t c = *p++;   /* First code unit */
721(void)utf;           /* Stop compiler warning when UTF not supported */
722
723/* In 16-bit and 32-bit modes, code units greater than 0xff set the bit for
7240xff. */
725
726#if PCRE2_CODE_UNIT_WIDTH != 8
727if (c > 0xff) SET_BIT(0xff); else
728#endif
729
730SET_BIT(c);
731
732/* In UTF-8 or UTF-16 mode, pick up the remaining code units in order to find
733the end of the character, even when caseless. */
734
735#ifdef SUPPORT_UNICODE
736if (utf)
737  {
738#if PCRE2_CODE_UNIT_WIDTH == 8
739  if (c >= 0xc0) GETUTF8INC(c, p);
740#elif PCRE2_CODE_UNIT_WIDTH == 16
741  if ((c & 0xfc00) == 0xd800) GETUTF16INC(c, p);
742#endif
743  }
744#endif  /* SUPPORT_UNICODE */
745
746/* If caseless, handle the other case of the character. */
747
748if (caseless)
749  {
750  if (utf)
751    {
752#if PCRE2_CODE_UNIT_WIDTH == 8
753    PCRE2_UCHAR buff[6];
754    c = UCD_OTHERCASE(c);
755    (void)PRIV(ord2utf)(c, buff);
756    SET_BIT(buff[0]);
757#else  /* 16-bit or 32-bit mode */
758    c = UCD_OTHERCASE(c);
759    if (c > 0xff) SET_BIT(0xff); else SET_BIT(c);
760#endif
761    }
762
763  /* Not UTF */
764
765  else if (MAX_255(c)) SET_BIT(re->tables[fcc_offset + c]);
766  }
767
768return p;
769}
770
771
772
773/*************************************************
774*     Set bits for a positive character type     *
775*************************************************/
776
777/* This function sets starting bits for a character type. In UTF-8 mode, we can
778only do a direct setting for bytes less than 128, as otherwise there can be
779confusion with bytes in the middle of UTF-8 characters. In a "traditional"
780environment, the tables will only recognize ASCII characters anyway, but in at
781least one Windows environment, some higher bytes bits were set in the tables.
782So we deal with that case by considering the UTF-8 encoding.
783
784Arguments:
785  re             the regex block
786  cbit type      the type of character wanted
787  table_limit    32 for non-UTF-8; 16 for UTF-8
788
789Returns:         nothing
790*/
791
792static void
793set_type_bits(pcre2_real_code *re, int cbit_type, unsigned int table_limit)
794{
795register uint32_t c;
796for (c = 0; c < table_limit; c++)
797  re->start_bitmap[c] |= re->tables[c+cbits_offset+cbit_type];
798#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
799if (table_limit == 32) return;
800for (c = 128; c < 256; c++)
801  {
802  if ((re->tables[cbits_offset + c/8] & (1 << (c&7))) != 0)
803    {
804    PCRE2_UCHAR buff[6];
805    (void)PRIV(ord2utf)(c, buff);
806    SET_BIT(buff[0]);
807    }
808  }
809#endif  /* UTF-8 */
810}
811
812
813/*************************************************
814*     Set bits for a negative character type     *
815*************************************************/
816
817/* This function sets starting bits for a negative character type such as \D.
818In UTF-8 mode, we can only do a direct setting for bytes less than 128, as
819otherwise there can be confusion with bytes in the middle of UTF-8 characters.
820Unlike in the positive case, where we can set appropriate starting bits for
821specific high-valued UTF-8 characters, in this case we have to set the bits for
822all high-valued characters. The lowest is 0xc2, but we overkill by starting at
8230xc0 (192) for simplicity.
824
825Arguments:
826  re             the regex block
827  cbit type      the type of character wanted
828  table_limit    32 for non-UTF-8; 16 for UTF-8
829
830Returns:         nothing
831*/
832
833static void
834set_nottype_bits(pcre2_real_code *re, int cbit_type, unsigned int table_limit)
835{
836register uint32_t c;
837for (c = 0; c < table_limit; c++)
838  re->start_bitmap[c] |= ~(re->tables[c+cbits_offset+cbit_type]);
839#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
840if (table_limit != 32) for (c = 24; c < 32; c++) re->start_bitmap[c] = 0xff;
841#endif
842}
843
844
845
846/*************************************************
847*          Create bitmap of starting bytes       *
848*************************************************/
849
850/* This function scans a compiled unanchored expression recursively and
851attempts to build a bitmap of the set of possible starting code units whose
852values are less than 256. In 16-bit and 32-bit mode, values above 255 all cause
853the 255 bit to be set. When calling set[_not]_type_bits() in UTF-8 (sic) mode
854we pass a value of 16 rather than 32 as the final argument. (See comments in
855those functions for the reason.)
856
857The SSB_CONTINUE return is useful for parenthesized groups in patterns such as
858(a*)b where the group provides some optional starting code units but scanning
859must continue at the outer level to find at least one mandatory code unit. At
860the outermost level, this function fails unless the result is SSB_DONE.
861
862Arguments:
863  re           points to the compiled regex block
864  code         points to an expression
865  utf          TRUE if in UTF mode
866
867Returns:       SSB_FAIL     => Failed to find any starting code units
868               SSB_DONE     => Found mandatory starting code units
869               SSB_CONTINUE => Found optional starting code units
870               SSB_UNKNOWN  => Hit an unrecognized opcode
871*/
872
873static int
874set_start_bits(pcre2_real_code *re, PCRE2_SPTR code, BOOL utf)
875{
876register uint32_t c;
877int yield = SSB_DONE;
878
879#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
880int table_limit = utf? 16:32;
881#else
882int table_limit = 32;
883#endif
884
885do
886  {
887  BOOL try_next = TRUE;
888  PCRE2_SPTR tcode = code + 1 + LINK_SIZE;
889
890  if (*code == OP_CBRA || *code == OP_SCBRA ||
891      *code == OP_CBRAPOS || *code == OP_SCBRAPOS) tcode += IMM2_SIZE;
892
893  while (try_next)    /* Loop for items in this branch */
894    {
895    int rc;
896    uint8_t *classmap = NULL;
897
898    switch(*tcode)
899      {
900      /* If we reach something we don't understand, it means a new opcode has
901      been created that hasn't been added to this function. Hopefully this
902      problem will be discovered during testing. */
903
904      default:
905      return SSB_UNKNOWN;
906
907      /* Fail for a valid opcode that implies no starting bits. */
908
909      case OP_ACCEPT:
910      case OP_ASSERT_ACCEPT:
911      case OP_ALLANY:
912      case OP_ANY:
913      case OP_ANYBYTE:
914      case OP_CIRC:
915      case OP_CIRCM:
916      case OP_CLOSE:
917      case OP_COMMIT:
918      case OP_COND:
919      case OP_CREF:
920      case OP_FALSE:
921      case OP_TRUE:
922      case OP_DNCREF:
923      case OP_DNREF:
924      case OP_DNREFI:
925      case OP_DNRREF:
926      case OP_DOLL:
927      case OP_DOLLM:
928      case OP_END:
929      case OP_EOD:
930      case OP_EODN:
931      case OP_EXTUNI:
932      case OP_FAIL:
933      case OP_MARK:
934      case OP_NOT:
935      case OP_NOTEXACT:
936      case OP_NOTEXACTI:
937      case OP_NOTI:
938      case OP_NOTMINPLUS:
939      case OP_NOTMINPLUSI:
940      case OP_NOTMINQUERY:
941      case OP_NOTMINQUERYI:
942      case OP_NOTMINSTAR:
943      case OP_NOTMINSTARI:
944      case OP_NOTMINUPTO:
945      case OP_NOTMINUPTOI:
946      case OP_NOTPLUS:
947      case OP_NOTPLUSI:
948      case OP_NOTPOSPLUS:
949      case OP_NOTPOSPLUSI:
950      case OP_NOTPOSQUERY:
951      case OP_NOTPOSQUERYI:
952      case OP_NOTPOSSTAR:
953      case OP_NOTPOSSTARI:
954      case OP_NOTPOSUPTO:
955      case OP_NOTPOSUPTOI:
956      case OP_NOTPROP:
957      case OP_NOTQUERY:
958      case OP_NOTQUERYI:
959      case OP_NOTSTAR:
960      case OP_NOTSTARI:
961      case OP_NOTUPTO:
962      case OP_NOTUPTOI:
963      case OP_NOT_HSPACE:
964      case OP_NOT_VSPACE:
965      case OP_PRUNE:
966      case OP_PRUNE_ARG:
967      case OP_RECURSE:
968      case OP_REF:
969      case OP_REFI:
970      case OP_REVERSE:
971      case OP_RREF:
972      case OP_SCOND:
973      case OP_SET_SOM:
974      case OP_SKIP:
975      case OP_SKIP_ARG:
976      case OP_SOD:
977      case OP_SOM:
978      case OP_THEN:
979      case OP_THEN_ARG:
980      return SSB_FAIL;
981
982      /* A "real" property test implies no starting bits, but the fake property
983      PT_CLIST identifies a list of characters. These lists are short, as they
984      are used for characters with more than one "other case", so there is no
985      point in recognizing them for OP_NOTPROP. */
986
987      case OP_PROP:
988      if (tcode[1] != PT_CLIST) return SSB_FAIL;
989        {
990        const uint32_t *p = PRIV(ucd_caseless_sets) + tcode[2];
991        while ((c = *p++) < NOTACHAR)
992          {
993#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
994          if (utf)
995            {
996            PCRE2_UCHAR buff[6];
997            (void)PRIV(ord2utf)(c, buff);
998            c = buff[0];
999            }
1000#endif
1001          if (c > 0xff) SET_BIT(0xff); else SET_BIT(c);
1002          }
1003        }
1004      try_next = FALSE;
1005      break;
1006
1007      /* We can ignore word boundary tests. */
1008
1009      case OP_WORD_BOUNDARY:
1010      case OP_NOT_WORD_BOUNDARY:
1011      tcode++;
1012      break;
1013
1014      /* If we hit a bracket or a positive lookahead assertion, recurse to set
1015      bits from within the subpattern. If it can't find anything, we have to
1016      give up. If it finds some mandatory character(s), we are done for this
1017      branch. Otherwise, carry on scanning after the subpattern. */
1018
1019      case OP_BRA:
1020      case OP_SBRA:
1021      case OP_CBRA:
1022      case OP_SCBRA:
1023      case OP_BRAPOS:
1024      case OP_SBRAPOS:
1025      case OP_CBRAPOS:
1026      case OP_SCBRAPOS:
1027      case OP_ONCE:
1028      case OP_ONCE_NC:
1029      case OP_ASSERT:
1030      rc = set_start_bits(re, tcode, utf);
1031      if (rc == SSB_FAIL || rc == SSB_UNKNOWN) return rc;
1032      if (rc == SSB_DONE) try_next = FALSE; else
1033        {
1034        do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
1035        tcode += 1 + LINK_SIZE;
1036        }
1037      break;
1038
1039      /* If we hit ALT or KET, it means we haven't found anything mandatory in
1040      this branch, though we might have found something optional. For ALT, we
1041      continue with the next alternative, but we have to arrange that the final
1042      result from subpattern is SSB_CONTINUE rather than SSB_DONE. For KET,
1043      return SSB_CONTINUE: if this is the top level, that indicates failure,
1044      but after a nested subpattern, it causes scanning to continue. */
1045
1046      case OP_ALT:
1047      yield = SSB_CONTINUE;
1048      try_next = FALSE;
1049      break;
1050
1051      case OP_KET:
1052      case OP_KETRMAX:
1053      case OP_KETRMIN:
1054      case OP_KETRPOS:
1055      return SSB_CONTINUE;
1056
1057      /* Skip over callout */
1058
1059      case OP_CALLOUT:
1060      tcode += PRIV(OP_lengths)[OP_CALLOUT];
1061      break;
1062
1063      case OP_CALLOUT_STR:
1064      tcode += GET(tcode, 1 + 2*LINK_SIZE);
1065      break;
1066
1067      /* Skip over lookbehind and negative lookahead assertions */
1068
1069      case OP_ASSERT_NOT:
1070      case OP_ASSERTBACK:
1071      case OP_ASSERTBACK_NOT:
1072      do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
1073      tcode += 1 + LINK_SIZE;
1074      break;
1075
1076      /* BRAZERO does the bracket, but carries on. */
1077
1078      case OP_BRAZERO:
1079      case OP_BRAMINZERO:
1080      case OP_BRAPOSZERO:
1081      rc = set_start_bits(re, ++tcode, utf);
1082      if (rc == SSB_FAIL || rc == SSB_UNKNOWN) return rc;
1083      do tcode += GET(tcode,1); while (*tcode == OP_ALT);
1084      tcode += 1 + LINK_SIZE;
1085      break;
1086
1087      /* SKIPZERO skips the bracket. */
1088
1089      case OP_SKIPZERO:
1090      tcode++;
1091      do tcode += GET(tcode,1); while (*tcode == OP_ALT);
1092      tcode += 1 + LINK_SIZE;
1093      break;
1094
1095      /* Single-char * or ? sets the bit and tries the next item */
1096
1097      case OP_STAR:
1098      case OP_MINSTAR:
1099      case OP_POSSTAR:
1100      case OP_QUERY:
1101      case OP_MINQUERY:
1102      case OP_POSQUERY:
1103      tcode = set_table_bit(re, tcode + 1, FALSE, utf);
1104      break;
1105
1106      case OP_STARI:
1107      case OP_MINSTARI:
1108      case OP_POSSTARI:
1109      case OP_QUERYI:
1110      case OP_MINQUERYI:
1111      case OP_POSQUERYI:
1112      tcode = set_table_bit(re, tcode + 1, TRUE, utf);
1113      break;
1114
1115      /* Single-char upto sets the bit and tries the next */
1116
1117      case OP_UPTO:
1118      case OP_MINUPTO:
1119      case OP_POSUPTO:
1120      tcode = set_table_bit(re, tcode + 1 + IMM2_SIZE, FALSE, utf);
1121      break;
1122
1123      case OP_UPTOI:
1124      case OP_MINUPTOI:
1125      case OP_POSUPTOI:
1126      tcode = set_table_bit(re, tcode + 1 + IMM2_SIZE, TRUE, utf);
1127      break;
1128
1129      /* At least one single char sets the bit and stops */
1130
1131      case OP_EXACT:
1132      tcode += IMM2_SIZE;
1133      /* Fall through */
1134      case OP_CHAR:
1135      case OP_PLUS:
1136      case OP_MINPLUS:
1137      case OP_POSPLUS:
1138      (void)set_table_bit(re, tcode + 1, FALSE, utf);
1139      try_next = FALSE;
1140      break;
1141
1142      case OP_EXACTI:
1143      tcode += IMM2_SIZE;
1144      /* Fall through */
1145      case OP_CHARI:
1146      case OP_PLUSI:
1147      case OP_MINPLUSI:
1148      case OP_POSPLUSI:
1149      (void)set_table_bit(re, tcode + 1, TRUE, utf);
1150      try_next = FALSE;
1151      break;
1152
1153      /* Special spacing and line-terminating items. These recognize specific
1154      lists of characters. The difference between VSPACE and ANYNL is that the
1155      latter can match the two-character CRLF sequence, but that is not
1156      relevant for finding the first character, so their code here is
1157      identical. */
1158
1159      case OP_HSPACE:
1160      SET_BIT(CHAR_HT);
1161      SET_BIT(CHAR_SPACE);
1162
1163      /* For the 16-bit and 32-bit libraries (which can never be EBCDIC), set
1164      the bits for 0xA0 and for code units >= 255, independently of UTF. */
1165
1166#if PCRE2_CODE_UNIT_WIDTH != 8
1167      SET_BIT(0xA0);
1168      SET_BIT(0xFF);
1169#else
1170      /* For the 8-bit library in UTF-8 mode, set the bits for the first code
1171      units of horizontal space characters. */
1172
1173#ifdef SUPPORT_UNICODE
1174      if (utf)
1175        {
1176        SET_BIT(0xC2);  /* For U+00A0 */
1177        SET_BIT(0xE1);  /* For U+1680, U+180E */
1178        SET_BIT(0xE2);  /* For U+2000 - U+200A, U+202F, U+205F */
1179        SET_BIT(0xE3);  /* For U+3000 */
1180        }
1181      else
1182#endif
1183      /* For the 8-bit library not in UTF-8 mode, set the bit for 0xA0, unless
1184      the code is EBCDIC. */
1185        {
1186#ifndef EBCDIC
1187        SET_BIT(0xA0);
1188#endif  /* Not EBCDIC */
1189        }
1190#endif  /* 8-bit support */
1191
1192      try_next = FALSE;
1193      break;
1194
1195      case OP_ANYNL:
1196      case OP_VSPACE:
1197      SET_BIT(CHAR_LF);
1198      SET_BIT(CHAR_VT);
1199      SET_BIT(CHAR_FF);
1200      SET_BIT(CHAR_CR);
1201
1202      /* For the 16-bit and 32-bit libraries (which can never be EBCDIC), set
1203      the bits for NEL and for code units >= 255, independently of UTF. */
1204
1205#if PCRE2_CODE_UNIT_WIDTH != 8
1206      SET_BIT(CHAR_NEL);
1207      SET_BIT(0xFF);
1208#else
1209      /* For the 8-bit library in UTF-8 mode, set the bits for the first code
1210      units of vertical space characters. */
1211
1212#ifdef SUPPORT_UNICODE
1213      if (utf)
1214        {
1215        SET_BIT(0xC2);  /* For U+0085 (NEL) */
1216        SET_BIT(0xE2);  /* For U+2028, U+2029 */
1217        }
1218      else
1219#endif
1220      /* For the 8-bit library not in UTF-8 mode, set the bit for NEL. */
1221        {
1222        SET_BIT(CHAR_NEL);
1223        }
1224#endif  /* 8-bit support */
1225
1226      try_next = FALSE;
1227      break;
1228
1229      /* Single character types set the bits and stop. Note that if PCRE2_UCP
1230      is set, we do not see these op codes because \d etc are converted to
1231      properties. Therefore, these apply in the case when only characters less
1232      than 256 are recognized to match the types. */
1233
1234      case OP_NOT_DIGIT:
1235      set_nottype_bits(re, cbit_digit, table_limit);
1236      try_next = FALSE;
1237      break;
1238
1239      case OP_DIGIT:
1240      set_type_bits(re, cbit_digit, table_limit);
1241      try_next = FALSE;
1242      break;
1243
1244      case OP_NOT_WHITESPACE:
1245      set_nottype_bits(re, cbit_space, table_limit);
1246      try_next = FALSE;
1247      break;
1248
1249      case OP_WHITESPACE:
1250      set_type_bits(re, cbit_space, table_limit);
1251      try_next = FALSE;
1252      break;
1253
1254      case OP_NOT_WORDCHAR:
1255      set_nottype_bits(re, cbit_word, table_limit);
1256      try_next = FALSE;
1257      break;
1258
1259      case OP_WORDCHAR:
1260      set_type_bits(re, cbit_word, table_limit);
1261      try_next = FALSE;
1262      break;
1263
1264      /* One or more character type fudges the pointer and restarts, knowing
1265      it will hit a single character type and stop there. */
1266
1267      case OP_TYPEPLUS:
1268      case OP_TYPEMINPLUS:
1269      case OP_TYPEPOSPLUS:
1270      tcode++;
1271      break;
1272
1273      case OP_TYPEEXACT:
1274      tcode += 1 + IMM2_SIZE;
1275      break;
1276
1277      /* Zero or more repeats of character types set the bits and then
1278      try again. */
1279
1280      case OP_TYPEUPTO:
1281      case OP_TYPEMINUPTO:
1282      case OP_TYPEPOSUPTO:
1283      tcode += IMM2_SIZE;  /* Fall through */
1284
1285      case OP_TYPESTAR:
1286      case OP_TYPEMINSTAR:
1287      case OP_TYPEPOSSTAR:
1288      case OP_TYPEQUERY:
1289      case OP_TYPEMINQUERY:
1290      case OP_TYPEPOSQUERY:
1291      switch(tcode[1])
1292        {
1293        default:
1294        case OP_ANY:
1295        case OP_ALLANY:
1296        return SSB_FAIL;
1297
1298        case OP_HSPACE:
1299        SET_BIT(CHAR_HT);
1300        SET_BIT(CHAR_SPACE);
1301
1302        /* For the 16-bit and 32-bit libraries (which can never be EBCDIC), set
1303        the bits for 0xA0 and for code units >= 255, independently of UTF. */
1304
1305#if PCRE2_CODE_UNIT_WIDTH != 8
1306        SET_BIT(0xA0);
1307        SET_BIT(0xFF);
1308#else
1309        /* For the 8-bit library in UTF-8 mode, set the bits for the first code
1310        units of horizontal space characters. */
1311
1312#ifdef SUPPORT_UNICODE
1313        if (utf)
1314          {
1315          SET_BIT(0xC2);  /* For U+00A0 */
1316          SET_BIT(0xE1);  /* For U+1680, U+180E */
1317          SET_BIT(0xE2);  /* For U+2000 - U+200A, U+202F, U+205F */
1318          SET_BIT(0xE3);  /* For U+3000 */
1319          }
1320        else
1321#endif
1322        /* For the 8-bit library not in UTF-8 mode, set the bit for 0xA0, unless
1323        the code is EBCDIC. */
1324          {
1325#ifndef EBCDIC
1326          SET_BIT(0xA0);
1327#endif  /* Not EBCDIC */
1328          }
1329#endif  /* 8-bit support */
1330        break;
1331
1332        case OP_ANYNL:
1333        case OP_VSPACE:
1334        SET_BIT(CHAR_LF);
1335        SET_BIT(CHAR_VT);
1336        SET_BIT(CHAR_FF);
1337        SET_BIT(CHAR_CR);
1338
1339        /* For the 16-bit and 32-bit libraries (which can never be EBCDIC), set
1340        the bits for NEL and for code units >= 255, independently of UTF. */
1341
1342#if PCRE2_CODE_UNIT_WIDTH != 8
1343        SET_BIT(CHAR_NEL);
1344        SET_BIT(0xFF);
1345#else
1346        /* For the 8-bit library in UTF-8 mode, set the bits for the first code
1347        units of vertical space characters. */
1348
1349#ifdef SUPPORT_UNICODE
1350        if (utf)
1351          {
1352          SET_BIT(0xC2);  /* For U+0085 (NEL) */
1353          SET_BIT(0xE2);  /* For U+2028, U+2029 */
1354          }
1355        else
1356#endif
1357        /* For the 8-bit library not in UTF-8 mode, set the bit for NEL. */
1358          {
1359          SET_BIT(CHAR_NEL);
1360          }
1361#endif  /* 8-bit support */
1362        break;
1363
1364        case OP_NOT_DIGIT:
1365        set_nottype_bits(re, cbit_digit, table_limit);
1366        break;
1367
1368        case OP_DIGIT:
1369        set_type_bits(re, cbit_digit, table_limit);
1370        break;
1371
1372        case OP_NOT_WHITESPACE:
1373        set_nottype_bits(re, cbit_space, table_limit);
1374        break;
1375
1376        case OP_WHITESPACE:
1377        set_type_bits(re, cbit_space, table_limit);
1378        break;
1379
1380        case OP_NOT_WORDCHAR:
1381        set_nottype_bits(re, cbit_word, table_limit);
1382        break;
1383
1384        case OP_WORDCHAR:
1385        set_type_bits(re, cbit_word, table_limit);
1386        break;
1387        }
1388
1389      tcode += 2;
1390      break;
1391
1392      /* Extended class: if there are any property checks, or if this is a
1393      negative XCLASS without a map, give up. If there are no property checks,
1394      there must be wide characters on the XCLASS list, because otherwise an
1395      XCLASS would not have been created. This means that code points >= 255
1396      are always potential starters. */
1397
1398#ifdef SUPPORT_WIDE_CHARS
1399      case OP_XCLASS:
1400      if ((tcode[1 + LINK_SIZE] & XCL_HASPROP) != 0 ||
1401          (tcode[1 + LINK_SIZE] & (XCL_MAP|XCL_NOT)) == XCL_NOT)
1402        return SSB_FAIL;
1403
1404      /* We have a positive XCLASS or a negative one without a map. Set up the
1405      map pointer if there is one, and fall through. */
1406
1407      classmap = ((tcode[1 + LINK_SIZE] & XCL_MAP) == 0)? NULL :
1408        (uint8_t *)(tcode + 1 + LINK_SIZE + 1);
1409#endif
1410
1411      /* Enter here for a negative non-XCLASS. In the 8-bit library, if we are
1412      in UTF mode, any byte with a value >= 0xc4 is a potentially valid starter
1413      because it starts a character with a value > 255. In 8-bit non-UTF mode,
1414      there is no difference between CLASS and NCLASS. In all other wide
1415      character modes, set the 0xFF bit to indicate code units >= 255. */
1416
1417      case OP_NCLASS:
1418#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
1419      if (utf)
1420        {
1421        re->start_bitmap[24] |= 0xf0;            /* Bits for 0xc4 - 0xc8 */
1422        memset(re->start_bitmap+25, 0xff, 7);    /* Bits for 0xc9 - 0xff */
1423        }
1424#elif PCRE2_CODE_UNIT_WIDTH != 8
1425      SET_BIT(0xFF);                             /* For characters >= 255 */
1426#endif
1427      /* Fall through */
1428
1429      /* Enter here for a positive non-XCLASS. If we have fallen through from
1430      an XCLASS, classmap will already be set; just advance the code pointer.
1431      Otherwise, set up classmap for a a non-XCLASS and advance past it. */
1432
1433      case OP_CLASS:
1434      if (*tcode == OP_XCLASS) tcode += GET(tcode, 1); else
1435        {
1436        classmap = (uint8_t *)(++tcode);
1437        tcode += 32 / sizeof(PCRE2_UCHAR);
1438        }
1439
1440      /* When wide characters are supported, classmap may be NULL. In UTF-8
1441      (sic) mode, the bits in a class bit map correspond to character values,
1442      not to byte values. However, the bit map we are constructing is for byte
1443      values. So we have to do a conversion for characters whose code point is
1444      greater than 127. In fact, there are only two possible starting bytes for
1445      characters in the range 128 - 255. */
1446
1447      if (classmap != NULL)
1448        {
1449#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
1450        if (utf)
1451          {
1452          for (c = 0; c < 16; c++) re->start_bitmap[c] |= classmap[c];
1453          for (c = 128; c < 256; c++)
1454            {
1455            if ((classmap[c/8] & (1 << (c&7))) != 0)
1456              {
1457              int d = (c >> 6) | 0xc0;            /* Set bit for this starter */
1458              re->start_bitmap[d/8] |= (1 << (d&7));  /* and then skip on to the */
1459              c = (c & 0xc0) + 0x40 - 1;          /* next relevant character. */
1460              }
1461            }
1462          }
1463        else
1464#endif
1465        /* In all modes except UTF-8, the two bit maps are compatible. */
1466
1467          {
1468          for (c = 0; c < 32; c++) re->start_bitmap[c] |= classmap[c];
1469          }
1470        }
1471
1472      /* Act on what follows the class. For a zero minimum repeat, continue;
1473      otherwise stop processing. */
1474
1475      switch (*tcode)
1476        {
1477        case OP_CRSTAR:
1478        case OP_CRMINSTAR:
1479        case OP_CRQUERY:
1480        case OP_CRMINQUERY:
1481        case OP_CRPOSSTAR:
1482        case OP_CRPOSQUERY:
1483        tcode++;
1484        break;
1485
1486        case OP_CRRANGE:
1487        case OP_CRMINRANGE:
1488        case OP_CRPOSRANGE:
1489        if (GET2(tcode, 1) == 0) tcode += 1 + 2 * IMM2_SIZE;
1490          else try_next = FALSE;
1491        break;
1492
1493        default:
1494        try_next = FALSE;
1495        break;
1496        }
1497      break; /* End of class handling case */
1498      }      /* End of switch for opcodes */
1499    }        /* End of try_next loop */
1500
1501  code += GET(code, 1);   /* Advance to next branch */
1502  }
1503while (*code == OP_ALT);
1504
1505return yield;
1506}
1507
1508
1509
1510/*************************************************
1511*          Study a compiled expression           *
1512*************************************************/
1513
1514/* This function is handed a compiled expression that it must study to produce
1515information that will speed up the matching.
1516
1517Argument:  points to the compiled expression
1518Returns:   0 normally; non-zero should never normally occur
1519           1 unknown opcode in set_start_bits
1520           2 missing capturing bracket
1521           3 unknown opcode in find_minlength
1522*/
1523
1524int
1525PRIV(study)(pcre2_real_code *re)
1526{
1527int min;
1528int count = 0;
1529PCRE2_UCHAR *code;
1530BOOL utf = (re->overall_options & PCRE2_UTF) != 0;
1531
1532/* Find start of compiled code */
1533
1534code = (PCRE2_UCHAR *)((uint8_t *)re + sizeof(pcre2_real_code)) +
1535  re->name_entry_size * re->name_count;
1536
1537/* For an anchored pattern, or an unanchored pattern that has a first code
1538unit, or a multiline pattern that matches only at "line start", there is no
1539point in seeking a list of starting code units. */
1540
1541if ((re->overall_options & PCRE2_ANCHORED) == 0 &&
1542    (re->flags & (PCRE2_FIRSTSET|PCRE2_STARTLINE)) == 0)
1543  {
1544  int rc = set_start_bits(re, code, utf);
1545  if (rc == SSB_UNKNOWN) return 1;
1546  if (rc == SSB_DONE) re->flags |= PCRE2_FIRSTMAPSET;
1547  }
1548
1549/* Find the minimum length of subject string. If it can match an empty string,
1550the minimum length is already known. */
1551
1552if ((re->flags & PCRE2_MATCH_EMPTY) == 0)
1553  {
1554  switch(min = find_minlength(re, code, code, utf, NULL, &count))
1555    {
1556    case -1:  /* \C in UTF mode or (*ACCEPT) or over-complex regex */
1557    break;    /* Leave minlength unchanged (will be zero) */
1558
1559    case -2:
1560    return 2; /* missing capturing bracket */
1561
1562    case -3:
1563    return 3; /* unrecognized opcode */
1564
1565    default:
1566    if (min > UINT16_MAX) min = UINT16_MAX;
1567    re->minlength = min;
1568    break;
1569    }
1570  }
1571
1572return 0;
1573}
1574
1575/* End of pcre2_study.c */
1576