1/*************************************************
2*      Perl-Compatible Regular Expressions       *
3*************************************************/
4
5/* PCRE is a library of functions to support regular expressions whose syntax
6and semantics are as close as possible to those of the Perl 5 language.
7
8                       Written by Philip Hazel
9     Original API code Copyright (c) 1997-2012 University of Cambridge
10         New API code Copyright (c) 2016 University of Cambridge
11
12-----------------------------------------------------------------------------
13Redistribution and use in source and binary forms, with or without
14modification, are permitted provided that the following conditions are met:
15
16    * Redistributions of source code must retain the above copyright notice,
17      this list of conditions and the following disclaimer.
18
19    * Redistributions in binary form must reproduce the above copyright
20      notice, this list of conditions and the following disclaimer in the
21      documentation and/or other materials provided with the distribution.
22
23    * Neither the name of the University of Cambridge nor the names of its
24      contributors may be used to endorse or promote products derived from
25      this software without specific prior written permission.
26
27THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37POSSIBILITY OF SUCH DAMAGE.
38-----------------------------------------------------------------------------
39*/
40
41/* This module contains functions that scan a compiled pattern and change
42repeats into possessive repeats where possible. */
43
44
45#ifdef HAVE_CONFIG_H
46#include "config.h"
47#endif
48
49
50#include "pcre2_internal.h"
51
52
53/*************************************************
54*        Tables for auto-possessification        *
55*************************************************/
56
57/* This table is used to check whether auto-possessification is possible
58between adjacent character-type opcodes. The left-hand (repeated) opcode is
59used to select the row, and the right-hand opcode is use to select the column.
60A value of 1 means that auto-possessification is OK. For example, the second
61value in the first row means that \D+\d can be turned into \D++\d.
62
63The Unicode property types (\P and \p) have to be present to fill out the table
64because of what their opcode values are, but the table values should always be
65zero because property types are handled separately in the code. The last four
66columns apply to items that cannot be repeated, so there is no need to have
67rows for them. Note that OP_DIGIT etc. are generated only when PCRE_UCP is
68*not* set. When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */
69
70#define APTROWS (LAST_AUTOTAB_LEFT_OP - FIRST_AUTOTAB_OP + 1)
71#define APTCOLS (LAST_AUTOTAB_RIGHT_OP - FIRST_AUTOTAB_OP + 1)
72
73static const uint8_t autoposstab[APTROWS][APTCOLS] = {
74/* \D \d \S \s \W \w  . .+ \C \P \p \R \H \h \V \v \X \Z \z  $ $M */
75  { 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 },  /* \D */
76  { 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1 },  /* \d */
77  { 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1 },  /* \S */
78  { 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 },  /* \s */
79  { 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 },  /* \W */
80  { 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1 },  /* \w */
81  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0 },  /* .  */
82  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 },  /* .+ */
83  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 },  /* \C */
84  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },  /* \P */
85  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },  /* \p */
86  { 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0 },  /* \R */
87  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0 },  /* \H */
88  { 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0 },  /* \h */
89  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0 },  /* \V */
90  { 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0 },  /* \v */
91  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 }   /* \X */
92};
93
94#ifdef SUPPORT_UNICODE
95/* This table is used to check whether auto-possessification is possible
96between adjacent Unicode property opcodes (OP_PROP and OP_NOTPROP). The
97left-hand (repeated) opcode is used to select the row, and the right-hand
98opcode is used to select the column. The values are as follows:
99
100  0   Always return FALSE (never auto-possessify)
101  1   Character groups are distinct (possessify if both are OP_PROP)
102  2   Check character categories in the same group (general or particular)
103  3   TRUE if the two opcodes are not the same (PROP vs NOTPROP)
104
105  4   Check left general category vs right particular category
106  5   Check right general category vs left particular category
107
108  6   Left alphanum vs right general category
109  7   Left space vs right general category
110  8   Left word vs right general category
111
112  9   Right alphanum vs left general category
113 10   Right space vs left general category
114 11   Right word vs left general category
115
116 12   Left alphanum vs right particular category
117 13   Left space vs right particular category
118 14   Left word vs right particular category
119
120 15   Right alphanum vs left particular category
121 16   Right space vs left particular category
122 17   Right word vs left particular category
123*/
124
125static const uint8_t propposstab[PT_TABSIZE][PT_TABSIZE] = {
126/* ANY LAMP GC  PC  SC ALNUM SPACE PXSPACE WORD CLIST UCNC */
127  { 0,  0,  0,  0,  0,    0,    0,      0,   0,    0,   0 },  /* PT_ANY */
128  { 0,  3,  0,  0,  0,    3,    1,      1,   0,    0,   0 },  /* PT_LAMP */
129  { 0,  0,  2,  4,  0,    9,   10,     10,  11,    0,   0 },  /* PT_GC */
130  { 0,  0,  5,  2,  0,   15,   16,     16,  17,    0,   0 },  /* PT_PC */
131  { 0,  0,  0,  0,  2,    0,    0,      0,   0,    0,   0 },  /* PT_SC */
132  { 0,  3,  6, 12,  0,    3,    1,      1,   0,    0,   0 },  /* PT_ALNUM */
133  { 0,  1,  7, 13,  0,    1,    3,      3,   1,    0,   0 },  /* PT_SPACE */
134  { 0,  1,  7, 13,  0,    1,    3,      3,   1,    0,   0 },  /* PT_PXSPACE */
135  { 0,  0,  8, 14,  0,    0,    1,      1,   3,    0,   0 },  /* PT_WORD */
136  { 0,  0,  0,  0,  0,    0,    0,      0,   0,    0,   0 },  /* PT_CLIST */
137  { 0,  0,  0,  0,  0,    0,    0,      0,   0,    0,   3 }   /* PT_UCNC */
138};
139
140/* This table is used to check whether auto-possessification is possible
141between adjacent Unicode property opcodes (OP_PROP and OP_NOTPROP) when one
142specifies a general category and the other specifies a particular category. The
143row is selected by the general category and the column by the particular
144category. The value is 1 if the particular category is not part of the general
145category. */
146
147static const uint8_t catposstab[7][30] = {
148/* Cc Cf Cn Co Cs Ll Lm Lo Lt Lu Mc Me Mn Nd Nl No Pc Pd Pe Pf Pi Po Ps Sc Sk Sm So Zl Zp Zs */
149  { 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },  /* C */
150  { 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },  /* L */
151  { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },  /* M */
152  { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },  /* N */
153  { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1 },  /* P */
154  { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1 },  /* S */
155  { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0 }   /* Z */
156};
157
158/* This table is used when checking ALNUM, (PX)SPACE, SPACE, and WORD against
159a general or particular category. The properties in each row are those
160that apply to the character set in question. Duplication means that a little
161unnecessary work is done when checking, but this keeps things much simpler
162because they can all use the same code. For more details see the comment where
163this table is used.
164
165Note: SPACE and PXSPACE used to be different because Perl excluded VT from
166"space", but from Perl 5.18 it's included, so both categories are treated the
167same here. */
168
169static const uint8_t posspropstab[3][4] = {
170  { ucp_L, ucp_N, ucp_N, ucp_Nl },  /* ALNUM, 3rd and 4th values redundant */
171  { ucp_Z, ucp_Z, ucp_C, ucp_Cc },  /* SPACE and PXSPACE, 2nd value redundant */
172  { ucp_L, ucp_N, ucp_P, ucp_Po }   /* WORD */
173};
174#endif  /* SUPPORT_UNICODE */
175
176
177
178#ifdef SUPPORT_UNICODE
179/*************************************************
180*        Check a character and a property        *
181*************************************************/
182
183/* This function is called by compare_opcodes() when a property item is
184adjacent to a fixed character.
185
186Arguments:
187  c            the character
188  ptype        the property type
189  pdata        the data for the type
190  negated      TRUE if it's a negated property (\P or \p{^)
191
192Returns:       TRUE if auto-possessifying is OK
193*/
194
195static BOOL
196check_char_prop(uint32_t c, unsigned int ptype, unsigned int pdata,
197  BOOL negated)
198{
199const uint32_t *p;
200const ucd_record *prop = GET_UCD(c);
201
202switch(ptype)
203  {
204  case PT_LAMP:
205  return (prop->chartype == ucp_Lu ||
206          prop->chartype == ucp_Ll ||
207          prop->chartype == ucp_Lt) == negated;
208
209  case PT_GC:
210  return (pdata == PRIV(ucp_gentype)[prop->chartype]) == negated;
211
212  case PT_PC:
213  return (pdata == prop->chartype) == negated;
214
215  case PT_SC:
216  return (pdata == prop->script) == negated;
217
218  /* These are specials */
219
220  case PT_ALNUM:
221  return (PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
222          PRIV(ucp_gentype)[prop->chartype] == ucp_N) == negated;
223
224  /* Perl space used to exclude VT, but from Perl 5.18 it is included, which
225  means that Perl space and POSIX space are now identical. PCRE was changed
226  at release 8.34. */
227
228  case PT_SPACE:    /* Perl space */
229  case PT_PXSPACE:  /* POSIX space */
230  switch(c)
231    {
232    HSPACE_CASES:
233    VSPACE_CASES:
234    return negated;
235
236    default:
237    return (PRIV(ucp_gentype)[prop->chartype] == ucp_Z) == negated;
238    }
239  break;  /* Control never reaches here */
240
241  case PT_WORD:
242  return (PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
243          PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
244          c == CHAR_UNDERSCORE) == negated;
245
246  case PT_CLIST:
247  p = PRIV(ucd_caseless_sets) + prop->caseset;
248  for (;;)
249    {
250    if (c < *p) return !negated;
251    if (c == *p++) return negated;
252    }
253  break;  /* Control never reaches here */
254  }
255
256return FALSE;
257}
258#endif  /* SUPPORT_UNICODE */
259
260
261
262/*************************************************
263*        Base opcode of repeated opcodes         *
264*************************************************/
265
266/* Returns the base opcode for repeated single character type opcodes. If the
267opcode is not a repeated character type, it returns with the original value.
268
269Arguments:  c opcode
270Returns:    base opcode for the type
271*/
272
273static PCRE2_UCHAR
274get_repeat_base(PCRE2_UCHAR c)
275{
276return (c > OP_TYPEPOSUPTO)? c :
277       (c >= OP_TYPESTAR)?   OP_TYPESTAR :
278       (c >= OP_NOTSTARI)?   OP_NOTSTARI :
279       (c >= OP_NOTSTAR)?    OP_NOTSTAR :
280       (c >= OP_STARI)?      OP_STARI :
281                             OP_STAR;
282}
283
284
285/*************************************************
286*        Fill the character property list        *
287*************************************************/
288
289/* Checks whether the code points to an opcode that can take part in auto-
290possessification, and if so, fills a list with its properties.
291
292Arguments:
293  code        points to start of expression
294  utf         TRUE if in UTF mode
295  fcc         points to the case-flipping table
296  list        points to output list
297              list[0] will be filled with the opcode
298              list[1] will be non-zero if this opcode
299                can match an empty character string
300              list[2..7] depends on the opcode
301
302Returns:      points to the start of the next opcode if *code is accepted
303              NULL if *code is not accepted
304*/
305
306static PCRE2_SPTR
307get_chr_property_list(PCRE2_SPTR code, BOOL utf, const uint8_t *fcc,
308  uint32_t *list)
309{
310PCRE2_UCHAR c = *code;
311PCRE2_UCHAR base;
312PCRE2_SPTR end;
313uint32_t chr;
314
315#ifdef SUPPORT_UNICODE
316uint32_t *clist_dest;
317const uint32_t *clist_src;
318#else
319(void)utf;    /* Suppress "unused parameter" compiler warning */
320#endif
321
322list[0] = c;
323list[1] = FALSE;
324code++;
325
326if (c >= OP_STAR && c <= OP_TYPEPOSUPTO)
327  {
328  base = get_repeat_base(c);
329  c -= (base - OP_STAR);
330
331  if (c == OP_UPTO || c == OP_MINUPTO || c == OP_EXACT || c == OP_POSUPTO)
332    code += IMM2_SIZE;
333
334  list[1] = (c != OP_PLUS && c != OP_MINPLUS && c != OP_EXACT &&
335             c != OP_POSPLUS);
336
337  switch(base)
338    {
339    case OP_STAR:
340    list[0] = OP_CHAR;
341    break;
342
343    case OP_STARI:
344    list[0] = OP_CHARI;
345    break;
346
347    case OP_NOTSTAR:
348    list[0] = OP_NOT;
349    break;
350
351    case OP_NOTSTARI:
352    list[0] = OP_NOTI;
353    break;
354
355    case OP_TYPESTAR:
356    list[0] = *code;
357    code++;
358    break;
359    }
360  c = list[0];
361  }
362
363switch(c)
364  {
365  case OP_NOT_DIGIT:
366  case OP_DIGIT:
367  case OP_NOT_WHITESPACE:
368  case OP_WHITESPACE:
369  case OP_NOT_WORDCHAR:
370  case OP_WORDCHAR:
371  case OP_ANY:
372  case OP_ALLANY:
373  case OP_ANYNL:
374  case OP_NOT_HSPACE:
375  case OP_HSPACE:
376  case OP_NOT_VSPACE:
377  case OP_VSPACE:
378  case OP_EXTUNI:
379  case OP_EODN:
380  case OP_EOD:
381  case OP_DOLL:
382  case OP_DOLLM:
383  return code;
384
385  case OP_CHAR:
386  case OP_NOT:
387  GETCHARINCTEST(chr, code);
388  list[2] = chr;
389  list[3] = NOTACHAR;
390  return code;
391
392  case OP_CHARI:
393  case OP_NOTI:
394  list[0] = (c == OP_CHARI) ? OP_CHAR : OP_NOT;
395  GETCHARINCTEST(chr, code);
396  list[2] = chr;
397
398#ifdef SUPPORT_UNICODE
399  if (chr < 128 || (chr < 256 && !utf))
400    list[3] = fcc[chr];
401  else
402    list[3] = UCD_OTHERCASE(chr);
403#elif defined SUPPORT_WIDE_CHARS
404  list[3] = (chr < 256) ? fcc[chr] : chr;
405#else
406  list[3] = fcc[chr];
407#endif
408
409  /* The othercase might be the same value. */
410
411  if (chr == list[3])
412    list[3] = NOTACHAR;
413  else
414    list[4] = NOTACHAR;
415  return code;
416
417#ifdef SUPPORT_UNICODE
418  case OP_PROP:
419  case OP_NOTPROP:
420  if (code[0] != PT_CLIST)
421    {
422    list[2] = code[0];
423    list[3] = code[1];
424    return code + 2;
425    }
426
427  /* Convert only if we have enough space. */
428
429  clist_src = PRIV(ucd_caseless_sets) + code[1];
430  clist_dest = list + 2;
431  code += 2;
432
433  do {
434     if (clist_dest >= list + 8)
435       {
436       /* Early return if there is not enough space. This should never
437       happen, since all clists are shorter than 5 character now. */
438       list[2] = code[0];
439       list[3] = code[1];
440       return code;
441       }
442     *clist_dest++ = *clist_src;
443     }
444  while(*clist_src++ != NOTACHAR);
445
446  /* All characters are stored. The terminating NOTACHAR is copied from the
447  clist itself. */
448
449  list[0] = (c == OP_PROP) ? OP_CHAR : OP_NOT;
450  return code;
451#endif
452
453  case OP_NCLASS:
454  case OP_CLASS:
455#ifdef SUPPORT_WIDE_CHARS
456  case OP_XCLASS:
457  if (c == OP_XCLASS)
458    end = code + GET(code, 0) - 1;
459  else
460#endif
461    end = code + 32 / sizeof(PCRE2_UCHAR);
462
463  switch(*end)
464    {
465    case OP_CRSTAR:
466    case OP_CRMINSTAR:
467    case OP_CRQUERY:
468    case OP_CRMINQUERY:
469    case OP_CRPOSSTAR:
470    case OP_CRPOSQUERY:
471    list[1] = TRUE;
472    end++;
473    break;
474
475    case OP_CRPLUS:
476    case OP_CRMINPLUS:
477    case OP_CRPOSPLUS:
478    end++;
479    break;
480
481    case OP_CRRANGE:
482    case OP_CRMINRANGE:
483    case OP_CRPOSRANGE:
484    list[1] = (GET2(end, 1) == 0);
485    end += 1 + 2 * IMM2_SIZE;
486    break;
487    }
488  list[2] = (uint32_t)(end - code);
489  return end;
490  }
491return NULL;    /* Opcode not accepted */
492}
493
494
495
496/*************************************************
497*    Scan further character sets for match       *
498*************************************************/
499
500/* Checks whether the base and the current opcode have a common character, in
501which case the base cannot be possessified.
502
503Arguments:
504  code        points to the byte code
505  utf         TRUE in UTF mode
506  cb          compile data block
507  base_list   the data list of the base opcode
508  base_end    the end of the data list
509  rec_limit   points to recursion depth counter
510
511Returns:      TRUE if the auto-possessification is possible
512*/
513
514static BOOL
515compare_opcodes(PCRE2_SPTR code, BOOL utf, const compile_block *cb,
516  const uint32_t *base_list, PCRE2_SPTR base_end, int *rec_limit)
517{
518PCRE2_UCHAR c;
519uint32_t list[8];
520const uint32_t *chr_ptr;
521const uint32_t *ochr_ptr;
522const uint32_t *list_ptr;
523PCRE2_SPTR next_code;
524#ifdef SUPPORT_WIDE_CHARS
525PCRE2_SPTR xclass_flags;
526#endif
527const uint8_t *class_bitset;
528const uint8_t *set1, *set2, *set_end;
529uint32_t chr;
530BOOL accepted, invert_bits;
531BOOL entered_a_group = FALSE;
532
533if (--(*rec_limit) <= 0) return FALSE;  /* Recursion has gone too deep */
534
535/* Note: the base_list[1] contains whether the current opcode has a greedy
536(represented by a non-zero value) quantifier. This is a different from
537other character type lists, which store here that the character iterator
538matches to an empty string (also represented by a non-zero value). */
539
540for(;;)
541  {
542  /* All operations move the code pointer forward.
543  Therefore infinite recursions are not possible. */
544
545  c = *code;
546
547  /* Skip over callouts */
548
549  if (c == OP_CALLOUT)
550    {
551    code += PRIV(OP_lengths)[c];
552    continue;
553    }
554
555  if (c == OP_CALLOUT_STR)
556    {
557    code += GET(code, 1 + 2*LINK_SIZE);
558    continue;
559    }
560
561  if (c == OP_ALT)
562    {
563    do code += GET(code, 1); while (*code == OP_ALT);
564    c = *code;
565    }
566
567  switch(c)
568    {
569    case OP_END:
570    case OP_KETRPOS:
571    /* TRUE only in greedy case. The non-greedy case could be replaced by
572    an OP_EXACT, but it is probably not worth it. (And note that OP_EXACT
573    uses more memory, which we cannot get at this stage.) */
574
575    return base_list[1] != 0;
576
577    case OP_KET:
578    /* If the bracket is capturing, and referenced by an OP_RECURSE, or
579    it is an atomic sub-pattern (assert, once, etc.) the non-greedy case
580    cannot be converted to a possessive form. */
581
582    if (base_list[1] == 0) return FALSE;
583
584    switch(*(code - GET(code, 1)))
585      {
586      case OP_ASSERT:
587      case OP_ASSERT_NOT:
588      case OP_ASSERTBACK:
589      case OP_ASSERTBACK_NOT:
590      case OP_ONCE:
591      case OP_ONCE_NC:
592      /* Atomic sub-patterns and assertions can always auto-possessify their
593      last iterator. However, if the group was entered as a result of checking
594      a previous iterator, this is not possible. */
595
596      return !entered_a_group;
597      }
598
599    code += PRIV(OP_lengths)[c];
600    continue;
601
602    case OP_ONCE:
603    case OP_ONCE_NC:
604    case OP_BRA:
605    case OP_CBRA:
606    next_code = code + GET(code, 1);
607    code += PRIV(OP_lengths)[c];
608
609    while (*next_code == OP_ALT)
610      {
611      if (!compare_opcodes(code, utf, cb, base_list, base_end, rec_limit))
612        return FALSE;
613      code = next_code + 1 + LINK_SIZE;
614      next_code += GET(next_code, 1);
615      }
616
617    entered_a_group = TRUE;
618    continue;
619
620    case OP_BRAZERO:
621    case OP_BRAMINZERO:
622
623    next_code = code + 1;
624    if (*next_code != OP_BRA && *next_code != OP_CBRA
625        && *next_code != OP_ONCE && *next_code != OP_ONCE_NC) return FALSE;
626
627    do next_code += GET(next_code, 1); while (*next_code == OP_ALT);
628
629    /* The bracket content will be checked by the OP_BRA/OP_CBRA case above. */
630
631    next_code += 1 + LINK_SIZE;
632    if (!compare_opcodes(next_code, utf, cb, base_list, base_end, rec_limit))
633      return FALSE;
634
635    code += PRIV(OP_lengths)[c];
636    continue;
637
638    default:
639    break;
640    }
641
642  /* Check for a supported opcode, and load its properties. */
643
644  code = get_chr_property_list(code, utf, cb->fcc, list);
645  if (code == NULL) return FALSE;    /* Unsupported */
646
647  /* If either opcode is a small character list, set pointers for comparing
648  characters from that list with another list, or with a property. */
649
650  if (base_list[0] == OP_CHAR)
651    {
652    chr_ptr = base_list + 2;
653    list_ptr = list;
654    }
655  else if (list[0] == OP_CHAR)
656    {
657    chr_ptr = list + 2;
658    list_ptr = base_list;
659    }
660
661  /* Character bitsets can also be compared to certain opcodes. */
662
663  else if (base_list[0] == OP_CLASS || list[0] == OP_CLASS
664#if PCRE2_CODE_UNIT_WIDTH == 8
665      /* In 8 bit, non-UTF mode, OP_CLASS and OP_NCLASS are the same. */
666      || (!utf && (base_list[0] == OP_NCLASS || list[0] == OP_NCLASS))
667#endif
668      )
669    {
670#if PCRE2_CODE_UNIT_WIDTH == 8
671    if (base_list[0] == OP_CLASS || (!utf && base_list[0] == OP_NCLASS))
672#else
673    if (base_list[0] == OP_CLASS)
674#endif
675      {
676      set1 = (uint8_t *)(base_end - base_list[2]);
677      list_ptr = list;
678      }
679    else
680      {
681      set1 = (uint8_t *)(code - list[2]);
682      list_ptr = base_list;
683      }
684
685    invert_bits = FALSE;
686    switch(list_ptr[0])
687      {
688      case OP_CLASS:
689      case OP_NCLASS:
690      set2 = (uint8_t *)
691        ((list_ptr == list ? code : base_end) - list_ptr[2]);
692      break;
693
694#ifdef SUPPORT_WIDE_CHARS
695      case OP_XCLASS:
696      xclass_flags = (list_ptr == list ? code : base_end) - list_ptr[2] + LINK_SIZE;
697      if ((*xclass_flags & XCL_HASPROP) != 0) return FALSE;
698      if ((*xclass_flags & XCL_MAP) == 0)
699        {
700        /* No bits are set for characters < 256. */
701        if (list[1] == 0) return TRUE;
702        /* Might be an empty repeat. */
703        continue;
704        }
705      set2 = (uint8_t *)(xclass_flags + 1);
706      break;
707#endif
708
709      case OP_NOT_DIGIT:
710      invert_bits = TRUE;
711      /* Fall through */
712      case OP_DIGIT:
713      set2 = (uint8_t *)(cb->cbits + cbit_digit);
714      break;
715
716      case OP_NOT_WHITESPACE:
717      invert_bits = TRUE;
718      /* Fall through */
719      case OP_WHITESPACE:
720      set2 = (uint8_t *)(cb->cbits + cbit_space);
721      break;
722
723      case OP_NOT_WORDCHAR:
724      invert_bits = TRUE;
725      /* Fall through */
726      case OP_WORDCHAR:
727      set2 = (uint8_t *)(cb->cbits + cbit_word);
728      break;
729
730      default:
731      return FALSE;
732      }
733
734    /* Because the bit sets are unaligned bytes, we need to perform byte
735    comparison here. */
736
737    set_end = set1 + 32;
738    if (invert_bits)
739      {
740      do
741        {
742        if ((*set1++ & ~(*set2++)) != 0) return FALSE;
743        }
744      while (set1 < set_end);
745      }
746    else
747      {
748      do
749        {
750        if ((*set1++ & *set2++) != 0) return FALSE;
751        }
752      while (set1 < set_end);
753      }
754
755    if (list[1] == 0) return TRUE;
756    /* Might be an empty repeat. */
757    continue;
758    }
759
760  /* Some property combinations also acceptable. Unicode property opcodes are
761  processed specially; the rest can be handled with a lookup table. */
762
763  else
764    {
765    uint32_t leftop, rightop;
766
767    leftop = base_list[0];
768    rightop = list[0];
769
770#ifdef SUPPORT_UNICODE
771    accepted = FALSE; /* Always set in non-unicode case. */
772    if (leftop == OP_PROP || leftop == OP_NOTPROP)
773      {
774      if (rightop == OP_EOD)
775        accepted = TRUE;
776      else if (rightop == OP_PROP || rightop == OP_NOTPROP)
777        {
778        int n;
779        const uint8_t *p;
780        BOOL same = leftop == rightop;
781        BOOL lisprop = leftop == OP_PROP;
782        BOOL risprop = rightop == OP_PROP;
783        BOOL bothprop = lisprop && risprop;
784
785        /* There's a table that specifies how each combination is to be
786        processed:
787          0   Always return FALSE (never auto-possessify)
788          1   Character groups are distinct (possessify if both are OP_PROP)
789          2   Check character categories in the same group (general or particular)
790          3   Return TRUE if the two opcodes are not the same
791          ... see comments below
792        */
793
794        n = propposstab[base_list[2]][list[2]];
795        switch(n)
796          {
797          case 0: break;
798          case 1: accepted = bothprop; break;
799          case 2: accepted = (base_list[3] == list[3]) != same; break;
800          case 3: accepted = !same; break;
801
802          case 4:  /* Left general category, right particular category */
803          accepted = risprop && catposstab[base_list[3]][list[3]] == same;
804          break;
805
806          case 5:  /* Right general category, left particular category */
807          accepted = lisprop && catposstab[list[3]][base_list[3]] == same;
808          break;
809
810          /* This code is logically tricky. Think hard before fiddling with it.
811          The posspropstab table has four entries per row. Each row relates to
812          one of PCRE's special properties such as ALNUM or SPACE or WORD.
813          Only WORD actually needs all four entries, but using repeats for the
814          others means they can all use the same code below.
815
816          The first two entries in each row are Unicode general categories, and
817          apply always, because all the characters they include are part of the
818          PCRE character set. The third and fourth entries are a general and a
819          particular category, respectively, that include one or more relevant
820          characters. One or the other is used, depending on whether the check
821          is for a general or a particular category. However, in both cases the
822          category contains more characters than the specials that are defined
823          for the property being tested against. Therefore, it cannot be used
824          in a NOTPROP case.
825
826          Example: the row for WORD contains ucp_L, ucp_N, ucp_P, ucp_Po.
827          Underscore is covered by ucp_P or ucp_Po. */
828
829          case 6:  /* Left alphanum vs right general category */
830          case 7:  /* Left space vs right general category */
831          case 8:  /* Left word vs right general category */
832          p = posspropstab[n-6];
833          accepted = risprop && lisprop ==
834            (list[3] != p[0] &&
835             list[3] != p[1] &&
836            (list[3] != p[2] || !lisprop));
837          break;
838
839          case 9:   /* Right alphanum vs left general category */
840          case 10:  /* Right space vs left general category */
841          case 11:  /* Right word vs left general category */
842          p = posspropstab[n-9];
843          accepted = lisprop && risprop ==
844            (base_list[3] != p[0] &&
845             base_list[3] != p[1] &&
846            (base_list[3] != p[2] || !risprop));
847          break;
848
849          case 12:  /* Left alphanum vs right particular category */
850          case 13:  /* Left space vs right particular category */
851          case 14:  /* Left word vs right particular category */
852          p = posspropstab[n-12];
853          accepted = risprop && lisprop ==
854            (catposstab[p[0]][list[3]] &&
855             catposstab[p[1]][list[3]] &&
856            (list[3] != p[3] || !lisprop));
857          break;
858
859          case 15:  /* Right alphanum vs left particular category */
860          case 16:  /* Right space vs left particular category */
861          case 17:  /* Right word vs left particular category */
862          p = posspropstab[n-15];
863          accepted = lisprop && risprop ==
864            (catposstab[p[0]][base_list[3]] &&
865             catposstab[p[1]][base_list[3]] &&
866            (base_list[3] != p[3] || !risprop));
867          break;
868          }
869        }
870      }
871
872    else
873#endif  /* SUPPORT_UNICODE */
874
875    accepted = leftop >= FIRST_AUTOTAB_OP && leftop <= LAST_AUTOTAB_LEFT_OP &&
876           rightop >= FIRST_AUTOTAB_OP && rightop <= LAST_AUTOTAB_RIGHT_OP &&
877           autoposstab[leftop - FIRST_AUTOTAB_OP][rightop - FIRST_AUTOTAB_OP];
878
879    if (!accepted) return FALSE;
880
881    if (list[1] == 0) return TRUE;
882    /* Might be an empty repeat. */
883    continue;
884    }
885
886  /* Control reaches here only if one of the items is a small character list.
887  All characters are checked against the other side. */
888
889  do
890    {
891    chr = *chr_ptr;
892
893    switch(list_ptr[0])
894      {
895      case OP_CHAR:
896      ochr_ptr = list_ptr + 2;
897      do
898        {
899        if (chr == *ochr_ptr) return FALSE;
900        ochr_ptr++;
901        }
902      while(*ochr_ptr != NOTACHAR);
903      break;
904
905      case OP_NOT:
906      ochr_ptr = list_ptr + 2;
907      do
908        {
909        if (chr == *ochr_ptr)
910          break;
911        ochr_ptr++;
912        }
913      while(*ochr_ptr != NOTACHAR);
914      if (*ochr_ptr == NOTACHAR) return FALSE;   /* Not found */
915      break;
916
917      /* Note that OP_DIGIT etc. are generated only when PCRE2_UCP is *not*
918      set. When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */
919
920      case OP_DIGIT:
921      if (chr < 256 && (cb->ctypes[chr] & ctype_digit) != 0) return FALSE;
922      break;
923
924      case OP_NOT_DIGIT:
925      if (chr > 255 || (cb->ctypes[chr] & ctype_digit) == 0) return FALSE;
926      break;
927
928      case OP_WHITESPACE:
929      if (chr < 256 && (cb->ctypes[chr] & ctype_space) != 0) return FALSE;
930      break;
931
932      case OP_NOT_WHITESPACE:
933      if (chr > 255 || (cb->ctypes[chr] & ctype_space) == 0) return FALSE;
934      break;
935
936      case OP_WORDCHAR:
937      if (chr < 255 && (cb->ctypes[chr] & ctype_word) != 0) return FALSE;
938      break;
939
940      case OP_NOT_WORDCHAR:
941      if (chr > 255 || (cb->ctypes[chr] & ctype_word) == 0) return FALSE;
942      break;
943
944      case OP_HSPACE:
945      switch(chr)
946        {
947        HSPACE_CASES: return FALSE;
948        default: break;
949        }
950      break;
951
952      case OP_NOT_HSPACE:
953      switch(chr)
954        {
955        HSPACE_CASES: break;
956        default: return FALSE;
957        }
958      break;
959
960      case OP_ANYNL:
961      case OP_VSPACE:
962      switch(chr)
963        {
964        VSPACE_CASES: return FALSE;
965        default: break;
966        }
967      break;
968
969      case OP_NOT_VSPACE:
970      switch(chr)
971        {
972        VSPACE_CASES: break;
973        default: return FALSE;
974        }
975      break;
976
977      case OP_DOLL:
978      case OP_EODN:
979      switch (chr)
980        {
981        case CHAR_CR:
982        case CHAR_LF:
983        case CHAR_VT:
984        case CHAR_FF:
985        case CHAR_NEL:
986#ifndef EBCDIC
987        case 0x2028:
988        case 0x2029:
989#endif  /* Not EBCDIC */
990        return FALSE;
991        }
992      break;
993
994      case OP_EOD:    /* Can always possessify before \z */
995      break;
996
997#ifdef SUPPORT_UNICODE
998      case OP_PROP:
999      case OP_NOTPROP:
1000      if (!check_char_prop(chr, list_ptr[2], list_ptr[3],
1001            list_ptr[0] == OP_NOTPROP))
1002        return FALSE;
1003      break;
1004#endif
1005
1006      case OP_NCLASS:
1007      if (chr > 255) return FALSE;
1008      /* Fall through */
1009
1010      case OP_CLASS:
1011      if (chr > 255) break;
1012      class_bitset = (uint8_t *)
1013        ((list_ptr == list ? code : base_end) - list_ptr[2]);
1014      if ((class_bitset[chr >> 3] & (1 << (chr & 7))) != 0) return FALSE;
1015      break;
1016
1017#ifdef SUPPORT_WIDE_CHARS
1018      case OP_XCLASS:
1019      if (PRIV(xclass)(chr, (list_ptr == list ? code : base_end) -
1020          list_ptr[2] + LINK_SIZE, utf)) return FALSE;
1021      break;
1022#endif
1023
1024      default:
1025      return FALSE;
1026      }
1027
1028    chr_ptr++;
1029    }
1030  while(*chr_ptr != NOTACHAR);
1031
1032  /* At least one character must be matched from this opcode. */
1033
1034  if (list[1] == 0) return TRUE;
1035  }
1036
1037/* Control never reaches here. There used to be a fail-save return FALSE; here,
1038but some compilers complain about an unreachable statement. */
1039}
1040
1041
1042
1043/*************************************************
1044*    Scan compiled regex for auto-possession     *
1045*************************************************/
1046
1047/* Replaces single character iterations with their possessive alternatives
1048if appropriate. This function modifies the compiled opcode! Hitting a
1049non-existant opcode may indicate a bug in PCRE2, but it can also be caused if a
1050bad UTF string was compiled with PCRE2_NO_UTF_CHECK.
1051
1052Arguments:
1053  code        points to start of the byte code
1054  utf         TRUE in UTF mode
1055  cb          compile data block
1056
1057Returns:      0 for success
1058              -1 if a non-existant opcode is encountered
1059*/
1060
1061int
1062PRIV(auto_possessify)(PCRE2_UCHAR *code, BOOL utf, const compile_block *cb)
1063{
1064register PCRE2_UCHAR c;
1065PCRE2_SPTR end;
1066PCRE2_UCHAR *repeat_opcode;
1067uint32_t list[8];
1068int rec_limit;
1069
1070for (;;)
1071  {
1072  c = *code;
1073
1074  if (c > OP_TABLE_LENGTH) return -1;   /* Something gone wrong */
1075
1076  if (c >= OP_STAR && c <= OP_TYPEPOSUPTO)
1077    {
1078    c -= get_repeat_base(c) - OP_STAR;
1079    end = (c <= OP_MINUPTO) ?
1080      get_chr_property_list(code, utf, cb->fcc, list) : NULL;
1081    list[1] = c == OP_STAR || c == OP_PLUS || c == OP_QUERY || c == OP_UPTO;
1082
1083    rec_limit = 1000;
1084    if (end != NULL && compare_opcodes(end, utf, cb, list, end, &rec_limit))
1085      {
1086      switch(c)
1087        {
1088        case OP_STAR:
1089        *code += OP_POSSTAR - OP_STAR;
1090        break;
1091
1092        case OP_MINSTAR:
1093        *code += OP_POSSTAR - OP_MINSTAR;
1094        break;
1095
1096        case OP_PLUS:
1097        *code += OP_POSPLUS - OP_PLUS;
1098        break;
1099
1100        case OP_MINPLUS:
1101        *code += OP_POSPLUS - OP_MINPLUS;
1102        break;
1103
1104        case OP_QUERY:
1105        *code += OP_POSQUERY - OP_QUERY;
1106        break;
1107
1108        case OP_MINQUERY:
1109        *code += OP_POSQUERY - OP_MINQUERY;
1110        break;
1111
1112        case OP_UPTO:
1113        *code += OP_POSUPTO - OP_UPTO;
1114        break;
1115
1116        case OP_MINUPTO:
1117        *code += OP_POSUPTO - OP_MINUPTO;
1118        break;
1119        }
1120      }
1121    c = *code;
1122    }
1123  else if (c == OP_CLASS || c == OP_NCLASS || c == OP_XCLASS)
1124    {
1125#ifdef SUPPORT_WIDE_CHARS
1126    if (c == OP_XCLASS)
1127      repeat_opcode = code + GET(code, 1);
1128    else
1129#endif
1130      repeat_opcode = code + 1 + (32 / sizeof(PCRE2_UCHAR));
1131
1132    c = *repeat_opcode;
1133    if (c >= OP_CRSTAR && c <= OP_CRMINRANGE)
1134      {
1135      /* end must not be NULL. */
1136      end = get_chr_property_list(code, utf, cb->fcc, list);
1137
1138      list[1] = (c & 1) == 0;
1139
1140      rec_limit = 1000;
1141      if (compare_opcodes(end, utf, cb, list, end, &rec_limit))
1142        {
1143        switch (c)
1144          {
1145          case OP_CRSTAR:
1146          case OP_CRMINSTAR:
1147          *repeat_opcode = OP_CRPOSSTAR;
1148          break;
1149
1150          case OP_CRPLUS:
1151          case OP_CRMINPLUS:
1152          *repeat_opcode = OP_CRPOSPLUS;
1153          break;
1154
1155          case OP_CRQUERY:
1156          case OP_CRMINQUERY:
1157          *repeat_opcode = OP_CRPOSQUERY;
1158          break;
1159
1160          case OP_CRRANGE:
1161          case OP_CRMINRANGE:
1162          *repeat_opcode = OP_CRPOSRANGE;
1163          break;
1164          }
1165        }
1166      }
1167    c = *code;
1168    }
1169
1170  switch(c)
1171    {
1172    case OP_END:
1173    return 0;
1174
1175    case OP_TYPESTAR:
1176    case OP_TYPEMINSTAR:
1177    case OP_TYPEPLUS:
1178    case OP_TYPEMINPLUS:
1179    case OP_TYPEQUERY:
1180    case OP_TYPEMINQUERY:
1181    case OP_TYPEPOSSTAR:
1182    case OP_TYPEPOSPLUS:
1183    case OP_TYPEPOSQUERY:
1184    if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1185    break;
1186
1187    case OP_TYPEUPTO:
1188    case OP_TYPEMINUPTO:
1189    case OP_TYPEEXACT:
1190    case OP_TYPEPOSUPTO:
1191    if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
1192      code += 2;
1193    break;
1194
1195    case OP_CALLOUT_STR:
1196    code += GET(code, 1 + 2*LINK_SIZE);
1197    break;
1198
1199#ifdef SUPPORT_WIDE_CHARS
1200    case OP_XCLASS:
1201    code += GET(code, 1);
1202    break;
1203#endif
1204
1205    case OP_MARK:
1206    case OP_PRUNE_ARG:
1207    case OP_SKIP_ARG:
1208    case OP_THEN_ARG:
1209    code += code[1];
1210    break;
1211    }
1212
1213  /* Add in the fixed length from the table */
1214
1215  code += PRIV(OP_lengths)[c];
1216
1217  /* In UTF-8 and UTF-16 modes, opcodes that are followed by a character may be
1218  followed by a multi-byte character. The length in the table is a minimum, so
1219  we have to arrange to skip the extra code units. */
1220
1221#ifdef MAYBE_UTF_MULTI
1222  if (utf) switch(c)
1223    {
1224    case OP_CHAR:
1225    case OP_CHARI:
1226    case OP_NOT:
1227    case OP_NOTI:
1228    case OP_STAR:
1229    case OP_MINSTAR:
1230    case OP_PLUS:
1231    case OP_MINPLUS:
1232    case OP_QUERY:
1233    case OP_MINQUERY:
1234    case OP_UPTO:
1235    case OP_MINUPTO:
1236    case OP_EXACT:
1237    case OP_POSSTAR:
1238    case OP_POSPLUS:
1239    case OP_POSQUERY:
1240    case OP_POSUPTO:
1241    case OP_STARI:
1242    case OP_MINSTARI:
1243    case OP_PLUSI:
1244    case OP_MINPLUSI:
1245    case OP_QUERYI:
1246    case OP_MINQUERYI:
1247    case OP_UPTOI:
1248    case OP_MINUPTOI:
1249    case OP_EXACTI:
1250    case OP_POSSTARI:
1251    case OP_POSPLUSI:
1252    case OP_POSQUERYI:
1253    case OP_POSUPTOI:
1254    case OP_NOTSTAR:
1255    case OP_NOTMINSTAR:
1256    case OP_NOTPLUS:
1257    case OP_NOTMINPLUS:
1258    case OP_NOTQUERY:
1259    case OP_NOTMINQUERY:
1260    case OP_NOTUPTO:
1261    case OP_NOTMINUPTO:
1262    case OP_NOTEXACT:
1263    case OP_NOTPOSSTAR:
1264    case OP_NOTPOSPLUS:
1265    case OP_NOTPOSQUERY:
1266    case OP_NOTPOSUPTO:
1267    case OP_NOTSTARI:
1268    case OP_NOTMINSTARI:
1269    case OP_NOTPLUSI:
1270    case OP_NOTMINPLUSI:
1271    case OP_NOTQUERYI:
1272    case OP_NOTMINQUERYI:
1273    case OP_NOTUPTOI:
1274    case OP_NOTMINUPTOI:
1275    case OP_NOTEXACTI:
1276    case OP_NOTPOSSTARI:
1277    case OP_NOTPOSPLUSI:
1278    case OP_NOTPOSQUERYI:
1279    case OP_NOTPOSUPTOI:
1280    if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
1281    break;
1282    }
1283#else
1284  (void)(utf);  /* Keep compiler happy by referencing function argument */
1285#endif  /* SUPPORT_WIDE_CHARS */
1286  }
1287}
1288
1289/* End of pcre2_auto_possess.c */
1290