pcre2_intmodedep.h revision 8b979b2abae173bb836d8e85a842cfd00447d4be
1/*************************************************
2*      Perl-Compatible Regular Expressions       *
3*************************************************/
4
5/* PCRE is a library of functions to support regular expressions whose syntax
6and semantics are as close as possible to those of the Perl 5 language.
7
8                       Written by Philip Hazel
9     Original API code Copyright (c) 1997-2012 University of Cambridge
10         New API code Copyright (c) 2016 University of Cambridge
11
12-----------------------------------------------------------------------------
13Redistribution and use in source and binary forms, with or without
14modification, are permitted provided that the following conditions are met:
15
16    * Redistributions of source code must retain the above copyright notice,
17      this list of conditions and the following disclaimer.
18
19    * Redistributions in binary form must reproduce the above copyright
20      notice, this list of conditions and the following disclaimer in the
21      documentation and/or other materials provided with the distribution.
22
23    * Neither the name of the University of Cambridge nor the names of its
24      contributors may be used to endorse or promote products derived from
25      this software without specific prior written permission.
26
27THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37POSSIBILITY OF SUCH DAMAGE.
38-----------------------------------------------------------------------------
39*/
40
41
42/* This module contains mode-dependent macro and structure definitions. The
43file is #included by pcre2_internal.h if PCRE2_CODE_UNIT_WIDTH is defined.
44These mode-dependent items are kept in a separate file so that they can also be
45#included multiple times for different code unit widths by pcre2test in order
46to have access to the hidden structures at all supported widths.
47
48Some of the mode-dependent macros are required at different widths for
49different parts of the pcre2test code (in particular, the included
50pcre_printint.c file). We undefine them here so that they can be re-defined for
51multiple inclusions. Not all of these are used in pcre2test, but it's easier
52just to undefine them all. */
53
54#undef ACROSSCHAR
55#undef BACKCHAR
56#undef BYTES2CU
57#undef CU2BYTES
58#undef FORWARDCHAR
59#undef FORWARDCHARTEST
60#undef GET
61#undef GET2
62#undef GETCHAR
63#undef GETCHARINC
64#undef GETCHARINCTEST
65#undef GETCHARLEN
66#undef GETCHARLENTEST
67#undef GETCHARTEST
68#undef GET_EXTRALEN
69#undef HAS_EXTRALEN
70#undef IMM2_SIZE
71#undef MAX_255
72#undef MAX_MARK
73#undef MAX_PATTERN_SIZE
74#undef MAX_UTF_SINGLE_CU
75#undef NOT_FIRSTCU
76#undef PUT
77#undef PUT2
78#undef PUT2INC
79#undef PUTCHAR
80#undef PUTINC
81#undef TABLE_GET
82
83
84
85/* -------------------------- MACROS ----------------------------- */
86
87/* PCRE keeps offsets in its compiled code as at least 16-bit quantities
88(always stored in big-endian order in 8-bit mode) by default. These are used,
89for example, to link from the start of a subpattern to its alternatives and its
90end. The use of 16 bits per offset limits the size of an 8-bit compiled regex
91to around 64K, which is big enough for almost everybody. However, I received a
92request for an even bigger limit. For this reason, and also to make the code
93easier to maintain, the storing and loading of offsets from the compiled code
94unit string is now handled by the macros that are defined here.
95
96The macros are controlled by the value of LINK_SIZE. This defaults to 2, but
97values of 3 or 4 are also supported. */
98
99/* ------------------- 8-bit support  ------------------ */
100
101#if PCRE2_CODE_UNIT_WIDTH == 8
102
103#if LINK_SIZE == 2
104#define PUT(a,n,d)   \
105  (a[n] = (PCRE2_UCHAR)((d) >> 8)), \
106  (a[(n)+1] = (PCRE2_UCHAR)((d) & 255))
107#define GET(a,n) \
108  (unsigned int)(((a)[n] << 8) | (a)[(n)+1])
109#define MAX_PATTERN_SIZE (1 << 16)
110
111#elif LINK_SIZE == 3
112#define PUT(a,n,d)       \
113  (a[n] = (PCRE2_UCHAR)((d) >> 16)),    \
114  (a[(n)+1] = (PCRE2_UCHAR)((d) >> 8)), \
115  (a[(n)+2] = (PCRE2_UCHAR)((d) & 255))
116#define GET(a,n) \
117  (unsigned int)(((a)[n] << 16) | ((a)[(n)+1] << 8) | (a)[(n)+2])
118#define MAX_PATTERN_SIZE (1 << 24)
119
120#elif LINK_SIZE == 4
121#define PUT(a,n,d)        \
122  (a[n] = (PCRE2_UCHAR)((d) >> 24)),     \
123  (a[(n)+1] = (PCRE2_UCHAR)((d) >> 16)), \
124  (a[(n)+2] = (PCRE2_UCHAR)((d) >> 8)),  \
125  (a[(n)+3] = (PCRE2_UCHAR)((d) & 255))
126#define GET(a,n) \
127  (unsigned int)(((a)[n] << 24) | ((a)[(n)+1] << 16) | ((a)[(n)+2] << 8) | (a)[(n)+3])
128#define MAX_PATTERN_SIZE (1 << 30)   /* Keep it positive */
129
130#else
131#error LINK_SIZE must be 2, 3, or 4
132#endif
133
134
135/* ------------------- 16-bit support  ------------------ */
136
137#elif PCRE2_CODE_UNIT_WIDTH == 16
138
139#if LINK_SIZE == 2
140#undef LINK_SIZE
141#define LINK_SIZE 1
142#define PUT(a,n,d)   \
143  (a[n] = (d))
144#define GET(a,n) \
145  (a[n])
146#define MAX_PATTERN_SIZE (1 << 16)
147
148#elif LINK_SIZE == 3 || LINK_SIZE == 4
149#undef LINK_SIZE
150#define LINK_SIZE 2
151#define PUT(a,n,d)   \
152  (a[n] = (PCRE2_UCHAR)((d) >> 16)), \
153  (a[(n)+1] = (PCRE2_UCHAR)((d) & 65535))
154#define GET(a,n) \
155  (unsigned int)(((a)[n] << 16) | (a)[(n)+1])
156#define MAX_PATTERN_SIZE (1 << 30)  /* Keep it positive */
157
158#else
159#error LINK_SIZE must be 2, 3, or 4
160#endif
161
162
163/* ------------------- 32-bit support  ------------------ */
164
165#elif PCRE2_CODE_UNIT_WIDTH == 32
166#undef LINK_SIZE
167#define LINK_SIZE 1
168#define PUT(a,n,d)   \
169  (a[n] = (d))
170#define GET(a,n) \
171  (a[n])
172#define MAX_PATTERN_SIZE (1 << 30)  /* Keep it positive */
173
174#else
175#error Unsupported compiling mode
176#endif
177
178
179/* --------------- Other mode-specific macros ----------------- */
180
181/* PCRE uses some other (at least) 16-bit quantities that do not change when
182the size of offsets changes. There are used for repeat counts and for other
183things such as capturing parenthesis numbers in back references.
184
185Define the number of code units required to hold a 16-bit count/offset, and
186macros to load and store such a value. For reasons that I do not understand,
187the expression in the 8-bit GET2 macro is treated by gcc as a signed
188expression, even when a is declared as unsigned. It seems that any kind of
189arithmetic results in a signed value. Hence the cast. */
190
191#if PCRE2_CODE_UNIT_WIDTH == 8
192#define IMM2_SIZE 2
193#define GET2(a,n) (unsigned int)(((a)[n] << 8) | (a)[(n)+1])
194#define PUT2(a,n,d) a[n] = (d) >> 8, a[(n)+1] = (d) & 255
195
196#else  /* Code units are 16 or 32 bits */
197#define IMM2_SIZE 1
198#define GET2(a,n) a[n]
199#define PUT2(a,n,d) a[n] = d
200#endif
201
202/* Other macros that are different for 8-bit mode. The MAX_255 macro checks
203whether its argument is less than 256. The maximum length of a MARK name must
204fit in one code unit; currently it is set to 255 or 65535. The TABLE_GET macro
205is used to access elements of tables containing exactly 256 items. When code
206points can be greater than 255, a check is needed before accessing these
207tables. */
208
209#if PCRE2_CODE_UNIT_WIDTH == 8
210#define MAX_255(c) TRUE
211#define MAX_MARK ((1u << 8) - 1)
212#ifdef SUPPORT_UNICODE
213#define SUPPORT_WIDE_CHARS
214#endif  /* SUPPORT_UNICODE */
215#define TABLE_GET(c, table, default) ((table)[c])
216
217#else  /* Code units are 16 or 32 bits */
218#define MAX_255(c) ((c) <= 255u)
219#define MAX_MARK ((1u << 16) - 1)
220#define SUPPORT_WIDE_CHARS
221#define TABLE_GET(c, table, default) (MAX_255(c)? ((table)[c]):(default))
222#endif
223
224
225
226/* ----------------- Character-handling macros ----------------- */
227
228/* There is a proposed future special "UTF-21" mode, in which only the lowest
22921 bits of a 32-bit character are interpreted as UTF, with the remaining 11
230high-order bits available to the application for other uses. In preparation for
231the future implementation of this mode, there are macros that load a data item
232and, if in this special mode, mask it to 21 bits. These macros all have names
233starting with UCHAR21. In all other modes, including the normal 32-bit
234library, the macros all have the same simple definitions. When the new mode is
235implemented, it is expected that these definitions will be varied appropriately
236using #ifdef when compiling the library that supports the special mode. */
237
238#define UCHAR21(eptr)        (*(eptr))
239#define UCHAR21TEST(eptr)    (*(eptr))
240#define UCHAR21INC(eptr)     (*(eptr)++)
241#define UCHAR21INCTEST(eptr) (*(eptr)++)
242
243/* When UTF encoding is being used, a character is no longer just a single
244byte in 8-bit mode or a single short in 16-bit mode. The macros for character
245handling generate simple sequences when used in the basic mode, and more
246complicated ones for UTF characters. GETCHARLENTEST and other macros are not
247used when UTF is not supported. To make sure they can never even appear when
248UTF support is omitted, we don't even define them. */
249
250#ifndef SUPPORT_UNICODE
251
252/* #define MAX_UTF_SINGLE_CU */
253/* #define HAS_EXTRALEN(c) */
254/* #define GET_EXTRALEN(c) */
255/* #define NOT_FIRSTCU(c) */
256#define GETCHAR(c, eptr) c = *eptr;
257#define GETCHARTEST(c, eptr) c = *eptr;
258#define GETCHARINC(c, eptr) c = *eptr++;
259#define GETCHARINCTEST(c, eptr) c = *eptr++;
260#define GETCHARLEN(c, eptr, len) c = *eptr;
261#define PUTCHAR(c, p) (*p = c, 1)
262/* #define GETCHARLENTEST(c, eptr, len) */
263/* #define BACKCHAR(eptr) */
264/* #define FORWARDCHAR(eptr) */
265/* #define FORWARCCHARTEST(eptr,end) */
266/* #define ACROSSCHAR(condition, eptr, action) */
267
268#else   /* SUPPORT_UNICODE */
269
270/* ------------------- 8-bit support  ------------------ */
271
272#if PCRE2_CODE_UNIT_WIDTH == 8
273#define MAYBE_UTF_MULTI          /* UTF chars may use multiple code units */
274
275/* The largest UTF code point that can be encoded as a single code unit. */
276
277#define MAX_UTF_SINGLE_CU 127
278
279/* Tests whether the code point needs extra characters to decode. */
280
281#define HAS_EXTRALEN(c) HASUTF8EXTRALEN(c)
282
283/* Returns with the additional number of characters if IS_MULTICHAR(c) is TRUE.
284Otherwise it has an undefined behaviour. */
285
286#define GET_EXTRALEN(c) (PRIV(utf8_table4)[(c) & 0x3fu])
287
288/* Returns TRUE, if the given value is not the first code unit of a UTF
289sequence. */
290
291#define NOT_FIRSTCU(c) (((c) & 0xc0u) == 0x80u)
292
293/* Get the next UTF-8 character, not advancing the pointer. This is called when
294we know we are in UTF-8 mode. */
295
296#define GETCHAR(c, eptr) \
297  c = *eptr; \
298  if (c >= 0xc0u) GETUTF8(c, eptr);
299
300/* Get the next UTF-8 character, testing for UTF-8 mode, and not advancing the
301pointer. */
302
303#define GETCHARTEST(c, eptr) \
304  c = *eptr; \
305  if (utf && c >= 0xc0u) GETUTF8(c, eptr);
306
307/* Get the next UTF-8 character, advancing the pointer. This is called when we
308know we are in UTF-8 mode. */
309
310#define GETCHARINC(c, eptr) \
311  c = *eptr++; \
312  if (c >= 0xc0u) GETUTF8INC(c, eptr);
313
314/* Get the next character, testing for UTF-8 mode, and advancing the pointer.
315This is called when we don't know if we are in UTF-8 mode. */
316
317#define GETCHARINCTEST(c, eptr) \
318  c = *eptr++; \
319  if (utf && c >= 0xc0u) GETUTF8INC(c, eptr);
320
321/* Get the next UTF-8 character, not advancing the pointer, incrementing length
322if there are extra bytes. This is called when we know we are in UTF-8 mode. */
323
324#define GETCHARLEN(c, eptr, len) \
325  c = *eptr; \
326  if (c >= 0xc0u) GETUTF8LEN(c, eptr, len);
327
328/* Get the next UTF-8 character, testing for UTF-8 mode, not advancing the
329pointer, incrementing length if there are extra bytes. This is called when we
330do not know if we are in UTF-8 mode. */
331
332#define GETCHARLENTEST(c, eptr, len) \
333  c = *eptr; \
334  if (utf && c >= 0xc0u) GETUTF8LEN(c, eptr, len);
335
336/* If the pointer is not at the start of a character, move it back until
337it is. This is called only in UTF-8 mode - we don't put a test within the macro
338because almost all calls are already within a block of UTF-8 only code. */
339
340#define BACKCHAR(eptr) while((*eptr & 0xc0u) == 0x80u) eptr--
341
342/* Same as above, just in the other direction. */
343#define FORWARDCHAR(eptr) while((*eptr & 0xc0u) == 0x80u) eptr++
344#define FORWARDCHARTEST(eptr,end) while(eptr < end && (*eptr & 0xc0u) == 0x80u) eptr++
345
346/* Same as above, but it allows a fully customizable form. */
347#define ACROSSCHAR(condition, eptr, action) \
348  while((condition) && ((eptr) & 0xc0u) == 0x80u) action
349
350/* Deposit a character into memory, returning the number of code units. */
351
352#define PUTCHAR(c, p) ((utf && c > MAX_UTF_SINGLE_CU)? \
353  PRIV(ord2utf)(c,p) : (*p = c, 1))
354
355
356/* ------------------- 16-bit support  ------------------ */
357
358#elif PCRE2_CODE_UNIT_WIDTH == 16
359#define MAYBE_UTF_MULTI          /* UTF chars may use multiple code units */
360
361/* The largest UTF code point that can be encoded as a single code unit. */
362
363#define MAX_UTF_SINGLE_CU 65535
364
365/* Tests whether the code point needs extra characters to decode. */
366
367#define HAS_EXTRALEN(c) (((c) & 0xfc00u) == 0xd800u)
368
369/* Returns with the additional number of characters if IS_MULTICHAR(c) is TRUE.
370Otherwise it has an undefined behaviour. */
371
372#define GET_EXTRALEN(c) 1
373
374/* Returns TRUE, if the given value is not the first code unit of a UTF
375sequence. */
376
377#define NOT_FIRSTCU(c) (((c) & 0xfc00u) == 0xdc00u)
378
379/* Base macro to pick up the low surrogate of a UTF-16 character, not
380advancing the pointer. */
381
382#define GETUTF16(c, eptr) \
383   { c = (((c & 0x3ffu) << 10) | (eptr[1] & 0x3ffu)) + 0x10000u; }
384
385/* Get the next UTF-16 character, not advancing the pointer. This is called when
386we know we are in UTF-16 mode. */
387
388#define GETCHAR(c, eptr) \
389  c = *eptr; \
390  if ((c & 0xfc00u) == 0xd800u) GETUTF16(c, eptr);
391
392/* Get the next UTF-16 character, testing for UTF-16 mode, and not advancing the
393pointer. */
394
395#define GETCHARTEST(c, eptr) \
396  c = *eptr; \
397  if (utf && (c & 0xfc00u) == 0xd800u) GETUTF16(c, eptr);
398
399/* Base macro to pick up the low surrogate of a UTF-16 character, advancing
400the pointer. */
401
402#define GETUTF16INC(c, eptr) \
403   { c = (((c & 0x3ffu) << 10) | (*eptr++ & 0x3ffu)) + 0x10000u; }
404
405/* Get the next UTF-16 character, advancing the pointer. This is called when we
406know we are in UTF-16 mode. */
407
408#define GETCHARINC(c, eptr) \
409  c = *eptr++; \
410  if ((c & 0xfc00u) == 0xd800u) GETUTF16INC(c, eptr);
411
412/* Get the next character, testing for UTF-16 mode, and advancing the pointer.
413This is called when we don't know if we are in UTF-16 mode. */
414
415#define GETCHARINCTEST(c, eptr) \
416  c = *eptr++; \
417  if (utf && (c & 0xfc00u) == 0xd800u) GETUTF16INC(c, eptr);
418
419/* Base macro to pick up the low surrogate of a UTF-16 character, not
420advancing the pointer, incrementing the length. */
421
422#define GETUTF16LEN(c, eptr, len) \
423   { c = (((c & 0x3ffu) << 10) | (eptr[1] & 0x3ffu)) + 0x10000u; len++; }
424
425/* Get the next UTF-16 character, not advancing the pointer, incrementing
426length if there is a low surrogate. This is called when we know we are in
427UTF-16 mode. */
428
429#define GETCHARLEN(c, eptr, len) \
430  c = *eptr; \
431  if ((c & 0xfc00u) == 0xd800u) GETUTF16LEN(c, eptr, len);
432
433/* Get the next UTF-816character, testing for UTF-16 mode, not advancing the
434pointer, incrementing length if there is a low surrogate. This is called when
435we do not know if we are in UTF-16 mode. */
436
437#define GETCHARLENTEST(c, eptr, len) \
438  c = *eptr; \
439  if (utf && (c & 0xfc00u) == 0xd800u) GETUTF16LEN(c, eptr, len);
440
441/* If the pointer is not at the start of a character, move it back until
442it is. This is called only in UTF-16 mode - we don't put a test within the
443macro because almost all calls are already within a block of UTF-16 only
444code. */
445
446#define BACKCHAR(eptr) if ((*eptr & 0xfc00u) == 0xdc00u) eptr--
447
448/* Same as above, just in the other direction. */
449#define FORWARDCHAR(eptr) if ((*eptr & 0xfc00u) == 0xdc00u) eptr++
450#define FORWARDCHARTEST(eptr,end) if (eptr < end && (*eptr & 0xfc00u) == 0xdc00u) eptr++
451
452/* Same as above, but it allows a fully customizable form. */
453#define ACROSSCHAR(condition, eptr, action) \
454  if ((condition) && ((eptr) & 0xfc00u) == 0xdc00u) action
455
456/* Deposit a character into memory, returning the number of code units. */
457
458#define PUTCHAR(c, p) ((utf && c > MAX_UTF_SINGLE_CU)? \
459  PRIV(ord2utf)(c,p) : (*p = c, 1))
460
461
462/* ------------------- 32-bit support  ------------------ */
463
464#else
465
466/* These are trivial for the 32-bit library, since all UTF-32 characters fit
467into one PCRE2_UCHAR unit. */
468
469#define MAX_UTF_SINGLE_CU (0x10ffffu)
470#define HAS_EXTRALEN(c) (0)
471#define GET_EXTRALEN(c) (0)
472#define NOT_FIRSTCU(c) (0)
473
474/* Get the next UTF-32 character, not advancing the pointer. This is called when
475we know we are in UTF-32 mode. */
476
477#define GETCHAR(c, eptr) \
478  c = *(eptr);
479
480/* Get the next UTF-32 character, testing for UTF-32 mode, and not advancing the
481pointer. */
482
483#define GETCHARTEST(c, eptr) \
484  c = *(eptr);
485
486/* Get the next UTF-32 character, advancing the pointer. This is called when we
487know we are in UTF-32 mode. */
488
489#define GETCHARINC(c, eptr) \
490  c = *((eptr)++);
491
492/* Get the next character, testing for UTF-32 mode, and advancing the pointer.
493This is called when we don't know if we are in UTF-32 mode. */
494
495#define GETCHARINCTEST(c, eptr) \
496  c = *((eptr)++);
497
498/* Get the next UTF-32 character, not advancing the pointer, not incrementing
499length (since all UTF-32 is of length 1). This is called when we know we are in
500UTF-32 mode. */
501
502#define GETCHARLEN(c, eptr, len) \
503  GETCHAR(c, eptr)
504
505/* Get the next UTF-32character, testing for UTF-32 mode, not advancing the
506pointer, not incrementing the length (since all UTF-32 is of length 1).
507This is called when we do not know if we are in UTF-32 mode. */
508
509#define GETCHARLENTEST(c, eptr, len) \
510  GETCHARTEST(c, eptr)
511
512/* If the pointer is not at the start of a character, move it back until
513it is. This is called only in UTF-32 mode - we don't put a test within the
514macro because almost all calls are already within a block of UTF-32 only
515code.
516
517These are all no-ops since all UTF-32 characters fit into one pcre_uchar. */
518
519#define BACKCHAR(eptr) do { } while (0)
520
521/* Same as above, just in the other direction. */
522
523#define FORWARDCHAR(eptr) do { } while (0)
524#define FORWARDCHARTEST(eptr,end) do { } while (0)
525
526/* Same as above, but it allows a fully customizable form. */
527
528#define ACROSSCHAR(condition, eptr, action) do { } while (0)
529
530/* Deposit a character into memory, returning the number of code units. */
531
532#define PUTCHAR(c, p) (*p = c, 1)
533
534#endif  /* UTF-32 character handling */
535#endif  /* SUPPORT_UNICODE */
536
537
538/* Mode-dependent macros that have the same definition in all modes. */
539
540#define CU2BYTES(x)     ((x)*((PCRE2_CODE_UNIT_WIDTH/8)))
541#define BYTES2CU(x)     ((x)/((PCRE2_CODE_UNIT_WIDTH/8)))
542#define PUTINC(a,n,d)   PUT(a,n,d), a += LINK_SIZE
543#define PUT2INC(a,n,d)  PUT2(a,n,d), a += IMM2_SIZE
544
545
546/* ----------------------- HIDDEN STRUCTURES ----------------------------- */
547
548/* NOTE: All these structures *must* start with a pcre2_memctl structure. The
549code that uses them is simpler because it assumes this. */
550
551/* The real general context structure. At present it holds only data for custom
552memory control. */
553
554typedef struct pcre2_real_general_context {
555  pcre2_memctl memctl;
556} pcre2_real_general_context;
557
558/* The real compile context structure */
559
560typedef struct pcre2_real_compile_context {
561  pcre2_memctl memctl;
562  int (*stack_guard)(uint32_t, void *);
563  void *stack_guard_data;
564  const uint8_t *tables;
565  PCRE2_SIZE max_pattern_length;
566  uint16_t bsr_convention;
567  uint16_t newline_convention;
568  uint32_t parens_nest_limit;
569} pcre2_real_compile_context;
570
571/* The real match context structure. */
572
573typedef struct pcre2_real_match_context {
574  pcre2_memctl memctl;
575#ifdef HEAP_MATCH_RECURSE
576  pcre2_memctl stack_memctl;
577#endif
578#ifdef SUPPORT_JIT
579  pcre2_jit_callback jit_callback;
580  void *jit_callback_data;
581#endif
582  int    (*callout)(pcre2_callout_block *, void *);
583  void    *callout_data;
584  PCRE2_SIZE offset_limit;
585  uint32_t match_limit;
586  uint32_t recursion_limit;
587} pcre2_real_match_context;
588
589/* The real compiled code structure. The type for the blocksize field is
590defined specially because it is required in pcre2_serialize_decode() when
591copying the size from possibly unaligned memory into a variable of the same
592type. Use a macro rather than a typedef to avoid compiler warnings when this
593file is included multiple times by pcre2test. LOOKBEHIND_MAX specifies the
594largest lookbehind that is supported. (OP_REVERSE in a pattern has a 16-bit
595argument in 8-bit and 16-bit modes, so we need no more than a 16-bit field
596here.) */
597
598#undef  CODE_BLOCKSIZE_TYPE
599#define CODE_BLOCKSIZE_TYPE size_t
600
601#undef  LOOKBEHIND_MAX
602#define LOOKBEHIND_MAX UINT16_MAX
603
604typedef struct pcre2_real_code {
605  pcre2_memctl memctl;            /* Memory control fields */
606  const uint8_t *tables;          /* The character tables */
607  void    *executable_jit;        /* Pointer to JIT code */
608  uint8_t  start_bitmap[32];      /* Bitmap for starting code unit < 256 */
609  CODE_BLOCKSIZE_TYPE blocksize;  /* Total (bytes) that was malloc-ed */
610  uint32_t magic_number;          /* Paranoid and endianness check */
611  uint32_t compile_options;       /* Options passed to pcre2_compile() */
612  uint32_t overall_options;       /* Options after processing the pattern */
613  uint32_t flags;                 /* Various state flags */
614  uint32_t limit_match;           /* Limit set in the pattern */
615  uint32_t limit_recursion;       /* Limit set in the pattern */
616  uint32_t first_codeunit;        /* Starting code unit */
617  uint32_t last_codeunit;         /* This codeunit must be seen */
618  uint16_t bsr_convention;        /* What \R matches */
619  uint16_t newline_convention;    /* What is a newline? */
620  uint16_t max_lookbehind;        /* Longest lookbehind (characters) */
621  uint16_t minlength;             /* Minimum length of match */
622  uint16_t top_bracket;           /* Highest numbered group */
623  uint16_t top_backref;           /* Highest numbered back reference */
624  uint16_t name_entry_size;       /* Size (code units) of table entries */
625  uint16_t name_count;            /* Number of name entries in the table */
626} pcre2_real_code;
627
628/* The real match data structure. */
629
630typedef struct pcre2_real_match_data {
631  pcre2_memctl     memctl;
632  const pcre2_real_code *code;    /* The pattern used for the match */
633  PCRE2_SPTR       subject;       /* The subject that was matched */
634  PCRE2_SPTR       mark;          /* Pointer to last mark */
635  PCRE2_SIZE       leftchar;      /* Offset to leftmost code unit */
636  PCRE2_SIZE       rightchar;     /* Offset to rightmost code unit */
637  PCRE2_SIZE       startchar;     /* Offset to starting code unit */
638  uint16_t         matchedby;     /* Type of match (normal, JIT, DFA) */
639  uint16_t         oveccount;     /* Number of pairs */
640  int              rc;            /* The return code from the match */
641  PCRE2_SIZE       ovector[1];    /* The first field */
642} pcre2_real_match_data;
643
644
645/* ----------------------- PRIVATE STRUCTURES ----------------------------- */
646
647/* These structures are not needed for pcre2test. */
648
649#ifndef PCRE2_PCRE2TEST
650
651/* Structure for checking for mutual recursion when scanning compiled code. */
652
653typedef struct recurse_check {
654  struct recurse_check *prev;
655  PCRE2_SPTR group;
656} recurse_check;
657
658/* Structure for building a cache when filling in recursion offsets. */
659
660typedef struct recurse_cache {
661  PCRE2_SPTR group;
662  int recno;
663} recurse_cache;
664
665/* Structure for maintaining a chain of pointers to the currently incomplete
666branches, for testing for left recursion while compiling. */
667
668typedef struct branch_chain {
669  struct branch_chain *outer;
670  PCRE2_UCHAR *current_branch;
671} branch_chain;
672
673/* Structure for building a list of named groups during the first pass of
674compiling. */
675
676typedef struct named_group {
677  PCRE2_SPTR   name;          /* Points to the name in the pattern */
678  uint32_t     number;        /* Group number */
679  uint16_t     length;        /* Length of the name */
680  uint16_t     isdup;         /* TRUE if a duplicate */
681} named_group;
682
683/* Structure for passing "static" information around between the functions
684doing the compiling, so that they are thread-safe. */
685
686typedef struct compile_block {
687  pcre2_real_compile_context *cx;  /* Points to the compile context */
688  const uint8_t *lcc;              /* Points to lower casing table */
689  const uint8_t *fcc;              /* Points to case-flipping table */
690  const uint8_t *cbits;            /* Points to character type table */
691  const uint8_t *ctypes;           /* Points to table of type maps */
692  PCRE2_SPTR start_workspace;      /* The start of working space */
693  PCRE2_SPTR start_code;           /* The start of the compiled code */
694  PCRE2_SPTR start_pattern;        /* The start of the pattern */
695  PCRE2_SPTR end_pattern;          /* The end of the pattern */
696  PCRE2_SPTR nestptr[2];           /* Pointer(s) saved for string substitution */
697  PCRE2_UCHAR *name_table;         /* The name/number table */
698  size_t workspace_size;           /* Size of workspace */
699  uint16_t names_found;            /* Number of entries so far */
700  uint16_t name_entry_size;        /* Size of each entry */
701  open_capitem *open_caps;         /* Chain of open capture items */
702  named_group *named_groups;       /* Points to vector in pre-compile */
703  uint32_t named_group_list_size;  /* Number of entries in the list */
704  uint32_t external_options;       /* External (initial) options */
705  uint32_t external_flags;         /* External flag bits to be set */
706  uint32_t bracount;               /* Count of capturing parens as we compile */
707  uint32_t final_bracount;         /* Saved value after first pass */
708  uint32_t *groupinfo;             /* Group info vector */
709  uint32_t top_backref;            /* Maximum back reference */
710  uint32_t backref_map;            /* Bitmap of low back refs */
711  uint32_t nltype;                 /* Newline type */
712  uint32_t nllen;                  /* Newline string length */
713  PCRE2_UCHAR nl[4];               /* Newline string when fixed length */
714  int  max_lookbehind;             /* Maximum lookbehind (characters) */
715  int  parens_depth;               /* Depth of nested parentheses */
716  int  assert_depth;               /* Depth of nested assertions */
717  int  req_varyopt;                /* "After variable item" flag for reqbyte */
718  BOOL had_accept;                 /* (*ACCEPT) encountered */
719  BOOL had_pruneorskip;            /* (*PRUNE) or (*SKIP) encountered */
720  BOOL had_recurse;                /* Had a recursion or subroutine call */
721  BOOL check_lookbehind;           /* Lookbehinds need later checking */
722  BOOL dupnames;                   /* Duplicate names exist */
723  BOOL iscondassert;               /* Next assert is a condition */
724} compile_block;
725
726/* Structure for keeping the properties of the in-memory stack used
727by the JIT matcher. */
728
729typedef struct pcre2_real_jit_stack {
730  pcre2_memctl memctl;
731  void* stack;
732} pcre2_real_jit_stack;
733
734/* Structure for keeping a chain of heap blocks used for saving ovectors
735during pattern recursion when the ovector is larger than can be saved on
736the system stack. */
737
738typedef struct ovecsave_frame {
739  struct ovecsave_frame *next;     /* Next frame on free chain */
740  PCRE2_SIZE saved_ovec[1];        /* First vector element */
741} ovecsave_frame;
742
743/* Structure for items in a linked list that represents an explicit recursive
744call within the pattern; used by pcre_match(). */
745
746typedef struct recursion_info {
747  struct recursion_info *prevrec;  /* Previous recursion record (or NULL) */
748  unsigned int group_num;          /* Number of group that was called */
749  PCRE2_SIZE *ovec_save;           /* Pointer to saved ovector frame */
750  uint32_t saved_capture_last;     /* Last capture number */
751  PCRE2_SPTR subject_position;     /* Position at start of recursion */
752} recursion_info;
753
754/* A similar structure for pcre_dfa_match(). */
755
756typedef struct dfa_recursion_info {
757  struct dfa_recursion_info *prevrec;
758  PCRE2_SPTR subject_position;
759  uint32_t group_num;
760} dfa_recursion_info;
761
762/* Structure for building a chain of data for holding the values of the subject
763pointer at the start of each subpattern, so as to detect when an empty string
764has been matched by a subpattern - to break infinite loops; used by
765pcre2_match(). */
766
767typedef struct eptrblock {
768  struct eptrblock *epb_prev;
769  PCRE2_SPTR epb_saved_eptr;
770} eptrblock;
771
772/* Structure for passing "static" information around between the functions
773doing traditional NFA matching (pcre2_match() and friends). */
774
775typedef struct match_block {
776  pcre2_memctl memctl;            /* For general use */
777#ifdef HEAP_MATCH_RECURSE
778  pcre2_memctl stack_memctl;      /* For "stack" frames */
779#endif
780  uint32_t match_call_count;      /* As it says */
781  uint32_t match_limit;           /* As it says */
782  uint32_t match_limit_recursion; /* As it says */
783  BOOL hitend;                    /* Hit the end of the subject at some point */
784  BOOL hasthen;                   /* Pattern contains (*THEN) */
785  const uint8_t *lcc;             /* Points to lower casing table */
786  const uint8_t *fcc;             /* Points to case-flipping table */
787  const uint8_t *ctypes;          /* Points to table of type maps */
788  PCRE2_SIZE *ovector;            /* Pointer to the offset vector */
789  PCRE2_SIZE offset_end;          /* One past the end */
790  PCRE2_SIZE offset_max;          /* The maximum usable for return data */
791  PCRE2_SIZE start_offset;        /* The start offset value */
792  PCRE2_SIZE end_offset_top;      /* Highwater mark at end of match */
793  uint16_t partial;               /* PARTIAL options */
794  uint16_t bsr_convention;        /* \R interpretation */
795  uint16_t name_count;            /* Number of names in name table */
796  uint16_t name_entry_size;       /* Size of entry in names table */
797  PCRE2_SPTR name_table;          /* Table of group names */
798  PCRE2_SPTR start_code;          /* For use when recursing */
799  PCRE2_SPTR start_subject;       /* Start of the subject string */
800  PCRE2_SPTR end_subject;         /* End of the subject string */
801  PCRE2_SPTR start_match_ptr;     /* Start of matched string */
802  PCRE2_SPTR end_match_ptr;       /* Subject position at end match */
803  PCRE2_SPTR start_used_ptr;      /* Earliest consulted character */
804  PCRE2_SPTR last_used_ptr;       /* Latest consulted character */
805  PCRE2_SPTR mark;                /* Mark pointer to pass back on success */
806  PCRE2_SPTR nomatch_mark;        /* Mark pointer to pass back on failure */
807  PCRE2_SPTR once_target;         /* Where to back up to for atomic groups */
808  uint32_t moptions;              /* Match options */
809  uint32_t poptions;              /* Pattern options */
810  uint32_t capture_last;          /* Most recent capture number + overflow flag */
811  uint32_t skip_arg_count;        /* For counting SKIP_ARGs */
812  uint32_t ignore_skip_arg;       /* For re-run when SKIP arg name not found */
813  uint32_t match_function_type;   /* Set for certain special calls of match() */
814  uint32_t nltype;                /* Newline type */
815  uint32_t nllen;                 /* Newline string length */
816  PCRE2_UCHAR nl[4];              /* Newline string when fixed */
817  eptrblock *eptrchain;           /* Chain of eptrblocks for tail recursions */
818  recursion_info *recursive;      /* Linked list of recursion data */
819  ovecsave_frame *ovecsave_chain; /* Linked list of free ovecsave blocks */
820  void  *callout_data;            /* To pass back to callouts */
821  int (*callout)(pcre2_callout_block *,void *);  /* Callout function or NULL */
822#ifdef HEAP_MATCH_RECURSE
823  void  *match_frames_base;       /* For remembering malloc'd frames */
824#endif
825} match_block;
826
827/* A similar structure is used for the same purpose by the DFA matching
828functions. */
829
830typedef struct dfa_match_block {
831  pcre2_memctl memctl;            /* For general use */
832  PCRE2_SPTR start_code;          /* Start of the compiled pattern */
833  PCRE2_SPTR start_subject ;      /* Start of the subject string */
834  PCRE2_SPTR end_subject;         /* End of subject string */
835  PCRE2_SPTR start_used_ptr;      /* Earliest consulted character */
836  PCRE2_SPTR last_used_ptr;       /* Latest consulted character */
837  const uint8_t *tables;          /* Character tables */
838  PCRE2_SIZE start_offset;        /* The start offset value */
839  uint32_t moptions;              /* Match options */
840  uint32_t poptions;              /* Pattern options */
841  uint32_t nltype;                /* Newline type */
842  uint32_t nllen;                 /* Newline string length */
843  PCRE2_UCHAR nl[4];              /* Newline string when fixed */
844  uint16_t bsr_convention;        /* \R interpretation */
845  void *callout_data;             /* To pass back to callouts */
846  int (*callout)(pcre2_callout_block *,void *);  /* Callout function or NULL */
847  dfa_recursion_info *recursive;  /* Linked list of recursion data */
848} dfa_match_block;
849
850#endif  /* PCRE2_PCRE2TEST */
851
852/* End of pcre2_intmodedep.h */
853