1/*
2 * Copyright (C) 2009 Apple Inc. All rights reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
6 * are met:
7 * 1. Redistributions of source code must retain the above copyright
8 *    notice, this list of conditions and the following disclaimer.
9 * 2. Redistributions in binary form must reproduce the above copyright
10 *    notice, this list of conditions and the following disclaimer in the
11 *    documentation and/or other materials provided with the distribution.
12 *
13 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
14 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
16 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR
17 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
18 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
19 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
20 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
21 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
23 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24 */
25
26#ifndef RegexInterpreter_h
27#define RegexInterpreter_h
28
29#include <wtf/Platform.h>
30
31#if ENABLE(YARR)
32
33#include <wtf/unicode/Unicode.h>
34#include "RegexParser.h"
35#include "RegexPattern.h"
36
37namespace JSC { namespace Yarr {
38
39class ByteDisjunction;
40
41struct ByteTerm {
42    enum Type {
43        TypeBodyAlternativeBegin,
44        TypeBodyAlternativeDisjunction,
45        TypeBodyAlternativeEnd,
46        TypeAlternativeBegin,
47        TypeAlternativeDisjunction,
48        TypeAlternativeEnd,
49        TypeSubpatternBegin,
50        TypeSubpatternEnd,
51        TypeAssertionBOL,
52        TypeAssertionEOL,
53        TypeAssertionWordBoundary,
54        TypePatternCharacterOnce,
55        TypePatternCharacterFixed,
56        TypePatternCharacterGreedy,
57        TypePatternCharacterNonGreedy,
58        TypePatternCasedCharacterOnce,
59        TypePatternCasedCharacterFixed,
60        TypePatternCasedCharacterGreedy,
61        TypePatternCasedCharacterNonGreedy,
62        TypeCharacterClass,
63        TypeBackReference,
64        TypeParenthesesSubpattern,
65        TypeParenthesesSubpatternOnceBegin,
66        TypeParenthesesSubpatternOnceEnd,
67        TypeParentheticalAssertionBegin,
68        TypeParentheticalAssertionEnd,
69        TypeCheckInput,
70    } type;
71    bool invertOrCapture;
72    union {
73        struct {
74            union {
75                UChar patternCharacter;
76                struct {
77                    UChar lo;
78                    UChar hi;
79                } casedCharacter;
80                CharacterClass* characterClass;
81                unsigned subpatternId;
82            };
83            union {
84                ByteDisjunction* parenthesesDisjunction;
85                unsigned parenthesesWidth;
86            };
87            QuantifierType quantityType;
88            unsigned quantityCount;
89        } atom;
90        struct {
91            int next;
92            int end;
93        } alternative;
94        unsigned checkInputCount;
95    };
96    unsigned frameLocation;
97    int inputPosition;
98
99    ByteTerm(UChar ch, int inputPos, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType)
100        : frameLocation(frameLocation)
101    {
102        switch (quantityType) {
103        case QuantifierFixedCount:
104            type = (quantityCount == 1) ? ByteTerm::TypePatternCharacterOnce : ByteTerm::TypePatternCharacterFixed;
105            break;
106        case QuantifierGreedy:
107            type = ByteTerm::TypePatternCharacterGreedy;
108            break;
109        case QuantifierNonGreedy:
110            type = ByteTerm::TypePatternCharacterNonGreedy;
111            break;
112        }
113
114        atom.patternCharacter = ch;
115        atom.quantityType = quantityType;
116        atom.quantityCount = quantityCount;
117        inputPosition = inputPos;
118    }
119
120    ByteTerm(UChar lo, UChar hi, int inputPos, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType)
121        : frameLocation(frameLocation)
122    {
123        switch (quantityType) {
124        case QuantifierFixedCount:
125            type = (quantityCount == 1) ? ByteTerm::TypePatternCasedCharacterOnce : ByteTerm::TypePatternCasedCharacterFixed;
126            break;
127        case QuantifierGreedy:
128            type = ByteTerm::TypePatternCasedCharacterGreedy;
129            break;
130        case QuantifierNonGreedy:
131            type = ByteTerm::TypePatternCasedCharacterNonGreedy;
132            break;
133        }
134
135        atom.casedCharacter.lo = lo;
136        atom.casedCharacter.hi = hi;
137        atom.quantityType = quantityType;
138        atom.quantityCount = quantityCount;
139        inputPosition = inputPos;
140    }
141
142    ByteTerm(CharacterClass* characterClass, bool invert, int inputPos)
143        : type(ByteTerm::TypeCharacterClass)
144        , invertOrCapture(invert)
145    {
146        atom.characterClass = characterClass;
147        atom.quantityType = QuantifierFixedCount;
148        atom.quantityCount = 1;
149        inputPosition = inputPos;
150    }
151
152    ByteTerm(Type type, unsigned subpatternId, ByteDisjunction* parenthesesInfo, bool invertOrCapture, int inputPos)
153        : type(type)
154        , invertOrCapture(invertOrCapture)
155    {
156        atom.subpatternId = subpatternId;
157        atom.parenthesesDisjunction = parenthesesInfo;
158        atom.quantityType = QuantifierFixedCount;
159        atom.quantityCount = 1;
160        inputPosition = inputPos;
161    }
162
163    ByteTerm(Type type, bool invert = false)
164        : type(type)
165        , invertOrCapture(invert)
166    {
167        atom.quantityType = QuantifierFixedCount;
168        atom.quantityCount = 1;
169    }
170
171    ByteTerm(Type type, unsigned subpatternId, bool invertOrCapture, int inputPos)
172        : type(type)
173        , invertOrCapture(invertOrCapture)
174    {
175        atom.subpatternId = subpatternId;
176        atom.quantityType = QuantifierFixedCount;
177        atom.quantityCount = 1;
178        inputPosition = inputPos;
179    }
180
181    static ByteTerm BOL(int inputPos)
182    {
183        ByteTerm term(TypeAssertionBOL);
184        term.inputPosition = inputPos;
185        return term;
186    }
187
188    static ByteTerm CheckInput(unsigned count)
189    {
190        ByteTerm term(TypeCheckInput);
191        term.checkInputCount = count;
192        return term;
193    }
194
195    static ByteTerm EOL(int inputPos)
196    {
197        ByteTerm term(TypeAssertionEOL);
198        term.inputPosition = inputPos;
199        return term;
200    }
201
202    static ByteTerm WordBoundary(bool invert, int inputPos)
203    {
204        ByteTerm term(TypeAssertionWordBoundary, invert);
205        term.inputPosition = inputPos;
206        return term;
207    }
208
209    static ByteTerm BackReference(unsigned subpatternId, int inputPos)
210    {
211        return ByteTerm(TypeBackReference, subpatternId, false, inputPos);
212    }
213
214    static ByteTerm BodyAlternativeBegin()
215    {
216        ByteTerm term(TypeBodyAlternativeBegin);
217        term.alternative.next = 0;
218        term.alternative.end = 0;
219        return term;
220    }
221
222    static ByteTerm BodyAlternativeDisjunction()
223    {
224        ByteTerm term(TypeBodyAlternativeDisjunction);
225        term.alternative.next = 0;
226        term.alternative.end = 0;
227        return term;
228    }
229
230    static ByteTerm BodyAlternativeEnd()
231    {
232        ByteTerm term(TypeBodyAlternativeEnd);
233        term.alternative.next = 0;
234        term.alternative.end = 0;
235        return term;
236    }
237
238    static ByteTerm AlternativeBegin()
239    {
240        ByteTerm term(TypeAlternativeBegin);
241        term.alternative.next = 0;
242        term.alternative.end = 0;
243        return term;
244    }
245
246    static ByteTerm AlternativeDisjunction()
247    {
248        ByteTerm term(TypeAlternativeDisjunction);
249        term.alternative.next = 0;
250        term.alternative.end = 0;
251        return term;
252    }
253
254    static ByteTerm AlternativeEnd()
255    {
256        ByteTerm term(TypeAlternativeEnd);
257        term.alternative.next = 0;
258        term.alternative.end = 0;
259        return term;
260    }
261
262    static ByteTerm SubpatternBegin()
263    {
264        return ByteTerm(TypeSubpatternBegin);
265    }
266
267    static ByteTerm SubpatternEnd()
268    {
269        return ByteTerm(TypeSubpatternEnd);
270    }
271
272    bool invert()
273    {
274        return invertOrCapture;
275    }
276
277    bool capture()
278    {
279        return invertOrCapture;
280    }
281};
282
283class ByteDisjunction : public FastAllocBase {
284public:
285    ByteDisjunction(unsigned numSubpatterns, unsigned frameSize)
286        : m_numSubpatterns(numSubpatterns)
287        , m_frameSize(frameSize)
288    {
289    }
290
291    Vector<ByteTerm> terms;
292    unsigned m_numSubpatterns;
293    unsigned m_frameSize;
294};
295
296struct BytecodePattern : FastAllocBase {
297    BytecodePattern(ByteDisjunction* body, Vector<ByteDisjunction*> allParenthesesInfo, RegexPattern& pattern)
298        : m_body(body)
299        , m_ignoreCase(pattern.m_ignoreCase)
300        , m_multiline(pattern.m_multiline)
301    {
302        newlineCharacterClass = pattern.newlineCharacterClass();
303        wordcharCharacterClass = pattern.wordcharCharacterClass();
304
305        m_allParenthesesInfo.append(allParenthesesInfo);
306        m_userCharacterClasses.append(pattern.m_userCharacterClasses);
307        // 'Steal' the RegexPattern's CharacterClasses!  We clear its
308        // array, so that it won't delete them on destruction.  We'll
309        // take responsibility for that.
310        pattern.m_userCharacterClasses.clear();
311    }
312
313    ~BytecodePattern()
314    {
315        deleteAllValues(m_allParenthesesInfo);
316        deleteAllValues(m_userCharacterClasses);
317    }
318
319    OwnPtr<ByteDisjunction> m_body;
320    bool m_ignoreCase;
321    bool m_multiline;
322
323    CharacterClass* newlineCharacterClass;
324    CharacterClass* wordcharCharacterClass;
325private:
326    Vector<ByteDisjunction*> m_allParenthesesInfo;
327    Vector<CharacterClass*> m_userCharacterClasses;
328};
329
330BytecodePattern* byteCompileRegex(const UString& pattern, unsigned& numSubpatterns, const char*& error, bool ignoreCase = false, bool multiline = false);
331int interpretRegex(BytecodePattern* v_regex, const UChar* input, unsigned start, unsigned length, int* output);
332
333} } // namespace JSC::Yarr
334
335#endif
336
337#endif // RegexInterpreter_h
338