1/*
2 * Copyright (C) 2009, 2010 Apple Inc. All rights reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
6 * are met:
7 * 1. Redistributions of source code must retain the above copyright
8 *    notice, this list of conditions and the following disclaimer.
9 * 2. Redistributions in binary form must reproduce the above copyright
10 *    notice, this list of conditions and the following disclaimer in the
11 *    documentation and/or other materials provided with the distribution.
12 *
13 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
14 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
16 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR
17 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
18 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
19 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
20 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
21 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
23 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24 */
25
26#ifndef YarrInterpreter_h
27#define YarrInterpreter_h
28
29#include "YarrPattern.h"
30#include <wtf/PassOwnPtr.h>
31#include <wtf/unicode/Unicode.h>
32
33namespace WTF {
34class BumpPointerAllocator;
35}
36using WTF::BumpPointerAllocator;
37
38namespace JSC { namespace Yarr {
39
40class ByteDisjunction;
41
42struct ByteTerm {
43    enum Type {
44        TypeBodyAlternativeBegin,
45        TypeBodyAlternativeDisjunction,
46        TypeBodyAlternativeEnd,
47        TypeAlternativeBegin,
48        TypeAlternativeDisjunction,
49        TypeAlternativeEnd,
50        TypeSubpatternBegin,
51        TypeSubpatternEnd,
52        TypeAssertionBOL,
53        TypeAssertionEOL,
54        TypeAssertionWordBoundary,
55        TypePatternCharacterOnce,
56        TypePatternCharacterFixed,
57        TypePatternCharacterGreedy,
58        TypePatternCharacterNonGreedy,
59        TypePatternCasedCharacterOnce,
60        TypePatternCasedCharacterFixed,
61        TypePatternCasedCharacterGreedy,
62        TypePatternCasedCharacterNonGreedy,
63        TypeCharacterClass,
64        TypeBackReference,
65        TypeParenthesesSubpattern,
66        TypeParenthesesSubpatternOnceBegin,
67        TypeParenthesesSubpatternOnceEnd,
68        TypeParenthesesSubpatternTerminalBegin,
69        TypeParenthesesSubpatternTerminalEnd,
70        TypeParentheticalAssertionBegin,
71        TypeParentheticalAssertionEnd,
72        TypeCheckInput,
73        TypeUncheckInput,
74    } type;
75    union {
76        struct {
77            union {
78                UChar patternCharacter;
79                struct {
80                    UChar lo;
81                    UChar hi;
82                } casedCharacter;
83                CharacterClass* characterClass;
84                unsigned subpatternId;
85            };
86            union {
87                ByteDisjunction* parenthesesDisjunction;
88                unsigned parenthesesWidth;
89            };
90            QuantifierType quantityType;
91            unsigned quantityCount;
92        } atom;
93        struct {
94            int next;
95            int end;
96            bool onceThrough;
97        } alternative;
98        unsigned checkInputCount;
99    };
100    unsigned frameLocation;
101    bool m_capture : 1;
102    bool m_invert : 1;
103    int inputPosition;
104
105    ByteTerm(UChar ch, int inputPos, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType)
106        : frameLocation(frameLocation)
107        , m_capture(false)
108        , m_invert(false)
109    {
110        switch (quantityType) {
111        case QuantifierFixedCount:
112            type = (quantityCount == 1) ? ByteTerm::TypePatternCharacterOnce : ByteTerm::TypePatternCharacterFixed;
113            break;
114        case QuantifierGreedy:
115            type = ByteTerm::TypePatternCharacterGreedy;
116            break;
117        case QuantifierNonGreedy:
118            type = ByteTerm::TypePatternCharacterNonGreedy;
119            break;
120        }
121
122        atom.patternCharacter = ch;
123        atom.quantityType = quantityType;
124        atom.quantityCount = quantityCount;
125        inputPosition = inputPos;
126    }
127
128    ByteTerm(UChar lo, UChar hi, int inputPos, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType)
129        : frameLocation(frameLocation)
130        , m_capture(false)
131        , m_invert(false)
132    {
133        switch (quantityType) {
134        case QuantifierFixedCount:
135            type = (quantityCount == 1) ? ByteTerm::TypePatternCasedCharacterOnce : ByteTerm::TypePatternCasedCharacterFixed;
136            break;
137        case QuantifierGreedy:
138            type = ByteTerm::TypePatternCasedCharacterGreedy;
139            break;
140        case QuantifierNonGreedy:
141            type = ByteTerm::TypePatternCasedCharacterNonGreedy;
142            break;
143        }
144
145        atom.casedCharacter.lo = lo;
146        atom.casedCharacter.hi = hi;
147        atom.quantityType = quantityType;
148        atom.quantityCount = quantityCount;
149        inputPosition = inputPos;
150    }
151
152    ByteTerm(CharacterClass* characterClass, bool invert, int inputPos)
153        : type(ByteTerm::TypeCharacterClass)
154        , m_capture(false)
155        , m_invert(invert)
156    {
157        atom.characterClass = characterClass;
158        atom.quantityType = QuantifierFixedCount;
159        atom.quantityCount = 1;
160        inputPosition = inputPos;
161    }
162
163    ByteTerm(Type type, unsigned subpatternId, ByteDisjunction* parenthesesInfo, bool capture, int inputPos)
164        : type(type)
165        , m_capture(capture)
166        , m_invert(false)
167    {
168        atom.subpatternId = subpatternId;
169        atom.parenthesesDisjunction = parenthesesInfo;
170        atom.quantityType = QuantifierFixedCount;
171        atom.quantityCount = 1;
172        inputPosition = inputPos;
173    }
174
175    ByteTerm(Type type, bool invert = false)
176        : type(type)
177        , m_capture(false)
178        , m_invert(invert)
179    {
180        atom.quantityType = QuantifierFixedCount;
181        atom.quantityCount = 1;
182    }
183
184    ByteTerm(Type type, unsigned subpatternId, bool capture, bool invert, int inputPos)
185        : type(type)
186        , m_capture(capture)
187        , m_invert(invert)
188    {
189        atom.subpatternId = subpatternId;
190        atom.quantityType = QuantifierFixedCount;
191        atom.quantityCount = 1;
192        inputPosition = inputPos;
193    }
194
195    static ByteTerm BOL(int inputPos)
196    {
197        ByteTerm term(TypeAssertionBOL);
198        term.inputPosition = inputPos;
199        return term;
200    }
201
202    static ByteTerm CheckInput(unsigned count)
203    {
204        ByteTerm term(TypeCheckInput);
205        term.checkInputCount = count;
206        return term;
207    }
208
209    static ByteTerm UncheckInput(unsigned count)
210    {
211        ByteTerm term(TypeUncheckInput);
212        term.checkInputCount = count;
213        return term;
214    }
215
216    static ByteTerm EOL(int inputPos)
217    {
218        ByteTerm term(TypeAssertionEOL);
219        term.inputPosition = inputPos;
220        return term;
221    }
222
223    static ByteTerm WordBoundary(bool invert, int inputPos)
224    {
225        ByteTerm term(TypeAssertionWordBoundary, invert);
226        term.inputPosition = inputPos;
227        return term;
228    }
229
230    static ByteTerm BackReference(unsigned subpatternId, int inputPos)
231    {
232        return ByteTerm(TypeBackReference, subpatternId, false, false, inputPos);
233    }
234
235    static ByteTerm BodyAlternativeBegin(bool onceThrough)
236    {
237        ByteTerm term(TypeBodyAlternativeBegin);
238        term.alternative.next = 0;
239        term.alternative.end = 0;
240        term.alternative.onceThrough = onceThrough;
241        return term;
242    }
243
244    static ByteTerm BodyAlternativeDisjunction(bool onceThrough)
245    {
246        ByteTerm term(TypeBodyAlternativeDisjunction);
247        term.alternative.next = 0;
248        term.alternative.end = 0;
249        term.alternative.onceThrough = onceThrough;
250        return term;
251    }
252
253    static ByteTerm BodyAlternativeEnd()
254    {
255        ByteTerm term(TypeBodyAlternativeEnd);
256        term.alternative.next = 0;
257        term.alternative.end = 0;
258        term.alternative.onceThrough = false;
259        return term;
260    }
261
262    static ByteTerm AlternativeBegin()
263    {
264        ByteTerm term(TypeAlternativeBegin);
265        term.alternative.next = 0;
266        term.alternative.end = 0;
267        term.alternative.onceThrough = false;
268        return term;
269    }
270
271    static ByteTerm AlternativeDisjunction()
272    {
273        ByteTerm term(TypeAlternativeDisjunction);
274        term.alternative.next = 0;
275        term.alternative.end = 0;
276        term.alternative.onceThrough = false;
277        return term;
278    }
279
280    static ByteTerm AlternativeEnd()
281    {
282        ByteTerm term(TypeAlternativeEnd);
283        term.alternative.next = 0;
284        term.alternative.end = 0;
285        term.alternative.onceThrough = false;
286        return term;
287    }
288
289    static ByteTerm SubpatternBegin()
290    {
291        return ByteTerm(TypeSubpatternBegin);
292    }
293
294    static ByteTerm SubpatternEnd()
295    {
296        return ByteTerm(TypeSubpatternEnd);
297    }
298
299    bool invert()
300    {
301        return m_invert;
302    }
303
304    bool capture()
305    {
306        return m_capture;
307    }
308};
309
310class ByteDisjunction {
311    WTF_MAKE_FAST_ALLOCATED;
312public:
313    ByteDisjunction(unsigned numSubpatterns, unsigned frameSize)
314        : m_numSubpatterns(numSubpatterns)
315        , m_frameSize(frameSize)
316    {
317    }
318
319    Vector<ByteTerm> terms;
320    unsigned m_numSubpatterns;
321    unsigned m_frameSize;
322};
323
324struct BytecodePattern {
325    WTF_MAKE_FAST_ALLOCATED;
326public:
327    BytecodePattern(PassOwnPtr<ByteDisjunction> body, Vector<ByteDisjunction*> allParenthesesInfo, YarrPattern& pattern, BumpPointerAllocator* allocator)
328        : m_body(body)
329        , m_ignoreCase(pattern.m_ignoreCase)
330        , m_multiline(pattern.m_multiline)
331        , m_containsBeginChars(pattern.m_containsBeginChars)
332        , m_allocator(allocator)
333    {
334        newlineCharacterClass = pattern.newlineCharacterClass();
335        wordcharCharacterClass = pattern.wordcharCharacterClass();
336
337        m_allParenthesesInfo.append(allParenthesesInfo);
338        m_userCharacterClasses.append(pattern.m_userCharacterClasses);
339        // 'Steal' the YarrPattern's CharacterClasses!  We clear its
340        // array, so that it won't delete them on destruction.  We'll
341        // take responsibility for that.
342        pattern.m_userCharacterClasses.clear();
343
344        m_beginChars.append(pattern.m_beginChars);
345    }
346
347    ~BytecodePattern()
348    {
349        deleteAllValues(m_allParenthesesInfo);
350        deleteAllValues(m_userCharacterClasses);
351    }
352
353    OwnPtr<ByteDisjunction> m_body;
354    bool m_ignoreCase;
355    bool m_multiline;
356    bool m_containsBeginChars;
357    // Each BytecodePattern is associated with a RegExp, each RegExp is associated
358    // with a JSGlobalData.  Cache a pointer to out JSGlobalData's m_regExpAllocator.
359    BumpPointerAllocator* m_allocator;
360
361    CharacterClass* newlineCharacterClass;
362    CharacterClass* wordcharCharacterClass;
363
364    Vector<BeginChar> m_beginChars;
365
366private:
367    Vector<ByteDisjunction*> m_allParenthesesInfo;
368    Vector<CharacterClass*> m_userCharacterClasses;
369};
370
371} } // namespace JSC::Yarr
372
373#endif // YarrInterpreter_h
374