1/*
2 * Copyright (C) 2009 Apple Inc. All rights reserved.
3 * Copyright (C) 2010 Peter Varga (pvarga@inf.u-szeged.hu), University of Szeged
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
15 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
17 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR
18 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
19 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
21 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
22 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
24 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 */
26
27#ifndef YarrPattern_h
28#define YarrPattern_h
29
30#include <runtime/UString.h>
31#include <wtf/Vector.h>
32#include <wtf/unicode/Unicode.h>
33
34namespace JSC { namespace Yarr {
35
36struct PatternDisjunction;
37
38struct CharacterRange {
39    UChar begin;
40    UChar end;
41
42    CharacterRange(UChar begin, UChar end)
43        : begin(begin)
44        , end(end)
45    {
46    }
47};
48
49struct CharacterClassTable : RefCounted<CharacterClassTable> {
50    const char* m_table;
51    bool m_inverted;
52    static PassRefPtr<CharacterClassTable> create(const char* table, bool inverted)
53    {
54        return adoptRef(new CharacterClassTable(table, inverted));
55    }
56
57private:
58    CharacterClassTable(const char* table, bool inverted)
59        : m_table(table)
60        , m_inverted(inverted)
61    {
62    }
63};
64
65struct CharacterClass {
66    WTF_MAKE_FAST_ALLOCATED;
67public:
68    // All CharacterClass instances have to have the full set of matches and ranges,
69    // they may have an optional table for faster lookups (which must match the
70    // specified matches and ranges)
71    CharacterClass(PassRefPtr<CharacterClassTable> table)
72        : m_table(table)
73    {
74    }
75    Vector<UChar> m_matches;
76    Vector<CharacterRange> m_ranges;
77    Vector<UChar> m_matchesUnicode;
78    Vector<CharacterRange> m_rangesUnicode;
79    RefPtr<CharacterClassTable> m_table;
80};
81
82enum QuantifierType {
83    QuantifierFixedCount,
84    QuantifierGreedy,
85    QuantifierNonGreedy,
86};
87
88struct PatternTerm {
89    enum Type {
90        TypeAssertionBOL,
91        TypeAssertionEOL,
92        TypeAssertionWordBoundary,
93        TypePatternCharacter,
94        TypeCharacterClass,
95        TypeBackReference,
96        TypeForwardReference,
97        TypeParenthesesSubpattern,
98        TypeParentheticalAssertion,
99    } type;
100    bool m_capture :1;
101    bool m_invert :1;
102    union {
103        UChar patternCharacter;
104        CharacterClass* characterClass;
105        unsigned backReferenceSubpatternId;
106        struct {
107            PatternDisjunction* disjunction;
108            unsigned subpatternId;
109            unsigned lastSubpatternId;
110            bool isCopy;
111            bool isTerminal;
112        } parentheses;
113    };
114    QuantifierType quantityType;
115    unsigned quantityCount;
116    int inputPosition;
117    unsigned frameLocation;
118
119    PatternTerm(UChar ch)
120        : type(PatternTerm::TypePatternCharacter)
121        , m_capture(false)
122        , m_invert(false)
123    {
124        patternCharacter = ch;
125        quantityType = QuantifierFixedCount;
126        quantityCount = 1;
127    }
128
129    PatternTerm(CharacterClass* charClass, bool invert)
130        : type(PatternTerm::TypeCharacterClass)
131        , m_capture(false)
132        , m_invert(invert)
133    {
134        characterClass = charClass;
135        quantityType = QuantifierFixedCount;
136        quantityCount = 1;
137    }
138
139    PatternTerm(Type type, unsigned subpatternId, PatternDisjunction* disjunction, bool capture = false, bool invert = false)
140        : type(type)
141        , m_capture(capture)
142        , m_invert(invert)
143    {
144        parentheses.disjunction = disjunction;
145        parentheses.subpatternId = subpatternId;
146        parentheses.isCopy = false;
147        parentheses.isTerminal = false;
148        quantityType = QuantifierFixedCount;
149        quantityCount = 1;
150    }
151
152    PatternTerm(Type type, bool invert = false)
153        : type(type)
154        , m_capture(false)
155        , m_invert(invert)
156    {
157        quantityType = QuantifierFixedCount;
158        quantityCount = 1;
159    }
160
161    PatternTerm(unsigned spatternId)
162        : type(TypeBackReference)
163        , m_capture(false)
164        , m_invert(false)
165    {
166        backReferenceSubpatternId = spatternId;
167        quantityType = QuantifierFixedCount;
168        quantityCount = 1;
169    }
170
171    static PatternTerm ForwardReference()
172    {
173        return PatternTerm(TypeForwardReference);
174    }
175
176    static PatternTerm BOL()
177    {
178        return PatternTerm(TypeAssertionBOL);
179    }
180
181    static PatternTerm EOL()
182    {
183        return PatternTerm(TypeAssertionEOL);
184    }
185
186    static PatternTerm WordBoundary(bool invert)
187    {
188        return PatternTerm(TypeAssertionWordBoundary, invert);
189    }
190
191    bool invert()
192    {
193        return m_invert;
194    }
195
196    bool capture()
197    {
198        return m_capture;
199    }
200
201    void quantify(unsigned count, QuantifierType type)
202    {
203        quantityCount = count;
204        quantityType = type;
205    }
206};
207
208struct PatternAlternative {
209    WTF_MAKE_FAST_ALLOCATED;
210public:
211    PatternAlternative(PatternDisjunction* disjunction)
212        : m_parent(disjunction)
213        , m_onceThrough(false)
214        , m_hasFixedSize(false)
215        , m_startsWithBOL(false)
216        , m_containsBOL(false)
217    {
218    }
219
220    PatternTerm& lastTerm()
221    {
222        ASSERT(m_terms.size());
223        return m_terms[m_terms.size() - 1];
224    }
225
226    void removeLastTerm()
227    {
228        ASSERT(m_terms.size());
229        m_terms.shrink(m_terms.size() - 1);
230    }
231
232    void setOnceThrough()
233    {
234        m_onceThrough = true;
235    }
236
237    bool onceThrough()
238    {
239        return m_onceThrough;
240    }
241
242    Vector<PatternTerm> m_terms;
243    PatternDisjunction* m_parent;
244    unsigned m_minimumSize;
245    bool m_onceThrough : 1;
246    bool m_hasFixedSize : 1;
247    bool m_startsWithBOL : 1;
248    bool m_containsBOL : 1;
249};
250
251struct PatternDisjunction {
252    WTF_MAKE_FAST_ALLOCATED;
253public:
254    PatternDisjunction(PatternAlternative* parent = 0)
255        : m_parent(parent)
256        , m_hasFixedSize(false)
257    {
258    }
259
260    ~PatternDisjunction()
261    {
262        deleteAllValues(m_alternatives);
263    }
264
265    PatternAlternative* addNewAlternative()
266    {
267        PatternAlternative* alternative = new PatternAlternative(this);
268        m_alternatives.append(alternative);
269        return alternative;
270    }
271
272    Vector<PatternAlternative*> m_alternatives;
273    PatternAlternative* m_parent;
274    unsigned m_minimumSize;
275    unsigned m_callFrameSize;
276    bool m_hasFixedSize;
277};
278
279// You probably don't want to be calling these functions directly
280// (please to be calling newlineCharacterClass() et al on your
281// friendly neighborhood YarrPattern instance to get nicely
282// cached copies).
283CharacterClass* newlineCreate();
284CharacterClass* digitsCreate();
285CharacterClass* spacesCreate();
286CharacterClass* wordcharCreate();
287CharacterClass* nondigitsCreate();
288CharacterClass* nonspacesCreate();
289CharacterClass* nonwordcharCreate();
290
291struct TermChain {
292    TermChain(PatternTerm term)
293        : term(term)
294    {}
295
296    PatternTerm term;
297    Vector<TermChain> hotTerms;
298};
299
300struct BeginChar {
301    BeginChar()
302        : value(0)
303        , mask(0)
304    {}
305
306    BeginChar(unsigned value, unsigned mask)
307        : value(value)
308        , mask(mask)
309    {}
310
311    unsigned value;
312    unsigned mask;
313};
314
315struct YarrPattern {
316    YarrPattern(const UString& pattern, bool ignoreCase, bool multiline, const char** error);
317
318    ~YarrPattern()
319    {
320        deleteAllValues(m_disjunctions);
321        deleteAllValues(m_userCharacterClasses);
322    }
323
324    void reset()
325    {
326        m_numSubpatterns = 0;
327        m_maxBackReference = 0;
328
329        m_containsBackreferences = false;
330        m_containsBeginChars = false;
331        m_containsBOL = false;
332
333        newlineCached = 0;
334        digitsCached = 0;
335        spacesCached = 0;
336        wordcharCached = 0;
337        nondigitsCached = 0;
338        nonspacesCached = 0;
339        nonwordcharCached = 0;
340
341        deleteAllValues(m_disjunctions);
342        m_disjunctions.clear();
343        deleteAllValues(m_userCharacterClasses);
344        m_userCharacterClasses.clear();
345        m_beginChars.clear();
346    }
347
348    bool containsIllegalBackReference()
349    {
350        return m_maxBackReference > m_numSubpatterns;
351    }
352
353    CharacterClass* newlineCharacterClass()
354    {
355        if (!newlineCached)
356            m_userCharacterClasses.append(newlineCached = newlineCreate());
357        return newlineCached;
358    }
359    CharacterClass* digitsCharacterClass()
360    {
361        if (!digitsCached)
362            m_userCharacterClasses.append(digitsCached = digitsCreate());
363        return digitsCached;
364    }
365    CharacterClass* spacesCharacterClass()
366    {
367        if (!spacesCached)
368            m_userCharacterClasses.append(spacesCached = spacesCreate());
369        return spacesCached;
370    }
371    CharacterClass* wordcharCharacterClass()
372    {
373        if (!wordcharCached)
374            m_userCharacterClasses.append(wordcharCached = wordcharCreate());
375        return wordcharCached;
376    }
377    CharacterClass* nondigitsCharacterClass()
378    {
379        if (!nondigitsCached)
380            m_userCharacterClasses.append(nondigitsCached = nondigitsCreate());
381        return nondigitsCached;
382    }
383    CharacterClass* nonspacesCharacterClass()
384    {
385        if (!nonspacesCached)
386            m_userCharacterClasses.append(nonspacesCached = nonspacesCreate());
387        return nonspacesCached;
388    }
389    CharacterClass* nonwordcharCharacterClass()
390    {
391        if (!nonwordcharCached)
392            m_userCharacterClasses.append(nonwordcharCached = nonwordcharCreate());
393        return nonwordcharCached;
394    }
395
396    bool m_ignoreCase : 1;
397    bool m_multiline : 1;
398    bool m_containsBackreferences : 1;
399    bool m_containsBeginChars : 1;
400    bool m_containsBOL : 1;
401    unsigned m_numSubpatterns;
402    unsigned m_maxBackReference;
403    PatternDisjunction* m_body;
404    Vector<PatternDisjunction*, 4> m_disjunctions;
405    Vector<CharacterClass*> m_userCharacterClasses;
406    Vector<BeginChar> m_beginChars;
407
408private:
409    const char* compile(const UString& patternString);
410
411    CharacterClass* newlineCached;
412    CharacterClass* digitsCached;
413    CharacterClass* spacesCached;
414    CharacterClass* wordcharCached;
415    CharacterClass* nondigitsCached;
416    CharacterClass* nonspacesCached;
417    CharacterClass* nonwordcharCached;
418};
419
420} } // namespace JSC::Yarr
421
422#endif // YarrPattern_h
423