1/*
2 * Copyright (C) 2009 Apple Inc. All rights reserved.
3 * Copyright (C) 2010 Peter Varga (pvarga@inf.u-szeged.hu), University of Szeged
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
15 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
17 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR
18 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
19 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
21 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
22 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
24 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 */
26
27#include "config.h"
28#include "YarrPattern.h"
29
30#include "Yarr.h"
31#include "YarrParser.h"
32#include <wtf/Vector.h>
33
34using namespace WTF;
35
36namespace JSC { namespace Yarr {
37
38#include "RegExpJitTables.h"
39
40class CharacterClassConstructor {
41public:
42    CharacterClassConstructor(bool isCaseInsensitive = false)
43        : m_isCaseInsensitive(isCaseInsensitive)
44    {
45    }
46
47    void reset()
48    {
49        m_matches.clear();
50        m_ranges.clear();
51        m_matchesUnicode.clear();
52        m_rangesUnicode.clear();
53    }
54
55    void append(const CharacterClass* other)
56    {
57        for (size_t i = 0; i < other->m_matches.size(); ++i)
58            addSorted(m_matches, other->m_matches[i]);
59        for (size_t i = 0; i < other->m_ranges.size(); ++i)
60            addSortedRange(m_ranges, other->m_ranges[i].begin, other->m_ranges[i].end);
61        for (size_t i = 0; i < other->m_matchesUnicode.size(); ++i)
62            addSorted(m_matchesUnicode, other->m_matchesUnicode[i]);
63        for (size_t i = 0; i < other->m_rangesUnicode.size(); ++i)
64            addSortedRange(m_rangesUnicode, other->m_rangesUnicode[i].begin, other->m_rangesUnicode[i].end);
65    }
66
67    void putChar(UChar ch)
68    {
69        if (ch <= 0x7f) {
70            if (m_isCaseInsensitive && isASCIIAlpha(ch)) {
71                addSorted(m_matches, toASCIIUpper(ch));
72                addSorted(m_matches, toASCIILower(ch));
73            } else
74                addSorted(m_matches, ch);
75        } else {
76            UChar upper, lower;
77            if (m_isCaseInsensitive && ((upper = Unicode::toUpper(ch)) != (lower = Unicode::toLower(ch)))) {
78                addSorted(m_matchesUnicode, upper);
79                addSorted(m_matchesUnicode, lower);
80            } else
81                addSorted(m_matchesUnicode, ch);
82        }
83    }
84
85    // returns true if this character has another case, and 'ch' is the upper case form.
86    static inline bool isUnicodeUpper(UChar ch)
87    {
88        return ch != Unicode::toLower(ch);
89    }
90
91    // returns true if this character has another case, and 'ch' is the lower case form.
92    static inline bool isUnicodeLower(UChar ch)
93    {
94        return ch != Unicode::toUpper(ch);
95    }
96
97    void putRange(UChar lo, UChar hi)
98    {
99        if (lo <= 0x7f) {
100            char asciiLo = lo;
101            char asciiHi = std::min(hi, (UChar)0x7f);
102            addSortedRange(m_ranges, lo, asciiHi);
103
104            if (m_isCaseInsensitive) {
105                if ((asciiLo <= 'Z') && (asciiHi >= 'A'))
106                    addSortedRange(m_ranges, std::max(asciiLo, 'A')+('a'-'A'), std::min(asciiHi, 'Z')+('a'-'A'));
107                if ((asciiLo <= 'z') && (asciiHi >= 'a'))
108                    addSortedRange(m_ranges, std::max(asciiLo, 'a')+('A'-'a'), std::min(asciiHi, 'z')+('A'-'a'));
109            }
110        }
111        if (hi >= 0x80) {
112            uint32_t unicodeCurr = std::max(lo, (UChar)0x80);
113            addSortedRange(m_rangesUnicode, unicodeCurr, hi);
114
115            if (m_isCaseInsensitive) {
116                while (unicodeCurr <= hi) {
117                    // If the upper bound of the range (hi) is 0xffff, the increments to
118                    // unicodeCurr in this loop may take it to 0x10000.  This is fine
119                    // (if so we won't re-enter the loop, since the loop condition above
120                    // will definitely fail) - but this does mean we cannot use a UChar
121                    // to represent unicodeCurr, we must use a 32-bit value instead.
122                    ASSERT(unicodeCurr <= 0xffff);
123
124                    if (isUnicodeUpper(unicodeCurr)) {
125                        UChar lowerCaseRangeBegin = Unicode::toLower(unicodeCurr);
126                        UChar lowerCaseRangeEnd = lowerCaseRangeBegin;
127                        while ((++unicodeCurr <= hi) && isUnicodeUpper(unicodeCurr) && (Unicode::toLower(unicodeCurr) == (lowerCaseRangeEnd + 1)))
128                            lowerCaseRangeEnd++;
129                        addSortedRange(m_rangesUnicode, lowerCaseRangeBegin, lowerCaseRangeEnd);
130                    } else if (isUnicodeLower(unicodeCurr)) {
131                        UChar upperCaseRangeBegin = Unicode::toUpper(unicodeCurr);
132                        UChar upperCaseRangeEnd = upperCaseRangeBegin;
133                        while ((++unicodeCurr <= hi) && isUnicodeLower(unicodeCurr) && (Unicode::toUpper(unicodeCurr) == (upperCaseRangeEnd + 1)))
134                            upperCaseRangeEnd++;
135                        addSortedRange(m_rangesUnicode, upperCaseRangeBegin, upperCaseRangeEnd);
136                    } else
137                        ++unicodeCurr;
138                }
139            }
140        }
141    }
142
143    CharacterClass* charClass()
144    {
145        CharacterClass* characterClass = new CharacterClass(0);
146
147        characterClass->m_matches.append(m_matches);
148        characterClass->m_ranges.append(m_ranges);
149        characterClass->m_matchesUnicode.append(m_matchesUnicode);
150        characterClass->m_rangesUnicode.append(m_rangesUnicode);
151
152        reset();
153
154        return characterClass;
155    }
156
157private:
158    void addSorted(Vector<UChar>& matches, UChar ch)
159    {
160        unsigned pos = 0;
161        unsigned range = matches.size();
162
163        // binary chop, find position to insert char.
164        while (range) {
165            unsigned index = range >> 1;
166
167            int val = matches[pos+index] - ch;
168            if (!val)
169                return;
170            else if (val > 0)
171                range = index;
172            else {
173                pos += (index+1);
174                range -= (index+1);
175            }
176        }
177
178        if (pos == matches.size())
179            matches.append(ch);
180        else
181            matches.insert(pos, ch);
182    }
183
184    void addSortedRange(Vector<CharacterRange>& ranges, UChar lo, UChar hi)
185    {
186        unsigned end = ranges.size();
187
188        // Simple linear scan - I doubt there are that many ranges anyway...
189        // feel free to fix this with something faster (eg binary chop).
190        for (unsigned i = 0; i < end; ++i) {
191            // does the new range fall before the current position in the array
192            if (hi < ranges[i].begin) {
193                // optional optimization: concatenate appending ranges? - may not be worthwhile.
194                if (hi == (ranges[i].begin - 1)) {
195                    ranges[i].begin = lo;
196                    return;
197                }
198                ranges.insert(i, CharacterRange(lo, hi));
199                return;
200            }
201            // Okay, since we didn't hit the last case, the end of the new range is definitely at or after the begining
202            // If the new range start at or before the end of the last range, then the overlap (if it starts one after the
203            // end of the last range they concatenate, which is just as good.
204            if (lo <= (ranges[i].end + 1)) {
205                // found an intersect! we'll replace this entry in the array.
206                ranges[i].begin = std::min(ranges[i].begin, lo);
207                ranges[i].end = std::max(ranges[i].end, hi);
208
209                // now check if the new range can subsume any subsequent ranges.
210                unsigned next = i+1;
211                // each iteration of the loop we will either remove something from the list, or break the loop.
212                while (next < ranges.size()) {
213                    if (ranges[next].begin <= (ranges[i].end + 1)) {
214                        // the next entry now overlaps / concatenates this one.
215                        ranges[i].end = std::max(ranges[i].end, ranges[next].end);
216                        ranges.remove(next);
217                    } else
218                        break;
219                }
220
221                return;
222            }
223        }
224
225        // CharacterRange comes after all existing ranges.
226        ranges.append(CharacterRange(lo, hi));
227    }
228
229    bool m_isCaseInsensitive;
230
231    Vector<UChar> m_matches;
232    Vector<CharacterRange> m_ranges;
233    Vector<UChar> m_matchesUnicode;
234    Vector<CharacterRange> m_rangesUnicode;
235};
236
237struct BeginCharHelper {
238    BeginCharHelper(Vector<BeginChar>* beginChars, bool isCaseInsensitive = false)
239        : m_beginChars(beginChars)
240        , m_isCaseInsensitive(isCaseInsensitive)
241    {}
242
243    void addBeginChar(BeginChar beginChar, Vector<TermChain>* hotTerms, QuantifierType quantityType, unsigned quantityCount)
244    {
245        if (quantityType == QuantifierFixedCount && quantityCount > 1) {
246            // We duplicate the first found character if the quantity of the term is more than one. eg.: /a{3}/
247            beginChar.value |= beginChar.value << 16;
248            beginChar.mask |= beginChar.mask << 16;
249            addCharacter(beginChar);
250        } else if (quantityType == QuantifierFixedCount && quantityCount == 1 && hotTerms->size())
251            // In case of characters with fixed quantifier we should check the next character as well.
252            linkHotTerms(beginChar, hotTerms);
253        else
254            // In case of greedy matching the next character checking is unnecessary therefore we just store
255            // the first character.
256            addCharacter(beginChar);
257    }
258
259    // Merge two following BeginChars in the vector to reduce the number of character checks.
260    void merge(unsigned size)
261    {
262        for (unsigned i = 0; i < size; i++) {
263            BeginChar* curr = &m_beginChars->at(i);
264            BeginChar* next = &m_beginChars->at(i + 1);
265
266            // If the current and the next size of value is different we should skip the merge process
267            // because the 16bit and 32bit values are unmergable.
268            if (curr->value <= 0xFFFF && next->value > 0xFFFF)
269                continue;
270
271            unsigned diff = curr->value ^ next->value;
272
273            curr->mask |= diff;
274            curr->value |= curr->mask;
275
276            m_beginChars->remove(i + 1);
277            size--;
278        }
279    }
280
281private:
282    void addCharacter(BeginChar beginChar)
283    {
284        unsigned pos = 0;
285        unsigned range = m_beginChars->size();
286
287        // binary chop, find position to insert char.
288        while (range) {
289            unsigned index = range >> 1;
290
291            int val = m_beginChars->at(pos+index).value - beginChar.value;
292            if (!val)
293                return;
294            if (val < 0)
295                range = index;
296            else {
297                pos += (index+1);
298                range -= (index+1);
299            }
300        }
301
302        if (pos == m_beginChars->size())
303            m_beginChars->append(beginChar);
304        else
305            m_beginChars->insert(pos, beginChar);
306    }
307
308    // Create BeginChar objects by appending each terms from a hotTerms vector to an existing BeginChar object.
309    void linkHotTerms(BeginChar beginChar, Vector<TermChain>* hotTerms)
310    {
311        for (unsigned i = 0; i < hotTerms->size(); i++) {
312            PatternTerm hotTerm = hotTerms->at(i).term;
313            ASSERT(hotTerm.type == PatternTerm::TypePatternCharacter);
314
315            UChar characterNext = hotTerm.patternCharacter;
316
317            // Append a character to an existing BeginChar object.
318            if (characterNext <= 0x7f) {
319                unsigned mask = 0;
320
321                if (m_isCaseInsensitive && isASCIIAlpha(characterNext)) {
322                    mask = 32;
323                    characterNext = toASCIILower(characterNext);
324                }
325
326                addCharacter(BeginChar(beginChar.value | (characterNext << 16), beginChar.mask | (mask << 16)));
327            } else {
328                UChar upper, lower;
329                if (m_isCaseInsensitive && ((upper = Unicode::toUpper(characterNext)) != (lower = Unicode::toLower(characterNext)))) {
330                    addCharacter(BeginChar(beginChar.value | (upper << 16), beginChar.mask));
331                    addCharacter(BeginChar(beginChar.value | (lower << 16), beginChar.mask));
332                } else
333                    addCharacter(BeginChar(beginChar.value | (characterNext << 16), beginChar.mask));
334            }
335        }
336    }
337
338    Vector<BeginChar>* m_beginChars;
339    bool m_isCaseInsensitive;
340};
341
342class YarrPatternConstructor {
343public:
344    YarrPatternConstructor(YarrPattern& pattern)
345        : m_pattern(pattern)
346        , m_characterClassConstructor(pattern.m_ignoreCase)
347        , m_beginCharHelper(&pattern.m_beginChars, pattern.m_ignoreCase)
348        , m_invertParentheticalAssertion(false)
349    {
350        m_pattern.m_body = new PatternDisjunction();
351        m_alternative = m_pattern.m_body->addNewAlternative();
352        m_pattern.m_disjunctions.append(m_pattern.m_body);
353    }
354
355    ~YarrPatternConstructor()
356    {
357    }
358
359    void reset()
360    {
361        m_pattern.reset();
362        m_characterClassConstructor.reset();
363
364        m_pattern.m_body = new PatternDisjunction();
365        m_alternative = m_pattern.m_body->addNewAlternative();
366        m_pattern.m_disjunctions.append(m_pattern.m_body);
367    }
368
369    void assertionBOL()
370    {
371        if (!m_alternative->m_terms.size() & !m_invertParentheticalAssertion) {
372            m_alternative->m_startsWithBOL = true;
373            m_alternative->m_containsBOL = true;
374            m_pattern.m_containsBOL = true;
375        }
376        m_alternative->m_terms.append(PatternTerm::BOL());
377    }
378    void assertionEOL()
379    {
380        m_alternative->m_terms.append(PatternTerm::EOL());
381    }
382    void assertionWordBoundary(bool invert)
383    {
384        m_alternative->m_terms.append(PatternTerm::WordBoundary(invert));
385    }
386
387    void atomPatternCharacter(UChar ch)
388    {
389        // We handle case-insensitive checking of unicode characters which do have both
390        // cases by handling them as if they were defined using a CharacterClass.
391        if (m_pattern.m_ignoreCase && !isASCII(ch) && (Unicode::toUpper(ch) != Unicode::toLower(ch))) {
392            atomCharacterClassBegin();
393            atomCharacterClassAtom(ch);
394            atomCharacterClassEnd();
395        } else
396            m_alternative->m_terms.append(PatternTerm(ch));
397    }
398
399    void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert)
400    {
401        switch (classID) {
402        case DigitClassID:
403            m_alternative->m_terms.append(PatternTerm(m_pattern.digitsCharacterClass(), invert));
404            break;
405        case SpaceClassID:
406            m_alternative->m_terms.append(PatternTerm(m_pattern.spacesCharacterClass(), invert));
407            break;
408        case WordClassID:
409            m_alternative->m_terms.append(PatternTerm(m_pattern.wordcharCharacterClass(), invert));
410            break;
411        case NewlineClassID:
412            m_alternative->m_terms.append(PatternTerm(m_pattern.newlineCharacterClass(), invert));
413            break;
414        }
415    }
416
417    void atomCharacterClassBegin(bool invert = false)
418    {
419        m_invertCharacterClass = invert;
420    }
421
422    void atomCharacterClassAtom(UChar ch)
423    {
424        m_characterClassConstructor.putChar(ch);
425    }
426
427    void atomCharacterClassRange(UChar begin, UChar end)
428    {
429        m_characterClassConstructor.putRange(begin, end);
430    }
431
432    void atomCharacterClassBuiltIn(BuiltInCharacterClassID classID, bool invert)
433    {
434        ASSERT(classID != NewlineClassID);
435
436        switch (classID) {
437        case DigitClassID:
438            m_characterClassConstructor.append(invert ? m_pattern.nondigitsCharacterClass() : m_pattern.digitsCharacterClass());
439            break;
440
441        case SpaceClassID:
442            m_characterClassConstructor.append(invert ? m_pattern.nonspacesCharacterClass() : m_pattern.spacesCharacterClass());
443            break;
444
445        case WordClassID:
446            m_characterClassConstructor.append(invert ? m_pattern.nonwordcharCharacterClass() : m_pattern.wordcharCharacterClass());
447            break;
448
449        default:
450            ASSERT_NOT_REACHED();
451        }
452    }
453
454    void atomCharacterClassEnd()
455    {
456        CharacterClass* newCharacterClass = m_characterClassConstructor.charClass();
457        m_pattern.m_userCharacterClasses.append(newCharacterClass);
458        m_alternative->m_terms.append(PatternTerm(newCharacterClass, m_invertCharacterClass));
459    }
460
461    void atomParenthesesSubpatternBegin(bool capture = true)
462    {
463        unsigned subpatternId = m_pattern.m_numSubpatterns + 1;
464        if (capture)
465            m_pattern.m_numSubpatterns++;
466
467        PatternDisjunction* parenthesesDisjunction = new PatternDisjunction(m_alternative);
468        m_pattern.m_disjunctions.append(parenthesesDisjunction);
469        m_alternative->m_terms.append(PatternTerm(PatternTerm::TypeParenthesesSubpattern, subpatternId, parenthesesDisjunction, capture, false));
470        m_alternative = parenthesesDisjunction->addNewAlternative();
471    }
472
473    void atomParentheticalAssertionBegin(bool invert = false)
474    {
475        PatternDisjunction* parenthesesDisjunction = new PatternDisjunction(m_alternative);
476        m_pattern.m_disjunctions.append(parenthesesDisjunction);
477        m_alternative->m_terms.append(PatternTerm(PatternTerm::TypeParentheticalAssertion, m_pattern.m_numSubpatterns + 1, parenthesesDisjunction, false, invert));
478        m_alternative = parenthesesDisjunction->addNewAlternative();
479        m_invertParentheticalAssertion = invert;
480    }
481
482    void atomParenthesesEnd()
483    {
484        ASSERT(m_alternative->m_parent);
485        ASSERT(m_alternative->m_parent->m_parent);
486
487        PatternDisjunction* parenthesesDisjunction = m_alternative->m_parent;
488        m_alternative = m_alternative->m_parent->m_parent;
489
490        PatternTerm& lastTerm = m_alternative->lastTerm();
491
492        unsigned numParenAlternatives = parenthesesDisjunction->m_alternatives.size();
493        unsigned numBOLAnchoredAlts = 0;
494        bool containsEmptyAlternative = false;
495
496        for (unsigned i = 0; i < numParenAlternatives; i++) {
497            if (!parenthesesDisjunction->m_alternatives[i]->m_terms.size() && numParenAlternatives > 1) {
498                PatternAlternative* altToRemove = parenthesesDisjunction->m_alternatives[i];
499                parenthesesDisjunction->m_alternatives.remove(i);
500                delete altToRemove;
501                --numParenAlternatives;
502
503                containsEmptyAlternative = true;
504                continue;
505            }
506
507            // Bubble up BOL flags
508            if (parenthesesDisjunction->m_alternatives[i]->m_startsWithBOL)
509                numBOLAnchoredAlts++;
510        }
511
512        if (numBOLAnchoredAlts) {
513            m_alternative->m_containsBOL = true;
514            // If all the alternatives in parens start with BOL, then so does this one
515            if (numBOLAnchoredAlts == numParenAlternatives)
516                m_alternative->m_startsWithBOL = true;
517        }
518
519        lastTerm.parentheses.lastSubpatternId = m_pattern.m_numSubpatterns;
520        m_invertParentheticalAssertion = false;
521
522        if (containsEmptyAlternative) {
523            // Backup and remove the current disjunction's alternatives.
524            Vector<PatternAlternative*> alternatives;
525            alternatives.append(parenthesesDisjunction->m_alternatives);
526            parenthesesDisjunction->m_alternatives.clear();
527            PatternAlternative* alternative = parenthesesDisjunction->addNewAlternative();
528
529            // Insert a new non-capturing parentheses.
530            unsigned subpatternId = m_pattern.m_numSubpatterns + 1;
531            PatternDisjunction* newDisjunction = new PatternDisjunction(alternative);
532            m_pattern.m_disjunctions.append(newDisjunction);
533            alternative->m_terms.append(PatternTerm(PatternTerm::TypeParenthesesSubpattern, subpatternId, newDisjunction, false, false));
534            newDisjunction->m_alternatives.append(alternatives);
535
536            // Set the quantifier of the new parentheses to '?' and set the inherited properties.
537            PatternTerm& disjunctionTerm = alternative->lastTerm();
538            disjunctionTerm.quantify(1, QuantifierGreedy);
539            disjunctionTerm.parentheses.lastSubpatternId = m_pattern.m_numSubpatterns;
540            alternative->m_containsBOL = m_alternative->m_containsBOL;
541            alternative->m_startsWithBOL = m_alternative->m_startsWithBOL;
542        }
543    }
544
545    void atomBackReference(unsigned subpatternId)
546    {
547        ASSERT(subpatternId);
548        m_pattern.m_containsBackreferences = true;
549        m_pattern.m_maxBackReference = std::max(m_pattern.m_maxBackReference, subpatternId);
550
551        if (subpatternId > m_pattern.m_numSubpatterns) {
552            m_alternative->m_terms.append(PatternTerm::ForwardReference());
553            return;
554        }
555
556        PatternAlternative* currentAlternative = m_alternative;
557        ASSERT(currentAlternative);
558
559        // Note to self: if we waited until the AST was baked, we could also remove forwards refs
560        while ((currentAlternative = currentAlternative->m_parent->m_parent)) {
561            PatternTerm& term = currentAlternative->lastTerm();
562            ASSERT((term.type == PatternTerm::TypeParenthesesSubpattern) || (term.type == PatternTerm::TypeParentheticalAssertion));
563
564            if ((term.type == PatternTerm::TypeParenthesesSubpattern) && term.capture() && (subpatternId == term.parentheses.subpatternId)) {
565                m_alternative->m_terms.append(PatternTerm::ForwardReference());
566                return;
567            }
568        }
569
570        m_alternative->m_terms.append(PatternTerm(subpatternId));
571    }
572
573    // deep copy the argument disjunction.  If filterStartsWithBOL is true,
574    // skip alternatives with m_startsWithBOL set true.
575    PatternDisjunction* copyDisjunction(PatternDisjunction* disjunction, bool filterStartsWithBOL = false)
576    {
577        PatternDisjunction* newDisjunction = 0;
578        for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) {
579            PatternAlternative* alternative = disjunction->m_alternatives[alt];
580            if (!filterStartsWithBOL || !alternative->m_startsWithBOL) {
581                if (!newDisjunction) {
582                    newDisjunction = new PatternDisjunction();
583                    newDisjunction->m_parent = disjunction->m_parent;
584                }
585                PatternAlternative* newAlternative = newDisjunction->addNewAlternative();
586                for (unsigned i = 0; i < alternative->m_terms.size(); ++i)
587                    newAlternative->m_terms.append(copyTerm(alternative->m_terms[i], filterStartsWithBOL));
588            }
589        }
590
591        if (newDisjunction)
592            m_pattern.m_disjunctions.append(newDisjunction);
593        return newDisjunction;
594    }
595
596    PatternTerm copyTerm(PatternTerm& term, bool filterStartsWithBOL = false)
597    {
598        if ((term.type != PatternTerm::TypeParenthesesSubpattern) && (term.type != PatternTerm::TypeParentheticalAssertion))
599            return PatternTerm(term);
600
601        PatternTerm termCopy = term;
602        termCopy.parentheses.disjunction = copyDisjunction(termCopy.parentheses.disjunction, filterStartsWithBOL);
603        return termCopy;
604    }
605
606    void quantifyAtom(unsigned min, unsigned max, bool greedy)
607    {
608        ASSERT(min <= max);
609        ASSERT(m_alternative->m_terms.size());
610
611        if (!max) {
612            m_alternative->removeLastTerm();
613            return;
614        }
615
616        PatternTerm& term = m_alternative->lastTerm();
617        ASSERT(term.type > PatternTerm::TypeAssertionWordBoundary);
618        ASSERT((term.quantityCount == 1) && (term.quantityType == QuantifierFixedCount));
619
620        // For any assertion with a zero minimum, not matching is valid and has no effect,
621        // remove it.  Otherwise, we need to match as least once, but there is no point
622        // matching more than once, so remove the quantifier.  It is not entirely clear
623        // from the spec whether or not this behavior is correct, but I believe this
624        // matches Firefox. :-/
625        if (term.type == PatternTerm::TypeParentheticalAssertion) {
626            if (!min)
627                m_alternative->removeLastTerm();
628            return;
629        }
630
631        if (min == 0)
632            term.quantify(max, greedy   ? QuantifierGreedy : QuantifierNonGreedy);
633        else if (min == max)
634            term.quantify(min, QuantifierFixedCount);
635        else {
636            term.quantify(min, QuantifierFixedCount);
637            m_alternative->m_terms.append(copyTerm(term));
638            // NOTE: this term is interesting from an analysis perspective, in that it can be ignored.....
639            m_alternative->lastTerm().quantify((max == quantifyInfinite) ? max : max - min, greedy ? QuantifierGreedy : QuantifierNonGreedy);
640            if (m_alternative->lastTerm().type == PatternTerm::TypeParenthesesSubpattern)
641                m_alternative->lastTerm().parentheses.isCopy = true;
642        }
643    }
644
645    void disjunction()
646    {
647        m_alternative = m_alternative->m_parent->addNewAlternative();
648    }
649
650    unsigned setupAlternativeOffsets(PatternAlternative* alternative, unsigned currentCallFrameSize, unsigned initialInputPosition)
651    {
652        alternative->m_hasFixedSize = true;
653        unsigned currentInputPosition = initialInputPosition;
654
655        for (unsigned i = 0; i < alternative->m_terms.size(); ++i) {
656            PatternTerm& term = alternative->m_terms[i];
657
658            switch (term.type) {
659            case PatternTerm::TypeAssertionBOL:
660            case PatternTerm::TypeAssertionEOL:
661            case PatternTerm::TypeAssertionWordBoundary:
662                term.inputPosition = currentInputPosition;
663                break;
664
665            case PatternTerm::TypeBackReference:
666                term.inputPosition = currentInputPosition;
667                term.frameLocation = currentCallFrameSize;
668                currentCallFrameSize += YarrStackSpaceForBackTrackInfoBackReference;
669                alternative->m_hasFixedSize = false;
670                break;
671
672            case PatternTerm::TypeForwardReference:
673                break;
674
675            case PatternTerm::TypePatternCharacter:
676                term.inputPosition = currentInputPosition;
677                if (term.quantityType != QuantifierFixedCount) {
678                    term.frameLocation = currentCallFrameSize;
679                    currentCallFrameSize += YarrStackSpaceForBackTrackInfoPatternCharacter;
680                    alternative->m_hasFixedSize = false;
681                } else
682                    currentInputPosition += term.quantityCount;
683                break;
684
685            case PatternTerm::TypeCharacterClass:
686                term.inputPosition = currentInputPosition;
687                if (term.quantityType != QuantifierFixedCount) {
688                    term.frameLocation = currentCallFrameSize;
689                    currentCallFrameSize += YarrStackSpaceForBackTrackInfoCharacterClass;
690                    alternative->m_hasFixedSize = false;
691                } else
692                    currentInputPosition += term.quantityCount;
693                break;
694
695            case PatternTerm::TypeParenthesesSubpattern:
696                // Note: for fixed once parentheses we will ensure at least the minimum is available; others are on their own.
697                term.frameLocation = currentCallFrameSize;
698                if (term.quantityCount == 1 && !term.parentheses.isCopy) {
699                    if (term.quantityType != QuantifierFixedCount)
700                        currentCallFrameSize += YarrStackSpaceForBackTrackInfoParenthesesOnce;
701                    currentCallFrameSize = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize, currentInputPosition);
702                    // If quantity is fixed, then pre-check its minimum size.
703                    if (term.quantityType == QuantifierFixedCount)
704                        currentInputPosition += term.parentheses.disjunction->m_minimumSize;
705                    term.inputPosition = currentInputPosition;
706                } else if (term.parentheses.isTerminal) {
707                    currentCallFrameSize += YarrStackSpaceForBackTrackInfoParenthesesTerminal;
708                    currentCallFrameSize = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize, currentInputPosition);
709                    term.inputPosition = currentInputPosition;
710                } else {
711                    term.inputPosition = currentInputPosition;
712                    setupDisjunctionOffsets(term.parentheses.disjunction, 0, currentInputPosition);
713                    currentCallFrameSize += YarrStackSpaceForBackTrackInfoParentheses;
714                }
715                // Fixed count of 1 could be accepted, if they have a fixed size *AND* if all alternatives are of the same length.
716                alternative->m_hasFixedSize = false;
717                break;
718
719            case PatternTerm::TypeParentheticalAssertion:
720                term.inputPosition = currentInputPosition;
721                term.frameLocation = currentCallFrameSize;
722                currentCallFrameSize = setupDisjunctionOffsets(term.parentheses.disjunction, currentCallFrameSize + YarrStackSpaceForBackTrackInfoParentheticalAssertion, currentInputPosition);
723                break;
724            }
725        }
726
727        alternative->m_minimumSize = currentInputPosition - initialInputPosition;
728        return currentCallFrameSize;
729    }
730
731    unsigned setupDisjunctionOffsets(PatternDisjunction* disjunction, unsigned initialCallFrameSize, unsigned initialInputPosition)
732    {
733        if ((disjunction != m_pattern.m_body) && (disjunction->m_alternatives.size() > 1))
734            initialCallFrameSize += YarrStackSpaceForBackTrackInfoAlternative;
735
736        unsigned minimumInputSize = UINT_MAX;
737        unsigned maximumCallFrameSize = 0;
738        bool hasFixedSize = true;
739
740        for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) {
741            PatternAlternative* alternative = disjunction->m_alternatives[alt];
742            unsigned currentAlternativeCallFrameSize = setupAlternativeOffsets(alternative, initialCallFrameSize, initialInputPosition);
743            minimumInputSize = min(minimumInputSize, alternative->m_minimumSize);
744            maximumCallFrameSize = max(maximumCallFrameSize, currentAlternativeCallFrameSize);
745            hasFixedSize &= alternative->m_hasFixedSize;
746        }
747
748        ASSERT(minimumInputSize != UINT_MAX);
749        ASSERT(maximumCallFrameSize >= initialCallFrameSize);
750
751        disjunction->m_hasFixedSize = hasFixedSize;
752        disjunction->m_minimumSize = minimumInputSize;
753        disjunction->m_callFrameSize = maximumCallFrameSize;
754        return maximumCallFrameSize;
755    }
756
757    void setupOffsets()
758    {
759        setupDisjunctionOffsets(m_pattern.m_body, 0, 0);
760    }
761
762    // This optimization identifies sets of parentheses that we will never need to backtrack.
763    // In these cases we do not need to store state from prior iterations.
764    // We can presently avoid backtracking for:
765    //   * where the parens are at the end of the regular expression (last term in any of the
766    //     alternatives of the main body disjunction).
767    //   * where the parens are non-capturing, and quantified unbounded greedy (*).
768    //   * where the parens do not contain any capturing subpatterns.
769    void checkForTerminalParentheses()
770    {
771        // This check is much too crude; should be just checking whether the candidate
772        // node contains nested capturing subpatterns, not the whole expression!
773        if (m_pattern.m_numSubpatterns)
774            return;
775
776        Vector<PatternAlternative*>& alternatives = m_pattern.m_body->m_alternatives;
777        for (size_t i = 0; i < alternatives.size(); ++i) {
778            Vector<PatternTerm>& terms = alternatives[i]->m_terms;
779            if (terms.size()) {
780                PatternTerm& term = terms.last();
781                if (term.type == PatternTerm::TypeParenthesesSubpattern
782                    && term.quantityType == QuantifierGreedy
783                    && term.quantityCount == quantifyInfinite
784                    && !term.capture())
785                    term.parentheses.isTerminal = true;
786            }
787        }
788    }
789
790    void optimizeBOL()
791    {
792        // Look for expressions containing beginning of line (^) anchoring and unroll them.
793        // e.g. /^a|^b|c/ becomes /^a|^b|c/ which is executed once followed by /c/ which loops
794        // This code relies on the parsing code tagging alternatives with m_containsBOL and
795        // m_startsWithBOL and rolling those up to containing alternatives.
796        // At this point, this is only valid for non-multiline expressions.
797        PatternDisjunction* disjunction = m_pattern.m_body;
798
799        if (!m_pattern.m_containsBOL || m_pattern.m_multiline)
800            return;
801
802        PatternDisjunction* loopDisjunction = copyDisjunction(disjunction, true);
803
804        // Set alternatives in disjunction to "onceThrough"
805        for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt)
806            disjunction->m_alternatives[alt]->setOnceThrough();
807
808        if (loopDisjunction) {
809            // Move alternatives from loopDisjunction to disjunction
810            for (unsigned alt = 0; alt < loopDisjunction->m_alternatives.size(); ++alt)
811                disjunction->m_alternatives.append(loopDisjunction->m_alternatives[alt]);
812
813            loopDisjunction->m_alternatives.clear();
814        }
815    }
816
817    // This function collects the terms which are potentially matching the first number of depth characters in the result.
818    // If this function returns false then it found at least one term which makes the beginning character
819    // look-up optimization inefficient.
820    bool setupDisjunctionBeginTerms(PatternDisjunction* disjunction, Vector<TermChain>* beginTerms, unsigned depth)
821    {
822        for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) {
823            PatternAlternative* alternative = disjunction->m_alternatives[alt];
824
825            if (!setupAlternativeBeginTerms(alternative, beginTerms, 0, depth))
826                return false;
827        }
828
829        return true;
830    }
831
832    bool setupAlternativeBeginTerms(PatternAlternative* alternative, Vector<TermChain>* beginTerms, unsigned termIndex, unsigned depth)
833    {
834        bool checkNext = true;
835        unsigned numTerms = alternative->m_terms.size();
836
837        while (checkNext && termIndex < numTerms) {
838            PatternTerm term = alternative->m_terms[termIndex];
839            checkNext = false;
840
841            switch (term.type) {
842            case PatternTerm::TypeAssertionBOL:
843            case PatternTerm::TypeAssertionEOL:
844            case PatternTerm::TypeAssertionWordBoundary:
845                return false;
846
847            case PatternTerm::TypeBackReference:
848            case PatternTerm::TypeForwardReference:
849                return false;
850
851            case PatternTerm::TypePatternCharacter:
852                if (termIndex != numTerms - 1) {
853                    beginTerms->append(TermChain(term));
854                    termIndex++;
855                    checkNext = true;
856                } else if (term.quantityType == QuantifierFixedCount) {
857                    beginTerms->append(TermChain(term));
858                    if (depth < 2 && termIndex < numTerms - 1 && term.quantityCount == 1)
859                        if (!setupAlternativeBeginTerms(alternative, &beginTerms->last().hotTerms, termIndex + 1, depth + 1))
860                            return false;
861                }
862
863                break;
864
865            case PatternTerm::TypeCharacterClass:
866                return false;
867
868            case PatternTerm::TypeParentheticalAssertion:
869                if (term.invert())
870                    return false;
871
872            case PatternTerm::TypeParenthesesSubpattern:
873                if (term.quantityType != QuantifierFixedCount) {
874                    if (termIndex == numTerms - 1)
875                        break;
876
877                    termIndex++;
878                    checkNext = true;
879                }
880
881                if (!setupDisjunctionBeginTerms(term.parentheses.disjunction, beginTerms, depth))
882                    return false;
883
884                break;
885            }
886        }
887
888        return true;
889    }
890
891    void setupBeginChars()
892    {
893        Vector<TermChain> beginTerms;
894        bool containsFixedCharacter = false;
895
896        if ((!m_pattern.m_body->m_hasFixedSize || m_pattern.m_body->m_alternatives.size() > 1)
897                && setupDisjunctionBeginTerms(m_pattern.m_body, &beginTerms, 0)) {
898            unsigned size = beginTerms.size();
899
900            // If we haven't collected any terms we should abort the preparation of beginning character look-up optimization.
901            if (!size)
902                return;
903
904            m_pattern.m_containsBeginChars = true;
905
906            for (unsigned i = 0; i < size; i++) {
907                PatternTerm term = beginTerms[i].term;
908
909                // We have just collected PatternCharacter terms, other terms are not allowed.
910                ASSERT(term.type == PatternTerm::TypePatternCharacter);
911
912                if (term.quantityType == QuantifierFixedCount)
913                    containsFixedCharacter = true;
914
915                UChar character = term.patternCharacter;
916                unsigned mask = 0;
917
918                if (character <= 0x7f) {
919                    if (m_pattern.m_ignoreCase && isASCIIAlpha(character)) {
920                        mask = 32;
921                        character = toASCIILower(character);
922                    }
923
924                    m_beginCharHelper.addBeginChar(BeginChar(character, mask), &beginTerms[i].hotTerms, term.quantityType, term.quantityCount);
925                } else {
926                    UChar upper, lower;
927                    if (m_pattern.m_ignoreCase && ((upper = Unicode::toUpper(character)) != (lower = Unicode::toLower(character)))) {
928                        m_beginCharHelper.addBeginChar(BeginChar(upper, mask), &beginTerms[i].hotTerms, term.quantityType, term.quantityCount);
929                        m_beginCharHelper.addBeginChar(BeginChar(lower, mask), &beginTerms[i].hotTerms, term.quantityType, term.quantityCount);
930                    } else
931                        m_beginCharHelper.addBeginChar(BeginChar(character, mask), &beginTerms[i].hotTerms, term.quantityType, term.quantityCount);
932                }
933            }
934
935            // If the pattern doesn't contain terms with fixed quantifiers then the beginning character look-up optimization is inefficient.
936            if (!containsFixedCharacter) {
937                m_pattern.m_containsBeginChars = false;
938                return;
939            }
940
941            size = m_pattern.m_beginChars.size();
942
943            if (size > 2)
944                m_beginCharHelper.merge(size - 1);
945            else if (size <= 1)
946                m_pattern.m_containsBeginChars = false;
947        }
948    }
949
950private:
951    YarrPattern& m_pattern;
952    PatternAlternative* m_alternative;
953    CharacterClassConstructor m_characterClassConstructor;
954    BeginCharHelper m_beginCharHelper;
955    bool m_invertCharacterClass;
956    bool m_invertParentheticalAssertion;
957};
958
959const char* YarrPattern::compile(const UString& patternString)
960{
961    YarrPatternConstructor constructor(*this);
962
963    if (const char* error = parse(constructor, patternString))
964        return error;
965
966    // If the pattern contains illegal backreferences reset & reparse.
967    // Quoting Netscape's "What's new in JavaScript 1.2",
968    //      "Note: if the number of left parentheses is less than the number specified
969    //       in \#, the \# is taken as an octal escape as described in the next row."
970    if (containsIllegalBackReference()) {
971        unsigned numSubpatterns = m_numSubpatterns;
972
973        constructor.reset();
974#if !ASSERT_DISABLED
975        const char* error =
976#endif
977            parse(constructor, patternString, numSubpatterns);
978
979        ASSERT(!error);
980        ASSERT(numSubpatterns == m_numSubpatterns);
981    }
982
983    constructor.checkForTerminalParentheses();
984    constructor.optimizeBOL();
985
986    constructor.setupOffsets();
987    constructor.setupBeginChars();
988
989    return 0;
990}
991
992YarrPattern::YarrPattern(const UString& pattern, bool ignoreCase, bool multiline, const char** error)
993    : m_ignoreCase(ignoreCase)
994    , m_multiline(multiline)
995    , m_containsBackreferences(false)
996    , m_containsBeginChars(false)
997    , m_containsBOL(false)
998    , m_numSubpatterns(0)
999    , m_maxBackReference(0)
1000    , newlineCached(0)
1001    , digitsCached(0)
1002    , spacesCached(0)
1003    , wordcharCached(0)
1004    , nondigitsCached(0)
1005    , nonspacesCached(0)
1006    , nonwordcharCached(0)
1007{
1008    *error = compile(pattern);
1009}
1010
1011} }
1012