15f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian/*
25f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * Copyright (C) 2009 Apple Inc. All rights reserved.
35f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian *
45f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * Redistribution and use in source and binary forms, with or without
55f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * modification, are permitted provided that the following conditions
65f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * are met:
75f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * 1. Redistributions of source code must retain the above copyright
85f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian *    notice, this list of conditions and the following disclaimer.
95f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * 2. Redistributions in binary form must reproduce the above copyright
105f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian *    notice, this list of conditions and the following disclaimer in the
115f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian *    documentation and/or other materials provided with the distribution.
125f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian *
135f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
145f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
155f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
165f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR
175f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
185f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
195f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
205f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
215f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
225f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
235f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
245f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian */
255f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian
2665f03d4f644ce73618e5f4f50dd694b26f55ae12Ben Murdoch#ifndef YarrParser_h
2765f03d4f644ce73618e5f4f50dd694b26f55ae12Ben Murdoch#define YarrParser_h
285f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian
292fc2651226baac27029e38c9d6ef883fa32084dbSteve Block#include <runtime/UString.h>
3065f03d4f644ce73618e5f4f50dd694b26f55ae12Ben Murdoch#include "Yarr.h"
315f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian#include <wtf/ASCIICType.h>
325f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian#include <wtf/unicode/Unicode.h>
335f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian
345f1ab04193ad0130ca8204aadaceae083aca9881Feng Qiannamespace JSC { namespace Yarr {
355f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian
3665f03d4f644ce73618e5f4f50dd694b26f55ae12Ben Murdoch#define REGEXP_ERROR_PREFIX "Invalid regular expression: "
37f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch
385f1ab04193ad0130ca8204aadaceae083aca9881Feng Qianenum BuiltInCharacterClassID {
395f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian    DigitClassID,
405f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian    SpaceClassID,
415f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian    WordClassID,
425f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian    NewlineClassID,
435f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian};
445f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian
455f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian// The Parser class should not be used directly - only via the Yarr::parse() method.
465f1ab04193ad0130ca8204aadaceae083aca9881Feng Qiantemplate<class Delegate>
475f1ab04193ad0130ca8204aadaceae083aca9881Feng Qianclass Parser {
485f1ab04193ad0130ca8204aadaceae083aca9881Feng Qianprivate:
495f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian    template<class FriendDelegate>
505f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian    friend const char* parse(FriendDelegate& delegate, const UString& pattern, unsigned backReferenceLimit);
515f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian
525f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian    enum ErrorCode {
535f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian        NoError,
545f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian        PatternTooLarge,
555f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian        QuantifierOutOfOrder,
565f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian        QuantifierWithoutAtom,
575f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian        MissingParentheses,
585f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian        ParenthesesUnmatched,
595f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian        ParenthesesTypeInvalid,
605f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian        CharacterClassUnmatched,
615f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian        CharacterClassOutOfOrder,
625f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian        EscapeUnterminated,
635f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian        NumberOfErrorCodes
645f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian    };
655f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian
665f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian    /*
675f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian     * CharacterClassParserDelegate:
685f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian     *
695f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian     * The class CharacterClassParserDelegate is used in the parsing of character
705f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian     * classes.  This class handles detection of character ranges.  This class
715f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian     * implements enough of the delegate interface such that it can be passed to
725f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian     * parseEscape() as an EscapeDelegate.  This allows parseEscape() to be reused
735f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian     * to perform the parsing of escape characters in character sets.
745f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian     */
755f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian    class CharacterClassParserDelegate {
765f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian    public:
775f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian        CharacterClassParserDelegate(Delegate& delegate, ErrorCode& err)
785f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian            : m_delegate(delegate)
795f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian            , m_err(err)
80f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch            , m_state(Empty)
812daae5fd11344eaa88a0d92b0f6d65f8d2255c00Ben Murdoch            , m_character(0)
825f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian        {
835f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian        }
845f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian
855f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian        /*
865f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian         * begin():
875f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian         *
885f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian         * Called at beginning of construction.
895f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian         */
905f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian        void begin(bool invert)
915f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian        {
925f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian            m_delegate.atomCharacterClassBegin(invert);
935f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian        }
945f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian
955f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian        /*
96f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch         * atomPatternCharacter():
975f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian         *
98f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch         * This method is called either from parseCharacterClass() (for an unescaped
99f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch         * character in a character class), or from parseEscape(). In the former case
100f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch         * the value true will be passed for the argument 'hyphenIsRange', and in this
101f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch         * mode we will allow a hypen to be treated as indicating a range (i.e. /[a-z]/
102f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch         * is different to /[a\-z]/).
1035f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian         */
104f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch        void atomPatternCharacter(UChar ch, bool hyphenIsRange = false)
1055f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian        {
1065f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian            switch (m_state) {
107f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch            case AfterCharacterClass:
108f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch                // Following a builtin character class we need look out for a hyphen.
109f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch                // We're looking for invalid ranges, such as /[\d-x]/ or /[\d-\d]/.
110f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch                // If we see a hyphen following a charater class then unlike usual
111f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch                // we'll report it to the delegate immediately, and put ourself into
112f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch                // a poisoned state. Any following calls to add another character or
113f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch                // character class will result in an error. (A hypen following a
114f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch                // character-class is itself valid, but only  at the end of a regex).
115f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch                if (hyphenIsRange && ch == '-') {
116f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch                    m_delegate.atomCharacterClassAtom('-');
117f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch                    m_state = AfterCharacterClassHyphen;
118f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch                    return;
119f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch                }
120f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch                // Otherwise just fall through - cached character so treat this as Empty.
121f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch
122f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch            case Empty:
1235f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian                m_character = ch;
124f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch                m_state = CachedCharacter;
125f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch                return;
1265f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian
127f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch            case CachedCharacter:
128f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch                if (hyphenIsRange && ch == '-')
129f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch                    m_state = CachedCharacterHyphen;
1305f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian                else {
1315f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian                    m_delegate.atomCharacterClassAtom(m_character);
1325f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian                    m_character = ch;
1335f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian                }
134f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch                return;
1355f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian
136f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch            case CachedCharacterHyphen:
137f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch                if (ch < m_character) {
1385f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian                    m_err = CharacterClassOutOfOrder;
139f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch                    return;
140f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch                }
141f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch                m_delegate.atomCharacterClassRange(m_character, ch);
142f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch                m_state = Empty;
143f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch                return;
144d06194330da2bb8da887d2e1adeacb3a5c1504b2Steve Block
145f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch                // See coment in atomBuiltInCharacterClass below.
146f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch                // This too is technically an error, per ECMA-262, and again we
147f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch                // we chose to allow this.  Note a subtlely here that while we
148f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch                // diverge from the spec's definition of CharacterRange we do
149f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch                // remain in compliance with the grammar.  For example, consider
150f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch                // the expression /[\d-a-z]/.  We comply with the grammar in
151f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch                // this case by not allowing a-z to be matched as a range.
152f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch            case AfterCharacterClassHyphen:
153f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch                m_delegate.atomCharacterClassAtom(ch);
154f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch                m_state = Empty;
155f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch                return;
156f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch            }
157d06194330da2bb8da887d2e1adeacb3a5c1504b2Steve Block        }
158d06194330da2bb8da887d2e1adeacb3a5c1504b2Steve Block
159d06194330da2bb8da887d2e1adeacb3a5c1504b2Steve Block        /*
1605f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian         * atomBuiltInCharacterClass():
1615f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian         *
1625f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian         * Adds a built-in character class, called by parseEscape().
1635f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian         */
1645f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian        void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert)
1655f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian        {
166f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch            switch (m_state) {
167f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch            case CachedCharacter:
168f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch                // Flush the currently cached character, then fall through.
169f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch                m_delegate.atomCharacterClassAtom(m_character);
170f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch
171f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch            case Empty:
172f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch            case AfterCharacterClass:
173f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch                m_state = AfterCharacterClass;
174f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch                m_delegate.atomCharacterClassBuiltIn(classID, invert);
175f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch                return;
176f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch
177f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch                // If we hit either of these cases, we have an invalid range that
178f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch                // looks something like /[x-\d]/ or /[\d-\d]/.
179f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch                // According to ECMA-262 this should be a syntax error, but
180f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch                // empirical testing shows this to break teh webz.  Instead we
181f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch                // comply with to the ECMA-262 grammar, and assume the grammar to
182f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch                // have matched the range correctly, but tweak our interpretation
183f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch                // of CharacterRange.  Effectively we implicitly handle the hyphen
184f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch                // as if it were escaped, e.g. /[\w-_]/ is treated as /[\w\-_]/.
185f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch            case CachedCharacterHyphen:
186f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch                m_delegate.atomCharacterClassAtom(m_character);
187f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch                m_delegate.atomCharacterClassAtom('-');
188f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch                // fall through
189f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch            case AfterCharacterClassHyphen:
190f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch                m_delegate.atomCharacterClassBuiltIn(classID, invert);
191f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch                m_state = Empty;
192f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch                return;
193f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch            }
1945f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian        }
1955f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian
1965f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian        /*
1975f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian         * end():
1985f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian         *
1995f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian         * Called at end of construction.
2005f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian         */
2015f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian        void end()
2025f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian        {
203f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch            if (m_state == CachedCharacter)
204f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch                m_delegate.atomCharacterClassAtom(m_character);
205f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch            else if (m_state == CachedCharacterHyphen) {
206f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch                m_delegate.atomCharacterClassAtom(m_character);
207f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch                m_delegate.atomCharacterClassAtom('-');
208f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch            }
2095f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian            m_delegate.atomCharacterClassEnd();
2105f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian        }
2115f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian
2125f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian        // parseEscape() should never call these delegate methods when
2135f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian        // invoked with inCharacterClass set.
2145f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian        void assertionWordBoundary(bool) { ASSERT_NOT_REACHED(); }
2155f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian        void atomBackReference(unsigned) { ASSERT_NOT_REACHED(); }
2165f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian
2175f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian    private:
2185f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian        Delegate& m_delegate;
2195f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian        ErrorCode& m_err;
2205f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian        enum CharacterClassConstructionState {
221f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch            Empty,
222f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch            CachedCharacter,
223f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch            CachedCharacterHyphen,
224f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch            AfterCharacterClass,
225f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch            AfterCharacterClassHyphen,
2265f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian        } m_state;
2275f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian        UChar m_character;
2285f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian    };
2295f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian
2305f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian    Parser(Delegate& delegate, const UString& pattern, unsigned backReferenceLimit)
2315f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian        : m_delegate(delegate)
2325f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian        , m_backReferenceLimit(backReferenceLimit)
2335f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian        , m_err(NoError)
234f486d19d62f1bc33246748b14b14a9dfa617b57fIain Merrick        , m_data(pattern.characters())
235f486d19d62f1bc33246748b14b14a9dfa617b57fIain Merrick        , m_size(pattern.length())
2365f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian        , m_index(0)
2375f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian        , m_parenthesesNestingDepth(0)
2385f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian    {
2395f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian    }
2405f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian
2415f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian    /*
2425f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian     * parseEscape():
2435f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian     *
2445f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian     * Helper for parseTokens() AND parseCharacterClass().
2455f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian     * Unlike the other parser methods, this function does not report tokens
2465f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian     * directly to the member delegate (m_delegate), instead tokens are
2475f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian     * emitted to the delegate provided as an argument.  In the case of atom
2485f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian     * escapes, parseTokens() will call parseEscape() passing m_delegate as
2495f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian     * an argument, and as such the escape will be reported to the delegate.
2505f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian     *
2515f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian     * However this method may also be used by parseCharacterClass(), in which
2525f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian     * case a CharacterClassParserDelegate will be passed as the delegate that
2535f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian     * tokens should be added to.  A boolean flag is also provided to indicate
2545f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian     * whether that an escape in a CharacterClass is being parsed (some parsing
2555f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian     * rules change in this context).
2565f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian     *
2575f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian     * The boolean value returned by this method indicates whether the token
2585f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian     * parsed was an atom (outside of a characted class \b and \B will be
2595f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian     * interpreted as assertions).
2605f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian     */
2615f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian    template<bool inCharacterClass, class EscapeDelegate>
2625f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian    bool parseEscape(EscapeDelegate& delegate)
2635f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian    {
2645f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian        ASSERT(!m_err);
2655f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian        ASSERT(peek() == '\\');
2665f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian        consume();
2675f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian
2685f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian        if (atEndOfPattern()) {
2695f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian            m_err = EscapeUnterminated;
2705f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian            return false;
2715f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian        }
2725f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian
2735f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian        switch (peek()) {
2745f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian        // Assertions
2755f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian        case 'b':
2765f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian            consume();
2775f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian            if (inCharacterClass)
2785f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian                delegate.atomPatternCharacter('\b');
2795f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian            else {
2805f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian                delegate.assertionWordBoundary(false);
2815f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian                return false;
2825f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian            }
2835f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian            break;
2845f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian        case 'B':
2855f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian            consume();
2865f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian            if (inCharacterClass)
2875f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian                delegate.atomPatternCharacter('B');
2885f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian            else {
2895f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian                delegate.assertionWordBoundary(true);
2905f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian                return false;
2915f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian            }
2925f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian            break;
2935f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian
2945f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian        // CharacterClassEscape
2955f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian        case 'd':
2965f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian            consume();
2975f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian            delegate.atomBuiltInCharacterClass(DigitClassID, false);
2985f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian            break;
2995f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian        case 's':
3005f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian            consume();
3015f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian            delegate.atomBuiltInCharacterClass(SpaceClassID, false);
3025f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian            break;
3035f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian        case 'w':
3045f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian            consume();
3055f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian            delegate.atomBuiltInCharacterClass(WordClassID, false);
3065f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian            break;
3075f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian        case 'D':
3085f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian            consume();
3095f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian            delegate.atomBuiltInCharacterClass(DigitClassID, true);
3105f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian            break;
3115f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian        case 'S':
3125f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian            consume();
3135f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian            delegate.atomBuiltInCharacterClass(SpaceClassID, true);
3145f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian            break;
3155f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian        case 'W':
3165f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian            consume();
3175f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian            delegate.atomBuiltInCharacterClass(WordClassID, true);
3185f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian            break;
3195f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian
3205f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian        // DecimalEscape
3215f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian        case '1':
3225f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian        case '2':
3235f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian        case '3':
3245f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian        case '4':
3255f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian        case '5':
3265f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian        case '6':
3275f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian        case '7':
3285f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian        case '8':
3295f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian        case '9': {
3305f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian            // To match Firefox, we parse an invalid backreference in the range [1-7] as an octal escape.
3315f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian            // First, try to parse this as backreference.
3325f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian            if (!inCharacterClass) {
3335f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian                ParseState state = saveState();
3345f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian
3355f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian                unsigned backReference = consumeNumber();
3365f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian                if (backReference <= m_backReferenceLimit) {
3375f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian                    delegate.atomBackReference(backReference);
3385f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian                    break;
3395f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian                }
3405f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian
3415f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian                restoreState(state);
3425f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian            }
3435f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian
3445f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian            // Not a backreference, and not octal.
3455f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian            if (peek() >= '8') {
3465f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian                delegate.atomPatternCharacter('\\');
3475f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian                break;
3485f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian            }
3495f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian
3505f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian            // Fall-through to handle this as an octal escape.
3515f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian        }
3525f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian
3535f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian        // Octal escape
3545f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian        case '0':
3555f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian            delegate.atomPatternCharacter(consumeOctal());
3565f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian            break;
3575f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian
3585f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian        // ControlEscape
3595f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian        case 'f':
3605f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian            consume();
3615f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian            delegate.atomPatternCharacter('\f');
3625f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian            break;
3635f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian        case 'n':
3645f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian            consume();
3655f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian            delegate.atomPatternCharacter('\n');
3665f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian            break;
3675f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian        case 'r':
3685f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian            consume();
3695f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian            delegate.atomPatternCharacter('\r');
3705f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian            break;
3715f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian        case 't':
3725f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian            consume();
3735f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian            delegate.atomPatternCharacter('\t');
3745f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian            break;
3755f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian        case 'v':
3765f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian            consume();
3775f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian            delegate.atomPatternCharacter('\v');
3785f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian            break;
3795f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian
3805f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian        // ControlLetter
3815f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian        case 'c': {
3825f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian            ParseState state = saveState();
3835f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian            consume();
3845f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian            if (!atEndOfPattern()) {
3855f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian                int control = consume();
3865f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian
3875f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian                // To match Firefox, inside a character class, we also accept numbers and '_' as control characters.
3885f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian                if (inCharacterClass ? WTF::isASCIIAlphanumeric(control) || (control == '_') : WTF::isASCIIAlpha(control)) {
3895f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian                    delegate.atomPatternCharacter(control & 0x1f);
3905f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian                    break;
3915f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian                }
3925f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian            }
3935f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian            restoreState(state);
3945f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian            delegate.atomPatternCharacter('\\');
3955f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian            break;
3965f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian        }
3975f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian
3985f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian        // HexEscape
3995f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian        case 'x': {
4005f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian            consume();
4015f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian            int x = tryConsumeHex(2);
4025f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian            if (x == -1)
4035f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian                delegate.atomPatternCharacter('x');
4045f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian            else
4055f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian                delegate.atomPatternCharacter(x);
4065f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian            break;
4075f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian        }
4085f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian
4095f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian        // UnicodeEscape
4105f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian        case 'u': {
4115f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian            consume();
4125f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian            int u = tryConsumeHex(4);
4135f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian            if (u == -1)
4145f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian                delegate.atomPatternCharacter('u');
4155f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian            else
4165f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian                delegate.atomPatternCharacter(u);
4175f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian            break;
4185f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian        }
4195f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian
4205f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian        // IdentityEscape
4215f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian        default:
4225f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian            delegate.atomPatternCharacter(consume());
4235f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian        }
4245f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian
4255f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian        return true;
4265f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian    }
4275f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian
4285f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian    /*
4295f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian     * parseAtomEscape(), parseCharacterClassEscape():
4305f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian     *
4315f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian     * These methods alias to parseEscape().
4325f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian     */
4335f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian    bool parseAtomEscape()
4345f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian    {
4355f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian        return parseEscape<false>(m_delegate);
4365f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian    }
4375f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian    void parseCharacterClassEscape(CharacterClassParserDelegate& delegate)
4385f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian    {
4395f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian        parseEscape<true>(delegate);
4405f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian    }
4415f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian
4425f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian    /*
4435f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian     * parseCharacterClass():
4445f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian     *
4455f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian     * Helper for parseTokens(); calls dirctly and indirectly (via parseCharacterClassEscape)
4465f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian     * to an instance of CharacterClassParserDelegate, to describe the character class to the
4475f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian     * delegate.
4485f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian     */
4495f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian    void parseCharacterClass()
4505f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian    {
4515f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian        ASSERT(!m_err);
4525f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian        ASSERT(peek() == '[');
4535f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian        consume();
4545f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian
4555f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian        CharacterClassParserDelegate characterClassConstructor(m_delegate, m_err);
4565f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian
4575f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian        characterClassConstructor.begin(tryConsume('^'));
4585f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian
4595f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian        while (!atEndOfPattern()) {
4605f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian            switch (peek()) {
4615f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian            case ']':
4625f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian                consume();
4635f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian                characterClassConstructor.end();
4645f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian                return;
4655f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian
4665f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian            case '\\':
4675f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian                parseCharacterClassEscape(characterClassConstructor);
4685f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian                break;
4695f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian
4705f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian            default:
471f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch                characterClassConstructor.atomPatternCharacter(consume(), true);
4725f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian            }
4735f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian
4745f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian            if (m_err)
4755f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian                return;
4765f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian        }
4775f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian
4785f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian        m_err = CharacterClassUnmatched;
4795f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian    }
4805f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian
4815f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian    /*
4825f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian     * parseParenthesesBegin():
4835f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian     *
4845f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian     * Helper for parseTokens(); checks for parentheses types other than regular capturing subpatterns.
4855f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian     */
4865f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian    void parseParenthesesBegin()
4875f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian    {
4885f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian        ASSERT(!m_err);
4895f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian        ASSERT(peek() == '(');
4905f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian        consume();
4915f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian
4925f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian        if (tryConsume('?')) {
4935f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian            if (atEndOfPattern()) {
4945f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian                m_err = ParenthesesTypeInvalid;
4955f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian                return;
4965f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian            }
4975f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian
4985f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian            switch (consume()) {
4995f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian            case ':':
5005f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian                m_delegate.atomParenthesesSubpatternBegin(false);
5015f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian                break;
5025f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian
5035f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian            case '=':
5045f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian                m_delegate.atomParentheticalAssertionBegin();
5055f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian                break;
5065f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian
5075f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian            case '!':
5085f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian                m_delegate.atomParentheticalAssertionBegin(true);
5095f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian                break;
5105f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian
5115f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian            default:
5125f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian                m_err = ParenthesesTypeInvalid;
5135f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian            }
5145f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian        } else
5155f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian            m_delegate.atomParenthesesSubpatternBegin();
5165f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian
5175f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian        ++m_parenthesesNestingDepth;
5185f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian    }
5195f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian
5205f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian    /*
5215f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian     * parseParenthesesEnd():
5225f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian     *
5235f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian     * Helper for parseTokens(); checks for parse errors (due to unmatched parentheses).
5245f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian     */
5255f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian    void parseParenthesesEnd()
5265f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian    {
5275f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian        ASSERT(!m_err);
5285f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian        ASSERT(peek() == ')');
5295f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian        consume();
5305f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian
5315f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian        if (m_parenthesesNestingDepth > 0)
5325f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian            m_delegate.atomParenthesesEnd();
5335f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian        else
5345f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian            m_err = ParenthesesUnmatched;
5355f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian
5365f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian        --m_parenthesesNestingDepth;
5375f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian    }
5385f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian
5395f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian    /*
5405f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian     * parseQuantifier():
5415f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian     *
5425f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian     * Helper for parseTokens(); checks for parse errors and non-greedy quantifiers.
5435f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian     */
5445f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian    void parseQuantifier(bool lastTokenWasAnAtom, unsigned min, unsigned max)
5455f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian    {
5465f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian        ASSERT(!m_err);
5475f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian        ASSERT(min <= max);
5485f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian
5495f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian        if (lastTokenWasAnAtom)
5505f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian            m_delegate.quantifyAtom(min, max, !tryConsume('?'));
5515f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian        else
5525f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian            m_err = QuantifierWithoutAtom;
5535f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian    }
5545f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian
5555f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian    /*
5565f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian     * parseTokens():
5575f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian     *
5585f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian     * This method loops over the input pattern reporting tokens to the delegate.
5595f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian     * The method returns when a parse error is detected, or the end of the pattern
5605f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian     * is reached.  One piece of state is tracked around the loop, which is whether
5615f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian     * the last token passed to the delegate was an atom (this is necessary to detect
5625f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian     * a parse error when a quantifier provided without an atom to quantify).
5635f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian     */
5645f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian    void parseTokens()
5655f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian    {
5665f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian        bool lastTokenWasAnAtom = false;
5675f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian
5685f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian        while (!atEndOfPattern()) {
5695f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian            switch (peek()) {
5705f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian            case '|':
5715f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian                consume();
5725f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian                m_delegate.disjunction();
5735f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian                lastTokenWasAnAtom = false;
5745f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian                break;
5755f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian
5765f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian            case '(':
5775f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian                parseParenthesesBegin();
5785f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian                lastTokenWasAnAtom = false;
5795f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian                break;
5805f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian
5815f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian            case ')':
5825f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian                parseParenthesesEnd();
5835f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian                lastTokenWasAnAtom = true;
5845f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian                break;
5855f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian
5865f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian            case '^':
5875f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian                consume();
5885f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian                m_delegate.assertionBOL();
5895f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian                lastTokenWasAnAtom = false;
5905f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian                break;
5915f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian
5925f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian            case '$':
5935f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian                consume();
5945f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian                m_delegate.assertionEOL();
5955f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian                lastTokenWasAnAtom = false;
5965f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian                break;
5975f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian
5985f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian            case '.':
5995f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian                consume();
6005f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian                m_delegate.atomBuiltInCharacterClass(NewlineClassID, true);
6015f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian                lastTokenWasAnAtom = true;
6025f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian                break;
6035f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian
6045f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian            case '[':
6055f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian                parseCharacterClass();
6065f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian                lastTokenWasAnAtom = true;
6075f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian                break;
6085f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian
6095f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian            case '\\':
6105f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian                lastTokenWasAnAtom = parseAtomEscape();
6115f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian                break;
6125f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian
6135f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian            case '*':
6145f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian                consume();
615f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch                parseQuantifier(lastTokenWasAnAtom, 0, quantifyInfinite);
6165f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian                lastTokenWasAnAtom = false;
6175f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian                break;
6185f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian
6195f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian            case '+':
6205f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian                consume();
621f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch                parseQuantifier(lastTokenWasAnAtom, 1, quantifyInfinite);
6225f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian                lastTokenWasAnAtom = false;
6235f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian                break;
6245f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian
6255f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian            case '?':
6265f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian                consume();
6275f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian                parseQuantifier(lastTokenWasAnAtom, 0, 1);
6285f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian                lastTokenWasAnAtom = false;
6295f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian                break;
6305f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian
6315f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian            case '{': {
6325f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian                ParseState state = saveState();
6335f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian
6345f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian                consume();
6355f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian                if (peekIsDigit()) {
6365f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian                    unsigned min = consumeNumber();
6375f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian                    unsigned max = min;
6385f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian
6395f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian                    if (tryConsume(','))
640f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch                        max = peekIsDigit() ? consumeNumber() : quantifyInfinite;
6415f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian
6425f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian                    if (tryConsume('}')) {
6435f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian                        if (min <= max)
6445f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian                            parseQuantifier(lastTokenWasAnAtom, min, max);
6455f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian                        else
6465f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian                            m_err = QuantifierOutOfOrder;
6475f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian                        lastTokenWasAnAtom = false;
6485f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian                        break;
6495f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian                    }
6505f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian                }
6515f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian
6525f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian                restoreState(state);
6535f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian            } // if we did not find a complete quantifer, fall through to the default case.
6545f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian
6555f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian            default:
6565f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian                m_delegate.atomPatternCharacter(consume());
6575f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian                lastTokenWasAnAtom = true;
6585f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian            }
6595f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian
6605f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian            if (m_err)
6615f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian                return;
6625f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian        }
6635f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian
6645f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian        if (m_parenthesesNestingDepth > 0)
6655f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian            m_err = MissingParentheses;
6665f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian    }
6675f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian
6685f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian    /*
6695f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian     * parse():
6705f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian     *
67165f03d4f644ce73618e5f4f50dd694b26f55ae12Ben Murdoch     * This method calls parseTokens() to parse over the input and converts any
6725f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian     * error code to a const char* for a result.
6735f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian     */
6745f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian    const char* parse()
6755f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian    {
6765f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian        if (m_size > MAX_PATTERN_SIZE)
6775f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian            m_err = PatternTooLarge;
6785f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian        else
6795f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian            parseTokens();
6805f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian        ASSERT(atEndOfPattern() || m_err);
6815f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian
6825f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian        // The order of this array must match the ErrorCode enum.
6835f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian        static const char* errorMessages[NumberOfErrorCodes] = {
6845f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian            0, // NoError
68565f03d4f644ce73618e5f4f50dd694b26f55ae12Ben Murdoch            REGEXP_ERROR_PREFIX "regular expression too large",
68665f03d4f644ce73618e5f4f50dd694b26f55ae12Ben Murdoch            REGEXP_ERROR_PREFIX "numbers out of order in {} quantifier",
68765f03d4f644ce73618e5f4f50dd694b26f55ae12Ben Murdoch            REGEXP_ERROR_PREFIX "nothing to repeat",
68865f03d4f644ce73618e5f4f50dd694b26f55ae12Ben Murdoch            REGEXP_ERROR_PREFIX "missing )",
68965f03d4f644ce73618e5f4f50dd694b26f55ae12Ben Murdoch            REGEXP_ERROR_PREFIX "unmatched parentheses",
69065f03d4f644ce73618e5f4f50dd694b26f55ae12Ben Murdoch            REGEXP_ERROR_PREFIX "unrecognized character after (?",
69165f03d4f644ce73618e5f4f50dd694b26f55ae12Ben Murdoch            REGEXP_ERROR_PREFIX "missing terminating ] for character class",
69265f03d4f644ce73618e5f4f50dd694b26f55ae12Ben Murdoch            REGEXP_ERROR_PREFIX "range out of order in character class",
69365f03d4f644ce73618e5f4f50dd694b26f55ae12Ben Murdoch            REGEXP_ERROR_PREFIX "\\ at end of pattern"
6945f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian        };
6955f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian
6965f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian        return errorMessages[m_err];
6975f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian    }
6985f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian
6995f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian
7005f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian    // Misc helper functions:
7015f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian
7025f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian    typedef unsigned ParseState;
7035f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian
7045f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian    ParseState saveState()
7055f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian    {
7065f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian        return m_index;
7075f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian    }
7085f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian
7095f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian    void restoreState(ParseState state)
7105f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian    {
7115f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian        m_index = state;
7125f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian    }
7135f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian
7145f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian    bool atEndOfPattern()
7155f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian    {
7165f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian        ASSERT(m_index <= m_size);
7175f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian        return m_index == m_size;
7185f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian    }
7195f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian
7205f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian    int peek()
7215f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian    {
7225f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian        ASSERT(m_index < m_size);
7235f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian        return m_data[m_index];
7245f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian    }
7255f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian
7265f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian    bool peekIsDigit()
7275f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian    {
7285f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian        return !atEndOfPattern() && WTF::isASCIIDigit(peek());
7295f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian    }
7305f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian
7315f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian    unsigned peekDigit()
7325f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian    {
7335f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian        ASSERT(peekIsDigit());
7345f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian        return peek() - '0';
7355f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian    }
7365f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian
7375f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian    int consume()
7385f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian    {
7395f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian        ASSERT(m_index < m_size);
7405f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian        return m_data[m_index++];
7415f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian    }
7425f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian
7435f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian    unsigned consumeDigit()
7445f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian    {
7455f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian        ASSERT(peekIsDigit());
7465f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian        return consume() - '0';
7475f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian    }
7485f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian
7495f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian    unsigned consumeNumber()
7505f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian    {
7515f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian        unsigned n = consumeDigit();
7525f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian        // check for overflow.
7535f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian        for (unsigned newValue; peekIsDigit() && ((newValue = n * 10 + peekDigit()) >= n); ) {
7545f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian            n = newValue;
7555f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian            consume();
7565f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian        }
7575f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian        return n;
7585f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian    }
7595f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian
7605f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian    unsigned consumeOctal()
7615f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian    {
7625f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian        ASSERT(WTF::isASCIIOctalDigit(peek()));
7635f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian
7645f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian        unsigned n = consumeDigit();
7655f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian        while (n < 32 && !atEndOfPattern() && WTF::isASCIIOctalDigit(peek()))
7665f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian            n = n * 8 + consumeDigit();
7675f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian        return n;
7685f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian    }
7695f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian
7705f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian    bool tryConsume(UChar ch)
7715f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian    {
7725f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian        if (atEndOfPattern() || (m_data[m_index] != ch))
7735f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian            return false;
7745f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian        ++m_index;
7755f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian        return true;
7765f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian    }
7775f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian
7785f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian    int tryConsumeHex(int count)
7795f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian    {
7805f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian        ParseState state = saveState();
7815f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian
7825f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian        int n = 0;
7835f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian        while (count--) {
7845f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian            if (atEndOfPattern() || !WTF::isASCIIHexDigit(peek())) {
7855f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian                restoreState(state);
7865f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian                return -1;
7875f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian            }
7885f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian            n = (n << 4) | WTF::toASCIIHexValue(consume());
7895f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian        }
7905f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian        return n;
7915f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian    }
7925f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian
7935f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian    Delegate& m_delegate;
7945f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian    unsigned m_backReferenceLimit;
7955f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian    ErrorCode m_err;
7965f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian    const UChar* m_data;
7975f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian    unsigned m_size;
7985f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian    unsigned m_index;
7995f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian    unsigned m_parenthesesNestingDepth;
8005f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian
8015f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian    // Derived by empirical testing of compile time in PCRE and WREC.
8025f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian    static const unsigned MAX_PATTERN_SIZE = 1024 * 1024;
8035f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian};
8045f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian
8055f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian/*
8065f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * Yarr::parse():
8075f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian *
8085f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * The parse method is passed a pattern to be parsed and a delegate upon which
8095f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * callbacks will be made to record the parsed tokens forming the regex.
8105f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * Yarr::parse() returns null on success, or a const C string providing an error
8115f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * message where a parse error occurs.
8125f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian *
8135f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * The Delegate must implement the following interface:
8145f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian *
8155f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian *    void assertionBOL();
8165f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian *    void assertionEOL();
8175f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian *    void assertionWordBoundary(bool invert);
8185f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian *
8195f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian *    void atomPatternCharacter(UChar ch);
8205f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian *    void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert);
8215f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian *    void atomCharacterClassBegin(bool invert)
8225f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian *    void atomCharacterClassAtom(UChar ch)
8235f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian *    void atomCharacterClassRange(UChar begin, UChar end)
8245f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian *    void atomCharacterClassBuiltIn(BuiltInCharacterClassID classID, bool invert)
8255f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian *    void atomCharacterClassEnd()
8265f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian *    void atomParenthesesSubpatternBegin(bool capture = true);
8275f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian *    void atomParentheticalAssertionBegin(bool invert = false);
8285f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian *    void atomParenthesesEnd();
8295f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian *    void atomBackReference(unsigned subpatternId);
8305f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian *
8315f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian *    void quantifyAtom(unsigned min, unsigned max, bool greedy);
8325f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian *
8335f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian *    void disjunction();
8345f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian *
8355f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * The regular expression is described by a sequence of assertion*() and atom*()
8365f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * callbacks to the delegate, describing the terms in the regular expression.
8375f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * Following an atom a quantifyAtom() call may occur to indicate that the previous
8385f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * atom should be quantified.  In the case of atoms described across multiple
8395f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * calls (parentheses and character classes) the call to quantifyAtom() will come
8405f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * after the call to the atom*End() method, never after atom*Begin().
8415f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian *
8425f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * Character classes may either be described by a single call to
8435f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * atomBuiltInCharacterClass(), or by a sequence of atomCharacterClass*() calls.
8445f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * In the latter case, ...Begin() will be called, followed by a sequence of
8455f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * calls to ...Atom(), ...Range(), and ...BuiltIn(), followed by a call to ...End().
8465f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian *
8475f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * Sequences of atoms and assertions are broken into alternatives via calls to
8485f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * disjunction().  Assertions, atoms, and disjunctions emitted between calls to
8495f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * atomParenthesesBegin() and atomParenthesesEnd() form the body of a subpattern.
8505f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * atomParenthesesBegin() is passed a subpatternId.  In the case of a regular
8515f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * capturing subpattern, this will be the subpatternId associated with these
8525f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * parentheses, and will also by definition be the lowest subpatternId of these
8535f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * parentheses and of any nested paretheses.  The atomParenthesesEnd() method
8545f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * is passed the subpatternId of the last capturing subexpression nested within
8555f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * these paretheses.  In the case of a capturing subpattern with no nested
8565f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * capturing subpatterns, the same subpatternId will be passed to the begin and
8575f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * end functions.  In the case of non-capturing subpatterns the subpatternId
8585f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * passed to the begin method is also the first possible subpatternId that might
8595f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * be nested within these paretheses.  If a set of non-capturing parentheses does
8605f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * not contain any capturing subpatterns, then the subpatternId passed to begin
8615f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * will be greater than the subpatternId passed to end.
8625f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian */
8635f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian
8645f1ab04193ad0130ca8204aadaceae083aca9881Feng Qiantemplate<class Delegate>
865f05b935882198ccf7d81675736e3aeb089c5113aBen Murdochconst char* parse(Delegate& delegate, const UString& pattern, unsigned backReferenceLimit = quantifyInfinite)
8665f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian{
8675f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian    return Parser<Delegate>(delegate, pattern, backReferenceLimit).parse();
8685f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian}
8695f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian
8705f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian} } // namespace JSC::Yarr
8715f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian
87265f03d4f644ce73618e5f4f50dd694b26f55ae12Ben Murdoch#endif // YarrParser_h
873