15f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian/* 25f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * Copyright (C) 2009 Apple Inc. All rights reserved. 35f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * 45f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * Redistribution and use in source and binary forms, with or without 55f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * modification, are permitted provided that the following conditions 65f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * are met: 75f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * 1. Redistributions of source code must retain the above copyright 85f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * notice, this list of conditions and the following disclaimer. 95f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * 2. Redistributions in binary form must reproduce the above copyright 105f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * notice, this list of conditions and the following disclaimer in the 115f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * documentation and/or other materials provided with the distribution. 125f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * 135f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY 145f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 155f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 165f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR 175f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 185f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 195f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 205f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 215f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 225f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 235f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 245f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian */ 255f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian 2665f03d4f644ce73618e5f4f50dd694b26f55ae12Ben Murdoch#ifndef YarrParser_h 2765f03d4f644ce73618e5f4f50dd694b26f55ae12Ben Murdoch#define YarrParser_h 285f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian 292fc2651226baac27029e38c9d6ef883fa32084dbSteve Block#include <runtime/UString.h> 3065f03d4f644ce73618e5f4f50dd694b26f55ae12Ben Murdoch#include "Yarr.h" 315f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian#include <wtf/ASCIICType.h> 325f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian#include <wtf/unicode/Unicode.h> 335f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian 345f1ab04193ad0130ca8204aadaceae083aca9881Feng Qiannamespace JSC { namespace Yarr { 355f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian 3665f03d4f644ce73618e5f4f50dd694b26f55ae12Ben Murdoch#define REGEXP_ERROR_PREFIX "Invalid regular expression: " 37f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch 385f1ab04193ad0130ca8204aadaceae083aca9881Feng Qianenum BuiltInCharacterClassID { 395f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian DigitClassID, 405f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian SpaceClassID, 415f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian WordClassID, 425f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian NewlineClassID, 435f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian}; 445f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian 455f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian// The Parser class should not be used directly - only via the Yarr::parse() method. 465f1ab04193ad0130ca8204aadaceae083aca9881Feng Qiantemplate<class Delegate> 475f1ab04193ad0130ca8204aadaceae083aca9881Feng Qianclass Parser { 485f1ab04193ad0130ca8204aadaceae083aca9881Feng Qianprivate: 495f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian template<class FriendDelegate> 505f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian friend const char* parse(FriendDelegate& delegate, const UString& pattern, unsigned backReferenceLimit); 515f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian 525f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian enum ErrorCode { 535f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian NoError, 545f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian PatternTooLarge, 555f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian QuantifierOutOfOrder, 565f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian QuantifierWithoutAtom, 575f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian MissingParentheses, 585f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian ParenthesesUnmatched, 595f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian ParenthesesTypeInvalid, 605f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian CharacterClassUnmatched, 615f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian CharacterClassOutOfOrder, 625f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian EscapeUnterminated, 635f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian NumberOfErrorCodes 645f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian }; 655f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian 665f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian /* 675f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * CharacterClassParserDelegate: 685f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * 695f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * The class CharacterClassParserDelegate is used in the parsing of character 705f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * classes. This class handles detection of character ranges. This class 715f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * implements enough of the delegate interface such that it can be passed to 725f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * parseEscape() as an EscapeDelegate. This allows parseEscape() to be reused 735f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * to perform the parsing of escape characters in character sets. 745f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian */ 755f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian class CharacterClassParserDelegate { 765f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian public: 775f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian CharacterClassParserDelegate(Delegate& delegate, ErrorCode& err) 785f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian : m_delegate(delegate) 795f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian , m_err(err) 80f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch , m_state(Empty) 812daae5fd11344eaa88a0d92b0f6d65f8d2255c00Ben Murdoch , m_character(0) 825f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian { 835f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian } 845f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian 855f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian /* 865f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * begin(): 875f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * 885f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * Called at beginning of construction. 895f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian */ 905f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian void begin(bool invert) 915f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian { 925f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian m_delegate.atomCharacterClassBegin(invert); 935f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian } 945f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian 955f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian /* 96f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch * atomPatternCharacter(): 975f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * 98f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch * This method is called either from parseCharacterClass() (for an unescaped 99f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch * character in a character class), or from parseEscape(). In the former case 100f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch * the value true will be passed for the argument 'hyphenIsRange', and in this 101f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch * mode we will allow a hypen to be treated as indicating a range (i.e. /[a-z]/ 102f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch * is different to /[a\-z]/). 1035f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian */ 104f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch void atomPatternCharacter(UChar ch, bool hyphenIsRange = false) 1055f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian { 1065f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian switch (m_state) { 107f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch case AfterCharacterClass: 108f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch // Following a builtin character class we need look out for a hyphen. 109f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch // We're looking for invalid ranges, such as /[\d-x]/ or /[\d-\d]/. 110f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch // If we see a hyphen following a charater class then unlike usual 111f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch // we'll report it to the delegate immediately, and put ourself into 112f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch // a poisoned state. Any following calls to add another character or 113f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch // character class will result in an error. (A hypen following a 114f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch // character-class is itself valid, but only at the end of a regex). 115f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch if (hyphenIsRange && ch == '-') { 116f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch m_delegate.atomCharacterClassAtom('-'); 117f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch m_state = AfterCharacterClassHyphen; 118f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch return; 119f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch } 120f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch // Otherwise just fall through - cached character so treat this as Empty. 121f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch 122f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch case Empty: 1235f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian m_character = ch; 124f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch m_state = CachedCharacter; 125f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch return; 1265f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian 127f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch case CachedCharacter: 128f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch if (hyphenIsRange && ch == '-') 129f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch m_state = CachedCharacterHyphen; 1305f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian else { 1315f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian m_delegate.atomCharacterClassAtom(m_character); 1325f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian m_character = ch; 1335f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian } 134f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch return; 1355f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian 136f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch case CachedCharacterHyphen: 137f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch if (ch < m_character) { 1385f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian m_err = CharacterClassOutOfOrder; 139f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch return; 140f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch } 141f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch m_delegate.atomCharacterClassRange(m_character, ch); 142f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch m_state = Empty; 143f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch return; 144d06194330da2bb8da887d2e1adeacb3a5c1504b2Steve Block 145f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch // See coment in atomBuiltInCharacterClass below. 146f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch // This too is technically an error, per ECMA-262, and again we 147f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch // we chose to allow this. Note a subtlely here that while we 148f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch // diverge from the spec's definition of CharacterRange we do 149f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch // remain in compliance with the grammar. For example, consider 150f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch // the expression /[\d-a-z]/. We comply with the grammar in 151f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch // this case by not allowing a-z to be matched as a range. 152f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch case AfterCharacterClassHyphen: 153f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch m_delegate.atomCharacterClassAtom(ch); 154f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch m_state = Empty; 155f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch return; 156f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch } 157d06194330da2bb8da887d2e1adeacb3a5c1504b2Steve Block } 158d06194330da2bb8da887d2e1adeacb3a5c1504b2Steve Block 159d06194330da2bb8da887d2e1adeacb3a5c1504b2Steve Block /* 1605f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * atomBuiltInCharacterClass(): 1615f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * 1625f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * Adds a built-in character class, called by parseEscape(). 1635f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian */ 1645f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert) 1655f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian { 166f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch switch (m_state) { 167f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch case CachedCharacter: 168f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch // Flush the currently cached character, then fall through. 169f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch m_delegate.atomCharacterClassAtom(m_character); 170f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch 171f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch case Empty: 172f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch case AfterCharacterClass: 173f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch m_state = AfterCharacterClass; 174f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch m_delegate.atomCharacterClassBuiltIn(classID, invert); 175f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch return; 176f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch 177f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch // If we hit either of these cases, we have an invalid range that 178f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch // looks something like /[x-\d]/ or /[\d-\d]/. 179f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch // According to ECMA-262 this should be a syntax error, but 180f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch // empirical testing shows this to break teh webz. Instead we 181f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch // comply with to the ECMA-262 grammar, and assume the grammar to 182f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch // have matched the range correctly, but tweak our interpretation 183f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch // of CharacterRange. Effectively we implicitly handle the hyphen 184f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch // as if it were escaped, e.g. /[\w-_]/ is treated as /[\w\-_]/. 185f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch case CachedCharacterHyphen: 186f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch m_delegate.atomCharacterClassAtom(m_character); 187f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch m_delegate.atomCharacterClassAtom('-'); 188f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch // fall through 189f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch case AfterCharacterClassHyphen: 190f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch m_delegate.atomCharacterClassBuiltIn(classID, invert); 191f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch m_state = Empty; 192f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch return; 193f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch } 1945f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian } 1955f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian 1965f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian /* 1975f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * end(): 1985f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * 1995f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * Called at end of construction. 2005f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian */ 2015f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian void end() 2025f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian { 203f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch if (m_state == CachedCharacter) 204f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch m_delegate.atomCharacterClassAtom(m_character); 205f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch else if (m_state == CachedCharacterHyphen) { 206f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch m_delegate.atomCharacterClassAtom(m_character); 207f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch m_delegate.atomCharacterClassAtom('-'); 208f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch } 2095f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian m_delegate.atomCharacterClassEnd(); 2105f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian } 2115f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian 2125f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian // parseEscape() should never call these delegate methods when 2135f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian // invoked with inCharacterClass set. 2145f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian void assertionWordBoundary(bool) { ASSERT_NOT_REACHED(); } 2155f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian void atomBackReference(unsigned) { ASSERT_NOT_REACHED(); } 2165f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian 2175f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian private: 2185f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian Delegate& m_delegate; 2195f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian ErrorCode& m_err; 2205f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian enum CharacterClassConstructionState { 221f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch Empty, 222f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch CachedCharacter, 223f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch CachedCharacterHyphen, 224f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch AfterCharacterClass, 225f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch AfterCharacterClassHyphen, 2265f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian } m_state; 2275f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian UChar m_character; 2285f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian }; 2295f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian 2305f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian Parser(Delegate& delegate, const UString& pattern, unsigned backReferenceLimit) 2315f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian : m_delegate(delegate) 2325f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian , m_backReferenceLimit(backReferenceLimit) 2335f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian , m_err(NoError) 234f486d19d62f1bc33246748b14b14a9dfa617b57fIain Merrick , m_data(pattern.characters()) 235f486d19d62f1bc33246748b14b14a9dfa617b57fIain Merrick , m_size(pattern.length()) 2365f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian , m_index(0) 2375f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian , m_parenthesesNestingDepth(0) 2385f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian { 2395f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian } 2405f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian 2415f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian /* 2425f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * parseEscape(): 2435f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * 2445f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * Helper for parseTokens() AND parseCharacterClass(). 2455f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * Unlike the other parser methods, this function does not report tokens 2465f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * directly to the member delegate (m_delegate), instead tokens are 2475f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * emitted to the delegate provided as an argument. In the case of atom 2485f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * escapes, parseTokens() will call parseEscape() passing m_delegate as 2495f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * an argument, and as such the escape will be reported to the delegate. 2505f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * 2515f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * However this method may also be used by parseCharacterClass(), in which 2525f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * case a CharacterClassParserDelegate will be passed as the delegate that 2535f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * tokens should be added to. A boolean flag is also provided to indicate 2545f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * whether that an escape in a CharacterClass is being parsed (some parsing 2555f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * rules change in this context). 2565f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * 2575f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * The boolean value returned by this method indicates whether the token 2585f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * parsed was an atom (outside of a characted class \b and \B will be 2595f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * interpreted as assertions). 2605f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian */ 2615f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian template<bool inCharacterClass, class EscapeDelegate> 2625f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian bool parseEscape(EscapeDelegate& delegate) 2635f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian { 2645f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian ASSERT(!m_err); 2655f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian ASSERT(peek() == '\\'); 2665f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian consume(); 2675f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian 2685f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian if (atEndOfPattern()) { 2695f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian m_err = EscapeUnterminated; 2705f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian return false; 2715f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian } 2725f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian 2735f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian switch (peek()) { 2745f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian // Assertions 2755f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian case 'b': 2765f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian consume(); 2775f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian if (inCharacterClass) 2785f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian delegate.atomPatternCharacter('\b'); 2795f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian else { 2805f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian delegate.assertionWordBoundary(false); 2815f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian return false; 2825f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian } 2835f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian break; 2845f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian case 'B': 2855f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian consume(); 2865f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian if (inCharacterClass) 2875f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian delegate.atomPatternCharacter('B'); 2885f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian else { 2895f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian delegate.assertionWordBoundary(true); 2905f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian return false; 2915f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian } 2925f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian break; 2935f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian 2945f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian // CharacterClassEscape 2955f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian case 'd': 2965f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian consume(); 2975f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian delegate.atomBuiltInCharacterClass(DigitClassID, false); 2985f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian break; 2995f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian case 's': 3005f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian consume(); 3015f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian delegate.atomBuiltInCharacterClass(SpaceClassID, false); 3025f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian break; 3035f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian case 'w': 3045f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian consume(); 3055f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian delegate.atomBuiltInCharacterClass(WordClassID, false); 3065f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian break; 3075f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian case 'D': 3085f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian consume(); 3095f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian delegate.atomBuiltInCharacterClass(DigitClassID, true); 3105f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian break; 3115f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian case 'S': 3125f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian consume(); 3135f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian delegate.atomBuiltInCharacterClass(SpaceClassID, true); 3145f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian break; 3155f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian case 'W': 3165f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian consume(); 3175f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian delegate.atomBuiltInCharacterClass(WordClassID, true); 3185f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian break; 3195f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian 3205f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian // DecimalEscape 3215f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian case '1': 3225f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian case '2': 3235f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian case '3': 3245f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian case '4': 3255f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian case '5': 3265f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian case '6': 3275f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian case '7': 3285f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian case '8': 3295f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian case '9': { 3305f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian // To match Firefox, we parse an invalid backreference in the range [1-7] as an octal escape. 3315f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian // First, try to parse this as backreference. 3325f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian if (!inCharacterClass) { 3335f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian ParseState state = saveState(); 3345f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian 3355f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian unsigned backReference = consumeNumber(); 3365f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian if (backReference <= m_backReferenceLimit) { 3375f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian delegate.atomBackReference(backReference); 3385f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian break; 3395f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian } 3405f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian 3415f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian restoreState(state); 3425f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian } 3435f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian 3445f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian // Not a backreference, and not octal. 3455f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian if (peek() >= '8') { 3465f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian delegate.atomPatternCharacter('\\'); 3475f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian break; 3485f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian } 3495f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian 3505f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian // Fall-through to handle this as an octal escape. 3515f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian } 3525f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian 3535f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian // Octal escape 3545f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian case '0': 3555f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian delegate.atomPatternCharacter(consumeOctal()); 3565f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian break; 3575f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian 3585f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian // ControlEscape 3595f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian case 'f': 3605f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian consume(); 3615f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian delegate.atomPatternCharacter('\f'); 3625f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian break; 3635f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian case 'n': 3645f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian consume(); 3655f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian delegate.atomPatternCharacter('\n'); 3665f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian break; 3675f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian case 'r': 3685f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian consume(); 3695f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian delegate.atomPatternCharacter('\r'); 3705f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian break; 3715f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian case 't': 3725f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian consume(); 3735f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian delegate.atomPatternCharacter('\t'); 3745f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian break; 3755f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian case 'v': 3765f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian consume(); 3775f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian delegate.atomPatternCharacter('\v'); 3785f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian break; 3795f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian 3805f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian // ControlLetter 3815f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian case 'c': { 3825f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian ParseState state = saveState(); 3835f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian consume(); 3845f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian if (!atEndOfPattern()) { 3855f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian int control = consume(); 3865f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian 3875f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian // To match Firefox, inside a character class, we also accept numbers and '_' as control characters. 3885f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian if (inCharacterClass ? WTF::isASCIIAlphanumeric(control) || (control == '_') : WTF::isASCIIAlpha(control)) { 3895f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian delegate.atomPatternCharacter(control & 0x1f); 3905f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian break; 3915f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian } 3925f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian } 3935f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian restoreState(state); 3945f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian delegate.atomPatternCharacter('\\'); 3955f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian break; 3965f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian } 3975f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian 3985f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian // HexEscape 3995f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian case 'x': { 4005f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian consume(); 4015f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian int x = tryConsumeHex(2); 4025f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian if (x == -1) 4035f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian delegate.atomPatternCharacter('x'); 4045f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian else 4055f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian delegate.atomPatternCharacter(x); 4065f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian break; 4075f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian } 4085f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian 4095f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian // UnicodeEscape 4105f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian case 'u': { 4115f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian consume(); 4125f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian int u = tryConsumeHex(4); 4135f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian if (u == -1) 4145f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian delegate.atomPatternCharacter('u'); 4155f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian else 4165f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian delegate.atomPatternCharacter(u); 4175f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian break; 4185f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian } 4195f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian 4205f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian // IdentityEscape 4215f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian default: 4225f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian delegate.atomPatternCharacter(consume()); 4235f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian } 4245f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian 4255f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian return true; 4265f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian } 4275f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian 4285f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian /* 4295f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * parseAtomEscape(), parseCharacterClassEscape(): 4305f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * 4315f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * These methods alias to parseEscape(). 4325f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian */ 4335f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian bool parseAtomEscape() 4345f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian { 4355f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian return parseEscape<false>(m_delegate); 4365f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian } 4375f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian void parseCharacterClassEscape(CharacterClassParserDelegate& delegate) 4385f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian { 4395f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian parseEscape<true>(delegate); 4405f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian } 4415f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian 4425f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian /* 4435f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * parseCharacterClass(): 4445f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * 4455f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * Helper for parseTokens(); calls dirctly and indirectly (via parseCharacterClassEscape) 4465f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * to an instance of CharacterClassParserDelegate, to describe the character class to the 4475f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * delegate. 4485f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian */ 4495f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian void parseCharacterClass() 4505f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian { 4515f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian ASSERT(!m_err); 4525f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian ASSERT(peek() == '['); 4535f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian consume(); 4545f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian 4555f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian CharacterClassParserDelegate characterClassConstructor(m_delegate, m_err); 4565f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian 4575f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian characterClassConstructor.begin(tryConsume('^')); 4585f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian 4595f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian while (!atEndOfPattern()) { 4605f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian switch (peek()) { 4615f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian case ']': 4625f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian consume(); 4635f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian characterClassConstructor.end(); 4645f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian return; 4655f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian 4665f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian case '\\': 4675f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian parseCharacterClassEscape(characterClassConstructor); 4685f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian break; 4695f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian 4705f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian default: 471f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch characterClassConstructor.atomPatternCharacter(consume(), true); 4725f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian } 4735f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian 4745f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian if (m_err) 4755f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian return; 4765f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian } 4775f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian 4785f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian m_err = CharacterClassUnmatched; 4795f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian } 4805f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian 4815f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian /* 4825f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * parseParenthesesBegin(): 4835f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * 4845f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * Helper for parseTokens(); checks for parentheses types other than regular capturing subpatterns. 4855f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian */ 4865f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian void parseParenthesesBegin() 4875f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian { 4885f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian ASSERT(!m_err); 4895f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian ASSERT(peek() == '('); 4905f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian consume(); 4915f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian 4925f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian if (tryConsume('?')) { 4935f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian if (atEndOfPattern()) { 4945f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian m_err = ParenthesesTypeInvalid; 4955f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian return; 4965f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian } 4975f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian 4985f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian switch (consume()) { 4995f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian case ':': 5005f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian m_delegate.atomParenthesesSubpatternBegin(false); 5015f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian break; 5025f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian 5035f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian case '=': 5045f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian m_delegate.atomParentheticalAssertionBegin(); 5055f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian break; 5065f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian 5075f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian case '!': 5085f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian m_delegate.atomParentheticalAssertionBegin(true); 5095f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian break; 5105f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian 5115f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian default: 5125f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian m_err = ParenthesesTypeInvalid; 5135f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian } 5145f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian } else 5155f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian m_delegate.atomParenthesesSubpatternBegin(); 5165f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian 5175f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian ++m_parenthesesNestingDepth; 5185f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian } 5195f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian 5205f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian /* 5215f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * parseParenthesesEnd(): 5225f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * 5235f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * Helper for parseTokens(); checks for parse errors (due to unmatched parentheses). 5245f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian */ 5255f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian void parseParenthesesEnd() 5265f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian { 5275f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian ASSERT(!m_err); 5285f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian ASSERT(peek() == ')'); 5295f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian consume(); 5305f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian 5315f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian if (m_parenthesesNestingDepth > 0) 5325f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian m_delegate.atomParenthesesEnd(); 5335f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian else 5345f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian m_err = ParenthesesUnmatched; 5355f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian 5365f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian --m_parenthesesNestingDepth; 5375f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian } 5385f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian 5395f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian /* 5405f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * parseQuantifier(): 5415f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * 5425f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * Helper for parseTokens(); checks for parse errors and non-greedy quantifiers. 5435f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian */ 5445f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian void parseQuantifier(bool lastTokenWasAnAtom, unsigned min, unsigned max) 5455f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian { 5465f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian ASSERT(!m_err); 5475f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian ASSERT(min <= max); 5485f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian 5495f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian if (lastTokenWasAnAtom) 5505f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian m_delegate.quantifyAtom(min, max, !tryConsume('?')); 5515f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian else 5525f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian m_err = QuantifierWithoutAtom; 5535f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian } 5545f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian 5555f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian /* 5565f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * parseTokens(): 5575f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * 5585f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * This method loops over the input pattern reporting tokens to the delegate. 5595f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * The method returns when a parse error is detected, or the end of the pattern 5605f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * is reached. One piece of state is tracked around the loop, which is whether 5615f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * the last token passed to the delegate was an atom (this is necessary to detect 5625f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * a parse error when a quantifier provided without an atom to quantify). 5635f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian */ 5645f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian void parseTokens() 5655f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian { 5665f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian bool lastTokenWasAnAtom = false; 5675f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian 5685f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian while (!atEndOfPattern()) { 5695f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian switch (peek()) { 5705f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian case '|': 5715f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian consume(); 5725f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian m_delegate.disjunction(); 5735f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian lastTokenWasAnAtom = false; 5745f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian break; 5755f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian 5765f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian case '(': 5775f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian parseParenthesesBegin(); 5785f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian lastTokenWasAnAtom = false; 5795f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian break; 5805f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian 5815f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian case ')': 5825f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian parseParenthesesEnd(); 5835f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian lastTokenWasAnAtom = true; 5845f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian break; 5855f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian 5865f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian case '^': 5875f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian consume(); 5885f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian m_delegate.assertionBOL(); 5895f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian lastTokenWasAnAtom = false; 5905f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian break; 5915f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian 5925f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian case '$': 5935f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian consume(); 5945f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian m_delegate.assertionEOL(); 5955f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian lastTokenWasAnAtom = false; 5965f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian break; 5975f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian 5985f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian case '.': 5995f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian consume(); 6005f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian m_delegate.atomBuiltInCharacterClass(NewlineClassID, true); 6015f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian lastTokenWasAnAtom = true; 6025f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian break; 6035f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian 6045f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian case '[': 6055f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian parseCharacterClass(); 6065f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian lastTokenWasAnAtom = true; 6075f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian break; 6085f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian 6095f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian case '\\': 6105f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian lastTokenWasAnAtom = parseAtomEscape(); 6115f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian break; 6125f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian 6135f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian case '*': 6145f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian consume(); 615f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch parseQuantifier(lastTokenWasAnAtom, 0, quantifyInfinite); 6165f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian lastTokenWasAnAtom = false; 6175f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian break; 6185f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian 6195f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian case '+': 6205f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian consume(); 621f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch parseQuantifier(lastTokenWasAnAtom, 1, quantifyInfinite); 6225f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian lastTokenWasAnAtom = false; 6235f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian break; 6245f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian 6255f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian case '?': 6265f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian consume(); 6275f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian parseQuantifier(lastTokenWasAnAtom, 0, 1); 6285f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian lastTokenWasAnAtom = false; 6295f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian break; 6305f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian 6315f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian case '{': { 6325f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian ParseState state = saveState(); 6335f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian 6345f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian consume(); 6355f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian if (peekIsDigit()) { 6365f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian unsigned min = consumeNumber(); 6375f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian unsigned max = min; 6385f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian 6395f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian if (tryConsume(',')) 640f05b935882198ccf7d81675736e3aeb089c5113aBen Murdoch max = peekIsDigit() ? consumeNumber() : quantifyInfinite; 6415f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian 6425f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian if (tryConsume('}')) { 6435f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian if (min <= max) 6445f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian parseQuantifier(lastTokenWasAnAtom, min, max); 6455f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian else 6465f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian m_err = QuantifierOutOfOrder; 6475f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian lastTokenWasAnAtom = false; 6485f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian break; 6495f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian } 6505f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian } 6515f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian 6525f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian restoreState(state); 6535f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian } // if we did not find a complete quantifer, fall through to the default case. 6545f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian 6555f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian default: 6565f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian m_delegate.atomPatternCharacter(consume()); 6575f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian lastTokenWasAnAtom = true; 6585f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian } 6595f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian 6605f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian if (m_err) 6615f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian return; 6625f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian } 6635f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian 6645f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian if (m_parenthesesNestingDepth > 0) 6655f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian m_err = MissingParentheses; 6665f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian } 6675f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian 6685f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian /* 6695f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * parse(): 6705f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * 67165f03d4f644ce73618e5f4f50dd694b26f55ae12Ben Murdoch * This method calls parseTokens() to parse over the input and converts any 6725f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * error code to a const char* for a result. 6735f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian */ 6745f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian const char* parse() 6755f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian { 6765f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian if (m_size > MAX_PATTERN_SIZE) 6775f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian m_err = PatternTooLarge; 6785f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian else 6795f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian parseTokens(); 6805f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian ASSERT(atEndOfPattern() || m_err); 6815f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian 6825f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian // The order of this array must match the ErrorCode enum. 6835f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian static const char* errorMessages[NumberOfErrorCodes] = { 6845f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian 0, // NoError 68565f03d4f644ce73618e5f4f50dd694b26f55ae12Ben Murdoch REGEXP_ERROR_PREFIX "regular expression too large", 68665f03d4f644ce73618e5f4f50dd694b26f55ae12Ben Murdoch REGEXP_ERROR_PREFIX "numbers out of order in {} quantifier", 68765f03d4f644ce73618e5f4f50dd694b26f55ae12Ben Murdoch REGEXP_ERROR_PREFIX "nothing to repeat", 68865f03d4f644ce73618e5f4f50dd694b26f55ae12Ben Murdoch REGEXP_ERROR_PREFIX "missing )", 68965f03d4f644ce73618e5f4f50dd694b26f55ae12Ben Murdoch REGEXP_ERROR_PREFIX "unmatched parentheses", 69065f03d4f644ce73618e5f4f50dd694b26f55ae12Ben Murdoch REGEXP_ERROR_PREFIX "unrecognized character after (?", 69165f03d4f644ce73618e5f4f50dd694b26f55ae12Ben Murdoch REGEXP_ERROR_PREFIX "missing terminating ] for character class", 69265f03d4f644ce73618e5f4f50dd694b26f55ae12Ben Murdoch REGEXP_ERROR_PREFIX "range out of order in character class", 69365f03d4f644ce73618e5f4f50dd694b26f55ae12Ben Murdoch REGEXP_ERROR_PREFIX "\\ at end of pattern" 6945f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian }; 6955f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian 6965f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian return errorMessages[m_err]; 6975f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian } 6985f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian 6995f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian 7005f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian // Misc helper functions: 7015f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian 7025f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian typedef unsigned ParseState; 7035f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian 7045f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian ParseState saveState() 7055f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian { 7065f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian return m_index; 7075f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian } 7085f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian 7095f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian void restoreState(ParseState state) 7105f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian { 7115f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian m_index = state; 7125f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian } 7135f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian 7145f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian bool atEndOfPattern() 7155f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian { 7165f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian ASSERT(m_index <= m_size); 7175f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian return m_index == m_size; 7185f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian } 7195f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian 7205f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian int peek() 7215f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian { 7225f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian ASSERT(m_index < m_size); 7235f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian return m_data[m_index]; 7245f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian } 7255f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian 7265f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian bool peekIsDigit() 7275f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian { 7285f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian return !atEndOfPattern() && WTF::isASCIIDigit(peek()); 7295f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian } 7305f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian 7315f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian unsigned peekDigit() 7325f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian { 7335f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian ASSERT(peekIsDigit()); 7345f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian return peek() - '0'; 7355f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian } 7365f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian 7375f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian int consume() 7385f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian { 7395f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian ASSERT(m_index < m_size); 7405f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian return m_data[m_index++]; 7415f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian } 7425f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian 7435f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian unsigned consumeDigit() 7445f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian { 7455f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian ASSERT(peekIsDigit()); 7465f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian return consume() - '0'; 7475f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian } 7485f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian 7495f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian unsigned consumeNumber() 7505f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian { 7515f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian unsigned n = consumeDigit(); 7525f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian // check for overflow. 7535f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian for (unsigned newValue; peekIsDigit() && ((newValue = n * 10 + peekDigit()) >= n); ) { 7545f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian n = newValue; 7555f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian consume(); 7565f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian } 7575f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian return n; 7585f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian } 7595f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian 7605f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian unsigned consumeOctal() 7615f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian { 7625f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian ASSERT(WTF::isASCIIOctalDigit(peek())); 7635f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian 7645f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian unsigned n = consumeDigit(); 7655f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian while (n < 32 && !atEndOfPattern() && WTF::isASCIIOctalDigit(peek())) 7665f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian n = n * 8 + consumeDigit(); 7675f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian return n; 7685f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian } 7695f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian 7705f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian bool tryConsume(UChar ch) 7715f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian { 7725f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian if (atEndOfPattern() || (m_data[m_index] != ch)) 7735f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian return false; 7745f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian ++m_index; 7755f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian return true; 7765f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian } 7775f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian 7785f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian int tryConsumeHex(int count) 7795f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian { 7805f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian ParseState state = saveState(); 7815f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian 7825f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian int n = 0; 7835f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian while (count--) { 7845f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian if (atEndOfPattern() || !WTF::isASCIIHexDigit(peek())) { 7855f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian restoreState(state); 7865f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian return -1; 7875f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian } 7885f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian n = (n << 4) | WTF::toASCIIHexValue(consume()); 7895f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian } 7905f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian return n; 7915f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian } 7925f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian 7935f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian Delegate& m_delegate; 7945f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian unsigned m_backReferenceLimit; 7955f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian ErrorCode m_err; 7965f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian const UChar* m_data; 7975f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian unsigned m_size; 7985f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian unsigned m_index; 7995f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian unsigned m_parenthesesNestingDepth; 8005f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian 8015f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian // Derived by empirical testing of compile time in PCRE and WREC. 8025f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian static const unsigned MAX_PATTERN_SIZE = 1024 * 1024; 8035f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian}; 8045f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian 8055f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian/* 8065f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * Yarr::parse(): 8075f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * 8085f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * The parse method is passed a pattern to be parsed and a delegate upon which 8095f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * callbacks will be made to record the parsed tokens forming the regex. 8105f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * Yarr::parse() returns null on success, or a const C string providing an error 8115f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * message where a parse error occurs. 8125f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * 8135f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * The Delegate must implement the following interface: 8145f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * 8155f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * void assertionBOL(); 8165f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * void assertionEOL(); 8175f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * void assertionWordBoundary(bool invert); 8185f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * 8195f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * void atomPatternCharacter(UChar ch); 8205f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert); 8215f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * void atomCharacterClassBegin(bool invert) 8225f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * void atomCharacterClassAtom(UChar ch) 8235f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * void atomCharacterClassRange(UChar begin, UChar end) 8245f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * void atomCharacterClassBuiltIn(BuiltInCharacterClassID classID, bool invert) 8255f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * void atomCharacterClassEnd() 8265f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * void atomParenthesesSubpatternBegin(bool capture = true); 8275f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * void atomParentheticalAssertionBegin(bool invert = false); 8285f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * void atomParenthesesEnd(); 8295f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * void atomBackReference(unsigned subpatternId); 8305f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * 8315f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * void quantifyAtom(unsigned min, unsigned max, bool greedy); 8325f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * 8335f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * void disjunction(); 8345f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * 8355f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * The regular expression is described by a sequence of assertion*() and atom*() 8365f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * callbacks to the delegate, describing the terms in the regular expression. 8375f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * Following an atom a quantifyAtom() call may occur to indicate that the previous 8385f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * atom should be quantified. In the case of atoms described across multiple 8395f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * calls (parentheses and character classes) the call to quantifyAtom() will come 8405f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * after the call to the atom*End() method, never after atom*Begin(). 8415f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * 8425f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * Character classes may either be described by a single call to 8435f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * atomBuiltInCharacterClass(), or by a sequence of atomCharacterClass*() calls. 8445f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * In the latter case, ...Begin() will be called, followed by a sequence of 8455f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * calls to ...Atom(), ...Range(), and ...BuiltIn(), followed by a call to ...End(). 8465f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * 8475f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * Sequences of atoms and assertions are broken into alternatives via calls to 8485f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * disjunction(). Assertions, atoms, and disjunctions emitted between calls to 8495f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * atomParenthesesBegin() and atomParenthesesEnd() form the body of a subpattern. 8505f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * atomParenthesesBegin() is passed a subpatternId. In the case of a regular 8515f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * capturing subpattern, this will be the subpatternId associated with these 8525f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * parentheses, and will also by definition be the lowest subpatternId of these 8535f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * parentheses and of any nested paretheses. The atomParenthesesEnd() method 8545f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * is passed the subpatternId of the last capturing subexpression nested within 8555f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * these paretheses. In the case of a capturing subpattern with no nested 8565f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * capturing subpatterns, the same subpatternId will be passed to the begin and 8575f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * end functions. In the case of non-capturing subpatterns the subpatternId 8585f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * passed to the begin method is also the first possible subpatternId that might 8595f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * be nested within these paretheses. If a set of non-capturing parentheses does 8605f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * not contain any capturing subpatterns, then the subpatternId passed to begin 8615f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian * will be greater than the subpatternId passed to end. 8625f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian */ 8635f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian 8645f1ab04193ad0130ca8204aadaceae083aca9881Feng Qiantemplate<class Delegate> 865f05b935882198ccf7d81675736e3aeb089c5113aBen Murdochconst char* parse(Delegate& delegate, const UString& pattern, unsigned backReferenceLimit = quantifyInfinite) 8665f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian{ 8675f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian return Parser<Delegate>(delegate, pattern, backReferenceLimit).parse(); 8685f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian} 8695f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian 8705f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian} } // namespace JSC::Yarr 8715f1ab04193ad0130ca8204aadaceae083aca9881Feng Qian 87265f03d4f644ce73618e5f4f50dd694b26f55ae12Ben Murdoch#endif // YarrParser_h 873