1/*
2 * Copyright (C) 2008 Apple Inc. All Rights Reserved.
3 * Copyright (C) 2009 Torch Mobile, Inc. http://www.torchmobile.com/
4 * Copyright (C) 2013 Google, Inc. All Rights Reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 *    notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 *    notice, this list of conditions and the following disclaimer in the
13 *    documentation and/or other materials provided with the distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
16 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR
19 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27
28#ifndef InputStreamPreprocessor_h
29#define InputStreamPreprocessor_h
30
31#include "platform/text/SegmentedString.h"
32#include "wtf/Noncopyable.h"
33
34namespace blink {
35
36const LChar kEndOfFileMarker = 0;
37
38// http://www.whatwg.org/specs/web-apps/current-work/#preprocessing-the-input-stream
39template <typename Tokenizer>
40class InputStreamPreprocessor {
41    WTF_MAKE_NONCOPYABLE(InputStreamPreprocessor);
42public:
43    InputStreamPreprocessor(Tokenizer* tokenizer)
44        : m_tokenizer(tokenizer)
45    {
46        reset();
47    }
48
49    ALWAYS_INLINE UChar nextInputCharacter() const { return m_nextInputCharacter; }
50
51    // Returns whether we succeeded in peeking at the next character.
52    // The only way we can fail to peek is if there are no more
53    // characters in |source| (after collapsing \r\n, etc).
54    ALWAYS_INLINE bool peek(SegmentedString& source)
55    {
56        m_nextInputCharacter = source.currentChar();
57
58        // Every branch in this function is expensive, so we have a
59        // fast-reject branch for characters that don't require special
60        // handling. Please run the parser benchmark whenever you touch
61        // this function. It's very hot.
62        static const UChar specialCharacterMask = '\n' | '\r' | '\0';
63        if (m_nextInputCharacter & ~specialCharacterMask) {
64            m_skipNextNewLine = false;
65            return true;
66        }
67        return processNextInputCharacter(source);
68    }
69
70    // Returns whether there are more characters in |source| after advancing.
71    ALWAYS_INLINE bool advance(SegmentedString& source)
72    {
73        source.advanceAndUpdateLineNumber();
74        if (source.isEmpty())
75            return false;
76        return peek(source);
77    }
78
79    bool skipNextNewLine() const { return m_skipNextNewLine; }
80
81    void reset(bool skipNextNewLine = false)
82    {
83        m_nextInputCharacter = '\0';
84        m_skipNextNewLine = skipNextNewLine;
85    }
86
87private:
88    bool processNextInputCharacter(SegmentedString& source)
89    {
90    ProcessAgain:
91        ASSERT(m_nextInputCharacter == source.currentChar());
92
93        if (m_nextInputCharacter == '\n' && m_skipNextNewLine) {
94            m_skipNextNewLine = false;
95            source.advancePastNewlineAndUpdateLineNumber();
96            if (source.isEmpty())
97                return false;
98            m_nextInputCharacter = source.currentChar();
99        }
100        if (m_nextInputCharacter == '\r') {
101            m_nextInputCharacter = '\n';
102            m_skipNextNewLine = true;
103        } else {
104            m_skipNextNewLine = false;
105            // FIXME: The spec indicates that the surrogate pair range as well as
106            // a number of specific character values are parse errors and should be replaced
107            // by the replacement character. We suspect this is a problem with the spec as doing
108            // that filtering breaks surrogate pair handling and causes us not to match Minefield.
109            if (m_nextInputCharacter == '\0' && !shouldTreatNullAsEndOfFileMarker(source)) {
110                if (m_tokenizer->shouldSkipNullCharacters()) {
111                    source.advancePastNonNewline();
112                    if (source.isEmpty())
113                        return false;
114                    m_nextInputCharacter = source.currentChar();
115                    goto ProcessAgain;
116                }
117                m_nextInputCharacter = 0xFFFD;
118            }
119        }
120        return true;
121    }
122
123    bool shouldTreatNullAsEndOfFileMarker(SegmentedString& source) const
124    {
125        return source.isClosed() && source.length() == 1;
126    }
127
128    Tokenizer* m_tokenizer;
129
130    // http://www.whatwg.org/specs/web-apps/current-work/#next-input-character
131    UChar m_nextInputCharacter;
132    bool m_skipNextNewLine;
133};
134
135}
136
137#endif // InputStreamPreprocessor_h
138
139