1/*
2 * Copyright (C) 2011 Google Inc.  All rights reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions are
6 * met:
7 *
8 *     * Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 *     * Redistributions in binary form must reproduce the above
11 * copyright notice, this list of conditions and the following disclaimer
12 * in the documentation and/or other materials provided with the
13 * distribution.
14 *     * Neither the name of Google Inc. nor the names of its
15 * contributors may be used to endorse or promote products derived from
16 * this software without specific prior written permission.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 */
30
31#include "config.h"
32
33#include "core/html/track/vtt/VTTTokenizer.h"
34
35#include "core/xml/parser/MarkupTokenizerInlines.h"
36#include "wtf/text/StringBuilder.h"
37#include "wtf/unicode/CharacterNames.h"
38
39namespace blink {
40
41#define WEBVTT_BEGIN_STATE(stateName) case stateName: stateName:
42#define WEBVTT_ADVANCE_TO(stateName)                               \
43    do {                                                           \
44        state = stateName;                                         \
45        ASSERT(!m_input.isEmpty());                                \
46        m_inputStreamPreprocessor.advance(m_input);                \
47        cc = m_inputStreamPreprocessor.nextInputCharacter();       \
48        goto stateName;                                            \
49    } while (false)
50
51template<unsigned charactersCount>
52ALWAYS_INLINE bool equalLiteral(const StringBuilder& s, const char (&characters)[charactersCount])
53{
54    return WTF::equal(s, reinterpret_cast<const LChar*>(characters), charactersCount - 1);
55}
56
57static void addNewClass(StringBuilder& classes, const StringBuilder& newClass)
58{
59    if (!classes.isEmpty())
60        classes.append(' ');
61    classes.append(newClass);
62}
63
64inline bool emitToken(VTTToken& resultToken, const VTTToken& token)
65{
66    resultToken = token;
67    return true;
68}
69
70inline bool advanceAndEmitToken(SegmentedString& source, VTTToken& resultToken, const VTTToken& token)
71{
72    source.advanceAndUpdateLineNumber();
73    return emitToken(resultToken, token);
74}
75
76VTTTokenizer::VTTTokenizer(const String& input)
77    : m_input(input)
78    , m_inputStreamPreprocessor(this)
79{
80    // Append a EOF marker and close the input "stream".
81    ASSERT(!m_input.isClosed());
82    m_input.append(SegmentedString(String(&kEndOfFileMarker, 1)));
83    m_input.close();
84}
85
86bool VTTTokenizer::nextToken(VTTToken& token)
87{
88    if (m_input.isEmpty() || !m_inputStreamPreprocessor.peek(m_input))
89        return false;
90
91    UChar cc = m_inputStreamPreprocessor.nextInputCharacter();
92    if (cc == kEndOfFileMarker) {
93        m_inputStreamPreprocessor.advance(m_input);
94        return false;
95    }
96
97    StringBuilder buffer;
98    StringBuilder result;
99    StringBuilder classes;
100    enum {
101        DataState,
102        EscapeState,
103        TagState,
104        StartTagState,
105        StartTagClassState,
106        StartTagAnnotationState,
107        EndTagState,
108        TimestampTagState,
109    } state = DataState;
110
111    // 4.8.10.13.4 WebVTT cue text tokenizer
112    switch (state) {
113        WEBVTT_BEGIN_STATE(DataState) {
114            if (cc == '&') {
115                buffer.append(static_cast<LChar>(cc));
116                WEBVTT_ADVANCE_TO(EscapeState);
117            } else if (cc == '<') {
118                if (result.isEmpty()) {
119                    WEBVTT_ADVANCE_TO(TagState);
120                } else {
121                    // We don't want to advance input or perform a state transition - just return a (new) token.
122                    // (On the next call to nextToken we will see '<' again, but take the other branch in this if instead.)
123                    return emitToken(token, VTTToken::StringToken(result.toString()));
124                }
125            } else if (cc == kEndOfFileMarker) {
126                return advanceAndEmitToken(m_input, token, VTTToken::StringToken(result.toString()));
127            } else {
128                result.append(cc);
129                WEBVTT_ADVANCE_TO(DataState);
130            }
131        }
132        END_STATE()
133
134        WEBVTT_BEGIN_STATE(EscapeState) {
135            if (cc == ';') {
136                if (equalLiteral(buffer, "&amp")) {
137                    result.append('&');
138                } else if (equalLiteral(buffer, "&lt")) {
139                    result.append('<');
140                } else if (equalLiteral(buffer, "&gt")) {
141                    result.append('>');
142                } else if (equalLiteral(buffer, "&lrm")) {
143                    result.append(leftToRightMark);
144                } else if (equalLiteral(buffer, "&rlm")) {
145                    result.append(rightToLeftMark);
146                } else if (equalLiteral(buffer, "&nbsp")) {
147                    result.append(noBreakSpace);
148                } else {
149                    buffer.append(static_cast<LChar>(cc));
150                    result.append(buffer);
151                }
152                buffer.clear();
153                WEBVTT_ADVANCE_TO(DataState);
154            } else if (isASCIIAlphanumeric(cc)) {
155                buffer.append(static_cast<LChar>(cc));
156                WEBVTT_ADVANCE_TO(EscapeState);
157            } else if (cc == '<') {
158                result.append(buffer);
159                return emitToken(token, VTTToken::StringToken(result.toString()));
160            } else if (cc == kEndOfFileMarker) {
161                result.append(buffer);
162                return advanceAndEmitToken(m_input, token, VTTToken::StringToken(result.toString()));
163            } else {
164                result.append(buffer);
165                buffer.clear();
166
167                if (cc == '&') {
168                    buffer.append(static_cast<LChar>(cc));
169                    WEBVTT_ADVANCE_TO(EscapeState);
170                }
171                result.append(cc);
172                WEBVTT_ADVANCE_TO(DataState);
173            }
174        }
175        END_STATE()
176
177        WEBVTT_BEGIN_STATE(TagState) {
178            if (isTokenizerWhitespace(cc)) {
179                ASSERT(result.isEmpty());
180                WEBVTT_ADVANCE_TO(StartTagAnnotationState);
181            } else if (cc == '.') {
182                ASSERT(result.isEmpty());
183                WEBVTT_ADVANCE_TO(StartTagClassState);
184            } else if (cc == '/') {
185                WEBVTT_ADVANCE_TO(EndTagState);
186            } else if (WTF::isASCIIDigit(cc)) {
187                result.append(cc);
188                WEBVTT_ADVANCE_TO(TimestampTagState);
189            } else if (cc == '>' || cc == kEndOfFileMarker) {
190                ASSERT(result.isEmpty());
191                return advanceAndEmitToken(m_input, token, VTTToken::StartTag(result.toString()));
192            } else {
193                result.append(cc);
194                WEBVTT_ADVANCE_TO(StartTagState);
195            }
196        }
197        END_STATE()
198
199        WEBVTT_BEGIN_STATE(StartTagState) {
200            if (isTokenizerWhitespace(cc)) {
201                WEBVTT_ADVANCE_TO(StartTagAnnotationState);
202            } else if (cc == '.') {
203                WEBVTT_ADVANCE_TO(StartTagClassState);
204            } else if (cc == '>' || cc == kEndOfFileMarker) {
205                return advanceAndEmitToken(m_input, token, VTTToken::StartTag(result.toString()));
206            } else {
207                result.append(cc);
208                WEBVTT_ADVANCE_TO(StartTagState);
209            }
210        }
211        END_STATE()
212
213        WEBVTT_BEGIN_STATE(StartTagClassState) {
214            if (isTokenizerWhitespace(cc)) {
215                addNewClass(classes, buffer);
216                buffer.clear();
217                WEBVTT_ADVANCE_TO(StartTagAnnotationState);
218            } else if (cc == '.') {
219                addNewClass(classes, buffer);
220                buffer.clear();
221                WEBVTT_ADVANCE_TO(StartTagClassState);
222            } else if (cc == '>' || cc == kEndOfFileMarker) {
223                addNewClass(classes, buffer);
224                buffer.clear();
225                return advanceAndEmitToken(m_input, token, VTTToken::StartTag(result.toString(), classes.toAtomicString()));
226            } else {
227                buffer.append(cc);
228                WEBVTT_ADVANCE_TO(StartTagClassState);
229            }
230        }
231        END_STATE()
232
233        WEBVTT_BEGIN_STATE(StartTagAnnotationState) {
234            if (cc == '>' || cc == kEndOfFileMarker) {
235                return advanceAndEmitToken(m_input, token, VTTToken::StartTag(result.toString(), classes.toAtomicString(), buffer.toAtomicString()));
236            }
237            buffer.append(cc);
238            WEBVTT_ADVANCE_TO(StartTagAnnotationState);
239        }
240        END_STATE()
241
242        WEBVTT_BEGIN_STATE(EndTagState) {
243            if (cc == '>' || cc == kEndOfFileMarker)
244                return advanceAndEmitToken(m_input, token, VTTToken::EndTag(result.toString()));
245            result.append(cc);
246            WEBVTT_ADVANCE_TO(EndTagState);
247        }
248        END_STATE()
249
250        WEBVTT_BEGIN_STATE(TimestampTagState) {
251            if (cc == '>' || cc == kEndOfFileMarker)
252                return advanceAndEmitToken(m_input, token, VTTToken::TimestampTag(result.toString()));
253            result.append(cc);
254            WEBVTT_ADVANCE_TO(TimestampTagState);
255        }
256        END_STATE()
257
258    }
259
260    ASSERT_NOT_REACHED();
261    return false;
262}
263
264}
265
266