1/*
2 * Copyright (C) 2008 Apple Inc. All Rights Reserved.
3 * Copyright (C) 2009 Torch Mobile, Inc. http://www.torchmobile.com/
4 * Copyright (C) 2010 Google, Inc. All Rights Reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 *    notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 *    notice, this list of conditions and the following disclaimer in the
13 *    documentation and/or other materials provided with the distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
16 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR
19 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27
28#include "config.h"
29#include "HTMLEntityParser.h"
30
31#include "HTMLEntitySearch.h"
32#include "HTMLEntityTable.h"
33#include <wtf/Vector.h>
34
35using namespace WTF;
36
37namespace WebCore {
38
39namespace {
40
41static const UChar windowsLatin1ExtensionArray[32] = {
42    0x20AC, 0x0081, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021, // 80-87
43    0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0x008D, 0x017D, 0x008F, // 88-8F
44    0x0090, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014, // 90-97
45    0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x009D, 0x017E, 0x0178, // 98-9F
46};
47
48inline UChar adjustEntity(UChar32 value)
49{
50    if ((value & ~0x1F) != 0x0080)
51        return value;
52    return windowsLatin1ExtensionArray[value - 0x80];
53}
54
55inline UChar32 legalEntityFor(UChar32 value)
56{
57    // FIXME: A number of specific entity values generate parse errors.
58    if (value == 0 || value > 0x10FFFF || (value >= 0xD800 && value <= 0xDFFF))
59        return 0xFFFD;
60    if (U_IS_BMP(value))
61        return adjustEntity(value);
62    return value;
63}
64
65inline bool convertToUTF16(UChar32 value, Vector<UChar, 16>& decodedEntity)
66{
67    if (U_IS_BMP(value)) {
68        UChar character = static_cast<UChar>(value);
69        ASSERT(character == value);
70        decodedEntity.append(character);
71        return true;
72    }
73    decodedEntity.append(U16_LEAD(value));
74    decodedEntity.append(U16_TRAIL(value));
75    return true;
76}
77
78inline bool isHexDigit(UChar cc)
79{
80    return (cc >= '0' && cc <= '9') || (cc >= 'a' && cc <= 'f') || (cc >= 'A' && cc <= 'F');
81}
82
83inline bool isAlphaNumeric(UChar cc)
84{
85    return (cc >= '0' && cc <= '9') || (cc >= 'a' && cc <= 'z') || (cc >= 'A' && cc <= 'Z');
86}
87
88void unconsumeCharacters(SegmentedString& source, const Vector<UChar, 10>& consumedCharacters)
89{
90    if (consumedCharacters.size() == 1)
91        source.push(consumedCharacters[0]);
92    else if (consumedCharacters.size() == 2) {
93        source.push(consumedCharacters[0]);
94        source.push(consumedCharacters[1]);
95    } else
96        source.prepend(SegmentedString(String(consumedCharacters.data(), consumedCharacters.size())));
97}
98
99}
100
101bool consumeHTMLEntity(SegmentedString& source, Vector<UChar, 16>& decodedEntity, bool& notEnoughCharacters, UChar additionalAllowedCharacter)
102{
103    ASSERT(!additionalAllowedCharacter || additionalAllowedCharacter == '"' || additionalAllowedCharacter == '\'' || additionalAllowedCharacter == '>');
104    ASSERT(!notEnoughCharacters);
105    ASSERT(decodedEntity.isEmpty());
106
107    enum EntityState {
108        Initial,
109        Number,
110        MaybeHexLowerCaseX,
111        MaybeHexUpperCaseX,
112        Hex,
113        Decimal,
114        Named
115    };
116    EntityState entityState = Initial;
117    UChar32 result = 0;
118    Vector<UChar, 10> consumedCharacters;
119
120    while (!source.isEmpty()) {
121        UChar cc = *source;
122        switch (entityState) {
123        case Initial: {
124            if (cc == '\x09' || cc == '\x0A' || cc == '\x0C' || cc == ' ' || cc == '<' || cc == '&')
125                return false;
126            if (additionalAllowedCharacter && cc == additionalAllowedCharacter)
127                return false;
128            if (cc == '#') {
129                entityState = Number;
130                break;
131            }
132            if ((cc >= 'a' && cc <= 'z') || (cc >= 'A' && cc <= 'Z')) {
133                entityState = Named;
134                continue;
135            }
136            return false;
137        }
138        case Number: {
139            if (cc == 'x') {
140                entityState = MaybeHexLowerCaseX;
141                break;
142            }
143            if (cc == 'X') {
144                entityState = MaybeHexUpperCaseX;
145                break;
146            }
147            if (cc >= '0' && cc <= '9') {
148                entityState = Decimal;
149                continue;
150            }
151            source.push('#');
152            return false;
153        }
154        case MaybeHexLowerCaseX: {
155            if (isHexDigit(cc)) {
156                entityState = Hex;
157                continue;
158            }
159            source.push('#');
160            source.push('x');
161            return false;
162        }
163        case MaybeHexUpperCaseX: {
164            if (isHexDigit(cc)) {
165                entityState = Hex;
166                continue;
167            }
168            source.push('#');
169            source.push('X');
170            return false;
171        }
172        case Hex: {
173            if (cc >= '0' && cc <= '9')
174                result = result * 16 + cc - '0';
175            else if (cc >= 'a' && cc <= 'f')
176                result = result * 16 + 10 + cc - 'a';
177            else if (cc >= 'A' && cc <= 'F')
178                result = result * 16 + 10 + cc - 'A';
179            else {
180                if (cc == ';')
181                    source.advanceAndASSERT(cc);
182                return convertToUTF16(legalEntityFor(result), decodedEntity);
183            }
184            break;
185        }
186        case Decimal: {
187            if (cc >= '0' && cc <= '9')
188                result = result * 10 + cc - '0';
189            else {
190                if (cc == ';')
191                    source.advanceAndASSERT(cc);
192                return convertToUTF16(legalEntityFor(result), decodedEntity);
193            }
194            break;
195        }
196        case Named: {
197            HTMLEntitySearch entitySearch;
198            while (!source.isEmpty()) {
199                cc = *source;
200                entitySearch.advance(cc);
201                if (!entitySearch.isEntityPrefix())
202                    break;
203                consumedCharacters.append(cc);
204                source.advanceAndASSERT(cc);
205            }
206            notEnoughCharacters = source.isEmpty();
207            if (notEnoughCharacters) {
208                // We can't an entity because there might be a longer entity
209                // that we could match if we had more data.
210                unconsumeCharacters(source, consumedCharacters);
211                return false;
212            }
213            if (!entitySearch.mostRecentMatch()) {
214                ASSERT(!entitySearch.currentValue());
215                unconsumeCharacters(source, consumedCharacters);
216                return false;
217            }
218            if (entitySearch.mostRecentMatch()->length != entitySearch.currentLength()) {
219                // We've consumed too many characters.  We need to walk the
220                // source back to the point at which we had consumed an
221                // actual entity.
222                unconsumeCharacters(source, consumedCharacters);
223                consumedCharacters.clear();
224                const int length = entitySearch.mostRecentMatch()->length;
225                const UChar* reference = entitySearch.mostRecentMatch()->entity;
226                for (int i = 0; i < length; ++i) {
227                    cc = *source;
228                    ASSERT_UNUSED(reference, cc == *reference++);
229                    consumedCharacters.append(cc);
230                    source.advanceAndASSERT(cc);
231                    ASSERT(!source.isEmpty());
232                }
233                cc = *source;
234            }
235            if (entitySearch.mostRecentMatch()->lastCharacter() == ';'
236                || !additionalAllowedCharacter
237                || !(isAlphaNumeric(cc) || cc == '=')) {
238                return convertToUTF16(entitySearch.mostRecentMatch()->value, decodedEntity);
239            }
240            unconsumeCharacters(source, consumedCharacters);
241            return false;
242        }
243        }
244        consumedCharacters.append(cc);
245        source.advanceAndASSERT(cc);
246    }
247    ASSERT(source.isEmpty());
248    notEnoughCharacters = true;
249    unconsumeCharacters(source, consumedCharacters);
250    return false;
251}
252
253UChar decodeNamedEntity(const char* name)
254{
255    HTMLEntitySearch search;
256    while (*name) {
257        search.advance(*name++);
258        if (!search.isEntityPrefix())
259            return 0;
260    }
261    search.advance(';');
262    UChar32 entityValue = search.currentValue();
263    if (U16_LENGTH(entityValue) != 1) {
264        // Callers need to move off this API if the entity table has values
265        // which do no fit in a 16 bit UChar!
266        ASSERT_NOT_REACHED();
267        return 0;
268    }
269    return static_cast<UChar>(entityValue);
270}
271
272} // namespace WebCore
273