1/*
2 * Copyright (C) 2008 Apple Inc. All Rights Reserved.
3 * Copyright (C) 2009 Torch Mobile, Inc. http://www.torchmobile.com/
4 * Copyright (C) 2010 Google, Inc. All Rights Reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 *    notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 *    notice, this list of conditions and the following disclaimer in the
13 *    documentation and/or other materials provided with the distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
16 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR
19 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27
28#include "config.h"
29#include "core/html/parser/HTMLEntityParser.h"
30
31#include "core/html/parser/HTMLEntitySearch.h"
32#include "core/html/parser/HTMLEntityTable.h"
33#include "wtf/text/StringBuilder.h"
34
35using namespace WTF;
36
37namespace blink {
38
39static const UChar windowsLatin1ExtensionArray[32] = {
40    0x20AC, 0x0081, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021, // 80-87
41    0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0x008D, 0x017D, 0x008F, // 88-8F
42    0x0090, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014, // 90-97
43    0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x009D, 0x017E, 0x0178, // 98-9F
44};
45
46static bool isAlphaNumeric(UChar cc)
47{
48    return (cc >= '0' && cc <= '9') || (cc >= 'a' && cc <= 'z') || (cc >= 'A' && cc <= 'Z');
49}
50
51static UChar adjustEntity(UChar32 value)
52{
53    if ((value & ~0x1F) != 0x0080)
54        return value;
55    return windowsLatin1ExtensionArray[value - 0x80];
56}
57
58static void appendLegalEntityFor(UChar32 c, DecodedHTMLEntity& decodedEntity)
59{
60    // FIXME: A number of specific entity values generate parse errors.
61    if (c <= 0 || c > 0x10FFFF || (c >= 0xD800 && c <= 0xDFFF)) {
62        decodedEntity.append(0xFFFD);
63        return;
64    }
65    if (U_IS_BMP(c)) {
66        decodedEntity.append(adjustEntity(c));
67        return;
68    }
69    decodedEntity.append(c);
70}
71
72static const UChar32 kInvalidUnicode = -1;
73
74static bool isHexDigit(UChar cc)
75{
76    return (cc >= '0' && cc <= '9') || (cc >= 'a' && cc <= 'f') || (cc >= 'A' && cc <= 'F');
77}
78
79static UChar asHexDigit(UChar cc)
80{
81    if (cc >= '0' && cc <= '9')
82      return cc - '0';
83    if (cc >= 'a' && cc <= 'z')
84      return 10 + cc - 'a';
85    if (cc >= 'A' && cc <= 'Z')
86      return 10 + cc - 'A';
87    ASSERT_NOT_REACHED();
88    return 0;
89}
90
91typedef Vector<UChar, 64> ConsumedCharacterBuffer;
92
93static void unconsumeCharacters(SegmentedString& source, ConsumedCharacterBuffer& consumedCharacters)
94{
95    if (consumedCharacters.size() == 1)
96        source.push(consumedCharacters[0]);
97    else if (consumedCharacters.size() == 2) {
98        source.push(consumedCharacters[0]);
99        source.push(consumedCharacters[1]);
100    } else
101        source.prepend(SegmentedString(String(consumedCharacters)));
102}
103
104static bool consumeNamedEntity(SegmentedString& source, DecodedHTMLEntity& decodedEntity, bool& notEnoughCharacters, UChar additionalAllowedCharacter, UChar& cc)
105{
106    ConsumedCharacterBuffer consumedCharacters;
107    HTMLEntitySearch entitySearch;
108    while (!source.isEmpty()) {
109        cc = source.currentChar();
110        entitySearch.advance(cc);
111        if (!entitySearch.isEntityPrefix())
112            break;
113        consumedCharacters.append(cc);
114        source.advanceAndASSERT(cc);
115    }
116    notEnoughCharacters = source.isEmpty();
117    if (notEnoughCharacters) {
118        // We can't decide on an entity because there might be a longer entity
119        // that we could match if we had more data.
120        unconsumeCharacters(source, consumedCharacters);
121        return false;
122    }
123    if (!entitySearch.mostRecentMatch()) {
124        unconsumeCharacters(source, consumedCharacters);
125        return false;
126    }
127    if (entitySearch.mostRecentMatch()->length != entitySearch.currentLength()) {
128        // We've consumed too many characters. We need to walk the
129        // source back to the point at which we had consumed an
130        // actual entity.
131        unconsumeCharacters(source, consumedCharacters);
132        consumedCharacters.clear();
133        const HTMLEntityTableEntry* mostRecent = entitySearch.mostRecentMatch();
134        const int length = mostRecent->length;
135        const LChar* reference = HTMLEntityTable::entityString(*mostRecent);
136        for (int i = 0; i < length; ++i) {
137            cc = source.currentChar();
138            ASSERT_UNUSED(reference, cc == static_cast<UChar>(*reference++));
139            consumedCharacters.append(cc);
140            source.advanceAndASSERT(cc);
141            ASSERT(!source.isEmpty());
142        }
143        cc = source.currentChar();
144    }
145    if (entitySearch.mostRecentMatch()->lastCharacter() == ';'
146        || !additionalAllowedCharacter
147        || !(isAlphaNumeric(cc) || cc == '=')) {
148        decodedEntity.append(entitySearch.mostRecentMatch()->firstValue);
149        if (UChar32 second = entitySearch.mostRecentMatch()->secondValue)
150            decodedEntity.append(second);
151        return true;
152    }
153    unconsumeCharacters(source, consumedCharacters);
154    return false;
155}
156
157bool consumeHTMLEntity(SegmentedString& source, DecodedHTMLEntity& decodedEntity, bool& notEnoughCharacters, UChar additionalAllowedCharacter)
158{
159    ASSERT(!additionalAllowedCharacter || additionalAllowedCharacter == '"' || additionalAllowedCharacter == '\'' || additionalAllowedCharacter == '>');
160    ASSERT(!notEnoughCharacters);
161    ASSERT(decodedEntity.isEmpty());
162
163    enum EntityState {
164        Initial,
165        Number,
166        MaybeHexLowerCaseX,
167        MaybeHexUpperCaseX,
168        Hex,
169        Decimal,
170        Named
171    };
172    EntityState entityState = Initial;
173    UChar32 result = 0;
174    ConsumedCharacterBuffer consumedCharacters;
175
176    while (!source.isEmpty()) {
177        UChar cc = source.currentChar();
178        switch (entityState) {
179        case Initial: {
180            if (cc == '\x09' || cc == '\x0A' || cc == '\x0C' || cc == ' ' || cc == '<' || cc == '&')
181                return false;
182            if (additionalAllowedCharacter && cc == additionalAllowedCharacter)
183                return false;
184            if (cc == '#') {
185                entityState = Number;
186                break;
187            }
188            if ((cc >= 'a' && cc <= 'z') || (cc >= 'A' && cc <= 'Z')) {
189                entityState = Named;
190                continue;
191            }
192            return false;
193        }
194        case Number: {
195            if (cc == 'x') {
196                entityState = MaybeHexLowerCaseX;
197                break;
198            }
199            if (cc == 'X') {
200                entityState = MaybeHexUpperCaseX;
201                break;
202            }
203            if (cc >= '0' && cc <= '9') {
204                entityState = Decimal;
205                continue;
206            }
207            source.push('#');
208            return false;
209        }
210        case MaybeHexLowerCaseX: {
211            if (isHexDigit(cc)) {
212                entityState = Hex;
213                continue;
214            }
215            source.push('#');
216            source.push('x');
217            return false;
218        }
219        case MaybeHexUpperCaseX: {
220            if (isHexDigit(cc)) {
221                entityState = Hex;
222                continue;
223            }
224            source.push('#');
225            source.push('X');
226            return false;
227        }
228        case Hex: {
229            if (isHexDigit(cc)) {
230                if (result != kInvalidUnicode)
231                    result = result * 16 + asHexDigit(cc);
232            } else if (cc == ';') {
233                source.advanceAndASSERT(cc);
234                appendLegalEntityFor(result, decodedEntity);
235                return true;
236            } else {
237                appendLegalEntityFor(result, decodedEntity);
238                return true;
239            }
240            break;
241        }
242        case Decimal: {
243            if (cc >= '0' && cc <= '9') {
244                if (result != kInvalidUnicode)
245                    result = result * 10 + cc - '0';
246            } else if (cc == ';') {
247                source.advanceAndASSERT(cc);
248                appendLegalEntityFor(result, decodedEntity);
249                return true;
250            } else {
251                appendLegalEntityFor(result, decodedEntity);
252                return true;
253            }
254            break;
255        }
256        case Named: {
257            return consumeNamedEntity(source, decodedEntity, notEnoughCharacters, additionalAllowedCharacter, cc);
258        }
259        }
260
261        if (result > UCHAR_MAX_VALUE)
262            result = kInvalidUnicode;
263
264        consumedCharacters.append(cc);
265        source.advanceAndASSERT(cc);
266    }
267    ASSERT(source.isEmpty());
268    notEnoughCharacters = true;
269    unconsumeCharacters(source, consumedCharacters);
270    return false;
271}
272
273static size_t appendUChar32ToUCharArray(UChar32 value, UChar* result)
274{
275    if (U_IS_BMP(value)) {
276        UChar character = static_cast<UChar>(value);
277        ASSERT(character == value);
278        result[0] = character;
279        return 1;
280    }
281
282    result[0] = U16_LEAD(value);
283    result[1] = U16_TRAIL(value);
284    return 2;
285}
286
287size_t decodeNamedEntityToUCharArray(const char* name, UChar result[4])
288{
289    HTMLEntitySearch search;
290    while (*name) {
291        search.advance(*name++);
292        if (!search.isEntityPrefix())
293            return 0;
294    }
295    search.advance(';');
296    if (!search.isEntityPrefix())
297        return 0;
298
299    size_t numberOfCodePoints = appendUChar32ToUCharArray(search.mostRecentMatch()->firstValue, result);
300    if (!search.mostRecentMatch()->secondValue)
301        return numberOfCodePoints;
302    return numberOfCodePoints + appendUChar32ToUCharArray(search.mostRecentMatch()->secondValue, result + numberOfCodePoints);
303}
304
305} // namespace blink
306