1/* 2 Copyright (C) 1999 Lars Knoll (knoll@mpi-hd.mpg.de) 3 Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009 Apple Inc. All rights reserved. 4 Copyright (C) 2005, 2006, 2007 Alexey Proskuryakov (ap@nypop.com) 5 6 This library is free software; you can redistribute it and/or 7 modify it under the terms of the GNU Library General Public 8 License as published by the Free Software Foundation; either 9 version 2 of the License, or (at your option) any later version. 10 11 This library is distributed in the hope that it will be useful, 12 but WITHOUT ANY WARRANTY; without even the implied warranty of 13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 Library General Public License for more details. 15 16 You should have received a copy of the GNU Library General Public License 17 along with this library; see the file COPYING.LIB. If not, write to 18 the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, 19 Boston, MA 02110-1301, USA. 20*/ 21 22 23#include "config.h" 24#include "TextResourceDecoder.h" 25 26#include "DOMImplementation.h" 27#include "HTMLNames.h" 28#include "TextCodec.h" 29#include "TextEncoding.h" 30#include "TextEncodingDetector.h" 31#include "TextEncodingRegistry.h" 32#include <wtf/ASCIICType.h> 33#include <wtf/StringExtras.h> 34 35using namespace WTF; 36 37namespace WebCore { 38 39using namespace HTMLNames; 40 41// You might think we should put these find functions elsewhere, perhaps with the 42// similar functions that operate on UChar, but arguably only the decoder has 43// a reason to process strings of char rather than UChar. 44 45static int find(const char* subject, size_t subjectLength, const char* target) 46{ 47 size_t targetLength = strlen(target); 48 if (targetLength > subjectLength) 49 return -1; 50 for (size_t i = 0; i <= subjectLength - targetLength; ++i) { 51 bool match = true; 52 for (size_t j = 0; j < targetLength; ++j) { 53 if (subject[i + j] != target[j]) { 54 match = false; 55 break; 56 } 57 } 58 if (match) 59 return i; 60 } 61 return -1; 62} 63 64static int findIgnoringCase(const char* subject, size_t subjectLength, const char* target) 65{ 66 size_t targetLength = strlen(target); 67 if (targetLength > subjectLength) 68 return -1; 69#ifndef NDEBUG 70 for (size_t i = 0; i < targetLength; ++i) 71 ASSERT(isASCIILower(target[i])); 72#endif 73 for (size_t i = 0; i <= subjectLength - targetLength; ++i) { 74 bool match = true; 75 for (size_t j = 0; j < targetLength; ++j) { 76 if (toASCIILower(subject[i + j]) != target[j]) { 77 match = false; 78 break; 79 } 80 } 81 if (match) 82 return i; 83 } 84 return -1; 85} 86 87static TextEncoding findTextEncoding(const char* encodingName, int length) 88{ 89 Vector<char, 64> buffer(length + 1); 90 memcpy(buffer.data(), encodingName, length); 91 buffer[length] = '\0'; 92 return buffer.data(); 93} 94 95class KanjiCode { 96public: 97 enum Type { ASCII, JIS, EUC, SJIS, UTF16, UTF8 }; 98 static enum Type judge(const char* str, int length); 99 static const int ESC = 0x1b; 100 static const unsigned char sjisMap[256]; 101 static int ISkanji(int code) 102 { 103 if (code >= 0x100) 104 return 0; 105 return sjisMap[code & 0xff] & 1; 106 } 107 static int ISkana(int code) 108 { 109 if (code >= 0x100) 110 return 0; 111 return sjisMap[code & 0xff] & 2; 112 } 113}; 114 115const unsigned char KanjiCode::sjisMap[256] = { 116 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 117 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 118 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 119 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 120 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 121 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 122 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 123 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 124 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 125 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 126 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 127 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 128 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 129 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 130 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 131 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0 132}; 133 134/* 135 * EUC-JP is 136 * [0xa1 - 0xfe][0xa1 - 0xfe] 137 * 0x8e[0xa1 - 0xfe](SS2) 138 * 0x8f[0xa1 - 0xfe][0xa1 - 0xfe](SS3) 139 * 140 * Shift_Jis is 141 * [0x81 - 0x9f, 0xe0 - 0xef(0xfe?)][0x40 - 0x7e, 0x80 - 0xfc] 142 * 143 * Shift_Jis Hankaku Kana is 144 * [0xa1 - 0xdf] 145 */ 146 147/* 148 * KanjiCode::judge() is based on judge_jcode() from jvim 149 * http://hp.vector.co.jp/authors/VA003457/vim/ 150 * 151 * Special Thanks to Kenichi Tsuchida 152 */ 153 154enum KanjiCode::Type KanjiCode::judge(const char* str, int size) 155{ 156 enum Type code; 157 int i; 158 int bfr = false; /* Kana Moji */ 159 int bfk = 0; /* EUC Kana */ 160 int sjis = 0; 161 int euc = 0; 162 163 const unsigned char* ptr = reinterpret_cast<const unsigned char*>(str); 164 165 code = ASCII; 166 167 i = 0; 168 while (i < size) { 169 if (ptr[i] == ESC && (size - i >= 3)) { 170 if ((ptr[i + 1] == '$' && ptr[i + 2] == 'B') 171 || (ptr[i + 1] == '(' && ptr[i + 2] == 'B')) { 172 code = JIS; 173 goto breakBreak; 174 } else if ((ptr[i + 1] == '$' && ptr[i + 2] == '@') 175 || (ptr[i + 1] == '(' && ptr[i + 2] == 'J')) { 176 code = JIS; 177 goto breakBreak; 178 } else if (ptr[i + 1] == '(' && ptr[i + 2] == 'I') { 179 code = JIS; 180 i += 3; 181 } else if (ptr[i + 1] == ')' && ptr[i + 2] == 'I') { 182 code = JIS; 183 i += 3; 184 } else { 185 i++; 186 } 187 bfr = false; 188 bfk = 0; 189 } else { 190 if (ptr[i] < 0x20) { 191 bfr = false; 192 bfk = 0; 193 /* ?? check kudokuten ?? && ?? hiragana ?? */ 194 if ((i >= 2) && (ptr[i - 2] == 0x81) 195 && (0x41 <= ptr[i - 1] && ptr[i - 1] <= 0x49)) { 196 code = SJIS; 197 sjis += 100; /* kudokuten */ 198 } else if ((i >= 2) && (ptr[i - 2] == 0xa1) 199 && (0xa2 <= ptr[i - 1] && ptr[i - 1] <= 0xaa)) { 200 code = EUC; 201 euc += 100; /* kudokuten */ 202 } else if ((i >= 2) && (ptr[i - 2] == 0x82) && (0xa0 <= ptr[i - 1])) { 203 sjis += 40; /* hiragana */ 204 } else if ((i >= 2) && (ptr[i - 2] == 0xa4) && (0xa0 <= ptr[i - 1])) { 205 euc += 40; /* hiragana */ 206 } 207 } else { 208 /* ?? check hiragana or katana ?? */ 209 if ((size - i > 1) && (ptr[i] == 0x82) && (0xa0 <= ptr[i + 1])) { 210 sjis++; /* hiragana */ 211 } else if ((size - i > 1) && (ptr[i] == 0x83) 212 && (0x40 <= ptr[i + 1] && ptr[i + 1] <= 0x9f)) { 213 sjis++; /* katakana */ 214 } else if ((size - i > 1) && (ptr[i] == 0xa4) && (0xa0 <= ptr[i + 1])) { 215 euc++; /* hiragana */ 216 } else if ((size - i > 1) && (ptr[i] == 0xa5) && (0xa0 <= ptr[i + 1])) { 217 euc++; /* katakana */ 218 } 219 if (bfr) { 220 if ((i >= 1) && (0x40 <= ptr[i] && ptr[i] <= 0xa0) && ISkanji(ptr[i - 1])) { 221 code = SJIS; 222 goto breakBreak; 223 } else if ((i >= 1) && (0x81 <= ptr[i - 1] && ptr[i - 1] <= 0x9f) && ((0x40 <= ptr[i] && ptr[i] < 0x7e) || (0x7e < ptr[i] && ptr[i] <= 0xfc))) { 224 code = SJIS; 225 goto breakBreak; 226 } else if ((i >= 1) && (0xfd <= ptr[i] && ptr[i] <= 0xfe) && (0xa1 <= ptr[i - 1] && ptr[i - 1] <= 0xfe)) { 227 code = EUC; 228 goto breakBreak; 229 } else if ((i >= 1) && (0xfd <= ptr[i - 1] && ptr[i - 1] <= 0xfe) && (0xa1 <= ptr[i] && ptr[i] <= 0xfe)) { 230 code = EUC; 231 goto breakBreak; 232 } else if ((i >= 1) && (ptr[i] < 0xa0 || 0xdf < ptr[i]) && (0x8e == ptr[i - 1])) { 233 code = SJIS; 234 goto breakBreak; 235 } else if (ptr[i] <= 0x7f) { 236 code = SJIS; 237 goto breakBreak; 238 } else { 239 if (0xa1 <= ptr[i] && ptr[i] <= 0xa6) { 240 euc++; /* sjis hankaku kana kigo */ 241 } else if (0xa1 <= ptr[i] && ptr[i] <= 0xdf) { 242 ; /* sjis hankaku kana */ 243 } else if (0xa1 <= ptr[i] && ptr[i] <= 0xfe) { 244 euc++; 245 } else if (0x8e == ptr[i]) { 246 euc++; 247 } else if (0x20 <= ptr[i] && ptr[i] <= 0x7f) { 248 sjis++; 249 } 250 bfr = false; 251 bfk = 0; 252 } 253 } else if (0x8e == ptr[i]) { 254 if (size - i <= 1) { 255 ; 256 } else if (0xa1 <= ptr[i + 1] && ptr[i + 1] <= 0xdf) { 257 /* EUC KANA or SJIS KANJI */ 258 if (bfk == 1) { 259 euc += 100; 260 } 261 bfk++; 262 i++; 263 } else { 264 /* SJIS only */ 265 code = SJIS; 266 goto breakBreak; 267 } 268 } else if (0x81 <= ptr[i] && ptr[i] <= 0x9f) { 269 /* SJIS only */ 270 code = SJIS; 271 if ((size - i >= 1) 272 && ((0x40 <= ptr[i + 1] && ptr[i + 1] <= 0x7e) 273 || (0x80 <= ptr[i + 1] && ptr[i + 1] <= 0xfc))) { 274 goto breakBreak; 275 } 276 } else if (0xfd <= ptr[i] && ptr[i] <= 0xfe) { 277 /* EUC only */ 278 code = EUC; 279 if ((size - i >= 1) 280 && (0xa1 <= ptr[i + 1] && ptr[i + 1] <= 0xfe)) { 281 goto breakBreak; 282 } 283 } else if (ptr[i] <= 0x7f) { 284 ; 285 } else { 286 bfr = true; 287 bfk = 0; 288 } 289 } 290 i++; 291 } 292 } 293 if (code == ASCII) { 294 if (sjis > euc) { 295 code = SJIS; 296 } else if (sjis < euc) { 297 code = EUC; 298 } 299 } 300breakBreak: 301 return (code); 302} 303 304TextResourceDecoder::ContentType TextResourceDecoder::determineContentType(const String& mimeType) 305{ 306 if (equalIgnoringCase(mimeType, "text/css")) 307 return CSS; 308 if (equalIgnoringCase(mimeType, "text/html")) 309 return HTML; 310 if (DOMImplementation::isXMLMIMEType(mimeType)) 311 return XML; 312 return PlainText; 313} 314 315const TextEncoding& TextResourceDecoder::defaultEncoding(ContentType contentType, const TextEncoding& specifiedDefaultEncoding) 316{ 317 // Despite 8.5 "Text/xml with Omitted Charset" of RFC 3023, we assume UTF-8 instead of US-ASCII 318 // for text/xml. This matches Firefox. 319 if (contentType == XML) 320 return UTF8Encoding(); 321 if (!specifiedDefaultEncoding.isValid()) 322 return Latin1Encoding(); 323 return specifiedDefaultEncoding; 324} 325 326TextResourceDecoder::TextResourceDecoder(const String& mimeType, const TextEncoding& specifiedDefaultEncoding, bool usesEncodingDetector) 327 : m_contentType(determineContentType(mimeType)) 328 , m_encoding(defaultEncoding(m_contentType, specifiedDefaultEncoding)) 329 , m_source(DefaultEncoding) 330 , m_hintEncoding(0) 331 , m_checkedForBOM(false) 332 , m_checkedForCSSCharset(false) 333 , m_checkedForHeadCharset(false) 334 , m_useLenientXMLDecoding(false) 335 , m_sawError(false) 336 , m_usesEncodingDetector(usesEncodingDetector) 337{ 338} 339 340TextResourceDecoder::~TextResourceDecoder() 341{ 342} 343 344void TextResourceDecoder::setEncoding(const TextEncoding& encoding, EncodingSource source) 345{ 346 // In case the encoding didn't exist, we keep the old one (helps some sites specifying invalid encodings). 347 if (!encoding.isValid()) 348 return; 349 350 // When encoding comes from meta tag (i.e. it cannot be XML files sent via XHR), 351 // treat x-user-defined as windows-1252 (bug 18270) 352 if (source == EncodingFromMetaTag && strcasecmp(encoding.name(), "x-user-defined") == 0) 353 m_encoding = "windows-1252"; 354 else if (source == EncodingFromMetaTag || source == EncodingFromXMLHeader || source == EncodingFromCSSCharset) 355 m_encoding = encoding.closestByteBasedEquivalent(); 356 else 357 m_encoding = encoding; 358 359 m_codec.clear(); 360 m_source = source; 361} 362 363// Returns the position of the encoding string. 364static int findXMLEncoding(const char* str, int len, int& encodingLength) 365{ 366 int pos = find(str, len, "encoding"); 367 if (pos == -1) 368 return -1; 369 pos += 8; 370 371 // Skip spaces and stray control characters. 372 while (pos < len && str[pos] <= ' ') 373 ++pos; 374 375 // Skip equals sign. 376 if (pos >= len || str[pos] != '=') 377 return -1; 378 ++pos; 379 380 // Skip spaces and stray control characters. 381 while (pos < len && str[pos] <= ' ') 382 ++pos; 383 384 // Skip quotation mark. 385 if (pos >= len) 386 return - 1; 387 char quoteMark = str[pos]; 388 if (quoteMark != '"' && quoteMark != '\'') 389 return -1; 390 ++pos; 391 392 // Find the trailing quotation mark. 393 int end = pos; 394 while (end < len && str[end] != quoteMark) 395 ++end; 396 if (end >= len) 397 return -1; 398 399 encodingLength = end - pos; 400 return pos; 401} 402 403// true if there is more to parse 404static inline bool skipWhitespace(const char*& pos, const char* dataEnd) 405{ 406 while (pos < dataEnd && (*pos == '\t' || *pos == ' ')) 407 ++pos; 408 return pos != dataEnd; 409} 410 411size_t TextResourceDecoder::checkForBOM(const char* data, size_t len) 412{ 413 // Check for UTF-16/32 or UTF-8 BOM mark at the beginning, which is a sure sign of a Unicode encoding. 414 // We let it override even a user-chosen encoding. 415 ASSERT(!m_checkedForBOM); 416 417 size_t lengthOfBOM = 0; 418 419 size_t bufferLength = m_buffer.size(); 420 421 size_t buf1Len = bufferLength; 422 size_t buf2Len = len; 423 const unsigned char* buf1 = reinterpret_cast<const unsigned char*>(m_buffer.data()); 424 const unsigned char* buf2 = reinterpret_cast<const unsigned char*>(data); 425 unsigned char c1 = buf1Len ? (--buf1Len, *buf1++) : buf2Len ? (--buf2Len, *buf2++) : 0; 426 unsigned char c2 = buf1Len ? (--buf1Len, *buf1++) : buf2Len ? (--buf2Len, *buf2++) : 0; 427 unsigned char c3 = buf1Len ? (--buf1Len, *buf1++) : buf2Len ? (--buf2Len, *buf2++) : 0; 428 unsigned char c4 = buf2Len ? (--buf2Len, *buf2++) : 0; 429 430 // Check for the BOM. 431 if (c1 == 0xFF && c2 == 0xFE) { 432 if (c3 != 0 || c4 != 0) { 433 setEncoding(UTF16LittleEndianEncoding(), AutoDetectedEncoding); 434 lengthOfBOM = 2; 435 } else { 436 setEncoding(UTF32LittleEndianEncoding(), AutoDetectedEncoding); 437 lengthOfBOM = 4; 438 } 439 } else if (c1 == 0xEF && c2 == 0xBB && c3 == 0xBF) { 440 setEncoding(UTF8Encoding(), AutoDetectedEncoding); 441 lengthOfBOM = 3; 442 } else if (c1 == 0xFE && c2 == 0xFF) { 443 setEncoding(UTF16BigEndianEncoding(), AutoDetectedEncoding); 444 lengthOfBOM = 2; 445 } else if (c1 == 0 && c2 == 0 && c3 == 0xFE && c4 == 0xFF) { 446 setEncoding(UTF32BigEndianEncoding(), AutoDetectedEncoding); 447 lengthOfBOM = 4; 448 } 449 450 if (lengthOfBOM || bufferLength + len >= 4) 451 m_checkedForBOM = true; 452 453 return lengthOfBOM; 454} 455 456bool TextResourceDecoder::checkForCSSCharset(const char* data, size_t len, bool& movedDataToBuffer) 457{ 458 if (m_source != DefaultEncoding && m_source != EncodingFromParentFrame) { 459 m_checkedForCSSCharset = true; 460 return true; 461 } 462 463 size_t oldSize = m_buffer.size(); 464 m_buffer.grow(oldSize + len); 465 memcpy(m_buffer.data() + oldSize, data, len); 466 467 movedDataToBuffer = true; 468 469 if (m_buffer.size() > 8) { // strlen("@charset") == 8 470 const char* dataStart = m_buffer.data(); 471 const char* dataEnd = dataStart + m_buffer.size(); 472 473 if (dataStart[0] == '@' && dataStart[1] == 'c' && dataStart[2] == 'h' && dataStart[3] == 'a' && dataStart[4] == 'r' && 474 dataStart[5] == 's' && dataStart[6] == 'e' && dataStart[7] == 't') { 475 476 dataStart += 8; 477 const char* pos = dataStart; 478 if (!skipWhitespace(pos, dataEnd)) 479 return false; 480 481 if (*pos == '"' || *pos == '\'') { 482 char quotationMark = *pos; 483 ++pos; 484 dataStart = pos; 485 486 while (pos < dataEnd && *pos != quotationMark) 487 ++pos; 488 if (pos == dataEnd) 489 return false; 490 491 int encodingNameLength = pos - dataStart + 1; 492 493 ++pos; 494 if (!skipWhitespace(pos, dataEnd)) 495 return false; 496 497 if (*pos == ';') 498 setEncoding(findTextEncoding(dataStart, encodingNameLength), EncodingFromCSSCharset); 499 } 500 } 501 m_checkedForCSSCharset = true; 502 return true; 503 } 504 return false; 505} 506 507// Other browsers allow comments in the head section, so we need to also. 508// It's important not to look for tags inside the comments. 509static inline void skipComment(const char*& ptr, const char* pEnd) 510{ 511 const char* p = ptr; 512 if (p == pEnd) 513 return; 514 // Allow <!-->; other browsers do. 515 if (*p == '>') { 516 p++; 517 } else { 518 while (p + 2 < pEnd) { 519 if (*p == '-') { 520 // This is the real end of comment, "-->". 521 if (p[1] == '-' && p[2] == '>') { 522 p += 3; 523 break; 524 } 525 // This is the incorrect end of comment that other browsers allow, "--!>". 526 if (p + 3 < pEnd && p[1] == '-' && p[2] == '!' && p[3] == '>') { 527 p += 4; 528 break; 529 } 530 } 531 p++; 532 } 533 } 534 ptr = p; 535} 536 537const int bytesToCheckUnconditionally = 1024; // That many input bytes will be checked for meta charset even if <head> section is over. 538 539bool TextResourceDecoder::checkForHeadCharset(const char* data, size_t len, bool& movedDataToBuffer) 540{ 541 if (m_source != DefaultEncoding && m_source != EncodingFromParentFrame) { 542 m_checkedForHeadCharset = true; 543 return true; 544 } 545 546 // This is not completely efficient, since the function might go 547 // through the HTML head several times. 548 549 size_t oldSize = m_buffer.size(); 550 m_buffer.grow(oldSize + len); 551 memcpy(m_buffer.data() + oldSize, data, len); 552 553 movedDataToBuffer = true; 554 555 const char* ptr = m_buffer.data(); 556 const char* pEnd = ptr + m_buffer.size(); 557 558 // Is there enough data available to check for XML declaration? 559 if (m_buffer.size() < 8) 560 return false; 561 562 // Handle XML declaration, which can have encoding in it. This encoding is honored even for HTML documents. 563 // It is an error for an XML declaration not to be at the start of an XML document, and it is ignored in HTML documents in such case. 564 if (ptr[0] == '<' && ptr[1] == '?' && ptr[2] == 'x' && ptr[3] == 'm' && ptr[4] == 'l') { 565 const char* xmlDeclarationEnd = ptr; 566 while (xmlDeclarationEnd != pEnd && *xmlDeclarationEnd != '>') 567 ++xmlDeclarationEnd; 568 if (xmlDeclarationEnd == pEnd) 569 return false; 570 // No need for +1, because we have an extra "?" to lose at the end of XML declaration. 571 int len; 572 int pos = findXMLEncoding(ptr, xmlDeclarationEnd - ptr, len); 573 if (pos != -1) 574 setEncoding(findTextEncoding(ptr + pos, len), EncodingFromXMLHeader); 575 // continue looking for a charset - it may be specified in an HTTP-Equiv meta 576 } else if (ptr[0] == '<' && ptr[1] == 0 && ptr[2] == '?' && ptr[3] == 0 && ptr[4] == 'x' && ptr[5] == 0) { 577 setEncoding(UTF16LittleEndianEncoding(), AutoDetectedEncoding); 578 return true; 579 } else if (ptr[0] == 0 && ptr[1] == '<' && ptr[2] == 0 && ptr[3] == '?' && ptr[4] == 0 && ptr[5] == 'x') { 580 setEncoding(UTF16BigEndianEncoding(), AutoDetectedEncoding); 581 return true; 582 } else if (ptr[0] == '<' && ptr[1] == 0 && ptr[2] == 0 && ptr[3] == 0 && ptr[4] == '?' && ptr[5] == 0 && ptr[6] == 0 && ptr[7] == 0) { 583 setEncoding(UTF32LittleEndianEncoding(), AutoDetectedEncoding); 584 return true; 585 } else if (ptr[0] == 0 && ptr[1] == 0 && ptr[2] == 0 && ptr[3] == '<' && ptr[4] == 0 && ptr[5] == 0 && ptr[6] == 0 && ptr[7] == '?') { 586 setEncoding(UTF32BigEndianEncoding(), AutoDetectedEncoding); 587 return true; 588 } 589 590 // we still don't have an encoding, and are in the head 591 // the following tags are allowed in <head>: 592 // SCRIPT|STYLE|META|LINK|OBJECT|TITLE|BASE 593 594 // We stop scanning when a tag that is not permitted in <head> 595 // is seen, rather when </head> is seen, because that more closely 596 // matches behavior in other browsers; more details in 597 // <http://bugs.webkit.org/show_bug.cgi?id=3590>. 598 599 // Additionally, we ignore things that looks like tags in <title>, <script> and <noscript>; see 600 // <http://bugs.webkit.org/show_bug.cgi?id=4560>, <http://bugs.webkit.org/show_bug.cgi?id=12165> 601 // and <http://bugs.webkit.org/show_bug.cgi?id=12389>. 602 603 // Since many sites have charset declarations after <body> or other tags that are disallowed in <head>, 604 // we don't bail out until we've checked at least bytesToCheckUnconditionally bytes of input. 605 606 AtomicStringImpl* enclosingTagName = 0; 607 bool inHeadSection = true; // Becomes false when </head> or any tag not allowed in head is encountered. 608 609 // the HTTP-EQUIV meta has no effect on XHTML 610 if (m_contentType == XML) 611 return true; 612 613 while (ptr + 3 < pEnd) { // +3 guarantees that "<!--" fits in the buffer - and certainly we aren't going to lose any "charset" that way. 614 if (*ptr == '<') { 615 bool end = false; 616 ptr++; 617 618 // Handle comments. 619 if (ptr[0] == '!' && ptr[1] == '-' && ptr[2] == '-') { 620 ptr += 3; 621 skipComment(ptr, pEnd); 622 if (ptr - m_buffer.data() >= bytesToCheckUnconditionally && !inHeadSection) { 623 // Some pages that test bandwidth from within the browser do it by having 624 // huge comments and measuring the time they take to load. Repeatedly scanning 625 // these comments can take a lot of CPU time. 626 m_checkedForHeadCharset = true; 627 return true; 628 } 629 continue; 630 } 631 632 if (*ptr == '/') { 633 ++ptr; 634 end = true; 635 } 636 637 // Grab the tag name, but mostly ignore namespaces. 638 bool sawNamespace = false; 639 char tagBuffer[20]; 640 int len = 0; 641 while (len < 19) { 642 if (ptr == pEnd) 643 return false; 644 char c = *ptr; 645 if (c == ':') { 646 len = 0; 647 sawNamespace = true; 648 ptr++; 649 continue; 650 } 651 if ((c >= 'a' && c <= 'z') || (c >= '0' && c <= '9')) 652 ; 653 else if (c >= 'A' && c <= 'Z') 654 c += 'a' - 'A'; 655 else 656 break; 657 tagBuffer[len++] = c; 658 ptr++; 659 } 660 tagBuffer[len] = 0; 661 AtomicString tag(tagBuffer); 662 663 if (enclosingTagName) { 664 if (end && tag.impl() == enclosingTagName) 665 enclosingTagName = 0; 666 } else { 667 if (tag == titleTag) 668 enclosingTagName = titleTag.localName().impl(); 669 else if (tag == scriptTag) 670 enclosingTagName = scriptTag.localName().impl(); 671 else if (tag == noscriptTag) 672 enclosingTagName = noscriptTag.localName().impl(); 673 } 674 675 // Find where the opening tag ends. 676 const char* tagContentStart = ptr; 677 if (!end) { 678 while (ptr != pEnd && *ptr != '>') { 679 if (*ptr == '\'' || *ptr == '"') { 680 char quoteMark = *ptr; 681 ++ptr; 682 while (ptr != pEnd && *ptr != quoteMark) 683 ++ptr; 684 if (ptr == pEnd) 685 return false; 686 } 687 ++ptr; 688 } 689 if (ptr == pEnd) 690 return false; 691 ++ptr; 692 } 693 694 if (!end && tag == metaTag && !sawNamespace) { 695 const char* str = tagContentStart; 696 int length = ptr - tagContentStart; 697 int pos = 0; 698 while (pos < length) { 699 int charsetPos = findIgnoringCase(str + pos, length - pos, "charset"); 700 if (charsetPos == -1) 701 break; 702 pos += charsetPos + 7; 703 // skip whitespace 704 while (pos < length && str[pos] <= ' ') 705 pos++; 706 if (pos == length) 707 break; 708 if (str[pos++] != '=') 709 continue; 710 while ((pos < length) && 711 (str[pos] <= ' ' || str[pos] == '=' || str[pos] == '"' || str[pos] == '\'')) 712 pos++; 713 714 // end ? 715 if (pos == length) 716 break; 717 int end = pos; 718 while (end < length && 719 str[end] != ' ' && str[end] != '"' && str[end] != '\'' && 720 str[end] != ';' && str[end] != '>') 721 end++; 722 setEncoding(findTextEncoding(str + pos, end - pos), EncodingFromMetaTag); 723 if (m_source == EncodingFromMetaTag) 724 return true; 725 726 if (end >= length || str[end] == '/' || str[end] == '>') 727 break; 728 729 pos = end + 1; 730 } 731 } else { 732 if (!enclosingTagName && tag != scriptTag && tag != noscriptTag && tag != styleTag 733 && tag != linkTag && tag != metaTag && tag != objectTag && tag != titleTag && tag != baseTag 734 && (end || tag != htmlTag) && (end || tag != headTag) && isASCIIAlpha(tagBuffer[0])) { 735 inHeadSection = false; 736 } 737 738 if (ptr - m_buffer.data() >= bytesToCheckUnconditionally && !inHeadSection) { 739 m_checkedForHeadCharset = true; 740 return true; 741 } 742 } 743 } else 744 ++ptr; 745 } 746 return false; 747} 748 749void TextResourceDecoder::detectJapaneseEncoding(const char* data, size_t len) 750{ 751 switch (KanjiCode::judge(data, len)) { 752 case KanjiCode::JIS: 753 setEncoding("ISO-2022-JP", AutoDetectedEncoding); 754 break; 755 case KanjiCode::EUC: 756 setEncoding("EUC-JP", AutoDetectedEncoding); 757 break; 758 case KanjiCode::SJIS: 759 setEncoding("Shift_JIS", AutoDetectedEncoding); 760 break; 761 case KanjiCode::ASCII: 762 case KanjiCode::UTF16: 763 case KanjiCode::UTF8: 764 break; 765 } 766} 767 768// We use the encoding detector in two cases: 769// 1. Encoding detector is turned ON and no other encoding source is 770// available (that is, it's DefaultEncoding). 771// 2. Encoding detector is turned ON and the encoding is set to 772// the encoding of the parent frame, which is also auto-detected. 773// Note that condition #2 is NOT satisfied unless parent-child frame 774// relationship is compliant to the same-origin policy. If they're from 775// different domains, |m_source| would not be set to EncodingFromParentFrame 776// in the first place. 777bool TextResourceDecoder::shouldAutoDetect() const 778{ 779 // Just checking m_hintEncoding suffices here because it's only set 780 // in setHintEncoding when the source is AutoDetectedEncoding. 781 return m_usesEncodingDetector 782 && (m_source == DefaultEncoding || (m_source == EncodingFromParentFrame && m_hintEncoding)); 783} 784 785String TextResourceDecoder::decode(const char* data, size_t len) 786{ 787 size_t lengthOfBOM = 0; 788 if (!m_checkedForBOM) 789 lengthOfBOM = checkForBOM(data, len); 790 791 bool movedDataToBuffer = false; 792 793 if (m_contentType == CSS && !m_checkedForCSSCharset) 794 if (!checkForCSSCharset(data, len, movedDataToBuffer)) 795 return ""; 796 797 if ((m_contentType == HTML || m_contentType == XML) && !m_checkedForHeadCharset) // HTML and XML 798 if (!checkForHeadCharset(data, len, movedDataToBuffer)) 799 return ""; 800 801 // FIXME: It is wrong to change the encoding downstream after we have already done some decoding. 802 if (shouldAutoDetect()) { 803 if (m_encoding.isJapanese()) 804 detectJapaneseEncoding(data, len); // FIXME: We should use detectTextEncoding() for all languages. 805 else { 806 TextEncoding detectedEncoding; 807 if (detectTextEncoding(data, len, m_hintEncoding, &detectedEncoding)) 808 setEncoding(detectedEncoding, AutoDetectedEncoding); 809 } 810 } 811 812 ASSERT(m_encoding.isValid()); 813 814 if (!m_codec) 815 m_codec.set(newTextCodec(m_encoding).release()); 816 817 if (m_buffer.isEmpty()) 818 return m_codec->decode(data + lengthOfBOM, len - lengthOfBOM, false, m_contentType == XML, m_sawError); 819 820 if (!movedDataToBuffer) { 821 size_t oldSize = m_buffer.size(); 822 m_buffer.grow(oldSize + len); 823 memcpy(m_buffer.data() + oldSize, data, len); 824 } 825 826 String result = m_codec->decode(m_buffer.data() + lengthOfBOM, m_buffer.size() - lengthOfBOM, false, m_contentType == XML && !m_useLenientXMLDecoding, m_sawError); 827 m_buffer.clear(); 828 return result; 829} 830 831String TextResourceDecoder::flush() 832{ 833 // If we can not identify the encoding even after a document is completely 834 // loaded, we need to detect the encoding if other conditions for 835 // autodetection is satisfied. 836 if (m_buffer.size() && shouldAutoDetect() 837 && ((!m_checkedForHeadCharset && (m_contentType == HTML || m_contentType == XML)) || (!m_checkedForCSSCharset && (m_contentType == CSS)))) { 838 TextEncoding detectedEncoding; 839 if (detectTextEncoding(m_buffer.data(), m_buffer.size(), 840 m_hintEncoding, &detectedEncoding)) 841 setEncoding(detectedEncoding, AutoDetectedEncoding); 842 } 843 844 if (!m_codec) 845 m_codec.set(newTextCodec(m_encoding).release()); 846 847 String result = m_codec->decode(m_buffer.data(), m_buffer.size(), true, m_contentType == XML && !m_useLenientXMLDecoding, m_sawError); 848 m_buffer.clear(); 849 m_codec.clear(); 850 m_checkedForBOM = false; // Skip BOM again when re-decoding. 851 return result; 852} 853 854} 855