1/* 2 * Copyright (C) 2004, 2007, 2008, 2009 Apple Inc. All rights reserved. 3 * Copyright (C) 2008, 2009 Google Inc. All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions are 7 * met: 8 * 9 * * Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * * Redistributions in binary form must reproduce the above 12 * copyright notice, this list of conditions and the following disclaimer 13 * in the documentation and/or other materials provided with the 14 * distribution. 15 * * Neither the name of Google Inc. nor the names of its 16 * contributors may be used to endorse or promote products derived from 17 * this software without specific prior written permission. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 20 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 21 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 22 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 23 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 24 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 25 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 26 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 27 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 28 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 */ 31 32#include "config.h" 33 34#if USE(GOOGLEURL) 35#include "KURL.h" 36 37#ifndef NDEBUG 38#include <stdio.h> 39#endif 40 41#include <algorithm> 42 43#include "CString.h" 44#include "StringHash.h" 45#include "NotImplemented.h" 46#include "TextEncoding.h" 47#include <wtf/HashMap.h> 48#include <wtf/Vector.h> 49#include <wtf/StdLibExtras.h> 50 51#include <googleurl/src/url_canon_internal.h> 52#include <googleurl/src/url_util.h> 53 54using WTF::isASCIILower; 55using WTF::toASCIILower; 56using std::binary_search; 57 58namespace WebCore { 59 60// Wraps WebCore's text encoding in a character set converter for the 61// canonicalizer. 62class KURLCharsetConverter : public url_canon::CharsetConverter { 63public: 64 // The encoding parameter may be NULL, but in this case the object must not 65 // be called. 66 KURLCharsetConverter(const TextEncoding* encoding) 67 : m_encoding(encoding) 68 { 69 } 70 71 virtual void ConvertFromUTF16(const url_parse::UTF16Char* input, int inputLength, 72 url_canon::CanonOutput* output) 73 { 74 CString encoded = m_encoding->encode(input, inputLength, URLEncodedEntitiesForUnencodables); 75 output->Append(encoded.data(), static_cast<int>(encoded.length())); 76 } 77 78private: 79 const TextEncoding* m_encoding; 80}; 81 82// Note that this function must be named differently than the one in KURL.cpp 83// since our unit tests evilly include both files, and their local definition 84// will be ambiguous. 85static inline void assertProtocolIsGood(const char* protocol) 86{ 87#ifndef NDEBUG 88 const char* p = protocol; 89 while (*p) { 90 ASSERT(*p > ' ' && *p < 0x7F && !(*p >= 'A' && *p <= 'Z')); 91 ++p; 92 } 93#endif 94} 95 96// Returns the characters for the given string, or a pointer to a static empty 97// string if the input string is NULL. This will always ensure we have a non- 98// NULL character pointer since ReplaceComponents has special meaning for NULL. 99static inline const url_parse::UTF16Char* CharactersOrEmpty(const String& str) 100{ 101 static const url_parse::UTF16Char zero = 0; 102 return str.characters() ? 103 reinterpret_cast<const url_parse::UTF16Char*>(str.characters()) : 104 &zero; 105} 106 107static inline bool isUnicodeEncoding(const TextEncoding* encoding) 108{ 109 return encoding->encodingForFormSubmission() == UTF8Encoding(); 110} 111 112static bool lowerCaseEqualsASCII(const char* begin, const char* end, const char* str) 113{ 114 while (begin != end && *str) { 115 ASSERT(toASCIILower(*str) == *str); 116 if (toASCIILower(*begin++) != *str++) 117 return false; 118 } 119 120 // Both strings are equal (ignoring case) if and only if all of the characters were equal, 121 // and the end of both has been reached. 122 return begin == end && !*str; 123} 124 125static inline bool isSchemeFirstChar(char c) 126{ 127 return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'); 128} 129 130static inline bool isSchemeChar(char c) 131{ 132 return isSchemeFirstChar(c) || (c >= '0' && c <= '9') || c == '.' || c == '-' || c == '*'; 133} 134 135 136// KURLGooglePrivate ----------------------------------------------------------- 137 138KURLGooglePrivate::KURLGooglePrivate() 139 : m_isValid(false) 140 , m_protocolInHTTPFamily(false) 141 , m_utf8IsASCII(true) 142 , m_stringIsValid(false) 143{ 144} 145 146KURLGooglePrivate::KURLGooglePrivate(const url_parse::Parsed& parsed, bool isValid) 147 : m_isValid(isValid) 148 , m_protocolInHTTPFamily(false) 149 , m_parsed(parsed) 150 , m_utf8IsASCII(true) 151 , m_stringIsValid(false) 152{ 153} 154 155// Setters for the data. Using the ASCII version when you know the 156// data is ASCII will be slightly more efficient. The UTF-8 version 157// will always be correct if the caller is unsure. 158void KURLGooglePrivate::setUtf8(const CString& str) 159{ 160 const char* data = str.data(); 161 unsigned dataLength = str.length(); 162 163 // The m_utf8IsASCII must always be correct since the DeprecatedString 164 // getter must create it with the proper constructor. This test can be 165 // removed when DeprecatedString is gone, but it still might be a 166 // performance win. 167 m_utf8IsASCII = true; 168 for (unsigned i = 0; i < dataLength; i++) { 169 if (static_cast<unsigned char>(data[i]) >= 0x80) { 170 m_utf8IsASCII = false; 171 break; 172 } 173 } 174 175 m_utf8 = str; 176 m_stringIsValid = false; 177 initProtocolInHTTPFamily(); 178} 179 180void KURLGooglePrivate::setAscii(const CString& str) 181{ 182 m_utf8 = str; 183 m_utf8IsASCII = true; 184 m_stringIsValid = false; 185 initProtocolInHTTPFamily(); 186} 187 188void KURLGooglePrivate::init(const KURL& base, 189 const String& relative, 190 const TextEncoding* queryEncoding) 191{ 192 init(base, relative.characters(), relative.length(), queryEncoding); 193} 194 195// Note: code mostly duplicated below. 196void KURLGooglePrivate::init(const KURL& base, const char* rel, int relLength, 197 const TextEncoding* queryEncoding) 198{ 199 // As a performance optimization, we do not use the charset converter if 200 // encoding is UTF-8 or other Unicode encodings. Note that this is 201 // per HTML5 2.5.3 (resolving URL). The URL canonicalizer will be 202 // more efficient with no charset converter object because it 203 // can do UTF-8 internally with no extra copies. 204 205 // We feel free to make the charset converter object every time since it's 206 // just a wrapper around a reference. 207 KURLCharsetConverter charsetConverterObject(queryEncoding); 208 KURLCharsetConverter* charsetConverter = 209 (!queryEncoding || isUnicodeEncoding(queryEncoding)) ? 0 : 210 &charsetConverterObject; 211 212 url_canon::RawCanonOutputT<char> output; 213 const CString& baseStr = base.m_url.utf8String(); 214 m_isValid = url_util::ResolveRelative(baseStr.data(), baseStr.length(), 215 base.m_url.m_parsed, rel, relLength, 216 charsetConverter, 217 &output, &m_parsed); 218 219 // See FIXME in KURLGooglePrivate in the header. If canonicalization has not 220 // changed the string, we can avoid an extra allocation by using assignment. 221 // 222 // When KURL encounters an error such that the URL is invalid and empty 223 // (for example, resolving a relative URL on a non-hierarchical base), it 224 // will produce an isNull URL, and calling setUtf8 will produce an empty 225 // non-null URL. This is unlikely to affect anything, but we preserve this 226 // just in case. 227 if (m_isValid || output.length()) { 228 // Without ref, the whole url is guaranteed to be ASCII-only. 229 if (m_parsed.ref.is_nonempty()) 230 setUtf8(CString(output.data(), output.length())); 231 else 232 setAscii(CString(output.data(), output.length())); 233 } else { 234 // WebCore expects resolved URLs to be empty rather than NULL. 235 setUtf8(CString("", 0)); 236 } 237} 238 239// Note: code mostly duplicated above. See FIXMEs and comments there. 240void KURLGooglePrivate::init(const KURL& base, const UChar* rel, int relLength, 241 const TextEncoding* queryEncoding) 242{ 243 KURLCharsetConverter charsetConverterObject(queryEncoding); 244 KURLCharsetConverter* charsetConverter = 245 (!queryEncoding || isUnicodeEncoding(queryEncoding)) ? 0 : 246 &charsetConverterObject; 247 248 url_canon::RawCanonOutputT<char> output; 249 const CString& baseStr = base.m_url.utf8String(); 250 m_isValid = url_util::ResolveRelative(baseStr.data(), baseStr.length(), 251 base.m_url.m_parsed, rel, relLength, 252 charsetConverter, 253 &output, &m_parsed); 254 255 256 if (m_isValid || output.length()) { 257 if (m_parsed.ref.is_nonempty()) 258 setUtf8(CString(output.data(), output.length())); 259 else 260 setAscii(CString(output.data(), output.length())); 261 } else 262 setUtf8(CString("", 0)); 263} 264 265void KURLGooglePrivate::initProtocolInHTTPFamily() 266{ 267 if (!m_isValid) { 268 m_protocolInHTTPFamily = false; 269 return; 270 } 271 272 const char* scheme = m_utf8.data() + m_parsed.scheme.begin; 273 if (m_parsed.scheme.len == 4) 274 m_protocolInHTTPFamily = lowerCaseEqualsASCII(scheme, scheme + 4, "http"); 275 else if (m_parsed.scheme.len == 5) 276 m_protocolInHTTPFamily = lowerCaseEqualsASCII(scheme, scheme + 5, "https"); 277 else 278 m_protocolInHTTPFamily = false; 279} 280 281void KURLGooglePrivate::copyTo(KURLGooglePrivate* dest) const 282{ 283 dest->m_isValid = m_isValid; 284 dest->m_protocolInHTTPFamily = m_protocolInHTTPFamily; 285 dest->m_parsed = m_parsed; 286 287 // Don't copy the 16-bit string since that will be regenerated as needed. 288 dest->m_utf8 = CString(m_utf8.data(), m_utf8.length()); 289 dest->m_utf8IsASCII = m_utf8IsASCII; 290 dest->m_stringIsValid = false; 291} 292 293String KURLGooglePrivate::componentString(const url_parse::Component& comp) const 294{ 295 if (!m_isValid || comp.len <= 0) { 296 // KURL returns a NULL string if the URL is itself a NULL string, and an 297 // empty string for other nonexistant entities. 298 if (utf8String().isNull()) 299 return String(); 300 return String("", 0); 301 } 302 // begin and len are in terms of bytes which do not match 303 // if string() is UTF-16 and input contains non-ASCII characters. 304 // However, the only part in urlString that can contain non-ASCII 305 // characters is 'ref' at the end of the string. In that case, 306 // begin will always match the actual value and len (in terms of 307 // byte) will be longer than what's needed by 'mid'. However, mid 308 // truncates len to avoid go past the end of a string so that we can 309 // get away withtout doing anything here. 310 return string().substring(comp.begin, comp.len); 311} 312 313void KURLGooglePrivate::replaceComponents(const Replacements& replacements) 314{ 315 url_canon::RawCanonOutputT<char> output; 316 url_parse::Parsed newParsed; 317 318 m_isValid = url_util::ReplaceComponents(utf8String().data(), 319 utf8String().length(), m_parsed, replacements, 0, &output, &newParsed); 320 321 m_parsed = newParsed; 322 if (m_parsed.ref.is_nonempty()) 323 setUtf8(CString(output.data(), output.length())); 324 else 325 setAscii(CString(output.data(), output.length())); 326} 327 328const String& KURLGooglePrivate::string() const 329{ 330 if (!m_stringIsValid) { 331 // Must special case the NULL case, since constructing the 332 // string like we do below will generate an empty rather than 333 // a NULL string. 334 if (m_utf8.isNull()) 335 m_string = String(); 336 else if (m_utf8IsASCII) 337 m_string = String(m_utf8.data(), m_utf8.length()); 338 else 339 m_string = String::fromUTF8(m_utf8.data(), m_utf8.length()); 340 m_stringIsValid = true; 341 } 342 return m_string; 343} 344 345// KURL ------------------------------------------------------------------------ 346 347// Creates with NULL-terminated string input representing an absolute URL. 348// WebCore generally calls this only with hardcoded strings, so the input is 349// ASCII. We treat is as UTF-8 just in case. 350KURL::KURL(ParsedURLStringTag, const char *url) 351{ 352 // FIXME The Mac code checks for beginning with a slash and converting to a 353 // file: URL. We will want to add this as well once we can compile on a 354 // system like that. 355 m_url.init(KURL(), url, strlen(url), 0); 356 357 // The one-argument constructors should never generate a NULL string. 358 // This is a funny quirk of KURL.cpp (probably a bug) which we preserve. 359 if (m_url.utf8String().isNull()) 360 m_url.setAscii(CString("", 0)); 361} 362 363// Initializes with a string representing an absolute URL. No encoding 364// information is specified. This generally happens when a KURL is converted 365// to a string and then converted back. In this case, the URL is already 366// canonical and in proper escaped form so needs no encoding. We treat it was 367// UTF-8 just in case. 368KURL::KURL(ParsedURLStringTag, const String& url) 369{ 370 if (!url.isNull()) 371 m_url.init(KURL(), url, 0); 372 else { 373 // WebCore expects us to preserve the nullness of strings when this 374 // constructor is used. In all other cases, it expects a non-null 375 // empty string, which is what init() will create. 376 m_url.m_isValid = false; 377 m_url.m_protocolInHTTPFamily = false; 378 } 379} 380 381// Constructs a new URL given a base URL and a possibly relative input URL. 382// This assumes UTF-8 encoding. 383KURL::KURL(const KURL& base, const String& relative) 384{ 385 m_url.init(base, relative, 0); 386} 387 388// Constructs a new URL given a base URL and a possibly relative input URL. 389// Any query portion of the relative URL will be encoded in the given encoding. 390KURL::KURL(const KURL& base, 391 const String& relative, 392 const TextEncoding& encoding) 393{ 394 m_url.init(base, relative, &encoding.encodingForFormSubmission()); 395} 396 397KURL::KURL(const CString& canonicalSpec, 398 const url_parse::Parsed& parsed, bool isValid) 399 : m_url(parsed, isValid) 400{ 401 // We know the reference fragment is the only part that can be UTF-8, so 402 // we know it's ASCII when there is no ref. 403 if (parsed.ref.is_nonempty()) 404 m_url.setUtf8(canonicalSpec); 405 else 406 m_url.setAscii(canonicalSpec); 407} 408 409#if PLATFORM(CF) 410KURL::KURL(CFURLRef) 411{ 412 notImplemented(); 413 invalidate(); 414} 415 416CFURLRef KURL::createCFURL() const 417{ 418 notImplemented(); 419 return 0; 420} 421#endif 422 423KURL KURL::copy() const 424{ 425 KURL result = *this; 426 m_url.copyTo(&result.m_url); 427 return result; 428} 429 430bool KURL::isNull() const 431{ 432 return m_url.utf8String().isNull(); 433} 434 435bool KURL::isEmpty() const 436{ 437 return !m_url.utf8String().length(); 438} 439 440bool KURL::isValid() const 441{ 442 return m_url.m_isValid; 443} 444 445bool KURL::hasPort() const 446{ 447 return hostEnd() < pathStart(); 448} 449 450bool KURL::protocolInHTTPFamily() const 451{ 452 return m_url.m_protocolInHTTPFamily; 453} 454 455bool KURL::hasPath() const 456{ 457 // Note that http://www.google.com/" has a path, the path is "/". This can 458 // return false only for invalid or nonstandard URLs. 459 return m_url.m_parsed.path.len >= 0; 460} 461 462// We handle "parameters" separated by a semicolon, while KURL.cpp does not, 463// which can lead to different results in some cases. 464String KURL::lastPathComponent() const 465{ 466 // When the output ends in a slash, WebCore has different expectations than 467 // the GoogleURL library. For "/foo/bar/" the library will return the empty 468 // string, but WebCore wants "bar". 469 url_parse::Component path = m_url.m_parsed.path; 470 if (path.len > 0 && m_url.utf8String().data()[path.end() - 1] == '/') 471 path.len--; 472 473 url_parse::Component file; 474 url_parse::ExtractFileName(m_url.utf8String().data(), path, &file); 475 476 // Bug: https://bugs.webkit.org/show_bug.cgi?id=21015 this function returns 477 // a null string when the path is empty, which we duplicate here. 478 if (!file.is_nonempty()) 479 return String(); 480 return m_url.componentString(file); 481} 482 483String KURL::protocol() const 484{ 485 return m_url.componentString(m_url.m_parsed.scheme); 486} 487 488String KURL::host() const 489{ 490 // Note: KURL.cpp unescapes here. 491 return m_url.componentString(m_url.m_parsed.host); 492} 493 494// Returns 0 when there is no port or it is invalid. 495// 496// We treat URL's with out-of-range port numbers as invalid URLs, and they will 497// be rejected by the canonicalizer. KURL.cpp will allow them in parsing, but 498// return 0 from this port() function, so we mirror that behavior here. 499unsigned short KURL::port() const 500{ 501 if (!m_url.m_isValid || m_url.m_parsed.port.len <= 0) 502 return 0; 503 int port = url_parse::ParsePort(m_url.utf8String().data(), m_url.m_parsed.port); 504 if (port == url_parse::PORT_UNSPECIFIED) 505 return 0; 506 return static_cast<unsigned short>(port); 507} 508 509// Returns the empty string if there is no password. 510String KURL::pass() const 511{ 512 // Bug: https://bugs.webkit.org/show_bug.cgi?id=21015 this function returns 513 // a null string when the password is empty, which we duplicate here. 514 if (!m_url.m_parsed.password.is_nonempty()) 515 return String(); 516 517 // Note: KURL.cpp unescapes here. 518 return m_url.componentString(m_url.m_parsed.password); 519} 520 521// Returns the empty string if there is no username. 522String KURL::user() const 523{ 524 // Note: KURL.cpp unescapes here. 525 return m_url.componentString(m_url.m_parsed.username); 526} 527 528String KURL::fragmentIdentifier() const 529{ 530 // Empty but present refs ("foo.com/bar#") should result in the empty 531 // string, which m_url.componentString will produce. Nonexistant refs should be 532 // the NULL string. 533 if (!m_url.m_parsed.ref.is_valid()) 534 return String(); 535 536 // Note: KURL.cpp unescapes here. 537 return m_url.componentString(m_url.m_parsed.ref); 538} 539 540bool KURL::hasFragmentIdentifier() const 541{ 542 // Note: KURL.cpp unescapes here. 543 // FIXME determine if KURL.cpp agrees about an empty ref 544 return m_url.m_parsed.ref.len >= 0; 545} 546 547String KURL::baseAsString() const 548{ 549 // FIXME: There is probably a more efficient way to do this? 550 return string().left(pathAfterLastSlash()); 551} 552 553String KURL::query() const 554{ 555 if (m_url.m_parsed.query.len >= 0) 556 return m_url.componentString(m_url.m_parsed.query); 557 558 // Bug: https://bugs.webkit.org/show_bug.cgi?id=21015 this function returns 559 // an empty string when the query is empty rather than a null (not sure 560 // which is right). 561 // Returns a null if the query is not specified, instead of empty. 562 if (m_url.m_parsed.query.is_valid()) 563 return String("", 0); 564 return String(); 565} 566 567String KURL::path() const 568{ 569 // Note: KURL.cpp unescapes here. 570 return m_url.componentString(m_url.m_parsed.path); 571} 572 573bool KURL::setProtocol(const String& protocol) 574{ 575 KURLGooglePrivate::Replacements replacements; 576 replacements.SetScheme(CharactersOrEmpty(protocol), 577 url_parse::Component(0, protocol.length())); 578 m_url.replaceComponents(replacements); 579 return true; 580} 581 582void KURL::setHost(const String& host) 583{ 584 KURLGooglePrivate::Replacements replacements; 585 replacements.SetHost(CharactersOrEmpty(host), 586 url_parse::Component(0, host.length())); 587 m_url.replaceComponents(replacements); 588} 589 590void KURL::setHostAndPort(const String& s) 591{ 592 String host = s; 593 String port; 594 int hostEnd = s.find(":"); 595 if (hostEnd != -1) { 596 host = s.left(hostEnd); 597 port = s.substring(hostEnd + 1); 598 } 599 600 KURLGooglePrivate::Replacements replacements; 601 // Host can't be removed, so we always set. 602 replacements.SetHost(CharactersOrEmpty(host), 603 url_parse::Component(0, host.length())); 604 605 if (port.isEmpty()) // Port may be removed, so we support clearing. 606 replacements.ClearPort(); 607 else 608 replacements.SetPort(CharactersOrEmpty(port), url_parse::Component(0, port.length())); 609 m_url.replaceComponents(replacements); 610} 611 612void KURL::removePort() 613{ 614 if (hasPort()) { 615 String urlWithoutPort = m_url.string().left(hostEnd()) + m_url.string().substring(pathStart()); 616 m_url.setUtf8(urlWithoutPort.utf8()); 617 } 618} 619 620void KURL::setPort(unsigned short i) 621{ 622 KURLGooglePrivate::Replacements replacements; 623 String portStr; 624 if (i) { 625 portStr = String::number(static_cast<int>(i)); 626 replacements.SetPort( 627 reinterpret_cast<const url_parse::UTF16Char*>(portStr.characters()), 628 url_parse::Component(0, portStr.length())); 629 630 } else { 631 // Clear any existing port when it is set to 0. 632 replacements.ClearPort(); 633 } 634 m_url.replaceComponents(replacements); 635} 636 637void KURL::setUser(const String& user) 638{ 639 // This function is commonly called to clear the username, which we 640 // normally don't have, so we optimize this case. 641 if (user.isEmpty() && !m_url.m_parsed.username.is_valid()) 642 return; 643 644 // The canonicalizer will clear any usernames that are empty, so we 645 // don't have to explicitly call ClearUsername() here. 646 KURLGooglePrivate::Replacements replacements; 647 replacements.SetUsername(CharactersOrEmpty(user), 648 url_parse::Component(0, user.length())); 649 m_url.replaceComponents(replacements); 650} 651 652void KURL::setPass(const String& pass) 653{ 654 // This function is commonly called to clear the password, which we 655 // normally don't have, so we optimize this case. 656 if (pass.isEmpty() && !m_url.m_parsed.password.is_valid()) 657 return; 658 659 // The canonicalizer will clear any passwords that are empty, so we 660 // don't have to explicitly call ClearUsername() here. 661 KURLGooglePrivate::Replacements replacements; 662 replacements.SetPassword(CharactersOrEmpty(pass), 663 url_parse::Component(0, pass.length())); 664 m_url.replaceComponents(replacements); 665} 666 667void KURL::setFragmentIdentifier(const String& s) 668{ 669 // This function is commonly called to clear the ref, which we 670 // normally don't have, so we optimize this case. 671 if (s.isNull() && !m_url.m_parsed.ref.is_valid()) 672 return; 673 674 KURLGooglePrivate::Replacements replacements; 675 if (s.isNull()) 676 replacements.ClearRef(); 677 else 678 replacements.SetRef(CharactersOrEmpty(s), url_parse::Component(0, s.length())); 679 m_url.replaceComponents(replacements); 680} 681 682void KURL::removeFragmentIdentifier() 683{ 684 KURLGooglePrivate::Replacements replacements; 685 replacements.ClearRef(); 686 m_url.replaceComponents(replacements); 687} 688 689void KURL::setQuery(const String& query) 690{ 691 KURLGooglePrivate::Replacements replacements; 692 if (query.isNull()) { 693 // KURL.cpp sets to NULL to clear any query. 694 replacements.ClearQuery(); 695 } else if (query.length() > 0 && query[0] == '?') { 696 // WebCore expects the query string to begin with a question mark, but 697 // GoogleURL doesn't. So we trim off the question mark when setting. 698 replacements.SetQuery(CharactersOrEmpty(query), 699 url_parse::Component(1, query.length() - 1)); 700 } else { 701 // When set with the empty string or something that doesn't begin with 702 // a question mark, KURL.cpp will add a question mark for you. The only 703 // way this isn't compatible is if you call this function with an empty 704 // string. KURL.cpp will leave a '?' with nothing following it in the 705 // URL, whereas we'll clear it. 706 // FIXME We should eliminate this difference. 707 replacements.SetQuery(CharactersOrEmpty(query), 708 url_parse::Component(0, query.length())); 709 } 710 m_url.replaceComponents(replacements); 711} 712 713void KURL::setPath(const String& path) 714{ 715 // Empty paths will be canonicalized to "/", so we don't have to worry 716 // about calling ClearPath(). 717 KURLGooglePrivate::Replacements replacements; 718 replacements.SetPath(CharactersOrEmpty(path), 719 url_parse::Component(0, path.length())); 720 m_url.replaceComponents(replacements); 721} 722 723// On Mac, this just seems to return the same URL, but with "/foo/bar" for 724// file: URLs instead of file:///foo/bar. We don't bother with any of this, 725// at least for now. 726String KURL::prettyURL() const 727{ 728 if (!m_url.m_isValid) 729 return String(); 730 return m_url.string(); 731} 732 733bool protocolIsJavaScript(const String& url) 734{ 735 return protocolIs(url, "javascript"); 736} 737 738// We copied the KURL version here on Dec 4, 2009 while doing a WebKit 739// merge. 740// 741// FIXME Somehow share this with KURL? Like we'd theoretically merge with 742// decodeURLEscapeSequences below? 743bool isDefaultPortForProtocol(unsigned short port, const String& protocol) 744{ 745 if (protocol.isEmpty()) 746 return false; 747 748 typedef HashMap<String, unsigned, CaseFoldingHash> DefaultPortsMap; 749 DEFINE_STATIC_LOCAL(DefaultPortsMap, defaultPorts, ()); 750 if (defaultPorts.isEmpty()) { 751 defaultPorts.set("http", 80); 752 defaultPorts.set("https", 443); 753 defaultPorts.set("ftp", 21); 754 defaultPorts.set("ftps", 990); 755 } 756 return defaultPorts.get(protocol) == port; 757} 758 759// We copied the KURL version here on Dec 4, 2009 while doing a WebKit 760// merge. 761// 762// FIXME Somehow share this with KURL? Like we'd theoretically merge with 763// decodeURLEscapeSequences below? 764bool portAllowed(const KURL& url) 765{ 766 unsigned short port = url.port(); 767 768 // Since most URLs don't have a port, return early for the "no port" case. 769 if (!port) 770 return true; 771 772 // This blocked port list matches the port blocking that Mozilla implements. 773 // See http://www.mozilla.org/projects/netlib/PortBanning.html for more information. 774 static const unsigned short blockedPortList[] = { 775 1, // tcpmux 776 7, // echo 777 9, // discard 778 11, // systat 779 13, // daytime 780 15, // netstat 781 17, // qotd 782 19, // chargen 783 20, // FTP-data 784 21, // FTP-control 785 22, // SSH 786 23, // telnet 787 25, // SMTP 788 37, // time 789 42, // name 790 43, // nicname 791 53, // domain 792 77, // priv-rjs 793 79, // finger 794 87, // ttylink 795 95, // supdup 796 101, // hostriame 797 102, // iso-tsap 798 103, // gppitnp 799 104, // acr-nema 800 109, // POP2 801 110, // POP3 802 111, // sunrpc 803 113, // auth 804 115, // SFTP 805 117, // uucp-path 806 119, // nntp 807 123, // NTP 808 135, // loc-srv / epmap 809 139, // netbios 810 143, // IMAP2 811 179, // BGP 812 389, // LDAP 813 465, // SMTP+SSL 814 512, // print / exec 815 513, // login 816 514, // shell 817 515, // printer 818 526, // tempo 819 530, // courier 820 531, // Chat 821 532, // netnews 822 540, // UUCP 823 556, // remotefs 824 563, // NNTP+SSL 825 587, // ESMTP 826 601, // syslog-conn 827 636, // LDAP+SSL 828 993, // IMAP+SSL 829 995, // POP3+SSL 830 2049, // NFS 831 3659, // apple-sasl / PasswordServer [Apple addition] 832 4045, // lockd 833 6000, // X11 834 }; 835 const unsigned short* const blockedPortListEnd = blockedPortList + sizeof(blockedPortList) / sizeof(blockedPortList[0]); 836 837#ifndef NDEBUG 838 // The port list must be sorted for binary_search to work. 839 static bool checkedPortList = false; 840 if (!checkedPortList) { 841 for (const unsigned short* p = blockedPortList; p != blockedPortListEnd - 1; ++p) 842 ASSERT(*p < *(p + 1)); 843 checkedPortList = true; 844 } 845#endif 846 847 // If the port is not in the blocked port list, allow it. 848 if (!binary_search(blockedPortList, blockedPortListEnd, port)) 849 return true; 850 851 // Allow ports 21 and 22 for FTP URLs, as Mozilla does. 852 if ((port == 21 || port == 22) && url.protocolIs("ftp")) 853 return true; 854 855 // Allow any port number in a file URL, since the port number is ignored. 856 if (url.protocolIs("file")) 857 return true; 858 859 return false; 860} 861 862// We copied the KURL version here on Sept 12, 2008 while doing a WebKit 863// merge. 864// 865// FIXME Somehow share this with KURL? Like we'd theoretically merge with 866// decodeURLEscapeSequences below? 867String mimeTypeFromDataURL(const String& url) 868{ 869 ASSERT(protocolIs(url, "data")); 870 int index = url.find(';'); 871 if (index == -1) 872 index = url.find(','); 873 if (index != -1) { 874 int len = index - 5; 875 if (len > 0) 876 return url.substring(5, len); 877 return "text/plain"; // Data URLs with no MIME type are considered text/plain. 878 } 879 return ""; 880} 881 882String decodeURLEscapeSequences(const String& str) 883{ 884 return decodeURLEscapeSequences(str, UTF8Encoding()); 885} 886 887// In KURL.cpp's implementation, this is called by every component getter. 888// It will unescape every character, including NULL. This is scary, and may 889// cause security holes. We never call this function for components, and 890// just return the ASCII versions instead. 891// 892// This function is also used to decode javascript: URLs and as a general 893// purpose unescaping function. 894// 895// FIXME These should be merged to the KURL.cpp implementation. 896String decodeURLEscapeSequences(const String& str, const TextEncoding& encoding) 897{ 898 // FIXME We can probably use KURL.cpp's version of this function 899 // without modification. However, I'm concerned about 900 // https://bugs.webkit.org/show_bug.cgi?id=20559 so am keeping this old 901 // custom code for now. Using their version will also fix the bug that 902 // we ignore the encoding. 903 // 904 // FIXME b/1350291: This does not get called very often. We just convert 905 // first to 8-bit UTF-8, then unescape, then back to 16-bit. This kind of 906 // sucks, and we don't use the encoding properly, which will make some 907 // obscure anchor navigations fail. 908 CString cstr = str.utf8(); 909 910 const char* input = cstr.data(); 911 int inputLength = cstr.length(); 912 url_canon::RawCanonOutputT<char> unescaped; 913 for (int i = 0; i < inputLength; i++) { 914 if (input[i] == '%') { 915 unsigned char ch; 916 if (url_canon::DecodeEscaped(input, &i, inputLength, &ch)) 917 unescaped.push_back(ch); 918 else { 919 // Invalid escape sequence, copy the percent literal. 920 unescaped.push_back('%'); 921 } 922 } else { 923 // Regular non-escaped 8-bit character. 924 unescaped.push_back(input[i]); 925 } 926 } 927 928 // Convert that 8-bit to UTF-16. It's not clear IE does this at all to 929 // JavaScript URLs, but Firefox and Safari do. 930 url_canon::RawCanonOutputT<url_parse::UTF16Char> utf16; 931 for (int i = 0; i < unescaped.length(); i++) { 932 unsigned char uch = static_cast<unsigned char>(unescaped.at(i)); 933 if (uch < 0x80) { 934 // Non-UTF-8, just append directly 935 utf16.push_back(uch); 936 } else { 937 // next_ch will point to the last character of the decoded 938 // character. 939 int nextCharacter = i; 940 unsigned codePoint; 941 if (url_canon::ReadUTFChar(unescaped.data(), &nextCharacter, 942 unescaped.length(), &codePoint)) { 943 // Valid UTF-8 character, convert to UTF-16. 944 url_canon::AppendUTF16Value(codePoint, &utf16); 945 i = nextCharacter; 946 } else { 947 // KURL.cpp strips any sequences that are not valid UTF-8. This 948 // sounds scary. Instead, we just keep those invalid code 949 // points and promote to UTF-16. We copy all characters from 950 // the current position to the end of the identified sqeuqnce. 951 while (i < nextCharacter) { 952 utf16.push_back(static_cast<unsigned char>(unescaped.at(i))); 953 i++; 954 } 955 utf16.push_back(static_cast<unsigned char>(unescaped.at(i))); 956 } 957 } 958 } 959 960 return String(reinterpret_cast<UChar*>(utf16.data()), utf16.length()); 961} 962 963bool KURL::protocolIs(const char* protocol) const 964{ 965 assertProtocolIsGood(protocol); 966 967 // JavaScript URLs are "valid" and should be executed even if KURL decides they are invalid. 968 // The free function protocolIsJavaScript() should be used instead. 969 // FIXME: Chromium code needs to be fixed for this assert to be enabled. ASSERT(strcmp(protocol, "javascript")); 970 971 if (m_url.m_parsed.scheme.len <= 0) 972 return !protocol; 973 return lowerCaseEqualsASCII( 974 m_url.utf8String().data() + m_url.m_parsed.scheme.begin, 975 m_url.utf8String().data() + m_url.m_parsed.scheme.end(), 976 protocol); 977} 978 979bool KURL::isLocalFile() const 980{ 981 return protocolIs("file"); 982} 983 984// This is called to escape a URL string. It is only used externally when 985// constructing mailto: links to set the query section. Since our query setter 986// will automatically do the correct escaping, this function does not have to 987// do any work. 988// 989// There is a possibility that a future called may use this function in other 990// ways, and may expect to get a valid URL string. The dangerous thing we want 991// to protect against here is accidentally getting NULLs in a string that is 992// not supposed to have NULLs. Therefore, we escape NULLs here to prevent this. 993String encodeWithURLEscapeSequences(const String& notEncodedString) 994{ 995 CString utf8 = UTF8Encoding().encode( 996 reinterpret_cast<const UChar*>(notEncodedString.characters()), 997 notEncodedString.length(), 998 URLEncodedEntitiesForUnencodables); 999 const char* input = utf8.data(); 1000 int inputLength = utf8.length(); 1001 1002 Vector<char, 2048> buffer; 1003 for (int i = 0; i < inputLength; i++) { 1004 if (!input[i]) 1005 buffer.append("%00", 3); 1006 else 1007 buffer.append(input[i]); 1008 } 1009 return String(buffer.data(), buffer.size()); 1010} 1011 1012bool KURL::isHierarchical() const 1013{ 1014 if (!m_url.m_parsed.scheme.is_nonempty()) 1015 return false; 1016 return url_util::IsStandard( 1017 &m_url.utf8String().data()[m_url.m_parsed.scheme.begin], 1018 m_url.utf8String().length(), 1019 m_url.m_parsed.scheme); 1020} 1021 1022#ifndef NDEBUG 1023void KURL::print() const 1024{ 1025 printf("%s\n", m_url.utf8String().data()); 1026} 1027#endif 1028 1029void KURL::invalidate() 1030{ 1031 // This is only called from the constructor so resetting the (automatically 1032 // initialized) string and parsed structure would be a waste of time. 1033 m_url.m_isValid = false; 1034 m_url.m_protocolInHTTPFamily = false; 1035} 1036 1037// Equal up to reference fragments, if any. 1038bool equalIgnoringFragmentIdentifier(const KURL& a, const KURL& b) 1039{ 1040 // Compute the length of each URL without its ref. Note that the reference 1041 // begin (if it exists) points to the character *after* the '#', so we need 1042 // to subtract one. 1043 int aLength = a.m_url.utf8String().length(); 1044 if (a.m_url.m_parsed.ref.len >= 0) 1045 aLength = a.m_url.m_parsed.ref.begin - 1; 1046 1047 int bLength = b.m_url.utf8String().length(); 1048 if (b.m_url.m_parsed.ref.len >= 0) 1049 bLength = b.m_url.m_parsed.ref.begin - 1; 1050 1051 return aLength == bLength 1052 && !strncmp(a.m_url.utf8String().data(), b.m_url.utf8String().data(), aLength); 1053} 1054 1055unsigned KURL::hostStart() const 1056{ 1057 return m_url.m_parsed.CountCharactersBefore(url_parse::Parsed::HOST, false); 1058} 1059 1060unsigned KURL::hostEnd() const 1061{ 1062 return m_url.m_parsed.CountCharactersBefore(url_parse::Parsed::PORT, true); 1063} 1064 1065unsigned KURL::pathStart() const 1066{ 1067 return m_url.m_parsed.CountCharactersBefore(url_parse::Parsed::PATH, false); 1068} 1069 1070unsigned KURL::pathEnd() const 1071{ 1072 return m_url.m_parsed.CountCharactersBefore(url_parse::Parsed::QUERY, true); 1073} 1074 1075unsigned KURL::pathAfterLastSlash() const 1076{ 1077 // When there's no path, ask for what would be the beginning of it. 1078 if (!m_url.m_parsed.path.is_valid()) 1079 return m_url.m_parsed.CountCharactersBefore(url_parse::Parsed::PATH, false); 1080 1081 url_parse::Component filename; 1082 url_parse::ExtractFileName(m_url.utf8String().data(), m_url.m_parsed.path, 1083 &filename); 1084 return filename.begin; 1085} 1086 1087const KURL& blankURL() 1088{ 1089 static KURL staticBlankURL(ParsedURLString, "about:blank"); 1090 return staticBlankURL; 1091} 1092 1093bool protocolIs(const String& url, const char* protocol) 1094{ 1095 // Do the comparison without making a new string object. 1096 assertProtocolIsGood(protocol); 1097 for (int i = 0; ; ++i) { 1098 if (!protocol[i]) 1099 return url[i] == ':'; 1100 if (toASCIILower(url[i]) != protocol[i]) 1101 return false; 1102 } 1103} 1104 1105inline bool KURL::protocolIs(const String& string, const char* protocol) 1106{ 1107 return WebCore::protocolIs(string, protocol); 1108} 1109 1110bool protocolHostAndPortAreEqual(const KURL& a, const KURL& b) 1111{ 1112 if (a.parsed().scheme.end() != b.parsed().scheme.end()) 1113 return false; 1114 1115 int hostStartA = a.hostStart(); 1116 int hostStartB = b.hostStart(); 1117 if (a.hostEnd() - hostStartA != b.hostEnd() - hostStartB) 1118 return false; 1119 1120 // Check the scheme 1121 for (int i = 0; i < a.parsed().scheme.end(); ++i) 1122 if (a.string()[i] != b.string()[i]) 1123 return false; 1124 1125 // And the host 1126 for (int i = hostStartA; i < static_cast<int>(a.hostEnd()); ++i) 1127 if (a.string()[i] != b.string()[i]) 1128 return false; 1129 1130 if (a.port() != b.port()) 1131 return false; 1132 1133 return true; 1134} 1135 1136} // namespace WebCore 1137 1138#endif // USE(GOOGLEURL) 1139