1/* 2 * Copyright (C) 2004, 2007, 2008, 2009 Apple Inc. All rights reserved. 3 * Copyright (C) 2008, 2009, 2011 Google Inc. All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions are 7 * met: 8 * 9 * * Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * * Redistributions in binary form must reproduce the above 12 * copyright notice, this list of conditions and the following disclaimer 13 * in the documentation and/or other materials provided with the 14 * distribution. 15 * * Neither the name of Google Inc. nor the names of its 16 * contributors may be used to endorse or promote products derived from 17 * this software without specific prior written permission. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 20 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 21 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 22 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 23 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 24 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 25 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 26 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 27 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 28 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 */ 31 32#include "config.h" 33 34#if USE(GOOGLEURL) 35#include "KURL.h" 36 37#ifndef NDEBUG 38#include <stdio.h> 39#endif 40 41#include <algorithm> 42 43#include "NotImplemented.h" 44#include "TextEncoding.h" 45#include <wtf/HashMap.h> 46#include <wtf/Vector.h> 47#include <wtf/StdLibExtras.h> 48#include <wtf/text/CString.h> 49#include <wtf/text/StringHash.h> 50 51#include <googleurl/src/url_util.h> 52 53using WTF::isASCIILower; 54using WTF::toASCIILower; 55using std::binary_search; 56 57namespace WebCore { 58 59static const int maximumValidPortNumber = 0xFFFE; 60static const int invalidPortNumber = 0xFFFF; 61 62// Wraps WebCore's text encoding in a character set converter for the 63// canonicalizer. 64class KURLCharsetConverter : public url_canon::CharsetConverter { 65public: 66 // The encoding parameter may be 0, but in this case the object must not be called. 67 KURLCharsetConverter(const TextEncoding* encoding) 68 : m_encoding(encoding) 69 { 70 } 71 72 virtual void ConvertFromUTF16(const url_parse::UTF16Char* input, int inputLength, 73 url_canon::CanonOutput* output) 74 { 75 CString encoded = m_encoding->encode(input, inputLength, URLEncodedEntitiesForUnencodables); 76 output->Append(encoded.data(), static_cast<int>(encoded.length())); 77 } 78 79private: 80 const TextEncoding* m_encoding; 81}; 82 83// Note that this function must be named differently than the one in KURL.cpp 84// since our unit tests evilly include both files, and their local definition 85// will be ambiguous. 86static inline void assertProtocolIsGood(const char* protocol) 87{ 88#ifndef NDEBUG 89 const char* p = protocol; 90 while (*p) { 91 ASSERT(*p > ' ' && *p < 0x7F && !(*p >= 'A' && *p <= 'Z')); 92 ++p; 93 } 94#endif 95} 96 97// Returns the characters for the given string, or a pointer to a static empty 98// string if the input string is null. This will always ensure we have a non- 99// null character pointer since ReplaceComponents has special meaning for null. 100static inline const url_parse::UTF16Char* CharactersOrEmpty(const String& str) 101{ 102 static const url_parse::UTF16Char zero = 0; 103 return str.characters() ? 104 reinterpret_cast<const url_parse::UTF16Char*>(str.characters()) : 105 &zero; 106} 107 108static inline bool isUnicodeEncoding(const TextEncoding* encoding) 109{ 110 return encoding->encodingForFormSubmission() == UTF8Encoding(); 111} 112 113static bool lowerCaseEqualsASCII(const char* begin, const char* end, const char* str) 114{ 115 while (begin != end && *str) { 116 ASSERT(toASCIILower(*str) == *str); 117 if (toASCIILower(*begin++) != *str++) 118 return false; 119 } 120 121 // Both strings are equal (ignoring case) if and only if all of the characters were equal, 122 // and the end of both has been reached. 123 return begin == end && !*str; 124} 125 126static inline bool isSchemeFirstChar(char c) 127{ 128 return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'); 129} 130 131static inline bool isSchemeChar(char c) 132{ 133 return isSchemeFirstChar(c) || (c >= '0' && c <= '9') || c == '.' || c == '-' || c == '*'; 134} 135 136bool isValidProtocol(const String& protocol) 137{ 138 // NOTE This is a copy of the function in KURL.cpp. 139 // RFC3986: ALPHA *( ALPHA / DIGIT / "+" / "-" / "." ) 140 if (protocol.isEmpty()) 141 return false; 142 if (!isSchemeFirstChar(protocol[0])) 143 return false; 144 unsigned protocolLength = protocol.length(); 145 for (unsigned i = 1; i < protocolLength; i++) { 146 if (!isSchemeChar(protocol[i])) 147 return false; 148 } 149 return true; 150} 151 152 153// KURLGooglePrivate ----------------------------------------------------------- 154 155KURLGooglePrivate::KURLGooglePrivate() 156 : m_isValid(false) 157 , m_protocolInHTTPFamily(false) 158 , m_utf8IsASCII(true) 159 , m_stringIsValid(false) 160{ 161} 162 163KURLGooglePrivate::KURLGooglePrivate(const url_parse::Parsed& parsed, bool isValid) 164 : m_isValid(isValid) 165 , m_protocolInHTTPFamily(false) 166 , m_parsed(parsed) 167 , m_utf8IsASCII(true) 168 , m_stringIsValid(false) 169{ 170} 171 172KURLGooglePrivate::KURLGooglePrivate(WTF::HashTableDeletedValueType) 173 : m_string(WTF::HashTableDeletedValue) 174{ 175} 176 177// Setters for the data. Using the ASCII version when you know the 178// data is ASCII will be slightly more efficient. The UTF-8 version 179// will always be correct if the caller is unsure. 180void KURLGooglePrivate::setUtf8(const CString& str) 181{ 182 const char* data = str.data(); 183 unsigned dataLength = str.length(); 184 185 // The m_utf8IsASCII must always be correct since the DeprecatedString 186 // getter must create it with the proper constructor. This test can be 187 // removed when DeprecatedString is gone, but it still might be a 188 // performance win. 189 m_utf8IsASCII = true; 190 for (unsigned i = 0; i < dataLength; i++) { 191 if (static_cast<unsigned char>(data[i]) >= 0x80) { 192 m_utf8IsASCII = false; 193 break; 194 } 195 } 196 197 m_utf8 = str; 198 m_stringIsValid = false; 199 initProtocolInHTTPFamily(); 200} 201 202void KURLGooglePrivate::setAscii(const CString& str) 203{ 204 m_utf8 = str; 205 m_utf8IsASCII = true; 206 m_stringIsValid = false; 207 initProtocolInHTTPFamily(); 208} 209 210void KURLGooglePrivate::init(const KURL& base, 211 const String& relative, 212 const TextEncoding* queryEncoding) 213{ 214 init(base, relative.characters(), relative.length(), queryEncoding); 215} 216 217template <typename CHAR> 218void KURLGooglePrivate::init(const KURL& base, const CHAR* rel, int relLength, 219 const TextEncoding* queryEncoding) 220{ 221 // As a performance optimization, we do not use the charset converter 222 // if encoding is UTF-8 or other Unicode encodings. Note that this is 223 // per HTML5 2.5.3 (resolving URL). The URL canonicalizer will be more 224 // efficient with no charset converter object because it can do UTF-8 225 // internally with no extra copies. 226 227 // We feel free to make the charset converter object every time since it's 228 // just a wrapper around a reference. 229 KURLCharsetConverter charsetConverterObject(queryEncoding); 230 KURLCharsetConverter* charsetConverter = 231 (!queryEncoding || isUnicodeEncoding(queryEncoding)) ? 0 : 232 &charsetConverterObject; 233 234 url_canon::RawCanonOutputT<char> output; 235 const CString& baseStr = base.m_url.utf8String(); 236 m_isValid = url_util::ResolveRelative(baseStr.data(), baseStr.length(), 237 base.m_url.m_parsed, rel, relLength, 238 charsetConverter, 239 &output, &m_parsed); 240 241 // See FIXME in KURLGooglePrivate in the header. If canonicalization has not 242 // changed the string, we can avoid an extra allocation by using assignment. 243 // 244 // When KURL encounters an error such that the URL is invalid and empty 245 // (for example, resolving a relative URL on a non-hierarchical base), it 246 // will produce an isNull URL, and calling setUtf8 will produce an empty 247 // non-null URL. This is unlikely to affect anything, but we preserve this 248 // just in case. 249 if (m_isValid || output.length()) { 250 // Without ref, the whole url is guaranteed to be ASCII-only. 251 if (m_parsed.ref.is_nonempty()) 252 setUtf8(CString(output.data(), output.length())); 253 else 254 setAscii(CString(output.data(), output.length())); 255 } else { 256 // WebCore expects resolved URLs to be empty rather than null. 257 setUtf8(CString("", 0)); 258 } 259} 260 261void KURLGooglePrivate::initProtocolInHTTPFamily() 262{ 263 if (!m_isValid) { 264 m_protocolInHTTPFamily = false; 265 return; 266 } 267 268 const char* scheme = m_utf8.data() + m_parsed.scheme.begin; 269 if (m_parsed.scheme.len == 4) 270 m_protocolInHTTPFamily = lowerCaseEqualsASCII(scheme, scheme + 4, "http"); 271 else if (m_parsed.scheme.len == 5) 272 m_protocolInHTTPFamily = lowerCaseEqualsASCII(scheme, scheme + 5, "https"); 273 else 274 m_protocolInHTTPFamily = false; 275} 276 277void KURLGooglePrivate::copyTo(KURLGooglePrivate* dest) const 278{ 279 dest->m_isValid = m_isValid; 280 dest->m_protocolInHTTPFamily = m_protocolInHTTPFamily; 281 dest->m_parsed = m_parsed; 282 283 // Don't copy the 16-bit string since that will be regenerated as needed. 284 dest->m_utf8 = CString(m_utf8.data(), m_utf8.length()); 285 dest->m_utf8IsASCII = m_utf8IsASCII; 286 dest->m_stringIsValid = false; 287} 288 289String KURLGooglePrivate::componentString(const url_parse::Component& comp) const 290{ 291 if (!m_isValid || comp.len <= 0) { 292 // KURL returns a null string if the URL is itself a null string, and an 293 // empty string for other nonexistent entities. 294 if (utf8String().isNull()) 295 return String(); 296 return String("", 0); 297 } 298 // begin and len are in terms of bytes which do not match 299 // if string() is UTF-16 and input contains non-ASCII characters. 300 // However, the only part in urlString that can contain non-ASCII 301 // characters is 'ref' at the end of the string. In that case, 302 // begin will always match the actual value and len (in terms of 303 // byte) will be longer than what's needed by 'mid'. However, mid 304 // truncates len to avoid go past the end of a string so that we can 305 // get away withtout doing anything here. 306 return string().substring(comp.begin, comp.len); 307} 308 309void KURLGooglePrivate::replaceComponents(const Replacements& replacements) 310{ 311 url_canon::RawCanonOutputT<char> output; 312 url_parse::Parsed newParsed; 313 314 m_isValid = url_util::ReplaceComponents(utf8String().data(), 315 utf8String().length(), m_parsed, replacements, 0, &output, &newParsed); 316 317 m_parsed = newParsed; 318 if (m_parsed.ref.is_nonempty()) 319 setUtf8(CString(output.data(), output.length())); 320 else 321 setAscii(CString(output.data(), output.length())); 322} 323 324const String& KURLGooglePrivate::string() const 325{ 326 if (!m_stringIsValid) { 327 // Handle the null case separately. Otherwise, constructing 328 // the string like we do below would generate the empty string, 329 // not the null string. 330 if (m_utf8.isNull()) 331 m_string = String(); 332 else if (m_utf8IsASCII) 333 m_string = String(m_utf8.data(), m_utf8.length()); 334 else 335 m_string = String::fromUTF8(m_utf8.data(), m_utf8.length()); 336 m_stringIsValid = true; 337 } 338 return m_string; 339} 340 341// KURL ------------------------------------------------------------------------ 342 343// Creates with null-terminated string input representing an absolute URL. 344// WebCore generally calls this only with hardcoded strings, so the input is 345// ASCII. We treat it as UTF-8 just in case. 346KURL::KURL(ParsedURLStringTag, const char *url) 347{ 348 // FIXME The Mac code checks for beginning with a slash and converts it to 349 // file: URL. We will want to add this as well once we can compile on a 350 // system like that. 351 m_url.init(KURL(), url, strlen(url), 0); 352 353 // The one-argument constructors should never generate a null string. 354 // This is a funny quirk of KURL.cpp (probably a bug) which we preserve. 355 if (m_url.utf8String().isNull()) 356 m_url.setAscii(CString("", 0)); 357} 358 359// Initializes with a string representing an absolute URL. No encoding 360// information is specified. This generally happens when a KURL is converted 361// to a string and then converted back. In this case, the URL is already 362// canonical and in proper escaped form so needs no encoding. We treat it as 363// UTF-8 just in case. 364KURL::KURL(ParsedURLStringTag, const String& url) 365{ 366 if (!url.isNull()) 367 m_url.init(KURL(), url, 0); 368 else { 369 // WebCore expects us to preserve the nullness of strings when this 370 // constructor is used. In all other cases, it expects a non-null 371 // empty string, which is what init() will create. 372 m_url.m_isValid = false; 373 m_url.m_protocolInHTTPFamily = false; 374 } 375} 376 377// Constructs a new URL given a base URL and a possibly relative input URL. 378// This assumes UTF-8 encoding. 379KURL::KURL(const KURL& base, const String& relative) 380{ 381 m_url.init(base, relative, 0); 382} 383 384// Constructs a new URL given a base URL and a possibly relative input URL. 385// Any query portion of the relative URL will be encoded in the given encoding. 386KURL::KURL(const KURL& base, 387 const String& relative, 388 const TextEncoding& encoding) 389{ 390 m_url.init(base, relative, &encoding.encodingForFormSubmission()); 391} 392 393KURL::KURL(const CString& canonicalSpec, 394 const url_parse::Parsed& parsed, bool isValid) 395 : m_url(parsed, isValid) 396{ 397 // We know the reference fragment is the only part that can be UTF-8, so 398 // we know it's ASCII when there is no ref. 399 if (parsed.ref.is_nonempty()) 400 m_url.setUtf8(canonicalSpec); 401 else 402 m_url.setAscii(canonicalSpec); 403} 404 405#if USE(CF) 406KURL::KURL(CFURLRef) 407{ 408 notImplemented(); 409 invalidate(); 410} 411 412CFURLRef KURL::createCFURL() const 413{ 414 notImplemented(); 415 return 0; 416} 417#endif 418 419KURL KURL::copy() const 420{ 421 KURL result = *this; 422 m_url.copyTo(&result.m_url); 423 return result; 424} 425 426bool KURL::isNull() const 427{ 428 return m_url.utf8String().isNull(); 429} 430 431bool KURL::isEmpty() const 432{ 433 return !m_url.utf8String().length(); 434} 435 436bool KURL::isValid() const 437{ 438 return m_url.m_isValid; 439} 440 441bool KURL::hasPort() const 442{ 443 return hostEnd() < pathStart(); 444} 445 446bool KURL::protocolInHTTPFamily() const 447{ 448 return m_url.m_protocolInHTTPFamily; 449} 450 451bool KURL::hasPath() const 452{ 453 // Note that http://www.google.com/" has a path, the path is "/". This can 454 // return false only for invalid or nonstandard URLs. 455 return m_url.m_parsed.path.len >= 0; 456} 457 458// We handle "parameters" separated by a semicolon, while KURL.cpp does not, 459// which can lead to different results in some cases. 460String KURL::lastPathComponent() const 461{ 462 // When the output ends in a slash, WebCore has different expectations than 463 // the GoogleURL library. For "/foo/bar/" the library will return the empty 464 // string, but WebCore wants "bar". 465 url_parse::Component path = m_url.m_parsed.path; 466 if (path.len > 0 && m_url.utf8String().data()[path.end() - 1] == '/') 467 path.len--; 468 469 url_parse::Component file; 470 url_parse::ExtractFileName(m_url.utf8String().data(), path, &file); 471 472 // Bug: https://bugs.webkit.org/show_bug.cgi?id=21015 this function returns 473 // a null string when the path is empty, which we duplicate here. 474 if (!file.is_nonempty()) 475 return String(); 476 return m_url.componentString(file); 477} 478 479String KURL::protocol() const 480{ 481 return m_url.componentString(m_url.m_parsed.scheme); 482} 483 484String KURL::host() const 485{ 486 // Note: KURL.cpp unescapes here. 487 return m_url.componentString(m_url.m_parsed.host); 488} 489 490// Returns 0 when there is no port. 491// 492// We treat URL's with out-of-range port numbers as invalid URLs, and they will 493// be rejected by the canonicalizer. KURL.cpp will allow them in parsing, but 494// return invalidPortNumber from this port() function, so we mirror that behavior here. 495unsigned short KURL::port() const 496{ 497 if (!m_url.m_isValid || m_url.m_parsed.port.len <= 0) 498 return 0; 499 int port = url_parse::ParsePort(m_url.utf8String().data(), m_url.m_parsed.port); 500 ASSERT(port != url_parse::PORT_UNSPECIFIED); // Checked port.len <= 0 before. 501 502 if (port == url_parse::PORT_INVALID || port > maximumValidPortNumber) // Mimic KURL::port() 503 port = invalidPortNumber; 504 505 return static_cast<unsigned short>(port); 506} 507 508// Returns the empty string if there is no password. 509String KURL::pass() const 510{ 511 // Bug: https://bugs.webkit.org/show_bug.cgi?id=21015 this function returns 512 // a null string when the password is empty, which we duplicate here. 513 if (!m_url.m_parsed.password.is_nonempty()) 514 return String(); 515 516 // Note: KURL.cpp unescapes here. 517 return m_url.componentString(m_url.m_parsed.password); 518} 519 520// Returns the empty string if there is no username. 521String KURL::user() const 522{ 523 // Note: KURL.cpp unescapes here. 524 return m_url.componentString(m_url.m_parsed.username); 525} 526 527String KURL::fragmentIdentifier() const 528{ 529 // Empty but present refs ("foo.com/bar#") should result in the empty 530 // string, which m_url.componentString will produce. Nonexistent refs 531 // should be the null string. 532 if (!m_url.m_parsed.ref.is_valid()) 533 return String(); 534 535 // Note: KURL.cpp unescapes here. 536 return m_url.componentString(m_url.m_parsed.ref); 537} 538 539bool KURL::hasFragmentIdentifier() const 540{ 541 // Note: KURL.cpp unescapes here. 542 // FIXME determine if KURL.cpp agrees about an empty ref 543 return m_url.m_parsed.ref.len >= 0; 544} 545 546void KURL::copyParsedQueryTo(ParsedURLParameters& parameters) const 547{ 548 String query = m_url.componentString(m_url.m_parsed.query); 549 const UChar* pos = query.characters(); 550 const UChar* end = query.characters() + query.length(); 551 while (pos < end) { 552 const UChar* parameterStart = pos; 553 while (pos < end && *pos != '&') 554 ++pos; 555 const UChar* parameterEnd = pos; 556 if (pos < end) { 557 ASSERT(*pos == '&'); 558 ++pos; 559 } 560 if (parameterStart == parameterEnd) 561 continue; 562 const UChar* nameStart = parameterStart; 563 const UChar* equalSign = parameterStart; 564 while (equalSign < parameterEnd && *equalSign != '=') 565 ++equalSign; 566 if (equalSign == nameStart) 567 continue; 568 String name(nameStart, equalSign - nameStart); 569 String value = equalSign == parameterEnd ? String() : String(equalSign + 1, parameterEnd - equalSign - 1); 570 parameters.set(name, value); 571 } 572} 573 574String KURL::baseAsString() const 575{ 576 // FIXME: There is probably a more efficient way to do this? 577 return string().left(pathAfterLastSlash()); 578} 579 580String KURL::query() const 581{ 582 if (m_url.m_parsed.query.len >= 0) 583 return m_url.componentString(m_url.m_parsed.query); 584 585 // Bug: https://bugs.webkit.org/show_bug.cgi?id=21015 this function returns 586 // an empty string when the query is empty rather than a null (not sure 587 // which is right). 588 // Returns a null if the query is not specified, instead of empty. 589 if (m_url.m_parsed.query.is_valid()) 590 return String("", 0); 591 return String(); 592} 593 594String KURL::path() const 595{ 596 return m_url.componentString(m_url.m_parsed.path); 597} 598 599bool KURL::setProtocol(const String& protocol) 600{ 601 // Firefox and IE remove everything after the first ':'. 602 int separatorPosition = protocol.find(':'); 603 String newProtocol = protocol.substring(0, separatorPosition); 604 605 // If KURL is given an invalid scheme, it returns failure without modifying 606 // the URL at all. This is in contrast to most other setters which modify 607 // the URL and set "m_isValid." 608 url_canon::RawCanonOutputT<char> canonProtocol; 609 url_parse::Component protocolComponent; 610 if (!url_canon::CanonicalizeScheme(newProtocol.characters(), 611 url_parse::Component(0, newProtocol.length()), 612 &canonProtocol, &protocolComponent) 613 || !protocolComponent.is_nonempty()) 614 return false; 615 616 KURLGooglePrivate::Replacements replacements; 617 replacements.SetScheme(CharactersOrEmpty(newProtocol), 618 url_parse::Component(0, newProtocol.length())); 619 m_url.replaceComponents(replacements); 620 621 // isValid could be false but we still return true here. This is because 622 // WebCore or JS scripts can build up a URL by setting individual 623 // components, and a JS exception is based on the return value of this 624 // function. We want to throw the exception and stop the script only when 625 // its trying to set a bad protocol, and not when it maybe just hasn't 626 // finished building up its final scheme. 627 return true; 628} 629 630void KURL::setHost(const String& host) 631{ 632 KURLGooglePrivate::Replacements replacements; 633 replacements.SetHost(CharactersOrEmpty(host), 634 url_parse::Component(0, host.length())); 635 m_url.replaceComponents(replacements); 636} 637 638void KURL::setHostAndPort(const String& s) 639{ 640 String host = s; 641 String port; 642 int hostEnd = s.find(":"); 643 if (hostEnd != -1) { 644 host = s.left(hostEnd); 645 port = s.substring(hostEnd + 1); 646 } 647 648 KURLGooglePrivate::Replacements replacements; 649 // Host can't be removed, so we always set. 650 replacements.SetHost(CharactersOrEmpty(host), 651 url_parse::Component(0, host.length())); 652 653 if (port.isEmpty()) // Port may be removed, so we support clearing. 654 replacements.ClearPort(); 655 else 656 replacements.SetPort(CharactersOrEmpty(port), url_parse::Component(0, port.length())); 657 m_url.replaceComponents(replacements); 658} 659 660void KURL::removePort() 661{ 662 if (hasPort()) { 663 String urlWithoutPort = m_url.string().left(hostEnd()) + m_url.string().substring(pathStart()); 664 m_url.setUtf8(urlWithoutPort.utf8()); 665 } 666} 667 668void KURL::setPort(unsigned short i) 669{ 670 KURLGooglePrivate::Replacements replacements; 671 String portStr; 672 673 portStr = String::number(i); 674 replacements.SetPort( 675 reinterpret_cast<const url_parse::UTF16Char*>(portStr.characters()), 676 url_parse::Component(0, portStr.length())); 677 678 m_url.replaceComponents(replacements); 679} 680 681void KURL::setUser(const String& user) 682{ 683 // This function is commonly called to clear the username, which we 684 // normally don't have, so we optimize this case. 685 if (user.isEmpty() && !m_url.m_parsed.username.is_valid()) 686 return; 687 688 // The canonicalizer will clear any usernames that are empty, so we 689 // don't have to explicitly call ClearUsername() here. 690 KURLGooglePrivate::Replacements replacements; 691 replacements.SetUsername(CharactersOrEmpty(user), 692 url_parse::Component(0, user.length())); 693 m_url.replaceComponents(replacements); 694} 695 696void KURL::setPass(const String& pass) 697{ 698 // This function is commonly called to clear the password, which we 699 // normally don't have, so we optimize this case. 700 if (pass.isEmpty() && !m_url.m_parsed.password.is_valid()) 701 return; 702 703 // The canonicalizer will clear any passwords that are empty, so we 704 // don't have to explicitly call ClearUsername() here. 705 KURLGooglePrivate::Replacements replacements; 706 replacements.SetPassword(CharactersOrEmpty(pass), 707 url_parse::Component(0, pass.length())); 708 m_url.replaceComponents(replacements); 709} 710 711void KURL::setFragmentIdentifier(const String& s) 712{ 713 // This function is commonly called to clear the ref, which we 714 // normally don't have, so we optimize this case. 715 if (s.isNull() && !m_url.m_parsed.ref.is_valid()) 716 return; 717 718 KURLGooglePrivate::Replacements replacements; 719 if (s.isNull()) 720 replacements.ClearRef(); 721 else 722 replacements.SetRef(CharactersOrEmpty(s), url_parse::Component(0, s.length())); 723 m_url.replaceComponents(replacements); 724} 725 726void KURL::removeFragmentIdentifier() 727{ 728 KURLGooglePrivate::Replacements replacements; 729 replacements.ClearRef(); 730 m_url.replaceComponents(replacements); 731} 732 733void KURL::setQuery(const String& query) 734{ 735 KURLGooglePrivate::Replacements replacements; 736 if (query.isNull()) { 737 // KURL.cpp sets to null to clear any query. 738 replacements.ClearQuery(); 739 } else if (query.length() > 0 && query[0] == '?') { 740 // WebCore expects the query string to begin with a question mark, but 741 // GoogleURL doesn't. So we trim off the question mark when setting. 742 replacements.SetQuery(CharactersOrEmpty(query), 743 url_parse::Component(1, query.length() - 1)); 744 } else { 745 // When set with the empty string or something that doesn't begin with 746 // a question mark, KURL.cpp will add a question mark for you. The only 747 // way this isn't compatible is if you call this function with an empty 748 // string. KURL.cpp will leave a '?' with nothing following it in the 749 // URL, whereas we'll clear it. 750 // FIXME We should eliminate this difference. 751 replacements.SetQuery(CharactersOrEmpty(query), 752 url_parse::Component(0, query.length())); 753 } 754 m_url.replaceComponents(replacements); 755} 756 757void KURL::setPath(const String& path) 758{ 759 // Empty paths will be canonicalized to "/", so we don't have to worry 760 // about calling ClearPath(). 761 KURLGooglePrivate::Replacements replacements; 762 replacements.SetPath(CharactersOrEmpty(path), 763 url_parse::Component(0, path.length())); 764 m_url.replaceComponents(replacements); 765} 766 767// On Mac, this just seems to return the same URL, but with "/foo/bar" for 768// file: URLs instead of file:///foo/bar. We don't bother with any of this, 769// at least for now. 770String KURL::deprecatedString() const 771{ 772 if (!m_url.m_isValid) 773 return String(); 774 return m_url.string(); 775} 776 777String decodeURLEscapeSequences(const String& str) 778{ 779 return decodeURLEscapeSequences(str, UTF8Encoding()); 780} 781 782// In KURL.cpp's implementation, this is called by every component getter. 783// It will unescape every character, including '\0'. This is scary, and may 784// cause security holes. We never call this function for components, and 785// just return the ASCII versions instead. 786// 787// This function is also used to decode javascript: URLs and as a general 788// purpose unescaping function. 789// 790// FIXME These should be merged to the KURL.cpp implementation. 791String decodeURLEscapeSequences(const String& str, const TextEncoding& encoding) 792{ 793 // FIXME We can probably use KURL.cpp's version of this function 794 // without modification. However, I'm concerned about 795 // https://bugs.webkit.org/show_bug.cgi?id=20559 so am keeping this old 796 // custom code for now. Using their version will also fix the bug that 797 // we ignore the encoding. 798 // 799 // FIXME b/1350291: This does not get called very often. We just convert 800 // first to 8-bit UTF-8, then unescape, then back to 16-bit. This kind of 801 // sucks, and we don't use the encoding properly, which will make some 802 // obscure anchor navigations fail. 803 CString cstr = str.utf8(); 804 805 const char* input = cstr.data(); 806 int inputLength = cstr.length(); 807 808 url_canon::RawCanonOutputT<url_parse::UTF16Char> unescaped; 809 810 url_util::DecodeURLEscapeSequences(input, inputLength, &unescaped); 811 812 return String(reinterpret_cast<UChar*>(unescaped.data()), 813 unescaped.length()); 814} 815 816bool KURL::protocolIs(const char* protocol) const 817{ 818 assertProtocolIsGood(protocol); 819 820 // JavaScript URLs are "valid" and should be executed even if KURL decides they are invalid. 821 // The free function protocolIsJavaScript() should be used instead. 822 // FIXME: Chromium code needs to be fixed for this assert to be enabled. ASSERT(strcmp(protocol, "javascript")); 823 824 if (m_url.m_parsed.scheme.len <= 0) 825 return !protocol; 826 return lowerCaseEqualsASCII( 827 m_url.utf8String().data() + m_url.m_parsed.scheme.begin, 828 m_url.utf8String().data() + m_url.m_parsed.scheme.end(), 829 protocol); 830} 831 832// This is called to escape a URL string. It is only used externally when 833// constructing mailto: links to set the query section. Since our query setter 834// will automatically do the correct escaping, this function does not have to 835// do any work. 836// 837// There is a possibility that a future caller may use this function in other 838// ways, and may expect to get a valid URL string. The dangerous thing we want 839// to protect against here is accidentally getting '\0' characters in a string 840// that is not supposed to have them. Therefore, we escape these characters. 841String encodeWithURLEscapeSequences(const String& notEncodedString) 842{ 843 CString utf8 = UTF8Encoding().encode( 844 reinterpret_cast<const UChar*>(notEncodedString.characters()), 845 notEncodedString.length(), 846 URLEncodedEntitiesForUnencodables); 847 const char* input = utf8.data(); 848 int inputLength = utf8.length(); 849 850 Vector<char, 2048> buffer; 851 for (int i = 0; i < inputLength; i++) { 852 if (!input[i]) 853 buffer.append("%00", 3); 854 else 855 buffer.append(input[i]); 856 } 857 return String(buffer.data(), buffer.size()); 858} 859 860bool KURL::isHierarchical() const 861{ 862 if (!m_url.m_parsed.scheme.is_nonempty()) 863 return false; 864 return url_util::IsStandard( 865 &m_url.utf8String().data()[m_url.m_parsed.scheme.begin], 866 m_url.m_parsed.scheme); 867} 868 869#ifndef NDEBUG 870void KURL::print() const 871{ 872 printf("%s\n", m_url.utf8String().data()); 873} 874#endif 875 876void KURL::invalidate() 877{ 878 // This is only called from the constructor so resetting the (automatically 879 // initialized) string and parsed structure would be a waste of time. 880 m_url.m_isValid = false; 881 m_url.m_protocolInHTTPFamily = false; 882} 883 884// Equal up to reference fragments, if any. 885bool equalIgnoringFragmentIdentifier(const KURL& a, const KURL& b) 886{ 887 // Compute the length of each URL without its ref. Note that the reference 888 // begin (if it exists) points to the character *after* the '#', so we need 889 // to subtract one. 890 int aLength = a.m_url.utf8String().length(); 891 if (a.m_url.m_parsed.ref.len >= 0) 892 aLength = a.m_url.m_parsed.ref.begin - 1; 893 894 int bLength = b.m_url.utf8String().length(); 895 if (b.m_url.m_parsed.ref.len >= 0) 896 bLength = b.m_url.m_parsed.ref.begin - 1; 897 898 return aLength == bLength 899 && !strncmp(a.m_url.utf8String().data(), b.m_url.utf8String().data(), aLength); 900} 901 902unsigned KURL::hostStart() const 903{ 904 return m_url.m_parsed.CountCharactersBefore(url_parse::Parsed::HOST, false); 905} 906 907unsigned KURL::hostEnd() const 908{ 909 return m_url.m_parsed.CountCharactersBefore(url_parse::Parsed::PORT, true); 910} 911 912unsigned KURL::pathStart() const 913{ 914 return m_url.m_parsed.CountCharactersBefore(url_parse::Parsed::PATH, false); 915} 916 917unsigned KURL::pathEnd() const 918{ 919 return m_url.m_parsed.CountCharactersBefore(url_parse::Parsed::QUERY, true); 920} 921 922unsigned KURL::pathAfterLastSlash() const 923{ 924 // When there's no path, ask for what would be the beginning of it. 925 if (!m_url.m_parsed.path.is_valid()) 926 return m_url.m_parsed.CountCharactersBefore(url_parse::Parsed::PATH, false); 927 928 url_parse::Component filename; 929 url_parse::ExtractFileName(m_url.utf8String().data(), m_url.m_parsed.path, 930 &filename); 931 return filename.begin; 932} 933 934bool protocolIs(const String& url, const char* protocol) 935{ 936 // Do the comparison without making a new string object. 937 assertProtocolIsGood(protocol); 938 939 // Check the scheme like GURL does. 940 return url_util::FindAndCompareScheme(url.characters(), url.length(), 941 protocol, 0); 942} 943 944inline bool KURL::protocolIs(const String& string, const char* protocol) 945{ 946 return WebCore::protocolIs(string, protocol); 947} 948 949bool protocolHostAndPortAreEqual(const KURL& a, const KURL& b) 950{ 951 if (a.parsed().scheme.end() != b.parsed().scheme.end()) 952 return false; 953 954 int hostStartA = a.hostStart(); 955 int hostLengthA = a.hostEnd() - hostStartA; 956 int hostStartB = b.hostStart(); 957 int hostLengthB = b.hostEnd() - b.hostStart(); 958 if (hostLengthA != hostLengthB) 959 return false; 960 961 // Check the scheme 962 for (int i = 0; i < a.parsed().scheme.end(); ++i) 963 if (a.string()[i] != b.string()[i]) 964 return false; 965 966 // And the host 967 for (int i = 0; i < hostLengthA; ++i) 968 if (a.string()[hostStartA + i] != b.string()[hostStartB + i]) 969 return false; 970 971 if (a.port() != b.port()) 972 return false; 973 974 return true; 975} 976 977} // namespace WebCore 978 979#endif // USE(GOOGLEURL) 980