1/* 2 * Copyright (C) 2004, 2007, 2008 Apple Inc. All rights reserved. 3 * 4 * Redistribution and use in source and binary forms, with or without 5 * modification, are permitted provided that the following conditions 6 * are met: 7 * 1. Redistributions of source code must retain the above copyright 8 * notice, this list of conditions and the following disclaimer. 9 * 2. Redistributions in binary form must reproduce the above copyright 10 * notice, this list of conditions and the following disclaimer in the 11 * documentation and/or other materials provided with the distribution. 12 * 13 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY 14 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 16 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR 17 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 18 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 19 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 20 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 21 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 22 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 23 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 24 */ 25 26#include "config.h" 27 28#include "KURL.h" 29 30#include "TextEncoding.h" 31#include <wtf/text/CString.h> 32#include <wtf/HashMap.h> 33#include <wtf/HexNumber.h> 34#include <wtf/StdLibExtras.h> 35#include <wtf/text/StringHash.h> 36 37#if USE(ICU_UNICODE) 38#include <unicode/uidna.h> 39#elif USE(QT4_UNICODE) 40#include <QUrl> 41#elif USE(GLIB_UNICODE) 42#include <glib.h> 43#include "GOwnPtr.h" 44#endif 45 46#include <stdio.h> 47 48using namespace std; 49using namespace WTF; 50 51namespace WebCore { 52 53typedef Vector<char, 512> CharBuffer; 54typedef Vector<UChar, 512> UCharBuffer; 55 56static const unsigned maximumValidPortNumber = 0xFFFE; 57static const unsigned invalidPortNumber = 0xFFFF; 58 59#if !USE(GOOGLEURL) 60 61// FIXME: This file makes too much use of the + operator on String. 62// We either have to optimize that operator so it doesn't involve 63// so many allocations, or change this to use Vector<UChar> instead. 64 65enum URLCharacterClasses { 66 // alpha 67 SchemeFirstChar = 1 << 0, 68 69 // ( alpha | digit | "+" | "-" | "." ) 70 SchemeChar = 1 << 1, 71 72 // mark = "-" | "_" | "." | "!" | "~" | "*" | "'" | "(" | ")" 73 // unreserved = alphanum | mark 74 // ( unreserved | escaped | ";" | ":" | "&" | "=" | "+" | "$" | "," ) 75 UserInfoChar = 1 << 2, 76 77 // alnum | "." | "-" | "%" 78 // The above is what the specification says, but we are lenient to 79 // match existing practice and also allow: 80 // "_" 81 HostnameChar = 1 << 3, 82 83 // hexdigit | ":" | "%" 84 IPv6Char = 1 << 4, 85 86 // "#" | "?" | "/" | nul 87 PathSegmentEndChar = 1 << 5, 88 89 // not allowed in path 90 BadChar = 1 << 6 91}; 92 93static const unsigned char characterClassTable[256] = { 94 /* 0 nul */ PathSegmentEndChar, /* 1 soh */ BadChar, 95 /* 2 stx */ BadChar, /* 3 etx */ BadChar, 96 /* 4 eot */ BadChar, /* 5 enq */ BadChar, /* 6 ack */ BadChar, /* 7 bel */ BadChar, 97 /* 8 bs */ BadChar, /* 9 ht */ BadChar, /* 10 nl */ BadChar, /* 11 vt */ BadChar, 98 /* 12 np */ BadChar, /* 13 cr */ BadChar, /* 14 so */ BadChar, /* 15 si */ BadChar, 99 /* 16 dle */ BadChar, /* 17 dc1 */ BadChar, /* 18 dc2 */ BadChar, /* 19 dc3 */ BadChar, 100 /* 20 dc4 */ BadChar, /* 21 nak */ BadChar, /* 22 syn */ BadChar, /* 23 etb */ BadChar, 101 /* 24 can */ BadChar, /* 25 em */ BadChar, /* 26 sub */ BadChar, /* 27 esc */ BadChar, 102 /* 28 fs */ BadChar, /* 29 gs */ BadChar, /* 30 rs */ BadChar, /* 31 us */ BadChar, 103 /* 32 sp */ BadChar, /* 33 ! */ UserInfoChar, 104 /* 34 " */ BadChar, /* 35 # */ PathSegmentEndChar | BadChar, 105 /* 36 $ */ UserInfoChar, /* 37 % */ UserInfoChar | HostnameChar | IPv6Char | BadChar, 106 /* 38 & */ UserInfoChar, /* 39 ' */ UserInfoChar, 107 /* 40 ( */ UserInfoChar, /* 41 ) */ UserInfoChar, 108 /* 42 * */ UserInfoChar, /* 43 + */ SchemeChar | UserInfoChar, 109 /* 44 , */ UserInfoChar, 110 /* 45 - */ SchemeChar | UserInfoChar | HostnameChar, 111 /* 46 . */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char, 112 /* 47 / */ PathSegmentEndChar, 113 /* 48 0 */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char, 114 /* 49 1 */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char, 115 /* 50 2 */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char, 116 /* 51 3 */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char, 117 /* 52 4 */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char, 118 /* 53 5 */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char, 119 /* 54 6 */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char, 120 /* 55 7 */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char, 121 /* 56 8 */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char, 122 /* 57 9 */ SchemeChar | UserInfoChar | HostnameChar | IPv6Char, 123 /* 58 : */ UserInfoChar | IPv6Char, /* 59 ; */ UserInfoChar, 124 /* 60 < */ BadChar, /* 61 = */ UserInfoChar, 125 /* 62 > */ BadChar, /* 63 ? */ PathSegmentEndChar | BadChar, 126 /* 64 @ */ 0, 127 /* 65 A */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char, 128 /* 66 B */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char, 129 /* 67 C */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char, 130 /* 68 D */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char, 131 /* 69 E */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char, 132 /* 70 F */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char, 133 /* 71 G */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, 134 /* 72 H */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, 135 /* 73 I */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, 136 /* 74 J */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, 137 /* 75 K */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, 138 /* 76 L */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, 139 /* 77 M */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, 140 /* 78 N */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, 141 /* 79 O */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, 142 /* 80 P */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, 143 /* 81 Q */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, 144 /* 82 R */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, 145 /* 83 S */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, 146 /* 84 T */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, 147 /* 85 U */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, 148 /* 86 V */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, 149 /* 87 W */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, 150 /* 88 X */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, 151 /* 89 Y */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, 152 /* 90 Z */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, 153 /* 91 [ */ 0, 154 /* 92 \ */ 0, /* 93 ] */ 0, 155 /* 94 ^ */ 0, 156 /* 95 _ */ UserInfoChar | HostnameChar, 157 /* 96 ` */ 0, 158 /* 97 a */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char, 159 /* 98 b */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char, 160 /* 99 c */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char, 161 /* 100 d */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char, 162 /* 101 e */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char, 163 /* 102 f */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar | IPv6Char, 164 /* 103 g */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, 165 /* 104 h */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, 166 /* 105 i */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, 167 /* 106 j */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, 168 /* 107 k */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, 169 /* 108 l */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, 170 /* 109 m */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, 171 /* 110 n */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, 172 /* 111 o */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, 173 /* 112 p */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, 174 /* 113 q */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, 175 /* 114 r */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, 176 /* 115 s */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, 177 /* 116 t */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, 178 /* 117 u */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, 179 /* 118 v */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, 180 /* 119 w */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, 181 /* 120 x */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, 182 /* 121 y */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, 183 /* 122 z */ SchemeFirstChar | SchemeChar | UserInfoChar | HostnameChar, 184 /* 123 { */ 0, 185 /* 124 | */ 0, /* 125 } */ 0, /* 126 ~ */ UserInfoChar, /* 127 del */ BadChar, 186 /* 128 */ BadChar, /* 129 */ BadChar, /* 130 */ BadChar, /* 131 */ BadChar, 187 /* 132 */ BadChar, /* 133 */ BadChar, /* 134 */ BadChar, /* 135 */ BadChar, 188 /* 136 */ BadChar, /* 137 */ BadChar, /* 138 */ BadChar, /* 139 */ BadChar, 189 /* 140 */ BadChar, /* 141 */ BadChar, /* 142 */ BadChar, /* 143 */ BadChar, 190 /* 144 */ BadChar, /* 145 */ BadChar, /* 146 */ BadChar, /* 147 */ BadChar, 191 /* 148 */ BadChar, /* 149 */ BadChar, /* 150 */ BadChar, /* 151 */ BadChar, 192 /* 152 */ BadChar, /* 153 */ BadChar, /* 154 */ BadChar, /* 155 */ BadChar, 193 /* 156 */ BadChar, /* 157 */ BadChar, /* 158 */ BadChar, /* 159 */ BadChar, 194 /* 160 */ BadChar, /* 161 */ BadChar, /* 162 */ BadChar, /* 163 */ BadChar, 195 /* 164 */ BadChar, /* 165 */ BadChar, /* 166 */ BadChar, /* 167 */ BadChar, 196 /* 168 */ BadChar, /* 169 */ BadChar, /* 170 */ BadChar, /* 171 */ BadChar, 197 /* 172 */ BadChar, /* 173 */ BadChar, /* 174 */ BadChar, /* 175 */ BadChar, 198 /* 176 */ BadChar, /* 177 */ BadChar, /* 178 */ BadChar, /* 179 */ BadChar, 199 /* 180 */ BadChar, /* 181 */ BadChar, /* 182 */ BadChar, /* 183 */ BadChar, 200 /* 184 */ BadChar, /* 185 */ BadChar, /* 186 */ BadChar, /* 187 */ BadChar, 201 /* 188 */ BadChar, /* 189 */ BadChar, /* 190 */ BadChar, /* 191 */ BadChar, 202 /* 192 */ BadChar, /* 193 */ BadChar, /* 194 */ BadChar, /* 195 */ BadChar, 203 /* 196 */ BadChar, /* 197 */ BadChar, /* 198 */ BadChar, /* 199 */ BadChar, 204 /* 200 */ BadChar, /* 201 */ BadChar, /* 202 */ BadChar, /* 203 */ BadChar, 205 /* 204 */ BadChar, /* 205 */ BadChar, /* 206 */ BadChar, /* 207 */ BadChar, 206 /* 208 */ BadChar, /* 209 */ BadChar, /* 210 */ BadChar, /* 211 */ BadChar, 207 /* 212 */ BadChar, /* 213 */ BadChar, /* 214 */ BadChar, /* 215 */ BadChar, 208 /* 216 */ BadChar, /* 217 */ BadChar, /* 218 */ BadChar, /* 219 */ BadChar, 209 /* 220 */ BadChar, /* 221 */ BadChar, /* 222 */ BadChar, /* 223 */ BadChar, 210 /* 224 */ BadChar, /* 225 */ BadChar, /* 226 */ BadChar, /* 227 */ BadChar, 211 /* 228 */ BadChar, /* 229 */ BadChar, /* 230 */ BadChar, /* 231 */ BadChar, 212 /* 232 */ BadChar, /* 233 */ BadChar, /* 234 */ BadChar, /* 235 */ BadChar, 213 /* 236 */ BadChar, /* 237 */ BadChar, /* 238 */ BadChar, /* 239 */ BadChar, 214 /* 240 */ BadChar, /* 241 */ BadChar, /* 242 */ BadChar, /* 243 */ BadChar, 215 /* 244 */ BadChar, /* 245 */ BadChar, /* 246 */ BadChar, /* 247 */ BadChar, 216 /* 248 */ BadChar, /* 249 */ BadChar, /* 250 */ BadChar, /* 251 */ BadChar, 217 /* 252 */ BadChar, /* 253 */ BadChar, /* 254 */ BadChar, /* 255 */ BadChar 218}; 219 220static int copyPathRemovingDots(char* dst, const char* src, int srcStart, int srcEnd); 221static void encodeRelativeString(const String& rel, const TextEncoding&, CharBuffer& ouput); 222static String substituteBackslashes(const String&); 223 224static inline bool isSchemeFirstChar(char c) { return characterClassTable[static_cast<unsigned char>(c)] & SchemeFirstChar; } 225static inline bool isSchemeFirstChar(UChar c) { return c <= 0xff && (characterClassTable[c] & SchemeFirstChar); } 226static inline bool isSchemeChar(char c) { return characterClassTable[static_cast<unsigned char>(c)] & SchemeChar; } 227static inline bool isSchemeChar(UChar c) { return c <= 0xff && (characterClassTable[c] & SchemeChar); } 228static inline bool isUserInfoChar(unsigned char c) { return characterClassTable[c] & UserInfoChar; } 229static inline bool isHostnameChar(unsigned char c) { return characterClassTable[c] & HostnameChar; } 230static inline bool isIPv6Char(unsigned char c) { return characterClassTable[c] & IPv6Char; } 231static inline bool isPathSegmentEndChar(char c) { return characterClassTable[static_cast<unsigned char>(c)] & PathSegmentEndChar; } 232static inline bool isPathSegmentEndChar(UChar c) { return c <= 0xff && (characterClassTable[c] & PathSegmentEndChar); } 233static inline bool isBadChar(unsigned char c) { return characterClassTable[c] & BadChar; } 234 235static inline int hexDigitValue(UChar c) 236{ 237 ASSERT(isASCIIHexDigit(c)); 238 if (c < 'A') 239 return c - '0'; 240 return (c - 'A' + 10) & 0xF; // handle both upper and lower case without a branch 241} 242 243// Copies the source to the destination, assuming all the source characters are 244// ASCII. The destination buffer must be large enough. Null characters are allowed 245// in the source string, and no attempt is made to null-terminate the result. 246static void copyASCII(const UChar* src, int length, char* dest) 247{ 248 for (int i = 0; i < length; i++) 249 dest[i] = static_cast<char>(src[i]); 250} 251 252static void appendASCII(const String& base, const char* rel, size_t len, CharBuffer& buffer) 253{ 254 buffer.resize(base.length() + len + 1); 255 copyASCII(base.characters(), base.length(), buffer.data()); 256 memcpy(buffer.data() + base.length(), rel, len); 257 buffer[buffer.size() - 1] = '\0'; 258} 259 260// FIXME: Move to PlatformString.h eventually. 261// Returns the index of the first index in string |s| of any of the characters 262// in |toFind|. |toFind| should be a null-terminated string, all characters up 263// to the null will be searched. Returns int if not found. 264static int findFirstOf(const UChar* s, int sLen, int startPos, const char* toFind) 265{ 266 for (int i = startPos; i < sLen; i++) { 267 const char* cur = toFind; 268 while (*cur) { 269 if (s[i] == *(cur++)) 270 return i; 271 } 272 } 273 return -1; 274} 275 276#ifndef NDEBUG 277static void checkEncodedString(const String& url) 278{ 279 for (unsigned i = 0; i < url.length(); ++i) 280 ASSERT(!(url[i] & ~0x7F)); 281 282 ASSERT(!url.length() || isSchemeFirstChar(url[0])); 283} 284#else 285static inline void checkEncodedString(const String&) 286{ 287} 288#endif 289 290inline bool KURL::protocolIs(const String& string, const char* protocol) 291{ 292 return WebCore::protocolIs(string, protocol); 293} 294 295void KURL::invalidate() 296{ 297 m_isValid = false; 298 m_protocolInHTTPFamily = false; 299 m_schemeEnd = 0; 300 m_userStart = 0; 301 m_userEnd = 0; 302 m_passwordEnd = 0; 303 m_hostEnd = 0; 304 m_portEnd = 0; 305 m_pathEnd = 0; 306 m_pathAfterLastSlash = 0; 307 m_queryEnd = 0; 308 m_fragmentEnd = 0; 309} 310 311KURL::KURL(ParsedURLStringTag, const char* url) 312{ 313 parse(url, 0); 314 ASSERT(url == m_string); 315} 316 317KURL::KURL(ParsedURLStringTag, const String& url) 318{ 319 parse(url); 320 ASSERT(url == m_string); 321} 322 323KURL::KURL(ParsedURLStringTag, const URLString& url) 324{ 325 parse(url.string()); 326 ASSERT(url.string() == m_string); 327} 328 329KURL::KURL(const KURL& base, const String& relative) 330{ 331 init(base, relative, UTF8Encoding()); 332} 333 334KURL::KURL(const KURL& base, const String& relative, const TextEncoding& encoding) 335{ 336 // For UTF-{7,16,32}, we want to use UTF-8 for the query part as 337 // we do when submitting a form. A form with GET method 338 // has its contents added to a URL as query params and it makes sense 339 // to be consistent. 340 init(base, relative, encoding.encodingForFormSubmission()); 341} 342 343static bool shouldTrimFromURL(unsigned char c) 344{ 345 // Browsers ignore leading/trailing whitespace and control 346 // characters from URLs. Note that c is an *unsigned* char here 347 // so this comparison should only catch control characters. 348 return c <= ' '; 349} 350 351void KURL::init(const KURL& base, const String& relative, const TextEncoding& encoding) 352{ 353 // Allow resolutions with a null or empty base URL, but not with any other invalid one. 354 // FIXME: Is this a good rule? 355 if (!base.m_isValid && !base.isEmpty()) { 356 m_string = relative; 357 invalidate(); 358 return; 359 } 360 361 // For compatibility with Win IE, treat backslashes as if they were slashes, 362 // as long as we're not dealing with javascript: or data: URLs. 363 String rel = relative; 364 if (rel.contains('\\') && !(protocolIsJavaScript(rel) || protocolIs(rel, "data"))) 365 rel = substituteBackslashes(rel); 366 367 String* originalString = &rel; 368 369 bool allASCII = charactersAreAllASCII(rel.characters(), rel.length()); 370 CharBuffer strBuffer; 371 char* str; 372 size_t len; 373 if (allASCII) { 374 len = rel.length(); 375 strBuffer.resize(len + 1); 376 copyASCII(rel.characters(), len, strBuffer.data()); 377 strBuffer[len] = 0; 378 str = strBuffer.data(); 379 } else { 380 originalString = 0; 381 encodeRelativeString(rel, encoding, strBuffer); 382 str = strBuffer.data(); 383 len = strlen(str); 384 } 385 386 // Get rid of leading whitespace and control characters. 387 while (len && shouldTrimFromURL(*str)) { 388 originalString = 0; 389 str++; 390 --len; 391 } 392 393 // Get rid of trailing whitespace and control characters. 394 while (len && shouldTrimFromURL(str[len - 1])) { 395 originalString = 0; 396 str[--len] = '\0'; 397 } 398 399 // According to the RFC, the reference should be interpreted as an 400 // absolute URI if possible, using the "leftmost, longest" 401 // algorithm. If the URI reference is absolute it will have a 402 // scheme, meaning that it will have a colon before the first 403 // non-scheme element. 404 bool absolute = false; 405 char* p = str; 406 if (isSchemeFirstChar(*p)) { 407 ++p; 408 while (isSchemeChar(*p)) { 409 ++p; 410 } 411 if (*p == ':') { 412 if (p[1] != '/' && equalIgnoringCase(base.protocol(), String(str, p - str)) && base.isHierarchical()) { 413 str = p + 1; 414 originalString = 0; 415 } else 416 absolute = true; 417 } 418 } 419 420 CharBuffer parseBuffer; 421 422 if (absolute) { 423 parse(str, originalString); 424 } else { 425 // If the base is empty or opaque (e.g. data: or javascript:), then the URL is invalid 426 // unless the relative URL is a single fragment. 427 if (!base.isHierarchical()) { 428 if (str[0] == '#') { 429 appendASCII(base.m_string.left(base.m_queryEnd), str, len, parseBuffer); 430 parse(parseBuffer.data(), 0); 431 } else { 432 m_string = relative; 433 invalidate(); 434 } 435 return; 436 } 437 438 switch (str[0]) { 439 case '\0': 440 // The reference is empty, so this is a reference to the same document with any fragment identifier removed. 441 *this = base; 442 removeFragmentIdentifier(); 443 break; 444 case '#': { 445 // must be fragment-only reference 446 appendASCII(base.m_string.left(base.m_queryEnd), str, len, parseBuffer); 447 parse(parseBuffer.data(), 0); 448 break; 449 } 450 case '?': { 451 // query-only reference, special case needed for non-URL results 452 appendASCII(base.m_string.left(base.m_pathEnd), str, len, parseBuffer); 453 parse(parseBuffer.data(), 0); 454 break; 455 } 456 case '/': 457 // must be net-path or absolute-path reference 458 if (str[1] == '/') { 459 // net-path 460 appendASCII(base.m_string.left(base.m_schemeEnd + 1), str, len, parseBuffer); 461 parse(parseBuffer.data(), 0); 462 } else { 463 // abs-path 464 appendASCII(base.m_string.left(base.m_portEnd), str, len, parseBuffer); 465 parse(parseBuffer.data(), 0); 466 } 467 break; 468 default: 469 { 470 // must be relative-path reference 471 472 // Base part plus relative part plus one possible slash added in between plus terminating \0 byte. 473 parseBuffer.resize(base.m_pathEnd + 1 + len + 1); 474 475 char* bufferPos = parseBuffer.data(); 476 477 // first copy everything before the path from the base 478 unsigned baseLength = base.m_string.length(); 479 const UChar* baseCharacters = base.m_string.characters(); 480 CharBuffer baseStringBuffer(baseLength); 481 copyASCII(baseCharacters, baseLength, baseStringBuffer.data()); 482 const char* baseString = baseStringBuffer.data(); 483 const char* baseStringStart = baseString; 484 const char* pathStart = baseStringStart + base.m_portEnd; 485 while (baseStringStart < pathStart) 486 *bufferPos++ = *baseStringStart++; 487 char* bufferPathStart = bufferPos; 488 489 // now copy the base path 490 const char* baseStringEnd = baseString + base.m_pathEnd; 491 492 // go back to the last slash 493 while (baseStringEnd > baseStringStart && baseStringEnd[-1] != '/') 494 baseStringEnd--; 495 496 if (baseStringEnd == baseStringStart) { 497 // no path in base, add a path separator if necessary 498 if (base.m_schemeEnd + 1 != base.m_pathEnd && *str && *str != '?' && *str != '#') 499 *bufferPos++ = '/'; 500 } else { 501 bufferPos += copyPathRemovingDots(bufferPos, baseStringStart, 0, baseStringEnd - baseStringStart); 502 } 503 504 const char* relStringStart = str; 505 const char* relStringPos = relStringStart; 506 507 while (*relStringPos && *relStringPos != '?' && *relStringPos != '#') { 508 if (relStringPos[0] == '.' && bufferPos[-1] == '/') { 509 if (isPathSegmentEndChar(relStringPos[1])) { 510 // skip over "." segment 511 relStringPos += 1; 512 if (relStringPos[0] == '/') 513 relStringPos++; 514 continue; 515 } else if (relStringPos[1] == '.' && isPathSegmentEndChar(relStringPos[2])) { 516 // skip over ".." segment and rewind the last segment 517 // the RFC leaves it up to the app to decide what to do with excess 518 // ".." segments - we choose to drop them since some web content 519 // relies on this. 520 relStringPos += 2; 521 if (relStringPos[0] == '/') 522 relStringPos++; 523 if (bufferPos > bufferPathStart + 1) 524 bufferPos--; 525 while (bufferPos > bufferPathStart + 1 && bufferPos[-1] != '/') 526 bufferPos--; 527 continue; 528 } 529 } 530 531 *bufferPos = *relStringPos; 532 relStringPos++; 533 bufferPos++; 534 } 535 536 // all done with the path work, now copy any remainder 537 // of the relative reference; this will also add a null terminator 538 strcpy(bufferPos, relStringPos); 539 540 parse(parseBuffer.data(), 0); 541 542 ASSERT(strlen(parseBuffer.data()) + 1 <= parseBuffer.size()); 543 break; 544 } 545 } 546 } 547} 548 549KURL KURL::copy() const 550{ 551 KURL result = *this; 552 result.m_string = result.m_string.crossThreadString(); 553 return result; 554} 555 556bool KURL::hasPath() const 557{ 558 return m_pathEnd != m_portEnd; 559} 560 561String KURL::lastPathComponent() const 562{ 563 if (!hasPath()) 564 return String(); 565 566 unsigned end = m_pathEnd - 1; 567 if (m_string[end] == '/') 568 --end; 569 570 size_t start = m_string.reverseFind('/', end); 571 if (start < static_cast<unsigned>(m_portEnd)) 572 return String(); 573 ++start; 574 575 return m_string.substring(start, end - start + 1); 576} 577 578String KURL::protocol() const 579{ 580 return m_string.left(m_schemeEnd); 581} 582 583String KURL::host() const 584{ 585 int start = hostStart(); 586 return decodeURLEscapeSequences(m_string.substring(start, m_hostEnd - start)); 587} 588 589unsigned short KURL::port() const 590{ 591 // We return a port of 0 if there is no port specified. This can happen in two situations: 592 // 1) The URL contains no colon after the host name and before the path component of the URL. 593 // 2) The URL contains a colon but there's no port number before the path component of the URL begins. 594 if (m_hostEnd == m_portEnd || m_hostEnd == m_portEnd - 1) 595 return 0; 596 597 const UChar* stringData = m_string.characters(); 598 bool ok = false; 599 unsigned number = charactersToUIntStrict(stringData + m_hostEnd + 1, m_portEnd - m_hostEnd - 1, &ok); 600 if (!ok || number > maximumValidPortNumber) 601 return invalidPortNumber; 602 return number; 603} 604 605String KURL::pass() const 606{ 607 if (m_passwordEnd == m_userEnd) 608 return String(); 609 610 return decodeURLEscapeSequences(m_string.substring(m_userEnd + 1, m_passwordEnd - m_userEnd - 1)); 611} 612 613String KURL::user() const 614{ 615 return decodeURLEscapeSequences(m_string.substring(m_userStart, m_userEnd - m_userStart)); 616} 617 618String KURL::fragmentIdentifier() const 619{ 620 if (m_fragmentEnd == m_queryEnd) 621 return String(); 622 623 return m_string.substring(m_queryEnd + 1, m_fragmentEnd - (m_queryEnd + 1)); 624} 625 626bool KURL::hasFragmentIdentifier() const 627{ 628 return m_fragmentEnd != m_queryEnd; 629} 630 631void KURL::copyParsedQueryTo(ParsedURLParameters& parameters) const 632{ 633 const UChar* pos = m_string.characters() + m_pathEnd + 1; 634 const UChar* end = m_string.characters() + m_queryEnd; 635 while (pos < end) { 636 const UChar* parameterStart = pos; 637 while (pos < end && *pos != '&') 638 ++pos; 639 const UChar* parameterEnd = pos; 640 if (pos < end) { 641 ASSERT(*pos == '&'); 642 ++pos; 643 } 644 if (parameterStart == parameterEnd) 645 continue; 646 const UChar* nameStart = parameterStart; 647 const UChar* equalSign = parameterStart; 648 while (equalSign < parameterEnd && *equalSign != '=') 649 ++equalSign; 650 if (equalSign == nameStart) 651 continue; 652 String name(nameStart, equalSign - nameStart); 653 String value = equalSign == parameterEnd ? String() : String(equalSign + 1, parameterEnd - equalSign - 1); 654 parameters.set(name, value); 655 } 656} 657 658String KURL::baseAsString() const 659{ 660 return m_string.left(m_pathAfterLastSlash); 661} 662 663#ifdef NDEBUG 664 665static inline void assertProtocolIsGood(const char*) 666{ 667} 668 669#else 670 671static void assertProtocolIsGood(const char* protocol) 672{ 673 const char* p = protocol; 674 while (*p) { 675 ASSERT(*p > ' ' && *p < 0x7F && !(*p >= 'A' && *p <= 'Z')); 676 ++p; 677 } 678} 679 680#endif 681 682bool KURL::protocolIs(const char* protocol) const 683{ 684 assertProtocolIsGood(protocol); 685 686 // JavaScript URLs are "valid" and should be executed even if KURL decides they are invalid. 687 // The free function protocolIsJavaScript() should be used instead. 688 ASSERT(!equalIgnoringCase(protocol, String("javascript"))); 689 690 if (!m_isValid) 691 return false; 692 693 // Do the comparison without making a new string object. 694 for (int i = 0; i < m_schemeEnd; ++i) { 695 if (!protocol[i] || toASCIILower(m_string[i]) != protocol[i]) 696 return false; 697 } 698 return !protocol[m_schemeEnd]; // We should have consumed all characters in the argument. 699} 700 701String KURL::query() const 702{ 703 if (m_queryEnd == m_pathEnd) 704 return String(); 705 706 return m_string.substring(m_pathEnd + 1, m_queryEnd - (m_pathEnd + 1)); 707} 708 709String KURL::path() const 710{ 711 return m_string.substring(m_portEnd, m_pathEnd - m_portEnd); 712} 713 714bool KURL::setProtocol(const String& s) 715{ 716 // Firefox and IE remove everything after the first ':'. 717 size_t separatorPosition = s.find(':'); 718 String newProtocol = s.substring(0, separatorPosition); 719 720 if (!isValidProtocol(newProtocol)) 721 return false; 722 723 if (!m_isValid) { 724 parse(newProtocol + ":" + m_string); 725 return true; 726 } 727 728 parse(newProtocol + m_string.substring(m_schemeEnd)); 729 return true; 730} 731 732void KURL::setHost(const String& s) 733{ 734 if (!m_isValid) 735 return; 736 737 // FIXME: Non-ASCII characters must be encoded and escaped to match parse() expectations, 738 // and to avoid changing more than just the host. 739 740 bool slashSlashNeeded = m_userStart == m_schemeEnd + 1; 741 742 parse(m_string.left(hostStart()) + (slashSlashNeeded ? "//" : "") + s + m_string.substring(m_hostEnd)); 743} 744 745void KURL::removePort() 746{ 747 if (m_hostEnd == m_portEnd) 748 return; 749 parse(m_string.left(m_hostEnd) + m_string.substring(m_portEnd)); 750} 751 752void KURL::setPort(unsigned short i) 753{ 754 if (!m_isValid) 755 return; 756 757 bool colonNeeded = m_portEnd == m_hostEnd; 758 int portStart = (colonNeeded ? m_hostEnd : m_hostEnd + 1); 759 760 parse(m_string.left(portStart) + (colonNeeded ? ":" : "") + String::number(i) + m_string.substring(m_portEnd)); 761} 762 763void KURL::setHostAndPort(const String& hostAndPort) 764{ 765 if (!m_isValid) 766 return; 767 768 // FIXME: Non-ASCII characters must be encoded and escaped to match parse() expectations, 769 // and to avoid changing more than just host and port. 770 771 bool slashSlashNeeded = m_userStart == m_schemeEnd + 1; 772 773 parse(m_string.left(hostStart()) + (slashSlashNeeded ? "//" : "") + hostAndPort + m_string.substring(m_portEnd)); 774} 775 776void KURL::setUser(const String& user) 777{ 778 if (!m_isValid) 779 return; 780 781 // FIXME: Non-ASCII characters must be encoded and escaped to match parse() expectations, 782 // and to avoid changing more than just the user login. 783 String u; 784 int end = m_userEnd; 785 if (!user.isEmpty()) { 786 u = user; 787 if (m_userStart == m_schemeEnd + 1) 788 u = "//" + u; 789 // Add '@' if we didn't have one before. 790 if (end == m_hostEnd || (end == m_passwordEnd && m_string[end] != '@')) 791 u.append('@'); 792 } else { 793 // Remove '@' if we now have neither user nor password. 794 if (m_userEnd == m_passwordEnd && end != m_hostEnd && m_string[end] == '@') 795 end += 1; 796 } 797 parse(m_string.left(m_userStart) + u + m_string.substring(end)); 798} 799 800void KURL::setPass(const String& password) 801{ 802 if (!m_isValid) 803 return; 804 805 // FIXME: Non-ASCII characters must be encoded and escaped to match parse() expectations, 806 // and to avoid changing more than just the user password. 807 String p; 808 int end = m_passwordEnd; 809 if (!password.isEmpty()) { 810 p = ":" + password + "@"; 811 if (m_userEnd == m_schemeEnd + 1) 812 p = "//" + p; 813 // Eat the existing '@' since we are going to add our own. 814 if (end != m_hostEnd && m_string[end] == '@') 815 end += 1; 816 } else { 817 // Remove '@' if we now have neither user nor password. 818 if (m_userStart == m_userEnd && end != m_hostEnd && m_string[end] == '@') 819 end += 1; 820 } 821 parse(m_string.left(m_userEnd) + p + m_string.substring(end)); 822} 823 824void KURL::setFragmentIdentifier(const String& s) 825{ 826 if (!m_isValid) 827 return; 828 829 // FIXME: Non-ASCII characters must be encoded and escaped to match parse() expectations. 830 parse(m_string.left(m_queryEnd) + "#" + s); 831} 832 833void KURL::removeFragmentIdentifier() 834{ 835 if (!m_isValid) 836 return; 837 parse(m_string.left(m_queryEnd)); 838} 839 840void KURL::setQuery(const String& query) 841{ 842 if (!m_isValid) 843 return; 844 845 // FIXME: '#' and non-ASCII characters must be encoded and escaped. 846 // Usually, the query is encoded using document encoding, not UTF-8, but we don't have 847 // access to the document in this function. 848 if ((query.isEmpty() || query[0] != '?') && !query.isNull()) 849 parse(m_string.left(m_pathEnd) + "?" + query + m_string.substring(m_queryEnd)); 850 else 851 parse(m_string.left(m_pathEnd) + query + m_string.substring(m_queryEnd)); 852 853} 854 855void KURL::setPath(const String& s) 856{ 857 if (!m_isValid) 858 return; 859 860 // FIXME: encodeWithURLEscapeSequences does not correctly escape '#' and '?', so fragment and query parts 861 // may be inadvertently affected. 862 String path = s; 863 if (path.isEmpty() || path[0] != '/') 864 path = "/" + path; 865 866 parse(m_string.left(m_portEnd) + encodeWithURLEscapeSequences(path) + m_string.substring(m_pathEnd)); 867} 868 869String KURL::deprecatedString() const 870{ 871 if (!m_isValid) 872 return m_string; 873 874 Vector<UChar> result; 875 876 append(result, protocol()); 877 result.append(':'); 878 879 Vector<UChar> authority; 880 881 if (m_hostEnd != m_passwordEnd) { 882 if (m_userEnd != m_userStart) { 883 append(authority, user()); 884 authority.append('@'); 885 } 886 append(authority, host()); 887 if (hasPort()) { 888 authority.append(':'); 889 append(authority, String::number(port())); 890 } 891 } 892 893 if (!authority.isEmpty()) { 894 result.append('/'); 895 result.append('/'); 896 result.append(authority); 897 } else if (protocolIs("file")) { 898 result.append('/'); 899 result.append('/'); 900 } 901 902 append(result, path()); 903 904 if (m_pathEnd != m_queryEnd) { 905 result.append('?'); 906 append(result, query()); 907 } 908 909 if (m_fragmentEnd != m_queryEnd) { 910 result.append('#'); 911 append(result, fragmentIdentifier()); 912 } 913 914 return String::adopt(result); 915} 916 917String decodeURLEscapeSequences(const String& str) 918{ 919 return decodeURLEscapeSequences(str, UTF8Encoding()); 920} 921 922String decodeURLEscapeSequences(const String& str, const TextEncoding& encoding) 923{ 924 Vector<UChar> result; 925 926 CharBuffer buffer; 927 928 unsigned length = str.length(); 929 unsigned decodedPosition = 0; 930 unsigned searchPosition = 0; 931 size_t encodedRunPosition; 932 while ((encodedRunPosition = str.find('%', searchPosition)) != notFound) { 933 // Find the sequence of %-escape codes. 934 unsigned encodedRunEnd = encodedRunPosition; 935 while (length - encodedRunEnd >= 3 936 && str[encodedRunEnd] == '%' 937 && isASCIIHexDigit(str[encodedRunEnd + 1]) 938 && isASCIIHexDigit(str[encodedRunEnd + 2])) 939 encodedRunEnd += 3; 940 searchPosition = encodedRunEnd; 941 if (encodedRunEnd == encodedRunPosition) { 942 ++searchPosition; 943 continue; 944 } 945 946 // Decode the %-escapes into bytes. 947 unsigned runLength = (encodedRunEnd - encodedRunPosition) / 3; 948 buffer.resize(runLength); 949 char* p = buffer.data(); 950 const UChar* q = str.characters() + encodedRunPosition; 951 for (unsigned i = 0; i < runLength; ++i) { 952 *p++ = (hexDigitValue(q[1]) << 4) | hexDigitValue(q[2]); 953 q += 3; 954 } 955 956 // Decode the bytes into Unicode characters. 957 String decoded = (encoding.isValid() ? encoding : UTF8Encoding()).decode(buffer.data(), p - buffer.data()); 958 if (decoded.isEmpty()) 959 continue; 960 961 // Build up the string with what we just skipped and what we just decoded. 962 result.append(str.characters() + decodedPosition, encodedRunPosition - decodedPosition); 963 result.append(decoded.characters(), decoded.length()); 964 decodedPosition = encodedRunEnd; 965 } 966 967 result.append(str.characters() + decodedPosition, length - decodedPosition); 968 969 return String::adopt(result); 970} 971 972// Caution: This function does not bounds check. 973static void appendEscapedChar(char*& buffer, unsigned char c) 974{ 975 *buffer++ = '%'; 976 placeByteAsHex(c, buffer); 977} 978 979static void appendEscapingBadChars(char*& buffer, const char* strStart, size_t length) 980{ 981 char* p = buffer; 982 983 const char* str = strStart; 984 const char* strEnd = strStart + length; 985 while (str < strEnd) { 986 unsigned char c = *str++; 987 if (isBadChar(c)) { 988 if (c == '%' || c == '?') 989 *p++ = c; 990 else if (c != 0x09 && c != 0x0a && c != 0x0d) 991 appendEscapedChar(p, c); 992 } else 993 *p++ = c; 994 } 995 996 buffer = p; 997} 998 999static void escapeAndAppendNonHierarchicalPart(char*& buffer, const char* strStart, size_t length) 1000{ 1001 char* p = buffer; 1002 1003 const char* str = strStart; 1004 const char* strEnd = strStart + length; 1005 while (str < strEnd) { 1006 unsigned char c = *str++; 1007 // Strip CR, LF and Tab from fragments, per: 1008 // https://bugs.webkit.org/show_bug.cgi?id=8770 1009 if (c == 0x09 || c == 0x0a || c == 0x0d) 1010 continue; 1011 1012 // Chrome and IE allow non-ascii characters in fragments, however doing 1013 // so would hit an ASSERT in checkEncodedString, so for now we don't. 1014 if (c < 0x20 || c >= 127) { 1015 appendEscapedChar(p, c); 1016 continue; 1017 } 1018 *p++ = c; 1019 } 1020 1021 buffer = p; 1022} 1023 1024// copy a path, accounting for "." and ".." segments 1025static int copyPathRemovingDots(char* dst, const char* src, int srcStart, int srcEnd) 1026{ 1027 char* bufferPathStart = dst; 1028 1029 // empty path is a special case, and need not have a leading slash 1030 if (srcStart != srcEnd) { 1031 const char* baseStringStart = src + srcStart; 1032 const char* baseStringEnd = src + srcEnd; 1033 const char* baseStringPos = baseStringStart; 1034 1035 // this code is unprepared for paths that do not begin with a 1036 // slash and we should always have one in the source string 1037 ASSERT(baseStringPos[0] == '/'); 1038 1039 // copy the leading slash into the destination 1040 *dst = *baseStringPos; 1041 baseStringPos++; 1042 dst++; 1043 1044 while (baseStringPos < baseStringEnd) { 1045 if (baseStringPos[0] == '.' && dst[-1] == '/') { 1046 if (baseStringPos[1] == '/' || baseStringPos + 1 == baseStringEnd) { 1047 // skip over "." segment 1048 baseStringPos += 2; 1049 continue; 1050 } else if (baseStringPos[1] == '.' && (baseStringPos[2] == '/' || 1051 baseStringPos + 2 == baseStringEnd)) { 1052 // skip over ".." segment and rewind the last segment 1053 // the RFC leaves it up to the app to decide what to do with excess 1054 // ".." segments - we choose to drop them since some web content 1055 // relies on this. 1056 baseStringPos += 3; 1057 if (dst > bufferPathStart + 1) 1058 dst--; 1059 while (dst > bufferPathStart && dst[-1] != '/') 1060 dst--; 1061 continue; 1062 } 1063 } 1064 1065 *dst = *baseStringPos; 1066 baseStringPos++; 1067 dst++; 1068 } 1069 } 1070 *dst = '\0'; 1071 return dst - bufferPathStart; 1072} 1073 1074static inline bool hasSlashDotOrDotDot(const char* str) 1075{ 1076 const unsigned char* p = reinterpret_cast<const unsigned char*>(str); 1077 if (!*p) 1078 return false; 1079 unsigned char pc = *p; 1080 while (unsigned char c = *++p) { 1081 if (c == '.' && (pc == '/' || pc == '.')) 1082 return true; 1083 pc = c; 1084 } 1085 return false; 1086} 1087 1088static inline bool matchLetter(char c, char lowercaseLetter) 1089{ 1090 return (c | 0x20) == lowercaseLetter; 1091} 1092 1093void KURL::parse(const String& string) 1094{ 1095 checkEncodedString(string); 1096 1097 CharBuffer buffer(string.length() + 1); 1098 copyASCII(string.characters(), string.length(), buffer.data()); 1099 buffer[string.length()] = '\0'; 1100 parse(buffer.data(), &string); 1101} 1102 1103static inline bool equal(const char* a, size_t lenA, const char* b, size_t lenB) 1104{ 1105 if (lenA != lenB) 1106 return false; 1107 return !strncmp(a, b, lenA); 1108} 1109 1110// List of default schemes is taken from google-url: 1111// http://code.google.com/p/google-url/source/browse/trunk/src/url_canon_stdurl.cc#120 1112static inline bool isDefaultPortForScheme(const char* port, size_t portLength, const char* scheme, size_t schemeLength) 1113{ 1114 // This switch is theoretically a performance optimization. It came over when 1115 // the code was moved from google-url, but may be removed later. 1116 switch (schemeLength) { 1117 case 2: 1118 return equal("ws", 2, scheme, schemeLength) && equal("80", 2, port, portLength); 1119 case 3: 1120 if (equal("ftp", 3, scheme, schemeLength)) 1121 return equal("21", 2, port, portLength); 1122 if (equal("wss", 3, scheme, schemeLength)) 1123 return equal("443", 3, port, portLength); 1124 break; 1125 case 4: 1126 return equal("http", 4, scheme, schemeLength) && equal("80", 2, port, portLength); 1127 case 5: 1128 return equal("https", 5, scheme, schemeLength) && equal("443", 3, port, portLength); 1129 case 6: 1130 return equal("gopher", 6, scheme, schemeLength) && equal("70", 2, port, portLength); 1131 } 1132 return false; 1133} 1134 1135static inline bool hostPortIsEmptyButCredentialsArePresent(int hostStart, int portEnd, char userEndChar) 1136{ 1137 return userEndChar == '@' && hostStart == portEnd; 1138} 1139 1140static bool isNonFileHierarchicalScheme(const char* scheme, size_t schemeLength) 1141{ 1142 switch (schemeLength) { 1143 case 2: 1144 return equal("ws", 2, scheme, schemeLength); 1145 case 3: 1146 return equal("ftp", 3, scheme, schemeLength) || equal("wss", 3, scheme, schemeLength); 1147 case 4: 1148 return equal("http", 4, scheme, schemeLength); 1149 case 5: 1150 return equal("https", 5, scheme, schemeLength); 1151 case 6: 1152 return equal("gopher", 6, scheme, schemeLength); 1153 } 1154 return false; 1155} 1156 1157void KURL::parse(const char* url, const String* originalString) 1158{ 1159 if (!url || url[0] == '\0') { 1160 // valid URL must be non-empty 1161 m_string = originalString ? *originalString : url; 1162 invalidate(); 1163 return; 1164 } 1165 1166 if (!isSchemeFirstChar(url[0])) { 1167 // scheme must start with an alphabetic character 1168 m_string = originalString ? *originalString : url; 1169 invalidate(); 1170 return; 1171 } 1172 1173 int schemeEnd = 0; 1174 while (isSchemeChar(url[schemeEnd])) 1175 schemeEnd++; 1176 1177 if (url[schemeEnd] != ':') { 1178 m_string = originalString ? *originalString : url; 1179 invalidate(); 1180 return; 1181 } 1182 1183 int userStart = schemeEnd + 1; 1184 int userEnd; 1185 int passwordStart; 1186 int passwordEnd; 1187 int hostStart; 1188 int hostEnd; 1189 int portStart; 1190 int portEnd; 1191 1192 bool hierarchical = url[schemeEnd + 1] == '/'; 1193 bool hasSecondSlash = hierarchical && url[schemeEnd + 2] == '/'; 1194 1195 bool isFile = schemeEnd == 4 1196 && matchLetter(url[0], 'f') 1197 && matchLetter(url[1], 'i') 1198 && matchLetter(url[2], 'l') 1199 && matchLetter(url[3], 'e'); 1200 1201 m_protocolInHTTPFamily = matchLetter(url[0], 'h') 1202 && matchLetter(url[1], 't') 1203 && matchLetter(url[2], 't') 1204 && matchLetter(url[3], 'p') 1205 && (url[4] == ':' || (matchLetter(url[4], 's') && url[5] == ':')); 1206 1207 if ((hierarchical && hasSecondSlash) || isNonFileHierarchicalScheme(url, schemeEnd)) { 1208 // The part after the scheme is either a net_path or an abs_path whose first path segment is empty. 1209 // Attempt to find an authority. 1210 // FIXME: Authority characters may be scanned twice, and it would be nice to be faster. 1211 1212 if (hierarchical) 1213 userStart++; 1214 if (hasSecondSlash) 1215 userStart++; 1216 userEnd = userStart; 1217 1218 int colonPos = 0; 1219 while (isUserInfoChar(url[userEnd])) { 1220 if (url[userEnd] == ':' && colonPos == 0) 1221 colonPos = userEnd; 1222 userEnd++; 1223 } 1224 1225 if (url[userEnd] == '@') { 1226 // actual end of the userinfo, start on the host 1227 if (colonPos != 0) { 1228 passwordEnd = userEnd; 1229 userEnd = colonPos; 1230 passwordStart = colonPos + 1; 1231 } else 1232 passwordStart = passwordEnd = userEnd; 1233 1234 hostStart = passwordEnd + 1; 1235 } else if (url[userEnd] == '[' || isPathSegmentEndChar(url[userEnd])) { 1236 // hit the end of the authority, must have been no user 1237 // or looks like an IPv6 hostname 1238 // either way, try to parse it as a hostname 1239 userEnd = userStart; 1240 passwordStart = passwordEnd = userEnd; 1241 hostStart = userStart; 1242 } else { 1243 // invalid character 1244 m_string = originalString ? *originalString : url; 1245 invalidate(); 1246 return; 1247 } 1248 1249 hostEnd = hostStart; 1250 1251 // IPV6 IP address 1252 if (url[hostEnd] == '[') { 1253 hostEnd++; 1254 while (isIPv6Char(url[hostEnd])) 1255 hostEnd++; 1256 if (url[hostEnd] == ']') 1257 hostEnd++; 1258 else { 1259 // invalid character 1260 m_string = originalString ? *originalString : url; 1261 invalidate(); 1262 return; 1263 } 1264 } else { 1265 while (isHostnameChar(url[hostEnd])) 1266 hostEnd++; 1267 } 1268 1269 if (url[hostEnd] == ':') { 1270 portStart = portEnd = hostEnd + 1; 1271 1272 // possible start of port 1273 portEnd = portStart; 1274 while (isASCIIDigit(url[portEnd])) 1275 portEnd++; 1276 } else 1277 portStart = portEnd = hostEnd; 1278 1279 if (!isPathSegmentEndChar(url[portEnd])) { 1280 // invalid character 1281 m_string = originalString ? *originalString : url; 1282 invalidate(); 1283 return; 1284 } 1285 1286 if (hostPortIsEmptyButCredentialsArePresent(hostStart, portEnd, url[userEnd])) { 1287 // in this circumstance, act as if there is an erroneous hostname containing an '@' 1288 userEnd = userStart; 1289 hostStart = userEnd; 1290 } 1291 1292 if (userStart == portEnd && !m_protocolInHTTPFamily && !isFile) { 1293 // No authority found, which means that this is not a net_path, but rather an abs_path whose first two 1294 // path segments are empty. For file, http and https only, an empty authority is allowed. 1295 userStart -= 2; 1296 userEnd = userStart; 1297 passwordStart = userEnd; 1298 passwordEnd = passwordStart; 1299 hostStart = passwordEnd; 1300 hostEnd = hostStart; 1301 portStart = hostEnd; 1302 portEnd = hostEnd; 1303 } 1304 } else { 1305 // the part after the scheme must be an opaque_part or an abs_path 1306 userEnd = userStart; 1307 passwordStart = passwordEnd = userEnd; 1308 hostStart = hostEnd = passwordEnd; 1309 portStart = portEnd = hostEnd; 1310 } 1311 1312 int pathStart = portEnd; 1313 int pathEnd = pathStart; 1314 while (url[pathEnd] && url[pathEnd] != '?' && url[pathEnd] != '#') 1315 pathEnd++; 1316 1317 int queryStart = pathEnd; 1318 int queryEnd = queryStart; 1319 if (url[queryStart] == '?') { 1320 while (url[queryEnd] && url[queryEnd] != '#') 1321 queryEnd++; 1322 } 1323 1324 int fragmentStart = queryEnd; 1325 int fragmentEnd = fragmentStart; 1326 if (url[fragmentStart] == '#') { 1327 fragmentStart++; 1328 fragmentEnd = fragmentStart; 1329 while (url[fragmentEnd]) 1330 fragmentEnd++; 1331 } 1332 1333 // assemble it all, remembering the real ranges 1334 1335 Vector<char, 4096> buffer(fragmentEnd * 3 + 1); 1336 1337 char *p = buffer.data(); 1338 const char *strPtr = url; 1339 1340 // copy in the scheme 1341 const char *schemeEndPtr = url + schemeEnd; 1342 while (strPtr < schemeEndPtr) 1343 *p++ = toASCIILower(*strPtr++); 1344 m_schemeEnd = p - buffer.data(); 1345 1346 bool hostIsLocalHost = portEnd - userStart == 9 1347 && matchLetter(url[userStart], 'l') 1348 && matchLetter(url[userStart+1], 'o') 1349 && matchLetter(url[userStart+2], 'c') 1350 && matchLetter(url[userStart+3], 'a') 1351 && matchLetter(url[userStart+4], 'l') 1352 && matchLetter(url[userStart+5], 'h') 1353 && matchLetter(url[userStart+6], 'o') 1354 && matchLetter(url[userStart+7], 's') 1355 && matchLetter(url[userStart+8], 't'); 1356 1357 // File URLs need a host part unless it is just file:// or file://localhost 1358 bool degenFilePath = pathStart == pathEnd && (hostStart == hostEnd || hostIsLocalHost); 1359 1360 bool haveNonHostAuthorityPart = userStart != userEnd || passwordStart != passwordEnd || portStart != portEnd; 1361 1362 // add ":" after scheme 1363 *p++ = ':'; 1364 1365 // if we have at least one authority part or a file URL - add "//" and authority 1366 if (isFile ? !degenFilePath : (haveNonHostAuthorityPart || hostStart != hostEnd)) { 1367 *p++ = '/'; 1368 *p++ = '/'; 1369 1370 m_userStart = p - buffer.data(); 1371 1372 // copy in the user 1373 strPtr = url + userStart; 1374 const char* userEndPtr = url + userEnd; 1375 while (strPtr < userEndPtr) 1376 *p++ = *strPtr++; 1377 m_userEnd = p - buffer.data(); 1378 1379 // copy in the password 1380 if (passwordEnd != passwordStart) { 1381 *p++ = ':'; 1382 strPtr = url + passwordStart; 1383 const char* passwordEndPtr = url + passwordEnd; 1384 while (strPtr < passwordEndPtr) 1385 *p++ = *strPtr++; 1386 } 1387 m_passwordEnd = p - buffer.data(); 1388 1389 // If we had any user info, add "@" 1390 if (p - buffer.data() != m_userStart) 1391 *p++ = '@'; 1392 1393 // copy in the host, except in the case of a file URL with authority="localhost" 1394 if (!(isFile && hostIsLocalHost && !haveNonHostAuthorityPart)) { 1395 strPtr = url + hostStart; 1396 const char* hostEndPtr = url + hostEnd; 1397 while (strPtr < hostEndPtr) 1398 *p++ = *strPtr++; 1399 } 1400 m_hostEnd = p - buffer.data(); 1401 1402 // Copy in the port if the URL has one (and it's not default). 1403 if (hostEnd != portStart) { 1404 const char* portStr = url + portStart; 1405 size_t portLength = portEnd - portStart; 1406 if (portLength && !isDefaultPortForScheme(portStr, portLength, buffer.data(), m_schemeEnd)) { 1407 *p++ = ':'; 1408 const char* portEndPtr = url + portEnd; 1409 while (portStr < portEndPtr) 1410 *p++ = *portStr++; 1411 } 1412 } 1413 m_portEnd = p - buffer.data(); 1414 } else 1415 m_userStart = m_userEnd = m_passwordEnd = m_hostEnd = m_portEnd = p - buffer.data(); 1416 1417 // For canonicalization, ensure we have a '/' for no path. 1418 // Do this only for URL with protocol http or https. 1419 if (m_protocolInHTTPFamily && pathEnd == pathStart) 1420 *p++ = '/'; 1421 1422 // add path, escaping bad characters 1423 if (!hierarchical) 1424 escapeAndAppendNonHierarchicalPart(p, url + pathStart, pathEnd - pathStart); 1425 else if (!hasSlashDotOrDotDot(url)) 1426 appendEscapingBadChars(p, url + pathStart, pathEnd - pathStart); 1427 else { 1428 CharBuffer pathBuffer(pathEnd - pathStart + 1); 1429 size_t length = copyPathRemovingDots(pathBuffer.data(), url, pathStart, pathEnd); 1430 appendEscapingBadChars(p, pathBuffer.data(), length); 1431 } 1432 1433 m_pathEnd = p - buffer.data(); 1434 1435 // Find the position after the last slash in the path, or 1436 // the position before the path if there are no slashes in it. 1437 int i; 1438 for (i = m_pathEnd; i > m_portEnd; --i) { 1439 if (buffer[i - 1] == '/') 1440 break; 1441 } 1442 m_pathAfterLastSlash = i; 1443 1444 // add query, escaping bad characters 1445 appendEscapingBadChars(p, url + queryStart, queryEnd - queryStart); 1446 m_queryEnd = p - buffer.data(); 1447 1448 // add fragment, escaping bad characters 1449 if (fragmentEnd != queryEnd) { 1450 *p++ = '#'; 1451 escapeAndAppendNonHierarchicalPart(p, url + fragmentStart, fragmentEnd - fragmentStart); 1452 } 1453 m_fragmentEnd = p - buffer.data(); 1454 1455 ASSERT(p - buffer.data() <= static_cast<int>(buffer.size())); 1456 1457 // If we didn't end up actually changing the original string and 1458 // it was already in a String, reuse it to avoid extra allocation. 1459 if (originalString && originalString->length() == static_cast<unsigned>(m_fragmentEnd) && strncmp(buffer.data(), url, m_fragmentEnd) == 0) 1460 m_string = *originalString; 1461 else 1462 m_string = String(buffer.data(), m_fragmentEnd); 1463 1464 m_isValid = true; 1465} 1466 1467bool equalIgnoringFragmentIdentifier(const KURL& a, const KURL& b) 1468{ 1469 if (a.m_queryEnd != b.m_queryEnd) 1470 return false; 1471 unsigned queryLength = a.m_queryEnd; 1472 for (unsigned i = 0; i < queryLength; ++i) 1473 if (a.string()[i] != b.string()[i]) 1474 return false; 1475 return true; 1476} 1477 1478bool protocolHostAndPortAreEqual(const KURL& a, const KURL& b) 1479{ 1480 if (a.m_schemeEnd != b.m_schemeEnd) 1481 return false; 1482 1483 int hostStartA = a.hostStart(); 1484 int hostLengthA = a.hostEnd() - hostStartA; 1485 int hostStartB = b.hostStart(); 1486 int hostLengthB = b.hostEnd() - b.hostStart(); 1487 if (hostLengthA != hostLengthB) 1488 return false; 1489 1490 // Check the scheme 1491 for (int i = 0; i < a.m_schemeEnd; ++i) 1492 if (a.string()[i] != b.string()[i]) 1493 return false; 1494 1495 // And the host 1496 for (int i = 0; i < hostLengthA; ++i) 1497 if (a.string()[hostStartA + i] != b.string()[hostStartB + i]) 1498 return false; 1499 1500 if (a.port() != b.port()) 1501 return false; 1502 1503 return true; 1504} 1505 1506String encodeWithURLEscapeSequences(const String& notEncodedString) 1507{ 1508 CString asUTF8 = notEncodedString.utf8(); 1509 1510 CharBuffer buffer(asUTF8.length() * 3 + 1); 1511 char* p = buffer.data(); 1512 1513 const char* str = asUTF8.data(); 1514 const char* strEnd = str + asUTF8.length(); 1515 while (str < strEnd) { 1516 unsigned char c = *str++; 1517 if (isBadChar(c)) 1518 appendEscapedChar(p, c); 1519 else 1520 *p++ = c; 1521 } 1522 1523 ASSERT(p - buffer.data() <= static_cast<int>(buffer.size())); 1524 1525 return String(buffer.data(), p - buffer.data()); 1526} 1527 1528// Appends the punycoded hostname identified by the given string and length to 1529// the output buffer. The result will not be null terminated. 1530static void appendEncodedHostname(UCharBuffer& buffer, const UChar* str, unsigned strLen) 1531{ 1532 // Needs to be big enough to hold an IDN-encoded name. 1533 // For host names bigger than this, we won't do IDN encoding, which is almost certainly OK. 1534 const unsigned hostnameBufferLength = 2048; 1535 1536 if (strLen > hostnameBufferLength || charactersAreAllASCII(str, strLen)) { 1537 buffer.append(str, strLen); 1538 return; 1539 } 1540 1541#if USE(ICU_UNICODE) 1542 UChar hostnameBuffer[hostnameBufferLength]; 1543 UErrorCode error = U_ZERO_ERROR; 1544 int32_t numCharactersConverted = uidna_IDNToASCII(str, strLen, hostnameBuffer, 1545 hostnameBufferLength, UIDNA_ALLOW_UNASSIGNED, 0, &error); 1546 if (error == U_ZERO_ERROR) 1547 buffer.append(hostnameBuffer, numCharactersConverted); 1548#elif USE(QT4_UNICODE) 1549 QByteArray result = QUrl::toAce(String(str, strLen)); 1550 buffer.append(result.constData(), result.length()); 1551#elif USE(GLIB_UNICODE) 1552 GOwnPtr<gchar> utf8Hostname; 1553 GOwnPtr<GError> utf8Err; 1554 utf8Hostname.set(g_utf16_to_utf8(str, strLen, 0, 0, &utf8Err.outPtr())); 1555 if (utf8Err) 1556 return; 1557 1558 GOwnPtr<gchar> encodedHostname; 1559 encodedHostname.set(g_hostname_to_ascii(utf8Hostname.get())); 1560 if (!encodedHostname) 1561 return; 1562 1563 buffer.append(encodedHostname.get(), strlen(encodedHostname.get())); 1564#endif 1565} 1566 1567static void findHostnamesInMailToURL(const UChar* str, int strLen, Vector<pair<int, int> >& nameRanges) 1568{ 1569 // In a mailto: URL, host names come after a '@' character and end with a '>' or ',' or '?' or end of string character. 1570 // Skip quoted strings so that characters in them don't confuse us. 1571 // When we find a '?' character, we are past the part of the URL that contains host names. 1572 1573 nameRanges.clear(); 1574 1575 int p = 0; 1576 while (1) { 1577 // Find start of host name or of quoted string. 1578 int hostnameOrStringStart = findFirstOf(str, strLen, p, "\"@?"); 1579 if (hostnameOrStringStart == -1) 1580 return; 1581 UChar c = str[hostnameOrStringStart]; 1582 p = hostnameOrStringStart + 1; 1583 1584 if (c == '?') 1585 return; 1586 1587 if (c == '@') { 1588 // Find end of host name. 1589 int hostnameStart = p; 1590 int hostnameEnd = findFirstOf(str, strLen, p, ">,?"); 1591 bool done; 1592 if (hostnameEnd == -1) { 1593 hostnameEnd = strLen; 1594 done = true; 1595 } else { 1596 p = hostnameEnd; 1597 done = false; 1598 } 1599 1600 nameRanges.append(make_pair(hostnameStart, hostnameEnd)); 1601 1602 if (done) 1603 return; 1604 } else { 1605 // Skip quoted string. 1606 ASSERT(c == '"'); 1607 while (1) { 1608 int escapedCharacterOrStringEnd = findFirstOf(str, strLen, p, "\"\\"); 1609 if (escapedCharacterOrStringEnd == -1) 1610 return; 1611 1612 c = str[escapedCharacterOrStringEnd]; 1613 p = escapedCharacterOrStringEnd + 1; 1614 1615 // If we are the end of the string, then break from the string loop back to the host name loop. 1616 if (c == '"') 1617 break; 1618 1619 // Skip escaped character. 1620 ASSERT(c == '\\'); 1621 if (p == strLen) 1622 return; 1623 1624 ++p; 1625 } 1626 } 1627 } 1628} 1629 1630static bool findHostnameInHierarchicalURL(const UChar* str, int strLen, int& startOffset, int& endOffset) 1631{ 1632 // Find the host name in a hierarchical URL. 1633 // It comes after a "://" sequence, with scheme characters preceding, and 1634 // this should be the first colon in the string. 1635 // It ends with the end of the string or a ":" or a path segment ending character. 1636 // If there is a "@" character, the host part is just the part after the "@". 1637 int separator = findFirstOf(str, strLen, 0, ":"); 1638 if (separator == -1 || separator + 2 >= strLen || 1639 str[separator + 1] != '/' || str[separator + 2] != '/') 1640 return false; 1641 1642 // Check that all characters before the :// are valid scheme characters. 1643 if (!isSchemeFirstChar(str[0])) 1644 return false; 1645 for (int i = 1; i < separator; ++i) { 1646 if (!isSchemeChar(str[i])) 1647 return false; 1648 } 1649 1650 // Start after the separator. 1651 int authorityStart = separator + 3; 1652 1653 // Find terminating character. 1654 int hostnameEnd = strLen; 1655 for (int i = authorityStart; i < strLen; ++i) { 1656 UChar c = str[i]; 1657 if (c == ':' || (isPathSegmentEndChar(c) && c != 0)) { 1658 hostnameEnd = i; 1659 break; 1660 } 1661 } 1662 1663 // Find "@" for the start of the host name. 1664 int userInfoTerminator = findFirstOf(str, strLen, authorityStart, "@"); 1665 int hostnameStart; 1666 if (userInfoTerminator == -1 || userInfoTerminator > hostnameEnd) 1667 hostnameStart = authorityStart; 1668 else 1669 hostnameStart = userInfoTerminator + 1; 1670 1671 startOffset = hostnameStart; 1672 endOffset = hostnameEnd; 1673 return true; 1674} 1675 1676// Converts all hostnames found in the given input to punycode, preserving the 1677// rest of the URL unchanged. The output will NOT be null-terminated. 1678static void encodeHostnames(const String& str, UCharBuffer& output) 1679{ 1680 output.clear(); 1681 1682 if (protocolIs(str, "mailto")) { 1683 Vector<pair<int, int> > hostnameRanges; 1684 findHostnamesInMailToURL(str.characters(), str.length(), hostnameRanges); 1685 int n = hostnameRanges.size(); 1686 int p = 0; 1687 for (int i = 0; i < n; ++i) { 1688 const pair<int, int>& r = hostnameRanges[i]; 1689 output.append(&str.characters()[p], r.first - p); 1690 appendEncodedHostname(output, &str.characters()[r.first], r.second - r.first); 1691 p = r.second; 1692 } 1693 // This will copy either everything after the last hostname, or the 1694 // whole thing if there is no hostname. 1695 output.append(&str.characters()[p], str.length() - p); 1696 } else { 1697 int hostStart, hostEnd; 1698 if (findHostnameInHierarchicalURL(str.characters(), str.length(), hostStart, hostEnd)) { 1699 output.append(str.characters(), hostStart); // Before hostname. 1700 appendEncodedHostname(output, &str.characters()[hostStart], hostEnd - hostStart); 1701 output.append(&str.characters()[hostEnd], str.length() - hostEnd); // After hostname. 1702 } else { 1703 // No hostname to encode, return the input. 1704 output.append(str.characters(), str.length()); 1705 } 1706 } 1707} 1708 1709static void encodeRelativeString(const String& rel, const TextEncoding& encoding, CharBuffer& output) 1710{ 1711 UCharBuffer s; 1712 encodeHostnames(rel, s); 1713 1714 TextEncoding pathEncoding(UTF8Encoding()); // Path is always encoded as UTF-8; other parts may depend on the scheme. 1715 1716 int pathEnd = -1; 1717 if (encoding != pathEncoding && encoding.isValid() && !protocolIs(rel, "mailto") && !protocolIs(rel, "data") && !protocolIsJavaScript(rel)) { 1718 // Find the first instance of either # or ?, keep pathEnd at -1 otherwise. 1719 pathEnd = findFirstOf(s.data(), s.size(), 0, "#?"); 1720 } 1721 1722 if (pathEnd == -1) { 1723 CString decoded = pathEncoding.encode(s.data(), s.size(), URLEncodedEntitiesForUnencodables); 1724 output.resize(decoded.length()); 1725 memcpy(output.data(), decoded.data(), decoded.length()); 1726 } else { 1727 CString pathDecoded = pathEncoding.encode(s.data(), pathEnd, URLEncodedEntitiesForUnencodables); 1728 // Unencodable characters in URLs are represented by converting 1729 // them to XML entities and escaping non-alphanumeric characters. 1730 CString otherDecoded = encoding.encode(s.data() + pathEnd, s.size() - pathEnd, URLEncodedEntitiesForUnencodables); 1731 1732 output.resize(pathDecoded.length() + otherDecoded.length()); 1733 memcpy(output.data(), pathDecoded.data(), pathDecoded.length()); 1734 memcpy(output.data() + pathDecoded.length(), otherDecoded.data(), otherDecoded.length()); 1735 } 1736 output.append('\0'); // null-terminate the output. 1737} 1738 1739static String substituteBackslashes(const String& string) 1740{ 1741 size_t questionPos = string.find('?'); 1742 size_t hashPos = string.find('#'); 1743 unsigned pathEnd; 1744 1745 if (hashPos != notFound && (questionPos == notFound || questionPos > hashPos)) 1746 pathEnd = hashPos; 1747 else if (questionPos != notFound) 1748 pathEnd = questionPos; 1749 else 1750 pathEnd = string.length(); 1751 1752 return string.left(pathEnd).replace('\\','/') + string.substring(pathEnd); 1753} 1754 1755bool KURL::isHierarchical() const 1756{ 1757 if (!m_isValid) 1758 return false; 1759 ASSERT(m_string[m_schemeEnd] == ':'); 1760 return m_string[m_schemeEnd + 1] == '/'; 1761} 1762 1763void KURL::copyToBuffer(CharBuffer& buffer) const 1764{ 1765 // FIXME: This throws away the high bytes of all the characters in the string! 1766 // That's fine for a valid URL, which is all ASCII, but not for invalid URLs. 1767 buffer.resize(m_string.length()); 1768 copyASCII(m_string.characters(), m_string.length(), buffer.data()); 1769} 1770 1771bool protocolIs(const String& url, const char* protocol) 1772{ 1773 // Do the comparison without making a new string object. 1774 assertProtocolIsGood(protocol); 1775 for (int i = 0; ; ++i) { 1776 if (!protocol[i]) 1777 return url[i] == ':'; 1778 if (toASCIILower(url[i]) != protocol[i]) 1779 return false; 1780 } 1781} 1782 1783bool isValidProtocol(const String& protocol) 1784{ 1785 // RFC3986: ALPHA *( ALPHA / DIGIT / "+" / "-" / "." ) 1786 if (protocol.isEmpty()) 1787 return false; 1788 if (!isSchemeFirstChar(protocol[0])) 1789 return false; 1790 unsigned protocolLength = protocol.length(); 1791 for (unsigned i = 1; i < protocolLength; i++) { 1792 if (!isSchemeChar(protocol[i])) 1793 return false; 1794 } 1795 return true; 1796} 1797 1798#ifndef NDEBUG 1799void KURL::print() const 1800{ 1801 printf("%s\n", m_string.utf8().data()); 1802} 1803#endif 1804 1805#endif // !USE(GOOGLEURL) 1806 1807String KURL::strippedForUseAsReferrer() const 1808{ 1809 KURL referrer(*this); 1810 referrer.setUser(String()); 1811 referrer.setPass(String()); 1812 referrer.removeFragmentIdentifier(); 1813 return referrer.string(); 1814} 1815 1816bool KURL::isLocalFile() const 1817{ 1818 // Including feed here might be a bad idea since drag and drop uses this check 1819 // and including feed would allow feeds to potentially let someone's blog 1820 // read the contents of the clipboard on a drag, even without a drop. 1821 // Likewise with using the FrameLoader::shouldTreatURLAsLocal() function. 1822 return protocolIs("file"); 1823} 1824 1825bool protocolIsJavaScript(const String& url) 1826{ 1827 return protocolIs(url, "javascript"); 1828} 1829 1830const KURL& blankURL() 1831{ 1832 DEFINE_STATIC_LOCAL(KURL, staticBlankURL, (ParsedURLString, "about:blank")); 1833 return staticBlankURL; 1834} 1835 1836bool isDefaultPortForProtocol(unsigned short port, const String& protocol) 1837{ 1838 if (protocol.isEmpty()) 1839 return false; 1840 1841 typedef HashMap<String, unsigned, CaseFoldingHash> DefaultPortsMap; 1842 DEFINE_STATIC_LOCAL(DefaultPortsMap, defaultPorts, ()); 1843 if (defaultPorts.isEmpty()) { 1844 defaultPorts.set("http", 80); 1845 defaultPorts.set("https", 443); 1846 defaultPorts.set("ftp", 21); 1847 defaultPorts.set("ftps", 990); 1848 } 1849 return defaultPorts.get(protocol) == port; 1850} 1851 1852bool portAllowed(const KURL& url) 1853{ 1854 unsigned short port = url.port(); 1855 1856 // Since most URLs don't have a port, return early for the "no port" case. 1857 if (!port) 1858 return true; 1859 1860 // This blocked port list matches the port blocking that Mozilla implements. 1861 // See http://www.mozilla.org/projects/netlib/PortBanning.html for more information. 1862 static const unsigned short blockedPortList[] = { 1863 1, // tcpmux 1864 7, // echo 1865 9, // discard 1866 11, // systat 1867 13, // daytime 1868 15, // netstat 1869 17, // qotd 1870 19, // chargen 1871 20, // FTP-data 1872 21, // FTP-control 1873 22, // SSH 1874 23, // telnet 1875 25, // SMTP 1876 37, // time 1877 42, // name 1878 43, // nicname 1879 53, // domain 1880 77, // priv-rjs 1881 79, // finger 1882 87, // ttylink 1883 95, // supdup 1884 101, // hostriame 1885 102, // iso-tsap 1886 103, // gppitnp 1887 104, // acr-nema 1888 109, // POP2 1889 110, // POP3 1890 111, // sunrpc 1891 113, // auth 1892 115, // SFTP 1893 117, // uucp-path 1894 119, // nntp 1895 123, // NTP 1896 135, // loc-srv / epmap 1897 139, // netbios 1898 143, // IMAP2 1899 179, // BGP 1900 389, // LDAP 1901 465, // SMTP+SSL 1902 512, // print / exec 1903 513, // login 1904 514, // shell 1905 515, // printer 1906 526, // tempo 1907 530, // courier 1908 531, // Chat 1909 532, // netnews 1910 540, // UUCP 1911 556, // remotefs 1912 563, // NNTP+SSL 1913 587, // ESMTP 1914 601, // syslog-conn 1915 636, // LDAP+SSL 1916 993, // IMAP+SSL 1917 995, // POP3+SSL 1918 2049, // NFS 1919 3659, // apple-sasl / PasswordServer [Apple addition] 1920 4045, // lockd 1921 6000, // X11 1922 6665, // Alternate IRC [Apple addition] 1923 6666, // Alternate IRC [Apple addition] 1924 6667, // Standard IRC [Apple addition] 1925 6668, // Alternate IRC [Apple addition] 1926 6669, // Alternate IRC [Apple addition] 1927 invalidPortNumber, // Used to block all invalid port numbers 1928 }; 1929 const unsigned short* const blockedPortListEnd = blockedPortList + WTF_ARRAY_LENGTH(blockedPortList); 1930 1931#ifndef NDEBUG 1932 // The port list must be sorted for binary_search to work. 1933 static bool checkedPortList = false; 1934 if (!checkedPortList) { 1935 for (const unsigned short* p = blockedPortList; p != blockedPortListEnd - 1; ++p) 1936 ASSERT(*p < *(p + 1)); 1937 checkedPortList = true; 1938 } 1939#endif 1940 1941 // If the port is not in the blocked port list, allow it. 1942 if (!binary_search(blockedPortList, blockedPortListEnd, port)) 1943 return true; 1944 1945 // Allow ports 21 and 22 for FTP URLs, as Mozilla does. 1946 if ((port == 21 || port == 22) && url.protocolIs("ftp")) 1947 return true; 1948 1949 // Allow any port number in a file URL, since the port number is ignored. 1950 if (url.protocolIs("file")) 1951 return true; 1952 1953 return false; 1954} 1955 1956String mimeTypeFromDataURL(const String& url) 1957{ 1958 ASSERT(protocolIs(url, "data")); 1959 size_t index = url.find(';'); 1960 if (index == notFound) 1961 index = url.find(','); 1962 if (index != notFound) { 1963 if (index > 5) 1964 return url.substring(5, index - 5); 1965 return "text/plain"; // Data URLs with no MIME type are considered text/plain. 1966 } 1967 return ""; 1968} 1969 1970} 1971