1// Copyright 2007, Google Inc. 2// All rights reserved. 3// 4// Redistribution and use in source and binary forms, with or without 5// modification, are permitted provided that the following conditions are 6// met: 7// 8// * Redistributions of source code must retain the above copyright 9// notice, this list of conditions and the following disclaimer. 10// * Redistributions in binary form must reproduce the above 11// copyright notice, this list of conditions and the following disclaimer 12// in the documentation and/or other materials provided with the 13// distribution. 14// * Neither the name of Google Inc. nor the names of its 15// contributors may be used to endorse or promote products derived from 16// this software without specific prior written permission. 17// 18// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 21// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 22// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 23// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 24// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 25// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 26// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 30// Canonicalizers for random bits that aren't big enough for their own files. 31 32#include <string.h> 33 34#include "googleurl/src/url_canon.h" 35#include "googleurl/src/url_canon_internal.h" 36 37namespace url_canon { 38 39namespace { 40 41// Returns true if the given character should be removed from the middle of a 42// URL. 43inline bool IsRemovableURLWhitespace(int ch) { 44 return ch == '\r' || ch == '\n' || ch == '\t'; 45} 46 47// Backend for RemoveURLWhitespace (see declaration in url_canon.h). 48// It sucks that we have to do this, since this takes about 13% of the total URL 49// canonicalization time. 50template<typename CHAR> 51const CHAR* DoRemoveURLWhitespace(const CHAR* input, int input_len, 52 CanonOutputT<CHAR>* buffer, 53 int* output_len) { 54 // Fast verification that there's nothing that needs removal. This is the 99% 55 // case, so we want it to be fast and don't care about impacting the speed 56 // when we do find whitespace. 57 int found_whitespace = false; 58 for (int i = 0; i < input_len; i++) { 59 if (!IsRemovableURLWhitespace(input[i])) 60 continue; 61 found_whitespace = true; 62 break; 63 } 64 65 if (!found_whitespace) { 66 // Didn't find any whitespace, we don't need to do anything. We can just 67 // return the input as the output. 68 *output_len = input_len; 69 return input; 70 } 71 72 // Remove the whitespace into the new buffer and return it. 73 for (int i = 0; i < input_len; i++) { 74 if (!IsRemovableURLWhitespace(input[i])) 75 buffer->push_back(input[i]); 76 } 77 *output_len = buffer->length(); 78 return buffer->data(); 79} 80 81// Contains the canonical version of each possible input letter in the scheme 82// (basically, lower-cased). The corresponding entry will be 0 if the letter 83// is not allowed in a scheme. 84const char kSchemeCanonical[0x80] = { 85// 00-1f: all are invalid 86 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 87 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 88// ' ' ! " # $ % & ' ( ) * + , - . / 89 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, '+', 0, '-', '.', 0, 90// 0 1 2 3 4 5 6 7 8 9 : ; < = > ? 91 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 0 , 0 , 0 , 0 , 0 , 0 , 92// @ A B C D E F G H I J K L M N O 93 0 , 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 94// P Q R S T U V W X Y Z [ \ ] ^ _ 95 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 0, 0 , 0, 0 , 0, 96// ` a b c d e f g h i j k l m n o 97 0 , 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 98// p q r s t u v w x y z { | } ~ 99 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 0 , 0 , 0 , 0 , 0 }; 100 101// This could be a table lookup as well by setting the high bit for each 102// valid character, but it's only called once per URL, and it makes the lookup 103// table easier to read not having extra stuff in it. 104inline bool IsSchemeFirstChar(unsigned char c) { 105 return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'); 106} 107 108template<typename CHAR, typename UCHAR> 109bool DoScheme(const CHAR* spec, 110 const url_parse::Component& scheme, 111 CanonOutput* output, 112 url_parse::Component* out_scheme) { 113 if (scheme.len <= 0) { 114 // Scheme is unspecified or empty, convert to empty by appending a colon. 115 *out_scheme = url_parse::Component(output->length(), 0); 116 output->push_back(':'); 117 return true; 118 } 119 120 // The output scheme starts from the current position. 121 out_scheme->begin = output->length(); 122 123 // Danger: it's important that this code does not strip any characters: it 124 // only emits the canonical version (be it valid or escaped) of each of 125 // the input characters. Stripping would put it out of sync with 126 // url_util::FindAndCompareScheme, which could cause some security checks on 127 // schemes to be incorrect. 128 bool success = true; 129 int end = scheme.end(); 130 for (int i = scheme.begin; i < end; i++) { 131 UCHAR ch = static_cast<UCHAR>(spec[i]); 132 char replacement = 0; 133 if (ch < 0x80) { 134 if (i == scheme.begin) { 135 // Need to do a special check for the first letter of the scheme. 136 if (IsSchemeFirstChar(static_cast<unsigned char>(ch))) 137 replacement = kSchemeCanonical[ch]; 138 } else { 139 replacement = kSchemeCanonical[ch]; 140 } 141 } 142 143 if (replacement) { 144 output->push_back(replacement); 145 } else if (ch == '%') { 146 // Canonicalizing the scheme multiple times should lead to the same 147 // result. Since invalid characters will be escaped, we need to preserve 148 // the percent to avoid multiple escaping. The scheme will be invalid. 149 success = false; 150 output->push_back('%'); 151 } else { 152 // Invalid character, store it but mark this scheme as invalid. 153 success = false; 154 155 // This will escape the output and also handle encoding issues. 156 // Ignore the return value since we already failed. 157 AppendUTF8EscapedChar(spec, &i, end, output); 158 } 159 } 160 161 // The output scheme ends with the the current position, before appending 162 // the colon. 163 out_scheme->len = output->length() - out_scheme->begin; 164 output->push_back(':'); 165 return success; 166} 167 168// The username and password components reference ranges in the corresponding 169// *_spec strings. Typically, these specs will be the same (we're 170// canonicalizing a single source string), but may be different when 171// replacing components. 172template<typename CHAR, typename UCHAR> 173bool DoUserInfo(const CHAR* username_spec, 174 const url_parse::Component& username, 175 const CHAR* password_spec, 176 const url_parse::Component& password, 177 CanonOutput* output, 178 url_parse::Component* out_username, 179 url_parse::Component* out_password) { 180 if (username.len <= 0 && password.len <= 0) { 181 // Common case: no user info. We strip empty username/passwords. 182 *out_username = url_parse::Component(); 183 *out_password = url_parse::Component(); 184 return true; 185 } 186 187 // Write the username. 188 out_username->begin = output->length(); 189 if (username.len > 0) { 190 // This will escape characters not valid for the username. 191 AppendStringOfType(&username_spec[username.begin], username.len, 192 CHAR_USERINFO, output); 193 } 194 out_username->len = output->length() - out_username->begin; 195 196 // When there is a password, we need the separator. Note that we strip 197 // empty but specified passwords. 198 if (password.len > 0) { 199 output->push_back(':'); 200 out_password->begin = output->length(); 201 AppendStringOfType(&password_spec[password.begin], password.len, 202 CHAR_USERINFO, output); 203 out_password->len = output->length() - out_password->begin; 204 } else { 205 *out_password = url_parse::Component(); 206 } 207 208 output->push_back('@'); 209 return true; 210} 211 212// Helper functions for converting port integers to strings. 213inline void WritePortInt(char* output, int output_len, int port) { 214 _itoa_s(port, output, output_len, 10); 215} 216 217// This function will prepend the colon if there will be a port. 218template<typename CHAR, typename UCHAR> 219bool DoPort(const CHAR* spec, 220 const url_parse::Component& port, 221 int default_port_for_scheme, 222 CanonOutput* output, 223 url_parse::Component* out_port) { 224 int port_num = url_parse::ParsePort(spec, port); 225 if (port_num == url_parse::PORT_UNSPECIFIED || 226 port_num == default_port_for_scheme) { 227 *out_port = url_parse::Component(); 228 return true; // Leave port empty. 229 } 230 231 if (port_num == url_parse::PORT_INVALID) { 232 // Invalid port: We'll copy the text from the input so the user can see 233 // what the error was, and mark the URL as invalid by returning false. 234 output->push_back(':'); 235 out_port->begin = output->length(); 236 AppendInvalidNarrowString(spec, port.begin, port.end(), output); 237 out_port->len = output->length() - out_port->begin; 238 return false; 239 } 240 241 // Convert port number back to an integer. Max port value is 5 digits, and 242 // the Parsed::ExtractPort will have made sure the integer is in range. 243 const int buf_size = 6; 244 char buf[buf_size]; 245 WritePortInt(buf, buf_size, port_num); 246 247 // Append the port number to the output, preceeded by a colon. 248 output->push_back(':'); 249 out_port->begin = output->length(); 250 for (int i = 0; i < buf_size && buf[i]; i++) 251 output->push_back(buf[i]); 252 253 out_port->len = output->length() - out_port->begin; 254 return true; 255} 256 257template<typename CHAR, typename UCHAR> 258void DoCanonicalizeRef(const CHAR* spec, 259 const url_parse::Component& ref, 260 CanonOutput* output, 261 url_parse::Component* out_ref) { 262 if (ref.len < 0) { 263 // Common case of no ref. 264 *out_ref = url_parse::Component(); 265 return; 266 } 267 268 // Append the ref separator. Note that we need to do this even when the ref 269 // is empty but present. 270 output->push_back('#'); 271 out_ref->begin = output->length(); 272 273 // Now iterate through all the characters, converting to UTF-8 and validating. 274 int end = ref.end(); 275 for (int i = ref.begin; i < end; i++) { 276 if (spec[i] == 0) { 277 // IE just strips NULLs, so we do too. 278 continue; 279 } else if (static_cast<UCHAR>(spec[i]) < 0x20) { 280 // Unline IE seems to, we escape control characters. This will probably 281 // make the reference fragment unusable on a web page, but people 282 // shouldn't be using control characters in their anchor names. 283 AppendEscapedChar(static_cast<unsigned char>(spec[i]), output); 284 } else if (static_cast<UCHAR>(spec[i]) < 0x80) { 285 // Normal ASCII characters are just appended. 286 output->push_back(static_cast<char>(spec[i])); 287 } else { 288 // Non-ASCII characters are appended unescaped, but only when they are 289 // valid. Invalid Unicode characters are replaced with the "invalid 290 // character" as IE seems to (ReadUTFChar puts the unicode replacement 291 // character in the output on failure for us). 292 unsigned code_point; 293 ReadUTFChar(spec, &i, end, &code_point); 294 AppendUTF8Value(code_point, output); 295 } 296 } 297 298 out_ref->len = output->length() - out_ref->begin; 299} 300 301} // namespace 302 303const char* RemoveURLWhitespace(const char* input, int input_len, 304 CanonOutputT<char>* buffer, 305 int* output_len) { 306 return DoRemoveURLWhitespace(input, input_len, buffer, output_len); 307} 308 309const char16* RemoveURLWhitespace(const char16* input, int input_len, 310 CanonOutputT<char16>* buffer, 311 int* output_len) { 312 return DoRemoveURLWhitespace(input, input_len, buffer, output_len); 313} 314 315char CanonicalSchemeChar(char16 ch) { 316 if (ch >= 0x80) 317 return 0; // Non-ASCII is not supported by schemes. 318 return kSchemeCanonical[ch]; 319} 320 321bool CanonicalizeScheme(const char* spec, 322 const url_parse::Component& scheme, 323 CanonOutput* output, 324 url_parse::Component* out_scheme) { 325 return DoScheme<char, unsigned char>(spec, scheme, output, out_scheme); 326} 327 328bool CanonicalizeScheme(const char16* spec, 329 const url_parse::Component& scheme, 330 CanonOutput* output, 331 url_parse::Component* out_scheme) { 332 return DoScheme<char16, char16>(spec, scheme, output, out_scheme); 333} 334 335bool CanonicalizeUserInfo(const char* username_source, 336 const url_parse::Component& username, 337 const char* password_source, 338 const url_parse::Component& password, 339 CanonOutput* output, 340 url_parse::Component* out_username, 341 url_parse::Component* out_password) { 342 return DoUserInfo<char, unsigned char>( 343 username_source, username, password_source, password, 344 output, out_username, out_password); 345} 346 347bool CanonicalizeUserInfo(const char16* username_source, 348 const url_parse::Component& username, 349 const char16* password_source, 350 const url_parse::Component& password, 351 CanonOutput* output, 352 url_parse::Component* out_username, 353 url_parse::Component* out_password) { 354 return DoUserInfo<char16, char16>( 355 username_source, username, password_source, password, 356 output, out_username, out_password); 357} 358 359bool CanonicalizePort(const char* spec, 360 const url_parse::Component& port, 361 int default_port_for_scheme, 362 CanonOutput* output, 363 url_parse::Component* out_port) { 364 return DoPort<char, unsigned char>(spec, port, 365 default_port_for_scheme, 366 output, out_port); 367} 368 369bool CanonicalizePort(const char16* spec, 370 const url_parse::Component& port, 371 int default_port_for_scheme, 372 CanonOutput* output, 373 url_parse::Component* out_port) { 374 return DoPort<char16, char16>(spec, port, default_port_for_scheme, 375 output, out_port); 376} 377 378void CanonicalizeRef(const char* spec, 379 const url_parse::Component& ref, 380 CanonOutput* output, 381 url_parse::Component* out_ref) { 382 DoCanonicalizeRef<char, unsigned char>(spec, ref, output, out_ref); 383} 384 385void CanonicalizeRef(const char16* spec, 386 const url_parse::Component& ref, 387 CanonOutput* output, 388 url_parse::Component* out_ref) { 389 DoCanonicalizeRef<char16, char16>(spec, ref, output, out_ref); 390} 391 392} // namespace url_canon 393