1// Copyright 2016 the V8 project authors. All rights reserved. 2// Use of this source code is governed by a BSD-style license that can be 3// found in the LICENSE file. 4 5#include "src/uri.h" 6 7#include "src/char-predicates-inl.h" 8#include "src/handles.h" 9#include "src/isolate-inl.h" 10#include "src/list.h" 11#include "src/string-search.h" 12 13namespace v8 { 14namespace internal { 15 16namespace { // anonymous namespace for DecodeURI helper functions 17bool IsReservedPredicate(uc16 c) { 18 switch (c) { 19 case '#': 20 case '$': 21 case '&': 22 case '+': 23 case ',': 24 case '/': 25 case ':': 26 case ';': 27 case '=': 28 case '?': 29 case '@': 30 return true; 31 default: 32 return false; 33 } 34} 35 36bool IsReplacementCharacter(const uint8_t* octets, int length) { 37 // The replacement character is at codepoint U+FFFD in the Unicode Specials 38 // table. Its UTF-8 encoding is 0xEF 0xBF 0xBD. 39 if (length != 3 || octets[0] != 0xef || octets[1] != 0xbf || 40 octets[2] != 0xbd) { 41 return false; 42 } 43 return true; 44} 45 46bool DecodeOctets(const uint8_t* octets, int length, List<uc16>* buffer) { 47 size_t cursor = 0; 48 uc32 value = unibrow::Utf8::ValueOf(octets, length, &cursor); 49 if (value == unibrow::Utf8::kBadChar && 50 !IsReplacementCharacter(octets, length)) { 51 return false; 52 } 53 54 if (value <= static_cast<uc32>(unibrow::Utf16::kMaxNonSurrogateCharCode)) { 55 buffer->Add(value); 56 } else { 57 buffer->Add(unibrow::Utf16::LeadSurrogate(value)); 58 buffer->Add(unibrow::Utf16::TrailSurrogate(value)); 59 } 60 return true; 61} 62 63int TwoDigitHex(uc16 character1, uc16 character2) { 64 if (character1 > 'f') return -1; 65 int high = HexValue(character1); 66 if (high == -1) return -1; 67 if (character2 > 'f') return -1; 68 int low = HexValue(character2); 69 if (low == -1) return -1; 70 return (high << 4) + low; 71} 72 73template <typename T> 74void AddToBuffer(uc16 decoded, String::FlatContent* uri_content, int index, 75 bool is_uri, List<T>* buffer) { 76 if (is_uri && IsReservedPredicate(decoded)) { 77 buffer->Add('%'); 78 uc16 first = uri_content->Get(index + 1); 79 uc16 second = uri_content->Get(index + 2); 80 DCHECK_GT(std::numeric_limits<T>::max(), first); 81 DCHECK_GT(std::numeric_limits<T>::max(), second); 82 83 buffer->Add(first); 84 buffer->Add(second); 85 } else { 86 buffer->Add(decoded); 87 } 88} 89 90bool IntoTwoByte(int index, bool is_uri, int uri_length, 91 String::FlatContent* uri_content, List<uc16>* buffer) { 92 for (int k = index; k < uri_length; k++) { 93 uc16 code = uri_content->Get(k); 94 if (code == '%') { 95 int two_digits; 96 if (k + 2 >= uri_length || 97 (two_digits = TwoDigitHex(uri_content->Get(k + 1), 98 uri_content->Get(k + 2))) < 0) { 99 return false; 100 } 101 k += 2; 102 uc16 decoded = static_cast<uc16>(two_digits); 103 if (decoded > unibrow::Utf8::kMaxOneByteChar) { 104 uint8_t octets[unibrow::Utf8::kMaxEncodedSize]; 105 octets[0] = decoded; 106 107 int number_of_continuation_bytes = 0; 108 while ((decoded << ++number_of_continuation_bytes) & 0x80) { 109 if (number_of_continuation_bytes > 3 || k + 3 >= uri_length) { 110 return false; 111 } 112 if (uri_content->Get(++k) != '%' || 113 (two_digits = TwoDigitHex(uri_content->Get(k + 1), 114 uri_content->Get(k + 2))) < 0) { 115 return false; 116 } 117 k += 2; 118 uc16 continuation_byte = static_cast<uc16>(two_digits); 119 octets[number_of_continuation_bytes] = continuation_byte; 120 } 121 122 if (!DecodeOctets(octets, number_of_continuation_bytes, buffer)) { 123 return false; 124 } 125 } else { 126 AddToBuffer(decoded, uri_content, k - 2, is_uri, buffer); 127 } 128 } else { 129 buffer->Add(code); 130 } 131 } 132 return true; 133} 134 135bool IntoOneAndTwoByte(Handle<String> uri, bool is_uri, 136 List<uint8_t>* one_byte_buffer, 137 List<uc16>* two_byte_buffer) { 138 DisallowHeapAllocation no_gc; 139 String::FlatContent uri_content = uri->GetFlatContent(); 140 141 int uri_length = uri->length(); 142 for (int k = 0; k < uri_length; k++) { 143 uc16 code = uri_content.Get(k); 144 if (code == '%') { 145 int two_digits; 146 if (k + 2 >= uri_length || 147 (two_digits = TwoDigitHex(uri_content.Get(k + 1), 148 uri_content.Get(k + 2))) < 0) { 149 return false; 150 } 151 152 uc16 decoded = static_cast<uc16>(two_digits); 153 if (decoded > unibrow::Utf8::kMaxOneByteChar) { 154 return IntoTwoByte(k, is_uri, uri_length, &uri_content, 155 two_byte_buffer); 156 } 157 158 AddToBuffer(decoded, &uri_content, k, is_uri, one_byte_buffer); 159 k += 2; 160 } else { 161 if (code > unibrow::Utf8::kMaxOneByteChar) { 162 return IntoTwoByte(k, is_uri, uri_length, &uri_content, 163 two_byte_buffer); 164 } 165 one_byte_buffer->Add(code); 166 } 167 } 168 return true; 169} 170 171} // anonymous namespace 172 173MaybeHandle<String> Uri::Decode(Isolate* isolate, Handle<String> uri, 174 bool is_uri) { 175 uri = String::Flatten(uri); 176 List<uint8_t> one_byte_buffer; 177 List<uc16> two_byte_buffer; 178 179 if (!IntoOneAndTwoByte(uri, is_uri, &one_byte_buffer, &two_byte_buffer)) { 180 THROW_NEW_ERROR(isolate, NewURIError(), String); 181 } 182 183 if (two_byte_buffer.is_empty()) { 184 return isolate->factory()->NewStringFromOneByte( 185 one_byte_buffer.ToConstVector()); 186 } 187 188 Handle<SeqTwoByteString> result; 189 ASSIGN_RETURN_ON_EXCEPTION( 190 isolate, result, isolate->factory()->NewRawTwoByteString( 191 one_byte_buffer.length() + two_byte_buffer.length()), 192 String); 193 194 CopyChars(result->GetChars(), one_byte_buffer.ToConstVector().start(), 195 one_byte_buffer.length()); 196 CopyChars(result->GetChars() + one_byte_buffer.length(), 197 two_byte_buffer.ToConstVector().start(), two_byte_buffer.length()); 198 199 return result; 200} 201 202namespace { // anonymous namespace for EncodeURI helper functions 203bool IsUnescapePredicateInUriComponent(uc16 c) { 204 if (IsAlphaNumeric(c)) { 205 return true; 206 } 207 208 switch (c) { 209 case '!': 210 case '\'': 211 case '(': 212 case ')': 213 case '*': 214 case '-': 215 case '.': 216 case '_': 217 case '~': 218 return true; 219 default: 220 return false; 221 } 222} 223 224bool IsUriSeparator(uc16 c) { 225 switch (c) { 226 case '#': 227 case ':': 228 case ';': 229 case '/': 230 case '?': 231 case '$': 232 case '&': 233 case '+': 234 case ',': 235 case '@': 236 case '=': 237 return true; 238 default: 239 return false; 240 } 241} 242 243void AddEncodedOctetToBuffer(uint8_t octet, List<uint8_t>* buffer) { 244 buffer->Add('%'); 245 buffer->Add(HexCharOfValue(octet >> 4)); 246 buffer->Add(HexCharOfValue(octet & 0x0F)); 247} 248 249void EncodeSingle(uc16 c, List<uint8_t>* buffer) { 250 char s[4] = {}; 251 int number_of_bytes; 252 number_of_bytes = 253 unibrow::Utf8::Encode(s, c, unibrow::Utf16::kNoPreviousCharacter, false); 254 for (int k = 0; k < number_of_bytes; k++) { 255 AddEncodedOctetToBuffer(s[k], buffer); 256 } 257} 258 259void EncodePair(uc16 cc1, uc16 cc2, List<uint8_t>* buffer) { 260 char s[4] = {}; 261 int number_of_bytes = 262 unibrow::Utf8::Encode(s, unibrow::Utf16::CombineSurrogatePair(cc1, cc2), 263 unibrow::Utf16::kNoPreviousCharacter, false); 264 for (int k = 0; k < number_of_bytes; k++) { 265 AddEncodedOctetToBuffer(s[k], buffer); 266 } 267} 268 269} // anonymous namespace 270 271MaybeHandle<String> Uri::Encode(Isolate* isolate, Handle<String> uri, 272 bool is_uri) { 273 uri = String::Flatten(uri); 274 int uri_length = uri->length(); 275 List<uint8_t> buffer(uri_length); 276 277 { 278 DisallowHeapAllocation no_gc; 279 String::FlatContent uri_content = uri->GetFlatContent(); 280 281 for (int k = 0; k < uri_length; k++) { 282 uc16 cc1 = uri_content.Get(k); 283 if (unibrow::Utf16::IsLeadSurrogate(cc1)) { 284 k++; 285 if (k < uri_length) { 286 uc16 cc2 = uri->Get(k); 287 if (unibrow::Utf16::IsTrailSurrogate(cc2)) { 288 EncodePair(cc1, cc2, &buffer); 289 continue; 290 } 291 } 292 } else if (!unibrow::Utf16::IsTrailSurrogate(cc1)) { 293 if (IsUnescapePredicateInUriComponent(cc1) || 294 (is_uri && IsUriSeparator(cc1))) { 295 buffer.Add(cc1); 296 } else { 297 EncodeSingle(cc1, &buffer); 298 } 299 continue; 300 } 301 302 AllowHeapAllocation allocate_error_and_return; 303 THROW_NEW_ERROR(isolate, NewURIError(), String); 304 } 305 } 306 307 return isolate->factory()->NewStringFromOneByte(buffer.ToConstVector()); 308} 309 310namespace { // Anonymous namespace for Escape and Unescape 311 312template <typename Char> 313int UnescapeChar(Vector<const Char> vector, int i, int length, int* step) { 314 uint16_t character = vector[i]; 315 int32_t hi = 0; 316 int32_t lo = 0; 317 if (character == '%' && i <= length - 6 && vector[i + 1] == 'u' && 318 (hi = TwoDigitHex(vector[i + 2], vector[i + 3])) > -1 && 319 (lo = TwoDigitHex(vector[i + 4], vector[i + 5])) > -1) { 320 *step = 6; 321 return (hi << 8) + lo; 322 } else if (character == '%' && i <= length - 3 && 323 (lo = TwoDigitHex(vector[i + 1], vector[i + 2])) > -1) { 324 *step = 3; 325 return lo; 326 } else { 327 *step = 1; 328 return character; 329 } 330} 331 332template <typename Char> 333MaybeHandle<String> UnescapeSlow(Isolate* isolate, Handle<String> string, 334 int start_index) { 335 bool one_byte = true; 336 int length = string->length(); 337 338 int unescaped_length = 0; 339 { 340 DisallowHeapAllocation no_allocation; 341 Vector<const Char> vector = string->GetCharVector<Char>(); 342 for (int i = start_index; i < length; unescaped_length++) { 343 int step; 344 if (UnescapeChar(vector, i, length, &step) > 345 String::kMaxOneByteCharCode) { 346 one_byte = false; 347 } 348 i += step; 349 } 350 } 351 352 DCHECK(start_index < length); 353 Handle<String> first_part = 354 isolate->factory()->NewProperSubString(string, 0, start_index); 355 356 int dest_position = 0; 357 Handle<String> second_part; 358 DCHECK(unescaped_length <= String::kMaxLength); 359 if (one_byte) { 360 Handle<SeqOneByteString> dest = isolate->factory() 361 ->NewRawOneByteString(unescaped_length) 362 .ToHandleChecked(); 363 DisallowHeapAllocation no_allocation; 364 Vector<const Char> vector = string->GetCharVector<Char>(); 365 for (int i = start_index; i < length; dest_position++) { 366 int step; 367 dest->SeqOneByteStringSet(dest_position, 368 UnescapeChar(vector, i, length, &step)); 369 i += step; 370 } 371 second_part = dest; 372 } else { 373 Handle<SeqTwoByteString> dest = isolate->factory() 374 ->NewRawTwoByteString(unescaped_length) 375 .ToHandleChecked(); 376 DisallowHeapAllocation no_allocation; 377 Vector<const Char> vector = string->GetCharVector<Char>(); 378 for (int i = start_index; i < length; dest_position++) { 379 int step; 380 dest->SeqTwoByteStringSet(dest_position, 381 UnescapeChar(vector, i, length, &step)); 382 i += step; 383 } 384 second_part = dest; 385 } 386 return isolate->factory()->NewConsString(first_part, second_part); 387} 388 389bool IsNotEscaped(uint16_t c) { 390 if (IsAlphaNumeric(c)) { 391 return true; 392 } 393 // @*_+-./ 394 switch (c) { 395 case '@': 396 case '*': 397 case '_': 398 case '+': 399 case '-': 400 case '.': 401 case '/': 402 return true; 403 default: 404 return false; 405 } 406} 407 408template <typename Char> 409static MaybeHandle<String> UnescapePrivate(Isolate* isolate, 410 Handle<String> source) { 411 int index; 412 { 413 DisallowHeapAllocation no_allocation; 414 StringSearch<uint8_t, Char> search(isolate, STATIC_CHAR_VECTOR("%")); 415 index = search.Search(source->GetCharVector<Char>(), 0); 416 if (index < 0) return source; 417 } 418 return UnescapeSlow<Char>(isolate, source, index); 419} 420 421template <typename Char> 422static MaybeHandle<String> EscapePrivate(Isolate* isolate, 423 Handle<String> string) { 424 DCHECK(string->IsFlat()); 425 int escaped_length = 0; 426 int length = string->length(); 427 428 { 429 DisallowHeapAllocation no_allocation; 430 Vector<const Char> vector = string->GetCharVector<Char>(); 431 for (int i = 0; i < length; i++) { 432 uint16_t c = vector[i]; 433 if (c >= 256) { 434 escaped_length += 6; 435 } else if (IsNotEscaped(c)) { 436 escaped_length++; 437 } else { 438 escaped_length += 3; 439 } 440 441 // We don't allow strings that are longer than a maximal length. 442 DCHECK(String::kMaxLength < 0x7fffffff - 6); // Cannot overflow. 443 if (escaped_length > String::kMaxLength) break; // Provoke exception. 444 } 445 } 446 447 // No length change implies no change. Return original string if no change. 448 if (escaped_length == length) return string; 449 450 Handle<SeqOneByteString> dest; 451 ASSIGN_RETURN_ON_EXCEPTION( 452 isolate, dest, isolate->factory()->NewRawOneByteString(escaped_length), 453 String); 454 int dest_position = 0; 455 456 { 457 DisallowHeapAllocation no_allocation; 458 Vector<const Char> vector = string->GetCharVector<Char>(); 459 for (int i = 0; i < length; i++) { 460 uint16_t c = vector[i]; 461 if (c >= 256) { 462 dest->SeqOneByteStringSet(dest_position, '%'); 463 dest->SeqOneByteStringSet(dest_position + 1, 'u'); 464 dest->SeqOneByteStringSet(dest_position + 2, HexCharOfValue(c >> 12)); 465 dest->SeqOneByteStringSet(dest_position + 3, 466 HexCharOfValue((c >> 8) & 0xf)); 467 dest->SeqOneByteStringSet(dest_position + 4, 468 HexCharOfValue((c >> 4) & 0xf)); 469 dest->SeqOneByteStringSet(dest_position + 5, HexCharOfValue(c & 0xf)); 470 dest_position += 6; 471 } else if (IsNotEscaped(c)) { 472 dest->SeqOneByteStringSet(dest_position, c); 473 dest_position++; 474 } else { 475 dest->SeqOneByteStringSet(dest_position, '%'); 476 dest->SeqOneByteStringSet(dest_position + 1, HexCharOfValue(c >> 4)); 477 dest->SeqOneByteStringSet(dest_position + 2, HexCharOfValue(c & 0xf)); 478 dest_position += 3; 479 } 480 } 481 } 482 483 return dest; 484} 485 486} // Anonymous namespace 487 488MaybeHandle<String> Uri::Escape(Isolate* isolate, Handle<String> string) { 489 Handle<String> result; 490 string = String::Flatten(string); 491 return string->IsOneByteRepresentationUnderneath() 492 ? EscapePrivate<uint8_t>(isolate, string) 493 : EscapePrivate<uc16>(isolate, string); 494} 495 496MaybeHandle<String> Uri::Unescape(Isolate* isolate, Handle<String> string) { 497 Handle<String> result; 498 string = String::Flatten(string); 499 return string->IsOneByteRepresentationUnderneath() 500 ? UnescapePrivate<uint8_t>(isolate, string) 501 : UnescapePrivate<uc16>(isolate, string); 502} 503 504} // namespace internal 505} // namespace v8 506