1// Copyright 2016 the V8 project authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#include "src/uri.h"
6
7#include "src/char-predicates-inl.h"
8#include "src/handles.h"
9#include "src/isolate-inl.h"
10#include "src/list.h"
11#include "src/string-search.h"
12
13namespace v8 {
14namespace internal {
15
16namespace {  // anonymous namespace for DecodeURI helper functions
17bool IsReservedPredicate(uc16 c) {
18  switch (c) {
19    case '#':
20    case '$':
21    case '&':
22    case '+':
23    case ',':
24    case '/':
25    case ':':
26    case ';':
27    case '=':
28    case '?':
29    case '@':
30      return true;
31    default:
32      return false;
33  }
34}
35
36bool IsReplacementCharacter(const uint8_t* octets, int length) {
37  // The replacement character is at codepoint U+FFFD in the Unicode Specials
38  // table. Its UTF-8 encoding is 0xEF 0xBF 0xBD.
39  if (length != 3 || octets[0] != 0xef || octets[1] != 0xbf ||
40      octets[2] != 0xbd) {
41    return false;
42  }
43  return true;
44}
45
46bool DecodeOctets(const uint8_t* octets, int length, List<uc16>* buffer) {
47  size_t cursor = 0;
48  uc32 value = unibrow::Utf8::ValueOf(octets, length, &cursor);
49  if (value == unibrow::Utf8::kBadChar &&
50      !IsReplacementCharacter(octets, length)) {
51    return false;
52  }
53
54  if (value <= static_cast<uc32>(unibrow::Utf16::kMaxNonSurrogateCharCode)) {
55    buffer->Add(value);
56  } else {
57    buffer->Add(unibrow::Utf16::LeadSurrogate(value));
58    buffer->Add(unibrow::Utf16::TrailSurrogate(value));
59  }
60  return true;
61}
62
63int TwoDigitHex(uc16 character1, uc16 character2) {
64  if (character1 > 'f') return -1;
65  int high = HexValue(character1);
66  if (high == -1) return -1;
67  if (character2 > 'f') return -1;
68  int low = HexValue(character2);
69  if (low == -1) return -1;
70  return (high << 4) + low;
71}
72
73template <typename T>
74void AddToBuffer(uc16 decoded, String::FlatContent* uri_content, int index,
75                 bool is_uri, List<T>* buffer) {
76  if (is_uri && IsReservedPredicate(decoded)) {
77    buffer->Add('%');
78    uc16 first = uri_content->Get(index + 1);
79    uc16 second = uri_content->Get(index + 2);
80    DCHECK_GT(std::numeric_limits<T>::max(), first);
81    DCHECK_GT(std::numeric_limits<T>::max(), second);
82
83    buffer->Add(first);
84    buffer->Add(second);
85  } else {
86    buffer->Add(decoded);
87  }
88}
89
90bool IntoTwoByte(int index, bool is_uri, int uri_length,
91                 String::FlatContent* uri_content, List<uc16>* buffer) {
92  for (int k = index; k < uri_length; k++) {
93    uc16 code = uri_content->Get(k);
94    if (code == '%') {
95      int two_digits;
96      if (k + 2 >= uri_length ||
97          (two_digits = TwoDigitHex(uri_content->Get(k + 1),
98                                    uri_content->Get(k + 2))) < 0) {
99        return false;
100      }
101      k += 2;
102      uc16 decoded = static_cast<uc16>(two_digits);
103      if (decoded > unibrow::Utf8::kMaxOneByteChar) {
104        uint8_t octets[unibrow::Utf8::kMaxEncodedSize];
105        octets[0] = decoded;
106
107        int number_of_continuation_bytes = 0;
108        while ((decoded << ++number_of_continuation_bytes) & 0x80) {
109          if (number_of_continuation_bytes > 3 || k + 3 >= uri_length) {
110            return false;
111          }
112          if (uri_content->Get(++k) != '%' ||
113              (two_digits = TwoDigitHex(uri_content->Get(k + 1),
114                                        uri_content->Get(k + 2))) < 0) {
115            return false;
116          }
117          k += 2;
118          uc16 continuation_byte = static_cast<uc16>(two_digits);
119          octets[number_of_continuation_bytes] = continuation_byte;
120        }
121
122        if (!DecodeOctets(octets, number_of_continuation_bytes, buffer)) {
123          return false;
124        }
125      } else {
126        AddToBuffer(decoded, uri_content, k - 2, is_uri, buffer);
127      }
128    } else {
129      buffer->Add(code);
130    }
131  }
132  return true;
133}
134
135bool IntoOneAndTwoByte(Handle<String> uri, bool is_uri,
136                       List<uint8_t>* one_byte_buffer,
137                       List<uc16>* two_byte_buffer) {
138  DisallowHeapAllocation no_gc;
139  String::FlatContent uri_content = uri->GetFlatContent();
140
141  int uri_length = uri->length();
142  for (int k = 0; k < uri_length; k++) {
143    uc16 code = uri_content.Get(k);
144    if (code == '%') {
145      int two_digits;
146      if (k + 2 >= uri_length ||
147          (two_digits = TwoDigitHex(uri_content.Get(k + 1),
148                                    uri_content.Get(k + 2))) < 0) {
149        return false;
150      }
151
152      uc16 decoded = static_cast<uc16>(two_digits);
153      if (decoded > unibrow::Utf8::kMaxOneByteChar) {
154        return IntoTwoByte(k, is_uri, uri_length, &uri_content,
155                           two_byte_buffer);
156      }
157
158      AddToBuffer(decoded, &uri_content, k, is_uri, one_byte_buffer);
159      k += 2;
160    } else {
161      if (code > unibrow::Utf8::kMaxOneByteChar) {
162        return IntoTwoByte(k, is_uri, uri_length, &uri_content,
163                           two_byte_buffer);
164      }
165      one_byte_buffer->Add(code);
166    }
167  }
168  return true;
169}
170
171}  // anonymous namespace
172
173MaybeHandle<String> Uri::Decode(Isolate* isolate, Handle<String> uri,
174                                bool is_uri) {
175  uri = String::Flatten(uri);
176  List<uint8_t> one_byte_buffer;
177  List<uc16> two_byte_buffer;
178
179  if (!IntoOneAndTwoByte(uri, is_uri, &one_byte_buffer, &two_byte_buffer)) {
180    THROW_NEW_ERROR(isolate, NewURIError(), String);
181  }
182
183  if (two_byte_buffer.is_empty()) {
184    return isolate->factory()->NewStringFromOneByte(
185        one_byte_buffer.ToConstVector());
186  }
187
188  Handle<SeqTwoByteString> result;
189  ASSIGN_RETURN_ON_EXCEPTION(
190      isolate, result, isolate->factory()->NewRawTwoByteString(
191                           one_byte_buffer.length() + two_byte_buffer.length()),
192      String);
193
194  CopyChars(result->GetChars(), one_byte_buffer.ToConstVector().start(),
195            one_byte_buffer.length());
196  CopyChars(result->GetChars() + one_byte_buffer.length(),
197            two_byte_buffer.ToConstVector().start(), two_byte_buffer.length());
198
199  return result;
200}
201
202namespace {  // anonymous namespace for EncodeURI helper functions
203bool IsUnescapePredicateInUriComponent(uc16 c) {
204  if (IsAlphaNumeric(c)) {
205    return true;
206  }
207
208  switch (c) {
209    case '!':
210    case '\'':
211    case '(':
212    case ')':
213    case '*':
214    case '-':
215    case '.':
216    case '_':
217    case '~':
218      return true;
219    default:
220      return false;
221  }
222}
223
224bool IsUriSeparator(uc16 c) {
225  switch (c) {
226    case '#':
227    case ':':
228    case ';':
229    case '/':
230    case '?':
231    case '$':
232    case '&':
233    case '+':
234    case ',':
235    case '@':
236    case '=':
237      return true;
238    default:
239      return false;
240  }
241}
242
243void AddEncodedOctetToBuffer(uint8_t octet, List<uint8_t>* buffer) {
244  buffer->Add('%');
245  buffer->Add(HexCharOfValue(octet >> 4));
246  buffer->Add(HexCharOfValue(octet & 0x0F));
247}
248
249void EncodeSingle(uc16 c, List<uint8_t>* buffer) {
250  char s[4] = {};
251  int number_of_bytes;
252  number_of_bytes =
253      unibrow::Utf8::Encode(s, c, unibrow::Utf16::kNoPreviousCharacter, false);
254  for (int k = 0; k < number_of_bytes; k++) {
255    AddEncodedOctetToBuffer(s[k], buffer);
256  }
257}
258
259void EncodePair(uc16 cc1, uc16 cc2, List<uint8_t>* buffer) {
260  char s[4] = {};
261  int number_of_bytes =
262      unibrow::Utf8::Encode(s, unibrow::Utf16::CombineSurrogatePair(cc1, cc2),
263                            unibrow::Utf16::kNoPreviousCharacter, false);
264  for (int k = 0; k < number_of_bytes; k++) {
265    AddEncodedOctetToBuffer(s[k], buffer);
266  }
267}
268
269}  // anonymous namespace
270
271MaybeHandle<String> Uri::Encode(Isolate* isolate, Handle<String> uri,
272                                bool is_uri) {
273  uri = String::Flatten(uri);
274  int uri_length = uri->length();
275  List<uint8_t> buffer(uri_length);
276
277  {
278    DisallowHeapAllocation no_gc;
279    String::FlatContent uri_content = uri->GetFlatContent();
280
281    for (int k = 0; k < uri_length; k++) {
282      uc16 cc1 = uri_content.Get(k);
283      if (unibrow::Utf16::IsLeadSurrogate(cc1)) {
284        k++;
285        if (k < uri_length) {
286          uc16 cc2 = uri->Get(k);
287          if (unibrow::Utf16::IsTrailSurrogate(cc2)) {
288            EncodePair(cc1, cc2, &buffer);
289            continue;
290          }
291        }
292      } else if (!unibrow::Utf16::IsTrailSurrogate(cc1)) {
293        if (IsUnescapePredicateInUriComponent(cc1) ||
294            (is_uri && IsUriSeparator(cc1))) {
295          buffer.Add(cc1);
296        } else {
297          EncodeSingle(cc1, &buffer);
298        }
299        continue;
300      }
301
302      AllowHeapAllocation allocate_error_and_return;
303      THROW_NEW_ERROR(isolate, NewURIError(), String);
304    }
305  }
306
307  return isolate->factory()->NewStringFromOneByte(buffer.ToConstVector());
308}
309
310namespace {  // Anonymous namespace for Escape and Unescape
311
312template <typename Char>
313int UnescapeChar(Vector<const Char> vector, int i, int length, int* step) {
314  uint16_t character = vector[i];
315  int32_t hi = 0;
316  int32_t lo = 0;
317  if (character == '%' && i <= length - 6 && vector[i + 1] == 'u' &&
318      (hi = TwoDigitHex(vector[i + 2], vector[i + 3])) > -1 &&
319      (lo = TwoDigitHex(vector[i + 4], vector[i + 5])) > -1) {
320    *step = 6;
321    return (hi << 8) + lo;
322  } else if (character == '%' && i <= length - 3 &&
323             (lo = TwoDigitHex(vector[i + 1], vector[i + 2])) > -1) {
324    *step = 3;
325    return lo;
326  } else {
327    *step = 1;
328    return character;
329  }
330}
331
332template <typename Char>
333MaybeHandle<String> UnescapeSlow(Isolate* isolate, Handle<String> string,
334                                 int start_index) {
335  bool one_byte = true;
336  int length = string->length();
337
338  int unescaped_length = 0;
339  {
340    DisallowHeapAllocation no_allocation;
341    Vector<const Char> vector = string->GetCharVector<Char>();
342    for (int i = start_index; i < length; unescaped_length++) {
343      int step;
344      if (UnescapeChar(vector, i, length, &step) >
345          String::kMaxOneByteCharCode) {
346        one_byte = false;
347      }
348      i += step;
349    }
350  }
351
352  DCHECK(start_index < length);
353  Handle<String> first_part =
354      isolate->factory()->NewProperSubString(string, 0, start_index);
355
356  int dest_position = 0;
357  Handle<String> second_part;
358  DCHECK(unescaped_length <= String::kMaxLength);
359  if (one_byte) {
360    Handle<SeqOneByteString> dest = isolate->factory()
361                                        ->NewRawOneByteString(unescaped_length)
362                                        .ToHandleChecked();
363    DisallowHeapAllocation no_allocation;
364    Vector<const Char> vector = string->GetCharVector<Char>();
365    for (int i = start_index; i < length; dest_position++) {
366      int step;
367      dest->SeqOneByteStringSet(dest_position,
368                                UnescapeChar(vector, i, length, &step));
369      i += step;
370    }
371    second_part = dest;
372  } else {
373    Handle<SeqTwoByteString> dest = isolate->factory()
374                                        ->NewRawTwoByteString(unescaped_length)
375                                        .ToHandleChecked();
376    DisallowHeapAllocation no_allocation;
377    Vector<const Char> vector = string->GetCharVector<Char>();
378    for (int i = start_index; i < length; dest_position++) {
379      int step;
380      dest->SeqTwoByteStringSet(dest_position,
381                                UnescapeChar(vector, i, length, &step));
382      i += step;
383    }
384    second_part = dest;
385  }
386  return isolate->factory()->NewConsString(first_part, second_part);
387}
388
389bool IsNotEscaped(uint16_t c) {
390  if (IsAlphaNumeric(c)) {
391    return true;
392  }
393  //  @*_+-./
394  switch (c) {
395    case '@':
396    case '*':
397    case '_':
398    case '+':
399    case '-':
400    case '.':
401    case '/':
402      return true;
403    default:
404      return false;
405  }
406}
407
408template <typename Char>
409static MaybeHandle<String> UnescapePrivate(Isolate* isolate,
410                                           Handle<String> source) {
411  int index;
412  {
413    DisallowHeapAllocation no_allocation;
414    StringSearch<uint8_t, Char> search(isolate, STATIC_CHAR_VECTOR("%"));
415    index = search.Search(source->GetCharVector<Char>(), 0);
416    if (index < 0) return source;
417  }
418  return UnescapeSlow<Char>(isolate, source, index);
419}
420
421template <typename Char>
422static MaybeHandle<String> EscapePrivate(Isolate* isolate,
423                                         Handle<String> string) {
424  DCHECK(string->IsFlat());
425  int escaped_length = 0;
426  int length = string->length();
427
428  {
429    DisallowHeapAllocation no_allocation;
430    Vector<const Char> vector = string->GetCharVector<Char>();
431    for (int i = 0; i < length; i++) {
432      uint16_t c = vector[i];
433      if (c >= 256) {
434        escaped_length += 6;
435      } else if (IsNotEscaped(c)) {
436        escaped_length++;
437      } else {
438        escaped_length += 3;
439      }
440
441      // We don't allow strings that are longer than a maximal length.
442      DCHECK(String::kMaxLength < 0x7fffffff - 6);     // Cannot overflow.
443      if (escaped_length > String::kMaxLength) break;  // Provoke exception.
444    }
445  }
446
447  // No length change implies no change.  Return original string if no change.
448  if (escaped_length == length) return string;
449
450  Handle<SeqOneByteString> dest;
451  ASSIGN_RETURN_ON_EXCEPTION(
452      isolate, dest, isolate->factory()->NewRawOneByteString(escaped_length),
453      String);
454  int dest_position = 0;
455
456  {
457    DisallowHeapAllocation no_allocation;
458    Vector<const Char> vector = string->GetCharVector<Char>();
459    for (int i = 0; i < length; i++) {
460      uint16_t c = vector[i];
461      if (c >= 256) {
462        dest->SeqOneByteStringSet(dest_position, '%');
463        dest->SeqOneByteStringSet(dest_position + 1, 'u');
464        dest->SeqOneByteStringSet(dest_position + 2, HexCharOfValue(c >> 12));
465        dest->SeqOneByteStringSet(dest_position + 3,
466                                  HexCharOfValue((c >> 8) & 0xf));
467        dest->SeqOneByteStringSet(dest_position + 4,
468                                  HexCharOfValue((c >> 4) & 0xf));
469        dest->SeqOneByteStringSet(dest_position + 5, HexCharOfValue(c & 0xf));
470        dest_position += 6;
471      } else if (IsNotEscaped(c)) {
472        dest->SeqOneByteStringSet(dest_position, c);
473        dest_position++;
474      } else {
475        dest->SeqOneByteStringSet(dest_position, '%');
476        dest->SeqOneByteStringSet(dest_position + 1, HexCharOfValue(c >> 4));
477        dest->SeqOneByteStringSet(dest_position + 2, HexCharOfValue(c & 0xf));
478        dest_position += 3;
479      }
480    }
481  }
482
483  return dest;
484}
485
486}  // Anonymous namespace
487
488MaybeHandle<String> Uri::Escape(Isolate* isolate, Handle<String> string) {
489  Handle<String> result;
490  string = String::Flatten(string);
491  return string->IsOneByteRepresentationUnderneath()
492             ? EscapePrivate<uint8_t>(isolate, string)
493             : EscapePrivate<uc16>(isolate, string);
494}
495
496MaybeHandle<String> Uri::Unescape(Isolate* isolate, Handle<String> string) {
497  Handle<String> result;
498  string = String::Flatten(string);
499  return string->IsOneByteRepresentationUnderneath()
500             ? UnescapePrivate<uint8_t>(isolate, string)
501             : UnescapePrivate<uc16>(isolate, string);
502}
503
504}  // namespace internal
505}  // namespace v8
506