1/*
2 *  Copyright 2004 The WebRTC Project Authors. All rights reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS.  All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include "webrtc/base/stringencode.h"
12
13#include <stdio.h>
14#include <stdlib.h>
15
16#include "webrtc/base/basictypes.h"
17#include "webrtc/base/checks.h"
18#include "webrtc/base/stringutils.h"
19
20namespace rtc {
21
22/////////////////////////////////////////////////////////////////////////////
23// String Encoding Utilities
24/////////////////////////////////////////////////////////////////////////////
25
26size_t escape(char * buffer, size_t buflen,
27              const char * source, size_t srclen,
28              const char * illegal, char escape) {
29  DCHECK(buffer);  // TODO: estimate output size
30  if (buflen <= 0)
31    return 0;
32
33  size_t srcpos = 0, bufpos = 0;
34  while ((srcpos < srclen) && (bufpos + 1 < buflen)) {
35    char ch = source[srcpos++];
36    if ((ch == escape) || ::strchr(illegal, ch)) {
37      if (bufpos + 2 >= buflen)
38        break;
39      buffer[bufpos++] = escape;
40    }
41    buffer[bufpos++] = ch;
42  }
43
44  buffer[bufpos] = '\0';
45  return bufpos;
46}
47
48size_t unescape(char * buffer, size_t buflen,
49                const char * source, size_t srclen,
50                char escape) {
51  DCHECK(buffer);  // TODO: estimate output size
52  if (buflen <= 0)
53    return 0;
54
55  size_t srcpos = 0, bufpos = 0;
56  while ((srcpos < srclen) && (bufpos + 1 < buflen)) {
57    char ch = source[srcpos++];
58    if ((ch == escape) && (srcpos < srclen)) {
59      ch = source[srcpos++];
60    }
61    buffer[bufpos++] = ch;
62  }
63  buffer[bufpos] = '\0';
64  return bufpos;
65}
66
67size_t encode(char * buffer, size_t buflen,
68              const char * source, size_t srclen,
69              const char * illegal, char escape) {
70  DCHECK(buffer);  // TODO: estimate output size
71  if (buflen <= 0)
72    return 0;
73
74  size_t srcpos = 0, bufpos = 0;
75  while ((srcpos < srclen) && (bufpos + 1 < buflen)) {
76    char ch = source[srcpos++];
77    if ((ch != escape) && !::strchr(illegal, ch)) {
78      buffer[bufpos++] = ch;
79    } else if (bufpos + 3 >= buflen) {
80      break;
81    } else {
82      buffer[bufpos+0] = escape;
83      buffer[bufpos+1] = hex_encode((static_cast<unsigned char>(ch) >> 4) & 0xF);
84      buffer[bufpos+2] = hex_encode((static_cast<unsigned char>(ch)     ) & 0xF);
85      bufpos += 3;
86    }
87  }
88  buffer[bufpos] = '\0';
89  return bufpos;
90}
91
92size_t decode(char * buffer, size_t buflen,
93              const char * source, size_t srclen,
94              char escape) {
95  if (buflen <= 0)
96    return 0;
97
98  unsigned char h1, h2;
99  size_t srcpos = 0, bufpos = 0;
100  while ((srcpos < srclen) && (bufpos + 1 < buflen)) {
101    char ch = source[srcpos++];
102    if ((ch == escape)
103        && (srcpos + 1 < srclen)
104        && hex_decode(source[srcpos], &h1)
105        && hex_decode(source[srcpos+1], &h2)) {
106      buffer[bufpos++] = (h1 << 4) | h2;
107      srcpos += 2;
108    } else {
109      buffer[bufpos++] = ch;
110    }
111  }
112  buffer[bufpos] = '\0';
113  return bufpos;
114}
115
116const char* unsafe_filename_characters() {
117  // It might be better to have a single specification which is the union of
118  // all operating systems, unless one system is overly restrictive.
119#if defined(WEBRTC_WIN)
120  return "\\/:*?\"<>|";
121#else  // !WEBRTC_WIN
122  // TODO
123  DCHECK(false);
124  return "";
125#endif  // !WEBRTC_WIN
126}
127
128const unsigned char URL_UNSAFE  = 0x1; // 0-33 "#$%&+,/:;<=>?@[\]^`{|} 127
129const unsigned char XML_UNSAFE  = 0x2; // "&'<>
130const unsigned char HTML_UNSAFE = 0x2; // "&'<>
131
132//  ! " # $ % & ' ( ) * + , - . / 0 1 2 3 4 6 5 7 8 9 : ; < = > ?
133//@ A B C D E F G H I J K L M N O P Q R S T U V W X Y Z [ \ ] ^ _
134//` a b c d e f g h i j k l m n o p q r s t u v w x y z { | } ~
135
136const unsigned char ASCII_CLASS[128] = {
137  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
138  1,0,3,1,1,1,3,2,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,1,1,3,1,3,1,
139  1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,0,
140  1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,1,
141};
142
143size_t url_encode(char * buffer, size_t buflen,
144                  const char * source, size_t srclen) {
145  if (NULL == buffer)
146    return srclen * 3 + 1;
147  if (buflen <= 0)
148    return 0;
149
150  size_t srcpos = 0, bufpos = 0;
151  while ((srcpos < srclen) && (bufpos + 1 < buflen)) {
152    unsigned char ch = source[srcpos++];
153    if ((ch < 128) && (ASCII_CLASS[ch] & URL_UNSAFE)) {
154      if (bufpos + 3 >= buflen) {
155        break;
156      }
157      buffer[bufpos+0] = '%';
158      buffer[bufpos+1] = hex_encode((ch >> 4) & 0xF);
159      buffer[bufpos+2] = hex_encode((ch     ) & 0xF);
160      bufpos += 3;
161    } else {
162      buffer[bufpos++] = ch;
163    }
164  }
165  buffer[bufpos] = '\0';
166  return bufpos;
167}
168
169size_t url_decode(char * buffer, size_t buflen,
170                  const char * source, size_t srclen) {
171  if (NULL == buffer)
172    return srclen + 1;
173  if (buflen <= 0)
174    return 0;
175
176  unsigned char h1, h2;
177  size_t srcpos = 0, bufpos = 0;
178  while ((srcpos < srclen) && (bufpos + 1 < buflen)) {
179    unsigned char ch = source[srcpos++];
180    if (ch == '+') {
181      buffer[bufpos++] = ' ';
182    } else if ((ch == '%')
183               && (srcpos + 1 < srclen)
184               && hex_decode(source[srcpos], &h1)
185               && hex_decode(source[srcpos+1], &h2))
186    {
187      buffer[bufpos++] = (h1 << 4) | h2;
188      srcpos += 2;
189    } else {
190      buffer[bufpos++] = ch;
191    }
192  }
193  buffer[bufpos] = '\0';
194  return bufpos;
195}
196
197size_t utf8_decode(const char* source, size_t srclen, unsigned long* value) {
198  const unsigned char* s = reinterpret_cast<const unsigned char*>(source);
199  if ((s[0] & 0x80) == 0x00) {                    // Check s[0] == 0xxxxxxx
200    *value = s[0];
201    return 1;
202  }
203  if ((srclen < 2) || ((s[1] & 0xC0) != 0x80)) {  // Check s[1] != 10xxxxxx
204    return 0;
205  }
206  // Accumulate the trailer byte values in value16, and combine it with the
207  // relevant bits from s[0], once we've determined the sequence length.
208  unsigned long value16 = (s[1] & 0x3F);
209  if ((s[0] & 0xE0) == 0xC0) {                    // Check s[0] == 110xxxxx
210    *value = ((s[0] & 0x1F) << 6) | value16;
211    return 2;
212  }
213  if ((srclen < 3) || ((s[2] & 0xC0) != 0x80)) {  // Check s[2] != 10xxxxxx
214    return 0;
215  }
216  value16 = (value16 << 6) | (s[2] & 0x3F);
217  if ((s[0] & 0xF0) == 0xE0) {                    // Check s[0] == 1110xxxx
218    *value = ((s[0] & 0x0F) << 12) | value16;
219    return 3;
220  }
221  if ((srclen < 4) || ((s[3] & 0xC0) != 0x80)) {  // Check s[3] != 10xxxxxx
222    return 0;
223  }
224  value16 = (value16 << 6) | (s[3] & 0x3F);
225  if ((s[0] & 0xF8) == 0xF0) {                    // Check s[0] == 11110xxx
226    *value = ((s[0] & 0x07) << 18) | value16;
227    return 4;
228  }
229  return 0;
230}
231
232size_t utf8_encode(char* buffer, size_t buflen, unsigned long value) {
233  if ((value <= 0x7F) && (buflen >= 1)) {
234    buffer[0] = static_cast<unsigned char>(value);
235    return 1;
236  }
237  if ((value <= 0x7FF) && (buflen >= 2)) {
238    buffer[0] = 0xC0 | static_cast<unsigned char>(value >> 6);
239    buffer[1] = 0x80 | static_cast<unsigned char>(value & 0x3F);
240    return 2;
241  }
242  if ((value <= 0xFFFF) && (buflen >= 3)) {
243    buffer[0] = 0xE0 | static_cast<unsigned char>(value >> 12);
244    buffer[1] = 0x80 | static_cast<unsigned char>((value >> 6) & 0x3F);
245    buffer[2] = 0x80 | static_cast<unsigned char>(value & 0x3F);
246    return 3;
247  }
248  if ((value <= 0x1FFFFF) && (buflen >= 4)) {
249    buffer[0] = 0xF0 | static_cast<unsigned char>(value >> 18);
250    buffer[1] = 0x80 | static_cast<unsigned char>((value >> 12) & 0x3F);
251    buffer[2] = 0x80 | static_cast<unsigned char>((value >> 6) & 0x3F);
252    buffer[3] = 0x80 | static_cast<unsigned char>(value & 0x3F);
253    return 4;
254  }
255  return 0;
256}
257
258size_t html_encode(char * buffer, size_t buflen,
259                   const char * source, size_t srclen) {
260  DCHECK(buffer);  // TODO: estimate output size
261  if (buflen <= 0)
262    return 0;
263
264  size_t srcpos = 0, bufpos = 0;
265  while ((srcpos < srclen) && (bufpos + 1 < buflen)) {
266    unsigned char ch = source[srcpos];
267    if (ch < 128) {
268      srcpos += 1;
269      if (ASCII_CLASS[ch] & HTML_UNSAFE) {
270        const char * escseq = 0;
271        size_t esclen = 0;
272        switch (ch) {
273          case '<':  escseq = "&lt;";   esclen = 4; break;
274          case '>':  escseq = "&gt;";   esclen = 4; break;
275          case '\'': escseq = "&#39;";  esclen = 5; break;
276          case '\"': escseq = "&quot;"; esclen = 6; break;
277          case '&':  escseq = "&amp;";  esclen = 5; break;
278          default: DCHECK(false);
279        }
280        if (bufpos + esclen >= buflen) {
281          break;
282        }
283        memcpy(buffer + bufpos, escseq, esclen);
284        bufpos += esclen;
285      } else {
286        buffer[bufpos++] = ch;
287      }
288    } else {
289      // Largest value is 0x1FFFFF => &#2097151;  (10 characters)
290      const size_t kEscseqSize = 11;
291      char escseq[kEscseqSize];
292      unsigned long val;
293      if (size_t vallen = utf8_decode(&source[srcpos], srclen - srcpos, &val)) {
294        srcpos += vallen;
295      } else {
296        // Not a valid utf8 sequence, just use the raw character.
297        val = static_cast<unsigned char>(source[srcpos++]);
298      }
299      size_t esclen = sprintfn(escseq, kEscseqSize, "&#%lu;", val);
300      if (bufpos + esclen >= buflen) {
301        break;
302      }
303      memcpy(buffer + bufpos, escseq, esclen);
304      bufpos += esclen;
305    }
306  }
307  buffer[bufpos] = '\0';
308  return bufpos;
309}
310
311size_t html_decode(char * buffer, size_t buflen,
312                   const char * source, size_t srclen) {
313  DCHECK(buffer);  // TODO: estimate output size
314  return xml_decode(buffer, buflen, source, srclen);
315}
316
317size_t xml_encode(char * buffer, size_t buflen,
318                  const char * source, size_t srclen) {
319  DCHECK(buffer);  // TODO: estimate output size
320  if (buflen <= 0)
321    return 0;
322
323  size_t srcpos = 0, bufpos = 0;
324  while ((srcpos < srclen) && (bufpos + 1 < buflen)) {
325    unsigned char ch = source[srcpos++];
326    if ((ch < 128) && (ASCII_CLASS[ch] & XML_UNSAFE)) {
327      const char * escseq = 0;
328      size_t esclen = 0;
329      switch (ch) {
330        case '<':  escseq = "&lt;";   esclen = 4; break;
331        case '>':  escseq = "&gt;";   esclen = 4; break;
332        case '\'': escseq = "&apos;"; esclen = 6; break;
333        case '\"': escseq = "&quot;"; esclen = 6; break;
334        case '&':  escseq = "&amp;";  esclen = 5; break;
335        default: DCHECK(false);
336      }
337      if (bufpos + esclen >= buflen) {
338        break;
339      }
340      memcpy(buffer + bufpos, escseq, esclen);
341      bufpos += esclen;
342    } else {
343      buffer[bufpos++] = ch;
344    }
345  }
346  buffer[bufpos] = '\0';
347  return bufpos;
348}
349
350size_t xml_decode(char * buffer, size_t buflen,
351                  const char * source, size_t srclen) {
352  DCHECK(buffer);  // TODO: estimate output size
353  if (buflen <= 0)
354    return 0;
355
356  size_t srcpos = 0, bufpos = 0;
357  while ((srcpos < srclen) && (bufpos + 1 < buflen)) {
358    unsigned char ch = source[srcpos++];
359    if (ch != '&') {
360      buffer[bufpos++] = ch;
361    } else if ((srcpos + 2 < srclen)
362               && (memcmp(source + srcpos, "lt;", 3) == 0)) {
363      buffer[bufpos++] = '<';
364      srcpos += 3;
365    } else if ((srcpos + 2 < srclen)
366               && (memcmp(source + srcpos, "gt;", 3) == 0)) {
367      buffer[bufpos++] = '>';
368      srcpos += 3;
369    } else if ((srcpos + 4 < srclen)
370               && (memcmp(source + srcpos, "apos;", 5) == 0)) {
371      buffer[bufpos++] = '\'';
372      srcpos += 5;
373    } else if ((srcpos + 4 < srclen)
374               && (memcmp(source + srcpos, "quot;", 5) == 0)) {
375      buffer[bufpos++] = '\"';
376      srcpos += 5;
377    } else if ((srcpos + 3 < srclen)
378               && (memcmp(source + srcpos, "amp;", 4) == 0)) {
379      buffer[bufpos++] = '&';
380      srcpos += 4;
381    } else if ((srcpos < srclen) && (source[srcpos] == '#')) {
382      int int_base = 10;
383      if ((srcpos + 1 < srclen) && (source[srcpos+1] == 'x')) {
384        int_base = 16;
385        srcpos += 1;
386      }
387      char * ptr;
388      // TODO: Fix hack (ptr may go past end of data)
389      unsigned long val = strtoul(source + srcpos + 1, &ptr, int_base);
390      if ((static_cast<size_t>(ptr - source) < srclen) && (*ptr == ';')) {
391        srcpos = ptr - source + 1;
392      } else {
393        // Not a valid escape sequence.
394        break;
395      }
396      if (size_t esclen = utf8_encode(buffer + bufpos, buflen - bufpos, val)) {
397        bufpos += esclen;
398      } else {
399        // Not enough room to encode the character, or illegal character
400        break;
401      }
402    } else {
403      // Unrecognized escape sequence.
404      break;
405    }
406  }
407  buffer[bufpos] = '\0';
408  return bufpos;
409}
410
411static const char HEX[] = "0123456789abcdef";
412
413char hex_encode(unsigned char val) {
414  DCHECK_LT(val, 16);
415  return (val < 16) ? HEX[val] : '!';
416}
417
418bool hex_decode(char ch, unsigned char* val) {
419  if ((ch >= '0') && (ch <= '9')) {
420    *val = ch - '0';
421  } else if ((ch >= 'A') && (ch <= 'Z')) {
422    *val = (ch - 'A') + 10;
423  } else if ((ch >= 'a') && (ch <= 'z')) {
424    *val = (ch - 'a') + 10;
425  } else {
426    return false;
427  }
428  return true;
429}
430
431size_t hex_encode(char* buffer, size_t buflen,
432                  const char* csource, size_t srclen) {
433  return hex_encode_with_delimiter(buffer, buflen, csource, srclen, 0);
434}
435
436size_t hex_encode_with_delimiter(char* buffer, size_t buflen,
437                                 const char* csource, size_t srclen,
438                                 char delimiter) {
439  DCHECK(buffer);  // TODO: estimate output size
440  if (buflen == 0)
441    return 0;
442
443  // Init and check bounds.
444  const unsigned char* bsource =
445      reinterpret_cast<const unsigned char*>(csource);
446  size_t srcpos = 0, bufpos = 0;
447  size_t needed = delimiter ? (srclen * 3) : (srclen * 2 + 1);
448  if (buflen < needed)
449    return 0;
450
451  while (srcpos < srclen) {
452    unsigned char ch = bsource[srcpos++];
453    buffer[bufpos  ] = hex_encode((ch >> 4) & 0xF);
454    buffer[bufpos+1] = hex_encode((ch     ) & 0xF);
455    bufpos += 2;
456
457    // Don't write a delimiter after the last byte.
458    if (delimiter && (srcpos < srclen)) {
459      buffer[bufpos] = delimiter;
460      ++bufpos;
461    }
462  }
463
464  // Null terminate.
465  buffer[bufpos] = '\0';
466  return bufpos;
467}
468
469std::string hex_encode(const char* source, size_t srclen) {
470  return hex_encode_with_delimiter(source, srclen, 0);
471}
472
473std::string hex_encode_with_delimiter(const char* source, size_t srclen,
474                                      char delimiter) {
475  const size_t kBufferSize = srclen * 3;
476  char* buffer = STACK_ARRAY(char, kBufferSize);
477  size_t length = hex_encode_with_delimiter(buffer, kBufferSize,
478                                            source, srclen, delimiter);
479  DCHECK(srclen == 0 || length > 0);
480  return std::string(buffer, length);
481}
482
483size_t hex_decode(char * cbuffer, size_t buflen,
484                  const char * source, size_t srclen) {
485  return hex_decode_with_delimiter(cbuffer, buflen, source, srclen, 0);
486}
487
488size_t hex_decode_with_delimiter(char* cbuffer, size_t buflen,
489                                 const char* source, size_t srclen,
490                                 char delimiter) {
491  DCHECK(cbuffer);  // TODO: estimate output size
492  if (buflen == 0)
493    return 0;
494
495  // Init and bounds check.
496  unsigned char* bbuffer = reinterpret_cast<unsigned char*>(cbuffer);
497  size_t srcpos = 0, bufpos = 0;
498  size_t needed = (delimiter) ? (srclen + 1) / 3 : srclen / 2;
499  if (buflen < needed)
500    return 0;
501
502  while (srcpos < srclen) {
503    if ((srclen - srcpos) < 2) {
504      // This means we have an odd number of bytes.
505      return 0;
506    }
507
508    unsigned char h1, h2;
509    if (!hex_decode(source[srcpos], &h1) ||
510        !hex_decode(source[srcpos + 1], &h2))
511      return 0;
512
513    bbuffer[bufpos++] = (h1 << 4) | h2;
514    srcpos += 2;
515
516    // Remove the delimiter if needed.
517    if (delimiter && (srclen - srcpos) > 1) {
518      if (source[srcpos] != delimiter)
519        return 0;
520      ++srcpos;
521    }
522  }
523
524  return bufpos;
525}
526
527size_t hex_decode(char* buffer, size_t buflen, const std::string& source) {
528  return hex_decode_with_delimiter(buffer, buflen, source, 0);
529}
530size_t hex_decode_with_delimiter(char* buffer, size_t buflen,
531                                 const std::string& source, char delimiter) {
532  return hex_decode_with_delimiter(buffer, buflen,
533                                   source.c_str(), source.length(), delimiter);
534}
535
536size_t transform(std::string& value, size_t maxlen, const std::string& source,
537                 Transform t) {
538  char* buffer = STACK_ARRAY(char, maxlen + 1);
539  size_t length = t(buffer, maxlen + 1, source.data(), source.length());
540  value.assign(buffer, length);
541  return length;
542}
543
544std::string s_transform(const std::string& source, Transform t) {
545  // Ask transformation function to approximate the destination size (returns upper bound)
546  size_t maxlen = t(NULL, 0, source.data(), source.length());
547  char * buffer = STACK_ARRAY(char, maxlen);
548  size_t len = t(buffer, maxlen, source.data(), source.length());
549  std::string result(buffer, len);
550  return result;
551}
552
553size_t tokenize(const std::string& source, char delimiter,
554                std::vector<std::string>* fields) {
555  DCHECK(fields);
556  fields->clear();
557  size_t last = 0;
558  for (size_t i = 0; i < source.length(); ++i) {
559    if (source[i] == delimiter) {
560      if (i != last) {
561        fields->push_back(source.substr(last, i - last));
562      }
563      last = i + 1;
564    }
565  }
566  if (last != source.length()) {
567    fields->push_back(source.substr(last, source.length() - last));
568  }
569  return fields->size();
570}
571
572size_t tokenize_append(const std::string& source, char delimiter,
573                       std::vector<std::string>* fields) {
574  if (!fields) return 0;
575
576  std::vector<std::string> new_fields;
577  tokenize(source, delimiter, &new_fields);
578  fields->insert(fields->end(), new_fields.begin(), new_fields.end());
579  return fields->size();
580}
581
582size_t tokenize(const std::string& source, char delimiter, char start_mark,
583                char end_mark, std::vector<std::string>* fields) {
584  if (!fields) return 0;
585  fields->clear();
586
587  std::string remain_source = source;
588  while (!remain_source.empty()) {
589    size_t start_pos = remain_source.find(start_mark);
590    if (std::string::npos == start_pos) break;
591    std::string pre_mark;
592    if (start_pos > 0) {
593      pre_mark = remain_source.substr(0, start_pos - 1);
594    }
595
596    ++start_pos;
597    size_t end_pos = remain_source.find(end_mark, start_pos);
598    if (std::string::npos == end_pos) break;
599
600    // We have found the matching marks. First tokenize the pre-mask. Then add
601    // the marked part as a single field. Finally, loop back for the post-mark.
602    tokenize_append(pre_mark, delimiter, fields);
603    fields->push_back(remain_source.substr(start_pos, end_pos - start_pos));
604    remain_source = remain_source.substr(end_pos + 1);
605  }
606
607  return tokenize_append(remain_source, delimiter, fields);
608}
609
610size_t split(const std::string& source, char delimiter,
611             std::vector<std::string>* fields) {
612  DCHECK(fields);
613  fields->clear();
614  size_t last = 0;
615  for (size_t i = 0; i < source.length(); ++i) {
616    if (source[i] == delimiter) {
617      fields->push_back(source.substr(last, i - last));
618      last = i + 1;
619    }
620  }
621  fields->push_back(source.substr(last, source.length() - last));
622  return fields->size();
623}
624
625char make_char_safe_for_filename(char c) {
626  if (c < 32)
627    return '_';
628
629  switch (c) {
630    case '<':
631    case '>':
632    case ':':
633    case '"':
634    case '/':
635    case '\\':
636    case '|':
637    case '*':
638    case '?':
639      return '_';
640
641    default:
642      return c;
643  }
644}
645
646/*
647void sprintf(std::string& value, size_t maxlen, const char * format, ...) {
648  char * buffer = STACK_ARRAY(char, maxlen + 1);
649  va_list args;
650  va_start(args, format);
651  value.assign(buffer, vsprintfn(buffer, maxlen + 1, format, args));
652  va_end(args);
653}
654*/
655
656/////////////////////////////////////////////////////////////////////////////
657
658}  // namespace rtc
659