1//===--- LiteralSupport.cpp - Code to parse and process literals ----------===//
2//
3//                     The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10// This file implements the NumericLiteralParser, CharLiteralParser, and
11// StringLiteralParser interfaces.
12//
13//===----------------------------------------------------------------------===//
14
15#include "clang/Lex/LiteralSupport.h"
16#include "clang/Lex/Preprocessor.h"
17#include "clang/Lex/LexDiagnostic.h"
18#include "clang/Basic/TargetInfo.h"
19#include "clang/Basic/ConvertUTF.h"
20#include "llvm/ADT/StringExtras.h"
21#include "llvm/Support/ErrorHandling.h"
22using namespace clang;
23
24/// HexDigitValue - Return the value of the specified hex digit, or -1 if it's
25/// not valid.
26static int HexDigitValue(char C) {
27  if (C >= '0' && C <= '9') return C-'0';
28  if (C >= 'a' && C <= 'f') return C-'a'+10;
29  if (C >= 'A' && C <= 'F') return C-'A'+10;
30  return -1;
31}
32
33static unsigned getCharWidth(tok::TokenKind kind, const TargetInfo &Target) {
34  switch (kind) {
35  default: llvm_unreachable("Unknown token type!");
36  case tok::char_constant:
37  case tok::string_literal:
38  case tok::utf8_string_literal:
39    return Target.getCharWidth();
40  case tok::wide_char_constant:
41  case tok::wide_string_literal:
42    return Target.getWCharWidth();
43  case tok::utf16_char_constant:
44  case tok::utf16_string_literal:
45    return Target.getChar16Width();
46  case tok::utf32_char_constant:
47  case tok::utf32_string_literal:
48    return Target.getChar32Width();
49  }
50}
51
52/// ProcessCharEscape - Parse a standard C escape sequence, which can occur in
53/// either a character or a string literal.
54static unsigned ProcessCharEscape(const char *&ThisTokBuf,
55                                  const char *ThisTokEnd, bool &HadError,
56                                  FullSourceLoc Loc, unsigned CharWidth,
57                                  DiagnosticsEngine *Diags) {
58  // Skip the '\' char.
59  ++ThisTokBuf;
60
61  // We know that this character can't be off the end of the buffer, because
62  // that would have been \", which would not have been the end of string.
63  unsigned ResultChar = *ThisTokBuf++;
64  switch (ResultChar) {
65  // These map to themselves.
66  case '\\': case '\'': case '"': case '?': break;
67
68    // These have fixed mappings.
69  case 'a':
70    // TODO: K&R: the meaning of '\\a' is different in traditional C
71    ResultChar = 7;
72    break;
73  case 'b':
74    ResultChar = 8;
75    break;
76  case 'e':
77    if (Diags)
78      Diags->Report(Loc, diag::ext_nonstandard_escape) << "e";
79    ResultChar = 27;
80    break;
81  case 'E':
82    if (Diags)
83      Diags->Report(Loc, diag::ext_nonstandard_escape) << "E";
84    ResultChar = 27;
85    break;
86  case 'f':
87    ResultChar = 12;
88    break;
89  case 'n':
90    ResultChar = 10;
91    break;
92  case 'r':
93    ResultChar = 13;
94    break;
95  case 't':
96    ResultChar = 9;
97    break;
98  case 'v':
99    ResultChar = 11;
100    break;
101  case 'x': { // Hex escape.
102    ResultChar = 0;
103    if (ThisTokBuf == ThisTokEnd || !isxdigit(*ThisTokBuf)) {
104      if (Diags)
105        Diags->Report(Loc, diag::err_hex_escape_no_digits);
106      HadError = 1;
107      break;
108    }
109
110    // Hex escapes are a maximal series of hex digits.
111    bool Overflow = false;
112    for (; ThisTokBuf != ThisTokEnd; ++ThisTokBuf) {
113      int CharVal = HexDigitValue(ThisTokBuf[0]);
114      if (CharVal == -1) break;
115      // About to shift out a digit?
116      Overflow |= (ResultChar & 0xF0000000) ? true : false;
117      ResultChar <<= 4;
118      ResultChar |= CharVal;
119    }
120
121    // See if any bits will be truncated when evaluated as a character.
122    if (CharWidth != 32 && (ResultChar >> CharWidth) != 0) {
123      Overflow = true;
124      ResultChar &= ~0U >> (32-CharWidth);
125    }
126
127    // Check for overflow.
128    if (Overflow && Diags)   // Too many digits to fit in
129      Diags->Report(Loc, diag::warn_hex_escape_too_large);
130    break;
131  }
132  case '0': case '1': case '2': case '3':
133  case '4': case '5': case '6': case '7': {
134    // Octal escapes.
135    --ThisTokBuf;
136    ResultChar = 0;
137
138    // Octal escapes are a series of octal digits with maximum length 3.
139    // "\0123" is a two digit sequence equal to "\012" "3".
140    unsigned NumDigits = 0;
141    do {
142      ResultChar <<= 3;
143      ResultChar |= *ThisTokBuf++ - '0';
144      ++NumDigits;
145    } while (ThisTokBuf != ThisTokEnd && NumDigits < 3 &&
146             ThisTokBuf[0] >= '0' && ThisTokBuf[0] <= '7');
147
148    // Check for overflow.  Reject '\777', but not L'\777'.
149    if (CharWidth != 32 && (ResultChar >> CharWidth) != 0) {
150      if (Diags)
151        Diags->Report(Loc, diag::warn_octal_escape_too_large);
152      ResultChar &= ~0U >> (32-CharWidth);
153    }
154    break;
155  }
156
157    // Otherwise, these are not valid escapes.
158  case '(': case '{': case '[': case '%':
159    // GCC accepts these as extensions.  We warn about them as such though.
160    if (Diags)
161      Diags->Report(Loc, diag::ext_nonstandard_escape)
162        << std::string()+(char)ResultChar;
163    break;
164  default:
165    if (Diags == 0)
166      break;
167
168    if (isgraph(ResultChar))
169      Diags->Report(Loc, diag::ext_unknown_escape)
170        << std::string()+(char)ResultChar;
171    else
172      Diags->Report(Loc, diag::ext_unknown_escape)
173        << "x"+llvm::utohexstr(ResultChar);
174    break;
175  }
176
177  return ResultChar;
178}
179
180/// ProcessUCNEscape - Read the Universal Character Name, check constraints and
181/// return the UTF32.
182static bool ProcessUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf,
183                             const char *ThisTokEnd,
184                             uint32_t &UcnVal, unsigned short &UcnLen,
185                             FullSourceLoc Loc, DiagnosticsEngine *Diags,
186                             const LangOptions &Features,
187                             bool in_char_string_literal = false) {
188  if (!Features.CPlusPlus && !Features.C99 && Diags)
189    Diags->Report(Loc, diag::warn_ucn_not_valid_in_c89);
190
191  const char *UcnBegin = ThisTokBuf;
192
193  // Skip the '\u' char's.
194  ThisTokBuf += 2;
195
196  if (ThisTokBuf == ThisTokEnd || !isxdigit(*ThisTokBuf)) {
197    if (Diags)
198      Diags->Report(Loc, diag::err_ucn_escape_no_digits);
199    return false;
200  }
201  UcnLen = (ThisTokBuf[-1] == 'u' ? 4 : 8);
202  unsigned short UcnLenSave = UcnLen;
203  for (; ThisTokBuf != ThisTokEnd && UcnLenSave; ++ThisTokBuf, UcnLenSave--) {
204    int CharVal = HexDigitValue(ThisTokBuf[0]);
205    if (CharVal == -1) break;
206    UcnVal <<= 4;
207    UcnVal |= CharVal;
208  }
209  // If we didn't consume the proper number of digits, there is a problem.
210  if (UcnLenSave) {
211    if (Diags) {
212      SourceLocation L =
213        Lexer::AdvanceToTokenCharacter(Loc, UcnBegin - ThisTokBegin,
214                                       Loc.getManager(), Features);
215      Diags->Report(L, diag::err_ucn_escape_incomplete);
216    }
217    return false;
218  }
219
220  // Check UCN constraints (C99 6.4.3p2) [C++11 lex.charset p2]
221  if ((0xD800 <= UcnVal && UcnVal <= 0xDFFF) || // surrogate codepoints
222      UcnVal > 0x10FFFF) {                      // maximum legal UTF32 value
223    if (Diags)
224      Diags->Report(Loc, diag::err_ucn_escape_invalid);
225    return false;
226  }
227
228  // C++11 allows UCNs that refer to control characters and basic source
229  // characters inside character and string literals
230  if (UcnVal < 0xa0 &&
231      (UcnVal != 0x24 && UcnVal != 0x40 && UcnVal != 0x60)) {  // $, @, `
232    bool IsError = (!Features.CPlusPlus0x || !in_char_string_literal);
233    if (Diags) {
234      SourceLocation UcnBeginLoc =
235        Lexer::AdvanceToTokenCharacter(Loc, UcnBegin - ThisTokBegin,
236                                       Loc.getManager(), Features);
237      char BasicSCSChar = UcnVal;
238      if (UcnVal >= 0x20 && UcnVal < 0x7f)
239        Diags->Report(UcnBeginLoc, IsError ? diag::err_ucn_escape_basic_scs :
240                      diag::warn_cxx98_compat_literal_ucn_escape_basic_scs)
241          << StringRef(&BasicSCSChar, 1);
242      else
243        Diags->Report(UcnBeginLoc, IsError ? diag::err_ucn_control_character :
244                      diag::warn_cxx98_compat_literal_ucn_control_character);
245    }
246    if (IsError)
247      return false;
248  }
249
250  return true;
251}
252
253/// EncodeUCNEscape - Read the Universal Character Name, check constraints and
254/// convert the UTF32 to UTF8 or UTF16. This is a subroutine of
255/// StringLiteralParser. When we decide to implement UCN's for identifiers,
256/// we will likely rework our support for UCN's.
257static void EncodeUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf,
258                            const char *ThisTokEnd,
259                            char *&ResultBuf, bool &HadError,
260                            FullSourceLoc Loc, unsigned CharByteWidth,
261                            DiagnosticsEngine *Diags,
262                            const LangOptions &Features) {
263  typedef uint32_t UTF32;
264  UTF32 UcnVal = 0;
265  unsigned short UcnLen = 0;
266  if (!ProcessUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, UcnVal, UcnLen,
267                        Loc, Diags, Features, true)) {
268    HadError = 1;
269    return;
270  }
271
272  assert((CharByteWidth == 1 || CharByteWidth == 2 || CharByteWidth) &&
273         "only character widths of 1, 2, or 4 bytes supported");
274
275  (void)UcnLen;
276  assert((UcnLen== 4 || UcnLen== 8) && "only ucn length of 4 or 8 supported");
277
278  if (CharByteWidth == 4) {
279    // FIXME: Make the type of the result buffer correct instead of
280    // using reinterpret_cast.
281    UTF32 *ResultPtr = reinterpret_cast<UTF32*>(ResultBuf);
282    *ResultPtr = UcnVal;
283    ResultBuf += 4;
284    return;
285  }
286
287  if (CharByteWidth == 2) {
288    // FIXME: Make the type of the result buffer correct instead of
289    // using reinterpret_cast.
290    UTF16 *ResultPtr = reinterpret_cast<UTF16*>(ResultBuf);
291
292    if (UcnVal < (UTF32)0xFFFF) {
293      *ResultPtr = UcnVal;
294      ResultBuf += 2;
295      return;
296    }
297
298    // Convert to UTF16.
299    UcnVal -= 0x10000;
300    *ResultPtr     = 0xD800 + (UcnVal >> 10);
301    *(ResultPtr+1) = 0xDC00 + (UcnVal & 0x3FF);
302    ResultBuf += 4;
303    return;
304  }
305
306  assert(CharByteWidth == 1 && "UTF-8 encoding is only for 1 byte characters");
307
308  // Now that we've parsed/checked the UCN, we convert from UTF32->UTF8.
309  // The conversion below was inspired by:
310  //   http://www.unicode.org/Public/PROGRAMS/CVTUTF/ConvertUTF.c
311  // First, we determine how many bytes the result will require.
312  typedef uint8_t UTF8;
313
314  unsigned short bytesToWrite = 0;
315  if (UcnVal < (UTF32)0x80)
316    bytesToWrite = 1;
317  else if (UcnVal < (UTF32)0x800)
318    bytesToWrite = 2;
319  else if (UcnVal < (UTF32)0x10000)
320    bytesToWrite = 3;
321  else
322    bytesToWrite = 4;
323
324  const unsigned byteMask = 0xBF;
325  const unsigned byteMark = 0x80;
326
327  // Once the bits are split out into bytes of UTF8, this is a mask OR-ed
328  // into the first byte, depending on how many bytes follow.
329  static const UTF8 firstByteMark[5] = {
330    0x00, 0x00, 0xC0, 0xE0, 0xF0
331  };
332  // Finally, we write the bytes into ResultBuf.
333  ResultBuf += bytesToWrite;
334  switch (bytesToWrite) { // note: everything falls through.
335    case 4: *--ResultBuf = (UTF8)((UcnVal | byteMark) & byteMask); UcnVal >>= 6;
336    case 3: *--ResultBuf = (UTF8)((UcnVal | byteMark) & byteMask); UcnVal >>= 6;
337    case 2: *--ResultBuf = (UTF8)((UcnVal | byteMark) & byteMask); UcnVal >>= 6;
338    case 1: *--ResultBuf = (UTF8) (UcnVal | firstByteMark[bytesToWrite]);
339  }
340  // Update the buffer.
341  ResultBuf += bytesToWrite;
342}
343
344
345///       integer-constant: [C99 6.4.4.1]
346///         decimal-constant integer-suffix
347///         octal-constant integer-suffix
348///         hexadecimal-constant integer-suffix
349///       user-defined-integer-literal: [C++11 lex.ext]
350///         decimal-literal ud-suffix
351///         octal-literal ud-suffix
352///         hexadecimal-literal ud-suffix
353///       decimal-constant:
354///         nonzero-digit
355///         decimal-constant digit
356///       octal-constant:
357///         0
358///         octal-constant octal-digit
359///       hexadecimal-constant:
360///         hexadecimal-prefix hexadecimal-digit
361///         hexadecimal-constant hexadecimal-digit
362///       hexadecimal-prefix: one of
363///         0x 0X
364///       integer-suffix:
365///         unsigned-suffix [long-suffix]
366///         unsigned-suffix [long-long-suffix]
367///         long-suffix [unsigned-suffix]
368///         long-long-suffix [unsigned-sufix]
369///       nonzero-digit:
370///         1 2 3 4 5 6 7 8 9
371///       octal-digit:
372///         0 1 2 3 4 5 6 7
373///       hexadecimal-digit:
374///         0 1 2 3 4 5 6 7 8 9
375///         a b c d e f
376///         A B C D E F
377///       unsigned-suffix: one of
378///         u U
379///       long-suffix: one of
380///         l L
381///       long-long-suffix: one of
382///         ll LL
383///
384///       floating-constant: [C99 6.4.4.2]
385///         TODO: add rules...
386///
387NumericLiteralParser::
388NumericLiteralParser(const char *begin, const char *end,
389                     SourceLocation TokLoc, Preprocessor &pp)
390  : PP(pp), ThisTokBegin(begin), ThisTokEnd(end) {
391
392  // This routine assumes that the range begin/end matches the regex for integer
393  // and FP constants (specifically, the 'pp-number' regex), and assumes that
394  // the byte at "*end" is both valid and not part of the regex.  Because of
395  // this, it doesn't have to check for 'overscan' in various places.
396  assert(!isalnum(*end) && *end != '.' && *end != '_' &&
397         "Lexer didn't maximally munch?");
398
399  s = DigitsBegin = begin;
400  saw_exponent = false;
401  saw_period = false;
402  saw_ud_suffix = false;
403  isLong = false;
404  isUnsigned = false;
405  isLongLong = false;
406  isFloat = false;
407  isImaginary = false;
408  isMicrosoftInteger = false;
409  hadError = false;
410
411  if (*s == '0') { // parse radix
412    ParseNumberStartingWithZero(TokLoc);
413    if (hadError)
414      return;
415  } else { // the first digit is non-zero
416    radix = 10;
417    s = SkipDigits(s);
418    if (s == ThisTokEnd) {
419      // Done.
420    } else if (isxdigit(*s) && !(*s == 'e' || *s == 'E')) {
421      PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, s-begin),
422              diag::err_invalid_decimal_digit) << StringRef(s, 1);
423      hadError = true;
424      return;
425    } else if (*s == '.') {
426      s++;
427      saw_period = true;
428      s = SkipDigits(s);
429    }
430    if ((*s == 'e' || *s == 'E')) { // exponent
431      const char *Exponent = s;
432      s++;
433      saw_exponent = true;
434      if (*s == '+' || *s == '-')  s++; // sign
435      const char *first_non_digit = SkipDigits(s);
436      if (first_non_digit != s) {
437        s = first_non_digit;
438      } else {
439        PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, Exponent-begin),
440                diag::err_exponent_has_no_digits);
441        hadError = true;
442        return;
443      }
444    }
445  }
446
447  SuffixBegin = s;
448
449  // Parse the suffix.  At this point we can classify whether we have an FP or
450  // integer constant.
451  bool isFPConstant = isFloatingLiteral();
452
453  // Loop over all of the characters of the suffix.  If we see something bad,
454  // we break out of the loop.
455  for (; s != ThisTokEnd; ++s) {
456    switch (*s) {
457    case 'f':      // FP Suffix for "float"
458    case 'F':
459      if (!isFPConstant) break;  // Error for integer constant.
460      if (isFloat || isLong) break; // FF, LF invalid.
461      isFloat = true;
462      continue;  // Success.
463    case 'u':
464    case 'U':
465      if (isFPConstant) break;  // Error for floating constant.
466      if (isUnsigned) break;    // Cannot be repeated.
467      isUnsigned = true;
468      continue;  // Success.
469    case 'l':
470    case 'L':
471      if (isLong || isLongLong) break;  // Cannot be repeated.
472      if (isFloat) break;               // LF invalid.
473
474      // Check for long long.  The L's need to be adjacent and the same case.
475      if (s+1 != ThisTokEnd && s[1] == s[0]) {
476        if (isFPConstant) break;        // long long invalid for floats.
477        isLongLong = true;
478        ++s;  // Eat both of them.
479      } else {
480        isLong = true;
481      }
482      continue;  // Success.
483    case 'i':
484    case 'I':
485      if (PP.getLangOpts().MicrosoftExt) {
486        if (isFPConstant || isLong || isLongLong) break;
487
488        // Allow i8, i16, i32, i64, and i128.
489        if (s + 1 != ThisTokEnd) {
490          switch (s[1]) {
491            case '8':
492              s += 2; // i8 suffix
493              isMicrosoftInteger = true;
494              break;
495            case '1':
496              if (s + 2 == ThisTokEnd) break;
497              if (s[2] == '6') {
498                s += 3; // i16 suffix
499                isMicrosoftInteger = true;
500              }
501              else if (s[2] == '2') {
502                if (s + 3 == ThisTokEnd) break;
503                if (s[3] == '8') {
504                  s += 4; // i128 suffix
505                  isMicrosoftInteger = true;
506                }
507              }
508              break;
509            case '3':
510              if (s + 2 == ThisTokEnd) break;
511              if (s[2] == '2') {
512                s += 3; // i32 suffix
513                isLong = true;
514                isMicrosoftInteger = true;
515              }
516              break;
517            case '6':
518              if (s + 2 == ThisTokEnd) break;
519              if (s[2] == '4') {
520                s += 3; // i64 suffix
521                isLongLong = true;
522                isMicrosoftInteger = true;
523              }
524              break;
525            default:
526              break;
527          }
528          break;
529        }
530      }
531      // fall through.
532    case 'j':
533    case 'J':
534      if (isImaginary) break;   // Cannot be repeated.
535      PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, s-begin),
536              diag::ext_imaginary_constant);
537      isImaginary = true;
538      continue;  // Success.
539    }
540    // If we reached here, there was an error or a ud-suffix.
541    break;
542  }
543
544  if (s != ThisTokEnd) {
545    if (PP.getLangOpts().CPlusPlus0x && s == SuffixBegin && *s == '_') {
546      // We have a ud-suffix! By C++11 [lex.ext]p10, ud-suffixes not starting
547      // with an '_' are ill-formed.
548      saw_ud_suffix = true;
549      return;
550    }
551
552    // Report an error if there are any.
553    PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, SuffixBegin-begin),
554            isFPConstant ? diag::err_invalid_suffix_float_constant :
555                           diag::err_invalid_suffix_integer_constant)
556      << StringRef(SuffixBegin, ThisTokEnd-SuffixBegin);
557    hadError = true;
558    return;
559  }
560}
561
562/// ParseNumberStartingWithZero - This method is called when the first character
563/// of the number is found to be a zero.  This means it is either an octal
564/// number (like '04') or a hex number ('0x123a') a binary number ('0b1010') or
565/// a floating point number (01239.123e4).  Eat the prefix, determining the
566/// radix etc.
567void NumericLiteralParser::ParseNumberStartingWithZero(SourceLocation TokLoc) {
568  assert(s[0] == '0' && "Invalid method call");
569  s++;
570
571  // Handle a hex number like 0x1234.
572  if ((*s == 'x' || *s == 'X') && (isxdigit(s[1]) || s[1] == '.')) {
573    s++;
574    radix = 16;
575    DigitsBegin = s;
576    s = SkipHexDigits(s);
577    bool noSignificand = (s == DigitsBegin);
578    if (s == ThisTokEnd) {
579      // Done.
580    } else if (*s == '.') {
581      s++;
582      saw_period = true;
583      const char *floatDigitsBegin = s;
584      s = SkipHexDigits(s);
585      noSignificand &= (floatDigitsBegin == s);
586    }
587
588    if (noSignificand) {
589      PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, s-ThisTokBegin), \
590        diag::err_hexconstant_requires_digits);
591      hadError = true;
592      return;
593    }
594
595    // A binary exponent can appear with or with a '.'. If dotted, the
596    // binary exponent is required.
597    if (*s == 'p' || *s == 'P') {
598      const char *Exponent = s;
599      s++;
600      saw_exponent = true;
601      if (*s == '+' || *s == '-')  s++; // sign
602      const char *first_non_digit = SkipDigits(s);
603      if (first_non_digit == s) {
604        PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, Exponent-ThisTokBegin),
605                diag::err_exponent_has_no_digits);
606        hadError = true;
607        return;
608      }
609      s = first_non_digit;
610
611      if (!PP.getLangOpts().HexFloats)
612        PP.Diag(TokLoc, diag::ext_hexconstant_invalid);
613    } else if (saw_period) {
614      PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, s-ThisTokBegin),
615              diag::err_hexconstant_requires_exponent);
616      hadError = true;
617    }
618    return;
619  }
620
621  // Handle simple binary numbers 0b01010
622  if (*s == 'b' || *s == 'B') {
623    // 0b101010 is a GCC extension.
624    PP.Diag(TokLoc, diag::ext_binary_literal);
625    ++s;
626    radix = 2;
627    DigitsBegin = s;
628    s = SkipBinaryDigits(s);
629    if (s == ThisTokEnd) {
630      // Done.
631    } else if (isxdigit(*s)) {
632      PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, s-ThisTokBegin),
633              diag::err_invalid_binary_digit) << StringRef(s, 1);
634      hadError = true;
635    }
636    // Other suffixes will be diagnosed by the caller.
637    return;
638  }
639
640  // For now, the radix is set to 8. If we discover that we have a
641  // floating point constant, the radix will change to 10. Octal floating
642  // point constants are not permitted (only decimal and hexadecimal).
643  radix = 8;
644  DigitsBegin = s;
645  s = SkipOctalDigits(s);
646  if (s == ThisTokEnd)
647    return; // Done, simple octal number like 01234
648
649  // If we have some other non-octal digit that *is* a decimal digit, see if
650  // this is part of a floating point number like 094.123 or 09e1.
651  if (isdigit(*s)) {
652    const char *EndDecimal = SkipDigits(s);
653    if (EndDecimal[0] == '.' || EndDecimal[0] == 'e' || EndDecimal[0] == 'E') {
654      s = EndDecimal;
655      radix = 10;
656    }
657  }
658
659  // If we have a hex digit other than 'e' (which denotes a FP exponent) then
660  // the code is using an incorrect base.
661  if (isxdigit(*s) && *s != 'e' && *s != 'E') {
662    PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, s-ThisTokBegin),
663            diag::err_invalid_octal_digit) << StringRef(s, 1);
664    hadError = true;
665    return;
666  }
667
668  if (*s == '.') {
669    s++;
670    radix = 10;
671    saw_period = true;
672    s = SkipDigits(s); // Skip suffix.
673  }
674  if (*s == 'e' || *s == 'E') { // exponent
675    const char *Exponent = s;
676    s++;
677    radix = 10;
678    saw_exponent = true;
679    if (*s == '+' || *s == '-')  s++; // sign
680    const char *first_non_digit = SkipDigits(s);
681    if (first_non_digit != s) {
682      s = first_non_digit;
683    } else {
684      PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, Exponent-ThisTokBegin),
685              diag::err_exponent_has_no_digits);
686      hadError = true;
687      return;
688    }
689  }
690}
691
692
693/// GetIntegerValue - Convert this numeric literal value to an APInt that
694/// matches Val's input width.  If there is an overflow, set Val to the low bits
695/// of the result and return true.  Otherwise, return false.
696bool NumericLiteralParser::GetIntegerValue(llvm::APInt &Val) {
697  // Fast path: Compute a conservative bound on the maximum number of
698  // bits per digit in this radix. If we can't possibly overflow a
699  // uint64 based on that bound then do the simple conversion to
700  // integer. This avoids the expensive overflow checking below, and
701  // handles the common cases that matter (small decimal integers and
702  // hex/octal values which don't overflow).
703  unsigned MaxBitsPerDigit = 1;
704  while ((1U << MaxBitsPerDigit) < radix)
705    MaxBitsPerDigit += 1;
706  if ((SuffixBegin - DigitsBegin) * MaxBitsPerDigit <= 64) {
707    uint64_t N = 0;
708    for (s = DigitsBegin; s != SuffixBegin; ++s)
709      N = N*radix + HexDigitValue(*s);
710
711    // This will truncate the value to Val's input width. Simply check
712    // for overflow by comparing.
713    Val = N;
714    return Val.getZExtValue() != N;
715  }
716
717  Val = 0;
718  s = DigitsBegin;
719
720  llvm::APInt RadixVal(Val.getBitWidth(), radix);
721  llvm::APInt CharVal(Val.getBitWidth(), 0);
722  llvm::APInt OldVal = Val;
723
724  bool OverflowOccurred = false;
725  while (s < SuffixBegin) {
726    unsigned C = HexDigitValue(*s++);
727
728    // If this letter is out of bound for this radix, reject it.
729    assert(C < radix && "NumericLiteralParser ctor should have rejected this");
730
731    CharVal = C;
732
733    // Add the digit to the value in the appropriate radix.  If adding in digits
734    // made the value smaller, then this overflowed.
735    OldVal = Val;
736
737    // Multiply by radix, did overflow occur on the multiply?
738    Val *= RadixVal;
739    OverflowOccurred |= Val.udiv(RadixVal) != OldVal;
740
741    // Add value, did overflow occur on the value?
742    //   (a + b) ult b  <=> overflow
743    Val += CharVal;
744    OverflowOccurred |= Val.ult(CharVal);
745  }
746  return OverflowOccurred;
747}
748
749llvm::APFloat::opStatus
750NumericLiteralParser::GetFloatValue(llvm::APFloat &Result) {
751  using llvm::APFloat;
752
753  unsigned n = std::min(SuffixBegin - ThisTokBegin, ThisTokEnd - ThisTokBegin);
754  return Result.convertFromString(StringRef(ThisTokBegin, n),
755                                  APFloat::rmNearestTiesToEven);
756}
757
758
759///       user-defined-character-literal: [C++11 lex.ext]
760///         character-literal ud-suffix
761///       ud-suffix:
762///         identifier
763///       character-literal: [C++11 lex.ccon]
764///         ' c-char-sequence '
765///         u' c-char-sequence '
766///         U' c-char-sequence '
767///         L' c-char-sequence '
768///       c-char-sequence:
769///         c-char
770///         c-char-sequence c-char
771///       c-char:
772///         any member of the source character set except the single-quote ',
773///           backslash \, or new-line character
774///         escape-sequence
775///         universal-character-name
776///       escape-sequence:
777///         simple-escape-sequence
778///         octal-escape-sequence
779///         hexadecimal-escape-sequence
780///       simple-escape-sequence:
781///         one of \' \" \? \\ \a \b \f \n \r \t \v
782///       octal-escape-sequence:
783///         \ octal-digit
784///         \ octal-digit octal-digit
785///         \ octal-digit octal-digit octal-digit
786///       hexadecimal-escape-sequence:
787///         \x hexadecimal-digit
788///         hexadecimal-escape-sequence hexadecimal-digit
789///       universal-character-name: [C++11 lex.charset]
790///         \u hex-quad
791///         \U hex-quad hex-quad
792///       hex-quad:
793///         hex-digit hex-digit hex-digit hex-digit
794///
795CharLiteralParser::CharLiteralParser(const char *begin, const char *end,
796                                     SourceLocation Loc, Preprocessor &PP,
797                                     tok::TokenKind kind) {
798  // At this point we know that the character matches the regex "(L|u|U)?'.*'".
799  HadError = false;
800
801  Kind = kind;
802
803  const char *TokBegin = begin;
804
805  // Skip over wide character determinant.
806  if (Kind != tok::char_constant) {
807    ++begin;
808  }
809
810  // Skip over the entry quote.
811  assert(begin[0] == '\'' && "Invalid token lexed");
812  ++begin;
813
814  // Remove an optional ud-suffix.
815  if (end[-1] != '\'') {
816    const char *UDSuffixEnd = end;
817    do {
818      --end;
819    } while (end[-1] != '\'');
820    UDSuffixBuf.assign(end, UDSuffixEnd);
821    UDSuffixOffset = end - TokBegin;
822  }
823
824  // Trim the ending quote.
825  assert(end != begin && "Invalid token lexed");
826  --end;
827
828  // FIXME: The "Value" is an uint64_t so we can handle char literals of
829  // up to 64-bits.
830  // FIXME: This extensively assumes that 'char' is 8-bits.
831  assert(PP.getTargetInfo().getCharWidth() == 8 &&
832         "Assumes char is 8 bits");
833  assert(PP.getTargetInfo().getIntWidth() <= 64 &&
834         (PP.getTargetInfo().getIntWidth() & 7) == 0 &&
835         "Assumes sizeof(int) on target is <= 64 and a multiple of char");
836  assert(PP.getTargetInfo().getWCharWidth() <= 64 &&
837         "Assumes sizeof(wchar) on target is <= 64");
838
839  SmallVector<uint32_t,4> codepoint_buffer;
840  codepoint_buffer.resize(end-begin);
841  uint32_t *buffer_begin = &codepoint_buffer.front();
842  uint32_t *buffer_end = buffer_begin + codepoint_buffer.size();
843
844  // Unicode escapes representing characters that cannot be correctly
845  // represented in a single code unit are disallowed in character literals
846  // by this implementation.
847  uint32_t largest_character_for_kind;
848  if (tok::wide_char_constant == Kind) {
849    largest_character_for_kind = 0xFFFFFFFFu >> (32-PP.getTargetInfo().getWCharWidth());
850  } else if (tok::utf16_char_constant == Kind) {
851    largest_character_for_kind = 0xFFFF;
852  } else if (tok::utf32_char_constant == Kind) {
853    largest_character_for_kind = 0x10FFFF;
854  } else {
855    largest_character_for_kind = 0x7Fu;
856  }
857
858  while (begin!=end) {
859    // Is this a span of non-escape characters?
860    if (begin[0] != '\\') {
861      char const *start = begin;
862      do {
863        ++begin;
864      } while (begin != end && *begin != '\\');
865
866      char const *tmp_in_start = start;
867      uint32_t *tmp_out_start = buffer_begin;
868      ConversionResult res =
869      ConvertUTF8toUTF32(reinterpret_cast<UTF8 const **>(&start),
870                         reinterpret_cast<UTF8 const *>(begin),
871                         &buffer_begin,buffer_end,strictConversion);
872      if (res!=conversionOK) {
873        // If we see bad encoding for unprefixed character literals, warn and
874        // simply copy the byte values, for compatibility with gcc and
875        // older versions of clang.
876        bool NoErrorOnBadEncoding = isAscii();
877        unsigned Msg = diag::err_bad_character_encoding;
878        if (NoErrorOnBadEncoding)
879          Msg = diag::warn_bad_character_encoding;
880        PP.Diag(Loc, Msg);
881        if (NoErrorOnBadEncoding) {
882          start = tmp_in_start;
883          buffer_begin = tmp_out_start;
884          for ( ; start != begin; ++start, ++buffer_begin)
885            *buffer_begin = static_cast<uint8_t>(*start);
886        } else {
887          HadError = true;
888        }
889      } else {
890        for (; tmp_out_start <buffer_begin; ++tmp_out_start) {
891          if (*tmp_out_start > largest_character_for_kind) {
892            HadError = true;
893            PP.Diag(Loc, diag::err_character_too_large);
894          }
895        }
896      }
897
898      continue;
899    }
900    // Is this a Universal Character Name excape?
901    if (begin[1] == 'u' || begin[1] == 'U') {
902      unsigned short UcnLen = 0;
903      if (!ProcessUCNEscape(TokBegin, begin, end, *buffer_begin, UcnLen,
904                            FullSourceLoc(Loc, PP.getSourceManager()),
905                            &PP.getDiagnostics(), PP.getLangOpts(),
906                            true))
907      {
908        HadError = true;
909      } else if (*buffer_begin > largest_character_for_kind) {
910        HadError = true;
911        PP.Diag(Loc,diag::err_character_too_large);
912      }
913
914      ++buffer_begin;
915      continue;
916    }
917    unsigned CharWidth = getCharWidth(Kind, PP.getTargetInfo());
918    uint64_t result =
919    ProcessCharEscape(begin, end, HadError,
920                      FullSourceLoc(Loc,PP.getSourceManager()),
921                      CharWidth, &PP.getDiagnostics());
922    *buffer_begin++ = result;
923  }
924
925  unsigned NumCharsSoFar = buffer_begin-&codepoint_buffer.front();
926
927  if (NumCharsSoFar > 1) {
928    if (isWide())
929      PP.Diag(Loc, diag::warn_extraneous_char_constant);
930    else if (isAscii() && NumCharsSoFar == 4)
931      PP.Diag(Loc, diag::ext_four_char_character_literal);
932    else if (isAscii())
933      PP.Diag(Loc, diag::ext_multichar_character_literal);
934    else
935      PP.Diag(Loc, diag::err_multichar_utf_character_literal);
936    IsMultiChar = true;
937  } else
938    IsMultiChar = false;
939
940  llvm::APInt LitVal(PP.getTargetInfo().getIntWidth(), 0);
941
942  // Narrow character literals act as though their value is concatenated
943  // in this implementation, but warn on overflow.
944  bool multi_char_too_long = false;
945  if (isAscii() && isMultiChar()) {
946    LitVal = 0;
947    for (size_t i=0;i<NumCharsSoFar;++i) {
948      // check for enough leading zeros to shift into
949      multi_char_too_long |= (LitVal.countLeadingZeros() < 8);
950      LitVal <<= 8;
951      LitVal = LitVal + (codepoint_buffer[i] & 0xFF);
952    }
953  } else if (NumCharsSoFar > 0) {
954    // otherwise just take the last character
955    LitVal = buffer_begin[-1];
956  }
957
958  if (!HadError && multi_char_too_long) {
959    PP.Diag(Loc,diag::warn_char_constant_too_large);
960  }
961
962  // Transfer the value from APInt to uint64_t
963  Value = LitVal.getZExtValue();
964
965  // If this is a single narrow character, sign extend it (e.g. '\xFF' is "-1")
966  // if 'char' is signed for this target (C99 6.4.4.4p10).  Note that multiple
967  // character constants are not sign extended in the this implementation:
968  // '\xFF\xFF' = 65536 and '\x0\xFF' = 255, which matches GCC.
969  if (isAscii() && NumCharsSoFar == 1 && (Value & 128) &&
970      PP.getLangOpts().CharIsSigned)
971    Value = (signed char)Value;
972}
973
974
975///       string-literal: [C++0x lex.string]
976///         encoding-prefix " [s-char-sequence] "
977///         encoding-prefix R raw-string
978///       encoding-prefix:
979///         u8
980///         u
981///         U
982///         L
983///       s-char-sequence:
984///         s-char
985///         s-char-sequence s-char
986///       s-char:
987///         any member of the source character set except the double-quote ",
988///           backslash \, or new-line character
989///         escape-sequence
990///         universal-character-name
991///       raw-string:
992///         " d-char-sequence ( r-char-sequence ) d-char-sequence "
993///       r-char-sequence:
994///         r-char
995///         r-char-sequence r-char
996///       r-char:
997///         any member of the source character set, except a right parenthesis )
998///           followed by the initial d-char-sequence (which may be empty)
999///           followed by a double quote ".
1000///       d-char-sequence:
1001///         d-char
1002///         d-char-sequence d-char
1003///       d-char:
1004///         any member of the basic source character set except:
1005///           space, the left parenthesis (, the right parenthesis ),
1006///           the backslash \, and the control characters representing horizontal
1007///           tab, vertical tab, form feed, and newline.
1008///       escape-sequence: [C++0x lex.ccon]
1009///         simple-escape-sequence
1010///         octal-escape-sequence
1011///         hexadecimal-escape-sequence
1012///       simple-escape-sequence:
1013///         one of \' \" \? \\ \a \b \f \n \r \t \v
1014///       octal-escape-sequence:
1015///         \ octal-digit
1016///         \ octal-digit octal-digit
1017///         \ octal-digit octal-digit octal-digit
1018///       hexadecimal-escape-sequence:
1019///         \x hexadecimal-digit
1020///         hexadecimal-escape-sequence hexadecimal-digit
1021///       universal-character-name:
1022///         \u hex-quad
1023///         \U hex-quad hex-quad
1024///       hex-quad:
1025///         hex-digit hex-digit hex-digit hex-digit
1026///
1027StringLiteralParser::
1028StringLiteralParser(const Token *StringToks, unsigned NumStringToks,
1029                    Preprocessor &PP, bool Complain)
1030  : SM(PP.getSourceManager()), Features(PP.getLangOpts()),
1031    Target(PP.getTargetInfo()), Diags(Complain ? &PP.getDiagnostics() : 0),
1032    MaxTokenLength(0), SizeBound(0), CharByteWidth(0), Kind(tok::unknown),
1033    ResultPtr(ResultBuf.data()), hadError(false), Pascal(false) {
1034  init(StringToks, NumStringToks);
1035}
1036
1037void StringLiteralParser::init(const Token *StringToks, unsigned NumStringToks){
1038  // The literal token may have come from an invalid source location (e.g. due
1039  // to a PCH error), in which case the token length will be 0.
1040  if (NumStringToks == 0 || StringToks[0].getLength() < 2) {
1041    hadError = true;
1042    return;
1043  }
1044
1045  // Scan all of the string portions, remember the max individual token length,
1046  // computing a bound on the concatenated string length, and see whether any
1047  // piece is a wide-string.  If any of the string portions is a wide-string
1048  // literal, the result is a wide-string literal [C99 6.4.5p4].
1049  assert(NumStringToks && "expected at least one token");
1050  MaxTokenLength = StringToks[0].getLength();
1051  assert(StringToks[0].getLength() >= 2 && "literal token is invalid!");
1052  SizeBound = StringToks[0].getLength()-2;  // -2 for "".
1053  Kind = StringToks[0].getKind();
1054
1055  hadError = false;
1056
1057  // Implement Translation Phase #6: concatenation of string literals
1058  /// (C99 5.1.1.2p1).  The common case is only one string fragment.
1059  for (unsigned i = 1; i != NumStringToks; ++i) {
1060    if (StringToks[i].getLength() < 2) {
1061      hadError = true;
1062      return;
1063    }
1064
1065    // The string could be shorter than this if it needs cleaning, but this is a
1066    // reasonable bound, which is all we need.
1067    assert(StringToks[i].getLength() >= 2 && "literal token is invalid!");
1068    SizeBound += StringToks[i].getLength()-2;  // -2 for "".
1069
1070    // Remember maximum string piece length.
1071    if (StringToks[i].getLength() > MaxTokenLength)
1072      MaxTokenLength = StringToks[i].getLength();
1073
1074    // Remember if we see any wide or utf-8/16/32 strings.
1075    // Also check for illegal concatenations.
1076    if (StringToks[i].isNot(Kind) && StringToks[i].isNot(tok::string_literal)) {
1077      if (isAscii()) {
1078        Kind = StringToks[i].getKind();
1079      } else {
1080        if (Diags)
1081          Diags->Report(FullSourceLoc(StringToks[i].getLocation(), SM),
1082                        diag::err_unsupported_string_concat);
1083        hadError = true;
1084      }
1085    }
1086  }
1087
1088  // Include space for the null terminator.
1089  ++SizeBound;
1090
1091  // TODO: K&R warning: "traditional C rejects string constant concatenation"
1092
1093  // Get the width in bytes of char/wchar_t/char16_t/char32_t
1094  CharByteWidth = getCharWidth(Kind, Target);
1095  assert((CharByteWidth & 7) == 0 && "Assumes character size is byte multiple");
1096  CharByteWidth /= 8;
1097
1098  // The output buffer size needs to be large enough to hold wide characters.
1099  // This is a worst-case assumption which basically corresponds to L"" "long".
1100  SizeBound *= CharByteWidth;
1101
1102  // Size the temporary buffer to hold the result string data.
1103  ResultBuf.resize(SizeBound);
1104
1105  // Likewise, but for each string piece.
1106  SmallString<512> TokenBuf;
1107  TokenBuf.resize(MaxTokenLength);
1108
1109  // Loop over all the strings, getting their spelling, and expanding them to
1110  // wide strings as appropriate.
1111  ResultPtr = &ResultBuf[0];   // Next byte to fill in.
1112
1113  Pascal = false;
1114
1115  SourceLocation UDSuffixTokLoc;
1116
1117  for (unsigned i = 0, e = NumStringToks; i != e; ++i) {
1118    const char *ThisTokBuf = &TokenBuf[0];
1119    // Get the spelling of the token, which eliminates trigraphs, etc.  We know
1120    // that ThisTokBuf points to a buffer that is big enough for the whole token
1121    // and 'spelled' tokens can only shrink.
1122    bool StringInvalid = false;
1123    unsigned ThisTokLen =
1124      Lexer::getSpelling(StringToks[i], ThisTokBuf, SM, Features,
1125                         &StringInvalid);
1126    if (StringInvalid) {
1127      hadError = true;
1128      continue;
1129    }
1130
1131    const char *ThisTokBegin = ThisTokBuf;
1132    const char *ThisTokEnd = ThisTokBuf+ThisTokLen;
1133
1134    // Remove an optional ud-suffix.
1135    if (ThisTokEnd[-1] != '"') {
1136      const char *UDSuffixEnd = ThisTokEnd;
1137      do {
1138        --ThisTokEnd;
1139      } while (ThisTokEnd[-1] != '"');
1140
1141      StringRef UDSuffix(ThisTokEnd, UDSuffixEnd - ThisTokEnd);
1142
1143      if (UDSuffixBuf.empty()) {
1144        UDSuffixBuf.assign(UDSuffix);
1145        UDSuffixToken = i;
1146        UDSuffixOffset = ThisTokEnd - ThisTokBuf;
1147        UDSuffixTokLoc = StringToks[i].getLocation();
1148      } else if (!UDSuffixBuf.equals(UDSuffix)) {
1149        // C++11 [lex.ext]p8: At the end of phase 6, if a string literal is the
1150        // result of a concatenation involving at least one user-defined-string-
1151        // literal, all the participating user-defined-string-literals shall
1152        // have the same ud-suffix.
1153        if (Diags) {
1154          SourceLocation TokLoc = StringToks[i].getLocation();
1155          Diags->Report(TokLoc, diag::err_string_concat_mixed_suffix)
1156            << UDSuffixBuf << UDSuffix
1157            << SourceRange(UDSuffixTokLoc, UDSuffixTokLoc)
1158            << SourceRange(TokLoc, TokLoc);
1159        }
1160        hadError = true;
1161      }
1162    }
1163
1164    // Strip the end quote.
1165    --ThisTokEnd;
1166
1167    // TODO: Input character set mapping support.
1168
1169    // Skip marker for wide or unicode strings.
1170    if (ThisTokBuf[0] == 'L' || ThisTokBuf[0] == 'u' || ThisTokBuf[0] == 'U') {
1171      ++ThisTokBuf;
1172      // Skip 8 of u8 marker for utf8 strings.
1173      if (ThisTokBuf[0] == '8')
1174        ++ThisTokBuf;
1175    }
1176
1177    // Check for raw string
1178    if (ThisTokBuf[0] == 'R') {
1179      ThisTokBuf += 2; // skip R"
1180
1181      const char *Prefix = ThisTokBuf;
1182      while (ThisTokBuf[0] != '(')
1183        ++ThisTokBuf;
1184      ++ThisTokBuf; // skip '('
1185
1186      // Remove same number of characters from the end
1187      ThisTokEnd -= ThisTokBuf - Prefix;
1188      assert(ThisTokEnd >= ThisTokBuf && "malformed raw string literal");
1189
1190      // Copy the string over
1191      if (CopyStringFragment(StringRef(ThisTokBuf, ThisTokEnd - ThisTokBuf)))
1192        if (DiagnoseBadString(StringToks[i]))
1193          hadError = true;
1194    } else {
1195      assert(ThisTokBuf[0] == '"' && "Expected quote, lexer broken?");
1196      ++ThisTokBuf; // skip "
1197
1198      // Check if this is a pascal string
1199      if (Features.PascalStrings && ThisTokBuf + 1 != ThisTokEnd &&
1200          ThisTokBuf[0] == '\\' && ThisTokBuf[1] == 'p') {
1201
1202        // If the \p sequence is found in the first token, we have a pascal string
1203        // Otherwise, if we already have a pascal string, ignore the first \p
1204        if (i == 0) {
1205          ++ThisTokBuf;
1206          Pascal = true;
1207        } else if (Pascal)
1208          ThisTokBuf += 2;
1209      }
1210
1211      while (ThisTokBuf != ThisTokEnd) {
1212        // Is this a span of non-escape characters?
1213        if (ThisTokBuf[0] != '\\') {
1214          const char *InStart = ThisTokBuf;
1215          do {
1216            ++ThisTokBuf;
1217          } while (ThisTokBuf != ThisTokEnd && ThisTokBuf[0] != '\\');
1218
1219          // Copy the character span over.
1220          if (CopyStringFragment(StringRef(InStart, ThisTokBuf - InStart)))
1221            if (DiagnoseBadString(StringToks[i]))
1222              hadError = true;
1223          continue;
1224        }
1225        // Is this a Universal Character Name escape?
1226        if (ThisTokBuf[1] == 'u' || ThisTokBuf[1] == 'U') {
1227          EncodeUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd,
1228                          ResultPtr, hadError,
1229                          FullSourceLoc(StringToks[i].getLocation(), SM),
1230                          CharByteWidth, Diags, Features);
1231          continue;
1232        }
1233        // Otherwise, this is a non-UCN escape character.  Process it.
1234        unsigned ResultChar =
1235          ProcessCharEscape(ThisTokBuf, ThisTokEnd, hadError,
1236                            FullSourceLoc(StringToks[i].getLocation(), SM),
1237                            CharByteWidth*8, Diags);
1238
1239        if (CharByteWidth == 4) {
1240          // FIXME: Make the type of the result buffer correct instead of
1241          // using reinterpret_cast.
1242          UTF32 *ResultWidePtr = reinterpret_cast<UTF32*>(ResultPtr);
1243          *ResultWidePtr = ResultChar;
1244          ResultPtr += 4;
1245        } else if (CharByteWidth == 2) {
1246          // FIXME: Make the type of the result buffer correct instead of
1247          // using reinterpret_cast.
1248          UTF16 *ResultWidePtr = reinterpret_cast<UTF16*>(ResultPtr);
1249          *ResultWidePtr = ResultChar & 0xFFFF;
1250          ResultPtr += 2;
1251        } else {
1252          assert(CharByteWidth == 1 && "Unexpected char width");
1253          *ResultPtr++ = ResultChar & 0xFF;
1254        }
1255      }
1256    }
1257  }
1258
1259  if (Pascal) {
1260    if (CharByteWidth == 4) {
1261      // FIXME: Make the type of the result buffer correct instead of
1262      // using reinterpret_cast.
1263      UTF32 *ResultWidePtr = reinterpret_cast<UTF32*>(ResultBuf.data());
1264      ResultWidePtr[0] = GetNumStringChars() - 1;
1265    } else if (CharByteWidth == 2) {
1266      // FIXME: Make the type of the result buffer correct instead of
1267      // using reinterpret_cast.
1268      UTF16 *ResultWidePtr = reinterpret_cast<UTF16*>(ResultBuf.data());
1269      ResultWidePtr[0] = GetNumStringChars() - 1;
1270    } else {
1271      assert(CharByteWidth == 1 && "Unexpected char width");
1272      ResultBuf[0] = GetNumStringChars() - 1;
1273    }
1274
1275    // Verify that pascal strings aren't too large.
1276    if (GetStringLength() > 256) {
1277      if (Diags)
1278        Diags->Report(FullSourceLoc(StringToks[0].getLocation(), SM),
1279                      diag::err_pascal_string_too_long)
1280          << SourceRange(StringToks[0].getLocation(),
1281                         StringToks[NumStringToks-1].getLocation());
1282      hadError = true;
1283      return;
1284    }
1285  } else if (Diags) {
1286    // Complain if this string literal has too many characters.
1287    unsigned MaxChars = Features.CPlusPlus? 65536 : Features.C99 ? 4095 : 509;
1288
1289    if (GetNumStringChars() > MaxChars)
1290      Diags->Report(FullSourceLoc(StringToks[0].getLocation(), SM),
1291                    diag::ext_string_too_long)
1292        << GetNumStringChars() << MaxChars
1293        << (Features.CPlusPlus ? 2 : Features.C99 ? 1 : 0)
1294        << SourceRange(StringToks[0].getLocation(),
1295                       StringToks[NumStringToks-1].getLocation());
1296  }
1297}
1298
1299
1300/// copyStringFragment - This function copies from Start to End into ResultPtr.
1301/// Performs widening for multi-byte characters.
1302bool StringLiteralParser::CopyStringFragment(StringRef Fragment) {
1303  assert(CharByteWidth==1 || CharByteWidth==2 || CharByteWidth==4);
1304  ConversionResult result = conversionOK;
1305  // Copy the character span over.
1306  if (CharByteWidth == 1) {
1307    if (!isLegalUTF8String(reinterpret_cast<const UTF8*>(Fragment.begin()),
1308                           reinterpret_cast<const UTF8*>(Fragment.end())))
1309      result = sourceIllegal;
1310    memcpy(ResultPtr, Fragment.data(), Fragment.size());
1311    ResultPtr += Fragment.size();
1312  } else if (CharByteWidth == 2) {
1313    UTF8 const *sourceStart = (UTF8 const *)Fragment.data();
1314    // FIXME: Make the type of the result buffer correct instead of
1315    // using reinterpret_cast.
1316    UTF16 *targetStart = reinterpret_cast<UTF16*>(ResultPtr);
1317    ConversionFlags flags = strictConversion;
1318    result = ConvertUTF8toUTF16(
1319	    &sourceStart,sourceStart + Fragment.size(),
1320        &targetStart,targetStart + 2*Fragment.size(),flags);
1321    if (result==conversionOK)
1322      ResultPtr = reinterpret_cast<char*>(targetStart);
1323  } else if (CharByteWidth == 4) {
1324    UTF8 const *sourceStart = (UTF8 const *)Fragment.data();
1325    // FIXME: Make the type of the result buffer correct instead of
1326    // using reinterpret_cast.
1327    UTF32 *targetStart = reinterpret_cast<UTF32*>(ResultPtr);
1328    ConversionFlags flags = strictConversion;
1329    result = ConvertUTF8toUTF32(
1330        &sourceStart,sourceStart + Fragment.size(),
1331        &targetStart,targetStart + 4*Fragment.size(),flags);
1332    if (result==conversionOK)
1333      ResultPtr = reinterpret_cast<char*>(targetStart);
1334  }
1335  assert((result != targetExhausted)
1336         && "ConvertUTF8toUTFXX exhausted target buffer");
1337  return result != conversionOK;
1338}
1339
1340bool StringLiteralParser::DiagnoseBadString(const Token &Tok) {
1341  // If we see bad encoding for unprefixed string literals, warn and
1342  // simply copy the byte values, for compatibility with gcc and older
1343  // versions of clang.
1344  bool NoErrorOnBadEncoding = isAscii();
1345  unsigned Msg = NoErrorOnBadEncoding ? diag::warn_bad_string_encoding :
1346                                        diag::err_bad_string_encoding;
1347  if (Diags)
1348    Diags->Report(FullSourceLoc(Tok.getLocation(), SM), Msg);
1349  return !NoErrorOnBadEncoding;
1350}
1351
1352/// getOffsetOfStringByte - This function returns the offset of the
1353/// specified byte of the string data represented by Token.  This handles
1354/// advancing over escape sequences in the string.
1355unsigned StringLiteralParser::getOffsetOfStringByte(const Token &Tok,
1356                                                    unsigned ByteNo) const {
1357  // Get the spelling of the token.
1358  SmallString<32> SpellingBuffer;
1359  SpellingBuffer.resize(Tok.getLength());
1360
1361  bool StringInvalid = false;
1362  const char *SpellingPtr = &SpellingBuffer[0];
1363  unsigned TokLen = Lexer::getSpelling(Tok, SpellingPtr, SM, Features,
1364                                       &StringInvalid);
1365  if (StringInvalid)
1366    return 0;
1367
1368  assert(SpellingPtr[0] != 'L' && SpellingPtr[0] != 'u' &&
1369         SpellingPtr[0] != 'U' && "Doesn't handle wide or utf strings yet");
1370
1371
1372  const char *SpellingStart = SpellingPtr;
1373  const char *SpellingEnd = SpellingPtr+TokLen;
1374
1375  // Skip over the leading quote.
1376  assert(SpellingPtr[0] == '"' && "Should be a string literal!");
1377  ++SpellingPtr;
1378
1379  // Skip over bytes until we find the offset we're looking for.
1380  while (ByteNo) {
1381    assert(SpellingPtr < SpellingEnd && "Didn't find byte offset!");
1382
1383    // Step over non-escapes simply.
1384    if (*SpellingPtr != '\\') {
1385      ++SpellingPtr;
1386      --ByteNo;
1387      continue;
1388    }
1389
1390    // Otherwise, this is an escape character.  Advance over it.
1391    bool HadError = false;
1392    ProcessCharEscape(SpellingPtr, SpellingEnd, HadError,
1393                      FullSourceLoc(Tok.getLocation(), SM),
1394                      CharByteWidth*8, Diags);
1395    assert(!HadError && "This method isn't valid on erroneous strings");
1396    --ByteNo;
1397  }
1398
1399  return SpellingPtr-SpellingStart;
1400}
1401