1//===--- LiteralSupport.cpp - Code to parse and process literals ----------===//
2//
3//                     The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10// This file implements the NumericLiteralParser, CharLiteralParser, and
11// StringLiteralParser interfaces.
12//
13//===----------------------------------------------------------------------===//
14
15#include "clang/Lex/LiteralSupport.h"
16#include "clang/Basic/CharInfo.h"
17#include "clang/Basic/TargetInfo.h"
18#include "clang/Lex/LexDiagnostic.h"
19#include "clang/Lex/Preprocessor.h"
20#include "llvm/ADT/StringExtras.h"
21#include "llvm/Support/ConvertUTF.h"
22#include "llvm/Support/ErrorHandling.h"
23
24using namespace clang;
25
26static unsigned getCharWidth(tok::TokenKind kind, const TargetInfo &Target) {
27  switch (kind) {
28  default: llvm_unreachable("Unknown token type!");
29  case tok::char_constant:
30  case tok::string_literal:
31  case tok::utf8_char_constant:
32  case tok::utf8_string_literal:
33    return Target.getCharWidth();
34  case tok::wide_char_constant:
35  case tok::wide_string_literal:
36    return Target.getWCharWidth();
37  case tok::utf16_char_constant:
38  case tok::utf16_string_literal:
39    return Target.getChar16Width();
40  case tok::utf32_char_constant:
41  case tok::utf32_string_literal:
42    return Target.getChar32Width();
43  }
44}
45
46static CharSourceRange MakeCharSourceRange(const LangOptions &Features,
47                                           FullSourceLoc TokLoc,
48                                           const char *TokBegin,
49                                           const char *TokRangeBegin,
50                                           const char *TokRangeEnd) {
51  SourceLocation Begin =
52    Lexer::AdvanceToTokenCharacter(TokLoc, TokRangeBegin - TokBegin,
53                                   TokLoc.getManager(), Features);
54  SourceLocation End =
55    Lexer::AdvanceToTokenCharacter(Begin, TokRangeEnd - TokRangeBegin,
56                                   TokLoc.getManager(), Features);
57  return CharSourceRange::getCharRange(Begin, End);
58}
59
60/// \brief Produce a diagnostic highlighting some portion of a literal.
61///
62/// Emits the diagnostic \p DiagID, highlighting the range of characters from
63/// \p TokRangeBegin (inclusive) to \p TokRangeEnd (exclusive), which must be
64/// a substring of a spelling buffer for the token beginning at \p TokBegin.
65static DiagnosticBuilder Diag(DiagnosticsEngine *Diags,
66                              const LangOptions &Features, FullSourceLoc TokLoc,
67                              const char *TokBegin, const char *TokRangeBegin,
68                              const char *TokRangeEnd, unsigned DiagID) {
69  SourceLocation Begin =
70    Lexer::AdvanceToTokenCharacter(TokLoc, TokRangeBegin - TokBegin,
71                                   TokLoc.getManager(), Features);
72  return Diags->Report(Begin, DiagID) <<
73    MakeCharSourceRange(Features, TokLoc, TokBegin, TokRangeBegin, TokRangeEnd);
74}
75
76/// ProcessCharEscape - Parse a standard C escape sequence, which can occur in
77/// either a character or a string literal.
78static unsigned ProcessCharEscape(const char *ThisTokBegin,
79                                  const char *&ThisTokBuf,
80                                  const char *ThisTokEnd, bool &HadError,
81                                  FullSourceLoc Loc, unsigned CharWidth,
82                                  DiagnosticsEngine *Diags,
83                                  const LangOptions &Features) {
84  const char *EscapeBegin = ThisTokBuf;
85
86  // Skip the '\' char.
87  ++ThisTokBuf;
88
89  // We know that this character can't be off the end of the buffer, because
90  // that would have been \", which would not have been the end of string.
91  unsigned ResultChar = *ThisTokBuf++;
92  switch (ResultChar) {
93  // These map to themselves.
94  case '\\': case '\'': case '"': case '?': break;
95
96    // These have fixed mappings.
97  case 'a':
98    // TODO: K&R: the meaning of '\\a' is different in traditional C
99    ResultChar = 7;
100    break;
101  case 'b':
102    ResultChar = 8;
103    break;
104  case 'e':
105    if (Diags)
106      Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
107           diag::ext_nonstandard_escape) << "e";
108    ResultChar = 27;
109    break;
110  case 'E':
111    if (Diags)
112      Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
113           diag::ext_nonstandard_escape) << "E";
114    ResultChar = 27;
115    break;
116  case 'f':
117    ResultChar = 12;
118    break;
119  case 'n':
120    ResultChar = 10;
121    break;
122  case 'r':
123    ResultChar = 13;
124    break;
125  case 't':
126    ResultChar = 9;
127    break;
128  case 'v':
129    ResultChar = 11;
130    break;
131  case 'x': { // Hex escape.
132    ResultChar = 0;
133    if (ThisTokBuf == ThisTokEnd || !isHexDigit(*ThisTokBuf)) {
134      if (Diags)
135        Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
136             diag::err_hex_escape_no_digits) << "x";
137      HadError = 1;
138      break;
139    }
140
141    // Hex escapes are a maximal series of hex digits.
142    bool Overflow = false;
143    for (; ThisTokBuf != ThisTokEnd; ++ThisTokBuf) {
144      int CharVal = llvm::hexDigitValue(ThisTokBuf[0]);
145      if (CharVal == -1) break;
146      // About to shift out a digit?
147      if (ResultChar & 0xF0000000)
148        Overflow = true;
149      ResultChar <<= 4;
150      ResultChar |= CharVal;
151    }
152
153    // See if any bits will be truncated when evaluated as a character.
154    if (CharWidth != 32 && (ResultChar >> CharWidth) != 0) {
155      Overflow = true;
156      ResultChar &= ~0U >> (32-CharWidth);
157    }
158
159    // Check for overflow.
160    if (Overflow && Diags)   // Too many digits to fit in
161      Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
162           diag::err_escape_too_large) << 0;
163    break;
164  }
165  case '0': case '1': case '2': case '3':
166  case '4': case '5': case '6': case '7': {
167    // Octal escapes.
168    --ThisTokBuf;
169    ResultChar = 0;
170
171    // Octal escapes are a series of octal digits with maximum length 3.
172    // "\0123" is a two digit sequence equal to "\012" "3".
173    unsigned NumDigits = 0;
174    do {
175      ResultChar <<= 3;
176      ResultChar |= *ThisTokBuf++ - '0';
177      ++NumDigits;
178    } while (ThisTokBuf != ThisTokEnd && NumDigits < 3 &&
179             ThisTokBuf[0] >= '0' && ThisTokBuf[0] <= '7');
180
181    // Check for overflow.  Reject '\777', but not L'\777'.
182    if (CharWidth != 32 && (ResultChar >> CharWidth) != 0) {
183      if (Diags)
184        Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
185             diag::err_escape_too_large) << 1;
186      ResultChar &= ~0U >> (32-CharWidth);
187    }
188    break;
189  }
190
191    // Otherwise, these are not valid escapes.
192  case '(': case '{': case '[': case '%':
193    // GCC accepts these as extensions.  We warn about them as such though.
194    if (Diags)
195      Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
196           diag::ext_nonstandard_escape)
197        << std::string(1, ResultChar);
198    break;
199  default:
200    if (!Diags)
201      break;
202
203    if (isPrintable(ResultChar))
204      Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
205           diag::ext_unknown_escape)
206        << std::string(1, ResultChar);
207    else
208      Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
209           diag::ext_unknown_escape)
210        << "x" + llvm::utohexstr(ResultChar);
211    break;
212  }
213
214  return ResultChar;
215}
216
217static void appendCodePoint(unsigned Codepoint,
218                            llvm::SmallVectorImpl<char> &Str) {
219  char ResultBuf[4];
220  char *ResultPtr = ResultBuf;
221  bool Res = llvm::ConvertCodePointToUTF8(Codepoint, ResultPtr);
222  (void)Res;
223  assert(Res && "Unexpected conversion failure");
224  Str.append(ResultBuf, ResultPtr);
225}
226
227void clang::expandUCNs(SmallVectorImpl<char> &Buf, StringRef Input) {
228  for (StringRef::iterator I = Input.begin(), E = Input.end(); I != E; ++I) {
229    if (*I != '\\') {
230      Buf.push_back(*I);
231      continue;
232    }
233
234    ++I;
235    assert(*I == 'u' || *I == 'U');
236
237    unsigned NumHexDigits;
238    if (*I == 'u')
239      NumHexDigits = 4;
240    else
241      NumHexDigits = 8;
242
243    assert(I + NumHexDigits <= E);
244
245    uint32_t CodePoint = 0;
246    for (++I; NumHexDigits != 0; ++I, --NumHexDigits) {
247      unsigned Value = llvm::hexDigitValue(*I);
248      assert(Value != -1U);
249
250      CodePoint <<= 4;
251      CodePoint += Value;
252    }
253
254    appendCodePoint(CodePoint, Buf);
255    --I;
256  }
257}
258
259/// ProcessUCNEscape - Read the Universal Character Name, check constraints and
260/// return the UTF32.
261static bool ProcessUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf,
262                             const char *ThisTokEnd,
263                             uint32_t &UcnVal, unsigned short &UcnLen,
264                             FullSourceLoc Loc, DiagnosticsEngine *Diags,
265                             const LangOptions &Features,
266                             bool in_char_string_literal = false) {
267  const char *UcnBegin = ThisTokBuf;
268
269  // Skip the '\u' char's.
270  ThisTokBuf += 2;
271
272  if (ThisTokBuf == ThisTokEnd || !isHexDigit(*ThisTokBuf)) {
273    if (Diags)
274      Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
275           diag::err_hex_escape_no_digits) << StringRef(&ThisTokBuf[-1], 1);
276    return false;
277  }
278  UcnLen = (ThisTokBuf[-1] == 'u' ? 4 : 8);
279  unsigned short UcnLenSave = UcnLen;
280  for (; ThisTokBuf != ThisTokEnd && UcnLenSave; ++ThisTokBuf, UcnLenSave--) {
281    int CharVal = llvm::hexDigitValue(ThisTokBuf[0]);
282    if (CharVal == -1) break;
283    UcnVal <<= 4;
284    UcnVal |= CharVal;
285  }
286  // If we didn't consume the proper number of digits, there is a problem.
287  if (UcnLenSave) {
288    if (Diags)
289      Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
290           diag::err_ucn_escape_incomplete);
291    return false;
292  }
293
294  // Check UCN constraints (C99 6.4.3p2) [C++11 lex.charset p2]
295  if ((0xD800 <= UcnVal && UcnVal <= 0xDFFF) || // surrogate codepoints
296      UcnVal > 0x10FFFF) {                      // maximum legal UTF32 value
297    if (Diags)
298      Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
299           diag::err_ucn_escape_invalid);
300    return false;
301  }
302
303  // C++11 allows UCNs that refer to control characters and basic source
304  // characters inside character and string literals
305  if (UcnVal < 0xa0 &&
306      (UcnVal != 0x24 && UcnVal != 0x40 && UcnVal != 0x60)) {  // $, @, `
307    bool IsError = (!Features.CPlusPlus11 || !in_char_string_literal);
308    if (Diags) {
309      char BasicSCSChar = UcnVal;
310      if (UcnVal >= 0x20 && UcnVal < 0x7f)
311        Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
312             IsError ? diag::err_ucn_escape_basic_scs :
313                       diag::warn_cxx98_compat_literal_ucn_escape_basic_scs)
314            << StringRef(&BasicSCSChar, 1);
315      else
316        Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
317             IsError ? diag::err_ucn_control_character :
318                       diag::warn_cxx98_compat_literal_ucn_control_character);
319    }
320    if (IsError)
321      return false;
322  }
323
324  if (!Features.CPlusPlus && !Features.C99 && Diags)
325    Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
326         diag::warn_ucn_not_valid_in_c89_literal);
327
328  return true;
329}
330
331/// MeasureUCNEscape - Determine the number of bytes within the resulting string
332/// which this UCN will occupy.
333static int MeasureUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf,
334                            const char *ThisTokEnd, unsigned CharByteWidth,
335                            const LangOptions &Features, bool &HadError) {
336  // UTF-32: 4 bytes per escape.
337  if (CharByteWidth == 4)
338    return 4;
339
340  uint32_t UcnVal = 0;
341  unsigned short UcnLen = 0;
342  FullSourceLoc Loc;
343
344  if (!ProcessUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, UcnVal,
345                        UcnLen, Loc, nullptr, Features, true)) {
346    HadError = true;
347    return 0;
348  }
349
350  // UTF-16: 2 bytes for BMP, 4 bytes otherwise.
351  if (CharByteWidth == 2)
352    return UcnVal <= 0xFFFF ? 2 : 4;
353
354  // UTF-8.
355  if (UcnVal < 0x80)
356    return 1;
357  if (UcnVal < 0x800)
358    return 2;
359  if (UcnVal < 0x10000)
360    return 3;
361  return 4;
362}
363
364/// EncodeUCNEscape - Read the Universal Character Name, check constraints and
365/// convert the UTF32 to UTF8 or UTF16. This is a subroutine of
366/// StringLiteralParser. When we decide to implement UCN's for identifiers,
367/// we will likely rework our support for UCN's.
368static void EncodeUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf,
369                            const char *ThisTokEnd,
370                            char *&ResultBuf, bool &HadError,
371                            FullSourceLoc Loc, unsigned CharByteWidth,
372                            DiagnosticsEngine *Diags,
373                            const LangOptions &Features) {
374  typedef uint32_t UTF32;
375  UTF32 UcnVal = 0;
376  unsigned short UcnLen = 0;
377  if (!ProcessUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, UcnVal, UcnLen,
378                        Loc, Diags, Features, true)) {
379    HadError = true;
380    return;
381  }
382
383  assert((CharByteWidth == 1 || CharByteWidth == 2 || CharByteWidth == 4) &&
384         "only character widths of 1, 2, or 4 bytes supported");
385
386  (void)UcnLen;
387  assert((UcnLen== 4 || UcnLen== 8) && "only ucn length of 4 or 8 supported");
388
389  if (CharByteWidth == 4) {
390    // FIXME: Make the type of the result buffer correct instead of
391    // using reinterpret_cast.
392    UTF32 *ResultPtr = reinterpret_cast<UTF32*>(ResultBuf);
393    *ResultPtr = UcnVal;
394    ResultBuf += 4;
395    return;
396  }
397
398  if (CharByteWidth == 2) {
399    // FIXME: Make the type of the result buffer correct instead of
400    // using reinterpret_cast.
401    UTF16 *ResultPtr = reinterpret_cast<UTF16*>(ResultBuf);
402
403    if (UcnVal <= (UTF32)0xFFFF) {
404      *ResultPtr = UcnVal;
405      ResultBuf += 2;
406      return;
407    }
408
409    // Convert to UTF16.
410    UcnVal -= 0x10000;
411    *ResultPtr     = 0xD800 + (UcnVal >> 10);
412    *(ResultPtr+1) = 0xDC00 + (UcnVal & 0x3FF);
413    ResultBuf += 4;
414    return;
415  }
416
417  assert(CharByteWidth == 1 && "UTF-8 encoding is only for 1 byte characters");
418
419  // Now that we've parsed/checked the UCN, we convert from UTF32->UTF8.
420  // The conversion below was inspired by:
421  //   http://www.unicode.org/Public/PROGRAMS/CVTUTF/ConvertUTF.c
422  // First, we determine how many bytes the result will require.
423  typedef uint8_t UTF8;
424
425  unsigned short bytesToWrite = 0;
426  if (UcnVal < (UTF32)0x80)
427    bytesToWrite = 1;
428  else if (UcnVal < (UTF32)0x800)
429    bytesToWrite = 2;
430  else if (UcnVal < (UTF32)0x10000)
431    bytesToWrite = 3;
432  else
433    bytesToWrite = 4;
434
435  const unsigned byteMask = 0xBF;
436  const unsigned byteMark = 0x80;
437
438  // Once the bits are split out into bytes of UTF8, this is a mask OR-ed
439  // into the first byte, depending on how many bytes follow.
440  static const UTF8 firstByteMark[5] = {
441    0x00, 0x00, 0xC0, 0xE0, 0xF0
442  };
443  // Finally, we write the bytes into ResultBuf.
444  ResultBuf += bytesToWrite;
445  switch (bytesToWrite) { // note: everything falls through.
446  case 4: *--ResultBuf = (UTF8)((UcnVal | byteMark) & byteMask); UcnVal >>= 6;
447  case 3: *--ResultBuf = (UTF8)((UcnVal | byteMark) & byteMask); UcnVal >>= 6;
448  case 2: *--ResultBuf = (UTF8)((UcnVal | byteMark) & byteMask); UcnVal >>= 6;
449  case 1: *--ResultBuf = (UTF8) (UcnVal | firstByteMark[bytesToWrite]);
450  }
451  // Update the buffer.
452  ResultBuf += bytesToWrite;
453}
454
455
456///       integer-constant: [C99 6.4.4.1]
457///         decimal-constant integer-suffix
458///         octal-constant integer-suffix
459///         hexadecimal-constant integer-suffix
460///         binary-literal integer-suffix [GNU, C++1y]
461///       user-defined-integer-literal: [C++11 lex.ext]
462///         decimal-literal ud-suffix
463///         octal-literal ud-suffix
464///         hexadecimal-literal ud-suffix
465///         binary-literal ud-suffix [GNU, C++1y]
466///       decimal-constant:
467///         nonzero-digit
468///         decimal-constant digit
469///       octal-constant:
470///         0
471///         octal-constant octal-digit
472///       hexadecimal-constant:
473///         hexadecimal-prefix hexadecimal-digit
474///         hexadecimal-constant hexadecimal-digit
475///       hexadecimal-prefix: one of
476///         0x 0X
477///       binary-literal:
478///         0b binary-digit
479///         0B binary-digit
480///         binary-literal binary-digit
481///       integer-suffix:
482///         unsigned-suffix [long-suffix]
483///         unsigned-suffix [long-long-suffix]
484///         long-suffix [unsigned-suffix]
485///         long-long-suffix [unsigned-sufix]
486///       nonzero-digit:
487///         1 2 3 4 5 6 7 8 9
488///       octal-digit:
489///         0 1 2 3 4 5 6 7
490///       hexadecimal-digit:
491///         0 1 2 3 4 5 6 7 8 9
492///         a b c d e f
493///         A B C D E F
494///       binary-digit:
495///         0
496///         1
497///       unsigned-suffix: one of
498///         u U
499///       long-suffix: one of
500///         l L
501///       long-long-suffix: one of
502///         ll LL
503///
504///       floating-constant: [C99 6.4.4.2]
505///         TODO: add rules...
506///
507NumericLiteralParser::NumericLiteralParser(StringRef TokSpelling,
508                                           SourceLocation TokLoc,
509                                           Preprocessor &PP)
510  : PP(PP), ThisTokBegin(TokSpelling.begin()), ThisTokEnd(TokSpelling.end()) {
511
512  // This routine assumes that the range begin/end matches the regex for integer
513  // and FP constants (specifically, the 'pp-number' regex), and assumes that
514  // the byte at "*end" is both valid and not part of the regex.  Because of
515  // this, it doesn't have to check for 'overscan' in various places.
516  assert(!isPreprocessingNumberBody(*ThisTokEnd) && "didn't maximally munch?");
517
518  s = DigitsBegin = ThisTokBegin;
519  saw_exponent = false;
520  saw_period = false;
521  saw_ud_suffix = false;
522  isLong = false;
523  isUnsigned = false;
524  isLongLong = false;
525  isFloat = false;
526  isImaginary = false;
527  MicrosoftInteger = 0;
528  hadError = false;
529
530  if (*s == '0') { // parse radix
531    ParseNumberStartingWithZero(TokLoc);
532    if (hadError)
533      return;
534  } else { // the first digit is non-zero
535    radix = 10;
536    s = SkipDigits(s);
537    if (s == ThisTokEnd) {
538      // Done.
539    } else if (isHexDigit(*s) && !(*s == 'e' || *s == 'E')) {
540      PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, s - ThisTokBegin),
541              diag::err_invalid_digit) << StringRef(s, 1) << 0;
542      hadError = true;
543      return;
544    } else if (*s == '.') {
545      checkSeparator(TokLoc, s, CSK_AfterDigits);
546      s++;
547      saw_period = true;
548      checkSeparator(TokLoc, s, CSK_BeforeDigits);
549      s = SkipDigits(s);
550    }
551    if ((*s == 'e' || *s == 'E')) { // exponent
552      checkSeparator(TokLoc, s, CSK_AfterDigits);
553      const char *Exponent = s;
554      s++;
555      saw_exponent = true;
556      if (*s == '+' || *s == '-')  s++; // sign
557      checkSeparator(TokLoc, s, CSK_BeforeDigits);
558      const char *first_non_digit = SkipDigits(s);
559      if (first_non_digit != s) {
560        s = first_non_digit;
561      } else {
562        PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, Exponent - ThisTokBegin),
563                diag::err_exponent_has_no_digits);
564        hadError = true;
565        return;
566      }
567    }
568  }
569
570  SuffixBegin = s;
571  checkSeparator(TokLoc, s, CSK_AfterDigits);
572
573  // Parse the suffix.  At this point we can classify whether we have an FP or
574  // integer constant.
575  bool isFPConstant = isFloatingLiteral();
576  const char *ImaginarySuffixLoc = nullptr;
577
578  // Loop over all of the characters of the suffix.  If we see something bad,
579  // we break out of the loop.
580  for (; s != ThisTokEnd; ++s) {
581    switch (*s) {
582    case 'f':      // FP Suffix for "float"
583    case 'F':
584      if (!isFPConstant) break;  // Error for integer constant.
585      if (isFloat || isLong) break; // FF, LF invalid.
586      isFloat = true;
587      continue;  // Success.
588    case 'u':
589    case 'U':
590      if (isFPConstant) break;  // Error for floating constant.
591      if (isUnsigned) break;    // Cannot be repeated.
592      isUnsigned = true;
593      continue;  // Success.
594    case 'l':
595    case 'L':
596      if (isLong || isLongLong) break;  // Cannot be repeated.
597      if (isFloat) break;               // LF invalid.
598
599      // Check for long long.  The L's need to be adjacent and the same case.
600      if (s[1] == s[0]) {
601        assert(s + 1 < ThisTokEnd && "didn't maximally munch?");
602        if (isFPConstant) break;        // long long invalid for floats.
603        isLongLong = true;
604        ++s;  // Eat both of them.
605      } else {
606        isLong = true;
607      }
608      continue;  // Success.
609    case 'i':
610    case 'I':
611      if (PP.getLangOpts().MicrosoftExt) {
612        if (isLong || isLongLong || MicrosoftInteger)
613          break;
614
615        if (!isFPConstant) {
616          // Allow i8, i16, i32, and i64.
617          switch (s[1]) {
618          case '8':
619            s += 2; // i8 suffix
620            MicrosoftInteger = 8;
621            break;
622          case '1':
623            if (s[2] == '6') {
624              s += 3; // i16 suffix
625              MicrosoftInteger = 16;
626            }
627            break;
628          case '3':
629            if (s[2] == '2') {
630              s += 3; // i32 suffix
631              MicrosoftInteger = 32;
632            }
633            break;
634          case '6':
635            if (s[2] == '4') {
636              s += 3; // i64 suffix
637              MicrosoftInteger = 64;
638            }
639            break;
640          default:
641            break;
642          }
643        }
644        if (MicrosoftInteger) {
645          assert(s <= ThisTokEnd && "didn't maximally munch?");
646          break;
647        }
648      }
649      // "i", "if", and "il" are user-defined suffixes in C++1y.
650      if (*s == 'i' && PP.getLangOpts().CPlusPlus14)
651        break;
652      // fall through.
653    case 'j':
654    case 'J':
655      if (isImaginary) break;   // Cannot be repeated.
656      isImaginary = true;
657      ImaginarySuffixLoc = s;
658      continue;  // Success.
659    }
660    // If we reached here, there was an error or a ud-suffix.
661    break;
662  }
663
664  if (s != ThisTokEnd) {
665    // FIXME: Don't bother expanding UCNs if !tok.hasUCN().
666    expandUCNs(UDSuffixBuf, StringRef(SuffixBegin, ThisTokEnd - SuffixBegin));
667    if (isValidUDSuffix(PP.getLangOpts(), UDSuffixBuf)) {
668      // Any suffix pieces we might have parsed are actually part of the
669      // ud-suffix.
670      isLong = false;
671      isUnsigned = false;
672      isLongLong = false;
673      isFloat = false;
674      isImaginary = false;
675      MicrosoftInteger = 0;
676
677      saw_ud_suffix = true;
678      return;
679    }
680
681    // Report an error if there are any.
682    PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, SuffixBegin - ThisTokBegin),
683            diag::err_invalid_suffix_constant)
684      << StringRef(SuffixBegin, ThisTokEnd-SuffixBegin) << isFPConstant;
685    hadError = true;
686    return;
687  }
688
689  if (isImaginary) {
690    PP.Diag(PP.AdvanceToTokenCharacter(TokLoc,
691                                       ImaginarySuffixLoc - ThisTokBegin),
692            diag::ext_imaginary_constant);
693  }
694}
695
696/// Determine whether a suffix is a valid ud-suffix. We avoid treating reserved
697/// suffixes as ud-suffixes, because the diagnostic experience is better if we
698/// treat it as an invalid suffix.
699bool NumericLiteralParser::isValidUDSuffix(const LangOptions &LangOpts,
700                                           StringRef Suffix) {
701  if (!LangOpts.CPlusPlus11 || Suffix.empty())
702    return false;
703
704  // By C++11 [lex.ext]p10, ud-suffixes starting with an '_' are always valid.
705  if (Suffix[0] == '_')
706    return true;
707
708  // In C++11, there are no library suffixes.
709  if (!LangOpts.CPlusPlus14)
710    return false;
711
712  // In C++1y, "s", "h", "min", "ms", "us", and "ns" are used in the library.
713  // Per tweaked N3660, "il", "i", and "if" are also used in the library.
714  return llvm::StringSwitch<bool>(Suffix)
715      .Cases("h", "min", "s", true)
716      .Cases("ms", "us", "ns", true)
717      .Cases("il", "i", "if", true)
718      .Default(false);
719}
720
721void NumericLiteralParser::checkSeparator(SourceLocation TokLoc,
722                                          const char *Pos,
723                                          CheckSeparatorKind IsAfterDigits) {
724  if (IsAfterDigits == CSK_AfterDigits) {
725    if (Pos == ThisTokBegin)
726      return;
727    --Pos;
728  } else if (Pos == ThisTokEnd)
729    return;
730
731  if (isDigitSeparator(*Pos))
732    PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, Pos - ThisTokBegin),
733            diag::err_digit_separator_not_between_digits)
734      << IsAfterDigits;
735}
736
737/// ParseNumberStartingWithZero - This method is called when the first character
738/// of the number is found to be a zero.  This means it is either an octal
739/// number (like '04') or a hex number ('0x123a') a binary number ('0b1010') or
740/// a floating point number (01239.123e4).  Eat the prefix, determining the
741/// radix etc.
742void NumericLiteralParser::ParseNumberStartingWithZero(SourceLocation TokLoc) {
743  assert(s[0] == '0' && "Invalid method call");
744  s++;
745
746  int c1 = s[0];
747
748  // Handle a hex number like 0x1234.
749  if ((c1 == 'x' || c1 == 'X') && (isHexDigit(s[1]) || s[1] == '.')) {
750    s++;
751    assert(s < ThisTokEnd && "didn't maximally munch?");
752    radix = 16;
753    DigitsBegin = s;
754    s = SkipHexDigits(s);
755    bool noSignificand = (s == DigitsBegin);
756    if (s == ThisTokEnd) {
757      // Done.
758    } else if (*s == '.') {
759      s++;
760      saw_period = true;
761      const char *floatDigitsBegin = s;
762      checkSeparator(TokLoc, s, CSK_BeforeDigits);
763      s = SkipHexDigits(s);
764      noSignificand &= (floatDigitsBegin == s);
765    }
766
767    if (noSignificand) {
768      PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, s - ThisTokBegin),
769        diag::err_hexconstant_requires) << 1;
770      hadError = true;
771      return;
772    }
773
774    // A binary exponent can appear with or with a '.'. If dotted, the
775    // binary exponent is required.
776    if (*s == 'p' || *s == 'P') {
777      checkSeparator(TokLoc, s, CSK_AfterDigits);
778      const char *Exponent = s;
779      s++;
780      saw_exponent = true;
781      if (*s == '+' || *s == '-')  s++; // sign
782      const char *first_non_digit = SkipDigits(s);
783      if (first_non_digit == s) {
784        PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, Exponent-ThisTokBegin),
785                diag::err_exponent_has_no_digits);
786        hadError = true;
787        return;
788      }
789      checkSeparator(TokLoc, s, CSK_BeforeDigits);
790      s = first_non_digit;
791
792      if (!PP.getLangOpts().HexFloats)
793        PP.Diag(TokLoc, diag::ext_hexconstant_invalid);
794    } else if (saw_period) {
795      PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, s-ThisTokBegin),
796              diag::err_hexconstant_requires) << 0;
797      hadError = true;
798    }
799    return;
800  }
801
802  // Handle simple binary numbers 0b01010
803  if ((c1 == 'b' || c1 == 'B') && (s[1] == '0' || s[1] == '1')) {
804    // 0b101010 is a C++1y / GCC extension.
805    PP.Diag(TokLoc,
806            PP.getLangOpts().CPlusPlus14
807              ? diag::warn_cxx11_compat_binary_literal
808              : PP.getLangOpts().CPlusPlus
809                ? diag::ext_binary_literal_cxx14
810                : diag::ext_binary_literal);
811    ++s;
812    assert(s < ThisTokEnd && "didn't maximally munch?");
813    radix = 2;
814    DigitsBegin = s;
815    s = SkipBinaryDigits(s);
816    if (s == ThisTokEnd) {
817      // Done.
818    } else if (isHexDigit(*s)) {
819      PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, s-ThisTokBegin),
820              diag::err_invalid_digit) << StringRef(s, 1) << 2;
821      hadError = true;
822    }
823    // Other suffixes will be diagnosed by the caller.
824    return;
825  }
826
827  // For now, the radix is set to 8. If we discover that we have a
828  // floating point constant, the radix will change to 10. Octal floating
829  // point constants are not permitted (only decimal and hexadecimal).
830  radix = 8;
831  DigitsBegin = s;
832  s = SkipOctalDigits(s);
833  if (s == ThisTokEnd)
834    return; // Done, simple octal number like 01234
835
836  // If we have some other non-octal digit that *is* a decimal digit, see if
837  // this is part of a floating point number like 094.123 or 09e1.
838  if (isDigit(*s)) {
839    const char *EndDecimal = SkipDigits(s);
840    if (EndDecimal[0] == '.' || EndDecimal[0] == 'e' || EndDecimal[0] == 'E') {
841      s = EndDecimal;
842      radix = 10;
843    }
844  }
845
846  // If we have a hex digit other than 'e' (which denotes a FP exponent) then
847  // the code is using an incorrect base.
848  if (isHexDigit(*s) && *s != 'e' && *s != 'E') {
849    PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, s-ThisTokBegin),
850            diag::err_invalid_digit) << StringRef(s, 1) << 1;
851    hadError = true;
852    return;
853  }
854
855  if (*s == '.') {
856    s++;
857    radix = 10;
858    saw_period = true;
859    checkSeparator(TokLoc, s, CSK_BeforeDigits);
860    s = SkipDigits(s); // Skip suffix.
861  }
862  if (*s == 'e' || *s == 'E') { // exponent
863    checkSeparator(TokLoc, s, CSK_AfterDigits);
864    const char *Exponent = s;
865    s++;
866    radix = 10;
867    saw_exponent = true;
868    if (*s == '+' || *s == '-')  s++; // sign
869    const char *first_non_digit = SkipDigits(s);
870    if (first_non_digit != s) {
871      checkSeparator(TokLoc, s, CSK_BeforeDigits);
872      s = first_non_digit;
873    } else {
874      PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, Exponent-ThisTokBegin),
875              diag::err_exponent_has_no_digits);
876      hadError = true;
877      return;
878    }
879  }
880}
881
882static bool alwaysFitsInto64Bits(unsigned Radix, unsigned NumDigits) {
883  switch (Radix) {
884  case 2:
885    return NumDigits <= 64;
886  case 8:
887    return NumDigits <= 64 / 3; // Digits are groups of 3 bits.
888  case 10:
889    return NumDigits <= 19; // floor(log10(2^64))
890  case 16:
891    return NumDigits <= 64 / 4; // Digits are groups of 4 bits.
892  default:
893    llvm_unreachable("impossible Radix");
894  }
895}
896
897/// GetIntegerValue - Convert this numeric literal value to an APInt that
898/// matches Val's input width.  If there is an overflow, set Val to the low bits
899/// of the result and return true.  Otherwise, return false.
900bool NumericLiteralParser::GetIntegerValue(llvm::APInt &Val) {
901  // Fast path: Compute a conservative bound on the maximum number of
902  // bits per digit in this radix. If we can't possibly overflow a
903  // uint64 based on that bound then do the simple conversion to
904  // integer. This avoids the expensive overflow checking below, and
905  // handles the common cases that matter (small decimal integers and
906  // hex/octal values which don't overflow).
907  const unsigned NumDigits = SuffixBegin - DigitsBegin;
908  if (alwaysFitsInto64Bits(radix, NumDigits)) {
909    uint64_t N = 0;
910    for (const char *Ptr = DigitsBegin; Ptr != SuffixBegin; ++Ptr)
911      if (!isDigitSeparator(*Ptr))
912        N = N * radix + llvm::hexDigitValue(*Ptr);
913
914    // This will truncate the value to Val's input width. Simply check
915    // for overflow by comparing.
916    Val = N;
917    return Val.getZExtValue() != N;
918  }
919
920  Val = 0;
921  const char *Ptr = DigitsBegin;
922
923  llvm::APInt RadixVal(Val.getBitWidth(), radix);
924  llvm::APInt CharVal(Val.getBitWidth(), 0);
925  llvm::APInt OldVal = Val;
926
927  bool OverflowOccurred = false;
928  while (Ptr < SuffixBegin) {
929    if (isDigitSeparator(*Ptr)) {
930      ++Ptr;
931      continue;
932    }
933
934    unsigned C = llvm::hexDigitValue(*Ptr++);
935
936    // If this letter is out of bound for this radix, reject it.
937    assert(C < radix && "NumericLiteralParser ctor should have rejected this");
938
939    CharVal = C;
940
941    // Add the digit to the value in the appropriate radix.  If adding in digits
942    // made the value smaller, then this overflowed.
943    OldVal = Val;
944
945    // Multiply by radix, did overflow occur on the multiply?
946    Val *= RadixVal;
947    OverflowOccurred |= Val.udiv(RadixVal) != OldVal;
948
949    // Add value, did overflow occur on the value?
950    //   (a + b) ult b  <=> overflow
951    Val += CharVal;
952    OverflowOccurred |= Val.ult(CharVal);
953  }
954  return OverflowOccurred;
955}
956
957llvm::APFloat::opStatus
958NumericLiteralParser::GetFloatValue(llvm::APFloat &Result) {
959  using llvm::APFloat;
960
961  unsigned n = std::min(SuffixBegin - ThisTokBegin, ThisTokEnd - ThisTokBegin);
962
963  llvm::SmallString<16> Buffer;
964  StringRef Str(ThisTokBegin, n);
965  if (Str.find('\'') != StringRef::npos) {
966    Buffer.reserve(n);
967    std::remove_copy_if(Str.begin(), Str.end(), std::back_inserter(Buffer),
968                        &isDigitSeparator);
969    Str = Buffer;
970  }
971
972  return Result.convertFromString(Str, APFloat::rmNearestTiesToEven);
973}
974
975
976/// \verbatim
977///       user-defined-character-literal: [C++11 lex.ext]
978///         character-literal ud-suffix
979///       ud-suffix:
980///         identifier
981///       character-literal: [C++11 lex.ccon]
982///         ' c-char-sequence '
983///         u' c-char-sequence '
984///         U' c-char-sequence '
985///         L' c-char-sequence '
986///       c-char-sequence:
987///         c-char
988///         c-char-sequence c-char
989///       c-char:
990///         any member of the source character set except the single-quote ',
991///           backslash \, or new-line character
992///         escape-sequence
993///         universal-character-name
994///       escape-sequence:
995///         simple-escape-sequence
996///         octal-escape-sequence
997///         hexadecimal-escape-sequence
998///       simple-escape-sequence:
999///         one of \' \" \? \\ \a \b \f \n \r \t \v
1000///       octal-escape-sequence:
1001///         \ octal-digit
1002///         \ octal-digit octal-digit
1003///         \ octal-digit octal-digit octal-digit
1004///       hexadecimal-escape-sequence:
1005///         \x hexadecimal-digit
1006///         hexadecimal-escape-sequence hexadecimal-digit
1007///       universal-character-name: [C++11 lex.charset]
1008///         \u hex-quad
1009///         \U hex-quad hex-quad
1010///       hex-quad:
1011///         hex-digit hex-digit hex-digit hex-digit
1012/// \endverbatim
1013///
1014CharLiteralParser::CharLiteralParser(const char *begin, const char *end,
1015                                     SourceLocation Loc, Preprocessor &PP,
1016                                     tok::TokenKind kind) {
1017  // At this point we know that the character matches the regex "(L|u|U)?'.*'".
1018  HadError = false;
1019
1020  Kind = kind;
1021
1022  const char *TokBegin = begin;
1023
1024  // Skip over wide character determinant.
1025  if (Kind != tok::char_constant)
1026    ++begin;
1027  if (Kind == tok::utf8_char_constant)
1028    ++begin;
1029
1030  // Skip over the entry quote.
1031  assert(begin[0] == '\'' && "Invalid token lexed");
1032  ++begin;
1033
1034  // Remove an optional ud-suffix.
1035  if (end[-1] != '\'') {
1036    const char *UDSuffixEnd = end;
1037    do {
1038      --end;
1039    } while (end[-1] != '\'');
1040    // FIXME: Don't bother with this if !tok.hasUCN().
1041    expandUCNs(UDSuffixBuf, StringRef(end, UDSuffixEnd - end));
1042    UDSuffixOffset = end - TokBegin;
1043  }
1044
1045  // Trim the ending quote.
1046  assert(end != begin && "Invalid token lexed");
1047  --end;
1048
1049  // FIXME: The "Value" is an uint64_t so we can handle char literals of
1050  // up to 64-bits.
1051  // FIXME: This extensively assumes that 'char' is 8-bits.
1052  assert(PP.getTargetInfo().getCharWidth() == 8 &&
1053         "Assumes char is 8 bits");
1054  assert(PP.getTargetInfo().getIntWidth() <= 64 &&
1055         (PP.getTargetInfo().getIntWidth() & 7) == 0 &&
1056         "Assumes sizeof(int) on target is <= 64 and a multiple of char");
1057  assert(PP.getTargetInfo().getWCharWidth() <= 64 &&
1058         "Assumes sizeof(wchar) on target is <= 64");
1059
1060  SmallVector<uint32_t, 4> codepoint_buffer;
1061  codepoint_buffer.resize(end - begin);
1062  uint32_t *buffer_begin = &codepoint_buffer.front();
1063  uint32_t *buffer_end = buffer_begin + codepoint_buffer.size();
1064
1065  // Unicode escapes representing characters that cannot be correctly
1066  // represented in a single code unit are disallowed in character literals
1067  // by this implementation.
1068  uint32_t largest_character_for_kind;
1069  if (tok::wide_char_constant == Kind) {
1070    largest_character_for_kind =
1071        0xFFFFFFFFu >> (32-PP.getTargetInfo().getWCharWidth());
1072  } else if (tok::utf8_char_constant == Kind) {
1073    largest_character_for_kind = 0x7F;
1074  } else if (tok::utf16_char_constant == Kind) {
1075    largest_character_for_kind = 0xFFFF;
1076  } else if (tok::utf32_char_constant == Kind) {
1077    largest_character_for_kind = 0x10FFFF;
1078  } else {
1079    largest_character_for_kind = 0x7Fu;
1080  }
1081
1082  while (begin != end) {
1083    // Is this a span of non-escape characters?
1084    if (begin[0] != '\\') {
1085      char const *start = begin;
1086      do {
1087        ++begin;
1088      } while (begin != end && *begin != '\\');
1089
1090      char const *tmp_in_start = start;
1091      uint32_t *tmp_out_start = buffer_begin;
1092      ConversionResult res =
1093          ConvertUTF8toUTF32(reinterpret_cast<UTF8 const **>(&start),
1094                             reinterpret_cast<UTF8 const *>(begin),
1095                             &buffer_begin, buffer_end, strictConversion);
1096      if (res != conversionOK) {
1097        // If we see bad encoding for unprefixed character literals, warn and
1098        // simply copy the byte values, for compatibility with gcc and
1099        // older versions of clang.
1100        bool NoErrorOnBadEncoding = isAscii();
1101        unsigned Msg = diag::err_bad_character_encoding;
1102        if (NoErrorOnBadEncoding)
1103          Msg = diag::warn_bad_character_encoding;
1104        PP.Diag(Loc, Msg);
1105        if (NoErrorOnBadEncoding) {
1106          start = tmp_in_start;
1107          buffer_begin = tmp_out_start;
1108          for (; start != begin; ++start, ++buffer_begin)
1109            *buffer_begin = static_cast<uint8_t>(*start);
1110        } else {
1111          HadError = true;
1112        }
1113      } else {
1114        for (; tmp_out_start < buffer_begin; ++tmp_out_start) {
1115          if (*tmp_out_start > largest_character_for_kind) {
1116            HadError = true;
1117            PP.Diag(Loc, diag::err_character_too_large);
1118          }
1119        }
1120      }
1121
1122      continue;
1123    }
1124    // Is this a Universal Character Name escape?
1125    if (begin[1] == 'u' || begin[1] == 'U') {
1126      unsigned short UcnLen = 0;
1127      if (!ProcessUCNEscape(TokBegin, begin, end, *buffer_begin, UcnLen,
1128                            FullSourceLoc(Loc, PP.getSourceManager()),
1129                            &PP.getDiagnostics(), PP.getLangOpts(), true)) {
1130        HadError = true;
1131      } else if (*buffer_begin > largest_character_for_kind) {
1132        HadError = true;
1133        PP.Diag(Loc, diag::err_character_too_large);
1134      }
1135
1136      ++buffer_begin;
1137      continue;
1138    }
1139    unsigned CharWidth = getCharWidth(Kind, PP.getTargetInfo());
1140    uint64_t result =
1141      ProcessCharEscape(TokBegin, begin, end, HadError,
1142                        FullSourceLoc(Loc,PP.getSourceManager()),
1143                        CharWidth, &PP.getDiagnostics(), PP.getLangOpts());
1144    *buffer_begin++ = result;
1145  }
1146
1147  unsigned NumCharsSoFar = buffer_begin - &codepoint_buffer.front();
1148
1149  if (NumCharsSoFar > 1) {
1150    if (isWide())
1151      PP.Diag(Loc, diag::warn_extraneous_char_constant);
1152    else if (isAscii() && NumCharsSoFar == 4)
1153      PP.Diag(Loc, diag::ext_four_char_character_literal);
1154    else if (isAscii())
1155      PP.Diag(Loc, diag::ext_multichar_character_literal);
1156    else
1157      PP.Diag(Loc, diag::err_multichar_utf_character_literal);
1158    IsMultiChar = true;
1159  } else {
1160    IsMultiChar = false;
1161  }
1162
1163  llvm::APInt LitVal(PP.getTargetInfo().getIntWidth(), 0);
1164
1165  // Narrow character literals act as though their value is concatenated
1166  // in this implementation, but warn on overflow.
1167  bool multi_char_too_long = false;
1168  if (isAscii() && isMultiChar()) {
1169    LitVal = 0;
1170    for (size_t i = 0; i < NumCharsSoFar; ++i) {
1171      // check for enough leading zeros to shift into
1172      multi_char_too_long |= (LitVal.countLeadingZeros() < 8);
1173      LitVal <<= 8;
1174      LitVal = LitVal + (codepoint_buffer[i] & 0xFF);
1175    }
1176  } else if (NumCharsSoFar > 0) {
1177    // otherwise just take the last character
1178    LitVal = buffer_begin[-1];
1179  }
1180
1181  if (!HadError && multi_char_too_long) {
1182    PP.Diag(Loc, diag::warn_char_constant_too_large);
1183  }
1184
1185  // Transfer the value from APInt to uint64_t
1186  Value = LitVal.getZExtValue();
1187
1188  // If this is a single narrow character, sign extend it (e.g. '\xFF' is "-1")
1189  // if 'char' is signed for this target (C99 6.4.4.4p10).  Note that multiple
1190  // character constants are not sign extended in the this implementation:
1191  // '\xFF\xFF' = 65536 and '\x0\xFF' = 255, which matches GCC.
1192  if (isAscii() && NumCharsSoFar == 1 && (Value & 128) &&
1193      PP.getLangOpts().CharIsSigned)
1194    Value = (signed char)Value;
1195}
1196
1197/// \verbatim
1198///       string-literal: [C++0x lex.string]
1199///         encoding-prefix " [s-char-sequence] "
1200///         encoding-prefix R raw-string
1201///       encoding-prefix:
1202///         u8
1203///         u
1204///         U
1205///         L
1206///       s-char-sequence:
1207///         s-char
1208///         s-char-sequence s-char
1209///       s-char:
1210///         any member of the source character set except the double-quote ",
1211///           backslash \, or new-line character
1212///         escape-sequence
1213///         universal-character-name
1214///       raw-string:
1215///         " d-char-sequence ( r-char-sequence ) d-char-sequence "
1216///       r-char-sequence:
1217///         r-char
1218///         r-char-sequence r-char
1219///       r-char:
1220///         any member of the source character set, except a right parenthesis )
1221///           followed by the initial d-char-sequence (which may be empty)
1222///           followed by a double quote ".
1223///       d-char-sequence:
1224///         d-char
1225///         d-char-sequence d-char
1226///       d-char:
1227///         any member of the basic source character set except:
1228///           space, the left parenthesis (, the right parenthesis ),
1229///           the backslash \, and the control characters representing horizontal
1230///           tab, vertical tab, form feed, and newline.
1231///       escape-sequence: [C++0x lex.ccon]
1232///         simple-escape-sequence
1233///         octal-escape-sequence
1234///         hexadecimal-escape-sequence
1235///       simple-escape-sequence:
1236///         one of \' \" \? \\ \a \b \f \n \r \t \v
1237///       octal-escape-sequence:
1238///         \ octal-digit
1239///         \ octal-digit octal-digit
1240///         \ octal-digit octal-digit octal-digit
1241///       hexadecimal-escape-sequence:
1242///         \x hexadecimal-digit
1243///         hexadecimal-escape-sequence hexadecimal-digit
1244///       universal-character-name:
1245///         \u hex-quad
1246///         \U hex-quad hex-quad
1247///       hex-quad:
1248///         hex-digit hex-digit hex-digit hex-digit
1249/// \endverbatim
1250///
1251StringLiteralParser::
1252StringLiteralParser(ArrayRef<Token> StringToks,
1253                    Preprocessor &PP, bool Complain)
1254  : SM(PP.getSourceManager()), Features(PP.getLangOpts()),
1255    Target(PP.getTargetInfo()), Diags(Complain ? &PP.getDiagnostics() :nullptr),
1256    MaxTokenLength(0), SizeBound(0), CharByteWidth(0), Kind(tok::unknown),
1257    ResultPtr(ResultBuf.data()), hadError(false), Pascal(false) {
1258  init(StringToks);
1259}
1260
1261void StringLiteralParser::init(ArrayRef<Token> StringToks){
1262  // The literal token may have come from an invalid source location (e.g. due
1263  // to a PCH error), in which case the token length will be 0.
1264  if (StringToks.empty() || StringToks[0].getLength() < 2)
1265    return DiagnoseLexingError(SourceLocation());
1266
1267  // Scan all of the string portions, remember the max individual token length,
1268  // computing a bound on the concatenated string length, and see whether any
1269  // piece is a wide-string.  If any of the string portions is a wide-string
1270  // literal, the result is a wide-string literal [C99 6.4.5p4].
1271  assert(!StringToks.empty() && "expected at least one token");
1272  MaxTokenLength = StringToks[0].getLength();
1273  assert(StringToks[0].getLength() >= 2 && "literal token is invalid!");
1274  SizeBound = StringToks[0].getLength()-2;  // -2 for "".
1275  Kind = StringToks[0].getKind();
1276
1277  hadError = false;
1278
1279  // Implement Translation Phase #6: concatenation of string literals
1280  /// (C99 5.1.1.2p1).  The common case is only one string fragment.
1281  for (unsigned i = 1; i != StringToks.size(); ++i) {
1282    if (StringToks[i].getLength() < 2)
1283      return DiagnoseLexingError(StringToks[i].getLocation());
1284
1285    // The string could be shorter than this if it needs cleaning, but this is a
1286    // reasonable bound, which is all we need.
1287    assert(StringToks[i].getLength() >= 2 && "literal token is invalid!");
1288    SizeBound += StringToks[i].getLength()-2;  // -2 for "".
1289
1290    // Remember maximum string piece length.
1291    if (StringToks[i].getLength() > MaxTokenLength)
1292      MaxTokenLength = StringToks[i].getLength();
1293
1294    // Remember if we see any wide or utf-8/16/32 strings.
1295    // Also check for illegal concatenations.
1296    if (StringToks[i].isNot(Kind) && StringToks[i].isNot(tok::string_literal)) {
1297      if (isAscii()) {
1298        Kind = StringToks[i].getKind();
1299      } else {
1300        if (Diags)
1301          Diags->Report(StringToks[i].getLocation(),
1302                        diag::err_unsupported_string_concat);
1303        hadError = true;
1304      }
1305    }
1306  }
1307
1308  // Include space for the null terminator.
1309  ++SizeBound;
1310
1311  // TODO: K&R warning: "traditional C rejects string constant concatenation"
1312
1313  // Get the width in bytes of char/wchar_t/char16_t/char32_t
1314  CharByteWidth = getCharWidth(Kind, Target);
1315  assert((CharByteWidth & 7) == 0 && "Assumes character size is byte multiple");
1316  CharByteWidth /= 8;
1317
1318  // The output buffer size needs to be large enough to hold wide characters.
1319  // This is a worst-case assumption which basically corresponds to L"" "long".
1320  SizeBound *= CharByteWidth;
1321
1322  // Size the temporary buffer to hold the result string data.
1323  ResultBuf.resize(SizeBound);
1324
1325  // Likewise, but for each string piece.
1326  SmallString<512> TokenBuf;
1327  TokenBuf.resize(MaxTokenLength);
1328
1329  // Loop over all the strings, getting their spelling, and expanding them to
1330  // wide strings as appropriate.
1331  ResultPtr = &ResultBuf[0];   // Next byte to fill in.
1332
1333  Pascal = false;
1334
1335  SourceLocation UDSuffixTokLoc;
1336
1337  for (unsigned i = 0, e = StringToks.size(); i != e; ++i) {
1338    const char *ThisTokBuf = &TokenBuf[0];
1339    // Get the spelling of the token, which eliminates trigraphs, etc.  We know
1340    // that ThisTokBuf points to a buffer that is big enough for the whole token
1341    // and 'spelled' tokens can only shrink.
1342    bool StringInvalid = false;
1343    unsigned ThisTokLen =
1344      Lexer::getSpelling(StringToks[i], ThisTokBuf, SM, Features,
1345                         &StringInvalid);
1346    if (StringInvalid)
1347      return DiagnoseLexingError(StringToks[i].getLocation());
1348
1349    const char *ThisTokBegin = ThisTokBuf;
1350    const char *ThisTokEnd = ThisTokBuf+ThisTokLen;
1351
1352    // Remove an optional ud-suffix.
1353    if (ThisTokEnd[-1] != '"') {
1354      const char *UDSuffixEnd = ThisTokEnd;
1355      do {
1356        --ThisTokEnd;
1357      } while (ThisTokEnd[-1] != '"');
1358
1359      StringRef UDSuffix(ThisTokEnd, UDSuffixEnd - ThisTokEnd);
1360
1361      if (UDSuffixBuf.empty()) {
1362        if (StringToks[i].hasUCN())
1363          expandUCNs(UDSuffixBuf, UDSuffix);
1364        else
1365          UDSuffixBuf.assign(UDSuffix);
1366        UDSuffixToken = i;
1367        UDSuffixOffset = ThisTokEnd - ThisTokBuf;
1368        UDSuffixTokLoc = StringToks[i].getLocation();
1369      } else {
1370        SmallString<32> ExpandedUDSuffix;
1371        if (StringToks[i].hasUCN()) {
1372          expandUCNs(ExpandedUDSuffix, UDSuffix);
1373          UDSuffix = ExpandedUDSuffix;
1374        }
1375
1376        // C++11 [lex.ext]p8: At the end of phase 6, if a string literal is the
1377        // result of a concatenation involving at least one user-defined-string-
1378        // literal, all the participating user-defined-string-literals shall
1379        // have the same ud-suffix.
1380        if (UDSuffixBuf != UDSuffix) {
1381          if (Diags) {
1382            SourceLocation TokLoc = StringToks[i].getLocation();
1383            Diags->Report(TokLoc, diag::err_string_concat_mixed_suffix)
1384              << UDSuffixBuf << UDSuffix
1385              << SourceRange(UDSuffixTokLoc, UDSuffixTokLoc)
1386              << SourceRange(TokLoc, TokLoc);
1387          }
1388          hadError = true;
1389        }
1390      }
1391    }
1392
1393    // Strip the end quote.
1394    --ThisTokEnd;
1395
1396    // TODO: Input character set mapping support.
1397
1398    // Skip marker for wide or unicode strings.
1399    if (ThisTokBuf[0] == 'L' || ThisTokBuf[0] == 'u' || ThisTokBuf[0] == 'U') {
1400      ++ThisTokBuf;
1401      // Skip 8 of u8 marker for utf8 strings.
1402      if (ThisTokBuf[0] == '8')
1403        ++ThisTokBuf;
1404    }
1405
1406    // Check for raw string
1407    if (ThisTokBuf[0] == 'R') {
1408      ThisTokBuf += 2; // skip R"
1409
1410      const char *Prefix = ThisTokBuf;
1411      while (ThisTokBuf[0] != '(')
1412        ++ThisTokBuf;
1413      ++ThisTokBuf; // skip '('
1414
1415      // Remove same number of characters from the end
1416      ThisTokEnd -= ThisTokBuf - Prefix;
1417      assert(ThisTokEnd >= ThisTokBuf && "malformed raw string literal");
1418
1419      // C++14 [lex.string]p4: A source-file new-line in a raw string literal
1420      // results in a new-line in the resulting execution string-literal.
1421      StringRef RemainingTokenSpan(ThisTokBuf, ThisTokEnd - ThisTokBuf);
1422      while (!RemainingTokenSpan.empty()) {
1423        // Split the string literal on \r\n boundaries.
1424        size_t CRLFPos = RemainingTokenSpan.find("\r\n");
1425        StringRef BeforeCRLF = RemainingTokenSpan.substr(0, CRLFPos);
1426        StringRef AfterCRLF = RemainingTokenSpan.substr(CRLFPos);
1427
1428        // Copy everything before the \r\n sequence into the string literal.
1429        if (CopyStringFragment(StringToks[i], ThisTokBegin, BeforeCRLF))
1430          hadError = true;
1431
1432        // Point into the \n inside the \r\n sequence and operate on the
1433        // remaining portion of the literal.
1434        RemainingTokenSpan = AfterCRLF.substr(1);
1435      }
1436    } else {
1437      if (ThisTokBuf[0] != '"') {
1438        // The file may have come from PCH and then changed after loading the
1439        // PCH; Fail gracefully.
1440        return DiagnoseLexingError(StringToks[i].getLocation());
1441      }
1442      ++ThisTokBuf; // skip "
1443
1444      // Check if this is a pascal string
1445      if (Features.PascalStrings && ThisTokBuf + 1 != ThisTokEnd &&
1446          ThisTokBuf[0] == '\\' && ThisTokBuf[1] == 'p') {
1447
1448        // If the \p sequence is found in the first token, we have a pascal string
1449        // Otherwise, if we already have a pascal string, ignore the first \p
1450        if (i == 0) {
1451          ++ThisTokBuf;
1452          Pascal = true;
1453        } else if (Pascal)
1454          ThisTokBuf += 2;
1455      }
1456
1457      while (ThisTokBuf != ThisTokEnd) {
1458        // Is this a span of non-escape characters?
1459        if (ThisTokBuf[0] != '\\') {
1460          const char *InStart = ThisTokBuf;
1461          do {
1462            ++ThisTokBuf;
1463          } while (ThisTokBuf != ThisTokEnd && ThisTokBuf[0] != '\\');
1464
1465          // Copy the character span over.
1466          if (CopyStringFragment(StringToks[i], ThisTokBegin,
1467                                 StringRef(InStart, ThisTokBuf - InStart)))
1468            hadError = true;
1469          continue;
1470        }
1471        // Is this a Universal Character Name escape?
1472        if (ThisTokBuf[1] == 'u' || ThisTokBuf[1] == 'U') {
1473          EncodeUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd,
1474                          ResultPtr, hadError,
1475                          FullSourceLoc(StringToks[i].getLocation(), SM),
1476                          CharByteWidth, Diags, Features);
1477          continue;
1478        }
1479        // Otherwise, this is a non-UCN escape character.  Process it.
1480        unsigned ResultChar =
1481          ProcessCharEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, hadError,
1482                            FullSourceLoc(StringToks[i].getLocation(), SM),
1483                            CharByteWidth*8, Diags, Features);
1484
1485        if (CharByteWidth == 4) {
1486          // FIXME: Make the type of the result buffer correct instead of
1487          // using reinterpret_cast.
1488          UTF32 *ResultWidePtr = reinterpret_cast<UTF32*>(ResultPtr);
1489          *ResultWidePtr = ResultChar;
1490          ResultPtr += 4;
1491        } else if (CharByteWidth == 2) {
1492          // FIXME: Make the type of the result buffer correct instead of
1493          // using reinterpret_cast.
1494          UTF16 *ResultWidePtr = reinterpret_cast<UTF16*>(ResultPtr);
1495          *ResultWidePtr = ResultChar & 0xFFFF;
1496          ResultPtr += 2;
1497        } else {
1498          assert(CharByteWidth == 1 && "Unexpected char width");
1499          *ResultPtr++ = ResultChar & 0xFF;
1500        }
1501      }
1502    }
1503  }
1504
1505  if (Pascal) {
1506    if (CharByteWidth == 4) {
1507      // FIXME: Make the type of the result buffer correct instead of
1508      // using reinterpret_cast.
1509      UTF32 *ResultWidePtr = reinterpret_cast<UTF32*>(ResultBuf.data());
1510      ResultWidePtr[0] = GetNumStringChars() - 1;
1511    } else if (CharByteWidth == 2) {
1512      // FIXME: Make the type of the result buffer correct instead of
1513      // using reinterpret_cast.
1514      UTF16 *ResultWidePtr = reinterpret_cast<UTF16*>(ResultBuf.data());
1515      ResultWidePtr[0] = GetNumStringChars() - 1;
1516    } else {
1517      assert(CharByteWidth == 1 && "Unexpected char width");
1518      ResultBuf[0] = GetNumStringChars() - 1;
1519    }
1520
1521    // Verify that pascal strings aren't too large.
1522    if (GetStringLength() > 256) {
1523      if (Diags)
1524        Diags->Report(StringToks.front().getLocation(),
1525                      diag::err_pascal_string_too_long)
1526          << SourceRange(StringToks.front().getLocation(),
1527                         StringToks.back().getLocation());
1528      hadError = true;
1529      return;
1530    }
1531  } else if (Diags) {
1532    // Complain if this string literal has too many characters.
1533    unsigned MaxChars = Features.CPlusPlus? 65536 : Features.C99 ? 4095 : 509;
1534
1535    if (GetNumStringChars() > MaxChars)
1536      Diags->Report(StringToks.front().getLocation(),
1537                    diag::ext_string_too_long)
1538        << GetNumStringChars() << MaxChars
1539        << (Features.CPlusPlus ? 2 : Features.C99 ? 1 : 0)
1540        << SourceRange(StringToks.front().getLocation(),
1541                       StringToks.back().getLocation());
1542  }
1543}
1544
1545static const char *resyncUTF8(const char *Err, const char *End) {
1546  if (Err == End)
1547    return End;
1548  End = Err + std::min<unsigned>(getNumBytesForUTF8(*Err), End-Err);
1549  while (++Err != End && (*Err & 0xC0) == 0x80)
1550    ;
1551  return Err;
1552}
1553
1554/// \brief This function copies from Fragment, which is a sequence of bytes
1555/// within Tok's contents (which begin at TokBegin) into ResultPtr.
1556/// Performs widening for multi-byte characters.
1557bool StringLiteralParser::CopyStringFragment(const Token &Tok,
1558                                             const char *TokBegin,
1559                                             StringRef Fragment) {
1560  const UTF8 *ErrorPtrTmp;
1561  if (ConvertUTF8toWide(CharByteWidth, Fragment, ResultPtr, ErrorPtrTmp))
1562    return false;
1563
1564  // If we see bad encoding for unprefixed string literals, warn and
1565  // simply copy the byte values, for compatibility with gcc and older
1566  // versions of clang.
1567  bool NoErrorOnBadEncoding = isAscii();
1568  if (NoErrorOnBadEncoding) {
1569    memcpy(ResultPtr, Fragment.data(), Fragment.size());
1570    ResultPtr += Fragment.size();
1571  }
1572
1573  if (Diags) {
1574    const char *ErrorPtr = reinterpret_cast<const char *>(ErrorPtrTmp);
1575
1576    FullSourceLoc SourceLoc(Tok.getLocation(), SM);
1577    const DiagnosticBuilder &Builder =
1578      Diag(Diags, Features, SourceLoc, TokBegin,
1579           ErrorPtr, resyncUTF8(ErrorPtr, Fragment.end()),
1580           NoErrorOnBadEncoding ? diag::warn_bad_string_encoding
1581                                : diag::err_bad_string_encoding);
1582
1583    const char *NextStart = resyncUTF8(ErrorPtr, Fragment.end());
1584    StringRef NextFragment(NextStart, Fragment.end()-NextStart);
1585
1586    // Decode into a dummy buffer.
1587    SmallString<512> Dummy;
1588    Dummy.reserve(Fragment.size() * CharByteWidth);
1589    char *Ptr = Dummy.data();
1590
1591    while (!ConvertUTF8toWide(CharByteWidth, NextFragment, Ptr, ErrorPtrTmp)) {
1592      const char *ErrorPtr = reinterpret_cast<const char *>(ErrorPtrTmp);
1593      NextStart = resyncUTF8(ErrorPtr, Fragment.end());
1594      Builder << MakeCharSourceRange(Features, SourceLoc, TokBegin,
1595                                     ErrorPtr, NextStart);
1596      NextFragment = StringRef(NextStart, Fragment.end()-NextStart);
1597    }
1598  }
1599  return !NoErrorOnBadEncoding;
1600}
1601
1602void StringLiteralParser::DiagnoseLexingError(SourceLocation Loc) {
1603  hadError = true;
1604  if (Diags)
1605    Diags->Report(Loc, diag::err_lexing_string);
1606}
1607
1608/// getOffsetOfStringByte - This function returns the offset of the
1609/// specified byte of the string data represented by Token.  This handles
1610/// advancing over escape sequences in the string.
1611unsigned StringLiteralParser::getOffsetOfStringByte(const Token &Tok,
1612                                                    unsigned ByteNo) const {
1613  // Get the spelling of the token.
1614  SmallString<32> SpellingBuffer;
1615  SpellingBuffer.resize(Tok.getLength());
1616
1617  bool StringInvalid = false;
1618  const char *SpellingPtr = &SpellingBuffer[0];
1619  unsigned TokLen = Lexer::getSpelling(Tok, SpellingPtr, SM, Features,
1620                                       &StringInvalid);
1621  if (StringInvalid)
1622    return 0;
1623
1624  const char *SpellingStart = SpellingPtr;
1625  const char *SpellingEnd = SpellingPtr+TokLen;
1626
1627  // Handle UTF-8 strings just like narrow strings.
1628  if (SpellingPtr[0] == 'u' && SpellingPtr[1] == '8')
1629    SpellingPtr += 2;
1630
1631  assert(SpellingPtr[0] != 'L' && SpellingPtr[0] != 'u' &&
1632         SpellingPtr[0] != 'U' && "Doesn't handle wide or utf strings yet");
1633
1634  // For raw string literals, this is easy.
1635  if (SpellingPtr[0] == 'R') {
1636    assert(SpellingPtr[1] == '"' && "Should be a raw string literal!");
1637    // Skip 'R"'.
1638    SpellingPtr += 2;
1639    while (*SpellingPtr != '(') {
1640      ++SpellingPtr;
1641      assert(SpellingPtr < SpellingEnd && "Missing ( for raw string literal");
1642    }
1643    // Skip '('.
1644    ++SpellingPtr;
1645    return SpellingPtr - SpellingStart + ByteNo;
1646  }
1647
1648  // Skip over the leading quote
1649  assert(SpellingPtr[0] == '"' && "Should be a string literal!");
1650  ++SpellingPtr;
1651
1652  // Skip over bytes until we find the offset we're looking for.
1653  while (ByteNo) {
1654    assert(SpellingPtr < SpellingEnd && "Didn't find byte offset!");
1655
1656    // Step over non-escapes simply.
1657    if (*SpellingPtr != '\\') {
1658      ++SpellingPtr;
1659      --ByteNo;
1660      continue;
1661    }
1662
1663    // Otherwise, this is an escape character.  Advance over it.
1664    bool HadError = false;
1665    if (SpellingPtr[1] == 'u' || SpellingPtr[1] == 'U') {
1666      const char *EscapePtr = SpellingPtr;
1667      unsigned Len = MeasureUCNEscape(SpellingStart, SpellingPtr, SpellingEnd,
1668                                      1, Features, HadError);
1669      if (Len > ByteNo) {
1670        // ByteNo is somewhere within the escape sequence.
1671        SpellingPtr = EscapePtr;
1672        break;
1673      }
1674      ByteNo -= Len;
1675    } else {
1676      ProcessCharEscape(SpellingStart, SpellingPtr, SpellingEnd, HadError,
1677                        FullSourceLoc(Tok.getLocation(), SM),
1678                        CharByteWidth*8, Diags, Features);
1679      --ByteNo;
1680    }
1681    assert(!HadError && "This method isn't valid on erroneous strings");
1682  }
1683
1684  return SpellingPtr-SpellingStart;
1685}
1686