LiteralSupport.cpp revision 4984212fced6d9c6538ebb7319839105e3880b45
1//===--- LiteralSupport.cpp - Code to parse and process literals ----------===// 2// 3// The LLVM Compiler Infrastructure 4// 5// This file is distributed under the University of Illinois Open Source 6// License. See LICENSE.TXT for details. 7// 8//===----------------------------------------------------------------------===// 9// 10// This file implements the NumericLiteralParser, CharLiteralParser, and 11// StringLiteralParser interfaces. 12// 13//===----------------------------------------------------------------------===// 14 15#include "clang/Lex/LiteralSupport.h" 16#include "clang/Lex/Preprocessor.h" 17#include "clang/Basic/Diagnostic.h" 18#include "clang/Basic/TargetInfo.h" 19#include "llvm/ADT/StringExtras.h" 20using namespace clang; 21 22/// HexDigitValue - Return the value of the specified hex digit, or -1 if it's 23/// not valid. 24static int HexDigitValue(char C) { 25 if (C >= '0' && C <= '9') return C-'0'; 26 if (C >= 'a' && C <= 'f') return C-'a'+10; 27 if (C >= 'A' && C <= 'F') return C-'A'+10; 28 return -1; 29} 30 31/// ProcessCharEscape - Parse a standard C escape sequence, which can occur in 32/// either a character or a string literal. 33static unsigned ProcessCharEscape(const char *&ThisTokBuf, 34 const char *ThisTokEnd, bool &HadError, 35 SourceLocation Loc, bool IsWide, 36 Preprocessor &PP) { 37 // Skip the '\' char. 38 ++ThisTokBuf; 39 40 // We know that this character can't be off the end of the buffer, because 41 // that would have been \", which would not have been the end of string. 42 unsigned ResultChar = *ThisTokBuf++; 43 switch (ResultChar) { 44 // These map to themselves. 45 case '\\': case '\'': case '"': case '?': break; 46 47 // These have fixed mappings. 48 case 'a': 49 // TODO: K&R: the meaning of '\\a' is different in traditional C 50 ResultChar = 7; 51 break; 52 case 'b': 53 ResultChar = 8; 54 break; 55 case 'e': 56 PP.Diag(Loc, diag::ext_nonstandard_escape) << "e"; 57 ResultChar = 27; 58 break; 59 case 'f': 60 ResultChar = 12; 61 break; 62 case 'n': 63 ResultChar = 10; 64 break; 65 case 'r': 66 ResultChar = 13; 67 break; 68 case 't': 69 ResultChar = 9; 70 break; 71 case 'v': 72 ResultChar = 11; 73 break; 74 75 //case 'u': case 'U': // FIXME: UCNs. 76 case 'x': { // Hex escape. 77 ResultChar = 0; 78 if (ThisTokBuf == ThisTokEnd || !isxdigit(*ThisTokBuf)) { 79 PP.Diag(Loc, diag::err_hex_escape_no_digits); 80 HadError = 1; 81 break; 82 } 83 84 // Hex escapes are a maximal series of hex digits. 85 bool Overflow = false; 86 for (; ThisTokBuf != ThisTokEnd; ++ThisTokBuf) { 87 int CharVal = HexDigitValue(ThisTokBuf[0]); 88 if (CharVal == -1) break; 89 // About to shift out a digit? 90 Overflow |= (ResultChar & 0xF0000000) ? true : false; 91 ResultChar <<= 4; 92 ResultChar |= CharVal; 93 } 94 95 // See if any bits will be truncated when evaluated as a character. 96 unsigned CharWidth = PP.getTargetInfo().getCharWidth(IsWide); 97 98 if (CharWidth != 32 && (ResultChar >> CharWidth) != 0) { 99 Overflow = true; 100 ResultChar &= ~0U >> (32-CharWidth); 101 } 102 103 // Check for overflow. 104 if (Overflow) // Too many digits to fit in 105 PP.Diag(Loc, diag::warn_hex_escape_too_large); 106 break; 107 } 108 case '0': case '1': case '2': case '3': 109 case '4': case '5': case '6': case '7': { 110 // Octal escapes. 111 --ThisTokBuf; 112 ResultChar = 0; 113 114 // Octal escapes are a series of octal digits with maximum length 3. 115 // "\0123" is a two digit sequence equal to "\012" "3". 116 unsigned NumDigits = 0; 117 do { 118 ResultChar <<= 3; 119 ResultChar |= *ThisTokBuf++ - '0'; 120 ++NumDigits; 121 } while (ThisTokBuf != ThisTokEnd && NumDigits < 3 && 122 ThisTokBuf[0] >= '0' && ThisTokBuf[0] <= '7'); 123 124 // Check for overflow. Reject '\777', but not L'\777'. 125 unsigned CharWidth = PP.getTargetInfo().getCharWidth(IsWide); 126 127 if (CharWidth != 32 && (ResultChar >> CharWidth) != 0) { 128 PP.Diag(Loc, diag::warn_octal_escape_too_large); 129 ResultChar &= ~0U >> (32-CharWidth); 130 } 131 break; 132 } 133 134 // Otherwise, these are not valid escapes. 135 case '(': case '{': case '[': case '%': 136 // GCC accepts these as extensions. We warn about them as such though. 137 if (!PP.getLangOptions().NoExtensions) { 138 PP.Diag(Loc, diag::ext_nonstandard_escape) 139 << std::string()+(char)ResultChar; 140 break; 141 } 142 // FALL THROUGH. 143 default: 144 if (isgraph(ThisTokBuf[0])) 145 PP.Diag(Loc, diag::ext_unknown_escape) << std::string()+(char)ResultChar; 146 else 147 PP.Diag(Loc, diag::ext_unknown_escape) << "x"+llvm::utohexstr(ResultChar); 148 break; 149 } 150 151 return ResultChar; 152} 153 154 155 156 157/// integer-constant: [C99 6.4.4.1] 158/// decimal-constant integer-suffix 159/// octal-constant integer-suffix 160/// hexadecimal-constant integer-suffix 161/// decimal-constant: 162/// nonzero-digit 163/// decimal-constant digit 164/// octal-constant: 165/// 0 166/// octal-constant octal-digit 167/// hexadecimal-constant: 168/// hexadecimal-prefix hexadecimal-digit 169/// hexadecimal-constant hexadecimal-digit 170/// hexadecimal-prefix: one of 171/// 0x 0X 172/// integer-suffix: 173/// unsigned-suffix [long-suffix] 174/// unsigned-suffix [long-long-suffix] 175/// long-suffix [unsigned-suffix] 176/// long-long-suffix [unsigned-sufix] 177/// nonzero-digit: 178/// 1 2 3 4 5 6 7 8 9 179/// octal-digit: 180/// 0 1 2 3 4 5 6 7 181/// hexadecimal-digit: 182/// 0 1 2 3 4 5 6 7 8 9 183/// a b c d e f 184/// A B C D E F 185/// unsigned-suffix: one of 186/// u U 187/// long-suffix: one of 188/// l L 189/// long-long-suffix: one of 190/// ll LL 191/// 192/// floating-constant: [C99 6.4.4.2] 193/// TODO: add rules... 194/// 195NumericLiteralParser:: 196NumericLiteralParser(const char *begin, const char *end, 197 SourceLocation TokLoc, Preprocessor &pp) 198 : PP(pp), ThisTokBegin(begin), ThisTokEnd(end) { 199 200 // This routine assumes that the range begin/end matches the regex for integer 201 // and FP constants (specifically, the 'pp-number' regex), and assumes that 202 // the byte at "*end" is both valid and not part of the regex. Because of 203 // this, it doesn't have to check for 'overscan' in various places. 204 assert(!isalnum(*end) && *end != '.' && *end != '_' && 205 "Lexer didn't maximally munch?"); 206 207 s = DigitsBegin = begin; 208 saw_exponent = false; 209 saw_period = false; 210 isLong = false; 211 isUnsigned = false; 212 isLongLong = false; 213 isFloat = false; 214 isImaginary = false; 215 hadError = false; 216 217 if (*s == '0') { // parse radix 218 ParseNumberStartingWithZero(TokLoc); 219 if (hadError) 220 return; 221 } else { // the first digit is non-zero 222 radix = 10; 223 s = SkipDigits(s); 224 if (s == ThisTokEnd) { 225 // Done. 226 } else if (isxdigit(*s) && !(*s == 'e' || *s == 'E')) { 227 PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, s-begin), 228 diag::err_invalid_decimal_digit) << std::string(s, s+1); 229 hadError = true; 230 return; 231 } else if (*s == '.') { 232 s++; 233 saw_period = true; 234 s = SkipDigits(s); 235 } 236 if ((*s == 'e' || *s == 'E')) { // exponent 237 const char *Exponent = s; 238 s++; 239 saw_exponent = true; 240 if (*s == '+' || *s == '-') s++; // sign 241 const char *first_non_digit = SkipDigits(s); 242 if (first_non_digit != s) { 243 s = first_non_digit; 244 } else { 245 PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, Exponent-begin), 246 diag::err_exponent_has_no_digits); 247 hadError = true; 248 return; 249 } 250 } 251 } 252 253 SuffixBegin = s; 254 255 // Parse the suffix. At this point we can classify whether we have an FP or 256 // integer constant. 257 bool isFPConstant = isFloatingLiteral(); 258 259 // Loop over all of the characters of the suffix. If we see something bad, 260 // we break out of the loop. 261 for (; s != ThisTokEnd; ++s) { 262 switch (*s) { 263 case 'f': // FP Suffix for "float" 264 case 'F': 265 if (!isFPConstant) break; // Error for integer constant. 266 if (isFloat || isLong) break; // FF, LF invalid. 267 isFloat = true; 268 continue; // Success. 269 case 'u': 270 case 'U': 271 if (isFPConstant) break; // Error for floating constant. 272 if (isUnsigned) break; // Cannot be repeated. 273 isUnsigned = true; 274 continue; // Success. 275 case 'l': 276 case 'L': 277 if (isLong || isLongLong) break; // Cannot be repeated. 278 if (isFloat) break; // LF invalid. 279 280 // Check for long long. The L's need to be adjacent and the same case. 281 if (s+1 != ThisTokEnd && s[1] == s[0]) { 282 if (isFPConstant) break; // long long invalid for floats. 283 isLongLong = true; 284 ++s; // Eat both of them. 285 } else { 286 isLong = true; 287 } 288 continue; // Success. 289 case 'i': 290 if (PP.getLangOptions().Microsoft) { 291 // Allow i8, i16, i32, i64, and i128. 292 if (++s == ThisTokEnd) break; 293 switch (*s) { 294 case '8': 295 s++; // i8 suffix 296 break; 297 case '1': 298 if (++s == ThisTokEnd) break; 299 if (*s == '6') s++; // i16 suffix 300 else if (*s == '2') { 301 if (++s == ThisTokEnd) break; 302 if (*s == '8') s++; // i128 suffix 303 } 304 break; 305 case '3': 306 if (++s == ThisTokEnd) break; 307 if (*s == '2') s++; // i32 suffix 308 break; 309 case '6': 310 if (++s == ThisTokEnd) break; 311 if (*s == '4') s++; // i64 suffix 312 break; 313 default: 314 break; 315 } 316 break; 317 } 318 // fall through. 319 case 'I': 320 case 'j': 321 case 'J': 322 if (isImaginary) break; // Cannot be repeated. 323 PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, s-begin), 324 diag::ext_imaginary_constant); 325 isImaginary = true; 326 continue; // Success. 327 } 328 // If we reached here, there was an error. 329 break; 330 } 331 332 // Report an error if there are any. 333 if (s != ThisTokEnd) { 334 PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, s-begin), 335 isFPConstant ? diag::err_invalid_suffix_float_constant : 336 diag::err_invalid_suffix_integer_constant) 337 << std::string(SuffixBegin, ThisTokEnd); 338 hadError = true; 339 return; 340 } 341} 342 343/// ParseNumberStartingWithZero - This method is called when the first character 344/// of the number is found to be a zero. This means it is either an octal 345/// number (like '04') or a hex number ('0x123a') a binary number ('0b1010') or 346/// a floating point number (01239.123e4). Eat the prefix, determining the 347/// radix etc. 348void NumericLiteralParser::ParseNumberStartingWithZero(SourceLocation TokLoc) { 349 assert(s[0] == '0' && "Invalid method call"); 350 s++; 351 352 // Handle a hex number like 0x1234. 353 if ((*s == 'x' || *s == 'X') && (isxdigit(s[1]) || s[1] == '.')) { 354 s++; 355 radix = 16; 356 DigitsBegin = s; 357 s = SkipHexDigits(s); 358 if (s == ThisTokEnd) { 359 // Done. 360 } else if (*s == '.') { 361 s++; 362 saw_period = true; 363 s = SkipHexDigits(s); 364 } 365 // A binary exponent can appear with or with a '.'. If dotted, the 366 // binary exponent is required. 367 if (*s == 'p' || *s == 'P') { 368 const char *Exponent = s; 369 s++; 370 saw_exponent = true; 371 if (*s == '+' || *s == '-') s++; // sign 372 const char *first_non_digit = SkipDigits(s); 373 if (first_non_digit == s) { 374 PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, Exponent-ThisTokBegin), 375 diag::err_exponent_has_no_digits); 376 hadError = true; 377 return; 378 } 379 s = first_non_digit; 380 381 if (!PP.getLangOptions().HexFloats) 382 PP.Diag(TokLoc, diag::ext_hexconstant_invalid); 383 } else if (saw_period) { 384 PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, s-ThisTokBegin), 385 diag::err_hexconstant_requires_exponent); 386 hadError = true; 387 } 388 return; 389 } 390 391 // Handle simple binary numbers 0b01010 392 if (*s == 'b' || *s == 'B') { 393 // 0b101010 is a GCC extension. 394 PP.Diag(TokLoc, diag::ext_binary_literal); 395 ++s; 396 radix = 2; 397 DigitsBegin = s; 398 s = SkipBinaryDigits(s); 399 if (s == ThisTokEnd) { 400 // Done. 401 } else if (isxdigit(*s)) { 402 PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, s-ThisTokBegin), 403 diag::err_invalid_binary_digit) << std::string(s, s+1); 404 hadError = true; 405 } 406 // Other suffixes will be diagnosed by the caller. 407 return; 408 } 409 410 // For now, the radix is set to 8. If we discover that we have a 411 // floating point constant, the radix will change to 10. Octal floating 412 // point constants are not permitted (only decimal and hexadecimal). 413 radix = 8; 414 DigitsBegin = s; 415 s = SkipOctalDigits(s); 416 if (s == ThisTokEnd) 417 return; // Done, simple octal number like 01234 418 419 // If we have some other non-octal digit that *is* a decimal digit, see if 420 // this is part of a floating point number like 094.123 or 09e1. 421 if (isdigit(*s)) { 422 const char *EndDecimal = SkipDigits(s); 423 if (EndDecimal[0] == '.' || EndDecimal[0] == 'e' || EndDecimal[0] == 'E') { 424 s = EndDecimal; 425 radix = 10; 426 } 427 } 428 429 // If we have a hex digit other than 'e' (which denotes a FP exponent) then 430 // the code is using an incorrect base. 431 if (isxdigit(*s) && *s != 'e' && *s != 'E') { 432 PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, s-ThisTokBegin), 433 diag::err_invalid_octal_digit) << std::string(s, s+1); 434 hadError = true; 435 return; 436 } 437 438 if (*s == '.') { 439 s++; 440 radix = 10; 441 saw_period = true; 442 s = SkipDigits(s); // Skip suffix. 443 } 444 if (*s == 'e' || *s == 'E') { // exponent 445 const char *Exponent = s; 446 s++; 447 radix = 10; 448 saw_exponent = true; 449 if (*s == '+' || *s == '-') s++; // sign 450 const char *first_non_digit = SkipDigits(s); 451 if (first_non_digit != s) { 452 s = first_non_digit; 453 } else { 454 PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, Exponent-ThisTokBegin), 455 diag::err_exponent_has_no_digits); 456 hadError = true; 457 return; 458 } 459 } 460} 461 462 463/// GetIntegerValue - Convert this numeric literal value to an APInt that 464/// matches Val's input width. If there is an overflow, set Val to the low bits 465/// of the result and return true. Otherwise, return false. 466bool NumericLiteralParser::GetIntegerValue(llvm::APInt &Val) { 467 // Fast path: Compute a conservative bound on the maximum number of 468 // bits per digit in this radix. If we can't possibly overflow a 469 // uint64 based on that bound then do the simple conversion to 470 // integer. This avoids the expensive overflow checking below, and 471 // handles the common cases that matter (small decimal integers and 472 // hex/octal values which don't overflow). 473 unsigned MaxBitsPerDigit = 1; 474 while ((1U << MaxBitsPerDigit) < radix) 475 MaxBitsPerDigit += 1; 476 if ((SuffixBegin - DigitsBegin) * MaxBitsPerDigit <= 64) { 477 uint64_t N = 0; 478 for (s = DigitsBegin; s != SuffixBegin; ++s) 479 N = N*radix + HexDigitValue(*s); 480 481 // This will truncate the value to Val's input width. Simply check 482 // for overflow by comparing. 483 Val = N; 484 return Val.getZExtValue() != N; 485 } 486 487 Val = 0; 488 s = DigitsBegin; 489 490 llvm::APInt RadixVal(Val.getBitWidth(), radix); 491 llvm::APInt CharVal(Val.getBitWidth(), 0); 492 llvm::APInt OldVal = Val; 493 494 bool OverflowOccurred = false; 495 while (s < SuffixBegin) { 496 unsigned C = HexDigitValue(*s++); 497 498 // If this letter is out of bound for this radix, reject it. 499 assert(C < radix && "NumericLiteralParser ctor should have rejected this"); 500 501 CharVal = C; 502 503 // Add the digit to the value in the appropriate radix. If adding in digits 504 // made the value smaller, then this overflowed. 505 OldVal = Val; 506 507 // Multiply by radix, did overflow occur on the multiply? 508 Val *= RadixVal; 509 OverflowOccurred |= Val.udiv(RadixVal) != OldVal; 510 511 // Add value, did overflow occur on the value? 512 // (a + b) ult b <=> overflow 513 Val += CharVal; 514 OverflowOccurred |= Val.ult(CharVal); 515 } 516 return OverflowOccurred; 517} 518 519llvm::APFloat NumericLiteralParser:: 520GetFloatValue(const llvm::fltSemantics &Format, bool* isExact) { 521 using llvm::APFloat; 522 523 llvm::SmallVector<char,256> floatChars; 524 for (unsigned i = 0, n = ThisTokEnd-ThisTokBegin; i != n; ++i) 525 floatChars.push_back(ThisTokBegin[i]); 526 527 floatChars.push_back('\0'); 528 529 APFloat V (Format, APFloat::fcZero, false); 530 APFloat::opStatus status; 531 532 status = V.convertFromString(&floatChars[0],APFloat::rmNearestTiesToEven); 533 534 if (isExact) 535 *isExact = status == APFloat::opOK; 536 537 return V; 538} 539 540 541CharLiteralParser::CharLiteralParser(const char *begin, const char *end, 542 SourceLocation Loc, Preprocessor &PP) { 543 // At this point we know that the character matches the regex "L?'.*'". 544 HadError = false; 545 Value = 0; 546 547 // Determine if this is a wide character. 548 IsWide = begin[0] == 'L'; 549 if (IsWide) ++begin; 550 551 // Skip over the entry quote. 552 assert(begin[0] == '\'' && "Invalid token lexed"); 553 ++begin; 554 555 // FIXME: This assumes that 'int' is 32-bits in overflow calculation, and the 556 // size of "value". 557 assert(PP.getTargetInfo().getIntWidth() == 32 && 558 "Assumes sizeof(int) == 4 for now"); 559 // FIXME: This assumes that wchar_t is 32-bits for now. 560 assert(PP.getTargetInfo().getWCharWidth() == 32 && 561 "Assumes sizeof(wchar_t) == 4 for now"); 562 // FIXME: This extensively assumes that 'char' is 8-bits. 563 assert(PP.getTargetInfo().getCharWidth() == 8 && 564 "Assumes char is 8 bits"); 565 566 bool isFirstChar = true; 567 bool isMultiChar = false; 568 while (begin[0] != '\'') { 569 unsigned ResultChar; 570 if (begin[0] != '\\') // If this is a normal character, consume it. 571 ResultChar = *begin++; 572 else // Otherwise, this is an escape character. 573 ResultChar = ProcessCharEscape(begin, end, HadError, Loc, IsWide, PP); 574 575 // If this is a multi-character constant (e.g. 'abc'), handle it. These are 576 // implementation defined (C99 6.4.4.4p10). 577 if (!isFirstChar) { 578 // If this is the second character being processed, do special handling. 579 if (!isMultiChar) { 580 isMultiChar = true; 581 582 // Warn about discarding the top bits for multi-char wide-character 583 // constants (L'abcd'). 584 if (IsWide) 585 PP.Diag(Loc, diag::warn_extraneous_wide_char_constant); 586 } 587 588 if (IsWide) { 589 // Emulate GCC's (unintentional?) behavior: L'ab' -> L'b'. 590 Value = 0; 591 } else { 592 // Narrow character literals act as though their value is concatenated 593 // in this implementation. 594 if (((Value << 8) >> 8) != Value) 595 PP.Diag(Loc, diag::warn_char_constant_too_large); 596 Value <<= 8; 597 } 598 } 599 600 Value += ResultChar; 601 isFirstChar = false; 602 } 603 604 // If this is a single narrow character, sign extend it (e.g. '\xFF' is "-1") 605 // if 'char' is signed for this target (C99 6.4.4.4p10). Note that multiple 606 // character constants are not sign extended in the this implementation: 607 // '\xFF\xFF' = 65536 and '\x0\xFF' = 255, which matches GCC. 608 if (!IsWide && !isMultiChar && (Value & 128) && 609 PP.getTargetInfo().isCharSigned()) 610 Value = (signed char)Value; 611} 612 613 614/// string-literal: [C99 6.4.5] 615/// " [s-char-sequence] " 616/// L" [s-char-sequence] " 617/// s-char-sequence: 618/// s-char 619/// s-char-sequence s-char 620/// s-char: 621/// any source character except the double quote ", 622/// backslash \, or newline character 623/// escape-character 624/// universal-character-name 625/// escape-character: [C99 6.4.4.4] 626/// \ escape-code 627/// universal-character-name 628/// escape-code: 629/// character-escape-code 630/// octal-escape-code 631/// hex-escape-code 632/// character-escape-code: one of 633/// n t b r f v a 634/// \ ' " ? 635/// octal-escape-code: 636/// octal-digit 637/// octal-digit octal-digit 638/// octal-digit octal-digit octal-digit 639/// hex-escape-code: 640/// x hex-digit 641/// hex-escape-code hex-digit 642/// universal-character-name: 643/// \u hex-quad 644/// \U hex-quad hex-quad 645/// hex-quad: 646/// hex-digit hex-digit hex-digit hex-digit 647/// 648StringLiteralParser:: 649StringLiteralParser(const Token *StringToks, unsigned NumStringToks, 650 Preprocessor &pp, TargetInfo &t) 651 : PP(pp), Target(t) { 652 // Scan all of the string portions, remember the max individual token length, 653 // computing a bound on the concatenated string length, and see whether any 654 // piece is a wide-string. If any of the string portions is a wide-string 655 // literal, the result is a wide-string literal [C99 6.4.5p4]. 656 MaxTokenLength = StringToks[0].getLength(); 657 SizeBound = StringToks[0].getLength()-2; // -2 for "". 658 AnyWide = StringToks[0].is(tok::wide_string_literal); 659 660 hadError = false; 661 662 // Implement Translation Phase #6: concatenation of string literals 663 /// (C99 5.1.1.2p1). The common case is only one string fragment. 664 for (unsigned i = 1; i != NumStringToks; ++i) { 665 // The string could be shorter than this if it needs cleaning, but this is a 666 // reasonable bound, which is all we need. 667 SizeBound += StringToks[i].getLength()-2; // -2 for "". 668 669 // Remember maximum string piece length. 670 if (StringToks[i].getLength() > MaxTokenLength) 671 MaxTokenLength = StringToks[i].getLength(); 672 673 // Remember if we see any wide strings. 674 AnyWide |= StringToks[i].is(tok::wide_string_literal); 675 } 676 677 678 // Include space for the null terminator. 679 ++SizeBound; 680 681 // TODO: K&R warning: "traditional C rejects string constant concatenation" 682 683 // Get the width in bytes of wchar_t. If no wchar_t strings are used, do not 684 // query the target. As such, wchar_tByteWidth is only valid if AnyWide=true. 685 wchar_tByteWidth = ~0U; 686 if (AnyWide) { 687 wchar_tByteWidth = Target.getWCharWidth(); 688 assert((wchar_tByteWidth & 7) == 0 && "Assumes wchar_t is byte multiple!"); 689 wchar_tByteWidth /= 8; 690 } 691 692 // The output buffer size needs to be large enough to hold wide characters. 693 // This is a worst-case assumption which basically corresponds to L"" "long". 694 if (AnyWide) 695 SizeBound *= wchar_tByteWidth; 696 697 // Size the temporary buffer to hold the result string data. 698 ResultBuf.resize(SizeBound); 699 700 // Likewise, but for each string piece. 701 llvm::SmallString<512> TokenBuf; 702 TokenBuf.resize(MaxTokenLength); 703 704 // Loop over all the strings, getting their spelling, and expanding them to 705 // wide strings as appropriate. 706 ResultPtr = &ResultBuf[0]; // Next byte to fill in. 707 708 Pascal = false; 709 710 for (unsigned i = 0, e = NumStringToks; i != e; ++i) { 711 const char *ThisTokBuf = &TokenBuf[0]; 712 // Get the spelling of the token, which eliminates trigraphs, etc. We know 713 // that ThisTokBuf points to a buffer that is big enough for the whole token 714 // and 'spelled' tokens can only shrink. 715 unsigned ThisTokLen = PP.getSpelling(StringToks[i], ThisTokBuf); 716 const char *ThisTokEnd = ThisTokBuf+ThisTokLen-1; // Skip end quote. 717 718 // TODO: Input character set mapping support. 719 720 // Skip L marker for wide strings. 721 bool ThisIsWide = false; 722 if (ThisTokBuf[0] == 'L') { 723 ++ThisTokBuf; 724 ThisIsWide = true; 725 } 726 727 assert(ThisTokBuf[0] == '"' && "Expected quote, lexer broken?"); 728 ++ThisTokBuf; 729 730 // Check if this is a pascal string 731 if (pp.getLangOptions().PascalStrings && ThisTokBuf + 1 != ThisTokEnd && 732 ThisTokBuf[0] == '\\' && ThisTokBuf[1] == 'p') { 733 734 // If the \p sequence is found in the first token, we have a pascal string 735 // Otherwise, if we already have a pascal string, ignore the first \p 736 if (i == 0) { 737 ++ThisTokBuf; 738 Pascal = true; 739 } else if (Pascal) 740 ThisTokBuf += 2; 741 } 742 743 while (ThisTokBuf != ThisTokEnd) { 744 // Is this a span of non-escape characters? 745 if (ThisTokBuf[0] != '\\') { 746 const char *InStart = ThisTokBuf; 747 do { 748 ++ThisTokBuf; 749 } while (ThisTokBuf != ThisTokEnd && ThisTokBuf[0] != '\\'); 750 751 // Copy the character span over. 752 unsigned Len = ThisTokBuf-InStart; 753 if (!AnyWide) { 754 memcpy(ResultPtr, InStart, Len); 755 ResultPtr += Len; 756 } else { 757 // Note: our internal rep of wide char tokens is always little-endian. 758 for (; Len; --Len, ++InStart) { 759 *ResultPtr++ = InStart[0]; 760 // Add zeros at the end. 761 for (unsigned i = 1, e = wchar_tByteWidth; i != e; ++i) 762 *ResultPtr++ = 0; 763 } 764 } 765 continue; 766 } 767 768 // Otherwise, this is an escape character. Process it. 769 unsigned ResultChar = ProcessCharEscape(ThisTokBuf, ThisTokEnd, hadError, 770 StringToks[i].getLocation(), 771 ThisIsWide, PP); 772 773 // Note: our internal rep of wide char tokens is always little-endian. 774 *ResultPtr++ = ResultChar & 0xFF; 775 776 if (AnyWide) { 777 for (unsigned i = 1, e = wchar_tByteWidth; i != e; ++i) 778 *ResultPtr++ = ResultChar >> i*8; 779 } 780 } 781 } 782 783 // Add zero terminator. 784 *ResultPtr = 0; 785 if (AnyWide) { 786 for (unsigned i = 1, e = wchar_tByteWidth; i != e; ++i) 787 *ResultPtr++ = 0; 788 } 789 790 if (Pascal) 791 ResultBuf[0] = ResultPtr-&ResultBuf[0]-1; 792} 793