LiteralSupport.cpp revision ac92d829111bc19d1cc97cd85c3c04bc39b969d1
1//===--- LiteralSupport.cpp - Code to parse and process literals ----------===// 2// 3// The LLVM Compiler Infrastructure 4// 5// This file is distributed under the University of Illinois Open Source 6// License. See LICENSE.TXT for details. 7// 8//===----------------------------------------------------------------------===// 9// 10// This file implements the NumericLiteralParser, CharLiteralParser, and 11// StringLiteralParser interfaces. 12// 13//===----------------------------------------------------------------------===// 14 15#include "clang/Lex/LiteralSupport.h" 16#include "clang/Lex/Preprocessor.h" 17#include "clang/Basic/Diagnostic.h" 18#include "clang/Basic/TargetInfo.h" 19#include "llvm/ADT/StringExtras.h" 20using namespace clang; 21 22/// HexDigitValue - Return the value of the specified hex digit, or -1 if it's 23/// not valid. 24static int HexDigitValue(char C) { 25 if (C >= '0' && C <= '9') return C-'0'; 26 if (C >= 'a' && C <= 'f') return C-'a'+10; 27 if (C >= 'A' && C <= 'F') return C-'A'+10; 28 return -1; 29} 30 31/// ProcessCharEscape - Parse a standard C escape sequence, which can occur in 32/// either a character or a string literal. 33static unsigned ProcessCharEscape(const char *&ThisTokBuf, 34 const char *ThisTokEnd, bool &HadError, 35 SourceLocation Loc, bool IsWide, 36 Preprocessor &PP) { 37 // Skip the '\' char. 38 ++ThisTokBuf; 39 40 // We know that this character can't be off the end of the buffer, because 41 // that would have been \", which would not have been the end of string. 42 unsigned ResultChar = *ThisTokBuf++; 43 switch (ResultChar) { 44 // These map to themselves. 45 case '\\': case '\'': case '"': case '?': break; 46 47 // These have fixed mappings. 48 case 'a': 49 // TODO: K&R: the meaning of '\\a' is different in traditional C 50 ResultChar = 7; 51 break; 52 case 'b': 53 ResultChar = 8; 54 break; 55 case 'e': 56 PP.Diag(Loc, diag::ext_nonstandard_escape) << "e"; 57 ResultChar = 27; 58 break; 59 case 'f': 60 ResultChar = 12; 61 break; 62 case 'n': 63 ResultChar = 10; 64 break; 65 case 'r': 66 ResultChar = 13; 67 break; 68 case 't': 69 ResultChar = 9; 70 break; 71 case 'v': 72 ResultChar = 11; 73 break; 74 75 //case 'u': case 'U': // FIXME: UCNs. 76 case 'x': { // Hex escape. 77 ResultChar = 0; 78 if (ThisTokBuf == ThisTokEnd || !isxdigit(*ThisTokBuf)) { 79 PP.Diag(Loc, diag::err_hex_escape_no_digits); 80 HadError = 1; 81 break; 82 } 83 84 // Hex escapes are a maximal series of hex digits. 85 bool Overflow = false; 86 for (; ThisTokBuf != ThisTokEnd; ++ThisTokBuf) { 87 int CharVal = HexDigitValue(ThisTokBuf[0]); 88 if (CharVal == -1) break; 89 // About to shift out a digit? 90 Overflow |= (ResultChar & 0xF0000000) ? true : false; 91 ResultChar <<= 4; 92 ResultChar |= CharVal; 93 } 94 95 // See if any bits will be truncated when evaluated as a character. 96 unsigned CharWidth = PP.getTargetInfo().getCharWidth(IsWide); 97 98 if (CharWidth != 32 && (ResultChar >> CharWidth) != 0) { 99 Overflow = true; 100 ResultChar &= ~0U >> (32-CharWidth); 101 } 102 103 // Check for overflow. 104 if (Overflow) // Too many digits to fit in 105 PP.Diag(Loc, diag::warn_hex_escape_too_large); 106 break; 107 } 108 case '0': case '1': case '2': case '3': 109 case '4': case '5': case '6': case '7': { 110 // Octal escapes. 111 --ThisTokBuf; 112 ResultChar = 0; 113 114 // Octal escapes are a series of octal digits with maximum length 3. 115 // "\0123" is a two digit sequence equal to "\012" "3". 116 unsigned NumDigits = 0; 117 do { 118 ResultChar <<= 3; 119 ResultChar |= *ThisTokBuf++ - '0'; 120 ++NumDigits; 121 } while (ThisTokBuf != ThisTokEnd && NumDigits < 3 && 122 ThisTokBuf[0] >= '0' && ThisTokBuf[0] <= '7'); 123 124 // Check for overflow. Reject '\777', but not L'\777'. 125 unsigned CharWidth = PP.getTargetInfo().getCharWidth(IsWide); 126 127 if (CharWidth != 32 && (ResultChar >> CharWidth) != 0) { 128 PP.Diag(Loc, diag::warn_octal_escape_too_large); 129 ResultChar &= ~0U >> (32-CharWidth); 130 } 131 break; 132 } 133 134 // Otherwise, these are not valid escapes. 135 case '(': case '{': case '[': case '%': 136 // GCC accepts these as extensions. We warn about them as such though. 137 if (!PP.getLangOptions().NoExtensions) { 138 PP.Diag(Loc, diag::ext_nonstandard_escape) 139 << std::string()+(char)ResultChar; 140 break; 141 } 142 // FALL THROUGH. 143 default: 144 if (isgraph(ThisTokBuf[0])) 145 PP.Diag(Loc, diag::ext_unknown_escape) << std::string()+(char)ResultChar; 146 else 147 PP.Diag(Loc, diag::ext_unknown_escape) << "x"+llvm::utohexstr(ResultChar); 148 break; 149 } 150 151 return ResultChar; 152} 153 154 155 156 157/// integer-constant: [C99 6.4.4.1] 158/// decimal-constant integer-suffix 159/// octal-constant integer-suffix 160/// hexadecimal-constant integer-suffix 161/// decimal-constant: 162/// nonzero-digit 163/// decimal-constant digit 164/// octal-constant: 165/// 0 166/// octal-constant octal-digit 167/// hexadecimal-constant: 168/// hexadecimal-prefix hexadecimal-digit 169/// hexadecimal-constant hexadecimal-digit 170/// hexadecimal-prefix: one of 171/// 0x 0X 172/// integer-suffix: 173/// unsigned-suffix [long-suffix] 174/// unsigned-suffix [long-long-suffix] 175/// long-suffix [unsigned-suffix] 176/// long-long-suffix [unsigned-sufix] 177/// nonzero-digit: 178/// 1 2 3 4 5 6 7 8 9 179/// octal-digit: 180/// 0 1 2 3 4 5 6 7 181/// hexadecimal-digit: 182/// 0 1 2 3 4 5 6 7 8 9 183/// a b c d e f 184/// A B C D E F 185/// unsigned-suffix: one of 186/// u U 187/// long-suffix: one of 188/// l L 189/// long-long-suffix: one of 190/// ll LL 191/// 192/// floating-constant: [C99 6.4.4.2] 193/// TODO: add rules... 194/// 195NumericLiteralParser:: 196NumericLiteralParser(const char *begin, const char *end, 197 SourceLocation TokLoc, Preprocessor &pp) 198 : PP(pp), ThisTokBegin(begin), ThisTokEnd(end) { 199 200 // This routine assumes that the range begin/end matches the regex for integer 201 // and FP constants (specifically, the 'pp-number' regex), and assumes that 202 // the byte at "*end" is both valid and not part of the regex. Because of 203 // this, it doesn't have to check for 'overscan' in various places. 204 assert(!isalnum(*end) && *end != '.' && *end != '_' && 205 "Lexer didn't maximally munch?"); 206 207 s = DigitsBegin = begin; 208 saw_exponent = false; 209 saw_period = false; 210 isLong = false; 211 isUnsigned = false; 212 isLongLong = false; 213 isFloat = false; 214 isImaginary = false; 215 hadError = false; 216 217 if (*s == '0') { // parse radix 218 ParseNumberStartingWithZero(TokLoc); 219 if (hadError) 220 return; 221 } else { // the first digit is non-zero 222 radix = 10; 223 s = SkipDigits(s); 224 if (s == ThisTokEnd) { 225 // Done. 226 } else if (isxdigit(*s) && !(*s == 'e' || *s == 'E')) { 227 PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, s-begin), 228 diag::err_invalid_decimal_digit) << std::string(s, s+1); 229 hadError = true; 230 return; 231 } else if (*s == '.') { 232 s++; 233 saw_period = true; 234 s = SkipDigits(s); 235 } 236 if ((*s == 'e' || *s == 'E')) { // exponent 237 const char *Exponent = s; 238 s++; 239 saw_exponent = true; 240 if (*s == '+' || *s == '-') s++; // sign 241 const char *first_non_digit = SkipDigits(s); 242 if (first_non_digit != s) { 243 s = first_non_digit; 244 } else { 245 PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, Exponent-begin), 246 diag::err_exponent_has_no_digits); 247 hadError = true; 248 return; 249 } 250 } 251 } 252 253 SuffixBegin = s; 254 255 // Parse the suffix. At this point we can classify whether we have an FP or 256 // integer constant. 257 bool isFPConstant = isFloatingLiteral(); 258 259 // Loop over all of the characters of the suffix. If we see something bad, 260 // we break out of the loop. 261 for (; s != ThisTokEnd; ++s) { 262 switch (*s) { 263 case 'f': // FP Suffix for "float" 264 case 'F': 265 if (!isFPConstant) break; // Error for integer constant. 266 if (isFloat || isLong) break; // FF, LF invalid. 267 isFloat = true; 268 continue; // Success. 269 case 'u': 270 case 'U': 271 if (isFPConstant) break; // Error for floating constant. 272 if (isUnsigned) break; // Cannot be repeated. 273 isUnsigned = true; 274 continue; // Success. 275 case 'l': 276 case 'L': 277 if (isLong || isLongLong) break; // Cannot be repeated. 278 if (isFloat) break; // LF invalid. 279 280 // Check for long long. The L's need to be adjacent and the same case. 281 if (s+1 != ThisTokEnd && s[1] == s[0]) { 282 if (isFPConstant) break; // long long invalid for floats. 283 isLongLong = true; 284 ++s; // Eat both of them. 285 } else { 286 isLong = true; 287 } 288 continue; // Success. 289 case 'i': 290 if (PP.getLangOptions().Microsoft) { 291 // Allow i8, i16, i32, i64, and i128. 292 if (++s == ThisTokEnd) break; 293 switch (*s) { 294 case '8': 295 s++; // i8 suffix 296 break; 297 case '1': 298 if (++s == ThisTokEnd) break; 299 if (*s == '6') s++; // i16 suffix 300 else if (*s == '2') { 301 if (++s == ThisTokEnd) break; 302 if (*s == '8') s++; // i128 suffix 303 } 304 break; 305 case '3': 306 if (++s == ThisTokEnd) break; 307 if (*s == '2') s++; // i32 suffix 308 break; 309 case '6': 310 if (++s == ThisTokEnd) break; 311 if (*s == '4') s++; // i64 suffix 312 break; 313 default: 314 break; 315 } 316 break; 317 } 318 // fall through. 319 case 'I': 320 case 'j': 321 case 'J': 322 if (isImaginary) break; // Cannot be repeated. 323 PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, s-begin), 324 diag::ext_imaginary_constant); 325 isImaginary = true; 326 continue; // Success. 327 } 328 // If we reached here, there was an error. 329 break; 330 } 331 332 // Report an error if there are any. 333 if (s != ThisTokEnd) { 334 PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, s-begin), 335 isFPConstant ? diag::err_invalid_suffix_float_constant : 336 diag::err_invalid_suffix_integer_constant) 337 << std::string(SuffixBegin, ThisTokEnd); 338 hadError = true; 339 return; 340 } 341} 342 343/// ParseNumberStartingWithZero - This method is called when the first character 344/// of the number is found to be a zero. This means it is either an octal 345/// number (like '04') or a hex number ('0x123a') a binary number ('0b1010') or 346/// a floating point number (01239.123e4). Eat the prefix, determining the 347/// radix etc. 348void NumericLiteralParser::ParseNumberStartingWithZero(SourceLocation TokLoc) { 349 assert(s[0] == '0' && "Invalid method call"); 350 s++; 351 352 // Handle a hex number like 0x1234. 353 if ((*s == 'x' || *s == 'X') && (isxdigit(s[1]) || s[1] == '.')) { 354 s++; 355 radix = 16; 356 DigitsBegin = s; 357 s = SkipHexDigits(s); 358 if (s == ThisTokEnd) { 359 // Done. 360 } else if (*s == '.') { 361 s++; 362 saw_period = true; 363 s = SkipHexDigits(s); 364 } 365 // A binary exponent can appear with or with a '.'. If dotted, the 366 // binary exponent is required. 367 if (*s == 'p' || *s == 'P') { 368 const char *Exponent = s; 369 s++; 370 saw_exponent = true; 371 if (*s == '+' || *s == '-') s++; // sign 372 const char *first_non_digit = SkipDigits(s); 373 if (first_non_digit == s) { 374 PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, Exponent-ThisTokBegin), 375 diag::err_exponent_has_no_digits); 376 hadError = true; 377 return; 378 } 379 s = first_non_digit; 380 381 if (!PP.getLangOptions().HexFloats) { 382 PP.Diag(TokLoc, diag::ext_hexconstant_invalid); 383 hadError = true; 384 } 385 } else if (saw_period) { 386 PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, s-ThisTokBegin), 387 diag::err_hexconstant_requires_exponent); 388 hadError = true; 389 } 390 return; 391 } 392 393 // Handle simple binary numbers 0b01010 394 if (*s == 'b' || *s == 'B') { 395 // 0b101010 is a GCC extension. 396 PP.Diag(TokLoc, diag::ext_binary_literal); 397 ++s; 398 radix = 2; 399 DigitsBegin = s; 400 s = SkipBinaryDigits(s); 401 if (s == ThisTokEnd) { 402 // Done. 403 } else if (isxdigit(*s)) { 404 PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, s-ThisTokBegin), 405 diag::err_invalid_binary_digit) << std::string(s, s+1); 406 hadError = true; 407 } 408 // Other suffixes will be diagnosed by the caller. 409 return; 410 } 411 412 // For now, the radix is set to 8. If we discover that we have a 413 // floating point constant, the radix will change to 10. Octal floating 414 // point constants are not permitted (only decimal and hexadecimal). 415 radix = 8; 416 DigitsBegin = s; 417 s = SkipOctalDigits(s); 418 if (s == ThisTokEnd) 419 return; // Done, simple octal number like 01234 420 421 // If we have some other non-octal digit that *is* a decimal digit, see if 422 // this is part of a floating point number like 094.123 or 09e1. 423 if (isdigit(*s)) { 424 const char *EndDecimal = SkipDigits(s); 425 if (EndDecimal[0] == '.' || EndDecimal[0] == 'e' || EndDecimal[0] == 'E') { 426 s = EndDecimal; 427 radix = 10; 428 } 429 } 430 431 // If we have a hex digit other than 'e' (which denotes a FP exponent) then 432 // the code is using an incorrect base. 433 if (isxdigit(*s) && *s != 'e' && *s != 'E') { 434 PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, s-ThisTokBegin), 435 diag::err_invalid_octal_digit) << std::string(s, s+1); 436 hadError = true; 437 return; 438 } 439 440 if (*s == '.') { 441 s++; 442 radix = 10; 443 saw_period = true; 444 s = SkipDigits(s); // Skip suffix. 445 } 446 if (*s == 'e' || *s == 'E') { // exponent 447 const char *Exponent = s; 448 s++; 449 radix = 10; 450 saw_exponent = true; 451 if (*s == '+' || *s == '-') s++; // sign 452 const char *first_non_digit = SkipDigits(s); 453 if (first_non_digit != s) { 454 s = first_non_digit; 455 } else { 456 PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, Exponent-ThisTokBegin), 457 diag::err_exponent_has_no_digits); 458 hadError = true; 459 return; 460 } 461 } 462} 463 464 465/// GetIntegerValue - Convert this numeric literal value to an APInt that 466/// matches Val's input width. If there is an overflow, set Val to the low bits 467/// of the result and return true. Otherwise, return false. 468bool NumericLiteralParser::GetIntegerValue(llvm::APInt &Val) { 469 // Fast path: Compute a conservative bound on the maximum number of 470 // bits per digit in this radix. If we can't possibly overflow a 471 // uint64 based on that bound then do the simple conversion to 472 // integer. This avoids the expensive overflow checking below, and 473 // handles the common cases that matter (small decimal integers and 474 // hex/octal values which don't overflow). 475 unsigned MaxBitsPerDigit = 1; 476 while ((1U << MaxBitsPerDigit) < radix) 477 MaxBitsPerDigit += 1; 478 if ((SuffixBegin - DigitsBegin) * MaxBitsPerDigit <= 64) { 479 uint64_t N = 0; 480 for (s = DigitsBegin; s != SuffixBegin; ++s) 481 N = N*radix + HexDigitValue(*s); 482 483 // This will truncate the value to Val's input width. Simply check 484 // for overflow by comparing. 485 Val = N; 486 return Val.getZExtValue() != N; 487 } 488 489 Val = 0; 490 s = DigitsBegin; 491 492 llvm::APInt RadixVal(Val.getBitWidth(), radix); 493 llvm::APInt CharVal(Val.getBitWidth(), 0); 494 llvm::APInt OldVal = Val; 495 496 bool OverflowOccurred = false; 497 while (s < SuffixBegin) { 498 unsigned C = HexDigitValue(*s++); 499 500 // If this letter is out of bound for this radix, reject it. 501 assert(C < radix && "NumericLiteralParser ctor should have rejected this"); 502 503 CharVal = C; 504 505 // Add the digit to the value in the appropriate radix. If adding in digits 506 // made the value smaller, then this overflowed. 507 OldVal = Val; 508 509 // Multiply by radix, did overflow occur on the multiply? 510 Val *= RadixVal; 511 OverflowOccurred |= Val.udiv(RadixVal) != OldVal; 512 513 // Add value, did overflow occur on the value? 514 // (a + b) ult b <=> overflow 515 Val += CharVal; 516 OverflowOccurred |= Val.ult(CharVal); 517 } 518 return OverflowOccurred; 519} 520 521llvm::APFloat NumericLiteralParser:: 522GetFloatValue(const llvm::fltSemantics &Format, bool* isExact) { 523 using llvm::APFloat; 524 525 llvm::SmallVector<char,256> floatChars; 526 for (unsigned i = 0, n = ThisTokEnd-ThisTokBegin; i != n; ++i) 527 floatChars.push_back(ThisTokBegin[i]); 528 529 floatChars.push_back('\0'); 530 531 APFloat V (Format, APFloat::fcZero, false); 532 APFloat::opStatus status; 533 534 status = V.convertFromString(&floatChars[0],APFloat::rmNearestTiesToEven); 535 536 if (isExact) 537 *isExact = status == APFloat::opOK; 538 539 return V; 540} 541 542 543CharLiteralParser::CharLiteralParser(const char *begin, const char *end, 544 SourceLocation Loc, Preprocessor &PP) { 545 // At this point we know that the character matches the regex "L?'.*'". 546 HadError = false; 547 Value = 0; 548 549 // Determine if this is a wide character. 550 IsWide = begin[0] == 'L'; 551 if (IsWide) ++begin; 552 553 // Skip over the entry quote. 554 assert(begin[0] == '\'' && "Invalid token lexed"); 555 ++begin; 556 557 // FIXME: This assumes that 'int' is 32-bits in overflow calculation, and the 558 // size of "value". 559 assert(PP.getTargetInfo().getIntWidth() == 32 && 560 "Assumes sizeof(int) == 4 for now"); 561 // FIXME: This assumes that wchar_t is 32-bits for now. 562 assert(PP.getTargetInfo().getWCharWidth() == 32 && 563 "Assumes sizeof(wchar_t) == 4 for now"); 564 // FIXME: This extensively assumes that 'char' is 8-bits. 565 assert(PP.getTargetInfo().getCharWidth() == 8 && 566 "Assumes char is 8 bits"); 567 568 bool isFirstChar = true; 569 bool isMultiChar = false; 570 while (begin[0] != '\'') { 571 unsigned ResultChar; 572 if (begin[0] != '\\') // If this is a normal character, consume it. 573 ResultChar = *begin++; 574 else // Otherwise, this is an escape character. 575 ResultChar = ProcessCharEscape(begin, end, HadError, Loc, IsWide, PP); 576 577 // If this is a multi-character constant (e.g. 'abc'), handle it. These are 578 // implementation defined (C99 6.4.4.4p10). 579 if (!isFirstChar) { 580 // If this is the second character being processed, do special handling. 581 if (!isMultiChar) { 582 isMultiChar = true; 583 584 // Warn about discarding the top bits for multi-char wide-character 585 // constants (L'abcd'). 586 if (IsWide) 587 PP.Diag(Loc, diag::warn_extraneous_wide_char_constant); 588 } 589 590 if (IsWide) { 591 // Emulate GCC's (unintentional?) behavior: L'ab' -> L'b'. 592 Value = 0; 593 } else { 594 // Narrow character literals act as though their value is concatenated 595 // in this implementation. 596 if (((Value << 8) >> 8) != Value) 597 PP.Diag(Loc, diag::warn_char_constant_too_large); 598 Value <<= 8; 599 } 600 } 601 602 Value += ResultChar; 603 isFirstChar = false; 604 } 605 606 // If this is a single narrow character, sign extend it (e.g. '\xFF' is "-1") 607 // if 'char' is signed for this target (C99 6.4.4.4p10). Note that multiple 608 // character constants are not sign extended in the this implementation: 609 // '\xFF\xFF' = 65536 and '\x0\xFF' = 255, which matches GCC. 610 if (!IsWide && !isMultiChar && (Value & 128) && 611 PP.getTargetInfo().isCharSigned()) 612 Value = (signed char)Value; 613} 614 615 616/// string-literal: [C99 6.4.5] 617/// " [s-char-sequence] " 618/// L" [s-char-sequence] " 619/// s-char-sequence: 620/// s-char 621/// s-char-sequence s-char 622/// s-char: 623/// any source character except the double quote ", 624/// backslash \, or newline character 625/// escape-character 626/// universal-character-name 627/// escape-character: [C99 6.4.4.4] 628/// \ escape-code 629/// universal-character-name 630/// escape-code: 631/// character-escape-code 632/// octal-escape-code 633/// hex-escape-code 634/// character-escape-code: one of 635/// n t b r f v a 636/// \ ' " ? 637/// octal-escape-code: 638/// octal-digit 639/// octal-digit octal-digit 640/// octal-digit octal-digit octal-digit 641/// hex-escape-code: 642/// x hex-digit 643/// hex-escape-code hex-digit 644/// universal-character-name: 645/// \u hex-quad 646/// \U hex-quad hex-quad 647/// hex-quad: 648/// hex-digit hex-digit hex-digit hex-digit 649/// 650StringLiteralParser:: 651StringLiteralParser(const Token *StringToks, unsigned NumStringToks, 652 Preprocessor &pp, TargetInfo &t) 653 : PP(pp), Target(t) { 654 // Scan all of the string portions, remember the max individual token length, 655 // computing a bound on the concatenated string length, and see whether any 656 // piece is a wide-string. If any of the string portions is a wide-string 657 // literal, the result is a wide-string literal [C99 6.4.5p4]. 658 MaxTokenLength = StringToks[0].getLength(); 659 SizeBound = StringToks[0].getLength()-2; // -2 for "". 660 AnyWide = StringToks[0].is(tok::wide_string_literal); 661 662 hadError = false; 663 664 // Implement Translation Phase #6: concatenation of string literals 665 /// (C99 5.1.1.2p1). The common case is only one string fragment. 666 for (unsigned i = 1; i != NumStringToks; ++i) { 667 // The string could be shorter than this if it needs cleaning, but this is a 668 // reasonable bound, which is all we need. 669 SizeBound += StringToks[i].getLength()-2; // -2 for "". 670 671 // Remember maximum string piece length. 672 if (StringToks[i].getLength() > MaxTokenLength) 673 MaxTokenLength = StringToks[i].getLength(); 674 675 // Remember if we see any wide strings. 676 AnyWide |= StringToks[i].is(tok::wide_string_literal); 677 } 678 679 680 // Include space for the null terminator. 681 ++SizeBound; 682 683 // TODO: K&R warning: "traditional C rejects string constant concatenation" 684 685 // Get the width in bytes of wchar_t. If no wchar_t strings are used, do not 686 // query the target. As such, wchar_tByteWidth is only valid if AnyWide=true. 687 wchar_tByteWidth = ~0U; 688 if (AnyWide) { 689 wchar_tByteWidth = Target.getWCharWidth(); 690 assert((wchar_tByteWidth & 7) == 0 && "Assumes wchar_t is byte multiple!"); 691 wchar_tByteWidth /= 8; 692 } 693 694 // The output buffer size needs to be large enough to hold wide characters. 695 // This is a worst-case assumption which basically corresponds to L"" "long". 696 if (AnyWide) 697 SizeBound *= wchar_tByteWidth; 698 699 // Size the temporary buffer to hold the result string data. 700 ResultBuf.resize(SizeBound); 701 702 // Likewise, but for each string piece. 703 llvm::SmallString<512> TokenBuf; 704 TokenBuf.resize(MaxTokenLength); 705 706 // Loop over all the strings, getting their spelling, and expanding them to 707 // wide strings as appropriate. 708 ResultPtr = &ResultBuf[0]; // Next byte to fill in. 709 710 Pascal = false; 711 712 for (unsigned i = 0, e = NumStringToks; i != e; ++i) { 713 const char *ThisTokBuf = &TokenBuf[0]; 714 // Get the spelling of the token, which eliminates trigraphs, etc. We know 715 // that ThisTokBuf points to a buffer that is big enough for the whole token 716 // and 'spelled' tokens can only shrink. 717 unsigned ThisTokLen = PP.getSpelling(StringToks[i], ThisTokBuf); 718 const char *ThisTokEnd = ThisTokBuf+ThisTokLen-1; // Skip end quote. 719 720 // TODO: Input character set mapping support. 721 722 // Skip L marker for wide strings. 723 bool ThisIsWide = false; 724 if (ThisTokBuf[0] == 'L') { 725 ++ThisTokBuf; 726 ThisIsWide = true; 727 } 728 729 assert(ThisTokBuf[0] == '"' && "Expected quote, lexer broken?"); 730 ++ThisTokBuf; 731 732 // Check if this is a pascal string 733 if (pp.getLangOptions().PascalStrings && ThisTokBuf + 1 != ThisTokEnd && 734 ThisTokBuf[0] == '\\' && ThisTokBuf[1] == 'p') { 735 736 // If the \p sequence is found in the first token, we have a pascal string 737 // Otherwise, if we already have a pascal string, ignore the first \p 738 if (i == 0) { 739 ++ThisTokBuf; 740 Pascal = true; 741 } else if (Pascal) 742 ThisTokBuf += 2; 743 } 744 745 while (ThisTokBuf != ThisTokEnd) { 746 // Is this a span of non-escape characters? 747 if (ThisTokBuf[0] != '\\') { 748 const char *InStart = ThisTokBuf; 749 do { 750 ++ThisTokBuf; 751 } while (ThisTokBuf != ThisTokEnd && ThisTokBuf[0] != '\\'); 752 753 // Copy the character span over. 754 unsigned Len = ThisTokBuf-InStart; 755 if (!AnyWide) { 756 memcpy(ResultPtr, InStart, Len); 757 ResultPtr += Len; 758 } else { 759 // Note: our internal rep of wide char tokens is always little-endian. 760 for (; Len; --Len, ++InStart) { 761 *ResultPtr++ = InStart[0]; 762 // Add zeros at the end. 763 for (unsigned i = 1, e = wchar_tByteWidth; i != e; ++i) 764 *ResultPtr++ = 0; 765 } 766 } 767 continue; 768 } 769 770 // Otherwise, this is an escape character. Process it. 771 unsigned ResultChar = ProcessCharEscape(ThisTokBuf, ThisTokEnd, hadError, 772 StringToks[i].getLocation(), 773 ThisIsWide, PP); 774 775 // Note: our internal rep of wide char tokens is always little-endian. 776 *ResultPtr++ = ResultChar & 0xFF; 777 778 if (AnyWide) { 779 for (unsigned i = 1, e = wchar_tByteWidth; i != e; ++i) 780 *ResultPtr++ = ResultChar >> i*8; 781 } 782 } 783 } 784 785 // Add zero terminator. 786 *ResultPtr = 0; 787 if (AnyWide) { 788 for (unsigned i = 1, e = wchar_tByteWidth; i != e; ++i) 789 *ResultPtr++ = 0; 790 } 791 792 if (Pascal) 793 ResultBuf[0] = ResultPtr-&ResultBuf[0]-1; 794} 795