1//===-- StringRef.cpp - Lightweight String References ---------------------===// 2// 3// The LLVM Compiler Infrastructure 4// 5// This file is distributed under the University of Illinois Open Source 6// License. See LICENSE.TXT for details. 7// 8//===----------------------------------------------------------------------===// 9 10#include "llvm/ADT/StringRef.h" 11#include "llvm/ADT/APInt.h" 12#include "llvm/ADT/Hashing.h" 13#include "llvm/ADT/edit_distance.h" 14#include <bitset> 15 16using namespace llvm; 17 18// MSVC emits references to this into the translation units which reference it. 19#ifndef _MSC_VER 20const size_t StringRef::npos; 21#endif 22 23static char ascii_tolower(char x) { 24 if (x >= 'A' && x <= 'Z') 25 return x - 'A' + 'a'; 26 return x; 27} 28 29static char ascii_toupper(char x) { 30 if (x >= 'a' && x <= 'z') 31 return x - 'a' + 'A'; 32 return x; 33} 34 35static bool ascii_isdigit(char x) { 36 return x >= '0' && x <= '9'; 37} 38 39// strncasecmp() is not available on non-POSIX systems, so define an 40// alternative function here. 41static int ascii_strncasecmp(const char *LHS, const char *RHS, size_t Length) { 42 for (size_t I = 0; I < Length; ++I) { 43 unsigned char LHC = ascii_tolower(LHS[I]); 44 unsigned char RHC = ascii_tolower(RHS[I]); 45 if (LHC != RHC) 46 return LHC < RHC ? -1 : 1; 47 } 48 return 0; 49} 50 51/// compare_lower - Compare strings, ignoring case. 52int StringRef::compare_lower(StringRef RHS) const { 53 if (int Res = ascii_strncasecmp(Data, RHS.Data, std::min(Length, RHS.Length))) 54 return Res; 55 if (Length == RHS.Length) 56 return 0; 57 return Length < RHS.Length ? -1 : 1; 58} 59 60/// Check if this string starts with the given \p Prefix, ignoring case. 61bool StringRef::startswith_lower(StringRef Prefix) const { 62 return Length >= Prefix.Length && 63 ascii_strncasecmp(Data, Prefix.Data, Prefix.Length) == 0; 64} 65 66/// Check if this string ends with the given \p Suffix, ignoring case. 67bool StringRef::endswith_lower(StringRef Suffix) const { 68 return Length >= Suffix.Length && 69 ascii_strncasecmp(end() - Suffix.Length, Suffix.Data, Suffix.Length) == 0; 70} 71 72/// compare_numeric - Compare strings, handle embedded numbers. 73int StringRef::compare_numeric(StringRef RHS) const { 74 for (size_t I = 0, E = std::min(Length, RHS.Length); I != E; ++I) { 75 // Check for sequences of digits. 76 if (ascii_isdigit(Data[I]) && ascii_isdigit(RHS.Data[I])) { 77 // The longer sequence of numbers is considered larger. 78 // This doesn't really handle prefixed zeros well. 79 size_t J; 80 for (J = I + 1; J != E + 1; ++J) { 81 bool ld = J < Length && ascii_isdigit(Data[J]); 82 bool rd = J < RHS.Length && ascii_isdigit(RHS.Data[J]); 83 if (ld != rd) 84 return rd ? -1 : 1; 85 if (!rd) 86 break; 87 } 88 // The two number sequences have the same length (J-I), just memcmp them. 89 if (int Res = compareMemory(Data + I, RHS.Data + I, J - I)) 90 return Res < 0 ? -1 : 1; 91 // Identical number sequences, continue search after the numbers. 92 I = J - 1; 93 continue; 94 } 95 if (Data[I] != RHS.Data[I]) 96 return (unsigned char)Data[I] < (unsigned char)RHS.Data[I] ? -1 : 1; 97 } 98 if (Length == RHS.Length) 99 return 0; 100 return Length < RHS.Length ? -1 : 1; 101} 102 103// Compute the edit distance between the two given strings. 104unsigned StringRef::edit_distance(llvm::StringRef Other, 105 bool AllowReplacements, 106 unsigned MaxEditDistance) const { 107 return llvm::ComputeEditDistance( 108 makeArrayRef(data(), size()), 109 makeArrayRef(Other.data(), Other.size()), 110 AllowReplacements, MaxEditDistance); 111} 112 113//===----------------------------------------------------------------------===// 114// String Operations 115//===----------------------------------------------------------------------===// 116 117std::string StringRef::lower() const { 118 std::string Result(size(), char()); 119 for (size_type i = 0, e = size(); i != e; ++i) { 120 Result[i] = ascii_tolower(Data[i]); 121 } 122 return Result; 123} 124 125std::string StringRef::upper() const { 126 std::string Result(size(), char()); 127 for (size_type i = 0, e = size(); i != e; ++i) { 128 Result[i] = ascii_toupper(Data[i]); 129 } 130 return Result; 131} 132 133//===----------------------------------------------------------------------===// 134// String Searching 135//===----------------------------------------------------------------------===// 136 137 138/// find - Search for the first string \arg Str in the string. 139/// 140/// \return - The index of the first occurrence of \arg Str, or npos if not 141/// found. 142size_t StringRef::find(StringRef Str, size_t From) const { 143 if (From > Length) 144 return npos; 145 146 const char *Needle = Str.data(); 147 size_t N = Str.size(); 148 if (N == 0) 149 return From; 150 151 size_t Size = Length - From; 152 if (Size < N) 153 return npos; 154 155 const char *Start = Data + From; 156 const char *Stop = Start + (Size - N + 1); 157 158 // For short haystacks or unsupported needles fall back to the naive algorithm 159 if (Size < 16 || N > 255) { 160 do { 161 if (std::memcmp(Start, Needle, N) == 0) 162 return Start - Data; 163 ++Start; 164 } while (Start < Stop); 165 return npos; 166 } 167 168 // Build the bad char heuristic table, with uint8_t to reduce cache thrashing. 169 uint8_t BadCharSkip[256]; 170 std::memset(BadCharSkip, N, 256); 171 for (unsigned i = 0; i != N-1; ++i) 172 BadCharSkip[(uint8_t)Str[i]] = N-1-i; 173 174 do { 175 if (std::memcmp(Start, Needle, N) == 0) 176 return Start - Data; 177 178 // Otherwise skip the appropriate number of bytes. 179 Start += BadCharSkip[(uint8_t)Start[N-1]]; 180 } while (Start < Stop); 181 182 return npos; 183} 184 185/// rfind - Search for the last string \arg Str in the string. 186/// 187/// \return - The index of the last occurrence of \arg Str, or npos if not 188/// found. 189size_t StringRef::rfind(StringRef Str) const { 190 size_t N = Str.size(); 191 if (N > Length) 192 return npos; 193 for (size_t i = Length - N + 1, e = 0; i != e;) { 194 --i; 195 if (substr(i, N).equals(Str)) 196 return i; 197 } 198 return npos; 199} 200 201/// find_first_of - Find the first character in the string that is in \arg 202/// Chars, or npos if not found. 203/// 204/// Note: O(size() + Chars.size()) 205StringRef::size_type StringRef::find_first_of(StringRef Chars, 206 size_t From) const { 207 std::bitset<1 << CHAR_BIT> CharBits; 208 for (size_type i = 0; i != Chars.size(); ++i) 209 CharBits.set((unsigned char)Chars[i]); 210 211 for (size_type i = std::min(From, Length), e = Length; i != e; ++i) 212 if (CharBits.test((unsigned char)Data[i])) 213 return i; 214 return npos; 215} 216 217/// find_first_not_of - Find the first character in the string that is not 218/// \arg C or npos if not found. 219StringRef::size_type StringRef::find_first_not_of(char C, size_t From) const { 220 for (size_type i = std::min(From, Length), e = Length; i != e; ++i) 221 if (Data[i] != C) 222 return i; 223 return npos; 224} 225 226/// find_first_not_of - Find the first character in the string that is not 227/// in the string \arg Chars, or npos if not found. 228/// 229/// Note: O(size() + Chars.size()) 230StringRef::size_type StringRef::find_first_not_of(StringRef Chars, 231 size_t From) const { 232 std::bitset<1 << CHAR_BIT> CharBits; 233 for (size_type i = 0; i != Chars.size(); ++i) 234 CharBits.set((unsigned char)Chars[i]); 235 236 for (size_type i = std::min(From, Length), e = Length; i != e; ++i) 237 if (!CharBits.test((unsigned char)Data[i])) 238 return i; 239 return npos; 240} 241 242/// find_last_of - Find the last character in the string that is in \arg C, 243/// or npos if not found. 244/// 245/// Note: O(size() + Chars.size()) 246StringRef::size_type StringRef::find_last_of(StringRef Chars, 247 size_t From) const { 248 std::bitset<1 << CHAR_BIT> CharBits; 249 for (size_type i = 0; i != Chars.size(); ++i) 250 CharBits.set((unsigned char)Chars[i]); 251 252 for (size_type i = std::min(From, Length) - 1, e = -1; i != e; --i) 253 if (CharBits.test((unsigned char)Data[i])) 254 return i; 255 return npos; 256} 257 258/// find_last_not_of - Find the last character in the string that is not 259/// \arg C, or npos if not found. 260StringRef::size_type StringRef::find_last_not_of(char C, size_t From) const { 261 for (size_type i = std::min(From, Length) - 1, e = -1; i != e; --i) 262 if (Data[i] != C) 263 return i; 264 return npos; 265} 266 267/// find_last_not_of - Find the last character in the string that is not in 268/// \arg Chars, or npos if not found. 269/// 270/// Note: O(size() + Chars.size()) 271StringRef::size_type StringRef::find_last_not_of(StringRef Chars, 272 size_t From) const { 273 std::bitset<1 << CHAR_BIT> CharBits; 274 for (size_type i = 0, e = Chars.size(); i != e; ++i) 275 CharBits.set((unsigned char)Chars[i]); 276 277 for (size_type i = std::min(From, Length) - 1, e = -1; i != e; --i) 278 if (!CharBits.test((unsigned char)Data[i])) 279 return i; 280 return npos; 281} 282 283void StringRef::split(SmallVectorImpl<StringRef> &A, 284 StringRef Separator, int MaxSplit, 285 bool KeepEmpty) const { 286 StringRef S = *this; 287 288 // Count down from MaxSplit. When MaxSplit is -1, this will just split 289 // "forever". This doesn't support splitting more than 2^31 times 290 // intentionally; if we ever want that we can make MaxSplit a 64-bit integer 291 // but that seems unlikely to be useful. 292 while (MaxSplit-- != 0) { 293 size_t Idx = S.find(Separator); 294 if (Idx == npos) 295 break; 296 297 // Push this split. 298 if (KeepEmpty || Idx > 0) 299 A.push_back(S.slice(0, Idx)); 300 301 // Jump forward. 302 S = S.slice(Idx + Separator.size(), npos); 303 } 304 305 // Push the tail. 306 if (KeepEmpty || !S.empty()) 307 A.push_back(S); 308} 309 310void StringRef::split(SmallVectorImpl<StringRef> &A, char Separator, 311 int MaxSplit, bool KeepEmpty) const { 312 StringRef S = *this; 313 314 // Count down from MaxSplit. When MaxSplit is -1, this will just split 315 // "forever". This doesn't support splitting more than 2^31 times 316 // intentionally; if we ever want that we can make MaxSplit a 64-bit integer 317 // but that seems unlikely to be useful. 318 while (MaxSplit-- != 0) { 319 size_t Idx = S.find(Separator); 320 if (Idx == npos) 321 break; 322 323 // Push this split. 324 if (KeepEmpty || Idx > 0) 325 A.push_back(S.slice(0, Idx)); 326 327 // Jump forward. 328 S = S.slice(Idx + 1, npos); 329 } 330 331 // Push the tail. 332 if (KeepEmpty || !S.empty()) 333 A.push_back(S); 334} 335 336//===----------------------------------------------------------------------===// 337// Helpful Algorithms 338//===----------------------------------------------------------------------===// 339 340/// count - Return the number of non-overlapped occurrences of \arg Str in 341/// the string. 342size_t StringRef::count(StringRef Str) const { 343 size_t Count = 0; 344 size_t N = Str.size(); 345 if (N > Length) 346 return 0; 347 for (size_t i = 0, e = Length - N + 1; i != e; ++i) 348 if (substr(i, N).equals(Str)) 349 ++Count; 350 return Count; 351} 352 353static unsigned GetAutoSenseRadix(StringRef &Str) { 354 if (Str.startswith("0x") || Str.startswith("0X")) { 355 Str = Str.substr(2); 356 return 16; 357 } 358 359 if (Str.startswith("0b") || Str.startswith("0B")) { 360 Str = Str.substr(2); 361 return 2; 362 } 363 364 if (Str.startswith("0o")) { 365 Str = Str.substr(2); 366 return 8; 367 } 368 369 if (Str.startswith("0")) 370 return 8; 371 372 return 10; 373} 374 375 376/// GetAsUnsignedInteger - Workhorse method that converts a integer character 377/// sequence of radix up to 36 to an unsigned long long value. 378bool llvm::getAsUnsignedInteger(StringRef Str, unsigned Radix, 379 unsigned long long &Result) { 380 // Autosense radix if not specified. 381 if (Radix == 0) 382 Radix = GetAutoSenseRadix(Str); 383 384 // Empty strings (after the radix autosense) are invalid. 385 if (Str.empty()) return true; 386 387 // Parse all the bytes of the string given this radix. Watch for overflow. 388 Result = 0; 389 while (!Str.empty()) { 390 unsigned CharVal; 391 if (Str[0] >= '0' && Str[0] <= '9') 392 CharVal = Str[0]-'0'; 393 else if (Str[0] >= 'a' && Str[0] <= 'z') 394 CharVal = Str[0]-'a'+10; 395 else if (Str[0] >= 'A' && Str[0] <= 'Z') 396 CharVal = Str[0]-'A'+10; 397 else 398 return true; 399 400 // If the parsed value is larger than the integer radix, the string is 401 // invalid. 402 if (CharVal >= Radix) 403 return true; 404 405 // Add in this character. 406 unsigned long long PrevResult = Result; 407 Result = Result*Radix+CharVal; 408 409 // Check for overflow by shifting back and seeing if bits were lost. 410 if (Result/Radix < PrevResult) 411 return true; 412 413 Str = Str.substr(1); 414 } 415 416 return false; 417} 418 419bool llvm::getAsSignedInteger(StringRef Str, unsigned Radix, 420 long long &Result) { 421 unsigned long long ULLVal; 422 423 // Handle positive strings first. 424 if (Str.empty() || Str.front() != '-') { 425 if (getAsUnsignedInteger(Str, Radix, ULLVal) || 426 // Check for value so large it overflows a signed value. 427 (long long)ULLVal < 0) 428 return true; 429 Result = ULLVal; 430 return false; 431 } 432 433 // Get the positive part of the value. 434 if (getAsUnsignedInteger(Str.substr(1), Radix, ULLVal) || 435 // Reject values so large they'd overflow as negative signed, but allow 436 // "-0". This negates the unsigned so that the negative isn't undefined 437 // on signed overflow. 438 (long long)-ULLVal > 0) 439 return true; 440 441 Result = -ULLVal; 442 return false; 443} 444 445bool StringRef::getAsInteger(unsigned Radix, APInt &Result) const { 446 StringRef Str = *this; 447 448 // Autosense radix if not specified. 449 if (Radix == 0) 450 Radix = GetAutoSenseRadix(Str); 451 452 assert(Radix > 1 && Radix <= 36); 453 454 // Empty strings (after the radix autosense) are invalid. 455 if (Str.empty()) return true; 456 457 // Skip leading zeroes. This can be a significant improvement if 458 // it means we don't need > 64 bits. 459 while (!Str.empty() && Str.front() == '0') 460 Str = Str.substr(1); 461 462 // If it was nothing but zeroes.... 463 if (Str.empty()) { 464 Result = APInt(64, 0); 465 return false; 466 } 467 468 // (Over-)estimate the required number of bits. 469 unsigned Log2Radix = 0; 470 while ((1U << Log2Radix) < Radix) Log2Radix++; 471 bool IsPowerOf2Radix = ((1U << Log2Radix) == Radix); 472 473 unsigned BitWidth = Log2Radix * Str.size(); 474 if (BitWidth < Result.getBitWidth()) 475 BitWidth = Result.getBitWidth(); // don't shrink the result 476 else if (BitWidth > Result.getBitWidth()) 477 Result = Result.zext(BitWidth); 478 479 APInt RadixAP, CharAP; // unused unless !IsPowerOf2Radix 480 if (!IsPowerOf2Radix) { 481 // These must have the same bit-width as Result. 482 RadixAP = APInt(BitWidth, Radix); 483 CharAP = APInt(BitWidth, 0); 484 } 485 486 // Parse all the bytes of the string given this radix. 487 Result = 0; 488 while (!Str.empty()) { 489 unsigned CharVal; 490 if (Str[0] >= '0' && Str[0] <= '9') 491 CharVal = Str[0]-'0'; 492 else if (Str[0] >= 'a' && Str[0] <= 'z') 493 CharVal = Str[0]-'a'+10; 494 else if (Str[0] >= 'A' && Str[0] <= 'Z') 495 CharVal = Str[0]-'A'+10; 496 else 497 return true; 498 499 // If the parsed value is larger than the integer radix, the string is 500 // invalid. 501 if (CharVal >= Radix) 502 return true; 503 504 // Add in this character. 505 if (IsPowerOf2Radix) { 506 Result <<= Log2Radix; 507 Result |= CharVal; 508 } else { 509 Result *= RadixAP; 510 CharAP = CharVal; 511 Result += CharAP; 512 } 513 514 Str = Str.substr(1); 515 } 516 517 return false; 518} 519 520 521// Implementation of StringRef hashing. 522hash_code llvm::hash_value(StringRef S) { 523 return hash_combine_range(S.begin(), S.end()); 524} 525