1//===-- StringRef.cpp - Lightweight String References ---------------------===// 2// 3// The LLVM Compiler Infrastructure 4// 5// This file is distributed under the University of Illinois Open Source 6// License. See LICENSE.TXT for details. 7// 8//===----------------------------------------------------------------------===// 9 10#include "llvm/ADT/StringRef.h" 11#include "llvm/ADT/APInt.h" 12#include "llvm/ADT/OwningPtr.h" 13#include "llvm/ADT/Hashing.h" 14#include "llvm/ADT/edit_distance.h" 15 16#include <bitset> 17 18using namespace llvm; 19 20// MSVC emits references to this into the translation units which reference it. 21#ifndef _MSC_VER 22const size_t StringRef::npos; 23#endif 24 25static char ascii_tolower(char x) { 26 if (x >= 'A' && x <= 'Z') 27 return x - 'A' + 'a'; 28 return x; 29} 30 31static char ascii_toupper(char x) { 32 if (x >= 'a' && x <= 'z') 33 return x - 'a' + 'A'; 34 return x; 35} 36 37static bool ascii_isdigit(char x) { 38 return x >= '0' && x <= '9'; 39} 40 41/// compare_lower - Compare strings, ignoring case. 42int StringRef::compare_lower(StringRef RHS) const { 43 for (size_t I = 0, E = min(Length, RHS.Length); I != E; ++I) { 44 unsigned char LHC = ascii_tolower(Data[I]); 45 unsigned char RHC = ascii_tolower(RHS.Data[I]); 46 if (LHC != RHC) 47 return LHC < RHC ? -1 : 1; 48 } 49 50 if (Length == RHS.Length) 51 return 0; 52 return Length < RHS.Length ? -1 : 1; 53} 54 55/// compare_numeric - Compare strings, handle embedded numbers. 56int StringRef::compare_numeric(StringRef RHS) const { 57 for (size_t I = 0, E = min(Length, RHS.Length); I != E; ++I) { 58 // Check for sequences of digits. 59 if (ascii_isdigit(Data[I]) && ascii_isdigit(RHS.Data[I])) { 60 // The longer sequence of numbers is considered larger. 61 // This doesn't really handle prefixed zeros well. 62 size_t J; 63 for (J = I + 1; J != E + 1; ++J) { 64 bool ld = J < Length && ascii_isdigit(Data[J]); 65 bool rd = J < RHS.Length && ascii_isdigit(RHS.Data[J]); 66 if (ld != rd) 67 return rd ? -1 : 1; 68 if (!rd) 69 break; 70 } 71 // The two number sequences have the same length (J-I), just memcmp them. 72 if (int Res = compareMemory(Data + I, RHS.Data + I, J - I)) 73 return Res < 0 ? -1 : 1; 74 // Identical number sequences, continue search after the numbers. 75 I = J - 1; 76 continue; 77 } 78 if (Data[I] != RHS.Data[I]) 79 return (unsigned char)Data[I] < (unsigned char)RHS.Data[I] ? -1 : 1; 80 } 81 if (Length == RHS.Length) 82 return 0; 83 return Length < RHS.Length ? -1 : 1; 84} 85 86// Compute the edit distance between the two given strings. 87unsigned StringRef::edit_distance(llvm::StringRef Other, 88 bool AllowReplacements, 89 unsigned MaxEditDistance) { 90 return llvm::ComputeEditDistance( 91 llvm::ArrayRef<char>(data(), size()), 92 llvm::ArrayRef<char>(Other.data(), Other.size()), 93 AllowReplacements, MaxEditDistance); 94} 95 96//===----------------------------------------------------------------------===// 97// String Operations 98//===----------------------------------------------------------------------===// 99 100std::string StringRef::lower() const { 101 std::string Result(size(), char()); 102 for (size_type i = 0, e = size(); i != e; ++i) { 103 Result[i] = ascii_tolower(Data[i]); 104 } 105 return Result; 106} 107 108std::string StringRef::upper() const { 109 std::string Result(size(), char()); 110 for (size_type i = 0, e = size(); i != e; ++i) { 111 Result[i] = ascii_toupper(Data[i]); 112 } 113 return Result; 114} 115 116//===----------------------------------------------------------------------===// 117// String Searching 118//===----------------------------------------------------------------------===// 119 120 121/// find - Search for the first string \arg Str in the string. 122/// 123/// \return - The index of the first occurrence of \arg Str, or npos if not 124/// found. 125size_t StringRef::find(StringRef Str, size_t From) const { 126 size_t N = Str.size(); 127 if (N > Length) 128 return npos; 129 130 // For short haystacks or unsupported needles fall back to the naive algorithm 131 if (Length < 16 || N > 255 || N == 0) { 132 for (size_t e = Length - N + 1, i = min(From, e); i != e; ++i) 133 if (substr(i, N).equals(Str)) 134 return i; 135 return npos; 136 } 137 138 if (From >= Length) 139 return npos; 140 141 // Build the bad char heuristic table, with uint8_t to reduce cache thrashing. 142 uint8_t BadCharSkip[256]; 143 std::memset(BadCharSkip, N, 256); 144 for (unsigned i = 0; i != N-1; ++i) 145 BadCharSkip[(uint8_t)Str[i]] = N-1-i; 146 147 unsigned Len = Length-From, Pos = From; 148 while (Len >= N) { 149 if (substr(Pos, N).equals(Str)) // See if this is the correct substring. 150 return Pos; 151 152 // Otherwise skip the appropriate number of bytes. 153 uint8_t Skip = BadCharSkip[(uint8_t)(*this)[Pos+N-1]]; 154 Len -= Skip; 155 Pos += Skip; 156 } 157 158 return npos; 159} 160 161/// rfind - Search for the last string \arg Str in the string. 162/// 163/// \return - The index of the last occurrence of \arg Str, or npos if not 164/// found. 165size_t StringRef::rfind(StringRef Str) const { 166 size_t N = Str.size(); 167 if (N > Length) 168 return npos; 169 for (size_t i = Length - N + 1, e = 0; i != e;) { 170 --i; 171 if (substr(i, N).equals(Str)) 172 return i; 173 } 174 return npos; 175} 176 177/// find_first_of - Find the first character in the string that is in \arg 178/// Chars, or npos if not found. 179/// 180/// Note: O(size() + Chars.size()) 181StringRef::size_type StringRef::find_first_of(StringRef Chars, 182 size_t From) const { 183 std::bitset<1 << CHAR_BIT> CharBits; 184 for (size_type i = 0; i != Chars.size(); ++i) 185 CharBits.set((unsigned char)Chars[i]); 186 187 for (size_type i = min(From, Length), e = Length; i != e; ++i) 188 if (CharBits.test((unsigned char)Data[i])) 189 return i; 190 return npos; 191} 192 193/// find_first_not_of - Find the first character in the string that is not 194/// \arg C or npos if not found. 195StringRef::size_type StringRef::find_first_not_of(char C, size_t From) const { 196 for (size_type i = min(From, Length), e = Length; i != e; ++i) 197 if (Data[i] != C) 198 return i; 199 return npos; 200} 201 202/// find_first_not_of - Find the first character in the string that is not 203/// in the string \arg Chars, or npos if not found. 204/// 205/// Note: O(size() + Chars.size()) 206StringRef::size_type StringRef::find_first_not_of(StringRef Chars, 207 size_t From) const { 208 std::bitset<1 << CHAR_BIT> CharBits; 209 for (size_type i = 0; i != Chars.size(); ++i) 210 CharBits.set((unsigned char)Chars[i]); 211 212 for (size_type i = min(From, Length), e = Length; i != e; ++i) 213 if (!CharBits.test((unsigned char)Data[i])) 214 return i; 215 return npos; 216} 217 218/// find_last_of - Find the last character in the string that is in \arg C, 219/// or npos if not found. 220/// 221/// Note: O(size() + Chars.size()) 222StringRef::size_type StringRef::find_last_of(StringRef Chars, 223 size_t From) const { 224 std::bitset<1 << CHAR_BIT> CharBits; 225 for (size_type i = 0; i != Chars.size(); ++i) 226 CharBits.set((unsigned char)Chars[i]); 227 228 for (size_type i = min(From, Length) - 1, e = -1; i != e; --i) 229 if (CharBits.test((unsigned char)Data[i])) 230 return i; 231 return npos; 232} 233 234/// find_last_not_of - Find the last character in the string that is not 235/// \arg C, or npos if not found. 236StringRef::size_type StringRef::find_last_not_of(char C, size_t From) const { 237 for (size_type i = min(From, Length) - 1, e = -1; i != e; --i) 238 if (Data[i] != C) 239 return i; 240 return npos; 241} 242 243/// find_last_not_of - Find the last character in the string that is not in 244/// \arg Chars, or npos if not found. 245/// 246/// Note: O(size() + Chars.size()) 247StringRef::size_type StringRef::find_last_not_of(StringRef Chars, 248 size_t From) const { 249 std::bitset<1 << CHAR_BIT> CharBits; 250 for (size_type i = 0, e = Chars.size(); i != e; ++i) 251 CharBits.set((unsigned char)Chars[i]); 252 253 for (size_type i = min(From, Length) - 1, e = -1; i != e; --i) 254 if (!CharBits.test((unsigned char)Data[i])) 255 return i; 256 return npos; 257} 258 259void StringRef::split(SmallVectorImpl<StringRef> &A, 260 StringRef Separators, int MaxSplit, 261 bool KeepEmpty) const { 262 StringRef rest = *this; 263 264 // rest.data() is used to distinguish cases like "a," that splits into 265 // "a" + "" and "a" that splits into "a" + 0. 266 for (int splits = 0; 267 rest.data() != NULL && (MaxSplit < 0 || splits < MaxSplit); 268 ++splits) { 269 std::pair<StringRef, StringRef> p = rest.split(Separators); 270 271 if (KeepEmpty || p.first.size() != 0) 272 A.push_back(p.first); 273 rest = p.second; 274 } 275 // If we have a tail left, add it. 276 if (rest.data() != NULL && (rest.size() != 0 || KeepEmpty)) 277 A.push_back(rest); 278} 279 280//===----------------------------------------------------------------------===// 281// Helpful Algorithms 282//===----------------------------------------------------------------------===// 283 284/// count - Return the number of non-overlapped occurrences of \arg Str in 285/// the string. 286size_t StringRef::count(StringRef Str) const { 287 size_t Count = 0; 288 size_t N = Str.size(); 289 if (N > Length) 290 return 0; 291 for (size_t i = 0, e = Length - N + 1; i != e; ++i) 292 if (substr(i, N).equals(Str)) 293 ++Count; 294 return Count; 295} 296 297static unsigned GetAutoSenseRadix(StringRef &Str) { 298 if (Str.startswith("0x")) { 299 Str = Str.substr(2); 300 return 16; 301 } 302 303 if (Str.startswith("0b")) { 304 Str = Str.substr(2); 305 return 2; 306 } 307 308 if (Str.startswith("0o")) { 309 Str = Str.substr(2); 310 return 8; 311 } 312 313 if (Str.startswith("0")) 314 return 8; 315 316 return 10; 317} 318 319 320/// GetAsUnsignedInteger - Workhorse method that converts a integer character 321/// sequence of radix up to 36 to an unsigned long long value. 322bool llvm::getAsUnsignedInteger(StringRef Str, unsigned Radix, 323 unsigned long long &Result) { 324 // Autosense radix if not specified. 325 if (Radix == 0) 326 Radix = GetAutoSenseRadix(Str); 327 328 // Empty strings (after the radix autosense) are invalid. 329 if (Str.empty()) return true; 330 331 // Parse all the bytes of the string given this radix. Watch for overflow. 332 Result = 0; 333 while (!Str.empty()) { 334 unsigned CharVal; 335 if (Str[0] >= '0' && Str[0] <= '9') 336 CharVal = Str[0]-'0'; 337 else if (Str[0] >= 'a' && Str[0] <= 'z') 338 CharVal = Str[0]-'a'+10; 339 else if (Str[0] >= 'A' && Str[0] <= 'Z') 340 CharVal = Str[0]-'A'+10; 341 else 342 return true; 343 344 // If the parsed value is larger than the integer radix, the string is 345 // invalid. 346 if (CharVal >= Radix) 347 return true; 348 349 // Add in this character. 350 unsigned long long PrevResult = Result; 351 Result = Result*Radix+CharVal; 352 353 // Check for overflow. 354 if (Result < PrevResult) 355 return true; 356 357 Str = Str.substr(1); 358 } 359 360 return false; 361} 362 363bool llvm::getAsSignedInteger(StringRef Str, unsigned Radix, 364 long long &Result) { 365 unsigned long long ULLVal; 366 367 // Handle positive strings first. 368 if (Str.empty() || Str.front() != '-') { 369 if (getAsUnsignedInteger(Str, Radix, ULLVal) || 370 // Check for value so large it overflows a signed value. 371 (long long)ULLVal < 0) 372 return true; 373 Result = ULLVal; 374 return false; 375 } 376 377 // Get the positive part of the value. 378 if (getAsUnsignedInteger(Str.substr(1), Radix, ULLVal) || 379 // Reject values so large they'd overflow as negative signed, but allow 380 // "-0". This negates the unsigned so that the negative isn't undefined 381 // on signed overflow. 382 (long long)-ULLVal > 0) 383 return true; 384 385 Result = -ULLVal; 386 return false; 387} 388 389bool StringRef::getAsInteger(unsigned Radix, APInt &Result) const { 390 StringRef Str = *this; 391 392 // Autosense radix if not specified. 393 if (Radix == 0) 394 Radix = GetAutoSenseRadix(Str); 395 396 assert(Radix > 1 && Radix <= 36); 397 398 // Empty strings (after the radix autosense) are invalid. 399 if (Str.empty()) return true; 400 401 // Skip leading zeroes. This can be a significant improvement if 402 // it means we don't need > 64 bits. 403 while (!Str.empty() && Str.front() == '0') 404 Str = Str.substr(1); 405 406 // If it was nothing but zeroes.... 407 if (Str.empty()) { 408 Result = APInt(64, 0); 409 return false; 410 } 411 412 // (Over-)estimate the required number of bits. 413 unsigned Log2Radix = 0; 414 while ((1U << Log2Radix) < Radix) Log2Radix++; 415 bool IsPowerOf2Radix = ((1U << Log2Radix) == Radix); 416 417 unsigned BitWidth = Log2Radix * Str.size(); 418 if (BitWidth < Result.getBitWidth()) 419 BitWidth = Result.getBitWidth(); // don't shrink the result 420 else if (BitWidth > Result.getBitWidth()) 421 Result = Result.zext(BitWidth); 422 423 APInt RadixAP, CharAP; // unused unless !IsPowerOf2Radix 424 if (!IsPowerOf2Radix) { 425 // These must have the same bit-width as Result. 426 RadixAP = APInt(BitWidth, Radix); 427 CharAP = APInt(BitWidth, 0); 428 } 429 430 // Parse all the bytes of the string given this radix. 431 Result = 0; 432 while (!Str.empty()) { 433 unsigned CharVal; 434 if (Str[0] >= '0' && Str[0] <= '9') 435 CharVal = Str[0]-'0'; 436 else if (Str[0] >= 'a' && Str[0] <= 'z') 437 CharVal = Str[0]-'a'+10; 438 else if (Str[0] >= 'A' && Str[0] <= 'Z') 439 CharVal = Str[0]-'A'+10; 440 else 441 return true; 442 443 // If the parsed value is larger than the integer radix, the string is 444 // invalid. 445 if (CharVal >= Radix) 446 return true; 447 448 // Add in this character. 449 if (IsPowerOf2Radix) { 450 Result <<= Log2Radix; 451 Result |= CharVal; 452 } else { 453 Result *= RadixAP; 454 CharAP = CharVal; 455 Result += CharAP; 456 } 457 458 Str = Str.substr(1); 459 } 460 461 return false; 462} 463 464 465// Implementation of StringRef hashing. 466hash_code llvm::hash_value(StringRef S) { 467 return hash_combine_range(S.begin(), S.end()); 468} 469