StringRef.cpp revision a7b966fc8d63b9b9432e1b33b33d4be6179e1fff
1//===-- StringRef.cpp - Lightweight String References ---------------------===// 2// 3// The LLVM Compiler Infrastructure 4// 5// This file is distributed under the University of Illinois Open Source 6// License. See LICENSE.TXT for details. 7// 8//===----------------------------------------------------------------------===// 9 10#include "llvm/ADT/StringRef.h" 11#include "llvm/ADT/APInt.h" 12#include "llvm/ADT/OwningPtr.h" 13#include <bitset> 14 15using namespace llvm; 16 17// MSVC emits references to this into the translation units which reference it. 18#ifndef _MSC_VER 19const size_t StringRef::npos; 20#endif 21 22static char ascii_tolower(char x) { 23 if (x >= 'A' && x <= 'Z') 24 return x - 'A' + 'a'; 25 return x; 26} 27 28static char ascii_toupper(char x) { 29 if (x >= 'a' && x <= 'z') 30 return x - 'a' + 'A'; 31 return x; 32} 33 34static bool ascii_isdigit(char x) { 35 return x >= '0' && x <= '9'; 36} 37 38/// compare_lower - Compare strings, ignoring case. 39int StringRef::compare_lower(StringRef RHS) const { 40 for (size_t I = 0, E = min(Length, RHS.Length); I != E; ++I) { 41 unsigned char LHC = ascii_tolower(Data[I]); 42 unsigned char RHC = ascii_tolower(RHS.Data[I]); 43 if (LHC != RHC) 44 return LHC < RHC ? -1 : 1; 45 } 46 47 if (Length == RHS.Length) 48 return 0; 49 return Length < RHS.Length ? -1 : 1; 50} 51 52/// compare_numeric - Compare strings, handle embedded numbers. 53int StringRef::compare_numeric(StringRef RHS) const { 54 for (size_t I = 0, E = min(Length, RHS.Length); I != E; ++I) { 55 // Check for sequences of digits. 56 if (ascii_isdigit(Data[I]) && ascii_isdigit(RHS.Data[I])) { 57 // The longer sequence of numbers is considered larger. 58 // This doesn't really handle prefixed zeros well. 59 size_t J; 60 for (J = I + 1; J != E + 1; ++J) { 61 bool ld = J < Length && ascii_isdigit(Data[J]); 62 bool rd = J < RHS.Length && ascii_isdigit(RHS.Data[J]); 63 if (ld != rd) 64 return rd ? -1 : 1; 65 if (!rd) 66 break; 67 } 68 // The two number sequences have the same length (J-I), just memcmp them. 69 if (int Res = compareMemory(Data + I, RHS.Data + I, J - I)) 70 return Res < 0 ? -1 : 1; 71 // Identical number sequences, continue search after the numbers. 72 I = J - 1; 73 continue; 74 } 75 if (Data[I] != RHS.Data[I]) 76 return (unsigned char)Data[I] < (unsigned char)RHS.Data[I] ? -1 : 1; 77 } 78 if (Length == RHS.Length) 79 return 0; 80 return Length < RHS.Length ? -1 : 1; 81} 82 83// Compute the edit distance between the two given strings. 84unsigned StringRef::edit_distance(llvm::StringRef Other, 85 bool AllowReplacements, 86 unsigned MaxEditDistance) { 87 // The algorithm implemented below is the "classic" 88 // dynamic-programming algorithm for computing the Levenshtein 89 // distance, which is described here: 90 // 91 // http://en.wikipedia.org/wiki/Levenshtein_distance 92 // 93 // Although the algorithm is typically described using an m x n 94 // array, only two rows are used at a time, so this implemenation 95 // just keeps two separate vectors for those two rows. 96 size_type m = size(); 97 size_type n = Other.size(); 98 99 const unsigned SmallBufferSize = 64; 100 unsigned SmallBuffer[SmallBufferSize]; 101 llvm::OwningArrayPtr<unsigned> Allocated; 102 unsigned *previous = SmallBuffer; 103 if (2*(n + 1) > SmallBufferSize) { 104 previous = new unsigned [2*(n+1)]; 105 Allocated.reset(previous); 106 } 107 unsigned *current = previous + (n + 1); 108 109 for (unsigned i = 0; i <= n; ++i) 110 previous[i] = i; 111 112 for (size_type y = 1; y <= m; ++y) { 113 current[0] = y; 114 unsigned BestThisRow = current[0]; 115 116 for (size_type x = 1; x <= n; ++x) { 117 if (AllowReplacements) { 118 current[x] = min(previous[x-1] + ((*this)[y-1] == Other[x-1]? 0u:1u), 119 min(current[x-1], previous[x])+1); 120 } 121 else { 122 if ((*this)[y-1] == Other[x-1]) current[x] = previous[x-1]; 123 else current[x] = min(current[x-1], previous[x]) + 1; 124 } 125 BestThisRow = min(BestThisRow, current[x]); 126 } 127 128 if (MaxEditDistance && BestThisRow > MaxEditDistance) 129 return MaxEditDistance + 1; 130 131 unsigned *tmp = current; 132 current = previous; 133 previous = tmp; 134 } 135 136 unsigned Result = previous[n]; 137 return Result; 138} 139 140//===----------------------------------------------------------------------===// 141// String Operations 142//===----------------------------------------------------------------------===// 143 144std::string StringRef::lower() const { 145 std::string Result(size(), char()); 146 for (size_type i = 0, e = size(); i != e; ++i) { 147 Result[i] = ascii_tolower(Data[i]); 148 } 149 return Result; 150} 151 152std::string StringRef::upper() const { 153 std::string Result(size(), char()); 154 for (size_type i = 0, e = size(); i != e; ++i) { 155 Result[i] = ascii_toupper(Data[i]); 156 } 157 return Result; 158} 159 160//===----------------------------------------------------------------------===// 161// String Searching 162//===----------------------------------------------------------------------===// 163 164 165/// find - Search for the first string \arg Str in the string. 166/// 167/// \return - The index of the first occurrence of \arg Str, or npos if not 168/// found. 169size_t StringRef::find(StringRef Str, size_t From) const { 170 size_t N = Str.size(); 171 if (N > Length) 172 return npos; 173 174 // For short haystacks or unsupported needles fall back to the naive algorithm 175 if (Length < 16 || N > 255 || N == 0) { 176 for (size_t e = Length - N + 1, i = min(From, e); i != e; ++i) 177 if (substr(i, N).equals(Str)) 178 return i; 179 return npos; 180 } 181 182 if (From >= Length) 183 return npos; 184 185 // Build the bad char heuristic table, with uint8_t to reduce cache thrashing. 186 uint8_t BadCharSkip[256]; 187 std::memset(BadCharSkip, N, 256); 188 for (unsigned i = 0; i != N-1; ++i) 189 BadCharSkip[(uint8_t)Str[i]] = N-1-i; 190 191 unsigned Len = Length-From, Pos = From; 192 while (Len >= N) { 193 if (substr(Pos, N).equals(Str)) // See if this is the correct substring. 194 return Pos; 195 196 // Otherwise skip the appropriate number of bytes. 197 uint8_t Skip = BadCharSkip[(uint8_t)(*this)[Pos+N-1]]; 198 Len -= Skip; 199 Pos += Skip; 200 } 201 202 return npos; 203} 204 205/// rfind - Search for the last string \arg Str in the string. 206/// 207/// \return - The index of the last occurrence of \arg Str, or npos if not 208/// found. 209size_t StringRef::rfind(StringRef Str) const { 210 size_t N = Str.size(); 211 if (N > Length) 212 return npos; 213 for (size_t i = Length - N + 1, e = 0; i != e;) { 214 --i; 215 if (substr(i, N).equals(Str)) 216 return i; 217 } 218 return npos; 219} 220 221/// find_first_of - Find the first character in the string that is in \arg 222/// Chars, or npos if not found. 223/// 224/// Note: O(size() + Chars.size()) 225StringRef::size_type StringRef::find_first_of(StringRef Chars, 226 size_t From) const { 227 std::bitset<1 << CHAR_BIT> CharBits; 228 for (size_type i = 0; i != Chars.size(); ++i) 229 CharBits.set((unsigned char)Chars[i]); 230 231 for (size_type i = min(From, Length), e = Length; i != e; ++i) 232 if (CharBits.test((unsigned char)Data[i])) 233 return i; 234 return npos; 235} 236 237/// find_first_not_of - Find the first character in the string that is not 238/// \arg C or npos if not found. 239StringRef::size_type StringRef::find_first_not_of(char C, size_t From) const { 240 for (size_type i = min(From, Length), e = Length; i != e; ++i) 241 if (Data[i] != C) 242 return i; 243 return npos; 244} 245 246/// find_first_not_of - Find the first character in the string that is not 247/// in the string \arg Chars, or npos if not found. 248/// 249/// Note: O(size() + Chars.size()) 250StringRef::size_type StringRef::find_first_not_of(StringRef Chars, 251 size_t From) const { 252 std::bitset<1 << CHAR_BIT> CharBits; 253 for (size_type i = 0; i != Chars.size(); ++i) 254 CharBits.set((unsigned char)Chars[i]); 255 256 for (size_type i = min(From, Length), e = Length; i != e; ++i) 257 if (!CharBits.test((unsigned char)Data[i])) 258 return i; 259 return npos; 260} 261 262/// find_last_of - Find the last character in the string that is in \arg C, 263/// or npos if not found. 264/// 265/// Note: O(size() + Chars.size()) 266StringRef::size_type StringRef::find_last_of(StringRef Chars, 267 size_t From) const { 268 std::bitset<1 << CHAR_BIT> CharBits; 269 for (size_type i = 0; i != Chars.size(); ++i) 270 CharBits.set((unsigned char)Chars[i]); 271 272 for (size_type i = min(From, Length) - 1, e = -1; i != e; --i) 273 if (CharBits.test((unsigned char)Data[i])) 274 return i; 275 return npos; 276} 277 278//===----------------------------------------------------------------------===// 279// Helpful Algorithms 280//===----------------------------------------------------------------------===// 281 282/// count - Return the number of non-overlapped occurrences of \arg Str in 283/// the string. 284size_t StringRef::count(StringRef Str) const { 285 size_t Count = 0; 286 size_t N = Str.size(); 287 if (N > Length) 288 return 0; 289 for (size_t i = 0, e = Length - N + 1; i != e; ++i) 290 if (substr(i, N).equals(Str)) 291 ++Count; 292 return Count; 293} 294 295static unsigned GetAutoSenseRadix(StringRef &Str) { 296 if (Str.startswith("0x")) { 297 Str = Str.substr(2); 298 return 16; 299 } else if (Str.startswith("0b")) { 300 Str = Str.substr(2); 301 return 2; 302 } else if (Str.startswith("0")) { 303 return 8; 304 } else { 305 return 10; 306 } 307} 308 309 310/// GetAsUnsignedInteger - Workhorse method that converts a integer character 311/// sequence of radix up to 36 to an unsigned long long value. 312static bool GetAsUnsignedInteger(StringRef Str, unsigned Radix, 313 unsigned long long &Result) { 314 // Autosense radix if not specified. 315 if (Radix == 0) 316 Radix = GetAutoSenseRadix(Str); 317 318 // Empty strings (after the radix autosense) are invalid. 319 if (Str.empty()) return true; 320 321 // Parse all the bytes of the string given this radix. Watch for overflow. 322 Result = 0; 323 while (!Str.empty()) { 324 unsigned CharVal; 325 if (Str[0] >= '0' && Str[0] <= '9') 326 CharVal = Str[0]-'0'; 327 else if (Str[0] >= 'a' && Str[0] <= 'z') 328 CharVal = Str[0]-'a'+10; 329 else if (Str[0] >= 'A' && Str[0] <= 'Z') 330 CharVal = Str[0]-'A'+10; 331 else 332 return true; 333 334 // If the parsed value is larger than the integer radix, the string is 335 // invalid. 336 if (CharVal >= Radix) 337 return true; 338 339 // Add in this character. 340 unsigned long long PrevResult = Result; 341 Result = Result*Radix+CharVal; 342 343 // Check for overflow. 344 if (Result < PrevResult) 345 return true; 346 347 Str = Str.substr(1); 348 } 349 350 return false; 351} 352 353bool StringRef::getAsInteger(unsigned Radix, unsigned long long &Result) const { 354 return GetAsUnsignedInteger(*this, Radix, Result); 355} 356 357 358bool StringRef::getAsInteger(unsigned Radix, long long &Result) const { 359 unsigned long long ULLVal; 360 361 // Handle positive strings first. 362 if (empty() || front() != '-') { 363 if (GetAsUnsignedInteger(*this, Radix, ULLVal) || 364 // Check for value so large it overflows a signed value. 365 (long long)ULLVal < 0) 366 return true; 367 Result = ULLVal; 368 return false; 369 } 370 371 // Get the positive part of the value. 372 if (GetAsUnsignedInteger(substr(1), Radix, ULLVal) || 373 // Reject values so large they'd overflow as negative signed, but allow 374 // "-0". This negates the unsigned so that the negative isn't undefined 375 // on signed overflow. 376 (long long)-ULLVal > 0) 377 return true; 378 379 Result = -ULLVal; 380 return false; 381} 382 383bool StringRef::getAsInteger(unsigned Radix, int &Result) const { 384 long long Val; 385 if (getAsInteger(Radix, Val) || 386 (int)Val != Val) 387 return true; 388 Result = Val; 389 return false; 390} 391 392bool StringRef::getAsInteger(unsigned Radix, unsigned &Result) const { 393 unsigned long long Val; 394 if (getAsInteger(Radix, Val) || 395 (unsigned)Val != Val) 396 return true; 397 Result = Val; 398 return false; 399} 400 401bool StringRef::getAsInteger(unsigned Radix, APInt &Result) const { 402 StringRef Str = *this; 403 404 // Autosense radix if not specified. 405 if (Radix == 0) 406 Radix = GetAutoSenseRadix(Str); 407 408 assert(Radix > 1 && Radix <= 36); 409 410 // Empty strings (after the radix autosense) are invalid. 411 if (Str.empty()) return true; 412 413 // Skip leading zeroes. This can be a significant improvement if 414 // it means we don't need > 64 bits. 415 while (!Str.empty() && Str.front() == '0') 416 Str = Str.substr(1); 417 418 // If it was nothing but zeroes.... 419 if (Str.empty()) { 420 Result = APInt(64, 0); 421 return false; 422 } 423 424 // (Over-)estimate the required number of bits. 425 unsigned Log2Radix = 0; 426 while ((1U << Log2Radix) < Radix) Log2Radix++; 427 bool IsPowerOf2Radix = ((1U << Log2Radix) == Radix); 428 429 unsigned BitWidth = Log2Radix * Str.size(); 430 if (BitWidth < Result.getBitWidth()) 431 BitWidth = Result.getBitWidth(); // don't shrink the result 432 else 433 Result = Result.zext(BitWidth); 434 435 APInt RadixAP, CharAP; // unused unless !IsPowerOf2Radix 436 if (!IsPowerOf2Radix) { 437 // These must have the same bit-width as Result. 438 RadixAP = APInt(BitWidth, Radix); 439 CharAP = APInt(BitWidth, 0); 440 } 441 442 // Parse all the bytes of the string given this radix. 443 Result = 0; 444 while (!Str.empty()) { 445 unsigned CharVal; 446 if (Str[0] >= '0' && Str[0] <= '9') 447 CharVal = Str[0]-'0'; 448 else if (Str[0] >= 'a' && Str[0] <= 'z') 449 CharVal = Str[0]-'a'+10; 450 else if (Str[0] >= 'A' && Str[0] <= 'Z') 451 CharVal = Str[0]-'A'+10; 452 else 453 return true; 454 455 // If the parsed value is larger than the integer radix, the string is 456 // invalid. 457 if (CharVal >= Radix) 458 return true; 459 460 // Add in this character. 461 if (IsPowerOf2Radix) { 462 Result <<= Log2Radix; 463 Result |= CharVal; 464 } else { 465 Result *= RadixAP; 466 CharAP = CharVal; 467 Result += CharAP; 468 } 469 470 Str = Str.substr(1); 471 } 472 473 return false; 474} 475