StringRef.cpp revision 63c133b67d61b0c457ff46c957aed2b8d90b599c
1//===-- StringRef.cpp - Lightweight String References ---------------------===// 2// 3// The LLVM Compiler Infrastructure 4// 5// This file is distributed under the University of Illinois Open Source 6// License. See LICENSE.TXT for details. 7// 8//===----------------------------------------------------------------------===// 9 10#include "llvm/ADT/StringRef.h" 11#include "llvm/ADT/APInt.h" 12#include "llvm/ADT/OwningPtr.h" 13#include <bitset> 14 15using namespace llvm; 16 17// MSVC emits references to this into the translation units which reference it. 18#ifndef _MSC_VER 19const size_t StringRef::npos; 20#endif 21 22static char ascii_tolower(char x) { 23 if (x >= 'A' && x <= 'Z') 24 return x - 'A' + 'a'; 25 return x; 26} 27 28static bool ascii_isdigit(char x) { 29 return x >= '0' && x <= '9'; 30} 31 32/// compare_lower - Compare strings, ignoring case. 33int StringRef::compare_lower(StringRef RHS) const { 34 for (size_t I = 0, E = min(Length, RHS.Length); I != E; ++I) { 35 unsigned char LHC = ascii_tolower(Data[I]); 36 unsigned char RHC = ascii_tolower(RHS.Data[I]); 37 if (LHC != RHC) 38 return LHC < RHC ? -1 : 1; 39 } 40 41 if (Length == RHS.Length) 42 return 0; 43 return Length < RHS.Length ? -1 : 1; 44} 45 46/// compare_numeric - Compare strings, handle embedded numbers. 47int StringRef::compare_numeric(StringRef RHS) const { 48 for (size_t I = 0, E = min(Length, RHS.Length); I != E; ++I) { 49 if (Data[I] == RHS.Data[I]) 50 continue; 51 if (ascii_isdigit(Data[I]) && ascii_isdigit(RHS.Data[I])) { 52 // The longer sequence of numbers is larger. This doesn't really handle 53 // prefixed zeros well. 54 for (size_t J = I+1; J != E+1; ++J) { 55 bool ld = J < Length && ascii_isdigit(Data[J]); 56 bool rd = J < RHS.Length && ascii_isdigit(RHS.Data[J]); 57 if (ld != rd) 58 return rd ? -1 : 1; 59 if (!rd) 60 break; 61 } 62 } 63 return (unsigned char)Data[I] < (unsigned char)RHS.Data[I] ? -1 : 1; 64 } 65 if (Length == RHS.Length) 66 return 0; 67 return Length < RHS.Length ? -1 : 1; 68} 69 70// Compute the edit distance between the two given strings. 71unsigned StringRef::edit_distance(llvm::StringRef Other, 72 bool AllowReplacements, 73 unsigned MaxEditDistance) { 74 // The algorithm implemented below is the "classic" 75 // dynamic-programming algorithm for computing the Levenshtein 76 // distance, which is described here: 77 // 78 // http://en.wikipedia.org/wiki/Levenshtein_distance 79 // 80 // Although the algorithm is typically described using an m x n 81 // array, only two rows are used at a time, so this implemenation 82 // just keeps two separate vectors for those two rows. 83 size_type m = size(); 84 size_type n = Other.size(); 85 86 const unsigned SmallBufferSize = 64; 87 unsigned SmallBuffer[SmallBufferSize]; 88 llvm::OwningArrayPtr<unsigned> Allocated; 89 unsigned *previous = SmallBuffer; 90 if (2*(n + 1) > SmallBufferSize) { 91 previous = new unsigned [2*(n+1)]; 92 Allocated.reset(previous); 93 } 94 unsigned *current = previous + (n + 1); 95 96 for (unsigned i = 0; i <= n; ++i) 97 previous[i] = i; 98 99 for (size_type y = 1; y <= m; ++y) { 100 current[0] = y; 101 unsigned BestThisRow = current[0]; 102 103 for (size_type x = 1; x <= n; ++x) { 104 if (AllowReplacements) { 105 current[x] = min(previous[x-1] + ((*this)[y-1] == Other[x-1]? 0u:1u), 106 min(current[x-1], previous[x])+1); 107 } 108 else { 109 if ((*this)[y-1] == Other[x-1]) current[x] = previous[x-1]; 110 else current[x] = min(current[x-1], previous[x]) + 1; 111 } 112 BestThisRow = min(BestThisRow, current[x]); 113 } 114 115 if (MaxEditDistance && BestThisRow > MaxEditDistance) 116 return MaxEditDistance + 1; 117 118 unsigned *tmp = current; 119 current = previous; 120 previous = tmp; 121 } 122 123 unsigned Result = previous[n]; 124 return Result; 125} 126 127//===----------------------------------------------------------------------===// 128// String Searching 129//===----------------------------------------------------------------------===// 130 131 132/// find - Search for the first string \arg Str in the string. 133/// 134/// \return - The index of the first occurence of \arg Str, or npos if not 135/// found. 136size_t StringRef::find(StringRef Str, size_t From) const { 137 size_t N = Str.size(); 138 if (N > Length) 139 return npos; 140 for (size_t e = Length - N + 1, i = min(From, e); i != e; ++i) 141 if (substr(i, N).equals(Str)) 142 return i; 143 return npos; 144} 145 146/// rfind - Search for the last string \arg Str in the string. 147/// 148/// \return - The index of the last occurence of \arg Str, or npos if not 149/// found. 150size_t StringRef::rfind(StringRef Str) const { 151 size_t N = Str.size(); 152 if (N > Length) 153 return npos; 154 for (size_t i = Length - N + 1, e = 0; i != e;) { 155 --i; 156 if (substr(i, N).equals(Str)) 157 return i; 158 } 159 return npos; 160} 161 162/// find_first_of - Find the first character in the string that is in \arg 163/// Chars, or npos if not found. 164/// 165/// Note: O(size() + Chars.size()) 166StringRef::size_type StringRef::find_first_of(StringRef Chars, 167 size_t From) const { 168 std::bitset<1 << CHAR_BIT> CharBits; 169 for (size_type i = 0; i != Chars.size(); ++i) 170 CharBits.set((unsigned char)Chars[i]); 171 172 for (size_type i = min(From, Length), e = Length; i != e; ++i) 173 if (CharBits.test((unsigned char)Data[i])) 174 return i; 175 return npos; 176} 177 178/// find_first_not_of - Find the first character in the string that is not 179/// \arg C or npos if not found. 180StringRef::size_type StringRef::find_first_not_of(char C, size_t From) const { 181 for (size_type i = min(From, Length), e = Length; i != e; ++i) 182 if (Data[i] != C) 183 return i; 184 return npos; 185} 186 187/// find_first_not_of - Find the first character in the string that is not 188/// in the string \arg Chars, or npos if not found. 189/// 190/// Note: O(size() + Chars.size()) 191StringRef::size_type StringRef::find_first_not_of(StringRef Chars, 192 size_t From) const { 193 std::bitset<1 << CHAR_BIT> CharBits; 194 for (size_type i = 0; i != Chars.size(); ++i) 195 CharBits.set((unsigned char)Chars[i]); 196 197 for (size_type i = min(From, Length), e = Length; i != e; ++i) 198 if (!CharBits.test((unsigned char)Data[i])) 199 return i; 200 return npos; 201} 202 203/// find_last_of - Find the last character in the string that is in \arg C, 204/// or npos if not found. 205/// 206/// Note: O(size() + Chars.size()) 207StringRef::size_type StringRef::find_last_of(StringRef Chars, 208 size_t From) const { 209 std::bitset<1 << CHAR_BIT> CharBits; 210 for (size_type i = 0; i != Chars.size(); ++i) 211 CharBits.set((unsigned char)Chars[i]); 212 213 for (size_type i = min(From, Length) - 1, e = -1; i != e; --i) 214 if (CharBits.test((unsigned char)Data[i])) 215 return i; 216 return npos; 217} 218 219//===----------------------------------------------------------------------===// 220// Helpful Algorithms 221//===----------------------------------------------------------------------===// 222 223/// count - Return the number of non-overlapped occurrences of \arg Str in 224/// the string. 225size_t StringRef::count(StringRef Str) const { 226 size_t Count = 0; 227 size_t N = Str.size(); 228 if (N > Length) 229 return 0; 230 for (size_t i = 0, e = Length - N + 1; i != e; ++i) 231 if (substr(i, N).equals(Str)) 232 ++Count; 233 return Count; 234} 235 236static unsigned GetAutoSenseRadix(StringRef &Str) { 237 if (Str.startswith("0x")) { 238 Str = Str.substr(2); 239 return 16; 240 } else if (Str.startswith("0b")) { 241 Str = Str.substr(2); 242 return 2; 243 } else if (Str.startswith("0")) { 244 return 8; 245 } else { 246 return 10; 247 } 248} 249 250 251/// GetAsUnsignedInteger - Workhorse method that converts a integer character 252/// sequence of radix up to 36 to an unsigned long long value. 253static bool GetAsUnsignedInteger(StringRef Str, unsigned Radix, 254 unsigned long long &Result) { 255 // Autosense radix if not specified. 256 if (Radix == 0) 257 Radix = GetAutoSenseRadix(Str); 258 259 // Empty strings (after the radix autosense) are invalid. 260 if (Str.empty()) return true; 261 262 // Parse all the bytes of the string given this radix. Watch for overflow. 263 Result = 0; 264 while (!Str.empty()) { 265 unsigned CharVal; 266 if (Str[0] >= '0' && Str[0] <= '9') 267 CharVal = Str[0]-'0'; 268 else if (Str[0] >= 'a' && Str[0] <= 'z') 269 CharVal = Str[0]-'a'+10; 270 else if (Str[0] >= 'A' && Str[0] <= 'Z') 271 CharVal = Str[0]-'A'+10; 272 else 273 return true; 274 275 // If the parsed value is larger than the integer radix, the string is 276 // invalid. 277 if (CharVal >= Radix) 278 return true; 279 280 // Add in this character. 281 unsigned long long PrevResult = Result; 282 Result = Result*Radix+CharVal; 283 284 // Check for overflow. 285 if (Result < PrevResult) 286 return true; 287 288 Str = Str.substr(1); 289 } 290 291 return false; 292} 293 294bool StringRef::getAsInteger(unsigned Radix, unsigned long long &Result) const { 295 return GetAsUnsignedInteger(*this, Radix, Result); 296} 297 298 299bool StringRef::getAsInteger(unsigned Radix, long long &Result) const { 300 unsigned long long ULLVal; 301 302 // Handle positive strings first. 303 if (empty() || front() != '-') { 304 if (GetAsUnsignedInteger(*this, Radix, ULLVal) || 305 // Check for value so large it overflows a signed value. 306 (long long)ULLVal < 0) 307 return true; 308 Result = ULLVal; 309 return false; 310 } 311 312 // Get the positive part of the value. 313 if (GetAsUnsignedInteger(substr(1), Radix, ULLVal) || 314 // Reject values so large they'd overflow as negative signed, but allow 315 // "-0". This negates the unsigned so that the negative isn't undefined 316 // on signed overflow. 317 (long long)-ULLVal > 0) 318 return true; 319 320 Result = -ULLVal; 321 return false; 322} 323 324bool StringRef::getAsInteger(unsigned Radix, int &Result) const { 325 long long Val; 326 if (getAsInteger(Radix, Val) || 327 (int)Val != Val) 328 return true; 329 Result = Val; 330 return false; 331} 332 333bool StringRef::getAsInteger(unsigned Radix, unsigned &Result) const { 334 unsigned long long Val; 335 if (getAsInteger(Radix, Val) || 336 (unsigned)Val != Val) 337 return true; 338 Result = Val; 339 return false; 340} 341 342bool StringRef::getAsInteger(unsigned Radix, APInt &Result) const { 343 StringRef Str = *this; 344 345 // Autosense radix if not specified. 346 if (Radix == 0) 347 Radix = GetAutoSenseRadix(Str); 348 349 assert(Radix > 1 && Radix <= 36); 350 351 // Empty strings (after the radix autosense) are invalid. 352 if (Str.empty()) return true; 353 354 // Skip leading zeroes. This can be a significant improvement if 355 // it means we don't need > 64 bits. 356 while (!Str.empty() && Str.front() == '0') 357 Str = Str.substr(1); 358 359 // If it was nothing but zeroes.... 360 if (Str.empty()) { 361 Result = APInt(64, 0); 362 return false; 363 } 364 365 // (Over-)estimate the required number of bits. 366 unsigned Log2Radix = 0; 367 while ((1U << Log2Radix) < Radix) Log2Radix++; 368 bool IsPowerOf2Radix = ((1U << Log2Radix) == Radix); 369 370 unsigned BitWidth = Log2Radix * Str.size(); 371 if (BitWidth < Result.getBitWidth()) 372 BitWidth = Result.getBitWidth(); // don't shrink the result 373 else 374 Result.zext(BitWidth); 375 376 APInt RadixAP, CharAP; // unused unless !IsPowerOf2Radix 377 if (!IsPowerOf2Radix) { 378 // These must have the same bit-width as Result. 379 RadixAP = APInt(BitWidth, Radix); 380 CharAP = APInt(BitWidth, 0); 381 } 382 383 // Parse all the bytes of the string given this radix. 384 Result = 0; 385 while (!Str.empty()) { 386 unsigned CharVal; 387 if (Str[0] >= '0' && Str[0] <= '9') 388 CharVal = Str[0]-'0'; 389 else if (Str[0] >= 'a' && Str[0] <= 'z') 390 CharVal = Str[0]-'a'+10; 391 else if (Str[0] >= 'A' && Str[0] <= 'Z') 392 CharVal = Str[0]-'A'+10; 393 else 394 return true; 395 396 // If the parsed value is larger than the integer radix, the string is 397 // invalid. 398 if (CharVal >= Radix) 399 return true; 400 401 // Add in this character. 402 if (IsPowerOf2Radix) { 403 Result <<= Log2Radix; 404 Result |= CharVal; 405 } else { 406 Result *= RadixAP; 407 CharAP = CharVal; 408 Result += CharAP; 409 } 410 411 Str = Str.substr(1); 412 } 413 414 return false; 415} 416