StringRef.cpp revision 0043e35b8261af607d6cf0695b79b1d6584e67ac
1//===-- StringRef.cpp - Lightweight String References ---------------------===// 2// 3// The LLVM Compiler Infrastructure 4// 5// This file is distributed under the University of Illinois Open Source 6// License. See LICENSE.TXT for details. 7// 8//===----------------------------------------------------------------------===// 9 10#include "llvm/ADT/StringRef.h" 11#include "llvm/ADT/APInt.h" 12#include <bitset> 13 14using namespace llvm; 15 16// MSVC emits references to this into the translation units which reference it. 17#ifndef _MSC_VER 18const size_t StringRef::npos; 19#endif 20 21static char ascii_tolower(char x) { 22 if (x >= 'A' && x <= 'Z') 23 return x - 'A' + 'a'; 24 return x; 25} 26 27static bool ascii_isdigit(char x) { 28 return x >= '0' && x <= '9'; 29} 30 31/// compare_lower - Compare strings, ignoring case. 32int StringRef::compare_lower(StringRef RHS) const { 33 for (size_t I = 0, E = min(Length, RHS.Length); I != E; ++I) { 34 unsigned char LHC = ascii_tolower(Data[I]); 35 unsigned char RHC = ascii_tolower(RHS.Data[I]); 36 if (LHC != RHC) 37 return LHC < RHC ? -1 : 1; 38 } 39 40 if (Length == RHS.Length) 41 return 0; 42 return Length < RHS.Length ? -1 : 1; 43} 44 45/// compare_numeric - Compare strings, handle embedded numbers. 46int StringRef::compare_numeric(StringRef RHS) const { 47 for (size_t I = 0, E = min(Length, RHS.Length); I != E; ++I) { 48 if (Data[I] == RHS.Data[I]) 49 continue; 50 if (ascii_isdigit(Data[I]) && ascii_isdigit(RHS.Data[I])) { 51 // The longer sequence of numbers is larger. This doesn't really handle 52 // prefixed zeros well. 53 for (size_t J = I+1; J != E+1; ++J) { 54 bool ld = J < Length && ascii_isdigit(Data[J]); 55 bool rd = J < RHS.Length && ascii_isdigit(RHS.Data[J]); 56 if (ld != rd) 57 return rd ? -1 : 1; 58 if (!rd) 59 break; 60 } 61 } 62 return Data[I] < RHS.Data[I] ? -1 : 1; 63 } 64 if (Length == RHS.Length) 65 return 0; 66 return Length < RHS.Length ? -1 : 1; 67} 68 69// Compute the edit distance between the two given strings. 70unsigned StringRef::edit_distance(llvm::StringRef Other, 71 bool AllowReplacements) { 72 // The algorithm implemented below is the "classic" 73 // dynamic-programming algorithm for computing the Levenshtein 74 // distance, which is described here: 75 // 76 // http://en.wikipedia.org/wiki/Levenshtein_distance 77 // 78 // Although the algorithm is typically described using an m x n 79 // array, only two rows are used at a time, so this implemenation 80 // just keeps two separate vectors for those two rows. 81 size_type m = size(); 82 size_type n = Other.size(); 83 84 const unsigned SmallBufferSize = 64; 85 unsigned SmallBuffer[SmallBufferSize]; 86 unsigned *Allocated = 0; 87 unsigned *previous = SmallBuffer; 88 if (2*(n + 1) > SmallBufferSize) 89 Allocated = previous = new unsigned [2*(n+1)]; 90 unsigned *current = previous + (n + 1); 91 92 for (unsigned i = 0; i <= n; ++i) 93 previous[i] = i; 94 95 for (size_type y = 1; y <= m; ++y) { 96 current[0] = y; 97 for (size_type x = 1; x <= n; ++x) { 98 if (AllowReplacements) { 99 current[x] = min(previous[x-1] + ((*this)[y-1] == Other[x-1]? 0u:1u), 100 min(current[x-1], previous[x])+1); 101 } 102 else { 103 if ((*this)[y-1] == Other[x-1]) current[x] = previous[x-1]; 104 else current[x] = min(current[x-1], previous[x]) + 1; 105 } 106 } 107 108 unsigned *tmp = current; 109 current = previous; 110 previous = tmp; 111 } 112 113 unsigned Result = previous[n]; 114 delete [] Allocated; 115 116 return Result; 117} 118 119//===----------------------------------------------------------------------===// 120// String Searching 121//===----------------------------------------------------------------------===// 122 123 124/// find - Search for the first string \arg Str in the string. 125/// 126/// \return - The index of the first occurence of \arg Str, or npos if not 127/// found. 128size_t StringRef::find(StringRef Str, size_t From) const { 129 size_t N = Str.size(); 130 if (N > Length) 131 return npos; 132 for (size_t e = Length - N + 1, i = min(From, e); i != e; ++i) 133 if (substr(i, N).equals(Str)) 134 return i; 135 return npos; 136} 137 138/// rfind - Search for the last string \arg Str in the string. 139/// 140/// \return - The index of the last occurence of \arg Str, or npos if not 141/// found. 142size_t StringRef::rfind(StringRef Str) const { 143 size_t N = Str.size(); 144 if (N > Length) 145 return npos; 146 for (size_t i = Length - N + 1, e = 0; i != e;) { 147 --i; 148 if (substr(i, N).equals(Str)) 149 return i; 150 } 151 return npos; 152} 153 154/// find_first_of - Find the first character in the string that is in \arg 155/// Chars, or npos if not found. 156/// 157/// Note: O(size() + Chars.size()) 158StringRef::size_type StringRef::find_first_of(StringRef Chars, 159 size_t From) const { 160 std::bitset<1 << CHAR_BIT> CharBits; 161 for (size_type i = 0; i != Chars.size(); ++i) 162 CharBits.set((unsigned char)Chars[i]); 163 164 for (size_type i = min(From, Length), e = Length; i != e; ++i) 165 if (CharBits.test((unsigned char)Data[i])) 166 return i; 167 return npos; 168} 169 170/// find_first_not_of - Find the first character in the string that is not 171/// \arg C or npos if not found. 172StringRef::size_type StringRef::find_first_not_of(char C, size_t From) const { 173 for (size_type i = min(From, Length), e = Length; i != e; ++i) 174 if (Data[i] != C) 175 return i; 176 return npos; 177} 178 179/// find_first_not_of - Find the first character in the string that is not 180/// in the string \arg Chars, or npos if not found. 181/// 182/// Note: O(size() + Chars.size()) 183StringRef::size_type StringRef::find_first_not_of(StringRef Chars, 184 size_t From) const { 185 std::bitset<1 << CHAR_BIT> CharBits; 186 for (size_type i = 0; i != Chars.size(); ++i) 187 CharBits.set((unsigned char)Chars[i]); 188 189 for (size_type i = min(From, Length), e = Length; i != e; ++i) 190 if (!CharBits.test((unsigned char)Data[i])) 191 return i; 192 return npos; 193} 194 195 196//===----------------------------------------------------------------------===// 197// Helpful Algorithms 198//===----------------------------------------------------------------------===// 199 200/// count - Return the number of non-overlapped occurrences of \arg Str in 201/// the string. 202size_t StringRef::count(StringRef Str) const { 203 size_t Count = 0; 204 size_t N = Str.size(); 205 if (N > Length) 206 return 0; 207 for (size_t i = 0, e = Length - N + 1; i != e; ++i) 208 if (substr(i, N).equals(Str)) 209 ++Count; 210 return Count; 211} 212 213static unsigned GetAutoSenseRadix(StringRef &Str) { 214 if (Str.startswith("0x")) { 215 Str = Str.substr(2); 216 return 16; 217 } else if (Str.startswith("0b")) { 218 Str = Str.substr(2); 219 return 2; 220 } else if (Str.startswith("0")) { 221 return 8; 222 } else { 223 return 10; 224 } 225} 226 227 228/// GetAsUnsignedInteger - Workhorse method that converts a integer character 229/// sequence of radix up to 36 to an unsigned long long value. 230static bool GetAsUnsignedInteger(StringRef Str, unsigned Radix, 231 unsigned long long &Result) { 232 // Autosense radix if not specified. 233 if (Radix == 0) 234 Radix = GetAutoSenseRadix(Str); 235 236 // Empty strings (after the radix autosense) are invalid. 237 if (Str.empty()) return true; 238 239 // Parse all the bytes of the string given this radix. Watch for overflow. 240 Result = 0; 241 while (!Str.empty()) { 242 unsigned CharVal; 243 if (Str[0] >= '0' && Str[0] <= '9') 244 CharVal = Str[0]-'0'; 245 else if (Str[0] >= 'a' && Str[0] <= 'z') 246 CharVal = Str[0]-'a'+10; 247 else if (Str[0] >= 'A' && Str[0] <= 'Z') 248 CharVal = Str[0]-'A'+10; 249 else 250 return true; 251 252 // If the parsed value is larger than the integer radix, the string is 253 // invalid. 254 if (CharVal >= Radix) 255 return true; 256 257 // Add in this character. 258 unsigned long long PrevResult = Result; 259 Result = Result*Radix+CharVal; 260 261 // Check for overflow. 262 if (Result < PrevResult) 263 return true; 264 265 Str = Str.substr(1); 266 } 267 268 return false; 269} 270 271bool StringRef::getAsInteger(unsigned Radix, unsigned long long &Result) const { 272 return GetAsUnsignedInteger(*this, Radix, Result); 273} 274 275 276bool StringRef::getAsInteger(unsigned Radix, long long &Result) const { 277 unsigned long long ULLVal; 278 279 // Handle positive strings first. 280 if (empty() || front() != '-') { 281 if (GetAsUnsignedInteger(*this, Radix, ULLVal) || 282 // Check for value so large it overflows a signed value. 283 (long long)ULLVal < 0) 284 return true; 285 Result = ULLVal; 286 return false; 287 } 288 289 // Get the positive part of the value. 290 if (GetAsUnsignedInteger(substr(1), Radix, ULLVal) || 291 // Reject values so large they'd overflow as negative signed, but allow 292 // "-0". This negates the unsigned so that the negative isn't undefined 293 // on signed overflow. 294 (long long)-ULLVal > 0) 295 return true; 296 297 Result = -ULLVal; 298 return false; 299} 300 301bool StringRef::getAsInteger(unsigned Radix, int &Result) const { 302 long long Val; 303 if (getAsInteger(Radix, Val) || 304 (int)Val != Val) 305 return true; 306 Result = Val; 307 return false; 308} 309 310bool StringRef::getAsInteger(unsigned Radix, unsigned &Result) const { 311 unsigned long long Val; 312 if (getAsInteger(Radix, Val) || 313 (unsigned)Val != Val) 314 return true; 315 Result = Val; 316 return false; 317} 318 319bool StringRef::getAsInteger(unsigned Radix, APInt &Result) const { 320 StringRef Str = *this; 321 322 // Autosense radix if not specified. 323 if (Radix == 0) 324 Radix = GetAutoSenseRadix(Str); 325 326 assert(Radix > 1 && Radix <= 36); 327 328 // Empty strings (after the radix autosense) are invalid. 329 if (Str.empty()) return true; 330 331 // Skip leading zeroes. This can be a significant improvement if 332 // it means we don't need > 64 bits. 333 while (!Str.empty() && Str.front() == '0') 334 Str = Str.substr(1); 335 336 // If it was nothing but zeroes.... 337 if (Str.empty()) { 338 Result = APInt(64, 0); 339 return false; 340 } 341 342 // (Over-)estimate the required number of bits. 343 unsigned Log2Radix = 0; 344 while ((1U << Log2Radix) < Radix) Log2Radix++; 345 bool IsPowerOf2Radix = ((1U << Log2Radix) == Radix); 346 347 unsigned BitWidth = Log2Radix * Str.size(); 348 if (BitWidth < Result.getBitWidth()) 349 BitWidth = Result.getBitWidth(); // don't shrink the result 350 else 351 Result.zext(BitWidth); 352 353 APInt RadixAP, CharAP; // unused unless !IsPowerOf2Radix 354 if (!IsPowerOf2Radix) { 355 // These must have the same bit-width as Result. 356 RadixAP = APInt(BitWidth, Radix); 357 CharAP = APInt(BitWidth, 0); 358 } 359 360 // Parse all the bytes of the string given this radix. 361 Result = 0; 362 while (!Str.empty()) { 363 unsigned CharVal; 364 if (Str[0] >= '0' && Str[0] <= '9') 365 CharVal = Str[0]-'0'; 366 else if (Str[0] >= 'a' && Str[0] <= 'z') 367 CharVal = Str[0]-'a'+10; 368 else if (Str[0] >= 'A' && Str[0] <= 'Z') 369 CharVal = Str[0]-'A'+10; 370 else 371 return true; 372 373 // If the parsed value is larger than the integer radix, the string is 374 // invalid. 375 if (CharVal >= Radix) 376 return true; 377 378 // Add in this character. 379 if (IsPowerOf2Radix) { 380 Result <<= Log2Radix; 381 Result |= CharVal; 382 } else { 383 Result *= RadixAP; 384 CharAP = CharVal; 385 Result += CharAP; 386 } 387 388 Str = Str.substr(1); 389 } 390 391 return false; 392} 393