StringRef.cpp revision 5ee568ac2704d7302017d42ad162d4b53d076cbc
1//===-- StringRef.cpp - Lightweight String References ---------------------===// 2// 3// The LLVM Compiler Infrastructure 4// 5// This file is distributed under the University of Illinois Open Source 6// License. See LICENSE.TXT for details. 7// 8//===----------------------------------------------------------------------===// 9 10#include "llvm/ADT/StringRef.h" 11#include "llvm/ADT/APInt.h" 12#include <bitset> 13 14using namespace llvm; 15 16// MSVC emits references to this into the translation units which reference it. 17#ifndef _MSC_VER 18const size_t StringRef::npos; 19#endif 20 21static char ascii_tolower(char x) { 22 if (x >= 'A' && x <= 'Z') 23 return x - 'A' + 'a'; 24 return x; 25} 26 27static bool ascii_isdigit(char x) { 28 return x >= '0' && x <= '9'; 29} 30 31/// compare_lower - Compare strings, ignoring case. 32int StringRef::compare_lower(StringRef RHS) const { 33 for (size_t I = 0, E = min(Length, RHS.Length); I != E; ++I) { 34 unsigned char LHC = ascii_tolower(Data[I]); 35 unsigned char RHC = ascii_tolower(RHS.Data[I]); 36 if (LHC != RHC) 37 return LHC < RHC ? -1 : 1; 38 } 39 40 if (Length == RHS.Length) 41 return 0; 42 return Length < RHS.Length ? -1 : 1; 43} 44 45/// compare_numeric - Compare strings, handle embedded numbers. 46int StringRef::compare_numeric(StringRef RHS) const { 47 for (size_t I = 0, E = min(Length, RHS.Length); I != E; ++I) { 48 if (Data[I] == RHS.Data[I]) 49 continue; 50 if (ascii_isdigit(Data[I]) && ascii_isdigit(RHS.Data[I])) { 51 // The longer sequence of numbers is larger. This doesn't really handle 52 // prefixed zeros well. 53 for (size_t J = I+1; J != E+1; ++J) { 54 bool ld = J < Length && ascii_isdigit(Data[J]); 55 bool rd = J < RHS.Length && ascii_isdigit(RHS.Data[J]); 56 if (ld != rd) 57 return rd ? -1 : 1; 58 if (!rd) 59 break; 60 } 61 } 62 return (unsigned char)Data[I] < (unsigned char)RHS.Data[I] ? -1 : 1; 63 } 64 if (Length == RHS.Length) 65 return 0; 66 return Length < RHS.Length ? -1 : 1; 67} 68 69// Compute the edit distance between the two given strings. 70unsigned StringRef::edit_distance(llvm::StringRef Other, 71 bool AllowReplacements, 72 unsigned MaxEditDistance) { 73 // The algorithm implemented below is the "classic" 74 // dynamic-programming algorithm for computing the Levenshtein 75 // distance, which is described here: 76 // 77 // http://en.wikipedia.org/wiki/Levenshtein_distance 78 // 79 // Although the algorithm is typically described using an m x n 80 // array, only two rows are used at a time, so this implemenation 81 // just keeps two separate vectors for those two rows. 82 size_type m = size(); 83 size_type n = Other.size(); 84 85 const unsigned SmallBufferSize = 64; 86 unsigned SmallBuffer[SmallBufferSize]; 87 unsigned *Allocated = 0; 88 unsigned *previous = SmallBuffer; 89 if (2*(n + 1) > SmallBufferSize) 90 Allocated = previous = new unsigned [2*(n+1)]; 91 unsigned *current = previous + (n + 1); 92 93 for (unsigned i = 0; i <= n; ++i) 94 previous[i] = i; 95 96 for (size_type y = 1; y <= m; ++y) { 97 current[0] = y; 98 unsigned BestThisRow = current[0]; 99 100 for (size_type x = 1; x <= n; ++x) { 101 if (AllowReplacements) { 102 current[x] = min(previous[x-1] + ((*this)[y-1] == Other[x-1]? 0u:1u), 103 min(current[x-1], previous[x])+1); 104 } 105 else { 106 if ((*this)[y-1] == Other[x-1]) current[x] = previous[x-1]; 107 else current[x] = min(current[x-1], previous[x]) + 1; 108 } 109 BestThisRow = min(BestThisRow, current[x]); 110 } 111 112 if (MaxEditDistance && BestThisRow > MaxEditDistance) 113 return MaxEditDistance + 1; 114 115 unsigned *tmp = current; 116 current = previous; 117 previous = tmp; 118 } 119 120 unsigned Result = previous[n]; 121 delete [] Allocated; 122 123 return Result; 124} 125 126//===----------------------------------------------------------------------===// 127// String Searching 128//===----------------------------------------------------------------------===// 129 130 131/// find - Search for the first string \arg Str in the string. 132/// 133/// \return - The index of the first occurence of \arg Str, or npos if not 134/// found. 135size_t StringRef::find(StringRef Str, size_t From) const { 136 size_t N = Str.size(); 137 if (N > Length) 138 return npos; 139 for (size_t e = Length - N + 1, i = min(From, e); i != e; ++i) 140 if (substr(i, N).equals(Str)) 141 return i; 142 return npos; 143} 144 145/// rfind - Search for the last string \arg Str in the string. 146/// 147/// \return - The index of the last occurence of \arg Str, or npos if not 148/// found. 149size_t StringRef::rfind(StringRef Str) const { 150 size_t N = Str.size(); 151 if (N > Length) 152 return npos; 153 for (size_t i = Length - N + 1, e = 0; i != e;) { 154 --i; 155 if (substr(i, N).equals(Str)) 156 return i; 157 } 158 return npos; 159} 160 161/// find_first_of - Find the first character in the string that is in \arg 162/// Chars, or npos if not found. 163/// 164/// Note: O(size() + Chars.size()) 165StringRef::size_type StringRef::find_first_of(StringRef Chars, 166 size_t From) const { 167 std::bitset<1 << CHAR_BIT> CharBits; 168 for (size_type i = 0; i != Chars.size(); ++i) 169 CharBits.set((unsigned char)Chars[i]); 170 171 for (size_type i = min(From, Length), e = Length; i != e; ++i) 172 if (CharBits.test((unsigned char)Data[i])) 173 return i; 174 return npos; 175} 176 177/// find_first_not_of - Find the first character in the string that is not 178/// \arg C or npos if not found. 179StringRef::size_type StringRef::find_first_not_of(char C, size_t From) const { 180 for (size_type i = min(From, Length), e = Length; i != e; ++i) 181 if (Data[i] != C) 182 return i; 183 return npos; 184} 185 186/// find_first_not_of - Find the first character in the string that is not 187/// in the string \arg Chars, or npos if not found. 188/// 189/// Note: O(size() + Chars.size()) 190StringRef::size_type StringRef::find_first_not_of(StringRef Chars, 191 size_t From) const { 192 std::bitset<1 << CHAR_BIT> CharBits; 193 for (size_type i = 0; i != Chars.size(); ++i) 194 CharBits.set((unsigned char)Chars[i]); 195 196 for (size_type i = min(From, Length), e = Length; i != e; ++i) 197 if (!CharBits.test((unsigned char)Data[i])) 198 return i; 199 return npos; 200} 201 202 203//===----------------------------------------------------------------------===// 204// Helpful Algorithms 205//===----------------------------------------------------------------------===// 206 207/// count - Return the number of non-overlapped occurrences of \arg Str in 208/// the string. 209size_t StringRef::count(StringRef Str) const { 210 size_t Count = 0; 211 size_t N = Str.size(); 212 if (N > Length) 213 return 0; 214 for (size_t i = 0, e = Length - N + 1; i != e; ++i) 215 if (substr(i, N).equals(Str)) 216 ++Count; 217 return Count; 218} 219 220static unsigned GetAutoSenseRadix(StringRef &Str) { 221 if (Str.startswith("0x")) { 222 Str = Str.substr(2); 223 return 16; 224 } else if (Str.startswith("0b")) { 225 Str = Str.substr(2); 226 return 2; 227 } else if (Str.startswith("0")) { 228 return 8; 229 } else { 230 return 10; 231 } 232} 233 234 235/// GetAsUnsignedInteger - Workhorse method that converts a integer character 236/// sequence of radix up to 36 to an unsigned long long value. 237static bool GetAsUnsignedInteger(StringRef Str, unsigned Radix, 238 unsigned long long &Result) { 239 // Autosense radix if not specified. 240 if (Radix == 0) 241 Radix = GetAutoSenseRadix(Str); 242 243 // Empty strings (after the radix autosense) are invalid. 244 if (Str.empty()) return true; 245 246 // Parse all the bytes of the string given this radix. Watch for overflow. 247 Result = 0; 248 while (!Str.empty()) { 249 unsigned CharVal; 250 if (Str[0] >= '0' && Str[0] <= '9') 251 CharVal = Str[0]-'0'; 252 else if (Str[0] >= 'a' && Str[0] <= 'z') 253 CharVal = Str[0]-'a'+10; 254 else if (Str[0] >= 'A' && Str[0] <= 'Z') 255 CharVal = Str[0]-'A'+10; 256 else 257 return true; 258 259 // If the parsed value is larger than the integer radix, the string is 260 // invalid. 261 if (CharVal >= Radix) 262 return true; 263 264 // Add in this character. 265 unsigned long long PrevResult = Result; 266 Result = Result*Radix+CharVal; 267 268 // Check for overflow. 269 if (Result < PrevResult) 270 return true; 271 272 Str = Str.substr(1); 273 } 274 275 return false; 276} 277 278bool StringRef::getAsInteger(unsigned Radix, unsigned long long &Result) const { 279 return GetAsUnsignedInteger(*this, Radix, Result); 280} 281 282 283bool StringRef::getAsInteger(unsigned Radix, long long &Result) const { 284 unsigned long long ULLVal; 285 286 // Handle positive strings first. 287 if (empty() || front() != '-') { 288 if (GetAsUnsignedInteger(*this, Radix, ULLVal) || 289 // Check for value so large it overflows a signed value. 290 (long long)ULLVal < 0) 291 return true; 292 Result = ULLVal; 293 return false; 294 } 295 296 // Get the positive part of the value. 297 if (GetAsUnsignedInteger(substr(1), Radix, ULLVal) || 298 // Reject values so large they'd overflow as negative signed, but allow 299 // "-0". This negates the unsigned so that the negative isn't undefined 300 // on signed overflow. 301 (long long)-ULLVal > 0) 302 return true; 303 304 Result = -ULLVal; 305 return false; 306} 307 308bool StringRef::getAsInteger(unsigned Radix, int &Result) const { 309 long long Val; 310 if (getAsInteger(Radix, Val) || 311 (int)Val != Val) 312 return true; 313 Result = Val; 314 return false; 315} 316 317bool StringRef::getAsInteger(unsigned Radix, unsigned &Result) const { 318 unsigned long long Val; 319 if (getAsInteger(Radix, Val) || 320 (unsigned)Val != Val) 321 return true; 322 Result = Val; 323 return false; 324} 325 326bool StringRef::getAsInteger(unsigned Radix, APInt &Result) const { 327 StringRef Str = *this; 328 329 // Autosense radix if not specified. 330 if (Radix == 0) 331 Radix = GetAutoSenseRadix(Str); 332 333 assert(Radix > 1 && Radix <= 36); 334 335 // Empty strings (after the radix autosense) are invalid. 336 if (Str.empty()) return true; 337 338 // Skip leading zeroes. This can be a significant improvement if 339 // it means we don't need > 64 bits. 340 while (!Str.empty() && Str.front() == '0') 341 Str = Str.substr(1); 342 343 // If it was nothing but zeroes.... 344 if (Str.empty()) { 345 Result = APInt(64, 0); 346 return false; 347 } 348 349 // (Over-)estimate the required number of bits. 350 unsigned Log2Radix = 0; 351 while ((1U << Log2Radix) < Radix) Log2Radix++; 352 bool IsPowerOf2Radix = ((1U << Log2Radix) == Radix); 353 354 unsigned BitWidth = Log2Radix * Str.size(); 355 if (BitWidth < Result.getBitWidth()) 356 BitWidth = Result.getBitWidth(); // don't shrink the result 357 else 358 Result.zext(BitWidth); 359 360 APInt RadixAP, CharAP; // unused unless !IsPowerOf2Radix 361 if (!IsPowerOf2Radix) { 362 // These must have the same bit-width as Result. 363 RadixAP = APInt(BitWidth, Radix); 364 CharAP = APInt(BitWidth, 0); 365 } 366 367 // Parse all the bytes of the string given this radix. 368 Result = 0; 369 while (!Str.empty()) { 370 unsigned CharVal; 371 if (Str[0] >= '0' && Str[0] <= '9') 372 CharVal = Str[0]-'0'; 373 else if (Str[0] >= 'a' && Str[0] <= 'z') 374 CharVal = Str[0]-'a'+10; 375 else if (Str[0] >= 'A' && Str[0] <= 'Z') 376 CharVal = Str[0]-'A'+10; 377 else 378 return true; 379 380 // If the parsed value is larger than the integer radix, the string is 381 // invalid. 382 if (CharVal >= Radix) 383 return true; 384 385 // Add in this character. 386 if (IsPowerOf2Radix) { 387 Result <<= Log2Radix; 388 Result |= CharVal; 389 } else { 390 Result *= RadixAP; 391 CharAP = CharVal; 392 Result += CharAP; 393 } 394 395 Str = Str.substr(1); 396 } 397 398 return false; 399} 400