StringRef.cpp revision a7b966fc8d63b9b9432e1b33b33d4be6179e1fff
1//===-- StringRef.cpp - Lightweight String References ---------------------===//
2//
3//                     The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9
10#include "llvm/ADT/StringRef.h"
11#include "llvm/ADT/APInt.h"
12#include "llvm/ADT/OwningPtr.h"
13#include <bitset>
14
15using namespace llvm;
16
17// MSVC emits references to this into the translation units which reference it.
18#ifndef _MSC_VER
19const size_t StringRef::npos;
20#endif
21
22static char ascii_tolower(char x) {
23  if (x >= 'A' && x <= 'Z')
24    return x - 'A' + 'a';
25  return x;
26}
27
28static char ascii_toupper(char x) {
29  if (x >= 'a' && x <= 'z')
30    return x - 'a' + 'A';
31  return x;
32}
33
34static bool ascii_isdigit(char x) {
35  return x >= '0' && x <= '9';
36}
37
38/// compare_lower - Compare strings, ignoring case.
39int StringRef::compare_lower(StringRef RHS) const {
40  for (size_t I = 0, E = min(Length, RHS.Length); I != E; ++I) {
41    unsigned char LHC = ascii_tolower(Data[I]);
42    unsigned char RHC = ascii_tolower(RHS.Data[I]);
43    if (LHC != RHC)
44      return LHC < RHC ? -1 : 1;
45  }
46
47  if (Length == RHS.Length)
48    return 0;
49  return Length < RHS.Length ? -1 : 1;
50}
51
52/// compare_numeric - Compare strings, handle embedded numbers.
53int StringRef::compare_numeric(StringRef RHS) const {
54  for (size_t I = 0, E = min(Length, RHS.Length); I != E; ++I) {
55    // Check for sequences of digits.
56    if (ascii_isdigit(Data[I]) && ascii_isdigit(RHS.Data[I])) {
57      // The longer sequence of numbers is considered larger.
58      // This doesn't really handle prefixed zeros well.
59      size_t J;
60      for (J = I + 1; J != E + 1; ++J) {
61        bool ld = J < Length && ascii_isdigit(Data[J]);
62        bool rd = J < RHS.Length && ascii_isdigit(RHS.Data[J]);
63        if (ld != rd)
64          return rd ? -1 : 1;
65        if (!rd)
66          break;
67      }
68      // The two number sequences have the same length (J-I), just memcmp them.
69      if (int Res = compareMemory(Data + I, RHS.Data + I, J - I))
70        return Res < 0 ? -1 : 1;
71      // Identical number sequences, continue search after the numbers.
72      I = J - 1;
73      continue;
74    }
75    if (Data[I] != RHS.Data[I])
76      return (unsigned char)Data[I] < (unsigned char)RHS.Data[I] ? -1 : 1;
77  }
78  if (Length == RHS.Length)
79    return 0;
80  return Length < RHS.Length ? -1 : 1;
81}
82
83// Compute the edit distance between the two given strings.
84unsigned StringRef::edit_distance(llvm::StringRef Other,
85                                  bool AllowReplacements,
86                                  unsigned MaxEditDistance) {
87  // The algorithm implemented below is the "classic"
88  // dynamic-programming algorithm for computing the Levenshtein
89  // distance, which is described here:
90  //
91  //   http://en.wikipedia.org/wiki/Levenshtein_distance
92  //
93  // Although the algorithm is typically described using an m x n
94  // array, only two rows are used at a time, so this implemenation
95  // just keeps two separate vectors for those two rows.
96  size_type m = size();
97  size_type n = Other.size();
98
99  const unsigned SmallBufferSize = 64;
100  unsigned SmallBuffer[SmallBufferSize];
101  llvm::OwningArrayPtr<unsigned> Allocated;
102  unsigned *previous = SmallBuffer;
103  if (2*(n + 1) > SmallBufferSize) {
104    previous = new unsigned [2*(n+1)];
105    Allocated.reset(previous);
106  }
107  unsigned *current = previous + (n + 1);
108
109  for (unsigned i = 0; i <= n; ++i)
110    previous[i] = i;
111
112  for (size_type y = 1; y <= m; ++y) {
113    current[0] = y;
114    unsigned BestThisRow = current[0];
115
116    for (size_type x = 1; x <= n; ++x) {
117      if (AllowReplacements) {
118        current[x] = min(previous[x-1] + ((*this)[y-1] == Other[x-1]? 0u:1u),
119                         min(current[x-1], previous[x])+1);
120      }
121      else {
122        if ((*this)[y-1] == Other[x-1]) current[x] = previous[x-1];
123        else current[x] = min(current[x-1], previous[x]) + 1;
124      }
125      BestThisRow = min(BestThisRow, current[x]);
126    }
127
128    if (MaxEditDistance && BestThisRow > MaxEditDistance)
129      return MaxEditDistance + 1;
130
131    unsigned *tmp = current;
132    current = previous;
133    previous = tmp;
134  }
135
136  unsigned Result = previous[n];
137  return Result;
138}
139
140//===----------------------------------------------------------------------===//
141// String Operations
142//===----------------------------------------------------------------------===//
143
144std::string StringRef::lower() const {
145  std::string Result(size(), char());
146  for (size_type i = 0, e = size(); i != e; ++i) {
147    Result[i] = ascii_tolower(Data[i]);
148  }
149  return Result;
150}
151
152std::string StringRef::upper() const {
153  std::string Result(size(), char());
154  for (size_type i = 0, e = size(); i != e; ++i) {
155    Result[i] = ascii_toupper(Data[i]);
156  }
157  return Result;
158}
159
160//===----------------------------------------------------------------------===//
161// String Searching
162//===----------------------------------------------------------------------===//
163
164
165/// find - Search for the first string \arg Str in the string.
166///
167/// \return - The index of the first occurrence of \arg Str, or npos if not
168/// found.
169size_t StringRef::find(StringRef Str, size_t From) const {
170  size_t N = Str.size();
171  if (N > Length)
172    return npos;
173
174  // For short haystacks or unsupported needles fall back to the naive algorithm
175  if (Length < 16 || N > 255 || N == 0) {
176    for (size_t e = Length - N + 1, i = min(From, e); i != e; ++i)
177      if (substr(i, N).equals(Str))
178        return i;
179    return npos;
180  }
181
182  if (From >= Length)
183    return npos;
184
185  // Build the bad char heuristic table, with uint8_t to reduce cache thrashing.
186  uint8_t BadCharSkip[256];
187  std::memset(BadCharSkip, N, 256);
188  for (unsigned i = 0; i != N-1; ++i)
189    BadCharSkip[(uint8_t)Str[i]] = N-1-i;
190
191  unsigned Len = Length-From, Pos = From;
192  while (Len >= N) {
193    if (substr(Pos, N).equals(Str)) // See if this is the correct substring.
194      return Pos;
195
196    // Otherwise skip the appropriate number of bytes.
197    uint8_t Skip = BadCharSkip[(uint8_t)(*this)[Pos+N-1]];
198    Len -= Skip;
199    Pos += Skip;
200  }
201
202  return npos;
203}
204
205/// rfind - Search for the last string \arg Str in the string.
206///
207/// \return - The index of the last occurrence of \arg Str, or npos if not
208/// found.
209size_t StringRef::rfind(StringRef Str) const {
210  size_t N = Str.size();
211  if (N > Length)
212    return npos;
213  for (size_t i = Length - N + 1, e = 0; i != e;) {
214    --i;
215    if (substr(i, N).equals(Str))
216      return i;
217  }
218  return npos;
219}
220
221/// find_first_of - Find the first character in the string that is in \arg
222/// Chars, or npos if not found.
223///
224/// Note: O(size() + Chars.size())
225StringRef::size_type StringRef::find_first_of(StringRef Chars,
226                                              size_t From) const {
227  std::bitset<1 << CHAR_BIT> CharBits;
228  for (size_type i = 0; i != Chars.size(); ++i)
229    CharBits.set((unsigned char)Chars[i]);
230
231  for (size_type i = min(From, Length), e = Length; i != e; ++i)
232    if (CharBits.test((unsigned char)Data[i]))
233      return i;
234  return npos;
235}
236
237/// find_first_not_of - Find the first character in the string that is not
238/// \arg C or npos if not found.
239StringRef::size_type StringRef::find_first_not_of(char C, size_t From) const {
240  for (size_type i = min(From, Length), e = Length; i != e; ++i)
241    if (Data[i] != C)
242      return i;
243  return npos;
244}
245
246/// find_first_not_of - Find the first character in the string that is not
247/// in the string \arg Chars, or npos if not found.
248///
249/// Note: O(size() + Chars.size())
250StringRef::size_type StringRef::find_first_not_of(StringRef Chars,
251                                                  size_t From) const {
252  std::bitset<1 << CHAR_BIT> CharBits;
253  for (size_type i = 0; i != Chars.size(); ++i)
254    CharBits.set((unsigned char)Chars[i]);
255
256  for (size_type i = min(From, Length), e = Length; i != e; ++i)
257    if (!CharBits.test((unsigned char)Data[i]))
258      return i;
259  return npos;
260}
261
262/// find_last_of - Find the last character in the string that is in \arg C,
263/// or npos if not found.
264///
265/// Note: O(size() + Chars.size())
266StringRef::size_type StringRef::find_last_of(StringRef Chars,
267                                             size_t From) const {
268  std::bitset<1 << CHAR_BIT> CharBits;
269  for (size_type i = 0; i != Chars.size(); ++i)
270    CharBits.set((unsigned char)Chars[i]);
271
272  for (size_type i = min(From, Length) - 1, e = -1; i != e; --i)
273    if (CharBits.test((unsigned char)Data[i]))
274      return i;
275  return npos;
276}
277
278//===----------------------------------------------------------------------===//
279// Helpful Algorithms
280//===----------------------------------------------------------------------===//
281
282/// count - Return the number of non-overlapped occurrences of \arg Str in
283/// the string.
284size_t StringRef::count(StringRef Str) const {
285  size_t Count = 0;
286  size_t N = Str.size();
287  if (N > Length)
288    return 0;
289  for (size_t i = 0, e = Length - N + 1; i != e; ++i)
290    if (substr(i, N).equals(Str))
291      ++Count;
292  return Count;
293}
294
295static unsigned GetAutoSenseRadix(StringRef &Str) {
296  if (Str.startswith("0x")) {
297    Str = Str.substr(2);
298    return 16;
299  } else if (Str.startswith("0b")) {
300    Str = Str.substr(2);
301    return 2;
302  } else if (Str.startswith("0")) {
303    return 8;
304  } else {
305    return 10;
306  }
307}
308
309
310/// GetAsUnsignedInteger - Workhorse method that converts a integer character
311/// sequence of radix up to 36 to an unsigned long long value.
312static bool GetAsUnsignedInteger(StringRef Str, unsigned Radix,
313                                 unsigned long long &Result) {
314  // Autosense radix if not specified.
315  if (Radix == 0)
316    Radix = GetAutoSenseRadix(Str);
317
318  // Empty strings (after the radix autosense) are invalid.
319  if (Str.empty()) return true;
320
321  // Parse all the bytes of the string given this radix.  Watch for overflow.
322  Result = 0;
323  while (!Str.empty()) {
324    unsigned CharVal;
325    if (Str[0] >= '0' && Str[0] <= '9')
326      CharVal = Str[0]-'0';
327    else if (Str[0] >= 'a' && Str[0] <= 'z')
328      CharVal = Str[0]-'a'+10;
329    else if (Str[0] >= 'A' && Str[0] <= 'Z')
330      CharVal = Str[0]-'A'+10;
331    else
332      return true;
333
334    // If the parsed value is larger than the integer radix, the string is
335    // invalid.
336    if (CharVal >= Radix)
337      return true;
338
339    // Add in this character.
340    unsigned long long PrevResult = Result;
341    Result = Result*Radix+CharVal;
342
343    // Check for overflow.
344    if (Result < PrevResult)
345      return true;
346
347    Str = Str.substr(1);
348  }
349
350  return false;
351}
352
353bool StringRef::getAsInteger(unsigned Radix, unsigned long long &Result) const {
354  return GetAsUnsignedInteger(*this, Radix, Result);
355}
356
357
358bool StringRef::getAsInteger(unsigned Radix, long long &Result) const {
359  unsigned long long ULLVal;
360
361  // Handle positive strings first.
362  if (empty() || front() != '-') {
363    if (GetAsUnsignedInteger(*this, Radix, ULLVal) ||
364        // Check for value so large it overflows a signed value.
365        (long long)ULLVal < 0)
366      return true;
367    Result = ULLVal;
368    return false;
369  }
370
371  // Get the positive part of the value.
372  if (GetAsUnsignedInteger(substr(1), Radix, ULLVal) ||
373      // Reject values so large they'd overflow as negative signed, but allow
374      // "-0".  This negates the unsigned so that the negative isn't undefined
375      // on signed overflow.
376      (long long)-ULLVal > 0)
377    return true;
378
379  Result = -ULLVal;
380  return false;
381}
382
383bool StringRef::getAsInteger(unsigned Radix, int &Result) const {
384  long long Val;
385  if (getAsInteger(Radix, Val) ||
386      (int)Val != Val)
387    return true;
388  Result = Val;
389  return false;
390}
391
392bool StringRef::getAsInteger(unsigned Radix, unsigned &Result) const {
393  unsigned long long Val;
394  if (getAsInteger(Radix, Val) ||
395      (unsigned)Val != Val)
396    return true;
397  Result = Val;
398  return false;
399}
400
401bool StringRef::getAsInteger(unsigned Radix, APInt &Result) const {
402  StringRef Str = *this;
403
404  // Autosense radix if not specified.
405  if (Radix == 0)
406    Radix = GetAutoSenseRadix(Str);
407
408  assert(Radix > 1 && Radix <= 36);
409
410  // Empty strings (after the radix autosense) are invalid.
411  if (Str.empty()) return true;
412
413  // Skip leading zeroes.  This can be a significant improvement if
414  // it means we don't need > 64 bits.
415  while (!Str.empty() && Str.front() == '0')
416    Str = Str.substr(1);
417
418  // If it was nothing but zeroes....
419  if (Str.empty()) {
420    Result = APInt(64, 0);
421    return false;
422  }
423
424  // (Over-)estimate the required number of bits.
425  unsigned Log2Radix = 0;
426  while ((1U << Log2Radix) < Radix) Log2Radix++;
427  bool IsPowerOf2Radix = ((1U << Log2Radix) == Radix);
428
429  unsigned BitWidth = Log2Radix * Str.size();
430  if (BitWidth < Result.getBitWidth())
431    BitWidth = Result.getBitWidth(); // don't shrink the result
432  else
433    Result = Result.zext(BitWidth);
434
435  APInt RadixAP, CharAP; // unused unless !IsPowerOf2Radix
436  if (!IsPowerOf2Radix) {
437    // These must have the same bit-width as Result.
438    RadixAP = APInt(BitWidth, Radix);
439    CharAP = APInt(BitWidth, 0);
440  }
441
442  // Parse all the bytes of the string given this radix.
443  Result = 0;
444  while (!Str.empty()) {
445    unsigned CharVal;
446    if (Str[0] >= '0' && Str[0] <= '9')
447      CharVal = Str[0]-'0';
448    else if (Str[0] >= 'a' && Str[0] <= 'z')
449      CharVal = Str[0]-'a'+10;
450    else if (Str[0] >= 'A' && Str[0] <= 'Z')
451      CharVal = Str[0]-'A'+10;
452    else
453      return true;
454
455    // If the parsed value is larger than the integer radix, the string is
456    // invalid.
457    if (CharVal >= Radix)
458      return true;
459
460    // Add in this character.
461    if (IsPowerOf2Radix) {
462      Result <<= Log2Radix;
463      Result |= CharVal;
464    } else {
465      Result *= RadixAP;
466      CharAP = CharVal;
467      Result += CharAP;
468    }
469
470    Str = Str.substr(1);
471  }
472
473  return false;
474}
475