StringRef.cpp revision 5ee568ac2704d7302017d42ad162d4b53d076cbc
1//===-- StringRef.cpp - Lightweight String References ---------------------===//
2//
3//                     The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9
10#include "llvm/ADT/StringRef.h"
11#include "llvm/ADT/APInt.h"
12#include <bitset>
13
14using namespace llvm;
15
16// MSVC emits references to this into the translation units which reference it.
17#ifndef _MSC_VER
18const size_t StringRef::npos;
19#endif
20
21static char ascii_tolower(char x) {
22  if (x >= 'A' && x <= 'Z')
23    return x - 'A' + 'a';
24  return x;
25}
26
27static bool ascii_isdigit(char x) {
28  return x >= '0' && x <= '9';
29}
30
31/// compare_lower - Compare strings, ignoring case.
32int StringRef::compare_lower(StringRef RHS) const {
33  for (size_t I = 0, E = min(Length, RHS.Length); I != E; ++I) {
34    unsigned char LHC = ascii_tolower(Data[I]);
35    unsigned char RHC = ascii_tolower(RHS.Data[I]);
36    if (LHC != RHC)
37      return LHC < RHC ? -1 : 1;
38  }
39
40  if (Length == RHS.Length)
41    return 0;
42  return Length < RHS.Length ? -1 : 1;
43}
44
45/// compare_numeric - Compare strings, handle embedded numbers.
46int StringRef::compare_numeric(StringRef RHS) const {
47  for (size_t I = 0, E = min(Length, RHS.Length); I != E; ++I) {
48    if (Data[I] == RHS.Data[I])
49      continue;
50    if (ascii_isdigit(Data[I]) && ascii_isdigit(RHS.Data[I])) {
51      // The longer sequence of numbers is larger. This doesn't really handle
52      // prefixed zeros well.
53      for (size_t J = I+1; J != E+1; ++J) {
54        bool ld = J < Length && ascii_isdigit(Data[J]);
55        bool rd = J < RHS.Length && ascii_isdigit(RHS.Data[J]);
56        if (ld != rd)
57          return rd ? -1 : 1;
58        if (!rd)
59          break;
60      }
61    }
62    return (unsigned char)Data[I] < (unsigned char)RHS.Data[I] ? -1 : 1;
63  }
64  if (Length == RHS.Length)
65    return 0;
66  return Length < RHS.Length ? -1 : 1;
67}
68
69// Compute the edit distance between the two given strings.
70unsigned StringRef::edit_distance(llvm::StringRef Other,
71                                  bool AllowReplacements,
72                                  unsigned MaxEditDistance) {
73  // The algorithm implemented below is the "classic"
74  // dynamic-programming algorithm for computing the Levenshtein
75  // distance, which is described here:
76  //
77  //   http://en.wikipedia.org/wiki/Levenshtein_distance
78  //
79  // Although the algorithm is typically described using an m x n
80  // array, only two rows are used at a time, so this implemenation
81  // just keeps two separate vectors for those two rows.
82  size_type m = size();
83  size_type n = Other.size();
84
85  const unsigned SmallBufferSize = 64;
86  unsigned SmallBuffer[SmallBufferSize];
87  unsigned *Allocated = 0;
88  unsigned *previous = SmallBuffer;
89  if (2*(n + 1) > SmallBufferSize)
90    Allocated = previous = new unsigned [2*(n+1)];
91  unsigned *current = previous + (n + 1);
92
93  for (unsigned i = 0; i <= n; ++i)
94    previous[i] = i;
95
96  for (size_type y = 1; y <= m; ++y) {
97    current[0] = y;
98    unsigned BestThisRow = current[0];
99
100    for (size_type x = 1; x <= n; ++x) {
101      if (AllowReplacements) {
102        current[x] = min(previous[x-1] + ((*this)[y-1] == Other[x-1]? 0u:1u),
103                         min(current[x-1], previous[x])+1);
104      }
105      else {
106        if ((*this)[y-1] == Other[x-1]) current[x] = previous[x-1];
107        else current[x] = min(current[x-1], previous[x]) + 1;
108      }
109      BestThisRow = min(BestThisRow, current[x]);
110    }
111
112    if (MaxEditDistance && BestThisRow > MaxEditDistance)
113      return MaxEditDistance + 1;
114
115    unsigned *tmp = current;
116    current = previous;
117    previous = tmp;
118  }
119
120  unsigned Result = previous[n];
121  delete [] Allocated;
122
123  return Result;
124}
125
126//===----------------------------------------------------------------------===//
127// String Searching
128//===----------------------------------------------------------------------===//
129
130
131/// find - Search for the first string \arg Str in the string.
132///
133/// \return - The index of the first occurence of \arg Str, or npos if not
134/// found.
135size_t StringRef::find(StringRef Str, size_t From) const {
136  size_t N = Str.size();
137  if (N > Length)
138    return npos;
139  for (size_t e = Length - N + 1, i = min(From, e); i != e; ++i)
140    if (substr(i, N).equals(Str))
141      return i;
142  return npos;
143}
144
145/// rfind - Search for the last string \arg Str in the string.
146///
147/// \return - The index of the last occurence of \arg Str, or npos if not
148/// found.
149size_t StringRef::rfind(StringRef Str) const {
150  size_t N = Str.size();
151  if (N > Length)
152    return npos;
153  for (size_t i = Length - N + 1, e = 0; i != e;) {
154    --i;
155    if (substr(i, N).equals(Str))
156      return i;
157  }
158  return npos;
159}
160
161/// find_first_of - Find the first character in the string that is in \arg
162/// Chars, or npos if not found.
163///
164/// Note: O(size() + Chars.size())
165StringRef::size_type StringRef::find_first_of(StringRef Chars,
166                                              size_t From) const {
167  std::bitset<1 << CHAR_BIT> CharBits;
168  for (size_type i = 0; i != Chars.size(); ++i)
169    CharBits.set((unsigned char)Chars[i]);
170
171  for (size_type i = min(From, Length), e = Length; i != e; ++i)
172    if (CharBits.test((unsigned char)Data[i]))
173      return i;
174  return npos;
175}
176
177/// find_first_not_of - Find the first character in the string that is not
178/// \arg C or npos if not found.
179StringRef::size_type StringRef::find_first_not_of(char C, size_t From) const {
180  for (size_type i = min(From, Length), e = Length; i != e; ++i)
181    if (Data[i] != C)
182      return i;
183  return npos;
184}
185
186/// find_first_not_of - Find the first character in the string that is not
187/// in the string \arg Chars, or npos if not found.
188///
189/// Note: O(size() + Chars.size())
190StringRef::size_type StringRef::find_first_not_of(StringRef Chars,
191                                                  size_t From) const {
192  std::bitset<1 << CHAR_BIT> CharBits;
193  for (size_type i = 0; i != Chars.size(); ++i)
194    CharBits.set((unsigned char)Chars[i]);
195
196  for (size_type i = min(From, Length), e = Length; i != e; ++i)
197    if (!CharBits.test((unsigned char)Data[i]))
198      return i;
199  return npos;
200}
201
202
203//===----------------------------------------------------------------------===//
204// Helpful Algorithms
205//===----------------------------------------------------------------------===//
206
207/// count - Return the number of non-overlapped occurrences of \arg Str in
208/// the string.
209size_t StringRef::count(StringRef Str) const {
210  size_t Count = 0;
211  size_t N = Str.size();
212  if (N > Length)
213    return 0;
214  for (size_t i = 0, e = Length - N + 1; i != e; ++i)
215    if (substr(i, N).equals(Str))
216      ++Count;
217  return Count;
218}
219
220static unsigned GetAutoSenseRadix(StringRef &Str) {
221  if (Str.startswith("0x")) {
222    Str = Str.substr(2);
223    return 16;
224  } else if (Str.startswith("0b")) {
225    Str = Str.substr(2);
226    return 2;
227  } else if (Str.startswith("0")) {
228    return 8;
229  } else {
230    return 10;
231  }
232}
233
234
235/// GetAsUnsignedInteger - Workhorse method that converts a integer character
236/// sequence of radix up to 36 to an unsigned long long value.
237static bool GetAsUnsignedInteger(StringRef Str, unsigned Radix,
238                                 unsigned long long &Result) {
239  // Autosense radix if not specified.
240  if (Radix == 0)
241    Radix = GetAutoSenseRadix(Str);
242
243  // Empty strings (after the radix autosense) are invalid.
244  if (Str.empty()) return true;
245
246  // Parse all the bytes of the string given this radix.  Watch for overflow.
247  Result = 0;
248  while (!Str.empty()) {
249    unsigned CharVal;
250    if (Str[0] >= '0' && Str[0] <= '9')
251      CharVal = Str[0]-'0';
252    else if (Str[0] >= 'a' && Str[0] <= 'z')
253      CharVal = Str[0]-'a'+10;
254    else if (Str[0] >= 'A' && Str[0] <= 'Z')
255      CharVal = Str[0]-'A'+10;
256    else
257      return true;
258
259    // If the parsed value is larger than the integer radix, the string is
260    // invalid.
261    if (CharVal >= Radix)
262      return true;
263
264    // Add in this character.
265    unsigned long long PrevResult = Result;
266    Result = Result*Radix+CharVal;
267
268    // Check for overflow.
269    if (Result < PrevResult)
270      return true;
271
272    Str = Str.substr(1);
273  }
274
275  return false;
276}
277
278bool StringRef::getAsInteger(unsigned Radix, unsigned long long &Result) const {
279  return GetAsUnsignedInteger(*this, Radix, Result);
280}
281
282
283bool StringRef::getAsInteger(unsigned Radix, long long &Result) const {
284  unsigned long long ULLVal;
285
286  // Handle positive strings first.
287  if (empty() || front() != '-') {
288    if (GetAsUnsignedInteger(*this, Radix, ULLVal) ||
289        // Check for value so large it overflows a signed value.
290        (long long)ULLVal < 0)
291      return true;
292    Result = ULLVal;
293    return false;
294  }
295
296  // Get the positive part of the value.
297  if (GetAsUnsignedInteger(substr(1), Radix, ULLVal) ||
298      // Reject values so large they'd overflow as negative signed, but allow
299      // "-0".  This negates the unsigned so that the negative isn't undefined
300      // on signed overflow.
301      (long long)-ULLVal > 0)
302    return true;
303
304  Result = -ULLVal;
305  return false;
306}
307
308bool StringRef::getAsInteger(unsigned Radix, int &Result) const {
309  long long Val;
310  if (getAsInteger(Radix, Val) ||
311      (int)Val != Val)
312    return true;
313  Result = Val;
314  return false;
315}
316
317bool StringRef::getAsInteger(unsigned Radix, unsigned &Result) const {
318  unsigned long long Val;
319  if (getAsInteger(Radix, Val) ||
320      (unsigned)Val != Val)
321    return true;
322  Result = Val;
323  return false;
324}
325
326bool StringRef::getAsInteger(unsigned Radix, APInt &Result) const {
327  StringRef Str = *this;
328
329  // Autosense radix if not specified.
330  if (Radix == 0)
331    Radix = GetAutoSenseRadix(Str);
332
333  assert(Radix > 1 && Radix <= 36);
334
335  // Empty strings (after the radix autosense) are invalid.
336  if (Str.empty()) return true;
337
338  // Skip leading zeroes.  This can be a significant improvement if
339  // it means we don't need > 64 bits.
340  while (!Str.empty() && Str.front() == '0')
341    Str = Str.substr(1);
342
343  // If it was nothing but zeroes....
344  if (Str.empty()) {
345    Result = APInt(64, 0);
346    return false;
347  }
348
349  // (Over-)estimate the required number of bits.
350  unsigned Log2Radix = 0;
351  while ((1U << Log2Radix) < Radix) Log2Radix++;
352  bool IsPowerOf2Radix = ((1U << Log2Radix) == Radix);
353
354  unsigned BitWidth = Log2Radix * Str.size();
355  if (BitWidth < Result.getBitWidth())
356    BitWidth = Result.getBitWidth(); // don't shrink the result
357  else
358    Result.zext(BitWidth);
359
360  APInt RadixAP, CharAP; // unused unless !IsPowerOf2Radix
361  if (!IsPowerOf2Radix) {
362    // These must have the same bit-width as Result.
363    RadixAP = APInt(BitWidth, Radix);
364    CharAP = APInt(BitWidth, 0);
365  }
366
367  // Parse all the bytes of the string given this radix.
368  Result = 0;
369  while (!Str.empty()) {
370    unsigned CharVal;
371    if (Str[0] >= '0' && Str[0] <= '9')
372      CharVal = Str[0]-'0';
373    else if (Str[0] >= 'a' && Str[0] <= 'z')
374      CharVal = Str[0]-'a'+10;
375    else if (Str[0] >= 'A' && Str[0] <= 'Z')
376      CharVal = Str[0]-'A'+10;
377    else
378      return true;
379
380    // If the parsed value is larger than the integer radix, the string is
381    // invalid.
382    if (CharVal >= Radix)
383      return true;
384
385    // Add in this character.
386    if (IsPowerOf2Radix) {
387      Result <<= Log2Radix;
388      Result |= CharVal;
389    } else {
390      Result *= RadixAP;
391      CharAP = CharVal;
392      Result += CharAP;
393    }
394
395    Str = Str.substr(1);
396  }
397
398  return false;
399}
400