StringRef.cpp revision 01d53ec176a6be4585df1f43af11151988ca4b35
1//===-- StringRef.cpp - Lightweight String References ---------------------===//
2//
3//                     The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9
10#include "llvm/ADT/StringRef.h"
11#include "llvm/ADT/APInt.h"
12#include "llvm/ADT/OwningPtr.h"
13#include "llvm/ADT/edit_distance.h"
14#include <bitset>
15
16using namespace llvm;
17
18// MSVC emits references to this into the translation units which reference it.
19#ifndef _MSC_VER
20const size_t StringRef::npos;
21#endif
22
23static char ascii_tolower(char x) {
24  if (x >= 'A' && x <= 'Z')
25    return x - 'A' + 'a';
26  return x;
27}
28
29static char ascii_toupper(char x) {
30  if (x >= 'a' && x <= 'z')
31    return x - 'a' + 'A';
32  return x;
33}
34
35static bool ascii_isdigit(char x) {
36  return x >= '0' && x <= '9';
37}
38
39/// compare_lower - Compare strings, ignoring case.
40int StringRef::compare_lower(StringRef RHS) const {
41  for (size_t I = 0, E = min(Length, RHS.Length); I != E; ++I) {
42    unsigned char LHC = ascii_tolower(Data[I]);
43    unsigned char RHC = ascii_tolower(RHS.Data[I]);
44    if (LHC != RHC)
45      return LHC < RHC ? -1 : 1;
46  }
47
48  if (Length == RHS.Length)
49    return 0;
50  return Length < RHS.Length ? -1 : 1;
51}
52
53/// compare_numeric - Compare strings, handle embedded numbers.
54int StringRef::compare_numeric(StringRef RHS) const {
55  for (size_t I = 0, E = min(Length, RHS.Length); I != E; ++I) {
56    // Check for sequences of digits.
57    if (ascii_isdigit(Data[I]) && ascii_isdigit(RHS.Data[I])) {
58      // The longer sequence of numbers is considered larger.
59      // This doesn't really handle prefixed zeros well.
60      size_t J;
61      for (J = I + 1; J != E + 1; ++J) {
62        bool ld = J < Length && ascii_isdigit(Data[J]);
63        bool rd = J < RHS.Length && ascii_isdigit(RHS.Data[J]);
64        if (ld != rd)
65          return rd ? -1 : 1;
66        if (!rd)
67          break;
68      }
69      // The two number sequences have the same length (J-I), just memcmp them.
70      if (int Res = compareMemory(Data + I, RHS.Data + I, J - I))
71        return Res < 0 ? -1 : 1;
72      // Identical number sequences, continue search after the numbers.
73      I = J - 1;
74      continue;
75    }
76    if (Data[I] != RHS.Data[I])
77      return (unsigned char)Data[I] < (unsigned char)RHS.Data[I] ? -1 : 1;
78  }
79  if (Length == RHS.Length)
80    return 0;
81  return Length < RHS.Length ? -1 : 1;
82}
83
84// Compute the edit distance between the two given strings.
85unsigned StringRef::edit_distance(llvm::StringRef Other,
86                                  bool AllowReplacements,
87                                  unsigned MaxEditDistance) {
88  return llvm::ComputeEditDistance(
89      llvm::ArrayRef<char>(data(), size()),
90      llvm::ArrayRef<char>(Other.data(), Other.size()),
91      AllowReplacements, MaxEditDistance);
92}
93
94//===----------------------------------------------------------------------===//
95// String Operations
96//===----------------------------------------------------------------------===//
97
98std::string StringRef::lower() const {
99  std::string Result(size(), char());
100  for (size_type i = 0, e = size(); i != e; ++i) {
101    Result[i] = ascii_tolower(Data[i]);
102  }
103  return Result;
104}
105
106std::string StringRef::upper() const {
107  std::string Result(size(), char());
108  for (size_type i = 0, e = size(); i != e; ++i) {
109    Result[i] = ascii_toupper(Data[i]);
110  }
111  return Result;
112}
113
114//===----------------------------------------------------------------------===//
115// String Searching
116//===----------------------------------------------------------------------===//
117
118
119/// find - Search for the first string \arg Str in the string.
120///
121/// \return - The index of the first occurrence of \arg Str, or npos if not
122/// found.
123size_t StringRef::find(StringRef Str, size_t From) const {
124  size_t N = Str.size();
125  if (N > Length)
126    return npos;
127
128  // For short haystacks or unsupported needles fall back to the naive algorithm
129  if (Length < 16 || N > 255 || N == 0) {
130    for (size_t e = Length - N + 1, i = min(From, e); i != e; ++i)
131      if (substr(i, N).equals(Str))
132        return i;
133    return npos;
134  }
135
136  if (From >= Length)
137    return npos;
138
139  // Build the bad char heuristic table, with uint8_t to reduce cache thrashing.
140  uint8_t BadCharSkip[256];
141  std::memset(BadCharSkip, N, 256);
142  for (unsigned i = 0; i != N-1; ++i)
143    BadCharSkip[(uint8_t)Str[i]] = N-1-i;
144
145  unsigned Len = Length-From, Pos = From;
146  while (Len >= N) {
147    if (substr(Pos, N).equals(Str)) // See if this is the correct substring.
148      return Pos;
149
150    // Otherwise skip the appropriate number of bytes.
151    uint8_t Skip = BadCharSkip[(uint8_t)(*this)[Pos+N-1]];
152    Len -= Skip;
153    Pos += Skip;
154  }
155
156  return npos;
157}
158
159/// rfind - Search for the last string \arg Str in the string.
160///
161/// \return - The index of the last occurrence of \arg Str, or npos if not
162/// found.
163size_t StringRef::rfind(StringRef Str) const {
164  size_t N = Str.size();
165  if (N > Length)
166    return npos;
167  for (size_t i = Length - N + 1, e = 0; i != e;) {
168    --i;
169    if (substr(i, N).equals(Str))
170      return i;
171  }
172  return npos;
173}
174
175/// find_first_of - Find the first character in the string that is in \arg
176/// Chars, or npos if not found.
177///
178/// Note: O(size() + Chars.size())
179StringRef::size_type StringRef::find_first_of(StringRef Chars,
180                                              size_t From) const {
181  std::bitset<1 << CHAR_BIT> CharBits;
182  for (size_type i = 0; i != Chars.size(); ++i)
183    CharBits.set((unsigned char)Chars[i]);
184
185  for (size_type i = min(From, Length), e = Length; i != e; ++i)
186    if (CharBits.test((unsigned char)Data[i]))
187      return i;
188  return npos;
189}
190
191/// find_first_not_of - Find the first character in the string that is not
192/// \arg C or npos if not found.
193StringRef::size_type StringRef::find_first_not_of(char C, size_t From) const {
194  for (size_type i = min(From, Length), e = Length; i != e; ++i)
195    if (Data[i] != C)
196      return i;
197  return npos;
198}
199
200/// find_first_not_of - Find the first character in the string that is not
201/// in the string \arg Chars, or npos if not found.
202///
203/// Note: O(size() + Chars.size())
204StringRef::size_type StringRef::find_first_not_of(StringRef Chars,
205                                                  size_t From) const {
206  std::bitset<1 << CHAR_BIT> CharBits;
207  for (size_type i = 0; i != Chars.size(); ++i)
208    CharBits.set((unsigned char)Chars[i]);
209
210  for (size_type i = min(From, Length), e = Length; i != e; ++i)
211    if (!CharBits.test((unsigned char)Data[i]))
212      return i;
213  return npos;
214}
215
216/// find_last_of - Find the last character in the string that is in \arg C,
217/// or npos if not found.
218///
219/// Note: O(size() + Chars.size())
220StringRef::size_type StringRef::find_last_of(StringRef Chars,
221                                             size_t From) const {
222  std::bitset<1 << CHAR_BIT> CharBits;
223  for (size_type i = 0; i != Chars.size(); ++i)
224    CharBits.set((unsigned char)Chars[i]);
225
226  for (size_type i = min(From, Length) - 1, e = -1; i != e; --i)
227    if (CharBits.test((unsigned char)Data[i]))
228      return i;
229  return npos;
230}
231
232//===----------------------------------------------------------------------===//
233// Helpful Algorithms
234//===----------------------------------------------------------------------===//
235
236/// count - Return the number of non-overlapped occurrences of \arg Str in
237/// the string.
238size_t StringRef::count(StringRef Str) const {
239  size_t Count = 0;
240  size_t N = Str.size();
241  if (N > Length)
242    return 0;
243  for (size_t i = 0, e = Length - N + 1; i != e; ++i)
244    if (substr(i, N).equals(Str))
245      ++Count;
246  return Count;
247}
248
249static unsigned GetAutoSenseRadix(StringRef &Str) {
250  if (Str.startswith("0x")) {
251    Str = Str.substr(2);
252    return 16;
253  } else if (Str.startswith("0b")) {
254    Str = Str.substr(2);
255    return 2;
256  } else if (Str.startswith("0")) {
257    return 8;
258  } else {
259    return 10;
260  }
261}
262
263
264/// GetAsUnsignedInteger - Workhorse method that converts a integer character
265/// sequence of radix up to 36 to an unsigned long long value.
266static bool GetAsUnsignedInteger(StringRef Str, unsigned Radix,
267                                 unsigned long long &Result) {
268  // Autosense radix if not specified.
269  if (Radix == 0)
270    Radix = GetAutoSenseRadix(Str);
271
272  // Empty strings (after the radix autosense) are invalid.
273  if (Str.empty()) return true;
274
275  // Parse all the bytes of the string given this radix.  Watch for overflow.
276  Result = 0;
277  while (!Str.empty()) {
278    unsigned CharVal;
279    if (Str[0] >= '0' && Str[0] <= '9')
280      CharVal = Str[0]-'0';
281    else if (Str[0] >= 'a' && Str[0] <= 'z')
282      CharVal = Str[0]-'a'+10;
283    else if (Str[0] >= 'A' && Str[0] <= 'Z')
284      CharVal = Str[0]-'A'+10;
285    else
286      return true;
287
288    // If the parsed value is larger than the integer radix, the string is
289    // invalid.
290    if (CharVal >= Radix)
291      return true;
292
293    // Add in this character.
294    unsigned long long PrevResult = Result;
295    Result = Result*Radix+CharVal;
296
297    // Check for overflow.
298    if (Result < PrevResult)
299      return true;
300
301    Str = Str.substr(1);
302  }
303
304  return false;
305}
306
307bool StringRef::getAsInteger(unsigned Radix, unsigned long long &Result) const {
308  return GetAsUnsignedInteger(*this, Radix, Result);
309}
310
311
312bool StringRef::getAsInteger(unsigned Radix, long long &Result) const {
313  unsigned long long ULLVal;
314
315  // Handle positive strings first.
316  if (empty() || front() != '-') {
317    if (GetAsUnsignedInteger(*this, Radix, ULLVal) ||
318        // Check for value so large it overflows a signed value.
319        (long long)ULLVal < 0)
320      return true;
321    Result = ULLVal;
322    return false;
323  }
324
325  // Get the positive part of the value.
326  if (GetAsUnsignedInteger(substr(1), Radix, ULLVal) ||
327      // Reject values so large they'd overflow as negative signed, but allow
328      // "-0".  This negates the unsigned so that the negative isn't undefined
329      // on signed overflow.
330      (long long)-ULLVal > 0)
331    return true;
332
333  Result = -ULLVal;
334  return false;
335}
336
337bool StringRef::getAsInteger(unsigned Radix, int &Result) const {
338  long long Val;
339  if (getAsInteger(Radix, Val) ||
340      (int)Val != Val)
341    return true;
342  Result = Val;
343  return false;
344}
345
346bool StringRef::getAsInteger(unsigned Radix, unsigned &Result) const {
347  unsigned long long Val;
348  if (getAsInteger(Radix, Val) ||
349      (unsigned)Val != Val)
350    return true;
351  Result = Val;
352  return false;
353}
354
355bool StringRef::getAsInteger(unsigned Radix, APInt &Result) const {
356  StringRef Str = *this;
357
358  // Autosense radix if not specified.
359  if (Radix == 0)
360    Radix = GetAutoSenseRadix(Str);
361
362  assert(Radix > 1 && Radix <= 36);
363
364  // Empty strings (after the radix autosense) are invalid.
365  if (Str.empty()) return true;
366
367  // Skip leading zeroes.  This can be a significant improvement if
368  // it means we don't need > 64 bits.
369  while (!Str.empty() && Str.front() == '0')
370    Str = Str.substr(1);
371
372  // If it was nothing but zeroes....
373  if (Str.empty()) {
374    Result = APInt(64, 0);
375    return false;
376  }
377
378  // (Over-)estimate the required number of bits.
379  unsigned Log2Radix = 0;
380  while ((1U << Log2Radix) < Radix) Log2Radix++;
381  bool IsPowerOf2Radix = ((1U << Log2Radix) == Radix);
382
383  unsigned BitWidth = Log2Radix * Str.size();
384  if (BitWidth < Result.getBitWidth())
385    BitWidth = Result.getBitWidth(); // don't shrink the result
386  else
387    Result = Result.zext(BitWidth);
388
389  APInt RadixAP, CharAP; // unused unless !IsPowerOf2Radix
390  if (!IsPowerOf2Radix) {
391    // These must have the same bit-width as Result.
392    RadixAP = APInt(BitWidth, Radix);
393    CharAP = APInt(BitWidth, 0);
394  }
395
396  // Parse all the bytes of the string given this radix.
397  Result = 0;
398  while (!Str.empty()) {
399    unsigned CharVal;
400    if (Str[0] >= '0' && Str[0] <= '9')
401      CharVal = Str[0]-'0';
402    else if (Str[0] >= 'a' && Str[0] <= 'z')
403      CharVal = Str[0]-'a'+10;
404    else if (Str[0] >= 'A' && Str[0] <= 'Z')
405      CharVal = Str[0]-'A'+10;
406    else
407      return true;
408
409    // If the parsed value is larger than the integer radix, the string is
410    // invalid.
411    if (CharVal >= Radix)
412      return true;
413
414    // Add in this character.
415    if (IsPowerOf2Radix) {
416      Result <<= Log2Radix;
417      Result |= CharVal;
418    } else {
419      Result *= RadixAP;
420      CharAP = CharVal;
421      Result += CharAP;
422    }
423
424    Str = Str.substr(1);
425  }
426
427  return false;
428}
429