StringRef.cpp revision 36b56886974eae4f9c5ebc96befd3e7bfe5de338
1//===-- StringRef.cpp - Lightweight String References ---------------------===//
2//
3//                     The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9
10#include "llvm/ADT/StringRef.h"
11#include "llvm/ADT/APInt.h"
12#include "llvm/ADT/Hashing.h"
13#include "llvm/ADT/edit_distance.h"
14#include <bitset>
15
16using namespace llvm;
17
18// MSVC emits references to this into the translation units which reference it.
19#ifndef _MSC_VER
20const size_t StringRef::npos;
21#endif
22
23static char ascii_tolower(char x) {
24  if (x >= 'A' && x <= 'Z')
25    return x - 'A' + 'a';
26  return x;
27}
28
29static char ascii_toupper(char x) {
30  if (x >= 'a' && x <= 'z')
31    return x - 'a' + 'A';
32  return x;
33}
34
35static bool ascii_isdigit(char x) {
36  return x >= '0' && x <= '9';
37}
38
39// strncasecmp() is not available on non-POSIX systems, so define an
40// alternative function here.
41static int ascii_strncasecmp(const char *LHS, const char *RHS, size_t Length) {
42  for (size_t I = 0; I < Length; ++I) {
43    unsigned char LHC = ascii_tolower(LHS[I]);
44    unsigned char RHC = ascii_tolower(RHS[I]);
45    if (LHC != RHC)
46      return LHC < RHC ? -1 : 1;
47  }
48  return 0;
49}
50
51/// compare_lower - Compare strings, ignoring case.
52int StringRef::compare_lower(StringRef RHS) const {
53  if (int Res = ascii_strncasecmp(Data, RHS.Data, min(Length, RHS.Length)))
54    return Res;
55  if (Length == RHS.Length)
56    return 0;
57  return Length < RHS.Length ? -1 : 1;
58}
59
60/// Check if this string starts with the given \p Prefix, ignoring case.
61bool StringRef::startswith_lower(StringRef Prefix) const {
62  return Length >= Prefix.Length &&
63      ascii_strncasecmp(Data, Prefix.Data, Prefix.Length) == 0;
64}
65
66/// Check if this string ends with the given \p Suffix, ignoring case.
67bool StringRef::endswith_lower(StringRef Suffix) const {
68  return Length >= Suffix.Length &&
69      ascii_strncasecmp(end() - Suffix.Length, Suffix.Data, Suffix.Length) == 0;
70}
71
72/// compare_numeric - Compare strings, handle embedded numbers.
73int StringRef::compare_numeric(StringRef RHS) const {
74  for (size_t I = 0, E = min(Length, RHS.Length); I != E; ++I) {
75    // Check for sequences of digits.
76    if (ascii_isdigit(Data[I]) && ascii_isdigit(RHS.Data[I])) {
77      // The longer sequence of numbers is considered larger.
78      // This doesn't really handle prefixed zeros well.
79      size_t J;
80      for (J = I + 1; J != E + 1; ++J) {
81        bool ld = J < Length && ascii_isdigit(Data[J]);
82        bool rd = J < RHS.Length && ascii_isdigit(RHS.Data[J]);
83        if (ld != rd)
84          return rd ? -1 : 1;
85        if (!rd)
86          break;
87      }
88      // The two number sequences have the same length (J-I), just memcmp them.
89      if (int Res = compareMemory(Data + I, RHS.Data + I, J - I))
90        return Res < 0 ? -1 : 1;
91      // Identical number sequences, continue search after the numbers.
92      I = J - 1;
93      continue;
94    }
95    if (Data[I] != RHS.Data[I])
96      return (unsigned char)Data[I] < (unsigned char)RHS.Data[I] ? -1 : 1;
97  }
98  if (Length == RHS.Length)
99    return 0;
100  return Length < RHS.Length ? -1 : 1;
101}
102
103// Compute the edit distance between the two given strings.
104unsigned StringRef::edit_distance(llvm::StringRef Other,
105                                  bool AllowReplacements,
106                                  unsigned MaxEditDistance) const {
107  return llvm::ComputeEditDistance(
108      llvm::ArrayRef<char>(data(), size()),
109      llvm::ArrayRef<char>(Other.data(), Other.size()),
110      AllowReplacements, MaxEditDistance);
111}
112
113//===----------------------------------------------------------------------===//
114// String Operations
115//===----------------------------------------------------------------------===//
116
117std::string StringRef::lower() const {
118  std::string Result(size(), char());
119  for (size_type i = 0, e = size(); i != e; ++i) {
120    Result[i] = ascii_tolower(Data[i]);
121  }
122  return Result;
123}
124
125std::string StringRef::upper() const {
126  std::string Result(size(), char());
127  for (size_type i = 0, e = size(); i != e; ++i) {
128    Result[i] = ascii_toupper(Data[i]);
129  }
130  return Result;
131}
132
133//===----------------------------------------------------------------------===//
134// String Searching
135//===----------------------------------------------------------------------===//
136
137
138/// find - Search for the first string \arg Str in the string.
139///
140/// \return - The index of the first occurrence of \arg Str, or npos if not
141/// found.
142size_t StringRef::find(StringRef Str, size_t From) const {
143  size_t N = Str.size();
144  if (N > Length)
145    return npos;
146
147  // For short haystacks or unsupported needles fall back to the naive algorithm
148  if (Length < 16 || N > 255 || N == 0) {
149    for (size_t e = Length - N + 1, i = min(From, e); i != e; ++i)
150      if (substr(i, N).equals(Str))
151        return i;
152    return npos;
153  }
154
155  if (From >= Length)
156    return npos;
157
158  // Build the bad char heuristic table, with uint8_t to reduce cache thrashing.
159  uint8_t BadCharSkip[256];
160  std::memset(BadCharSkip, N, 256);
161  for (unsigned i = 0; i != N-1; ++i)
162    BadCharSkip[(uint8_t)Str[i]] = N-1-i;
163
164  unsigned Len = Length-From, Pos = From;
165  while (Len >= N) {
166    if (substr(Pos, N).equals(Str)) // See if this is the correct substring.
167      return Pos;
168
169    // Otherwise skip the appropriate number of bytes.
170    uint8_t Skip = BadCharSkip[(uint8_t)(*this)[Pos+N-1]];
171    Len -= Skip;
172    Pos += Skip;
173  }
174
175  return npos;
176}
177
178/// rfind - Search for the last string \arg Str in the string.
179///
180/// \return - The index of the last occurrence of \arg Str, or npos if not
181/// found.
182size_t StringRef::rfind(StringRef Str) const {
183  size_t N = Str.size();
184  if (N > Length)
185    return npos;
186  for (size_t i = Length - N + 1, e = 0; i != e;) {
187    --i;
188    if (substr(i, N).equals(Str))
189      return i;
190  }
191  return npos;
192}
193
194/// find_first_of - Find the first character in the string that is in \arg
195/// Chars, or npos if not found.
196///
197/// Note: O(size() + Chars.size())
198StringRef::size_type StringRef::find_first_of(StringRef Chars,
199                                              size_t From) const {
200  std::bitset<1 << CHAR_BIT> CharBits;
201  for (size_type i = 0; i != Chars.size(); ++i)
202    CharBits.set((unsigned char)Chars[i]);
203
204  for (size_type i = min(From, Length), e = Length; i != e; ++i)
205    if (CharBits.test((unsigned char)Data[i]))
206      return i;
207  return npos;
208}
209
210/// find_first_not_of - Find the first character in the string that is not
211/// \arg C or npos if not found.
212StringRef::size_type StringRef::find_first_not_of(char C, size_t From) const {
213  for (size_type i = min(From, Length), e = Length; i != e; ++i)
214    if (Data[i] != C)
215      return i;
216  return npos;
217}
218
219/// find_first_not_of - Find the first character in the string that is not
220/// in the string \arg Chars, or npos if not found.
221///
222/// Note: O(size() + Chars.size())
223StringRef::size_type StringRef::find_first_not_of(StringRef Chars,
224                                                  size_t From) const {
225  std::bitset<1 << CHAR_BIT> CharBits;
226  for (size_type i = 0; i != Chars.size(); ++i)
227    CharBits.set((unsigned char)Chars[i]);
228
229  for (size_type i = min(From, Length), e = Length; i != e; ++i)
230    if (!CharBits.test((unsigned char)Data[i]))
231      return i;
232  return npos;
233}
234
235/// find_last_of - Find the last character in the string that is in \arg C,
236/// or npos if not found.
237///
238/// Note: O(size() + Chars.size())
239StringRef::size_type StringRef::find_last_of(StringRef Chars,
240                                             size_t From) const {
241  std::bitset<1 << CHAR_BIT> CharBits;
242  for (size_type i = 0; i != Chars.size(); ++i)
243    CharBits.set((unsigned char)Chars[i]);
244
245  for (size_type i = min(From, Length) - 1, e = -1; i != e; --i)
246    if (CharBits.test((unsigned char)Data[i]))
247      return i;
248  return npos;
249}
250
251/// find_last_not_of - Find the last character in the string that is not
252/// \arg C, or npos if not found.
253StringRef::size_type StringRef::find_last_not_of(char C, size_t From) const {
254  for (size_type i = min(From, Length) - 1, e = -1; i != e; --i)
255    if (Data[i] != C)
256      return i;
257  return npos;
258}
259
260/// find_last_not_of - Find the last character in the string that is not in
261/// \arg Chars, or npos if not found.
262///
263/// Note: O(size() + Chars.size())
264StringRef::size_type StringRef::find_last_not_of(StringRef Chars,
265                                                 size_t From) const {
266  std::bitset<1 << CHAR_BIT> CharBits;
267  for (size_type i = 0, e = Chars.size(); i != e; ++i)
268    CharBits.set((unsigned char)Chars[i]);
269
270  for (size_type i = min(From, Length) - 1, e = -1; i != e; --i)
271    if (!CharBits.test((unsigned char)Data[i]))
272      return i;
273  return npos;
274}
275
276void StringRef::split(SmallVectorImpl<StringRef> &A,
277                      StringRef Separators, int MaxSplit,
278                      bool KeepEmpty) const {
279  StringRef rest = *this;
280
281  // rest.data() is used to distinguish cases like "a," that splits into
282  // "a" + "" and "a" that splits into "a" + 0.
283  for (int splits = 0;
284       rest.data() != NULL && (MaxSplit < 0 || splits < MaxSplit);
285       ++splits) {
286    std::pair<StringRef, StringRef> p = rest.split(Separators);
287
288    if (KeepEmpty || p.first.size() != 0)
289      A.push_back(p.first);
290    rest = p.second;
291  }
292  // If we have a tail left, add it.
293  if (rest.data() != NULL && (rest.size() != 0 || KeepEmpty))
294    A.push_back(rest);
295}
296
297//===----------------------------------------------------------------------===//
298// Helpful Algorithms
299//===----------------------------------------------------------------------===//
300
301/// count - Return the number of non-overlapped occurrences of \arg Str in
302/// the string.
303size_t StringRef::count(StringRef Str) const {
304  size_t Count = 0;
305  size_t N = Str.size();
306  if (N > Length)
307    return 0;
308  for (size_t i = 0, e = Length - N + 1; i != e; ++i)
309    if (substr(i, N).equals(Str))
310      ++Count;
311  return Count;
312}
313
314static unsigned GetAutoSenseRadix(StringRef &Str) {
315  if (Str.startswith("0x")) {
316    Str = Str.substr(2);
317    return 16;
318  }
319
320  if (Str.startswith("0b")) {
321    Str = Str.substr(2);
322    return 2;
323  }
324
325  if (Str.startswith("0o")) {
326    Str = Str.substr(2);
327    return 8;
328  }
329
330  if (Str.startswith("0"))
331    return 8;
332
333  return 10;
334}
335
336
337/// GetAsUnsignedInteger - Workhorse method that converts a integer character
338/// sequence of radix up to 36 to an unsigned long long value.
339bool llvm::getAsUnsignedInteger(StringRef Str, unsigned Radix,
340                                unsigned long long &Result) {
341  // Autosense radix if not specified.
342  if (Radix == 0)
343    Radix = GetAutoSenseRadix(Str);
344
345  // Empty strings (after the radix autosense) are invalid.
346  if (Str.empty()) return true;
347
348  // Parse all the bytes of the string given this radix.  Watch for overflow.
349  Result = 0;
350  while (!Str.empty()) {
351    unsigned CharVal;
352    if (Str[0] >= '0' && Str[0] <= '9')
353      CharVal = Str[0]-'0';
354    else if (Str[0] >= 'a' && Str[0] <= 'z')
355      CharVal = Str[0]-'a'+10;
356    else if (Str[0] >= 'A' && Str[0] <= 'Z')
357      CharVal = Str[0]-'A'+10;
358    else
359      return true;
360
361    // If the parsed value is larger than the integer radix, the string is
362    // invalid.
363    if (CharVal >= Radix)
364      return true;
365
366    // Add in this character.
367    unsigned long long PrevResult = Result;
368    Result = Result*Radix+CharVal;
369
370    // Check for overflow by shifting back and seeing if bits were lost.
371    if (Result/Radix < PrevResult)
372      return true;
373
374    Str = Str.substr(1);
375  }
376
377  return false;
378}
379
380bool llvm::getAsSignedInteger(StringRef Str, unsigned Radix,
381                              long long &Result) {
382  unsigned long long ULLVal;
383
384  // Handle positive strings first.
385  if (Str.empty() || Str.front() != '-') {
386    if (getAsUnsignedInteger(Str, Radix, ULLVal) ||
387        // Check for value so large it overflows a signed value.
388        (long long)ULLVal < 0)
389      return true;
390    Result = ULLVal;
391    return false;
392  }
393
394  // Get the positive part of the value.
395  if (getAsUnsignedInteger(Str.substr(1), Radix, ULLVal) ||
396      // Reject values so large they'd overflow as negative signed, but allow
397      // "-0".  This negates the unsigned so that the negative isn't undefined
398      // on signed overflow.
399      (long long)-ULLVal > 0)
400    return true;
401
402  Result = -ULLVal;
403  return false;
404}
405
406bool StringRef::getAsInteger(unsigned Radix, APInt &Result) const {
407  StringRef Str = *this;
408
409  // Autosense radix if not specified.
410  if (Radix == 0)
411    Radix = GetAutoSenseRadix(Str);
412
413  assert(Radix > 1 && Radix <= 36);
414
415  // Empty strings (after the radix autosense) are invalid.
416  if (Str.empty()) return true;
417
418  // Skip leading zeroes.  This can be a significant improvement if
419  // it means we don't need > 64 bits.
420  while (!Str.empty() && Str.front() == '0')
421    Str = Str.substr(1);
422
423  // If it was nothing but zeroes....
424  if (Str.empty()) {
425    Result = APInt(64, 0);
426    return false;
427  }
428
429  // (Over-)estimate the required number of bits.
430  unsigned Log2Radix = 0;
431  while ((1U << Log2Radix) < Radix) Log2Radix++;
432  bool IsPowerOf2Radix = ((1U << Log2Radix) == Radix);
433
434  unsigned BitWidth = Log2Radix * Str.size();
435  if (BitWidth < Result.getBitWidth())
436    BitWidth = Result.getBitWidth(); // don't shrink the result
437  else if (BitWidth > Result.getBitWidth())
438    Result = Result.zext(BitWidth);
439
440  APInt RadixAP, CharAP; // unused unless !IsPowerOf2Radix
441  if (!IsPowerOf2Radix) {
442    // These must have the same bit-width as Result.
443    RadixAP = APInt(BitWidth, Radix);
444    CharAP = APInt(BitWidth, 0);
445  }
446
447  // Parse all the bytes of the string given this radix.
448  Result = 0;
449  while (!Str.empty()) {
450    unsigned CharVal;
451    if (Str[0] >= '0' && Str[0] <= '9')
452      CharVal = Str[0]-'0';
453    else if (Str[0] >= 'a' && Str[0] <= 'z')
454      CharVal = Str[0]-'a'+10;
455    else if (Str[0] >= 'A' && Str[0] <= 'Z')
456      CharVal = Str[0]-'A'+10;
457    else
458      return true;
459
460    // If the parsed value is larger than the integer radix, the string is
461    // invalid.
462    if (CharVal >= Radix)
463      return true;
464
465    // Add in this character.
466    if (IsPowerOf2Radix) {
467      Result <<= Log2Radix;
468      Result |= CharVal;
469    } else {
470      Result *= RadixAP;
471      CharAP = CharVal;
472      Result += CharAP;
473    }
474
475    Str = Str.substr(1);
476  }
477
478  return false;
479}
480
481
482// Implementation of StringRef hashing.
483hash_code llvm::hash_value(StringRef S) {
484  return hash_combine_range(S.begin(), S.end());
485}
486