1// Copyright 2011 the V8 project authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#ifndef V8_STRING_SEARCH_H_
6#define V8_STRING_SEARCH_H_
7
8namespace v8 {
9namespace internal {
10
11
12//---------------------------------------------------------------------
13// String Search object.
14//---------------------------------------------------------------------
15
16// Class holding constants and methods that apply to all string search variants,
17// independently of subject and pattern char size.
18class StringSearchBase {
19 protected:
20  // Cap on the maximal shift in the Boyer-Moore implementation. By setting a
21  // limit, we can fix the size of tables. For a needle longer than this limit,
22  // search will not be optimal, since we only build tables for a suffix
23  // of the string, but it is a safe approximation.
24  static const int kBMMaxShift = Isolate::kBMMaxShift;
25
26  // Reduce alphabet to this size.
27  // One of the tables used by Boyer-Moore and Boyer-Moore-Horspool has size
28  // proportional to the input alphabet. We reduce the alphabet size by
29  // equating input characters modulo a smaller alphabet size. This gives
30  // a potentially less efficient searching, but is a safe approximation.
31  // For needles using only characters in the same Unicode 256-code point page,
32  // there is no search speed degradation.
33  static const int kLatin1AlphabetSize = 256;
34  static const int kUC16AlphabetSize = Isolate::kUC16AlphabetSize;
35
36  // Bad-char shift table stored in the state. It's length is the alphabet size.
37  // For patterns below this length, the skip length of Boyer-Moore is too short
38  // to compensate for the algorithmic overhead compared to simple brute force.
39  static const int kBMMinPatternLength = 7;
40
41  static inline bool IsOneByteString(Vector<const uint8_t> string) {
42    return true;
43  }
44
45  static inline bool IsOneByteString(Vector<const uc16> string) {
46    return String::IsOneByte(string.start(), string.length());
47  }
48
49  friend class Isolate;
50};
51
52
53template <typename PatternChar, typename SubjectChar>
54class StringSearch : private StringSearchBase {
55 public:
56  StringSearch(Isolate* isolate, Vector<const PatternChar> pattern)
57      : isolate_(isolate),
58        pattern_(pattern),
59        start_(Max(0, pattern.length() - kBMMaxShift)) {
60    if (sizeof(PatternChar) > sizeof(SubjectChar)) {
61      if (!IsOneByteString(pattern_)) {
62        strategy_ = &FailSearch;
63        return;
64      }
65    }
66    int pattern_length = pattern_.length();
67    if (pattern_length < kBMMinPatternLength) {
68      if (pattern_length == 1) {
69        strategy_ = &SingleCharSearch;
70        return;
71      }
72      strategy_ = &LinearSearch;
73      return;
74    }
75    strategy_ = &InitialSearch;
76  }
77
78  int Search(Vector<const SubjectChar> subject, int index) {
79    return strategy_(this, subject, index);
80  }
81
82  static inline int AlphabetSize() {
83    if (sizeof(PatternChar) == 1) {
84      // Latin1 needle.
85      return kLatin1AlphabetSize;
86    } else {
87      DCHECK(sizeof(PatternChar) == 2);
88      // UC16 needle.
89      return kUC16AlphabetSize;
90    }
91  }
92
93 private:
94  typedef int (*SearchFunction)(  // NOLINT - it's not a cast!
95      StringSearch<PatternChar, SubjectChar>*,
96      Vector<const SubjectChar>,
97      int);
98
99  static int FailSearch(StringSearch<PatternChar, SubjectChar>*,
100                        Vector<const SubjectChar>,
101                        int) {
102    return -1;
103  }
104
105  static int SingleCharSearch(StringSearch<PatternChar, SubjectChar>* search,
106                              Vector<const SubjectChar> subject,
107                              int start_index);
108
109  static int LinearSearch(StringSearch<PatternChar, SubjectChar>* search,
110                          Vector<const SubjectChar> subject,
111                          int start_index);
112
113  static int InitialSearch(StringSearch<PatternChar, SubjectChar>* search,
114                           Vector<const SubjectChar> subject,
115                           int start_index);
116
117  static int BoyerMooreHorspoolSearch(
118      StringSearch<PatternChar, SubjectChar>* search,
119      Vector<const SubjectChar> subject,
120      int start_index);
121
122  static int BoyerMooreSearch(StringSearch<PatternChar, SubjectChar>* search,
123                              Vector<const SubjectChar> subject,
124                              int start_index);
125
126  void PopulateBoyerMooreHorspoolTable();
127
128  void PopulateBoyerMooreTable();
129
130  static inline bool exceedsOneByte(uint8_t c) {
131    return false;
132  }
133
134  static inline bool exceedsOneByte(uint16_t c) {
135    return c > String::kMaxOneByteCharCodeU;
136  }
137
138  static inline int CharOccurrence(int* bad_char_occurrence,
139                                   SubjectChar char_code) {
140    if (sizeof(SubjectChar) == 1) {
141      return bad_char_occurrence[static_cast<int>(char_code)];
142    }
143    if (sizeof(PatternChar) == 1) {
144      if (exceedsOneByte(char_code)) {
145        return -1;
146      }
147      return bad_char_occurrence[static_cast<unsigned int>(char_code)];
148    }
149    // Both pattern and subject are UC16. Reduce character to equivalence class.
150    int equiv_class = char_code % kUC16AlphabetSize;
151    return bad_char_occurrence[equiv_class];
152  }
153
154  // The following tables are shared by all searches.
155  // TODO(lrn): Introduce a way for a pattern to keep its tables
156  // between searches (e.g., for an Atom RegExp).
157
158  // Store for the BoyerMoore(Horspool) bad char shift table.
159  // Return a table covering the last kBMMaxShift+1 positions of
160  // pattern.
161  int* bad_char_table() {
162    return isolate_->bad_char_shift_table();
163  }
164
165  // Store for the BoyerMoore good suffix shift table.
166  int* good_suffix_shift_table() {
167    // Return biased pointer that maps the range  [start_..pattern_.length()
168    // to the kGoodSuffixShiftTable array.
169    return isolate_->good_suffix_shift_table() - start_;
170  }
171
172  // Table used temporarily while building the BoyerMoore good suffix
173  // shift table.
174  int* suffix_table() {
175    // Return biased pointer that maps the range  [start_..pattern_.length()
176    // to the kSuffixTable array.
177    return isolate_->suffix_table() - start_;
178  }
179
180  Isolate* isolate_;
181  // The pattern to search for.
182  Vector<const PatternChar> pattern_;
183  // Pointer to implementation of the search.
184  SearchFunction strategy_;
185  // Cache value of Max(0, pattern_length() - kBMMaxShift)
186  int start_;
187};
188
189
190//---------------------------------------------------------------------
191// Single Character Pattern Search Strategy
192//---------------------------------------------------------------------
193
194template <typename PatternChar, typename SubjectChar>
195int StringSearch<PatternChar, SubjectChar>::SingleCharSearch(
196    StringSearch<PatternChar, SubjectChar>* search,
197    Vector<const SubjectChar> subject,
198    int index) {
199  DCHECK_EQ(1, search->pattern_.length());
200  PatternChar pattern_first_char = search->pattern_[0];
201  int i = index;
202  if (sizeof(SubjectChar) == 1 && sizeof(PatternChar) == 1) {
203    const SubjectChar* pos = reinterpret_cast<const SubjectChar*>(
204        memchr(subject.start() + i,
205               pattern_first_char,
206               subject.length() - i));
207    if (pos == NULL) return -1;
208    return static_cast<int>(pos - subject.start());
209  } else {
210    if (sizeof(PatternChar) > sizeof(SubjectChar)) {
211      if (exceedsOneByte(pattern_first_char)) {
212        return -1;
213      }
214    }
215    SubjectChar search_char = static_cast<SubjectChar>(pattern_first_char);
216    int n = subject.length();
217    while (i < n) {
218      if (subject[i++] == search_char) return i - 1;
219    }
220    return -1;
221  }
222}
223
224//---------------------------------------------------------------------
225// Linear Search Strategy
226//---------------------------------------------------------------------
227
228
229template <typename PatternChar, typename SubjectChar>
230inline bool CharCompare(const PatternChar* pattern,
231                        const SubjectChar* subject,
232                        int length) {
233  DCHECK(length > 0);
234  int pos = 0;
235  do {
236    if (pattern[pos] != subject[pos]) {
237      return false;
238    }
239    pos++;
240  } while (pos < length);
241  return true;
242}
243
244
245// Simple linear search for short patterns. Never bails out.
246template <typename PatternChar, typename SubjectChar>
247int StringSearch<PatternChar, SubjectChar>::LinearSearch(
248    StringSearch<PatternChar, SubjectChar>* search,
249    Vector<const SubjectChar> subject,
250    int index) {
251  Vector<const PatternChar> pattern = search->pattern_;
252  DCHECK(pattern.length() > 1);
253  int pattern_length = pattern.length();
254  PatternChar pattern_first_char = pattern[0];
255  int i = index;
256  int n = subject.length() - pattern_length;
257  while (i <= n) {
258    if (sizeof(SubjectChar) == 1 && sizeof(PatternChar) == 1) {
259      const SubjectChar* pos = reinterpret_cast<const SubjectChar*>(
260          memchr(subject.start() + i,
261                 pattern_first_char,
262                 n - i + 1));
263      if (pos == NULL) return -1;
264      i = static_cast<int>(pos - subject.start()) + 1;
265    } else {
266      if (subject[i++] != pattern_first_char) continue;
267    }
268    // Loop extracted to separate function to allow using return to do
269    // a deeper break.
270    if (CharCompare(pattern.start() + 1,
271                    subject.start() + i,
272                    pattern_length - 1)) {
273      return i - 1;
274    }
275  }
276  return -1;
277}
278
279//---------------------------------------------------------------------
280// Boyer-Moore string search
281//---------------------------------------------------------------------
282
283template <typename PatternChar, typename SubjectChar>
284int StringSearch<PatternChar, SubjectChar>::BoyerMooreSearch(
285    StringSearch<PatternChar, SubjectChar>* search,
286    Vector<const SubjectChar> subject,
287    int start_index) {
288  Vector<const PatternChar> pattern = search->pattern_;
289  int subject_length = subject.length();
290  int pattern_length = pattern.length();
291  // Only preprocess at most kBMMaxShift last characters of pattern.
292  int start = search->start_;
293
294  int* bad_char_occurence = search->bad_char_table();
295  int* good_suffix_shift = search->good_suffix_shift_table();
296
297  PatternChar last_char = pattern[pattern_length - 1];
298  int index = start_index;
299  // Continue search from i.
300  while (index <= subject_length - pattern_length) {
301    int j = pattern_length - 1;
302    int c;
303    while (last_char != (c = subject[index + j])) {
304      int shift =
305          j - CharOccurrence(bad_char_occurence, c);
306      index += shift;
307      if (index > subject_length - pattern_length) {
308        return -1;
309      }
310    }
311    while (j >= 0 && pattern[j] == (c = subject[index + j])) j--;
312    if (j < 0) {
313      return index;
314    } else if (j < start) {
315      // we have matched more than our tables allow us to be smart about.
316      // Fall back on BMH shift.
317      index += pattern_length - 1
318          - CharOccurrence(bad_char_occurence,
319                           static_cast<SubjectChar>(last_char));
320    } else {
321      int gs_shift = good_suffix_shift[j + 1];
322      int bc_occ =
323          CharOccurrence(bad_char_occurence, c);
324      int shift = j - bc_occ;
325      if (gs_shift > shift) {
326        shift = gs_shift;
327      }
328      index += shift;
329    }
330  }
331
332  return -1;
333}
334
335
336template <typename PatternChar, typename SubjectChar>
337void StringSearch<PatternChar, SubjectChar>::PopulateBoyerMooreTable() {
338  int pattern_length = pattern_.length();
339  const PatternChar* pattern = pattern_.start();
340  // Only look at the last kBMMaxShift characters of pattern (from start_
341  // to pattern_length).
342  int start = start_;
343  int length = pattern_length - start;
344
345  // Biased tables so that we can use pattern indices as table indices,
346  // even if we only cover the part of the pattern from offset start.
347  int* shift_table = good_suffix_shift_table();
348  int* suffix_table = this->suffix_table();
349
350  // Initialize table.
351  for (int i = start; i < pattern_length; i++) {
352    shift_table[i] = length;
353  }
354  shift_table[pattern_length] = 1;
355  suffix_table[pattern_length] = pattern_length + 1;
356
357  if (pattern_length <= start) {
358    return;
359  }
360
361  // Find suffixes.
362  PatternChar last_char = pattern[pattern_length - 1];
363  int suffix = pattern_length + 1;
364  {
365    int i = pattern_length;
366    while (i > start) {
367      PatternChar c = pattern[i - 1];
368      while (suffix <= pattern_length && c != pattern[suffix - 1]) {
369        if (shift_table[suffix] == length) {
370          shift_table[suffix] = suffix - i;
371        }
372        suffix = suffix_table[suffix];
373      }
374      suffix_table[--i] = --suffix;
375      if (suffix == pattern_length) {
376        // No suffix to extend, so we check against last_char only.
377        while ((i > start) && (pattern[i - 1] != last_char)) {
378          if (shift_table[pattern_length] == length) {
379            shift_table[pattern_length] = pattern_length - i;
380          }
381          suffix_table[--i] = pattern_length;
382        }
383        if (i > start) {
384          suffix_table[--i] = --suffix;
385        }
386      }
387    }
388  }
389  // Build shift table using suffixes.
390  if (suffix < pattern_length) {
391    for (int i = start; i <= pattern_length; i++) {
392      if (shift_table[i] == length) {
393        shift_table[i] = suffix - start;
394      }
395      if (i == suffix) {
396        suffix = suffix_table[suffix];
397      }
398    }
399  }
400}
401
402//---------------------------------------------------------------------
403// Boyer-Moore-Horspool string search.
404//---------------------------------------------------------------------
405
406template <typename PatternChar, typename SubjectChar>
407int StringSearch<PatternChar, SubjectChar>::BoyerMooreHorspoolSearch(
408    StringSearch<PatternChar, SubjectChar>* search,
409    Vector<const SubjectChar> subject,
410    int start_index) {
411  Vector<const PatternChar> pattern = search->pattern_;
412  int subject_length = subject.length();
413  int pattern_length = pattern.length();
414  int* char_occurrences = search->bad_char_table();
415  int badness = -pattern_length;
416
417  // How bad we are doing without a good-suffix table.
418  PatternChar last_char = pattern[pattern_length - 1];
419  int last_char_shift = pattern_length - 1 -
420      CharOccurrence(char_occurrences, static_cast<SubjectChar>(last_char));
421  // Perform search
422  int index = start_index;  // No matches found prior to this index.
423  while (index <= subject_length - pattern_length) {
424    int j = pattern_length - 1;
425    int subject_char;
426    while (last_char != (subject_char = subject[index + j])) {
427      int bc_occ = CharOccurrence(char_occurrences, subject_char);
428      int shift = j - bc_occ;
429      index += shift;
430      badness += 1 - shift;  // at most zero, so badness cannot increase.
431      if (index > subject_length - pattern_length) {
432        return -1;
433      }
434    }
435    j--;
436    while (j >= 0 && pattern[j] == (subject[index + j])) j--;
437    if (j < 0) {
438      return index;
439    } else {
440      index += last_char_shift;
441      // Badness increases by the number of characters we have
442      // checked, and decreases by the number of characters we
443      // can skip by shifting. It's a measure of how we are doing
444      // compared to reading each character exactly once.
445      badness += (pattern_length - j) - last_char_shift;
446      if (badness > 0) {
447        search->PopulateBoyerMooreTable();
448        search->strategy_ = &BoyerMooreSearch;
449        return BoyerMooreSearch(search, subject, index);
450      }
451    }
452  }
453  return -1;
454}
455
456
457template <typename PatternChar, typename SubjectChar>
458void StringSearch<PatternChar, SubjectChar>::PopulateBoyerMooreHorspoolTable() {
459  int pattern_length = pattern_.length();
460
461  int* bad_char_occurrence = bad_char_table();
462
463  // Only preprocess at most kBMMaxShift last characters of pattern.
464  int start = start_;
465  // Run forwards to populate bad_char_table, so that *last* instance
466  // of character equivalence class is the one registered.
467  // Notice: Doesn't include the last character.
468  int table_size = AlphabetSize();
469  if (start == 0) {  // All patterns less than kBMMaxShift in length.
470    memset(bad_char_occurrence,
471           -1,
472           table_size * sizeof(*bad_char_occurrence));
473  } else {
474    for (int i = 0; i < table_size; i++) {
475      bad_char_occurrence[i] = start - 1;
476    }
477  }
478  for (int i = start; i < pattern_length - 1; i++) {
479    PatternChar c = pattern_[i];
480    int bucket = (sizeof(PatternChar) == 1) ? c : c % AlphabetSize();
481    bad_char_occurrence[bucket] = i;
482  }
483}
484
485//---------------------------------------------------------------------
486// Linear string search with bailout to BMH.
487//---------------------------------------------------------------------
488
489// Simple linear search for short patterns, which bails out if the string
490// isn't found very early in the subject. Upgrades to BoyerMooreHorspool.
491template <typename PatternChar, typename SubjectChar>
492int StringSearch<PatternChar, SubjectChar>::InitialSearch(
493    StringSearch<PatternChar, SubjectChar>* search,
494    Vector<const SubjectChar> subject,
495    int index) {
496  Vector<const PatternChar> pattern = search->pattern_;
497  int pattern_length = pattern.length();
498  // Badness is a count of how much work we have done.  When we have
499  // done enough work we decide it's probably worth switching to a better
500  // algorithm.
501  int badness = -10 - (pattern_length << 2);
502
503  // We know our pattern is at least 2 characters, we cache the first so
504  // the common case of the first character not matching is faster.
505  PatternChar pattern_first_char = pattern[0];
506  for (int i = index, n = subject.length() - pattern_length; i <= n; i++) {
507    badness++;
508    if (badness <= 0) {
509      if (sizeof(SubjectChar) == 1 && sizeof(PatternChar) == 1) {
510        const SubjectChar* pos = reinterpret_cast<const SubjectChar*>(
511            memchr(subject.start() + i,
512                   pattern_first_char,
513                   n - i + 1));
514        if (pos == NULL) {
515          return -1;
516        }
517        i = static_cast<int>(pos - subject.start());
518      } else {
519        if (subject[i] != pattern_first_char) continue;
520      }
521      int j = 1;
522      do {
523        if (pattern[j] != subject[i + j]) {
524          break;
525        }
526        j++;
527      } while (j < pattern_length);
528      if (j == pattern_length) {
529        return i;
530      }
531      badness += j;
532    } else {
533      search->PopulateBoyerMooreHorspoolTable();
534      search->strategy_ = &BoyerMooreHorspoolSearch;
535      return BoyerMooreHorspoolSearch(search, subject, i);
536    }
537  }
538  return -1;
539}
540
541
542// Perform a a single stand-alone search.
543// If searching multiple times for the same pattern, a search
544// object should be constructed once and the Search function then called
545// for each search.
546template <typename SubjectChar, typename PatternChar>
547int SearchString(Isolate* isolate,
548                 Vector<const SubjectChar> subject,
549                 Vector<const PatternChar> pattern,
550                 int start_index) {
551  StringSearch<PatternChar, SubjectChar> search(isolate, pattern);
552  return search.Search(subject, start_index);
553}
554
555}}  // namespace v8::internal
556
557#endif  // V8_STRING_SEARCH_H_
558