1// Copyright 2011 the V8 project authors. All rights reserved.
2// Redistribution and use in source and binary forms, with or without
3// modification, are permitted provided that the following conditions are
4// met:
5//
6//     * Redistributions of source code must retain the above copyright
7//       notice, this list of conditions and the following disclaimer.
8//     * Redistributions in binary form must reproduce the above
9//       copyright notice, this list of conditions and the following
10//       disclaimer in the documentation and/or other materials provided
11//       with the distribution.
12//     * Neither the name of Google Inc. nor the names of its
13//       contributors may be used to endorse or promote products derived
14//       from this software without specific prior written permission.
15//
16// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
20// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
22// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27
28#ifndef V8_STRING_SEARCH_H_
29#define V8_STRING_SEARCH_H_
30
31namespace v8 {
32namespace internal {
33
34
35//---------------------------------------------------------------------
36// String Search object.
37//---------------------------------------------------------------------
38
39// Class holding constants and methods that apply to all string search variants,
40// independently of subject and pattern char size.
41class StringSearchBase {
42 protected:
43  // Cap on the maximal shift in the Boyer-Moore implementation. By setting a
44  // limit, we can fix the size of tables. For a needle longer than this limit,
45  // search will not be optimal, since we only build tables for a suffix
46  // of the string, but it is a safe approximation.
47  static const int kBMMaxShift = Isolate::kBMMaxShift;
48
49  // Reduce alphabet to this size.
50  // One of the tables used by Boyer-Moore and Boyer-Moore-Horspool has size
51  // proportional to the input alphabet. We reduce the alphabet size by
52  // equating input characters modulo a smaller alphabet size. This gives
53  // a potentially less efficient searching, but is a safe approximation.
54  // For needles using only characters in the same Unicode 256-code point page,
55  // there is no search speed degradation.
56  static const int kAsciiAlphabetSize = 128;
57  static const int kUC16AlphabetSize = Isolate::kUC16AlphabetSize;
58
59  // Bad-char shift table stored in the state. It's length is the alphabet size.
60  // For patterns below this length, the skip length of Boyer-Moore is too short
61  // to compensate for the algorithmic overhead compared to simple brute force.
62  static const int kBMMinPatternLength = 7;
63
64  static inline bool IsAsciiString(Vector<const char>) {
65    return true;
66  }
67
68  static inline bool IsAsciiString(Vector<const uc16> string) {
69    return String::IsAscii(string.start(), string.length());
70  }
71
72  friend class Isolate;
73};
74
75
76template <typename PatternChar, typename SubjectChar>
77class StringSearch : private StringSearchBase {
78 public:
79  StringSearch(Isolate* isolate, Vector<const PatternChar> pattern)
80      : isolate_(isolate),
81        pattern_(pattern),
82        start_(Max(0, pattern.length() - kBMMaxShift)) {
83    if (sizeof(PatternChar) > sizeof(SubjectChar)) {
84      if (!IsAsciiString(pattern_)) {
85        strategy_ = &FailSearch;
86        return;
87      }
88    }
89    int pattern_length = pattern_.length();
90    if (pattern_length < kBMMinPatternLength) {
91      if (pattern_length == 1) {
92        strategy_ = &SingleCharSearch;
93        return;
94      }
95      strategy_ = &LinearSearch;
96      return;
97    }
98    strategy_ = &InitialSearch;
99  }
100
101  int Search(Vector<const SubjectChar> subject, int index) {
102    return strategy_(this, subject, index);
103  }
104
105  static inline int AlphabetSize() {
106    if (sizeof(PatternChar) == 1) {
107      // ASCII needle.
108      return kAsciiAlphabetSize;
109    } else {
110      ASSERT(sizeof(PatternChar) == 2);
111      // UC16 needle.
112      return kUC16AlphabetSize;
113    }
114  }
115
116 private:
117  typedef int (*SearchFunction)(  // NOLINT - it's not a cast!
118      StringSearch<PatternChar, SubjectChar>*,
119      Vector<const SubjectChar>,
120      int);
121
122  static int FailSearch(StringSearch<PatternChar, SubjectChar>*,
123                        Vector<const SubjectChar>,
124                        int) {
125    return -1;
126  }
127
128  static int SingleCharSearch(StringSearch<PatternChar, SubjectChar>* search,
129                              Vector<const SubjectChar> subject,
130                              int start_index);
131
132  static int LinearSearch(StringSearch<PatternChar, SubjectChar>* search,
133                          Vector<const SubjectChar> subject,
134                          int start_index);
135
136  static int InitialSearch(StringSearch<PatternChar, SubjectChar>* search,
137                           Vector<const SubjectChar> subject,
138                           int start_index);
139
140  static int BoyerMooreHorspoolSearch(
141      StringSearch<PatternChar, SubjectChar>* search,
142      Vector<const SubjectChar> subject,
143      int start_index);
144
145  static int BoyerMooreSearch(StringSearch<PatternChar, SubjectChar>* search,
146                              Vector<const SubjectChar> subject,
147                              int start_index);
148
149  void PopulateBoyerMooreHorspoolTable();
150
151  void PopulateBoyerMooreTable();
152
153  static inline int CharOccurrence(int* bad_char_occurrence,
154                                   SubjectChar char_code) {
155    if (sizeof(SubjectChar) == 1) {
156      return bad_char_occurrence[static_cast<int>(char_code)];
157    }
158    if (sizeof(PatternChar) == 1) {
159      if (static_cast<unsigned int>(char_code) > String::kMaxAsciiCharCodeU) {
160        return -1;
161      }
162      return bad_char_occurrence[static_cast<unsigned int>(char_code)];
163    }
164    // Both pattern and subject are UC16. Reduce character to equivalence class.
165    int equiv_class = char_code % kUC16AlphabetSize;
166    return bad_char_occurrence[equiv_class];
167  }
168
169  // The following tables are shared by all searches.
170  // TODO(lrn): Introduce a way for a pattern to keep its tables
171  // between searches (e.g., for an Atom RegExp).
172
173  // Store for the BoyerMoore(Horspool) bad char shift table.
174  // Return a table covering the last kBMMaxShift+1 positions of
175  // pattern.
176  int* bad_char_table() {
177    return isolate_->bad_char_shift_table();
178  }
179
180  // Store for the BoyerMoore good suffix shift table.
181  int* good_suffix_shift_table() {
182    // Return biased pointer that maps the range  [start_..pattern_.length()
183    // to the kGoodSuffixShiftTable array.
184    return isolate_->good_suffix_shift_table() - start_;
185  }
186
187  // Table used temporarily while building the BoyerMoore good suffix
188  // shift table.
189  int* suffix_table() {
190    // Return biased pointer that maps the range  [start_..pattern_.length()
191    // to the kSuffixTable array.
192    return isolate_->suffix_table() - start_;
193  }
194
195  Isolate* isolate_;
196  // The pattern to search for.
197  Vector<const PatternChar> pattern_;
198  // Pointer to implementation of the search.
199  SearchFunction strategy_;
200  // Cache value of Max(0, pattern_length() - kBMMaxShift)
201  int start_;
202};
203
204
205//---------------------------------------------------------------------
206// Single Character Pattern Search Strategy
207//---------------------------------------------------------------------
208
209template <typename PatternChar, typename SubjectChar>
210int StringSearch<PatternChar, SubjectChar>::SingleCharSearch(
211    StringSearch<PatternChar, SubjectChar>* search,
212    Vector<const SubjectChar> subject,
213    int index) {
214  ASSERT_EQ(1, search->pattern_.length());
215  PatternChar pattern_first_char = search->pattern_[0];
216  int i = index;
217  if (sizeof(SubjectChar) == 1 && sizeof(PatternChar) == 1) {
218    const SubjectChar* pos = reinterpret_cast<const SubjectChar*>(
219        memchr(subject.start() + i,
220               pattern_first_char,
221               subject.length() - i));
222    if (pos == NULL) return -1;
223    return static_cast<int>(pos - subject.start());
224  } else {
225    if (sizeof(PatternChar) > sizeof(SubjectChar)) {
226      if (static_cast<uc16>(pattern_first_char) > String::kMaxAsciiCharCodeU) {
227        return -1;
228      }
229    }
230    SubjectChar search_char = static_cast<SubjectChar>(pattern_first_char);
231    int n = subject.length();
232    while (i < n) {
233      if (subject[i++] == search_char) return i - 1;
234    }
235    return -1;
236  }
237}
238
239//---------------------------------------------------------------------
240// Linear Search Strategy
241//---------------------------------------------------------------------
242
243
244template <typename PatternChar, typename SubjectChar>
245inline bool CharCompare(const PatternChar* pattern,
246                        const SubjectChar* subject,
247                        int length) {
248  ASSERT(length > 0);
249  int pos = 0;
250  do {
251    if (pattern[pos] != subject[pos]) {
252      return false;
253    }
254    pos++;
255  } while (pos < length);
256  return true;
257}
258
259
260// Simple linear search for short patterns. Never bails out.
261template <typename PatternChar, typename SubjectChar>
262int StringSearch<PatternChar, SubjectChar>::LinearSearch(
263    StringSearch<PatternChar, SubjectChar>* search,
264    Vector<const SubjectChar> subject,
265    int index) {
266  Vector<const PatternChar> pattern = search->pattern_;
267  ASSERT(pattern.length() > 1);
268  int pattern_length = pattern.length();
269  PatternChar pattern_first_char = pattern[0];
270  int i = index;
271  int n = subject.length() - pattern_length;
272  while (i <= n) {
273    if (sizeof(SubjectChar) == 1 && sizeof(PatternChar) == 1) {
274      const SubjectChar* pos = reinterpret_cast<const SubjectChar*>(
275          memchr(subject.start() + i,
276                 pattern_first_char,
277                 n - i + 1));
278      if (pos == NULL) return -1;
279      i = static_cast<int>(pos - subject.start()) + 1;
280    } else {
281      if (subject[i++] != pattern_first_char) continue;
282    }
283    // Loop extracted to separate function to allow using return to do
284    // a deeper break.
285    if (CharCompare(pattern.start() + 1,
286                    subject.start() + i,
287                    pattern_length - 1)) {
288      return i - 1;
289    }
290  }
291  return -1;
292}
293
294//---------------------------------------------------------------------
295// Boyer-Moore string search
296//---------------------------------------------------------------------
297
298template <typename PatternChar, typename SubjectChar>
299int StringSearch<PatternChar, SubjectChar>::BoyerMooreSearch(
300    StringSearch<PatternChar, SubjectChar>* search,
301    Vector<const SubjectChar> subject,
302    int start_index) {
303  Vector<const PatternChar> pattern = search->pattern_;
304  int subject_length = subject.length();
305  int pattern_length = pattern.length();
306  // Only preprocess at most kBMMaxShift last characters of pattern.
307  int start = search->start_;
308
309  int* bad_char_occurence = search->bad_char_table();
310  int* good_suffix_shift = search->good_suffix_shift_table();
311
312  PatternChar last_char = pattern[pattern_length - 1];
313  int index = start_index;
314  // Continue search from i.
315  while (index <= subject_length - pattern_length) {
316    int j = pattern_length - 1;
317    int c;
318    while (last_char != (c = subject[index + j])) {
319      int shift =
320          j - CharOccurrence(bad_char_occurence, c);
321      index += shift;
322      if (index > subject_length - pattern_length) {
323        return -1;
324      }
325    }
326    while (j >= 0 && pattern[j] == (c = subject[index + j])) j--;
327    if (j < 0) {
328      return index;
329    } else if (j < start) {
330      // we have matched more than our tables allow us to be smart about.
331      // Fall back on BMH shift.
332      index += pattern_length - 1
333          - CharOccurrence(bad_char_occurence,
334                           static_cast<SubjectChar>(last_char));
335    } else {
336      int gs_shift = good_suffix_shift[j + 1];
337      int bc_occ =
338          CharOccurrence(bad_char_occurence, c);
339      int shift = j - bc_occ;
340      if (gs_shift > shift) {
341        shift = gs_shift;
342      }
343      index += shift;
344    }
345  }
346
347  return -1;
348}
349
350
351template <typename PatternChar, typename SubjectChar>
352void StringSearch<PatternChar, SubjectChar>::PopulateBoyerMooreTable() {
353  int pattern_length = pattern_.length();
354  const PatternChar* pattern = pattern_.start();
355  // Only look at the last kBMMaxShift characters of pattern (from start_
356  // to pattern_length).
357  int start = start_;
358  int length = pattern_length - start;
359
360  // Biased tables so that we can use pattern indices as table indices,
361  // even if we only cover the part of the pattern from offset start.
362  int* shift_table = good_suffix_shift_table();
363  int* suffix_table = this->suffix_table();
364
365  // Initialize table.
366  for (int i = start; i < pattern_length; i++) {
367    shift_table[i] = length;
368  }
369  shift_table[pattern_length] = 1;
370  suffix_table[pattern_length] = pattern_length + 1;
371
372  if (pattern_length <= start) {
373    return;
374  }
375
376  // Find suffixes.
377  PatternChar last_char = pattern[pattern_length - 1];
378  int suffix = pattern_length + 1;
379  {
380    int i = pattern_length;
381    while (i > start) {
382      PatternChar c = pattern[i - 1];
383      while (suffix <= pattern_length && c != pattern[suffix - 1]) {
384        if (shift_table[suffix] == length) {
385          shift_table[suffix] = suffix - i;
386        }
387        suffix = suffix_table[suffix];
388      }
389      suffix_table[--i] = --suffix;
390      if (suffix == pattern_length) {
391        // No suffix to extend, so we check against last_char only.
392        while ((i > start) && (pattern[i - 1] != last_char)) {
393          if (shift_table[pattern_length] == length) {
394            shift_table[pattern_length] = pattern_length - i;
395          }
396          suffix_table[--i] = pattern_length;
397        }
398        if (i > start) {
399          suffix_table[--i] = --suffix;
400        }
401      }
402    }
403  }
404  // Build shift table using suffixes.
405  if (suffix < pattern_length) {
406    for (int i = start; i <= pattern_length; i++) {
407      if (shift_table[i] == length) {
408        shift_table[i] = suffix - start;
409      }
410      if (i == suffix) {
411        suffix = suffix_table[suffix];
412      }
413    }
414  }
415}
416
417//---------------------------------------------------------------------
418// Boyer-Moore-Horspool string search.
419//---------------------------------------------------------------------
420
421template <typename PatternChar, typename SubjectChar>
422int StringSearch<PatternChar, SubjectChar>::BoyerMooreHorspoolSearch(
423    StringSearch<PatternChar, SubjectChar>* search,
424    Vector<const SubjectChar> subject,
425    int start_index) {
426  Vector<const PatternChar> pattern = search->pattern_;
427  int subject_length = subject.length();
428  int pattern_length = pattern.length();
429  int* char_occurrences = search->bad_char_table();
430  int badness = -pattern_length;
431
432  // How bad we are doing without a good-suffix table.
433  PatternChar last_char = pattern[pattern_length - 1];
434  int last_char_shift = pattern_length - 1 -
435      CharOccurrence(char_occurrences, static_cast<SubjectChar>(last_char));
436  // Perform search
437  int index = start_index;  // No matches found prior to this index.
438  while (index <= subject_length - pattern_length) {
439    int j = pattern_length - 1;
440    int subject_char;
441    while (last_char != (subject_char = subject[index + j])) {
442      int bc_occ = CharOccurrence(char_occurrences, subject_char);
443      int shift = j - bc_occ;
444      index += shift;
445      badness += 1 - shift;  // at most zero, so badness cannot increase.
446      if (index > subject_length - pattern_length) {
447        return -1;
448      }
449    }
450    j--;
451    while (j >= 0 && pattern[j] == (subject[index + j])) j--;
452    if (j < 0) {
453      return index;
454    } else {
455      index += last_char_shift;
456      // Badness increases by the number of characters we have
457      // checked, and decreases by the number of characters we
458      // can skip by shifting. It's a measure of how we are doing
459      // compared to reading each character exactly once.
460      badness += (pattern_length - j) - last_char_shift;
461      if (badness > 0) {
462        search->PopulateBoyerMooreTable();
463        search->strategy_ = &BoyerMooreSearch;
464        return BoyerMooreSearch(search, subject, index);
465      }
466    }
467  }
468  return -1;
469}
470
471
472template <typename PatternChar, typename SubjectChar>
473void StringSearch<PatternChar, SubjectChar>::PopulateBoyerMooreHorspoolTable() {
474  int pattern_length = pattern_.length();
475
476  int* bad_char_occurrence = bad_char_table();
477
478  // Only preprocess at most kBMMaxShift last characters of pattern.
479  int start = start_;
480  // Run forwards to populate bad_char_table, so that *last* instance
481  // of character equivalence class is the one registered.
482  // Notice: Doesn't include the last character.
483  int table_size = AlphabetSize();
484  if (start == 0) {  // All patterns less than kBMMaxShift in length.
485    memset(bad_char_occurrence,
486           -1,
487           table_size * sizeof(*bad_char_occurrence));
488  } else {
489    for (int i = 0; i < table_size; i++) {
490      bad_char_occurrence[i] = start - 1;
491    }
492  }
493  for (int i = start; i < pattern_length - 1; i++) {
494    PatternChar c = pattern_[i];
495    int bucket = (sizeof(PatternChar) == 1) ? c : c % AlphabetSize();
496    bad_char_occurrence[bucket] = i;
497  }
498}
499
500//---------------------------------------------------------------------
501// Linear string search with bailout to BMH.
502//---------------------------------------------------------------------
503
504// Simple linear search for short patterns, which bails out if the string
505// isn't found very early in the subject. Upgrades to BoyerMooreHorspool.
506template <typename PatternChar, typename SubjectChar>
507int StringSearch<PatternChar, SubjectChar>::InitialSearch(
508    StringSearch<PatternChar, SubjectChar>* search,
509    Vector<const SubjectChar> subject,
510    int index) {
511  Vector<const PatternChar> pattern = search->pattern_;
512  int pattern_length = pattern.length();
513  // Badness is a count of how much work we have done.  When we have
514  // done enough work we decide it's probably worth switching to a better
515  // algorithm.
516  int badness = -10 - (pattern_length << 2);
517
518  // We know our pattern is at least 2 characters, we cache the first so
519  // the common case of the first character not matching is faster.
520  PatternChar pattern_first_char = pattern[0];
521  for (int i = index, n = subject.length() - pattern_length; i <= n; i++) {
522    badness++;
523    if (badness <= 0) {
524      if (sizeof(SubjectChar) == 1 && sizeof(PatternChar) == 1) {
525        const SubjectChar* pos = reinterpret_cast<const SubjectChar*>(
526            memchr(subject.start() + i,
527                   pattern_first_char,
528                   n - i + 1));
529        if (pos == NULL) {
530          return -1;
531        }
532        i = static_cast<int>(pos - subject.start());
533      } else {
534        if (subject[i] != pattern_first_char) continue;
535      }
536      int j = 1;
537      do {
538        if (pattern[j] != subject[i + j]) {
539          break;
540        }
541        j++;
542      } while (j < pattern_length);
543      if (j == pattern_length) {
544        return i;
545      }
546      badness += j;
547    } else {
548      search->PopulateBoyerMooreHorspoolTable();
549      search->strategy_ = &BoyerMooreHorspoolSearch;
550      return BoyerMooreHorspoolSearch(search, subject, i);
551    }
552  }
553  return -1;
554}
555
556
557// Perform a a single stand-alone search.
558// If searching multiple times for the same pattern, a search
559// object should be constructed once and the Search function then called
560// for each search.
561template <typename SubjectChar, typename PatternChar>
562int SearchString(Isolate* isolate,
563                 Vector<const SubjectChar> subject,
564                 Vector<const PatternChar> pattern,
565                 int start_index) {
566  StringSearch<PatternChar, SubjectChar> search(isolate, pattern);
567  return search.Search(subject, start_index);
568}
569
570}}  // namespace v8::internal
571
572#endif  // V8_STRING_SEARCH_H_
573