1// Copyright 2011 the V8 project authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#ifndef V8_STRING_SEARCH_H_
6#define V8_STRING_SEARCH_H_
7
8#include "src/isolate.h"
9#include "src/vector.h"
10
11namespace v8 {
12namespace internal {
13
14
15//---------------------------------------------------------------------
16// String Search object.
17//---------------------------------------------------------------------
18
19// Class holding constants and methods that apply to all string search variants,
20// independently of subject and pattern char size.
21class StringSearchBase {
22 protected:
23  // Cap on the maximal shift in the Boyer-Moore implementation. By setting a
24  // limit, we can fix the size of tables. For a needle longer than this limit,
25  // search will not be optimal, since we only build tables for a suffix
26  // of the string, but it is a safe approximation.
27  static const int kBMMaxShift = Isolate::kBMMaxShift;
28
29  // Reduce alphabet to this size.
30  // One of the tables used by Boyer-Moore and Boyer-Moore-Horspool has size
31  // proportional to the input alphabet. We reduce the alphabet size by
32  // equating input characters modulo a smaller alphabet size. This gives
33  // a potentially less efficient searching, but is a safe approximation.
34  // For needles using only characters in the same Unicode 256-code point page,
35  // there is no search speed degradation.
36  static const int kLatin1AlphabetSize = 256;
37  static const int kUC16AlphabetSize = Isolate::kUC16AlphabetSize;
38
39  // Bad-char shift table stored in the state. It's length is the alphabet size.
40  // For patterns below this length, the skip length of Boyer-Moore is too short
41  // to compensate for the algorithmic overhead compared to simple brute force.
42  static const int kBMMinPatternLength = 7;
43
44  static inline bool IsOneByteString(Vector<const uint8_t> string) {
45    return true;
46  }
47
48  static inline bool IsOneByteString(Vector<const uc16> string) {
49    return String::IsOneByte(string.start(), string.length());
50  }
51
52  friend class Isolate;
53};
54
55
56template <typename PatternChar, typename SubjectChar>
57class StringSearch : private StringSearchBase {
58 public:
59  StringSearch(Isolate* isolate, Vector<const PatternChar> pattern)
60      : isolate_(isolate),
61        pattern_(pattern),
62        start_(Max(0, pattern.length() - kBMMaxShift)) {
63    if (sizeof(PatternChar) > sizeof(SubjectChar)) {
64      if (!IsOneByteString(pattern_)) {
65        strategy_ = &FailSearch;
66        return;
67      }
68    }
69    int pattern_length = pattern_.length();
70    if (pattern_length < kBMMinPatternLength) {
71      if (pattern_length == 1) {
72        strategy_ = &SingleCharSearch;
73        return;
74      }
75      strategy_ = &LinearSearch;
76      return;
77    }
78    strategy_ = &InitialSearch;
79  }
80
81  int Search(Vector<const SubjectChar> subject, int index) {
82    return strategy_(this, subject, index);
83  }
84
85  static inline int AlphabetSize() {
86    if (sizeof(PatternChar) == 1) {
87      // Latin1 needle.
88      return kLatin1AlphabetSize;
89    } else {
90      DCHECK(sizeof(PatternChar) == 2);
91      // UC16 needle.
92      return kUC16AlphabetSize;
93    }
94  }
95
96 private:
97  typedef int (*SearchFunction)(  // NOLINT - it's not a cast!
98      StringSearch<PatternChar, SubjectChar>*,
99      Vector<const SubjectChar>,
100      int);
101
102  static int FailSearch(StringSearch<PatternChar, SubjectChar>*,
103                        Vector<const SubjectChar>,
104                        int) {
105    return -1;
106  }
107
108  static int SingleCharSearch(StringSearch<PatternChar, SubjectChar>* search,
109                              Vector<const SubjectChar> subject,
110                              int start_index);
111
112  static int LinearSearch(StringSearch<PatternChar, SubjectChar>* search,
113                          Vector<const SubjectChar> subject,
114                          int start_index);
115
116  static int InitialSearch(StringSearch<PatternChar, SubjectChar>* search,
117                           Vector<const SubjectChar> subject,
118                           int start_index);
119
120  static int BoyerMooreHorspoolSearch(
121      StringSearch<PatternChar, SubjectChar>* search,
122      Vector<const SubjectChar> subject,
123      int start_index);
124
125  static int BoyerMooreSearch(StringSearch<PatternChar, SubjectChar>* search,
126                              Vector<const SubjectChar> subject,
127                              int start_index);
128
129  void PopulateBoyerMooreHorspoolTable();
130
131  void PopulateBoyerMooreTable();
132
133  static inline bool exceedsOneByte(uint8_t c) {
134    return false;
135  }
136
137  static inline bool exceedsOneByte(uint16_t c) {
138    return c > String::kMaxOneByteCharCodeU;
139  }
140
141  static inline int CharOccurrence(int* bad_char_occurrence,
142                                   SubjectChar char_code) {
143    if (sizeof(SubjectChar) == 1) {
144      return bad_char_occurrence[static_cast<int>(char_code)];
145    }
146    if (sizeof(PatternChar) == 1) {
147      if (exceedsOneByte(char_code)) {
148        return -1;
149      }
150      return bad_char_occurrence[static_cast<unsigned int>(char_code)];
151    }
152    // Both pattern and subject are UC16. Reduce character to equivalence class.
153    int equiv_class = char_code % kUC16AlphabetSize;
154    return bad_char_occurrence[equiv_class];
155  }
156
157  // The following tables are shared by all searches.
158  // TODO(lrn): Introduce a way for a pattern to keep its tables
159  // between searches (e.g., for an Atom RegExp).
160
161  // Store for the BoyerMoore(Horspool) bad char shift table.
162  // Return a table covering the last kBMMaxShift+1 positions of
163  // pattern.
164  int* bad_char_table() {
165    return isolate_->bad_char_shift_table();
166  }
167
168  // Store for the BoyerMoore good suffix shift table.
169  int* good_suffix_shift_table() {
170    // Return biased pointer that maps the range  [start_..pattern_.length()
171    // to the kGoodSuffixShiftTable array.
172    return isolate_->good_suffix_shift_table() - start_;
173  }
174
175  // Table used temporarily while building the BoyerMoore good suffix
176  // shift table.
177  int* suffix_table() {
178    // Return biased pointer that maps the range  [start_..pattern_.length()
179    // to the kSuffixTable array.
180    return isolate_->suffix_table() - start_;
181  }
182
183  Isolate* isolate_;
184  // The pattern to search for.
185  Vector<const PatternChar> pattern_;
186  // Pointer to implementation of the search.
187  SearchFunction strategy_;
188  // Cache value of Max(0, pattern_length() - kBMMaxShift)
189  int start_;
190};
191
192
193template <typename T, typename U>
194inline T AlignDown(T value, U alignment) {
195  return reinterpret_cast<T>(
196      (reinterpret_cast<uintptr_t>(value) & ~(alignment - 1)));
197}
198
199
200inline uint8_t GetHighestValueByte(uc16 character) {
201  return Max(static_cast<uint8_t>(character & 0xFF),
202             static_cast<uint8_t>(character >> 8));
203}
204
205
206inline uint8_t GetHighestValueByte(uint8_t character) { return character; }
207
208
209template <typename PatternChar, typename SubjectChar>
210inline int FindFirstCharacter(Vector<const PatternChar> pattern,
211                              Vector<const SubjectChar> subject, int index) {
212  const PatternChar pattern_first_char = pattern[0];
213  const int max_n = (subject.length() - pattern.length() + 1);
214
215  const uint8_t search_byte = GetHighestValueByte(pattern_first_char);
216  const SubjectChar search_char = static_cast<SubjectChar>(pattern_first_char);
217  int pos = index;
218  do {
219    DCHECK_GE(max_n - pos, 0);
220    const SubjectChar* char_pos = reinterpret_cast<const SubjectChar*>(
221        memchr(subject.start() + pos, search_byte,
222               (max_n - pos) * sizeof(SubjectChar)));
223    if (char_pos == NULL) return -1;
224    char_pos = AlignDown(char_pos, sizeof(SubjectChar));
225    pos = static_cast<int>(char_pos - subject.start());
226    if (subject[pos] == search_char) return pos;
227  } while (++pos < max_n);
228
229  return -1;
230}
231
232
233//---------------------------------------------------------------------
234// Single Character Pattern Search Strategy
235//---------------------------------------------------------------------
236
237template <typename PatternChar, typename SubjectChar>
238int StringSearch<PatternChar, SubjectChar>::SingleCharSearch(
239    StringSearch<PatternChar, SubjectChar>* search,
240    Vector<const SubjectChar> subject,
241    int index) {
242  DCHECK_EQ(1, search->pattern_.length());
243  PatternChar pattern_first_char = search->pattern_[0];
244  if (sizeof(PatternChar) > sizeof(SubjectChar)) {
245    if (exceedsOneByte(pattern_first_char)) {
246      return -1;
247    }
248  }
249  return FindFirstCharacter(search->pattern_, subject, index);
250}
251
252//---------------------------------------------------------------------
253// Linear Search Strategy
254//---------------------------------------------------------------------
255
256
257template <typename PatternChar, typename SubjectChar>
258inline bool CharCompare(const PatternChar* pattern,
259                        const SubjectChar* subject,
260                        int length) {
261  DCHECK(length > 0);
262  int pos = 0;
263  do {
264    if (pattern[pos] != subject[pos]) {
265      return false;
266    }
267    pos++;
268  } while (pos < length);
269  return true;
270}
271
272
273// Simple linear search for short patterns. Never bails out.
274template <typename PatternChar, typename SubjectChar>
275int StringSearch<PatternChar, SubjectChar>::LinearSearch(
276    StringSearch<PatternChar, SubjectChar>* search,
277    Vector<const SubjectChar> subject,
278    int index) {
279  Vector<const PatternChar> pattern = search->pattern_;
280  DCHECK(pattern.length() > 1);
281  int pattern_length = pattern.length();
282  int i = index;
283  int n = subject.length() - pattern_length;
284  while (i <= n) {
285    i = FindFirstCharacter(pattern, subject, i);
286    if (i == -1) return -1;
287    DCHECK_LE(i, n);
288    i++;
289    // Loop extracted to separate function to allow using return to do
290    // a deeper break.
291    if (CharCompare(pattern.start() + 1,
292                    subject.start() + i,
293                    pattern_length - 1)) {
294      return i - 1;
295    }
296  }
297  return -1;
298}
299
300//---------------------------------------------------------------------
301// Boyer-Moore string search
302//---------------------------------------------------------------------
303
304template <typename PatternChar, typename SubjectChar>
305int StringSearch<PatternChar, SubjectChar>::BoyerMooreSearch(
306    StringSearch<PatternChar, SubjectChar>* search,
307    Vector<const SubjectChar> subject,
308    int start_index) {
309  Vector<const PatternChar> pattern = search->pattern_;
310  int subject_length = subject.length();
311  int pattern_length = pattern.length();
312  // Only preprocess at most kBMMaxShift last characters of pattern.
313  int start = search->start_;
314
315  int* bad_char_occurence = search->bad_char_table();
316  int* good_suffix_shift = search->good_suffix_shift_table();
317
318  PatternChar last_char = pattern[pattern_length - 1];
319  int index = start_index;
320  // Continue search from i.
321  while (index <= subject_length - pattern_length) {
322    int j = pattern_length - 1;
323    int c;
324    while (last_char != (c = subject[index + j])) {
325      int shift =
326          j - CharOccurrence(bad_char_occurence, c);
327      index += shift;
328      if (index > subject_length - pattern_length) {
329        return -1;
330      }
331    }
332    while (j >= 0 && pattern[j] == (c = subject[index + j])) j--;
333    if (j < 0) {
334      return index;
335    } else if (j < start) {
336      // we have matched more than our tables allow us to be smart about.
337      // Fall back on BMH shift.
338      index += pattern_length - 1
339          - CharOccurrence(bad_char_occurence,
340                           static_cast<SubjectChar>(last_char));
341    } else {
342      int gs_shift = good_suffix_shift[j + 1];
343      int bc_occ =
344          CharOccurrence(bad_char_occurence, c);
345      int shift = j - bc_occ;
346      if (gs_shift > shift) {
347        shift = gs_shift;
348      }
349      index += shift;
350    }
351  }
352
353  return -1;
354}
355
356
357template <typename PatternChar, typename SubjectChar>
358void StringSearch<PatternChar, SubjectChar>::PopulateBoyerMooreTable() {
359  int pattern_length = pattern_.length();
360  const PatternChar* pattern = pattern_.start();
361  // Only look at the last kBMMaxShift characters of pattern (from start_
362  // to pattern_length).
363  int start = start_;
364  int length = pattern_length - start;
365
366  // Biased tables so that we can use pattern indices as table indices,
367  // even if we only cover the part of the pattern from offset start.
368  int* shift_table = good_suffix_shift_table();
369  int* suffix_table = this->suffix_table();
370
371  // Initialize table.
372  for (int i = start; i < pattern_length; i++) {
373    shift_table[i] = length;
374  }
375  shift_table[pattern_length] = 1;
376  suffix_table[pattern_length] = pattern_length + 1;
377
378  if (pattern_length <= start) {
379    return;
380  }
381
382  // Find suffixes.
383  PatternChar last_char = pattern[pattern_length - 1];
384  int suffix = pattern_length + 1;
385  {
386    int i = pattern_length;
387    while (i > start) {
388      PatternChar c = pattern[i - 1];
389      while (suffix <= pattern_length && c != pattern[suffix - 1]) {
390        if (shift_table[suffix] == length) {
391          shift_table[suffix] = suffix - i;
392        }
393        suffix = suffix_table[suffix];
394      }
395      suffix_table[--i] = --suffix;
396      if (suffix == pattern_length) {
397        // No suffix to extend, so we check against last_char only.
398        while ((i > start) && (pattern[i - 1] != last_char)) {
399          if (shift_table[pattern_length] == length) {
400            shift_table[pattern_length] = pattern_length - i;
401          }
402          suffix_table[--i] = pattern_length;
403        }
404        if (i > start) {
405          suffix_table[--i] = --suffix;
406        }
407      }
408    }
409  }
410  // Build shift table using suffixes.
411  if (suffix < pattern_length) {
412    for (int i = start; i <= pattern_length; i++) {
413      if (shift_table[i] == length) {
414        shift_table[i] = suffix - start;
415      }
416      if (i == suffix) {
417        suffix = suffix_table[suffix];
418      }
419    }
420  }
421}
422
423//---------------------------------------------------------------------
424// Boyer-Moore-Horspool string search.
425//---------------------------------------------------------------------
426
427template <typename PatternChar, typename SubjectChar>
428int StringSearch<PatternChar, SubjectChar>::BoyerMooreHorspoolSearch(
429    StringSearch<PatternChar, SubjectChar>* search,
430    Vector<const SubjectChar> subject,
431    int start_index) {
432  Vector<const PatternChar> pattern = search->pattern_;
433  int subject_length = subject.length();
434  int pattern_length = pattern.length();
435  int* char_occurrences = search->bad_char_table();
436  int badness = -pattern_length;
437
438  // How bad we are doing without a good-suffix table.
439  PatternChar last_char = pattern[pattern_length - 1];
440  int last_char_shift = pattern_length - 1 -
441      CharOccurrence(char_occurrences, static_cast<SubjectChar>(last_char));
442  // Perform search
443  int index = start_index;  // No matches found prior to this index.
444  while (index <= subject_length - pattern_length) {
445    int j = pattern_length - 1;
446    int subject_char;
447    while (last_char != (subject_char = subject[index + j])) {
448      int bc_occ = CharOccurrence(char_occurrences, subject_char);
449      int shift = j - bc_occ;
450      index += shift;
451      badness += 1 - shift;  // at most zero, so badness cannot increase.
452      if (index > subject_length - pattern_length) {
453        return -1;
454      }
455    }
456    j--;
457    while (j >= 0 && pattern[j] == (subject[index + j])) j--;
458    if (j < 0) {
459      return index;
460    } else {
461      index += last_char_shift;
462      // Badness increases by the number of characters we have
463      // checked, and decreases by the number of characters we
464      // can skip by shifting. It's a measure of how we are doing
465      // compared to reading each character exactly once.
466      badness += (pattern_length - j) - last_char_shift;
467      if (badness > 0) {
468        search->PopulateBoyerMooreTable();
469        search->strategy_ = &BoyerMooreSearch;
470        return BoyerMooreSearch(search, subject, index);
471      }
472    }
473  }
474  return -1;
475}
476
477
478template <typename PatternChar, typename SubjectChar>
479void StringSearch<PatternChar, SubjectChar>::PopulateBoyerMooreHorspoolTable() {
480  int pattern_length = pattern_.length();
481
482  int* bad_char_occurrence = bad_char_table();
483
484  // Only preprocess at most kBMMaxShift last characters of pattern.
485  int start = start_;
486  // Run forwards to populate bad_char_table, so that *last* instance
487  // of character equivalence class is the one registered.
488  // Notice: Doesn't include the last character.
489  int table_size = AlphabetSize();
490  if (start == 0) {  // All patterns less than kBMMaxShift in length.
491    memset(bad_char_occurrence,
492           -1,
493           table_size * sizeof(*bad_char_occurrence));
494  } else {
495    for (int i = 0; i < table_size; i++) {
496      bad_char_occurrence[i] = start - 1;
497    }
498  }
499  for (int i = start; i < pattern_length - 1; i++) {
500    PatternChar c = pattern_[i];
501    int bucket = (sizeof(PatternChar) == 1) ? c : c % AlphabetSize();
502    bad_char_occurrence[bucket] = i;
503  }
504}
505
506//---------------------------------------------------------------------
507// Linear string search with bailout to BMH.
508//---------------------------------------------------------------------
509
510// Simple linear search for short patterns, which bails out if the string
511// isn't found very early in the subject. Upgrades to BoyerMooreHorspool.
512template <typename PatternChar, typename SubjectChar>
513int StringSearch<PatternChar, SubjectChar>::InitialSearch(
514    StringSearch<PatternChar, SubjectChar>* search,
515    Vector<const SubjectChar> subject,
516    int index) {
517  Vector<const PatternChar> pattern = search->pattern_;
518  int pattern_length = pattern.length();
519  // Badness is a count of how much work we have done.  When we have
520  // done enough work we decide it's probably worth switching to a better
521  // algorithm.
522  int badness = -10 - (pattern_length << 2);
523
524  // We know our pattern is at least 2 characters, we cache the first so
525  // the common case of the first character not matching is faster.
526  for (int i = index, n = subject.length() - pattern_length; i <= n; i++) {
527    badness++;
528    if (badness <= 0) {
529      i = FindFirstCharacter(pattern, subject, i);
530      if (i == -1) return -1;
531      DCHECK_LE(i, n);
532      int j = 1;
533      do {
534        if (pattern[j] != subject[i + j]) {
535          break;
536        }
537        j++;
538      } while (j < pattern_length);
539      if (j == pattern_length) {
540        return i;
541      }
542      badness += j;
543    } else {
544      search->PopulateBoyerMooreHorspoolTable();
545      search->strategy_ = &BoyerMooreHorspoolSearch;
546      return BoyerMooreHorspoolSearch(search, subject, i);
547    }
548  }
549  return -1;
550}
551
552
553// Perform a a single stand-alone search.
554// If searching multiple times for the same pattern, a search
555// object should be constructed once and the Search function then called
556// for each search.
557template <typename SubjectChar, typename PatternChar>
558int SearchString(Isolate* isolate,
559                 Vector<const SubjectChar> subject,
560                 Vector<const PatternChar> pattern,
561                 int start_index) {
562  StringSearch<PatternChar, SubjectChar> search(isolate, pattern);
563  return search.Search(subject, start_index);
564}
565
566}  // namespace internal
567}  // namespace v8
568
569#endif  // V8_STRING_SEARCH_H_
570