11b3afd1cab9087ca3c4e585d3da77d374d65c082mstarzinger@chromium.org// Copyright 2011 the V8 project authors. All rights reserved.
23484964a86451e86dcf04be9bd8c0d76ee04f081rossberg@chromium.org// Use of this source code is governed by a BSD-style license that can be
33484964a86451e86dcf04be9bd8c0d76ee04f081rossberg@chromium.org// found in the LICENSE file.
42ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org
52ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org#ifndef V8_STRING_SEARCH_H_
62ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org#define V8_STRING_SEARCH_H_
72ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org
82ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.orgnamespace v8 {
92ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.orgnamespace internal {
102ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org
112ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org
12f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org//---------------------------------------------------------------------
13f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org// String Search object.
14f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org//---------------------------------------------------------------------
15f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org
16f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org// Class holding constants and methods that apply to all string search variants,
17f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org// independently of subject and pattern char size.
18f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.orgclass StringSearchBase {
19f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org protected:
20f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org  // Cap on the maximal shift in the Boyer-Moore implementation. By setting a
21f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org  // limit, we can fix the size of tables. For a needle longer than this limit,
22f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org  // search will not be optimal, since we only build tables for a suffix
23f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org  // of the string, but it is a safe approximation.
24ea88ce93dcb41a9200ec8747ae7642a5db1f4ce7sgjesse@chromium.org  static const int kBMMaxShift = Isolate::kBMMaxShift;
25f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org
26f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org  // Reduce alphabet to this size.
27f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org  // One of the tables used by Boyer-Moore and Boyer-Moore-Horspool has size
28f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org  // proportional to the input alphabet. We reduce the alphabet size by
29f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org  // equating input characters modulo a smaller alphabet size. This gives
30f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org  // a potentially less efficient searching, but is a safe approximation.
31f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org  // For needles using only characters in the same Unicode 256-code point page,
32f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org  // there is no search speed degradation.
332c81ceb7f1e1ccf5f304be0646f4c1375941a7f2machenbach@chromium.org  static const int kLatin1AlphabetSize = 256;
34ea88ce93dcb41a9200ec8747ae7642a5db1f4ce7sgjesse@chromium.org  static const int kUC16AlphabetSize = Isolate::kUC16AlphabetSize;
35f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org
36f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org  // Bad-char shift table stored in the state. It's length is the alphabet size.
37f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org  // For patterns below this length, the skip length of Boyer-Moore is too short
38f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org  // to compensate for the algorithmic overhead compared to simple brute force.
39f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org  static const int kBMMinPatternLength = 7;
40f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org
4159297c735ad2a41156ae9c723a39ff259ad061e0jkummerow@chromium.org  static inline bool IsOneByteString(Vector<const uint8_t> string) {
42f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org    return true;
43f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org  }
44f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org
4559297c735ad2a41156ae9c723a39ff259ad061e0jkummerow@chromium.org  static inline bool IsOneByteString(Vector<const uc16> string) {
4659297c735ad2a41156ae9c723a39ff259ad061e0jkummerow@chromium.org    return String::IsOneByte(string.start(), string.length());
47f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org  }
48f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org
49ea88ce93dcb41a9200ec8747ae7642a5db1f4ce7sgjesse@chromium.org  friend class Isolate;
50f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org};
51f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org
52f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org
53f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.orgtemplate <typename PatternChar, typename SubjectChar>
54f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.orgclass StringSearch : private StringSearchBase {
552ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org public:
56ea88ce93dcb41a9200ec8747ae7642a5db1f4ce7sgjesse@chromium.org  StringSearch(Isolate* isolate, Vector<const PatternChar> pattern)
57ea88ce93dcb41a9200ec8747ae7642a5db1f4ce7sgjesse@chromium.org      : isolate_(isolate),
58ea88ce93dcb41a9200ec8747ae7642a5db1f4ce7sgjesse@chromium.org        pattern_(pattern),
59f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org        start_(Max(0, pattern.length() - kBMMaxShift)) {
60f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org    if (sizeof(PatternChar) > sizeof(SubjectChar)) {
6159297c735ad2a41156ae9c723a39ff259ad061e0jkummerow@chromium.org      if (!IsOneByteString(pattern_)) {
62f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org        strategy_ = &FailSearch;
63f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org        return;
64f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org      }
65f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org    }
66f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org    int pattern_length = pattern_.length();
67f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org    if (pattern_length < kBMMinPatternLength) {
68f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org      if (pattern_length == 1) {
69f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org        strategy_ = &SingleCharSearch;
70f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org        return;
71f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org      }
72f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org      strategy_ = &LinearSearch;
73f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org      return;
742ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org    }
75f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org    strategy_ = &InitialSearch;
762ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org  }
77f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org
78f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org  int Search(Vector<const SubjectChar> subject, int index) {
79f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org    return strategy_(this, subject, index);
802ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org  }
81f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org
82f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org  static inline int AlphabetSize() {
83f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org    if (sizeof(PatternChar) == 1) {
842c81ceb7f1e1ccf5f304be0646f4c1375941a7f2machenbach@chromium.org      // Latin1 needle.
852c81ceb7f1e1ccf5f304be0646f4c1375941a7f2machenbach@chromium.org      return kLatin1AlphabetSize;
86f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org    } else {
87e3c177a423baa3c30225c4e422b6f6c76d38b951machenbach@chromium.org      DCHECK(sizeof(PatternChar) == 2);
88f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org      // UC16 needle.
89f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org      return kUC16AlphabetSize;
90f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org    }
912ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org  }
92f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org
932ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org private:
94f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org  typedef int (*SearchFunction)(  // NOLINT - it's not a cast!
95f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org      StringSearch<PatternChar, SubjectChar>*,
96f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org      Vector<const SubjectChar>,
97f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org      int);
98f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org
99f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org  static int FailSearch(StringSearch<PatternChar, SubjectChar>*,
100f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org                        Vector<const SubjectChar>,
101f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org                        int) {
102f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org    return -1;
103f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org  }
1042ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org
105f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org  static int SingleCharSearch(StringSearch<PatternChar, SubjectChar>* search,
106f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org                              Vector<const SubjectChar> subject,
107f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org                              int start_index);
108f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org
109f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org  static int LinearSearch(StringSearch<PatternChar, SubjectChar>* search,
110f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org                          Vector<const SubjectChar> subject,
111f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org                          int start_index);
112f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org
113f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org  static int InitialSearch(StringSearch<PatternChar, SubjectChar>* search,
114f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org                           Vector<const SubjectChar> subject,
115f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org                           int start_index);
116f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org
117f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org  static int BoyerMooreHorspoolSearch(
118f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org      StringSearch<PatternChar, SubjectChar>* search,
119f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org      Vector<const SubjectChar> subject,
120f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org      int start_index);
121f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org
122f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org  static int BoyerMooreSearch(StringSearch<PatternChar, SubjectChar>* search,
123f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org                              Vector<const SubjectChar> subject,
124f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org                              int start_index);
125f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org
126f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org  void PopulateBoyerMooreHorspoolTable();
127f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org
128f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org  void PopulateBoyerMooreTable();
129f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org
13059297c735ad2a41156ae9c723a39ff259ad061e0jkummerow@chromium.org  static inline bool exceedsOneByte(uint8_t c) {
13159297c735ad2a41156ae9c723a39ff259ad061e0jkummerow@chromium.org    return false;
13259297c735ad2a41156ae9c723a39ff259ad061e0jkummerow@chromium.org  }
13359297c735ad2a41156ae9c723a39ff259ad061e0jkummerow@chromium.org
13459297c735ad2a41156ae9c723a39ff259ad061e0jkummerow@chromium.org  static inline bool exceedsOneByte(uint16_t c) {
13559297c735ad2a41156ae9c723a39ff259ad061e0jkummerow@chromium.org    return c > String::kMaxOneByteCharCodeU;
13659297c735ad2a41156ae9c723a39ff259ad061e0jkummerow@chromium.org  }
13759297c735ad2a41156ae9c723a39ff259ad061e0jkummerow@chromium.org
138f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org  static inline int CharOccurrence(int* bad_char_occurrence,
139f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org                                   SubjectChar char_code) {
140f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org    if (sizeof(SubjectChar) == 1) {
141f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org      return bad_char_occurrence[static_cast<int>(char_code)];
142f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org    }
143f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org    if (sizeof(PatternChar) == 1) {
14459297c735ad2a41156ae9c723a39ff259ad061e0jkummerow@chromium.org      if (exceedsOneByte(char_code)) {
145f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org        return -1;
146f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org      }
147ec7067e20f87c4d63531d5cb80cb942eb960a610kmillikin@chromium.org      return bad_char_occurrence[static_cast<unsigned int>(char_code)];
148f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org    }
149ec7067e20f87c4d63531d5cb80cb942eb960a610kmillikin@chromium.org    // Both pattern and subject are UC16. Reduce character to equivalence class.
150f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org    int equiv_class = char_code % kUC16AlphabetSize;
151f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org    return bad_char_occurrence[equiv_class];
152f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org  }
153f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org
154ea88ce93dcb41a9200ec8747ae7642a5db1f4ce7sgjesse@chromium.org  // The following tables are shared by all searches.
155ea88ce93dcb41a9200ec8747ae7642a5db1f4ce7sgjesse@chromium.org  // TODO(lrn): Introduce a way for a pattern to keep its tables
156ea88ce93dcb41a9200ec8747ae7642a5db1f4ce7sgjesse@chromium.org  // between searches (e.g., for an Atom RegExp).
157ea88ce93dcb41a9200ec8747ae7642a5db1f4ce7sgjesse@chromium.org
158ea88ce93dcb41a9200ec8747ae7642a5db1f4ce7sgjesse@chromium.org  // Store for the BoyerMoore(Horspool) bad char shift table.
159f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org  // Return a table covering the last kBMMaxShift+1 positions of
160f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org  // pattern.
161f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org  int* bad_char_table() {
162ea88ce93dcb41a9200ec8747ae7642a5db1f4ce7sgjesse@chromium.org    return isolate_->bad_char_shift_table();
163f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org  }
164f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org
165ea88ce93dcb41a9200ec8747ae7642a5db1f4ce7sgjesse@chromium.org  // Store for the BoyerMoore good suffix shift table.
166f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org  int* good_suffix_shift_table() {
167f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org    // Return biased pointer that maps the range  [start_..pattern_.length()
168f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org    // to the kGoodSuffixShiftTable array.
169ea88ce93dcb41a9200ec8747ae7642a5db1f4ce7sgjesse@chromium.org    return isolate_->good_suffix_shift_table() - start_;
170f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org  }
171f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org
172ea88ce93dcb41a9200ec8747ae7642a5db1f4ce7sgjesse@chromium.org  // Table used temporarily while building the BoyerMoore good suffix
173ea88ce93dcb41a9200ec8747ae7642a5db1f4ce7sgjesse@chromium.org  // shift table.
174f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org  int* suffix_table() {
175f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org    // Return biased pointer that maps the range  [start_..pattern_.length()
176f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org    // to the kSuffixTable array.
177ea88ce93dcb41a9200ec8747ae7642a5db1f4ce7sgjesse@chromium.org    return isolate_->suffix_table() - start_;
178f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org  }
179f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org
180ea88ce93dcb41a9200ec8747ae7642a5db1f4ce7sgjesse@chromium.org  Isolate* isolate_;
181f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org  // The pattern to search for.
182f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org  Vector<const PatternChar> pattern_;
183f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org  // Pointer to implementation of the search.
184f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org  SearchFunction strategy_;
185f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org  // Cache value of Max(0, pattern_length() - kBMMaxShift)
186f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org  int start_;
1872ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org};
1882ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org
1892ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org
190f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org//---------------------------------------------------------------------
191f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org// Single Character Pattern Search Strategy
192f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org//---------------------------------------------------------------------
1932ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org
194f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.orgtemplate <typename PatternChar, typename SubjectChar>
195f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.orgint StringSearch<PatternChar, SubjectChar>::SingleCharSearch(
196f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org    StringSearch<PatternChar, SubjectChar>* search,
197f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org    Vector<const SubjectChar> subject,
198f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org    int index) {
199e3c177a423baa3c30225c4e422b6f6c76d38b951machenbach@chromium.org  DCHECK_EQ(1, search->pattern_.length());
200f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org  PatternChar pattern_first_char = search->pattern_[0];
201f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org  int i = index;
202f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org  if (sizeof(SubjectChar) == 1 && sizeof(PatternChar) == 1) {
203f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org    const SubjectChar* pos = reinterpret_cast<const SubjectChar*>(
204f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org        memchr(subject.start() + i,
205f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org               pattern_first_char,
206f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org               subject.length() - i));
207f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org    if (pos == NULL) return -1;
208f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org    return static_cast<int>(pos - subject.start());
2092ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org  } else {
210f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org    if (sizeof(PatternChar) > sizeof(SubjectChar)) {
21159297c735ad2a41156ae9c723a39ff259ad061e0jkummerow@chromium.org      if (exceedsOneByte(pattern_first_char)) {
212f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org        return -1;
213f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org      }
2142ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org    }
215f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org    SubjectChar search_char = static_cast<SubjectChar>(pattern_first_char);
216f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org    int n = subject.length();
217f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org    while (i < n) {
218f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org      if (subject[i++] == search_char) return i - 1;
219f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org    }
220f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org    return -1;
2212ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org  }
222f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org}
223f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org
224f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org//---------------------------------------------------------------------
225f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org// Linear Search Strategy
226f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org//---------------------------------------------------------------------
227f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org
228f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org
229f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.orgtemplate <typename PatternChar, typename SubjectChar>
2301b3afd1cab9087ca3c4e585d3da77d374d65c082mstarzinger@chromium.orginline bool CharCompare(const PatternChar* pattern,
2311b3afd1cab9087ca3c4e585d3da77d374d65c082mstarzinger@chromium.org                        const SubjectChar* subject,
2321b3afd1cab9087ca3c4e585d3da77d374d65c082mstarzinger@chromium.org                        int length) {
233e3c177a423baa3c30225c4e422b6f6c76d38b951machenbach@chromium.org  DCHECK(length > 0);
234f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org  int pos = 0;
235f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org  do {
236f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org    if (pattern[pos] != subject[pos]) {
237f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org      return false;
238f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org    }
239f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org    pos++;
240f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org  } while (pos < length);
241f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org  return true;
242f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org}
243f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org
244f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org
245f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org// Simple linear search for short patterns. Never bails out.
246f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.orgtemplate <typename PatternChar, typename SubjectChar>
247f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.orgint StringSearch<PatternChar, SubjectChar>::LinearSearch(
248f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org    StringSearch<PatternChar, SubjectChar>* search,
249f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org    Vector<const SubjectChar> subject,
250f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org    int index) {
251f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org  Vector<const PatternChar> pattern = search->pattern_;
252e3c177a423baa3c30225c4e422b6f6c76d38b951machenbach@chromium.org  DCHECK(pattern.length() > 1);
253f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org  int pattern_length = pattern.length();
254f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org  PatternChar pattern_first_char = pattern[0];
255f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org  int i = index;
256f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org  int n = subject.length() - pattern_length;
257f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org  while (i <= n) {
258f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org    if (sizeof(SubjectChar) == 1 && sizeof(PatternChar) == 1) {
259f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org      const SubjectChar* pos = reinterpret_cast<const SubjectChar*>(
260f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org          memchr(subject.start() + i,
261f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org                 pattern_first_char,
262f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org                 n - i + 1));
263f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org      if (pos == NULL) return -1;
264f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org      i = static_cast<int>(pos - subject.start()) + 1;
265f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org    } else {
266f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org      if (subject[i++] != pattern_first_char) continue;
267f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org    }
268f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org    // Loop extracted to separate function to allow using return to do
269f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org    // a deeper break.
270f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org    if (CharCompare(pattern.start() + 1,
271f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org                    subject.start() + i,
272f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org                    pattern_length - 1)) {
273f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org      return i - 1;
274f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org    }
275f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org  }
276f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org  return -1;
277f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org}
278f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org
279f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org//---------------------------------------------------------------------
280f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org// Boyer-Moore string search
281f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org//---------------------------------------------------------------------
282f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org
283f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.orgtemplate <typename PatternChar, typename SubjectChar>
284f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.orgint StringSearch<PatternChar, SubjectChar>::BoyerMooreSearch(
285f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org    StringSearch<PatternChar, SubjectChar>* search,
286f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org    Vector<const SubjectChar> subject,
287f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org    int start_index) {
288f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org  Vector<const PatternChar> pattern = search->pattern_;
289f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org  int subject_length = subject.length();
290f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org  int pattern_length = pattern.length();
291f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org  // Only preprocess at most kBMMaxShift last characters of pattern.
292f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org  int start = search->start_;
293f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org
294f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org  int* bad_char_occurence = search->bad_char_table();
295f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org  int* good_suffix_shift = search->good_suffix_shift_table();
296f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org
297f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org  PatternChar last_char = pattern[pattern_length - 1];
298f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org  int index = start_index;
299f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org  // Continue search from i.
300f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org  while (index <= subject_length - pattern_length) {
301f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org    int j = pattern_length - 1;
302f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org    int c;
303f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org    while (last_char != (c = subject[index + j])) {
304f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org      int shift =
305f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org          j - CharOccurrence(bad_char_occurence, c);
306f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org      index += shift;
307f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org      if (index > subject_length - pattern_length) {
308f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org        return -1;
309f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org      }
310f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org    }
311f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org    while (j >= 0 && pattern[j] == (c = subject[index + j])) j--;
312f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org    if (j < 0) {
313f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org      return index;
314f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org    } else if (j < start) {
315f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org      // we have matched more than our tables allow us to be smart about.
316f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org      // Fall back on BMH shift.
317f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org      index += pattern_length - 1
318f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org          - CharOccurrence(bad_char_occurence,
319f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org                           static_cast<SubjectChar>(last_char));
320f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org    } else {
321f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org      int gs_shift = good_suffix_shift[j + 1];
322f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org      int bc_occ =
323f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org          CharOccurrence(bad_char_occurence, c);
324f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org      int shift = j - bc_occ;
325f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org      if (gs_shift > shift) {
326f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org        shift = gs_shift;
327f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org      }
328f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org      index += shift;
329f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org    }
3302ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org  }
331f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org
332f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org  return -1;
3332ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org}
3342ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org
3352ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org
336f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.orgtemplate <typename PatternChar, typename SubjectChar>
337f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.orgvoid StringSearch<PatternChar, SubjectChar>::PopulateBoyerMooreTable() {
338f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org  int pattern_length = pattern_.length();
339f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org  const PatternChar* pattern = pattern_.start();
340f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org  // Only look at the last kBMMaxShift characters of pattern (from start_
341f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org  // to pattern_length).
342f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org  int start = start_;
343f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org  int length = pattern_length - start;
344f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org
345f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org  // Biased tables so that we can use pattern indices as table indices,
346f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org  // even if we only cover the part of the pattern from offset start.
347f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org  int* shift_table = good_suffix_shift_table();
348f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org  int* suffix_table = this->suffix_table();
349f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org
350f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org  // Initialize table.
351f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org  for (int i = start; i < pattern_length; i++) {
352f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org    shift_table[i] = length;
353f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org  }
354f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org  shift_table[pattern_length] = 1;
355f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org  suffix_table[pattern_length] = pattern_length + 1;
3562ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org
357bf0c820d028452571c8c744ddd212c32c6d6a996danno@chromium.org  if (pattern_length <= start) {
358bf0c820d028452571c8c744ddd212c32c6d6a996danno@chromium.org    return;
359bf0c820d028452571c8c744ddd212c32c6d6a996danno@chromium.org  }
360bf0c820d028452571c8c744ddd212c32c6d6a996danno@chromium.org
361f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org  // Find suffixes.
362f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org  PatternChar last_char = pattern[pattern_length - 1];
363f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org  int suffix = pattern_length + 1;
3642ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org  {
365f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org    int i = pattern_length;
3662ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org    while (i > start) {
3672ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org      PatternChar c = pattern[i - 1];
368f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org      while (suffix <= pattern_length && c != pattern[suffix - 1]) {
369f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org        if (shift_table[suffix] == length) {
370f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org          shift_table[suffix] = suffix - i;
3712ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org        }
372f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org        suffix = suffix_table[suffix];
3732ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org      }
374f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org      suffix_table[--i] = --suffix;
375f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org      if (suffix == pattern_length) {
3762ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org        // No suffix to extend, so we check against last_char only.
3772ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org        while ((i > start) && (pattern[i - 1] != last_char)) {
378f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org          if (shift_table[pattern_length] == length) {
379f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org            shift_table[pattern_length] = pattern_length - i;
3802ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org          }
381f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org          suffix_table[--i] = pattern_length;
3822ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org        }
3832ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org        if (i > start) {
384f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org          suffix_table[--i] = --suffix;
3852ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org        }
3862ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org      }
3872ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org    }
3882ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org  }
389f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org  // Build shift table using suffixes.
390f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org  if (suffix < pattern_length) {
391f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org    for (int i = start; i <= pattern_length; i++) {
392f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org      if (shift_table[i] == length) {
393f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org        shift_table[i] = suffix - start;
3942ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org      }
3952ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org      if (i == suffix) {
396f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org        suffix = suffix_table[suffix];
3972ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org      }
3982ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org    }
3992ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org  }
4002ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org}
4012ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org
402f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org//---------------------------------------------------------------------
403f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org// Boyer-Moore-Horspool string search.
404f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org//---------------------------------------------------------------------
4052ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org
406f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.orgtemplate <typename PatternChar, typename SubjectChar>
407f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.orgint StringSearch<PatternChar, SubjectChar>::BoyerMooreHorspoolSearch(
408f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org    StringSearch<PatternChar, SubjectChar>* search,
409f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org    Vector<const SubjectChar> subject,
410f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org    int start_index) {
411f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org  Vector<const PatternChar> pattern = search->pattern_;
412f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org  int subject_length = subject.length();
413f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org  int pattern_length = pattern.length();
414f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org  int* char_occurrences = search->bad_char_table();
415f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org  int badness = -pattern_length;
4162ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org
4172ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org  // How bad we are doing without a good-suffix table.
418f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org  PatternChar last_char = pattern[pattern_length - 1];
419f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org  int last_char_shift = pattern_length - 1 -
420f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org      CharOccurrence(char_occurrences, static_cast<SubjectChar>(last_char));
4212ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org  // Perform search
422f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org  int index = start_index;  // No matches found prior to this index.
423f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org  while (index <= subject_length - pattern_length) {
424f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org    int j = pattern_length - 1;
425f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org    int subject_char;
426f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org    while (last_char != (subject_char = subject[index + j])) {
427f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org      int bc_occ = CharOccurrence(char_occurrences, subject_char);
4282ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org      int shift = j - bc_occ;
429f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org      index += shift;
4302ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org      badness += 1 - shift;  // at most zero, so badness cannot increase.
431f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org      if (index > subject_length - pattern_length) {
4322ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org        return -1;
4332ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org      }
4342ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org    }
4352ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org    j--;
436f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org    while (j >= 0 && pattern[j] == (subject[index + j])) j--;
4372ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org    if (j < 0) {
438f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org      return index;
4392ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org    } else {
440f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org      index += last_char_shift;
4412ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org      // Badness increases by the number of characters we have
4422ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org      // checked, and decreases by the number of characters we
4432ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org      // can skip by shifting. It's a measure of how we are doing
4442ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org      // compared to reading each character exactly once.
445f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org      badness += (pattern_length - j) - last_char_shift;
4462ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org      if (badness > 0) {
447f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org        search->PopulateBoyerMooreTable();
448f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org        search->strategy_ = &BoyerMooreSearch;
449f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org        return BoyerMooreSearch(search, subject, index);
4502ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org      }
4512ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org    }
4522ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org  }
4532ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org  return -1;
4542ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org}
4552ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org
4562ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org
457f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.orgtemplate <typename PatternChar, typename SubjectChar>
458f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.orgvoid StringSearch<PatternChar, SubjectChar>::PopulateBoyerMooreHorspoolTable() {
459f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org  int pattern_length = pattern_.length();
4602ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org
461f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org  int* bad_char_occurrence = bad_char_table();
462f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org
463f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org  // Only preprocess at most kBMMaxShift last characters of pattern.
464f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org  int start = start_;
465f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org  // Run forwards to populate bad_char_table, so that *last* instance
466f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org  // of character equivalence class is the one registered.
467f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org  // Notice: Doesn't include the last character.
468f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org  int table_size = AlphabetSize();
469f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org  if (start == 0) {  // All patterns less than kBMMaxShift in length.
470f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org    memset(bad_char_occurrence,
471f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org           -1,
472f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org           table_size * sizeof(*bad_char_occurrence));
473f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org  } else {
474f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org    for (int i = 0; i < table_size; i++) {
475f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org      bad_char_occurrence[i] = start - 1;
4762ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org    }
4772ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org  }
478f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org  for (int i = start; i < pattern_length - 1; i++) {
479f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org    PatternChar c = pattern_[i];
480f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org    int bucket = (sizeof(PatternChar) == 1) ? c : c % AlphabetSize();
481f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org    bad_char_occurrence[bucket] = i;
482f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org  }
4832ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org}
4842ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org
485f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org//---------------------------------------------------------------------
486f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org// Linear string search with bailout to BMH.
487f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org//---------------------------------------------------------------------
4882ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org
489f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org// Simple linear search for short patterns, which bails out if the string
490f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org// isn't found very early in the subject. Upgrades to BoyerMooreHorspool.
4912ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.orgtemplate <typename PatternChar, typename SubjectChar>
492f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.orgint StringSearch<PatternChar, SubjectChar>::InitialSearch(
493f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org    StringSearch<PatternChar, SubjectChar>* search,
494f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org    Vector<const SubjectChar> subject,
495f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org    int index) {
496f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org  Vector<const PatternChar> pattern = search->pattern_;
4972ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org  int pattern_length = pattern.length();
4982ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org  // Badness is a count of how much work we have done.  When we have
4992ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org  // done enough work we decide it's probably worth switching to a better
5002ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org  // algorithm.
5012ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org  int badness = -10 - (pattern_length << 2);
5022ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org
5032ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org  // We know our pattern is at least 2 characters, we cache the first so
5042ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org  // the common case of the first character not matching is faster.
5052ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org  PatternChar pattern_first_char = pattern[0];
506f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org  for (int i = index, n = subject.length() - pattern_length; i <= n; i++) {
5072ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org    badness++;
508f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org    if (badness <= 0) {
509f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org      if (sizeof(SubjectChar) == 1 && sizeof(PatternChar) == 1) {
510f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org        const SubjectChar* pos = reinterpret_cast<const SubjectChar*>(
511f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org            memchr(subject.start() + i,
512f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org                   pattern_first_char,
513f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org                   n - i + 1));
514f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org        if (pos == NULL) {
515f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org          return -1;
516f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org        }
517f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org        i = static_cast<int>(pos - subject.start());
518f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org      } else {
519f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org        if (subject[i] != pattern_first_char) continue;
5202ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org      }
521f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org      int j = 1;
522f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org      do {
523f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org        if (pattern[j] != subject[i + j]) {
524f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org          break;
525f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org        }
526f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org        j++;
527f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org      } while (j < pattern_length);
528f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org      if (j == pattern_length) {
529f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org        return i;
5302ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org      }
531f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org      badness += j;
5322ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org    } else {
533f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org      search->PopulateBoyerMooreHorspoolTable();
534f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org      search->strategy_ = &BoyerMooreHorspoolSearch;
535f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org      return BoyerMooreHorspoolSearch(search, subject, i);
5362ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org    }
5372ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org  }
5382ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org  return -1;
5392ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org}
5402ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org
5412ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org
542f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org// Perform a a single stand-alone search.
543f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org// If searching multiple times for the same pattern, a search
544f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org// object should be constructed once and the Search function then called
545f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org// for each search.
5462ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.orgtemplate <typename SubjectChar, typename PatternChar>
5471b3afd1cab9087ca3c4e585d3da77d374d65c082mstarzinger@chromium.orgint SearchString(Isolate* isolate,
5481b3afd1cab9087ca3c4e585d3da77d374d65c082mstarzinger@chromium.org                 Vector<const SubjectChar> subject,
5491b3afd1cab9087ca3c4e585d3da77d374d65c082mstarzinger@chromium.org                 Vector<const PatternChar> pattern,
5501b3afd1cab9087ca3c4e585d3da77d374d65c082mstarzinger@chromium.org                 int start_index) {
551ea88ce93dcb41a9200ec8747ae7642a5db1f4ce7sgjesse@chromium.org  StringSearch<PatternChar, SubjectChar> search(isolate, pattern);
552f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org  return search.Search(subject, start_index);
5532ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org}
5542ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org
5552ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org}}  // namespace v8::internal
5562ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org
5572ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org#endif  // V8_STRING_SEARCH_H_
558