11b3afd1cab9087ca3c4e585d3da77d374d65c082mstarzinger@chromium.org// Copyright 2011 the V8 project authors. All rights reserved. 22ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org// Redistribution and use in source and binary forms, with or without 32ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org// modification, are permitted provided that the following conditions are 42ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org// met: 52ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org// 62ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org// * Redistributions of source code must retain the above copyright 72ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org// notice, this list of conditions and the following disclaimer. 82ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org// * Redistributions in binary form must reproduce the above 92ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org// copyright notice, this list of conditions and the following 102ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org// disclaimer in the documentation and/or other materials provided 112ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org// with the distribution. 122ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org// * Neither the name of Google Inc. nor the names of its 132ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org// contributors may be used to endorse or promote products derived 142ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org// from this software without specific prior written permission. 152ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org// 162ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 172ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 182ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 192ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 202ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 212ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 222ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 232ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 242ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 252ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 262ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 272ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org 282ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org#ifndef V8_STRING_SEARCH_H_ 292ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org#define V8_STRING_SEARCH_H_ 302ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org 312ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.orgnamespace v8 { 322ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.orgnamespace internal { 332ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org 342ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org 35f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org//--------------------------------------------------------------------- 36f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org// String Search object. 37f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org//--------------------------------------------------------------------- 38f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org 39f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org// Class holding constants and methods that apply to all string search variants, 40f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org// independently of subject and pattern char size. 41f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.orgclass StringSearchBase { 42f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org protected: 43f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org // Cap on the maximal shift in the Boyer-Moore implementation. By setting a 44f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org // limit, we can fix the size of tables. For a needle longer than this limit, 45f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org // search will not be optimal, since we only build tables for a suffix 46f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org // of the string, but it is a safe approximation. 47ea88ce93dcb41a9200ec8747ae7642a5db1f4ce7sgjesse@chromium.org static const int kBMMaxShift = Isolate::kBMMaxShift; 48f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org 49f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org // Reduce alphabet to this size. 50f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org // One of the tables used by Boyer-Moore and Boyer-Moore-Horspool has size 51f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org // proportional to the input alphabet. We reduce the alphabet size by 52f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org // equating input characters modulo a smaller alphabet size. This gives 53f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org // a potentially less efficient searching, but is a safe approximation. 54f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org // For needles using only characters in the same Unicode 256-code point page, 55f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org // there is no search speed degradation. 5659297c735ad2a41156ae9c723a39ff259ad061e0jkummerow@chromium.org static const int kAsciiAlphabetSize = 256; 57ea88ce93dcb41a9200ec8747ae7642a5db1f4ce7sgjesse@chromium.org static const int kUC16AlphabetSize = Isolate::kUC16AlphabetSize; 58f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org 59f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org // Bad-char shift table stored in the state. It's length is the alphabet size. 60f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org // For patterns below this length, the skip length of Boyer-Moore is too short 61f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org // to compensate for the algorithmic overhead compared to simple brute force. 62f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org static const int kBMMinPatternLength = 7; 63f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org 6459297c735ad2a41156ae9c723a39ff259ad061e0jkummerow@chromium.org static inline bool IsOneByteString(Vector<const uint8_t> string) { 65f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org return true; 66f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org } 67f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org 6859297c735ad2a41156ae9c723a39ff259ad061e0jkummerow@chromium.org static inline bool IsOneByteString(Vector<const uc16> string) { 6959297c735ad2a41156ae9c723a39ff259ad061e0jkummerow@chromium.org return String::IsOneByte(string.start(), string.length()); 70f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org } 71f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org 72ea88ce93dcb41a9200ec8747ae7642a5db1f4ce7sgjesse@chromium.org friend class Isolate; 73f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org}; 74f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org 75f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org 76f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.orgtemplate <typename PatternChar, typename SubjectChar> 77f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.orgclass StringSearch : private StringSearchBase { 782ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org public: 79ea88ce93dcb41a9200ec8747ae7642a5db1f4ce7sgjesse@chromium.org StringSearch(Isolate* isolate, Vector<const PatternChar> pattern) 80ea88ce93dcb41a9200ec8747ae7642a5db1f4ce7sgjesse@chromium.org : isolate_(isolate), 81ea88ce93dcb41a9200ec8747ae7642a5db1f4ce7sgjesse@chromium.org pattern_(pattern), 82f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org start_(Max(0, pattern.length() - kBMMaxShift)) { 83f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org if (sizeof(PatternChar) > sizeof(SubjectChar)) { 8459297c735ad2a41156ae9c723a39ff259ad061e0jkummerow@chromium.org if (!IsOneByteString(pattern_)) { 85f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org strategy_ = &FailSearch; 86f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org return; 87f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org } 88f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org } 89f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org int pattern_length = pattern_.length(); 90f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org if (pattern_length < kBMMinPatternLength) { 91f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org if (pattern_length == 1) { 92f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org strategy_ = &SingleCharSearch; 93f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org return; 94f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org } 95f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org strategy_ = &LinearSearch; 96f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org return; 972ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org } 98f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org strategy_ = &InitialSearch; 992ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org } 100f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org 101f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org int Search(Vector<const SubjectChar> subject, int index) { 102f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org return strategy_(this, subject, index); 1032ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org } 104f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org 105f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org static inline int AlphabetSize() { 106f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org if (sizeof(PatternChar) == 1) { 107f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org // ASCII needle. 108f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org return kAsciiAlphabetSize; 109f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org } else { 110f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org ASSERT(sizeof(PatternChar) == 2); 111f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org // UC16 needle. 112f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org return kUC16AlphabetSize; 113f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org } 1142ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org } 115f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org 1162ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org private: 117f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org typedef int (*SearchFunction)( // NOLINT - it's not a cast! 118f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org StringSearch<PatternChar, SubjectChar>*, 119f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org Vector<const SubjectChar>, 120f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org int); 121f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org 122f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org static int FailSearch(StringSearch<PatternChar, SubjectChar>*, 123f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org Vector<const SubjectChar>, 124f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org int) { 125f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org return -1; 126f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org } 1272ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org 128f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org static int SingleCharSearch(StringSearch<PatternChar, SubjectChar>* search, 129f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org Vector<const SubjectChar> subject, 130f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org int start_index); 131f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org 132f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org static int LinearSearch(StringSearch<PatternChar, SubjectChar>* search, 133f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org Vector<const SubjectChar> subject, 134f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org int start_index); 135f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org 136f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org static int InitialSearch(StringSearch<PatternChar, SubjectChar>* search, 137f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org Vector<const SubjectChar> subject, 138f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org int start_index); 139f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org 140f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org static int BoyerMooreHorspoolSearch( 141f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org StringSearch<PatternChar, SubjectChar>* search, 142f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org Vector<const SubjectChar> subject, 143f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org int start_index); 144f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org 145f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org static int BoyerMooreSearch(StringSearch<PatternChar, SubjectChar>* search, 146f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org Vector<const SubjectChar> subject, 147f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org int start_index); 148f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org 149f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org void PopulateBoyerMooreHorspoolTable(); 150f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org 151f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org void PopulateBoyerMooreTable(); 152f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org 15359297c735ad2a41156ae9c723a39ff259ad061e0jkummerow@chromium.org static inline bool exceedsOneByte(uint8_t c) { 15459297c735ad2a41156ae9c723a39ff259ad061e0jkummerow@chromium.org return false; 15559297c735ad2a41156ae9c723a39ff259ad061e0jkummerow@chromium.org } 15659297c735ad2a41156ae9c723a39ff259ad061e0jkummerow@chromium.org 15759297c735ad2a41156ae9c723a39ff259ad061e0jkummerow@chromium.org static inline bool exceedsOneByte(uint16_t c) { 15859297c735ad2a41156ae9c723a39ff259ad061e0jkummerow@chromium.org return c > String::kMaxOneByteCharCodeU; 15959297c735ad2a41156ae9c723a39ff259ad061e0jkummerow@chromium.org } 16059297c735ad2a41156ae9c723a39ff259ad061e0jkummerow@chromium.org 161f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org static inline int CharOccurrence(int* bad_char_occurrence, 162f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org SubjectChar char_code) { 163f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org if (sizeof(SubjectChar) == 1) { 164f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org return bad_char_occurrence[static_cast<int>(char_code)]; 165f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org } 166f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org if (sizeof(PatternChar) == 1) { 16759297c735ad2a41156ae9c723a39ff259ad061e0jkummerow@chromium.org if (exceedsOneByte(char_code)) { 168f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org return -1; 169f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org } 170ec7067e20f87c4d63531d5cb80cb942eb960a610kmillikin@chromium.org return bad_char_occurrence[static_cast<unsigned int>(char_code)]; 171f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org } 172ec7067e20f87c4d63531d5cb80cb942eb960a610kmillikin@chromium.org // Both pattern and subject are UC16. Reduce character to equivalence class. 173f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org int equiv_class = char_code % kUC16AlphabetSize; 174f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org return bad_char_occurrence[equiv_class]; 175f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org } 176f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org 177ea88ce93dcb41a9200ec8747ae7642a5db1f4ce7sgjesse@chromium.org // The following tables are shared by all searches. 178ea88ce93dcb41a9200ec8747ae7642a5db1f4ce7sgjesse@chromium.org // TODO(lrn): Introduce a way for a pattern to keep its tables 179ea88ce93dcb41a9200ec8747ae7642a5db1f4ce7sgjesse@chromium.org // between searches (e.g., for an Atom RegExp). 180ea88ce93dcb41a9200ec8747ae7642a5db1f4ce7sgjesse@chromium.org 181ea88ce93dcb41a9200ec8747ae7642a5db1f4ce7sgjesse@chromium.org // Store for the BoyerMoore(Horspool) bad char shift table. 182f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org // Return a table covering the last kBMMaxShift+1 positions of 183f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org // pattern. 184f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org int* bad_char_table() { 185ea88ce93dcb41a9200ec8747ae7642a5db1f4ce7sgjesse@chromium.org return isolate_->bad_char_shift_table(); 186f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org } 187f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org 188ea88ce93dcb41a9200ec8747ae7642a5db1f4ce7sgjesse@chromium.org // Store for the BoyerMoore good suffix shift table. 189f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org int* good_suffix_shift_table() { 190f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org // Return biased pointer that maps the range [start_..pattern_.length() 191f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org // to the kGoodSuffixShiftTable array. 192ea88ce93dcb41a9200ec8747ae7642a5db1f4ce7sgjesse@chromium.org return isolate_->good_suffix_shift_table() - start_; 193f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org } 194f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org 195ea88ce93dcb41a9200ec8747ae7642a5db1f4ce7sgjesse@chromium.org // Table used temporarily while building the BoyerMoore good suffix 196ea88ce93dcb41a9200ec8747ae7642a5db1f4ce7sgjesse@chromium.org // shift table. 197f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org int* suffix_table() { 198f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org // Return biased pointer that maps the range [start_..pattern_.length() 199f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org // to the kSuffixTable array. 200ea88ce93dcb41a9200ec8747ae7642a5db1f4ce7sgjesse@chromium.org return isolate_->suffix_table() - start_; 201f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org } 202f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org 203ea88ce93dcb41a9200ec8747ae7642a5db1f4ce7sgjesse@chromium.org Isolate* isolate_; 204f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org // The pattern to search for. 205f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org Vector<const PatternChar> pattern_; 206f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org // Pointer to implementation of the search. 207f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org SearchFunction strategy_; 208f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org // Cache value of Max(0, pattern_length() - kBMMaxShift) 209f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org int start_; 2102ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org}; 2112ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org 2122ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org 213f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org//--------------------------------------------------------------------- 214f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org// Single Character Pattern Search Strategy 215f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org//--------------------------------------------------------------------- 2162ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org 217f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.orgtemplate <typename PatternChar, typename SubjectChar> 218f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.orgint StringSearch<PatternChar, SubjectChar>::SingleCharSearch( 219f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org StringSearch<PatternChar, SubjectChar>* search, 220f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org Vector<const SubjectChar> subject, 221f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org int index) { 222f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org ASSERT_EQ(1, search->pattern_.length()); 223f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org PatternChar pattern_first_char = search->pattern_[0]; 224f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org int i = index; 225f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org if (sizeof(SubjectChar) == 1 && sizeof(PatternChar) == 1) { 226f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org const SubjectChar* pos = reinterpret_cast<const SubjectChar*>( 227f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org memchr(subject.start() + i, 228f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org pattern_first_char, 229f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org subject.length() - i)); 230f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org if (pos == NULL) return -1; 231f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org return static_cast<int>(pos - subject.start()); 2322ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org } else { 233f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org if (sizeof(PatternChar) > sizeof(SubjectChar)) { 23459297c735ad2a41156ae9c723a39ff259ad061e0jkummerow@chromium.org if (exceedsOneByte(pattern_first_char)) { 235f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org return -1; 236f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org } 2372ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org } 238f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org SubjectChar search_char = static_cast<SubjectChar>(pattern_first_char); 239f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org int n = subject.length(); 240f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org while (i < n) { 241f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org if (subject[i++] == search_char) return i - 1; 242f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org } 243f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org return -1; 2442ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org } 245f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org} 246f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org 247f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org//--------------------------------------------------------------------- 248f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org// Linear Search Strategy 249f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org//--------------------------------------------------------------------- 250f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org 251f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org 252f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.orgtemplate <typename PatternChar, typename SubjectChar> 2531b3afd1cab9087ca3c4e585d3da77d374d65c082mstarzinger@chromium.orginline bool CharCompare(const PatternChar* pattern, 2541b3afd1cab9087ca3c4e585d3da77d374d65c082mstarzinger@chromium.org const SubjectChar* subject, 2551b3afd1cab9087ca3c4e585d3da77d374d65c082mstarzinger@chromium.org int length) { 256f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org ASSERT(length > 0); 257f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org int pos = 0; 258f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org do { 259f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org if (pattern[pos] != subject[pos]) { 260f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org return false; 261f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org } 262f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org pos++; 263f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org } while (pos < length); 264f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org return true; 265f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org} 266f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org 267f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org 268f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org// Simple linear search for short patterns. Never bails out. 269f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.orgtemplate <typename PatternChar, typename SubjectChar> 270f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.orgint StringSearch<PatternChar, SubjectChar>::LinearSearch( 271f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org StringSearch<PatternChar, SubjectChar>* search, 272f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org Vector<const SubjectChar> subject, 273f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org int index) { 274f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org Vector<const PatternChar> pattern = search->pattern_; 275f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org ASSERT(pattern.length() > 1); 276f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org int pattern_length = pattern.length(); 277f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org PatternChar pattern_first_char = pattern[0]; 278f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org int i = index; 279f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org int n = subject.length() - pattern_length; 280f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org while (i <= n) { 281f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org if (sizeof(SubjectChar) == 1 && sizeof(PatternChar) == 1) { 282f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org const SubjectChar* pos = reinterpret_cast<const SubjectChar*>( 283f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org memchr(subject.start() + i, 284f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org pattern_first_char, 285f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org n - i + 1)); 286f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org if (pos == NULL) return -1; 287f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org i = static_cast<int>(pos - subject.start()) + 1; 288f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org } else { 289f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org if (subject[i++] != pattern_first_char) continue; 290f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org } 291f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org // Loop extracted to separate function to allow using return to do 292f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org // a deeper break. 293f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org if (CharCompare(pattern.start() + 1, 294f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org subject.start() + i, 295f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org pattern_length - 1)) { 296f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org return i - 1; 297f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org } 298f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org } 299f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org return -1; 300f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org} 301f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org 302f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org//--------------------------------------------------------------------- 303f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org// Boyer-Moore string search 304f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org//--------------------------------------------------------------------- 305f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org 306f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.orgtemplate <typename PatternChar, typename SubjectChar> 307f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.orgint StringSearch<PatternChar, SubjectChar>::BoyerMooreSearch( 308f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org StringSearch<PatternChar, SubjectChar>* search, 309f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org Vector<const SubjectChar> subject, 310f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org int start_index) { 311f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org Vector<const PatternChar> pattern = search->pattern_; 312f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org int subject_length = subject.length(); 313f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org int pattern_length = pattern.length(); 314f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org // Only preprocess at most kBMMaxShift last characters of pattern. 315f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org int start = search->start_; 316f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org 317f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org int* bad_char_occurence = search->bad_char_table(); 318f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org int* good_suffix_shift = search->good_suffix_shift_table(); 319f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org 320f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org PatternChar last_char = pattern[pattern_length - 1]; 321f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org int index = start_index; 322f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org // Continue search from i. 323f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org while (index <= subject_length - pattern_length) { 324f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org int j = pattern_length - 1; 325f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org int c; 326f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org while (last_char != (c = subject[index + j])) { 327f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org int shift = 328f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org j - CharOccurrence(bad_char_occurence, c); 329f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org index += shift; 330f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org if (index > subject_length - pattern_length) { 331f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org return -1; 332f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org } 333f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org } 334f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org while (j >= 0 && pattern[j] == (c = subject[index + j])) j--; 335f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org if (j < 0) { 336f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org return index; 337f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org } else if (j < start) { 338f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org // we have matched more than our tables allow us to be smart about. 339f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org // Fall back on BMH shift. 340f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org index += pattern_length - 1 341f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org - CharOccurrence(bad_char_occurence, 342f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org static_cast<SubjectChar>(last_char)); 343f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org } else { 344f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org int gs_shift = good_suffix_shift[j + 1]; 345f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org int bc_occ = 346f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org CharOccurrence(bad_char_occurence, c); 347f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org int shift = j - bc_occ; 348f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org if (gs_shift > shift) { 349f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org shift = gs_shift; 350f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org } 351f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org index += shift; 352f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org } 3532ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org } 354f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org 355f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org return -1; 3562ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org} 3572ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org 3582ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org 359f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.orgtemplate <typename PatternChar, typename SubjectChar> 360f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.orgvoid StringSearch<PatternChar, SubjectChar>::PopulateBoyerMooreTable() { 361f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org int pattern_length = pattern_.length(); 362f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org const PatternChar* pattern = pattern_.start(); 363f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org // Only look at the last kBMMaxShift characters of pattern (from start_ 364f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org // to pattern_length). 365f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org int start = start_; 366f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org int length = pattern_length - start; 367f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org 368f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org // Biased tables so that we can use pattern indices as table indices, 369f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org // even if we only cover the part of the pattern from offset start. 370f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org int* shift_table = good_suffix_shift_table(); 371f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org int* suffix_table = this->suffix_table(); 372f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org 373f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org // Initialize table. 374f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org for (int i = start; i < pattern_length; i++) { 375f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org shift_table[i] = length; 376f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org } 377f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org shift_table[pattern_length] = 1; 378f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org suffix_table[pattern_length] = pattern_length + 1; 3792ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org 380bf0c820d028452571c8c744ddd212c32c6d6a996danno@chromium.org if (pattern_length <= start) { 381bf0c820d028452571c8c744ddd212c32c6d6a996danno@chromium.org return; 382bf0c820d028452571c8c744ddd212c32c6d6a996danno@chromium.org } 383bf0c820d028452571c8c744ddd212c32c6d6a996danno@chromium.org 384f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org // Find suffixes. 385f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org PatternChar last_char = pattern[pattern_length - 1]; 386f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org int suffix = pattern_length + 1; 3872ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org { 388f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org int i = pattern_length; 3892ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org while (i > start) { 3902ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org PatternChar c = pattern[i - 1]; 391f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org while (suffix <= pattern_length && c != pattern[suffix - 1]) { 392f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org if (shift_table[suffix] == length) { 393f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org shift_table[suffix] = suffix - i; 3942ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org } 395f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org suffix = suffix_table[suffix]; 3962ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org } 397f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org suffix_table[--i] = --suffix; 398f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org if (suffix == pattern_length) { 3992ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org // No suffix to extend, so we check against last_char only. 4002ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org while ((i > start) && (pattern[i - 1] != last_char)) { 401f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org if (shift_table[pattern_length] == length) { 402f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org shift_table[pattern_length] = pattern_length - i; 4032ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org } 404f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org suffix_table[--i] = pattern_length; 4052ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org } 4062ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org if (i > start) { 407f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org suffix_table[--i] = --suffix; 4082ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org } 4092ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org } 4102ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org } 4112ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org } 412f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org // Build shift table using suffixes. 413f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org if (suffix < pattern_length) { 414f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org for (int i = start; i <= pattern_length; i++) { 415f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org if (shift_table[i] == length) { 416f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org shift_table[i] = suffix - start; 4172ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org } 4182ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org if (i == suffix) { 419f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org suffix = suffix_table[suffix]; 4202ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org } 4212ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org } 4222ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org } 4232ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org} 4242ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org 425f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org//--------------------------------------------------------------------- 426f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org// Boyer-Moore-Horspool string search. 427f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org//--------------------------------------------------------------------- 4282ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org 429f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.orgtemplate <typename PatternChar, typename SubjectChar> 430f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.orgint StringSearch<PatternChar, SubjectChar>::BoyerMooreHorspoolSearch( 431f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org StringSearch<PatternChar, SubjectChar>* search, 432f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org Vector<const SubjectChar> subject, 433f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org int start_index) { 434f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org Vector<const PatternChar> pattern = search->pattern_; 435f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org int subject_length = subject.length(); 436f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org int pattern_length = pattern.length(); 437f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org int* char_occurrences = search->bad_char_table(); 438f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org int badness = -pattern_length; 4392ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org 4402ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org // How bad we are doing without a good-suffix table. 441f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org PatternChar last_char = pattern[pattern_length - 1]; 442f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org int last_char_shift = pattern_length - 1 - 443f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org CharOccurrence(char_occurrences, static_cast<SubjectChar>(last_char)); 4442ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org // Perform search 445f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org int index = start_index; // No matches found prior to this index. 446f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org while (index <= subject_length - pattern_length) { 447f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org int j = pattern_length - 1; 448f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org int subject_char; 449f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org while (last_char != (subject_char = subject[index + j])) { 450f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org int bc_occ = CharOccurrence(char_occurrences, subject_char); 4512ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org int shift = j - bc_occ; 452f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org index += shift; 4532ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org badness += 1 - shift; // at most zero, so badness cannot increase. 454f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org if (index > subject_length - pattern_length) { 4552ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org return -1; 4562ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org } 4572ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org } 4582ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org j--; 459f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org while (j >= 0 && pattern[j] == (subject[index + j])) j--; 4602ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org if (j < 0) { 461f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org return index; 4622ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org } else { 463f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org index += last_char_shift; 4642ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org // Badness increases by the number of characters we have 4652ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org // checked, and decreases by the number of characters we 4662ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org // can skip by shifting. It's a measure of how we are doing 4672ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org // compared to reading each character exactly once. 468f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org badness += (pattern_length - j) - last_char_shift; 4692ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org if (badness > 0) { 470f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org search->PopulateBoyerMooreTable(); 471f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org search->strategy_ = &BoyerMooreSearch; 472f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org return BoyerMooreSearch(search, subject, index); 4732ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org } 4742ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org } 4752ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org } 4762ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org return -1; 4772ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org} 4782ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org 4792ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org 480f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.orgtemplate <typename PatternChar, typename SubjectChar> 481f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.orgvoid StringSearch<PatternChar, SubjectChar>::PopulateBoyerMooreHorspoolTable() { 482f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org int pattern_length = pattern_.length(); 4832ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org 484f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org int* bad_char_occurrence = bad_char_table(); 485f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org 486f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org // Only preprocess at most kBMMaxShift last characters of pattern. 487f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org int start = start_; 488f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org // Run forwards to populate bad_char_table, so that *last* instance 489f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org // of character equivalence class is the one registered. 490f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org // Notice: Doesn't include the last character. 491f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org int table_size = AlphabetSize(); 492f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org if (start == 0) { // All patterns less than kBMMaxShift in length. 493f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org memset(bad_char_occurrence, 494f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org -1, 495f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org table_size * sizeof(*bad_char_occurrence)); 496f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org } else { 497f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org for (int i = 0; i < table_size; i++) { 498f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org bad_char_occurrence[i] = start - 1; 4992ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org } 5002ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org } 501f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org for (int i = start; i < pattern_length - 1; i++) { 502f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org PatternChar c = pattern_[i]; 503f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org int bucket = (sizeof(PatternChar) == 1) ? c : c % AlphabetSize(); 504f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org bad_char_occurrence[bucket] = i; 505f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org } 5062ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org} 5072ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org 508f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org//--------------------------------------------------------------------- 509f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org// Linear string search with bailout to BMH. 510f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org//--------------------------------------------------------------------- 5112ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org 512f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org// Simple linear search for short patterns, which bails out if the string 513f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org// isn't found very early in the subject. Upgrades to BoyerMooreHorspool. 5142ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.orgtemplate <typename PatternChar, typename SubjectChar> 515f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.orgint StringSearch<PatternChar, SubjectChar>::InitialSearch( 516f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org StringSearch<PatternChar, SubjectChar>* search, 517f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org Vector<const SubjectChar> subject, 518f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org int index) { 519f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org Vector<const PatternChar> pattern = search->pattern_; 5202ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org int pattern_length = pattern.length(); 5212ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org // Badness is a count of how much work we have done. When we have 5222ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org // done enough work we decide it's probably worth switching to a better 5232ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org // algorithm. 5242ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org int badness = -10 - (pattern_length << 2); 5252ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org 5262ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org // We know our pattern is at least 2 characters, we cache the first so 5272ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org // the common case of the first character not matching is faster. 5282ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org PatternChar pattern_first_char = pattern[0]; 529f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org for (int i = index, n = subject.length() - pattern_length; i <= n; i++) { 5302ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org badness++; 531f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org if (badness <= 0) { 532f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org if (sizeof(SubjectChar) == 1 && sizeof(PatternChar) == 1) { 533f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org const SubjectChar* pos = reinterpret_cast<const SubjectChar*>( 534f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org memchr(subject.start() + i, 535f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org pattern_first_char, 536f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org n - i + 1)); 537f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org if (pos == NULL) { 538f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org return -1; 539f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org } 540f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org i = static_cast<int>(pos - subject.start()); 541f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org } else { 542f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org if (subject[i] != pattern_first_char) continue; 5432ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org } 544f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org int j = 1; 545f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org do { 546f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org if (pattern[j] != subject[i + j]) { 547f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org break; 548f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org } 549f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org j++; 550f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org } while (j < pattern_length); 551f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org if (j == pattern_length) { 552f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org return i; 5532ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org } 554f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org badness += j; 5552ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org } else { 556f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org search->PopulateBoyerMooreHorspoolTable(); 557f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org search->strategy_ = &BoyerMooreHorspoolSearch; 558f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org return BoyerMooreHorspoolSearch(search, subject, i); 5592ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org } 5602ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org } 5612ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org return -1; 5622ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org} 5632ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org 5642ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org 565f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org// Perform a a single stand-alone search. 566f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org// If searching multiple times for the same pattern, a search 567f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org// object should be constructed once and the Search function then called 568f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org// for each search. 5692ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.orgtemplate <typename SubjectChar, typename PatternChar> 5701b3afd1cab9087ca3c4e585d3da77d374d65c082mstarzinger@chromium.orgint SearchString(Isolate* isolate, 5711b3afd1cab9087ca3c4e585d3da77d374d65c082mstarzinger@chromium.org Vector<const SubjectChar> subject, 5721b3afd1cab9087ca3c4e585d3da77d374d65c082mstarzinger@chromium.org Vector<const PatternChar> pattern, 5731b3afd1cab9087ca3c4e585d3da77d374d65c082mstarzinger@chromium.org int start_index) { 574ea88ce93dcb41a9200ec8747ae7642a5db1f4ce7sgjesse@chromium.org StringSearch<PatternChar, SubjectChar> search(isolate, pattern); 575f05f2913e034b9332e55c02c9395e701725c02c1kmillikin@chromium.org return search.Search(subject, start_index); 5762ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org} 5772ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org 5782ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org}} // namespace v8::internal 5792ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org 5802ec107fe650fe56eed094ca017940f26af46644bsgjesse@chromium.org#endif // V8_STRING_SEARCH_H_ 581