13ef787dbeca8a5fb1086949cda830dccee07bfbdBen Murdoch// Copyright 2011 the V8 project authors. All rights reserved. 2b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch// Use of this source code is governed by a BSD-style license that can be 3b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch// found in the LICENSE file. 459151504615d929945dc59db37bf1166937748c6Steve Block 559151504615d929945dc59db37bf1166937748c6Steve Block#ifndef V8_STRING_SEARCH_H_ 659151504615d929945dc59db37bf1166937748c6Steve Block#define V8_STRING_SEARCH_H_ 759151504615d929945dc59db37bf1166937748c6Steve Block 8014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch#include "src/isolate.h" 9014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch#include "src/vector.h" 10014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch 1159151504615d929945dc59db37bf1166937748c6Steve Blocknamespace v8 { 1259151504615d929945dc59db37bf1166937748c6Steve Blocknamespace internal { 1359151504615d929945dc59db37bf1166937748c6Steve Block 1459151504615d929945dc59db37bf1166937748c6Steve Block 15f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch//--------------------------------------------------------------------- 16f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch// String Search object. 17f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch//--------------------------------------------------------------------- 18f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch 19f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch// Class holding constants and methods that apply to all string search variants, 20f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch// independently of subject and pattern char size. 21f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdochclass StringSearchBase { 22f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch protected: 23f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch // Cap on the maximal shift in the Boyer-Moore implementation. By setting a 24f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch // limit, we can fix the size of tables. For a needle longer than this limit, 25f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch // search will not be optimal, since we only build tables for a suffix 26f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch // of the string, but it is a safe approximation. 2744f0eee88ff00398ff7f715fab053374d808c90dSteve Block static const int kBMMaxShift = Isolate::kBMMaxShift; 28f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch 29f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch // Reduce alphabet to this size. 30f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch // One of the tables used by Boyer-Moore and Boyer-Moore-Horspool has size 31f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch // proportional to the input alphabet. We reduce the alphabet size by 32f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch // equating input characters modulo a smaller alphabet size. This gives 33f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch // a potentially less efficient searching, but is a safe approximation. 34f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch // For needles using only characters in the same Unicode 256-code point page, 35f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch // there is no search speed degradation. 36b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch static const int kLatin1AlphabetSize = 256; 3744f0eee88ff00398ff7f715fab053374d808c90dSteve Block static const int kUC16AlphabetSize = Isolate::kUC16AlphabetSize; 38f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch 39f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch // Bad-char shift table stored in the state. It's length is the alphabet size. 40f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch // For patterns below this length, the skip length of Boyer-Moore is too short 41f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch // to compensate for the algorithmic overhead compared to simple brute force. 42f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch static const int kBMMinPatternLength = 7; 43f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch 44b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch static inline bool IsOneByteString(Vector<const uint8_t> string) { 45f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch return true; 46f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch } 47f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch 48b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch static inline bool IsOneByteString(Vector<const uc16> string) { 49b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch return String::IsOneByte(string.start(), string.length()); 50f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch } 51f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch 5244f0eee88ff00398ff7f715fab053374d808c90dSteve Block friend class Isolate; 53f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch}; 54f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch 55f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch 56f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdochtemplate <typename PatternChar, typename SubjectChar> 57f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdochclass StringSearch : private StringSearchBase { 5859151504615d929945dc59db37bf1166937748c6Steve Block public: 5944f0eee88ff00398ff7f715fab053374d808c90dSteve Block StringSearch(Isolate* isolate, Vector<const PatternChar> pattern) 6044f0eee88ff00398ff7f715fab053374d808c90dSteve Block : isolate_(isolate), 6144f0eee88ff00398ff7f715fab053374d808c90dSteve Block pattern_(pattern), 62f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch start_(Max(0, pattern.length() - kBMMaxShift)) { 63f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch if (sizeof(PatternChar) > sizeof(SubjectChar)) { 64b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch if (!IsOneByteString(pattern_)) { 65f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch strategy_ = &FailSearch; 66f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch return; 67f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch } 68f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch } 69f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch int pattern_length = pattern_.length(); 70f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch if (pattern_length < kBMMinPatternLength) { 71f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch if (pattern_length == 1) { 72f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch strategy_ = &SingleCharSearch; 73f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch return; 74f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch } 75f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch strategy_ = &LinearSearch; 76f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch return; 7759151504615d929945dc59db37bf1166937748c6Steve Block } 78f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch strategy_ = &InitialSearch; 7959151504615d929945dc59db37bf1166937748c6Steve Block } 80f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch 81f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch int Search(Vector<const SubjectChar> subject, int index) { 82f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch return strategy_(this, subject, index); 8359151504615d929945dc59db37bf1166937748c6Steve Block } 84f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch 85f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch static inline int AlphabetSize() { 86f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch if (sizeof(PatternChar) == 1) { 87b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch // Latin1 needle. 88b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch return kLatin1AlphabetSize; 89f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch } else { 90b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch DCHECK(sizeof(PatternChar) == 2); 91f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch // UC16 needle. 92f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch return kUC16AlphabetSize; 93f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch } 9459151504615d929945dc59db37bf1166937748c6Steve Block } 95f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch 9659151504615d929945dc59db37bf1166937748c6Steve Block private: 97f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch typedef int (*SearchFunction)( // NOLINT - it's not a cast! 98f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch StringSearch<PatternChar, SubjectChar>*, 99f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch Vector<const SubjectChar>, 100f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch int); 101f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch 102f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch static int FailSearch(StringSearch<PatternChar, SubjectChar>*, 103f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch Vector<const SubjectChar>, 104f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch int) { 105f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch return -1; 106f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch } 10759151504615d929945dc59db37bf1166937748c6Steve Block 108f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch static int SingleCharSearch(StringSearch<PatternChar, SubjectChar>* search, 109f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch Vector<const SubjectChar> subject, 110f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch int start_index); 111f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch 112f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch static int LinearSearch(StringSearch<PatternChar, SubjectChar>* search, 113f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch Vector<const SubjectChar> subject, 114f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch int start_index); 115f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch 116f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch static int InitialSearch(StringSearch<PatternChar, SubjectChar>* search, 117f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch Vector<const SubjectChar> subject, 118f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch int start_index); 119f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch 120f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch static int BoyerMooreHorspoolSearch( 121f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch StringSearch<PatternChar, SubjectChar>* search, 122f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch Vector<const SubjectChar> subject, 123f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch int start_index); 124f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch 125f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch static int BoyerMooreSearch(StringSearch<PatternChar, SubjectChar>* search, 126f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch Vector<const SubjectChar> subject, 127f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch int start_index); 128f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch 129f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch void PopulateBoyerMooreHorspoolTable(); 130f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch 131f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch void PopulateBoyerMooreTable(); 132f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch 133b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch static inline bool exceedsOneByte(uint8_t c) { 134b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch return false; 135b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch } 136b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch 137b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch static inline bool exceedsOneByte(uint16_t c) { 138b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch return c > String::kMaxOneByteCharCodeU; 139b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch } 140b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch 141f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch static inline int CharOccurrence(int* bad_char_occurrence, 142f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch SubjectChar char_code) { 143f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch if (sizeof(SubjectChar) == 1) { 144f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch return bad_char_occurrence[static_cast<int>(char_code)]; 145f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch } 146f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch if (sizeof(PatternChar) == 1) { 147b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch if (exceedsOneByte(char_code)) { 148f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch return -1; 149f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch } 150f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch return bad_char_occurrence[static_cast<unsigned int>(char_code)]; 151f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch } 152f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch // Both pattern and subject are UC16. Reduce character to equivalence class. 153f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch int equiv_class = char_code % kUC16AlphabetSize; 154f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch return bad_char_occurrence[equiv_class]; 155f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch } 156f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch 15744f0eee88ff00398ff7f715fab053374d808c90dSteve Block // The following tables are shared by all searches. 15844f0eee88ff00398ff7f715fab053374d808c90dSteve Block // TODO(lrn): Introduce a way for a pattern to keep its tables 15944f0eee88ff00398ff7f715fab053374d808c90dSteve Block // between searches (e.g., for an Atom RegExp). 16044f0eee88ff00398ff7f715fab053374d808c90dSteve Block 16144f0eee88ff00398ff7f715fab053374d808c90dSteve Block // Store for the BoyerMoore(Horspool) bad char shift table. 162f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch // Return a table covering the last kBMMaxShift+1 positions of 163f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch // pattern. 164f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch int* bad_char_table() { 16544f0eee88ff00398ff7f715fab053374d808c90dSteve Block return isolate_->bad_char_shift_table(); 166f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch } 167f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch 16844f0eee88ff00398ff7f715fab053374d808c90dSteve Block // Store for the BoyerMoore good suffix shift table. 169f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch int* good_suffix_shift_table() { 170f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch // Return biased pointer that maps the range [start_..pattern_.length() 171f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch // to the kGoodSuffixShiftTable array. 17244f0eee88ff00398ff7f715fab053374d808c90dSteve Block return isolate_->good_suffix_shift_table() - start_; 173f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch } 174f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch 17544f0eee88ff00398ff7f715fab053374d808c90dSteve Block // Table used temporarily while building the BoyerMoore good suffix 17644f0eee88ff00398ff7f715fab053374d808c90dSteve Block // shift table. 177f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch int* suffix_table() { 178f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch // Return biased pointer that maps the range [start_..pattern_.length() 179f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch // to the kSuffixTable array. 18044f0eee88ff00398ff7f715fab053374d808c90dSteve Block return isolate_->suffix_table() - start_; 181f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch } 182f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch 18344f0eee88ff00398ff7f715fab053374d808c90dSteve Block Isolate* isolate_; 184f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch // The pattern to search for. 185f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch Vector<const PatternChar> pattern_; 186f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch // Pointer to implementation of the search. 187f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch SearchFunction strategy_; 188f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch // Cache value of Max(0, pattern_length() - kBMMaxShift) 189f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch int start_; 19059151504615d929945dc59db37bf1166937748c6Steve Block}; 19159151504615d929945dc59db37bf1166937748c6Steve Block 19259151504615d929945dc59db37bf1166937748c6Steve Block 193014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdochtemplate <typename T, typename U> 194014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdochinline T AlignDown(T value, U alignment) { 195014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch return reinterpret_cast<T>( 196014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch (reinterpret_cast<uintptr_t>(value) & ~(alignment - 1))); 197014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch} 198014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch 199014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch 200014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdochinline uint8_t GetHighestValueByte(uc16 character) { 201014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch return Max(static_cast<uint8_t>(character & 0xFF), 202014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch static_cast<uint8_t>(character >> 8)); 203014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch} 204014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch 205014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch 206014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdochinline uint8_t GetHighestValueByte(uint8_t character) { return character; } 207014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch 208014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch 209014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdochtemplate <typename PatternChar, typename SubjectChar> 210014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdochinline int FindFirstCharacter(Vector<const PatternChar> pattern, 211014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch Vector<const SubjectChar> subject, int index) { 212014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch const PatternChar pattern_first_char = pattern[0]; 213014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch const int max_n = (subject.length() - pattern.length() + 1); 214014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch 215014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch const uint8_t search_byte = GetHighestValueByte(pattern_first_char); 216014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch const SubjectChar search_char = static_cast<SubjectChar>(pattern_first_char); 217014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch int pos = index; 218014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch do { 219014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch DCHECK_GE(max_n - pos, 0); 220014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch const SubjectChar* char_pos = reinterpret_cast<const SubjectChar*>( 221014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch memchr(subject.start() + pos, search_byte, 222014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch (max_n - pos) * sizeof(SubjectChar))); 223014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch if (char_pos == NULL) return -1; 224014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch char_pos = AlignDown(char_pos, sizeof(SubjectChar)); 225014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch pos = static_cast<int>(char_pos - subject.start()); 226014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch if (subject[pos] == search_char) return pos; 227014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch } while (++pos < max_n); 228014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch 229014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch return -1; 230014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch} 231014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch 232014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch 233f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch//--------------------------------------------------------------------- 234f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch// Single Character Pattern Search Strategy 235f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch//--------------------------------------------------------------------- 23659151504615d929945dc59db37bf1166937748c6Steve Block 237f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdochtemplate <typename PatternChar, typename SubjectChar> 238f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdochint StringSearch<PatternChar, SubjectChar>::SingleCharSearch( 239f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch StringSearch<PatternChar, SubjectChar>* search, 240f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch Vector<const SubjectChar> subject, 241f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch int index) { 242b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch DCHECK_EQ(1, search->pattern_.length()); 243f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch PatternChar pattern_first_char = search->pattern_[0]; 244014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch if (sizeof(PatternChar) > sizeof(SubjectChar)) { 245014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch if (exceedsOneByte(pattern_first_char)) { 246014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch return -1; 24759151504615d929945dc59db37bf1166937748c6Steve Block } 24859151504615d929945dc59db37bf1166937748c6Steve Block } 249014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch return FindFirstCharacter(search->pattern_, subject, index); 250f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch} 251f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch 252f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch//--------------------------------------------------------------------- 253f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch// Linear Search Strategy 254f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch//--------------------------------------------------------------------- 255f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch 256f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch 257f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdochtemplate <typename PatternChar, typename SubjectChar> 2583ef787dbeca8a5fb1086949cda830dccee07bfbdBen Murdochinline bool CharCompare(const PatternChar* pattern, 2593ef787dbeca8a5fb1086949cda830dccee07bfbdBen Murdoch const SubjectChar* subject, 2603ef787dbeca8a5fb1086949cda830dccee07bfbdBen Murdoch int length) { 261b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch DCHECK(length > 0); 262f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch int pos = 0; 263f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch do { 264f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch if (pattern[pos] != subject[pos]) { 265f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch return false; 266f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch } 267f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch pos++; 268f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch } while (pos < length); 269f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch return true; 270f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch} 271f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch 272f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch 273f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch// Simple linear search for short patterns. Never bails out. 274f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdochtemplate <typename PatternChar, typename SubjectChar> 275f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdochint StringSearch<PatternChar, SubjectChar>::LinearSearch( 276f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch StringSearch<PatternChar, SubjectChar>* search, 277f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch Vector<const SubjectChar> subject, 278f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch int index) { 279f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch Vector<const PatternChar> pattern = search->pattern_; 280b8a8cc1952d61a2f3a2568848933943a543b5d3eBen Murdoch DCHECK(pattern.length() > 1); 281f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch int pattern_length = pattern.length(); 282f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch int i = index; 283f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch int n = subject.length() - pattern_length; 284f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch while (i <= n) { 285014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch i = FindFirstCharacter(pattern, subject, i); 286014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch if (i == -1) return -1; 287014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch DCHECK_LE(i, n); 288014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch i++; 289f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch // Loop extracted to separate function to allow using return to do 290f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch // a deeper break. 291f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch if (CharCompare(pattern.start() + 1, 292f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch subject.start() + i, 293f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch pattern_length - 1)) { 294f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch return i - 1; 295f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch } 296f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch } 297f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch return -1; 298f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch} 299f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch 300f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch//--------------------------------------------------------------------- 301f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch// Boyer-Moore string search 302f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch//--------------------------------------------------------------------- 303f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch 304f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdochtemplate <typename PatternChar, typename SubjectChar> 305f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdochint StringSearch<PatternChar, SubjectChar>::BoyerMooreSearch( 306f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch StringSearch<PatternChar, SubjectChar>* search, 307f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch Vector<const SubjectChar> subject, 308f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch int start_index) { 309f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch Vector<const PatternChar> pattern = search->pattern_; 310f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch int subject_length = subject.length(); 311f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch int pattern_length = pattern.length(); 312f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch // Only preprocess at most kBMMaxShift last characters of pattern. 313f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch int start = search->start_; 314f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch 315f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch int* bad_char_occurence = search->bad_char_table(); 316f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch int* good_suffix_shift = search->good_suffix_shift_table(); 317f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch 318f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch PatternChar last_char = pattern[pattern_length - 1]; 319f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch int index = start_index; 320f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch // Continue search from i. 321f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch while (index <= subject_length - pattern_length) { 322f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch int j = pattern_length - 1; 323f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch int c; 324f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch while (last_char != (c = subject[index + j])) { 325f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch int shift = 326f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch j - CharOccurrence(bad_char_occurence, c); 327f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch index += shift; 328f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch if (index > subject_length - pattern_length) { 329f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch return -1; 330f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch } 331f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch } 332f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch while (j >= 0 && pattern[j] == (c = subject[index + j])) j--; 333f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch if (j < 0) { 334f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch return index; 335f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch } else if (j < start) { 336f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch // we have matched more than our tables allow us to be smart about. 337f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch // Fall back on BMH shift. 338f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch index += pattern_length - 1 339f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch - CharOccurrence(bad_char_occurence, 340f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch static_cast<SubjectChar>(last_char)); 341f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch } else { 342f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch int gs_shift = good_suffix_shift[j + 1]; 343f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch int bc_occ = 344f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch CharOccurrence(bad_char_occurence, c); 345f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch int shift = j - bc_occ; 346f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch if (gs_shift > shift) { 347f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch shift = gs_shift; 348f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch } 349f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch index += shift; 350f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch } 35159151504615d929945dc59db37bf1166937748c6Steve Block } 352f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch 353f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch return -1; 35459151504615d929945dc59db37bf1166937748c6Steve Block} 35559151504615d929945dc59db37bf1166937748c6Steve Block 35659151504615d929945dc59db37bf1166937748c6Steve Block 357f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdochtemplate <typename PatternChar, typename SubjectChar> 358f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdochvoid StringSearch<PatternChar, SubjectChar>::PopulateBoyerMooreTable() { 359f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch int pattern_length = pattern_.length(); 360f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch const PatternChar* pattern = pattern_.start(); 361f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch // Only look at the last kBMMaxShift characters of pattern (from start_ 362f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch // to pattern_length). 363f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch int start = start_; 364f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch int length = pattern_length - start; 365f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch 366f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch // Biased tables so that we can use pattern indices as table indices, 367f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch // even if we only cover the part of the pattern from offset start. 368f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch int* shift_table = good_suffix_shift_table(); 369f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch int* suffix_table = this->suffix_table(); 370f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch 371f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch // Initialize table. 372f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch for (int i = start; i < pattern_length; i++) { 373f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch shift_table[i] = length; 374f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch } 375f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch shift_table[pattern_length] = 1; 376f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch suffix_table[pattern_length] = pattern_length + 1; 37759151504615d929945dc59db37bf1166937748c6Steve Block 3783ef787dbeca8a5fb1086949cda830dccee07bfbdBen Murdoch if (pattern_length <= start) { 3793ef787dbeca8a5fb1086949cda830dccee07bfbdBen Murdoch return; 3803ef787dbeca8a5fb1086949cda830dccee07bfbdBen Murdoch } 3813ef787dbeca8a5fb1086949cda830dccee07bfbdBen Murdoch 382f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch // Find suffixes. 383f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch PatternChar last_char = pattern[pattern_length - 1]; 384f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch int suffix = pattern_length + 1; 38559151504615d929945dc59db37bf1166937748c6Steve Block { 386f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch int i = pattern_length; 38759151504615d929945dc59db37bf1166937748c6Steve Block while (i > start) { 38859151504615d929945dc59db37bf1166937748c6Steve Block PatternChar c = pattern[i - 1]; 389f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch while (suffix <= pattern_length && c != pattern[suffix - 1]) { 390f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch if (shift_table[suffix] == length) { 391f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch shift_table[suffix] = suffix - i; 39259151504615d929945dc59db37bf1166937748c6Steve Block } 393f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch suffix = suffix_table[suffix]; 39459151504615d929945dc59db37bf1166937748c6Steve Block } 395f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch suffix_table[--i] = --suffix; 396f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch if (suffix == pattern_length) { 39759151504615d929945dc59db37bf1166937748c6Steve Block // No suffix to extend, so we check against last_char only. 39859151504615d929945dc59db37bf1166937748c6Steve Block while ((i > start) && (pattern[i - 1] != last_char)) { 399f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch if (shift_table[pattern_length] == length) { 400f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch shift_table[pattern_length] = pattern_length - i; 40159151504615d929945dc59db37bf1166937748c6Steve Block } 402f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch suffix_table[--i] = pattern_length; 40359151504615d929945dc59db37bf1166937748c6Steve Block } 40459151504615d929945dc59db37bf1166937748c6Steve Block if (i > start) { 405f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch suffix_table[--i] = --suffix; 40659151504615d929945dc59db37bf1166937748c6Steve Block } 40759151504615d929945dc59db37bf1166937748c6Steve Block } 40859151504615d929945dc59db37bf1166937748c6Steve Block } 40959151504615d929945dc59db37bf1166937748c6Steve Block } 410f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch // Build shift table using suffixes. 411f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch if (suffix < pattern_length) { 412f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch for (int i = start; i <= pattern_length; i++) { 413f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch if (shift_table[i] == length) { 414f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch shift_table[i] = suffix - start; 41559151504615d929945dc59db37bf1166937748c6Steve Block } 41659151504615d929945dc59db37bf1166937748c6Steve Block if (i == suffix) { 417f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch suffix = suffix_table[suffix]; 41859151504615d929945dc59db37bf1166937748c6Steve Block } 41959151504615d929945dc59db37bf1166937748c6Steve Block } 42059151504615d929945dc59db37bf1166937748c6Steve Block } 42159151504615d929945dc59db37bf1166937748c6Steve Block} 42259151504615d929945dc59db37bf1166937748c6Steve Block 423f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch//--------------------------------------------------------------------- 424f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch// Boyer-Moore-Horspool string search. 425f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch//--------------------------------------------------------------------- 42659151504615d929945dc59db37bf1166937748c6Steve Block 427f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdochtemplate <typename PatternChar, typename SubjectChar> 428f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdochint StringSearch<PatternChar, SubjectChar>::BoyerMooreHorspoolSearch( 429f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch StringSearch<PatternChar, SubjectChar>* search, 430f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch Vector<const SubjectChar> subject, 431f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch int start_index) { 432f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch Vector<const PatternChar> pattern = search->pattern_; 433f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch int subject_length = subject.length(); 434f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch int pattern_length = pattern.length(); 435f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch int* char_occurrences = search->bad_char_table(); 436f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch int badness = -pattern_length; 43759151504615d929945dc59db37bf1166937748c6Steve Block 43859151504615d929945dc59db37bf1166937748c6Steve Block // How bad we are doing without a good-suffix table. 439f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch PatternChar last_char = pattern[pattern_length - 1]; 440f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch int last_char_shift = pattern_length - 1 - 441f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch CharOccurrence(char_occurrences, static_cast<SubjectChar>(last_char)); 44259151504615d929945dc59db37bf1166937748c6Steve Block // Perform search 443f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch int index = start_index; // No matches found prior to this index. 444f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch while (index <= subject_length - pattern_length) { 445f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch int j = pattern_length - 1; 446f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch int subject_char; 447f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch while (last_char != (subject_char = subject[index + j])) { 448f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch int bc_occ = CharOccurrence(char_occurrences, subject_char); 44959151504615d929945dc59db37bf1166937748c6Steve Block int shift = j - bc_occ; 450f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch index += shift; 45159151504615d929945dc59db37bf1166937748c6Steve Block badness += 1 - shift; // at most zero, so badness cannot increase. 452f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch if (index > subject_length - pattern_length) { 45359151504615d929945dc59db37bf1166937748c6Steve Block return -1; 45459151504615d929945dc59db37bf1166937748c6Steve Block } 45559151504615d929945dc59db37bf1166937748c6Steve Block } 45659151504615d929945dc59db37bf1166937748c6Steve Block j--; 457f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch while (j >= 0 && pattern[j] == (subject[index + j])) j--; 45859151504615d929945dc59db37bf1166937748c6Steve Block if (j < 0) { 459f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch return index; 46059151504615d929945dc59db37bf1166937748c6Steve Block } else { 461f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch index += last_char_shift; 46259151504615d929945dc59db37bf1166937748c6Steve Block // Badness increases by the number of characters we have 46359151504615d929945dc59db37bf1166937748c6Steve Block // checked, and decreases by the number of characters we 46459151504615d929945dc59db37bf1166937748c6Steve Block // can skip by shifting. It's a measure of how we are doing 46559151504615d929945dc59db37bf1166937748c6Steve Block // compared to reading each character exactly once. 466f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch badness += (pattern_length - j) - last_char_shift; 46759151504615d929945dc59db37bf1166937748c6Steve Block if (badness > 0) { 468f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch search->PopulateBoyerMooreTable(); 469f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch search->strategy_ = &BoyerMooreSearch; 470f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch return BoyerMooreSearch(search, subject, index); 47159151504615d929945dc59db37bf1166937748c6Steve Block } 47259151504615d929945dc59db37bf1166937748c6Steve Block } 47359151504615d929945dc59db37bf1166937748c6Steve Block } 47459151504615d929945dc59db37bf1166937748c6Steve Block return -1; 47559151504615d929945dc59db37bf1166937748c6Steve Block} 47659151504615d929945dc59db37bf1166937748c6Steve Block 47759151504615d929945dc59db37bf1166937748c6Steve Block 478f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdochtemplate <typename PatternChar, typename SubjectChar> 479f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdochvoid StringSearch<PatternChar, SubjectChar>::PopulateBoyerMooreHorspoolTable() { 480f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch int pattern_length = pattern_.length(); 48159151504615d929945dc59db37bf1166937748c6Steve Block 482f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch int* bad_char_occurrence = bad_char_table(); 483f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch 484f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch // Only preprocess at most kBMMaxShift last characters of pattern. 485f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch int start = start_; 486f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch // Run forwards to populate bad_char_table, so that *last* instance 487f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch // of character equivalence class is the one registered. 488f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch // Notice: Doesn't include the last character. 489f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch int table_size = AlphabetSize(); 490f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch if (start == 0) { // All patterns less than kBMMaxShift in length. 491f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch memset(bad_char_occurrence, 492f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch -1, 493f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch table_size * sizeof(*bad_char_occurrence)); 494f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch } else { 495f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch for (int i = 0; i < table_size; i++) { 496f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch bad_char_occurrence[i] = start - 1; 49759151504615d929945dc59db37bf1166937748c6Steve Block } 49859151504615d929945dc59db37bf1166937748c6Steve Block } 499f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch for (int i = start; i < pattern_length - 1; i++) { 500f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch PatternChar c = pattern_[i]; 501f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch int bucket = (sizeof(PatternChar) == 1) ? c : c % AlphabetSize(); 502f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch bad_char_occurrence[bucket] = i; 503f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch } 50459151504615d929945dc59db37bf1166937748c6Steve Block} 50559151504615d929945dc59db37bf1166937748c6Steve Block 506f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch//--------------------------------------------------------------------- 507f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch// Linear string search with bailout to BMH. 508f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch//--------------------------------------------------------------------- 50959151504615d929945dc59db37bf1166937748c6Steve Block 510f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch// Simple linear search for short patterns, which bails out if the string 511f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch// isn't found very early in the subject. Upgrades to BoyerMooreHorspool. 51259151504615d929945dc59db37bf1166937748c6Steve Blocktemplate <typename PatternChar, typename SubjectChar> 513f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdochint StringSearch<PatternChar, SubjectChar>::InitialSearch( 514f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch StringSearch<PatternChar, SubjectChar>* search, 515f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch Vector<const SubjectChar> subject, 516f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch int index) { 517f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch Vector<const PatternChar> pattern = search->pattern_; 51859151504615d929945dc59db37bf1166937748c6Steve Block int pattern_length = pattern.length(); 51959151504615d929945dc59db37bf1166937748c6Steve Block // Badness is a count of how much work we have done. When we have 52059151504615d929945dc59db37bf1166937748c6Steve Block // done enough work we decide it's probably worth switching to a better 52159151504615d929945dc59db37bf1166937748c6Steve Block // algorithm. 52259151504615d929945dc59db37bf1166937748c6Steve Block int badness = -10 - (pattern_length << 2); 52359151504615d929945dc59db37bf1166937748c6Steve Block 52459151504615d929945dc59db37bf1166937748c6Steve Block // We know our pattern is at least 2 characters, we cache the first so 52559151504615d929945dc59db37bf1166937748c6Steve Block // the common case of the first character not matching is faster. 526f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch for (int i = index, n = subject.length() - pattern_length; i <= n; i++) { 52759151504615d929945dc59db37bf1166937748c6Steve Block badness++; 528f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch if (badness <= 0) { 529014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch i = FindFirstCharacter(pattern, subject, i); 530014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch if (i == -1) return -1; 531014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch DCHECK_LE(i, n); 532f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch int j = 1; 533f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch do { 534f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch if (pattern[j] != subject[i + j]) { 535f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch break; 536f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch } 537f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch j++; 538f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch } while (j < pattern_length); 539f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch if (j == pattern_length) { 540f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch return i; 54159151504615d929945dc59db37bf1166937748c6Steve Block } 542f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch badness += j; 54359151504615d929945dc59db37bf1166937748c6Steve Block } else { 544f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch search->PopulateBoyerMooreHorspoolTable(); 545f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch search->strategy_ = &BoyerMooreHorspoolSearch; 546f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch return BoyerMooreHorspoolSearch(search, subject, i); 54759151504615d929945dc59db37bf1166937748c6Steve Block } 54859151504615d929945dc59db37bf1166937748c6Steve Block } 54959151504615d929945dc59db37bf1166937748c6Steve Block return -1; 55059151504615d929945dc59db37bf1166937748c6Steve Block} 55159151504615d929945dc59db37bf1166937748c6Steve Block 55259151504615d929945dc59db37bf1166937748c6Steve Block 553f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch// Perform a a single stand-alone search. 554f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch// If searching multiple times for the same pattern, a search 555f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch// object should be constructed once and the Search function then called 556f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch// for each search. 55759151504615d929945dc59db37bf1166937748c6Steve Blocktemplate <typename SubjectChar, typename PatternChar> 5583ef787dbeca8a5fb1086949cda830dccee07bfbdBen Murdochint SearchString(Isolate* isolate, 5593ef787dbeca8a5fb1086949cda830dccee07bfbdBen Murdoch Vector<const SubjectChar> subject, 5603ef787dbeca8a5fb1086949cda830dccee07bfbdBen Murdoch Vector<const PatternChar> pattern, 5613ef787dbeca8a5fb1086949cda830dccee07bfbdBen Murdoch int start_index) { 56244f0eee88ff00398ff7f715fab053374d808c90dSteve Block StringSearch<PatternChar, SubjectChar> search(isolate, pattern); 563f87a203d89e1bbb6708282e0b64dbd13d59b723dBen Murdoch return search.Search(subject, start_index); 56459151504615d929945dc59db37bf1166937748c6Steve Block} 56559151504615d929945dc59db37bf1166937748c6Steve Block 566014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch} // namespace internal 567014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch} // namespace v8 56859151504615d929945dc59db37bf1166937748c6Steve Block 56959151504615d929945dc59db37bf1166937748c6Steve Block#endif // V8_STRING_SEARCH_H_ 570