1ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen// Copyright (c) 2011 The Chromium Authors. All rights reserved. 2c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// Use of this source code is governed by a BSD-style license that can be 3c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// found in the LICENSE file. 4c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 5c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch#include "chrome/browser/history/snippet.h" 6c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 7c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch#include <algorithm> 8c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 9c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch#include "base/logging.h" 10ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen#include "base/memory/scoped_ptr.h" 113345a6884c488ff3a535c2c9acdd33d74b37e311Iain Merrick#include "base/string_split.h" 12c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch#include "base/string_util.h" 13c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch#include "base/utf_string_conversions.h" 14c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch#include "unicode/brkiter.h" 15c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch#include "unicode/utext.h" 16c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch#include "unicode/utf8.h" 17c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 18c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdochnamespace { 19c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 20c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdochbool PairFirstLessThan(const Snippet::MatchPosition& a, 21c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch const Snippet::MatchPosition& b) { 22c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch return a.first < b.first; 23c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch} 24c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 25c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// Combines all pairs after offset in match_positions that are contained 26c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// or touch the pair at offset. 27c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdochvoid CoalescePositionsFrom(size_t offset, 28c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch Snippet::MatchPositions* match_positions) { 29c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch DCHECK(offset < match_positions->size()); 30c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch Snippet::MatchPosition& pair((*match_positions)[offset]); 31c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch ++offset; 32c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch while (offset < match_positions->size() && 33c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch pair.second >= (*match_positions)[offset].first) { 34c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch pair.second = std::max(pair.second, (*match_positions)[offset].second); 35c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch match_positions->erase(match_positions->begin() + offset); 36c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch } 37c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch} 38c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 39c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// Makes sure there is a pair in match_positions that contains the specified 40c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// range. This keeps the pairs ordered in match_positions by first, and makes 41c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// sure none of the pairs in match_positions touch each other. 42c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdochvoid AddMatch(size_t start, 43c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch size_t end, 44c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch Snippet::MatchPositions* match_positions) { 45c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch DCHECK(start < end); 46c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch DCHECK(match_positions); 47c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch Snippet::MatchPosition pair(start, end); 48c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch if (match_positions->empty()) { 49c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch match_positions->push_back(pair); 50c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch return; 51c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch } 52c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch // There's at least one match. Find the position of the new match, 53c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch // potentially extending pairs around it. 54c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch Snippet::MatchPositions::iterator i = 55c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch std::lower_bound(match_positions->begin(), match_positions->end(), 56c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch pair, &PairFirstLessThan); 57c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch if (i != match_positions->end() && i->first == start) { 58c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch // Match not at the end and there is already a pair with the same 59c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch // start. 60c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch if (end > i->second) { 61c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch // New pair extends beyond existing pair. Extend existing pair and 62c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch // coalesce matches after it. 63c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch i->second = end; 64c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch CoalescePositionsFrom(i - match_positions->begin(), match_positions); 65c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch } // else case, new pair completely contained in existing pair, nothing 66c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch // to do. 67c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch } else if (i == match_positions->begin()) { 68c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch // Match at the beginning and the first pair doesn't have the same 69c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch // start. Insert new pair and coalesce matches after it. 70c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch match_positions->insert(i, pair); 71c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch CoalescePositionsFrom(0, match_positions); 72c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch } else { 73c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch // Not at the beginning (but may be at the end). 74c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch --i; 75c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch if (start <= i->second && end > i->second) { 76c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch // Previous element contains match. Extend it and coalesce. 77c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch i->second = end; 78c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch CoalescePositionsFrom(i - match_positions->begin(), match_positions); 79c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch } else if (end > i->second) { 80c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch // Region doesn't touch previous element. See if region touches current 81c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch // element. 82c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch ++i; 83c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch if (i == match_positions->end() || end < i->first) { 84c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch match_positions->insert(i, pair); 85c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch } else { 86c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch i->first = start; 87c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch i->second = end; 88c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch CoalescePositionsFrom(i - match_positions->begin(), match_positions); 89c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch } 90c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch } 91c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch } 92c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch} 93c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 94c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// Converts an index in a utf8 string into the index in the corresponding utf16 95c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// string and returns the utf16 index. This is intended to be called in a loop 96c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// iterating through a utf8 string. 97c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// 98c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// utf8_string: the utf8 string. 99c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// utf8_length: length of the utf8 string. 100c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// offset: the utf8 offset to convert. 101c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// utf8_pos: current offset in the utf8 string. This is modified and on return 102c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// matches offset. 103c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// wide_pos: current index in the wide string. This is the same as the return 104c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// value. 105c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdochsize_t AdvanceAndReturnUTF16Pos(const char* utf8_string, 106c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch int32_t utf8_length, 107c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch int32_t offset, 108c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch int32_t* utf8_pos, 109c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch size_t* utf16_pos) { 110c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch DCHECK(offset >= *utf8_pos && offset <= utf8_length); 111c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 112c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch UChar32 wide_char; 113c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch while (*utf8_pos < offset) { 114c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch U8_NEXT(utf8_string, *utf8_pos, utf8_length, wide_char); 115c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch *utf16_pos += (wide_char <= 0xFFFF) ? 1 : 2; 116c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch } 117c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch return *utf16_pos; 118c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch} 119c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 120c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// Given a character break iterator over a UTF-8 string, set the iterator 121c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// position to |*utf8_pos| and move by |count| characters. |count| can 122c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// be either positive or negative. 123c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdochvoid MoveByNGraphemes(icu::BreakIterator* bi, int count, size_t* utf8_pos) { 124c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch // Ignore the return value. A side effect of the current position 125c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch // being set at or following |*utf8_pos| is exploited here. 126c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch // It's simpler than calling following(n) and then previous(). 127c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch // isBoundary() is not very fast, but should be good enough for the 128c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch // snippet generation. If not, revisit the way we scan in ComputeSnippet. 129c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch bi->isBoundary(*utf8_pos); 130c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch bi->next(count); 131c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch *utf8_pos = static_cast<size_t>(bi->current()); 132c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch} 133c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 134c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// The amount of context to include for a given hit. Note that it's counted 135c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// in terms of graphemes rather than bytes. 136c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdochconst int kSnippetContext = 50; 137c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 138c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// Returns true if next match falls within a snippet window 139c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// from the previous match. The window size is counted in terms 140c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// of graphemes rather than bytes in UTF-8. 141c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdochbool IsNextMatchWithinSnippetWindow(icu::BreakIterator* bi, 142c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch size_t previous_match_end, 143c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch size_t next_match_start) { 144c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch // If it's within a window in terms of bytes, it's certain 145c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch // that it's within a window in terms of graphemes as well. 146c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch if (next_match_start < previous_match_end + kSnippetContext) 147c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch return true; 148c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch bi->isBoundary(previous_match_end); 149c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch // An alternative to this is to call |bi->next()| at most 150c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch // kSnippetContext times, compare |bi->current()| with |next_match_start| 151c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch // after each call and return early if possible. There are other 152c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch // heuristics to speed things up if necessary, but it's not likely that 153c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch // we need to bother. 154c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch bi->next(kSnippetContext); 155c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch int64 current = bi->current(); 156c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch return (next_match_start < static_cast<uint64>(current) || 157c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch current == icu::BreakIterator::DONE); 158c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch} 159c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 160c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch} // namespace 161c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 162c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// static 163c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdochvoid Snippet::ExtractMatchPositions(const std::string& offsets_str, 164c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch const std::string& column_num, 165c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch MatchPositions* match_positions) { 166c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch DCHECK(match_positions); 167c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch if (offsets_str.empty()) 168c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch return; 169c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch std::vector<std::string> offsets; 170731df977c0511bca2206b5f333555b1205ff1f43Iain Merrick base::SplitString(offsets_str, ' ', &offsets); 171c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch // SQLite offsets are sets of four integers: 172c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch // column, query term, match offset, match length 173c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch // Matches within a string are marked by (start, end) pairs. 174c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch for (size_t i = 0; i < offsets.size() - 3; i += 4) { 175c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch if (offsets[i] != column_num) 176c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch continue; 177c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch const size_t start = atoi(offsets[i + 2].c_str()); 178c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch const size_t end = start + atoi(offsets[i + 3].c_str()); 179c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch // Switch to DCHECK after debugging http://crbug.com/15261. 180c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch CHECK(end >= start); 181c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch AddMatch(start, end, match_positions); 182c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch } 183c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch} 184c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 185c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// static 186c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdochvoid Snippet::ConvertMatchPositionsToWide( 187c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch const std::string& utf8_string, 188c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch Snippet::MatchPositions* match_positions) { 189c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch DCHECK(match_positions); 190c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch int32_t utf8_pos = 0; 191c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch size_t utf16_pos = 0; 192c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch const char* utf8_cstring = utf8_string.c_str(); 193c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch const int32_t utf8_length = static_cast<int32_t>(utf8_string.size()); 194c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch for (Snippet::MatchPositions::iterator i = match_positions->begin(); 195c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch i != match_positions->end(); ++i) { 196c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch i->first = AdvanceAndReturnUTF16Pos(utf8_cstring, utf8_length, 197c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch i->first, &utf8_pos, &utf16_pos); 198c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch i->second = AdvanceAndReturnUTF16Pos(utf8_cstring, utf8_length, 199c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch i->second, &utf8_pos, &utf16_pos); 200c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch } 201c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch} 202c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 203731df977c0511bca2206b5f333555b1205ff1f43Iain MerrickSnippet::Snippet() { 204731df977c0511bca2206b5f333555b1205ff1f43Iain Merrick} 205731df977c0511bca2206b5f333555b1205ff1f43Iain Merrick 206731df977c0511bca2206b5f333555b1205ff1f43Iain MerrickSnippet::~Snippet() { 207731df977c0511bca2206b5f333555b1205ff1f43Iain Merrick} 208731df977c0511bca2206b5f333555b1205ff1f43Iain Merrick 209c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdochvoid Snippet::ComputeSnippet(const MatchPositions& match_positions, 210c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch const std::string& document) { 211c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch // The length of snippets we try to produce. 212c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch // We can generate longer snippets but stop once we cross kSnippetMaxLength. 213c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch const size_t kSnippetMaxLength = 200; 214c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch const string16 kEllipsis = ASCIIToUTF16(" ... "); 215c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 216c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch UText* document_utext = NULL; 217c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch UErrorCode status = U_ZERO_ERROR; 218c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch document_utext = utext_openUTF8(document_utext, document.data(), 219c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch document.size(), &status); 220c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch // Locale does not matter because there's no per-locale customization 221c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch // for character iterator. 222c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch scoped_ptr<icu::BreakIterator> bi(icu::BreakIterator::createCharacterInstance( 223c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch icu::Locale::getDefault(), status)); 224c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch bi->setText(document_utext, status); 225c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch DCHECK(U_SUCCESS(status)); 226c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 227c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch // We build the snippet by iterating through the matches and then grabbing 228c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch // context around each match. If matches are near enough each other (within 229c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch // kSnippetContext), we skip the "..." between them. 230c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch string16 snippet; 231c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch size_t start = 0; 232c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch for (size_t i = 0; i < match_positions.size(); ++i) { 233c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch // Some shorter names for the current match. 234c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch const size_t match_start = match_positions[i].first; 235c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch const size_t match_end = match_positions[i].second; 236c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 237c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch // Switch to DCHECK after debugging http://crbug.com/15261. 238c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch CHECK(match_end > match_start); 239c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch CHECK(match_end <= document.size()); 240c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 241c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch // Add the context, if any, to show before the match. 242c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch size_t context_start = match_start; 243c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch MoveByNGraphemes(bi.get(), -kSnippetContext, &context_start); 244c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch start = std::max(start, context_start); 245c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch if (start < match_start) { 246c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch if (start > 0) 247c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch snippet += kEllipsis; 248c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch // Switch to DCHECK after debugging http://crbug.com/15261. 249c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch CHECK(start < document.size()); 250c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch snippet += UTF8ToUTF16(document.substr(start, match_start - start)); 251c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch } 252c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 253c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch // Add the match. 254c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch const size_t first = snippet.size(); 255c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch snippet += UTF8ToUTF16(document.substr(match_start, 256c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch match_end - match_start)); 257c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch matches_.push_back(std::make_pair(first, snippet.size())); 258c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 259c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch // Compute the context, if any, to show after the match. 260c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch size_t end; 261c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch // Check if the next match falls within our snippet window. 262c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch if (i + 1 < match_positions.size() && 263c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch IsNextMatchWithinSnippetWindow(bi.get(), match_end, 264c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch match_positions[i + 1].first)) { 265c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch // Yes, it's within the window. Make the end context extend just up 266c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch // to the next match. 267c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch end = match_positions[i + 1].first; 268c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch // Switch to DCHECK after debugging http://crbug.com/15261. 269c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch CHECK(end >= match_end); 270c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch CHECK(end <= document.size()); 271c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch snippet += UTF8ToUTF16(document.substr(match_end, end - match_end)); 272c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch } else { 273c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch // No, there's either no next match or the next match is too far away. 274c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch end = match_end; 275c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch MoveByNGraphemes(bi.get(), kSnippetContext, &end); 276c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch // Switch to DCHECK after debugging http://crbug.com/15261. 277c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch CHECK(end >= match_end); 278c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch CHECK(end <= document.size()); 279c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch snippet += UTF8ToUTF16(document.substr(match_end, end - match_end)); 280c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch if (end < document.size()) 281c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch snippet += kEllipsis; 282c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch } 283c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch start = end; 284c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 285c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch // Stop here if we have enough snippet computed. 286c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch if (snippet.size() >= kSnippetMaxLength) 287c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch break; 288c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch } 289c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 290c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch utext_close(document_utext); 291c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch swap(text_, snippet); 292c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch} 293731df977c0511bca2206b5f333555b1205ff1f43Iain Merrick 294731df977c0511bca2206b5f333555b1205ff1f43Iain Merrickvoid Snippet::Swap(Snippet* other) { 295731df977c0511bca2206b5f333555b1205ff1f43Iain Merrick text_.swap(other->text_); 296731df977c0511bca2206b5f333555b1205ff1f43Iain Merrick matches_.swap(other->matches_); 297731df977c0511bca2206b5f333555b1205ff1f43Iain Merrick} 298