1ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen// Copyright (c) 2011 The Chromium Authors. All rights reserved.
2c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// Use of this source code is governed by a BSD-style license that can be
3c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// found in the LICENSE file.
4c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
5c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch#include "chrome/browser/history/snippet.h"
6c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
7c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch#include <algorithm>
8c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
9c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch#include "base/logging.h"
10ddb351dbec246cf1fab5ec20d2d5520909041de1Kristian Monsen#include "base/memory/scoped_ptr.h"
113345a6884c488ff3a535c2c9acdd33d74b37e311Iain Merrick#include "base/string_split.h"
12c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch#include "base/string_util.h"
13c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch#include "base/utf_string_conversions.h"
14c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch#include "unicode/brkiter.h"
15c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch#include "unicode/utext.h"
16c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch#include "unicode/utf8.h"
17c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
18c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdochnamespace {
19c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
20c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdochbool PairFirstLessThan(const Snippet::MatchPosition& a,
21c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch                       const Snippet::MatchPosition& b) {
22c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  return a.first < b.first;
23c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch}
24c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
25c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// Combines all pairs after offset in match_positions that are contained
26c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// or touch the pair at offset.
27c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdochvoid CoalescePositionsFrom(size_t offset,
28c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch                           Snippet::MatchPositions* match_positions) {
29c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  DCHECK(offset < match_positions->size());
30c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  Snippet::MatchPosition& pair((*match_positions)[offset]);
31c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  ++offset;
32c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  while (offset < match_positions->size() &&
33c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch         pair.second >= (*match_positions)[offset].first) {
34c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    pair.second = std::max(pair.second, (*match_positions)[offset].second);
35c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    match_positions->erase(match_positions->begin() + offset);
36c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  }
37c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch}
38c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
39c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// Makes sure there is a pair in match_positions that contains the specified
40c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// range. This keeps the pairs ordered in match_positions by first, and makes
41c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// sure none of the pairs in match_positions touch each other.
42c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdochvoid AddMatch(size_t start,
43c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch              size_t end,
44c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch              Snippet::MatchPositions* match_positions) {
45c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  DCHECK(start < end);
46c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  DCHECK(match_positions);
47c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  Snippet::MatchPosition pair(start, end);
48c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  if (match_positions->empty()) {
49c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    match_positions->push_back(pair);
50c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    return;
51c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  }
52c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  // There's at least one match. Find the position of the new match,
53c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  // potentially extending pairs around it.
54c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  Snippet::MatchPositions::iterator i =
55c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch      std::lower_bound(match_positions->begin(), match_positions->end(),
56c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch                       pair, &PairFirstLessThan);
57c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  if (i != match_positions->end() && i->first == start) {
58c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    // Match not at the end and there is already a pair with the same
59c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    // start.
60c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    if (end > i->second) {
61c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch      // New pair extends beyond existing pair. Extend existing pair and
62c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch      // coalesce matches after it.
63c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch      i->second = end;
64c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch      CoalescePositionsFrom(i - match_positions->begin(), match_positions);
65c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    }  // else case, new pair completely contained in existing pair, nothing
66c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch       // to do.
67c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  } else if (i == match_positions->begin()) {
68c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    // Match at the beginning and the first pair doesn't have the same
69c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    // start. Insert new pair and coalesce matches after it.
70c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    match_positions->insert(i, pair);
71c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    CoalescePositionsFrom(0, match_positions);
72c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  } else {
73c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    // Not at the beginning (but may be at the end).
74c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    --i;
75c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    if (start <= i->second && end > i->second) {
76c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch      // Previous element contains match. Extend it and coalesce.
77c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch      i->second = end;
78c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch      CoalescePositionsFrom(i - match_positions->begin(), match_positions);
79c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    } else if (end > i->second) {
80c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch      // Region doesn't touch previous element. See if region touches current
81c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch      // element.
82c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch      ++i;
83c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch      if (i == match_positions->end() || end < i->first) {
84c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch        match_positions->insert(i, pair);
85c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch      } else {
86c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch        i->first = start;
87c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch        i->second = end;
88c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch        CoalescePositionsFrom(i - match_positions->begin(), match_positions);
89c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch      }
90c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    }
91c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  }
92c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch}
93c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
94c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// Converts an index in a utf8 string into the index in the corresponding utf16
95c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// string and returns the utf16 index. This is intended to be called in a loop
96c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// iterating through a utf8 string.
97c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch//
98c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// utf8_string: the utf8 string.
99c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// utf8_length: length of the utf8 string.
100c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// offset: the utf8 offset to convert.
101c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// utf8_pos: current offset in the utf8 string. This is modified and on return
102c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch//           matches offset.
103c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// wide_pos: current index in the wide string. This is the same as the return
104c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch//           value.
105c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdochsize_t AdvanceAndReturnUTF16Pos(const char* utf8_string,
106c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch                                int32_t utf8_length,
107c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch                                int32_t offset,
108c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch                                int32_t* utf8_pos,
109c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch                                size_t* utf16_pos) {
110c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  DCHECK(offset >= *utf8_pos && offset <= utf8_length);
111c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
112c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  UChar32 wide_char;
113c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  while (*utf8_pos < offset) {
114c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    U8_NEXT(utf8_string, *utf8_pos, utf8_length, wide_char);
115c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    *utf16_pos += (wide_char <= 0xFFFF) ? 1 : 2;
116c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  }
117c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  return *utf16_pos;
118c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch}
119c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
120c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// Given a character break iterator over a UTF-8 string, set the iterator
121c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// position to |*utf8_pos| and move by |count| characters. |count| can
122c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// be either positive or negative.
123c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdochvoid MoveByNGraphemes(icu::BreakIterator* bi, int count, size_t* utf8_pos) {
124c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  // Ignore the return value. A side effect of the current position
125c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  // being set at or following |*utf8_pos| is exploited here.
126c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  // It's simpler than calling following(n) and then previous().
127c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  // isBoundary() is not very fast, but should be good enough for the
128c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  // snippet generation. If not, revisit the way we scan in ComputeSnippet.
129c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  bi->isBoundary(*utf8_pos);
130c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  bi->next(count);
131c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  *utf8_pos = static_cast<size_t>(bi->current());
132c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch}
133c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
134c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// The amount of context to include for a given hit. Note that it's counted
135c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// in terms of graphemes rather than bytes.
136c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdochconst int kSnippetContext = 50;
137c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
138c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// Returns true if next match falls within a snippet window
139c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// from the previous match. The window size is counted in terms
140c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// of graphemes rather than bytes in UTF-8.
141c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdochbool IsNextMatchWithinSnippetWindow(icu::BreakIterator* bi,
142c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch                                    size_t previous_match_end,
143c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch                                    size_t next_match_start) {
144c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  // If it's within a window in terms of bytes, it's certain
145c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  // that it's within a window in terms of graphemes as well.
146c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  if (next_match_start < previous_match_end + kSnippetContext)
147c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    return true;
148c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  bi->isBoundary(previous_match_end);
149c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  // An alternative to this is to call |bi->next()| at most
150c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  // kSnippetContext times, compare |bi->current()| with |next_match_start|
151c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  // after each call and return early if possible. There are other
152c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  // heuristics to speed things up if necessary, but it's not likely that
153c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  // we need to bother.
154c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  bi->next(kSnippetContext);
155c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  int64 current = bi->current();
156c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  return (next_match_start < static_cast<uint64>(current) ||
157c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch          current == icu::BreakIterator::DONE);
158c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch}
159c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
160c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch}  // namespace
161c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
162c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// static
163c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdochvoid Snippet::ExtractMatchPositions(const std::string& offsets_str,
164c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch                                    const std::string& column_num,
165c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch                                    MatchPositions* match_positions) {
166c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  DCHECK(match_positions);
167c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  if (offsets_str.empty())
168c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    return;
169c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  std::vector<std::string> offsets;
170731df977c0511bca2206b5f333555b1205ff1f43Iain Merrick  base::SplitString(offsets_str, ' ', &offsets);
171c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  // SQLite offsets are sets of four integers:
172c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  //   column, query term, match offset, match length
173c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  // Matches within a string are marked by (start, end) pairs.
174c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  for (size_t i = 0; i < offsets.size() - 3; i += 4) {
175c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    if (offsets[i] != column_num)
176c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch      continue;
177c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    const size_t start = atoi(offsets[i + 2].c_str());
178c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    const size_t end = start + atoi(offsets[i + 3].c_str());
179c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    // Switch to DCHECK after debugging http://crbug.com/15261.
180c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    CHECK(end >= start);
181c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    AddMatch(start, end, match_positions);
182c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  }
183c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch}
184c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
185c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// static
186c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdochvoid Snippet::ConvertMatchPositionsToWide(
187c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    const std::string& utf8_string,
188c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    Snippet::MatchPositions* match_positions) {
189c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  DCHECK(match_positions);
190c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  int32_t utf8_pos = 0;
191c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  size_t utf16_pos = 0;
192c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  const char* utf8_cstring = utf8_string.c_str();
193c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  const int32_t utf8_length = static_cast<int32_t>(utf8_string.size());
194c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  for (Snippet::MatchPositions::iterator i = match_positions->begin();
195c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch       i != match_positions->end(); ++i) {
196c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    i->first = AdvanceAndReturnUTF16Pos(utf8_cstring, utf8_length,
197c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch                                        i->first, &utf8_pos, &utf16_pos);
198c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    i->second = AdvanceAndReturnUTF16Pos(utf8_cstring, utf8_length,
199c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch                                         i->second, &utf8_pos, &utf16_pos);
200c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  }
201c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch}
202c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
203731df977c0511bca2206b5f333555b1205ff1f43Iain MerrickSnippet::Snippet() {
204731df977c0511bca2206b5f333555b1205ff1f43Iain Merrick}
205731df977c0511bca2206b5f333555b1205ff1f43Iain Merrick
206731df977c0511bca2206b5f333555b1205ff1f43Iain MerrickSnippet::~Snippet() {
207731df977c0511bca2206b5f333555b1205ff1f43Iain Merrick}
208731df977c0511bca2206b5f333555b1205ff1f43Iain Merrick
209c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdochvoid Snippet::ComputeSnippet(const MatchPositions& match_positions,
210c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch                             const std::string& document) {
211c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  // The length of snippets we try to produce.
212c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  // We can generate longer snippets but stop once we cross kSnippetMaxLength.
213c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  const size_t kSnippetMaxLength = 200;
214c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  const string16 kEllipsis = ASCIIToUTF16(" ... ");
215c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
216c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  UText* document_utext = NULL;
217c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  UErrorCode status = U_ZERO_ERROR;
218c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  document_utext = utext_openUTF8(document_utext, document.data(),
219c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch                                  document.size(), &status);
220c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  // Locale does not matter because there's no per-locale customization
221c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  // for character iterator.
222c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  scoped_ptr<icu::BreakIterator> bi(icu::BreakIterator::createCharacterInstance(
223c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch      icu::Locale::getDefault(), status));
224c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  bi->setText(document_utext, status);
225c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  DCHECK(U_SUCCESS(status));
226c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
227c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  // We build the snippet by iterating through the matches and then grabbing
228c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  // context around each match.  If matches are near enough each other (within
229c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  // kSnippetContext), we skip the "..." between them.
230c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  string16 snippet;
231c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  size_t start = 0;
232c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  for (size_t i = 0; i < match_positions.size(); ++i) {
233c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    // Some shorter names for the current match.
234c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    const size_t match_start = match_positions[i].first;
235c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    const size_t match_end = match_positions[i].second;
236c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
237c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    // Switch to DCHECK after debugging http://crbug.com/15261.
238c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    CHECK(match_end > match_start);
239c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    CHECK(match_end <= document.size());
240c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
241c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    // Add the context, if any, to show before the match.
242c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    size_t context_start = match_start;
243c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    MoveByNGraphemes(bi.get(), -kSnippetContext, &context_start);
244c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    start = std::max(start, context_start);
245c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    if (start < match_start) {
246c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch      if (start > 0)
247c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch        snippet += kEllipsis;
248c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch      // Switch to DCHECK after debugging http://crbug.com/15261.
249c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch      CHECK(start < document.size());
250c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch      snippet += UTF8ToUTF16(document.substr(start, match_start - start));
251c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    }
252c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
253c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    // Add the match.
254c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    const size_t first = snippet.size();
255c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    snippet += UTF8ToUTF16(document.substr(match_start,
256c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch                                          match_end - match_start));
257c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    matches_.push_back(std::make_pair(first, snippet.size()));
258c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
259c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    // Compute the context, if any, to show after the match.
260c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    size_t end;
261c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    // Check if the next match falls within our snippet window.
262c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    if (i + 1 < match_positions.size() &&
263c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch        IsNextMatchWithinSnippetWindow(bi.get(), match_end,
264c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch            match_positions[i + 1].first)) {
265c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch      // Yes, it's within the window.  Make the end context extend just up
266c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch      // to the next match.
267c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch      end = match_positions[i + 1].first;
268c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch      // Switch to DCHECK after debugging http://crbug.com/15261.
269c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch      CHECK(end >= match_end);
270c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch      CHECK(end <= document.size());
271c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch      snippet += UTF8ToUTF16(document.substr(match_end, end - match_end));
272c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    } else {
273c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch      // No, there's either no next match or the next match is too far away.
274c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch      end = match_end;
275c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch      MoveByNGraphemes(bi.get(), kSnippetContext, &end);
276c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch      // Switch to DCHECK after debugging http://crbug.com/15261.
277c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch      CHECK(end >= match_end);
278c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch      CHECK(end <= document.size());
279c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch      snippet += UTF8ToUTF16(document.substr(match_end, end - match_end));
280c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch      if (end < document.size())
281c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch        snippet += kEllipsis;
282c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    }
283c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    start = end;
284c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
285c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    // Stop here if we have enough snippet computed.
286c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    if (snippet.size() >= kSnippetMaxLength)
287c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch      break;
288c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  }
289c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
290c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  utext_close(document_utext);
291c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  swap(text_, snippet);
292c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch}
293731df977c0511bca2206b5f333555b1205ff1f43Iain Merrick
294731df977c0511bca2206b5f333555b1205ff1f43Iain Merrickvoid Snippet::Swap(Snippet* other) {
295731df977c0511bca2206b5f333555b1205ff1f43Iain Merrick  text_.swap(other->text_);
296731df977c0511bca2206b5f333555b1205ff1f43Iain Merrick  matches_.swap(other->matches_);
297731df977c0511bca2206b5f333555b1205ff1f43Iain Merrick}
298