10529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch// Copyright 2014 The Chromium Authors. All rights reserved. 25821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Use of this source code is governed by a BSD-style license that can be 35821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// found in the LICENSE file. 45821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 50529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch#include "components/query_parser/snippet.h" 65821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 75821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include <algorithm> 85821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 95821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "base/logging.h" 105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "base/memory/scoped_ptr.h" 112a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)#include "base/strings/string_split.h" 12868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)#include "base/strings/string_util.h" 13868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)#include "base/strings/utf_string_conversions.h" 14ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch#include "third_party/icu/source/common/unicode/brkiter.h" 15ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch#include "third_party/icu/source/common/unicode/utext.h" 16ca12bfac764ba476d6cd062bf1dde12cc64c3f40Ben Murdoch#include "third_party/icu/source/common/unicode/utf8.h" 175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 180529e5d033099cbfc42635f6f6183833b09dff6eBen Murdochnamespace query_parser { 195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)namespace { 205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)bool PairFirstLessThan(const Snippet::MatchPosition& a, 225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const Snippet::MatchPosition& b) { 235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return a.first < b.first; 245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Combines all pairs after offset in match_positions that are contained 275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// or touch the pair at offset. 285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)void CoalescePositionsFrom(size_t offset, 295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Snippet::MatchPositions* match_positions) { 305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) DCHECK(offset < match_positions->size()); 315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Snippet::MatchPosition& pair((*match_positions)[offset]); 325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ++offset; 335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) while (offset < match_positions->size() && 345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) pair.second >= (*match_positions)[offset].first) { 355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) pair.second = std::max(pair.second, (*match_positions)[offset].second); 365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) match_positions->erase(match_positions->begin() + offset); 375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Makes sure there is a pair in match_positions that contains the specified 415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// range. This keeps the pairs ordered in match_positions by first, and makes 425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// sure none of the pairs in match_positions touch each other. 435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)void AddMatch(size_t start, 445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) size_t end, 455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Snippet::MatchPositions* match_positions) { 465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) DCHECK(start < end); 475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) DCHECK(match_positions); 485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Snippet::MatchPosition pair(start, end); 495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (match_positions->empty()) { 505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) match_positions->push_back(pair); 515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return; 525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // There's at least one match. Find the position of the new match, 545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // potentially extending pairs around it. 555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Snippet::MatchPositions::iterator i = 565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) std::lower_bound(match_positions->begin(), match_positions->end(), 575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) pair, &PairFirstLessThan); 585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (i != match_positions->end() && i->first == start) { 595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Match not at the end and there is already a pair with the same 605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // start. 615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (end > i->second) { 625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // New pair extends beyond existing pair. Extend existing pair and 635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // coalesce matches after it. 645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) i->second = end; 655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) CoalescePositionsFrom(i - match_positions->begin(), match_positions); 665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } // else case, new pair completely contained in existing pair, nothing 675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // to do. 685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } else if (i == match_positions->begin()) { 695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Match at the beginning and the first pair doesn't have the same 705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // start. Insert new pair and coalesce matches after it. 715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) match_positions->insert(i, pair); 725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) CoalescePositionsFrom(0, match_positions); 735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } else { 745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Not at the beginning (but may be at the end). 755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) --i; 765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (start <= i->second && end > i->second) { 775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Previous element contains match. Extend it and coalesce. 785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) i->second = end; 795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) CoalescePositionsFrom(i - match_positions->begin(), match_positions); 805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } else if (end > i->second) { 815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Region doesn't touch previous element. See if region touches current 825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // element. 835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ++i; 845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (i == match_positions->end() || end < i->first) { 855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) match_positions->insert(i, pair); 865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } else { 875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) i->first = start; 885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) i->second = end; 895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) CoalescePositionsFrom(i - match_positions->begin(), match_positions); 905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Converts an index in a utf8 string into the index in the corresponding utf16 965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// string and returns the utf16 index. This is intended to be called in a loop 975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// iterating through a utf8 string. 985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// 995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// utf8_string: the utf8 string. 1005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// utf8_length: length of the utf8 string. 1015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// offset: the utf8 offset to convert. 1025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// utf8_pos: current offset in the utf8 string. This is modified and on return 1035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// matches offset. 1045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// wide_pos: current index in the wide string. This is the same as the return 1055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// value. 1065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)size_t AdvanceAndReturnUTF16Pos(const char* utf8_string, 1075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int32_t utf8_length, 1085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int32_t offset, 1095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int32_t* utf8_pos, 1105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) size_t* utf16_pos) { 1115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) DCHECK(offset >= *utf8_pos && offset <= utf8_length); 1125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) UChar32 wide_char; 1145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) while (*utf8_pos < offset) { 1155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) U8_NEXT(utf8_string, *utf8_pos, utf8_length, wide_char); 1165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) *utf16_pos += (wide_char <= 0xFFFF) ? 1 : 2; 1175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 1185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return *utf16_pos; 1195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 1205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Given a character break iterator over a UTF-8 string, set the iterator 1225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// position to |*utf8_pos| and move by |count| characters. |count| can 1235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// be either positive or negative. 1245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)void MoveByNGraphemes(icu::BreakIterator* bi, int count, size_t* utf8_pos) { 1255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Ignore the return value. A side effect of the current position 1265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // being set at or following |*utf8_pos| is exploited here. 1275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // It's simpler than calling following(n) and then previous(). 1285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // isBoundary() is not very fast, but should be good enough for the 1295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // snippet generation. If not, revisit the way we scan in ComputeSnippet. 1300529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch bi->isBoundary(static_cast<int32_t>(*utf8_pos)); 1315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) bi->next(count); 1325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) *utf8_pos = static_cast<size_t>(bi->current()); 1335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 1345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// The amount of context to include for a given hit. Note that it's counted 1365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// in terms of graphemes rather than bytes. 1375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)const int kSnippetContext = 50; 1385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Returns true if next match falls within a snippet window 1405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// from the previous match. The window size is counted in terms 1415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// of graphemes rather than bytes in UTF-8. 1425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)bool IsNextMatchWithinSnippetWindow(icu::BreakIterator* bi, 1435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) size_t previous_match_end, 1445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) size_t next_match_start) { 1455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // If it's within a window in terms of bytes, it's certain 1465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // that it's within a window in terms of graphemes as well. 1475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (next_match_start < previous_match_end + kSnippetContext) 1485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return true; 1490529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch bi->isBoundary(static_cast<int32_t>(previous_match_end)); 1505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // An alternative to this is to call |bi->next()| at most 1515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // kSnippetContext times, compare |bi->current()| with |next_match_start| 1525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // after each call and return early if possible. There are other 1535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // heuristics to speed things up if necessary, but it's not likely that 1545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // we need to bother. 1555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) bi->next(kSnippetContext); 1565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int64 current = bi->current(); 1575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return (next_match_start < static_cast<uint64>(current) || 1585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) current == icu::BreakIterator::DONE); 1595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 1605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} // namespace 1625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// static 1645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)void Snippet::ExtractMatchPositions(const std::string& offsets_str, 1655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const std::string& column_num, 1665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) MatchPositions* match_positions) { 1675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) DCHECK(match_positions); 1685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (offsets_str.empty()) 1695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return; 1705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) std::vector<std::string> offsets; 1715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) base::SplitString(offsets_str, ' ', &offsets); 1725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // SQLite offsets are sets of four integers: 1735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // column, query term, match offset, match length 1745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Matches within a string are marked by (start, end) pairs. 1755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) for (size_t i = 0; i < offsets.size() - 3; i += 4) { 1765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (offsets[i] != column_num) 1775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) continue; 1785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const size_t start = atoi(offsets[i + 2].c_str()); 1795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const size_t end = start + atoi(offsets[i + 3].c_str()); 1805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Switch to DCHECK after debugging http://crbug.com/15261. 1815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) CHECK(end >= start); 1825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) AddMatch(start, end, match_positions); 1835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 1845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 1855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// static 1875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)void Snippet::ConvertMatchPositionsToWide( 1885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const std::string& utf8_string, 1895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Snippet::MatchPositions* match_positions) { 1905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) DCHECK(match_positions); 1915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int32_t utf8_pos = 0; 1925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) size_t utf16_pos = 0; 1935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const char* utf8_cstring = utf8_string.c_str(); 1945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const int32_t utf8_length = static_cast<int32_t>(utf8_string.size()); 1955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) for (Snippet::MatchPositions::iterator i = match_positions->begin(); 1965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) i != match_positions->end(); ++i) { 1975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) i->first = AdvanceAndReturnUTF16Pos(utf8_cstring, utf8_length, 1980529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch static_cast<int32_t>(i->first), 1990529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch &utf8_pos, &utf16_pos); 2005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) i->second = AdvanceAndReturnUTF16Pos(utf8_cstring, utf8_length, 2010529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch static_cast<int32_t>(i->second), 2020529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch &utf8_pos, &utf16_pos); 2035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 2045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 2055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)Snippet::Snippet() { 2075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 2085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)Snippet::~Snippet() { 2105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 2115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)void Snippet::ComputeSnippet(const MatchPositions& match_positions, 2135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const std::string& document) { 2145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // The length of snippets we try to produce. 2155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // We can generate longer snippets but stop once we cross kSnippetMaxLength. 2165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const size_t kSnippetMaxLength = 200; 2175d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) const base::string16 kEllipsis = base::ASCIIToUTF16(" ... "); 2185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) UText* document_utext = NULL; 2205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) UErrorCode status = U_ZERO_ERROR; 2215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) document_utext = utext_openUTF8(document_utext, document.data(), 2225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) document.size(), &status); 2235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Locale does not matter because there's no per-locale customization 2245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // for character iterator. 2255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) scoped_ptr<icu::BreakIterator> bi(icu::BreakIterator::createCharacterInstance( 2265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) icu::Locale::getDefault(), status)); 2275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) bi->setText(document_utext, status); 2285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) DCHECK(U_SUCCESS(status)); 2295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // We build the snippet by iterating through the matches and then grabbing 2315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // context around each match. If matches are near enough each other (within 2325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // kSnippetContext), we skip the "..." between them. 233a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles) base::string16 snippet; 2345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) size_t start = 0; 2355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) for (size_t i = 0; i < match_positions.size(); ++i) { 2365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Some shorter names for the current match. 2375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const size_t match_start = match_positions[i].first; 2385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const size_t match_end = match_positions[i].second; 2395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Switch to DCHECK after debugging http://crbug.com/15261. 2415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) CHECK(match_end > match_start); 2425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) CHECK(match_end <= document.size()); 2435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Add the context, if any, to show before the match. 2455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) size_t context_start = match_start; 2465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) MoveByNGraphemes(bi.get(), -kSnippetContext, &context_start); 2475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) start = std::max(start, context_start); 2485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (start < match_start) { 2495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (start > 0) 2505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) snippet += kEllipsis; 2515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Switch to DCHECK after debugging http://crbug.com/15261. 2525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) CHECK(start < document.size()); 2535d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) snippet += base::UTF8ToUTF16(document.substr(start, match_start - start)); 2545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 2555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Add the match. 2575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const size_t first = snippet.size(); 2585d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) snippet += base::UTF8ToUTF16(document.substr(match_start, 2595d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) match_end - match_start)); 2605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) matches_.push_back(std::make_pair(first, snippet.size())); 2615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Compute the context, if any, to show after the match. 2635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) size_t end; 2645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Check if the next match falls within our snippet window. 2655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (i + 1 < match_positions.size() && 2665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) IsNextMatchWithinSnippetWindow(bi.get(), match_end, 2675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) match_positions[i + 1].first)) { 2685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Yes, it's within the window. Make the end context extend just up 2695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // to the next match. 2705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) end = match_positions[i + 1].first; 2715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Switch to DCHECK after debugging http://crbug.com/15261. 2725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) CHECK(end >= match_end); 2735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) CHECK(end <= document.size()); 2745d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) snippet += base::UTF8ToUTF16(document.substr(match_end, end - match_end)); 2755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } else { 2765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // No, there's either no next match or the next match is too far away. 2775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) end = match_end; 2785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) MoveByNGraphemes(bi.get(), kSnippetContext, &end); 2795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Switch to DCHECK after debugging http://crbug.com/15261. 2805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) CHECK(end >= match_end); 2815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) CHECK(end <= document.size()); 2825d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) snippet += base::UTF8ToUTF16(document.substr(match_end, end - match_end)); 2835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (end < document.size()) 2845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) snippet += kEllipsis; 2855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 2865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) start = end; 2875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Stop here if we have enough snippet computed. 2895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (snippet.size() >= kSnippetMaxLength) 2905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) break; 2915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 2925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) utext_close(document_utext); 2945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) swap(text_, snippet); 2955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 2965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)void Snippet::Swap(Snippet* other) { 2985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) text_.swap(other->text_); 2995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) matches_.swap(other->matches_); 3005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 3010529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch 3020529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch} // namespace query_parser 303