15821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Copyright (c) 2012 The Chromium Authors. All rights reserved.
25821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Use of this source code is governed by a BSD-style license that can be
35821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// found in the LICENSE file.
45821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
55821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "chrome/browser/history/in_memory_url_index_types.h"
65821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
75821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include <algorithm>
85821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include <functional>
95821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include <iterator>
105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include <numeric>
115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include <set>
125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "base/i18n/break_iterator.h"
145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "base/i18n/case_conversion.h"
15868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)#include "base/strings/string_util.h"
162a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)#include "net/base/escape.h"
172a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)#include "net/base/net_util.h"
185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)namespace history {
205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Matches within URL and Title Strings ----------------------------------------
225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
23a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)TermMatches MatchTermInString(const base::string16& term,
24a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)                              const base::string16& cleaned_string,
255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                              int term_num) {
265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  const size_t kMaxCompareLength = 2048;
27a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)  const base::string16& short_string =
28a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)      (cleaned_string.length() > kMaxCompareLength) ?
292a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)      cleaned_string.substr(0, kMaxCompareLength) : cleaned_string;
305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  TermMatches matches;
31a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)  for (size_t location = short_string.find(term);
32a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)       location != base::string16::npos;
335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)       location = short_string.find(term, location + 1))
345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    matches.push_back(TermMatch(term_num, location, term.length()));
355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  return matches;
365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Comparison function for sorting TermMatches by their offsets.
395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)bool MatchOffsetLess(const TermMatch& m1, const TermMatch& m2) {
405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  return m1.offset < m2.offset;
415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)TermMatches SortAndDeoverlapMatches(const TermMatches& matches) {
445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (matches.empty())
455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    return matches;
465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  TermMatches sorted_matches = matches;
475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  std::sort(sorted_matches.begin(), sorted_matches.end(), MatchOffsetLess);
485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  TermMatches clean_matches;
495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  TermMatch last_match;
505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  for (TermMatches::const_iterator iter = sorted_matches.begin();
515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)       iter != sorted_matches.end(); ++iter) {
525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    if (iter->offset >= last_match.offset + last_match.length) {
535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      last_match = *iter;
545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      clean_matches.push_back(last_match);
555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    }
565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  return clean_matches;
585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)std::vector<size_t> OffsetsFromTermMatches(const TermMatches& matches) {
615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  std::vector<size_t> offsets;
625c02ac1a9c1b504631c0a3d2b6e737b5d738bae1Bo Liu  for (TermMatches::const_iterator i = matches.begin(); i != matches.end();
635c02ac1a9c1b504631c0a3d2b6e737b5d738bae1Bo Liu       ++i) {
645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    offsets.push_back(i->offset);
655c02ac1a9c1b504631c0a3d2b6e737b5d738bae1Bo Liu    offsets.push_back(i->offset + i->length);
665c02ac1a9c1b504631c0a3d2b6e737b5d738bae1Bo Liu  }
675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  return offsets;
685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)TermMatches ReplaceOffsetsInTermMatches(const TermMatches& matches,
715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                                        const std::vector<size_t>& offsets) {
725c02ac1a9c1b504631c0a3d2b6e737b5d738bae1Bo Liu  DCHECK_EQ(2 * matches.size(), offsets.size());
735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  TermMatches new_matches;
745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  std::vector<size_t>::const_iterator offset_iter = offsets.begin();
755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  for (TermMatches::const_iterator term_iter = matches.begin();
765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)       term_iter != matches.end(); ++term_iter, ++offset_iter) {
775c02ac1a9c1b504631c0a3d2b6e737b5d738bae1Bo Liu    const size_t starting_offset = *offset_iter;
785c02ac1a9c1b504631c0a3d2b6e737b5d738bae1Bo Liu    ++offset_iter;
795c02ac1a9c1b504631c0a3d2b6e737b5d738bae1Bo Liu    const size_t ending_offset = *offset_iter;
805c02ac1a9c1b504631c0a3d2b6e737b5d738bae1Bo Liu    if ((starting_offset != base::string16::npos) &&
815c02ac1a9c1b504631c0a3d2b6e737b5d738bae1Bo Liu        (ending_offset != base::string16::npos) &&
825c02ac1a9c1b504631c0a3d2b6e737b5d738bae1Bo Liu        (starting_offset != ending_offset)) {
835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      TermMatch new_match(*term_iter);
845c02ac1a9c1b504631c0a3d2b6e737b5d738bae1Bo Liu      new_match.offset = starting_offset;
855c02ac1a9c1b504631c0a3d2b6e737b5d738bae1Bo Liu      new_match.length = ending_offset - starting_offset;
865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      new_matches.push_back(new_match);
875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    }
885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  return new_matches;
905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Utility Functions -----------------------------------------------------------
935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
94a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)String16Set String16SetFromString16(const base::string16& cleaned_uni_string,
955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                                    WordStarts* word_starts) {
965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  String16Vector words =
972a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)      String16VectorFromString16(cleaned_uni_string, false, word_starts);
985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  String16Set word_set;
995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  for (String16Vector::const_iterator iter = words.begin(); iter != words.end();
1005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)       ++iter)
1015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    word_set.insert(base::i18n::ToLower(*iter).substr(0, kMaxSignificantChars));
1025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  return word_set;
1035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
1045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
105a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)String16Vector String16VectorFromString16(
106a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)    const base::string16& cleaned_uni_string,
107a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)    bool break_on_space,
108a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)    WordStarts* word_starts) {
1095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (word_starts)
1105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    word_starts->clear();
1112a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)  base::i18n::BreakIterator iter(cleaned_uni_string,
1125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      break_on_space ? base::i18n::BreakIterator::BREAK_SPACE :
1135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)          base::i18n::BreakIterator::BREAK_WORD);
1145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  String16Vector words;
1155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (!iter.Init())
1165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    return words;
1175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  while (iter.Advance()) {
1185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    if (break_on_space || iter.IsWord()) {
119a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)      base::string16 word(iter.GetString());
1205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      size_t initial_whitespace = 0;
1215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      if (break_on_space) {
122a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)        base::string16 trimmed_word;
123a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        base::TrimWhitespace(word, base::TRIM_LEADING, &trimmed_word);
1245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        initial_whitespace = word.length() - trimmed_word.length();
125a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)        base::TrimWhitespace(trimmed_word, base::TRIM_TRAILING, &word);
1265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      }
1275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      if (word.empty())
1285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        continue;
1295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      words.push_back(word);
1305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      if (!word_starts)
1315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        continue;
1325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      size_t word_start = iter.prev() + initial_whitespace;
1335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      if (word_start < kMaxSignificantChars)
1345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        word_starts->push_back(word_start);
1355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    }
1365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
1375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  return words;
1385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
1395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
140a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)Char16Set Char16SetFromString16(const base::string16& term) {
1415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  Char16Set characters;
1425d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  for (base::string16::const_iterator iter = term.begin(); iter != term.end();
1435d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)       ++iter)
1445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    characters.insert(*iter);
1455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  return characters;
1465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
1475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1482a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)// HistoryInfoMapValue ---------------------------------------------------------
1492a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)
1502a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)HistoryInfoMapValue::HistoryInfoMapValue() {}
1512a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)HistoryInfoMapValue::~HistoryInfoMapValue() {}
1522a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)
1535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// RowWordStarts ---------------------------------------------------------------
1545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)RowWordStarts::RowWordStarts() {}
1565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)RowWordStarts::~RowWordStarts() {}
1575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)void RowWordStarts::Clear() {
1595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  url_word_starts_.clear();
1605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  title_word_starts_.clear();
1615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
1625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}  // namespace history
164