15821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Copyright (c) 2012 The Chromium Authors. All rights reserved. 25821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Use of this source code is governed by a BSD-style license that can be 35821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// found in the LICENSE file. 45821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 55821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "chrome/browser/history/in_memory_url_index_types.h" 65821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 75821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include <algorithm> 85821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include <functional> 95821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include <iterator> 105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include <numeric> 115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include <set> 125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "base/i18n/break_iterator.h" 145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "base/i18n/case_conversion.h" 15868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)#include "base/strings/string_util.h" 162a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)#include "net/base/escape.h" 172a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)#include "net/base/net_util.h" 185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)namespace history { 205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Matches within URL and Title Strings ---------------------------------------- 225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 23a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)TermMatches MatchTermInString(const base::string16& term, 24a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles) const base::string16& cleaned_string, 255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int term_num) { 265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const size_t kMaxCompareLength = 2048; 27a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles) const base::string16& short_string = 28a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles) (cleaned_string.length() > kMaxCompareLength) ? 292a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) cleaned_string.substr(0, kMaxCompareLength) : cleaned_string; 305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) TermMatches matches; 31a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles) for (size_t location = short_string.find(term); 32a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles) location != base::string16::npos; 335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) location = short_string.find(term, location + 1)) 345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) matches.push_back(TermMatch(term_num, location, term.length())); 355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return matches; 365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Comparison function for sorting TermMatches by their offsets. 395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)bool MatchOffsetLess(const TermMatch& m1, const TermMatch& m2) { 405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return m1.offset < m2.offset; 415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)TermMatches SortAndDeoverlapMatches(const TermMatches& matches) { 445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (matches.empty()) 455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return matches; 465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) TermMatches sorted_matches = matches; 475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) std::sort(sorted_matches.begin(), sorted_matches.end(), MatchOffsetLess); 485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) TermMatches clean_matches; 495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) TermMatch last_match; 505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) for (TermMatches::const_iterator iter = sorted_matches.begin(); 515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) iter != sorted_matches.end(); ++iter) { 525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (iter->offset >= last_match.offset + last_match.length) { 535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) last_match = *iter; 545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) clean_matches.push_back(last_match); 555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return clean_matches; 585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)std::vector<size_t> OffsetsFromTermMatches(const TermMatches& matches) { 615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) std::vector<size_t> offsets; 625c02ac1a9c1b504631c0a3d2b6e737b5d738bae1Bo Liu for (TermMatches::const_iterator i = matches.begin(); i != matches.end(); 635c02ac1a9c1b504631c0a3d2b6e737b5d738bae1Bo Liu ++i) { 645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) offsets.push_back(i->offset); 655c02ac1a9c1b504631c0a3d2b6e737b5d738bae1Bo Liu offsets.push_back(i->offset + i->length); 665c02ac1a9c1b504631c0a3d2b6e737b5d738bae1Bo Liu } 675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return offsets; 685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)TermMatches ReplaceOffsetsInTermMatches(const TermMatches& matches, 715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const std::vector<size_t>& offsets) { 725c02ac1a9c1b504631c0a3d2b6e737b5d738bae1Bo Liu DCHECK_EQ(2 * matches.size(), offsets.size()); 735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) TermMatches new_matches; 745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) std::vector<size_t>::const_iterator offset_iter = offsets.begin(); 755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) for (TermMatches::const_iterator term_iter = matches.begin(); 765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) term_iter != matches.end(); ++term_iter, ++offset_iter) { 775c02ac1a9c1b504631c0a3d2b6e737b5d738bae1Bo Liu const size_t starting_offset = *offset_iter; 785c02ac1a9c1b504631c0a3d2b6e737b5d738bae1Bo Liu ++offset_iter; 795c02ac1a9c1b504631c0a3d2b6e737b5d738bae1Bo Liu const size_t ending_offset = *offset_iter; 805c02ac1a9c1b504631c0a3d2b6e737b5d738bae1Bo Liu if ((starting_offset != base::string16::npos) && 815c02ac1a9c1b504631c0a3d2b6e737b5d738bae1Bo Liu (ending_offset != base::string16::npos) && 825c02ac1a9c1b504631c0a3d2b6e737b5d738bae1Bo Liu (starting_offset != ending_offset)) { 835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) TermMatch new_match(*term_iter); 845c02ac1a9c1b504631c0a3d2b6e737b5d738bae1Bo Liu new_match.offset = starting_offset; 855c02ac1a9c1b504631c0a3d2b6e737b5d738bae1Bo Liu new_match.length = ending_offset - starting_offset; 865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) new_matches.push_back(new_match); 875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return new_matches; 905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Utility Functions ----------------------------------------------------------- 935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 94a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)String16Set String16SetFromString16(const base::string16& cleaned_uni_string, 955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) WordStarts* word_starts) { 965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) String16Vector words = 972a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) String16VectorFromString16(cleaned_uni_string, false, word_starts); 985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) String16Set word_set; 995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) for (String16Vector::const_iterator iter = words.begin(); iter != words.end(); 1005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ++iter) 1015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) word_set.insert(base::i18n::ToLower(*iter).substr(0, kMaxSignificantChars)); 1025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return word_set; 1035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 1045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 105a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)String16Vector String16VectorFromString16( 106a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles) const base::string16& cleaned_uni_string, 107a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles) bool break_on_space, 108a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles) WordStarts* word_starts) { 1095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (word_starts) 1105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) word_starts->clear(); 1112a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) base::i18n::BreakIterator iter(cleaned_uni_string, 1125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) break_on_space ? base::i18n::BreakIterator::BREAK_SPACE : 1135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) base::i18n::BreakIterator::BREAK_WORD); 1145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) String16Vector words; 1155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (!iter.Init()) 1165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return words; 1175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) while (iter.Advance()) { 1185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (break_on_space || iter.IsWord()) { 119a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles) base::string16 word(iter.GetString()); 1205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) size_t initial_whitespace = 0; 1215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (break_on_space) { 122a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles) base::string16 trimmed_word; 123a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) base::TrimWhitespace(word, base::TRIM_LEADING, &trimmed_word); 1245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) initial_whitespace = word.length() - trimmed_word.length(); 125a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles) base::TrimWhitespace(trimmed_word, base::TRIM_TRAILING, &word); 1265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 1275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (word.empty()) 1285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) continue; 1295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) words.push_back(word); 1305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (!word_starts) 1315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) continue; 1325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) size_t word_start = iter.prev() + initial_whitespace; 1335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (word_start < kMaxSignificantChars) 1345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) word_starts->push_back(word_start); 1355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 1365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 1375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return words; 1385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 1395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 140a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)Char16Set Char16SetFromString16(const base::string16& term) { 1415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Char16Set characters; 1425d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) for (base::string16::const_iterator iter = term.begin(); iter != term.end(); 1435d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) ++iter) 1445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) characters.insert(*iter); 1455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return characters; 1465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 1475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1482a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)// HistoryInfoMapValue --------------------------------------------------------- 1492a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) 1502a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)HistoryInfoMapValue::HistoryInfoMapValue() {} 1512a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)HistoryInfoMapValue::~HistoryInfoMapValue() {} 1522a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) 1535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// RowWordStarts --------------------------------------------------------------- 1545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)RowWordStarts::RowWordStarts() {} 1565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)RowWordStarts::~RowWordStarts() {} 1575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)void RowWordStarts::Clear() { 1595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) url_word_starts_.clear(); 1605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) title_word_starts_.clear(); 1615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 1625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} // namespace history 164