in_memory_url_index.cc revision 3345a6884c488ff3a535c2c9acdd33d74b37e311
1// Copyright (c) 2010 The Chromium Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style license that can be 3// found in the LICENSE file. 4 5#include "chrome/browser/history/in_memory_url_index.h" 6 7#include <algorithm> 8#include <limits> 9 10#include "app/l10n_util.h" 11#include "base/i18n/word_iterator.h" 12#include "base/string_util.h" 13#include "base/time.h" 14#include "base/utf_string_conversions.h" 15#include "chrome/browser/autocomplete/history_provider_util.h" 16#include "chrome/browser/history/url_database.h" 17#include "net/base/escape.h" 18#include "net/base/net_util.h" 19 20using base::Time; 21using base::TimeDelta; 22 23namespace history { 24 25ScoredHistoryMatch::ScoredHistoryMatch() : raw_score(0) {} 26 27ScoredHistoryMatch::ScoredHistoryMatch(const URLRow& url_info, 28 size_t input_location, 29 bool match_in_scheme, 30 bool innermost_match, 31 int score) 32 : HistoryMatch(url_info, input_location, match_in_scheme, innermost_match), 33 raw_score(score) { 34} 35 36InMemoryURLIndex::InMemoryURLIndex() : history_item_count_(0) {} 37 38InMemoryURLIndex::~InMemoryURLIndex() {} 39 40static const int32_t kURLToLowerBufferSize = 2048; 41 42// Indexing 43 44bool InMemoryURLIndex::Init(history::URLDatabase* history_db, 45 const std::string& languages) { 46 // TODO(mrossetti): Register for profile/language change notifications. 47 languages_ = languages; 48 // Reset our indexes. 49 char_word_map_.clear(); 50 word_id_history_map_.clear(); 51 if (!history_db) 52 return false; 53 URLDatabase::URLEnumerator history_enum; 54 if (history_db->InitURLEnumeratorForEverything(&history_enum)) { 55 URLRow row; 56 Time recent_threshold = InMemoryURLIndex::RecentThreshold(); 57 while (history_enum.GetNextURL(&row)) { 58 // Do some filtering so that we only get history items which could 59 // possibly pass the HistoryURLProvider::CullPoorMatches filter later. 60 if ((row.typed_count() > kLowQualityMatchTypedLimit) || 61 (row.visit_count() > kLowQualityMatchVisitLimit) || 62 (row.last_visit() >= recent_threshold)) { 63 if (!IndexRow(row)) 64 return false; 65 } 66 } 67 } 68 return true; 69} 70 71bool InMemoryURLIndex::IndexRow(URLRow row) { 72 const GURL& gurl(row.url()); 73 string16 url(net::FormatUrl(gurl, languages_, 74 net::kFormatUrlOmitUsernamePassword, 75 UnescapeRule::SPACES | UnescapeRule::URL_SPECIAL_CHARS, 76 NULL, NULL, NULL)); 77 78 // TODO(mrossetti): Detect row_id > std::numeric_limits<HistoryID>::max(). 79 HistoryID history_id = static_cast<HistoryID>(row.id()); 80 81 // Add the row for quick lookup in the history info store. 82 url = l10n_util::ToLower(url); 83 URLRow new_row(GURL(url), row.id()); 84 new_row.set_visit_count(row.visit_count()); 85 new_row.set_typed_count(row.typed_count()); 86 new_row.set_last_visit(row.last_visit()); 87 new_row.set_title(row.title()); 88 history_info_map_.insert(std::make_pair(history_id, new_row)); 89 90 // Split into individual, unique words. 91 String16Set words = WordSetFromString16(url); 92 93 // For each word, add a new entry into the word index referring to the 94 // associated history item. 95 for (String16Set::iterator iter = words.begin(); 96 iter != words.end(); ++iter) { 97 String16Set::value_type uni_word = *iter; 98 AddWordToIndex(uni_word, history_id); 99 } 100 ++history_item_count_; 101 return true; 102} 103 104// Searching 105 106ScoredHistoryMatches InMemoryURLIndex::HistoryItemsForTerms( 107 const String16Vector& terms) { 108 ScoredHistoryMatches scored_items; 109 if (!terms.empty()) { 110 // Reset used_ flags for term_char_word_set_cache_. We use a basic mark- 111 // and-sweep approach. 112 ResetTermCharWordSetCache(); 113 114 // Lowercase the terms. 115 // TODO(mrossetti): Another opportunity for a transform algorithm. 116 String16Vector lower_terms; 117 for (String16Vector::const_iterator term_iter = terms.begin(); 118 term_iter != terms.end(); ++term_iter) { 119 String16Vector::value_type lower_string(*term_iter); 120 std::transform(lower_string.begin(), 121 lower_string.end(), 122 lower_string.begin(), 123 tolower); 124 lower_terms.push_back(lower_string); 125 } 126 127 String16Vector::value_type all_terms(JoinString(lower_terms, ' ')); 128 HistoryIDSet history_id_set = HistoryIDSetFromWords(all_terms); 129 130 // Pass over all of the candidates filtering out any without a proper 131 // substring match, inserting those which pass in order by score. 132 scored_items = std::for_each(history_id_set.begin(), history_id_set.end(), 133 AddHistoryMatch(*this, 134 lower_terms)).ScoredMatches(); 135 } 136 137 // Remove any stale TermCharWordSet's. 138 term_char_word_set_cache_.erase( 139 std::remove_if(term_char_word_set_cache_.begin(), 140 term_char_word_set_cache_.end(), 141 std::mem_fun_ref(&TermCharWordSet::IsNotUsed)), 142 term_char_word_set_cache_.end()); 143 return scored_items; 144} 145 146void InMemoryURLIndex::ResetTermCharWordSetCache() { 147 // TODO(mrossetti): Consider keeping more of the cache around for possible 148 // repeat searches, except a 'shortcuts' approach might be better for that. 149 // TODO(mrossetti): Change TermCharWordSet to a class and use for_each. 150 for (TermCharWordSetVector::iterator iter = 151 term_char_word_set_cache_.begin(); 152 iter != term_char_word_set_cache_.end(); ++iter) 153 iter->used_ = false; 154} 155 156InMemoryURLIndex::HistoryIDSet InMemoryURLIndex::HistoryIDSetFromWords( 157 const string16& uni_string) { 158 // Break the terms down into individual terms (words), get the candidate 159 // set for each term, and intersect each to get a final candidate list. 160 // Note that a single 'term' from the user's perspective might be 161 // a string like "http://www.somewebsite.com" which, from our perspective, 162 // is four words: 'http', 'www', 'somewebsite', and 'com'. 163 HistoryIDSet history_id_set; 164 String16Set words = WordSetFromString16(uni_string); 165 bool first_word = true; 166 for (String16Set::iterator iter = words.begin(); 167 iter != words.end(); ++iter) { 168 String16Set::value_type uni_word = *iter; 169 HistoryIDSet term_history_id_set = 170 InMemoryURLIndex::HistoryIDsForTerm(uni_word); 171 if (first_word) { 172 history_id_set.swap(term_history_id_set); 173 first_word = false; 174 } else { 175 HistoryIDSet old_history_id_set(history_id_set); 176 history_id_set.clear(); 177 std::set_intersection(old_history_id_set.begin(), 178 old_history_id_set.end(), 179 term_history_id_set.begin(), 180 term_history_id_set.end(), 181 std::inserter(history_id_set, 182 history_id_set.begin())); 183 } 184 } 185 return history_id_set; 186} 187 188InMemoryURLIndex::HistoryIDSet InMemoryURLIndex::HistoryIDsForTerm( 189 const string16& uni_word) { 190 InMemoryURLIndex::HistoryIDSet history_id_set; 191 192 // For each character in the word, get the char/word_id map entry and 193 // intersect with the set in an incremental manner. 194 Char16Set uni_chars = CharactersFromString16(uni_word); 195 WordIDSet word_id_set(WordIDSetForTermChars(uni_chars)); 196 197 // If any words resulted then we can compose a set of history IDs by unioning 198 // the sets from each word. 199 if (!word_id_set.empty()) { 200 for (WordIDSet::iterator word_id_iter = word_id_set.begin(); 201 word_id_iter != word_id_set.end(); ++word_id_iter) { 202 WordID word_id = *word_id_iter; 203 WordIDHistoryMap::iterator word_iter = word_id_history_map_.find(word_id); 204 if (word_iter != word_id_history_map_.end()) { 205 HistoryIDSet& word_history_id_set(word_iter->second); 206 history_id_set.insert(word_history_id_set.begin(), 207 word_history_id_set.end()); 208 } 209 } 210 } 211 212 return history_id_set; 213} 214 215// Utility Functions 216 217// static 218InMemoryURLIndex::String16Set InMemoryURLIndex::WordSetFromString16( 219 const string16& uni_string) { 220 String16Set words; 221 WordIterator iter(&uni_string, WordIterator::BREAK_WORD); 222 if (iter.Init()) { 223 while (iter.Advance()) { 224 if (iter.IsWord()) 225 words.insert(iter.GetWord()); 226 } 227 } 228 return words; 229} 230 231// static 232InMemoryURLIndex::Char16Set InMemoryURLIndex::CharactersFromString16( 233 const string16& uni_word) { 234 Char16Set characters; 235 for (string16::const_iterator iter = uni_word.begin(); 236 iter != uni_word.end(); ++iter) 237 characters.insert(*iter); 238 return characters; 239} 240 241void InMemoryURLIndex::AddWordToIndex(const string16& uni_word, 242 HistoryID history_id) { 243 WordMap::iterator word_pos = word_map_.find(uni_word); 244 if (word_pos != word_map_.end()) 245 UpdateWordHistory(word_pos->second, history_id); 246 else 247 AddWordHistory(uni_word, history_id); 248} 249 250void InMemoryURLIndex::UpdateWordHistory(WordID word_id, HistoryID history_id) { 251 WordIDHistoryMap::iterator history_pos = word_id_history_map_.find(word_id); 252 DCHECK(history_pos != word_id_history_map_.end()); 253 HistoryIDSet& history_id_set(history_pos->second); 254 history_id_set.insert(history_id); 255} 256 257// Add a new word to the word list and the word map, and then create a 258// new entry in the word/history map. 259void InMemoryURLIndex::AddWordHistory(const string16& uni_word, 260 HistoryID history_id) { 261 word_list_.push_back(uni_word); 262 WordID word_id = word_list_.size() - 1; 263 word_map_.insert(std::make_pair(uni_word, word_id)); 264 HistoryIDSet history_id_set; 265 history_id_set.insert(history_id); 266 word_id_history_map_.insert(std::make_pair(word_id, history_id_set)); 267 // For each character in the newly added word (i.e. a word that is not 268 // already in the word index), add the word to the character index. 269 Char16Set characters = CharactersFromString16(uni_word); 270 for (Char16Set::iterator uni_char_iter = characters.begin(); 271 uni_char_iter != characters.end(); ++uni_char_iter) { 272 Char16Set::value_type uni_string = *uni_char_iter; 273 CharWordIDMap::iterator char_iter = char_word_map_.find(uni_string); 274 if (char_iter != char_word_map_.end()) { 275 // Update existing entry in the char/word index. 276 WordIDSet& word_id_set(char_iter->second); 277 word_id_set.insert(word_id); 278 } else { 279 // Create a new entry in the char/word index. 280 WordIDSet word_id_set; 281 word_id_set.insert(word_id); 282 char_word_map_.insert(std::make_pair(uni_string, word_id_set)); 283 } 284 } 285} 286 287InMemoryURLIndex::WordIDSet InMemoryURLIndex::WordIDSetForTermChars( 288 InMemoryURLIndex::Char16Set const& uni_chars) { 289 TermCharWordSet* best_term_char_word_set = NULL; 290 bool set_found = false; 291 size_t outstanding_count = 0; 292 Char16Set outstanding_chars; 293 for (TermCharWordSetVector::iterator iter = term_char_word_set_cache_.begin(); 294 iter != term_char_word_set_cache_.end(); ++iter) { 295 TermCharWordSetVector::value_type& term_char_word_set(*iter); 296 Char16Set& char_set(term_char_word_set.char_set_); 297 Char16Set exclusions; 298 std::set_difference(char_set.begin(), char_set.end(), 299 uni_chars.begin(), uni_chars.end(), 300 std::inserter(exclusions, exclusions.begin())); 301 if (exclusions.empty()) { 302 // Do a reverse difference so that we know which characters remain 303 // to be indexed. Then decide if this is a better match than any 304 // previous cached set. 305 std::set_difference(uni_chars.begin(), uni_chars.end(), 306 char_set.begin(), char_set.end(), 307 std::inserter(exclusions, exclusions.begin())); 308 if (!set_found || exclusions.size() < outstanding_count) { 309 outstanding_chars = exclusions; 310 best_term_char_word_set = &*iter; 311 outstanding_count = exclusions.size(); 312 set_found = true; 313 } 314 } 315 } 316 317 WordIDSet word_id_set; 318 if (set_found && outstanding_count == 0) { 319 // If there were no oustanding characters then we can use the cached one. 320 best_term_char_word_set->used_ = true; 321 word_id_set = best_term_char_word_set->word_id_set_; 322 } else { 323 // Some or all of the characters remain to be indexed. 324 bool first_character = true; 325 if (set_found) { 326 first_character = false; 327 word_id_set = best_term_char_word_set->word_id_set_; 328 } else { 329 outstanding_chars = uni_chars; 330 } 331 332 for (Char16Set::iterator uni_char_iter = outstanding_chars.begin(); 333 uni_char_iter != outstanding_chars.end(); ++uni_char_iter) { 334 Char16Set::value_type uni_char = *uni_char_iter; 335 CharWordIDMap::iterator char_iter = char_word_map_.find(uni_char); 336 if (char_iter == char_word_map_.end()) { 337 // The character was not found so bail. 338 word_id_set.clear(); 339 break; 340 } 341 WordIDSet& char_word_id_set(char_iter->second); 342 if (first_character) { 343 word_id_set = char_word_id_set; 344 first_character = false; 345 } else { 346 WordIDSet old_word_id_set(word_id_set); 347 word_id_set.clear(); 348 std::set_intersection(old_word_id_set.begin(), 349 old_word_id_set.end(), 350 char_word_id_set.begin(), 351 char_word_id_set.end(), 352 std::inserter(word_id_set, 353 word_id_set.begin())); 354 } 355 } 356 // Update the cache. 357 if (!set_found || outstanding_count) { 358 term_char_word_set_cache_.push_back(TermCharWordSet(uni_chars, 359 word_id_set, true)); 360 } 361 } 362 return word_id_set; 363} 364 365// static 366int InMemoryURLIndex::RawScoreForURL(const URLRow& row, 367 const String16Vector& terms, 368 size_t* first_term_location) { 369 GURL gurl = row.url(); 370 if (!gurl.is_valid()) 371 return 0; 372 373 string16 url = UTF8ToUTF16(gurl.spec()); 374 375 // Collect all term start positions so we can see if they appear in order. 376 std::vector<size_t> term_locations; 377 int out_of_order = 0; // Count the terms which are out of order. 378 size_t start_location_total = 0; 379 size_t term_length_total = 0; 380 for (String16Vector::const_iterator iter = terms.begin(); iter != terms.end(); 381 ++iter) { 382 string16 term = *iter; 383 size_t term_location = url.find(term); 384 if (term_location == string16::npos) 385 return 0; // A term was not found. This should not happen. 386 if (iter != terms.begin()) { 387 // See if this term is out-of-order. 388 for (std::vector<size_t>::const_iterator order_iter = 389 term_locations.begin(); order_iter != term_locations.end(); 390 ++order_iter) { 391 if (term_location <= *order_iter) 392 ++out_of_order; 393 } 394 } else { 395 *first_term_location = term_location; 396 } 397 term_locations.push_back(term_location); 398 start_location_total += term_location; 399 term_length_total += term.size(); 400 } 401 402 // Calculate a raw score. 403 // TODO(mrossetti): This is good enough for now but must be fine-tuned. 404 const float kOrderMaxValue = 10.0; 405 float order_value = 10.0; 406 if (terms.size() > 1) { 407 int max_possible_out_of_order = (terms.size() * (terms.size() - 1)) / 2; 408 order_value = 409 (static_cast<float>(max_possible_out_of_order - out_of_order) / 410 max_possible_out_of_order) * kOrderMaxValue; 411 } 412 413 const float kStartMaxValue = 10.0; 414 const size_t kMaxSignificantStart = 20; 415 float start_value = 416 (static_cast<float>(kMaxSignificantStart - 417 std::min(kMaxSignificantStart, start_location_total / terms.size()))) / 418 static_cast<float>(kMaxSignificantStart) * kStartMaxValue; 419 420 const float kCompleteMaxValue = 10.0; 421 float complete_value = 422 (static_cast<float>(term_length_total) / static_cast<float>(url.size())) * 423 kStartMaxValue; 424 425 const float kLastVisitMaxValue = 10.0; 426 const base::TimeDelta kMaxSignificantDay = base::TimeDelta::FromDays(30); 427 int64 delta_time = (kMaxSignificantDay - 428 std::min((base::Time::Now() - row.last_visit()), 429 kMaxSignificantDay)).ToInternalValue(); 430 float last_visit_value = 431 (static_cast<float>(delta_time) / 432 static_cast<float>(kMaxSignificantDay.ToInternalValue())) * 433 kLastVisitMaxValue; 434 435 const float kVisitCountMaxValue = 10.0; 436 const int kMaxSignificantVisits = 10; 437 float visit_count_value = 438 (static_cast<float>(std::min(row.visit_count(), 439 kMaxSignificantVisits))) / static_cast<float>(kMaxSignificantVisits) * 440 kVisitCountMaxValue; 441 float raw_score = order_value + start_value + complete_value + 442 last_visit_value + visit_count_value; 443 444 // Normalize the score. 445 const float kMaxNormalizedRawScore = 1000.0; 446 raw_score = 447 (raw_score / (kOrderMaxValue + kStartMaxValue + kCompleteMaxValue + 448 kLastVisitMaxValue + kVisitCountMaxValue)) * 449 kMaxNormalizedRawScore; 450 return static_cast<int>(raw_score); 451} 452 453// static 454Time InMemoryURLIndex::RecentThreshold() { 455 return Time::Now() - TimeDelta::FromDays(kLowQualityMatchAgeLimitInDays); 456} 457 458void InMemoryURLIndex::AddHistoryMatch::operator()( 459 const InMemoryURLIndex::HistoryID history_id) { 460 HistoryInfoMap::const_iterator hist_pos = 461 index_.history_info_map_.find(history_id); 462 if (hist_pos != index_.history_info_map_.end()) { 463 const URLRow& hist_item = hist_pos->second; 464 // TODO(mrossetti): Accommodate multiple term highlighting. 465 size_t input_location = 0; 466 int score = InMemoryURLIndex::RawScoreForURL( 467 hist_item, lower_terms_, &input_location); 468 if (score != 0) { 469 // We only retain the top 10 highest scoring results so 470 // see if this one fits into the top 10 and, if so, where. 471 ScoredHistoryMatches::iterator scored_iter = scored_matches_.begin(); 472 while (scored_iter != scored_matches_.end() && 473 (*scored_iter).raw_score > score) 474 ++scored_iter; 475 if ((scored_matches_.size() < 10) || 476 (scored_iter != scored_matches_.end())) { 477 // Create and insert the new item. 478 // TODO(mrossetti): Properly set |match_in_scheme| and 479 // |innermost_match|. 480 bool match_in_scheme = false; 481 bool innermost_match = true; 482 ScoredHistoryMatch match(hist_item, input_location, 483 match_in_scheme, innermost_match, score); 484 if (!scored_matches_.empty()) 485 scored_matches_.insert(scored_iter, match); 486 else 487 scored_matches_.push_back(match); 488 // Trim any entries beyond 10. 489 if (scored_matches_.size() > 10) { 490 scored_matches_.erase(scored_matches_.begin() + 10, 491 scored_matches_.end()); 492 } 493 } 494 } 495 } 496} 497 498} // namespace history 499