in_memory_url_index.cc revision 513209b27ff55e2841eac0e4120199c23acce758
1// Copyright (c) 2010 The Chromium Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style license that can be 3// found in the LICENSE file. 4 5#include "chrome/browser/history/in_memory_url_index.h" 6 7#include <algorithm> 8#include <limits> 9 10#include "app/l10n_util.h" 11#include "base/i18n/word_iterator.h" 12#include "base/string_util.h" 13#include "base/time.h" 14#include "base/utf_string_conversions.h" 15#include "chrome/browser/autocomplete/history_provider_util.h" 16#include "chrome/browser/history/url_database.h" 17#include "net/base/escape.h" 18#include "net/base/net_util.h" 19 20using base::Time; 21using base::TimeDelta; 22 23namespace history { 24 25ScoredHistoryMatch::ScoredHistoryMatch() : raw_score(0) {} 26 27ScoredHistoryMatch::ScoredHistoryMatch(const URLRow& url_info, 28 size_t input_location, 29 bool match_in_scheme, 30 bool innermost_match, 31 int score) 32 : HistoryMatch(url_info, input_location, match_in_scheme, innermost_match), 33 raw_score(score) { 34} 35 36struct InMemoryURLIndex::TermCharWordSet { 37 TermCharWordSet(const Char16Set& char_set, 38 const WordIDSet& word_id_set, 39 bool used) 40 : char_set_(char_set), 41 word_id_set_(word_id_set), 42 used_(used) {} 43 44 bool IsNotUsed() const { return !used_; } 45 46 Char16Set char_set_; 47 WordIDSet word_id_set_; 48 bool used_; // true if this set has been used for the current term search. 49}; 50 51InMemoryURLIndex::InMemoryURLIndex() : history_item_count_(0) {} 52 53InMemoryURLIndex::~InMemoryURLIndex() {} 54 55static const int32_t kURLToLowerBufferSize = 2048; 56 57// Indexing 58 59bool InMemoryURLIndex::Init(history::URLDatabase* history_db, 60 const std::string& languages) { 61 // TODO(mrossetti): Register for profile/language change notifications. 62 languages_ = languages; 63 // Reset our indexes. 64 char_word_map_.clear(); 65 word_id_history_map_.clear(); 66 if (!history_db) 67 return false; 68 URLDatabase::URLEnumerator history_enum; 69 if (history_db->InitURLEnumeratorForEverything(&history_enum)) { 70 URLRow row; 71 Time recent_threshold = InMemoryURLIndex::RecentThreshold(); 72 while (history_enum.GetNextURL(&row)) { 73 // Do some filtering so that we only get history items which could 74 // possibly pass the HistoryURLProvider::CullPoorMatches filter later. 75 if ((row.typed_count() > kLowQualityMatchTypedLimit) || 76 (row.visit_count() > kLowQualityMatchVisitLimit) || 77 (row.last_visit() >= recent_threshold)) { 78 if (!IndexRow(row)) 79 return false; 80 } 81 } 82 } 83 return true; 84} 85 86bool InMemoryURLIndex::IndexRow(const URLRow& row) { 87 const GURL& gurl(row.url()); 88 string16 url(net::FormatUrl(gurl, languages_, 89 net::kFormatUrlOmitUsernamePassword, 90 UnescapeRule::SPACES | UnescapeRule::URL_SPECIAL_CHARS, 91 NULL, NULL, NULL)); 92 93 // TODO(mrossetti): Detect row_id > std::numeric_limits<HistoryID>::max(). 94 HistoryID history_id = static_cast<HistoryID>(row.id()); 95 96 // Add the row for quick lookup in the history info store. 97 url = l10n_util::ToLower(url); 98 URLRow new_row(GURL(url), row.id()); 99 new_row.set_visit_count(row.visit_count()); 100 new_row.set_typed_count(row.typed_count()); 101 new_row.set_last_visit(row.last_visit()); 102 new_row.set_title(row.title()); 103 history_info_map_.insert(std::make_pair(history_id, new_row)); 104 105 // Split into individual, unique words. 106 String16Set words = WordSetFromString16(url); 107 108 // For each word, add a new entry into the word index referring to the 109 // associated history item. 110 for (String16Set::iterator iter = words.begin(); 111 iter != words.end(); ++iter) { 112 String16Set::value_type uni_word = *iter; 113 AddWordToIndex(uni_word, history_id); 114 } 115 ++history_item_count_; 116 return true; 117} 118 119// Searching 120 121ScoredHistoryMatches InMemoryURLIndex::HistoryItemsForTerms( 122 const String16Vector& terms) { 123 ScoredHistoryMatches scored_items; 124 if (!terms.empty()) { 125 // Reset used_ flags for term_char_word_set_cache_. We use a basic mark- 126 // and-sweep approach. 127 ResetTermCharWordSetCache(); 128 129 // Lowercase the terms. 130 // TODO(mrossetti): Another opportunity for a transform algorithm. 131 String16Vector lower_terms; 132 for (String16Vector::const_iterator term_iter = terms.begin(); 133 term_iter != terms.end(); ++term_iter) { 134 String16Vector::value_type lower_string(*term_iter); 135 std::transform(lower_string.begin(), 136 lower_string.end(), 137 lower_string.begin(), 138 tolower); 139 lower_terms.push_back(lower_string); 140 } 141 142 String16Vector::value_type all_terms(JoinString(lower_terms, ' ')); 143 HistoryIDSet history_id_set = HistoryIDSetFromWords(all_terms); 144 145 // Pass over all of the candidates filtering out any without a proper 146 // substring match, inserting those which pass in order by score. 147 scored_items = std::for_each(history_id_set.begin(), history_id_set.end(), 148 AddHistoryMatch(*this, 149 lower_terms)).ScoredMatches(); 150 } 151 152 // Remove any stale TermCharWordSet's. 153 term_char_word_set_cache_.erase( 154 std::remove_if(term_char_word_set_cache_.begin(), 155 term_char_word_set_cache_.end(), 156 std::mem_fun_ref(&TermCharWordSet::IsNotUsed)), 157 term_char_word_set_cache_.end()); 158 return scored_items; 159} 160 161void InMemoryURLIndex::ResetTermCharWordSetCache() { 162 // TODO(mrossetti): Consider keeping more of the cache around for possible 163 // repeat searches, except a 'shortcuts' approach might be better for that. 164 // TODO(mrossetti): Change TermCharWordSet to a class and use for_each. 165 for (TermCharWordSetVector::iterator iter = 166 term_char_word_set_cache_.begin(); 167 iter != term_char_word_set_cache_.end(); ++iter) 168 iter->used_ = false; 169} 170 171InMemoryURLIndex::HistoryIDSet InMemoryURLIndex::HistoryIDSetFromWords( 172 const string16& uni_string) { 173 // Break the terms down into individual terms (words), get the candidate 174 // set for each term, and intersect each to get a final candidate list. 175 // Note that a single 'term' from the user's perspective might be 176 // a string like "http://www.somewebsite.com" which, from our perspective, 177 // is four words: 'http', 'www', 'somewebsite', and 'com'. 178 HistoryIDSet history_id_set; 179 String16Set words = WordSetFromString16(uni_string); 180 bool first_word = true; 181 for (String16Set::iterator iter = words.begin(); 182 iter != words.end(); ++iter) { 183 String16Set::value_type uni_word = *iter; 184 HistoryIDSet term_history_id_set = 185 InMemoryURLIndex::HistoryIDsForTerm(uni_word); 186 if (first_word) { 187 history_id_set.swap(term_history_id_set); 188 first_word = false; 189 } else { 190 HistoryIDSet old_history_id_set(history_id_set); 191 history_id_set.clear(); 192 std::set_intersection(old_history_id_set.begin(), 193 old_history_id_set.end(), 194 term_history_id_set.begin(), 195 term_history_id_set.end(), 196 std::inserter(history_id_set, 197 history_id_set.begin())); 198 } 199 } 200 return history_id_set; 201} 202 203InMemoryURLIndex::HistoryIDSet InMemoryURLIndex::HistoryIDsForTerm( 204 const string16& uni_word) { 205 InMemoryURLIndex::HistoryIDSet history_id_set; 206 207 // For each character in the word, get the char/word_id map entry and 208 // intersect with the set in an incremental manner. 209 Char16Set uni_chars = CharactersFromString16(uni_word); 210 WordIDSet word_id_set(WordIDSetForTermChars(uni_chars)); 211 212 // If any words resulted then we can compose a set of history IDs by unioning 213 // the sets from each word. 214 if (!word_id_set.empty()) { 215 for (WordIDSet::iterator word_id_iter = word_id_set.begin(); 216 word_id_iter != word_id_set.end(); ++word_id_iter) { 217 WordID word_id = *word_id_iter; 218 WordIDHistoryMap::iterator word_iter = word_id_history_map_.find(word_id); 219 if (word_iter != word_id_history_map_.end()) { 220 HistoryIDSet& word_history_id_set(word_iter->second); 221 history_id_set.insert(word_history_id_set.begin(), 222 word_history_id_set.end()); 223 } 224 } 225 } 226 227 return history_id_set; 228} 229 230// Utility Functions 231 232// static 233InMemoryURLIndex::String16Set InMemoryURLIndex::WordSetFromString16( 234 const string16& uni_string) { 235 String16Set words; 236 WordIterator iter(&uni_string, WordIterator::BREAK_WORD); 237 if (iter.Init()) { 238 while (iter.Advance()) { 239 if (iter.IsWord()) 240 words.insert(iter.GetWord()); 241 } 242 } 243 return words; 244} 245 246// static 247InMemoryURLIndex::Char16Set InMemoryURLIndex::CharactersFromString16( 248 const string16& uni_word) { 249 Char16Set characters; 250 for (string16::const_iterator iter = uni_word.begin(); 251 iter != uni_word.end(); ++iter) 252 characters.insert(*iter); 253 return characters; 254} 255 256void InMemoryURLIndex::AddWordToIndex(const string16& uni_word, 257 HistoryID history_id) { 258 WordMap::iterator word_pos = word_map_.find(uni_word); 259 if (word_pos != word_map_.end()) 260 UpdateWordHistory(word_pos->second, history_id); 261 else 262 AddWordHistory(uni_word, history_id); 263} 264 265void InMemoryURLIndex::UpdateWordHistory(WordID word_id, HistoryID history_id) { 266 WordIDHistoryMap::iterator history_pos = word_id_history_map_.find(word_id); 267 DCHECK(history_pos != word_id_history_map_.end()); 268 HistoryIDSet& history_id_set(history_pos->second); 269 history_id_set.insert(history_id); 270} 271 272// Add a new word to the word list and the word map, and then create a 273// new entry in the word/history map. 274void InMemoryURLIndex::AddWordHistory(const string16& uni_word, 275 HistoryID history_id) { 276 word_list_.push_back(uni_word); 277 WordID word_id = word_list_.size() - 1; 278 word_map_.insert(std::make_pair(uni_word, word_id)); 279 HistoryIDSet history_id_set; 280 history_id_set.insert(history_id); 281 word_id_history_map_.insert(std::make_pair(word_id, history_id_set)); 282 // For each character in the newly added word (i.e. a word that is not 283 // already in the word index), add the word to the character index. 284 Char16Set characters = CharactersFromString16(uni_word); 285 for (Char16Set::iterator uni_char_iter = characters.begin(); 286 uni_char_iter != characters.end(); ++uni_char_iter) { 287 Char16Set::value_type uni_string = *uni_char_iter; 288 CharWordIDMap::iterator char_iter = char_word_map_.find(uni_string); 289 if (char_iter != char_word_map_.end()) { 290 // Update existing entry in the char/word index. 291 WordIDSet& word_id_set(char_iter->second); 292 word_id_set.insert(word_id); 293 } else { 294 // Create a new entry in the char/word index. 295 WordIDSet word_id_set; 296 word_id_set.insert(word_id); 297 char_word_map_.insert(std::make_pair(uni_string, word_id_set)); 298 } 299 } 300} 301 302InMemoryURLIndex::WordIDSet InMemoryURLIndex::WordIDSetForTermChars( 303 InMemoryURLIndex::Char16Set const& uni_chars) { 304 TermCharWordSet* best_term_char_word_set = NULL; 305 bool set_found = false; 306 size_t outstanding_count = 0; 307 Char16Set outstanding_chars; 308 for (TermCharWordSetVector::iterator iter = term_char_word_set_cache_.begin(); 309 iter != term_char_word_set_cache_.end(); ++iter) { 310 TermCharWordSetVector::value_type& term_char_word_set(*iter); 311 Char16Set& char_set(term_char_word_set.char_set_); 312 Char16Set exclusions; 313 std::set_difference(char_set.begin(), char_set.end(), 314 uni_chars.begin(), uni_chars.end(), 315 std::inserter(exclusions, exclusions.begin())); 316 if (exclusions.empty()) { 317 // Do a reverse difference so that we know which characters remain 318 // to be indexed. Then decide if this is a better match than any 319 // previous cached set. 320 std::set_difference(uni_chars.begin(), uni_chars.end(), 321 char_set.begin(), char_set.end(), 322 std::inserter(exclusions, exclusions.begin())); 323 if (!set_found || exclusions.size() < outstanding_count) { 324 outstanding_chars = exclusions; 325 best_term_char_word_set = &*iter; 326 outstanding_count = exclusions.size(); 327 set_found = true; 328 } 329 } 330 } 331 332 WordIDSet word_id_set; 333 if (set_found && outstanding_count == 0) { 334 // If there were no oustanding characters then we can use the cached one. 335 best_term_char_word_set->used_ = true; 336 word_id_set = best_term_char_word_set->word_id_set_; 337 } else { 338 // Some or all of the characters remain to be indexed. 339 bool first_character = true; 340 if (set_found) { 341 first_character = false; 342 word_id_set = best_term_char_word_set->word_id_set_; 343 } else { 344 outstanding_chars = uni_chars; 345 } 346 347 for (Char16Set::iterator uni_char_iter = outstanding_chars.begin(); 348 uni_char_iter != outstanding_chars.end(); ++uni_char_iter) { 349 Char16Set::value_type uni_char = *uni_char_iter; 350 CharWordIDMap::iterator char_iter = char_word_map_.find(uni_char); 351 if (char_iter == char_word_map_.end()) { 352 // The character was not found so bail. 353 word_id_set.clear(); 354 break; 355 } 356 WordIDSet& char_word_id_set(char_iter->second); 357 if (first_character) { 358 word_id_set = char_word_id_set; 359 first_character = false; 360 } else { 361 WordIDSet old_word_id_set(word_id_set); 362 word_id_set.clear(); 363 std::set_intersection(old_word_id_set.begin(), 364 old_word_id_set.end(), 365 char_word_id_set.begin(), 366 char_word_id_set.end(), 367 std::inserter(word_id_set, 368 word_id_set.begin())); 369 } 370 } 371 // Update the cache. 372 if (!set_found || outstanding_count) { 373 term_char_word_set_cache_.push_back(TermCharWordSet(uni_chars, 374 word_id_set, true)); 375 } 376 } 377 return word_id_set; 378} 379 380// static 381int InMemoryURLIndex::RawScoreForURL(const URLRow& row, 382 const String16Vector& terms, 383 size_t* first_term_location) { 384 GURL gurl = row.url(); 385 if (!gurl.is_valid()) 386 return 0; 387 388 string16 url = UTF8ToUTF16(gurl.spec()); 389 390 // Collect all term start positions so we can see if they appear in order. 391 std::vector<size_t> term_locations; 392 int out_of_order = 0; // Count the terms which are out of order. 393 size_t start_location_total = 0; 394 size_t term_length_total = 0; 395 for (String16Vector::const_iterator iter = terms.begin(); iter != terms.end(); 396 ++iter) { 397 string16 term = *iter; 398 size_t term_location = url.find(term); 399 if (term_location == string16::npos) 400 return 0; // A term was not found. This should not happen. 401 if (iter != terms.begin()) { 402 // See if this term is out-of-order. 403 for (std::vector<size_t>::const_iterator order_iter = 404 term_locations.begin(); order_iter != term_locations.end(); 405 ++order_iter) { 406 if (term_location <= *order_iter) 407 ++out_of_order; 408 } 409 } else { 410 *first_term_location = term_location; 411 } 412 term_locations.push_back(term_location); 413 start_location_total += term_location; 414 term_length_total += term.size(); 415 } 416 417 // Calculate a raw score. 418 // TODO(mrossetti): This is good enough for now but must be fine-tuned. 419 const float kOrderMaxValue = 10.0; 420 float order_value = 10.0; 421 if (terms.size() > 1) { 422 int max_possible_out_of_order = (terms.size() * (terms.size() - 1)) / 2; 423 order_value = 424 (static_cast<float>(max_possible_out_of_order - out_of_order) / 425 max_possible_out_of_order) * kOrderMaxValue; 426 } 427 428 const float kStartMaxValue = 10.0; 429 const size_t kMaxSignificantStart = 20; 430 float start_value = 431 (static_cast<float>(kMaxSignificantStart - 432 std::min(kMaxSignificantStart, start_location_total / terms.size()))) / 433 static_cast<float>(kMaxSignificantStart) * kStartMaxValue; 434 435 const float kCompleteMaxValue = 10.0; 436 float complete_value = 437 (static_cast<float>(term_length_total) / static_cast<float>(url.size())) * 438 kStartMaxValue; 439 440 const float kLastVisitMaxValue = 10.0; 441 const base::TimeDelta kMaxSignificantDay = base::TimeDelta::FromDays(30); 442 int64 delta_time = (kMaxSignificantDay - 443 std::min((base::Time::Now() - row.last_visit()), 444 kMaxSignificantDay)).ToInternalValue(); 445 float last_visit_value = 446 (static_cast<float>(delta_time) / 447 static_cast<float>(kMaxSignificantDay.ToInternalValue())) * 448 kLastVisitMaxValue; 449 450 const float kVisitCountMaxValue = 10.0; 451 const int kMaxSignificantVisits = 10; 452 float visit_count_value = 453 (static_cast<float>(std::min(row.visit_count(), 454 kMaxSignificantVisits))) / static_cast<float>(kMaxSignificantVisits) * 455 kVisitCountMaxValue; 456 457 const float kTypedCountMaxValue = 20.0; 458 const int kMaxSignificantTyped = 10; 459 float typed_count_value = 460 (static_cast<float>(std::min(row.typed_count(), 461 kMaxSignificantTyped))) / static_cast<float>(kMaxSignificantTyped) * 462 kTypedCountMaxValue; 463 464 float raw_score = order_value + start_value + complete_value + 465 last_visit_value + visit_count_value + typed_count_value; 466 467 // Normalize the score. 468 const float kMaxNormalizedRawScore = 1000.0; 469 raw_score = 470 (raw_score / (kOrderMaxValue + kStartMaxValue + kCompleteMaxValue + 471 kLastVisitMaxValue + kVisitCountMaxValue + 472 kTypedCountMaxValue)) * 473 kMaxNormalizedRawScore; 474 return static_cast<int>(raw_score); 475} 476 477// static 478Time InMemoryURLIndex::RecentThreshold() { 479 return Time::Now() - TimeDelta::FromDays(kLowQualityMatchAgeLimitInDays); 480} 481 482InMemoryURLIndex::AddHistoryMatch::AddHistoryMatch( 483 const InMemoryURLIndex& index, 484 const String16Vector& lower_terms) 485 : index_(index), 486 lower_terms_(lower_terms) { 487} 488 489InMemoryURLIndex::AddHistoryMatch::~AddHistoryMatch() {} 490 491void InMemoryURLIndex::AddHistoryMatch::operator()( 492 const InMemoryURLIndex::HistoryID history_id) { 493 HistoryInfoMap::const_iterator hist_pos = 494 index_.history_info_map_.find(history_id); 495 if (hist_pos != index_.history_info_map_.end()) { 496 const URLRow& hist_item = hist_pos->second; 497 // TODO(mrossetti): Accommodate multiple term highlighting. 498 size_t input_location = 0; 499 int score = InMemoryURLIndex::RawScoreForURL( 500 hist_item, lower_terms_, &input_location); 501 if (score != 0) { 502 // We only retain the top 10 highest scoring results so 503 // see if this one fits into the top 10 and, if so, where. 504 ScoredHistoryMatches::iterator scored_iter = scored_matches_.begin(); 505 while (scored_iter != scored_matches_.end() && 506 (*scored_iter).raw_score > score) 507 ++scored_iter; 508 if ((scored_matches_.size() < 10) || 509 (scored_iter != scored_matches_.end())) { 510 // Create and insert the new item. 511 // TODO(mrossetti): Properly set |match_in_scheme| and 512 // |innermost_match|. 513 bool match_in_scheme = false; 514 bool innermost_match = true; 515 ScoredHistoryMatch match(hist_item, input_location, 516 match_in_scheme, innermost_match, score); 517 if (!scored_matches_.empty()) 518 scored_matches_.insert(scored_iter, match); 519 else 520 scored_matches_.push_back(match); 521 // Trim any entries beyond 10. 522 if (scored_matches_.size() > 10) { 523 scored_matches_.erase(scored_matches_.begin() + 10, 524 scored_matches_.end()); 525 } 526 } 527 } 528 } 529} 530 531} // namespace history 532