in_memory_url_index.cc revision 21d179b334e59e9a3bfcaed4c4430bef1bc5759d
1// Copyright (c) 2010 The Chromium Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style license that can be 3// found in the LICENSE file. 4 5#include "chrome/browser/history/in_memory_url_index.h" 6 7#include <algorithm> 8#include <limits> 9 10#include "app/l10n_util.h" 11#include "base/i18n/break_iterator.h" 12#include "base/string_util.h" 13#include "base/time.h" 14#include "base/utf_string_conversions.h" 15#include "chrome/browser/autocomplete/history_provider_util.h" 16#include "chrome/browser/history/url_database.h" 17#include "net/base/escape.h" 18#include "net/base/net_util.h" 19#include "unicode/utypes.h" // for int32_t 20 21using base::Time; 22using base::TimeDelta; 23 24namespace history { 25 26ScoredHistoryMatch::ScoredHistoryMatch() : raw_score(0) {} 27 28ScoredHistoryMatch::ScoredHistoryMatch(const URLRow& url_info, 29 size_t input_location, 30 bool match_in_scheme, 31 bool innermost_match, 32 int score) 33 : HistoryMatch(url_info, input_location, match_in_scheme, innermost_match), 34 raw_score(score) { 35} 36 37struct InMemoryURLIndex::TermCharWordSet { 38 TermCharWordSet(const Char16Set& char_set, 39 const WordIDSet& word_id_set, 40 bool used) 41 : char_set_(char_set), 42 word_id_set_(word_id_set), 43 used_(used) {} 44 45 bool IsNotUsed() const { return !used_; } 46 47 Char16Set char_set_; 48 WordIDSet word_id_set_; 49 bool used_; // true if this set has been used for the current term search. 50}; 51 52InMemoryURLIndex::InMemoryURLIndex() : history_item_count_(0) {} 53 54InMemoryURLIndex::~InMemoryURLIndex() {} 55 56static const int32_t kURLToLowerBufferSize = 2048; 57 58// Indexing 59 60bool InMemoryURLIndex::Init(history::URLDatabase* history_db, 61 const std::string& languages) { 62 // TODO(mrossetti): Register for profile/language change notifications. 63 languages_ = languages; 64 // Reset our indexes. 65 char_word_map_.clear(); 66 word_id_history_map_.clear(); 67 if (!history_db) 68 return false; 69 URLDatabase::URLEnumerator history_enum; 70 if (history_db->InitURLEnumeratorForEverything(&history_enum)) { 71 URLRow row; 72 Time recent_threshold = InMemoryURLIndex::RecentThreshold(); 73 while (history_enum.GetNextURL(&row)) { 74 // Do some filtering so that we only get history items which could 75 // possibly pass the HistoryURLProvider::CullPoorMatches filter later. 76 if ((row.typed_count() > kLowQualityMatchTypedLimit) || 77 (row.visit_count() > kLowQualityMatchVisitLimit) || 78 (row.last_visit() >= recent_threshold)) { 79 if (!IndexRow(row)) 80 return false; 81 } 82 } 83 } 84 return true; 85} 86 87bool InMemoryURLIndex::IndexRow(const URLRow& row) { 88 const GURL& gurl(row.url()); 89 string16 url(net::FormatUrl(gurl, languages_, 90 net::kFormatUrlOmitUsernamePassword, 91 UnescapeRule::SPACES | UnescapeRule::URL_SPECIAL_CHARS, 92 NULL, NULL, NULL)); 93 94 // TODO(mrossetti): Detect row_id > std::numeric_limits<HistoryID>::max(). 95 HistoryID history_id = static_cast<HistoryID>(row.id()); 96 97 // Add the row for quick lookup in the history info store. 98 url = l10n_util::ToLower(url); 99 URLRow new_row(GURL(url), row.id()); 100 new_row.set_visit_count(row.visit_count()); 101 new_row.set_typed_count(row.typed_count()); 102 new_row.set_last_visit(row.last_visit()); 103 new_row.set_title(row.title()); 104 history_info_map_.insert(std::make_pair(history_id, new_row)); 105 106 // Split into individual, unique words. 107 String16Set words = WordSetFromString16(url); 108 109 // For each word, add a new entry into the word index referring to the 110 // associated history item. 111 for (String16Set::iterator iter = words.begin(); 112 iter != words.end(); ++iter) { 113 String16Set::value_type uni_word = *iter; 114 AddWordToIndex(uni_word, history_id); 115 } 116 ++history_item_count_; 117 return true; 118} 119 120// Searching 121 122ScoredHistoryMatches InMemoryURLIndex::HistoryItemsForTerms( 123 const String16Vector& terms) { 124 ScoredHistoryMatches scored_items; 125 if (!terms.empty()) { 126 // Reset used_ flags for term_char_word_set_cache_. We use a basic mark- 127 // and-sweep approach. 128 ResetTermCharWordSetCache(); 129 130 // Lowercase the terms. 131 // TODO(mrossetti): Another opportunity for a transform algorithm. 132 String16Vector lower_terms; 133 for (String16Vector::const_iterator term_iter = terms.begin(); 134 term_iter != terms.end(); ++term_iter) { 135 String16Vector::value_type lower_string(*term_iter); 136 std::transform(lower_string.begin(), 137 lower_string.end(), 138 lower_string.begin(), 139 tolower); 140 lower_terms.push_back(lower_string); 141 } 142 143 String16Vector::value_type all_terms(JoinString(lower_terms, ' ')); 144 HistoryIDSet history_id_set = HistoryIDSetFromWords(all_terms); 145 146 // Pass over all of the candidates filtering out any without a proper 147 // substring match, inserting those which pass in order by score. 148 scored_items = std::for_each(history_id_set.begin(), history_id_set.end(), 149 AddHistoryMatch(*this, 150 lower_terms)).ScoredMatches(); 151 } 152 153 // Remove any stale TermCharWordSet's. 154 term_char_word_set_cache_.erase( 155 std::remove_if(term_char_word_set_cache_.begin(), 156 term_char_word_set_cache_.end(), 157 std::mem_fun_ref(&TermCharWordSet::IsNotUsed)), 158 term_char_word_set_cache_.end()); 159 return scored_items; 160} 161 162void InMemoryURLIndex::ResetTermCharWordSetCache() { 163 // TODO(mrossetti): Consider keeping more of the cache around for possible 164 // repeat searches, except a 'shortcuts' approach might be better for that. 165 // TODO(mrossetti): Change TermCharWordSet to a class and use for_each. 166 for (TermCharWordSetVector::iterator iter = 167 term_char_word_set_cache_.begin(); 168 iter != term_char_word_set_cache_.end(); ++iter) 169 iter->used_ = false; 170} 171 172InMemoryURLIndex::HistoryIDSet InMemoryURLIndex::HistoryIDSetFromWords( 173 const string16& uni_string) { 174 // Break the terms down into individual terms (words), get the candidate 175 // set for each term, and intersect each to get a final candidate list. 176 // Note that a single 'term' from the user's perspective might be 177 // a string like "http://www.somewebsite.com" which, from our perspective, 178 // is four words: 'http', 'www', 'somewebsite', and 'com'. 179 HistoryIDSet history_id_set; 180 String16Set words = WordSetFromString16(uni_string); 181 bool first_word = true; 182 for (String16Set::iterator iter = words.begin(); 183 iter != words.end(); ++iter) { 184 String16Set::value_type uni_word = *iter; 185 HistoryIDSet term_history_id_set = 186 InMemoryURLIndex::HistoryIDsForTerm(uni_word); 187 if (first_word) { 188 history_id_set.swap(term_history_id_set); 189 first_word = false; 190 } else { 191 HistoryIDSet old_history_id_set(history_id_set); 192 history_id_set.clear(); 193 std::set_intersection(old_history_id_set.begin(), 194 old_history_id_set.end(), 195 term_history_id_set.begin(), 196 term_history_id_set.end(), 197 std::inserter(history_id_set, 198 history_id_set.begin())); 199 } 200 } 201 return history_id_set; 202} 203 204InMemoryURLIndex::HistoryIDSet InMemoryURLIndex::HistoryIDsForTerm( 205 const string16& uni_word) { 206 InMemoryURLIndex::HistoryIDSet history_id_set; 207 208 // For each character in the word, get the char/word_id map entry and 209 // intersect with the set in an incremental manner. 210 Char16Set uni_chars = CharactersFromString16(uni_word); 211 WordIDSet word_id_set(WordIDSetForTermChars(uni_chars)); 212 213 // If any words resulted then we can compose a set of history IDs by unioning 214 // the sets from each word. 215 if (!word_id_set.empty()) { 216 for (WordIDSet::iterator word_id_iter = word_id_set.begin(); 217 word_id_iter != word_id_set.end(); ++word_id_iter) { 218 WordID word_id = *word_id_iter; 219 WordIDHistoryMap::iterator word_iter = word_id_history_map_.find(word_id); 220 if (word_iter != word_id_history_map_.end()) { 221 HistoryIDSet& word_history_id_set(word_iter->second); 222 history_id_set.insert(word_history_id_set.begin(), 223 word_history_id_set.end()); 224 } 225 } 226 } 227 228 return history_id_set; 229} 230 231// Utility Functions 232 233// static 234InMemoryURLIndex::String16Set InMemoryURLIndex::WordSetFromString16( 235 const string16& uni_string) { 236 String16Set words; 237 base::BreakIterator iter(&uni_string, base::BreakIterator::BREAK_WORD); 238 if (iter.Init()) { 239 while (iter.Advance()) { 240 if (iter.IsWord()) 241 words.insert(iter.GetString()); 242 } 243 } 244 return words; 245} 246 247// static 248InMemoryURLIndex::Char16Set InMemoryURLIndex::CharactersFromString16( 249 const string16& uni_word) { 250 Char16Set characters; 251 for (string16::const_iterator iter = uni_word.begin(); 252 iter != uni_word.end(); ++iter) 253 characters.insert(*iter); 254 return characters; 255} 256 257void InMemoryURLIndex::AddWordToIndex(const string16& uni_word, 258 HistoryID history_id) { 259 WordMap::iterator word_pos = word_map_.find(uni_word); 260 if (word_pos != word_map_.end()) 261 UpdateWordHistory(word_pos->second, history_id); 262 else 263 AddWordHistory(uni_word, history_id); 264} 265 266void InMemoryURLIndex::UpdateWordHistory(WordID word_id, HistoryID history_id) { 267 WordIDHistoryMap::iterator history_pos = word_id_history_map_.find(word_id); 268 DCHECK(history_pos != word_id_history_map_.end()); 269 HistoryIDSet& history_id_set(history_pos->second); 270 history_id_set.insert(history_id); 271} 272 273// Add a new word to the word list and the word map, and then create a 274// new entry in the word/history map. 275void InMemoryURLIndex::AddWordHistory(const string16& uni_word, 276 HistoryID history_id) { 277 word_list_.push_back(uni_word); 278 WordID word_id = word_list_.size() - 1; 279 word_map_.insert(std::make_pair(uni_word, word_id)); 280 HistoryIDSet history_id_set; 281 history_id_set.insert(history_id); 282 word_id_history_map_.insert(std::make_pair(word_id, history_id_set)); 283 // For each character in the newly added word (i.e. a word that is not 284 // already in the word index), add the word to the character index. 285 Char16Set characters = CharactersFromString16(uni_word); 286 for (Char16Set::iterator uni_char_iter = characters.begin(); 287 uni_char_iter != characters.end(); ++uni_char_iter) { 288 Char16Set::value_type uni_string = *uni_char_iter; 289 CharWordIDMap::iterator char_iter = char_word_map_.find(uni_string); 290 if (char_iter != char_word_map_.end()) { 291 // Update existing entry in the char/word index. 292 WordIDSet& word_id_set(char_iter->second); 293 word_id_set.insert(word_id); 294 } else { 295 // Create a new entry in the char/word index. 296 WordIDSet word_id_set; 297 word_id_set.insert(word_id); 298 char_word_map_.insert(std::make_pair(uni_string, word_id_set)); 299 } 300 } 301} 302 303InMemoryURLIndex::WordIDSet InMemoryURLIndex::WordIDSetForTermChars( 304 InMemoryURLIndex::Char16Set const& uni_chars) { 305 TermCharWordSet* best_term_char_word_set = NULL; 306 bool set_found = false; 307 size_t outstanding_count = 0; 308 Char16Set outstanding_chars; 309 for (TermCharWordSetVector::iterator iter = term_char_word_set_cache_.begin(); 310 iter != term_char_word_set_cache_.end(); ++iter) { 311 TermCharWordSetVector::value_type& term_char_word_set(*iter); 312 Char16Set& char_set(term_char_word_set.char_set_); 313 Char16Set exclusions; 314 std::set_difference(char_set.begin(), char_set.end(), 315 uni_chars.begin(), uni_chars.end(), 316 std::inserter(exclusions, exclusions.begin())); 317 if (exclusions.empty()) { 318 // Do a reverse difference so that we know which characters remain 319 // to be indexed. Then decide if this is a better match than any 320 // previous cached set. 321 std::set_difference(uni_chars.begin(), uni_chars.end(), 322 char_set.begin(), char_set.end(), 323 std::inserter(exclusions, exclusions.begin())); 324 if (!set_found || exclusions.size() < outstanding_count) { 325 outstanding_chars = exclusions; 326 best_term_char_word_set = &*iter; 327 outstanding_count = exclusions.size(); 328 set_found = true; 329 } 330 } 331 } 332 333 WordIDSet word_id_set; 334 if (set_found && outstanding_count == 0) { 335 // If there were no oustanding characters then we can use the cached one. 336 best_term_char_word_set->used_ = true; 337 word_id_set = best_term_char_word_set->word_id_set_; 338 } else { 339 // Some or all of the characters remain to be indexed. 340 bool first_character = true; 341 if (set_found) { 342 first_character = false; 343 word_id_set = best_term_char_word_set->word_id_set_; 344 } else { 345 outstanding_chars = uni_chars; 346 } 347 348 for (Char16Set::iterator uni_char_iter = outstanding_chars.begin(); 349 uni_char_iter != outstanding_chars.end(); ++uni_char_iter) { 350 Char16Set::value_type uni_char = *uni_char_iter; 351 CharWordIDMap::iterator char_iter = char_word_map_.find(uni_char); 352 if (char_iter == char_word_map_.end()) { 353 // The character was not found so bail. 354 word_id_set.clear(); 355 break; 356 } 357 WordIDSet& char_word_id_set(char_iter->second); 358 if (first_character) { 359 word_id_set = char_word_id_set; 360 first_character = false; 361 } else { 362 WordIDSet old_word_id_set(word_id_set); 363 word_id_set.clear(); 364 std::set_intersection(old_word_id_set.begin(), 365 old_word_id_set.end(), 366 char_word_id_set.begin(), 367 char_word_id_set.end(), 368 std::inserter(word_id_set, 369 word_id_set.begin())); 370 } 371 } 372 // Update the cache. 373 if (!set_found || outstanding_count) { 374 term_char_word_set_cache_.push_back(TermCharWordSet(uni_chars, 375 word_id_set, true)); 376 } 377 } 378 return word_id_set; 379} 380 381// static 382int InMemoryURLIndex::RawScoreForURL(const URLRow& row, 383 const String16Vector& terms, 384 size_t* first_term_location) { 385 GURL gurl = row.url(); 386 if (!gurl.is_valid()) 387 return 0; 388 389 string16 url = UTF8ToUTF16(gurl.spec()); 390 391 // Collect all term start positions so we can see if they appear in order. 392 std::vector<size_t> term_locations; 393 int out_of_order = 0; // Count the terms which are out of order. 394 size_t start_location_total = 0; 395 size_t term_length_total = 0; 396 for (String16Vector::const_iterator iter = terms.begin(); iter != terms.end(); 397 ++iter) { 398 string16 term = *iter; 399 size_t term_location = url.find(term); 400 if (term_location == string16::npos) 401 return 0; // A term was not found. This should not happen. 402 if (iter != terms.begin()) { 403 // See if this term is out-of-order. 404 for (std::vector<size_t>::const_iterator order_iter = 405 term_locations.begin(); order_iter != term_locations.end(); 406 ++order_iter) { 407 if (term_location <= *order_iter) 408 ++out_of_order; 409 } 410 } else { 411 *first_term_location = term_location; 412 } 413 term_locations.push_back(term_location); 414 start_location_total += term_location; 415 term_length_total += term.size(); 416 } 417 418 // Calculate a raw score. 419 // TODO(mrossetti): This is good enough for now but must be fine-tuned. 420 const float kOrderMaxValue = 10.0; 421 float order_value = 10.0; 422 if (terms.size() > 1) { 423 int max_possible_out_of_order = (terms.size() * (terms.size() - 1)) / 2; 424 order_value = 425 (static_cast<float>(max_possible_out_of_order - out_of_order) / 426 max_possible_out_of_order) * kOrderMaxValue; 427 } 428 429 const float kStartMaxValue = 10.0; 430 const size_t kMaxSignificantStart = 20; 431 float start_value = 432 (static_cast<float>(kMaxSignificantStart - 433 std::min(kMaxSignificantStart, start_location_total / terms.size()))) / 434 static_cast<float>(kMaxSignificantStart) * kStartMaxValue; 435 436 const float kCompleteMaxValue = 10.0; 437 float complete_value = 438 (static_cast<float>(term_length_total) / static_cast<float>(url.size())) * 439 kStartMaxValue; 440 441 const float kLastVisitMaxValue = 10.0; 442 const base::TimeDelta kMaxSignificantDay = base::TimeDelta::FromDays(30); 443 int64 delta_time = (kMaxSignificantDay - 444 std::min((base::Time::Now() - row.last_visit()), 445 kMaxSignificantDay)).ToInternalValue(); 446 float last_visit_value = 447 (static_cast<float>(delta_time) / 448 static_cast<float>(kMaxSignificantDay.ToInternalValue())) * 449 kLastVisitMaxValue; 450 451 const float kVisitCountMaxValue = 10.0; 452 const int kMaxSignificantVisits = 10; 453 float visit_count_value = 454 (static_cast<float>(std::min(row.visit_count(), 455 kMaxSignificantVisits))) / static_cast<float>(kMaxSignificantVisits) * 456 kVisitCountMaxValue; 457 458 const float kTypedCountMaxValue = 20.0; 459 const int kMaxSignificantTyped = 10; 460 float typed_count_value = 461 (static_cast<float>(std::min(row.typed_count(), 462 kMaxSignificantTyped))) / static_cast<float>(kMaxSignificantTyped) * 463 kTypedCountMaxValue; 464 465 float raw_score = order_value + start_value + complete_value + 466 last_visit_value + visit_count_value + typed_count_value; 467 468 // Normalize the score. 469 const float kMaxNormalizedRawScore = 1000.0; 470 raw_score = 471 (raw_score / (kOrderMaxValue + kStartMaxValue + kCompleteMaxValue + 472 kLastVisitMaxValue + kVisitCountMaxValue + 473 kTypedCountMaxValue)) * 474 kMaxNormalizedRawScore; 475 return static_cast<int>(raw_score); 476} 477 478// static 479Time InMemoryURLIndex::RecentThreshold() { 480 return Time::Now() - TimeDelta::FromDays(kLowQualityMatchAgeLimitInDays); 481} 482 483InMemoryURLIndex::AddHistoryMatch::AddHistoryMatch( 484 const InMemoryURLIndex& index, 485 const String16Vector& lower_terms) 486 : index_(index), 487 lower_terms_(lower_terms) { 488} 489 490InMemoryURLIndex::AddHistoryMatch::~AddHistoryMatch() {} 491 492void InMemoryURLIndex::AddHistoryMatch::operator()( 493 const InMemoryURLIndex::HistoryID history_id) { 494 HistoryInfoMap::const_iterator hist_pos = 495 index_.history_info_map_.find(history_id); 496 if (hist_pos != index_.history_info_map_.end()) { 497 const URLRow& hist_item = hist_pos->second; 498 // TODO(mrossetti): Accommodate multiple term highlighting. 499 size_t input_location = 0; 500 int score = InMemoryURLIndex::RawScoreForURL( 501 hist_item, lower_terms_, &input_location); 502 if (score != 0) { 503 // We only retain the top 10 highest scoring results so 504 // see if this one fits into the top 10 and, if so, where. 505 ScoredHistoryMatches::iterator scored_iter = scored_matches_.begin(); 506 while (scored_iter != scored_matches_.end() && 507 (*scored_iter).raw_score > score) 508 ++scored_iter; 509 if ((scored_matches_.size() < 10) || 510 (scored_iter != scored_matches_.end())) { 511 // Create and insert the new item. 512 // TODO(mrossetti): Properly set |match_in_scheme| and 513 // |innermost_match|. 514 bool match_in_scheme = false; 515 bool innermost_match = true; 516 ScoredHistoryMatch match(hist_item, input_location, 517 match_in_scheme, innermost_match, score); 518 if (!scored_matches_.empty()) 519 scored_matches_.insert(scored_iter, match); 520 else 521 scored_matches_.push_back(match); 522 // Trim any entries beyond 10. 523 if (scored_matches_.size() > 10) { 524 scored_matches_.erase(scored_matches_.begin() + 10, 525 scored_matches_.end()); 526 } 527 } 528 } 529 } 530} 531 532} // namespace history 533