15821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Copyright (c) 2012 The Chromium Authors. All rights reserved. 25821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Use of this source code is governed by a BSD-style license that can be 35821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// found in the LICENSE file. 45821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 55821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "chrome/browser/history/scored_history_match.h" 65821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 75821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include <algorithm> 85821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include <functional> 95821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include <iterator> 105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include <numeric> 115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include <set> 125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include <math.h> 145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "base/metrics/histogram.h" 16868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)#include "base/strings/string_util.h" 17868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)#include "base/strings/utf_string_conversions.h" 182a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)#include "chrome/browser/autocomplete/history_url_provider.h" 195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "chrome/browser/autocomplete/url_prefix.h" 202a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)#include "chrome/browser/bookmarks/bookmark_service.h" 215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "chrome/common/chrome_switches.h" 225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "content/public/browser/browser_thread.h" 235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)namespace history { 255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// ScoredHistoryMatch ---------------------------------------------------------- 275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)bool ScoredHistoryMatch::initialized_ = false; 29c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)const size_t ScoredHistoryMatch::kMaxVisitsToScore = 10u; 305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)bool ScoredHistoryMatch::also_do_hup_like_scoring = false; 312a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)int ScoredHistoryMatch::max_assigned_score_for_non_inlineable_matches = -1; 325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)ScoredHistoryMatch::ScoredHistoryMatch() 345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) : raw_score(0), 355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) can_inline(false) { 365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (!initialized_) { 372a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) InitializeAlsoDoHUPLikeScoringFieldAndMaxScoreField(); 385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) initialized_ = true; 395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)ScoredHistoryMatch::ScoredHistoryMatch(const URLRow& row, 43c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) const VisitInfoVector& visits, 442a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) const std::string& languages, 455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const string16& lower_string, 465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const String16Vector& terms, 475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const RowWordStarts& word_starts, 485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const base::Time now, 495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) BookmarkService* bookmark_service) 505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) : HistoryMatch(row, 0, false, false), 515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) raw_score(0), 525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) can_inline(false) { 535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (!initialized_) { 542a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) InitializeAlsoDoHUPLikeScoringFieldAndMaxScoreField(); 555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) initialized_ = true; 565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) GURL gurl = row.url(); 595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (!gurl.is_valid()) 605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return; 615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Figure out where each search term appears in the URL and/or page title 635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // so that we can score as well as provide autocomplete highlighting. 642a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) string16 url = CleanUpUrlForMatching(gurl, languages); 652a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) string16 title = CleanUpTitleForMatching(row.title()); 665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int term_num = 0; 675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) for (String16Vector::const_iterator iter = terms.begin(); iter != terms.end(); 685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ++iter, ++term_num) { 695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) string16 term = *iter; 705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) TermMatches url_term_matches = MatchTermInString(term, url, term_num); 715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) TermMatches title_term_matches = MatchTermInString(term, title, term_num); 725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (url_term_matches.empty() && title_term_matches.empty()) 735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return; // A term was not found in either URL or title - reject. 745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) url_matches.insert(url_matches.end(), url_term_matches.begin(), 755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) url_term_matches.end()); 765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) title_matches.insert(title_matches.end(), title_term_matches.begin(), 775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) title_term_matches.end()); 785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Sort matches by offset and eliminate any which overlap. 815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // TODO(mpearson): Investigate whether this has any meaningful 825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // effect on scoring. (It's necessary at some point: removing 835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // overlaps and sorting is needed to decide what to highlight in the 845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // suggestion string. But this sort and de-overlap doesn't have to 855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // be done before scoring.) 865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) url_matches = SortAndDeoverlapMatches(url_matches); 875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) title_matches = SortAndDeoverlapMatches(title_matches); 885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 892a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) // We can inline autocomplete a match if: 905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // 1) there is only one search term 915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // 2) AND the match begins immediately after one of the prefixes in 925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // URLPrefix such as http://www and https:// (note that one of these 935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // is the empty prefix, for cases where the user has typed the scheme) 945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // 3) AND the search string does not end in whitespace (making it look to 955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // the IMUI as though there is a single search term when actually there 965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // is a second, empty term). 972a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) // |best_inlineable_prefix| stores the inlineable prefix computed in 982a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) // clause (2) or NULL if no such prefix exists. (The URL is not inlineable.) 992a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) // Note that using the best prefix here means that when multiple 1002a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) // prefixes match, we'll choose to inline following the longest one. 1012a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) // For a URL like "http://www.washingtonmutual.com", this means 1022a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) // typing "w" will inline "ashington..." instead of "ww.washington...". 1032a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) const URLPrefix* best_inlineable_prefix = 1042a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) (!url_matches.empty() && (terms.size() == 1)) ? 1052a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) URLPrefix::BestURLPrefix(UTF8ToUTF16(gurl.spec()), terms[0]) : 1062a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) NULL; 1072a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) can_inline = (best_inlineable_prefix != NULL) && 1082a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) !IsWhitespace(*(lower_string.rbegin())); 1092a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) match_in_scheme = can_inline && best_inlineable_prefix->prefix.empty(); 1102a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) if (can_inline) { 1112a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) // Initialize innermost_match. 1122a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) // The idea here is that matches that occur in the scheme or 1132a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) // "www." are worse than matches which don't. For the URLs 1142a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) // "http://www.google.com" and "http://wellsfargo.com", we want 1152a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) // the omnibox input "w" to cause the latter URL to rank higher 1162a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) // than the former. Note that this is not the same as checking 1172a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) // whether one match's inlinable prefix has more components than 1182a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) // the other match's, since in this example, both matches would 1192a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) // have an inlinable prefix of "http://", which is one component. 1202a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) // 1212a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) // Instead, we look for the overall best (i.e., most components) 1222a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) // prefix of the current URL, and then check whether the inlinable 1232a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) // prefix has that many components. If it does, this is an 1242a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) // "innermost" match, and should be boosted. In the example 1252a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) // above, the best prefixes for the two URLs have two and one 1262a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) // components respectively, while the inlinable prefixes each 1272a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) // have one component; this means the first match is not innermost 1282a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) // and the second match is innermost, resulting in us boosting the 1292a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) // second match. 1302a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) // 1312a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) // Now, the code that implements this. 1322a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) // The deepest prefix for this URL regardless of where the match is. 1332a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) const URLPrefix* best_prefix = 1342a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) URLPrefix::BestURLPrefix(UTF8ToUTF16(gurl.spec()), string16()); 1352a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) DCHECK(best_prefix != NULL); 1362a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) const int num_components_in_best_prefix = best_prefix->num_components; 1372a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) // If the URL is inlineable, we must have a match. Note the prefix that 1382a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) // makes it inlineable may be empty. 1392a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) DCHECK(best_inlineable_prefix != NULL); 1402a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) const int num_components_in_best_inlineable_prefix = 1412a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) best_inlineable_prefix->num_components; 1422a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) innermost_match = (num_components_in_best_inlineable_prefix == 1432a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) num_components_in_best_prefix); 1442a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) } 1455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1467d4cd473f85ac64c3747c96c277f9e506a0d2246Torne (Richard Coles) const float topicality_score = GetTopicalityScore( 1477d4cd473f85ac64c3747c96c277f9e506a0d2246Torne (Richard Coles) terms.size(), url, url_matches, title_matches, word_starts); 1487d4cd473f85ac64c3747c96c277f9e506a0d2246Torne (Richard Coles) const float frecency_score = GetFrecency(now, visits); 1497d4cd473f85ac64c3747c96c277f9e506a0d2246Torne (Richard Coles) raw_score = GetFinalRelevancyScore(topicality_score, frecency_score); 1507d4cd473f85ac64c3747c96c277f9e506a0d2246Torne (Richard Coles) raw_score = 1517d4cd473f85ac64c3747c96c277f9e506a0d2246Torne (Richard Coles) (raw_score <= kint32max) ? static_cast<int>(raw_score) : kint32max; 1525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (also_do_hup_like_scoring && can_inline) { 1542a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) // HistoryURL-provider-like scoring gives any match that is 1552a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) // capable of being inlined a certain minimum score. Some of these 1562a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) // are given a higher score that lets them be shown in inline. 1572a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) // This test here derives from the test in 1582a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) // HistoryURLProvider::PromoteMatchForInlineAutocomplete(). 1592a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) const bool promote_to_inline = (row.typed_count() > 1) || 1602a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) (IsHostOnly() && (row.typed_count() == 1)); 1612a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) int hup_like_score = promote_to_inline ? 1622a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) HistoryURLProvider::kScoreForBestInlineableResult : 1632a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) HistoryURLProvider::kBaseScoreForNonInlineableResult; 1642a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) 1652a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) // Also, if the user types the hostname of a host with a typed 1662a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) // visit, then everything from that host get given inlineable scores 1672a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) // (because the URL-that-you-typed will go first and everything 1682a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) // else will be assigned one minus the previous score, as coded 1692a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) // at the end of HistoryURLProvider::DoAutocomplete(). 1702a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) if (UTF8ToUTF16(gurl.host()) == terms[0]) 1712a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) hup_like_score = HistoryURLProvider::kScoreForBestInlineableResult; 1722a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) 1732a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) // HistoryURLProvider has the function PromoteOrCreateShorterSuggestion() 1742a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) // that's meant to promote prefixes of the best match (if they've 1752a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) // been visited enough related to the best match) or 1762a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) // create/promote host-only suggestions (even if they've never 1772a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) // been typed). The code is complicated and we don't try to 1782a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) // duplicate the logic here. Instead, we handle a simple case: in 1792a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) // low-typed-count ranges, give host-only matches (i.e., 1802a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) // http://www.foo.com/ vs. http://www.foo.com/bar.html) a boost so 1812a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) // that the host-only match outscores all the other matches that 1822a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) // would normally have the same base score. This behavior is not 1832a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) // identical to what happens in HistoryURLProvider even in these 1842a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) // low typed count ranges--sometimes it will create/promote when 1852a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) // this test does not (indeed, we cannot create matches like HUP 1862a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) // can) and vice versa--but the underlying philosophy is similar. 1872a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) if (!promote_to_inline && IsHostOnly()) 1882a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) hup_like_score++; 1895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // All the other logic to goes into hup-like-scoring happens in 1915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // the tie-breaker case of MatchScoreGreater(). 1925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1932a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) // Incorporate hup_like_score into raw_score. 1945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) raw_score = std::max(raw_score, hup_like_score); 1955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 1962a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) 1972a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) // If this match is not inlineable and there's a cap on the maximum 1982a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) // score that can be given to non-inlineable matches, apply the cap. 1992a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) if (!can_inline && (max_assigned_score_for_non_inlineable_matches != -1)) { 2002a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) raw_score = std::min(max_assigned_score_for_non_inlineable_matches, 2012a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) raw_score); 2022a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) } 2035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 2045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)ScoredHistoryMatch::~ScoredHistoryMatch() {} 2065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Comparison function for sorting ScoredMatches by their scores with 2085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// intelligent tie-breaking. 2095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)bool ScoredHistoryMatch::MatchScoreGreater(const ScoredHistoryMatch& m1, 2105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const ScoredHistoryMatch& m2) { 2115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (m1.raw_score != m2.raw_score) 2125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return m1.raw_score > m2.raw_score; 2135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // This tie-breaking logic is inspired by / largely copied from the 2155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // ordering logic in history_url_provider.cc CompareHistoryMatch(). 2165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2172a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) // A URL that has been typed at all is better than one that has never been 2182a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) // typed. (Note "!"s on each side.) 2192a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) if (!m1.url_info.typed_count() != !m2.url_info.typed_count()) 2202a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) return m1.url_info.typed_count() > m2.url_info.typed_count(); 2212a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) 2222a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) // Innermost matches (matches after any scheme or "www.") are better than 2232a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) // non-innermost matches. 2242a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) if (m1.innermost_match != m2.innermost_match) 2252a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) return m1.innermost_match; 2262a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) 2275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // URLs that have been typed more often are better. 2285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (m1.url_info.typed_count() != m2.url_info.typed_count()) 2295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return m1.url_info.typed_count() > m2.url_info.typed_count(); 2305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // For URLs that have each been typed once, a host (alone) is better 2325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // than a page inside. 2335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (m1.url_info.typed_count() == 1) { 2345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (m1.IsHostOnly() != m2.IsHostOnly()) 2355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return m1.IsHostOnly(); 2365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 2375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // URLs that have been visited more often are better. 2395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (m1.url_info.visit_count() != m2.url_info.visit_count()) 2405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return m1.url_info.visit_count() > m2.url_info.visit_count(); 2415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // URLs that have been visited more recently are better. 2435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return m1.url_info.last_visit() > m2.url_info.last_visit(); 2445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 2455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// static 2475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)float ScoredHistoryMatch::GetTopicalityScore( 2485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const int num_terms, 2495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const string16& url, 2505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const TermMatches& url_matches, 2515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const TermMatches& title_matches, 2525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const RowWordStarts& word_starts) { 2535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Because the below thread is not thread safe, we check that we're 2545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // only calling it from one thread: the UI thread. Specifically, 2555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // we check "if we've heard of the UI thread then we'd better 2565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // be on it." The first part is necessary so unit tests pass. (Many 2575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // unit tests don't set up the threading naming system; hence 2585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // CurrentlyOn(UI thread) will fail.) 2593d4dfb6f11fb4e934d658743a8efc26d5490fdb0Ben Murdoch DCHECK(!content::BrowserThread::IsThreadInitialized( 2603d4dfb6f11fb4e934d658743a8efc26d5490fdb0Ben Murdoch content::BrowserThread::UI) || 2613d4dfb6f11fb4e934d658743a8efc26d5490fdb0Ben Murdoch content::BrowserThread::CurrentlyOn(content::BrowserThread::UI)); 2625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (raw_term_score_to_topicality_score == NULL) { 2635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) raw_term_score_to_topicality_score = new float[kMaxRawTermScore]; 2645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) FillInTermScoreToTopicalityScoreArray(); 2655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 2665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // A vector that accumulates per-term scores. The strongest match--a 2675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // match in the hostname at a word boundary--is worth 10 points. 2685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Everything else is less. In general, a match that's not at a word 2695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // boundary is worth about 1/4th or 1/5th of a match at the word boundary 2705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // in the same part of the URL/title. 271c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) DCHECK_GT(num_terms, 0); 2725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) std::vector<int> term_scores(num_terms, 0); 2735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) std::vector<size_t>::const_iterator next_word_starts = 2745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) word_starts.url_word_starts_.begin(); 2755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) std::vector<size_t>::const_iterator end_word_starts = 2765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) word_starts.url_word_starts_.end(); 2775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const size_t question_mark_pos = url.find('?'); 2785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const size_t colon_pos = url.find(':'); 2795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // The + 3 skips the // that probably appears in the protocol 2805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // after the colon. If the protocol doesn't have two slashes after 2815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // the colon, that's okay--all this ends up doing is starting our 2825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // search for the next / a few characters into the hostname. The 2835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // only times this can cause problems is if we have a protocol without 2845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // a // after the colon and the hostname is only one or two characters. 2855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // This isn't worth worrying about. 2865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const size_t end_of_hostname_pos = (colon_pos != std::string::npos) ? 2875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) url.find('/', colon_pos + 3) : url.find('/'); 2885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) size_t last_part_of_hostname_pos = 2895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) (end_of_hostname_pos != std::string::npos) ? 2905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) url.rfind('.', end_of_hostname_pos) : 2915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) url.rfind('.'); 2925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Loop through all URL matches and score them appropriately. 2935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) for (TermMatches::const_iterator iter = url_matches.begin(); 2945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) iter != url_matches.end(); ++iter) { 2955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Advance next_word_starts until it's >= the position of the term 2965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // we're considering. 2975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) while ((next_word_starts != end_word_starts) && 2985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) (*next_word_starts < iter->offset)) { 2995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ++next_word_starts; 3005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 3015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const bool at_word_boundary = (next_word_starts != end_word_starts) && 3025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) (*next_word_starts == iter->offset); 3035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if ((question_mark_pos != std::string::npos) && 3045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) (iter->offset > question_mark_pos)) { 3055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // match in CGI ?... fragment 3065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) term_scores[iter->term_num] += at_word_boundary ? 5 : 0; 3075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } else if ((end_of_hostname_pos != std::string::npos) && 3085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) (iter->offset > end_of_hostname_pos)) { 3095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // match in path 310c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) term_scores[iter->term_num] += at_word_boundary ? 8 : 0; 3115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } else if ((colon_pos == std::string::npos) || 3125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) (iter->offset > colon_pos)) { 3135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // match in hostname 3145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if ((last_part_of_hostname_pos == std::string::npos) || 3155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) (iter->offset < last_part_of_hostname_pos)) { 3165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Either there are no dots in the hostname or this match isn't 3175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // the last dotted component. 3185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) term_scores[iter->term_num] += at_word_boundary ? 10 : 2; 3195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } // else: match in the last part of a dotted hostname (usually 3205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // this is the top-level domain .com, .net, etc.). Do not 3215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // count this match for scoring. 3225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } // else: match in protocol. Do not count this match for scoring. 3235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 3245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Now do the analogous loop over all matches in the title. 3255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) next_word_starts = word_starts.title_word_starts_.begin(); 3265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) end_word_starts = word_starts.title_word_starts_.end(); 3275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int word_num = 0; 3285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) for (TermMatches::const_iterator iter = title_matches.begin(); 3295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) iter != title_matches.end(); ++iter) { 3305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Advance next_word_starts until it's >= the position of the term 3315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // we're considering. 3325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) while ((next_word_starts != end_word_starts) && 3335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) (*next_word_starts < iter->offset)) { 3345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ++next_word_starts; 3355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ++word_num; 3365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 3375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (word_num >= 10) break; // only count the first ten words 3385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const bool at_word_boundary = (next_word_starts != end_word_starts) && 3395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) (*next_word_starts == iter->offset); 3402a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) term_scores[iter->term_num] += at_word_boundary ? 8 : 0; 3415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 3425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // TODO(mpearson): Restore logic for penalizing out-of-order matches. 3435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // (Perhaps discount them by 0.8?) 3445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // TODO(mpearson): Consider: if the earliest match occurs late in the string, 3455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // should we discount it? 3465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // TODO(mpearson): Consider: do we want to score based on how much of the 3475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // input string the input covers? (I'm leaning toward no.) 3485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 3495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Compute the topicality_score as the sum of transformed term_scores. 3505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) float topicality_score = 0; 3515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) for (size_t i = 0; i < term_scores.size(); ++i) { 352c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) // Drop this URL if it seems like a term didn't appear or, more precisely, 353c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) // didn't appear in a part of the URL or title that we trust enough 354c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) // to give it credit for. For instance, terms that appear in the middle 355c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) // of a CGI parameter get no credit. Almost all the matches dropped 356c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) // due to this test would look stupid if shown to the user. 357c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) if (term_scores[i] == 0) 358c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) return 0; 3595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) topicality_score += raw_term_score_to_topicality_score[ 360c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) (term_scores[i] >= kMaxRawTermScore) ? (kMaxRawTermScore - 1) : 3615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) term_scores[i]]; 3625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 3635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // TODO(mpearson): If there are multiple terms, consider taking the 364c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) // geometric mean of per-term scores rather than the arithmetic mean. 3655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 366c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) return topicality_score / num_terms; 3675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 3685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 3695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// static 3705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)float* ScoredHistoryMatch::raw_term_score_to_topicality_score = NULL; 3715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 3725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// static 3735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)void ScoredHistoryMatch::FillInTermScoreToTopicalityScoreArray() { 3745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) for (int term_score = 0; term_score < kMaxRawTermScore; ++term_score) { 3755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) float topicality_score; 3765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (term_score < 10) { 3775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // If the term scores less than 10 points (no full-credit hit, or 3785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // no combination of hits that score that well), then the topicality 3795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // score is linear in the term score. 3805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) topicality_score = 0.1 * term_score; 3815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } else { 3825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // For term scores of at least ten points, pass them through a log 3835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // function so a score of 10 points gets a 1.0 (to meet up exactly 3845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // with the linear component) and increases logarithmically until 3855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // maxing out at 30 points, with computes to a score around 2.1. 3865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) topicality_score = (1.0 + 2.25 * log10(0.1 * term_score)); 3875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 3885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) raw_term_score_to_topicality_score[term_score] = topicality_score; 3895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 3905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 3915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 3925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// static 3935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)float* ScoredHistoryMatch::days_ago_to_recency_score = NULL; 3945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 3955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// static 3965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)float ScoredHistoryMatch::GetRecencyScore(int last_visit_days_ago) { 3975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Because the below thread is not thread safe, we check that we're 3985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // only calling it from one thread: the UI thread. Specifically, 3995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // we check "if we've heard of the UI thread then we'd better 4005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // be on it." The first part is necessary so unit tests pass. (Many 4015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // unit tests don't set up the threading naming system; hence 4025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // CurrentlyOn(UI thread) will fail.) 4033d4dfb6f11fb4e934d658743a8efc26d5490fdb0Ben Murdoch DCHECK(!content::BrowserThread::IsThreadInitialized( 4043d4dfb6f11fb4e934d658743a8efc26d5490fdb0Ben Murdoch content::BrowserThread::UI) || 4053d4dfb6f11fb4e934d658743a8efc26d5490fdb0Ben Murdoch content::BrowserThread::CurrentlyOn(content::BrowserThread::UI)); 4065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (days_ago_to_recency_score == NULL) { 4075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) days_ago_to_recency_score = new float[kDaysToPrecomputeRecencyScoresFor]; 4085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) FillInDaysAgoToRecencyScoreArray(); 4095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 4105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Lookup the score in days_ago_to_recency_score, treating 4115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // everything older than what we've precomputed as the oldest thing 4125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // we've precomputed. The std::max is to protect against corruption 4135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // in the database (in case last_visit_days_ago is negative). 4145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return days_ago_to_recency_score[ 4155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) std::max( 4165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) std::min(last_visit_days_ago, kDaysToPrecomputeRecencyScoresFor - 1), 4175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 0)]; 4185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 4195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 4205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)void ScoredHistoryMatch::FillInDaysAgoToRecencyScoreArray() { 4215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) for (int days_ago = 0; days_ago < kDaysToPrecomputeRecencyScoresFor; 4225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) days_ago++) { 4235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int unnormalized_recency_score; 424c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) if (days_ago <= 4) { 4255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) unnormalized_recency_score = 100; 426c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) } else if (days_ago <= 14) { 427c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) // Linearly extrapolate between 4 and 14 days so 14 days has a score 428c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) // of 70. 429c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) unnormalized_recency_score = 70 + (14 - days_ago) * (100 - 70) / (14 - 4); 430c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) } else if (days_ago <= 31) { 431c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) // Linearly extrapolate between 14 and 31 days so 31 days has a score 4325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // of 50. 433c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) unnormalized_recency_score = 50 + (31 - days_ago) * (70 - 50) / (31 - 14); 4345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } else if (days_ago <= 90) { 4355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Linearly extrapolate between 30 and 90 days so 90 days has a score 436c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) // of 30. 437c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) unnormalized_recency_score = 30 + (90 - days_ago) * (50 - 30) / (90 - 30); 4385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } else { 4395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Linearly extrapolate between 90 and 365 days so 365 days has a score 4405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // of 10. 4415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) unnormalized_recency_score = 4425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 10 + (365 - days_ago) * (20 - 10) / (365 - 90); 4435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 4445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) days_ago_to_recency_score[days_ago] = unnormalized_recency_score / 100.0; 4455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (days_ago > 0) { 4465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) DCHECK_LE(days_ago_to_recency_score[days_ago], 4475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) days_ago_to_recency_score[days_ago - 1]); 4485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 4495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 4505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 4515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 4525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// static 453c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)float ScoredHistoryMatch::GetFrecency(const base::Time& now, 454c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) const VisitInfoVector& visits) { 455c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) // Compute the weighted average |value_of_transition| over the last at 456c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) // most kMaxVisitsToScore visits, where each visit is weighted using 457c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) // GetRecencyScore() based on how many days ago it happened. Use 458c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) // kMaxVisitsToScore as the denominator for the average regardless of 459c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) // how many visits there were in order to penalize a match that has 460c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) // fewer visits than kMaxVisitsToScore. 461c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) const int total_sampled_visits = std::min(visits.size(), kMaxVisitsToScore); 4627d4cd473f85ac64c3747c96c277f9e506a0d2246Torne (Richard Coles) if (total_sampled_visits == 0) 4637d4cd473f85ac64c3747c96c277f9e506a0d2246Torne (Richard Coles) return 0.0f; 464c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) float summed_visit_points = 0; 465c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) for (int i = 0; i < total_sampled_visits; ++i) { 466c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) const int value_of_transition = 467c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) (visits[i].second == content::PAGE_TRANSITION_TYPED) ? 20 : 1; 468c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) const float bucket_weight = 469c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) GetRecencyScore((now - visits[i].first).InDays()); 470c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) summed_visit_points += (value_of_transition * bucket_weight); 471c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) } 472c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) return visits.size() * summed_visit_points / total_sampled_visits; 4735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 4745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 4752a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)// static 476c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)float ScoredHistoryMatch::GetFinalRelevancyScore(float topicality_score, 477c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) float frecency_score) { 478c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) if (topicality_score == 0) 479c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) return 0; 4802a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) // Here's how to interpret intermediate_score: Suppose the omnibox 481c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) // has one input term. Suppose we have a URL for which the omnibox 482c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) // input term has a single URL hostname hit at a word boundary. (This 483c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) // implies topicality_score = 1.0.). Then the intermediate_score for 484c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) // this URL will depend entirely on the frecency_score with 485c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) // this interpretation: 486c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) // - a single typed visit more than three months ago, no other visits -> 0.2 487c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) // - a visit every three days, no typed visits -> 0.706 488c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) // - a visit every day, no typed visits -> 0.916 489c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) // - a single typed visit yesterday, no other visits -> 2.0 490c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) // - a typed visit once a week -> 11.77 491c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) // - a typed visit every three days -> 14.12 492c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) // - at least ten typed visits today -> 20.0 (maximum score) 493c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) const float intermediate_score = topicality_score * frecency_score; 494c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) // The below code maps intermediate_score to the range [0, 1399]. 495c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) // The score maxes out at 1400 (i.e., cannot beat a good inline result). 496c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) if (intermediate_score <= 1) { 497c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) // Linearly extrapolate between 0 and 1.5 so 0 has a score of 400 498c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) // and 1.5 has a score of 600. 499c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) const float slope = (600 - 400) / (1.5f - 0.0f); 500c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) return 400 + slope * intermediate_score; 501c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) } 502c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) if (intermediate_score <= 12.0) { 503c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) // Linearly extrapolate up to 12 so 12 has a score of 1300. 504c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) const float slope = (1300 - 600) / (12.0f - 1.5f); 505c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) return 600 + slope * (intermediate_score - 1.5); 5062a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) } 507c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) // Linearly extrapolate so a score of 20 (or more) has a score of 1399. 508c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) // (Scores above 20 are possible for URLs that have multiple term hits 509c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) // in the URL and/or title and that are visited practically all 510c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) // the time using typed visits. We don't attempt to distinguish 511c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) // between these very good results.) 512c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) const float slope = (1399 - 1300) / (20.0f - 12.0f); 513c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) return std::min(1399.0, 1300 + slope * (intermediate_score - 12.0)); 5142a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)} 5152a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) 5162a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)void ScoredHistoryMatch::InitializeAlsoDoHUPLikeScoringFieldAndMaxScoreField() { 5177d4cd473f85ac64c3747c96c277f9e506a0d2246Torne (Richard Coles) also_do_hup_like_scoring = false; 5182a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) // When doing HUP-like scoring, don't allow a non-inlineable match 5192a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) // to beat the score of good inlineable matches. This is a problem 5202a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) // because if a non-inlineable match ends up with the highest score 5212a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) // from HistoryQuick provider, all HistoryQuick matches get demoted 5222a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) // to non-inlineable scores (scores less than 1200). Without 5232a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) // HUP-like-scoring, these results would actually come from the HUP 5242a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) // and not be demoted, thus outscoring the demoted HQP results. 5252a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) // When the HQP provides these, we need to clamp the non-inlineable 5262a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) // results to preserve this behavior. 5272a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) if (also_do_hup_like_scoring) { 5282a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) max_assigned_score_for_non_inlineable_matches = 5292a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) HistoryURLProvider::kScoreForBestInlineableResult - 1; 5302a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) } 5315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 5325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 5335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} // namespace history 534