15821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Copyright (c) 2011 The Chromium Authors. All rights reserved. 25821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Use of this source code is governed by a BSD-style license that can be 35821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// found in the LICENSE file. 45821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// 55821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// This class loads a client-side model and lets you compute a phishing score 65821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// for a set of previously extracted features. The phishing score corresponds 75821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// to the probability that the features are indicative of a phishing site. 85821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// 95821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// For more details on how the score is actually computed for a given model 105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// and a given set of features read the comments in client_model.proto file. 115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// 125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// See features.h for a list of features that are currently used. 135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#ifndef CHROME_RENDERER_SAFE_BROWSING_SCORER_H_ 155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#define CHROME_RENDERER_SAFE_BROWSING_SCORER_H_ 165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include <string> 185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "base/basictypes.h" 207d4cd473f85ac64c3747c96c277f9e506a0d2246Torne (Richard Coles)#include "base/containers/hash_tables.h" 21c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)#include "base/strings/string_piece.h" 225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "chrome/common/safe_browsing/client_model.pb.h" 235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)namespace safe_browsing { 255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)class FeatureMap; 265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Scorer methods are virtual to simplify mocking of this class. 285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)class Scorer { 295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) public: 305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) virtual ~Scorer(); 315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Factory method which creates a new Scorer object by parsing the given 335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // model. If parsing fails this method returns NULL. 345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) static Scorer* Create(const base::StringPiece& model_str); 355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // This method computes the probability that the given features are indicative 375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // of phishing. It returns a score value that falls in the range [0.0,1.0] 385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // (range is inclusive on both ends). 395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) virtual double ComputeScore(const FeatureMap& features) const; 405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Returns the version number of the loaded client model. 425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int model_version() const; 435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // -- Accessors used by the page feature extractor --------------------------- 455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Returns a set of hashed page terms that appear in the model in binary 475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // format. 485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const base::hash_set<std::string>& page_terms() const; 495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Returns a set of hashed page words that appear in the model in binary 515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // format. 525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const base::hash_set<uint32>& page_words() const; 535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Return the maximum number of words per term for the loaded model. 555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) size_t max_words_per_term() const; 565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Returns the murmurhash3 seed for the loaded model. 585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) uint32 murmurhash3_seed() const; 595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 60cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles) // Return the maximum number of unique shingle hashes per page. 61cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles) size_t max_shingles_per_page() const; 62cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles) 63cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles) // Return the number of words in a shingle. 64cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles) size_t shingle_size() const; 65cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles) 665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) protected: 675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Most clients should use the factory method. This constructor is public 685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // to allow for mock implementations. 695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Scorer(); 705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) private: 725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) friend class PhishingScorerTest; 735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Computes the score for a given rule and feature map. The score is computed 755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // by multiplying the rule weight with the product of feature weights for the 765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // given rule. The feature weights are stored in the feature map. If a 775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // particular feature does not exist in the feature map we set its weight to 785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // zero. 795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) double ComputeRuleScore(const ClientSideModel::Rule& rule, 805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const FeatureMap& features) const; 815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ClientSideModel model_; 835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) base::hash_set<std::string> page_terms_; 845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) base::hash_set<uint32> page_words_; 855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) DISALLOW_COPY_AND_ASSIGN(Scorer); 875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}; 88cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)} // namespace safe_browsing 895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#endif // CHROME_RENDERER_SAFE_BROWSING_SCORER_H_ 91