15821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Copyright (c) 2011 The Chromium Authors. All rights reserved.
25821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Use of this source code is governed by a BSD-style license that can be
35821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// found in the LICENSE file.
45821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//
55821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// This class loads a client-side model and lets you compute a phishing score
65821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// for a set of previously extracted features.  The phishing score corresponds
75821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// to the probability that the features are indicative of a phishing site.
85821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//
95821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// For more details on how the score is actually computed for a given model
105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// and a given set of features read the comments in client_model.proto file.
115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//
125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// See features.h for a list of features that are currently used.
135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#ifndef CHROME_RENDERER_SAFE_BROWSING_SCORER_H_
155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#define CHROME_RENDERER_SAFE_BROWSING_SCORER_H_
165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include <string>
185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "base/basictypes.h"
207d4cd473f85ac64c3747c96c277f9e506a0d2246Torne (Richard Coles)#include "base/containers/hash_tables.h"
21c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)#include "base/strings/string_piece.h"
225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "chrome/common/safe_browsing/client_model.pb.h"
235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)namespace safe_browsing {
255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)class FeatureMap;
265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Scorer methods are virtual to simplify mocking of this class.
285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)class Scorer {
295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) public:
305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  virtual ~Scorer();
315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Factory method which creates a new Scorer object by parsing the given
335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // model.  If parsing fails this method returns NULL.
345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  static Scorer* Create(const base::StringPiece& model_str);
355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // This method computes the probability that the given features are indicative
375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // of phishing.  It returns a score value that falls in the range [0.0,1.0]
385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // (range is inclusive on both ends).
395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  virtual double ComputeScore(const FeatureMap& features) const;
405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Returns the version number of the loaded client model.
425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int model_version() const;
435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // -- Accessors used by the page feature extractor ---------------------------
455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Returns a set of hashed page terms that appear in the model in binary
475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // format.
485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  const base::hash_set<std::string>& page_terms() const;
495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Returns a set of hashed page words that appear in the model in binary
515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // format.
525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  const base::hash_set<uint32>& page_words() const;
535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Return the maximum number of words per term for the loaded model.
555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  size_t max_words_per_term() const;
565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Returns the murmurhash3 seed for the loaded model.
585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  uint32 murmurhash3_seed() const;
595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
60cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)  // Return the maximum number of unique shingle hashes per page.
61cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)  size_t max_shingles_per_page() const;
62cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)
63cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)  // Return the number of words in a shingle.
64cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)  size_t shingle_size() const;
65cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)
665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) protected:
675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Most clients should use the factory method.  This constructor is public
685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // to allow for mock implementations.
695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  Scorer();
705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) private:
725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  friend class PhishingScorerTest;
735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Computes the score for a given rule and feature map.  The score is computed
755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // by multiplying the rule weight with the product of feature weights for the
765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // given rule.  The feature weights are stored in the feature map.  If a
775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // particular feature does not exist in the feature map we set its weight to
785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // zero.
795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  double ComputeRuleScore(const ClientSideModel::Rule& rule,
805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                          const FeatureMap& features) const;
815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  ClientSideModel model_;
835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  base::hash_set<std::string> page_terms_;
845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  base::hash_set<uint32> page_words_;
855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  DISALLOW_COPY_AND_ASSIGN(Scorer);
875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)};
88cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)}  // namespace safe_browsing
895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#endif  // CHROME_RENDERER_SAFE_BROWSING_SCORER_H_
91