15821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Copyright (c) 2012 The Chromium Authors. All rights reserved.
25821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Use of this source code is governed by a BSD-style license that can be
35821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// found in the LICENSE file.
45821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//
55821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// A read-only set implementation for |SBPrefix| items.  Prefixes are
65821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// sorted and stored as 16-bit deltas from the previous prefix.  An
75821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// index structure provides quick random access, and also handles
85821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// cases where 16 bits cannot encode a delta.
95821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//
105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// For example, the sequence {20, 25, 41, 65432, 150000, 160000} would
115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// be stored as:
125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//  A pair {20, 0} in |index_|.
135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//  5, 16, 65391 in |deltas_|.
145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//  A pair {150000, 3} in |index_|.
155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//  10000 in |deltas_|.
165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// |index_.size()| will be 2, |deltas_.size()| will be 4.
175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//
185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// This structure is intended for storage of sparse uniform sets of
195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// prefixes of a certain size.  As of this writing, my safe-browsing
205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// database contains:
215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//   653132 add prefixes
225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//   6446 are duplicates (from different chunks)
235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//   24301 w/in 2^8 of the prior prefix
245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//   622337 w/in 2^16 of the prior prefix
255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//   47 further than 2^16 from the prior prefix
265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// For this input, the memory usage is approximately 2 bytes per
275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// prefix, a bit over 1.2M.  The bloom filter used 25 bits per prefix,
285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// a bit over 1.9M on this data.
295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//
305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Experimenting with random selections of the above data, storage
315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// size drops almost linearly as prefix count drops, until the index
325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// overhead starts to become a problem a bit under 200k prefixes.  The
335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// memory footprint gets worse than storing the raw prefix data around
345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// 75k prefixes.  Fortunately, the actual memory footprint also falls.
355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// If the prefix count increases the memory footprint should increase
365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// approximately linearly.  The worst-case would be 2^16 items all
375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// 2^16 apart, which would need 512k (versus 256k to store the raw
385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// data).
395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//
405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// The on-disk format looks like:
415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//         4 byte magic number
425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//         4 byte version number
435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//         4 byte |index_.size()|
445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//         4 byte |deltas_.size()|
455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//     n * 8 byte |&index_[0]..&index_[n]|
465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//     m * 2 byte |&deltas_[0]..&deltas_[m]|
475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//        16 byte digest
485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#ifndef CHROME_BROWSER_SAFE_BROWSING_PREFIX_SET_H_
505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#define CHROME_BROWSER_SAFE_BROWSING_PREFIX_SET_H_
515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include <vector>
535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "chrome/browser/safe_browsing/safe_browsing_util.h"
555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
562a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)namespace base {
575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)class FilePath;
582a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)}
595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)namespace safe_browsing {
615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)class PrefixSet {
635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) public:
645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  explicit PrefixSet(const std::vector<SBPrefix>& sorted_prefixes);
655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  ~PrefixSet();
665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // |true| if |prefix| was in |prefixes| passed to the constructor.
685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  bool Exists(SBPrefix prefix) const;
695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Persist the set on disk.
712a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)  static PrefixSet* LoadFile(const base::FilePath& filter_name);
722a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)  bool WriteFile(const base::FilePath& filter_name) const;
735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Regenerate the vector of prefixes passed to the constructor into
755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // |prefixes|.  Prefixes will be added in sorted order.
765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  void GetPrefixes(std::vector<SBPrefix>* prefixes) const;
775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) private:
795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Maximum number of consecutive deltas to encode before generating
805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // a new index entry.  This helps keep the worst-case performance
815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // for |Exists()| under control.
825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  static const size_t kMaxRun = 100;
835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Helper for |LoadFile()|.  Steals the contents of |index| and
855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // |deltas| using |swap()|.
865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  PrefixSet(std::vector<std::pair<SBPrefix,size_t> > *index,
875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)            std::vector<uint16> *deltas);
885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Top-level index of prefix to offset in |deltas_|.  Each pair
905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // indicates a base prefix and where the deltas from that prefix
915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // begin in |deltas_|.  The deltas for a pair end at the next pair's
925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // index into |deltas_|.
935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  std::vector<std::pair<SBPrefix,size_t> > index_;
945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Deltas which are added to the prefix in |index_| to generate
965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // prefixes.  Deltas are only valid between consecutive items from
975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // |index_|, or the end of |deltas_| for the last |index_| pair.
985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  std::vector<uint16> deltas_;
995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  DISALLOW_COPY_AND_ASSIGN(PrefixSet);
1015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)};
1025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}  // namespace safe_browsing
1045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#endif  // CHROME_BROWSER_SAFE_BROWSING_PREFIX_SET_H_
106