1// Copyright (c) 2010 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#include "chrome/browser/safe_browsing/safe_browsing_store.h"
6
7#include <algorithm>
8
9#include "base/metrics/histogram.h"
10
11namespace {
12
13// Find items matching between |subs| and |adds|, and remove them,
14// recording the item from |adds| in |adds_removed|.  To minimize
15// copies, the inputs are processing in parallel, so |subs| and |adds|
16// should be compatibly ordered (either by SBAddPrefixLess or
17// SBAddPrefixHashLess).
18//
19// |predAS| provides add < sub, |predSA| provides sub < add, for the
20// tightest compare appropriate (see calls in SBProcessSubs).
21template <class S, class A, typename PredAS, typename PredSA>
22void KnockoutSubs(std::vector<S>* subs,
23                  std::vector<A>* adds,
24                  PredAS predAS, PredSA predSA,
25                  std::vector<A>* adds_removed) {
26  // Keep a pair of output iterators for writing kept items.  Due to
27  // deletions, these may lag the main iterators.  Using erase() on
28  // individual items would result in O(N^2) copies.  Using std::list
29  // would work around that, at double or triple the memory cost.
30  typename std::vector<A>::iterator add_out = adds->begin();
31  typename std::vector<S>::iterator sub_out = subs->begin();
32
33  // Current location in vectors.
34  // TODO(shess): I want these to be const_iterator, but then
35  // std::copy() gets confused.  Could snag a const_iterator add_end,
36  // or write an inline std::copy(), but it seems like I'm doing
37  // something wrong.
38  typename std::vector<A>::iterator add_iter = adds->begin();
39  typename std::vector<S>::iterator sub_iter = subs->begin();
40
41  while (add_iter != adds->end() && sub_iter != subs->end()) {
42    // If |*sub_iter| < |*add_iter|, retain the sub.
43    if (predSA(*sub_iter, *add_iter)) {
44      *sub_out = *sub_iter;
45      ++sub_out;
46      ++sub_iter;
47
48      // If |*add_iter| < |*sub_iter|, retain the add.
49    } else if (predAS(*add_iter, *sub_iter)) {
50      *add_out = *add_iter;
51      ++add_out;
52      ++add_iter;
53
54      // Record equal items and drop them.
55    } else {
56      adds_removed->push_back(*add_iter);
57      ++add_iter;
58      ++sub_iter;
59    }
60  }
61
62  // Erase any leftover gap.
63  adds->erase(add_out, add_iter);
64  subs->erase(sub_out, sub_iter);
65}
66
67// Remove items in |removes| from |full_hashes|.  |full_hashes| and
68// |removes| should be ordered by SBAddPrefix component.
69template <class T>
70void RemoveMatchingPrefixes(const std::vector<SBAddPrefix>& removes,
71                            std::vector<T>* full_hashes) {
72  // This is basically an inline of std::set_difference().
73  // Unfortunately, that algorithm requires that the two iterator
74  // pairs use the same value types.
75
76  // Where to store kept items.
77  typename std::vector<T>::iterator out = full_hashes->begin();
78
79  typename std::vector<T>::iterator hash_iter = full_hashes->begin();
80  std::vector<SBAddPrefix>::const_iterator remove_iter = removes.begin();
81
82  while (hash_iter != full_hashes->end() && remove_iter != removes.end()) {
83    // Keep items less than |*remove_iter|.
84    if (SBAddPrefixLess(*hash_iter, *remove_iter)) {
85      *out = *hash_iter;
86      ++out;
87      ++hash_iter;
88
89      // No hit for |*remove_iter|, bump it forward.
90    } else if (SBAddPrefixLess(*remove_iter, *hash_iter)) {
91      ++remove_iter;
92
93      // Drop equal items, there may be multiple hits.
94    } else {
95      do {
96        ++hash_iter;
97      } while (hash_iter != full_hashes->end() &&
98               !SBAddPrefixLess(*remove_iter, *hash_iter));
99      ++remove_iter;
100    }
101  }
102
103  // Erase any leftover gap.
104  full_hashes->erase(out, hash_iter);
105}
106
107// Remove deleted items (|chunk_id| in |del_set|) from the vector.
108template <class T>
109void RemoveDeleted(std::vector<T>* vec, const base::hash_set<int32>& del_set) {
110  DCHECK(vec);
111
112  // Scan through the items read, dropping the items in |del_set|.
113  typename std::vector<T>::iterator add_iter = vec->begin();
114  for (typename std::vector<T>::iterator iter = add_iter;
115       iter != vec->end(); ++iter) {
116    if (del_set.count(iter->chunk_id) == 0) {
117      *add_iter = *iter;
118      ++add_iter;
119    }
120  }
121  vec->erase(add_iter, vec->end());
122}
123
124enum MissTypes {
125  MISS_TYPE_ALL,
126  MISS_TYPE_FALSE,
127
128  // Always at the end.
129  MISS_TYPE_MAX
130};
131
132}  // namespace
133
134void SBCheckPrefixMisses(const std::vector<SBAddPrefix>& add_prefixes,
135                         const std::set<SBPrefix>& prefix_misses) {
136  if (prefix_misses.empty())
137    return;
138
139  // Record a hit for all prefixes which missed when sent to the
140  // server.
141  for (size_t i = 0; i < prefix_misses.size(); ++i) {
142    UMA_HISTOGRAM_ENUMERATION("SB2.BloomFilterFalsePositives",
143                              MISS_TYPE_ALL, MISS_TYPE_MAX);
144  }
145
146  // Collect the misses which are not present in |add_prefixes|.
147  // Since |add_prefixes| can contain multiple copies of the same
148  // prefix, it is not sufficient to count the number of elements
149  // present in both collections.
150  std::set<SBPrefix> false_misses(prefix_misses.begin(), prefix_misses.end());
151  for (size_t i = 0; i < add_prefixes.size(); ++i) {
152    // |erase()| on an absent element should cost like |find()|.
153    false_misses.erase(add_prefixes[i].prefix);
154  }
155
156  // Record a hit for prefixes which we shouldn't have sent in the
157  // first place.
158  for (size_t i = 0; i < false_misses.size(); ++i) {
159    UMA_HISTOGRAM_ENUMERATION("SB2.BloomFilterFalsePositives",
160                              MISS_TYPE_FALSE, MISS_TYPE_MAX);
161  }
162
163  // Divide |MISS_TYPE_FALSE| by |MISS_TYPE_ALL| to get the
164  // bloom-filter false-positive rate.
165}
166
167void SBProcessSubs(std::vector<SBAddPrefix>* add_prefixes,
168                   std::vector<SBSubPrefix>* sub_prefixes,
169                   std::vector<SBAddFullHash>* add_full_hashes,
170                   std::vector<SBSubFullHash>* sub_full_hashes,
171                   const base::hash_set<int32>& add_chunks_deleted,
172                   const base::hash_set<int32>& sub_chunks_deleted) {
173  // It is possible to structure templates and template
174  // specializations such that the following calls work without having
175  // to qualify things.  It becomes very arbitrary, though, and less
176  // clear how things are working.
177
178  // Sort the inputs by the SBAddPrefix bits.
179  std::sort(add_prefixes->begin(), add_prefixes->end(),
180            SBAddPrefixLess<SBAddPrefix,SBAddPrefix>);
181  std::sort(sub_prefixes->begin(), sub_prefixes->end(),
182            SBAddPrefixLess<SBSubPrefix,SBSubPrefix>);
183  std::sort(add_full_hashes->begin(), add_full_hashes->end(),
184            SBAddPrefixHashLess<SBAddFullHash,SBAddFullHash>);
185  std::sort(sub_full_hashes->begin(), sub_full_hashes->end(),
186            SBAddPrefixHashLess<SBSubFullHash,SBSubFullHash>);
187
188  // Factor out the prefix subs.
189  std::vector<SBAddPrefix> removed_adds;
190  KnockoutSubs(sub_prefixes, add_prefixes,
191               SBAddPrefixLess<SBAddPrefix,SBSubPrefix>,
192               SBAddPrefixLess<SBSubPrefix,SBAddPrefix>,
193               &removed_adds);
194
195  // Remove the full-hashes corrosponding to the adds which
196  // KnockoutSubs() removed.  Processing these w/in KnockoutSubs()
197  // would make the code more complicated, and they are very small
198  // relative to the prefix lists so the gain would be modest.
199  RemoveMatchingPrefixes(removed_adds, add_full_hashes);
200  RemoveMatchingPrefixes(removed_adds, sub_full_hashes);
201
202  // http://crbug.com/52385
203  // TODO(shess): AFAICT this pass is not done on the trunk.  I
204  // believe that's a bug, but it may not matter because full-hash
205  // subs almost never happen (I think you'd need multiple collisions
206  // where one of the sites stopped being flagged?).  Enable this once
207  // everything is in.  [if(0) instead of #ifdef 0 to make sure it
208  // compiles.]
209  if (0) {
210    // Factor out the full-hash subs.
211    std::vector<SBAddFullHash> removed_full_adds;
212    KnockoutSubs(sub_full_hashes, add_full_hashes,
213                 SBAddPrefixHashLess<SBAddFullHash,SBSubFullHash>,
214                 SBAddPrefixHashLess<SBSubFullHash,SBAddFullHash>,
215                 &removed_full_adds);
216  }
217
218  // Remove items from the deleted chunks.  This is done after other
219  // processing to allow subs to knock out adds (and be removed) even
220  // if the add's chunk is deleted.
221  RemoveDeleted(add_prefixes, add_chunks_deleted);
222  RemoveDeleted(sub_prefixes, sub_chunks_deleted);
223  RemoveDeleted(add_full_hashes, add_chunks_deleted);
224  RemoveDeleted(sub_full_hashes, sub_chunks_deleted);
225}
226