1// Copyright (c) 2011 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5// Provides global database of differential decompression dictionaries for the
6// SDCH filter (processes sdch enconded content).
7
8// Exactly one instance of SdchManager is built, and all references are made
9// into that collection.
10//
11// The SdchManager maintains a collection of memory resident dictionaries. It
12// can find a dictionary (based on a server specification of a hash), store a
13// dictionary, and make judgements about what URLs can use, set, etc. a
14// dictionary.
15
16// These dictionaries are acquired over the net, and include a header
17// (containing metadata) as well as a VCDIFF dictionary (for use by a VCDIFF
18// module) to decompress data.
19
20#ifndef NET_BASE_SDCH_MANAGER_H_
21#define NET_BASE_SDCH_MANAGER_H_
22
23#include <map>
24#include <set>
25#include <string>
26
27#include "base/gtest_prod_util.h"
28#include "base/memory/ref_counted.h"
29#include "base/memory/scoped_ptr.h"
30#include "base/threading/non_thread_safe.h"
31#include "base/time/time.h"
32#include "net/base/net_export.h"
33#include "url/gurl.h"
34
35namespace net {
36
37//------------------------------------------------------------------------------
38// Create a public interface to help us load SDCH dictionaries.
39// The SdchManager class allows registration to support this interface.
40// A browser may register a fetcher that is used by the dictionary managers to
41// get data from a specified URL. This allows us to use very high level browser
42// functionality in this base (when the functionality can be provided).
43class NET_EXPORT SdchFetcher {
44 public:
45  class NET_EXPORT Delegate {
46   public:
47    virtual ~Delegate() {}
48
49    // Called whenever the SdchFetcher has successfully retrieved a
50    // dictionary.  |dictionary_text| contains the body of the dictionary
51    // retrieved from |dictionary_url|.
52    virtual void AddSdchDictionary(const std::string& dictionary_text,
53                                   const GURL& dictionary_url) = 0;
54  };
55
56  SdchFetcher() {}
57  virtual ~SdchFetcher() {}
58
59  // The Schedule() method is called when there is a need to get a dictionary
60  // from a server. The callee is responsible for getting that dictionary_text,
61  // and then calling back to AddSdchDictionary() in the Delegate instance.
62  virtual void Schedule(const GURL& dictionary_url) = 0;
63
64  // The Cancel() method is called to cancel all pending dictionary fetches.
65  // This is used for implementation of ClearData() below.
66  virtual void Cancel() = 0;
67
68 private:
69  DISALLOW_COPY_AND_ASSIGN(SdchFetcher);
70};
71
72//------------------------------------------------------------------------------
73
74class NET_EXPORT SdchManager
75    : public SdchFetcher::Delegate,
76      public NON_EXPORTED_BASE(base::NonThreadSafe) {
77 public:
78  // A list of errors that appeared and were either resolved, or used to turn
79  // off sdch encoding.
80  enum ProblemCodes {
81    MIN_PROBLEM_CODE,
82
83    // Content-encoding correction problems.
84    ADDED_CONTENT_ENCODING = 1,
85    FIXED_CONTENT_ENCODING = 2,
86    FIXED_CONTENT_ENCODINGS = 3,
87
88    // Content decoding errors.
89    DECODE_HEADER_ERROR = 4,
90    DECODE_BODY_ERROR = 5,
91
92    // More content-encoding correction problems.
93    OPTIONAL_GUNZIP_ENCODING_ADDED = 6,
94
95    // Content encoding correction when we're not even tagged as HTML!?!
96    BINARY_ADDED_CONTENT_ENCODING = 7,
97    BINARY_FIXED_CONTENT_ENCODING = 8,
98    BINARY_FIXED_CONTENT_ENCODINGS = 9,
99
100    // Dictionary selection for use problems.
101    DICTIONARY_FOUND_HAS_WRONG_DOMAIN = 10,
102    DICTIONARY_FOUND_HAS_WRONG_PORT_LIST = 11,
103    DICTIONARY_FOUND_HAS_WRONG_PATH = 12,
104    DICTIONARY_FOUND_HAS_WRONG_SCHEME = 13,
105    DICTIONARY_HASH_NOT_FOUND = 14,
106    DICTIONARY_HASH_MALFORMED = 15,
107
108    // Dictionary saving problems.
109    DICTIONARY_HAS_NO_HEADER = 20,
110    DICTIONARY_HEADER_LINE_MISSING_COLON = 21,
111    DICTIONARY_MISSING_DOMAIN_SPECIFIER = 22,
112    DICTIONARY_SPECIFIES_TOP_LEVEL_DOMAIN = 23,
113    DICTIONARY_DOMAIN_NOT_MATCHING_SOURCE_URL = 24,
114    DICTIONARY_PORT_NOT_MATCHING_SOURCE_URL = 25,
115    DICTIONARY_HAS_NO_TEXT = 26,
116    DICTIONARY_REFERER_URL_HAS_DOT_IN_PREFIX = 27,
117
118    // Dictionary loading problems.
119    DICTIONARY_LOAD_ATTEMPT_FROM_DIFFERENT_HOST = 30,
120    DICTIONARY_SELECTED_FOR_SSL = 31,
121    DICTIONARY_ALREADY_LOADED = 32,
122    DICTIONARY_SELECTED_FROM_NON_HTTP = 33,
123    DICTIONARY_IS_TOO_LARGE= 34,
124    DICTIONARY_COUNT_EXCEEDED = 35,
125    DICTIONARY_ALREADY_SCHEDULED_TO_DOWNLOAD = 36,
126    DICTIONARY_ALREADY_TRIED_TO_DOWNLOAD = 37,
127    DICTIONARY_FETCH_READ_FAILED = 38,
128
129    // Failsafe hack.
130    ATTEMPT_TO_DECODE_NON_HTTP_DATA = 40,
131
132
133    // Content-Encoding problems detected, with no action taken.
134    MULTIENCODING_FOR_NON_SDCH_REQUEST = 50,
135    SDCH_CONTENT_ENCODE_FOR_NON_SDCH_REQUEST = 51,
136
137    // Dictionary manager issues.
138    DOMAIN_BLACKLIST_INCLUDES_TARGET = 61,
139
140    // Problematic decode recovery methods.
141    META_REFRESH_RECOVERY = 70,            // Dictionary not found.
142    // defunct =  71, // Almost the same as META_REFRESH_UNSUPPORTED.
143    // defunct = 72,  // Almost the same as CACHED_META_REFRESH_UNSUPPORTED.
144    // defunct = 73,  // PASSING_THROUGH_NON_SDCH plus
145                      // RESPONSE_TENTATIVE_SDCH in ../filter/sdch_filter.cc.
146    META_REFRESH_UNSUPPORTED = 74,         // Unrecoverable error.
147    CACHED_META_REFRESH_UNSUPPORTED = 75,  // As above, but pulled from cache.
148    PASSING_THROUGH_NON_SDCH = 76,  // Tagged sdch but missing dictionary-hash.
149    INCOMPLETE_SDCH_CONTENT = 77,   // Last window was not completely decoded.
150    PASS_THROUGH_404_CODE = 78,     // URL not found message passing through.
151
152    // This next report is very common, and not really an error scenario, but
153    // it exercises the error recovery logic.
154    PASS_THROUGH_OLD_CACHED = 79,   // Back button got pre-SDCH cached content.
155
156    // Common decoded recovery methods.
157    META_REFRESH_CACHED_RECOVERY = 80,  // Probably startup tab loading.
158    // defunct = 81, // Now tracked by ResponseCorruptionDetectionCause histo.
159
160    // Non SDCH problems, only accounted for to make stat counting complete
161    // (i.e., be able to be sure all dictionary advertisements are accounted
162    // for).
163
164    UNFLUSHED_CONTENT = 90,    // Possible error in filter chaining.
165    // defunct = 91,           // MISSING_TIME_STATS (Should never happen.)
166    CACHE_DECODED = 92,        // No timing stats recorded.
167    // defunct = 93,           // OVER_10_MINUTES (No timing stats recorded.)
168    UNINITIALIZED = 94,        // Filter never even got initialized.
169    PRIOR_TO_DICTIONARY = 95,  // We hadn't even parsed a dictionary selector.
170    DECODE_ERROR = 96,         // Something went wrong during decode.
171
172    // Problem during the latency test.
173    LATENCY_TEST_DISALLOWED = 100,  // SDCH now failing, but it worked before!
174
175    MAX_PROBLEM_CODE  // Used to bound histogram.
176  };
177
178  // Use the following static limits to block DOS attacks until we implement
179  // a cached dictionary evicition strategy.
180  static const size_t kMaxDictionarySize;
181  static const size_t kMaxDictionaryCount;
182
183  // There is one instance of |Dictionary| for each memory-cached SDCH
184  // dictionary.
185  class NET_EXPORT_PRIVATE Dictionary : public base::RefCounted<Dictionary> {
186   public:
187    // Sdch filters can get our text to use in decoding compressed data.
188    const std::string& text() const { return text_; }
189
190   private:
191    friend class base::RefCounted<Dictionary>;
192    friend class SdchManager;  // Only manager can construct an instance.
193    FRIEND_TEST_ALL_PREFIXES(SdchManagerTest, PathMatch);
194
195    // Construct a vc-diff usable dictionary from the dictionary_text starting
196    // at the given offset. The supplied client_hash should be used to
197    // advertise the dictionary's availability relative to the suppplied URL.
198    Dictionary(const std::string& dictionary_text,
199               size_t offset,
200               const std::string& client_hash,
201               const GURL& url,
202               const std::string& domain,
203               const std::string& path,
204               const base::Time& expiration,
205               const std::set<int>& ports);
206    virtual ~Dictionary();
207
208    const GURL& url() const { return url_; }
209    const std::string& client_hash() const { return client_hash_; }
210
211    // Security method to check if we can advertise this dictionary for use
212    // if the |target_url| returns SDCH compressed data.
213    bool CanAdvertise(const GURL& target_url);
214
215    // Security methods to check if we can establish a new dictionary with the
216    // given data, that arrived in response to get of dictionary_url.
217    static bool CanSet(const std::string& domain, const std::string& path,
218                       const std::set<int>& ports, const GURL& dictionary_url);
219
220    // Security method to check if we can use a dictionary to decompress a
221    // target that arrived with a reference to this dictionary.
222    bool CanUse(const GURL& referring_url);
223
224    // Compare paths to see if they "match" for dictionary use.
225    static bool PathMatch(const std::string& path,
226                          const std::string& restriction);
227
228    // Compare domains to see if the "match" for dictionary use.
229    static bool DomainMatch(const GURL& url, const std::string& restriction);
230
231
232    // The actual text of the dictionary.
233    std::string text_;
234
235    // Part of the hash of text_ that the client uses to advertise the fact that
236    // it has a specific dictionary pre-cached.
237    std::string client_hash_;
238
239    // The GURL that arrived with the text_ in a URL request to specify where
240    // this dictionary may be used.
241    const GURL url_;
242
243    // Metadate "headers" in before dictionary text contained the following:
244    // Each dictionary payload consists of several headers, followed by the text
245    // of the dictionary. The following are the known headers.
246    const std::string domain_;
247    const std::string path_;
248    const base::Time expiration_;  // Implied by max-age.
249    const std::set<int> ports_;
250
251    DISALLOW_COPY_AND_ASSIGN(Dictionary);
252  };
253
254  SdchManager();
255  virtual ~SdchManager();
256
257  // Clear data (for browser data removal).
258  void ClearData();
259
260  // Record stats on various errors.
261  static void SdchErrorRecovery(ProblemCodes problem);
262
263  // Register a fetcher that this class can use to obtain dictionaries.
264  void set_sdch_fetcher(scoped_ptr<SdchFetcher> fetcher);
265
266  // Enables or disables SDCH compression.
267  static void EnableSdchSupport(bool enabled);
268
269  static bool sdch_enabled() { return g_sdch_enabled_; }
270
271  // Enables or disables SDCH compression over secure connection.
272  static void EnableSecureSchemeSupport(bool enabled);
273
274  static bool secure_scheme_supported() { return g_secure_scheme_supported_; }
275
276  // Briefly prevent further advertising of SDCH on this domain (if SDCH is
277  // enabled). After enough calls to IsInSupportedDomain() the blacklisting
278  // will be removed. Additional blacklists take exponentially more calls
279  // to IsInSupportedDomain() before the blacklisting is undone.
280  // Used when filter errors are found from a given domain, but it is plausible
281  // that the cause is temporary (such as application startup, where cached
282  // entries are used, but a dictionary is not yet loaded).
283  void BlacklistDomain(const GURL& url, ProblemCodes blacklist_reason);
284
285  // Used when SEVERE filter errors are found from a given domain, to prevent
286  // further use of SDCH on that domain.
287  void BlacklistDomainForever(const GURL& url, ProblemCodes blacklist_reason);
288
289  // Unit test only, this function resets enabling of sdch, and clears the
290  // blacklist.
291  void ClearBlacklistings();
292
293  // Unit test only, this function resets the blacklisting count for a domain.
294  void ClearDomainBlacklisting(const std::string& domain);
295
296  // Unit test only: indicate how many more times a domain will be blacklisted.
297  int BlackListDomainCount(const std::string& domain);
298
299  // Unit test only: Indicate what current blacklist increment is for a domain.
300  int BlacklistDomainExponential(const std::string& domain);
301
302  // Check to see if SDCH is enabled (globally), and the given URL is in a
303  // supported domain (i.e., not blacklisted, and either the specific supported
304  // domain, or all domains were assumed supported). If it is blacklist, reduce
305  // by 1 the number of times it will be reported as blacklisted.
306  bool IsInSupportedDomain(const GURL& url);
307
308  // Schedule the URL fetching to load a dictionary. This will always return
309  // before the dictionary is actually loaded and added.
310  // After the implied task does completes, the dictionary will have been
311  // cached in memory.
312  void FetchDictionary(const GURL& request_url, const GURL& dictionary_url);
313
314  // Security test function used before initiating a FetchDictionary.
315  // Return true if fetch is legal.
316  bool CanFetchDictionary(const GURL& referring_url,
317                          const GURL& dictionary_url) const;
318
319  // Find the vcdiff dictionary (the body of the sdch dictionary that appears
320  // after the meta-data headers like Domain:...) with the given |server_hash|
321  // to use to decompreses data that arrived as SDCH encoded content. Check to
322  // be sure the returned |dictionary| can be used for decoding content supplied
323  // in response to a request for |referring_url|.
324  // Return null in |dictionary| if there is no matching legal dictionary.
325  void GetVcdiffDictionary(const std::string& server_hash,
326                           const GURL& referring_url,
327                           scoped_refptr<Dictionary>* dictionary);
328
329  // Get list of available (pre-cached) dictionaries that we have already loaded
330  // into memory. The list is a comma separated list of (client) hashes per
331  // the SDCH spec.
332  void GetAvailDictionaryList(const GURL& target_url, std::string* list);
333
334  // Construct the pair of hashes for client and server to identify an SDCH
335  // dictionary. This is only made public to facilitate unit testing, but is
336  // otherwise private
337  static void GenerateHash(const std::string& dictionary_text,
338                           std::string* client_hash, std::string* server_hash);
339
340  // For Latency testing only, we need to know if we've succeeded in doing a
341  // round trip before starting our comparative tests. If ever we encounter
342  // problems with SDCH, we opt-out of the test unless/until we perform a
343  // complete SDCH decoding.
344  bool AllowLatencyExperiment(const GURL& url) const;
345
346  void SetAllowLatencyExperiment(const GURL& url, bool enable);
347
348  int GetFetchesCountForTesting() const {
349    return fetches_count_for_testing_;
350  }
351
352  // Implementation of SdchFetcher::Delegate.
353
354  // Add an SDCH dictionary to our list of availible
355  // dictionaries. This addition will fail if addition is illegal
356  // (data in the dictionary is not acceptable from the
357  // dictionary_url; dictionary already added, etc.).
358  virtual void AddSdchDictionary(const std::string& dictionary_text,
359                                 const GURL& dictionary_url) OVERRIDE;
360
361 private:
362  struct BlacklistInfo {
363    BlacklistInfo()
364        : count(0),
365          exponential_count(0),
366          reason(MIN_PROBLEM_CODE) {}
367
368    int count;                   // # of times to refuse SDCH advertisement.
369    int exponential_count;       // Current exponential backoff ratchet.
370    ProblemCodes reason;         // Why domain was blacklisted.
371
372  };
373  typedef std::map<std::string, BlacklistInfo> DomainBlacklistInfo;
374  typedef std::set<std::string> ExperimentSet;
375
376  // A map of dictionaries info indexed by the hash that the server provides.
377  typedef std::map<std::string, scoped_refptr<Dictionary> > DictionaryMap;
378
379  // Support SDCH compression, by advertising in headers.
380  static bool g_sdch_enabled_;
381
382  // Support SDCH compression for HTTPS requests and responses. When supported,
383  // HTTPS applicable dictionaries MUST have been acquired securely via HTTPS.
384  static bool g_secure_scheme_supported_;
385
386  // A simple implementation of a RFC 3548 "URL safe" base64 encoder.
387  static void UrlSafeBase64Encode(const std::string& input,
388                                  std::string* output);
389  DictionaryMap dictionaries_;
390
391  // An instance that can fetch a dictionary given a URL.
392  scoped_ptr<SdchFetcher> fetcher_;
393
394  // List domains where decode failures have required disabling sdch.
395  DomainBlacklistInfo blacklisted_domains_;
396
397  // List of hostnames for which a latency experiment is allowed (because a
398  // round trip test has recently passed).
399  ExperimentSet allow_latency_experiment_;
400
401  int fetches_count_for_testing_;
402
403  DISALLOW_COPY_AND_ASSIGN(SdchManager);
404};
405
406}  // namespace net
407
408#endif  // NET_BASE_SDCH_MANAGER_H_
409