sdch_manager.h revision c7f5f8508d98d5952d42ed7648c2a8f30a4da156
1// Copyright (c) 2006-2008 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5// Provides global database of differential decompression dictionaries for the
6// SDCH filter (processes sdch enconded content).
7
8// Exactly one instance of SdchManager is built, and all references are made
9// into that collection.
10//
11// The SdchManager maintains a collection of memory resident dictionaries.  It
12// can find a dictionary (based on a server specification of a hash), store a
13// dictionary, and make judgements about what URLs can use, set, etc. a
14// dictionary.
15
16// These dictionaries are acquired over the net, and include a header
17// (containing metadata) as well as a VCDIFF dictionary (for use by a VCDIFF
18// module) to decompress data.
19
20#ifndef NET_BASE_SDCH_MANAGER_H_
21#define NET_BASE_SDCH_MANAGER_H_
22
23#include <map>
24#include <set>
25#include <string>
26
27#include "base/ref_counted.h"
28#include "base/scoped_ptr.h"
29#include "base/time.h"
30#include "googleurl/src/gurl.h"
31#include "testing/gtest/include/gtest/gtest_prod.h"
32
33//------------------------------------------------------------------------------
34// Create a public interface to help us load SDCH dictionaries.
35// The SdchManager class allows registration to support this interface.
36// A browser may register a fetcher that is used by the dictionary managers to
37// get data from a specified URL.  This allows us to use very high level browser
38// functionality in this base (when the functionaity can be provided).
39class SdchFetcher {
40 public:
41  SdchFetcher() {}
42  virtual ~SdchFetcher() {}
43
44  // The Schedule() method is called when there is a need to get a dictionary
45  // from a server.  The callee is responsible for getting that dictionary_text,
46  // and then calling back to AddSdchDictionary() to the SdchManager instance.
47  virtual void Schedule(const GURL& dictionary_url) = 0;
48 private:
49  DISALLOW_COPY_AND_ASSIGN(SdchFetcher);
50};
51//------------------------------------------------------------------------------
52
53class SdchManager {
54 public:
55  // A list of errors that appeared and were either resolved, or used to turn
56  // off sdch encoding.
57  enum ProblemCodes {
58    MIN_PROBLEM_CODE,
59
60    // Content-encoding correction problems.
61    ADDED_CONTENT_ENCODING = 1,
62    FIXED_CONTENT_ENCODING = 2,
63    FIXED_CONTENT_ENCODINGS = 3,
64
65    // Content decoding errors.
66    DECODE_HEADER_ERROR = 4,
67    DECODE_BODY_ERROR = 5,
68
69    // More content-encoding correction problems.
70    OPTIONAL_GUNZIP_ENCODING_ADDED = 6,
71
72    // Content encoding correction when we're not even tagged as HTML!?!
73    BINARY_ADDED_CONTENT_ENCODING = 7,
74    BINARY_FIXED_CONTENT_ENCODING = 8,
75    BINARY_FIXED_CONTENT_ENCODINGS = 9,
76
77    // Dictionary selection for use problems.
78    DICTIONARY_FOUND_HAS_WRONG_DOMAIN = 10,
79    DICTIONARY_FOUND_HAS_WRONG_PORT_LIST = 11,
80    DICTIONARY_FOUND_HAS_WRONG_PATH = 12,
81    DICTIONARY_FOUND_HAS_WRONG_SCHEME = 13,
82    DICTIONARY_HASH_NOT_FOUND = 14,
83    DICTIONARY_HASH_MALFORMED = 15,
84
85    // Dictionary saving problems.
86    DICTIONARY_HAS_NO_HEADER = 20,
87    DICTIONARY_HEADER_LINE_MISSING_COLON = 21,
88    DICTIONARY_MISSING_DOMAIN_SPECIFIER = 22,
89    DICTIONARY_SPECIFIES_TOP_LEVEL_DOMAIN = 23,
90    DICTIONARY_DOMAIN_NOT_MATCHING_SOURCE_URL = 24,
91    DICTIONARY_PORT_NOT_MATCHING_SOURCE_URL = 25,
92    DICTIONARY_HAS_NO_TEXT = 26,
93    DICTIONARY_REFERER_URL_HAS_DOT_IN_PREFIX = 27,
94
95    // Dictionary loading problems.
96    DICTIONARY_LOAD_ATTEMPT_FROM_DIFFERENT_HOST = 30,
97    DICTIONARY_SELECTED_FOR_SSL = 31,
98    DICTIONARY_ALREADY_LOADED = 32,
99    DICTIONARY_SELECTED_FROM_NON_HTTP = 33,
100    DICTIONARY_IS_TOO_LARGE= 34,
101    DICTIONARY_COUNT_EXCEEDED = 35,
102    DICTIONARY_ALREADY_SCHEDULED_TO_DOWNLOAD = 36,
103    DICTIONARY_ALREADY_TRIED_TO_DOWNLOAD = 37,
104
105    // Failsafe hack.
106    ATTEMPT_TO_DECODE_NON_HTTP_DATA = 40,
107
108
109    // Content-Encoding problems detected, with no action taken.
110    MULTIENCODING_FOR_NON_SDCH_REQUEST = 50,
111    SDCH_CONTENT_ENCODE_FOR_NON_SDCH_REQUEST = 51,
112
113    // Dictionary manager issues.
114    DOMAIN_BLACKLIST_INCLUDES_TARGET = 61,
115
116    // Problematic decode recovery methods.
117    META_REFRESH_RECOVERY = 70,            // Dictionary not found.
118    // defunct =  71, // Almost the same as META_REFRESH_UNSUPPORTED.
119    // defunct = 72,  // Almost the same as CACHED_META_REFRESH_UNSUPPORTED.
120    // defunct = 73,  // PASSING_THROUGH_NON_SDCH plus DISCARD_TENTATIVE_SDCH.
121    META_REFRESH_UNSUPPORTED = 74,         // Unrecoverable error.
122    CACHED_META_REFRESH_UNSUPPORTED = 75,  // As above, but pulled from cache.
123    PASSING_THROUGH_NON_SDCH = 76,  // Tagged sdch but missing dictionary-hash.
124    INCOMPLETE_SDCH_CONTENT = 77,   // Last window was not completely decoded.
125    PASS_THROUGH_404_CODE = 78,     // URL not found message passing through.
126
127    // This next report is very common, and not really an error scenario, but
128    // it exercises the error recovery logic.
129    PASS_THROUGH_OLD_CACHED = 79,   // Back button got pre-SDCH cached content.
130
131    // Common decoded recovery methods.
132    META_REFRESH_CACHED_RECOVERY = 80,  // Probably startup tab loading.
133    DISCARD_TENTATIVE_SDCH = 81,        // Server decided not to use sdch.
134
135    // Non SDCH problems, only accounted for to make stat counting complete
136    // (i.e., be able to be sure all dictionary advertisements are accounted
137    // for).
138
139    UNFLUSHED_CONTENT = 90,    // Possible error in filter chaining.
140    // defunct = 91,           // MISSING_TIME_STATS (Should never happen.)
141    CACHE_DECODED = 92,        // No timing stats recorded.
142    // defunct = 93,           // OVER_10_MINUTES (No timing stats recorded.)
143    UNINITIALIZED = 94,        // Filter never even got initialized.
144    PRIOR_TO_DICTIONARY = 95,  // We hadn't even parsed a dictionary selector.
145    DECODE_ERROR = 96,         // Something went wrong during decode.
146
147    // Problem during the latency test.
148    LATENCY_TEST_DISALLOWED = 100,  // SDCH now failing, but it worked before!
149
150    MAX_PROBLEM_CODE  // Used to bound histogram.
151  };
152
153  // Use the following static limits to block DOS attacks until we implement
154  // a cached dictionary evicition strategy.
155  static const size_t kMaxDictionarySize;
156  static const size_t kMaxDictionaryCount;
157
158  // There is one instance of |Dictionary| for each memory-cached SDCH
159  // dictionary.
160  class Dictionary : public base::RefCounted<Dictionary> {
161   public:
162    // Sdch filters can get our text to use in decoding compressed data.
163    const std::string& text() const { return text_; }
164
165   private:
166    friend class base::RefCounted<Dictionary>;
167    friend class SdchManager;  // Only manager can construct an instance.
168    FRIEND_TEST(SdchFilterTest, PathMatch);
169
170    // Construct a vc-diff usable dictionary from the dictionary_text starting
171    // at the given offset.  The supplied client_hash should be used to
172    // advertise the dictionary's availability relative to the suppplied URL.
173    Dictionary(const std::string& dictionary_text, size_t offset,
174               const std::string& client_hash, const GURL& url,
175               const std::string& domain, const std::string& path,
176               const base::Time& expiration, const std::set<int> ports);
177    ~Dictionary() {}
178
179    const GURL& url() const { return url_; }
180    const std::string& client_hash() const { return client_hash_; }
181
182    // Security method to check if we can advertise this dictionary for use
183    // if the |target_url| returns SDCH compressed data.
184    bool CanAdvertise(const GURL& target_url);
185
186    // Security methods to check if we can establish a new dictionary with the
187    // given data, that arrived in response to get of dictionary_url.
188    static bool CanSet(const std::string& domain, const std::string& path,
189                       const std::set<int> ports, const GURL& dictionary_url);
190
191    // Security method to check if we can use a dictionary to decompress a
192    // target that arrived with a reference to this dictionary.
193    bool CanUse(const GURL& referring_url);
194
195    // Compare paths to see if they "match" for dictionary use.
196    static bool PathMatch(const std::string& path,
197                          const std::string& restriction);
198
199    // Compare domains to see if the "match" for dictionary use.
200    static bool DomainMatch(const GURL& url, const std::string& restriction);
201
202
203    // The actual text of the dictionary.
204    std::string text_;
205
206    // Part of the hash of text_ that the client uses to advertise the fact that
207    // it has a specific dictionary pre-cached.
208    std::string client_hash_;
209
210    // The GURL that arrived with the text_ in a URL request to specify where
211    // this dictionary may be used.
212    const GURL url_;
213
214    // Metadate "headers" in before dictionary text contained the following:
215    // Each dictionary payload consists of several headers, followed by the text
216    // of the dictionary.  The following are the known headers.
217    const std::string domain_;
218    const std::string path_;
219    const base::Time expiration_;  // Implied by max-age.
220    const std::set<int> ports_;
221
222    DISALLOW_COPY_AND_ASSIGN(Dictionary);
223  };
224
225  SdchManager();
226  ~SdchManager();
227
228  // Discontinue fetching of dictionaries, as we're now shutting down.
229  static void Shutdown();
230
231  // Provide access to the single instance of this class.
232  static SdchManager* Global();
233
234  // Record stats on various errors.
235  static void SdchErrorRecovery(ProblemCodes problem);
236
237  // Register a fetcher that this class can use to obtain dictionaries.
238  void set_sdch_fetcher(SdchFetcher* fetcher) { fetcher_.reset(fetcher); }
239
240  // If called with an empty string, advertise and support sdch on all domains.
241  // If called with a specific string, advertise and support only the specified
242  // domain.  Function assumes the existence of a global SdchManager instance.
243  void EnableSdchSupport(const std::string& domain);
244
245  static bool sdch_enabled() { return global_ && global_->sdch_enabled_; }
246
247  // Briefly prevent further advertising of SDCH on this domain (if SDCH is
248  // enabled). After enough calls to IsInSupportedDomain() the blacklisting
249  // will be removed.  Additional blacklists take exponentially more calls
250  // to IsInSupportedDomain() before the blacklisting is undone.
251  // Used when filter errors are found from a given domain, but it is plausible
252  // that the cause is temporary (such as application startup, where cached
253  // entries are used, but a dictionary is not yet loaded).
254  static void BlacklistDomain(const GURL& url);
255
256  // Used when SEVERE filter errors are found from a given domain, to prevent
257  // further use of SDCH on that domain.
258  static void BlacklistDomainForever(const GURL& url);
259
260  // Unit test only, this function resets enabling of sdch, and clears the
261  // blacklist.
262  static void ClearBlacklistings();
263
264  // Unit test only, this function resets the blacklisting count for a domain.
265  static void ClearDomainBlacklisting(const std::string& domain);
266
267  // Unit test only: indicate how many more times a domain will be blacklisted.
268  static int BlackListDomainCount(const std::string& domain);
269
270  // Unit test only: Indicate what current blacklist increment is for a domain.
271  static int BlacklistDomainExponential(const std::string& domain);
272
273  // Check to see if SDCH is enabled (globally), and the given URL is in a
274  // supported domain (i.e., not blacklisted, and either the specific supported
275  // domain, or all domains were assumed supported).  If it is blacklist, reduce
276  // by 1 the number of times it will be reported as blacklisted.
277  const bool IsInSupportedDomain(const GURL& url);
278
279  // Schedule the URL fetching to load a dictionary. This will always return
280  // before the dictionary is actually loaded and added.
281  // After the implied task does completes, the dictionary will have been
282  // cached in memory.
283  void FetchDictionary(const GURL& request_url, const GURL& dictionary_url);
284
285  // Security test function used before initiating a FetchDictionary.
286  // Return true if fetch is legal.
287  bool CanFetchDictionary(const GURL& referring_url,
288                          const GURL& dictionary_url) const;
289
290  // Add an SDCH dictionary to our list of availible dictionaries. This addition
291  // will fail (return false) if addition is illegal (data in the dictionary is
292  // not acceptable from the dictionary_url; dictionary already added, etc.).
293  bool AddSdchDictionary(const std::string& dictionary_text,
294                         const GURL& dictionary_url);
295
296  // Find the vcdiff dictionary (the body of the sdch dictionary that appears
297  // after the meta-data headers like Domain:...) with the given |server_hash|
298  // to use to decompreses data that arrived as SDCH encoded content.  Check to
299  // be sure the returned |dictionary| can be used for decoding content supplied
300  // in response to a request for |referring_url|.
301  // Caller is responsible for AddRef()ing the dictionary, and Release()ing it
302  // when done.
303  // Return null in |dictionary| if there is no matching legal dictionary.
304  void GetVcdiffDictionary(const std::string& server_hash,
305                           const GURL& referring_url,
306                           Dictionary** dictionary);
307
308  // Get list of available (pre-cached) dictionaries that we have already loaded
309  // into memory.  The list is a comma separated list of (client) hashes per
310  // the SDCH spec.
311  void GetAvailDictionaryList(const GURL& target_url, std::string* list);
312
313  // Construct the pair of hashes for client and server to identify an SDCH
314  // dictionary.  This is only made public to facilitate unit testing, but is
315  // otherwise private
316  static void GenerateHash(const std::string& dictionary_text,
317                           std::string* client_hash, std::string* server_hash);
318
319  // For Latency testing only, we need to know if we've succeeded in doing a
320  // round trip before starting our comparative tests.  If ever we encounter
321  // problems with SDCH, we opt-out of the test unless/until we perform a
322  // complete SDCH decoding.
323  bool AllowLatencyExperiment(const GURL& url) const;
324
325  void SetAllowLatencyExperiment(const GURL& url, bool enable);
326
327 private:
328  typedef std::map<std::string, int> DomainCounter;
329  typedef std::set<std::string> ExperimentSet;
330
331  // A map of dictionaries info indexed by the hash that the server provides.
332  typedef std::map<std::string, Dictionary*> DictionaryMap;
333
334  // The one global instance of that holds all the data.
335  static SdchManager* global_;
336
337  // A simple implementation of a RFC 3548 "URL safe" base64 encoder.
338  static void UrlSafeBase64Encode(const std::string& input,
339                                  std::string* output);
340  DictionaryMap dictionaries_;
341
342  // An instance that can fetch a dictionary given a URL.
343  scoped_ptr<SdchFetcher> fetcher_;
344
345  // Support SDCH compression, by advertising in headers.
346  bool sdch_enabled_;
347
348  // Empty string means all domains.  Non-empty means support only the given
349  // domain is supported.
350  std::string supported_domain_;
351
352  // List domains where decode failures have required disabling sdch, along with
353  // count of how many additonal uses should be blacklisted.
354  DomainCounter blacklisted_domains_;
355
356  // Support exponential backoff in number of domain accesses before
357  // blacklisting expires.
358  DomainCounter exponential_blacklist_count;
359
360  // List of hostnames for which a latency experiment is allowed (because a
361  // round trip test has recently passed).
362  ExperimentSet allow_latency_experiment_;
363
364  DISALLOW_COPY_AND_ASSIGN(SdchManager);
365};
366
367#endif  // NET_BASE_SDCH_MANAGER_H_
368