template_url_parser.cc revision 3345a6884c488ff3a535c2c9acdd33d74b37e311
1// Copyright (c) 2010 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#include "chrome/browser/search_engines/template_url_parser.h"
6
7#include <algorithm>
8#include <map>
9#include <vector>
10
11#include "base/logging.h"
12#include "base/scoped_ptr.h"
13#include "base/string_number_conversions.h"
14#include "base/string_util.h"
15#include "base/utf_string_conversions.h"
16#include "chrome/browser/search_engines/template_url.h"
17#include "chrome/common/url_constants.h"
18#include "googleurl/src/gurl.h"
19#include "libxml/parser.h"
20#include "libxml/xmlwriter.h"
21
22namespace {
23
24//
25// NOTE: libxml uses the UTF-8 encoding. As 0-127 of UTF-8 corresponds
26// to that of char, the following names are all in terms of char. This avoids
27// having to convert to wide, then do comparisons
28
29// Defines for element names of the OSD document:
30static const char kURLElement[] = "Url";
31static const char kParamElement[] = "Param";
32static const char kShortNameElement[] = "ShortName";
33static const char kDescriptionElement[] = "Description";
34static const char kImageElement[] = "Image";
35static const char kOpenSearchDescriptionElement[] = "OpenSearchDescription";
36static const char kFirefoxSearchDescriptionElement[] = "SearchPlugin";
37static const char kLanguageElement[] = "Language";
38static const char kInputEncodingElement[] = "InputEncoding";
39
40// Various XML attributes used.
41static const char kURLTypeAttribute[] = "type";
42static const char kURLTemplateAttribute[] = "template";
43static const char kImageTypeAttribute[] = "type";
44static const char kImageWidthAttribute[] = "width";
45static const char kImageHeightAttribute[] = "height";
46static const char kURLIndexOffsetAttribute[] = "indexOffset";
47static const char kURLPageOffsetAttribute[] = "pageOffset";
48static const char kParamNameAttribute[] = "name";
49static const char kParamValueAttribute[] = "value";
50static const char kParamMethodAttribute[] = "method";
51
52// Mime type for search results.
53static const char kHTMLType[] = "text/html";
54
55// Mime type for as you type suggestions.
56static const char kSuggestionType[] = "application/x-suggestions+json";
57
58// Namespace identifier.
59static const char kOSDNS[] = "xmlns";
60
61// The namespace for documents we understand.
62static const char kNameSpace[] = "http://a9.com/-/spec/opensearch/1.1/";
63
64// Removes the namespace from the specified |name|, ex: os:Url -> Url.
65static void PruneNamespace(std::string* name) {
66  size_t index = name->find_first_of(":");
67  if (index != std::string::npos)
68    name->erase(0, index + 1);
69}
70
71//
72// To minimize memory overhead while parsing, a SAX style parser is used.
73// ParsingContext is used to maintain the state we're in the document
74// while parsing.
75class ParsingContext {
76 public:
77  // Enum of the known element types.
78  enum ElementType {
79    UNKNOWN,
80    OPEN_SEARCH_DESCRIPTION,
81    URL,
82    PARAM,
83    SHORT_NAME,
84    DESCRIPTION,
85    IMAGE,
86    LANGUAGE,
87    INPUT_ENCODING,
88  };
89
90  enum Method {
91    GET,
92    POST
93  };
94
95  // Key/value of a Param node.
96  typedef std::pair<std::string, std::string> Param;
97
98  ParsingContext(TemplateURLParser::ParameterFilter* parameter_filter,
99                 TemplateURL* url)
100      : url_(url),
101        parameter_filter_(parameter_filter),
102        method_(GET),
103        suggestion_method_(GET),
104        is_suggest_url_(false),
105        derive_image_from_url_(false) {
106    if (kElementNameToElementTypeMap == NULL)
107      InitMapping();
108  }
109
110  // Invoked when an element starts.
111  void PushElement(const std::string& element) {
112    ElementType type;
113    if (kElementNameToElementTypeMap->find(element) ==
114        kElementNameToElementTypeMap->end()) {
115      type = UNKNOWN;
116    } else {
117      type = (*kElementNameToElementTypeMap)[element];
118    }
119    elements_.push_back(type);
120  }
121
122  void PopElement() {
123    elements_.pop_back();
124  }
125
126  // Returns the current ElementType.
127  ElementType GetKnownType() {
128    if (elements_.size() == 2 && elements_[0] == OPEN_SEARCH_DESCRIPTION)
129      return elements_[1];
130
131    // We only expect PARAM nodes under the Url node
132    if (elements_.size() == 3 && elements_[0] == OPEN_SEARCH_DESCRIPTION &&
133        elements_[1] == URL && elements_[2] == PARAM)
134      return PARAM;
135
136    return UNKNOWN;
137  }
138
139  TemplateURL* template_url() { return url_; }
140
141  void AddImageRef(const std::wstring& type, int width, int height) {
142    if (width > 0 && height > 0)
143      current_image_.reset(new TemplateURL::ImageRef(type, width, height));
144  }
145
146  void EndImage() {
147    current_image_.reset();
148  }
149
150  void SetImageURL(const std::wstring& url) {
151    if (current_image_.get()) {
152      current_image_->url = GURL(WideToUTF8(url));
153      url_->add_image_ref(*current_image_);
154      current_image_.reset();
155    }
156  }
157
158  void ResetString() {
159    string_.clear();
160  }
161
162  void AppendString(const std::wstring& string) {
163    string_ += string;
164  }
165
166  const std::wstring& GetString() {
167    return string_;
168  }
169
170  void ResetExtraParams() {
171    extra_params_.clear();
172  }
173
174  void AddExtraParams(const std::string& key, const std::string& value) {
175    if (parameter_filter_ && !parameter_filter_->KeepParameter(key, value))
176      return;
177    extra_params_.push_back(Param(key, value));
178  }
179
180  const std::vector<Param>& extra_params() const { return extra_params_; }
181
182  void set_is_suggestion(bool value) { is_suggest_url_ = value; }
183  bool is_suggestion() const { return is_suggest_url_; }
184
185  TemplateURLParser::ParameterFilter* parameter_filter() const {
186    return parameter_filter_;
187  }
188
189  void set_derive_image_from_url(bool derive_image_from_url) {
190    derive_image_from_url_ = derive_image_from_url;
191  }
192
193  void set_method(Method method) { method_ = method; }
194  Method method() { return method_; }
195
196  void set_suggestion_method(Method method) { suggestion_method_ = method; }
197  Method suggestion_method() { return suggestion_method_; }
198
199  // Builds the image URL from the Template search URL if no image URL has been
200  // set.
201  void DeriveImageFromURL() {
202    if (derive_image_from_url_ &&
203        url_->GetFavIconURL().is_empty() && url_->url()) {
204      GURL url(url_->url()->url());  // More url's please...
205      url_->SetFavIconURL(TemplateURL::GenerateFaviconURL(url));
206    }
207  }
208
209 private:
210  static void InitMapping() {
211    kElementNameToElementTypeMap = new std::map<std::string, ElementType>;
212    (*kElementNameToElementTypeMap)[kURLElement] = URL;
213    (*kElementNameToElementTypeMap)[kParamElement] = PARAM;
214    (*kElementNameToElementTypeMap)[kShortNameElement] = SHORT_NAME;
215    (*kElementNameToElementTypeMap)[kDescriptionElement] = DESCRIPTION;
216    (*kElementNameToElementTypeMap)[kImageElement] = IMAGE;
217    (*kElementNameToElementTypeMap)[kOpenSearchDescriptionElement] =
218        OPEN_SEARCH_DESCRIPTION;
219    (*kElementNameToElementTypeMap)[kFirefoxSearchDescriptionElement] =
220        OPEN_SEARCH_DESCRIPTION;
221    (*kElementNameToElementTypeMap)[kLanguageElement] =
222        LANGUAGE;
223    (*kElementNameToElementTypeMap)[kInputEncodingElement] =
224        INPUT_ENCODING;
225  }
226
227  // Key is UTF8 encoded.
228  static std::map<std::string, ElementType>* kElementNameToElementTypeMap;
229  // TemplateURL supplied to Read method. It's owned by the caller, so we
230  // don't need to free it.
231  TemplateURL* url_;
232  std::vector<ElementType> elements_;
233  scoped_ptr<TemplateURL::ImageRef> current_image_;
234
235  // Character content for the current element.
236  std::wstring string_;
237
238  TemplateURLParser::ParameterFilter* parameter_filter_;
239
240  // The list of parameters parsed in the Param nodes of a Url node.
241  std::vector<Param> extra_params_;
242
243  // The HTTP methods used.
244  Method method_;
245  Method suggestion_method_;
246
247  // If true, we are currently parsing a suggest URL, otherwise it is an HTML
248  // search.  Note that we don't need a stack as Url nodes cannot be nested.
249  bool is_suggest_url_;
250
251  // Whether we should derive the image from the URL (when images are data
252  // URLs).
253  bool derive_image_from_url_;
254
255  DISALLOW_COPY_AND_ASSIGN(ParsingContext);
256};
257
258// static
259std::map<std::string, ParsingContext::ElementType>*
260    ParsingContext::kElementNameToElementTypeMap = NULL;
261
262std::wstring XMLCharToWide(const xmlChar* value) {
263  return UTF8ToWide(std::string((const char*)value));
264}
265
266std::wstring XMLCharToWide(const xmlChar* value, int length) {
267  return UTF8ToWide(std::string((const char*)value, length));
268}
269
270std::string XMLCharToString(const xmlChar* value) {
271  return std::string((const char*)value);
272}
273
274// Returns true if input_encoding contains a valid input encoding string. This
275// doesn't verify that we have a valid encoding for the string, just that the
276// string contains characters that constitute a valid input encoding.
277bool IsValidEncodingString(const std::string& input_encoding) {
278  if (input_encoding.empty())
279    return false;
280
281  if (!IsAsciiAlpha(input_encoding[0]))
282    return false;
283
284  for (size_t i = 1, max = input_encoding.size(); i < max; ++i) {
285    char c = input_encoding[i];
286    if (!IsAsciiAlpha(c) && !IsAsciiDigit(c) && c != '.' && c != '_' &&
287        c != '-') {
288      return false;
289    }
290  }
291  return true;
292}
293
294void ParseURL(const xmlChar** atts, ParsingContext* context) {
295  if (!atts)
296    return;
297
298  TemplateURL* turl = context->template_url();
299  const xmlChar** attributes = atts;
300  std::string template_url;
301  bool is_post = false;
302  bool is_html_url = false;
303  bool is_suggest_url = false;
304  int index_offset = 1;
305  int page_offset = 1;
306
307  while (*attributes) {
308    std::string name(XMLCharToString(*attributes));
309    const xmlChar* value = attributes[1];
310    if (name == kURLTypeAttribute) {
311      std::string type = XMLCharToString(value);
312      is_html_url = (type == kHTMLType);
313      is_suggest_url = (type == kSuggestionType);
314    } else if (name == kURLTemplateAttribute) {
315      template_url = XMLCharToString(value);
316    } else if (name == kURLIndexOffsetAttribute) {
317      base::StringToInt(XMLCharToString(value), &index_offset);
318      index_offset = std::max(1, index_offset);
319    } else if (name == kURLPageOffsetAttribute) {
320      base::StringToInt(XMLCharToString(value), &page_offset);
321      page_offset = std::max(1, page_offset);
322    } else if (name == kParamMethodAttribute) {
323      is_post = LowerCaseEqualsASCII(XMLCharToString(value), "post");
324    }
325    attributes += 2;
326  }
327  if (is_html_url) {
328    turl->SetURL(template_url, index_offset, page_offset);
329    context->set_is_suggestion(false);
330    if (is_post)
331      context->set_method(ParsingContext::POST);
332  } else if (is_suggest_url) {
333    turl->SetSuggestionsURL(template_url, index_offset, page_offset);
334    context->set_is_suggestion(true);
335    if (is_post)
336      context->set_suggestion_method(ParsingContext::POST);
337  }
338}
339
340void ParseImage(const xmlChar** atts, ParsingContext* context) {
341  if (!atts)
342    return;
343
344  const xmlChar** attributes = atts;
345  int width = 0;
346  int height = 0;
347  std::wstring type;
348  while (*attributes) {
349    std::string name(XMLCharToString(*attributes));
350    const xmlChar* value = attributes[1];
351    if (name == kImageTypeAttribute) {
352      type = XMLCharToWide(value);
353    } else if (name == kImageWidthAttribute) {
354      base::StringToInt(XMLCharToString(value), &width);
355    } else if (name == kImageHeightAttribute) {
356      base::StringToInt(XMLCharToString(value), &height);
357    }
358    attributes += 2;
359  }
360  if (width > 0 && height > 0 && !type.empty()) {
361    // Valid Image URL.
362    context->AddImageRef(type, width, height);
363  }
364}
365
366void ParseParam(const xmlChar** atts, ParsingContext* context) {
367  if (!atts)
368    return;
369
370  const xmlChar** attributes = atts;
371  std::wstring type;
372  std::string key, value;
373  while (*attributes) {
374    std::string name(XMLCharToString(*attributes));
375    const xmlChar* val = attributes[1];
376    if (name == kParamNameAttribute) {
377      key = XMLCharToString(val);
378    } else if (name == kParamValueAttribute) {
379      value = XMLCharToString(val);
380    }
381    attributes += 2;
382  }
383  if (!key.empty())
384    context->AddExtraParams(key, value);
385}
386
387static void AppendParamToQuery(const std::string& key,
388                               const std::string& value,
389                               std::string* query) {
390  if (!query->empty())
391    query->append("&");
392  if (!key.empty()) {
393    query->append(key);
394    query->append("=");
395  }
396  query->append(value);
397}
398
399void ProcessURLParams(ParsingContext* context) {
400  TemplateURL* t_url = context->template_url();
401  const TemplateURLRef* t_url_ref =
402      context->is_suggestion() ? t_url->suggestions_url() :
403                                 t_url->url();
404  if (!t_url_ref)
405    return;
406
407  if (!context->parameter_filter() && context->extra_params().empty())
408    return;
409
410  GURL url(t_url_ref->url());
411  // If there is a parameter filter, parse the existing URL and remove any
412  // unwanted parameter.
413  TemplateURLParser::ParameterFilter* filter = context->parameter_filter();
414  std::string new_query;
415  bool modified = false;
416  if (filter) {
417    url_parse::Component query = url.parsed_for_possibly_invalid_spec().query;
418    url_parse::Component key, value;
419    const char* url_spec = url.spec().c_str();
420    while (url_parse::ExtractQueryKeyValue(url_spec, &query, &key, &value)) {
421      std::string key_str(url_spec, key.begin, key.len);
422      std::string value_str(url_spec, value.begin, value.len);
423      if (filter->KeepParameter(key_str, value_str)) {
424        AppendParamToQuery(key_str, value_str, &new_query);
425      } else {
426        modified = true;
427      }
428    }
429  }
430  if (!modified)
431    new_query = url.query();
432
433  // Add the extra parameters if any.
434  const std::vector<ParsingContext::Param>& params = context->extra_params();
435  if (!params.empty()) {
436    modified = true;
437    std::vector<ParsingContext::Param>::const_iterator iter;
438    for (iter = params.begin(); iter != params.end(); ++iter)
439      AppendParamToQuery(iter->first, iter->second, &new_query);
440  }
441
442  if (modified) {
443    GURL::Replacements repl;
444    repl.SetQueryStr(new_query);
445    url = url.ReplaceComponents(repl);
446    if (context->is_suggestion()) {
447      t_url->SetSuggestionsURL(url.spec(),
448                               t_url_ref->index_offset(),
449                               t_url_ref->page_offset());
450    } else {
451      t_url->SetURL(url.spec(),
452                    t_url_ref->index_offset(),
453                    t_url_ref->page_offset());
454    }
455  }
456}
457
458void StartElementImpl(void *ctx, const xmlChar *name, const xmlChar **atts) {
459  ParsingContext* context = reinterpret_cast<ParsingContext*>(ctx);
460  std::string node_name((const char*)name);
461  PruneNamespace(&node_name);
462  context->PushElement(node_name);
463  switch (context->GetKnownType()) {
464    case ParsingContext::URL:
465      context->ResetExtraParams();
466      ParseURL(atts, context);
467      break;
468    case ParsingContext::IMAGE:
469      ParseImage(atts, context);
470      break;
471    case ParsingContext::PARAM:
472      ParseParam(atts, context);
473      break;
474    default:
475      break;
476  }
477  context->ResetString();
478}
479
480void EndElementImpl(void *ctx, const xmlChar *name) {
481  ParsingContext* context = reinterpret_cast<ParsingContext*>(ctx);
482  switch (context->GetKnownType()) {
483    case ParsingContext::SHORT_NAME:
484      context->template_url()->set_short_name(context->GetString());
485      break;
486    case ParsingContext::DESCRIPTION:
487      context->template_url()->set_description(context->GetString());
488      break;
489    case ParsingContext::IMAGE: {
490      GURL image_url(WideToUTF8(context->GetString()));
491      if (image_url.SchemeIs(chrome::kDataScheme)) {
492        // TODO (jcampan): bug 1169256: when dealing with data URL, we need to
493        // decode the data URL in the renderer. For now, we'll just point to the
494        // fav icon from the URL.
495        context->set_derive_image_from_url(true);
496      } else {
497        context->SetImageURL(context->GetString());
498      }
499      context->EndImage();
500      break;
501    }
502    case ParsingContext::LANGUAGE:
503      context->template_url()->add_language(context->GetString());
504      break;
505    case ParsingContext::INPUT_ENCODING: {
506      std::string input_encoding = WideToASCII(context->GetString());
507      if (IsValidEncodingString(input_encoding))
508        context->template_url()->add_input_encoding(input_encoding);
509      break;
510    }
511    case ParsingContext::URL:
512      ProcessURLParams(context);
513      break;
514    default:
515      break;
516  }
517  context->ResetString();
518  context->PopElement();
519}
520
521void CharactersImpl(void *ctx, const xmlChar *ch, int len) {
522  ParsingContext* context = reinterpret_cast<ParsingContext*>(ctx);
523  context->AppendString(XMLCharToWide(ch, len));
524}
525
526// Returns true if the ref is null, or the url wrapped by ref is
527// valid with a spec of http/https.
528bool IsHTTPRef(const TemplateURLRef* ref) {
529  if (ref == NULL)
530    return true;
531  GURL url(ref->url());
532  return (url.is_valid() && (url.SchemeIs(chrome::kHttpScheme) ||
533                             url.SchemeIs(chrome::kHttpsScheme)));
534}
535
536// Returns true if the TemplateURL is legal. A legal TemplateURL is one
537// where all URLs have a spec of http/https.
538bool IsLegal(TemplateURL* url) {
539  if (!IsHTTPRef(url->url()) || !IsHTTPRef(url->suggestions_url()))
540    return false;
541  // Make sure all the image refs are legal.
542  const std::vector<TemplateURL::ImageRef>& image_refs = url->image_refs();
543  for (size_t i = 0; i < image_refs.size(); i++) {
544    GURL image_url(image_refs[i].url);
545    if (!image_url.is_valid() ||
546        !(image_url.SchemeIs(chrome::kHttpScheme) ||
547          image_url.SchemeIs(chrome::kHttpsScheme))) {
548      return false;
549    }
550  }
551  return true;
552}
553
554}  // namespace
555
556// static
557bool TemplateURLParser::Parse(const unsigned char* data, size_t length,
558                              TemplateURLParser::ParameterFilter* param_filter,
559                              TemplateURL* url) {
560  DCHECK(url);
561  // xmlSubstituteEntitiesDefault(1) makes it so that &amp; isn't mapped to
562  // &#38; . Unfortunately xmlSubstituteEntitiesDefault effects global state.
563  // If this becomes problematic we'll need to provide our own entity
564  // type for &amp;, or strip out &#34; by hand after parsing.
565  int last_sub_entities_value = xmlSubstituteEntitiesDefault(1);
566  ParsingContext context(param_filter, url);
567  xmlSAXHandler sax_handler;
568  memset(&sax_handler, 0, sizeof(sax_handler));
569  sax_handler.startElement = &StartElementImpl;
570  sax_handler.endElement = &EndElementImpl;
571  sax_handler.characters = &CharactersImpl;
572  xmlSAXUserParseMemory(&sax_handler, &context,
573                        reinterpret_cast<const char*>(data),
574                        static_cast<int>(length));
575  xmlSubstituteEntitiesDefault(last_sub_entities_value);
576  // If the image was a data URL, use the favicon from the search URL instead.
577  // (see TODO inEndElementImpl()).
578  context.DeriveImageFromURL();
579
580  // TODO(jcampan): http://b/issue?id=1196285 we do not support search engines
581  //                that use POST yet.
582  if (context.method() == ParsingContext::POST)
583    return false;
584  if (context.suggestion_method() == ParsingContext::POST)
585    url->SetSuggestionsURL("", 0, 0);
586
587  if (!url->short_name().empty() && !url->description().empty()) {
588    // So far so good, make sure the urls are http.
589    return IsLegal(url);
590  }
591  return false;
592}
593