1// Copyright 2014 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#include "components/search_engines/template_url_parser.h"
6
7#include <algorithm>
8#include <map>
9#include <vector>
10
11#include "base/logging.h"
12#include "base/memory/scoped_ptr.h"
13#include "base/strings/string_number_conversions.h"
14#include "base/strings/string_util.h"
15#include "base/strings/utf_string_conversions.h"
16#include "components/search_engines/template_url.h"
17#include "libxml/parser.h"
18#include "libxml/xmlwriter.h"
19#include "ui/gfx/favicon_size.h"
20#include "url/gurl.h"
21#include "url/url_constants.h"
22
23namespace {
24
25// NOTE: libxml uses the UTF-8 encoding. As 0-127 of UTF-8 corresponds
26// to that of char, the following names are all in terms of char. This avoids
27// having to convert to wide, then do comparisons.
28
29// Defines for element names of the OSD document:
30const char kURLElement[] = "Url";
31const char kParamElement[] = "Param";
32const char kShortNameElement[] = "ShortName";
33const char kImageElement[] = "Image";
34const char kOpenSearchDescriptionElement[] = "OpenSearchDescription";
35const char kFirefoxSearchDescriptionElement[] = "SearchPlugin";
36const char kInputEncodingElement[] = "InputEncoding";
37const char kAliasElement[] = "Alias";
38
39// Various XML attributes used.
40const char kURLTypeAttribute[] = "type";
41const char kURLTemplateAttribute[] = "template";
42const char kImageTypeAttribute[] = "type";
43const char kImageWidthAttribute[] = "width";
44const char kImageHeightAttribute[] = "height";
45const char kParamNameAttribute[] = "name";
46const char kParamValueAttribute[] = "value";
47const char kParamMethodAttribute[] = "method";
48
49// Mime type for search results.
50const char kHTMLType[] = "text/html";
51
52// Mime type for as you type suggestions.
53const char kSuggestionType[] = "application/x-suggestions+json";
54
55std::string XMLCharToString(const xmlChar* value) {
56  return std::string(reinterpret_cast<const char*>(value));
57}
58
59// Returns true if input_encoding contains a valid input encoding string. This
60// doesn't verify that we have a valid encoding for the string, just that the
61// string contains characters that constitute a valid input encoding.
62bool IsValidEncodingString(const std::string& input_encoding) {
63  if (input_encoding.empty())
64    return false;
65
66  if (!IsAsciiAlpha(input_encoding[0]))
67    return false;
68
69  for (size_t i = 1, max = input_encoding.size(); i < max; ++i) {
70    char c = input_encoding[i];
71    if (!IsAsciiAlpha(c) && !IsAsciiDigit(c) && c != '.' && c != '_' &&
72        c != '-') {
73      return false;
74    }
75  }
76  return true;
77}
78
79void AppendParamToQuery(const std::string& key,
80                        const std::string& value,
81                        std::string* query) {
82  if (!query->empty())
83    query->append("&");
84  if (!key.empty()) {
85    query->append(key);
86    query->append("=");
87  }
88  query->append(value);
89}
90
91// Returns true if |url| is empty or is a valid URL with a scheme of HTTP[S].
92bool IsHTTPRef(const std::string& url) {
93  if (url.empty())
94    return true;
95  GURL gurl(url);
96  return gurl.is_valid() && (gurl.SchemeIs(url::kHttpScheme) ||
97                             gurl.SchemeIs(url::kHttpsScheme));
98}
99
100}  // namespace
101
102
103// TemplateURLParsingContext --------------------------------------------------
104
105// To minimize memory overhead while parsing, a SAX style parser is used.
106// TemplateURLParsingContext is used to maintain the state we're in the document
107// while parsing.
108class TemplateURLParsingContext {
109 public:
110  // Enum of the known element types.
111  enum ElementType {
112    UNKNOWN,
113    OPEN_SEARCH_DESCRIPTION,
114    URL,
115    PARAM,
116    SHORT_NAME,
117    IMAGE,
118    INPUT_ENCODING,
119    ALIAS,
120  };
121
122  enum Method {
123    GET,
124    POST
125  };
126
127  // Key/value of a Param node.
128  typedef std::pair<std::string, std::string> Param;
129
130  explicit TemplateURLParsingContext(
131      TemplateURLParser::ParameterFilter* parameter_filter);
132
133  static void StartElementImpl(void* ctx,
134                               const xmlChar* name,
135                               const xmlChar** atts);
136  static void EndElementImpl(void* ctx, const xmlChar* name);
137  static void CharactersImpl(void* ctx, const xmlChar* ch, int len);
138
139  // Returns a heap-allocated TemplateURL representing the result of parsing.
140  // This will be NULL if parsing failed or if the results were invalid for some
141  // reason (e.g. the resulting URL was not HTTP[S], a name wasn't supplied,
142  // a resulting TemplateURLRef was invalid, etc.).
143  TemplateURL* GetTemplateURL(const SearchTermsData& search_terms_data,
144                              bool show_in_default_list);
145
146 private:
147  // Key is UTF8 encoded.
148  typedef std::map<std::string, ElementType> ElementNameToElementTypeMap;
149
150  static void InitMapping();
151
152  void ParseURL(const xmlChar** atts);
153  void ParseImage(const xmlChar** atts);
154  void ParseParam(const xmlChar** atts);
155  void ProcessURLParams();
156
157  // Returns the current ElementType.
158  ElementType GetKnownType();
159
160  static ElementNameToElementTypeMap* kElementNameToElementTypeMap;
161
162  // Data that gets updated as we parse, and is converted to a TemplateURL by
163  // GetTemplateURL().
164  TemplateURLData data_;
165
166  std::vector<ElementType> elements_;
167  bool image_is_valid_for_favicon_;
168
169  // Character content for the current element.
170  base::string16 string_;
171
172  TemplateURLParser::ParameterFilter* parameter_filter_;
173
174  // The list of parameters parsed in the Param nodes of a Url node.
175  std::vector<Param> extra_params_;
176
177  // The HTTP methods used.
178  Method method_;
179  Method suggestion_method_;
180
181  // If true, we are currently parsing a suggest URL, otherwise it is an HTML
182  // search.  Note that we don't need a stack as URL nodes cannot be nested.
183  bool is_suggest_url_;
184
185  // If true, the user has set a keyword and we should use it. Otherwise,
186  // we generate a keyword based on the URL.
187  bool has_custom_keyword_;
188
189  // Whether we should derive the image from the URL (when images are data
190  // URLs).
191  bool derive_image_from_url_;
192
193  DISALLOW_COPY_AND_ASSIGN(TemplateURLParsingContext);
194};
195
196// static
197TemplateURLParsingContext::ElementNameToElementTypeMap*
198    TemplateURLParsingContext::kElementNameToElementTypeMap = NULL;
199
200TemplateURLParsingContext::TemplateURLParsingContext(
201    TemplateURLParser::ParameterFilter* parameter_filter)
202    : image_is_valid_for_favicon_(false),
203      parameter_filter_(parameter_filter),
204      method_(GET),
205      suggestion_method_(GET),
206      is_suggest_url_(false),
207      has_custom_keyword_(false),
208      derive_image_from_url_(false) {
209  if (kElementNameToElementTypeMap == NULL)
210    InitMapping();
211}
212
213// static
214void TemplateURLParsingContext::StartElementImpl(void* ctx,
215                                                 const xmlChar* name,
216                                                 const xmlChar** atts) {
217  // Remove the namespace from |name|, ex: os:Url -> Url.
218  std::string node_name(XMLCharToString(name));
219  size_t index = node_name.find_first_of(":");
220  if (index != std::string::npos)
221    node_name.erase(0, index + 1);
222
223  TemplateURLParsingContext* context =
224      reinterpret_cast<TemplateURLParsingContext*>(ctx);
225  context->elements_.push_back(
226    context->kElementNameToElementTypeMap->count(node_name) ?
227        (*context->kElementNameToElementTypeMap)[node_name] : UNKNOWN);
228  switch (context->GetKnownType()) {
229    case TemplateURLParsingContext::URL:
230      context->extra_params_.clear();
231      context->ParseURL(atts);
232      break;
233    case TemplateURLParsingContext::IMAGE:
234      context->ParseImage(atts);
235      break;
236    case TemplateURLParsingContext::PARAM:
237      context->ParseParam(atts);
238      break;
239    default:
240      break;
241  }
242  context->string_.clear();
243}
244
245// static
246void TemplateURLParsingContext::EndElementImpl(void* ctx, const xmlChar* name) {
247  TemplateURLParsingContext* context =
248      reinterpret_cast<TemplateURLParsingContext*>(ctx);
249  switch (context->GetKnownType()) {
250    case TemplateURLParsingContext::URL:
251      context->ProcessURLParams();
252      break;
253    case TemplateURLParsingContext::SHORT_NAME:
254      context->data_.short_name = context->string_;
255      break;
256    case TemplateURLParsingContext::IMAGE: {
257      GURL image_url(base::UTF16ToUTF8(context->string_));
258      if (image_url.SchemeIs(url::kDataScheme)) {
259        // TODO (jcampan): bug 1169256: when dealing with data URL, we need to
260        // decode the data URL in the renderer. For now, we'll just point to the
261        // favicon from the URL.
262        context->derive_image_from_url_ = true;
263      } else if (context->image_is_valid_for_favicon_ && image_url.is_valid() &&
264                 (image_url.SchemeIs(url::kHttpScheme) ||
265                  image_url.SchemeIs(url::kHttpsScheme))) {
266        context->data_.favicon_url = image_url;
267      }
268      context->image_is_valid_for_favicon_ = false;
269      break;
270    }
271    case TemplateURLParsingContext::INPUT_ENCODING: {
272      std::string input_encoding = base::UTF16ToASCII(context->string_);
273      if (IsValidEncodingString(input_encoding))
274        context->data_.input_encodings.push_back(input_encoding);
275      break;
276    }
277    case TemplateURLParsingContext::ALIAS: {
278      context->data_.SetKeyword(context->string_);
279      context->has_custom_keyword_ = true;
280      break;
281    }
282    default:
283      break;
284  }
285  context->string_.clear();
286  context->elements_.pop_back();
287}
288
289// static
290void TemplateURLParsingContext::CharactersImpl(void* ctx,
291                                               const xmlChar* ch,
292                                               int len) {
293  reinterpret_cast<TemplateURLParsingContext*>(ctx)->string_ +=
294      base::UTF8ToUTF16(std::string(reinterpret_cast<const char*>(ch), len));
295}
296
297TemplateURL* TemplateURLParsingContext::GetTemplateURL(
298    const SearchTermsData& search_terms_data,
299    bool show_in_default_list) {
300  // TODO(jcampan): Support engines that use POST; see http://crbug.com/18107
301  if (method_ == TemplateURLParsingContext::POST || data_.short_name.empty() ||
302      !IsHTTPRef(data_.url()) || !IsHTTPRef(data_.suggestions_url))
303    return NULL;
304  if (suggestion_method_ == TemplateURLParsingContext::POST)
305    data_.suggestions_url.clear();
306
307  // If the image was a data URL, use the favicon from the search URL instead.
308  // (see the TODO in EndElementImpl()).
309  GURL search_url(data_.url());
310  if (derive_image_from_url_ && data_.favicon_url.is_empty())
311    data_.favicon_url = TemplateURL::GenerateFaviconURL(search_url);
312
313  // Generate a keyword for this search engine if a custom one was not present
314  // in the imported data.
315  if (!has_custom_keyword_)
316    data_.SetKeyword(TemplateURL::GenerateKeyword(search_url));
317
318  data_.show_in_default_list = show_in_default_list;
319
320  // Bail if the search URL is empty or if either TemplateURLRef is invalid.
321  scoped_ptr<TemplateURL> template_url(new TemplateURL(data_));
322  if (template_url->url().empty() ||
323      !template_url->url_ref().IsValid(search_terms_data) ||
324      (!template_url->suggestions_url().empty() &&
325       !template_url->suggestions_url_ref().IsValid(search_terms_data))) {
326    return NULL;
327  }
328
329  return template_url.release();
330}
331
332// static
333void TemplateURLParsingContext::InitMapping() {
334  kElementNameToElementTypeMap = new std::map<std::string, ElementType>;
335  (*kElementNameToElementTypeMap)[kURLElement] = URL;
336  (*kElementNameToElementTypeMap)[kParamElement] = PARAM;
337  (*kElementNameToElementTypeMap)[kShortNameElement] = SHORT_NAME;
338  (*kElementNameToElementTypeMap)[kImageElement] = IMAGE;
339  (*kElementNameToElementTypeMap)[kOpenSearchDescriptionElement] =
340      OPEN_SEARCH_DESCRIPTION;
341  (*kElementNameToElementTypeMap)[kFirefoxSearchDescriptionElement] =
342      OPEN_SEARCH_DESCRIPTION;
343  (*kElementNameToElementTypeMap)[kInputEncodingElement] = INPUT_ENCODING;
344  (*kElementNameToElementTypeMap)[kAliasElement] = ALIAS;
345}
346
347void TemplateURLParsingContext::ParseURL(const xmlChar** atts) {
348  if (!atts)
349    return;
350
351  std::string template_url;
352  bool is_post = false;
353  bool is_html_url = false;
354  bool is_suggest_url = false;
355  for (; *atts; atts += 2) {
356    std::string name(XMLCharToString(*atts));
357    const xmlChar* value = atts[1];
358    if (name == kURLTypeAttribute) {
359      std::string type = XMLCharToString(value);
360      is_html_url = (type == kHTMLType);
361      is_suggest_url = (type == kSuggestionType);
362    } else if (name == kURLTemplateAttribute) {
363      template_url = XMLCharToString(value);
364    } else if (name == kParamMethodAttribute) {
365      is_post = LowerCaseEqualsASCII(XMLCharToString(value), "post");
366    }
367  }
368
369  if (is_html_url && !template_url.empty()) {
370    data_.SetURL(template_url);
371    is_suggest_url_ = false;
372    if (is_post)
373      method_ = POST;
374  } else if (is_suggest_url) {
375    data_.suggestions_url = template_url;
376    is_suggest_url_ = true;
377    if (is_post)
378      suggestion_method_ = POST;
379  }
380}
381
382void TemplateURLParsingContext::ParseImage(const xmlChar** atts) {
383  if (!atts)
384    return;
385
386  int width = 0;
387  int height = 0;
388  std::string type;
389  for (; *atts; atts += 2) {
390    std::string name(XMLCharToString(*atts));
391    const xmlChar* value = atts[1];
392    if (name == kImageTypeAttribute) {
393      type = XMLCharToString(value);
394    } else if (name == kImageWidthAttribute) {
395      base::StringToInt(XMLCharToString(value), &width);
396    } else if (name == kImageHeightAttribute) {
397      base::StringToInt(XMLCharToString(value), &height);
398    }
399  }
400
401  image_is_valid_for_favicon_ = (width == gfx::kFaviconSize) &&
402      (height == gfx::kFaviconSize) &&
403      ((type == "image/x-icon") || (type == "image/vnd.microsoft.icon"));
404}
405
406void TemplateURLParsingContext::ParseParam(const xmlChar** atts) {
407  if (!atts)
408    return;
409
410  std::string key, value;
411  for (; *atts; atts += 2) {
412    std::string name(XMLCharToString(*atts));
413    const xmlChar* val = atts[1];
414    if (name == kParamNameAttribute) {
415      key = XMLCharToString(val);
416    } else if (name == kParamValueAttribute) {
417      value = XMLCharToString(val);
418    }
419  }
420
421  if (!key.empty() &&
422      (!parameter_filter_ || parameter_filter_->KeepParameter(key, value)))
423    extra_params_.push_back(Param(key, value));
424}
425
426void TemplateURLParsingContext::ProcessURLParams() {
427  if (!parameter_filter_ && extra_params_.empty())
428    return;
429
430  GURL url(is_suggest_url_ ? data_.suggestions_url : data_.url());
431  if (url.is_empty())
432    return;
433
434  // If there is a parameter filter, parse the existing URL and remove any
435  // unwanted parameter.
436  std::string new_query;
437  bool modified = false;
438  if (parameter_filter_) {
439    url::Component query = url.parsed_for_possibly_invalid_spec().query;
440    url::Component key, value;
441    const char* url_spec = url.spec().c_str();
442    while (url::ExtractQueryKeyValue(url_spec, &query, &key, &value)) {
443      std::string key_str(url_spec, key.begin, key.len);
444      std::string value_str(url_spec, value.begin, value.len);
445      if (parameter_filter_->KeepParameter(key_str, value_str)) {
446        AppendParamToQuery(key_str, value_str, &new_query);
447      } else {
448        modified = true;
449      }
450    }
451  }
452  if (!modified)
453    new_query = url.query();
454
455  // Add the extra parameters if any.
456  if (!extra_params_.empty()) {
457    modified = true;
458    for (std::vector<Param>::const_iterator iter(extra_params_.begin());
459         iter != extra_params_.end(); ++iter)
460      AppendParamToQuery(iter->first, iter->second, &new_query);
461  }
462
463  if (modified) {
464    GURL::Replacements repl;
465    repl.SetQueryStr(new_query);
466    url = url.ReplaceComponents(repl);
467    if (is_suggest_url_)
468      data_.suggestions_url = url.spec();
469    else if (url.is_valid())
470      data_.SetURL(url.spec());
471  }
472}
473
474TemplateURLParsingContext::ElementType
475    TemplateURLParsingContext::GetKnownType() {
476  if (elements_.size() == 2 && elements_[0] == OPEN_SEARCH_DESCRIPTION)
477    return elements_[1];
478  // We only expect PARAM nodes under the URL node.
479  return (elements_.size() == 3 && elements_[0] == OPEN_SEARCH_DESCRIPTION &&
480      elements_[1] == URL && elements_[2] == PARAM) ? PARAM : UNKNOWN;
481}
482
483
484// TemplateURLParser ----------------------------------------------------------
485
486// static
487TemplateURL* TemplateURLParser::Parse(
488    const SearchTermsData& search_terms_data,
489    bool show_in_default_list,
490    const char* data,
491    size_t length,
492    TemplateURLParser::ParameterFilter* param_filter) {
493  // xmlSubstituteEntitiesDefault(1) makes it so that &amp; isn't mapped to
494  // &#38; . Unfortunately xmlSubstituteEntitiesDefault affects global state.
495  // If this becomes problematic we'll need to provide our own entity
496  // type for &amp;, or strip out &#38; by hand after parsing.
497  int last_sub_entities_value = xmlSubstituteEntitiesDefault(1);
498  TemplateURLParsingContext context(param_filter);
499  xmlSAXHandler sax_handler;
500  memset(&sax_handler, 0, sizeof(sax_handler));
501  sax_handler.startElement = &TemplateURLParsingContext::StartElementImpl;
502  sax_handler.endElement = &TemplateURLParsingContext::EndElementImpl;
503  sax_handler.characters = &TemplateURLParsingContext::CharactersImpl;
504  int error = xmlSAXUserParseMemory(&sax_handler, &context, data,
505                                    static_cast<int>(length));
506  xmlSubstituteEntitiesDefault(last_sub_entities_value);
507
508  return error ?
509      NULL : context.GetTemplateURL(search_terms_data, show_in_default_list);
510}
511