template_url_parser.cc revision 3345a6884c488ff3a535c2c9acdd33d74b37e311
1// Copyright (c) 2010 The Chromium Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style license that can be 3// found in the LICENSE file. 4 5#include "chrome/browser/search_engines/template_url_parser.h" 6 7#include <algorithm> 8#include <map> 9#include <vector> 10 11#include "base/logging.h" 12#include "base/scoped_ptr.h" 13#include "base/string_number_conversions.h" 14#include "base/string_util.h" 15#include "base/utf_string_conversions.h" 16#include "chrome/browser/search_engines/template_url.h" 17#include "chrome/common/url_constants.h" 18#include "googleurl/src/gurl.h" 19#include "libxml/parser.h" 20#include "libxml/xmlwriter.h" 21 22namespace { 23 24// 25// NOTE: libxml uses the UTF-8 encoding. As 0-127 of UTF-8 corresponds 26// to that of char, the following names are all in terms of char. This avoids 27// having to convert to wide, then do comparisons 28 29// Defines for element names of the OSD document: 30static const char kURLElement[] = "Url"; 31static const char kParamElement[] = "Param"; 32static const char kShortNameElement[] = "ShortName"; 33static const char kDescriptionElement[] = "Description"; 34static const char kImageElement[] = "Image"; 35static const char kOpenSearchDescriptionElement[] = "OpenSearchDescription"; 36static const char kFirefoxSearchDescriptionElement[] = "SearchPlugin"; 37static const char kLanguageElement[] = "Language"; 38static const char kInputEncodingElement[] = "InputEncoding"; 39 40// Various XML attributes used. 41static const char kURLTypeAttribute[] = "type"; 42static const char kURLTemplateAttribute[] = "template"; 43static const char kImageTypeAttribute[] = "type"; 44static const char kImageWidthAttribute[] = "width"; 45static const char kImageHeightAttribute[] = "height"; 46static const char kURLIndexOffsetAttribute[] = "indexOffset"; 47static const char kURLPageOffsetAttribute[] = "pageOffset"; 48static const char kParamNameAttribute[] = "name"; 49static const char kParamValueAttribute[] = "value"; 50static const char kParamMethodAttribute[] = "method"; 51 52// Mime type for search results. 53static const char kHTMLType[] = "text/html"; 54 55// Mime type for as you type suggestions. 56static const char kSuggestionType[] = "application/x-suggestions+json"; 57 58// Namespace identifier. 59static const char kOSDNS[] = "xmlns"; 60 61// The namespace for documents we understand. 62static const char kNameSpace[] = "http://a9.com/-/spec/opensearch/1.1/"; 63 64// Removes the namespace from the specified |name|, ex: os:Url -> Url. 65static void PruneNamespace(std::string* name) { 66 size_t index = name->find_first_of(":"); 67 if (index != std::string::npos) 68 name->erase(0, index + 1); 69} 70 71// 72// To minimize memory overhead while parsing, a SAX style parser is used. 73// ParsingContext is used to maintain the state we're in the document 74// while parsing. 75class ParsingContext { 76 public: 77 // Enum of the known element types. 78 enum ElementType { 79 UNKNOWN, 80 OPEN_SEARCH_DESCRIPTION, 81 URL, 82 PARAM, 83 SHORT_NAME, 84 DESCRIPTION, 85 IMAGE, 86 LANGUAGE, 87 INPUT_ENCODING, 88 }; 89 90 enum Method { 91 GET, 92 POST 93 }; 94 95 // Key/value of a Param node. 96 typedef std::pair<std::string, std::string> Param; 97 98 ParsingContext(TemplateURLParser::ParameterFilter* parameter_filter, 99 TemplateURL* url) 100 : url_(url), 101 parameter_filter_(parameter_filter), 102 method_(GET), 103 suggestion_method_(GET), 104 is_suggest_url_(false), 105 derive_image_from_url_(false) { 106 if (kElementNameToElementTypeMap == NULL) 107 InitMapping(); 108 } 109 110 // Invoked when an element starts. 111 void PushElement(const std::string& element) { 112 ElementType type; 113 if (kElementNameToElementTypeMap->find(element) == 114 kElementNameToElementTypeMap->end()) { 115 type = UNKNOWN; 116 } else { 117 type = (*kElementNameToElementTypeMap)[element]; 118 } 119 elements_.push_back(type); 120 } 121 122 void PopElement() { 123 elements_.pop_back(); 124 } 125 126 // Returns the current ElementType. 127 ElementType GetKnownType() { 128 if (elements_.size() == 2 && elements_[0] == OPEN_SEARCH_DESCRIPTION) 129 return elements_[1]; 130 131 // We only expect PARAM nodes under the Url node 132 if (elements_.size() == 3 && elements_[0] == OPEN_SEARCH_DESCRIPTION && 133 elements_[1] == URL && elements_[2] == PARAM) 134 return PARAM; 135 136 return UNKNOWN; 137 } 138 139 TemplateURL* template_url() { return url_; } 140 141 void AddImageRef(const std::wstring& type, int width, int height) { 142 if (width > 0 && height > 0) 143 current_image_.reset(new TemplateURL::ImageRef(type, width, height)); 144 } 145 146 void EndImage() { 147 current_image_.reset(); 148 } 149 150 void SetImageURL(const std::wstring& url) { 151 if (current_image_.get()) { 152 current_image_->url = GURL(WideToUTF8(url)); 153 url_->add_image_ref(*current_image_); 154 current_image_.reset(); 155 } 156 } 157 158 void ResetString() { 159 string_.clear(); 160 } 161 162 void AppendString(const std::wstring& string) { 163 string_ += string; 164 } 165 166 const std::wstring& GetString() { 167 return string_; 168 } 169 170 void ResetExtraParams() { 171 extra_params_.clear(); 172 } 173 174 void AddExtraParams(const std::string& key, const std::string& value) { 175 if (parameter_filter_ && !parameter_filter_->KeepParameter(key, value)) 176 return; 177 extra_params_.push_back(Param(key, value)); 178 } 179 180 const std::vector<Param>& extra_params() const { return extra_params_; } 181 182 void set_is_suggestion(bool value) { is_suggest_url_ = value; } 183 bool is_suggestion() const { return is_suggest_url_; } 184 185 TemplateURLParser::ParameterFilter* parameter_filter() const { 186 return parameter_filter_; 187 } 188 189 void set_derive_image_from_url(bool derive_image_from_url) { 190 derive_image_from_url_ = derive_image_from_url; 191 } 192 193 void set_method(Method method) { method_ = method; } 194 Method method() { return method_; } 195 196 void set_suggestion_method(Method method) { suggestion_method_ = method; } 197 Method suggestion_method() { return suggestion_method_; } 198 199 // Builds the image URL from the Template search URL if no image URL has been 200 // set. 201 void DeriveImageFromURL() { 202 if (derive_image_from_url_ && 203 url_->GetFavIconURL().is_empty() && url_->url()) { 204 GURL url(url_->url()->url()); // More url's please... 205 url_->SetFavIconURL(TemplateURL::GenerateFaviconURL(url)); 206 } 207 } 208 209 private: 210 static void InitMapping() { 211 kElementNameToElementTypeMap = new std::map<std::string, ElementType>; 212 (*kElementNameToElementTypeMap)[kURLElement] = URL; 213 (*kElementNameToElementTypeMap)[kParamElement] = PARAM; 214 (*kElementNameToElementTypeMap)[kShortNameElement] = SHORT_NAME; 215 (*kElementNameToElementTypeMap)[kDescriptionElement] = DESCRIPTION; 216 (*kElementNameToElementTypeMap)[kImageElement] = IMAGE; 217 (*kElementNameToElementTypeMap)[kOpenSearchDescriptionElement] = 218 OPEN_SEARCH_DESCRIPTION; 219 (*kElementNameToElementTypeMap)[kFirefoxSearchDescriptionElement] = 220 OPEN_SEARCH_DESCRIPTION; 221 (*kElementNameToElementTypeMap)[kLanguageElement] = 222 LANGUAGE; 223 (*kElementNameToElementTypeMap)[kInputEncodingElement] = 224 INPUT_ENCODING; 225 } 226 227 // Key is UTF8 encoded. 228 static std::map<std::string, ElementType>* kElementNameToElementTypeMap; 229 // TemplateURL supplied to Read method. It's owned by the caller, so we 230 // don't need to free it. 231 TemplateURL* url_; 232 std::vector<ElementType> elements_; 233 scoped_ptr<TemplateURL::ImageRef> current_image_; 234 235 // Character content for the current element. 236 std::wstring string_; 237 238 TemplateURLParser::ParameterFilter* parameter_filter_; 239 240 // The list of parameters parsed in the Param nodes of a Url node. 241 std::vector<Param> extra_params_; 242 243 // The HTTP methods used. 244 Method method_; 245 Method suggestion_method_; 246 247 // If true, we are currently parsing a suggest URL, otherwise it is an HTML 248 // search. Note that we don't need a stack as Url nodes cannot be nested. 249 bool is_suggest_url_; 250 251 // Whether we should derive the image from the URL (when images are data 252 // URLs). 253 bool derive_image_from_url_; 254 255 DISALLOW_COPY_AND_ASSIGN(ParsingContext); 256}; 257 258// static 259std::map<std::string, ParsingContext::ElementType>* 260 ParsingContext::kElementNameToElementTypeMap = NULL; 261 262std::wstring XMLCharToWide(const xmlChar* value) { 263 return UTF8ToWide(std::string((const char*)value)); 264} 265 266std::wstring XMLCharToWide(const xmlChar* value, int length) { 267 return UTF8ToWide(std::string((const char*)value, length)); 268} 269 270std::string XMLCharToString(const xmlChar* value) { 271 return std::string((const char*)value); 272} 273 274// Returns true if input_encoding contains a valid input encoding string. This 275// doesn't verify that we have a valid encoding for the string, just that the 276// string contains characters that constitute a valid input encoding. 277bool IsValidEncodingString(const std::string& input_encoding) { 278 if (input_encoding.empty()) 279 return false; 280 281 if (!IsAsciiAlpha(input_encoding[0])) 282 return false; 283 284 for (size_t i = 1, max = input_encoding.size(); i < max; ++i) { 285 char c = input_encoding[i]; 286 if (!IsAsciiAlpha(c) && !IsAsciiDigit(c) && c != '.' && c != '_' && 287 c != '-') { 288 return false; 289 } 290 } 291 return true; 292} 293 294void ParseURL(const xmlChar** atts, ParsingContext* context) { 295 if (!atts) 296 return; 297 298 TemplateURL* turl = context->template_url(); 299 const xmlChar** attributes = atts; 300 std::string template_url; 301 bool is_post = false; 302 bool is_html_url = false; 303 bool is_suggest_url = false; 304 int index_offset = 1; 305 int page_offset = 1; 306 307 while (*attributes) { 308 std::string name(XMLCharToString(*attributes)); 309 const xmlChar* value = attributes[1]; 310 if (name == kURLTypeAttribute) { 311 std::string type = XMLCharToString(value); 312 is_html_url = (type == kHTMLType); 313 is_suggest_url = (type == kSuggestionType); 314 } else if (name == kURLTemplateAttribute) { 315 template_url = XMLCharToString(value); 316 } else if (name == kURLIndexOffsetAttribute) { 317 base::StringToInt(XMLCharToString(value), &index_offset); 318 index_offset = std::max(1, index_offset); 319 } else if (name == kURLPageOffsetAttribute) { 320 base::StringToInt(XMLCharToString(value), &page_offset); 321 page_offset = std::max(1, page_offset); 322 } else if (name == kParamMethodAttribute) { 323 is_post = LowerCaseEqualsASCII(XMLCharToString(value), "post"); 324 } 325 attributes += 2; 326 } 327 if (is_html_url) { 328 turl->SetURL(template_url, index_offset, page_offset); 329 context->set_is_suggestion(false); 330 if (is_post) 331 context->set_method(ParsingContext::POST); 332 } else if (is_suggest_url) { 333 turl->SetSuggestionsURL(template_url, index_offset, page_offset); 334 context->set_is_suggestion(true); 335 if (is_post) 336 context->set_suggestion_method(ParsingContext::POST); 337 } 338} 339 340void ParseImage(const xmlChar** atts, ParsingContext* context) { 341 if (!atts) 342 return; 343 344 const xmlChar** attributes = atts; 345 int width = 0; 346 int height = 0; 347 std::wstring type; 348 while (*attributes) { 349 std::string name(XMLCharToString(*attributes)); 350 const xmlChar* value = attributes[1]; 351 if (name == kImageTypeAttribute) { 352 type = XMLCharToWide(value); 353 } else if (name == kImageWidthAttribute) { 354 base::StringToInt(XMLCharToString(value), &width); 355 } else if (name == kImageHeightAttribute) { 356 base::StringToInt(XMLCharToString(value), &height); 357 } 358 attributes += 2; 359 } 360 if (width > 0 && height > 0 && !type.empty()) { 361 // Valid Image URL. 362 context->AddImageRef(type, width, height); 363 } 364} 365 366void ParseParam(const xmlChar** atts, ParsingContext* context) { 367 if (!atts) 368 return; 369 370 const xmlChar** attributes = atts; 371 std::wstring type; 372 std::string key, value; 373 while (*attributes) { 374 std::string name(XMLCharToString(*attributes)); 375 const xmlChar* val = attributes[1]; 376 if (name == kParamNameAttribute) { 377 key = XMLCharToString(val); 378 } else if (name == kParamValueAttribute) { 379 value = XMLCharToString(val); 380 } 381 attributes += 2; 382 } 383 if (!key.empty()) 384 context->AddExtraParams(key, value); 385} 386 387static void AppendParamToQuery(const std::string& key, 388 const std::string& value, 389 std::string* query) { 390 if (!query->empty()) 391 query->append("&"); 392 if (!key.empty()) { 393 query->append(key); 394 query->append("="); 395 } 396 query->append(value); 397} 398 399void ProcessURLParams(ParsingContext* context) { 400 TemplateURL* t_url = context->template_url(); 401 const TemplateURLRef* t_url_ref = 402 context->is_suggestion() ? t_url->suggestions_url() : 403 t_url->url(); 404 if (!t_url_ref) 405 return; 406 407 if (!context->parameter_filter() && context->extra_params().empty()) 408 return; 409 410 GURL url(t_url_ref->url()); 411 // If there is a parameter filter, parse the existing URL and remove any 412 // unwanted parameter. 413 TemplateURLParser::ParameterFilter* filter = context->parameter_filter(); 414 std::string new_query; 415 bool modified = false; 416 if (filter) { 417 url_parse::Component query = url.parsed_for_possibly_invalid_spec().query; 418 url_parse::Component key, value; 419 const char* url_spec = url.spec().c_str(); 420 while (url_parse::ExtractQueryKeyValue(url_spec, &query, &key, &value)) { 421 std::string key_str(url_spec, key.begin, key.len); 422 std::string value_str(url_spec, value.begin, value.len); 423 if (filter->KeepParameter(key_str, value_str)) { 424 AppendParamToQuery(key_str, value_str, &new_query); 425 } else { 426 modified = true; 427 } 428 } 429 } 430 if (!modified) 431 new_query = url.query(); 432 433 // Add the extra parameters if any. 434 const std::vector<ParsingContext::Param>& params = context->extra_params(); 435 if (!params.empty()) { 436 modified = true; 437 std::vector<ParsingContext::Param>::const_iterator iter; 438 for (iter = params.begin(); iter != params.end(); ++iter) 439 AppendParamToQuery(iter->first, iter->second, &new_query); 440 } 441 442 if (modified) { 443 GURL::Replacements repl; 444 repl.SetQueryStr(new_query); 445 url = url.ReplaceComponents(repl); 446 if (context->is_suggestion()) { 447 t_url->SetSuggestionsURL(url.spec(), 448 t_url_ref->index_offset(), 449 t_url_ref->page_offset()); 450 } else { 451 t_url->SetURL(url.spec(), 452 t_url_ref->index_offset(), 453 t_url_ref->page_offset()); 454 } 455 } 456} 457 458void StartElementImpl(void *ctx, const xmlChar *name, const xmlChar **atts) { 459 ParsingContext* context = reinterpret_cast<ParsingContext*>(ctx); 460 std::string node_name((const char*)name); 461 PruneNamespace(&node_name); 462 context->PushElement(node_name); 463 switch (context->GetKnownType()) { 464 case ParsingContext::URL: 465 context->ResetExtraParams(); 466 ParseURL(atts, context); 467 break; 468 case ParsingContext::IMAGE: 469 ParseImage(atts, context); 470 break; 471 case ParsingContext::PARAM: 472 ParseParam(atts, context); 473 break; 474 default: 475 break; 476 } 477 context->ResetString(); 478} 479 480void EndElementImpl(void *ctx, const xmlChar *name) { 481 ParsingContext* context = reinterpret_cast<ParsingContext*>(ctx); 482 switch (context->GetKnownType()) { 483 case ParsingContext::SHORT_NAME: 484 context->template_url()->set_short_name(context->GetString()); 485 break; 486 case ParsingContext::DESCRIPTION: 487 context->template_url()->set_description(context->GetString()); 488 break; 489 case ParsingContext::IMAGE: { 490 GURL image_url(WideToUTF8(context->GetString())); 491 if (image_url.SchemeIs(chrome::kDataScheme)) { 492 // TODO (jcampan): bug 1169256: when dealing with data URL, we need to 493 // decode the data URL in the renderer. For now, we'll just point to the 494 // fav icon from the URL. 495 context->set_derive_image_from_url(true); 496 } else { 497 context->SetImageURL(context->GetString()); 498 } 499 context->EndImage(); 500 break; 501 } 502 case ParsingContext::LANGUAGE: 503 context->template_url()->add_language(context->GetString()); 504 break; 505 case ParsingContext::INPUT_ENCODING: { 506 std::string input_encoding = WideToASCII(context->GetString()); 507 if (IsValidEncodingString(input_encoding)) 508 context->template_url()->add_input_encoding(input_encoding); 509 break; 510 } 511 case ParsingContext::URL: 512 ProcessURLParams(context); 513 break; 514 default: 515 break; 516 } 517 context->ResetString(); 518 context->PopElement(); 519} 520 521void CharactersImpl(void *ctx, const xmlChar *ch, int len) { 522 ParsingContext* context = reinterpret_cast<ParsingContext*>(ctx); 523 context->AppendString(XMLCharToWide(ch, len)); 524} 525 526// Returns true if the ref is null, or the url wrapped by ref is 527// valid with a spec of http/https. 528bool IsHTTPRef(const TemplateURLRef* ref) { 529 if (ref == NULL) 530 return true; 531 GURL url(ref->url()); 532 return (url.is_valid() && (url.SchemeIs(chrome::kHttpScheme) || 533 url.SchemeIs(chrome::kHttpsScheme))); 534} 535 536// Returns true if the TemplateURL is legal. A legal TemplateURL is one 537// where all URLs have a spec of http/https. 538bool IsLegal(TemplateURL* url) { 539 if (!IsHTTPRef(url->url()) || !IsHTTPRef(url->suggestions_url())) 540 return false; 541 // Make sure all the image refs are legal. 542 const std::vector<TemplateURL::ImageRef>& image_refs = url->image_refs(); 543 for (size_t i = 0; i < image_refs.size(); i++) { 544 GURL image_url(image_refs[i].url); 545 if (!image_url.is_valid() || 546 !(image_url.SchemeIs(chrome::kHttpScheme) || 547 image_url.SchemeIs(chrome::kHttpsScheme))) { 548 return false; 549 } 550 } 551 return true; 552} 553 554} // namespace 555 556// static 557bool TemplateURLParser::Parse(const unsigned char* data, size_t length, 558 TemplateURLParser::ParameterFilter* param_filter, 559 TemplateURL* url) { 560 DCHECK(url); 561 // xmlSubstituteEntitiesDefault(1) makes it so that & isn't mapped to 562 // & . Unfortunately xmlSubstituteEntitiesDefault effects global state. 563 // If this becomes problematic we'll need to provide our own entity 564 // type for &, or strip out " by hand after parsing. 565 int last_sub_entities_value = xmlSubstituteEntitiesDefault(1); 566 ParsingContext context(param_filter, url); 567 xmlSAXHandler sax_handler; 568 memset(&sax_handler, 0, sizeof(sax_handler)); 569 sax_handler.startElement = &StartElementImpl; 570 sax_handler.endElement = &EndElementImpl; 571 sax_handler.characters = &CharactersImpl; 572 xmlSAXUserParseMemory(&sax_handler, &context, 573 reinterpret_cast<const char*>(data), 574 static_cast<int>(length)); 575 xmlSubstituteEntitiesDefault(last_sub_entities_value); 576 // If the image was a data URL, use the favicon from the search URL instead. 577 // (see TODO inEndElementImpl()). 578 context.DeriveImageFromURL(); 579 580 // TODO(jcampan): http://b/issue?id=1196285 we do not support search engines 581 // that use POST yet. 582 if (context.method() == ParsingContext::POST) 583 return false; 584 if (context.suggestion_method() == ParsingContext::POST) 585 url->SetSuggestionsURL("", 0, 0); 586 587 if (!url->short_name().empty() && !url->description().empty()) { 588 // So far so good, make sure the urls are http. 589 return IsLegal(url); 590 } 591 return false; 592} 593