// Copyright 2014 The Chromium Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. #include "components/search_engines/template_url_parser.h" #include #include #include #include #include "base/logging.h" #include "base/macros.h" #include "base/memory/scoped_ptr.h" #include "base/strings/string_number_conversions.h" #include "base/strings/string_util.h" #include "base/strings/utf_string_conversions.h" #include "components/search_engines/search_terms_data.h" #include "components/search_engines/template_url.h" #include "libxml/parser.h" #include "libxml/xmlwriter.h" #include "ui/gfx/favicon_size.h" #include "url/gurl.h" #include "url/url_constants.h" namespace { // NOTE: libxml uses the UTF-8 encoding. As 0-127 of UTF-8 corresponds // to that of char, the following names are all in terms of char. This avoids // having to convert to wide, then do comparisons. // Defines for element names of the OSD document: const char kURLElement[] = "Url"; const char kParamElement[] = "Param"; const char kShortNameElement[] = "ShortName"; const char kImageElement[] = "Image"; const char kOpenSearchDescriptionElement[] = "OpenSearchDescription"; const char kFirefoxSearchDescriptionElement[] = "SearchPlugin"; const char kInputEncodingElement[] = "InputEncoding"; const char kAliasElement[] = "Alias"; // Various XML attributes used. const char kURLTypeAttribute[] = "type"; const char kURLTemplateAttribute[] = "template"; const char kImageTypeAttribute[] = "type"; const char kImageWidthAttribute[] = "width"; const char kImageHeightAttribute[] = "height"; const char kParamNameAttribute[] = "name"; const char kParamValueAttribute[] = "value"; const char kParamMethodAttribute[] = "method"; // Mime type for search results. const char kHTMLType[] = "text/html"; // Mime type for as you type suggestions. const char kSuggestionType[] = "application/x-suggestions+json"; std::string XMLCharToString(const xmlChar* value) { return std::string(reinterpret_cast(value)); } // Returns true if input_encoding contains a valid input encoding string. This // doesn't verify that we have a valid encoding for the string, just that the // string contains characters that constitute a valid input encoding. bool IsValidEncodingString(const std::string& input_encoding) { if (input_encoding.empty()) return false; if (!base::IsAsciiAlpha(input_encoding[0])) return false; for (size_t i = 1, max = input_encoding.size(); i < max; ++i) { char c = input_encoding[i]; if (!base::IsAsciiAlpha(c) && !base::IsAsciiDigit(c) && c != '.' && c != '_' && c != '-') { return false; } } return true; } void AppendParamToQuery(const std::string& key, const std::string& value, std::string* query) { if (!query->empty()) query->append("&"); if (!key.empty()) { query->append(key); query->append("="); } query->append(value); } // Returns true if |url| is empty or is a valid URL with a scheme of HTTP[S]. bool IsHTTPRef(const std::string& url) { if (url.empty()) return true; GURL gurl(url); return gurl.is_valid() && (gurl.SchemeIs(url::kHttpScheme) || gurl.SchemeIs(url::kHttpsScheme)); } } // namespace // TemplateURLParsingContext -------------------------------------------------- // To minimize memory overhead while parsing, a SAX style parser is used. // TemplateURLParsingContext is used to maintain the state we're in the document // while parsing. class TemplateURLParsingContext { public: // Enum of the known element types. enum ElementType { UNKNOWN, OPEN_SEARCH_DESCRIPTION, URL, PARAM, SHORT_NAME, IMAGE, INPUT_ENCODING, ALIAS, }; enum Method { GET, POST }; // Key/value of a Param node. typedef std::pair Param; explicit TemplateURLParsingContext( TemplateURLParser::ParameterFilter* parameter_filter); static void StartElementImpl(void* ctx, const xmlChar* name, const xmlChar** atts); static void EndElementImpl(void* ctx, const xmlChar* name); static void CharactersImpl(void* ctx, const xmlChar* ch, int len); // Returns a heap-allocated TemplateURL representing the result of parsing. // This will be NULL if parsing failed or if the results were invalid for some // reason (e.g. the resulting URL was not HTTP[S], a name wasn't supplied, // a resulting TemplateURLRef was invalid, etc.). TemplateURL* GetTemplateURL(const SearchTermsData& search_terms_data, bool show_in_default_list); private: // Key is UTF8 encoded. typedef std::map ElementNameToElementTypeMap; static void InitMapping(); void ParseURL(const xmlChar** atts); void ParseImage(const xmlChar** atts); void ParseParam(const xmlChar** atts); void ProcessURLParams(); // Returns the current ElementType. ElementType GetKnownType(); static ElementNameToElementTypeMap* kElementNameToElementTypeMap; // Data that gets updated as we parse, and is converted to a TemplateURL by // GetTemplateURL(). TemplateURLData data_; std::vector elements_; bool image_is_valid_for_favicon_; // Character content for the current element. base::string16 string_; TemplateURLParser::ParameterFilter* parameter_filter_; // The list of parameters parsed in the Param nodes of a Url node. std::vector extra_params_; // The HTTP methods used. Method method_; Method suggestion_method_; // If true, we are currently parsing a suggest URL, otherwise it is an HTML // search. Note that we don't need a stack as URL nodes cannot be nested. bool is_suggest_url_; // If true, the user has set a keyword and we should use it. Otherwise, // we generate a keyword based on the URL. bool has_custom_keyword_; // Whether we should derive the image from the URL (when images are data // URLs). bool derive_image_from_url_; DISALLOW_COPY_AND_ASSIGN(TemplateURLParsingContext); }; // static TemplateURLParsingContext::ElementNameToElementTypeMap* TemplateURLParsingContext::kElementNameToElementTypeMap = NULL; TemplateURLParsingContext::TemplateURLParsingContext( TemplateURLParser::ParameterFilter* parameter_filter) : image_is_valid_for_favicon_(false), parameter_filter_(parameter_filter), method_(GET), suggestion_method_(GET), is_suggest_url_(false), has_custom_keyword_(false), derive_image_from_url_(false) { if (kElementNameToElementTypeMap == NULL) InitMapping(); } // static void TemplateURLParsingContext::StartElementImpl(void* ctx, const xmlChar* name, const xmlChar** atts) { // Remove the namespace from |name|, ex: os:Url -> Url. std::string node_name(XMLCharToString(name)); size_t index = node_name.find_first_of(":"); if (index != std::string::npos) node_name.erase(0, index + 1); TemplateURLParsingContext* context = reinterpret_cast(ctx); context->elements_.push_back( context->kElementNameToElementTypeMap->count(node_name) ? (*context->kElementNameToElementTypeMap)[node_name] : UNKNOWN); switch (context->GetKnownType()) { case TemplateURLParsingContext::URL: context->extra_params_.clear(); context->ParseURL(atts); break; case TemplateURLParsingContext::IMAGE: context->ParseImage(atts); break; case TemplateURLParsingContext::PARAM: context->ParseParam(atts); break; default: break; } context->string_.clear(); } // static void TemplateURLParsingContext::EndElementImpl(void* ctx, const xmlChar* name) { TemplateURLParsingContext* context = reinterpret_cast(ctx); switch (context->GetKnownType()) { case TemplateURLParsingContext::URL: context->ProcessURLParams(); break; case TemplateURLParsingContext::SHORT_NAME: context->data_.SetShortName(context->string_); break; case TemplateURLParsingContext::IMAGE: { GURL image_url(base::UTF16ToUTF8(context->string_)); if (image_url.SchemeIs(url::kDataScheme)) { // TODO (jcampan): bug 1169256: when dealing with data URL, we need to // decode the data URL in the renderer. For now, we'll just point to the // favicon from the URL. context->derive_image_from_url_ = true; } else if (context->image_is_valid_for_favicon_ && image_url.is_valid() && (image_url.SchemeIs(url::kHttpScheme) || image_url.SchemeIs(url::kHttpsScheme))) { context->data_.favicon_url = image_url; } context->image_is_valid_for_favicon_ = false; break; } case TemplateURLParsingContext::INPUT_ENCODING: { std::string input_encoding = base::UTF16ToASCII(context->string_); if (IsValidEncodingString(input_encoding)) context->data_.input_encodings.push_back(input_encoding); break; } case TemplateURLParsingContext::ALIAS: { context->data_.SetKeyword(context->string_); context->has_custom_keyword_ = true; break; } default: break; } context->string_.clear(); context->elements_.pop_back(); } // static void TemplateURLParsingContext::CharactersImpl(void* ctx, const xmlChar* ch, int len) { reinterpret_cast(ctx)->string_ += base::UTF8ToUTF16( base::StringPiece(reinterpret_cast(ch), len)); } TemplateURL* TemplateURLParsingContext::GetTemplateURL( const SearchTermsData& search_terms_data, bool show_in_default_list) { // TODO(jcampan): Support engines that use POST; see http://crbug.com/18107 if (method_ == TemplateURLParsingContext::POST || data_.short_name().empty() || !IsHTTPRef(data_.url()) || !IsHTTPRef(data_.suggestions_url)) return NULL; if (suggestion_method_ == TemplateURLParsingContext::POST) data_.suggestions_url.clear(); // If the image was a data URL, use the favicon from the search URL instead. // (see the TODO in EndElementImpl()). GURL search_url(data_.url()); if (derive_image_from_url_ && data_.favicon_url.is_empty()) data_.favicon_url = TemplateURL::GenerateFaviconURL(search_url); // Generate a keyword for this search engine if a custom one was not present // in the imported data. if (!has_custom_keyword_) data_.SetKeyword(TemplateURL::GenerateKeyword( search_url, search_terms_data.GetAcceptLanguages())); data_.show_in_default_list = show_in_default_list; // Bail if the search URL is empty or if either TemplateURLRef is invalid. scoped_ptr template_url(new TemplateURL(data_)); if (template_url->url().empty() || !template_url->url_ref().IsValid(search_terms_data) || (!template_url->suggestions_url().empty() && !template_url->suggestions_url_ref().IsValid(search_terms_data))) { return NULL; } return template_url.release(); } // static void TemplateURLParsingContext::InitMapping() { kElementNameToElementTypeMap = new std::map; (*kElementNameToElementTypeMap)[kURLElement] = URL; (*kElementNameToElementTypeMap)[kParamElement] = PARAM; (*kElementNameToElementTypeMap)[kShortNameElement] = SHORT_NAME; (*kElementNameToElementTypeMap)[kImageElement] = IMAGE; (*kElementNameToElementTypeMap)[kOpenSearchDescriptionElement] = OPEN_SEARCH_DESCRIPTION; (*kElementNameToElementTypeMap)[kFirefoxSearchDescriptionElement] = OPEN_SEARCH_DESCRIPTION; (*kElementNameToElementTypeMap)[kInputEncodingElement] = INPUT_ENCODING; (*kElementNameToElementTypeMap)[kAliasElement] = ALIAS; } void TemplateURLParsingContext::ParseURL(const xmlChar** atts) { if (!atts) return; std::string template_url; bool is_post = false; bool is_html_url = false; bool is_suggest_url = false; for (; *atts; atts += 2) { std::string name(XMLCharToString(*atts)); const xmlChar* value = atts[1]; if (name == kURLTypeAttribute) { std::string type = XMLCharToString(value); is_html_url = (type == kHTMLType); is_suggest_url = (type == kSuggestionType); } else if (name == kURLTemplateAttribute) { template_url = XMLCharToString(value); } else if (name == kParamMethodAttribute) { is_post = base::LowerCaseEqualsASCII(XMLCharToString(value), "post"); } } if (is_html_url && !template_url.empty()) { data_.SetURL(template_url); is_suggest_url_ = false; if (is_post) method_ = POST; } else if (is_suggest_url) { data_.suggestions_url = template_url; is_suggest_url_ = true; if (is_post) suggestion_method_ = POST; } } void TemplateURLParsingContext::ParseImage(const xmlChar** atts) { if (!atts) return; int width = 0; int height = 0; std::string type; for (; *atts; atts += 2) { std::string name(XMLCharToString(*atts)); const xmlChar* value = atts[1]; if (name == kImageTypeAttribute) { type = XMLCharToString(value); } else if (name == kImageWidthAttribute) { base::StringToInt(XMLCharToString(value), &width); } else if (name == kImageHeightAttribute) { base::StringToInt(XMLCharToString(value), &height); } } image_is_valid_for_favicon_ = (width == gfx::kFaviconSize) && (height == gfx::kFaviconSize) && ((type == "image/x-icon") || (type == "image/vnd.microsoft.icon")); } void TemplateURLParsingContext::ParseParam(const xmlChar** atts) { if (!atts) return; std::string key, value; for (; *atts; atts += 2) { std::string name(XMLCharToString(*atts)); const xmlChar* val = atts[1]; if (name == kParamNameAttribute) { key = XMLCharToString(val); } else if (name == kParamValueAttribute) { value = XMLCharToString(val); } } if (!key.empty() && (!parameter_filter_ || parameter_filter_->KeepParameter(key, value))) extra_params_.push_back(Param(key, value)); } void TemplateURLParsingContext::ProcessURLParams() { if (!parameter_filter_ && extra_params_.empty()) return; GURL url(is_suggest_url_ ? data_.suggestions_url : data_.url()); if (url.is_empty()) return; // If there is a parameter filter, parse the existing URL and remove any // unwanted parameter. std::string new_query; bool modified = false; if (parameter_filter_) { url::Component query = url.parsed_for_possibly_invalid_spec().query; url::Component key, value; const char* url_spec = url.spec().c_str(); while (url::ExtractQueryKeyValue(url_spec, &query, &key, &value)) { std::string key_str(url_spec, key.begin, key.len); std::string value_str(url_spec, value.begin, value.len); if (parameter_filter_->KeepParameter(key_str, value_str)) { AppendParamToQuery(key_str, value_str, &new_query); } else { modified = true; } } } if (!modified) new_query = url.query(); // Add the extra parameters if any. if (!extra_params_.empty()) { modified = true; for (std::vector::const_iterator iter(extra_params_.begin()); iter != extra_params_.end(); ++iter) AppendParamToQuery(iter->first, iter->second, &new_query); } if (modified) { GURL::Replacements repl; repl.SetQueryStr(new_query); url = url.ReplaceComponents(repl); if (is_suggest_url_) data_.suggestions_url = url.spec(); else if (url.is_valid()) data_.SetURL(url.spec()); } } TemplateURLParsingContext::ElementType TemplateURLParsingContext::GetKnownType() { if (elements_.size() == 2 && elements_[0] == OPEN_SEARCH_DESCRIPTION) return elements_[1]; // We only expect PARAM nodes under the URL node. return (elements_.size() == 3 && elements_[0] == OPEN_SEARCH_DESCRIPTION && elements_[1] == URL && elements_[2] == PARAM) ? PARAM : UNKNOWN; } // TemplateURLParser ---------------------------------------------------------- // static TemplateURL* TemplateURLParser::Parse( const SearchTermsData& search_terms_data, bool show_in_default_list, const char* data, size_t length, TemplateURLParser::ParameterFilter* param_filter) { // xmlSubstituteEntitiesDefault(1) makes it so that & isn't mapped to // & . Unfortunately xmlSubstituteEntitiesDefault affects global state. // If this becomes problematic we'll need to provide our own entity // type for &, or strip out & by hand after parsing. int last_sub_entities_value = xmlSubstituteEntitiesDefault(1); TemplateURLParsingContext context(param_filter); xmlSAXHandler sax_handler; memset(&sax_handler, 0, sizeof(sax_handler)); sax_handler.startElement = &TemplateURLParsingContext::StartElementImpl; sax_handler.endElement = &TemplateURLParsingContext::EndElementImpl; sax_handler.characters = &TemplateURLParsingContext::CharactersImpl; int error = xmlSAXUserParseMemory(&sax_handler, &context, data, static_cast(length)); xmlSubstituteEntitiesDefault(last_sub_entities_value); return error ? NULL : context.GetTemplateURL(search_terms_data, show_in_default_list); }