diff options
Diffstat (limited to 'chrome/browser/search_engines/template_url_parser.cc')
-rw-r--r-- | chrome/browser/search_engines/template_url_parser.cc | 586 |
1 files changed, 586 insertions, 0 deletions
diff --git a/chrome/browser/search_engines/template_url_parser.cc b/chrome/browser/search_engines/template_url_parser.cc new file mode 100644 index 0000000..c3d6c7e --- /dev/null +++ b/chrome/browser/search_engines/template_url_parser.cc @@ -0,0 +1,586 @@ +// Copyright (c) 2006-2008 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "chrome/browser/search_engines/template_url_parser.h" + +#include <map> +#include <vector> + +#include "base/logging.h" +#include "base/scoped_ptr.h" +#include "base/string_util.h" +#include "chrome/browser/search_engines/template_url.h" +#include "googleurl/src/gurl.h" +#include "libxml/parser.h" +#include "libxml/xmlwriter.h" + +namespace { + +// +// NOTE: libxml uses the UTF-8 encoding. As 0-127 of UTF-8 corresponds +// to that of char, the following names are all in terms of char. This avoids +// having to convert to wide, then do comparisons + +// Defines for element names of the OSD document: +static const char kURLElement[] = "Url"; +static const char kParamElement[] = "Param"; +static const char kShortNameElement[] = "ShortName"; +static const char kDescriptionElement[] = "Description"; +static const char kImageElement[] = "Image"; +static const char kOpenSearchDescriptionElement[] = "OpenSearchDescription"; +static const char kFirefoxSearchDescriptionElement[] = "SearchPlugin"; +static const char kLanguageElement[] = "Language"; +static const char kInputEncodingElement[] = "InputEncoding"; + +// Various XML attributes used. +static const char kURLTypeAttribute[] = "type"; +static const char kURLTemplateAttribute[] = "template"; +static const char kImageTypeAttribute[] = "type"; +static const char kImageWidthAttribute[] = "width"; +static const char kImageHeightAttribute[] = "height"; +static const char kURLIndexOffsetAttribute[] = "indexOffset"; +static const char kURLPageOffsetAttribute[] = "pageOffset"; +static const char kParamNameAttribute[] = "name"; +static const char kParamValueAttribute[] = "value"; +static const char kParamMethodAttribute[] = "method"; + +// Mime type for search results. +static const char kHTMLType[] = "text/html"; + +// Mime type for as you type suggestions. +static const char kSuggestionType[] = "application/x-suggestions+json"; + +// Namespace identifier. +static const char kOSDNS[] = "xmlns"; + +// The namespace for documents we understand. +static const char kNameSpace[] = "http://a9.com/-/spec/opensearch/1.1/"; + +// Removes the namespace from the specified |name|, ex: os:Url -> Url. +static void PruneNamespace(std::string* name) { + size_t index = name->find_first_of(":"); + if (index != std::string::npos) + name->erase(0, index + 1); +} + +// +// To minimize memory overhead while parsing, a SAX style parser is used. +// ParsingContext is used to maintain the state we're in the document +// while parsing. +class ParsingContext { + public: + // Enum of the known element types. + enum ElementType { + UNKNOWN, + OPEN_SEARCH_DESCRIPTION, + URL, + PARAM, + SHORT_NAME, + DESCRIPTION, + IMAGE, + LANGUAGE, + INPUT_ENCODING, + }; + + enum Method { + GET, + POST + }; + + // Key/value of a Param node. + typedef std::pair<std::string, std::string> Param; + + ParsingContext(TemplateURLParser::ParameterFilter* parameter_filter, + TemplateURL* url) + : url_(url), + parameter_filter_(parameter_filter), + method_(GET), + suggestion_method_(GET), + is_suggest_url_(false), + derive_image_from_url_(false) { + if (kElementNameToElementTypeMap == NULL) + InitMapping(); + } + + // Invoked when an element starts. + void PushElement(const std::string& element) { + ElementType type; + if (kElementNameToElementTypeMap->find(element) == + kElementNameToElementTypeMap->end()) { + type = UNKNOWN; + } else { + type = (*kElementNameToElementTypeMap)[element]; + } + elements_.push_back(type); + } + + void PopElement() { + elements_.pop_back(); + } + + // Returns the current ElementType. + ElementType GetKnownType() { + if (elements_.size() == 2 && elements_[0] == OPEN_SEARCH_DESCRIPTION) + return elements_[1]; + + // We only expect PARAM nodes under the Url node + if (elements_.size() == 3 && elements_[0] == OPEN_SEARCH_DESCRIPTION && + elements_[1] == URL && elements_[2] == PARAM) + return PARAM; + + return UNKNOWN; + } + + TemplateURL* template_url() { return url_; } + + void AddImageRef(const std::wstring& type, int width, int height) { + if (width > 0 && height > 0) + current_image_.reset(new TemplateURL::ImageRef(type, width, height)); + } + + void EndImage() { + current_image_.reset(); + } + + void SetImageURL(const std::wstring& url) { + if (current_image_.get()) { + current_image_->url = GURL(WideToUTF8(url)); + url_->add_image_ref(*current_image_); + current_image_.reset(); + } + } + + void ResetString() { + string_.clear(); + } + + void AppendString(const std::wstring& string) { + string_ += string; + } + + const std::wstring& GetString() { + return string_; + } + + void ResetExtraParams() { + extra_params_.clear(); + } + + void AddExtraParams(const std::string& key, const std::string& value) { + if (parameter_filter_ && !parameter_filter_->KeepParameter(key, value)) + return; + extra_params_.push_back(Param(key, value)); + } + + const std::vector<Param>& extra_params() const { return extra_params_; } + + void set_is_suggestion(bool value) { is_suggest_url_ = value; } + bool is_suggestion() const { return is_suggest_url_; } + + TemplateURLParser::ParameterFilter* parameter_filter() const { + return parameter_filter_; + } + + void set_derive_image_from_url(bool derive_image_from_url) { + derive_image_from_url_ = derive_image_from_url; + } + + void set_method(Method method) { method_ = method; } + Method method() { return method_; } + + void set_suggestion_method(Method method) { suggestion_method_ = method; } + Method suggestion_method() { return suggestion_method_; } + + // Builds the image URL from the Template search URL if no image URL has been + // set. + void DeriveImageFromURL() { + if (derive_image_from_url_ && + url_->GetFavIconURL().is_empty() && url_->url()) { + GURL url(WideToUTF8(url_->url()->url())); // More url's please... + url_->SetFavIconURL(TemplateURL::GenerateFaviconURL(url)); + } + } + + private: + static void InitMapping() { + kElementNameToElementTypeMap = new std::map<std::string,ElementType>; + (*kElementNameToElementTypeMap)[kURLElement] = URL; + (*kElementNameToElementTypeMap)[kParamElement] = PARAM; + (*kElementNameToElementTypeMap)[kShortNameElement] = SHORT_NAME; + (*kElementNameToElementTypeMap)[kDescriptionElement] = DESCRIPTION; + (*kElementNameToElementTypeMap)[kImageElement] = IMAGE; + (*kElementNameToElementTypeMap)[kOpenSearchDescriptionElement] = + OPEN_SEARCH_DESCRIPTION; + (*kElementNameToElementTypeMap)[kFirefoxSearchDescriptionElement] = + OPEN_SEARCH_DESCRIPTION; + (*kElementNameToElementTypeMap)[kLanguageElement] = + LANGUAGE; + (*kElementNameToElementTypeMap)[kInputEncodingElement] = + INPUT_ENCODING; + } + + // Key is UTF8 encoded. + static std::map<std::string,ElementType>* kElementNameToElementTypeMap; + // TemplateURL supplied to Read method. It's owned by the caller, so we + // don't need to free it. + TemplateURL* url_; + std::vector<ElementType> elements_; + scoped_ptr<TemplateURL::ImageRef> current_image_; + + // Character content for the current element. + std::wstring string_; + + TemplateURLParser::ParameterFilter* parameter_filter_; + + // The list of parameters parsed in the Param nodes of a Url node. + std::vector<Param> extra_params_; + + // The HTTP methods used. + Method method_; + Method suggestion_method_; + + // If true, we are currently parsing a suggest URL, otherwise it is an HTML + // search. Note that we don't need a stack as Url nodes cannot be nested. + bool is_suggest_url_; + + // Whether we should derive the image from the URL (when images are data + // URLs). + bool derive_image_from_url_; + + DISALLOW_EVIL_CONSTRUCTORS(ParsingContext); +}; + +//static +std::map<std::string,ParsingContext::ElementType>* + ParsingContext::kElementNameToElementTypeMap = NULL; + +std::wstring XMLCharToWide(const xmlChar* value) { + return UTF8ToWide(std::string((const char*)value)); +} + +std::wstring XMLCharToWide(const xmlChar* value, int length) { + return UTF8ToWide(std::string((const char*)value, length)); +} + +std::string XMLCharToString(const xmlChar* value) { + return std::string((const char*)value); +} + +// Returns true if input_encoding contains a valid input encoding string. This +// doesn't verify that we have a valid encoding for the string, just that the +// string contains characters that constitute a valid input encoding. +bool IsValidEncodingString(const std::string& input_encoding) { + if (input_encoding.empty()) + return false; + + if (!IsAsciiAlpha(input_encoding[0])) + return false; + + for (size_t i = 1, max = input_encoding.size(); i < max; ++i) { + char c = input_encoding[i]; + if (!IsAsciiAlpha(c) && !IsAsciiDigit(c) && c != '.' && c != '_' && + c != '-') { + return false; + } + } + return true; +} + +void ParseURL(const xmlChar** atts, ParsingContext* context) { + if (!atts) + return; + + TemplateURL* turl = context->template_url(); + const xmlChar** attributes = atts; + std::wstring template_url; + bool is_post = false; + bool is_html_url = false; + bool is_suggest_url = false; + int index_offset = 1; + int page_offset = 1; + + while (*attributes) { + std::string name(XMLCharToString(*attributes)); + const xmlChar* value = attributes[1]; + if (name == kURLTypeAttribute) { + std::string type = XMLCharToString(value); + is_html_url = (type == kHTMLType); + is_suggest_url = (type == kSuggestionType); + } else if (name == kURLTemplateAttribute) { + template_url = XMLCharToWide(value); + } else if (name == kURLIndexOffsetAttribute) { + index_offset = std::max(1, StringToInt(XMLCharToWide(value))); + } else if (name == kURLPageOffsetAttribute) { + page_offset = std::max(1, StringToInt(XMLCharToWide(value))); + } else if (name == kParamMethodAttribute) { + is_post = LowerCaseEqualsASCII(XMLCharToString(value), "post"); + } + attributes += 2; + } + if (is_html_url) { + turl->SetURL(template_url, index_offset, page_offset); + context->set_is_suggestion(false); + if (is_post) + context->set_method(ParsingContext::POST); + } else if (is_suggest_url) { + turl->SetSuggestionsURL(template_url, index_offset, page_offset); + context->set_is_suggestion(true); + if (is_post) + context->set_suggestion_method(ParsingContext::POST); + } +} + +void ParseImage(const xmlChar** atts, ParsingContext* context) { + if (!atts) + return; + + const xmlChar** attributes = atts; + int width = 0; + int height = 0; + std::wstring type; + while (*attributes) { + std::string name(XMLCharToString(*attributes)); + const xmlChar* value = attributes[1]; + if (name == kImageTypeAttribute) { + type = XMLCharToWide(value); + } else if (name == kImageWidthAttribute) { + width = StringToInt(XMLCharToWide(value)); + } else if (name == kImageHeightAttribute) { + height = StringToInt(XMLCharToWide(value)); + } + attributes += 2; + } + if (width > 0 && height > 0 && !type.empty()) { + // Valid Image URL. + context->AddImageRef(type, width, height); + } +} + +void ParseParam(const xmlChar** atts, ParsingContext* context) { + if (!atts) + return; + + const xmlChar** attributes = atts; + std::wstring type; + std::string key, value; + while (*attributes) { + std::string name(XMLCharToString(*attributes)); + const xmlChar* val = attributes[1]; + if (name == kParamNameAttribute) { + key = XMLCharToString(val); + } else if (name == kParamValueAttribute) { + value = XMLCharToString(val); + } + attributes += 2; + } + if (!key.empty()) + context->AddExtraParams(key, value); +} + +static void AppendParamToQuery(const std::string& key, + const std::string& value, + std::string* query) { + if (!query->empty()) + query->append("&"); + if (!key.empty()) { + query->append(key); + query->append("="); + } + query->append(value); +} + +void ProcessURLParams(ParsingContext* context) { + TemplateURL* t_url = context->template_url(); + const TemplateURLRef* t_url_ref = + context->is_suggestion() ? t_url->suggestions_url() : + t_url->url(); + if (!t_url_ref) + return; + + if (!context->parameter_filter() && context->extra_params().empty()) + return; + + GURL url(WideToUTF8(t_url_ref->url())); + // If there is a parameter filter, parse the existing URL and remove any + // unwanted parameter. + TemplateURLParser::ParameterFilter* filter = context->parameter_filter(); + std::string new_query; + bool modified = false; + if (filter) { + url_parse::Component query = url.parsed_for_possibly_invalid_spec().query; + url_parse::Component key, value; + const char* url_spec = url.spec().c_str(); + while (url_parse::ExtractQueryKeyValue(url_spec, &query, &key, &value)) { + std::string key_str(url_spec, key.begin, key.len); + std::string value_str(url_spec, value.begin, value.len); + if (filter->KeepParameter(key_str, value_str)) { + AppendParamToQuery(key_str, value_str, &new_query); + } else { + modified = true; + } + } + } + if (!modified) + new_query = url.query(); + + // Add the extra parameters if any. + const std::vector<ParsingContext::Param>& params = context->extra_params(); + if (!params.empty()) { + modified = true; + std::vector<ParsingContext::Param>::const_iterator iter; + for (iter = params.begin(); iter != params.end(); ++iter) + AppendParamToQuery(iter->first, iter->second, &new_query); + } + + if (modified) { + GURL::Replacements repl; + repl.SetQueryStr(new_query); + url = url.ReplaceComponents(repl); + if (context->is_suggestion()) { + t_url->SetSuggestionsURL(UTF8ToWide(url.spec()), + t_url_ref->index_offset(), + t_url_ref->page_offset()); + } else { + t_url->SetURL(UTF8ToWide(url.spec()), + t_url_ref->index_offset(), + t_url_ref->page_offset()); + } + } +} + +void StartElementImpl(void *ctx, const xmlChar *name, const xmlChar **atts) { + ParsingContext* context = reinterpret_cast<ParsingContext*>(ctx); + std::string node_name((const char*)name); + PruneNamespace(&node_name); + context->PushElement(node_name); + switch (context->GetKnownType()) { + case ParsingContext::URL: + context->ResetExtraParams(); + ParseURL(atts, context); + break; + case ParsingContext::IMAGE: + ParseImage(atts, context); + break; + case ParsingContext::PARAM: + ParseParam(atts, context); + break; + default: + break; + } + context->ResetString(); +} + +void EndElementImpl(void *ctx, const xmlChar *name) { + ParsingContext* context = reinterpret_cast<ParsingContext*>(ctx); + switch (context->GetKnownType()) { + case ParsingContext::SHORT_NAME: + context->template_url()->set_short_name(context->GetString()); + break; + case ParsingContext::DESCRIPTION: + context->template_url()->set_description(context->GetString()); + break; + case ParsingContext::IMAGE: { + GURL image_url(WideToUTF8(context->GetString())); + if (image_url.SchemeIs("data")) { + // TODO (jcampan): bug 1169256: when dealing with data URL, we need to + // decode the data URL in the renderer. For now, we'll just point to the + // fav icon from the URL. + context->set_derive_image_from_url(true); + } else { + context->SetImageURL(context->GetString()); + } + context->EndImage(); + break; + } + case ParsingContext::LANGUAGE: + context->template_url()->add_language(context->GetString()); + break; + case ParsingContext::INPUT_ENCODING: { + std::string input_encoding = WideToASCII(context->GetString()); + if (IsValidEncodingString(input_encoding)) + context->template_url()->add_input_encoding(input_encoding); + break; + } + case ParsingContext::URL: + ProcessURLParams(context); + break; + default: + break; + } + context->ResetString(); + context->PopElement(); +} + +void CharactersImpl(void *ctx, const xmlChar *ch, int len) { + ParsingContext* context = reinterpret_cast<ParsingContext*>(ctx); + context->AppendString(XMLCharToWide(ch, len)); +} + +// Returns true if the ref is null, or the url wrapped by ref is +// valid with a spec of http/https. +bool IsHTTPRef(const TemplateURLRef* ref) { + if (ref == NULL) + return true; + GURL url(WideToUTF8(ref->url())); + return (url.is_valid() && (url.SchemeIs("http") || url.SchemeIs("https"))); +} + +// Returns true if the TemplateURL is legal. A legal TemplateURL is one +// where all URLs have a spec of http/https. +bool IsLegal(TemplateURL* url) { + if (!IsHTTPRef(url->url()) || !IsHTTPRef(url->suggestions_url())) + return false; + // Make sure all the image refs are legal. + const std::vector<TemplateURL::ImageRef>& image_refs = url->image_refs(); + for (size_t i = 0; i < image_refs.size(); i++) { + GURL image_url(image_refs[i].url); + if (!image_url.is_valid() || + !(image_url.SchemeIs("http") || image_url.SchemeIs("https"))) { + return false; + } + } + return true; +} + +} // namespace + +// static +bool TemplateURLParser::Parse(const unsigned char* data, size_t length, + TemplateURLParser::ParameterFilter* param_filter, + TemplateURL* url) { + DCHECK(url); + // xmlSubstituteEntitiesDefault(1) makes it so that & isn't mapped to + // & . Unfortunately xmlSubstituteEntitiesDefault effects global state. + // If this becomes problematic we'll need to provide our own entity + // type for &, or strip out " by hand after parsing. + int last_sub_entities_value = xmlSubstituteEntitiesDefault(1); + ParsingContext context(param_filter, url); + xmlSAXHandler sax_handler; + memset(&sax_handler, 0, sizeof(sax_handler)); + sax_handler.startElement = &StartElementImpl; + sax_handler.endElement = &EndElementImpl; + sax_handler.characters = &CharactersImpl; + xmlSAXUserParseMemory(&sax_handler, &context, + reinterpret_cast<const char*>(data), + static_cast<int>(length)); + xmlSubstituteEntitiesDefault(last_sub_entities_value); + // If the image was a data URL, use the favicon from the search URL instead. + // (see TODO inEndElementImpl()). + context.DeriveImageFromURL(); + + // TODO(jcampan): http://b/issue?id=1196285 we do not support search engines + // that use POST yet. + if (context.method() == ParsingContext::POST) + return false; + if (context.suggestion_method() == ParsingContext::POST) + url->SetSuggestionsURL(L"", 0, 0); + + if (!url->short_name().empty() && !url->description().empty()) { + // So far so good, make sure the urls are http. + return IsLegal(url); + } + return false; +} + + |