// Copyright (c) 2006-2008 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
//
// How we handle the base tag better.
// Current status:
// At now the normal way we use to handling base tag is
// a) For those links which have corresponding local saved files, such as
// savable CSS, JavaScript files, they will be written to relative URLs which
// point to local saved file. Why those links can not be resolved as absolute
// file URLs, because if they are resolved as absolute URLs, after moving the
// file location from one directory to another directory, the file URLs will
// be dead links.
// b) For those links which have not corresponding local saved files, such as
// links in A, AREA tags, they will be resolved as absolute URLs.
// c) We comment all base tags when serialzing DOM for the page.
// FireFox also uses above way to handle base tag.
//
// Problem:
// This way can not handle the following situation:
// the base tag is written by JavaScript.
// For example. The page "www.yahoo.com" use
// "document.write(' to DOM, so all URLs which point to
// local saved resource files will be resolved as
// "http://www.yahoo.com/yahoo_files/...", which will cause all saved resource
// files can not be loaded correctly. Also the page will be rendered ugly since
// all saved sub-resource files (such as CSS, JavaScript files) and sub-frame
// files can not be fetched.
// Now FireFox, IE and WebKit based Browser all have this problem.
//
// Solution:
// My solution is that we comment old base tag and write new base tag:
// after the previous commented base tag. In WebKit, it
// always uses the latest "href" attribute of base tag to set document's base
// URL. Based on this behavior, when we encounter a base tag, we comment it and
// write a new base tag after the previous commented base tag.
// The new added base tag can help engine to locate correct base URL for
// correctly loading local saved resource files. Also I think we need to inherit
// the base target value from document object when appending new base tag.
// If there are multiple base tags in original document, we will comment all old
// base tags and append new base tag after each old base tag because we do not
// know those old base tags are original content or added by JavaScript. If
// they are added by JavaScript, it means when loading saved page, the script(s)
// will still insert base tag(s) to DOM, so the new added base tag(s) can
// override the incorrect base URL and make sure we alway load correct local
// saved resource files.
#include "config.h"
#include "base/compiler_specific.h"
MSVC_PUSH_WARNING_LEVEL(0);
#include "DocumentType.h"
#include "FrameLoader.h"
#include "Document.h"
#include "Element.h"
#include "HTMLCollection.h"
#include "HTMLElement.h"
#include "HTMLFormElement.h"
#include "HTMLMetaElement.h"
#include "HTMLNames.h"
#include "KURL.h"
#include "markup.h"
#include "PlatformString.h"
#include "TextEncoding.h"
MSVC_POP_WARNING();
#undef LOG
#include "webkit/glue/dom_serializer.h"
#include "base/string_util.h"
#include "webkit/glue/dom_operations.h"
#include "webkit/glue/dom_serializer_delegate.h"
#include "webkit/glue/entity_map.h"
#include "webkit/glue/glue_util.h"
#include "webkit/glue/webframe_impl.h"
namespace {
// Default "mark of the web" declaration
static const char* const kDefaultMarkOfTheWeb =
"\n\n";
// Default meat content for writing correct charset declaration.
static const wchar_t* const kDefaultMetaContent =
L"";
// Notation of start comment.
static const wchar_t* const kStartCommentNotation = L"";
// Default XML declaration.
static const wchar_t* const kXMLDeclaration =
L"\n";
// Default base tag declaration
static const wchar_t* const kBaseTagDeclaration =
L"";
static const wchar_t* const kBaseTargetDeclaration =
L" target=\"%ls\"";
// Maximum length of data buffer which is used to temporary save generated
// html content data.
static const int kHtmlContentBufferLength = 65536;
// Check whether specified unicode has corresponding html/xml entity name.
// If yes, replace the character with the returned entity notation, if not
// then still use original character.
void ConvertCorrespondingSymbolToEntity(WebCore::String* result,
const WebCore::String& value,
bool in_html_doc) {
unsigned len = value.length();
const UChar* start_pos = value.characters();
const UChar* cur_pos = start_pos;
while (len--) {
const char* entity_name =
webkit_glue::EntityMap::GetEntityNameByCode(*cur_pos, in_html_doc);
if (entity_name) {
// Append content before entity code.
if (cur_pos > start_pos)
result->append(start_pos, cur_pos - start_pos);
result->append("&");
result->append(entity_name);
result->append(";");
start_pos = ++cur_pos;
} else {
cur_pos++;
}
}
// Append the remaining content.
if (cur_pos > start_pos)
result->append(start_pos, cur_pos - start_pos);
}
} // namespace
namespace webkit_glue {
// SerializeDomParam Constructor.
DomSerializer::SerializeDomParam::SerializeDomParam(
const GURL& current_frame_gurl,
const WebCore::TextEncoding& text_encoding,
WebCore::Document* doc,
const std::wstring& directory_name)
: current_frame_gurl(current_frame_gurl),
text_encoding(text_encoding),
doc(doc),
directory_name(directory_name),
has_doctype(false),
has_checked_meta(false),
skip_meta_element(NULL),
is_in_script_or_style_tag(false),
has_doc_declaration(false) {
// Cache the value since we check it lots of times.
is_html_document = doc->isHTMLDocument();
}
// Static.
std::string DomSerializer::GenerateMarkOfTheWebDeclaration(
const GURL& url) {
return StringPrintf(kDefaultMarkOfTheWeb,
url.spec().size(), url.spec().c_str());
}
// Static.
std::wstring DomSerializer::GenerateBaseTagDeclaration(
const std::wstring& base_target) {
std::wstring target_declaration = base_target.empty() ? L"" :
StringPrintf(kBaseTargetDeclaration, base_target.c_str());
return StringPrintf(kBaseTagDeclaration, target_declaration.c_str());
}
WebCore::String DomSerializer::PreActionBeforeSerializeOpenTag(
const WebCore::Element* element, SerializeDomParam* param,
bool* need_skip) {
WebCore::String result;
*need_skip = false;
if (param->is_html_document) {
// Skip the open tag of original META tag which declare charset since we
// have overrided the META which have correct charset declaration after
// serializing open tag of HEAD element.
if (element->hasTagName(WebCore::HTMLNames::metaTag)) {
const WebCore::HTMLMetaElement* meta =
static_cast(element);
// Check whether the META tag has declared charset or not.
WebCore::String equiv = meta->httpEquiv();
if (equalIgnoringCase(equiv, "content-type")) {
WebCore::String content = meta->content();
if (content.length() && content.contains("charset", false)) {
// Find META tag declared charset, we need to skip it when
// serializing DOM.
param->skip_meta_element = element;
*need_skip = true;
}
}
} else if (element->hasTagName(WebCore::HTMLNames::htmlTag)) {
// Check something before processing the open tag of HEAD element.
// First we add doc type declaration if original doc has it.
if (!param->has_doctype) {
param->has_doctype = true;
result += createMarkup(param->doc->doctype());
}
// Add MOTW declaration before html tag.
// See http://msdn2.microsoft.com/en-us/library/ms537628(VS.85).aspx.
result += StdStringToString(GenerateMarkOfTheWebDeclaration(
param->current_frame_gurl));
} else if (element->hasTagName(WebCore::HTMLNames::baseTag)) {
// Comment the BASE tag when serializing dom.
result += StdWStringToString(kStartCommentNotation);
}
} else {
// Write XML declaration.
if (!param->has_doc_declaration) {
param->has_doc_declaration = true;
// Get encoding info.
WebCore::String xml_encoding = param->doc->xmlEncoding();
if (xml_encoding.isEmpty())
xml_encoding = param->doc->frame()->loader()->encoding();
if (xml_encoding.isEmpty())
xml_encoding = WebCore::UTF8Encoding().name();
std::wstring str_xml_declaration =
StringPrintf(kXMLDeclaration,
StringToStdWString(param->doc->xmlVersion()).c_str(),
StringToStdWString(xml_encoding).c_str(),
param->doc->xmlStandalone() ? L" standalone=\"yes\"" :
L"");
result += StdWStringToString(str_xml_declaration);
}
// Add doc type declaration if original doc has it.
if (!param->has_doctype) {
param->has_doctype = true;
result += createMarkup(param->doc->doctype());
}
}
return result;
}
WebCore::String DomSerializer::PostActionAfterSerializeOpenTag(
const WebCore::Element* element, SerializeDomParam* param) {
WebCore::String result;
if (!param->is_html_document)
return result;
// Check after processing the open tag of HEAD element
if (!param->has_checked_meta &&
element->hasTagName(WebCore::HTMLNames::headTag)) {
param->has_checked_meta = true;
// Check meta element. WebKit only pre-parse the first 512 bytes
// of the document. If the whole is larger and meta is the
// end of head part, then this kind of pages aren't decoded correctly
// because of this issue. So when we serialize the DOM, we need to
// make sure the meta will in first child of head tag.
// See http://bugs.webkit.org/show_bug.cgi?id=16621.
// First we generate new content for writing correct META element.
std::wstring str_meta =
StringPrintf(kDefaultMetaContent,
ASCIIToWide(param->text_encoding.name()).c_str());
result += StdWStringToString(str_meta);
// Will search each META which has charset declaration, and skip them all
// in PreActionBeforeSerializeOpenTag.
} else if (element->hasTagName(WebCore::HTMLNames::scriptTag) ||
element->hasTagName(WebCore::HTMLNames::styleTag)) {
param->is_in_script_or_style_tag = true;
}
return result;
}
WebCore::String DomSerializer::PreActionBeforeSerializeEndTag(
const WebCore::Element* element, SerializeDomParam* param,
bool* need_skip) {
WebCore::String result;
*need_skip = false;
if (!param->is_html_document)
return result;
// Skip the end tag of original META tag which declare charset.
// Need not to check whether it's META tag since we guarantee
// skip_meta_element is definitely META tag if it's not NULL.
if (param->skip_meta_element == element) {
*need_skip = true;
} else if (element->hasTagName(WebCore::HTMLNames::scriptTag) ||
element->hasTagName(WebCore::HTMLNames::styleTag)) {
DCHECK(param->is_in_script_or_style_tag);
param->is_in_script_or_style_tag = false;
}
return result;
}
// After we finish serializing end tag of a element, we give the target
// element a chance to do some post work to add some additional data.
WebCore::String DomSerializer::PostActionAfterSerializeEndTag(
const WebCore::Element* element, SerializeDomParam* param) {
WebCore::String result;
if (!param->is_html_document)
return result;
// Comment the BASE tag when serializing DOM.
if (element->hasTagName(WebCore::HTMLNames::baseTag)) {
result += StdWStringToString(kEndCommentNotation);
// Append a new base tag declaration.
result += StdWStringToString(GenerateBaseTagDeclaration(
webkit_glue::StringToStdWString(param->doc->baseTarget())));
}
return result;
}
void DomSerializer::SaveHtmlContentToBuffer(const WebCore::String& result,
SerializeDomParam* param) {
if (!result.length())
return;
// Convert the unicode content to target encoding
WebCore::CString encoding_result = param->text_encoding.encode(
result.characters(), result.length(), WebCore::EntitiesForUnencodables);
// if the data buffer will be full, then send it out first.
if (encoding_result.length() + data_buffer_.size() >
data_buffer_.capacity()) {
// Send data to delegate, tell it now we are serializing current frame.
delegate_->DidSerializeDataForFrame(param->current_frame_gurl,
data_buffer_, DomSerializerDelegate::CURRENT_FRAME_IS_NOT_FINISHED);
data_buffer_.clear();
}
// Append result to data buffer.
data_buffer_.append(CStringToStdString(encoding_result));
}
void DomSerializer::OpenTagToString(const WebCore::Element* element,
SerializeDomParam* param) {
bool need_skip;
// Do pre action for open tag.
WebCore::String result = PreActionBeforeSerializeOpenTag(element,
param,
&need_skip);
if (need_skip)
return;
// Add open tag
result += "<" + element->nodeName();
// Go through all attributes and serialize them.
const WebCore::NamedAttrMap *attrMap = element->attributes(true);
if (attrMap) {
unsigned numAttrs = attrMap->length();
for (unsigned i = 0; i < numAttrs; i++) {
result += " ";
// Add attribute pair
const WebCore::Attribute *attribute = attrMap->attributeItem(i);
result += attribute->name().toString();
result += "=\"";
if (!attribute->value().isEmpty()) {
// Check whether we need to replace some resource links
// with local resource paths.
const WebCore::QualifiedName& attr_name = attribute->name();
// Check whether need to change the attribute which has link
bool need_replace_link =
ElementHasLegalLinkAttribute(element, attr_name);
if (need_replace_link) {
// First, get the absolute link
const WebCore::String& attr_value = attribute->value();
// For links start with "javascript:", we do not change it.
if (attr_value.startsWith("javascript:", false)) {
result += attr_value;
} else {
WebCore::String str_value = param->doc->completeURL(attr_value);
std::string value(StringToStdString(str_value));
// Check whether we local files for those link.
LinkLocalPathMap::const_iterator it = local_links_.find(value);
if (it != local_links_.end()) {
// Replace the link when we have local files.
result += StdWStringToString(param->directory_name);
result += StdWStringToString(it->second);
} else {
// If not found local path, replace it with absolute link.
result += str_value;
}
}
} else {
ConvertCorrespondingSymbolToEntity(&result, attribute->value(),
param->is_html_document);
}
}
result += "\"";
}
}
// Complete the open tag for element when it has child/children.
if (element->hasChildNodes())
result += ">";
// Do post action for open tag.
result += PostActionAfterSerializeOpenTag(element, param);
// Save the result to data buffer.
SaveHtmlContentToBuffer(result, param);
}
// Serialize end tag of an specified element.
void DomSerializer::EndTagToString(const WebCore::Element* element,
SerializeDomParam* param) {
bool need_skip;
// Do pre action for end tag.
WebCore::String result = PreActionBeforeSerializeEndTag(element,
param,
&need_skip);
if (need_skip)
return;
// Write end tag when element has child/children.
if (element->hasChildNodes()) {
result += "";
result += element->nodeName();
result += ">";
} else {
// Check whether we have to write end tag for empty element.
if (param->is_html_document) {
result += ">";
const WebCore::HTMLElement* html_element =
static_cast(element);
if (html_element->endTagRequirement() == WebCore::TagStatusRequired) {
// We need to write end tag when it is required.
result += "";
result += html_element->nodeName();
result += ">";
}
} else {
// For xml base document.
result += " />";
}
}
// Do post action for end tag.
result += PostActionAfterSerializeEndTag(element, param);
// Save the result to data buffer.
SaveHtmlContentToBuffer(result, param);
}
void DomSerializer::BuildContentForNode(const WebCore::Node* node,
SerializeDomParam* param) {
switch (node->nodeType()) {
case WebCore::Node::ELEMENT_NODE: {
// Process open tag of element.
OpenTagToString(static_cast(node), param);
// Walk through the children nodes and process it.
for (const WebCore::Node *child = node->firstChild(); child != NULL;
child = child->nextSibling())
BuildContentForNode(child, param);
// Process end tag of element.
EndTagToString(static_cast(node), param);
break;
}
case WebCore::Node::TEXT_NODE: {
SaveHtmlContentToBuffer(createMarkup(node), param);
break;
}
case WebCore::Node::ATTRIBUTE_NODE:
case WebCore::Node::DOCUMENT_NODE:
case WebCore::Node::DOCUMENT_FRAGMENT_NODE: {
// Should not exist.
DCHECK(false);
break;
}
// Document type node can be in DOM?
case WebCore::Node::DOCUMENT_TYPE_NODE:
param->has_doctype = true;
default: {
// For other type node, call default action.
SaveHtmlContentToBuffer(createMarkup(node), param);
break;
}
}
}
DomSerializer::DomSerializer(WebFrame* webframe,
bool recursive_serialization,
DomSerializerDelegate* delegate,
const std::vector& links,
const std::vector& local_paths,
const std::wstring& local_directory_name)
: delegate_(delegate),
recursive_serialization_(recursive_serialization),
frames_collected_(false),
local_directory_name_(local_directory_name) {
// Must specify available webframe.
DCHECK(webframe);
specified_webframeimpl_ = static_cast(webframe);
// Make sure we have not-NULL delegate.
DCHECK(delegate);
// Build local resources map.
DCHECK(links.size() == local_paths.size());
std::vector::const_iterator link_it = links.begin();
std::vector::const_iterator path_it = local_paths.begin();
for (; link_it != links.end(); ++link_it, ++path_it) {
bool never_present = local_links_.insert(
LinkLocalPathMap::value_type(link_it->spec(), *path_it)).
second;
DCHECK(never_present);
}
// Init data buffer.
data_buffer_.reserve(kHtmlContentBufferLength);
DCHECK(data_buffer_.empty());
}
void DomSerializer::CollectTargetFrames() {
DCHECK(!frames_collected_);
frames_collected_ = true;
// First, process main frame.
frames_.push_back(specified_webframeimpl_);
// Return now if user only needs to serialize specified frame, not including
// all sub-frames.
if (!recursive_serialization_)
return;
// Collect all frames inside the specified frame.
for (int i = 0; i < static_cast(frames_.size()); ++i) {
WebFrameImpl* current_frame = frames_[i];
// Get current using document.
WebCore::Document* current_doc = current_frame->frame()->document();
// Go through sub-frames.
RefPtr all = current_doc->all();
for (WebCore::Node* node = all->firstItem(); node != NULL;
node = all->nextItem()) {
if (!node->isHTMLElement())
continue;
WebCore::Element* element = static_cast(node);
// Check frame tag and iframe tag.
bool is_frame_element;
WebFrameImpl* web_frame = GetWebFrameImplFromElement(
element, &is_frame_element);
if (is_frame_element && web_frame)
frames_.push_back(web_frame);
}
}
}
bool DomSerializer::SerializeDom() {
// Collect target frames.
if (!frames_collected_)
CollectTargetFrames();
bool did_serialization = false;
// Get GURL for main frame.
GURL main_page_gurl(KURLToGURL(
specified_webframeimpl_->frame()->loader()->url()));
// Go through all frames for serializing DOM for whole page, include
// sub-frames.
for (int i = 0; i < static_cast(frames_.size()); ++i) {
// Get current serializing frame.
WebFrameImpl* current_frame = frames_[i];
// Get current using document.
WebCore::Document* current_doc = current_frame->frame()->document();
// Get current frame's URL.
const WebCore::KURL& current_frame_kurl =
current_frame->frame()->loader()->url();
GURL current_frame_gurl(KURLToGURL(current_frame_kurl));
// Check whether we have done this document.
if (local_links_.find(current_frame_gurl.spec()) != local_links_.end()) {
// A new document, we will serialize it.
did_serialization = true;
// Get target encoding for current document.
WebCore::String encoding = current_frame->frame()->loader()->encoding();
// Create the text encoding object with target encoding.
WebCore::TextEncoding text_encoding(encoding);
// Construct serialize parameter for late processing document.
SerializeDomParam param(
current_frame_gurl,
encoding.length() ? text_encoding : WebCore::UTF8Encoding(),
current_doc,
current_frame_gurl == main_page_gurl ?
local_directory_name_ : L"./");
// Process current document.
WebCore::Element* root_element = current_doc->documentElement();
if (root_element)
BuildContentForNode(root_element, ¶m);
// Sink the remainder data and finish serializing current frame.
delegate_->DidSerializeDataForFrame(current_frame_gurl, data_buffer_,
DomSerializerDelegate::CURRENT_FRAME_IS_FINISHED);
// Clear the buffer.
data_buffer_.clear();
}
}
// We have done call frames, so we send message to embedder to tell it that
// frames are finished serializing.
DCHECK(data_buffer_.empty());
delegate_->DidSerializeDataForFrame(GURL(), data_buffer_,
DomSerializerDelegate::ALL_FRAMES_ARE_FINISHED);
return did_serialization;
}
} // namespace webkit_glue