/* Copyright (C) 1997 Martin Jones (mjones@kde.org) (C) 1997 Torben Weis (weis@kde.org) (C) 1999,2001 Lars Knoll (knoll@kde.org) (C) 2000,2001 Dirk Mueller (mueller@kde.org) Copyright (C) 2004, 2005, 2006, 2007 Apple Inc. All rights reserved. This library is free software; you can redistribute it and/or modify it under the terms of the GNU Library General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public License for more details. You should have received a copy of the GNU Library General Public License along with this library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ #include "config.h" #include "HTMLParser.h" #include "CharacterNames.h" #include "CSSPropertyNames.h" #include "CSSValueKeywords.h" #include "Comment.h" #include "DocumentFragment.h" #include "Frame.h" #include "HTMLBodyElement.h" #include "HTMLDocument.h" #include "HTMLDivElement.h" #include "HTMLDListElement.h" #include "HTMLElementFactory.h" #include "HTMLFormElement.h" #include "HTMLHeadElement.h" #include "HTMLHRElement.h" #include "HTMLHtmlElement.h" #include "HTMLIsIndexElement.h" #include "HTMLMapElement.h" #include "HTMLNames.h" #include "HTMLTableCellElement.h" #include "HTMLTableRowElement.h" #include "HTMLTableSectionElement.h" #include "HTMLTokenizer.h" #include "LocalizedStrings.h" #include "Page.h" #include "Settings.h" #include "Text.h" namespace WebCore { using namespace HTMLNames; static const unsigned cMaxRedundantTagDepth = 20; static const unsigned cResidualStyleMaxDepth = 200; struct HTMLStackElem : Noncopyable { HTMLStackElem(const AtomicString& t, int lvl, Node* n, bool r, HTMLStackElem* nx) : tagName(t) , level(lvl) , strayTableContent(false) , node(n) , didRefNode(r) , next(nx) { } void derefNode() { if (didRefNode) node->deref(); } AtomicString tagName; int level; bool strayTableContent; Node* node; bool didRefNode; HTMLStackElem* next; }; /** * The parser parses tokenized input into the document, building up the * document tree. If the document is well-formed, parsing it is straightforward. * * Unfortunately, we have to handle many HTML documents that are not well-formed, * so the parser has to be tolerant about errors. * * We have to take care of at least the following error conditions: * * 1. The element being added is explicitly forbidden inside some outer tag. * In this case we should close all tags up to the one, which forbids * the element, and add it afterwards. * * 2. We are not allowed to add the element directly. It could be that * the person writing the document forgot some tag in between (or that the * tag in between is optional). This could be the case with the following * tags: HTML HEAD BODY TBODY TR TD LI (did I forget any?). * * 3. We want to add a block element inside to an inline element. Close all * inline elements up to the next higher block element. * * 4. If this doesn't help, close elements until we are allowed to add the * element or ignore the tag. * */ HTMLParser::HTMLParser(HTMLDocument* doc, bool reportErrors) : document(doc) , current(doc) , didRefCurrent(false) , blockStack(0) , head(0) , inBody(false) , haveContent(false) , haveFrameSet(false) , m_isParsingFragment(false) , m_reportErrors(reportErrors) , m_handlingResidualStyleAcrossBlocks(false) , inStrayTableContent(0) { } HTMLParser::HTMLParser(DocumentFragment* frag) : document(frag->document()) , current(frag) , didRefCurrent(true) , blockStack(0) , head(0) , inBody(true) , haveContent(false) , haveFrameSet(false) , m_isParsingFragment(true) , m_reportErrors(false) , m_handlingResidualStyleAcrossBlocks(false) , inStrayTableContent(0) { if (frag) frag->ref(); } HTMLParser::~HTMLParser() { freeBlock(); if (didRefCurrent) current->deref(); } void HTMLParser::reset() { ASSERT(!m_isParsingFragment); setCurrent(document); freeBlock(); inBody = false; haveFrameSet = false; haveContent = false; inStrayTableContent = 0; m_currentFormElement = 0; m_currentMapElement = 0; head = 0; m_isindexElement = 0; m_skipModeTag = nullAtom; } void HTMLParser::setCurrent(Node* newCurrent) { bool didRefNewCurrent = newCurrent && newCurrent != document; if (didRefNewCurrent) newCurrent->ref(); if (didRefCurrent) current->deref(); current = newCurrent; didRefCurrent = didRefNewCurrent; } PassRefPtr HTMLParser::parseToken(Token* t) { if (!m_skipModeTag.isNull()) { if (!t->beginTag && t->tagName == m_skipModeTag) // Found the end tag for the current skip mode, so we're done skipping. m_skipModeTag = nullAtom; else if (current->localName() == t->tagName) // Do not skip . // FIXME: What does that comment mean? How can it be right to parse a token without clearing m_skipModeTag? ; else return 0; } // Apparently some sites use
instead of
. Be compatible with IE and Firefox and treat this like
. if (t->isCloseTag(brTag) && document->inCompatMode()) { reportError(MalformedBRError); t->beginTag = true; } if (!t->beginTag) { processCloseTag(t); return 0; } // Ignore spaces, if we're not inside a paragraph or other inline code. // Do not alter the text if it is parse of a scriptTag. if (t->tagName == textAtom && t->text && current->localName() != scriptTag) { if (inBody && !skipMode() && current->localName() != styleTag && current->localName() != titleTag && !t->text->containsOnlyWhitespace()) haveContent = true; RefPtr n; String text = t->text.get(); unsigned charsLeft = text.length(); while (charsLeft) { // split large blocks of text to nodes of manageable size n = Text::createWithLengthLimit(document, text, charsLeft); if (!insertNode(n.get(), t->flat)) return 0; } return n; } RefPtr n = getNode(t); // just to be sure, and to catch currently unimplemented stuff if (!n) return 0; // set attributes if (n->isHTMLElement()) { HTMLElement* e = static_cast(n.get()); e->setAttributeMap(t->attrs.get()); // take care of optional close tags if (e->endTagRequirement() == TagStatusOptional) popBlock(t->tagName); // If the node does not have a forbidden end tag requirement, and if the broken XML self-closing // syntax was used, report an error. if (t->brokenXMLStyle && e->endTagRequirement() != TagStatusForbidden) { if (t->tagName == scriptTag) reportError(IncorrectXMLCloseScriptWarning); else reportError(IncorrectXMLSelfCloseError, &t->tagName); } } if (!insertNode(n.get(), t->flat)) { // we couldn't insert the node if (n->isElementNode()) { Element* e = static_cast(n.get()); e->setAttributeMap(0); } if (m_currentMapElement == n) m_currentMapElement = 0; if (m_currentFormElement == n) m_currentFormElement = 0; if (head == n) head = 0; return 0; } return n; } static bool isTableSection(Node* n) { return n->hasTagName(tbodyTag) || n->hasTagName(tfootTag) || n->hasTagName(theadTag); } static bool isTablePart(Node* n) { return n->hasTagName(trTag) || n->hasTagName(tdTag) || n->hasTagName(thTag) || isTableSection(n); } static bool isTableRelated(Node* n) { return n->hasTagName(tableTag) || isTablePart(n); } bool HTMLParser::insertNode(Node* n, bool flat) { RefPtr protectNode(n); const AtomicString& localName = n->localName(); int tagPriority = n->isHTMLElement() ? static_cast(n)->tagPriority() : 0; // is never allowed inside stray table content. Always pop out of the stray table content // and close up the first table, and then start the second table as a sibling. if (inStrayTableContent && localName == tableTag) popBlock(tableTag); // let's be stupid and just try to insert it. // this should work if the document is well-formed Node* newNode = current->addChild(n); if (!newNode) return handleError(n, flat, localName, tagPriority); // Try to handle the error. // don't push elements without end tags (e.g., ) on the stack bool parentAttached = current->attached(); if (tagPriority > 0 && !flat) { if (newNode == current) { // This case should only be hit when a demoted is placed inside a table. ASSERT(localName == formTag); reportError(FormInsideTablePartError, ¤t->localName()); } else { // The pushBlock function transfers ownership of current to the block stack // so we're guaranteed that didRefCurrent is false. The code below is an // optimized version of setCurrent that takes advantage of that fact and also // assumes that newNode is neither 0 nor a pointer to the document. pushBlock(localName, tagPriority); ASSERT(!didRefCurrent); newNode->ref(); current = newNode; didRefCurrent = true; } if (parentAttached && !n->attached() && !m_isParsingFragment) n->attach(); } else { if (parentAttached && !n->attached() && !m_isParsingFragment) n->attach(); n->finishParsingChildren(); } return true; } bool HTMLParser::handleError(Node* n, bool flat, const AtomicString& localName, int tagPriority) { // Error handling code. This is just ad hoc handling of specific parent/child combinations. HTMLElement* e; bool handled = false; // 1. Check out the element's tag name to decide how to deal with errors. if (n->isHTMLElement()) { HTMLElement* h = static_cast(n); if (h->hasLocalName(trTag) || h->hasLocalName(thTag) || h->hasLocalName(tdTag)) { if (inStrayTableContent && !isTableRelated(current)) { reportError(MisplacedTablePartError, &localName, ¤t->localName()); // pop out to the nearest enclosing table-related tag. while (blockStack && !isTableRelated(current)) popOneBlock(); return insertNode(n); } } else if (h->hasLocalName(headTag)) { if (!current->isDocumentNode() && !current->hasTagName(htmlTag)) { reportError(MisplacedHeadError); return false; } } else if (h->hasLocalName(metaTag) || h->hasLocalName(linkTag) || h->hasLocalName(baseTag)) { bool createdHead = false; if (!head) { createHead(); createdHead = true; } if (head) { if (!createdHead) reportError(MisplacedHeadContentError, &localName, ¤t->localName()); if (head->addChild(n)) { if (!n->attached() && !m_isParsingFragment) n->attach(); return true; } else return false; } } else if (h->hasLocalName(htmlTag)) { if (!current->isDocumentNode() ) { if (document->documentElement()->hasTagName(htmlTag)) { reportError(RedundantHTMLBodyError, &localName); // we have another element.... apply attributes to existing one // make sure we don't overwrite already existing attributes NamedAttrMap* map = static_cast(n)->attributes(true); Element* existingHTML = static_cast(document->documentElement()); NamedAttrMap* bmap = existingHTML->attributes(false); for (unsigned l = 0; map && l < map->length(); ++l) { Attribute* it = map->attributeItem(l); if (!bmap->getAttributeItem(it->name())) existingHTML->setAttribute(it->name(), it->value()); } } return false; } } else if (h->hasLocalName(titleTag) || h->hasLocalName(styleTag)) { bool createdHead = false; if (!head) { createHead(); createdHead = true; } if (head) { Node* newNode = head->addChild(n); if (!newNode) { setSkipMode(h->tagQName()); return false; } if (!createdHead) reportError(MisplacedHeadContentError, &localName, ¤t->localName()); pushBlock(localName, tagPriority); setCurrent(newNode); if (!n->attached() && !m_isParsingFragment) n->attach(); return true; } if (inBody) { setSkipMode(h->tagQName()); return false; } } else if (h->hasLocalName(bodyTag)) { if (inBody && document->body()) { // we have another element.... apply attributes to existing one // make sure we don't overwrite already existing attributes // some sites use ... reportError(RedundantHTMLBodyError, &localName); NamedAttrMap* map = static_cast(n)->attributes(true); Element* existingBody = document->body(); NamedAttrMap* bmap = existingBody->attributes(false); for (unsigned l = 0; map && l < map->length(); ++l) { Attribute* it = map->attributeItem(l); if (!bmap->getAttributeItem(it->name())) existingBody->setAttribute(it->name(), it->value()); } return false; } else if (!current->isDocumentNode()) return false; } else if (h->hasLocalName(areaTag)) { if (m_currentMapElement) { reportError(MisplacedAreaError, ¤t->localName()); m_currentMapElement->addChild(n); if (!n->attached() && !m_isParsingFragment) n->attach(); handled = true; return true; } return false; } else if (h->hasLocalName(colgroupTag) || h->hasLocalName(captionTag)) { if (isTableRelated(current)) { while (blockStack && isTablePart(current)) popOneBlock(); return insertNode(n); } } } else if (n->isCommentNode() && !head) return false; // 2. Next we examine our currently active element to do some further error handling. if (current->isHTMLElement()) { HTMLElement* h = static_cast(current); const AtomicString& currentTagName = current->localName(); if (h->hasLocalName(htmlTag)) { HTMLElement* elt = n->isHTMLElement() ? static_cast(n) : 0; if (elt && (elt->hasLocalName(scriptTag) || elt->hasLocalName(styleTag) || elt->hasLocalName(metaTag) || elt->hasLocalName(linkTag) || elt->hasLocalName(objectTag) || elt->hasLocalName(embedTag) || elt->hasLocalName(titleTag) || elt->hasLocalName(isindexTag) || elt->hasLocalName(baseTag))) { if (!head) { head = new HTMLHeadElement(document); e = head; insertNode(e); handled = true; } } else { if (n->isTextNode()) { Text* t = static_cast(n); if (t->containsOnlyWhitespace()) return false; } if (!haveFrameSet) { e = new HTMLBodyElement(document); startBody(); insertNode(e); handled = true; } else reportError(MisplacedFramesetContentError, &localName); } } else if (h->hasLocalName(headTag)) { if (n->hasTagName(htmlTag)) return false; else { // This means the body starts here... if (!haveFrameSet) { popBlock(currentTagName); e = new HTMLBodyElement(document); startBody(); insertNode(e); handled = true; } else reportError(MisplacedFramesetContentError, &localName); } } else if (h->hasLocalName(addressTag) || h->hasLocalName(dlTag) || h->hasLocalName(dtTag) || h->hasLocalName(fontTag) || h->hasLocalName(styleTag) || h->hasLocalName(titleTag)) { reportError(MisplacedContentRetryError, &localName, ¤tTagName); popBlock(currentTagName); handled = true; } else if (h->hasLocalName(captionTag)) { // Illegal content in a caption. Close the caption and try again. reportError(MisplacedCaptionContentError, &localName); popBlock(currentTagName); if (isTablePart(n)) return insertNode(n, flat); } else if (h->hasLocalName(tableTag) || h->hasLocalName(trTag) || isTableSection(h)) { if (n->hasTagName(tableTag)) { reportError(MisplacedTableError, ¤tTagName); if (m_isParsingFragment && !h->hasLocalName(tableTag)) // fragment may contain table parts without
ancestor, pop them one by one popBlock(h->localName()); popBlock(localName); // end the table handled = true; // ...and start a new one } else { ExceptionCode ec = 0; Node* node = current; Node* parent = node->parentNode(); // A script may have removed the current node's parent from the DOM // http://bugs.webkit.org/show_bug.cgi?id=7137 // FIXME: we should do real recovery here and re-parent with the correct node. if (!parent) return false; Node* grandparent = parent->parentNode(); if (n->isTextNode() || (h->hasLocalName(trTag) && isTableSection(parent) && grandparent && grandparent->hasTagName(tableTag)) || ((!n->hasTagName(tdTag) && !n->hasTagName(thTag) && !n->hasTagName(formTag) && !n->hasTagName(scriptTag)) && isTableSection(node) && parent->hasTagName(tableTag))) { node = (node->hasTagName(tableTag)) ? node : ((node->hasTagName(trTag)) ? grandparent : parent); // This can happen with fragments if (!node) return false; Node* parent = node->parentNode(); if (!parent) return false; parent->insertBefore(n, node, ec); if (!ec) { reportError(StrayTableContentError, &localName, ¤tTagName); if (n->isHTMLElement() && tagPriority > 0 && !flat && static_cast(n)->endTagRequirement() != TagStatusForbidden) { pushBlock(localName, tagPriority); setCurrent(n); inStrayTableContent++; blockStack->strayTableContent = true; } return true; } } if (!ec) { if (current->hasTagName(trTag)) { reportError(TablePartRequiredError, &localName, &tdTag.localName()); e = new HTMLTableCellElement(tdTag, document); } else if (current->hasTagName(tableTag)) { // Don't report an error in this case, since making a happens all the time when you have
, // and it isn't really a parse error per se. e = new HTMLTableSectionElement(tbodyTag, document); } else { reportError(TablePartRequiredError, &localName, &trTag.localName()); e = new HTMLTableRowElement(document); } insertNode(e); handled = true; } } } else if (h->hasLocalName(objectTag)) { reportError(MisplacedContentRetryError, &localName, ¤tTagName); popBlock(objectTag); handled = true; } else if (h->hasLocalName(pTag) || isHeaderTag(currentTagName)) { if (!isInline(n)) { popBlock(currentTagName); handled = true; } } else if (h->hasLocalName(optionTag) || h->hasLocalName(optgroupTag)) { if (localName == optgroupTag) { popBlock(currentTagName); handled = true; } else if (localName == selectTag) { // IE treats a nested select as . Let's do the same popBlock(localName); } } else if (h->hasLocalName(colgroupTag)) { popBlock(currentTagName); handled = true; } else if (!h->hasLocalName(bodyTag)) { if (isInline(current)) { popInlineBlocks(); handled = true; } } } else if (current->isDocumentNode()) { if (n->isTextNode()) { Text* t = static_cast(n); if (t->containsOnlyWhitespace()) return false; } if (!document->documentElement()) { e = new HTMLHtmlElement(document); insertNode(e); handled = true; } } // 3. If we couldn't handle the error, just return false and attempt to error-correct again. if (!handled) { reportError(IgnoredContentError, &localName, ¤t->localName()); return false; } return insertNode(n); } typedef bool (HTMLParser::*CreateErrorCheckFunc)(Token* t, RefPtr&); typedef HashMap FunctionMap; bool HTMLParser::textCreateErrorCheck(Token* t, RefPtr& result) { result = new Text(document, t->text.get()); return false; } bool HTMLParser::commentCreateErrorCheck(Token* t, RefPtr& result) { result = new Comment(document, t->text.get()); return false; } bool HTMLParser::headCreateErrorCheck(Token* t, RefPtr& result) { if (!head || current->localName() == htmlTag) { head = new HTMLHeadElement(document); result = head; } else reportError(MisplacedHeadError); return false; } bool HTMLParser::bodyCreateErrorCheck(Token* t, RefPtr& result) { // body no longer allowed if we have a frameset if (haveFrameSet) return false; popBlock(headTag); startBody(); return true; } bool HTMLParser::framesetCreateErrorCheck(Token* t, RefPtr& result) { popBlock(headTag); if (inBody && !haveFrameSet && !haveContent) { popBlock(bodyTag); // ### actually for IE document.body returns the now hidden "body" element // we can't implement that behaviour now because it could cause too many // regressions and the headaches are not worth the work as long as there is // no site actually relying on that detail (Dirk) if (document->body()) document->body()->setAttribute(styleAttr, "display:none"); inBody = false; } if ((haveContent || haveFrameSet) && current->localName() == htmlTag) return false; haveFrameSet = true; startBody(); return true; } bool HTMLParser::iframeCreateErrorCheck(Token* t, RefPtr& result) { // a bit of a special case, since the frame is inlined setSkipMode(iframeTag); return true; } bool HTMLParser::formCreateErrorCheck(Token* t, RefPtr& result) { // Only create a new form if we're not already inside one. // This is consistent with other browsers' behavior. if (!m_currentFormElement) { m_currentFormElement = new HTMLFormElement(document); result = m_currentFormElement; } return false; } bool HTMLParser::isindexCreateErrorCheck(Token* t, RefPtr& result) { RefPtr n = handleIsindex(t); if (!inBody) { m_isindexElement = n.release(); } else { t->flat = true; result = n.release(); } return false; } bool HTMLParser::selectCreateErrorCheck(Token* t, RefPtr& result) { return true; } bool HTMLParser::ddCreateErrorCheck(Token* t, RefPtr& result) { popBlock(dtTag); popBlock(ddTag); return true; } bool HTMLParser::dtCreateErrorCheck(Token* t, RefPtr& result) { popBlock(ddTag); popBlock(dtTag); return true; } bool HTMLParser::nestedCreateErrorCheck(Token* t, RefPtr& result) { popBlock(t->tagName); return true; } bool HTMLParser::nestedStyleCreateErrorCheck(Token* t, RefPtr& result) { return allowNestedRedundantTag(t->tagName); } bool HTMLParser::tableCellCreateErrorCheck(Token* t, RefPtr& result) { popBlock(tdTag); popBlock(thTag); return true; } bool HTMLParser::tableSectionCreateErrorCheck(Token* t, RefPtr& result) { popBlock(theadTag); popBlock(tbodyTag); popBlock(tfootTag); return true; } bool HTMLParser::noembedCreateErrorCheck(Token* t, RefPtr& result) { setSkipMode(noembedTag); return true; } bool HTMLParser::noframesCreateErrorCheck(Token* t, RefPtr& result) { setSkipMode(noframesTag); return true; } bool HTMLParser::noscriptCreateErrorCheck(Token* t, RefPtr& result) { if (!m_isParsingFragment) { Settings* settings = document->settings(); if (settings && settings->isJavaScriptEnabled()) setSkipMode(noscriptTag); } return true; } bool HTMLParser::mapCreateErrorCheck(Token* t, RefPtr& result) { m_currentMapElement = new HTMLMapElement(document); result = m_currentMapElement; return false; } PassRefPtr HTMLParser::getNode(Token* t) { // Init our error handling table. static FunctionMap gFunctionMap; if (gFunctionMap.isEmpty()) { gFunctionMap.set(aTag.localName().impl(), &HTMLParser::nestedCreateErrorCheck); gFunctionMap.set(bTag.localName().impl(), &HTMLParser::nestedStyleCreateErrorCheck); gFunctionMap.set(bigTag.localName().impl(), &HTMLParser::nestedStyleCreateErrorCheck); gFunctionMap.set(bodyTag.localName().impl(), &HTMLParser::bodyCreateErrorCheck); gFunctionMap.set(buttonTag.localName().impl(), &HTMLParser::nestedCreateErrorCheck); gFunctionMap.set(commentAtom.impl(), &HTMLParser::commentCreateErrorCheck); gFunctionMap.set(ddTag.localName().impl(), &HTMLParser::ddCreateErrorCheck); gFunctionMap.set(dtTag.localName().impl(), &HTMLParser::dtCreateErrorCheck); gFunctionMap.set(formTag.localName().impl(), &HTMLParser::formCreateErrorCheck); gFunctionMap.set(framesetTag.localName().impl(), &HTMLParser::framesetCreateErrorCheck); gFunctionMap.set(headTag.localName().impl(), &HTMLParser::headCreateErrorCheck); gFunctionMap.set(iTag.localName().impl(), &HTMLParser::nestedStyleCreateErrorCheck); gFunctionMap.set(iframeTag.localName().impl(), &HTMLParser::iframeCreateErrorCheck); gFunctionMap.set(isindexTag.localName().impl(), &HTMLParser::isindexCreateErrorCheck); gFunctionMap.set(liTag.localName().impl(), &HTMLParser::nestedCreateErrorCheck); gFunctionMap.set(mapTag.localName().impl(), &HTMLParser::mapCreateErrorCheck); gFunctionMap.set(nobrTag.localName().impl(), &HTMLParser::nestedCreateErrorCheck); gFunctionMap.set(noembedTag.localName().impl(), &HTMLParser::noembedCreateErrorCheck); gFunctionMap.set(noframesTag.localName().impl(), &HTMLParser::noframesCreateErrorCheck); gFunctionMap.set(noscriptTag.localName().impl(), &HTMLParser::noscriptCreateErrorCheck); gFunctionMap.set(sTag.localName().impl(), &HTMLParser::nestedStyleCreateErrorCheck); gFunctionMap.set(selectTag.localName().impl(), &HTMLParser::selectCreateErrorCheck); gFunctionMap.set(smallTag.localName().impl(), &HTMLParser::nestedStyleCreateErrorCheck); gFunctionMap.set(strikeTag.localName().impl(), &HTMLParser::nestedStyleCreateErrorCheck); gFunctionMap.set(tbodyTag.localName().impl(), &HTMLParser::tableSectionCreateErrorCheck); gFunctionMap.set(tdTag.localName().impl(), &HTMLParser::tableCellCreateErrorCheck); gFunctionMap.set(textAtom.impl(), &HTMLParser::textCreateErrorCheck); gFunctionMap.set(tfootTag.localName().impl(), &HTMLParser::tableSectionCreateErrorCheck); gFunctionMap.set(thTag.localName().impl(), &HTMLParser::tableCellCreateErrorCheck); gFunctionMap.set(theadTag.localName().impl(), &HTMLParser::tableSectionCreateErrorCheck); gFunctionMap.set(trTag.localName().impl(), &HTMLParser::nestedCreateErrorCheck); gFunctionMap.set(ttTag.localName().impl(), &HTMLParser::nestedStyleCreateErrorCheck); gFunctionMap.set(uTag.localName().impl(), &HTMLParser::nestedStyleCreateErrorCheck); } bool proceed = true; RefPtr result; if (CreateErrorCheckFunc errorCheckFunc = gFunctionMap.get(t->tagName.impl())) proceed = (this->*errorCheckFunc)(t, result); if (proceed) result = HTMLElementFactory::createHTMLElement(t->tagName, document, m_currentFormElement.get()); return result.release(); } bool HTMLParser::allowNestedRedundantTag(const AtomicString& tagName) { // www.liceo.edu.mx is an example of a site that achieves a level of nesting of // about 1500 tags, all from a bunch of s. We will only allow at most 20 // nested tags of the same type before just ignoring them all together. unsigned i = 0; for (HTMLStackElem* curr = blockStack; i < cMaxRedundantTagDepth && curr && curr->tagName == tagName; curr = curr->next, i++) { } return i != cMaxRedundantTagDepth; } void HTMLParser::processCloseTag(Token* t) { // Support for really broken html. // we never close the body tag, since some stupid web pages close it before the actual end of the doc. // let's rely on the end() call to close things. if (t->tagName == htmlTag || t->tagName == bodyTag || t->tagName == commentAtom) return; bool checkForCloseTagErrors = true; if (t->tagName == formTag && m_currentFormElement) { m_currentFormElement = 0; checkForCloseTagErrors = false; } else if (t->tagName == mapTag) m_currentMapElement = 0; else if (t->tagName == pTag) checkForCloseTagErrors = false; HTMLStackElem* oldElem = blockStack; popBlock(t->tagName, checkForCloseTagErrors); if (oldElem == blockStack && t->tagName == pTag) { // We encountered a stray

. Amazingly Gecko, WinIE, and MacIE all treat // this as a valid break, i.e.,

. So go ahead and make the empty // paragraph. t->beginTag = true; parseToken(t); popBlock(t->tagName); reportError(StrayParagraphCloseError); } } bool HTMLParser::isHeaderTag(const AtomicString& tagName) { static HashSet headerTags; if (headerTags.isEmpty()) { headerTags.add(h1Tag.localName().impl()); headerTags.add(h2Tag.localName().impl()); headerTags.add(h3Tag.localName().impl()); headerTags.add(h4Tag.localName().impl()); headerTags.add(h5Tag.localName().impl()); headerTags.add(h6Tag.localName().impl()); } return headerTags.contains(tagName.impl()); } bool HTMLParser::isInline(Node* node) const { if (node->isTextNode()) return true; if (node->isHTMLElement()) { HTMLElement* e = static_cast(node); if (e->hasLocalName(aTag) || e->hasLocalName(fontTag) || e->hasLocalName(ttTag) || e->hasLocalName(uTag) || e->hasLocalName(bTag) || e->hasLocalName(iTag) || e->hasLocalName(sTag) || e->hasLocalName(strikeTag) || e->hasLocalName(bigTag) || e->hasLocalName(smallTag) || e->hasLocalName(emTag) || e->hasLocalName(strongTag) || e->hasLocalName(dfnTag) || e->hasLocalName(codeTag) || e->hasLocalName(sampTag) || e->hasLocalName(kbdTag) || e->hasLocalName(varTag) || e->hasLocalName(citeTag) || e->hasLocalName(abbrTag) || e->hasLocalName(acronymTag) || e->hasLocalName(subTag) || e->hasLocalName(supTag) || e->hasLocalName(spanTag) || e->hasLocalName(nobrTag) || e->hasLocalName(noframesTag) || e->hasLocalName(nolayerTag) || e->hasLocalName(noembedTag)) return true; if (e->hasLocalName(noscriptTag) && !m_isParsingFragment) { Settings* settings = document->settings(); if (settings && settings->isJavaScriptEnabled()) return true; } } return false; } bool HTMLParser::isResidualStyleTag(const AtomicString& tagName) { static HashSet residualStyleTags; if (residualStyleTags.isEmpty()) { residualStyleTags.add(aTag.localName().impl()); residualStyleTags.add(fontTag.localName().impl()); residualStyleTags.add(ttTag.localName().impl()); residualStyleTags.add(uTag.localName().impl()); residualStyleTags.add(bTag.localName().impl()); residualStyleTags.add(iTag.localName().impl()); residualStyleTags.add(sTag.localName().impl()); residualStyleTags.add(strikeTag.localName().impl()); residualStyleTags.add(bigTag.localName().impl()); residualStyleTags.add(smallTag.localName().impl()); residualStyleTags.add(emTag.localName().impl()); residualStyleTags.add(strongTag.localName().impl()); residualStyleTags.add(dfnTag.localName().impl()); residualStyleTags.add(codeTag.localName().impl()); residualStyleTags.add(sampTag.localName().impl()); residualStyleTags.add(kbdTag.localName().impl()); residualStyleTags.add(varTag.localName().impl()); residualStyleTags.add(nobrTag.localName().impl()); } return residualStyleTags.contains(tagName.impl()); } bool HTMLParser::isAffectedByResidualStyle(const AtomicString& tagName) { static HashSet unaffectedTags; if (unaffectedTags.isEmpty()) { unaffectedTags.add(bodyTag.localName().impl()); unaffectedTags.add(tableTag.localName().impl()); unaffectedTags.add(theadTag.localName().impl()); unaffectedTags.add(tbodyTag.localName().impl()); unaffectedTags.add(tfootTag.localName().impl()); unaffectedTags.add(trTag.localName().impl()); unaffectedTags.add(thTag.localName().impl()); unaffectedTags.add(tdTag.localName().impl()); unaffectedTags.add(captionTag.localName().impl()); unaffectedTags.add(colgroupTag.localName().impl()); unaffectedTags.add(colTag.localName().impl()); unaffectedTags.add(optionTag.localName().impl()); unaffectedTags.add(optgroupTag.localName().impl()); unaffectedTags.add(selectTag.localName().impl()); unaffectedTags.add(objectTag.localName().impl()); } return !unaffectedTags.contains(tagName.impl()); } void HTMLParser::handleResidualStyleCloseTagAcrossBlocks(HTMLStackElem* elem) { HTMLStackElem* maxElem = 0; bool finished = false; bool strayTableContent = elem->strayTableContent; m_handlingResidualStyleAcrossBlocks = true; while (!finished) { // Find the outermost element that crosses over to a higher level. If there exists another higher-level // element, we will do another pass, until we have corrected the innermost one. ExceptionCode ec = 0; HTMLStackElem* curr = blockStack; HTMLStackElem* prev = 0; HTMLStackElem* prevMaxElem = 0; maxElem = 0; finished = true; while (curr && curr != elem) { if (curr->level > elem->level) { if (!isAffectedByResidualStyle(curr->tagName)) return; if (maxElem) // We will need another pass. finished = false; maxElem = curr; prevMaxElem = prev; } prev = curr; curr = curr->next; } if (!curr || !maxElem) return; Node* residualElem = prev->node; Node* blockElem = prevMaxElem ? prevMaxElem->node : current; Node* parentElem = elem->node; // Check to see if the reparenting that is going to occur is allowed according to the DOM. // FIXME: We should either always allow it or perform an additional fixup instead of // just bailing here. // Example:

blah

isn't doing a fixup right now. if (!parentElem->childAllowed(blockElem)) return; if (maxElem->node->parentNode() != elem->node) { // Walk the stack and remove any elements that aren't residual style tags. These // are basically just being closed up. Example: // Moo

Goo

. // In the above example, the doesn't need to be reopened. It can just close. HTMLStackElem* currElem = maxElem->next; HTMLStackElem* prevElem = maxElem; while (currElem != elem) { HTMLStackElem* nextElem = currElem->next; if (!isResidualStyleTag(currElem->tagName)) { prevElem->next = nextElem; prevElem->derefNode(); prevElem->node = currElem->node; prevElem->didRefNode = currElem->didRefNode; delete currElem; } else prevElem = currElem; currElem = nextElem; } // We have to reopen residual tags in between maxElem and elem. An example of this case is: // Moo

Foo. // In this case, we need to transform the part before the

into: // Moo // so that the will remain open. This involves the modification of elements // in the block stack. // This will also affect how we ultimately reparent the block, since we want it to end up // under the reopened residual tags (e.g., the in the above example.) RefPtr prevNode = 0; currElem = maxElem; while (currElem->node != residualElem) { if (isResidualStyleTag(currElem->node->localName())) { // Create a clone of this element. // We call releaseRef to get a raw pointer since we plan to hand over ownership to currElem. Node* currNode = currElem->node->cloneNode(false).releaseRef(); reportError(ResidualStyleError, &currNode->localName()); // Change the stack element's node to point to the clone. // The stack element adopts the reference we obtained above by calling release(). currElem->derefNode(); currElem->node = currNode; currElem->didRefNode = true; // Attach the previous node as a child of this new node. if (prevNode) currNode->appendChild(prevNode, ec); else // The new parent for the block element is going to be the innermost clone. parentElem = currNode; // FIXME: We shifted parentElem to be a residual inline. We never checked to see if blockElem could be legally placed inside the inline though. prevNode = currNode; } currElem = currElem->next; } // Now append the chain of new residual style elements if one exists. if (prevNode) elem->node->appendChild(prevNode, ec); // FIXME: This append can result in weird stuff happening, like an inline chain being put into a table section. } // Check if the block is still in the tree. If it isn't, then we don't // want to remove it from its parent (that would crash) or insert it into // a new parent later. See http://bugs.webkit.org/show_bug.cgi?id=6778 bool isBlockStillInTree = blockElem->parentNode(); // We need to make a clone of |residualElem| and place it just inside |blockElem|. // All content of |blockElem| is reparented to be under this clone. We then // reparent |blockElem| using real DOM calls so that attachment/detachment will // be performed to fix up the rendering tree. // So for this example: ...

FooGoo

// The end result will be: ...

FooGoo

// // Step 1: Remove |blockElem| from its parent, doing a batch detach of all the kids. if (isBlockStillInTree) blockElem->parentNode()->removeChild(blockElem, ec); Node* newNodePtr = 0; ASSERT(finished || blockElem->firstChild()); if (blockElem->firstChild()) { // Step 2: Clone |residualElem|. RefPtr newNode = residualElem->cloneNode(false); // Shallow clone. We don't pick up the same kids. newNodePtr = newNode.get(); reportError(ResidualStyleError, &newNode->localName()); // Step 3: Place |blockElem|'s children under |newNode|. Remove all of the children of |blockElem| // before we've put |newElem| into the document. That way we'll only do one attachment of all // the new content (instead of a bunch of individual attachments). Node* currNode = blockElem->firstChild(); while (currNode) { Node* nextNode = currNode->nextSibling(); newNode->appendChild(currNode, ec); currNode = nextNode; } // Step 4: Place |newNode| under |blockElem|. |blockElem| is still out of the document, so no // attachment can occur yet. blockElem->appendChild(newNode.release(), ec); } // Step 5: Reparent |blockElem|. Now the full attachment of the fixed up tree takes place. if (isBlockStillInTree) parentElem->appendChild(blockElem, ec); // Step 6: Pull |elem| out of the stack, since it is no longer enclosing us. Also update // the node associated with the previous stack element so that when it gets popped, // it doesn't make the residual element the next current node. HTMLStackElem* currElem = maxElem; HTMLStackElem* prevElem = 0; while (currElem != elem) { prevElem = currElem; currElem = currElem->next; } prevElem->next = elem->next; prevElem->derefNode(); prevElem->node = elem->node; prevElem->didRefNode = elem->didRefNode; if (!finished) { // Repurpose |elem| to represent |newNode| and insert it at the appropriate position // in the stack. We do not do this for the innermost block, because in that case the new // node is effectively no longer open. elem->next = maxElem; elem->node = prevMaxElem->node; elem->didRefNode = prevMaxElem->didRefNode; elem->strayTableContent = false; prevMaxElem->next = elem; ASSERT(newNodePtr); prevMaxElem->node = newNodePtr; prevMaxElem->didRefNode = false; } else delete elem; } // FIXME: If we ever make a case like this work: //
// Then this check will be too simplistic. Right now the
chain will end up inside the , which is pretty crazy. if (strayTableContent) inStrayTableContent--; // Step 7: Reopen intermediate inlines, e.g.,

FooGoo

. // In the above example, Goo should stay italic. // We cap the number of tags we're willing to reopen based off cResidualStyleMaxDepth. HTMLStackElem* curr = blockStack; HTMLStackElem* residualStyleStack = 0; unsigned stackDepth = 1; while (curr && curr != maxElem) { // We will actually schedule this tag for reopening // after we complete the close of this entire block. if (isResidualStyleTag(curr->tagName) && stackDepth++ < cResidualStyleMaxDepth) // We've overloaded the use of stack elements and are just reusing the // struct with a slightly different meaning to the variables. Instead of chaining // from innermost to outermost, we build up a list of all the tags we need to reopen // from the outermost to the innermost, i.e., residualStyleStack will end up pointing // to the outermost tag we need to reopen. // We also set curr->node to be the actual element that corresponds to the ID stored in // curr->id rather than the node that you should pop to when the element gets pulled off // the stack. moveOneBlockToStack(residualStyleStack); else popOneBlock(); curr = blockStack; } reopenResidualStyleTags(residualStyleStack, 0); // Stray table content can't be an issue here, since some element above will always become the root of new stray table content. m_handlingResidualStyleAcrossBlocks = false; } void HTMLParser::reopenResidualStyleTags(HTMLStackElem* elem, Node* malformedTableParent) { // Loop for each tag that needs to be reopened. while (elem) { // Create a shallow clone of the DOM node for this element. RefPtr newNode = elem->node->cloneNode(false); reportError(ResidualStyleError, &newNode->localName()); // Append the new node. In the malformed table case, we need to insert before the table, // which will be the last child. ExceptionCode ec = 0; if (malformedTableParent) malformedTableParent->insertBefore(newNode, malformedTableParent->lastChild(), ec); else current->appendChild(newNode, ec); // FIXME: Is it really OK to ignore the exceptions here? // Now push a new stack element for this node we just created. pushBlock(elem->tagName, elem->level); // Set our strayTableContent boolean if needed, so that the reopened tag also knows // that it is inside a malformed table. blockStack->strayTableContent = malformedTableParent != 0; if (blockStack->strayTableContent) inStrayTableContent++; // Clear our malformed table parent variable. malformedTableParent = 0; // Update |current| manually to point to the new node. setCurrent(newNode.get()); // Advance to the next tag that needs to be reopened. HTMLStackElem* next = elem->next; elem->derefNode(); delete elem; elem = next; } } void HTMLParser::pushBlock(const AtomicString& tagName, int level) { current->beginParsingChildren(); blockStack = new HTMLStackElem(tagName, level, current, didRefCurrent, blockStack); didRefCurrent = false; } void HTMLParser::popBlock(const AtomicString& tagName, bool reportErrors) { HTMLStackElem* elem = blockStack; int maxLevel = 0; while (elem && (elem->tagName != tagName)) { if (maxLevel < elem->level) maxLevel = elem->level; elem = elem->next; } if (!elem) { if (reportErrors) reportError(StrayCloseTagError, &tagName, 0, true); return; } if (maxLevel > elem->level) { // We didn't match because the tag is in a different scope, e.g., //

Foo. Try to correct the problem. if (!isResidualStyleTag(tagName)) return; return handleResidualStyleCloseTagAcrossBlocks(elem); } bool isAffectedByStyle = isAffectedByResidualStyle(elem->tagName); HTMLStackElem* residualStyleStack = 0; Node* malformedTableParent = 0; elem = blockStack; unsigned stackDepth = 1; while (elem) { if (elem->tagName == tagName) { int strayTable = inStrayTableContent; popOneBlock(); elem = 0; // This element was the root of some malformed content just inside an implicit or // explicit or . // If we end up needing to reopen residual style tags, the root of the reopened chain // must also know that it is the root of malformed content inside a /. if (strayTable && (inStrayTableContent < strayTable) && residualStyleStack) { Node* curr = current; while (curr && !curr->hasTagName(tableTag)) curr = curr->parentNode(); malformedTableParent = curr ? curr->parentNode() : 0; } } else { if (m_currentFormElement && elem->tagName == formTag) // A is being closed prematurely (and this is // malformed HTML). Set an attribute on the form to clear out its // bottom margin. m_currentFormElement->setMalformed(true); // Schedule this tag for reopening // after we complete the close of this entire block. if (isAffectedByStyle && isResidualStyleTag(elem->tagName) && stackDepth++ < cResidualStyleMaxDepth) // We've overloaded the use of stack elements and are just reusing the // struct with a slightly different meaning to the variables. Instead of chaining // from innermost to outermost, we build up a list of all the tags we need to reopen // from the outermost to the innermost, i.e., residualStyleStack will end up pointing // to the outermost tag we need to reopen. // We also set elem->node to be the actual element that corresponds to the ID stored in // elem->id rather than the node that you should pop to when the element gets pulled off // the stack. moveOneBlockToStack(residualStyleStack); else popOneBlock(); elem = blockStack; } } reopenResidualStyleTags(residualStyleStack, malformedTableParent); } inline HTMLStackElem* HTMLParser::popOneBlockCommon() { HTMLStackElem* elem = blockStack; // Form elements restore their state during the parsing process. // Also, a few elements (, ) need to know when all child elements (s) are available. if (current && elem->node != current) current->finishParsingChildren(); blockStack = elem->next; current = elem->node; didRefCurrent = elem->didRefNode; if (elem->strayTableContent) inStrayTableContent--; return elem; } void HTMLParser::popOneBlock() { // Store the current node before popOneBlockCommon overwrites it. Node* lastCurrent = current; bool didRefLastCurrent = didRefCurrent; delete popOneBlockCommon(); if (didRefLastCurrent) lastCurrent->deref(); } void HTMLParser::moveOneBlockToStack(HTMLStackElem*& head) { // We'll be using the stack element we're popping, but for the current node. // See the two callers for details. // Store the current node before popOneBlockCommon overwrites it. Node* lastCurrent = current; bool didRefLastCurrent = didRefCurrent; // Pop the block, but don't deref the current node as popOneBlock does because // we'll be using the pointer in the new stack element. HTMLStackElem* elem = popOneBlockCommon(); // Transfer the current node into the stack element. // No need to deref the old elem->node because popOneBlockCommon transferred // it into the current/didRefCurrent fields. elem->node = lastCurrent; elem->didRefNode = didRefLastCurrent; elem->next = head; head = elem; } void HTMLParser::popInlineBlocks() { while (blockStack && isInline(current)) popOneBlock(); } void HTMLParser::freeBlock() { while (blockStack) popOneBlock(); } void HTMLParser::createHead() { if (head || !document->documentElement()) return; head = new HTMLHeadElement(document); HTMLElement* body = document->body(); ExceptionCode ec = 0; document->documentElement()->insertBefore(head, body, ec); if (ec) head = 0; // If the body does not exist yet, then the should be pushed as the current block. if (head && !body) { pushBlock(head->localName(), head->tagPriority()); setCurrent(head); } } PassRefPtr HTMLParser::handleIsindex(Token* t) { RefPtr n = new HTMLDivElement(document); NamedMappedAttrMap* attrs = t->attrs.get(); RefPtr isIndex = new HTMLIsIndexElement(document, m_currentFormElement.get()); isIndex->setAttributeMap(attrs); isIndex->setAttribute(typeAttr, "khtml_isindex"); String text = searchableIndexIntroduction(); if (attrs) { if (Attribute* a = attrs->getAttributeItem(promptAttr)) text = a->value().domString() + " "; t->attrs = 0; } n->addChild(new HTMLHRElement(document)); n->addChild(new Text(document, text)); n->addChild(isIndex.release()); n->addChild(new HTMLHRElement(document)); return n.release(); } void HTMLParser::startBody() { if (inBody) return; inBody = true; if (m_isindexElement) { insertNode(m_isindexElement.get(), true /* don't descend into this node */); m_isindexElement = 0; } } void HTMLParser::finished() { // In the case of a completely empty document, here's the place to create the HTML element. if (current && current->isDocumentNode() && !document->documentElement()) insertNode(new HTMLHtmlElement(document)); // This ensures that "current" is not left pointing to a node when the document is destroyed. freeBlock(); setCurrent(0); // Warning, this may delete the tokenizer and parser, so don't try to do anything else after this. if (!m_isParsingFragment) document->finishedParsing(); } void HTMLParser::reportErrorToConsole(HTMLParserErrorCode errorCode, const AtomicString* tagName1, const AtomicString* tagName2, bool closeTags) { Frame* frame = document->frame(); if (!frame) return; Page* page = frame->page(); if (!page) return; HTMLTokenizer* htmlTokenizer = static_cast(document->tokenizer()); int lineNumber = htmlTokenizer->lineNumber() + 1; AtomicString tag1; AtomicString tag2; if (tagName1) { if (*tagName1 == "#text") tag1 = "Text"; else if (*tagName1 == "#comment") tag1 = ""; else tag1 = (closeTags ? ""; } if (tagName2) { if (*tagName2 == "#text") tag2 = "Text"; else if (*tagName2 == "#comment") tag2 = ""; else tag2 = (closeTags ? ""; } const char* errorMsg = htmlParserErrorMessageTemplate(errorCode); if (!errorMsg) return; String message; if (htmlTokenizer->processingContentWrittenByScript()) message += htmlParserDocumentWriteMessage(); message += errorMsg; message.replace("%tag1", tag1); message.replace("%tag2", tag2); page->chrome()->addMessageToConsole(HTMLMessageSource, isWarning(errorCode) ? WarningMessageLevel: ErrorMessageLevel, message, lineNumber, document->url()); } }