Add webkit to the repository.

git-svn-id: svn://svn.chromium.org/chrome/trunk/src@18 0039d316-1c4b-4281-b951-d872f2087c98
author: initial.commit <initial.commit@0039d316-1c4b-4281-b951-d872f2087c98> 2008-07-27 00:20:51 +0000
committer: initial.commit <initial.commit@0039d316-1c4b-4281-b951-d872f2087c98> 2008-07-27 00:20:51 +0000
commit: f5b16fed647e941aa66933178da85db2860d639b (patch)
tree: f00e9856c04aad3b558a140955e7674add33f051 /webkit/pending/HTMLTokenizer.cpp
parent: 920c091ac3ee15079194c82ae8a7a18215f3f23c (diff)
download: chromium_src-f5b16fed647e941aa66933178da85db2860d639b.zip
chromium_src-f5b16fed647e941aa66933178da85db2860d639b.tar.gz
chromium_src-f5b16fed647e941aa66933178da85db2860d639b.tar.bz2
1 files changed, 1791 insertions, 0 deletions
diff --git a/webkit/pending/HTMLTokenizer.cpp b/webkit/pending/HTMLTokenizer.cpp
new file mode 100644
index 0000000..61d3a37
--- /dev/null
+++ b/webkit/pending/HTMLTokenizer.cpp
@@ -0,0 +1,1791 @@
+/*
+    Copyright (C) 1997 Martin Jones (mjones@kde.org)
+              (C) 1997 Torben Weis (weis@kde.org)
+              (C) 1998 Waldo Bastian (bastian@kde.org)
+              (C) 1999 Lars Knoll (knoll@kde.org)
+              (C) 1999 Antti Koivisto (koivisto@kde.org)
+              (C) 2001 Dirk Mueller (mueller@kde.org)
+    Copyright (C) 2004, 2005, 2006, 2007, 2008 Apple Inc. All rights reserved.
+    Copyright (C) 2005, 2006 Alexey Proskuryakov (ap@nypop.com)
+
+    This library is free software; you can redistribute it and/or
+    modify it under the terms of the GNU Library General Public
+    License as published by the Free Software Foundation; either
+    version 2 of the License, or (at your option) any later version.
+
+    This library is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+    Library General Public License for more details.
+
+    You should have received a copy of the GNU Library General Public License
+    along with this library; see the file COPYING.LIB.  If not, write to
+    the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
+    Boston, MA 02110-1301, USA.
+*/
+
+#include "config.h"
+#include "HTMLTokenizer.h"
+
+#include "CSSHelper.h"
+#include "Cache.h"
+#include "CachedScript.h"
+#include "DocLoader.h"
+#include "DocumentFragment.h"
+#include "EventNames.h"
+#include "Frame.h"
+#include "FrameLoader.h"
+#include "FrameView.h"
+#include "HTMLElement.h"
+#include "HTMLNames.h"
+#include "HTMLParser.h"
+#include "HTMLScriptElement.h"
+#include "HTMLViewSourceDocument.h"
+#include "Settings.h"
+#include "SystemTime.h"
+#include "JSBridge.h"
+#include <wtf/ASCIICType.h>
+
+#include "HTMLEntityNames.c"
+
+// #define INSTRUMENT_LAYOUT_SCHEDULING 1
+
+#if MOBILE
+// The mobile device needs to be responsive, as such the tokenizer chunk size is reduced.
+// This value is used to define how many characters the tokenizer will process before 
+// yeilding control.
+#define TOKENIZER_CHUNK_SIZE  256
+#else
+#define TOKENIZER_CHUNK_SIZE  4096
+#endif
+
+using namespace std;
+using namespace WTF;
+
+namespace WebCore {
+
+using namespace HTMLNames;
+using namespace EventNames;
+
+#if MOBILE
+// As the chunks are smaller (above), the tokenizer should not yield for as long a period, otherwise
+// it will take way to long to load a page.
+const double tokenizerTimeDelay = 0.300;
+
+#else
+// FIXME: We would like this constant to be 200ms.
+// Yielding more aggressively results in increased responsiveness and better incremental rendering.
+// It slows down overall page-load on slower machines, though, so for now we set a value of 500.
+const double tokenizerTimeDelay = 0.500;
+#endif
+
+static const char commentStart [] = "<!--";
+static const char scriptEnd [] = "</script";
+static const char xmpEnd [] = "</xmp";
+static const char styleEnd [] =  "</style";
+static const char textareaEnd [] = "</textarea";
+static const char titleEnd [] = "</title";
+
+// Full support for MS Windows extensions to Latin-1.
+// Technically these extensions should only be activated for pages
+// marked "windows-1252" or "cp1252", but
+// in the standard Microsoft way, these extensions infect hundreds of thousands
+// of web pages.  Note that people with non-latin-1 Microsoft extensions
+// are SOL.
+//
+// See: http://www.microsoft.com/globaldev/reference/WinCP.asp
+//      http://www.bbsinc.com/iso8859.html
+//      http://www.obviously.com/
+//
+// There may be better equivalents
+
+// We only need this for entities. For non-entity text, we handle this in the text encoding.
+
+static const UChar windowsLatin1ExtensionArray[32] = {
+    0x20AC, 0x0081, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021, // 80-87
+    0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0x008D, 0x017D, 0x008F, // 88-8F
+    0x0090, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014, // 90-97
+    0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x009D, 0x017E, 0x0178  // 98-9F
+};
+
+static inline UChar fixUpChar(UChar c)
+{
+    if ((c & ~0x1F) != 0x0080)
+        return c;
+    return windowsLatin1ExtensionArray[c - 0x80];
+}
+
+static inline bool tagMatch(const char* s1, const UChar* s2, unsigned length)
+{
+    for (unsigned i = 0; i != length; ++i) {
+        unsigned char c1 = s1[i];
+        unsigned char uc1 = toASCIIUpper(static_cast<char>(c1));
+        UChar c2 = s2[i];
+        if (c1 != c2 && uc1 != c2)
+            return false;
+    }
+    return true;
+}
+
+inline void Token::addAttribute(Document* doc, AtomicString& attrName, const AtomicString& v, bool viewSourceMode)
+{
+    if (!attrName.isEmpty()) {
+        ASSERT(!attrName.contains('/'));
+        RefPtr<MappedAttribute> a = new MappedAttribute(attrName, v);
+        if (!attrs)
+            attrs = new NamedMappedAttrMap(0);
+        attrs->insertAttribute(a.release(), viewSourceMode);
+    }
+    
+    attrName = emptyAtom;
+}
+
+// ----------------------------------------------------------------------------
+
+HTMLTokenizer::HTMLTokenizer(HTMLDocument* doc, bool reportErrors)
+    : Tokenizer()
+    , buffer(0)
+    , scriptCode(0)
+    , scriptCodeSize(0)
+    , scriptCodeMaxSize(0)
+    , scriptCodeResync(0)
+    , m_executingScript(0)
+    , m_requestingScript(false)
+    , m_hasScriptsWaitingForStylesheets(false)
+    , m_timer(this, &HTMLTokenizer::timerFired)
+    , m_doc(doc)
+    , parser(new HTMLParser(doc, reportErrors))
+    , inWrite(false)
+    , m_fragment(false)
+{
+    begin();
+}
+
+HTMLTokenizer::HTMLTokenizer(HTMLViewSourceDocument* doc)
+    : Tokenizer(true)
+    , buffer(0)
+    , scriptCode(0)
+    , scriptCodeSize(0)
+    , scriptCodeMaxSize(0)
+    , scriptCodeResync(0)
+    , m_executingScript(0)
+    , m_requestingScript(false)
+    , m_hasScriptsWaitingForStylesheets(false)
+    , m_timer(this, &HTMLTokenizer::timerFired)
+    , m_doc(doc)
+    , parser(0)
+    , inWrite(false)
+    , m_fragment(false)
+{
+    begin();
+}
+
+HTMLTokenizer::HTMLTokenizer(DocumentFragment* frag)
+    : buffer(0)
+    , scriptCode(0)
+    , scriptCodeSize(0)
+    , scriptCodeMaxSize(0)
+    , scriptCodeResync(0)
+    , m_executingScript(0)
+    , m_requestingScript(false)
+    , m_hasScriptsWaitingForStylesheets(false)
+    , m_timer(this, &HTMLTokenizer::timerFired)
+    , m_doc(frag->document())
+    , inWrite(false)
+    , m_fragment(true)
+{
+    parser = new HTMLParser(frag);
+    begin();
+}
+
+void HTMLTokenizer::reset()
+{
+    ASSERT(m_executingScript == 0);
+
+    while (!pendingScripts.isEmpty()) {
+      CachedScript *cs = pendingScripts.dequeue();
+      ASSERT(cache()->disabled() || cs->accessCount() > 0);
+      cs->deref(this);
+    }
+    
+    fastFree(buffer);
+    buffer = dest = 0;
+    size = 0;
+
+    fastFree(scriptCode);
+    scriptCode = 0;
+    scriptCodeSize = scriptCodeMaxSize = scriptCodeResync = 0;
+
+    m_timer.stop();
+    m_state.setAllowYield(false);
+    m_state.setForceSynchronous(false);
+
+    currToken.reset();
+}
+
+void HTMLTokenizer::begin()
+{
+    m_executingScript = 0;
+    m_requestingScript = false;
+    m_hasScriptsWaitingForStylesheets = false;
+    m_state.setLoadingExtScript(false);
+    reset();
+    size = 254;
+    buffer = static_cast<UChar*>(fastMalloc(sizeof(UChar) * 254));
+    dest = buffer;
+    tquote = NoQuote;
+    searchCount = 0;
+    m_state.setEntityState(NoEntity);
+    scriptSrc = String();
+    pendingSrc.clear();
+    currentPrependingSrc = 0;
+    noMoreData = false;
+    brokenComments = false;
+    brokenServer = false;
+    m_lineNumber = 0;
+    scriptStartLineno = 0;
+    tagStartLineno = 0;
+    m_state.setForceSynchronous(false);
+}
+
+void HTMLTokenizer::setForceSynchronous(bool force)
+{
+    m_state.setForceSynchronous(force);
+}
+
+HTMLTokenizer::State HTMLTokenizer::processListing(SegmentedString list, State state)
+{
+    // This function adds the listing 'list' as
+    // preformatted text-tokens to the token-collection
+    while (!list.isEmpty()) {
+        if (state.skipLF()) {
+            state.setSkipLF(false);
+            if (*list == '\n') {
+                list.advance();
+                continue;
+            }
+        }
+
+        checkBuffer();
+
+        if (*list == '\n' || *list == '\r') {
+            if (state.discardLF())
+                // Ignore this LF
+                state.setDiscardLF(false); // We have discarded 1 LF
+            else
+                *dest++ = '\n';
+
+            /* Check for MS-DOS CRLF sequence */
+            if (*list == '\r')
+                state.setSkipLF(true);
+
+            list.advance();
+        } else {
+            state.setDiscardLF(false);
+            *dest++ = *list;
+            list.advance();
+        }
+    }
+
+    return state;
+}
+
+HTMLTokenizer::State HTMLTokenizer::parseSpecial(SegmentedString &src, State state)
+{
+    ASSERT(state.inTextArea() || state.inTitle() || !state.hasEntityState());
+    ASSERT(!state.hasTagState());
+    ASSERT(state.inXmp() + state.inTextArea() + state.inTitle() + state.inStyle() + state.inScript() == 1 );
+    if (state.inScript())
+        scriptStartLineno = m_lineNumber;
+
+    if (state.inComment()) 
+        state = parseComment(src, state);
+
+    while ( !src.isEmpty() ) {
+        checkScriptBuffer();
+        UChar ch = *src;
+
+        if (!scriptCodeResync && !brokenComments && !state.inTextArea() && !state.inXmp() && ch == '-' && scriptCodeSize >= 3 && !src.escaped() && scriptCode[scriptCodeSize-3] == '<' && scriptCode[scriptCodeSize-2] == '!' && scriptCode[scriptCodeSize-1] == '-') {
+            state.setInComment(true);
+            state = parseComment(src, state);
+            continue;
+        }
+        if (scriptCodeResync && !tquote && ch == '>') {
+            src.advancePastNonNewline();
+            scriptCodeSize = scriptCodeResync-1;
+            scriptCodeResync = 0;
+            scriptCode[ scriptCodeSize ] = scriptCode[ scriptCodeSize + 1 ] = 0;
+            if (state.inScript())
+                state = scriptHandler(state);
+            else {
+                state = processListing(SegmentedString(scriptCode, scriptCodeSize), state);
+                processToken();
+                if (state.inStyle()) { 
+                    currToken.tagName = styleTag.localName(); 
+                    currToken.beginTag = false; 
+                } else if (state.inTextArea()) { 
+                    currToken.tagName = textareaTag.localName(); 
+                    currToken.beginTag = false; 
+                } else if (state.inTitle()) { 
+                    currToken.tagName = titleTag.localName(); 
+                    currToken.beginTag = false; 
+                } else if (state.inXmp()) {
+                    currToken.tagName = xmpTag.localName(); 
+                    currToken.beginTag = false; 
+                }
+                processToken();
+                state.setInStyle(false);
+                state.setInScript(false);
+                state.setInTextArea(false);
+                state.setInTitle(false);
+                state.setInXmp(false);
+                tquote = NoQuote;
+                scriptCodeSize = scriptCodeResync = 0;
+            }
+            return state;
+        }
+        // possible end of tagname, lets check.
+        if (!scriptCodeResync && !state.escaped() && !src.escaped() && (ch == '>' || ch == '/' || isASCIISpace(ch)) &&
+             scriptCodeSize >= searchStopperLen &&
+             tagMatch( searchStopper, scriptCode+scriptCodeSize-searchStopperLen, searchStopperLen )) {
+            scriptCodeResync = scriptCodeSize-searchStopperLen+1;
+            tquote = NoQuote;
+            continue;
+        }
+        if (scriptCodeResync && !state.escaped()) {
+            if (ch == '\"')
+                tquote = (tquote == NoQuote) ? DoubleQuote : ((tquote == SingleQuote) ? SingleQuote : NoQuote);
+            else if (ch == '\'')
+                tquote = (tquote == NoQuote) ? SingleQuote : (tquote == DoubleQuote) ? DoubleQuote : NoQuote;
+            else if (tquote != NoQuote && (ch == '\r' || ch == '\n'))
+                tquote = NoQuote;
+        }
+        state.setEscaped(!state.escaped() && ch == '\\');
+        if (!scriptCodeResync && (state.inTextArea() || state.inTitle()) && !src.escaped() && ch == '&') {
+            UChar* scriptCodeDest = scriptCode+scriptCodeSize;
+            src.advancePastNonNewline();
+            state = parseEntity(src, scriptCodeDest, state, m_cBufferPos, true, false);
+            scriptCodeSize = scriptCodeDest - scriptCode;
+        } else {
+            scriptCode[scriptCodeSize++] = ch;
+            src.advance(m_lineNumber);
+        }
+    }
+
+    return state;
+}
+
+HTMLTokenizer::State HTMLTokenizer::scriptHandler(State state)
+{
+    // We are inside a <script>
+    bool doScriptExec = false;
+
+    // (Bugzilla 3837) Scripts following a frameset element should not execute or, 
+    // in the case of extern scripts, even load.
+    bool followingFrameset = (m_doc->body() && m_doc->body()->hasTagName(framesetTag));
+  
+    CachedScript* cs = 0;
+    // don't load external scripts for standalone documents (for now)
+    if (!inViewSourceMode()) {
+        if (!scriptSrc.isEmpty() && m_doc->frame()) {
+            // forget what we just got; load from src url instead
+            if (!parser->skipMode() && !followingFrameset) {
+#ifdef INSTRUMENT_LAYOUT_SCHEDULING
+                if (!m_doc->ownerElement())
+                    printf("Requesting script at time %d\n", m_doc->elapsedTime());
+#endif
+                // The parser might have been stopped by for example a window.close call in an earlier script.
+                // If so, we don't want to load scripts.
+                if (!m_parserStopped && (cs = m_doc->docLoader()->requestScript(scriptSrc, scriptSrcCharset)))
+                    pendingScripts.enqueue(cs);
+                else
+                    scriptNode = 0;
+            } else
+                scriptNode = 0;
+            scriptSrc = String();
+        } else {
+#ifdef TOKEN_DEBUG
+            kdDebug( 6036 ) << "---START SCRIPT---" << endl;
+            kdDebug( 6036 ) << DeprecatedString(scriptCode, scriptCodeSize) << endl;
+            kdDebug( 6036 ) << "---END SCRIPT---" << endl;
+#endif
+            // Parse scriptCode containing <script> info
+#if USE(LOW_BANDWIDTH_DISPLAY)
+            if (m_doc->inLowBandwidthDisplay()) {
+                // ideal solution is only skipping internal JavaScript if there is external JavaScript.
+                // but internal JavaScript can use document.write() to create an external JavaScript,
+                // so we have to skip internal JavaScript all the time.
+                m_doc->frame()->loader()->needToSwitchOutLowBandwidthDisplay();
+                doScriptExec = false;
+            } else
+#endif
+            doScriptExec = static_cast<HTMLScriptElement*>(scriptNode.get())->shouldExecuteAsJavaScript();
+            scriptNode = 0;
+        }
+    }
+
+    state = processListing(SegmentedString(scriptCode, scriptCodeSize), state);
+    RefPtr<Node> node = processToken();
+    String scriptString = node ? node->textContent() : "";
+    currToken.tagName = scriptTag.localName();
+    currToken.beginTag = false;
+    processToken();
+
+    state.setInScript(false);
+    
+    // FIXME: The script should be syntax highlighted.
+    if (inViewSourceMode())
+        return state;
+
+    SegmentedString *savedPrependingSrc = currentPrependingSrc;
+    SegmentedString prependingSrc;
+    currentPrependingSrc = &prependingSrc;
+    scriptCodeSize = scriptCodeResync = 0;
+
+    if (!parser->skipMode() && !followingFrameset) {
+        if (cs) {
+            if (savedPrependingSrc)
+                savedPrependingSrc->append(src);
+            else
+                pendingSrc.prepend(src);
+            setSrc(SegmentedString());
+
+            // the ref() call below may call notifyFinished if the script is already in cache,
+            // and that mucks with the state directly, so we must write it back to the object.
+            m_state = state;
+            bool savedRequestingScript = m_requestingScript;
+            m_requestingScript = true;
+            cs->ref(this);
+            m_requestingScript = savedRequestingScript;
+            state = m_state;
+            // will be 0 if script was already loaded and ref() executed it
+            if (!pendingScripts.isEmpty())
+                state.setLoadingExtScript(true);
+        } else if (!m_fragment && doScriptExec) {
+            if (!m_executingScript)
+                pendingSrc.prepend(src);
+            else
+                prependingSrc = src;
+            setSrc(SegmentedString());
+            state = scriptExecution(scriptString, state, String(), scriptStartLineno);
+        }
+    }
+
+    if (!m_executingScript && !state.loadingExtScript()) {
+        src.append(pendingSrc);
+        pendingSrc.clear();
+    } else if (!prependingSrc.isEmpty()) {
+        // restore first so that the write appends in the right place
+        // (does not hurt to do it again below)
+        currentPrependingSrc = savedPrependingSrc;
+
+        // we need to do this slightly modified bit of one of the write() cases
+        // because we want to prepend to pendingSrc rather than appending
+        // if there's no previous prependingSrc
+        if (!pendingScripts.isEmpty()) {
+            if (currentPrependingSrc) {
+                currentPrependingSrc->append(prependingSrc);
+            } else {
+                pendingSrc.prepend(prependingSrc);
+            }
+        } else {
+            m_state = state;
+            write(prependingSrc, false);
+            state = m_state;
+        }
+    }
+
+    currentPrependingSrc = savedPrependingSrc;
+
+    return state;
+}
+
+HTMLTokenizer::State HTMLTokenizer::scriptExecution(const String& str, State state, const String& scriptURL, int baseLine)
+{
+    if (m_fragment || !m_doc->frame())
+        return state;
+    m_executingScript++;
+    DeprecatedString url = scriptURL.isNull() ? m_doc->frame()->document()->url() : scriptURL.deprecatedString();
+
+    SegmentedString *savedPrependingSrc = currentPrependingSrc;
+    SegmentedString prependingSrc;
+    currentPrependingSrc = &prependingSrc;
+
+#ifdef INSTRUMENT_LAYOUT_SCHEDULING
+    if (!m_doc->ownerElement())
+        printf("beginning script execution at %d\n", m_doc->elapsedTime());
+#endif
+
+    m_state = state;
+    m_doc->frame()->loader()->executeScript(url, baseLine, str);
+    state = m_state;
+
+    state.setAllowYield(true);
+
+#ifdef INSTRUMENT_LAYOUT_SCHEDULING
+    if (!m_doc->ownerElement())
+        printf("ending script execution at %d\n", m_doc->elapsedTime());
+#endif
+    
+    m_executingScript--;
+
+    if (!m_executingScript && !state.loadingExtScript()) {
+        pendingSrc.prepend(prependingSrc);        
+        src.append(pendingSrc);
+        pendingSrc.clear();
+    } else if (!prependingSrc.isEmpty()) {
+        // restore first so that the write appends in the right place
+        // (does not hurt to do it again below)
+        currentPrependingSrc = savedPrependingSrc;
+
+        // we need to do this slightly modified bit of one of the write() cases
+        // because we want to prepend to pendingSrc rather than appending
+        // if there's no previous prependingSrc
+        if (!pendingScripts.isEmpty()) {
+            if (currentPrependingSrc)
+                currentPrependingSrc->append(prependingSrc);
+            else
+                pendingSrc.prepend(prependingSrc);
+        } else {
+            m_state = state;
+            write(prependingSrc, false);
+            state = m_state;
+        }
+    }
+
+    currentPrependingSrc = savedPrependingSrc;
+
+    return state;
+}
+
+HTMLTokenizer::State HTMLTokenizer::parseComment(SegmentedString &src, State state)
+{
+    // FIXME: Why does this code even run for comments inside <script> and <style>? This seems bogus.
+    checkScriptBuffer(src.length());
+    while (!src.isEmpty()) {
+        UChar ch = *src;
+        scriptCode[scriptCodeSize++] = ch;
+        if (ch == '>') {
+            bool handleBrokenComments = brokenComments && !(state.inScript() || state.inStyle());
+            int endCharsCount = 1; // start off with one for the '>' character
+            if (scriptCodeSize > 2 && scriptCode[scriptCodeSize-3] == '-' && scriptCode[scriptCodeSize-2] == '-') {
+                endCharsCount = 3;
+            } else if (scriptCodeSize > 3 && scriptCode[scriptCodeSize-4] == '-' && scriptCode[scriptCodeSize-3] == '-' && 
+                scriptCode[scriptCodeSize-2] == '!') {
+                // Other browsers will accept --!> as a close comment, even though it's
+                // not technically valid.
+                endCharsCount = 4;
+            }
+            if (handleBrokenComments || endCharsCount > 1) {
+                src.advancePastNonNewline();
+                if (!(state.inTitle() || state.inScript() || state.inXmp() || state.inTextArea() || state.inStyle())) {
+                    checkScriptBuffer();
+                    scriptCode[scriptCodeSize] = 0;
+                    scriptCode[scriptCodeSize + 1] = 0;
+                    currToken.tagName = commentAtom;
+                    currToken.beginTag = true;
+                    state = processListing(SegmentedString(scriptCode, scriptCodeSize - endCharsCount), state);
+                    processToken();
+                    currToken.tagName = commentAtom;
+                    currToken.beginTag = false;
+                    processToken();
+                    scriptCodeSize = 0;
+                }
+                state.setInComment(false);
+                return state; // Finished parsing comment
+            }
+        }
+        src.advance(m_lineNumber);
+    }
+
+    return state;
+}
+
+HTMLTokenizer::State HTMLTokenizer::parseServer(SegmentedString& src, State state)
+{
+    checkScriptBuffer(src.length());
+    while (!src.isEmpty()) {
+        UChar ch = *src;
+        scriptCode[scriptCodeSize++] = ch;
+        if (ch == '>' && scriptCodeSize > 1 && scriptCode[scriptCodeSize-2] == '%') {
+            src.advancePastNonNewline();
+            state.setInServer(false);
+            scriptCodeSize = 0;
+            return state; // Finished parsing server include
+        }
+        src.advance(m_lineNumber);
+    }
+    return state;
+}
+
+HTMLTokenizer::State HTMLTokenizer::parseProcessingInstruction(SegmentedString &src, State state)
+{
+    UChar oldchar = 0;
+    while (!src.isEmpty()) {
+        UChar chbegin = *src;
+        if (chbegin == '\'')
+            tquote = tquote == SingleQuote ? NoQuote : SingleQuote;
+        else if (chbegin == '\"')
+            tquote = tquote == DoubleQuote ? NoQuote : DoubleQuote;
+        // Look for '?>'
+        // Some crappy sites omit the "?" before it, so
+        // we look for an unquoted '>' instead. (IE compatible)
+        else if (chbegin == '>' && (!tquote || oldchar == '?')) {
+            // We got a '?>' sequence
+            state.setInProcessingInstruction(false);
+            src.advancePastNonNewline();
+            state.setDiscardLF(true);
+            return state; // Finished parsing comment!
+        }
+        src.advance(m_lineNumber);
+        oldchar = chbegin;
+    }
+    
+    return state;
+}
+
+HTMLTokenizer::State HTMLTokenizer::parseText(SegmentedString &src, State state)
+{
+    while (!src.isEmpty()) {
+        UChar cc = *src;
+
+        if (state.skipLF()) {
+            state.setSkipLF(false);
+            if (cc == '\n') {
+                src.advancePastNewline(m_lineNumber);
+                continue;
+            }
+        }
+
+        // do we need to enlarge the buffer?
+        checkBuffer();
+
+        if (cc == '\r') {
+            state.setSkipLF(true);
+            *dest++ = '\n';
+        } else
+            *dest++ = cc;
+        src.advance(m_lineNumber);
+    }
+
+    return state;
+}
+
+
+HTMLTokenizer::State HTMLTokenizer::parseEntity(SegmentedString &src, UChar*& dest, State state, unsigned &cBufferPos, bool start, bool parsingTag)
+{
+    if (start)
+    {
+        cBufferPos = 0;
+        state.setEntityState(SearchEntity);
+        EntityUnicodeValue = 0;
+    }
+
+    while(!src.isEmpty())
+    {
+        UChar cc = *src;
+        switch(state.entityState()) {
+        case NoEntity:
+            ASSERT(state.entityState() != NoEntity);
+            return state;
+        
+        case SearchEntity:
+            if (cc == '#') {
+                cBuffer[cBufferPos++] = cc;
+                src.advancePastNonNewline();
+                state.setEntityState(NumericSearch);
+            } else
+                state.setEntityState(EntityName);
+            break;
+
+        case NumericSearch:
+            if (cc == 'x' || cc == 'X') {
+                cBuffer[cBufferPos++] = cc;
+                src.advancePastNonNewline();
+                state.setEntityState(Hexadecimal);
+            } else if (cc >= '0' && cc <= '9')
+                state.setEntityState(Decimal);
+            else
+                state.setEntityState(SearchSemicolon);
+            break;
+
+        case Hexadecimal: {
+            int ll = min(src.length(), 10 - cBufferPos);
+            while (ll--) {
+                cc = *src;
+                if (!((cc >= '0' && cc <= '9') || (cc >= 'a' && cc <= 'f') || (cc >= 'A' && cc <= 'F'))) {
+                    state.setEntityState(SearchSemicolon);
+                    break;
+                }
+                int digit;
+                if (cc < 'A')
+                    digit = cc - '0';
+                else
+                    digit = (cc - 'A' + 10) & 0xF; // handle both upper and lower case without a branch
+                EntityUnicodeValue = EntityUnicodeValue * 16 + digit;
+                cBuffer[cBufferPos++] = cc;
+                src.advancePastNonNewline();
+            }
+            if (cBufferPos == 10)  
+                state.setEntityState(SearchSemicolon);
+            break;
+        }
+        case Decimal:
+        {
+            int ll = min(src.length(), 9-cBufferPos);
+            while(ll--) {
+                cc = *src;
+
+                if (!(cc >= '0' && cc <= '9')) {
+                    state.setEntityState(SearchSemicolon);
+                    break;
+                }
+
+                EntityUnicodeValue = EntityUnicodeValue * 10 + (cc - '0');
+                cBuffer[cBufferPos++] = cc;
+                src.advancePastNonNewline();
+            }
+            if (cBufferPos == 9)  
+                state.setEntityState(SearchSemicolon);
+            break;
+        }
+        case EntityName:
+        {
+            int ll = min(src.length(), 9-cBufferPos);
+            while(ll--) {
+                cc = *src;
+
+                if (!((cc >= 'a' && cc <= 'z') || (cc >= '0' && cc <= '9') || (cc >= 'A' && cc <= 'Z'))) {
+                    state.setEntityState(SearchSemicolon);
+                    break;
+                }
+
+                cBuffer[cBufferPos++] = cc;
+                src.advancePastNonNewline();
+            }
+            if (cBufferPos == 9) 
+                state.setEntityState(SearchSemicolon);
+            if (state.entityState() == SearchSemicolon) {
+                if(cBufferPos > 1) {
+                    // Since the maximum length of entity name is 9,
+                    // so a single char array which is allocated on
+                    // the stack, its length is 10, should be OK.
+                    // Also if we have an illegal character, we treat it
+                    // as illegal entity name.
+                    unsigned testedEntityNameLen = 0;
+                    char tmpEntityNameBuffer[10];
+
+                    ASSERT(cBufferPos < 10);
+                    for (; testedEntityNameLen < cBufferPos; ++testedEntityNameLen) {
+                        if (cBuffer[testedEntityNameLen] > 0x7e)
+                            break;
+                        tmpEntityNameBuffer[testedEntityNameLen] = cBuffer[testedEntityNameLen];
+                    }
+
+                    const Entity *e;
+
+                    if (testedEntityNameLen == cBufferPos)
+                        e = findEntity(tmpEntityNameBuffer, cBufferPos);
+                    else
+                        e = 0;
+
+                    if(e)
+                        EntityUnicodeValue = e->code;
+
+                    // be IE compatible
+                    if(parsingTag && EntityUnicodeValue > 255 && *src != ';')
+                        EntityUnicodeValue = 0;
+                }
+            }
+            else
+                break;
+        }
+        case SearchSemicolon:
+            // Don't allow values that are more than 21 bits.
+            if (EntityUnicodeValue > 0 && EntityUnicodeValue <= 0x10FFFF) {
+                if (!inViewSourceMode()) {
+                    if (*src == ';')
+                        src.advancePastNonNewline();
+                    if (EntityUnicodeValue <= 0xFFFF) {
+                        checkBuffer();
+                        src.push(fixUpChar(EntityUnicodeValue));
+                    } else {
+                        // Convert to UTF-16, using surrogate code points.
+                        checkBuffer(2);
+                        src.push(U16_LEAD(EntityUnicodeValue));
+                        src.push(U16_TRAIL(EntityUnicodeValue));
+                    }
+                } else {
+                    // FIXME: We should eventually colorize entities by sending them as a special token.
+                    checkBuffer(11);
+                    *dest++ = '&';
+                    for (unsigned i = 0; i < cBufferPos; i++)
+                        dest[i] = cBuffer[i];
+                    dest += cBufferPos;
+                    if (*src == ';') {
+                        *dest++ = ';';
+                        src.advancePastNonNewline();
+                    }
+                }
+            } else {
+                checkBuffer(10);
+                // ignore the sequence, add it to the buffer as plaintext
+                *dest++ = '&';
+                for (unsigned i = 0; i < cBufferPos; i++)
+                    dest[i] = cBuffer[i];
+                dest += cBufferPos;
+            }
+
+            state.setEntityState(NoEntity);
+            return state;
+        }
+    }
+
+    return state;
+}
+
+HTMLTokenizer::State HTMLTokenizer::parseTag(SegmentedString &src, State state)
+{
+    ASSERT(!state.hasEntityState());
+
+    unsigned cBufferPos = m_cBufferPos;
+
+    bool lastIsSlash = false;
+
+    while (!src.isEmpty()) {
+        checkBuffer();
+        switch(state.tagState()) {
+        case NoTag:
+        {
+            m_cBufferPos = cBufferPos;
+            return state;
+        }
+        case TagName:
+        {
+#if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1
+            qDebug("TagName");
+#endif
+            if (searchCount > 0)
+            {
+                if (*src == commentStart[searchCount])
+                {
+                    searchCount++;
+                    if (searchCount == 4)
+                    {
+#ifdef TOKEN_DEBUG
+                        kdDebug( 6036 ) << "Found comment" << endl;
+#endif
+                        // Found '<!--' sequence
+                        src.advancePastNonNewline();
+                        dest = buffer; // ignore the previous part of this tag
+                        state.setInComment(true);
+                        state.setTagState(NoTag);
+
+                        // Fix bug 34302 at kde.bugs.org.  Go ahead and treat
+                        // <!--> as a valid comment, since both mozilla and IE on windows
+                        // can handle this case.  Only do this in quirks mode. -dwh
+                        if (!src.isEmpty() && *src == '>' && m_doc->inCompatMode()) {
+                          state.setInComment(false);
+                          src.advancePastNonNewline();
+                          if (!src.isEmpty())
+                              cBuffer[cBufferPos++] = *src;
+                        }
+                        else
+                          state = parseComment(src, state);
+
+                        m_cBufferPos = cBufferPos;
+                        return state; // Finished parsing tag!
+                    }
+                    cBuffer[cBufferPos++] = *src;
+                    src.advancePastNonNewline();
+                    break;
+                }
+                else
+                    searchCount = 0; // Stop looking for '<!--' sequence
+            }
+
+            bool finish = false;
+            unsigned int ll = min(src.length(), CBUFLEN - cBufferPos);
+            while (ll--) {
+                UChar curchar = *src;
+                if (isASCIISpace(curchar) || curchar == '>' || curchar == '<') {
+                    finish = true;
+                    break;
+                }
+                
+                // tolower() shows up on profiles. This is faster!
+                if (curchar >= 'A' && curchar <= 'Z' && !inViewSourceMode())
+                    cBuffer[cBufferPos++] = curchar + ('a' - 'A');
+                else
+                    cBuffer[cBufferPos++] = curchar;
+                src.advancePastNonNewline();
+            }
+
+            // Disadvantage: we add the possible rest of the tag
+            // as attribute names. ### judge if this causes problems
+            if(finish || CBUFLEN == cBufferPos) {
+                bool beginTag;
+                UChar* ptr = cBuffer;
+                unsigned int len = cBufferPos;
+                cBuffer[cBufferPos] = '\0';
+                if ((cBufferPos > 0) && (*ptr == '/')) {
+                    // End Tag
+                    beginTag = false;
+                    ptr++;
+                    len--;
+                }
+                else
+                    // Start Tag
+                    beginTag = true;
+
+                // Ignore the / in fake xml tags like <br/>.  We trim off the "/" so that we'll get "br" as the tag name and not "br/".
+                if (len > 1 && ptr[len-1] == '/' && !inViewSourceMode())
+                    ptr[--len] = '\0';
+
+                // Now that we've shaved off any invalid / that might have followed the name), make the tag.
+                // FIXME: FireFox and WinIE turn !foo nodes into comments, we ignore comments. (fast/parser/tag-with-exclamation-point.html)
+                if (ptr[0] != '!' || inViewSourceMode()) {
+                    currToken.tagName = AtomicString(ptr);
+                    currToken.beginTag = beginTag;
+                }
+                dest = buffer;
+                state.setTagState(SearchAttribute);
+                cBufferPos = 0;
+            }
+            break;
+        }
+        case SearchAttribute:
+#if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1
+            qDebug("SearchAttribute");
+#endif
+            while(!src.isEmpty()) {
+                UChar curchar = *src;
+                // In this mode just ignore any quotes we encounter and treat them like spaces.
+                if (!isASCIISpace(curchar) && curchar != '\'' && curchar != '"') {
+                    if (curchar == '<' || curchar == '>')
+                        state.setTagState(SearchEnd);
+                    else
+                        state.setTagState(AttributeName);
+
+                    cBufferPos = 0;
+                    break;
+                }
+                if (inViewSourceMode())
+                    currToken.addViewSourceChar(curchar);
+                src.advance(m_lineNumber);
+            }
+            break;
+        case AttributeName:
+        {
+#if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1
+            qDebug("AttributeName");
+#endif
+            int ll = min(src.length(), CBUFLEN-cBufferPos);
+            while(ll--) {
+                UChar curchar = *src;
+                // If we encounter a "/" when scanning an attribute name, treat it as a delimiter.  This allows the 
+                // cases like <input type=checkbox checked/> to work (and accommodates XML-style syntax as per HTML5).
+                if (curchar <= '>' && (curchar >= '<' || isASCIISpace(curchar) || curchar == '/')) {
+                    cBuffer[cBufferPos] = '\0';
+                    attrName = AtomicString(cBuffer);
+                    dest = buffer;
+                    *dest++ = 0;
+                    state.setTagState(SearchEqual);
+                    if (inViewSourceMode())
+                        currToken.addViewSourceChar('a');
+                    break;
+                }
+                
+                // tolower() shows up on profiles. This is faster!
+                if (curchar >= 'A' && curchar <= 'Z' && !inViewSourceMode())
+                    cBuffer[cBufferPos++] = curchar + ('a' - 'A');
+                else
+                    cBuffer[cBufferPos++] = curchar;
+                    
+                src.advance(m_lineNumber);
+            }
+            if ( cBufferPos == CBUFLEN ) {
+                cBuffer[cBufferPos] = '\0';
+                attrName = AtomicString(cBuffer);
+                dest = buffer;
+                *dest++ = 0;
+                state.setTagState(SearchEqual);
+                if (inViewSourceMode())
+                    currToken.addViewSourceChar('a');
+            }
+            break;
+        }
+        case SearchEqual:
+#if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1
+            qDebug("SearchEqual");
+#endif
+            while(!src.isEmpty()) {
+                UChar curchar = *src;
+
+                if (lastIsSlash && curchar == '>') {
+                    // This is a quirk (with a long sad history).  We have to do this
+                    // since widgets do <script src="foo.js"/> and expect the tag to close.
+                    if (currToken.tagName == scriptTag)
+                        currToken.flat = true;
+                    currToken.brokenXMLStyle = true;
+                }
+
+                // In this mode just ignore any quotes or slashes we encounter and treat them like spaces.
+                if (!isASCIISpace(curchar) && curchar != '\'' && curchar != '"' && curchar != '/') {
+                    if (curchar == '=') {
+#ifdef TOKEN_DEBUG
+                        kdDebug(6036) << "found equal" << endl;
+#endif
+                        state.setTagState(SearchValue);
+                        if (inViewSourceMode())
+                            currToken.addViewSourceChar(curchar);
+                        src.advancePastNonNewline();
+                    } else {
+                        currToken.addAttribute(m_doc, attrName, emptyAtom, inViewSourceMode());
+                        dest = buffer;
+                        state.setTagState(SearchAttribute);
+                        lastIsSlash = false;
+                    }
+                    break;
+                }
+                if (inViewSourceMode())
+                    currToken.addViewSourceChar(curchar);
+                    
+                lastIsSlash = curchar == '/';
+
+                src.advance(m_lineNumber);
+            }
+            break;
+        case SearchValue:
+            while (!src.isEmpty()) {
+                UChar curchar = *src;
+                if (!isASCIISpace(curchar)) {
+                    if (curchar == '\'' || curchar == '\"') {
+                        tquote = curchar == '\"' ? DoubleQuote : SingleQuote;
+                        state.setTagState(QuotedValue);
+                        if (inViewSourceMode())
+                            currToken.addViewSourceChar(curchar);
+                        src.advancePastNonNewline();
+                    } else
+                        state.setTagState(Value);
+
+                    break;
+                }
+                if (inViewSourceMode())
+                    currToken.addViewSourceChar(curchar);
+                src.advance(m_lineNumber);
+            }
+            break;
+        case QuotedValue:
+#if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1
+            qDebug("QuotedValue");
+#endif
+            while (!src.isEmpty()) {
+                checkBuffer();
+
+                UChar curchar = *src;
+                if (curchar <= '>' && !src.escaped()) {
+                    if (curchar == '>' && attrName.isEmpty()) {
+                        // Handle a case like <img '>.  Just go ahead and be willing
+                        // to close the whole tag.  Don't consume the character and
+                        // just go back into SearchEnd while ignoring the whole
+                        // value.
+                        // FIXME: Note that this is actually not a very good solution.
+                        // It doesn't handle the general case of
+                        // unmatched quotes among attributes that have names. -dwh
+                        while (dest > buffer + 1 && (dest[-1] == '\n' || dest[-1] == '\r'))
+                            dest--; // remove trailing newlines
+                        AtomicString v(buffer + 1, dest - buffer - 1);
+                        attrName = v; // Just make the name/value match. (FIXME: Is this some WinIE quirk?)
+                        currToken.addAttribute(m_doc, attrName, v, inViewSourceMode());
+                        if (inViewSourceMode())
+                            currToken.addViewSourceChar('x');
+                        state.setTagState(SearchAttribute);
+                        dest = buffer;
+                        tquote = NoQuote;
+                        break;
+                    }
+                    
+                    if (curchar == '&') {
+                        src.advancePastNonNewline();
+                        state = parseEntity(src, dest, state, cBufferPos, true, true);
+                        break;
+                    }
+
+                    if ((tquote == SingleQuote && curchar == '\'') || (tquote == DoubleQuote && curchar == '\"')) {
+                        // some <input type=hidden> rely on trailing spaces. argh
+                        while (dest > buffer + 1 && (dest[-1] == '\n' || dest[-1] == '\r'))
+                            dest--; // remove trailing newlines
+                        AtomicString v(buffer + 1, dest - buffer - 1);
+                        if (attrName.isEmpty()) {
+                            attrName = v; // Make the name match the value. (FIXME: Is this a WinIE quirk?)
+                            if (inViewSourceMode())
+                                currToken.addViewSourceChar('x');
+                        } else if (inViewSourceMode())
+                            currToken.addViewSourceChar('v');
+                        currToken.addAttribute(m_doc, attrName, v, inViewSourceMode());
+                        dest = buffer;
+                        state.setTagState(SearchAttribute);
+                        tquote = NoQuote;
+                        if (inViewSourceMode())
+                            currToken.addViewSourceChar(curchar);
+                        src.advancePastNonNewline();
+                        break;
+                    }
+                }
+
+                *dest++ = curchar;
+                src.advance(m_lineNumber);
+            }
+            break;
+        case Value:
+#if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1
+            qDebug("Value");
+#endif
+            while(!src.isEmpty()) {
+                checkBuffer();
+                UChar curchar = *src;
+                if (curchar <= '>' && !src.escaped()) {
+                    // parse Entities
+                    if (curchar == '&') {
+                        src.advancePastNonNewline();
+                        state = parseEntity(src, dest, state, cBufferPos, true, true);
+                        break;
+                    }
+                    // no quotes. Every space means end of value
+                    // '/' does not delimit in IE!
+                    if (isASCIISpace(curchar) || curchar == '>') {
+                        AtomicString v(buffer+1, dest-buffer-1);
+                        currToken.addAttribute(m_doc, attrName, v, inViewSourceMode());
+                        if (inViewSourceMode())
+                            currToken.addViewSourceChar('v');
+                        dest = buffer;
+                        state.setTagState(SearchAttribute);
+                        break;
+                    }
+                }
+
+                *dest++ = curchar;
+                src.advance(m_lineNumber);
+            }
+            break;
+        case SearchEnd:
+        {
+#if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1
+                qDebug("SearchEnd");
+#endif
+            while (!src.isEmpty()) {
+                UChar ch = *src;
+                if (ch == '>' || ch == '<')
+                    break;
+                if (ch == '/')
+                    currToken.flat = true;
+                if (inViewSourceMode())
+                    currToken.addViewSourceChar(ch);
+                src.advance(m_lineNumber);
+            }
+            if (src.isEmpty()) break;
+
+            searchCount = 0; // Stop looking for '<!--' sequence
+            state.setTagState(NoTag);
+            tquote = NoQuote;
+
+            if (*src != '<')
+                src.advance(m_lineNumber);
+
+            if (currToken.tagName == nullAtom) { //stop if tag is unknown
+                m_cBufferPos = cBufferPos;
+                return state;
+            }
+
+            AtomicString tagName = currToken.tagName;
+#if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 0
+            kdDebug( 6036 ) << "appending Tag: " << tagName.deprecatedString() << endl;
+#endif
+
+            // Handle <script src="foo"/> like Mozilla/Opera. We have to do this now for Dashboard
+            // compatibility.
+            bool isSelfClosingScript = currToken.flat && currToken.beginTag && currToken.tagName == scriptTag;
+            bool beginTag = !currToken.flat && currToken.beginTag;
+            if (currToken.beginTag && currToken.tagName == scriptTag && !inViewSourceMode() && !parser->skipMode()) {
+                Attribute* a = 0;
+                scriptSrc = String();
+                scriptSrcCharset = String();
+                if (currToken.attrs && !m_fragment) {
+                    Settings* settings = m_doc->settings();
+                    if (m_doc->frame() && m_doc->frame()->scriptBridge()->isEnabled()) {
+                        if ((a = currToken.attrs->getAttributeItem(srcAttr)))
+                            scriptSrc = m_doc->completeURL(parseURL(a->value()));
+                    }
+                }
+            }
+
+            RefPtr<Node> n = processToken();
+            m_cBufferPos = cBufferPos;
+            if (n) {
+                if ((tagName == preTag || tagName == listingTag) && !inViewSourceMode()) {
+                    if (beginTag)
+                        state.setDiscardLF(true); // Discard the first LF after we open a pre.
+                } else if (tagName == scriptTag && n) {
+                    ASSERT(!scriptNode);
+                    scriptNode = n;
+                    scriptSrcCharset = static_cast<HTMLScriptElement*>(n.get())->scriptCharset();
+                    if (beginTag) {
+                        searchStopper = scriptEnd;
+                        searchStopperLen = 8;
+                        state.setInScript(true);
+                        state = parseSpecial(src, state);
+                    } else if (isSelfClosingScript) { // Handle <script src="foo"/>
+                        state.setInScript(true);
+                        state = scriptHandler(state);
+                    }
+                } else if (tagName == styleTag) {
+                    if (beginTag) {
+                        searchStopper = styleEnd;
+                        searchStopperLen = 7;
+                        state.setInStyle(true);
+                        state = parseSpecial(src, state);
+                    }
+                } else if (tagName == textareaTag) {
+                    if (beginTag) {
+                        searchStopper = textareaEnd;
+                        searchStopperLen = 10;
+                        state.setInTextArea(true);
+                        state = parseSpecial(src, state);
+                    }
+                } else if (tagName == titleTag) {
+                    if (beginTag) {
+                        searchStopper = titleEnd;
+                        searchStopperLen = 7;
+                        State savedState = state;
+                        SegmentedString savedSrc = src;
+                        long savedLineno = m_lineNumber;
+                        state.setInTitle(true);
+                        state = parseSpecial(src, state);
+                        if (state.inTitle() && src.isEmpty()) {
+                            // We just ate the rest of the document as the title #text node!
+                            // Reset the state then retokenize without special title handling.
+                            // Let the parser clean up the missing </title> tag.
+                            // FIXME: This is incorrect, because src.isEmpty() doesn't mean we're
+                            // at the end of the document unless noMoreData is also true. We need
+                            // to detect this case elsewhere, and save the state somewhere other
+                            // than a local variable.
+                            state = savedState;
+                            src = savedSrc;
+                            m_lineNumber = savedLineno;
+                            scriptCodeSize = 0;
+                        }
+                    }
+                } else if (tagName == xmpTag) {
+                    if (beginTag) {
+                        searchStopper = xmpEnd;
+                        searchStopperLen = 5;
+                        state.setInXmp(true);
+                        state = parseSpecial(src, state);
+                    }
+                }
+            }
+            if (tagName == plaintextTag)
+                state.setInPlainText(beginTag);
+            return state; // Finished parsing tag!
+        }
+        } // end switch
+    }
+    m_cBufferPos = cBufferPos;
+    return state;
+}
+
+inline bool HTMLTokenizer::continueProcessing(int& processedCount, double startTime, State &state)
+{
+    // We don't want to be checking elapsed time with every character, so we only check after we've
+    // processed a certain number of characters.
+    bool allowedYield = state.allowYield();
+    state.setAllowYield(false);
+    if (!state.loadingExtScript() && !state.forceSynchronous() && !m_executingScript && (processedCount > TOKENIZER_CHUNK_SIZE || allowedYield)) {
+        processedCount = 0;
+        if (currentTime() - startTime > tokenizerTimeDelay) {
+            /* FIXME: We'd like to yield aggressively to give stylesheets the opportunity to
+               load, but this hurts overall performance on slower machines.  For now turn this
+               off.
+            || (!m_doc->haveStylesheetsLoaded() && 
+                (m_doc->documentElement()->id() != ID_HTML || m_doc->body()))) {*/
+            // Schedule the timer to keep processing as soon as possible.
+            m_timer.startOneShot(0);
+#ifdef INSTRUMENT_LAYOUT_SCHEDULING
+            if (currentTime() - startTime > tokenizerTimeDelay)
+                printf("Deferring processing of data because 500ms elapsed away from event loop.\n");
+#endif
+            return false;
+        }
+    }
+    
+    processedCount++;
+    return true;
+}
+
+bool HTMLTokenizer::write(const SegmentedString& str, bool appendData)
+{
+#ifdef TOKEN_DEBUG
+    kdDebug( 6036 ) << this << " Tokenizer::write(\"" << str.toString() << "\"," << appendData << ")" << endl;
+#endif
+
+    if (!buffer)
+        return false;
+    
+    if (m_parserStopped)
+        return false;
+
+    SegmentedString source(str);
+    if (m_executingScript)
+        source.setExcludeLineNumbers();
+
+    if ((m_executingScript && appendData) || !pendingScripts.isEmpty()) {
+        // don't parse; we will do this later
+        if (currentPrependingSrc)
+            currentPrependingSrc->append(source);
+        else
+            pendingSrc.append(source);
+        return false;
+    }
+
+    if (!src.isEmpty())
+        src.append(source);
+    else
+        setSrc(source);
+
+    // Once a timer is set, it has control of when the tokenizer continues.
+    if (m_timer.isActive())
+        return false;
+
+    bool wasInWrite = inWrite;
+    inWrite = true;
+    
+#ifdef INSTRUMENT_LAYOUT_SCHEDULING
+    if (!m_doc->ownerElement())
+        printf("Beginning write at time %d\n", m_doc->elapsedTime());
+#endif
+    
+    int processedCount = 0;
+    double startTime = currentTime();
+
+    Frame *frame = m_doc->frame();
+
+    State state = m_state;
+
+    while (!src.isEmpty() && (!frame || !frame->loader()->isScheduledLocationChangePending())) {
+        if (!continueProcessing(processedCount, startTime, state))
+            break;
+
+        // do we need to enlarge the buffer?
+        checkBuffer();
+
+        UChar cc = *src;
+
+        bool wasSkipLF = state.skipLF();
+        if (wasSkipLF)
+            state.setSkipLF(false);
+
+        if (wasSkipLF && (cc == '\n'))
+            src.advance();
+        else if (state.needsSpecialWriteHandling()) {
+            // it's important to keep needsSpecialWriteHandling with the flags this block tests
+            if (state.hasEntityState())
+                state = parseEntity(src, dest, state, m_cBufferPos, false, state.hasTagState());
+            else if (state.inPlainText())
+                state = parseText(src, state);
+            else if (state.inAnySpecial())
+                state = parseSpecial(src, state);
+            else if (state.inComment())
+                state = parseComment(src, state);
+            else if (state.inServer())
+                state = parseServer(src, state);
+            else if (state.inProcessingInstruction())
+                state = parseProcessingInstruction(src, state);
+            else if (state.hasTagState())
+                state = parseTag(src, state);
+            else if (state.startTag()) {
+                state.setStartTag(false);
+                
+                switch(cc) {
+                case '/':
+                    break;
+                case '!': {
+                    // <!-- comment -->
+                    searchCount = 1; // Look for '<!--' sequence to start comment
+                    
+                    break;
+                }
+                case '?': {
+                    // xml processing instruction
+                    state.setInProcessingInstruction(true);
+                    tquote = NoQuote;
+                    state = parseProcessingInstruction(src, state);
+                    continue;
+
+                    break;
+                }
+                case '%':
+                    if (!brokenServer) {
+                        // <% server stuff, handle as comment %>
+                        state.setInServer(true);
+                        tquote = NoQuote;
+                        state = parseServer(src, state);
+                        continue;
+                    }
+                    // else fall through
+                default: {
+                    if( ((cc >= 'a') && (cc <= 'z')) || ((cc >= 'A') && (cc <= 'Z'))) {
+                        // Start of a Start-Tag
+                    } else {
+                        // Invalid tag
+                        // Add as is
+                        *dest = '<';
+                        dest++;
+                        continue;
+                    }
+                }
+                }; // end case
+
+                processToken();
+
+                m_cBufferPos = 0;
+                state.setTagState(TagName);
+                state = parseTag(src, state);
+            }
+        } else if (cc == '&' && !src.escaped()) {
+            src.advancePastNonNewline();
+            state = parseEntity(src, dest, state, m_cBufferPos, true, state.hasTagState());
+        } else if (cc == '<' && !src.escaped()) {
+            tagStartLineno = m_lineNumber;
+            src.advancePastNonNewline();
+            state.setStartTag(true);
+        } else if (cc == '\n' || cc == '\r') {
+            if (state.discardLF())
+                // Ignore this LF
+                state.setDiscardLF(false); // We have discarded 1 LF
+            else {
+                // Process this LF
+                *dest++ = '\n';
+                if (cc == '\r' && !src.excludeLineNumbers())
+                    m_lineNumber++;
+            }
+
+            /* Check for MS-DOS CRLF sequence */
+            if (cc == '\r')
+                state.setSkipLF(true);
+            src.advance(m_lineNumber);
+        } else {
+            state.setDiscardLF(false);
+            *dest++ = cc;
+            src.advancePastNonNewline();
+        }
+    }
+    
+#ifdef INSTRUMENT_LAYOUT_SCHEDULING
+    if (!m_doc->ownerElement())
+        printf("Ending write at time %d\n", m_doc->elapsedTime());
+#endif
+    
+    inWrite = wasInWrite;
+
+    m_state = state;
+
+    if (noMoreData && !inWrite && !state.loadingExtScript() && !m_executingScript && !m_timer.isActive()) {
+        end(); // this actually causes us to be deleted
+        return true;
+    }
+    return false;
+}
+
+void HTMLTokenizer::stopParsing()
+{
+    Tokenizer::stopParsing();
+    m_timer.stop();
+
+    // The part needs to know that the tokenizer has finished with its data,
+    // regardless of whether it happened naturally or due to manual intervention.
+    if (!m_fragment && m_doc->frame())
+        m_doc->frame()->loader()->tokenizerProcessedData();
+}
+
+bool HTMLTokenizer::processingData() const
+{
+    return m_timer.isActive() || inWrite;
+}
+
+void HTMLTokenizer::timerFired(Timer<HTMLTokenizer>*)
+{
+#ifdef INSTRUMENT_LAYOUT_SCHEDULING
+    if (!m_doc->ownerElement())
+        printf("Beginning timer write at time %d\n", m_doc->elapsedTime());
+#endif
+
+    if (m_doc->view() && m_doc->view()->layoutPending() && !m_doc->minimumLayoutDelay()) {
+        // Restart the timer and let layout win.  This is basically a way of ensuring that the layout
+        // timer has higher priority than our timer.
+        m_timer.startOneShot(0);
+        return;
+    }
+
+    // Invoke write() as though more data came in. This might cause us to get deleted.
+    write(SegmentedString(), true);
+}
+
+void HTMLTokenizer::end()
+{
+    ASSERT(!m_timer.isActive());
+    m_timer.stop(); // Only helps if assertion above fires, but do it anyway.
+
+    if (buffer) {
+        // parseTag is using the buffer for different matters
+        if (!m_state.hasTagState())
+            processToken();
+
+        fastFree(scriptCode);
+        scriptCode = 0;
+        scriptCodeSize = scriptCodeMaxSize = scriptCodeResync = 0;
+
+        fastFree(buffer);
+        buffer = 0;
+    }
+
+    if (!inViewSourceMode())
+        parser->finished();
+    else
+        m_doc->finishedParsing();
+}
+
+void HTMLTokenizer::finish()
+{
+    // do this as long as we don't find matching comment ends
+    while((m_state.inComment() || m_state.inServer()) && scriptCode && scriptCodeSize) {
+        // we've found an unmatched comment start
+        if (m_state.inComment())
+            brokenComments = true;
+        else
+            brokenServer = true;
+        checkScriptBuffer();
+        scriptCode[scriptCodeSize] = 0;
+        scriptCode[scriptCodeSize + 1] = 0;
+        int pos;
+        String food;
+        if (m_state.inScript() || m_state.inStyle())
+            food = String(scriptCode, scriptCodeSize);
+        else if (m_state.inServer()) {
+            food = "<";
+            food.append(String(scriptCode, scriptCodeSize));
+        } else {
+            pos = DeprecatedConstString(reinterpret_cast<DeprecatedChar*>(scriptCode), scriptCodeSize).string().find('>');
+            food = String(scriptCode + pos + 1, scriptCodeSize - pos - 1);
+        }
+        fastFree(scriptCode);
+        scriptCode = 0;
+        scriptCodeSize = scriptCodeMaxSize = scriptCodeResync = 0;
+        m_state.setInComment(false);
+        m_state.setInServer(false);
+        if (!food.isEmpty())
+            write(food, true);
+    }
+    // this indicates we will not receive any more data... but if we are waiting on
+    // an external script to load, we can't finish parsing until that is done
+    noMoreData = true;
+    if (!inWrite && !m_state.loadingExtScript() && !m_executingScript && !m_timer.isActive())
+        end(); // this actually causes us to be deleted
+}
+
+PassRefPtr<Node> HTMLTokenizer::processToken()
+{
+    JSBridge* jsProxy = (!m_fragment && m_doc->frame()) ? m_doc->frame()->scriptBridge() : 0;
+    if (jsProxy && m_doc->frame()->scriptBridge()->isEnabled())
+        jsProxy->setEventHandlerLineno(tagStartLineno);
+    if (dest > buffer) {
+#ifdef TOKEN_DEBUG
+        if(currToken.tagName.length()) {
+            qDebug( "unexpected token: %s, str: *%s*", currToken.tagName.deprecatedString().latin1(),DeprecatedConstString( buffer,dest-buffer ).deprecatedString().latin1() );
+            ASSERT(0);
+        }
+
+#endif
+        currToken.text = StringImpl::createStrippingNullCharacters(buffer, dest - buffer);
+        if (currToken.tagName != commentAtom)
+            currToken.tagName = textAtom;
+    } else if (currToken.tagName == nullAtom) {
+        currToken.reset();
+        if (jsProxy)
+            jsProxy->setEventHandlerLineno(m_lineNumber);
+        return 0;
+    }
+
+    dest = buffer;
+
+#ifdef TOKEN_DEBUG
+    DeprecatedString name = currToken.tagName.deprecatedString();
+    DeprecatedString text;
+    if(currToken.text)
+        text = DeprecatedConstString(currToken.text->unicode(), currToken.text->length()).deprecatedString();
+
+    kdDebug( 6036 ) << "Token --> " << name << endl;
+    if (currToken.flat)
+        kdDebug( 6036 ) << "Token is FLAT!" << endl;
+    if(!text.isNull())
+        kdDebug( 6036 ) << "text: \"" << text << "\"" << endl;
+    unsigned l = currToken.attrs ? currToken.attrs->length() : 0;
+    if(l) {
+        kdDebug( 6036 ) << "Attributes: " << l << endl;
+        for (unsigned i = 0; i < l; ++i) {
+            Attribute* c = currToken.attrs->attributeItem(i);
+            kdDebug( 6036 ) << "    " << c->localName().deprecatedString()
+                            << "=\"" << c->value().deprecatedString() << "\"" << endl;
+        }
+    }
+    kdDebug( 6036 ) << endl;
+#endif
+
+    RefPtr<Node> n;
+    
+    if (!m_parserStopped) {
+        if (inViewSourceMode())
+            static_cast<HTMLViewSourceDocument*>(m_doc)->addViewSourceToken(&currToken);
+        else
+            // pass the token over to the parser, the parser DOES NOT delete the token
+            n = parser->parseToken(&currToken);
+    }
+    currToken.reset();
+    if (jsProxy)
+        jsProxy->setEventHandlerLineno(0);
+
+    return n.release();
+}
+
+HTMLTokenizer::~HTMLTokenizer()
+{
+    ASSERT(!inWrite);
+    reset();
+    delete parser;
+}
+
+
+void HTMLTokenizer::enlargeBuffer(int len)
+{
+    int newSize = max(size * 2, size + len);
+    int oldOffset = dest - buffer;
+    buffer = static_cast<UChar*>(fastRealloc(buffer, newSize * sizeof(UChar)));
+    dest = buffer + oldOffset;
+    size = newSize;
+}
+
+void HTMLTokenizer::enlargeScriptBuffer(int len)
+{
+    int newSize = max(scriptCodeMaxSize * 2, scriptCodeMaxSize + len);
+    scriptCode = static_cast<UChar*>(fastRealloc(scriptCode, newSize * sizeof(UChar)));
+    scriptCodeMaxSize = newSize;
+}
+    
+void HTMLTokenizer::executeScriptsWaitingForStylesheets()
+{
+    ASSERT(m_doc->haveStylesheetsLoaded());
+
+    if (m_hasScriptsWaitingForStylesheets)
+        notifyFinished(0);
+}
+
+void HTMLTokenizer::notifyFinished(CachedResource*)
+{
+#ifdef INSTRUMENT_LAYOUT_SCHEDULING
+    if (!m_doc->ownerElement())
+        printf("script loaded at %d\n", m_doc->elapsedTime());
+#endif
+
+    ASSERT(!pendingScripts.isEmpty());
+
+    // Make scripts loaded from file URLs wait for stylesheets to match Tiger behavior where
+    // file loads were serialized in lower level.
+    // FIXME: this should really be done for all script loads or the same effect should be achieved by other
+    // means, like javascript suspend/resume
+    m_hasScriptsWaitingForStylesheets = !m_doc->haveStylesheetsLoaded() && pendingScripts.head()->url().startsWith("file:", false);
+    if (m_hasScriptsWaitingForStylesheets)
+        return;
+
+    bool finished = false;
+    while (!finished && pendingScripts.head()->isLoaded()) {
+#ifdef TOKEN_DEBUG
+        kdDebug( 6036 ) << "Finished loading an external script" << endl;
+#endif
+        CachedScript* cs = pendingScripts.dequeue();
+        ASSERT(cache()->disabled() || cs->accessCount() > 0);
+
+        String scriptSource = cs->script();
+#ifdef TOKEN_DEBUG
+        kdDebug( 6036 ) << "External script is:" << endl << scriptSource.deprecatedString() << endl;
+#endif
+        setSrc(SegmentedString());
+
+        // make sure we forget about the script before we execute the new one
+        // infinite recursion might happen otherwise
+        String cachedScriptUrl(cs->url());
+        bool errorOccurred = cs->errorOccurred();
+        cs->deref(this);
+        RefPtr<Node> n = scriptNode.release();
+
+#ifdef INSTRUMENT_LAYOUT_SCHEDULING
+        if (!m_doc->ownerElement())
+            printf("external script beginning execution at %d\n", m_doc->elapsedTime());
+#endif
+
+        if (errorOccurred)
+            EventTargetNodeCast(n.get())->dispatchHTMLEvent(errorEvent, true, false);
+        else {
+            if (static_cast<HTMLScriptElement*>(n.get())->shouldExecuteAsJavaScript())
+                m_state = scriptExecution(scriptSource, m_state, cachedScriptUrl);
+            EventTargetNodeCast(n.get())->dispatchHTMLEvent(loadEvent, false, false);
+        }
+
+        // The state of pendingScripts.isEmpty() can change inside the scriptExecution()
+        // call above, so test afterwards.
+        finished = pendingScripts.isEmpty();
+        if (finished) {
+            m_state.setLoadingExtScript(false);
+#ifdef INSTRUMENT_LAYOUT_SCHEDULING
+            if (!m_doc->ownerElement())
+                printf("external script finished execution at %d\n", m_doc->elapsedTime());
+#endif
+        }
+
+        // 'm_requestingScript' is true when we are called synchronously from
+        // scriptHandler(). In that case scriptHandler() will take care
+        // of pendingSrc.
+        if (!m_requestingScript) {
+            SegmentedString rest = pendingSrc;
+            pendingSrc.clear();
+            write(rest, false);
+            // we might be deleted at this point, do not access any members.
+        }
+    }
+}
+
+bool HTMLTokenizer::isWaitingForScripts() const
+{
+    return m_state.loadingExtScript();
+}
+
+void HTMLTokenizer::setSrc(const SegmentedString &source)
+{
+    src = source;
+}
+
+void parseHTMLDocumentFragment(const String& source, DocumentFragment* fragment)
+{
+    HTMLTokenizer tok(fragment);
+    tok.setForceSynchronous(true);
+    tok.write(source, true);
+    tok.finish();
+    ASSERT(!tok.processingData());      // make sure we're done (see 3963151)
+}
+
+UChar decodeNamedEntity(const char* name)
+{
+    const Entity* e = findEntity(name, strlen(name));
+    return e ? e->code : 0;
+}
+
+}
author	initial.commit <initial.commit@0039d316-1c4b-4281-b951-d872f2087c98>	2008-07-27 00:20:51 +0000
committer	initial.commit <initial.commit@0039d316-1c4b-4281-b951-d872f2087c98>	2008-07-27 00:20:51 +0000
commit	f5b16fed647e941aa66933178da85db2860d639b (patch)
tree	f00e9856c04aad3b558a140955e7674add33f051 /webkit/pending/HTMLTokenizer.cpp
parent	920c091ac3ee15079194c82ae8a7a18215f3f23c (diff)
download	chromium_src-f5b16fed647e941aa66933178da85db2860d639b.zip chromium_src-f5b16fed647e941aa66933178da85db2860d639b.tar.gz chromium_src-f5b16fed647e941aa66933178da85db2860d639b.tar.bz2