Copy the KDE 3.5 branch to branches/trinity for new KDE 3.5 features.

BUG:215923 git-svn-id: svn://anonsvn.kde.org/home/kde/branches/trinity/kdelibs@1054174 283d02a7-25f6-0310-bc7c-ecb5cbfe19da
author: toma <toma@283d02a7-25f6-0310-bc7c-ecb5cbfe19da> 2009-11-25 17:56:58 +0000
committer: toma <toma@283d02a7-25f6-0310-bc7c-ecb5cbfe19da> 2009-11-25 17:56:58 +0000
commit: ce4a32fe52ef09d8f5ff1dd22c001110902b60a2 (patch)
tree: 5ac38a06f3dde268dc7927dc155896926aaf7012 /khtml/html/htmltokenizer.cpp
download: tdelibs-ce4a32fe52ef09d8f5ff1dd22c001110902b60a2.tar.gz
tdelibs-ce4a32fe52ef09d8f5ff1dd22c001110902b60a2.zip
1 files changed, 1798 insertions, 0 deletions
diff --git a/khtml/html/htmltokenizer.cpp b/khtml/html/htmltokenizer.cpp
new file mode 100644
index 000000000..e0983582a
--- /dev/null
+++ b/khtml/html/htmltokenizer.cpp
@@ -0,0 +1,1798 @@
+/*
+    This file is part of the KDE libraries
+
+    Copyright (C) 1997 Martin Jones (mjones@kde.org)
+              (C) 1997 Torben Weis (weis@kde.org)
+              (C) 1998 Waldo Bastian (bastian@kde.org)
+              (C) 1999 Lars Knoll (knoll@kde.org)
+              (C) 1999 Antti Koivisto (koivisto@kde.org)
+              (C) 2001-2003 Dirk Mueller (mueller@kde.org)
+              (C) 2004 Apple Computer, Inc.
+              (C) 2006 Germain Garand (germain@ebooksfrance.org)
+
+    This library is free software; you can redistribute it and/or
+    modify it under the terms of the GNU Library General Public
+    License as published by the Free Software Foundation; either
+    version 2 of the License, or (at your option) any later version.
+
+    This library is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+    Library General Public License for more details.
+
+    You should have received a copy of the GNU Library General Public License
+    along with this library; see the file COPYING.LIB.  If not, write to
+    the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
+    Boston, MA 02110-1301, USA.
+*/
+//----------------------------------------------------------------------------
+//
+// KDE HTML Widget - Tokenizers
+
+//#define TOKEN_DEBUG 1
+//#define TOKEN_DEBUG 2
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "html/htmltokenizer.h"
+#include "html/html_documentimpl.h"
+#include "html/htmlparser.h"
+#include "html/dtd.h"
+
+#include "misc/loader.h"
+#include "misc/htmlhashes.h"
+
+#include "khtmlview.h"
+#include "khtml_part.h"
+#include "xml/dom_docimpl.h"
+#include "css/csshelper.h"
+#include "ecma/kjs_proxy.h"
+#include <kcharsets.h>
+#include <kglobal.h>
+#include <ctype.h>
+#include <assert.h>
+#include <qvariant.h>
+#include <kdebug.h>
+#include <stdlib.h>
+
+#include "kentities.c"
+
+using namespace khtml;
+
+static const QChar commentStart [] = { '<','!','-','-', QChar::null };
+
+static const char scriptEnd [] = "</script";
+static const char xmpEnd [] = "</xmp";
+static const char styleEnd [] =  "</style";
+static const char textareaEnd [] = "</textarea";
+static const char titleEnd [] = "</title";
+
+#define KHTML_ALLOC_QCHAR_VEC( N ) (QChar*) malloc( sizeof(QChar)*( N ) )
+#define KHTML_REALLOC_QCHAR_VEC(P, N ) (QChar*) realloc(P, sizeof(QChar)*( N ))
+#define KHTML_DELETE_QCHAR_VEC( P ) free((char*)( P ))
+
+// Full support for MS Windows extensions to Latin-1.
+// Technically these extensions should only be activated for pages
+// marked "windows-1252" or "cp1252", but
+// in the standard Microsoft way, these extensions infect hundreds of thousands
+// of web pages.  Note that people with non-latin-1 Microsoft extensions
+// are SOL.
+//
+// See: http://www.microsoft.com/globaldev/reference/WinCP.asp
+//      http://www.bbsinc.com/iso8859.html
+//      http://www.obviously.com/
+//
+// There may be better equivalents
+#if 0
+#define fixUpChar(x)
+#else
+#define fixUpChar(x) \
+            switch ((x).unicode()) \
+            { \
+            case 0x80: (x) = 0x20ac; break; \
+            case 0x82: (x) = 0x201a;    break; \
+            case 0x83: (x) = 0x0192; break; \
+            case 0x84: (x) = 0x201e;    break; \
+            case 0x85: (x) = 0x2026; break; \
+            case 0x86: (x) = 0x2020; break; \
+            case 0x87: (x) = 0x2021; break; \
+            case 0x88: (x) = 0x02C6; break; \
+            case 0x89: (x) = 0x2030; break; \
+            case 0x8A: (x) = 0x0160; break; \
+            case 0x8b: (x) = 0x2039;    break; \
+            case 0x8C: (x) = 0x0152; break; \
+            case 0x8E: (x) = 0x017D; break; \
+            case 0x91: (x) = 0x2018;   break; \
+            case 0x92: (x) = 0x2019;   break; \
+            case 0x93: (x) = 0x201C;    break; \
+            case 0x94: (x) = 0X201D;    break; \
+            case 0x95: (x) = 0x2022;    break; \
+            case 0x96: (x) = 0x2013;    break; \
+            case 0x97: (x) = 0x2014;    break; \
+            case 0x98: (x) = 0x02DC;    break; \
+            case 0x99: (x) = 0x2122; break; \
+            case 0x9A: (x) = 0x0161; break; \
+            case 0x9b: (x) = 0x203A;    break; \
+            case 0x9C: (x) = 0x0153; break; \
+            case 0x9E: (x) = 0x017E; break; \
+            case 0x9F: (x) = 0x0178; break; \
+            default: break; \
+            }
+#endif
+// ----------------------------------------------------------------------------
+
+HTMLTokenizer::HTMLTokenizer(DOM::DocumentImpl *_doc, KHTMLView *_view)
+{
+    view = _view;
+    buffer = 0;
+    scriptCode = 0;
+    scriptCodeSize = scriptCodeMaxSize = scriptCodeResync = 0;
+    charsets = KGlobal::charsets();
+    parser = new KHTMLParser(_view, _doc);
+    m_executingScript = 0;
+    m_autoCloseTimer = 0;
+    onHold = false;
+
+    reset();
+}
+
+HTMLTokenizer::HTMLTokenizer(DOM::DocumentImpl *_doc, DOM::DocumentFragmentImpl *i)
+{
+    view = 0;
+    buffer = 0;
+    scriptCode = 0;
+    scriptCodeSize = scriptCodeMaxSize = scriptCodeResync = 0;
+    charsets = KGlobal::charsets();
+    parser = new KHTMLParser( i, _doc );
+    m_executingScript = 0;
+    m_autoCloseTimer = 0;
+    onHold = false;
+
+    reset();
+}
+
+void HTMLTokenizer::reset()
+{
+    assert(m_executingScript == 0);
+    Q_ASSERT(onHold == false);
+    m_abort = false;
+
+    while (!cachedScript.isEmpty())
+        cachedScript.dequeue()->deref(this);
+
+    if ( buffer )
+        KHTML_DELETE_QCHAR_VEC(buffer);
+    buffer = dest = 0;
+    size = 0;
+
+    if ( scriptCode )
+        KHTML_DELETE_QCHAR_VEC(scriptCode);
+    scriptCode = 0;
+    scriptCodeSize = scriptCodeMaxSize = scriptCodeResync = 0;
+
+    if (m_autoCloseTimer) {
+        killTimer(m_autoCloseTimer);
+        m_autoCloseTimer = 0;
+    }
+
+    currToken.reset();
+}
+
+void HTMLTokenizer::begin()
+{
+    m_executingScript = 0;
+    onHold = false;
+    reset();
+    size = 254;
+    buffer = KHTML_ALLOC_QCHAR_VEC( 255 );
+    dest = buffer;
+    tag = NoTag;
+    pending = NonePending;
+    discard = NoneDiscard;
+    pre = false;
+    prePos = 0;
+    plaintext = false;
+    xmp = false;
+    processingInstruction = false;
+    script = false;
+    escaped = false;
+    style = false;
+    skipLF = false;
+    select = false;
+    comment = false;
+    server = false;
+    textarea = false;
+    title = false;
+    startTag = false;
+    tquote = NoQuote;
+    searchCount = 0;
+    Entity = NoEntity;
+    noMoreData = false;
+    brokenComments = false;
+    brokenServer = false;
+    brokenScript = false;
+    lineno = 0;
+    scriptStartLineno = 0;
+    tagStartLineno = 0;
+}
+
+void HTMLTokenizer::processListing(TokenizerString list)
+{
+    bool old_pre = pre;
+
+    // This function adds the listing 'list' as
+    // preformatted text-tokens to the token-collection
+    // thereby converting TABs.
+    if(!style) pre = true;
+    prePos = 0;
+
+    while ( !list.isEmpty() )
+    {
+        checkBuffer(3*TAB_SIZE);
+
+        if (skipLF && ( *list != '\n' ))
+        {
+            skipLF = false;
+        }
+
+        if (skipLF)
+        {
+            skipLF = false;
+            ++list;
+        }
+        else if (( *list == '\n' ) || ( *list == '\r' ))
+        {
+            if (discard == LFDiscard)
+            {
+                // Ignore this LF
+                discard = NoneDiscard; // We have discarded 1 LF
+            }
+            else
+            {
+                // Process this LF
+                if (pending)
+                    addPending();
+
+                // we used to do it not at all and we want to have
+                // it fixed for textarea. So here we are
+                if ( textarea ) {
+                    prePos++;
+                    *dest++ = *list;
+                } else
+                    pending = LFPending;
+            }
+            /* Check for MS-DOS CRLF sequence */
+            if (*list == '\r')
+            {
+                skipLF = true;
+            }
+            ++list;
+        }
+        else if (( *list == ' ' ) || ( *list == '\t'))
+        {
+            if (pending)
+                addPending();
+            if (*list == ' ')
+                pending = SpacePending;
+            else
+                pending = TabPending;
+
+            ++list;
+        }
+        else
+        {
+            discard = NoneDiscard;
+            if (pending)
+                addPending();
+
+            prePos++;
+            *dest++ = *list;
+            ++list;
+        }
+
+    }
+
+    if ((pending == SpacePending) || (pending == TabPending))
+        addPending();
+    else
+        pending = NonePending;
+
+    prePos = 0;
+    pre = old_pre;
+}
+
+void HTMLTokenizer::parseSpecial(TokenizerString &src)
+{
+    assert( textarea || title || !Entity );
+    assert( !tag );
+    assert( xmp+textarea+title+style+script == 1 );
+    if (script)
+        scriptStartLineno = lineno+src.lineCount();
+
+    if ( comment ) parseComment( src );
+
+    while ( !src.isEmpty() ) {
+        checkScriptBuffer();
+        unsigned char ch = src->latin1();
+        if ( !scriptCodeResync && !brokenComments && !textarea && !xmp && ch == '-' && scriptCodeSize >= 3 && !src.escaped() && QConstString( scriptCode+scriptCodeSize-3, 3 ).string() == "<!-" ) {
+            comment = true;
+            scriptCode[ scriptCodeSize++ ] = ch;
+            ++src;
+            parseComment( src );
+            continue;
+        }
+        if ( scriptCodeResync && !tquote && ( ch == '>' ) ) {
+            ++src;
+            scriptCodeSize = scriptCodeResync-1;
+            scriptCodeResync = 0;
+            scriptCode[ scriptCodeSize ] = scriptCode[ scriptCodeSize + 1 ] = 0;
+            if ( script )
+                scriptHandler();
+            else {
+                processListing(TokenizerString(scriptCode, scriptCodeSize));
+                processToken();
+                if ( style )         { currToken.tid = ID_STYLE + ID_CLOSE_TAG; }
+                else if ( textarea ) { currToken.tid = ID_TEXTAREA + ID_CLOSE_TAG; }
+                else if ( title ) { currToken.tid = ID_TITLE + ID_CLOSE_TAG; }
+                else if ( xmp )  { currToken.tid = ID_XMP + ID_CLOSE_TAG; }
+                processToken();
+                script = style = textarea = title = xmp = false;
+                tquote = NoQuote;
+                scriptCodeSize = scriptCodeResync = 0;
+            }
+            return;
+        }
+        // possible end of tagname, lets check.
+        if ( !scriptCodeResync && !escaped && !src.escaped() && ( ch == '>' || ch == '/' || ch <= ' ' ) && ch &&
+             scriptCodeSize >= searchStopperLen &&
+             !QConstString( scriptCode+scriptCodeSize-searchStopperLen, searchStopperLen ).string().find( searchStopper, 0, false )) {
+            scriptCodeResync = scriptCodeSize-searchStopperLen+1;
+            tquote = NoQuote;
+            continue;
+        }
+        if ( scriptCodeResync && !escaped ) {
+            if(ch == '\"')
+                tquote = (tquote == NoQuote) ? DoubleQuote : ((tquote == SingleQuote) ? SingleQuote : NoQuote);
+            else if(ch == '\'')
+                tquote = (tquote == NoQuote) ? SingleQuote : (tquote == DoubleQuote) ? DoubleQuote : NoQuote;
+            else if (tquote != NoQuote && (ch == '\r' || ch == '\n'))
+                tquote = NoQuote;
+        }
+        escaped = ( !escaped && ch == '\\' );
+        if (!scriptCodeResync && (textarea||title) && !src.escaped() && ch == '&') {
+            QChar *scriptCodeDest = scriptCode+scriptCodeSize;
+            ++src;
+            parseEntity(src,scriptCodeDest,true);
+            scriptCodeSize = scriptCodeDest-scriptCode;
+        }
+        else {
+            scriptCode[ scriptCodeSize++ ] = *src;
+            ++src;
+        }
+    }
+}
+
+void HTMLTokenizer::scriptHandler()
+{
+    QString currentScriptSrc = scriptSrc;
+    scriptSrc = QString::null;
+
+    processListing(TokenizerString(scriptCode, scriptCodeSize));
+    QString exScript( buffer, dest-buffer );
+
+    processToken();
+    currToken.tid = ID_SCRIPT + ID_CLOSE_TAG;
+    processToken();
+
+    // Scripts following a frameset element should not be executed or even loaded in the case of extern scripts.
+    bool followingFrameset = (parser->doc()->body() && parser->doc()->body()->id() == ID_FRAMESET);
+    bool effectiveScript = !parser->skipMode() && !followingFrameset;
+    bool deferredScript = false;
+
+    if ( effectiveScript ) {
+        CachedScript* cs = 0;
+
+        // forget what we just got, load from src url instead
+        if ( !currentScriptSrc.isEmpty() && javascript &&
+             (cs = parser->doc()->docLoader()->requestScript(currentScriptSrc, scriptSrcCharset) )) {
+            cachedScript.enqueue(cs);
+        }
+
+        if (cs) {
+            pendingQueue.push(src);
+            uint scriptCount = cachedScript.count();
+            setSrc(TokenizerString());
+            scriptCodeSize = scriptCodeResync = 0;
+            cs->ref(this);
+            if (cachedScript.count() == scriptCount)
+                deferredScript = true;
+        }
+        else if (currentScriptSrc.isEmpty() && view && javascript ) {
+            pendingQueue.push(src);
+            setSrc(TokenizerString());
+            scriptCodeSize = scriptCodeResync = 0;
+            scriptExecution( exScript, QString::null, tagStartLineno /*scriptStartLineno*/ );
+        } else {
+            // script was filtered or disallowed
+            effectiveScript = false;
+        }
+    }
+
+    script = false;
+    scriptCodeSize = scriptCodeResync = 0;
+
+    if ( !effectiveScript )
+        return;
+
+    if ( !m_executingScript && cachedScript.isEmpty() ) {
+        src.append(pendingQueue.pop());
+    } else if ( cachedScript.isEmpty() ) {
+        write( pendingQueue.pop(), false );
+    } else if ( !deferredScript && pendingQueue.count() > 1) {
+        TokenizerString t = pendingQueue.pop();
+        pendingQueue.top().prepend( t );
+    }
+}
+
+void HTMLTokenizer::scriptExecution( const QString& str, const QString& scriptURL,
+                                     int baseLine)
+{
+    bool oldscript = script;
+    m_executingScript++;
+    script = false;
+    QString url;
+    if (scriptURL.isNull() && view)
+      url = static_cast<DocumentImpl*>(view->part()->document().handle())->URL().url();
+    else
+      url = scriptURL;
+
+    if (view)
+	view->part()->executeScript(url,baseLine+1,Node(),str);
+    m_executingScript--;
+    script = oldscript;
+}
+
+void HTMLTokenizer::parseComment(TokenizerString &src)
+{
+    // SGML strict
+    bool strict = parser->doc()->inStrictMode() && parser->doc()->htmlMode() != DocumentImpl::XHtml && !script && !style;
+    int delimiterCount = 0;
+    bool canClose = false;
+
+    checkScriptBuffer(src.length());
+    while ( src.length() ) {
+        scriptCode[ scriptCodeSize++ ] = *src;
+
+#if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1
+        qDebug("comment is now: *%s*", src.toString().left(16).latin1());
+#endif
+
+        if (strict)
+        {
+            if (src->unicode() == '-') {
+                delimiterCount++;
+                if (delimiterCount == 2) {
+                    delimiterCount = 0;
+                    canClose = !canClose;
+                }
+            }
+            else
+                delimiterCount = 0;
+        }
+
+        if ((!strict || canClose) && src->unicode() == '>')
+        {
+            bool handleBrokenComments =  brokenComments && !( script || style );
+            bool scriptEnd=false;
+            if (!strict)
+            {
+                if ( scriptCodeSize > 2 && scriptCode[scriptCodeSize-3] == '-' &&
+                     scriptCode[scriptCodeSize-2] == '-' )
+                    scriptEnd=true;
+            }
+
+            if (canClose || handleBrokenComments || scriptEnd ){
+                ++src;
+                if ( !( title || script || xmp || textarea || style) ) {
+#ifdef COMMENTS_IN_DOM
+                    checkScriptBuffer();
+                    scriptCode[ scriptCodeSize ] = 0;
+                    scriptCode[ scriptCodeSize + 1 ] = 0;
+                    currToken.tid = ID_COMMENT;
+                    processListing(DOMStringIt(scriptCode, scriptCodeSize - 2));
+                    processToken();
+                    currToken.tid = ID_COMMENT + ID_CLOSE_TAG;
+                    processToken();
+#endif
+                    scriptCodeSize = 0;
+                }
+                comment = false;
+                return; // Finished parsing comment
+            }
+        }
+        ++src;
+    }
+}
+
+void HTMLTokenizer::parseServer(TokenizerString &src)
+{
+    checkScriptBuffer(src.length());
+    while ( !src.isEmpty() ) {
+        scriptCode[ scriptCodeSize++ ] = *src;
+        if (src->unicode() == '>' &&
+            scriptCodeSize > 1 && scriptCode[scriptCodeSize-2] == '%') {
+            ++src;
+            server = false;
+            scriptCodeSize = 0;
+            return; // Finished parsing server include
+        }
+        ++src;
+    }
+}
+
+void HTMLTokenizer::parseProcessingInstruction(TokenizerString &src)
+{
+    char oldchar = 0;
+    while ( !src.isEmpty() )
+    {
+        unsigned char chbegin = src->latin1();
+        if(chbegin == '\'') {
+            tquote = tquote == SingleQuote ? NoQuote : SingleQuote;
+        }
+        else if(chbegin == '\"') {
+            tquote = tquote == DoubleQuote ? NoQuote : DoubleQuote;
+        }
+        // Look for '?>'
+        // some crappy sites omit the "?" before it, so
+        // we look for an unquoted '>' instead. (IE compatible)
+        else if ( chbegin == '>' && ( !tquote || oldchar == '?' ) )
+        {
+            // We got a '?>' sequence
+            processingInstruction = false;
+            ++src;
+            discard=LFDiscard;
+            return; // Finished parsing comment!
+        }
+        ++src;
+        oldchar = chbegin;
+    }
+}
+
+void HTMLTokenizer::parseText(TokenizerString &src)
+{
+    while ( !src.isEmpty() )
+    {
+        // do we need to enlarge the buffer?
+        checkBuffer();
+
+        // ascii is okay because we only do ascii comparisons
+        unsigned char chbegin = src->latin1();
+
+        if (skipLF && ( chbegin != '\n' ))
+        {
+            skipLF = false;
+        }
+
+        if (skipLF)
+        {
+            skipLF = false;
+            ++src;
+        }
+        else if (( chbegin == '\n' ) || ( chbegin == '\r' ))
+        {
+            if (chbegin == '\r')
+                skipLF = true;
+
+            *dest++ = '\n';
+            ++src;
+        }
+        else {
+            *dest++ = *src;
+            ++src;
+        }
+    }
+}
+
+
+void HTMLTokenizer::parseEntity(TokenizerString &src, QChar *&dest, bool start)
+{
+    if( start )
+    {
+        cBufferPos = 0;
+        entityLen = 0;
+        Entity = SearchEntity;
+    }
+
+    while( !src.isEmpty() )
+    {
+        ushort cc = src->unicode();
+        switch(Entity) {
+        case NoEntity:
+            return;
+
+            break;
+        case SearchEntity:
+            if(cc == '#') {
+                cBuffer[cBufferPos++] = cc;
+                ++src;
+                Entity = NumericSearch;
+            }
+            else
+                Entity = EntityName;
+
+            break;
+
+        case NumericSearch:
+            if(cc == 'x' || cc == 'X') {
+                cBuffer[cBufferPos++] = cc;
+                ++src;
+                Entity = Hexadecimal;
+            }
+            else if(cc >= '0' && cc <= '9')
+                Entity = Decimal;
+            else
+                Entity = SearchSemicolon;
+
+            break;
+
+        case Hexadecimal:
+        {
+            int uc = EntityChar.unicode();
+            int ll = kMin<uint>(src.length(), 8);
+            while(ll--) {
+                QChar csrc(src->lower());
+                cc = csrc.cell();
+
+                if(csrc.row() || !((cc >= '0' && cc <= '9') || (cc >= 'a' && cc <= 'f'))) {
+                    break;
+                }
+                uc = uc*16 + (cc - ( cc < 'a' ? '0' : 'a' - 10));
+                cBuffer[cBufferPos++] = cc;
+                ++src;
+            }
+            EntityChar = QChar(uc);
+            Entity = SearchSemicolon;
+            break;
+        }
+        case Decimal:
+        {
+            int uc = EntityChar.unicode();
+            int ll = kMin(src.length(), 9-cBufferPos);
+            while(ll--) {
+                cc = src->cell();
+
+                if(src->row() || !(cc >= '0' && cc <= '9')) {
+                    Entity = SearchSemicolon;
+                    break;
+                }
+
+                uc = uc * 10 + (cc - '0');
+                cBuffer[cBufferPos++] = cc;
+                ++src;
+            }
+            EntityChar = QChar(uc);
+            if(cBufferPos == 9)  Entity = SearchSemicolon;
+            break;
+        }
+        case EntityName:
+        {
+            int ll = kMin(src.length(), 9-cBufferPos);
+            while(ll--) {
+                QChar csrc = *src;
+                cc = csrc.cell();
+
+                if(csrc.row() || !((cc >= 'a' && cc <= 'z') ||
+                                   (cc >= '0' && cc <= '9') || (cc >= 'A' && cc <= 'Z'))) {
+                    Entity = SearchSemicolon;
+                    break;
+                }
+
+                cBuffer[cBufferPos++] = cc;
+                ++src;
+
+                // be IE compatible and interpret even unterminated entities
+                // outside tags. like "foo &nbspstuff bla".
+                if ( tag == NoTag ) {
+                    const entity* e = kde_findEntity(cBuffer, cBufferPos);
+                    if ( e && e->code < 256 ) {
+                        EntityChar = e->code;
+                        entityLen = cBufferPos;
+                    }
+                }
+            }
+            if(cBufferPos == 9) Entity = SearchSemicolon;
+            if(Entity == SearchSemicolon) {
+                if(cBufferPos > 1) {
+                    const entity *e = kde_findEntity(cBuffer, cBufferPos);
+                    // IE only accepts unterminated entities < 256,
+                    // Gecko accepts them all, but only outside tags
+                    if(e && ( tag == NoTag || e->code < 256 || *src == ';' )) {
+                        EntityChar = e->code;
+                        entityLen = cBufferPos;
+                    }
+                }
+            }
+            break;
+        }
+        case SearchSemicolon:
+#ifdef TOKEN_DEBUG
+            kdDebug( 6036 ) << "ENTITY " << EntityChar.unicode() << endl;
+#endif
+            fixUpChar(EntityChar);
+
+            if (*src == ';')
+                    ++src;
+
+            if ( !EntityChar.isNull() ) {
+                checkBuffer();
+                if (entityLen > 0 && entityLen < cBufferPos) {
+                    int rem = cBufferPos - entityLen;
+                    src.prepend( TokenizerString(QString::fromAscii(cBuffer+entityLen, rem)) );
+                }
+                src.push( EntityChar );
+            } else {
+#ifdef TOKEN_DEBUG
+                kdDebug( 6036 ) << "unknown entity!" << endl;
+#endif
+                checkBuffer(11);
+                // ignore the sequence, add it to the buffer as plaintext
+                *dest++ = '&';
+                for(unsigned int i = 0; i < cBufferPos; i++)
+                    dest[i] = cBuffer[i];
+                dest += cBufferPos;
+                if (pre)
+                    prePos += cBufferPos+1;
+            }
+
+            Entity = NoEntity;
+            EntityChar = QChar::null;
+            return;
+        };
+    }
+}
+
+void HTMLTokenizer::parseTag(TokenizerString &src)
+{
+    assert(!Entity );
+    checkScriptBuffer( src.length() );
+
+    while ( !src.isEmpty() )
+    {
+        checkBuffer();
+#if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1
+        uint l = 0;
+        while(l < src.length() && (src.toString()[l]).latin1() != '>')
+            l++;
+        qDebug("src is now: *%s*, tquote: %d",
+               src.toString().left(l).latin1(), tquote);
+#endif
+        switch(tag) {
+        case NoTag:
+            return;
+        case TagName:
+        {
+#if defined(TOKEN_DEBUG) &&  TOKEN_DEBUG > 1
+            qDebug("TagName");
+#endif
+            if (searchCount > 0)
+            {
+                if (*src == commentStart[searchCount])
+                {
+                    searchCount++;
+                    if (searchCount == 4)
+                    {
+#ifdef TOKEN_DEBUG
+                        kdDebug( 6036 ) << "Found comment" << endl;
+#endif
+                        // Found '<!--' sequence
+                        ++src;
+                        dest = buffer; // ignore the previous part of this tag
+                        tag = NoTag;
+
+                        comment = true;
+                        parseComment(src);
+                        return; // Finished parsing tag!
+                    }
+                    // cuts of high part, is okay
+                    cBuffer[cBufferPos++] = src->cell();
+                    ++src;
+                    break;
+                }
+                else
+                    searchCount = 0; // Stop looking for '<!--' sequence
+            }
+
+            bool finish = false;
+            unsigned int ll = kMin(src.length(), CBUFLEN-cBufferPos);
+            while(ll--) {
+                ushort curchar = *src;
+                if(curchar <= ' ' || curchar == '>' ) {
+                    finish = true;
+                    break;
+                }
+                // this is a nasty performance trick. will work for the A-Z
+                // characters, but not for others. if it contains one,
+                // we fail anyway
+                char cc = curchar;
+                cBuffer[cBufferPos++] = cc | 0x20;
+                ++src;
+            }
+
+            // Disadvantage: we add the possible rest of the tag
+            // as attribute names. ### judge if this causes problems
+            if(finish || CBUFLEN == cBufferPos) {
+                bool beginTag;
+                char* ptr = cBuffer;
+                unsigned int len = cBufferPos;
+                cBuffer[cBufferPos] = '\0';
+                if ((cBufferPos > 0) && (*ptr == '/'))
+                {
+                    // End Tag
+                    beginTag = false;
+                    ptr++;
+                    len--;
+                }
+                else
+                    // Start Tag
+                    beginTag = true;
+                // Accept empty xml tags like <br/>
+                if(len > 1 && ptr[len-1] == '/' ) {
+                    ptr[--len] = '\0';
+                    // if its like <br/> and not like <input/ value=foo>, take it as flat
+                    if (*src == '>')
+                        currToken.flat = true;
+                }
+
+                uint tagID = khtml::getTagID(ptr, len);
+                if (!tagID) {
+#ifdef TOKEN_DEBUG
+                    QCString tmp(ptr, len+1);
+                    kdDebug( 6036 ) << "Unknown tag: \"" << tmp.data() << "\"" << endl;
+#endif
+                    dest = buffer;
+                }
+                else
+                {
+#ifdef TOKEN_DEBUG
+                    QCString tmp(ptr, len+1);
+                    kdDebug( 6036 ) << "found tag id=" << tagID << ": " << tmp.data() << endl;
+#endif
+                    currToken.tid = beginTag ? tagID : tagID + ID_CLOSE_TAG;
+                    dest = buffer;
+                }
+                tag = SearchAttribute;
+                cBufferPos = 0;
+            }
+            break;
+        }
+        case SearchAttribute:
+        {
+#if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1
+                qDebug("SearchAttribute");
+#endif
+            bool atespace = false;
+            ushort curchar;
+            while(!src.isEmpty()) {
+                curchar = *src;
+                if(curchar > ' ') {
+                    if(curchar == '<' || curchar == '>')
+                        tag = SearchEnd;
+                    else if(atespace && (curchar == '\'' || curchar == '"'))
+                    {
+                        tag = SearchValue;
+                        *dest++ = 0;
+                        attrName = QString::null;
+                    }
+                    else
+                        tag = AttributeName;
+
+                    cBufferPos = 0;
+                    break;
+                }
+                atespace = true;
+                ++src;
+            }
+            break;
+        }
+        case AttributeName:
+        {
+#if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1
+                qDebug("AttributeName");
+#endif
+            ushort curchar;
+            int ll = kMin(src.length(), CBUFLEN-cBufferPos);
+
+            while(ll--) {
+                curchar = *src;
+                if(curchar <= '>') {
+                    if(curchar <= ' ' || curchar == '=' || curchar == '>') {
+                        unsigned int a;
+                        cBuffer[cBufferPos] = '\0';
+                        a = khtml::getAttrID(cBuffer, cBufferPos);
+                        if ( !a ) {
+                            // did we just get /> or e.g checked/>
+                            if (curchar == '>' && cBufferPos >=1 && cBuffer[cBufferPos-1] == '/') {
+                                currToken.flat = true;
+                                if (cBufferPos>1)
+                                    a = khtml::getAttrID(cBuffer, cBufferPos-1);
+                            }
+                            if (!a)
+                                attrName = QString::fromLatin1(QCString(cBuffer, cBufferPos+1).data());
+                        }
+
+                        dest = buffer;
+                        *dest++ = a;
+#ifdef TOKEN_DEBUG
+                        if (!a || (cBufferPos && *cBuffer == '!'))
+                            kdDebug( 6036 ) << "Unknown attribute: *" << QCString(cBuffer, cBufferPos+1).data() << "*" << endl;
+                        else
+                            kdDebug( 6036 ) << "Known attribute: " << QCString(cBuffer, cBufferPos+1).data() << endl;
+#endif
+
+                        tag = SearchEqual;
+                        break;
+                    }
+                }
+                cBuffer[cBufferPos++] =
+                     (  curchar >= 'A' && curchar <= 'Z' ) ? curchar | 0x20 : curchar;
+                ++src;
+            }
+            if ( cBufferPos == CBUFLEN ) {
+                cBuffer[cBufferPos] = '\0';
+                attrName = QString::fromLatin1(QCString(cBuffer, cBufferPos+1).data());
+                dest = buffer;
+                *dest++ = 0;
+                tag = SearchEqual;
+            }
+            break;
+        }
+        case SearchEqual:
+        {
+#if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1
+                qDebug("SearchEqual");
+#endif
+            ushort curchar;
+            bool atespace = false;
+            while(!src.isEmpty()) {
+                curchar = src->unicode();
+                if(curchar > ' ') {
+                    if(curchar == '=') {
+#ifdef TOKEN_DEBUG
+                        kdDebug(6036) << "found equal" << endl;
+#endif
+                        tag = SearchValue;
+                        ++src;
+                    }
+                    else if(atespace && (curchar == '\'' || curchar == '"'))
+                    {
+                        tag = SearchValue;
+                        *dest++ = 0;
+                        attrName = QString::null;
+                    }
+                    else {
+                        DOMString v("");
+                        currToken.addAttribute(parser->docPtr(), buffer, attrName, v);
+                        dest = buffer;
+                        tag = SearchAttribute;
+                    }
+                    break;
+                }
+                atespace = true;
+                ++src;
+            }
+            break;
+        }
+        case SearchValue:
+        {
+            ushort curchar;
+            while(!src.isEmpty()) {
+                curchar = src->unicode();
+                if(curchar > ' ') {
+                    if(( curchar == '\'' || curchar == '\"' )) {
+                        tquote = curchar == '\"' ? DoubleQuote : SingleQuote;
+                        tag = QuotedValue;
+                        ++src;
+                    } else
+                        tag = Value;
+
+                    break;
+                }
+                ++src;
+            }
+            break;
+        }
+        case QuotedValue:
+        {
+#if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1
+                qDebug("QuotedValue");
+#endif
+            ushort curchar;
+            while(!src.isEmpty()) {
+                checkBuffer();
+
+                curchar = src->unicode();
+                if(curchar <= '\'' && !src.escaped()) {
+                    // ### attributes like '&{blaa....};' are supposed to be treated as jscript.
+                    if ( curchar == '&' )
+                    {
+                        ++src;
+                        parseEntity(src, dest, true);
+                        break;
+                    }
+                    else if ( (tquote == SingleQuote && curchar == '\'') ||
+                              (tquote == DoubleQuote && curchar == '\"') )
+                    {
+                        // some <input type=hidden> rely on trailing spaces. argh
+                        while(dest > buffer+1 && (*(dest-1) == '\n' || *(dest-1) == '\r'))
+                            dest--; // remove trailing newlines
+                        DOMString v(buffer+1, dest-buffer-1);
+                        currToken.addAttribute(parser->docPtr(), buffer, attrName, v);
+
+                        dest = buffer;
+                        tag = SearchAttribute;
+                        tquote = NoQuote;
+                        ++src;
+                        break;
+                    }
+                }
+                *dest++ = *src;
+                ++src;
+            }
+            break;
+        }
+        case Value:
+        {
+#if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1
+            qDebug("Value");
+#endif
+            ushort curchar;
+            while(!src.isEmpty()) {
+                checkBuffer();
+                curchar = src->unicode();
+                if(curchar <= '>' && !src.escaped()) {
+                    // parse Entities
+                    if ( curchar == '&' )
+                    {
+                        ++src;
+                        parseEntity(src, dest, true);
+                        break;
+                    }
+                    // no quotes. Every space means end of value
+                    // '/' does not delimit in IE!
+                    if ( curchar <= ' ' || curchar == '>' )
+                    {
+                        DOMString v(buffer+1, dest-buffer-1);
+                        currToken.addAttribute(parser->docPtr(), buffer, attrName, v);
+                        dest = buffer;
+                        tag = SearchAttribute;
+                        break;
+                    }
+                }
+
+                *dest++ = *src;
+                ++src;
+            }
+            break;
+        }
+        case SearchEnd:
+        {
+#if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1
+                qDebug("SearchEnd");
+#endif
+            while(!src.isEmpty()) {
+                if(*src == '<' || *src == '>')
+                    break;
+
+                if (*src == '/')
+                    currToken.flat = true;
+
+                ++src;
+            }
+            if(src.isEmpty() && *src != '<' && *src != '>') break;
+
+            searchCount = 0; // Stop looking for '<!--' sequence
+            tag = NoTag;
+            tquote = NoQuote;
+            if ( *src == '>' )
+                ++src;
+
+            if ( !currToken.tid ) //stop if tag is unknown
+                return;
+
+            uint tagID = currToken.tid;
+#if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 0
+            kdDebug( 6036 ) << "appending Tag: " << tagID << endl;
+#endif
+            // If the tag requires an end tag it cannot be flat,
+            // unless we are using the HTML parser to parse XHTML
+            // The only exception is SCRIPT and priority 0 tokens.
+            if (tagID < ID_CLOSE_TAG && tagID != ID_SCRIPT &&
+                DOM::endTag[tagID] == DOM::REQUIRED &&
+                parser->doc()->htmlMode() != DocumentImpl::XHtml)
+                currToken.flat = false;
+
+            bool beginTag = !currToken.flat && (tagID < ID_CLOSE_TAG);
+
+            if(tagID >= ID_CLOSE_TAG)
+                tagID -= ID_CLOSE_TAG;
+            else if ( !brokenScript && tagID == ID_SCRIPT ) {
+                DOMStringImpl* a = 0;
+                bool foundTypeAttribute = false;
+                scriptSrc = scriptSrcCharset = QString::null;
+                if ( currToken.attrs && /* potentially have a ATTR_SRC ? */
+                     view &&  /* are we a regular tokenizer or just for innerHTML ? */
+                     parser->doc()->view()->part()->jScriptEnabled() /* jscript allowed at all? */
+                    ) {
+                    if ( ( a = currToken.attrs->getValue( ATTR_SRC ) ) )
+                        scriptSrc = parser->doc()->completeURL(khtml::parseURL( DOMString(a) ).string() );
+                    if ( ( a = currToken.attrs->getValue( ATTR_CHARSET ) ) )
+                        scriptSrcCharset = DOMString(a).string().stripWhiteSpace();
+                    if ( scriptSrcCharset.isEmpty() && view)
+                        scriptSrcCharset = parser->doc()->view()->part()->encoding();
+                    /* Check type before language, since language is deprecated */
+                    if ((a = currToken.attrs->getValue(ATTR_TYPE)) != 0 && !DOMString(a).string().isEmpty())
+                        foundTypeAttribute = true;
+                    else
+                        a = currToken.attrs->getValue(ATTR_LANGUAGE);
+                }
+                javascript = true;
+
+                if( foundTypeAttribute ) {
+                    /*
+                        Mozilla 1.5 doesn't accept the text/javascript1.x formats, but WinIE 6 does.
+                        Mozilla 1.5 doesn't accept text/jscript, text/ecmascript, and text/livescript, but WinIE 6 does.
+			Mozilla 1.5 accepts application/x-javascript, WinIE 6 doesn't.
+                        Mozilla 1.5 allows leading and trailing whitespace, but WinIE 6 doesn't.
+                        Mozilla 1.5 and WinIE 6 both accept the empty string, but neither accept a whitespace-only string.
+                        We want to accept all the values that either of these browsers accept, but not other values.
+                     */
+                    QString type = DOMString(a).string().stripWhiteSpace().lower();
+                    if( type.compare("text/javascript") != 0 &&
+                        type.compare("text/javascript1.0") != 0 &&
+                        type.compare("text/javascript1.1") != 0 &&
+                        type.compare("text/javascript1.2") != 0 &&
+                        type.compare("text/javascript1.3") != 0 &&
+                        type.compare("text/javascript1.4") != 0 &&
+                        type.compare("text/javascript1.5") != 0 &&
+                        type.compare("text/jscript") != 0 &&
+                        type.compare("text/ecmascript") != 0 &&
+                        type.compare("text/livescript") != 0 &&
+			type.compare("application/x-javascript") != 0 &&
+			type.compare("application/x-ecmascript") != 0 &&
+			type.compare("application/javascript") != 0 &&
+			type.compare("application/ecmascript") != 0 )
+                        javascript = false;
+                } else if( a ) {
+                    /*
+                     Mozilla 1.5 doesn't accept jscript or ecmascript, but WinIE 6 does.
+                     Mozilla 1.5 accepts javascript1.0, javascript1.4, and javascript1.5, but WinIE 6 accepts only 1.1 - 1.3.
+                     Neither Mozilla 1.5 nor WinIE 6 accept leading or trailing whitespace.
+                     We want to accept all the values that either of these browsers accept, but not other values.
+                     */
+                    QString lang = DOMString(a).string();
+                    lang = lang.lower();
+                    if( lang.compare("") != 0 &&
+                        lang.compare("javascript") != 0 &&
+                        lang.compare("javascript1.0") != 0 &&
+                        lang.compare("javascript1.1") != 0 &&
+                        lang.compare("javascript1.2") != 0 &&
+                        lang.compare("javascript1.3") != 0 &&
+                        lang.compare("javascript1.4") != 0 &&
+                        lang.compare("javascript1.5") != 0 &&
+                        lang.compare("ecmascript") != 0 &&
+                        lang.compare("livescript") != 0 &&
+                        lang.compare("jscript") )
+                        javascript = false;
+                }
+            }
+
+            processToken();
+
+            if ( parser->selectMode() && beginTag)
+                discard = AllDiscard;
+
+            switch( tagID ) {
+            case ID_PRE:
+                pre = beginTag;
+                if (beginTag)
+                    discard = LFDiscard;
+                prePos = 0;
+                break;
+            case ID_BR:
+                prePos = 0;
+                break;
+            case ID_SCRIPT:
+                if (beginTag) {
+                    searchStopper = scriptEnd;
+                    searchStopperLen = 8;
+                    script = true;
+                    parseSpecial(src);
+                }
+                else if (tagID < ID_CLOSE_TAG) // Handle <script src="foo"/>
+                    scriptHandler();
+                break;
+            case ID_STYLE:
+                if (beginTag) {
+                    searchStopper = styleEnd;
+                    searchStopperLen = 7;
+                    style = true;
+                    parseSpecial(src);
+                }
+                break;
+            case ID_TEXTAREA:
+                if(beginTag) {
+                    searchStopper = textareaEnd;
+                    searchStopperLen = 10;
+                    textarea = true;
+                    discard = NoneDiscard;
+                    parseSpecial(src);
+                }
+                break;
+            case ID_TITLE:
+                if (beginTag) {
+                    searchStopper = titleEnd;
+                    searchStopperLen = 7;
+                    title = true;
+                    parseSpecial(src);
+                }
+                break;
+            case ID_XMP:
+                if (beginTag) {
+                    searchStopper = xmpEnd;
+                    searchStopperLen = 5;
+                    xmp = true;
+                    parseSpecial(src);
+                }
+                break;
+            case ID_SELECT:
+                select = beginTag;
+                break;
+            case ID_PLAINTEXT:
+                plaintext = beginTag;
+                break;
+            }
+            return; // Finished parsing tag!
+        }
+        } // end switch
+    }
+    return;
+}
+
+void HTMLTokenizer::addPending()
+{
+    if ( select && !(comment || script))
+    {
+        *dest++ = ' ';
+    }
+    else if ( textarea )
+    {
+        switch(pending) {
+        case LFPending:  *dest++ = '\n'; prePos = 0; break;
+        case SpacePending: *dest++ = ' '; ++prePos; break;
+        case TabPending: *dest++ = '\t'; prePos += TAB_SIZE - (prePos % TAB_SIZE); break;
+        case NonePending:
+            assert(0);
+        }
+    }
+    else
+    {
+        int p;
+
+        switch (pending)
+        {
+        case SpacePending:
+            // Insert a breaking space
+            *dest++ = QChar(' ');
+            prePos++;
+            break;
+
+        case LFPending:
+            *dest = '\n';
+            dest++;
+            prePos = 0;
+            break;
+
+        case TabPending:
+            p = TAB_SIZE - ( prePos % TAB_SIZE );
+            for ( int x = 0; x < p; x++ )
+                *dest++ = QChar(' ');
+            prePos += p;
+            break;
+
+        case NonePending:
+            assert(0);
+            break;
+        }
+    }
+
+    pending = NonePending;
+}
+
+void HTMLTokenizer::write( const TokenizerString &str, bool appendData )
+{
+#ifdef TOKEN_DEBUG
+    kdDebug( 6036 ) << this << " Tokenizer::write(\"" << str.toString() << "\"," << appendData << ")" << endl;
+#endif
+
+    if ( !buffer )
+        return;
+
+    if ( ( m_executingScript && appendData ) || cachedScript.count() ) {
+        // don't parse; we will do this later
+        if (pendingQueue.isEmpty())
+            pendingQueue.push(str);
+        else if (appendData)
+            pendingQueue.bottom().append(str);
+        else
+            pendingQueue.top().append(str);
+        return;
+    }
+
+    if ( onHold ) {
+        src.append(str);
+        return;
+    }
+
+    if (!src.isEmpty())
+        src.append(str);
+    else
+        setSrc(str);
+    m_abort = false;
+
+//     if (Entity)
+//         parseEntity(src, dest);
+
+    while ( !src.isEmpty() )
+    {
+        if ( m_abort )
+            return;
+        // do we need to enlarge the buffer?
+        checkBuffer();
+
+        ushort cc = src->unicode();
+
+        if (skipLF && (cc != '\n'))
+            skipLF = false;
+
+        if (skipLF) {
+            skipLF = false;
+            ++src;
+        }
+        else if ( Entity )
+            parseEntity( src, dest );
+        else if ( plaintext )
+            parseText( src );
+        else if (script)
+            parseSpecial(src);
+        else if (style)
+            parseSpecial(src);
+        else if (xmp)
+            parseSpecial(src);
+        else if (textarea)
+            parseSpecial(src);
+        else if (title)
+            parseSpecial(src);
+        else if (comment)
+            parseComment(src);
+        else if (server)
+            parseServer(src);
+        else if (processingInstruction)
+            parseProcessingInstruction(src);
+        else if (tag)
+            parseTag(src);
+        else if ( startTag )
+        {
+            startTag = false;
+            bool endTag = false;
+
+            switch(cc) {
+            case '/':
+                endTag = true;
+                break;
+            case '!':
+            {
+                // <!-- comment -->
+                searchCount = 1; // Look for '<!--' sequence to start comment
+
+                break;
+            }
+            case '?':
+            {
+                // xml processing instruction
+                processingInstruction = true;
+                tquote = NoQuote;
+                parseProcessingInstruction(src);
+                continue;
+
+                break;
+            }
+            case '%':
+                if (!brokenServer) {
+                    // <% server stuff, handle as comment %>
+                    server = true;
+                    tquote = NoQuote;
+                    parseServer(src);
+                    continue;
+                }
+                // else fall through
+            default:
+            {
+                if( ((cc >= 'a') && (cc <= 'z')) || ((cc >= 'A') && (cc <= 'Z')))
+                {
+                    // Start of a Start-Tag
+                }
+                else
+                {
+                    // Invalid tag
+                    // Add as is
+                    if (pending)
+                        addPending();
+                    *dest = '<';
+                    dest++;
+                    continue;
+                }
+            }
+            }; // end case
+
+            // According to SGML any LF immediately after a starttag, or
+            // immediately before an endtag should be ignored.
+            // ### Gecko and MSIE though only ignores LF immediately after
+            // starttags and only for PRE elements -- asj (28/06-2005)
+            if ( pending )
+                if (!select)
+                    addPending();
+                else
+                    pending = NonePending;
+
+            // Cancel unused discards
+            discard = NoneDiscard;
+            // if (!endTag) discard = LFDiscard;
+
+            processToken();
+
+            cBufferPos = 0;
+            tag = TagName;
+            parseTag(src);
+        }
+        else if ( cc == '&' && !src.escaped())
+        {
+            ++src;
+            if ( pending )
+                addPending();
+            discard = NoneDiscard;
+            parseEntity(src, dest, true);
+        }
+        else if ( cc == '<' && !src.escaped())
+        {
+            tagStartLineno = lineno+src.lineCount();
+            ++src;
+            discard = NoneDiscard;
+            startTag = true;
+        }
+        else if (( cc == '\n' ) || ( cc == '\r' ))
+        {
+            if (discard == SpaceDiscard)
+                discard = NoneDiscard;
+
+            if (discard == LFDiscard) {
+                // Ignore one LF
+                discard = NoneDiscard;
+            }
+            else if (discard == AllDiscard)
+            {
+                // Ignore
+            }
+            else
+            {
+                if (select && !script) {
+                    pending = LFPending;
+                } else {
+                    if (pending)
+                        addPending();
+                    pending = LFPending;
+                }
+            }
+
+            /* Check for MS-DOS CRLF sequence */
+            if (cc == '\r')
+            {
+                skipLF = true;
+            }
+            ++src;
+        }
+        else if (( cc == ' ' ) || ( cc == '\t' ))
+        {
+            if(discard == LFDiscard)
+                discard = NoneDiscard;
+
+            if(discard == SpaceDiscard) {
+                // Ignore one space
+                discard = NoneDiscard;
+            }
+            else if(discard == AllDiscard)
+            {
+                // Ignore
+            }
+            else {
+                if (select && !script) {
+                    if (!pending)
+                        pending = SpacePending;
+                } else {
+                    if (pending)
+                        addPending();
+                    if (cc == ' ')
+                        pending = SpacePending;
+                    else
+                        pending = TabPending;
+                }
+            }
+
+            ++src;
+        }
+        else
+        {
+            if (pending)
+                addPending();
+
+            discard = NoneDiscard;
+            if ( pre )
+            {
+                prePos++;
+            }
+            *dest = *src;
+            fixUpChar( *dest );
+            ++dest;
+            ++src;
+        }
+    }
+
+    if (noMoreData && cachedScript.isEmpty() && !m_executingScript)
+        end(); // this actually causes us to be deleted
+}
+
+void HTMLTokenizer::timerEvent( QTimerEvent *e )
+{
+    if ( e->timerId() == m_autoCloseTimer && cachedScript.isEmpty() ) {
+         finish();
+    }
+}
+
+void HTMLTokenizer::setAutoClose( bool b ) {
+    killTimer( m_autoCloseTimer );
+    m_autoCloseTimer = 0;
+    if ( b )
+        m_autoCloseTimer = startTimer(100);
+}
+
+void HTMLTokenizer::end()
+{
+    if ( buffer == 0 ) {
+        emit finishedParsing();
+        return;
+    }
+
+    // parseTag is using the buffer for different matters
+    if ( !tag )
+        processToken();
+
+    if(buffer)
+        KHTML_DELETE_QCHAR_VEC(buffer);
+
+    if(scriptCode)
+        KHTML_DELETE_QCHAR_VEC(scriptCode);
+
+    scriptCode = 0;
+    scriptCodeSize = scriptCodeMaxSize = scriptCodeResync = 0;
+    buffer = 0;
+    emit finishedParsing();
+}
+
+void HTMLTokenizer::finish()
+{
+    if ( m_autoCloseTimer ) {
+        killTimer( m_autoCloseTimer );
+        m_autoCloseTimer = 0;
+    }
+    // do this as long as we don't find matching comment ends
+    while((title || script || comment || server) && scriptCode && scriptCodeSize)
+    {
+        // we've found an unmatched comment start
+        if (comment)
+            brokenComments = true;
+        else if (server)
+            brokenServer = true;
+        else if (script)
+            brokenScript = true;
+
+        checkScriptBuffer();
+        scriptCode[ scriptCodeSize ] = 0;
+        scriptCode[ scriptCodeSize + 1 ] = 0;
+        int pos;
+        QString food;
+        if (title || style || script)
+            food.setUnicode(scriptCode, scriptCodeSize);
+        else if (server) {
+            food = "<";
+            food += QString(scriptCode, scriptCodeSize);
+        }
+        else {
+            pos = QConstString(scriptCode, scriptCodeSize).string().find('>');
+            food.setUnicode(scriptCode+pos+1, scriptCodeSize-pos-1); // deep copy
+        }
+        KHTML_DELETE_QCHAR_VEC(scriptCode);
+        scriptCode = 0;
+        scriptCodeSize = scriptCodeMaxSize = scriptCodeResync = 0;
+        if (script)
+            scriptHandler();
+
+        comment = title = server = script = false;
+        if ( !food.isEmpty() )
+            write(food, true);
+    }
+    // this indicates we will not receive any more data... but if we are waiting on
+    // an external script to load, we can't finish parsing until that is done
+    noMoreData = true;
+    if (cachedScript.isEmpty() && !m_executingScript && !onHold)
+        end(); // this actually causes us to be deleted
+}
+
+void HTMLTokenizer::processToken()
+{
+    KJSProxy *jsProxy = view ? view->part()->jScript() : 0L;
+    if (jsProxy)
+        jsProxy->setEventHandlerLineno(tagStartLineno+1);
+    if ( dest > buffer )
+    {
+#if 0
+        if(currToken.tid) {
+            qDebug( "unexpected token id: %d, str: *%s*", currToken.tid,QConstString( buffer,dest-buffer ).string().latin1() );
+            assert(0);
+        }
+
+#endif
+        currToken.text = new DOMStringImpl( buffer, dest - buffer );
+        currToken.text->ref();
+        currToken.tid = ID_TEXT;
+    }
+    else if(!currToken.tid) {
+        currToken.reset();
+        if (jsProxy)
+            jsProxy->setEventHandlerLineno(lineno+src.lineCount()+1);
+        return;
+    }
+
+    dest = buffer;
+
+#ifdef TOKEN_DEBUG
+    QString name = QString( getTagName(currToken.tid) );
+    QString text;
+    if(currToken.text)
+        text = QConstString(currToken.text->s, currToken.text->l).string();
+
+    kdDebug( 6036 ) << "Token --> " << name << "   id = " << currToken.tid << endl;
+    if (currToken.flat)
+        kdDebug( 6036 ) << "Token is FLAT!" << endl;
+    if(!text.isNull())
+        kdDebug( 6036 ) << "text: \"" << text << "\"" << endl;
+    unsigned long l = currToken.attrs ? currToken.attrs->length() : 0;
+    if(l) {
+        kdDebug( 6036 ) << "Attributes: " << l << endl;
+        for (unsigned long i = 0; i < l; ++i) {
+            NodeImpl::Id tid = currToken.attrs->idAt(i);
+            DOMString value = currToken.attrs->valueAt(i);
+            kdDebug( 6036 ) << "    " << tid << " " << parser->doc()->getDocument()->getName(NodeImpl::AttributeId, tid).string()
+                            << "=\"" << value.string() << "\"" << endl;
+        }
+    }
+    kdDebug( 6036 ) << endl;
+#endif
+
+    // In some cases, parseToken() can cause javascript code to be executed
+    // (for example, when setting an attribute that causes an event handler
+    // to be created). So we need to protect against re-entrancy into the parser
+    m_executingScript++;
+
+    // pass the token over to the parser, the parser DOES NOT delete the token
+    parser->parseToken(&currToken);
+
+    m_executingScript--;
+
+    if ( currToken.flat && currToken.tid != ID_TEXT && !parser->noSpaces() )
+	discard = NoneDiscard;
+
+    currToken.reset();
+    if (jsProxy)
+        jsProxy->setEventHandlerLineno(1);
+}
+
+
+HTMLTokenizer::~HTMLTokenizer()
+{
+    reset();
+    delete parser;
+}
+
+
+void HTMLTokenizer::enlargeBuffer(int len)
+{
+    int newsize = kMax(size*2, size+len);
+    int oldoffs = (dest - buffer);
+
+    buffer = KHTML_REALLOC_QCHAR_VEC(buffer, newsize);
+    dest = buffer + oldoffs;
+    size = newsize;
+}
+
+void HTMLTokenizer::enlargeScriptBuffer(int len)
+{
+    int newsize = kMax(scriptCodeMaxSize*2, scriptCodeMaxSize+len);
+    scriptCode = KHTML_REALLOC_QCHAR_VEC(scriptCode, newsize);
+    scriptCodeMaxSize = newsize;
+}
+
+void HTMLTokenizer::notifyFinished(CachedObject* /*finishedObj*/)
+{
+    assert(!cachedScript.isEmpty());
+    bool done = false;
+    while (!done && cachedScript.head()->isLoaded()) {
+
+        kdDebug( 6036 ) << "Finished loading an external script" << endl;
+
+        CachedScript* cs = cachedScript.dequeue();
+        DOMString scriptSource = cs->script();
+#ifdef TOKEN_DEBUG
+        kdDebug( 6036 ) << "External script is:" << endl << scriptSource.string() << endl;
+#endif
+        setSrc(TokenizerString());
+
+        // make sure we forget about the script before we execute the new one
+        // infinite recursion might happen otherwise
+        QString cachedScriptUrl( cs->url().string() );
+        cs->deref(this);
+
+	scriptExecution( scriptSource.string(), cachedScriptUrl );
+
+        done = cachedScript.isEmpty();
+
+        // 'script' is true when we are called synchronously from
+        // scriptHandler(). In that case scriptHandler() will take care
+        // of 'scriptOutput'.
+        if ( !script ) {
+            while (pendingQueue.count() > 1) {
+               TokenizerString t = pendingQueue.pop();
+               pendingQueue.top().prepend( t );
+            }
+            if (done) {
+                write(pendingQueue.pop(), false);
+            }
+            // we might be deleted at this point, do not
+            // access any members.
+        }
+    }
+}
+
+bool HTMLTokenizer::isWaitingForScripts() const
+{
+    return cachedScript.count();
+}
+
+bool HTMLTokenizer::isExecutingScript() const
+{
+    return (m_executingScript > 0);
+}
+
+void HTMLTokenizer::setSrc(const TokenizerString& source)
+{
+    lineno += src.lineCount();
+    src = source;
+    src.resetLineCount();
+}
+
+void HTMLTokenizer::setOnHold(bool _onHold)
+{
+    if (onHold == _onHold) return;
+    onHold = _onHold;
+}
+
author	toma <toma@283d02a7-25f6-0310-bc7c-ecb5cbfe19da>	2009-11-25 17:56:58 +0000
committer	toma <toma@283d02a7-25f6-0310-bc7c-ecb5cbfe19da>	2009-11-25 17:56:58 +0000
commit	ce4a32fe52ef09d8f5ff1dd22c001110902b60a2 (patch)
tree	5ac38a06f3dde268dc7927dc155896926aaf7012 /khtml/html/htmltokenizer.cpp
download	tdelibs-ce4a32fe52ef09d8f5ff1dd22c001110902b60a2.tar.gz tdelibs-ce4a32fe52ef09d8f5ff1dd22c001110902b60a2.zip