/***************************************************************************
 *   Copyright (C) 2004 by Puto Moura                                      *
 *   mojo@localhost.localdomain                                            *
 *                                                                         *
 *   This program is free software; you can redistribute it and/or modify  *
 *   it under the terms of the GNU General Public License as published by  *
 *   the Free Software Foundation; either version 2 of the License, or     *
 *   (at your option) any later version.                                   *
 *                                                                         *
 *   This program is distributed in the hope that it will be useful,       *
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
 *   GNU General Public License for more details.                          *
 *                                                                         *
 *   You should have received a copy of the GNU General Public License     *
 *   along with this program; if not, write to the                         *
 *   Free Software Foundation, Inc.,                                       *
 *   51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.             *
 ***************************************************************************/
#include "linkchecker.h"
#include "searchmanager.h"
#include "../utils/utils.h"
#include "../parser/htmlparser.h"

#include <tqstring.h>
#include <tqtimer.h>
#include <tqtextcodec.h> 
#include <tqcstring.h>

#include <tdeio/netaccess.h>
#include <tdeio/global.h>
#include <tdeio/job.h>
#include <tdeio/scheduler.h>
#include <tdeio/slave.h>
#include <kmimetype.h>
#include <kapplication.h>
#include <klocale.h>
#include <tdehtml_part.h>
#include <dom/html_misc.h>
#include <dom/dom_node.h>
#include <dom/dom_string.h>


int LinkChecker::count_ = 0;

LinkChecker::LinkChecker(LinkStatus* linkstatus, int time_out,
                         TQObject *parent, const char *name)
        : TQObject(parent, name), search_manager_(0), 
        linkstatus_(linkstatus), t_job_(0), time_out_(time_out), checker_(0), document_charset_(), 
        redirection_(false), header_checked_(false), finnished_(false), 
        parsing_(false), is_charset_checked_(false), has_defined_charset_(false)
{
    Q_ASSERT(linkstatus_);
    Q_ASSERT(!linkstatus_->checked());

    kdDebug(23100) << endl << ++count_ << ": " << "Checking " << linkstatus_->absoluteUrl().url() << endl;
}

LinkChecker::~LinkChecker()
{}

void LinkChecker::setSearchManager(SearchManager* search_manager)
{
    Q_ASSERT(search_manager);
    search_manager_ = search_manager;
}

void LinkChecker::check()
{
    Q_ASSERT(!finnished_);

    KURL url(linkStatus()->absoluteUrl());
    Q_ASSERT(url.isValid());

    if(url.hasRef()) {
        KMimeType::Ptr mimeType = KMimeType::findByURL(url);
        if(mimeType->is("text/html") || mimeType->is("application/xml")) {
            checkRef();
            return;
        }
    }
        
    t_job_ = TDEIO::get(url, false, false);

    t_job_->addMetaData("PropagateHttpHeader", "true"); // to have the http header

    if (linkstatus_->parent()) {
        t_job_->addMetaData("referrer", linkstatus_->parent()->absoluteUrl().prettyURL());
    }

    if(search_manager_->sendIdentification())
    {
        t_job_->addMetaData("SendUserAgent", "true");
        t_job_->addMetaData("UserAgent", search_manager_->userAgent());
    }
    else
        t_job_->addMetaData("SendUserAgent", "false");


    TQObject::connect(t_job_, TQT_SIGNAL(data(TDEIO::Job *, const TQByteArray &)),
                        this, TQT_SLOT(slotData(TDEIO::Job *, const TQByteArray &)));
    TQObject::connect(t_job_, TQT_SIGNAL(mimetype(TDEIO::Job *, const TQString &)),
                        this, TQT_SLOT(slotMimetype(TDEIO::Job *, const TQString &)));
    TQObject::connect(t_job_, TQT_SIGNAL(result(TDEIO::Job *)),
                        this, TQT_SLOT(slotResult(TDEIO::Job *)));
    TQObject::connect(t_job_, TQT_SIGNAL(redirection(TDEIO::Job *, const KURL &)),
                        this, TQT_SLOT(slotRedirection(TDEIO::Job *, const KURL &)));

    TQTimer::singleShot( time_out_ * 1000, this, TQT_SLOT(slotTimeOut()) );

    t_job_->setInteractive(false);    
}

void LinkChecker::slotTimeOut()
{
    if(!finnished_ && !parsing_)
    {
        kdDebug(23100) << "timeout: " << linkstatus_->absoluteUrl().url() << endl;
        if(t_job_ && t_job_->slave())
            kdDebug(23100) << " - " << t_job_->slave() << "/" <<  t_job_->slave()->slave_pid() << endl;
        else
            kdDebug(23100) << endl;

        
//         Q_ASSERT(t_job_); // can happen: e.g. bad result signal
        if(t_job_->error() != TDEIO::ERR_USER_CANCELED)
        {
            linkstatus_->setErrorOccurred(true);
            linkstatus_->setChecked(true);
            linkstatus_->setError(i18n("Timeout"));
            linkstatus_->setStatus(LinkStatus::TIMEOUT);

            killJob();
            finnish();
        }
    }
}

void LinkChecker::slotMimetype (TDEIO::Job* /*job*/, const TQString &type)
{
    if(finnished_)
        return;

//     kdDebug(23100) <<  "LinkChecker::slotMimetype:" << type << "-> " << linkstatus_->absoluteUrl().url() 
//             << " - " << t_job_->slave() << "/" <<  t_job_->slave()->slave_pid() << endl;

    Q_ASSERT(t_job_);

    LinkStatus* ls = 0;
/*    if(redirection_)
        ls = linkStatus()->redirection();
    else*/
        ls = linkstatus_;
    Q_ASSERT(ls);

    ls->setMimeType(type);
    KURL url = ls->absoluteUrl();

    // we doesn't do nothing if file is http or https because we need the header
    // which is only available in the data response
    if(!t_job_->error()) // if a error happened let result() handle that
    {
        if(ls->onlyCheckHeader())
        {
            //kdDebug(23100) <<  "only check header: " << ls->absoluteUrl().prettyURL() << endl;

            // file is OK (http can have an error page though job->error() is false)
            if(!url.protocol().startsWith("http"))
            {
                ls->setStatusText("OK");
                ls->setStatus(LinkStatus::SUCCESSFULL);
                
                killJob();                
                finnish();
            }
        }
        else // !ls->onlyCheckHeader()
        {
            //kdDebug(23100) <<  "NOT only check header: " << ls->absoluteUrl().prettyURL() << endl;

            // file is OK (http can have an error page though job->error() is false)
            if(!url.protocol().startsWith("http")) // if not, it have to go trough slotData to get the http header
            {
                // it's not an html page, so we don't want the file content
                if(type != "text/html"/* && type != "text/plain"*/)
                {
                    //kdDebug(23100) <<  "mimetype: " << type << endl;
                    ls->setStatusText("OK");
                    ls->setStatus(LinkStatus::SUCCESSFULL);
                    
                    killJob();                    
                    finnish();
                }
            }
        }
    }
}

void LinkChecker::slotData(TDEIO::Job* /*job*/, const TQByteArray& data)
{   
    if(finnished_)
        return;

    kdDebug(23100) <<  "LinkChecker::slotData -> " << linkstatus_->absoluteUrl().url() 
            << " - " << t_job_->slave() << "/" <<  t_job_->slave()->slave_pid()  << endl;
    
    Q_ASSERT(t_job_);
    
    LinkStatus* ls = 0;
/*    if(redirection_)
        ls = linkStatus()->redirection();
    else*/
        ls = linkstatus_;
    Q_ASSERT(ls);

    KURL url = ls->absoluteUrl();

    if(!t_job_->error())
    {
        if(ls->onlyCheckHeader())
        {
            Q_ASSERT(header_checked_ == false);
            // the job should have been killed in slotMimetype
            Q_ASSERT(url.protocol() == "http" || url.protocol() == "https");

            // get the header and quit
            if(url.protocol().startsWith("http"))
            {
                // get the header
                ls->setHttpHeader(getHttpHeader(t_job_));

                if(t_job_->isErrorPage())
                    ls->setIsErrorPage(true);

                if(header_checked_)
                {
                    killJob();
                    linkstatus_->setStatus(getHttpStatus());
                    linkstatus_->setChecked(true);
                    finnish();
                    return;
                }
            }
        }
        else
        {
            if(url.protocol().startsWith("http"))
            {
                if(!header_checked_)
                {
                    ls->setHttpHeader(getHttpHeader(t_job_));                    
                }
                if(ls->mimeType() != "text/html" && header_checked_)
                {
                    //kdDebug(23100) <<  "mimetype of " << ls->absoluteUrl().prettyURL() << ": " << ls->mimeType() << endl;
                    ls->setStatus(getHttpStatus());
                    killJob();
                    finnish(); // if finnish is called before kill what you get is a segfault, don't know why
                    return;
                }
                else if(t_job_->isErrorPage() && header_checked_)
                {
                    //kdDebug(23100) <<  "ERROR PAGE" << endl;
                    ls->setIsErrorPage(true);
                    ls->setStatus(getHttpStatus());
                    killJob();
                    finnish();
                    return;
                }
            }
            else
            {
                Q_ASSERT(ls->mimeType() == "text/html");
            }
            if(!is_charset_checked_) 
                findDocumentCharset(data);
            
            TQTextCodec* codec = 0;
            if(has_defined_charset_) 
                codec = TQTextCodec::codecForName(document_charset_);
            if(!codec)
                codec = TQTextCodec::codecForName("iso8859-1"); // default
            
            doc_html_ += codec->toUnicode(data);
        }
    }
}

void LinkChecker::findDocumentCharset(TQString const& doc)
{
    Q_ASSERT(!is_charset_checked_);
    
    is_charset_checked_ = true; // only check the first stream of data
                    
    if(header_checked_)
        document_charset_ = linkstatus_->httpHeader().charset();

    // try to look in the meta elements                    
    if(document_charset_.isNull() || document_charset_.isEmpty()) 
        document_charset_ = HtmlParser::findCharsetInMetaElement(doc);
    
    if(!document_charset_.isNull() && !document_charset_.isEmpty())
        has_defined_charset_ = true;
}

// only comes here if an error happened or in case of a clean html page
// if onlyCheckHeader is false
void LinkChecker::slotResult(TDEIO::Job* /*job*/)
{
    if(finnished_)
        return;

    kdDebug(23100) <<  "LinkChecker::slotResult -> " << linkstatus_->absoluteUrl().url()  << endl;

    Q_ASSERT(t_job_);
    if(!t_job_)
        return;
    
    if(redirection_) {
        if(!processRedirection(redirection_url_)) {
            t_job_ = 0;
            linkstatus_->setChecked(true);
            finnish();
            return;
        }
    }
            
    TDEIO::TransferJob* job = t_job_;
    t_job_ = 0;

    emit jobFinnished(this);

    if(job->error() == TDEIO::ERR_USER_CANCELED)
    {
        // FIXME This can happen! If the job is non interactive...
        kdWarning(23100) << endl << "Job killed quietly, yet signal result was emited..." << endl;
        kdDebug(23100) << linkstatus_->toString() << endl;
        finnish();
        return;
    }

    LinkStatus* ls = 0;
    if(redirection_)
        ls = linkStatus()->redirection();
    else
        ls = linkstatus_;
    Q_ASSERT(ls);

    if(!(!ls->onlyCheckHeader() ||
          job->error() ||
          !header_checked_))
        kdWarning(23100) << ls->toString() << endl;

    Q_ASSERT(!ls->onlyCheckHeader() || job->error() || !header_checked_);

    if(ls->isErrorPage())
        kdWarning(23100) << "\n\n" << ls->toString() << endl << endl;

    Q_ASSERT(!job->isErrorPage());

    if(job->error())
    {
        kdDebug(23100) << "Job error: " <<  job->errorString() << endl;
        kdDebug(23100) << "Job error code: " <<  job->error() << endl;

        if(job->error() == TDEIO::ERR_IS_DIRECTORY)
        {
            ls->setStatusText("OK");
            ls->setStatus(LinkStatus::SUCCESSFULL);
        }
        else
        {
            ls->setErrorOccurred(true);
            if(job->error() == TDEIO::ERR_SERVER_TIMEOUT)
                ls->setStatus(LinkStatus::TIMEOUT);
            else
                ls->setStatus(LinkStatus::BROKEN);

            if(job->errorString().isEmpty())
                kdWarning(23100) << "\n\nError string is empty, error = " << job->error() << "\n\n\n";
            if(job->error() != TDEIO::ERR_NO_CONTENT) 
                ls->setError(job->errorString());
            else
                ls->setError(i18n("No Content"));
        }
    }

    else
    {
        if(!ls->absoluteUrl().protocol().startsWith("http")) {
            ls->setStatusText("OK");
            ls->setStatus(LinkStatus::SUCCESSFULL);
           }
        else
        {
            if(!header_checked_)
            {
                kdDebug(23100) <<  "\n\nheader not received... checking again...\n\n\n";
                //check again
                check();
                return;
            }
            Q_ASSERT(header_checked_);
            
            ls->setStatus(getHttpStatus());
        }

        if(!doc_html_.isNull() && !doc_html_.isEmpty())
        {
            ls->setDocHtml(doc_html_);

            parsing_ = true;
            HtmlParser parser(doc_html_);

            if(parser.hasBaseUrl())
                ls->setBaseURI(KURL(parser.baseUrl().url()));
            if(parser.hasTitle())
                ls->setHtmlDocTitle(parser.title().attributeTITLE());
            
            ls->setChildrenNodes(parser.nodes());
            parsing_ = false;
        }
    }
    finnish();
}


void LinkChecker::slotRedirection (TDEIO::Job* /*job*/, const KURL &url)
{
    kdDebug(23100) <<  "LinkChecker::slotRedirection -> " << 
            linkstatus_->absoluteUrl().url()  << " -> " << url.url() << endl;
//             << " - " << t_job_->slave() << "/" <<  t_job_->slave()->slave_pid() << endl;
    
    redirection_ = true;
    redirection_url_ = url;
}

bool LinkChecker::processRedirection(KURL const& toUrl)
{
    if(finnished_)
        return true;

    kdDebug(23100) <<  "LinkChecker::processRedirection -> " << linkstatus_->absoluteUrl().url() << " -> " << toUrl.url() << endl;

    Q_ASSERT(t_job_);
    Q_ASSERT(linkstatus_->absoluteUrl().protocol().startsWith("http"));
    Q_ASSERT(redirection_);

    linkstatus_->setHttpHeader(getHttpHeader(t_job_, false));
    linkstatus_->setIsRedirection(true);
    linkstatus_->setStatusText("redirection");
    linkstatus_->setStatus(LinkStatus::HTTP_REDIRECTION);
    linkstatus_->setChecked(true);
    
    LinkStatus* ls_red = new LinkStatus(*linkstatus_);
    ls_red->setAbsoluteUrl(toUrl);
    ls_red->setRootUrl(linkstatus_->rootUrl());

    if(!linkstatus_->onlyCheckHeader())
        ls_red->setOnlyCheckHeader(false);

    linkstatus_->setRedirection(ls_red);
    ls_red->setParent(linkstatus_);
    ls_red->setOriginalUrl(toUrl.url());

    Q_ASSERT(search_manager_);

    if(search_manager_->localDomain(ls_red->absoluteUrl()))
        ls_red->setExternalDomainDepth(-1);
    else
    {
        if(search_manager_->localDomain(linkstatus_->absoluteUrl()))
            ls_red->setExternalDomainDepth(linkstatus_->externalDomainDepth() + 1);
        else
            ls_red->setExternalDomainDepth(linkstatus_->externalDomainDepth());
    }

    if(!toUrl.isValid() || search_manager_->existUrl(toUrl, linkstatus_->absoluteUrl()))
    {
        ls_red->setChecked(false);
        return false;
    }
    else
    {
        ls_red->setChecked(true);
        return true;
    }
}

void LinkChecker::finnish()
{
    Q_ASSERT(!t_job_);

    if(!finnished_)
    {
        kdDebug(23100) <<  "LinkChecker::finnish -> " << linkstatus_->absoluteUrl().url() << endl;

        finnished_ = true;

        if(redirection_)
            Q_ASSERT(linkstatus_->checked());
        else
            linkstatus_->setChecked(true);

        emit transactionFinished(linkstatus_, this);
    }
}

HttpResponseHeader LinkChecker::getHttpHeader(TDEIO::Job* /*job*/, bool remember_check)
{
    //kdDebug(23100) <<  "LinkChecker::getHttpHeader -> " << linkstatus_->absoluteUrl().url() << endl;
    
    Q_ASSERT(!finnished_);
    Q_ASSERT(t_job_);

    TQString header_string = t_job_->queryMetaData("HTTP-Headers");
    //    Q_ASSERT(!header_string.isNull() && !header_string.isEmpty());
//     kdDebug(23100) << "HTTP header: " << endl << header_string << endl;
//     kdDebug(23100) << "Keys: " << HttpResponseHeader(header_string).keys() << endl;
//     kdDebug(23100) << "Content-type: " << HttpResponseHeader(header_string).contentType() << endl;
//     kdDebug(23100) << "Content-type: " << HttpResponseHeader(header_string).value("content-type") << endl;

    if(header_string.isNull() || header_string.isEmpty())
    {
        header_checked_ = false;
        kdWarning(23100) <<  "header_string.isNull() || header_string.isEmpty(): "
        << linkstatus_->toString()  << endl;
    }
    else if(remember_check)
        header_checked_ = true;

    return HttpResponseHeader(header_string);
}

void LinkChecker::checkRef()
{
    KURL url(linkStatus()->absoluteUrl());
    Q_ASSERT(url.hasRef());
    
    TQString ref = url.ref();
    if(ref == "" || ref == "top") {
        linkstatus_->setStatusText("OK");
        linkstatus_->setStatus(LinkStatus::SUCCESSFULL);
        finnish();
        return;
    }

    TQString url_base;
    LinkStatus const* ls_parent = 0;
    int i_ref = -1;

    if(linkStatus()->originalUrl().startsWith("#"))
        ls_parent = linkStatus()->parent();

    else
    {
        i_ref = url.url().find("#");
        url_base = url.url().left(i_ref);
        //kdDebug(23100) << "url_base: " << url_base << endl;

        Q_ASSERT(search_manager_);

        ls_parent = search_manager_->linkStatus(url_base);
    }

    if(ls_parent)
        checkRef(ls_parent);
    else
    {
        url = KURL::fromPathOrURL(url.url().left(i_ref));
        checkRef(url);
    }
}

void LinkChecker::checkRef(KURL const& url)
{
    Q_ASSERT(search_manager_);
    
    TQString url_string = url.url();
    TDEHTMLPart* html_part = search_manager_->htmlPart(url_string);
    if(!html_part)
    {
        kdDebug() << "new TDEHTMLPart: " +  url_string << endl;

        html_part = new TDEHTMLPart();
        html_part->setOnlyLocalReferences(true);

        TQString tmpFile;
        if(TDEIO::NetAccess::download(url, tmpFile, 0))
        {
            TQString doc_html = FileManager::read(tmpFile);
            html_part->begin();
            html_part->write(doc_html);
            html_part->end();

            TDEIO::NetAccess::removeTempFile(tmpFile);
        } 
        else 
        {
            kdDebug(23100) <<  TDEIO::NetAccess::lastErrorString() << endl;
        }

        search_manager_->addHtmlPart(url_string, html_part);
    }

    if(hasAnchor(html_part, linkStatus()->absoluteUrl().ref()))
    {
        linkstatus_->setStatusText("OK");
        linkstatus_->setStatus(LinkStatus::SUCCESSFULL);
    }
    else
    {
        linkstatus_->setErrorOccurred(true);
        linkstatus_->setError(i18n( "Link destination not found." ));
        linkstatus_->setStatus(LinkStatus::BROKEN);
    }

    finnish();
}

void LinkChecker::checkRef(LinkStatus const* linkstatus_parent)
{
    Q_ASSERT(search_manager_);

    TQString url_string = linkstatus_parent->absoluteUrl().url();
    TDEHTMLPart* html_part = search_manager_->htmlPart(url_string);
    if(!html_part)
    {
        kdDebug() << "new TDEHTMLPart: " +  url_string << endl;

        html_part = new TDEHTMLPart();
        html_part->setOnlyLocalReferences(true);

        html_part->begin();
        html_part->write(linkstatus_parent->docHtml());
        html_part->end();

        search_manager_->addHtmlPart(url_string, html_part);
    }

    if(hasAnchor(html_part, linkStatus()->absoluteUrl().ref()))
    {
        linkstatus_->setStatusText("OK");
        linkstatus_->setStatus(LinkStatus::SUCCESSFULL);
    }
    else
    {
        linkstatus_->setErrorOccurred(true);
        linkstatus_->setError(i18n( "Link destination not found." ));
        linkstatus_->setStatus(LinkStatus::BROKEN);
    }

    finnish();
}

bool LinkChecker::hasAnchor(TDEHTMLPart* html_part, TQString const& anchor)
{
    DOM::HTMLDocument htmlDocument = html_part->htmlDocument();
    DOM::HTMLCollection anchors = htmlDocument.anchors();

    DOM::DOMString name_ref(anchor);
    Q_ASSERT(!name_ref.isNull());

    DOM::Node node = anchors.namedItem(name_ref);
    if(node.isNull())
    {
        node = htmlDocument.getElementById(name_ref);
    }

    if(!node.isNull())
        return true;
    else
        return false;
}

void LinkChecker::killJob()
{
    if(!t_job_)
        return;

    TDEIO::TransferJob* aux = t_job_;
    t_job_ = 0;
    aux->disconnect(this);
    aux->kill(true); // quietly   
}

LinkStatus::Status LinkChecker::getHttpStatus() const
{
    TQString status_code = TQString::number(linkstatus_->httpHeader().statusCode());
    
    if(status_code[0] == '2')
        return LinkStatus::SUCCESSFULL;
    else if(status_code[0] == '3')
        return LinkStatus::HTTP_REDIRECTION;
    else if(status_code[0] == '4')
        return LinkStatus::HTTP_CLIENT_ERROR;
    else if(status_code[0] == '5')
        return LinkStatus::HTTP_SERVER_ERROR;
    else
        return LinkStatus::UNDETERMINED;
}

#include "linkchecker.moc"