/*************************************************************************** * Copyright (C) 2004 by Puto Moura * * mojo@localhost.localdomain * * * * This program is free software; you can redistribute it and/or modify * * it under the terms of the GNU General Public License as published by * * the Free Software Foundation; either version 2 of the License, or * * (at your option) any later version. * * * * This program is distributed in the hope that it will be useful, * * but WITHOUT ANY WARRANTY; without even the implied warranty of * * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * * GNU General Public License for more details. * * * * You should have received a copy of the GNU General Public License * * along with this program; if not, write to the * * Free Software Foundation, Inc., * * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. * ***************************************************************************/ #include "linkchecker.h" #include "searchmanager.h" #include "../utils/utils.h" #include "../parser/htmlparser.h" #include <tqstring.h> #include <tqtimer.h> #include <tqtextcodec.h> #include <tqcstring.h> #include <tdeio/netaccess.h> #include <tdeio/global.h> #include <tdeio/job.h> #include <tdeio/scheduler.h> #include <tdeio/slave.h> #include <kmimetype.h> #include <kapplication.h> #include <klocale.h> #include <tdehtml_part.h> #include <dom/html_misc.h> #include <dom/dom_node.h> #include <dom/dom_string.h> int LinkChecker::count_ = 0; LinkChecker::LinkChecker(LinkStatus* linkstatus, int time_out, TQObject *parent, const char *name) : TQObject(parent, name), search_manager_(0), linkstatus_(linkstatus), t_job_(0), time_out_(time_out), checker_(0), document_charset_(), redirection_(false), header_checked_(false), finnished_(false), parsing_(false), is_charset_checked_(false), has_defined_charset_(false) { Q_ASSERT(linkstatus_); Q_ASSERT(!linkstatus_->checked()); kdDebug(23100) << endl << ++count_ << ": " << "Checking " << linkstatus_->absoluteUrl().url() << endl; } LinkChecker::~LinkChecker() {} void LinkChecker::setSearchManager(SearchManager* search_manager) { Q_ASSERT(search_manager); search_manager_ = search_manager; } void LinkChecker::check() { Q_ASSERT(!finnished_); KURL url(linkStatus()->absoluteUrl()); Q_ASSERT(url.isValid()); if(url.hasRef()) { KMimeType::Ptr mimeType = KMimeType::findByURL(url); if(mimeType->is("text/html") || mimeType->is("application/xml")) { checkRef(); return; } } t_job_ = TDEIO::get(url, false, false); t_job_->addMetaData("PropagateHttpHeader", "true"); // to have the http header if (linkstatus_->parent()) { t_job_->addMetaData("referrer", linkstatus_->parent()->absoluteUrl().prettyURL()); } if(search_manager_->sendIdentification()) { t_job_->addMetaData("SendUserAgent", "true"); t_job_->addMetaData("UserAgent", search_manager_->userAgent()); } else t_job_->addMetaData("SendUserAgent", "false"); TQObject::connect(t_job_, TQT_SIGNAL(data(TDEIO::Job *, const TQByteArray &)), this, TQT_SLOT(slotData(TDEIO::Job *, const TQByteArray &))); TQObject::connect(t_job_, TQT_SIGNAL(mimetype(TDEIO::Job *, const TQString &)), this, TQT_SLOT(slotMimetype(TDEIO::Job *, const TQString &))); TQObject::connect(t_job_, TQT_SIGNAL(result(TDEIO::Job *)), this, TQT_SLOT(slotResult(TDEIO::Job *))); TQObject::connect(t_job_, TQT_SIGNAL(redirection(TDEIO::Job *, const KURL &)), this, TQT_SLOT(slotRedirection(TDEIO::Job *, const KURL &))); TQTimer::singleShot( time_out_ * 1000, this, TQT_SLOT(slotTimeOut()) ); t_job_->setInteractive(false); } void LinkChecker::slotTimeOut() { if(!finnished_ && !parsing_) { kdDebug(23100) << "timeout: " << linkstatus_->absoluteUrl().url() << endl; if(t_job_ && t_job_->slave()) kdDebug(23100) << " - " << t_job_->slave() << "/" << t_job_->slave()->slave_pid() << endl; else kdDebug(23100) << endl; // Q_ASSERT(t_job_); // can happen: e.g. bad result signal if(t_job_->error() != TDEIO::ERR_USER_CANCELED) { linkstatus_->setErrorOccurred(true); linkstatus_->setChecked(true); linkstatus_->setError(i18n("Timeout")); linkstatus_->setStatus(LinkStatus::TIMEOUT); killJob(); finnish(); } } } void LinkChecker::slotMimetype (TDEIO::Job* /*job*/, const TQString &type) { if(finnished_) return; // kdDebug(23100) << "LinkChecker::slotMimetype:" << type << "-> " << linkstatus_->absoluteUrl().url() // << " - " << t_job_->slave() << "/" << t_job_->slave()->slave_pid() << endl; Q_ASSERT(t_job_); LinkStatus* ls = 0; /* if(redirection_) ls = linkStatus()->redirection(); else*/ ls = linkstatus_; Q_ASSERT(ls); ls->setMimeType(type); KURL url = ls->absoluteUrl(); // we doesn't do nothing if file is http or https because we need the header // which is only available in the data response if(!t_job_->error()) // if a error happened let result() handle that { if(ls->onlyCheckHeader()) { //kdDebug(23100) << "only check header: " << ls->absoluteUrl().prettyURL() << endl; // file is OK (http can have an error page though job->error() is false) if(!url.protocol().startsWith("http")) { ls->setStatusText("OK"); ls->setStatus(LinkStatus::SUCCESSFULL); killJob(); finnish(); } } else // !ls->onlyCheckHeader() { //kdDebug(23100) << "NOT only check header: " << ls->absoluteUrl().prettyURL() << endl; // file is OK (http can have an error page though job->error() is false) if(!url.protocol().startsWith("http")) // if not, it have to go trough slotData to get the http header { // it's not an html page, so we don't want the file content if(type != "text/html"/* && type != "text/plain"*/) { //kdDebug(23100) << "mimetype: " << type << endl; ls->setStatusText("OK"); ls->setStatus(LinkStatus::SUCCESSFULL); killJob(); finnish(); } } } } } void LinkChecker::slotData(TDEIO::Job* /*job*/, const TQByteArray& data) { if(finnished_) return; kdDebug(23100) << "LinkChecker::slotData -> " << linkstatus_->absoluteUrl().url() << " - " << t_job_->slave() << "/" << t_job_->slave()->slave_pid() << endl; Q_ASSERT(t_job_); LinkStatus* ls = 0; /* if(redirection_) ls = linkStatus()->redirection(); else*/ ls = linkstatus_; Q_ASSERT(ls); KURL url = ls->absoluteUrl(); if(!t_job_->error()) { if(ls->onlyCheckHeader()) { Q_ASSERT(header_checked_ == false); // the job should have been killed in slotMimetype Q_ASSERT(url.protocol() == "http" || url.protocol() == "https"); // get the header and quit if(url.protocol().startsWith("http")) { // get the header ls->setHttpHeader(getHttpHeader(t_job_)); if(t_job_->isErrorPage()) ls->setIsErrorPage(true); if(header_checked_) { killJob(); linkstatus_->setStatus(getHttpStatus()); linkstatus_->setChecked(true); finnish(); return; } } } else { if(url.protocol().startsWith("http")) { if(!header_checked_) { ls->setHttpHeader(getHttpHeader(t_job_)); } if(ls->mimeType() != "text/html" && header_checked_) { //kdDebug(23100) << "mimetype of " << ls->absoluteUrl().prettyURL() << ": " << ls->mimeType() << endl; ls->setStatus(getHttpStatus()); killJob(); finnish(); // if finnish is called before kill what you get is a segfault, don't know why return; } else if(t_job_->isErrorPage() && header_checked_) { //kdDebug(23100) << "ERROR PAGE" << endl; ls->setIsErrorPage(true); ls->setStatus(getHttpStatus()); killJob(); finnish(); return; } } else { Q_ASSERT(ls->mimeType() == "text/html"); } if(!is_charset_checked_) findDocumentCharset(data); TQTextCodec* codec = 0; if(has_defined_charset_) codec = TQTextCodec::codecForName(document_charset_); if(!codec) codec = TQTextCodec::codecForName("iso8859-1"); // default doc_html_ += codec->toUnicode(data); } } } void LinkChecker::findDocumentCharset(TQString const& doc) { Q_ASSERT(!is_charset_checked_); is_charset_checked_ = true; // only check the first stream of data if(header_checked_) document_charset_ = linkstatus_->httpHeader().charset(); // try to look in the meta elements if(document_charset_.isNull() || document_charset_.isEmpty()) document_charset_ = HtmlParser::findCharsetInMetaElement(doc); if(!document_charset_.isNull() && !document_charset_.isEmpty()) has_defined_charset_ = true; } // only comes here if an error happened or in case of a clean html page // if onlyCheckHeader is false void LinkChecker::slotResult(TDEIO::Job* /*job*/) { if(finnished_) return; kdDebug(23100) << "LinkChecker::slotResult -> " << linkstatus_->absoluteUrl().url() << endl; Q_ASSERT(t_job_); if(!t_job_) return; if(redirection_) { if(!processRedirection(redirection_url_)) { t_job_ = 0; linkstatus_->setChecked(true); finnish(); return; } } TDEIO::TransferJob* job = t_job_; t_job_ = 0; emit jobFinnished(this); if(job->error() == TDEIO::ERR_USER_CANCELED) { // FIXME This can happen! If the job is non interactive... kdWarning(23100) << endl << "Job killed quietly, yet signal result was emited..." << endl; kdDebug(23100) << linkstatus_->toString() << endl; finnish(); return; } LinkStatus* ls = 0; if(redirection_) ls = linkStatus()->redirection(); else ls = linkstatus_; Q_ASSERT(ls); if(!(!ls->onlyCheckHeader() || job->error() || !header_checked_)) kdWarning(23100) << ls->toString() << endl; Q_ASSERT(!ls->onlyCheckHeader() || job->error() || !header_checked_); if(ls->isErrorPage()) kdWarning(23100) << "\n\n" << ls->toString() << endl << endl; Q_ASSERT(!job->isErrorPage()); if(job->error()) { kdDebug(23100) << "Job error: " << job->errorString() << endl; kdDebug(23100) << "Job error code: " << job->error() << endl; if(job->error() == TDEIO::ERR_IS_DIRECTORY) { ls->setStatusText("OK"); ls->setStatus(LinkStatus::SUCCESSFULL); } else { ls->setErrorOccurred(true); if(job->error() == TDEIO::ERR_SERVER_TIMEOUT) ls->setStatus(LinkStatus::TIMEOUT); else ls->setStatus(LinkStatus::BROKEN); if(job->errorString().isEmpty()) kdWarning(23100) << "\n\nError string is empty, error = " << job->error() << "\n\n\n"; if(job->error() != TDEIO::ERR_NO_CONTENT) ls->setError(job->errorString()); else ls->setError(i18n("No Content")); } } else { if(!ls->absoluteUrl().protocol().startsWith("http")) { ls->setStatusText("OK"); ls->setStatus(LinkStatus::SUCCESSFULL); } else { if(!header_checked_) { kdDebug(23100) << "\n\nheader not received... checking again...\n\n\n"; //check again check(); return; } Q_ASSERT(header_checked_); ls->setStatus(getHttpStatus()); } if(!doc_html_.isNull() && !doc_html_.isEmpty()) { ls->setDocHtml(doc_html_); parsing_ = true; HtmlParser parser(doc_html_); if(parser.hasBaseUrl()) ls->setBaseURI(KURL(parser.baseUrl().url())); if(parser.hasTitle()) ls->setHtmlDocTitle(parser.title().attributeTITLE()); ls->setChildrenNodes(parser.nodes()); parsing_ = false; } } finnish(); } void LinkChecker::slotRedirection (TDEIO::Job* /*job*/, const KURL &url) { kdDebug(23100) << "LinkChecker::slotRedirection -> " << linkstatus_->absoluteUrl().url() << " -> " << url.url() << endl; // << " - " << t_job_->slave() << "/" << t_job_->slave()->slave_pid() << endl; redirection_ = true; redirection_url_ = url; } bool LinkChecker::processRedirection(KURL const& toUrl) { if(finnished_) return true; kdDebug(23100) << "LinkChecker::processRedirection -> " << linkstatus_->absoluteUrl().url() << " -> " << toUrl.url() << endl; Q_ASSERT(t_job_); Q_ASSERT(linkstatus_->absoluteUrl().protocol().startsWith("http")); Q_ASSERT(redirection_); linkstatus_->setHttpHeader(getHttpHeader(t_job_, false)); linkstatus_->setIsRedirection(true); linkstatus_->setStatusText("redirection"); linkstatus_->setStatus(LinkStatus::HTTP_REDIRECTION); linkstatus_->setChecked(true); LinkStatus* ls_red = new LinkStatus(*linkstatus_); ls_red->setAbsoluteUrl(toUrl); ls_red->setRootUrl(linkstatus_->rootUrl()); if(!linkstatus_->onlyCheckHeader()) ls_red->setOnlyCheckHeader(false); linkstatus_->setRedirection(ls_red); ls_red->setParent(linkstatus_); ls_red->setOriginalUrl(toUrl.url()); Q_ASSERT(search_manager_); if(search_manager_->localDomain(ls_red->absoluteUrl())) ls_red->setExternalDomainDepth(-1); else { if(search_manager_->localDomain(linkstatus_->absoluteUrl())) ls_red->setExternalDomainDepth(linkstatus_->externalDomainDepth() + 1); else ls_red->setExternalDomainDepth(linkstatus_->externalDomainDepth()); } if(!toUrl.isValid() || search_manager_->existUrl(toUrl, linkstatus_->absoluteUrl())) { ls_red->setChecked(false); return false; } else { ls_red->setChecked(true); return true; } } void LinkChecker::finnish() { Q_ASSERT(!t_job_); if(!finnished_) { kdDebug(23100) << "LinkChecker::finnish -> " << linkstatus_->absoluteUrl().url() << endl; finnished_ = true; if(redirection_) Q_ASSERT(linkstatus_->checked()); else linkstatus_->setChecked(true); emit transactionFinished(linkstatus_, this); } } HttpResponseHeader LinkChecker::getHttpHeader(TDEIO::Job* /*job*/, bool remember_check) { //kdDebug(23100) << "LinkChecker::getHttpHeader -> " << linkstatus_->absoluteUrl().url() << endl; Q_ASSERT(!finnished_); Q_ASSERT(t_job_); TQString header_string = t_job_->queryMetaData("HTTP-Headers"); // Q_ASSERT(!header_string.isNull() && !header_string.isEmpty()); // kdDebug(23100) << "HTTP header: " << endl << header_string << endl; // kdDebug(23100) << "Keys: " << HttpResponseHeader(header_string).keys() << endl; // kdDebug(23100) << "Content-type: " << HttpResponseHeader(header_string).contentType() << endl; // kdDebug(23100) << "Content-type: " << HttpResponseHeader(header_string).value("content-type") << endl; if(header_string.isNull() || header_string.isEmpty()) { header_checked_ = false; kdWarning(23100) << "header_string.isNull() || header_string.isEmpty(): " << linkstatus_->toString() << endl; } else if(remember_check) header_checked_ = true; return HttpResponseHeader(header_string); } void LinkChecker::checkRef() { KURL url(linkStatus()->absoluteUrl()); Q_ASSERT(url.hasRef()); TQString ref = url.ref(); if(ref == "" || ref == "top") { linkstatus_->setStatusText("OK"); linkstatus_->setStatus(LinkStatus::SUCCESSFULL); finnish(); return; } TQString url_base; LinkStatus const* ls_parent = 0; int i_ref = -1; if(linkStatus()->originalUrl().startsWith("#")) ls_parent = linkStatus()->parent(); else { i_ref = url.url().find("#"); url_base = url.url().left(i_ref); //kdDebug(23100) << "url_base: " << url_base << endl; Q_ASSERT(search_manager_); ls_parent = search_manager_->linkStatus(url_base); } if(ls_parent) checkRef(ls_parent); else { url = KURL::fromPathOrURL(url.url().left(i_ref)); checkRef(url); } } void LinkChecker::checkRef(KURL const& url) { Q_ASSERT(search_manager_); TQString url_string = url.url(); TDEHTMLPart* html_part = search_manager_->htmlPart(url_string); if(!html_part) { kdDebug() << "new TDEHTMLPart: " + url_string << endl; html_part = new TDEHTMLPart(); html_part->setOnlyLocalReferences(true); TQString tmpFile; if(TDEIO::NetAccess::download(url, tmpFile, 0)) { TQString doc_html = FileManager::read(tmpFile); html_part->begin(); html_part->write(doc_html); html_part->end(); TDEIO::NetAccess::removeTempFile(tmpFile); } else { kdDebug(23100) << TDEIO::NetAccess::lastErrorString() << endl; } search_manager_->addHtmlPart(url_string, html_part); } if(hasAnchor(html_part, linkStatus()->absoluteUrl().ref())) { linkstatus_->setStatusText("OK"); linkstatus_->setStatus(LinkStatus::SUCCESSFULL); } else { linkstatus_->setErrorOccurred(true); linkstatus_->setError(i18n( "Link destination not found." )); linkstatus_->setStatus(LinkStatus::BROKEN); } finnish(); } void LinkChecker::checkRef(LinkStatus const* linkstatus_parent) { Q_ASSERT(search_manager_); TQString url_string = linkstatus_parent->absoluteUrl().url(); TDEHTMLPart* html_part = search_manager_->htmlPart(url_string); if(!html_part) { kdDebug() << "new TDEHTMLPart: " + url_string << endl; html_part = new TDEHTMLPart(); html_part->setOnlyLocalReferences(true); html_part->begin(); html_part->write(linkstatus_parent->docHtml()); html_part->end(); search_manager_->addHtmlPart(url_string, html_part); } if(hasAnchor(html_part, linkStatus()->absoluteUrl().ref())) { linkstatus_->setStatusText("OK"); linkstatus_->setStatus(LinkStatus::SUCCESSFULL); } else { linkstatus_->setErrorOccurred(true); linkstatus_->setError(i18n( "Link destination not found." )); linkstatus_->setStatus(LinkStatus::BROKEN); } finnish(); } bool LinkChecker::hasAnchor(TDEHTMLPart* html_part, TQString const& anchor) { DOM::HTMLDocument htmlDocument = html_part->htmlDocument(); DOM::HTMLCollection anchors = htmlDocument.anchors(); DOM::DOMString name_ref(anchor); Q_ASSERT(!name_ref.isNull()); DOM::Node node = anchors.namedItem(name_ref); if(node.isNull()) { node = htmlDocument.getElementById(name_ref); } if(!node.isNull()) return true; else return false; } void LinkChecker::killJob() { if(!t_job_) return; TDEIO::TransferJob* aux = t_job_; t_job_ = 0; aux->disconnect(this); aux->kill(true); // quietly } LinkStatus::Status LinkChecker::getHttpStatus() const { TQString status_code = TQString::number(linkstatus_->httpHeader().statusCode()); if(status_code[0] == '2') return LinkStatus::SUCCESSFULL; else if(status_code[0] == '3') return LinkStatus::HTTP_REDIRECTION; else if(status_code[0] == '4') return LinkStatus::HTTP_CLIENT_ERROR; else if(status_code[0] == '5') return LinkStatus::HTTP_SERVER_ERROR; else return LinkStatus::UNDETERMINED; } #include "linkchecker.moc"