/*************************************************************************** * Copyright (C) 2008 by Jacob Kanev , * * Thomas Fischer * * * * This program is free software; you can redistribute it and/or modify * * it under the terms of the GNU General Public License as published by * * the Free Software Foundation; either version 2 of the License, or * * (at your option) any later version. * * * * This program is distributed in the hope that it will be useful, * * but WITHOUT ANY WARRANTY; without even the implied warranty of * * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * * GNU General Public License for more details. * * * * You should have received a copy of the GNU General Public License * * along with this program; if not, write to the * * Free Software Foundation, Inc., * * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * ***************************************************************************/ #include #include #include #include #include #include #include #include #include #include #include #include #include "webqueryciteseerx.h" using BibTeX::Value; using BibTeX::Entry; using BibTeX::EntryField; namespace KBibTeX { //_______________________________________________________________________________________________________________ // Construct widget WebQueryCiteSeerXWidget::WebQueryCiteSeerXWidget( QWidget *parent, const char *name ) : WebQueryWidget( parent, name ) { init(); Settings *settings = Settings::self(); QString value = settings->getWebQueryDefault( "CiteSeerX" ); value = value == QString::null ? "" : value; lineEditQuery->setText( value ); slotTextChanged( value, true ); } //_______________________________________________________________________________________________________________ // Construct WebQueryCiteSeerX::WebQueryCiteSeerX( QWidget* parent ) : WebQuery( parent ), m_citeSeerXServer( "citeseerx.ist.psu.edu" ) { m_widget = new WebQueryCiteSeerXWidget( parent ); } //_______________________________________________________________________________________________________________ // Destroy WebQueryCiteSeerX::~WebQueryCiteSeerX() { delete m_widget; } //_______________________________________________________________________________________________________________ // GUI string QString WebQueryCiteSeerX::title() { return i18n( "CiteSeerX" ); } //_______________________________________________________________________________________________________________ // GUI info QString WebQueryCiteSeerX::disclaimer() { return i18n( "About CiteSeerX" ); } //_______________________________________________________________________________________________________________ // URL for disclaimer QString WebQueryCiteSeerX::disclaimerURL() { return "http://citeseerx.ist.psu.edu/about/site"; } //_______________________________________________________________________________________________________________ // return pointer to widget WebQueryWidget *WebQueryCiteSeerX::widget() { return m_widget; } //_______________________________________________________________________________________________________________ // user has pressed "Cancel" void WebQueryCiteSeerX::cancelQuery() { m_queryQueue.clear(); } //_______________________________________________________________________________________________________________ // main function -- collects all queries for one search void WebQueryCiteSeerX::query() { // store CiteSeerX as future default WebQuery::query(); Settings *settings = Settings::self(); settings->setWebQueryDefault( "CiteSeerX", m_widget->lineEditQuery->text() ); // read number of desired results from GUI m_queryQueue.clear(); m_desiredHits = m_widget->spinBoxMaxHits->value(); // one for each entry, and one for each page of 10 links setNumStages( m_desiredHits + ( m_desiredHits / 10 + 1 ) ); // prepare search term QString searchTerm = m_widget->lineEditQuery->text().stripWhiteSpace().replace( '$', "" ); QStringList queryWords = QStringList::split( QRegExp( "\\s+" ), searchTerm ); if ( searchTerm.isEmpty() || queryWords.size() == 0 ) { setEndSearch( WebQuery::statusInvalidQuery ); return; } // build query from search term QString query; for ( uint i = 0; i < queryWords.size(); ++i ) { if ( i ) query += " AND "; query += queryWords[i]; } query = query.replace( "%", "%25" ).replace( "+", "%2B" ).replace( " ", "%20" ).replace( "#", "%23" ).replace( "&", "%26" ).replace( "?", "%3F" ); // schedule jobs DataRequest dr; dr.url = KURL( QString( "http://citeseerx.ist.psu.edu/search?q=" ).append( query ).append( "&submit=Search&sort=rel" ) ); dr.parser = &WebQueryCiteSeerX::parseSummaryPage; m_queryQueue.push_back( dr ); // start job queue nextJob(); } //_______________________________________________________________________________________________________________ // process results from current job void WebQueryCiteSeerX::parseSummaryPage( const QString& data ) { // regexp. for finding paper entries (example: href="/viewdoc/summary;jsessionid=12345ABCD?doi=10.1.1.108.9937") QRegExp paperXpr( "href=\"(/viewdoc/summary[^?]*\\?doi=[^\"]+)\"" ); // count paper results and schedule single paper URLs for ( int p = paperXpr.search( data ); p >= 0; p = paperXpr.search( data, p + paperXpr.matchedLength() ) ) { if ( ++m_receivedHits > m_desiredHits ) break; DataRequest dr; dr.url = KURL( QString( "http://" ) + m_citeSeerXServer + paperXpr.cap( 1 ) ); dr.parser = &WebQueryCiteSeerX::parsePaperPage; m_queryQueue.push_back( dr ); } // if we haven't reached the desired number of hits, schedule the next summary page QRegExp nextSummaryXpr( "Next 10" ); if ( m_receivedHits < m_desiredHits ) if ( nextSummaryXpr.search( data ) >= 0 ) { DataRequest dr; dr.url = KURL( QString( "http://" ) + m_citeSeerXServer + nextSummaryXpr.cap( 1 ).replace( "&", "&" ) ); dr.parser = &WebQueryCiteSeerX::parseSummaryPage; m_queryQueue.push_back( dr ); } } //_______________________________________________________________________________________________________________ // process the result of one single paper link void WebQueryCiteSeerX::parsePaperPage( const QString& data ) { // find type and id: @XXX{ YYY QRegExp typeIdXpr( "@(.*)\\{(.*)," ); typeIdXpr.setMinimal( true ); typeIdXpr.search( data ); QString typeStr = typeIdXpr.cap( 1 ); QString id = typeIdXpr.cap( 2 ); // create entry Entry *entry = new BibTeX::Entry( typeIdXpr.cap( 1 ), typeIdXpr.cap( 2 ) ); // find abstract: <..>Abstract: <..> XXX parseForSingleExpression( "<[^<]+>Abstract:\\s*<[^<]+>([^<]+)", data, entry, BibTeX::EntryField::ftAbstract ); // find title: title = {XXX} parseForSingleExpression( "title = \\{([^}]+)\\}", data, entry, BibTeX::EntryField::ftTitle ); // find author: author = {XXX} parseForSingleExpression( "author = \\{([^}]+)\\}", data, entry, BibTeX::EntryField::ftAuthor ); // find year: year = {XXX} parseForSingleExpression( "year = \\{([^}]+)\\}", data, entry, BibTeX::EntryField::ftYear ); // find journal: journal = {XXX} parseForSingleExpression( "journal = \\{([^}]+)\\}", data, entry, BibTeX::EntryField::ftJournal ); // find pages: pages = {XXX} parseForSingleExpression( "pages = \\{([^}]+)\\}", data, entry, BibTeX::EntryField::ftPages ); // publish what we've found emit foundEntry( entry, false ); } //_______________________________________________________________________________________________________________ // find single bibtex field in html page and add to entry void WebQueryCiteSeerX::parseForSingleExpression( QString description, const QString &data, Entry *entry, BibTeX::EntryField::FieldType type ) { // search, and add to entry if found QRegExp xpr( description ); if ( xpr.search( data ) + 1 ) { EntryField *field = new EntryField( type ); field->setValue( new Value( xpr.cap( 1 ), false ) ); entry->addField( field ); } } //_______________________________________________________________________________________________________________ // read data from the job and start the current parser void WebQueryCiteSeerX::getData( KIO::Job *job ) { // advance GUI progress bar enterNextStage(); if ( job && !job->error() && !m_aborted ) { // read data QBuffer data; data.open( IO_WriteOnly ); data.writeBlock( dynamic_cast( job )->data() ); data.close(); data.open( IO_ReadOnly ); QTextStream ts( &data ); QString result = ts.read(); data.close(); // hand the read data over to the parser ( this->*m_currentParser )( result ); } // proceed nextJob(); } //_______________________________________________________________________________________________________________ // call the next job void WebQueryCiteSeerX::nextJob() { // no more requests: finished if ( !m_queryQueue.size() ) { setEndSearch( WebQuery::statusSuccess ); m_receivedHits = 0; } // else: take the next request from queue and start it else if ( !m_aborted ) { m_currentParser = m_queryQueue.front().parser; KIO::Job *job = KIO::storedGet( m_queryQueue.front().url, FALSE, FALSE ); m_queryQueue.pop_front(); connect( job, SIGNAL( result( KIO::Job * ) ), this, SLOT( getData( KIO::Job * ) ) ); } } } #include "webqueryciteseerx.moc"