diff options
Diffstat (limited to 'src/fileimporterbibtex.cpp')
-rw-r--r-- | src/fileimporterbibtex.cpp | 658 |
1 files changed, 658 insertions, 0 deletions
diff --git a/src/fileimporterbibtex.cpp b/src/fileimporterbibtex.cpp new file mode 100644 index 0000000..5312f0c --- /dev/null +++ b/src/fileimporterbibtex.cpp @@ -0,0 +1,658 @@ +/*************************************************************************** +* Copyright (C) 2004-2009 by Thomas Fischer * +* fischer@unix-ag.uni-kl.de * +* * +* This program is free software; you can redistribute it and/or modify * +* it under the terms of the GNU General Public License as published by * +* the Free Software Foundation; either version 2 of the License, or * +* (at your option) any later version. * +* * +* This program is distributed in the hope that it will be useful, * +* but WITHOUT ANY WARRANTY; without even the implied warranty of * +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * +* GNU General Public License for more details. * +* * +* You should have received a copy of the GNU General Public License * +* along with this program; if not, write to the * +* Free Software Foundation, Inc., * +* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * +***************************************************************************/ +#include <qiodevice.h> +#include <qregexp.h> +#include <qapplication.h> + +#include <file.h> +#include <comment.h> +#include <macro.h> +#include <preamble.h> +#include <entry.h> +#include <element.h> +#include <encoderlatex.h> +#include <value.h> + +#include "fileimporterbibtex.h" + +#define max(a,b) ((a)<(b)?(b):(a)) + +namespace BibTeX +{ + const QString extraAlphaNumChars = QString( "?'`-_:.+/$\\\"&" ); + const QRegExp htmlRegExp = QRegExp( "</?(a|pre)[^>]*>", false ); + + FileImporterBibTeX::FileImporterBibTeX( bool personFirstNameFirst, QString encoding ) : FileImporter(), m_personFirstNameFirst( personFirstNameFirst ), m_currentChar( ' ' ), m_ignoreComments( FALSE ), m_lineBufferSize( 4096 ), m_encoding( encoding ) + { + cancelFlag = FALSE; + m_lineBuffer = new char[m_lineBufferSize]; + m_textStream = NULL; + } + + + FileImporterBibTeX::~FileImporterBibTeX() + { + delete[] m_lineBuffer; + } + + File* FileImporterBibTeX::load( QIODevice *iodevice ) + { + m_mutex.lock(); + cancelFlag = FALSE; + + QString rawText; + const char *encodingFrom = m_encoding == "latex" ? "utf-8\0" : m_encoding.append( "\0" ).ascii(); + iconv_t iconvHandle = iconv_open( "utf-8", encodingFrom ); + char *convertedLine = new char[m_lineBufferSize * 4]; + int len; + bool encodingOk = true; + while ( encodingOk && iodevice->isReadable() && ( len = iodevice->readLine( m_lineBuffer, m_lineBufferSize ) ) > 0 ) + { + evaluateParameterComments( iconvHandle, m_lineBuffer ); + + char *raw = m_lineBuffer; + char *enc = convertedLine; + size_t encLen = m_lineBufferSize, rawLen = ( size_t )len; + size_t result = iconv( iconvHandle, &raw, &rawLen, &enc, &encLen ); + + qApp->processEvents(); + + if ( result != 0 ) + { + QString problematic = QString( m_lineBuffer ).mid( max( 0, m_lineBufferSize - encLen - 15 ), 30 ); + if ( problematic.isNull() || problematic.isEmpty() ) problematic = QString( m_lineBuffer ); + qDebug( "iconv resulted in error code %i for source encoding %s, maybe file is in different encoding? Problem is somewhere here: \"%s\"", result, encodingFrom, problematic.latin1() ); + encodingOk = false; + break; + } + if ( rawLen > 0 ) + { + qDebug( "iconv could not convert complete string, only %i out of %i chars", len - rawLen, len ); + encodingOk = false; + break; + } + enc[0] = '\0'; + + /** remove leading UTF-8 byte-order mark (BOM) */ + int offset = 0; + while (((( unsigned char )convertedLine[offset] ) == 0xef || (( unsigned char )convertedLine[offset] ) == 0xbb || (( unsigned char )convertedLine[offset] ) == 0xbf ) && offset < 4 ) + ++offset; + + QString line = QString::fromUtf8( convertedLine + offset ); + rawText.append( line ); + } + iconv_close( iconvHandle ); + delete[] convertedLine; + + if ( !encodingOk ) + { + qDebug( "Decoding failed, cannot load file. Please fix encoding manually." ); + m_mutex.unlock(); + return NULL; + } + + /** Cleaning up code comming from DBLP */ + rawText = rawText.replace( htmlRegExp, "" ); + rawText = EncoderLaTeX::currentEncoderLaTeX() ->decode( rawText ); + unescapeLaTeXChars( rawText ); + m_textStream = new QTextStream( rawText, IO_ReadOnly ); + m_textStream->setEncoding( QTextStream::UnicodeUTF8 ); + m_currentLineNumber = 0; + m_posIntCurrentLine = 0; + m_currentLine = ""; + + File *result = new File(); + QIODevice *streamDevice = m_textStream->device(); + while ( !cancelFlag && !m_textStream->atEnd() ) + { + emit progress( streamDevice->at(), streamDevice->size() ); + qApp->processEvents(); + Element * element = nextElement(); + if ( element != NULL ) + { + Comment *comment = dynamic_cast<Comment*>( element ); + if ( !m_ignoreComments || comment == NULL ) + result->appendElement( element ); + else + delete element; + } + qApp->processEvents(); + } + emit progress( streamDevice->size(), streamDevice->size() ); + + if ( cancelFlag ) + { + qDebug( "Loading file has been canceled" ); + delete result; + result = NULL; + } + + delete m_textStream; + + m_mutex.unlock(); + return result; + } + + bool FileImporterBibTeX::guessCanDecode( const QString & rawText ) + { + QString text = EncoderLaTeX::currentEncoderLaTeX() ->decode( rawText ); + return text.find( QRegExp( "@\\w+\\{.+\\}" ) ) >= 0; + } + + void FileImporterBibTeX::setIgnoreComments( bool ignoreComments ) + { + m_ignoreComments = ignoreComments; + } + + void FileImporterBibTeX::cancel() + { + cancelFlag = TRUE; + } + + Element *FileImporterBibTeX::nextElement() + { + Token token = nextToken(); + + if ( token == tAt ) + { + QString elementType = readSimpleString(); + if ( elementType.lower() == "comment" ) + return readCommentElement(); + else if ( elementType.lower() == "string" ) + return readMacroElement(); + else if ( elementType.lower() == "preamble" ) + return readPreambleElement(); + else if ( !elementType.isEmpty() ) + return readEntryElement( elementType ); + else + { + qDebug( "ElementType is empty" ); + return NULL; + } + } + else if ( token == tUnknown ) + { + qDebug( "Unknown token near line %i, treating as comment", m_currentLineNumber ); + return readPlainCommentElement(); + } + + if ( token != tEOF ) + qDebug( "Don't know how to parse next token near line %i: %s", m_currentLineNumber, tokenidToString( token ).latin1() ); + + return NULL; + } + + Comment *FileImporterBibTeX::readCommentElement() + { + while ( m_currentChar != '{' && m_currentChar != '(' && !m_textStream->atEnd() ) + m_currentChar = nextChar(); + + return new Comment( readBracketString( m_currentChar ), TRUE ); + } + + Comment *FileImporterBibTeX::readPlainCommentElement() + { + QString result = m_currentChar; + result += readLine(); + m_currentChar = nextChar(); + while ( !m_textStream->atEnd() && m_currentChar != '@' && !m_currentChar.isSpace() ) + { + result.append( '\n' ).append( m_currentChar ); + m_currentChar = nextChar(); + result.append( readLine() ); + m_currentChar = nextChar(); + } + return new Comment( result, FALSE ); + } + + Macro *FileImporterBibTeX::readMacroElement() + { + Token token = nextToken(); + while ( token != tBracketOpen ) + { + if ( token == tEOF ) + { + qDebug( "Error in parsing unknown macro (near line %i): Opening curly brace ({) expected", m_currentLineNumber ); + return NULL; + } + token = nextToken(); + } + + QString key = readSimpleString(); + if ( nextToken() != tAssign ) + { + qDebug( "Error in parsing macro '%s' (near line %i): Assign symbol (=) expected", key.latin1(), m_currentLineNumber ); + return NULL; + } + + Macro *macro = new Macro( key ); + do + { + bool isStringKey = FALSE; + QString text = readString( isStringKey ).replace( QRegExp( "\\s+" ), " " ); + if ( isStringKey ) + macro->value()->items.append( new MacroKey( text ) ); + else + macro->value()->items.append( new BibTeX::PlainText( text ) ); + + token = nextToken(); + } + while ( token == tDoublecross ); + + return macro; + } + + Preamble *FileImporterBibTeX::readPreambleElement() + { + Token token = nextToken(); + while ( token != tBracketOpen ) + { + if ( token == tEOF ) + { + qDebug( "Error in parsing unknown preamble (near line %i): Opening curly brace ({) expected", m_currentLineNumber ); + return NULL; + } + token = nextToken(); + } + + Preamble *preamble = new Preamble( ); + do + { + bool isStringKey = FALSE; + QString text = readString( isStringKey ).replace( QRegExp( "\\s+" ), " " ); + if ( isStringKey ) + preamble->value()->items.append( new MacroKey( text ) ); + else + preamble->value()->items.append( new BibTeX::PlainText( text ) ); + + token = nextToken(); + } + while ( token == tDoublecross ); + + return preamble; + } + + Entry *FileImporterBibTeX::readEntryElement( const QString& typeString ) + { + Token token = nextToken(); + while ( token != tBracketOpen ) + { + if ( token == tEOF ) + { + qDebug( "Error in parsing unknown entry (near line %i): Opening curly brace ({) expected", m_currentLineNumber ); + return NULL; + } + token = nextToken(); + } + + QString key = readSimpleString(); + Entry *entry = new Entry( typeString, key ); + + token = nextToken(); + do + { + if ( token == tBracketClose || token == tEOF ) + break; + else if ( token != tComma ) + { + qDebug( "Error in parsing entry '%s' (near line %i): Comma symbol (,) expected but got 0x%x (token %s)", key.latin1(), m_currentLineNumber, m_currentChar.unicode(), tokenidToString( token ).latin1() ); + delete entry; + return NULL; + } + + QString fieldTypeName = readSimpleString(); + token = nextToken(); + if ( fieldTypeName == QString::null || token == tBracketClose ) + { + // entry is buggy, but we still accept it + break; + } + else if ( token != tAssign ) + { + qDebug( "Error in parsing entry '%s' (near line %i): Assign symbol (=) expected after field name '%s'", key.latin1(), m_currentLineNumber, fieldTypeName.latin1() ); + delete entry; + return NULL; + } + + /** check for duplicate fields */ + if ( entry->getField( fieldTypeName ) != NULL ) + { + int i = 1; + QString appendix = QString::number( i ); + while ( entry->getField( fieldTypeName + appendix ) != NULL ) + { + ++i; + appendix = QString::number( i ); + } + fieldTypeName += appendix; + } + + EntryField *entryField = new EntryField( fieldTypeName ); + + token = readValue( entryField->value(), entryField->fieldType() ); + + entry->addField( entryField ); + } + while ( TRUE ); + + return entry; + } + + FileImporterBibTeX::Token FileImporterBibTeX::nextToken() + { + if ( m_textStream->atEnd() ) + return tEOF; + + Token curToken = tUnknown; + + while (( m_currentChar.isSpace() || m_currentChar == '\t' ) && !m_textStream->atEnd() ) + m_currentChar = nextChar(); + + switch ( m_currentChar.latin1() ) + { + case '@': + curToken = tAt; + break; + case '{': + case '(': + curToken = tBracketOpen; + break; + case '}': + case ')': + curToken = tBracketClose; + break; + case ',': + curToken = tComma; + break; + case '=': + curToken = tAssign; + break; + case '#': + curToken = tDoublecross; + break; + default: + if ( m_textStream->atEnd() ) + curToken = tEOF; + } + + if ( curToken != tUnknown && curToken != tEOF ) + m_currentChar = nextChar(); + + return curToken; + } + + QString FileImporterBibTeX::readString( bool &isStringKey ) + { + while ( m_currentChar.isSpace() ) + m_currentChar = nextChar(); + + isStringKey = FALSE; + switch ( m_currentChar.latin1() ) + { + case '{': + case '(': + return readBracketString( m_currentChar ); + case '"': + return readQuotedString(); + default: + isStringKey = TRUE; + return readSimpleString(); + } + } + + QString FileImporterBibTeX::readSimpleString( QChar until ) + { + QString result; + + while ( m_currentChar.isSpace() ) + m_currentChar = nextChar(); + + if ( m_currentChar.isLetterOrNumber() || extraAlphaNumChars.contains( m_currentChar ) ) + { + result.append( m_currentChar ); + m_currentChar = nextChar(); + } + + while ( !m_textStream->atEnd() ) + { + if ( until != '\0' ) + { + if ( m_currentChar != until ) + result.append( m_currentChar ); + else + break; + } + else + if ( m_currentChar.isLetterOrNumber() || extraAlphaNumChars.contains( m_currentChar ) ) + result.append( m_currentChar ); + else if ( m_currentChar == "," || m_currentChar == "(" || m_currentChar == ")" || m_currentChar == "{" || m_currentChar == "}" || m_currentChar == "=" || m_currentChar == "#" || m_currentChar == "@" || m_currentChar.isSpace() ) + break; + else + { + qDebug( "Unknown letter or number: 0x%x", m_currentChar.unicode() ); + // break; + } + m_currentChar = nextChar(); + } + return result; + } + + QString FileImporterBibTeX::readQuotedString() + { + QString result; + QChar lastChar = m_currentChar; + m_currentChar = nextChar(); + while ( !m_textStream->atEnd() ) + { + if ( m_currentChar != '"' || lastChar == '\\' ) + result.append( m_currentChar ); + else + break; + lastChar = m_currentChar; + m_currentChar = nextChar(); + } + + /** read character after closing " */ + m_currentChar = nextChar(); + + return result; + } + + QString FileImporterBibTeX::readLine() + { + QString result = m_currentLine.mid( m_posIntCurrentLine ); + m_posIntCurrentLine = m_currentLine.length() + 2; + return result; + } + + QString FileImporterBibTeX::readBracketString( const QChar openingBracket ) + { + QString result; + QChar closingBracket = '}'; + if ( openingBracket == '(' ) + closingBracket = ')'; + int counter = 1; + m_currentChar = nextChar(); + while ( !m_textStream->atEnd() ) + { + if ( m_currentChar == openingBracket ) + counter++; + else if ( m_currentChar == closingBracket ) + counter--; + + if ( counter == 0 ) + break; + else + result.append( m_currentChar ); + m_currentChar = nextChar(); + } + m_currentChar = nextChar(); + return result; + } + + FileImporterBibTeX::Token FileImporterBibTeX::readValue( Value *value, EntryField::FieldType fieldType ) + { + Token token = tUnknown; + + do + { + bool isStringKey = FALSE; + QString text = readString( isStringKey ).replace( QRegExp( "\\s+" ), " " ); + + switch ( fieldType ) + { + case EntryField::ftKeywords: + { + if ( isStringKey ) + qDebug( "WARNING: Cannot handle keywords that are macros" ); + else + value->items.append( new KeywordContainer( text ) ); + } + break; + case EntryField::ftAuthor: + case EntryField::ftEditor: + { + if ( isStringKey ) + qDebug( "WARNING: Cannot handle authors/editors that are macros" ); + else + { + QStringList persons; + splitPersons( text, persons ); + PersonContainer *container = new PersonContainer( m_personFirstNameFirst ); + for ( QStringList::ConstIterator pit = persons.constBegin(); pit != persons.constEnd(); ++pit ) + container->persons.append( new Person( *pit, m_personFirstNameFirst ) ); + value->items.append( container ); + } + } + break; + case EntryField::ftPages: + text.replace( QRegExp( "\\s*--?\\s*" ), QChar( 0x2013 ) ); + default: + { + if ( isStringKey ) + value->items.append( new MacroKey( text ) ); + else + value->items.append( new BibTeX::PlainText( text ) ); + } + } + + token = nextToken(); + } + while ( token == tDoublecross ); + + return token; + } + + void FileImporterBibTeX::unescapeLaTeXChars( QString &text ) + { + text.replace( "\\&", "&" ); + } + + void FileImporterBibTeX::splitPersons( const QString& text, QStringList &persons ) + { + QStringList wordList; + QString word; + int bracketCounter = 0; + + for ( unsigned int pos = 0;pos < text.length();++pos ) + { + if ( text[pos] == '{' ) + ++bracketCounter; + else if ( text[pos] == '}' ) + --bracketCounter; + + if ( text[pos] == ' ' || text[pos] == '\n' || text[pos] == '\r' ) + { + if ( word == "and" && bracketCounter == 0 ) + { + persons.append( wordList.join( " " ) ); + wordList.clear(); + } + else if ( !word.isEmpty() ) + wordList.append( word ); + + word = ""; + } + else + word.append( text[pos] ); + } + + wordList.append( word ); + persons.append( wordList.join( " " ) ); + } + + void FileImporterBibTeX::evaluateParameterComments( iconv_t &iconvHandle, const char *cline ) + { + /** simple preliminary checks before expensive conversion to QString */ + if ( cline[0] == '@' && cline[1] == 'c' ) + { + QString line = QString( cline ).lower(); + /** check if this file requests a special encoding */ + if ( line.startsWith( "@comment{x-kbibtex-encoding=" ) && line.endsWith( "}\n" ) ) + { + QString newEncoding = line.mid( 28, line.length() - 30 ); + qDebug( "x-kbibtex-encoding=<%s>", newEncoding.latin1() ); + if ( newEncoding == "latex" ) newEncoding = "utf-8"; + iconv_close( iconvHandle ); + iconvHandle = iconv_open( "utf-8", newEncoding.append( '\0' ).ascii() ); + } + } + } + + QChar FileImporterBibTeX::nextChar() + { + bool atEndOfLine = m_posIntCurrentLine >= m_currentLine.length(); + + while (( m_posIntCurrentLine >= m_currentLine.length() || m_currentLine.isEmpty() || m_currentLine.isNull() ) && !m_textStream->atEnd() ) + { + m_currentLine = m_textStream->readLine(); + m_posIntCurrentLine = 0; + ++m_currentLineNumber; + } + + if ( atEndOfLine ) + return QChar( ' ' ); + else if ( m_posIntCurrentLine < m_currentLine.length() ) + { + QChar result = m_currentLine[m_posIntCurrentLine]; + ++m_posIntCurrentLine; + return result; + } + + return QChar(); + } + + QString FileImporterBibTeX::tokenidToString( Token token ) + { + switch ( token ) + { + case tAt: return QString( "At" ); + case tBracketClose: return QString( "BracketClose" ); + case tBracketOpen: return QString( "BracketOpen" ); + case tAlphaNumText: return QString( "AlphaNumText" ); + case tAssign: return QString( "Assign" ); + case tComma: return QString( "Comma" ); + case tDoublecross: return QString( "Doublecross" ); + case tEOF: return QString( "EOF" ); + case tUnknown: return QString( "Unknown" ); + default: return QString( "<Unknown>" ); + } + } +} |