diff options
Diffstat (limited to 'qtinterface/tqtextcodec.cpp')
-rw-r--r-- | qtinterface/tqtextcodec.cpp | 492 |
1 files changed, 492 insertions, 0 deletions
diff --git a/qtinterface/tqtextcodec.cpp b/qtinterface/tqtextcodec.cpp index 7958168..6e047a5 100644 --- a/qtinterface/tqtextcodec.cpp +++ b/qtinterface/tqtextcodec.cpp @@ -21,3 +21,495 @@ Boston, MA 02110-1301, USA. #include <tqt.h> #include <tqtextcodec.h> + +#ifdef USE_QT4 + +// returns a string containing the letters and numbers from input, +// with a space separating run of a character class. e.g. "iso8859-1" +// becomes "iso 8859 1" +static QString lettersAndNumbers( const char * input ) +{ + QString result; + QChar c; + + while( input && *input ) { + c = *input; + if ( c.isLetter() || c.isNumber() ) + result += c.lower(); + if ( input[1] ) { + // add space at character class transition, except + // transition from upper-case to lower-case letter + QChar n( input[1] ); + if ( c.isLetter() && n.isLetter() ) { + if ( c == c.lower() && n == n.upper() ) + result += ' '; + } else if ( c.category() != n.category() ) { + result += ' '; + } + } + input++; + } + return result.simplifyWhiteSpace(); +} + +#define CHAINED 0xffff + +struct QMultiByteUnicodeTable { + // If multiByte, ignore unicode and index into multiByte + // with the next character. + QMultiByteUnicodeTable() : unicode(0xfffd), multiByte(0) { } + + ~QMultiByteUnicodeTable() + { + if ( multiByte ) + delete [] multiByte; + } + + ushort unicode; + QMultiByteUnicodeTable* multiByte; +}; + +static int getByte(char* &cursor) +{ + int byte = 0; + if ( *cursor ) { + if ( cursor[1] == 'x' ) + byte = strtol(cursor+2,&cursor,16); + else if ( cursor[1] == 'd' ) + byte = strtol(cursor+2,&cursor,10); + else + byte = strtol(cursor+2,&cursor,8); + } + return byte&0xff; +} + +class QTextCodecFromIOD; + +class QTextCodecFromIODDecoder : public QTextDecoder { + const QTextCodecFromIOD* codec; + QMultiByteUnicodeTable* mb; +public: + QTextCodecFromIODDecoder(const QTextCodecFromIOD* c); + //QString toUnicode(const char* chars, int len); + QString convertToUnicode(const char* chars, int len, int *state); +}; + +class QTextCodecFromIOD : public QTextCodec { + friend class QTextCodecFromIODDecoder; + + TQCString n; + + // If from_unicode_page[row()][cell()] is 0 and from_unicode_page_multiByte, + // use from_unicode_page_multiByte[row()][cell()] as string. + char** from_unicode_page; + char*** from_unicode_page_multiByte; + char unkn; + + // Only one of these is used + ushort* to_unicode; + QMultiByteUnicodeTable* to_unicode_multiByte; + int max_bytes_per_char; + TQStrList aliases; + + bool stateless() const { return !to_unicode_multiByte; } + +public: + QTextCodecFromIOD(QIODevice* iod) + { + from_unicode_page = 0; + to_unicode_multiByte = 0; + to_unicode = 0; + from_unicode_page_multiByte = 0; + max_bytes_per_char = 1; + + const int maxlen=100; + char line[maxlen]; + char esc='\\'; + char comm='%'; + bool incmap = FALSE; + while (iod->readLine(line,maxlen) > 0) { + if (0==qstrnicmp(line,"<code_set_name>",15)) + n = line+15; + else if (0==qstrnicmp(line,"<escape_char> ",14)) + esc = line[14]; + else if (0==qstrnicmp(line,"<comment_char> ",15)) + comm = line[15]; + else if (line[0]==comm && 0==qstrnicmp(line+1," alias ",7)) { + aliases.append(line+8); + } else if (0==qstrnicmp(line,"CHARMAP",7)) { + if (!from_unicode_page) { + from_unicode_page = new char*[256]; + for (int i=0; i<256; i++) + from_unicode_page[i]=0; + } + if (!to_unicode) { + to_unicode = new ushort[256]; + } + incmap = TRUE; + } else if (0==qstrnicmp(line,"END CHARMAP",11)) + break; + else if (incmap) { + char* cursor = line; + int byte=-1,unicode=-1; + ushort* mb_unicode=0; + const int maxmb=8; // more -> we'll need to improve datastructures + char mb[maxmb+1]; + int nmb=0; + + while (*cursor) { + if (cursor[0]=='<' && cursor[1]=='U' && + cursor[2]>='0' && cursor[2]<='9' && + cursor[3]>='0' && cursor[3]<='9') { + + unicode = strtol(cursor+2,&cursor,16); + + } else if (*cursor==esc) { + + byte = getByte(cursor); + + if ( *cursor == esc ) { + if ( !to_unicode_multiByte ) { + to_unicode_multiByte = + new QMultiByteUnicodeTable[256]; + for (int i=0; i<256; i++) { + to_unicode_multiByte[i].unicode = + to_unicode[i]; + to_unicode_multiByte[i].multiByte = 0; + } + delete [] to_unicode; + to_unicode = 0; + } + QMultiByteUnicodeTable* mbut = + to_unicode_multiByte+byte; + mb[nmb++] = byte; + while ( nmb < maxmb && *cursor == esc ) { + // Always at least once + + mbut->unicode = CHAINED; + byte = getByte(cursor); + mb[nmb++] = byte; + if (!mbut->multiByte) { + mbut->multiByte = + new QMultiByteUnicodeTable[256]; + } + mbut = mbut->multiByte+byte; + mb_unicode = & mbut->unicode; + } + + if ( nmb > max_bytes_per_char ) + max_bytes_per_char = nmb; + } + } else { + cursor++; + } + } + + if (unicode >= 0 && unicode <= 0xffff) + { + QChar ch((ushort)unicode); + if (!from_unicode_page[ch.row()]) { + from_unicode_page[ch.row()] = new char[256]; + for (int i=0; i<256; i++) + from_unicode_page[ch.row()][i]=0; + } + if ( mb_unicode ) { + from_unicode_page[ch.row()][ch.cell()] = 0; + if (!from_unicode_page_multiByte) { + from_unicode_page_multiByte = new char**[256]; + for (int i=0; i<256; i++) + from_unicode_page_multiByte[i]=0; + } + if (!from_unicode_page_multiByte[ch.row()]) { + from_unicode_page_multiByte[ch.row()] = new char*[256]; + for (int i=0; i<256; i++) + from_unicode_page_multiByte[ch.row()][i] = 0; + } + mb[nmb++] = 0; + from_unicode_page_multiByte[ch.row()][ch.cell()] + = qstrdup(mb); + *mb_unicode = unicode; + } else { + from_unicode_page[ch.row()][ch.cell()] = (char)byte; + if ( to_unicode ) + to_unicode[byte] = unicode; + else + to_unicode_multiByte[byte].unicode = unicode; + } + } else { + } + } + } + n = n.stripWhiteSpace(); + + unkn = '?'; // ##### Might be a bad choice. + } + + ~QTextCodecFromIOD() + { + if ( from_unicode_page ) { + for (int i=0; i<256; i++) + if (from_unicode_page[i]) + delete [] from_unicode_page[i]; + } + if ( from_unicode_page_multiByte ) { + for (int i=0; i<256; i++) + if (from_unicode_page_multiByte[i]) + for (int j=0; j<256; j++) + if (from_unicode_page_multiByte[i][j]) + delete [] from_unicode_page_multiByte[i][j]; + } + if ( to_unicode ) + delete [] to_unicode; + if ( to_unicode_multiByte ) + delete [] to_unicode_multiByte; + } + + bool ok() const + { + return !!from_unicode_page; + } + + QTextDecoder* makeDecoder() const + { + if ( stateless() ) + return QTextCodec::makeDecoder(); + else + return new QTextCodecFromIODDecoder(this); + } + + const char* qtio_name() const + { + return n; + } + + int mibEnum() const + { + return 0; // #### Unknown. + } + + int heuristicContentMatch(const char*, int) const + { + return 0; + } + + int heuristicNameMatch(const char* hint) const + { + int bestr = QTextCodec::heuristicNameMatch(hint); + TQStrListIterator it(aliases); + char* a; + while ((a=it.current())) { + ++it; + int r = simpleHeuristicNameMatch(a,hint); + if (r > bestr) + bestr = r; + } + return bestr; + } + + QString toUnicode(const char* chars, int len) const + { + const uchar* uchars = (const uchar*)chars; + QString result; + QMultiByteUnicodeTable* multiByte=to_unicode_multiByte; + if ( multiByte ) { + while (len--) { + QMultiByteUnicodeTable& mb = multiByte[*uchars]; + if ( mb.multiByte ) { + // Chained multi-byte + multiByte = mb.multiByte; + } else { + result += QChar(mb.unicode); + multiByte=to_unicode_multiByte; + } + uchars++; + } + } else { + while (len--) + result += QChar(to_unicode[*uchars++]); + } + return result; + } + + QString convertToUnicode(const char* chars, int len, ConverterState *state) const + { + return toUnicode(chars, len); + } + +#if !defined(Q_NO_USING_KEYWORD) + using QTextCodec::fromUnicode; +#endif + TQCString fromUnicode(const QString& uc, int& lenInOut) const + { + if (lenInOut > (int)uc.length()) + lenInOut = uc.length(); + int rlen = lenInOut*max_bytes_per_char; + TQCString rstr(rlen+1); + char* cursor = rstr.data(); + char* s=0; + int l = lenInOut; + int lout = 0; + for (int i=0; i<l; i++) { + QChar ch = uc[i]; + if ( ch == QChar() ) { + // special + *cursor++ = 0; + } else if ( from_unicode_page[ch.row()] && + from_unicode_page[ch.row()][ch.cell()] ) + { + *cursor++ = from_unicode_page[ch.row()][ch.cell()]; + lout++; + } else if ( from_unicode_page_multiByte && + from_unicode_page_multiByte[ch.row()] && + (s=from_unicode_page_multiByte[ch.row()][ch.cell()]) ) + { + while (*s) { + *cursor++ = *s++; + lout++; + } + } else { + *cursor++ = unkn; + lout++; + } + } + *cursor = 0; + lenInOut = lout; + return rstr; + } + + QByteArray convertFromUnicode(const QChar *charin, int len, ConverterState *state) const + { + return fromUnicode(charin, len); + } + + QByteArray name() const + { + return qtio_name(); + } +}; + +// QTextCodecFromIODDecoder::QTextCodecFromIODDecoder(const QTextCodecFromIOD* c) : +// codec(c) +// { +// mb = codec->to_unicode_multiByte; +// } + +QString QTextCodecFromIODDecoder::convertToUnicode(const char* chars, int len, int *state) +{ + const uchar* uchars = (const uchar*)chars; + QString result; + while (len--) { + QMultiByteUnicodeTable& t = mb[*uchars]; + if ( t.multiByte ) { + // Chained multi-byte + mb = t.multiByte; + } else { + if ( t.unicode ) + result += QChar(t.unicode); + mb=codec->to_unicode_multiByte; + } + uchars++; + } + return result; +} + +#ifndef QT_NO_CODECS +// Cannot use <pre> or \code +/*! + Reads a POSIX2 charmap definition from \a iod. + The parser recognizes the following lines: + +<font name="sans"> + <code_set_name> <i>name</i></br> + <escape_char> <i>character</i></br> + % alias <i>alias</i></br> + CHARMAP</br> + <<i>token</i>> /x<i>hexbyte</i> <U<i>unicode</i>> ...</br> + <<i>token</i>> /d<i>decbyte</i> <U<i>unicode</i>> ...</br> + <<i>token</i>> /<i>octbyte</i> <U<i>unicode</i>> ...</br> + <<i>token</i>> /<i>any</i>/<i>any</i>... <U<i>unicode</i>> ...</br> + END CHARMAP</br> +</font> + + The resulting QTextCodec is returned (and also added to the global + list of codecs). The name() of the result is taken from the + code_set_name. + + Note that a codec constructed in this way uses much more memory + and is slower than a hand-written QTextCodec subclass, since + tables in code are kept in memory shared by all Qt applications. + + \sa loadCharmapFile() +*/ +QTextCodec* QTextCodec::loadCharmap(QIODevice* iod) +{ + QTextCodecFromIOD* r = new QTextCodecFromIOD(iod); + if ( !r->ok() ) { + delete r; + r = 0; + } + return r; +} + +/*! + A convenience function for loadCharmap() that loads the charmap + definition from the file \a filename. +*/ +QTextCodec* QTextCodec::loadCharmapFile(QString filename) +{ + QFile f(filename); + if (f.open(IO_ReadOnly)) { + QTextCodecFromIOD* r = new QTextCodecFromIOD(&f); + if ( !r->ok() ) + delete r; + else + return r; + } + return 0; +} + +/*! + Returns a value indicating how likely it is that this decoder is + appropriate for decoding some format that has the given name. The + name is compared with the \a hint. + + A good match returns a positive number around the length of the + string. A bad match is negative. + + The default implementation calls simpleHeuristicNameMatch() with + the name of the codec. +*/ +int QTextCodec::heuristicNameMatch(const char* hint) const +{ + return simpleHeuristicNameMatch(name(),hint); +} + +/*! + A simple utility function for heuristicNameMatch(): it does some + very minor character-skipping so that almost-exact matches score + high. \a name is the text we're matching and \a hint is used for + the comparison. +*/ +int QTextCodec::simpleHeuristicNameMatch(const char* name, const char* hint) +{ + // if they're the same, return a perfect score. + if ( name && hint && *name && *hint && qstricmp( name, hint ) == 0 ) + return qstrlen( hint ); + + // if the letters and numbers are the same, we have an "almost" + // perfect match. + QString h( lettersAndNumbers( hint ) ); + QString n( lettersAndNumbers( name ) ); + if ( h == n ) + return qstrlen( hint )-1; + + if ( h.stripWhiteSpace() == n.stripWhiteSpace() ) + return qstrlen( hint )-2; + + // could do some more here, but I don't think it's worth it + + return 0; +} + +#endif //QT_NO_CODECS + +#endif // USE_QT4
\ No newline at end of file |