diff options
Diffstat (limited to 'tdehtml/misc/decoder.cpp')
-rw-r--r-- | tdehtml/misc/decoder.cpp | 790 |
1 files changed, 790 insertions, 0 deletions
diff --git a/tdehtml/misc/decoder.cpp b/tdehtml/misc/decoder.cpp new file mode 100644 index 000000000..6000aa9cf --- /dev/null +++ b/tdehtml/misc/decoder.cpp @@ -0,0 +1,790 @@ +/* + This file is part of the KDE libraries + + Copyright (C) 1999 Lars Knoll (knoll@kde.org) + Copyright (C) 2003 Dirk Mueller (mueller@kde.org) + Copyright (C) 2003 Apple Computer, Inc. + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Library General Public + License as published by the Free Software Foundation; either + version 2 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Library General Public License for more details. + + You should have received a copy of the GNU Library General Public License + along with this library; see the file COPYING.LIB. If not, write to + the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, + Boston, MA 02110-1301, USA. +*/ +//---------------------------------------------------------------------------- +// +// KDE HTML Widget -- decoder for input stream + +#undef DECODE_DEBUG +//#define DECODE_DEBUG + +#include <assert.h> + +#include "decoder.h" +#include "guess_ja.h" + +using namespace tdehtml; + +#include "htmlhashes.h" + +#include <tqregexp.h> +#include <tqtextcodec.h> + +#include <kglobal.h> +#include <kcharsets.h> + +#include <ctype.h> +#include <kdebug.h> +#include <klocale.h> + + + +Decoder::Decoder() +{ + // latin1 + m_codec = TQTextCodec::codecForMib(4); + m_decoder = m_codec->makeDecoder(); + enc = 0; + m_type = DefaultEncoding; + body = false; + beginning = true; + visualRTL = false; + m_autoDetectLanguage = SemiautomaticDetection; + kc = NULL; +} + +Decoder::~Decoder() +{ + delete m_decoder; + if (kc) + delete kc; +} + +void Decoder::setEncoding(const char *_encoding, EncodingType type) +{ +#ifdef DECODE_DEBUG + kdDebug(6005) << "setEncoding " << _encoding << " " << type << endl; +#endif + enc = _encoding; +#ifdef DECODE_DEBUG + kdDebug(6005) << "old encoding is:" << m_codec->name() << endl; +#endif + enc = enc.lower(); +#ifdef DECODE_DEBUG + kdDebug(6005) << "requesting:" << enc << endl; +#endif + if(enc.isNull() || enc.isEmpty()) + return; + +#ifdef APPLE_CHANGES + TQTextCodec *codec = (type == EncodingFromMetaTag || type == EncodingFromXMLHeader) + ? TQTextCodec::codecForNameEightBitOnly(enc) + : TQTextCodec::codecForName(enc); + if (codec) { + enc = codec->name(); + visualRTL = codec->usesVisualOrdering(); + } +#else + if(enc == "visual") // hebrew visually ordered + enc = "iso8859-8"; + bool b; + TQTextCodec *codec = TDEGlobal::charsets()->codecForName(enc, b); + if (!b) + codec = 0; + + if (type == EncodingFromMetaTag || type == EncodingFromXMLHeader) { + //Sometimes the codec specified is absurd, i.e. UTF-16 despite + //us decoding a meta tag as ASCII. In that case, ignore it. + if (codec && + (codec->mibEnum() == 1000)) //UTF16 or similar. + codec = 0; + } + + if (codec && codec->mibEnum() == 11) { + //We do NOT want to use Qt's TQHebrewCodec, since it tries to reorder itself. + codec = TQTextCodec::codecForName("iso8859-8-i"); + + // visually ordered unless one of the following + if( !(enc == "iso-8859-8-i" || enc == "iso_8859-8-i" + || enc == "csiso88598i" || enc == "logical") ) + visualRTL = true; + } +#endif + + if( codec ) { // in case the codec didn't exist, we keep the old one (fixes some sites specifying invalid codecs) + m_codec = codec; + m_type = type; + delete m_decoder; + m_decoder = m_codec->makeDecoder(); + } + +#ifdef DECODE_DEBUG + kdDebug(6005) << "Decoder::encoding used is" << m_codec->name() << endl; +#endif +} + +const char *Decoder::encoding() const +{ + return enc; +} + +// Other browsers allow comments in the head section, so we need to also. +// It's important not to look for tags inside the comments. +static void skipComment(const char *&ptr, const char *pEnd) +{ + const char *p = ptr; + // Allow <!-->; other browsers do. + if (*p == '>') { + p++; + } else { + while (p != pEnd) { + if (*p == '-') { + // This is the real end of comment, "-->". + if (p[1] == '-' && p[2] == '>') { + p += 3; + break; + } + // This is the incorrect end of comment that other browsers allow, "--!>". + if (p[1] == '-' && p[2] == '!' && p[3] == '>') { + p += 4; + break; + } + } + p++; + } + } + ptr = p; +} + +// Returns the position of the encoding string. +static int findXMLEncoding(const TQCString &str, int &encodingLength) +{ + int len = str.length(); + + int pos = str.find("encoding"); + if (pos == -1) + return -1; + pos += 8; + + // Skip spaces and stray control characters. + while (pos < len && str[pos] <= ' ') + ++pos; + + //Bail out if nothing after + if (pos >= len) + return -1; + + // Skip equals sign. + if (str[pos] != '=') + return -1; + ++pos; + + // Skip spaces and stray control characters. + while (pos < len && str[pos] <= ' ') + ++pos; + + //Bail out if nothing after + if (pos >= len) + return -1; + + // Skip quotation mark. + char quoteMark = str[pos]; + if (quoteMark != '"' && quoteMark != '\'') + return -1; + ++pos; + + // Find the trailing quotation mark. + int end = pos; + while (end < len && str[end] != quoteMark) + ++end; + + if (end >= len) + return -1; + + encodingLength = end - pos; + return pos; +} + +TQString Decoder::decode(const char *data, int len) +{ + // Check for UTF-16 or UTF-8 BOM mark at the beginning, which is a sure sign of a Unicode encoding. + int bufferLength = buffer.length(); + const int maximumBOMLength = 10; + if (beginning && bufferLength + len >= maximumBOMLength) { + // If the user has chosen utf16 we still need to auto-detect the endianness + if ((m_type != UserChosenEncoding) || (m_codec->mibEnum() == 1000)) { + // Extract the first three bytes. + // Handle the case where some of bytes are already in the buffer. + const uchar *udata = (const uchar *)data; + uchar c1 = bufferLength >= 1 ? (uchar)buffer[0] : *udata++; + uchar c2 = bufferLength >= 2 ? (uchar)buffer[1] : *udata++; + uchar c3 = bufferLength >= 3 ? (uchar)buffer[2] : *udata++; + + // Check for the BOM + const char *autoDetectedEncoding; + if ((c1 == 0xFE && c2 == 0xFF) || (c1 == 0xFF && c2 == 0xFE)) { + autoDetectedEncoding = "ISO-10646-UCS-2"; + } else if (c1 == 0xEF && c2 == 0xBB && c3 == 0xBF) { + autoDetectedEncoding = "UTF-8"; + } else if (c1 == 0x00 || c2 == 0x00) { + uchar c4 = bufferLength >= 4 ? (uchar)buffer[3] : *udata++; + uchar c5 = bufferLength >= 5 ? (uchar)buffer[4] : *udata++; + uchar c6 = bufferLength >= 6 ? (uchar)buffer[5] : *udata++; + uchar c7 = bufferLength >= 7 ? (uchar)buffer[6] : *udata++; + uchar c8 = bufferLength >= 8 ? (uchar)buffer[7] : *udata++; + uchar c9 = bufferLength >= 9 ? (uchar)buffer[8] : *udata++; + uchar c10 = bufferLength >= 10 ? (uchar)buffer[9] : *udata++; + int nul_count_even = (c2 != 0) + (c4 != 0) + (c6 != 0) + (c8 != 0) + (c10 != 0); + int nul_count_odd = (c1 != 0) + (c3 != 0) + (c5 != 0) + (c7 != 0) + (c9 != 0); + if ((nul_count_even == 0 && nul_count_odd == 5) || + (nul_count_even == 5 && nul_count_odd == 0)) + autoDetectedEncoding = "ISO-10646-UCS-2"; + else + autoDetectedEncoding = 0; + } else { + autoDetectedEncoding = 0; + } + + // If we found a BOM, use the encoding it implies. + if (autoDetectedEncoding != 0) { + m_type = AutoDetectedEncoding; + m_codec = TQTextCodec::codecForName(autoDetectedEncoding); + assert(m_codec); + enc = m_codec->name(); + delete m_decoder; + m_decoder = m_codec->makeDecoder(); + if (m_codec->mibEnum() == 1000 && c2 == 0x00) + { + // utf16LE, we need to put the decoder in LE mode + char reverseUtf16[3] = {0xFF, 0xFE, 0x00}; + m_decoder->toUnicode(reverseUtf16, 2); + } + } + } + beginning = false; + } + + // this is not completely efficient, since the function might go + // through the html head several times... + + bool lookForMetaTag = m_type == DefaultEncoding && !body; + + if (lookForMetaTag) { +#ifdef DECODE_DEBUG + kdDebug(6005) << "looking for charset definition" << endl; +#endif + { // extra level of braces to keep indenting matching original for better diff'ing +#ifdef APPLE_CHANGES + buffer.append(data, len); +#else + if(m_codec->mibEnum() != 1000) { // utf16 + // replace '\0' by spaces, for buggy pages + char *d = const_cast<char *>(data); + int i = len - 1; + while(i >= 0) { + if(d[i] == 0) d[i] = ' '; + i--; + } + } + buffer += TQCString(data, len+1); +#endif + // we still don't have an encoding, and are in the head + // the following tags are allowed in <head>: + // SCRIPT|STYLE|META|LINK|OBJECT|TITLE|BASE + int invalid = 0; // invalid head tag count +#ifdef APPLE_CHANGES + const char *ptr = buffer.latin1(); + const char *pEnd = ptr + buffer.length(); +#else + const char *ptr = buffer.data(); + const char *pEnd = ptr + buffer.length(); +#endif + while(ptr != pEnd) + { + if(*ptr == '<') { + bool end = false; + ptr++; + + // Handle comments. + if (ptr[0] == '!' && ptr[1] == '-' && ptr[2] == '-') { + ptr += 3; + skipComment(ptr, pEnd); + continue; + } + + // Handle XML header, which can have encoding in it. + if (ptr[0] == '?' && ptr[1] == 'x' && ptr[2] == 'm' && ptr[3] == 'l') { + const char *end = ptr; + while (*end != '>' && *end != '\0') end++; + if (*end == '\0') + break; + TQCString str(ptr, end - ptr + 1); //+1 as it must include the \0 terminator + int len; + int pos = findXMLEncoding(str, len); + if (pos != -1) { + setEncoding(str.mid(pos, len), EncodingFromXMLHeader); + if (m_type == EncodingFromXMLHeader) + goto found; + } + } + + if(*ptr == '/') ptr++, end=true; + char tmp[20]; + int len = 0; + while ( + ((*ptr >= 'a') && (*ptr <= 'z') || + (*ptr >= 'A') && (*ptr <= 'Z') || + (*ptr >= '0') && (*ptr <= '9')) + && len < 19 ) + { + tmp[len] = tolower( *ptr ); + ptr++; + len++; + } + tmp[len] = 0; + int id = tdehtml::getTagID(tmp, len); + if(end) id += ID_CLOSE_TAG; + + switch( id ) { + case ID_META: + { + // found a meta tag... + //ptr += 5; + const char * end = ptr; + while(*end != '>' && *end != '\0') end++; + if ( *end == '\0' ) break; + TQCString str( ptr, (end-ptr)+1); + str = str.lower(); + int pos = 0; + //if( (pos = str.find("http-equiv", pos)) == -1) break; + //if( (pos = str.find("content-type", pos)) == -1) break; + while( pos < ( int ) str.length() ) { + if( (pos = str.find("charset", pos)) == -1) break; + pos += 7; + // skip whitespace.. + while( pos < (int)str.length() && str[pos] <= ' ' ) pos++; + if ( pos == ( int )str.length()) break; + if ( str[pos++] != '=' ) continue; + while ( pos < ( int )str.length() && + ( str[pos] <= ' ' ) || str[pos] == '=' || str[pos] == '"' || str[pos] == '\'') + pos++; + + // end ? + if ( pos == ( int )str.length() ) break; + uint endpos = pos; + while( endpos < str.length() && + (str[endpos] != ' ' && str[endpos] != '"' && str[endpos] != '\'' + && str[endpos] != ';' && str[endpos] != '>') ) + endpos++; + enc = str.mid(pos, endpos-pos); +#ifdef DECODE_DEBUG + kdDebug( 6005 ) << "Decoder: found charset: " << enc.data() << endl; +#endif + setEncoding(enc, EncodingFromMetaTag); + if( m_type == EncodingFromMetaTag ) goto found; + + if ( endpos >= str.length() || str[endpos] == '/' || str[endpos] == '>' ) break; + + pos = endpos + 1; + } + } + case ID_SCRIPT: + case (ID_SCRIPT+ID_CLOSE_TAG): + case ID_NOSCRIPT: + case (ID_NOSCRIPT+ID_CLOSE_TAG): + case ID_STYLE: + case (ID_STYLE+ID_CLOSE_TAG): + case ID_LINK: + case (ID_LINK+ID_CLOSE_TAG): + case ID_OBJECT: + case (ID_OBJECT+ID_CLOSE_TAG): + case ID_TITLE: + case (ID_TITLE+ID_CLOSE_TAG): + case ID_BASE: + case (ID_BASE+ID_CLOSE_TAG): + case ID_HTML: + case ID_HEAD: + case 0: + case (0 + ID_CLOSE_TAG ): + break; + case ID_BODY: + case (ID_HEAD+ID_CLOSE_TAG): + body = true; +#ifdef DECODE_DEBUG + kdDebug( 6005 ) << "Decoder: no charset found. Id=" << id << endl; +#endif + goto found; + default: + // Invalid tag in head. Let's be a little tolerant + invalid++; + if (invalid > 2) { + body = true; +#ifdef DECODE_DEBUG + kdDebug( 6005 ) << "Decoder: no charset found. Id=" << id << endl; +#endif + goto found; + } + } + } + else + ptr++; + } + if (invalid > 0) { + body = true; + goto found; + } + return TQString::null; + } + } + + found: + if (m_type == DefaultEncoding) + { +#ifdef DECODE_DEBUG + kdDebug( 6005 ) << "Decoder: use auto-detect (" << strlen(data) << ")" << endl; +#endif + + switch ( m_autoDetectLanguage) { + case Decoder::Arabic: + enc = automaticDetectionForArabic( (const unsigned char*) data, len ); + break; + case Decoder::Baltic: + enc = automaticDetectionForBaltic( (const unsigned char*) data, len ); + break; + case Decoder::CentralEuropean: + enc = automaticDetectionForCentralEuropean( (const unsigned char*) data, len ); + break; + case Decoder::Russian: + case Decoder::Ukrainian: + enc = automaticDetectionForCyrillic( (const unsigned char*) data, len, m_autoDetectLanguage ); + break; + case Decoder::Greek: + enc = automaticDetectionForGreek( (const unsigned char*) data, len ); + break; + case Decoder::Hebrew: + enc = automaticDetectionForHebrew( (const unsigned char*) data, len ); + break; + case Decoder::Japanese: + enc = automaticDetectionForJapanese( (const unsigned char*) data, len ); + break; + case Decoder::Turkish: + enc = automaticDetectionForTurkish( (const unsigned char*) data, len ); + break; + case Decoder::WesternEuropean: + enc = automaticDetectionForWesternEuropean( (const unsigned char*) data, len ); + break; + case Decoder::SemiautomaticDetection: + case Decoder::Chinese: + case Decoder::Korean: + case Decoder::Thai: + case Decoder::Unicode: + // huh. somethings broken in this code ### FIXME + enc = 0; //Reset invalid codec we tried, so we get back to latin1 fallback. + break; + } + +#ifdef DECODE_DEBUG + kdDebug( 6005 ) << "Decoder: auto detect encoding is " << enc.data() << endl; +#endif + if ( !enc.isEmpty() ) + setEncoding( enc.data(), AutoDetectedEncoding); + } + + + // if we still haven't found an encoding latin1 will be used... + // this is according to HTML4.0 specs + if (!m_codec) + { + if(enc.isEmpty()) enc = "iso8859-1"; + m_codec = TQTextCodec::codecForName(enc); + // be sure not to crash + if(!m_codec) { + m_codec = TQTextCodec::codecForMib(4); + enc = "iso8859-1"; + } + delete m_decoder; + m_decoder = m_codec->makeDecoder(); + } + TQString out; + + if(!buffer.isEmpty() && enc != "ISO-10646-UCS-2") { + out = m_decoder->toUnicode(buffer, buffer.length()); + buffer = ""; + } else { + if(m_codec->mibEnum() != 1000) // utf16 + { + // ### hack for a bug in TQTextCodec. It cut's the input stream + // in case there are \0 in it. ZDNET has them inside... :-( + char *d = const_cast<char *>(data); + int i = len - 1; + while(i >= 0) { + if(*(d+i) == 0) *(d+i) = ' '; + i--; + } + } + out = m_decoder->toUnicode(data, len); + } + + return out; +} + +TQString Decoder::flush() const +{ + return m_decoder->toUnicode(buffer, buffer.length()); +} + +TQCString Decoder::automaticDetectionForArabic( const unsigned char* ptr, int size ) +{ + for ( int i = 0; i < size; ++i ) { + if ( ( ptr[ i ] >= 0x80 && ptr[ i ] <= 0x9F ) || ptr[ i ] == 0xA1 || ptr[ i ] == 0xA2 || ptr[ i ] == 0xA3 + || ( ptr[ i ] >= 0xA5 && ptr[ i ] <= 0xAB ) || ( ptr[ i ] >= 0xAE && ptr[ i ] <= 0xBA ) + || ptr[ i ] == 0xBC || ptr[ i ] == 0xBD || ptr[ i ] == 0xBE || ptr[ i ] == 0xC0 + || ( ptr[ i ] >= 0xDB && ptr[ i ] <= 0xDF ) || ( ptr[ i ] >= 0xF3 ) ) { + return "cp1256"; + } + } + + return "iso-8859-6"; +} + +TQCString Decoder::automaticDetectionForBaltic( const unsigned char* ptr, int size ) +{ + for ( int i = 0; i < size; ++i ) { + if ( ( ptr[ i ] >= 0x80 && ptr[ i ] <= 0x9E ) ) + return "cp1257"; + + if ( ptr[ i ] == 0xA1 || ptr[ i ] == 0xA5 ) + return "iso-8859-13"; + } + + return "iso-8859-13"; +} + +TQCString Decoder::automaticDetectionForCentralEuropean(const unsigned char* ptr, int size ) +{ + TQCString charset = TQCString(); + for ( int i = 0; i < size; ++i ) { + if ( ptr[ i ] >= 0x80 && ptr[ i ] <= 0x9F ) { + if ( ptr[ i ] == 0x81 || ptr[ i ] == 0x83 || ptr[ i ] == 0x90 || ptr[ i ] == 0x98 ) + return "ibm852"; + + if ( i + 1 > size ) + return "cp1250"; + else { // maybe ibm852 ? + charset = "cp1250"; + continue; + } + } + if ( ptr[ i ] == 0xA5 || ptr[ i ] == 0xAE || ptr[ i ] == 0xBE || ptr[ i ] == 0xC3 || ptr[ i ] == 0xD0 || ptr[ i ] == 0xE3 || ptr[ i ] == 0xF0 ) { + if ( i + 1 > size ) + return "iso-8859-2"; + else { // maybe ibm852 ? + if ( charset.isNull() ) + charset = "iso-8859-2"; + continue; + } + } + } + + if ( charset.isNull() ) + charset = "iso-8859-3"; + + return charset.data(); +} + +TQCString Decoder::automaticDetectionForCyrillic( const unsigned char* ptr, int size, AutoDetectLanguage _language ) +{ + int koi_st=0; + int cp1251_st=0; + +// int koi_na=0; +// int cp1251_na=0; + + int koi_o_capital=0; + int koi_o=0; + int cp1251_o_capital=0; + int cp1251_o=0; + + int koi_a_capital=0; + int koi_a=0; + int cp1251_a_capital=0; + int cp1251_a=0; + + int koi_i_capital=0; + int koi_i=0; + int cp1251_i_capital=0; + int cp1251_i=0; + + int cp1251_small_range=0; + int koi_small_range=0; + int ibm866_small_range=0; + + int i; + for (i=1; (i<size) && (cp1251_small_range+koi_small_range<1000) ;++i) + { + if (ptr[i]>0xdf) + { + ++cp1251_small_range; + + if (ptr[i]==0xee)//small o + ++cp1251_o; + else if (ptr[i]==0xe0)//small a + ++cp1251_a; + else if (ptr[i]==0xe8)//small i + ++cp1251_i; + else if (ptr[i]==0xf2 && ptr[i-1]==0xf1)//small st + ++cp1251_st; + + else if (ptr[i]==0xef) + ++koi_o_capital; + else if (ptr[i]==0xe1) + ++koi_a_capital; + else if (ptr[i]==0xe9) + ++koi_i_capital; + + } + else if (ptr[i]>0xbf) + { + ++koi_small_range; + + if (ptr[i]==0xcf)//small o + ++koi_o; + else if (ptr[i]==0xc1)//small a + ++koi_a; + else if (ptr[i]==0xc9)//small i + ++koi_i; + else if (ptr[i]==0xd4 && ptr[i-1]==0xd3)//small st + ++koi_st; + + else if (ptr[i]==0xce) + ++cp1251_o_capital; + else if (ptr[i]==0xc0) + ++cp1251_a_capital; + else if (ptr[i]==0xc8) + ++cp1251_i_capital; + } + else if (ptr[i]>0x9f && ptr[i]<0xaf) //first 16 letterz is 60% + ++ibm866_small_range; + + } + + if (ibm866_small_range>cp1251_small_range+koi_small_range) + return "ibm866"; //hehe this is a rare case :) + + TQCString koi_string = "koi8-u"; + TQCString cp1251_string = "cp1251"; + + if (cp1251_st==0 && koi_st>1) + return koi_string; + if (koi_st==0 && cp1251_st>1) + return cp1251_string; + + if (cp1251_st>0 && koi_st>0) + { + if (cp1251_st/koi_st>2) + return cp1251_string; + else if (koi_st/cp1251_st>2) + return koi_string; + } + + if (cp1251_a>koi_a && cp1251_o>koi_o && cp1251_i>koi_i) + return cp1251_string; + if (koi_a>cp1251_a && koi_o>cp1251_o && koi_i>cp1251_i) + return koi_string; + + if (cp1251_a_capital>koi_a_capital && cp1251_o_capital>koi_o_capital && cp1251_i_capital>koi_i_capital) + return cp1251_string; + if (koi_a_capital>cp1251_a_capital && koi_o_capital>cp1251_o_capital && koi_i_capital>cp1251_i_capital) + return koi_string; + + //fallback... + if (cp1251_small_range>koi_small_range) + return cp1251_string; + else + return koi_string; + +} + +TQCString Decoder::automaticDetectionForGreek( const unsigned char* ptr, int size ) +{ + for ( int i = 0; i < size; ++i ) { + if ( ptr[ i ] == 0x80 || ( ptr[ i ] >= 0x82 && ptr[ i ] <= 0x87 ) || ptr[ i ] == 0x89 || ptr[ i ] == 0x8B + || ( ptr[ i ] >= 0x91 && ptr[ i ] <= 0x97 ) || ptr[ i ] == 0x99 || ptr[ i ] == 0x9B || ptr[ i ] == 0xA4 + || ptr[ i ] == 0xA5 || ptr[ i ] == 0xAE ) { + return "cp1253"; + } + } + + return "iso-8859-7"; +} + +TQCString Decoder::automaticDetectionForHebrew( const unsigned char* ptr, int size ) +{ + for ( int i = 0; i < size; ++i ) { + if ( ptr[ i ] == 0x80 || ( ptr[ i ] >= 0x82 && ptr[ i ] <= 0x89 ) || ptr[ i ] == 0x8B + || ( ptr[ i ] >= 0x91 && ptr[ i ] <= 0x99 ) || ptr[ i ] == 0x9B || ptr[ i ] == 0xA1 || ( ptr[ i ] >= 0xBF && ptr[ i ] <= 0xC9 ) + || ( ptr[ i ] >= 0xCB && ptr[ i ] <= 0xD8 ) ) { + return "cp1255"; + } + + if ( ptr[ i ] == 0xDF ) + return "iso-8859-8-i"; + } + + return "iso-8859-8-i"; +} + +TQCString Decoder::automaticDetectionForJapanese( const unsigned char* ptr, int size ) +{ + if (!kc) + kc = new JapaneseCode(); + + switch ( kc->guess_jp( (const char*)ptr, size ) ) { + case JapaneseCode::JIS: + return "jis7"; + case JapaneseCode::EUC: + return "eucjp"; + case JapaneseCode::SJIS: + return "sjis"; + case JapaneseCode::UTF8: + return "utf8"; + default: + break; + } + + return ""; +} + +TQCString Decoder::automaticDetectionForTurkish( const unsigned char* ptr, int size ) +{ + for ( int i = 0; i < size; ++i ) { + if ( ptr[ i ] == 0x80 || ( ptr[ i ] >= 0x82 && ptr[ i ] <= 0x8C ) || ( ptr[ i ] >= 0x91 && ptr[ i ] <= 0x9C ) || ptr[ i ] == 0x9F ) { + return "cp1254"; + } + } + + return "iso-8859-9"; +} + +TQCString Decoder::automaticDetectionForWesternEuropean( const unsigned char* ptr, int size ) +{ + for ( int i = 0; i < size; ++i ) { + if ( ptr[ i ] >= 0x80 && ptr[ i ] <= 0x9F ) + return "cp1252"; + } + + return "iso-8859-1"; //"iso-8859-15"; Which better at default ? +} + + +// ----------------------------------------------------------------------------- +#undef DECODE_DEBUG |