diff options
Diffstat (limited to 'tdecore/tequivchars.cpp')
-rwxr-xr-x | tdecore/tequivchars.cpp | 241 |
1 files changed, 241 insertions, 0 deletions
diff --git a/tdecore/tequivchars.cpp b/tdecore/tequivchars.cpp new file mode 100755 index 000000000..d259946b2 --- /dev/null +++ b/tdecore/tequivchars.cpp @@ -0,0 +1,241 @@ +#undef REGEX_IS_PCRE2 +#define OPTIMIZE_ASCII_LOOKUP + +#ifdef REGEXP_IS_PCRE2 +#pragma message "############ Assuming regular expressions are PCRE2 ############" +#endif + +#ifdef OPTIMIZE_ASCII_LOOKUP +#pragma message "############ ASCII characters will be processed separately ############" +#endif + +#include "tequivchars.h" + +//typedef wchar_t CHAR16; +//typedef unsigned short CHAR16; +typedef TQChar CHAR16; + +class TEquivChars_Private +{ +public: + + struct defaultCollation { + CHAR16 character; + CHAR16 collatesTo; + }; + + const defaultCollation EquivalentsTable // terminating ';' is provided in include file + #include "tequivchars-mapping.h" + uint EquivTableROWS = sizeof(EquivalentsTable)/sizeof(EquivalentsTable[0]); +}; + +TEquivChars::TEquivChars() +{ + p = new TEquivChars_Private; +} + +TEquivChars::~TEquivChars() +{ + delete p; +} + +TQString TEquivChars::replaceChars( const TQString &inputString, bool isRegex ) +{ + int inStrLen = inputString.length(); + TQString outString = TQString::fromLatin1( "" ); + outString.reserve( inStrLen ); + const TQChar *char16 = inputString.unicode(); + + bool backSlashed = false; // \_ + bool startedCharClass = false; // Previous character was starting '[' of character class + bool inCharacterClass = false; // [___] + bool inPosixBracketExpr = false; // [:___:] +#ifdef REGEXP_IS_PCRE2 + bool quoteLiteral = false; // \Q___\E + bool inBraceExpr = false; // \c{___} where 'c' is any of: 'x' 'o' 'p' 'P' 'N' 'g' + bool inDirective = false; // (*___) + bool inGroupName = false; // (?<___> +#endif // REGEXP_IS_PCRE2 + CHAR16 currChar = 0; + CHAR16 prevChar = 0; + CHAR16 nextChar = 0; + + for ( int i = 0 ; i < inStrLen ; outString[i] = CHAR16(currChar), i++ ) { + + prevChar = currChar; + currChar = char16[i].unicode(); + + if ( isRegex ) { + + /* + Look for regex characters and character sequences + that should never be converted to an equivalent. + */ + + if ( i < ( inStrLen - 1 ) ) + nextChar = char16[i+1].unicode(); + else + nextChar = 0; + + if ( currChar == '\\' ) { + backSlashed = true; + continue; + } + + // Don't convert backSlashed characters + if ( backSlashed ) { +#ifdef REGEXP_IS_PCRE2 + switch (currChar) { + case 'Q' : quoteLiteral = true; break; // Entering literal \Q___\E + case 'E' : quoteLiteral = false; break; // Leaving literal \Q___\E + case 'N' : // Entering Unicode codepoint specification \N{U+___} ? + case 'P' : // Entering (negated) Unicode property specification \p{} ? + case 'p' : // Entering Unicode property specification \p{} ? + case 'g' : // Entering a named backreference \g{___} ? + if ( nextChar == '{' ) inBraceExpr = true; + break; + } +#endif // REGEXP_IS_PCRE2 + backSlashed = false; + continue; + } + +#ifdef REGEXP_IS_PCRE2 + if ( quoteLiteral ) + continue; + + if ( inBraceExpr ) { + // Is it time to leave brace expression {___} ? + if ( nextChar == '}' ) inBraceExpr = true; + continue; + } +#endif // REGEXP_IS_PCRE2 + + if ( startedCharClass ) { + switch (currChar) { + case '^' : // Negated character class, proceed to next character + continue; // Bypass converting this special character + case ']' : // Treat as part of character class, not as a closure + case ':' : // Treat as part of character class, not as start of bracket expression + startedCharClass = false; + continue; // Bypass converting these special characters + } + startedCharClass = false; + } // startedCharClass + + if ( inCharacterClass ) { + + if ( inPosixBracketExpr ) { + // Is it time to leave POSIX bracket expression [:___:] ? + if ( currChar == ':' && nextChar == ']' ) inPosixBracketExpr = false; + continue; + } // inPosixBracketExpr + + else { // ! inPosixBracketExpr + + if ( prevChar == '[' && currChar == ':' ) { + // Enter POSIX bracket expression [:___:] + inPosixBracketExpr = true; + continue; + } + + if ( currChar == ']' ) { + // Leaving character class [___] + inCharacterClass = false; + continue; + } + + } // ! inPosixBracketExpr + + } // inCharacterClass + + else { // ! inCharacterClass + + switch (currChar) { + + case '[' : + // Entering a character class [___] + startedCharClass = true; + inCharacterClass = true; + continue; + break; +#ifdef REGEXP_IS_PCRE2 + case '*' : + if ( prevChar != '(' ) continue; + // Entering a PCRE2 directive (*___) + inDirective = true; + continue; + break; + + case '?' : + if ( prevChar != '(' ) continue; + if ( nextChar != '<' ) continue; + // Entering PCRE2 group name (?<___>) + inGroupName = true; + continue; + break; +#endif // REGEXP_IS_PCRE2 + } +#ifdef REGEXP_IS_PCRE2 + if ( inDirective ) { + // Is it time to leave PCRE2 directive (*___) ? + if (currChar == ')' ) inDirective = false; + continue; + } + + if ( inGroupName ) { + // Is it time to leave PCRE2 group name (?<___>) ? + if (currChar == '>' ) inGroupName = false; + continue; + } +#endif // REGEXP_IS_PCRE2 + } // ! inCharacterClass + + /* + If we have reached here, this regex character is a + candidate for potential conversion to an equivalent. + */ + + } // isRegex + + //-Debug: std::cerr << "Converting '" << TQString(currChar).utf8().data() << "' to '"; + +#ifdef OPTIMIZE_ASCII_LOOKUP + // We can process ASCII quickly without using lookup table + unsigned short codepoint = currChar.unicode(); + if ( codepoint < 128 ) { + if ( codepoint > 64 && codepoint < 91 ) // convert upper case ASCII + currChar = TQChar(codepoint + 32 ); // to corresponding lower case + // All other ASCII characters are equivalent to themselves + //-Debug: std::cerr << TQString(currChar).utf8().data() << "' (ascii)" << std::endl; + continue; + } +#endif + + // Use a simple binary search to look up an equivalent character + int low = 0; + int high = p->EquivTableROWS - 1; + while (low <= high) { + int mid = low + (high - low) / 2; + if ( currChar == p->EquivalentsTable[mid].character ) { + // Found equivalent character, use it instead + currChar = p->EquivalentsTable[mid].collatesTo; + break; + } + if ( p->EquivalentsTable[mid].character < currChar ) + low = mid + 1; + else + high = mid - 1; + } + //-Debug: std::cerr << TQString(currChar).utf8().data() << "'" << std::endl; + + /* FIXME: Possible ideas for optimizing table lookup speed + (1) Detect & handle ASCII (<128) characters separately. *DONE* + (2) Split table into multiple lookup tables and search each + in order of descending likelihood of character match. + */ + + } + + return outString; +} |