1 files changed, 241 insertions, 0 deletions
diff --git a/tdecore/tequivchars.cpp b/tdecore/tequivchars.cpp
new file mode 100755
index 000000000..d259946b2
--- /dev/null
+++ b/tdecore/tequivchars.cpp
@@ -0,0 +1,241 @@
+#undef REGEX_IS_PCRE2
+#define OPTIMIZE_ASCII_LOOKUP
+
+#ifdef REGEXP_IS_PCRE2
+#pragma message "############ Assuming regular expressions are PCRE2 ############"
+#endif
+
+#ifdef OPTIMIZE_ASCII_LOOKUP
+#pragma message "############ ASCII characters will be processed separately ############"
+#endif
+
+#include "tequivchars.h"
+
+//typedef wchar_t CHAR16;
+//typedef unsigned short CHAR16;
+typedef TQChar CHAR16;
+
+class TEquivChars_Private
+{
+public:
+
+  struct defaultCollation {
+    CHAR16 character;
+    CHAR16 collatesTo;
+  };
+
+  const defaultCollation EquivalentsTable // terminating ';' is provided in include file
+  #include "tequivchars-mapping.h"
+  uint EquivTableROWS = sizeof(EquivalentsTable)/sizeof(EquivalentsTable[0]);
+};
+
+TEquivChars::TEquivChars()
+{
+  p = new TEquivChars_Private;
+}
+
+TEquivChars::~TEquivChars()
+{
+  delete p;
+}
+
+TQString TEquivChars::replaceChars( const TQString &inputString, bool isRegex )
+{
+  int inStrLen = inputString.length();
+  TQString outString = TQString::fromLatin1( "" );
+  outString.reserve( inStrLen );
+  const TQChar *char16 = inputString.unicode();
+
+  bool backSlashed        = false; // \_
+  bool startedCharClass   = false; // Previous character was starting '[' of character class
+  bool inCharacterClass   = false; // [___]
+  bool inPosixBracketExpr = false; // [:___:]
+#ifdef REGEXP_IS_PCRE2
+  bool quoteLiteral       = false; // \Q___\E
+  bool inBraceExpr        = false; // \c{___} where 'c' is any of: 'x' 'o' 'p' 'P' 'N' 'g'
+  bool inDirective        = false; // (*___)
+  bool inGroupName        = false; // (?<___>
+#endif // REGEXP_IS_PCRE2
+  CHAR16 currChar  = 0;
+  CHAR16 prevChar  = 0;
+  CHAR16 nextChar  = 0;
+
+  for ( int i = 0 ; i < inStrLen ; outString[i] = CHAR16(currChar), i++  ) {
+
+    prevChar = currChar;
+    currChar = char16[i].unicode();
+
+    if ( isRegex ) {
+
+      /*
+         Look for regex characters and character sequences
+         that should never be converted to an equivalent.
+      */
+
+      if ( i < ( inStrLen - 1 ) )
+        nextChar = char16[i+1].unicode();
+      else
+        nextChar = 0;
+
+      if ( currChar == '\\' ) {
+        backSlashed = true;
+        continue;
+      }
+
+      // Don't convert backSlashed characters
+      if ( backSlashed ) {
+#ifdef REGEXP_IS_PCRE2
+        switch (currChar) {
+          case 'Q' : quoteLiteral = true;  break; // Entering literal \Q___\E
+          case 'E' : quoteLiteral = false; break; // Leaving literal \Q___\E
+          case 'N' : // Entering Unicode codepoint specification \N{U+___} ?
+          case 'P' : // Entering (negated) Unicode property specification \p{} ?
+          case 'p' : // Entering Unicode property specification \p{} ?
+          case 'g' : // Entering a named backreference \g{___} ?
+            if ( nextChar == '{' ) inBraceExpr = true;
+            break;
+        }
+#endif // REGEXP_IS_PCRE2
+        backSlashed = false;
+        continue;
+      }
+
+#ifdef REGEXP_IS_PCRE2
+      if ( quoteLiteral )
+        continue;
+
+      if ( inBraceExpr ) {
+        // Is it time to leave brace expression {___} ?
+        if ( nextChar == '}' ) inBraceExpr = true;
+        continue;
+      }
+#endif // REGEXP_IS_PCRE2
+
+      if ( startedCharClass ) {
+        switch (currChar) {
+          case '^' : // Negated character class, proceed to next character
+            continue; // Bypass converting this special character
+          case ']' : // Treat as part of character class, not as a closure
+          case ':' : // Treat as part of character class, not as start of bracket expression
+            startedCharClass = false;
+            continue;  // Bypass converting these special characters
+        }
+        startedCharClass = false;
+      } // startedCharClass
+
+      if ( inCharacterClass ) {
+
+        if ( inPosixBracketExpr ) {
+          // Is it time to leave POSIX bracket expression [:___:] ?
+          if ( currChar == ':' && nextChar == ']' ) inPosixBracketExpr = false;
+          continue;
+        } // inPosixBracketExpr
+
+        else { // ! inPosixBracketExpr
+
+          if ( prevChar == '[' && currChar == ':' ) {
+            // Enter POSIX bracket expression [:___:]
+            inPosixBracketExpr = true;
+            continue;
+          }
+
+          if ( currChar == ']' ) {
+            // Leaving character class [___]
+            inCharacterClass = false;
+            continue;
+          }
+
+        } // ! inPosixBracketExpr
+
+      } // inCharacterClass
+
+      else { // ! inCharacterClass
+
+        switch (currChar) {
+
+          case '[' :
+            // Entering a character class [___]
+            startedCharClass = true;
+            inCharacterClass = true;
+            continue;
+            break;
+#ifdef REGEXP_IS_PCRE2
+          case '*' :
+            if ( prevChar != '(' ) continue;
+            // Entering a PCRE2 directive (*___)
+            inDirective = true;
+            continue;
+            break;
+
+          case '?' :
+            if ( prevChar != '(' ) continue;
+            if ( nextChar != '<' ) continue;
+            // Entering PCRE2 group name (?<___>)
+            inGroupName = true;
+            continue;
+            break;
+#endif // REGEXP_IS_PCRE2
+        }
+#ifdef REGEXP_IS_PCRE2
+        if ( inDirective ) {
+          // Is it time to leave PCRE2 directive (*___) ?
+          if (currChar == ')' ) inDirective = false;
+          continue;
+        }
+
+        if ( inGroupName ) {
+          // Is it time to leave PCRE2 group name (?<___>) ?
+          if (currChar == '>' ) inGroupName = false;
+          continue;
+        }
+#endif // REGEXP_IS_PCRE2
+      } // ! inCharacterClass
+
+      /*
+         If we have reached here, this regex character is a
+         candidate for potential conversion to an equivalent.
+      */
+
+    } // isRegex
+
+    //-Debug: std::cerr << "Converting '" << TQString(currChar).utf8().data() << "' to '";
+
+#ifdef OPTIMIZE_ASCII_LOOKUP
+    // We can process ASCII quickly without using lookup table
+    unsigned short codepoint = currChar.unicode();
+    if ( codepoint < 128 ) {
+      if ( codepoint > 64 && codepoint < 91 ) // convert upper case ASCII
+        currChar = TQChar(codepoint + 32 ); // to corresponding lower case
+      // All other ASCII characters are equivalent to themselves
+      //-Debug: std::cerr << TQString(currChar).utf8().data() << "' (ascii)" << std::endl;
+      continue;
+    }
+#endif
+
+    // Use a simple binary search to look up an equivalent character
+    int low  =  0;
+    int high =  p->EquivTableROWS - 1;
+    while (low <= high) {
+      int mid = low + (high - low) / 2;
+      if ( currChar == p->EquivalentsTable[mid].character ) {
+        // Found equivalent character, use it instead
+        currChar = p->EquivalentsTable[mid].collatesTo;
+        break;
+      }
+      if ( p->EquivalentsTable[mid].character < currChar )
+        low = mid + 1;
+      else
+        high = mid - 1;	
+    }
+    //-Debug: std::cerr << TQString(currChar).utf8().data() << "'" << std::endl;
+
+    /* FIXME: Possible ideas for optimizing table lookup speed
+       (1) Detect & handle ASCII (<128) characters separately. *DONE*
+       (2) Split table into multiple lookup tables and search each
+           in order of descending likelihood of character match.
+    */
+
+  }
+
+  return outString;
+}