kjs: use libpcre2 instead of libpcre

Signed-off-by: Michele Calgaro <michele.calgaro@yahoo.it>
author: Michele Calgaro <michele.calgaro@yahoo.it> 2024-08-17 22:26:29 +0900
committer: Michele Calgaro <michele.calgaro@yahoo.it> 2024-08-18 20:14:02 +0900
commit: 7740e825a683a9cc84f8422c94109c5fcc4beb8e (patch)
tree: 0f4cb4d307e3536232cea590e607d14f9edb5e76 /kjs/regexp.cpp
parent: b59d51c67903335d27ada24d51be77137f664cb3 (diff)
download: tdelibs-7740e825a683a9cc84f8422c94109c5fcc4beb8e.tar.gz
tdelibs-7740e825a683a9cc84f8422c94109c5fcc4beb8e.zip
1 files changed, 105 insertions, 124 deletions
diff --git a/kjs/regexp.cpp b/kjs/regexp.cpp
index 0c2675588..a693fdc1a 100644
--- a/kjs/regexp.cpp
+++ b/kjs/regexp.cpp
@@ -30,21 +30,17 @@
 
 using namespace KJS;
 
-#ifdef PCRE_CONFIG_UTF8
 RegExp::UTF8SupportState RegExp::utf8Support = RegExp::Unknown;
-#endif
 
 RegExp::RegExp(const UString &p, int f)
   : pat(p), flgs(f), m_notEmpty(false), valid(true), buffer(0), originalPos(0)
 {
   // Determine whether libpcre has unicode support if need be..
-#ifdef PCRE_CONFIG_UTF8
   if (utf8Support == Unknown) {
-    int supported;
-    pcre_config(PCRE_CONFIG_UTF8, (void*)&supported);
-    utf8Support = supported ? Supported : Unsupported;
+    uint32_t supported;
+    pcre2_config(PCRE2_CONFIG_COMPILED_WIDTHS, (void*)&supported);
+    utf8Support = (supported & 0x0001) ? Supported : Unsupported;
   }
-#endif
 
   nrSubPatterns = 0; // determined in match() with POSIX regex.
 
@@ -63,33 +59,33 @@ RegExp::RegExp(const UString &p, int f)
         escape = false;
         // we only care about \u
         if (c == 'u') {
-	  // standard unicode escape sequence looks like \uxxxx but
-	  // other browsers also accept less then 4 hex digits
-	  unsigned short u = 0;
-	  int j = 0;
-	  for (j = 0; j < 4; ++j) {
-	    if (i + 1 < p.size() && Lexer::isHexDigit(p[i + 1].unicode())) {
-	      u = (u << 4) + Lexer::convertHex(p[i + 1].unicode());
-	      ++i;
-	    } else {
-	      // sequence incomplete. restore index.
-	      // TODO: cleaner way to propagate warning
-	      fprintf(stderr, "KJS: saw %d digit \\u sequence.\n", j);
-	      i -= j;
-	      break;
-	    }
-	  }
-	  if (j < 4) {
-	    // sequence was incomplete. treat \u as u which IE always
-	    // and FF sometimes does.
-	    intern.append(UString('u'));
-	  } else {
+    // standard unicode escape sequence looks like \uxxxx but
+    // other browsers also accept less then 4 hex digits
+    unsigned short u = 0;
+    int j = 0;
+    for (j = 0; j < 4; ++j) {
+      if (i + 1 < p.size() && Lexer::isHexDigit(p[i + 1].unicode())) {
+        u = (u << 4) + Lexer::convertHex(p[i + 1].unicode());
+        ++i;
+      } else {
+        // sequence incomplete. restore index.
+        // TODO: cleaner way to propagate warning
+        fprintf(stderr, "KJS: saw %d digit \\u sequence.\n", j);
+        i -= j;
+        break;
+      }
+    }
+    if (j < 4) {
+      // sequence was incomplete. treat \u as u which IE always
+      // and FF sometimes does.
+      intern.append(UString('u'));
+    } else {
             c = UChar(u);
             switch (u) {
             case 0:
-	      // Make sure to encode 0, to avoid terminating the string
-	      intern += UString(nil);
-	      break;
+        // Make sure to encode 0, to avoid terminating the string
+        intern += UString(nil);
+        break;
             case '^':
             case '$':
             case '\\':
@@ -101,13 +97,13 @@ RegExp::RegExp(const UString &p, int f)
             case '{': case '}':
             case '[': case ']':
             case '|':
-	      // escape pattern characters have to remain escaped
-	      intern.append(UString('\\'));
-	      // intentional fallthrough
+        // escape pattern characters have to remain escaped
+        intern.append(UString('\\'));
+        // intentional fallthrough
             default:
-	      intern += UString(&c, 1);
-	      break;
-	    }
+        intern += UString(&c, 1);
+        break;
+      }
           }
           continue;
         }
@@ -126,45 +122,46 @@ RegExp::RegExp(const UString &p, int f)
     intern = p;
   }
 
-#ifdef HAVE_PCREPOSIX
-  int pcreflags = 0;
-  const char *perrormsg;
-  int errorOffset;
+#ifdef HAVE_PCRE2POSIX
+  uint32_t pcre2flags = 0;
+  int errorCode;
+  PCRE2_SIZE errorOffset;
 
   if (flgs & IgnoreCase)
-    pcreflags |= PCRE_CASELESS;
+    pcre2flags |= PCRE2_CASELESS;
 
   if (flgs & Multiline)
-    pcreflags |= PCRE_MULTILINE;
+    pcre2flags |= PCRE2_MULTILINE;
 
-#ifdef PCRE_CONFIG_UTF8
   if (utf8Support == Supported)
-    pcreflags |= (PCRE_UTF8 | PCRE_NO_UTF8_CHECK);
-#endif
+    pcre2flags |= (PCRE2_UTF | PCRE2_NO_UTF_CHECK);
 
   // Fill our buffer with an encoded version, whether utf-8, or, 
   // if PCRE is incapable, truncated.
   prepareMatch(intern);
 
-  pcregex = pcre_compile(buffer, pcreflags,
-			 &perrormsg, &errorOffset, NULL);
+  pcregex = pcre2_compile(buffer, PCRE2_ZERO_TERMINATED, pcre2flags,
+       &errorCode, &errorOffset, NULL);
   doneMatch(); // Cleanup buffers
   if (!pcregex) {
 #ifndef NDEBUG
-    fprintf(stderr, "KJS: pcre_compile() failed with '%s'\n", perrormsg);
+    PCRE2_UCHAR errorMsg[256];
+    pcre2_get_error_message(errorCode, errorMsg, sizeof(errorMsg));
+    fprintf(stderr, "KJS: pcre_compile() failed with '%s'\n", errorMsg);
 #endif
     valid = false;
     return;
   }
 
-#ifdef PCRE_INFO_CAPTURECOUNT
   // Get number of subpatterns that will be returned
-  int rc = pcre_fullinfo( pcregex, NULL, PCRE_INFO_CAPTURECOUNT, &nrSubPatterns);
+  int rc = pcre2_pattern_info(pcregex, PCRE2_INFO_CAPTURECOUNT, &nrSubPatterns);
   if (rc != 0)
-#endif
+  {
     nrSubPatterns = 0; // fallback. We always need the first pair of offsets.
+  }
 
-#else /* HAVE_PCREPOSIX */
+  match_data = pcre2_match_data_create_from_pattern(pcregex, NULL);
+#else
 
   int regflags = 0;
 #ifdef REG_EXTENDED
@@ -195,9 +192,15 @@ RegExp::RegExp(const UString &p, int f)
 RegExp::~RegExp()
 {
   doneMatch(); // Be 100% sure buffers are freed
-#ifdef HAVE_PCREPOSIX
+#ifdef HAVE_PCRE2POSIX
+  if (match_data)
+  {
+    pcre2_match_data_free(match_data);
+  }
   if (pcregex)
-    pcre_free(pcregex);
+  {
+    pcre2_code_free(pcregex);
+  }
 #else
   /* TODO: is this really okay after an error ? */
   regfree(&preg);
@@ -208,7 +211,7 @@ void RegExp::prepareUtf8(const UString& s)
 {
   // Allocate a buffer big enough to hold all the characters plus \0
   const int length = s.size();
-  buffer = new char[length * 3 + 1];
+  buffer = new buftype_t[length * 3 + 1];
 
   // Also create buffer for positions. We need one extra character in there,
   // even past the \0 since the non-empty handling may jump one past the end
@@ -217,7 +220,7 @@ void RegExp::prepareUtf8(const UString& s)
   // Convert to runs of 8-bit characters, and generate indeces
   // Note that we do NOT combine surrogate pairs here, as 
   // regexps operate on them as separate characters
-  char *p      = buffer;
+  buftype_t *p = buffer;
   int  *posOut = originalPos;
   const UChar *d = s.data();
   for (int i = 0; i != length; ++i) {
@@ -225,16 +228,16 @@ void RegExp::prepareUtf8(const UString& s)
 
     int sequenceLen;
     if (c < 0x80) {
-      *p++ = (char)c;
+      *p++ = (buftype_t)c;
       sequenceLen = 1;
     } else if (c < 0x800) {
-      *p++ = (char)((c >> 6) | 0xC0); // C0 is the 2-byte flag for UTF-8
-      *p++ = (char)((c | 0x80) & 0xBF); // next 6 bits, with high bit set
+      *p++ = (buftype_t)((c >> 6) | 0xC0); // C0 is the 2-byte flag for UTF-8
+      *p++ = (buftype_t)((c | 0x80) & 0xBF); // next 6 bits, with high bit set
       sequenceLen = 2;
     } else {
-      *p++ = (char)((c >> 12) | 0xE0); // E0 is the 3-byte flag for UTF-8
-      *p++ = (char)(((c >> 6) | 0x80) & 0xBF); // next 6 bits, with high bit set
-      *p++ = (char)((c | 0x80) & 0xBF); // next 6 bits, with high bit set
+      *p++ = (buftype_t)((c >> 12) | 0xE0); // E0 is the 3-byte flag for UTF-8
+      *p++ = (buftype_t)(((c >> 6) | 0x80) & 0xBF); // next 6 bits, with high bit set
+      *p++ = (buftype_t)((c | 0x80) & 0xBF); // next 6 bits, with high bit set
       sequenceLen = 3;
     }
 
@@ -262,7 +265,7 @@ void RegExp::prepareASCII (const UString& s)
   // when we don't have utf 8 available -- use 
   // truncated version, and pray for the best 
   CString truncated = s.cstring();
-  buffer = new char[truncated.size() + 1];
+  buffer = new buftype_t[truncated.size() + 1];
   memcpy(buffer, truncated.c_str(), truncated.size());
   buffer[truncated.size()] = '\0'; // For _compile use
   bufferSize = truncated.size();
@@ -272,11 +275,9 @@ void RegExp::prepareMatch(const UString &s)
 {
   delete[] originalPos; // Just to be sure..
   delete[] buffer;
-#ifdef PCRE_CONFIG_UTF8
   if (utf8Support == Supported)
     prepareUtf8(s);
   else
-#endif
     prepareASCII(s);
 
 #ifndef NDEBUG
@@ -308,17 +309,16 @@ UString RegExp::match(const UString &s, int i, int *pos, int **ovector)
   if (i > s.size() || s.isNull())
     return UString::null;
 
-#ifdef HAVE_PCREPOSIX
-  int ovecsize = (nrSubPatterns+1)*3; // see pcre docu
-  if (ovector) *ovector = new int[ovecsize];
-  if (!pcregex)
+#ifdef HAVE_PCRE2POSIX
+  if (!pcregex || !match_data)
+    return UString::null;
+  if (!ovector)
     return UString::null;
 
   int startPos;
   int nextPos;
-
-#ifdef PCRE_CONFIG_UTF8
-  if (utf8Support == Supported) {
+  if (utf8Support == Supported)
+  {
     startPos = i;
     while (originalPos[startPos] < i)
       ++startPos;
@@ -328,53 +328,59 @@ UString RegExp::match(const UString &s, int i, int *pos, int **ovector)
       while (originalPos[nextPos] < (i + 1))
         ++nextPos;
     }
-  } else
-#endif
+  }
+  else
   {
     startPos = i;
     nextPos  = i + (i < s.size() ? 1 : 0);
   }
 
-  int baseFlags =
-#ifdef PCRE_CONFIG_UTF8
-    utf8Support == Supported ? PCRE_NO_UTF8_CHECK :
-#endif
-    0;
-  int numMatches = pcre_exec(pcregex, NULL, buffer, bufferSize, startPos,
-                             m_notEmpty ? (PCRE_NOTEMPTY | PCRE_ANCHORED | baseFlags) : baseFlags, // see man pcretest
-                             ovector ? *ovector : 0L, ovecsize);
-  if (numMatches < 0)
+  uint32_t baseFlags = (utf8Support == Supported ? PCRE2_NO_UTF_CHECK : 0);
+  if (m_notEmpty)
+  {
+    baseFlags |= PCRE2_NOTEMPTY | PCRE2_ANCHORED;
+  }
+  int numMatches = pcre2_match(pcregex, buffer, PCRE2_ZERO_TERMINATED, startPos, baseFlags, match_data, NULL);
+  if (numMatches <= 0)
   {
     // Failed to match.
-    if (numMatches == PCRE_ERROR_NOMATCH && (flgs & Global) && m_notEmpty && ovector && startPos < nextPos)
+    if (numMatches == PCRE2_ERROR_NOMATCH && (flgs & Global) && m_notEmpty && startPos < nextPos)
     {
       // We set m_notEmpty ourselves, to look for a non-empty match
-      // (see man pcretest or pcretest.c for details).
       // So we don't stop here, we want to try again at i+1.
 #ifdef KJS_VERBOSE
       fprintf(stderr, "No match after m_notEmpty. +1 and keep going.\n");
 #endif
       m_notEmpty = 0;
-      numMatches = pcre_exec(pcregex, NULL, buffer, bufferSize, nextPos, baseFlags,
-                             ovector ? *ovector : 0L, ovecsize);
-      if (numMatches < 0)
+      baseFlags = (utf8Support == Supported ? PCRE2_NO_UTF_CHECK : 0);
+      numMatches = pcre2_match(pcregex, buffer, PCRE2_ZERO_TERMINATED, nextPos, baseFlags, match_data, NULL);
+      if (numMatches <= 0)
         return UString::null;
     }
-    else // done
+    else
       return UString::null;
   }
 
-  // Got a match, proceed with it.
-  // But fix up the ovector if need be..
-  if (ovector && originalPos) {
-    for (unsigned c = 0; c < 2 * TQMIN((unsigned)numMatches, nrSubPatterns+1); ++c) {
-      if ((*ovector)[c] != -1)
-        (*ovector)[c] = originalPos[(*ovector)[c]];
+  PCRE2_SIZE *pcre2_ovector = pcre2_get_ovector_pointer(match_data);
+  if (!pcre2_ovector)
+    return UString::null;
+
+  uint32_t pcre2_ovecCount = pcre2_get_ovector_count(match_data);
+  *ovector = new int[pcre2_ovecCount * 2];
+  if (originalPos)
+  {
+    for (size_t c = 0; c < 2 * pcre2_ovecCount; ++c)
+    {
+      (*ovector)[c] = (pcre2_ovector[c] != -1) ? originalPos[pcre2_ovector[c]] : -1;
+    }
+  }
+  else
+  {
+    for (size_t c = 0; c < 2 * pcre2_ovecCount; ++c)
+    {
+      (*ovector)[c] = pcre2_ovector[c];
     }
   }
-
-  if (!ovector)
-    return UString::null; // don't rely on the return value if you pass ovector==0
 #else
   const uint maxMatch = 10;
   regmatch_t rmatch[maxMatch];
@@ -419,28 +425,3 @@ UString RegExp::match(const UString &s, int i, int *pos, int **ovector)
   }
   return s.substr((*ovector)[0], (*ovector)[1] - (*ovector)[0]);
 }
-
-#if 0 // unused
-bool RegExp::test(const UString &s, int)
-{
-#ifdef HAVE_PCREPOSIX
-  int ovector[300];
-  CString buffer(s.cstring());
-
-  if (s.isNull() ||
-      pcre_exec(pcregex, NULL, buffer.c_str(), buffer.size(), 0,
-		0, ovector, 300) == PCRE_ERROR_NOMATCH)
-    return false;
-  else
-    return true;
-
-#else
-
-  char *str = strdup(s.ascii());
-  int r = regexec(&preg, str, 0, 0, 0);
-  free(str);
-
-  return r == 0;
-#endif
-}
-#endif
author	Michele Calgaro <michele.calgaro@yahoo.it>	2024-08-17 22:26:29 +0900
committer	Michele Calgaro <michele.calgaro@yahoo.it>	2024-08-18 20:14:02 +0900
commit	7740e825a683a9cc84f8422c94109c5fcc4beb8e (patch)
tree	0f4cb4d307e3536232cea590e607d14f9edb5e76 /kjs/regexp.cpp
parent	b59d51c67903335d27ada24d51be77137f664cb3 (diff)
download	tdelibs-7740e825a683a9cc84f8422c94109c5fcc4beb8e.tar.gz tdelibs-7740e825a683a9cc84f8422c94109c5fcc4beb8e.zip