diff options
Diffstat (limited to 'debian/htdig/htdig-3.2.0b6/htlib/StringMatch.cc')
-rw-r--r-- | debian/htdig/htdig-3.2.0b6/htlib/StringMatch.cc | 601 |
1 files changed, 601 insertions, 0 deletions
diff --git a/debian/htdig/htdig-3.2.0b6/htlib/StringMatch.cc b/debian/htdig/htdig-3.2.0b6/htlib/StringMatch.cc new file mode 100644 index 00000000..b1512cc3 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htlib/StringMatch.cc @@ -0,0 +1,601 @@ +// +// StringMatch.cc +// +// StringMatch: This class provides an interface to a fairly specialized string +// lookup facility. It is intended to be used as a replace for any +// regualr expression matching when the pattern string is in the form: +// +// <string1>|<string2>|<string3>|... +// +// Just like regular expression routines, the pattern needs to be +// compiled before it can be used. This is done using the Pattern() +// member function. Once the pattern has been compiled, the member +// function Find() can be used to search for the pattern in a string. +// If a string has been found, the "which" and "length" parameters +// will be set to the string index and string length respectively. +// (The string index is counted starting from 0) The return value of +// Find() is the position at which the string was found or -1 if no +// strings could be found. If a case insensitive match needs to be +// performed, call the IgnoreCase() member function before calling +// Pattern(). This function will setup a character translation table +// which will convert all uppercase characters to lowercase. If some +// other translation is required, the TranslationTable() member +// function can be called to provide a custom table. This table needs +// to be 256 characters. +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1999-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: StringMatch.cc,v 1.18 2004/05/28 13:15:21 lha Exp $ +// + +#ifdef HAVE_CONFIG_H +#include "htconfig.h" +#endif /* HAVE_CONFIG_H */ + +#include "StringMatch.h" + +#include <string.h> +#include <ctype.h> + +#ifdef HAVE_STD +#include <fstream> +#ifdef HAVE_NAMESPACES +using namespace std; +#endif +#else +#include <fstream.h> +#endif /* HAVE_STD */ + +// +// Entries in the state table can either be normal or final. +// Final states have an match index encoded in them. This number +// is shifted left by INDEX_SHIFT bits. +// +#define MATCH_INDEX_MASK 0xffff0000 +#define STATE_MASK 0x0000ffff +#define INDEX_SHIFT 16 + +//***************************************************************************** +// StringMatch::StringMatch() +// +StringMatch::StringMatch() +{ + // + // Clear out the state table pointers + // + for (int i = 0; i < 256; i++) + table[i] = 0; + local_alloc = 0; + trans = 0; +} + + +//***************************************************************************** +// StringMatch::~StringMatch() +// +StringMatch::~StringMatch() +{ + for (int i = 0; i < 256; i++) + delete [] table[i]; + if (local_alloc) + delete [] trans; +} + + +//***************************************************************************** +// void StringMatch::Pattern(char *pattern) +// Compile the given pattern into a state transition table +// +void +StringMatch::Pattern(char *pattern, char sep) +{ + if (!pattern || !*pattern) + { + // + // No pattern to compile... + // + return; + } + + // + // Allocate enough space in the state table to hold the worst case + // patterns... + // + int n = strlen(pattern); + + // ...but since the state table does not need an extra state + // for each string in the pattern, we can subtract the number + // of separators. Wins for small but numerous strings in + // the pattern. + char *tmpstr; + for (tmpstr = pattern; + (tmpstr = strchr(tmpstr, sep)) != NULL; + tmpstr++) // Pass the separator. + n--; + + int i; + + for (i = 0; i < 256; i++) + { + table[i] = new int[n]; + memset((unsigned char *) table[i], 0, n * sizeof(int)); + } + for (i = 0; i < n; i++) + table[0][i] = i; // "no-op" states for null char, to be ignored + + // + // Set up a standard case translation table if needed. + // + if (!trans) + { + trans = new unsigned char[256]; + for (i = 0; i < 256; i++) + { + trans[i] = (unsigned char)i; + } + local_alloc = 1; + } + + // + // Go though each of the patterns and build entries in the table. + // + int state = 0; + int totalStates = 0; + unsigned char previous = 0; + int previousState = 0; + int previousValue = 0; + int index = 1; + unsigned char chr; + + while ((unsigned char)*pattern) + { +#if 0 + if (totalStates > n) + { + cerr << "Fatal! Miscalculation of number of states" + << endl; + exit (2); + } +#endif + + chr = trans[(unsigned char)*pattern]; + if (chr == 0) + { + pattern++; + continue; + } + if (chr == sep) + { + // + // Next pattern + // + table[previous][previousState] = + previousValue | (index << INDEX_SHIFT); + index++; + state = 0; + // totalStates--; + } + else + { + previousValue = table[chr][state]; + previousState = state; + if (previousValue) + { + if (previousValue & MATCH_INDEX_MASK) + { + if (previousValue & STATE_MASK) + { + state = previousValue & STATE_MASK; + } + else + { + table[chr][state] |= ++totalStates; + state = totalStates; + } + } + else + { + state = previousValue & STATE_MASK; + } + } + else + { + table[chr][state] = ++totalStates; + state = totalStates; + } + } + previous = chr; + pattern++; + } + table[previous][previousState] = + previousValue | (index << INDEX_SHIFT); +} + + +//***************************************************************************** +// int StringMatch::FindFirst(const char *string, int &which, int &length) +// Attempt to find the first occurance of the previous compiled patterns. +// +int StringMatch::FindFirst(const char *string, int &which, int &length) +{ + which = -1; + length = -1; + + if (!table[0]) + return 0; + + int state = 0, new_state = 0; + int pos = 0; + int start_pos = 0; + + while ((unsigned char)string[pos]) + { + new_state = table[trans[(unsigned char)string[pos] & 0xff]][state]; + if (new_state) + { + if (state == 0) + { + // + // Keep track of where we started comparing so that we can + // come back to this point later if we didn't match anything + // + start_pos = pos; + } + } + else + { + // + // We came back to 0 state. This means we didn't match anything. + // + if (state) + { + // But we may already have a match, and are just being greedy. + if (which != -1) + return start_pos; + + pos = start_pos + 1; + } + else + pos++; + state = 0; + continue; + } + state = new_state; + if (state & MATCH_INDEX_MASK) + { + // + // Matched one of the patterns. + // Determine which and return. + // + which = ((unsigned int) (state & MATCH_INDEX_MASK) + >> INDEX_SHIFT) - 1; + length = pos - start_pos + 1; + state &= STATE_MASK; + + // Continue to find the longest, if there is one. + if (state == 0) + return start_pos; + } + pos++; + } + + // Maybe we were too greedy. + if (which != -1) + return start_pos; + + return -1; +} + + +//***************************************************************************** +// int StringMatch::Compare(const char *string, int &which, int &length) +// +int StringMatch::Compare(const char *string, int &which, int &length) +{ + which = -1; + length = -1; + + if (!table[0]) + return 0; + + int state = 0, new_state = 0; + int pos = 0; + int start_pos = 0; + + // + // Skip to at least the start of a word. + // + while ((unsigned char)string[pos]) + { + new_state = table[trans[string[pos]]][state]; + if (new_state) + { + if (state == 0) + { + start_pos = pos; + } + } + else + { + // We may already have a match, and are just being greedy. + if (which != -1) + return 1; + + return 0; + } + state = new_state; + if (state & MATCH_INDEX_MASK) + { + // + // Matched one of the patterns. + // + which = ((unsigned int) (state & MATCH_INDEX_MASK) + >> INDEX_SHIFT) - 1; + length = pos - start_pos + 1; + + // Continue to find the longest, if there is one. + state &= STATE_MASK; + if (state == 0) + return 1; + } + pos++; + } + + // Maybe we were too greedy. + if (which != -1) + return 1; + + return 0; +} + + +//***************************************************************************** +// int StringMatch::FindFirstWord(char *string) +// +int StringMatch::FindFirstWord(const char *string) +{ + int dummy; + return FindFirstWord(string, dummy, dummy); +} + + +//***************************************************************************** +// int StringMatch::CompareWord(const char *string) +// +int StringMatch::CompareWord(const char *string) +{ + int dummy; + return CompareWord(string, dummy, dummy); +} + + +//***************************************************************************** +// int StringMatch::FindFirstWord(char *string, int &which, int &length) +// Attempt to find the first occurance of the previous compiled patterns. +// +int StringMatch::FindFirstWord(const char *string, int &which, int &length) +{ + which = -1; + length = -1; + + int state = 0, new_state = 0; + int pos = 0; + int start_pos = 0; + int is_word = 1; + + // + // Skip to at least the start of a word. + // + while ((unsigned char)string[pos]) + { + new_state = table[trans[(unsigned char)string[pos]]][state]; + if (new_state) + { + if (state == 0) + { + start_pos = pos; + } + } + else + { + // + // We came back to 0 state. This means we didn't match anything. + // + if (state) + { + pos = start_pos + 1; + } + else + pos++; + state = 0; + continue; + } + state = new_state; + + if (state & MATCH_INDEX_MASK) + { + // + // Matched one of the patterns. + // + is_word = 1; + if (start_pos != 0) + { + if (HtIsStrictWordChar((unsigned char)string[start_pos - 1])) + is_word = 0; + } + if (HtIsStrictWordChar((unsigned char)string[pos + 1])) + is_word = 0; + if (is_word) + { + // + // Determine which and return. + // + which = ((unsigned int) (state & MATCH_INDEX_MASK) + >> INDEX_SHIFT) - 1; + length = pos - start_pos + 1; + return start_pos; + } + else + { + // + // Not at the end of word. Continue searching. + // + if (state & STATE_MASK) + { + state &= STATE_MASK; + } + else + { + pos = start_pos + 1; + state = 0; + } + } + } + pos++; + } + return -1; +} + + +//***************************************************************************** +// int StringMatch::CompareWord(const char *string, int &which, int &length) +// +int StringMatch::CompareWord(const char *string, int &which, int &length) +{ + which = -1; + length = -1; + + if (!table[0]) + return 0; + + int state = 0; + int position = 0; + + // + // Skip to at least the start of a word. + // + while ((unsigned char)string[position]) + { + state = table[trans[(unsigned char)string[position]]][state]; + if (state == 0) + { + return 0; + } + + if (state & MATCH_INDEX_MASK) + { + // + // Matched one of the patterns. See if it is a word. + // + int isWord = 1; + + if ((unsigned char)string[position + 1]) + { + if (HtIsStrictWordChar((unsigned char)string[position + 1])) + isWord = 0; + } + + if (isWord) + { + which = ((unsigned int) (state & MATCH_INDEX_MASK) + >> INDEX_SHIFT) - 1; + length = position + 1; + return 1; + } + else + { + // + // Not at the end of a word. Continue searching. + // + if ((state & STATE_MASK) != 0) + { + state &= STATE_MASK; + } + else + { + return 0; + } + } + } + position++; + } + return 0; +} + + +//***************************************************************************** +// void StringMatch::TranslationTable(char *table) +// +void StringMatch::TranslationTable(char *table) +{ + if (local_alloc) + delete [] trans; + trans = (unsigned char *) table; + local_alloc = 0; +} + + +//***************************************************************************** +// void StringMatch::IgnoreCase() +// Set up the case translation table to convert uppercase to lowercase +// +void StringMatch::IgnoreCase() +{ + if (!local_alloc || !trans) + { + trans = new unsigned char[256]; + for (int i = 0; i < 256; i++) + trans[i] = (unsigned char)i; + local_alloc = 1; + } + for (int i = 0; i < 256; i++) + if (isupper((unsigned char)i)) + trans[i] = tolower((unsigned char)i); +} + + +//***************************************************************************** +// void StringMatch::IgnorePunct(char *punct) +// Set up the character translation table to ignore punctuation +// +void StringMatch::IgnorePunct(char *punct) +{ + if (!local_alloc || !trans) + { + trans = new unsigned char[256]; + for (int i = 0; i < 256; i++) + trans[i] = (unsigned char)i; + local_alloc = 1; + } + if (punct) + for (int i = 0; punct[i]; i++) + trans[(unsigned char)punct[i]] = 0; + else + for (int i = 0; i < 256; i++) + if (HtIsWordChar(i) && !HtIsStrictWordChar(i)) + trans[i] = 0; +} + + +//***************************************************************************** +// int StringMatch::FindFirst(const char *source) +// +int StringMatch::FindFirst(const char *source) +{ + int dummy; + return FindFirst(source, dummy, dummy); +} + + +//***************************************************************************** +// int StringMatch::Compare(const char *source) +// +int StringMatch::Compare(const char *source) +{ + int dummy; + return Compare(source, dummy, dummy); +} |