1 files changed, 441 insertions, 0 deletions
diff --git a/debian/htdig/htdig-3.2.0b6/htfuzzy/EndingsDB.cc b/debian/htdig/htdig-3.2.0b6/htfuzzy/EndingsDB.cc
new file mode 100644
index 00000000..81dec74b
--- /dev/null
+++ b/debian/htdig/htdig-3.2.0b6/htfuzzy/EndingsDB.cc
@@ -0,0 +1,441 @@
+//
+// EndingsDB.cc
+//
+// EndingsDB: Implementation of the private endings database
+//           
+// Part of the ht://Dig package   <http://www.htdig.org/>
+// Copyright (c) 1995-2004 The ht://Dig Group
+// For copyright details, see the file COPYING in your distribution
+// or the GNU Library General Public License (LGPL) version 2 or later
+// <http://www.gnu.org/copyleft/lgpl.html>
+//
+// $Id: EndingsDB.cc,v 1.17 2004/05/28 13:15:20 lha Exp $
+//
+
+#ifdef HAVE_CONFIG_H
+#include "htconfig.h"
+#endif /* HAVE_CONFIG_H */
+
+#include <fcntl.h>
+
+#include "Endings.h"
+#include "htfuzzy.h"
+#include "SuffixEntry.h"
+#include "Dictionary.h"
+#include "List.h"
+#include "HtConfiguration.h"
+
+#include "filecopy.h"
+
+// This is an attempt to get around compatibility problems 
+// with the included regex
+#ifdef _MSC_VER /* _WIN32 */
+#include "regex_win32.h"
+#else
+# ifdef USE_RX
+#  include <rxposix.h>
+# else // Use regex
+#  ifdef HAVE_BROKEN_REGEX
+#   include <regex.h>
+#  else // include regex code and header
+#   include "gregex.h"
+#  endif
+# endif
+#endif //_MSC_VER /* _WIN32 */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/stat.h>
+
+#ifdef HAVE_STD
+#include <fstream>
+#ifdef HAVE_NAMESPACES
+using namespace std;
+#endif
+#else
+#include <fstream.h>
+#endif /* HAVE_STD */
+
+//*****************************************************************************
+//
+int
+Endings::createDB(const HtConfiguration &config)
+{
+    Dictionary	rules;
+    String      tmpdir = getenv("TMPDIR");
+    String      word2root, root2word;
+    
+#if defined(LIBHTDIG) || defined(LIBHTDIGPHP) || defined(_MSC_VER) //WIN32
+    int ret = -1;
+    char * source = NULL;
+    char * dest = NULL;
+#endif
+
+    if (tmpdir.length())
+      {
+	word2root = tmpdir;
+	root2word = tmpdir;
+      }
+    else
+      {
+	word2root = "/tmp";
+	root2word = "/tmp";
+      }
+
+    word2root << "/word2root.db";
+    root2word << "/root2word.db";
+
+    if (debug)
+	cout << "htfuzzy/endings: Reading rules\n";
+	
+    if (readRules(rules, config["endings_affix_file"]) == NOTOK)
+	return NOTOK;
+
+    if (debug)
+	cout << "htfuzzy/endings: Creating databases\n";
+	
+    if (createRoot(rules, word2root, root2word,
+		   config["endings_dictionary"]) == NOTOK)
+	return NOTOK;
+
+    //
+    // Since we used files in TMPDIR for our temporary databases, we need
+    // to now move them to the correct location as defined in the config
+    // database.
+    //
+    
+#if defined(LIBHTDIG) || defined(LIBHTDIGPHP) || defined(_MSC_VER) //WIN32
+
+    //Uses file_copy function - works on Unix/Linux & WinNT
+    source = root2word.get();
+    dest = (char *)config["endings_root2word_db"].get();
+
+    //Attempt rename, if fail attempt copy & delete.
+    ret = rename(source, dest);
+    if (ret < 0)
+    {
+        ret = file_copy(source, dest, FILECOPY_OVERWRITE_ON);
+        if (ret == TRUE)
+            unlink(source);
+        else
+            return NOTOK;
+    }
+
+    source = word2root.get();
+    dest = (char *)config["endings_word2root_db"].get();
+
+    //Attempt rename, if fail attempt copy & delete.
+    ret = rename(source, dest);
+    if (ret < 0)
+    {
+        ret = file_copy(source, dest, FILECOPY_OVERWRITE_ON);
+        if (ret == TRUE)
+            unlink(source);
+        else
+            return NOTOK;
+    }
+    
+#else //This code uses a system call - Phase this out
+
+    struct stat stat_buf;
+    String mv("mv");	// assume it's in the PATH if predefined setting fails
+    if ((stat(MV, &stat_buf) != -1) && S_ISREG(stat_buf.st_mode))
+	mv = MV;
+    system(form("%s %s %s;%s %s %s",
+	mv.get(), root2word.get(), config["endings_root2word_db"].get(),
+	mv.get(), word2root.get(), config["endings_word2root_db"].get()));
+
+#endif
+
+    return OK;
+
+}
+
+
+//*****************************************************************************
+int
+Endings::readRules(Dictionary &rules, const String& rulesFile)
+{
+    FILE	*fl = fopen(rulesFile, "r");
+
+    if (fl == NULL)
+	return NOTOK;
+
+    int		inSuffixes = 0;
+    char	currentSuffix[2] = " ";
+    char	*p;
+    char	input[1024];
+    String	line;
+	
+    while (fgets(input, sizeof(input), fl))
+    {
+	if (input[0] == '\n' || input[0] == '#')
+	    continue;
+
+	if (mystrncasecmp(input, "suffixes", 8) == 0)
+	{
+	    inSuffixes = 1;
+	    continue;
+	}
+	else if (mystrncasecmp(input, "prefixes", 8) == 0)
+	{
+	    inSuffixes = 0;
+	    continue;
+	}
+	if (!inSuffixes)
+	    continue;
+
+	if (mystrncasecmp(input, "flag ", 5) == 0)
+	{
+	    p = input + 5;
+	    while (*p == '*' || *p == ' ' || *p == '\t')
+		p++;
+	    currentSuffix[0] = *p;
+	}
+	else
+	{
+	    line << input;
+	    line.chop("\r\n");
+	    if (line.indexOf('>') > 0)
+	    {
+		List		*list;
+		SuffixEntry	*se = new SuffixEntry(line);
+			
+		if (rules.Exists(currentSuffix))
+		{
+		    list = (List *) rules[currentSuffix];
+		}
+		else
+		{
+		    list = new List;
+		    rules.Add(currentSuffix, list);
+		}
+		list->Add(se);
+		line = 0;
+	    }
+	}
+    }
+
+    fclose(fl);
+    return OK;
+}
+
+
+//*****************************************************************************
+int
+Endings::createRoot(Dictionary &rules, char *word2root, char *root2word, const String& dictFile)
+{
+    FILE	*fl = fopen(dictFile, "r");
+    if (fl == NULL)
+	return NOTOK;
+
+    Database	*w2r = Database::getDatabaseInstance(DB_BTREE);
+    Database	*r2w = Database::getDatabaseInstance(DB_BTREE);
+
+    w2r->OpenReadWrite(word2root, 0664);
+    r2w->OpenReadWrite(root2word, 0664);
+	
+    char	input[1024];
+    char	*p;
+    String	words;
+    String	word;
+    List	wordList;
+    int		count = 0;
+    String	data;
+	
+    while (fgets(input, sizeof(input), fl))
+    {
+	if ((count % 100) == 0 && debug == 1)
+	{
+	    cout << "htfuzzy/endings: words: " << count << '\n';
+	    cout.flush();
+	}
+	count++;
+		
+	p = strchr(input, '/');
+	if (p == NULL)
+	    continue;		// Only words that have legal endings are used
+
+	*p++ = '\0';
+
+	mungeWord(input, word);
+	expandWord(words, wordList, rules, word, p);
+
+	if (debug > 1)
+	    cout << "htfuzzy/endings: " << word << " --> " << words << endl;
+
+	//
+	// Store the root mapped to the list of expanded words.
+	//
+	r2w->Put(word, words);
+
+	//
+	// For each of the expanded words, build a map to its root.
+	//
+	for (int i = 0; i < wordList.Count(); i++)
+	{
+	    //
+	    // Append to existing record if there is one.
+	    //
+	    data = "";
+	    if (w2r->Get(*(String *)wordList[i], data) == OK)
+		data << ' ';
+	    data << word;
+	    w2r->Put(*(String *)wordList[i], data);
+	}
+    }
+
+    if (debug == 1)
+	cout << endl;
+	
+    fclose(fl);
+    w2r->Close();
+    r2w->Close();
+    delete w2r;
+    delete r2w;
+
+    return OK;
+}
+
+
+//*****************************************************************************
+// Convert a word from the dictionary format into something we can actually
+// use.  This means that the word will be converted to lowercase and that
+// any accents will be combined into single characters.
+//
+void
+Endings::mungeWord(char *input, String &word)
+{
+    char	*p = input + 1;
+    
+    word = 0;
+    while (*input)
+    {
+	p = input + 1;
+	switch (*p)
+	{
+    	    case '"':	// The previous character needs to get an umlaut
+		switch (*input)
+		{
+		    case 'a':
+		    case 'A':
+			word << char(228);
+			input += 2;
+			continue;
+			break;
+		    case 'e':
+		    case 'E':
+			word << char(235);
+			input += 2;
+			continue;
+			break;
+		    case 'i':
+		    case 'I':
+			word << char(239);
+			input += 2;
+			continue;
+			break;
+		    case 'o':
+		    case 'O':
+			word << char(246);
+			input += 2;
+			continue;
+			break;
+		    case 'u':
+		    case 'U':
+			word << char(252);
+			input += 2;
+			continue;
+			break;
+		}
+		break;
+		
+	    case 'S':	// See if the previous character needs to be an sz
+		if (*input == 's')
+		{
+		    word << char(223);
+		    input += 2;
+		    continue;
+		}
+		else
+		{
+		    word << *input;
+		}
+		break;
+		
+	    default:
+		word << *input;
+		break;
+	}
+	input++;
+    }
+    word.lowercase();
+}
+
+
+//*****************************************************************************
+void
+Endings::expandWord(String &words, List &wordList,
+		    Dictionary &rules, char *word, char *suffixes)
+{
+    char	suffix[2] = " ";
+    String	root;
+    SuffixEntry	*entry;
+    List	*suffixRules;
+    char	*p;
+    String	rule;
+	
+    words = 0;
+    wordList.Destroy();
+
+    while (*suffixes > ' ')
+    {
+	suffix[0] = *suffixes++;
+	if (!rules.Exists(suffix))
+	    continue;
+
+	suffixRules = (List *) rules[suffix];
+	for (int i = 0; i < suffixRules->Count(); i++)
+	{
+	    entry = (SuffixEntry *) (*suffixRules)[i];
+	    root = word;
+	    regex_t	reg;
+	    rule = entry->rule;
+	    if (strchr((char*)rule, '\''))
+		continue;
+	    if (debug > 2)
+		cout << "Applying regex '" << entry->expression << "' to " << word << endl;
+	    regcomp(&reg, (char*)entry->expression, REG_ICASE | REG_NOSUB | REG_EXTENDED);
+	    if (regexec(&reg, word, 0, NULL, 0) == 0)
+	    {
+		//
+		// Matched
+		//
+		if (rule[0] == '-')
+		{
+		    //
+		    // We need to remove something...
+		    //
+		    p = strchr((char*)rule, ',');
+		    if (p)
+		    {
+			*p++ = '\0';
+			root.chop((int)strlen(rule.get()) - 1);
+			root << p;
+		    }
+		}
+		else
+		{
+		    root << rule;
+		}
+		root.lowercase();
+		if (debug > 2)
+		    cout << word << " with " << rule << " --> '" << root << "'\n";
+		wordList.Add(new String(root));
+		words << root << ' ';
+	    }
+	    regfree(&reg);
+	}
+    }
+    words.chop(1);
+}