summaryrefslogtreecommitdiffstats
path: root/debian/htdig/htdig-3.2.0b6/htfuzzy/EndingsDB.cc
diff options
context:
space:
mode:
Diffstat (limited to 'debian/htdig/htdig-3.2.0b6/htfuzzy/EndingsDB.cc')
-rw-r--r--debian/htdig/htdig-3.2.0b6/htfuzzy/EndingsDB.cc441
1 files changed, 441 insertions, 0 deletions
diff --git a/debian/htdig/htdig-3.2.0b6/htfuzzy/EndingsDB.cc b/debian/htdig/htdig-3.2.0b6/htfuzzy/EndingsDB.cc
new file mode 100644
index 00000000..81dec74b
--- /dev/null
+++ b/debian/htdig/htdig-3.2.0b6/htfuzzy/EndingsDB.cc
@@ -0,0 +1,441 @@
+//
+// EndingsDB.cc
+//
+// EndingsDB: Implementation of the private endings database
+//
+// Part of the ht://Dig package <http://www.htdig.org/>
+// Copyright (c) 1995-2004 The ht://Dig Group
+// For copyright details, see the file COPYING in your distribution
+// or the GNU Library General Public License (LGPL) version 2 or later
+// <http://www.gnu.org/copyleft/lgpl.html>
+//
+// $Id: EndingsDB.cc,v 1.17 2004/05/28 13:15:20 lha Exp $
+//
+
+#ifdef HAVE_CONFIG_H
+#include "htconfig.h"
+#endif /* HAVE_CONFIG_H */
+
+#include <fcntl.h>
+
+#include "Endings.h"
+#include "htfuzzy.h"
+#include "SuffixEntry.h"
+#include "Dictionary.h"
+#include "List.h"
+#include "HtConfiguration.h"
+
+#include "filecopy.h"
+
+// This is an attempt to get around compatibility problems
+// with the included regex
+#ifdef _MSC_VER /* _WIN32 */
+#include "regex_win32.h"
+#else
+# ifdef USE_RX
+# include <rxposix.h>
+# else // Use regex
+# ifdef HAVE_BROKEN_REGEX
+# include <regex.h>
+# else // include regex code and header
+# include "gregex.h"
+# endif
+# endif
+#endif //_MSC_VER /* _WIN32 */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/stat.h>
+
+#ifdef HAVE_STD
+#include <fstream>
+#ifdef HAVE_NAMESPACES
+using namespace std;
+#endif
+#else
+#include <fstream.h>
+#endif /* HAVE_STD */
+
+//*****************************************************************************
+//
+int
+Endings::createDB(const HtConfiguration &config)
+{
+ Dictionary rules;
+ String tmpdir = getenv("TMPDIR");
+ String word2root, root2word;
+
+#if defined(LIBHTDIG) || defined(LIBHTDIGPHP) || defined(_MSC_VER) //WIN32
+ int ret = -1;
+ char * source = NULL;
+ char * dest = NULL;
+#endif
+
+ if (tmpdir.length())
+ {
+ word2root = tmpdir;
+ root2word = tmpdir;
+ }
+ else
+ {
+ word2root = "/tmp";
+ root2word = "/tmp";
+ }
+
+ word2root << "/word2root.db";
+ root2word << "/root2word.db";
+
+ if (debug)
+ cout << "htfuzzy/endings: Reading rules\n";
+
+ if (readRules(rules, config["endings_affix_file"]) == NOTOK)
+ return NOTOK;
+
+ if (debug)
+ cout << "htfuzzy/endings: Creating databases\n";
+
+ if (createRoot(rules, word2root, root2word,
+ config["endings_dictionary"]) == NOTOK)
+ return NOTOK;
+
+ //
+ // Since we used files in TMPDIR for our temporary databases, we need
+ // to now move them to the correct location as defined in the config
+ // database.
+ //
+
+#if defined(LIBHTDIG) || defined(LIBHTDIGPHP) || defined(_MSC_VER) //WIN32
+
+ //Uses file_copy function - works on Unix/Linux & WinNT
+ source = root2word.get();
+ dest = (char *)config["endings_root2word_db"].get();
+
+ //Attempt rename, if fail attempt copy & delete.
+ ret = rename(source, dest);
+ if (ret < 0)
+ {
+ ret = file_copy(source, dest, FILECOPY_OVERWRITE_ON);
+ if (ret == TRUE)
+ unlink(source);
+ else
+ return NOTOK;
+ }
+
+ source = word2root.get();
+ dest = (char *)config["endings_word2root_db"].get();
+
+ //Attempt rename, if fail attempt copy & delete.
+ ret = rename(source, dest);
+ if (ret < 0)
+ {
+ ret = file_copy(source, dest, FILECOPY_OVERWRITE_ON);
+ if (ret == TRUE)
+ unlink(source);
+ else
+ return NOTOK;
+ }
+
+#else //This code uses a system call - Phase this out
+
+ struct stat stat_buf;
+ String mv("mv"); // assume it's in the PATH if predefined setting fails
+ if ((stat(MV, &stat_buf) != -1) && S_ISREG(stat_buf.st_mode))
+ mv = MV;
+ system(form("%s %s %s;%s %s %s",
+ mv.get(), root2word.get(), config["endings_root2word_db"].get(),
+ mv.get(), word2root.get(), config["endings_word2root_db"].get()));
+
+#endif
+
+ return OK;
+
+}
+
+
+//*****************************************************************************
+int
+Endings::readRules(Dictionary &rules, const String& rulesFile)
+{
+ FILE *fl = fopen(rulesFile, "r");
+
+ if (fl == NULL)
+ return NOTOK;
+
+ int inSuffixes = 0;
+ char currentSuffix[2] = " ";
+ char *p;
+ char input[1024];
+ String line;
+
+ while (fgets(input, sizeof(input), fl))
+ {
+ if (input[0] == '\n' || input[0] == '#')
+ continue;
+
+ if (mystrncasecmp(input, "suffixes", 8) == 0)
+ {
+ inSuffixes = 1;
+ continue;
+ }
+ else if (mystrncasecmp(input, "prefixes", 8) == 0)
+ {
+ inSuffixes = 0;
+ continue;
+ }
+ if (!inSuffixes)
+ continue;
+
+ if (mystrncasecmp(input, "flag ", 5) == 0)
+ {
+ p = input + 5;
+ while (*p == '*' || *p == ' ' || *p == '\t')
+ p++;
+ currentSuffix[0] = *p;
+ }
+ else
+ {
+ line << input;
+ line.chop("\r\n");
+ if (line.indexOf('>') > 0)
+ {
+ List *list;
+ SuffixEntry *se = new SuffixEntry(line);
+
+ if (rules.Exists(currentSuffix))
+ {
+ list = (List *) rules[currentSuffix];
+ }
+ else
+ {
+ list = new List;
+ rules.Add(currentSuffix, list);
+ }
+ list->Add(se);
+ line = 0;
+ }
+ }
+ }
+
+ fclose(fl);
+ return OK;
+}
+
+
+//*****************************************************************************
+int
+Endings::createRoot(Dictionary &rules, char *word2root, char *root2word, const String& dictFile)
+{
+ FILE *fl = fopen(dictFile, "r");
+ if (fl == NULL)
+ return NOTOK;
+
+ Database *w2r = Database::getDatabaseInstance(DB_BTREE);
+ Database *r2w = Database::getDatabaseInstance(DB_BTREE);
+
+ w2r->OpenReadWrite(word2root, 0664);
+ r2w->OpenReadWrite(root2word, 0664);
+
+ char input[1024];
+ char *p;
+ String words;
+ String word;
+ List wordList;
+ int count = 0;
+ String data;
+
+ while (fgets(input, sizeof(input), fl))
+ {
+ if ((count % 100) == 0 && debug == 1)
+ {
+ cout << "htfuzzy/endings: words: " << count << '\n';
+ cout.flush();
+ }
+ count++;
+
+ p = strchr(input, '/');
+ if (p == NULL)
+ continue; // Only words that have legal endings are used
+
+ *p++ = '\0';
+
+ mungeWord(input, word);
+ expandWord(words, wordList, rules, word, p);
+
+ if (debug > 1)
+ cout << "htfuzzy/endings: " << word << " --> " << words << endl;
+
+ //
+ // Store the root mapped to the list of expanded words.
+ //
+ r2w->Put(word, words);
+
+ //
+ // For each of the expanded words, build a map to its root.
+ //
+ for (int i = 0; i < wordList.Count(); i++)
+ {
+ //
+ // Append to existing record if there is one.
+ //
+ data = "";
+ if (w2r->Get(*(String *)wordList[i], data) == OK)
+ data << ' ';
+ data << word;
+ w2r->Put(*(String *)wordList[i], data);
+ }
+ }
+
+ if (debug == 1)
+ cout << endl;
+
+ fclose(fl);
+ w2r->Close();
+ r2w->Close();
+ delete w2r;
+ delete r2w;
+
+ return OK;
+}
+
+
+//*****************************************************************************
+// Convert a word from the dictionary format into something we can actually
+// use. This means that the word will be converted to lowercase and that
+// any accents will be combined into single characters.
+//
+void
+Endings::mungeWord(char *input, String &word)
+{
+ char *p = input + 1;
+
+ word = 0;
+ while (*input)
+ {
+ p = input + 1;
+ switch (*p)
+ {
+ case '"': // The previous character needs to get an umlaut
+ switch (*input)
+ {
+ case 'a':
+ case 'A':
+ word << char(228);
+ input += 2;
+ continue;
+ break;
+ case 'e':
+ case 'E':
+ word << char(235);
+ input += 2;
+ continue;
+ break;
+ case 'i':
+ case 'I':
+ word << char(239);
+ input += 2;
+ continue;
+ break;
+ case 'o':
+ case 'O':
+ word << char(246);
+ input += 2;
+ continue;
+ break;
+ case 'u':
+ case 'U':
+ word << char(252);
+ input += 2;
+ continue;
+ break;
+ }
+ break;
+
+ case 'S': // See if the previous character needs to be an sz
+ if (*input == 's')
+ {
+ word << char(223);
+ input += 2;
+ continue;
+ }
+ else
+ {
+ word << *input;
+ }
+ break;
+
+ default:
+ word << *input;
+ break;
+ }
+ input++;
+ }
+ word.lowercase();
+}
+
+
+//*****************************************************************************
+void
+Endings::expandWord(String &words, List &wordList,
+ Dictionary &rules, char *word, char *suffixes)
+{
+ char suffix[2] = " ";
+ String root;
+ SuffixEntry *entry;
+ List *suffixRules;
+ char *p;
+ String rule;
+
+ words = 0;
+ wordList.Destroy();
+
+ while (*suffixes > ' ')
+ {
+ suffix[0] = *suffixes++;
+ if (!rules.Exists(suffix))
+ continue;
+
+ suffixRules = (List *) rules[suffix];
+ for (int i = 0; i < suffixRules->Count(); i++)
+ {
+ entry = (SuffixEntry *) (*suffixRules)[i];
+ root = word;
+ regex_t reg;
+ rule = entry->rule;
+ if (strchr((char*)rule, '\''))
+ continue;
+ if (debug > 2)
+ cout << "Applying regex '" << entry->expression << "' to " << word << endl;
+ regcomp(&reg, (char*)entry->expression, REG_ICASE | REG_NOSUB | REG_EXTENDED);
+ if (regexec(&reg, word, 0, NULL, 0) == 0)
+ {
+ //
+ // Matched
+ //
+ if (rule[0] == '-')
+ {
+ //
+ // We need to remove something...
+ //
+ p = strchr((char*)rule, ',');
+ if (p)
+ {
+ *p++ = '\0';
+ root.chop((int)strlen(rule.get()) - 1);
+ root << p;
+ }
+ }
+ else
+ {
+ root << rule;
+ }
+ root.lowercase();
+ if (debug > 2)
+ cout << word << " with " << rule << " --> '" << root << "'\n";
+ wordList.Add(new String(root));
+ words << root << ' ';
+ }
+ regfree(&reg);
+ }
+ }
+ words.chop(1);
+}