diff options
Diffstat (limited to 'debian/htdig/htdig-3.2.0b6/htfuzzy/EndingsDB.cc')
-rw-r--r-- | debian/htdig/htdig-3.2.0b6/htfuzzy/EndingsDB.cc | 441 |
1 files changed, 441 insertions, 0 deletions
diff --git a/debian/htdig/htdig-3.2.0b6/htfuzzy/EndingsDB.cc b/debian/htdig/htdig-3.2.0b6/htfuzzy/EndingsDB.cc new file mode 100644 index 00000000..81dec74b --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htfuzzy/EndingsDB.cc @@ -0,0 +1,441 @@ +// +// EndingsDB.cc +// +// EndingsDB: Implementation of the private endings database +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1995-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: EndingsDB.cc,v 1.17 2004/05/28 13:15:20 lha Exp $ +// + +#ifdef HAVE_CONFIG_H +#include "htconfig.h" +#endif /* HAVE_CONFIG_H */ + +#include <fcntl.h> + +#include "Endings.h" +#include "htfuzzy.h" +#include "SuffixEntry.h" +#include "Dictionary.h" +#include "List.h" +#include "HtConfiguration.h" + +#include "filecopy.h" + +// This is an attempt to get around compatibility problems +// with the included regex +#ifdef _MSC_VER /* _WIN32 */ +#include "regex_win32.h" +#else +# ifdef USE_RX +# include <rxposix.h> +# else // Use regex +# ifdef HAVE_BROKEN_REGEX +# include <regex.h> +# else // include regex code and header +# include "gregex.h" +# endif +# endif +#endif //_MSC_VER /* _WIN32 */ + +#include <stdio.h> +#include <stdlib.h> +#include <sys/stat.h> + +#ifdef HAVE_STD +#include <fstream> +#ifdef HAVE_NAMESPACES +using namespace std; +#endif +#else +#include <fstream.h> +#endif /* HAVE_STD */ + +//***************************************************************************** +// +int +Endings::createDB(const HtConfiguration &config) +{ + Dictionary rules; + String tmpdir = getenv("TMPDIR"); + String word2root, root2word; + +#if defined(LIBHTDIG) || defined(LIBHTDIGPHP) || defined(_MSC_VER) //WIN32 + int ret = -1; + char * source = NULL; + char * dest = NULL; +#endif + + if (tmpdir.length()) + { + word2root = tmpdir; + root2word = tmpdir; + } + else + { + word2root = "/tmp"; + root2word = "/tmp"; + } + + word2root << "/word2root.db"; + root2word << "/root2word.db"; + + if (debug) + cout << "htfuzzy/endings: Reading rules\n"; + + if (readRules(rules, config["endings_affix_file"]) == NOTOK) + return NOTOK; + + if (debug) + cout << "htfuzzy/endings: Creating databases\n"; + + if (createRoot(rules, word2root, root2word, + config["endings_dictionary"]) == NOTOK) + return NOTOK; + + // + // Since we used files in TMPDIR for our temporary databases, we need + // to now move them to the correct location as defined in the config + // database. + // + +#if defined(LIBHTDIG) || defined(LIBHTDIGPHP) || defined(_MSC_VER) //WIN32 + + //Uses file_copy function - works on Unix/Linux & WinNT + source = root2word.get(); + dest = (char *)config["endings_root2word_db"].get(); + + //Attempt rename, if fail attempt copy & delete. + ret = rename(source, dest); + if (ret < 0) + { + ret = file_copy(source, dest, FILECOPY_OVERWRITE_ON); + if (ret == TRUE) + unlink(source); + else + return NOTOK; + } + + source = word2root.get(); + dest = (char *)config["endings_word2root_db"].get(); + + //Attempt rename, if fail attempt copy & delete. + ret = rename(source, dest); + if (ret < 0) + { + ret = file_copy(source, dest, FILECOPY_OVERWRITE_ON); + if (ret == TRUE) + unlink(source); + else + return NOTOK; + } + +#else //This code uses a system call - Phase this out + + struct stat stat_buf; + String mv("mv"); // assume it's in the PATH if predefined setting fails + if ((stat(MV, &stat_buf) != -1) && S_ISREG(stat_buf.st_mode)) + mv = MV; + system(form("%s %s %s;%s %s %s", + mv.get(), root2word.get(), config["endings_root2word_db"].get(), + mv.get(), word2root.get(), config["endings_word2root_db"].get())); + +#endif + + return OK; + +} + + +//***************************************************************************** +int +Endings::readRules(Dictionary &rules, const String& rulesFile) +{ + FILE *fl = fopen(rulesFile, "r"); + + if (fl == NULL) + return NOTOK; + + int inSuffixes = 0; + char currentSuffix[2] = " "; + char *p; + char input[1024]; + String line; + + while (fgets(input, sizeof(input), fl)) + { + if (input[0] == '\n' || input[0] == '#') + continue; + + if (mystrncasecmp(input, "suffixes", 8) == 0) + { + inSuffixes = 1; + continue; + } + else if (mystrncasecmp(input, "prefixes", 8) == 0) + { + inSuffixes = 0; + continue; + } + if (!inSuffixes) + continue; + + if (mystrncasecmp(input, "flag ", 5) == 0) + { + p = input + 5; + while (*p == '*' || *p == ' ' || *p == '\t') + p++; + currentSuffix[0] = *p; + } + else + { + line << input; + line.chop("\r\n"); + if (line.indexOf('>') > 0) + { + List *list; + SuffixEntry *se = new SuffixEntry(line); + + if (rules.Exists(currentSuffix)) + { + list = (List *) rules[currentSuffix]; + } + else + { + list = new List; + rules.Add(currentSuffix, list); + } + list->Add(se); + line = 0; + } + } + } + + fclose(fl); + return OK; +} + + +//***************************************************************************** +int +Endings::createRoot(Dictionary &rules, char *word2root, char *root2word, const String& dictFile) +{ + FILE *fl = fopen(dictFile, "r"); + if (fl == NULL) + return NOTOK; + + Database *w2r = Database::getDatabaseInstance(DB_BTREE); + Database *r2w = Database::getDatabaseInstance(DB_BTREE); + + w2r->OpenReadWrite(word2root, 0664); + r2w->OpenReadWrite(root2word, 0664); + + char input[1024]; + char *p; + String words; + String word; + List wordList; + int count = 0; + String data; + + while (fgets(input, sizeof(input), fl)) + { + if ((count % 100) == 0 && debug == 1) + { + cout << "htfuzzy/endings: words: " << count << '\n'; + cout.flush(); + } + count++; + + p = strchr(input, '/'); + if (p == NULL) + continue; // Only words that have legal endings are used + + *p++ = '\0'; + + mungeWord(input, word); + expandWord(words, wordList, rules, word, p); + + if (debug > 1) + cout << "htfuzzy/endings: " << word << " --> " << words << endl; + + // + // Store the root mapped to the list of expanded words. + // + r2w->Put(word, words); + + // + // For each of the expanded words, build a map to its root. + // + for (int i = 0; i < wordList.Count(); i++) + { + // + // Append to existing record if there is one. + // + data = ""; + if (w2r->Get(*(String *)wordList[i], data) == OK) + data << ' '; + data << word; + w2r->Put(*(String *)wordList[i], data); + } + } + + if (debug == 1) + cout << endl; + + fclose(fl); + w2r->Close(); + r2w->Close(); + delete w2r; + delete r2w; + + return OK; +} + + +//***************************************************************************** +// Convert a word from the dictionary format into something we can actually +// use. This means that the word will be converted to lowercase and that +// any accents will be combined into single characters. +// +void +Endings::mungeWord(char *input, String &word) +{ + char *p = input + 1; + + word = 0; + while (*input) + { + p = input + 1; + switch (*p) + { + case '"': // The previous character needs to get an umlaut + switch (*input) + { + case 'a': + case 'A': + word << char(228); + input += 2; + continue; + break; + case 'e': + case 'E': + word << char(235); + input += 2; + continue; + break; + case 'i': + case 'I': + word << char(239); + input += 2; + continue; + break; + case 'o': + case 'O': + word << char(246); + input += 2; + continue; + break; + case 'u': + case 'U': + word << char(252); + input += 2; + continue; + break; + } + break; + + case 'S': // See if the previous character needs to be an sz + if (*input == 's') + { + word << char(223); + input += 2; + continue; + } + else + { + word << *input; + } + break; + + default: + word << *input; + break; + } + input++; + } + word.lowercase(); +} + + +//***************************************************************************** +void +Endings::expandWord(String &words, List &wordList, + Dictionary &rules, char *word, char *suffixes) +{ + char suffix[2] = " "; + String root; + SuffixEntry *entry; + List *suffixRules; + char *p; + String rule; + + words = 0; + wordList.Destroy(); + + while (*suffixes > ' ') + { + suffix[0] = *suffixes++; + if (!rules.Exists(suffix)) + continue; + + suffixRules = (List *) rules[suffix]; + for (int i = 0; i < suffixRules->Count(); i++) + { + entry = (SuffixEntry *) (*suffixRules)[i]; + root = word; + regex_t reg; + rule = entry->rule; + if (strchr((char*)rule, '\'')) + continue; + if (debug > 2) + cout << "Applying regex '" << entry->expression << "' to " << word << endl; + regcomp(®, (char*)entry->expression, REG_ICASE | REG_NOSUB | REG_EXTENDED); + if (regexec(®, word, 0, NULL, 0) == 0) + { + // + // Matched + // + if (rule[0] == '-') + { + // + // We need to remove something... + // + p = strchr((char*)rule, ','); + if (p) + { + *p++ = '\0'; + root.chop((int)strlen(rule.get()) - 1); + root << p; + } + } + else + { + root << rule; + } + root.lowercase(); + if (debug > 2) + cout << word << " with " << rule << " --> '" << root << "'\n"; + wordList.Add(new String(root)); + words << root << ' '; + } + regfree(®); + } + } + words.chop(1); +} |