summaryrefslogtreecommitdiffstats
path: root/debian/htdig/htdig-3.2.0b6/libhtdig/libhtdig_htmerge.cc
diff options
context:
space:
mode:
Diffstat (limited to 'debian/htdig/htdig-3.2.0b6/libhtdig/libhtdig_htmerge.cc')
-rw-r--r--debian/htdig/htdig-3.2.0b6/libhtdig/libhtdig_htmerge.cc407
1 files changed, 407 insertions, 0 deletions
diff --git a/debian/htdig/htdig-3.2.0b6/libhtdig/libhtdig_htmerge.cc b/debian/htdig/htdig-3.2.0b6/libhtdig/libhtdig_htmerge.cc
new file mode 100644
index 00000000..988a8b61
--- /dev/null
+++ b/debian/htdig/htdig-3.2.0b6/libhtdig/libhtdig_htmerge.cc
@@ -0,0 +1,407 @@
+//----------------------------------------------------------------
+//
+// libhtdig_htmerge.cc
+//
+// 1/25/2002 created from htmerge.cc
+//
+// Neal Richter nealr@rightnow.com
+//
+// libhtdig_htmerge.cc
+//
+// htmerge: Merges two databases and/or updates databases to remove
+// old documents and ensures the databases are consistent.
+// Calls db.cc, docs.cc, and/or words.cc as necessary
+//
+// Part of the ht://Dig package <http://www.htdig.org/>
+// Copyright (c) 1999-2004 The ht://Dig Group
+// For copyright details, see the file COPYING in your distribution
+// or the GNU Library General Public License (LGPL) version 2 or later or later
+// <http://www.gnu.org/copyleft/lgpl.html>
+//
+// $Id: libhtdig_htmerge.cc,v 1.5 2004/05/28 13:15:29 lha Exp $
+//
+//----------------------------------------------------------------
+
+#ifdef HAVE_CONFIG_H
+#include "htconfig.h"
+#endif /* HAVE_CONFIG_H */
+
+extern "C" {
+#include "libhtdig_api.h"
+}
+
+#include "libhtdig_log.h"
+
+#include "WordContext.h"
+#include "good_strtok.h"
+#include "defaults.h"
+#include "DocumentDB.h"
+#include "HtURLCodec.h"
+#include "HtWordList.h"
+#include "HtWordReference.h"
+#include "htString.h"
+
+#ifdef HAVE_STD
+#include <fstream>
+#ifdef HAVE_NAMESPACES
+using namespace std;
+#endif
+#else
+#include <fstream.h>
+#endif /* HAVE_STD */
+
+#include <stdio.h>
+
+#ifndef _WIN32
+#include <unistd.h>
+#endif
+
+#include <stdlib.h>
+#include <ctype.h>
+#include <string.h>
+
+// If we have this, we probably want it.
+//#ifdef HAVE_GETOPT_H
+//#include <getopt.h>
+//#endif
+
+
+
+
+
+//Global Variables for this file
+
+// This hash is used to keep track of all the document IDs which have to be
+// discarded.
+// This is generated from the doc database and is used to prune words
+// from the word db
+static Dictionary discard_list;
+
+// This config is used for merging multiple databses
+static HtConfiguration merge_config;
+static HtConfiguration *config = NULL;
+
+static int verbose = 0;
+//static int stats = 0;
+static int alt_work_area = 0;
+
+//static String configFile = DEFAULT_CONFIG_FILE;
+extern String configFile;
+
+static String merge_configFile = 0;
+
+
+// Component procedures
+static int mergeDB ();
+
+int htmerge_index_merge(htmerge_parameters_struct *htmerge_parms)
+{
+ int ret = -1;
+ int merge_ret = -1;
+
+ //load htmerge 'command-line parameters'
+ configFile = htmerge_parms->configFile;
+ merge_configFile = htmerge_parms->merge_configFile;
+ verbose = htmerge_parms->debug;
+ if(verbose != 0)
+ {
+ ret = logOpen(htmerge_parms->logFile);
+
+ if(ret == FALSE)
+ {
+ reportError (form ("[HTDIG] Error opening log file [%s] . Error:[%d], %s\n",
+ htmerge_parms->logFile, errno, strerror(errno)) );
+ return(HTMERGE_ERROR_LOGFILE_OPEN);
+ }
+ }
+
+ alt_work_area = htmerge_parms->alt_work_area;
+
+
+
+ config = HtConfiguration::config ();
+ config->Defaults (&defaults[0]);
+
+ if (access ((char *) configFile, R_OK) < 0)
+ {
+ reportError (form ("[HTMERGE] Unable to find configuration file '%s'",
+ configFile.get ()));
+ return(HTMERGE_ERROR_CONFIG_READ);
+ }
+
+ config->Read (configFile);
+
+ //
+ // Check url_part_aliases and common_url_parts for
+ // errors.
+ String url_part_errors = HtURLCodec::instance ()->ErrMsg ();
+
+ if (url_part_errors.length () != 0)
+ {
+ reportError (form("[HTMERGE] Invalid url_part_aliases or common_url_parts: %s",
+ url_part_errors.get ()));
+ return(HTMERGE_ERROR_URL_PART);
+ }
+
+ if (merge_configFile.length ())
+ {
+ merge_config.Defaults (&defaults[0]);
+ if (access ((char *) merge_configFile, R_OK) < 0)
+ {
+ reportError (form ("[HTMERGE] Unable to find configuration file '%s'",
+ merge_configFile.get ()));
+ return(HTMERGE_ERROR_CONFIG_READ);
+ }
+ merge_config.Read (merge_configFile);
+ }
+
+ if (alt_work_area != 0)
+ {
+ String configValue;
+
+ configValue = config->Find ("word_db");
+ if (configValue.length () != 0)
+ {
+ configValue << ".work";
+ config->Add ("word_db", configValue);
+ }
+
+ configValue = config->Find ("doc_db");
+ if (configValue.length () != 0)
+ {
+ configValue << ".work";
+ config->Add ("doc_db", configValue);
+ }
+
+ configValue = config->Find ("doc_index");
+ if (configValue.length () != 0)
+ {
+ configValue << ".work";
+ config->Add ("doc_index", configValue);
+ }
+
+ configValue = config->Find ("doc_excerpt");
+ if (configValue.length () != 0)
+ {
+ configValue << ".work";
+ config->Add ("doc_excerpt", configValue);
+ }
+ }
+
+ WordContext::Initialize(*config);
+
+ if (merge_configFile.length())
+ {
+ // Merge the databases specified in merge_configFile into the current
+ // databases. Do this first then update the other databases as usual
+ // Note: We don't have to specify anything, it's all in the config vars
+
+ merge_ret = mergeDB();
+ }
+
+ //call destructors here
+ config->~HtConfiguration();
+ merge_config.~HtConfiguration();
+
+ if (verbose != 0)
+ {
+ ret = logClose();
+
+ if (ret == FALSE)
+ {
+ reportError (form("[HTMERGE]: Error closing file [%s]. Error:[%d], %s\n",
+ htmerge_parms->logFile, errno, strerror(errno)) );
+ return(HTMERGE_ERROR_LOGFILE_CLOSE);
+ }
+ }
+
+ return(TRUE);
+}
+
+//*****************************************************************************
+// void mergeDB()
+//
+static int mergeDB ()
+{
+ HtConfiguration *config = HtConfiguration::config ();
+ DocumentDB merge_db, db;
+ List *urls;
+ Dictionary merge_dup_ids, db_dup_ids; // Lists of DocIds to ignore
+ int docIDOffset;
+
+ const String doc_index = config->Find ("doc_index");
+ if (access (doc_index, R_OK) < 0)
+ {
+ reportError (form
+ ("[HTMERGE] Unable to open document index '%s'",
+ (const char *) doc_index));
+ return(HTMERGE_ERROR_DOCINDEX_READ);
+ }
+ const String doc_excerpt = config->Find ("doc_excerpt");
+ if (access (doc_excerpt, R_OK) < 0)
+ {
+ reportError (form
+ ("[HTMERGE] Unable to open document excerpts '%s'",
+ (const char *) doc_excerpt));
+ return(HTMERGE_ERROR_EXCERPTDB_READ);
+ }
+ const String doc_db = config->Find ("doc_db");
+ if (db.Open (doc_db, doc_index, doc_excerpt) < 0)
+ {
+ reportError (form ("[HTMERGE] Unable to open/create document database '%s'",
+ (const char *) doc_db));
+ return(HTMERGE_ERROR_DOCDB_READ);
+ }
+
+
+ const String merge_doc_index = merge_config["doc_index"];
+ if (access (merge_doc_index, R_OK) < 0)
+ {
+ reportError (form
+ ("[HTMERGE] Unable to open document index '%s'",
+ (const char *) merge_doc_index));
+ return(HTMERGE_ERROR_DOCINDEX_READ);
+ }
+ const String merge_doc_excerpt = merge_config["doc_excerpt"];
+ if (access (merge_doc_excerpt, R_OK) < 0)
+ {
+ reportError (form
+ ("[HTMERGE] Unable to open document excerpts '%s'",
+ (const char *) merge_doc_excerpt));
+ return(HTMERGE_ERROR_EXCERPTDB_READ);
+ }
+ const String merge_doc_db = merge_config["doc_db"];
+ if (merge_db.Open (merge_doc_db, merge_doc_index, merge_doc_excerpt) < 0)
+ {
+ reportError (form ("[HTMERGE] Unable to open document database '%s'",
+ (const char *) merge_doc_db));
+ return(HTMERGE_ERROR_DOCDB_READ);
+ }
+
+ // Start the merging by going through all the URLs that are in
+ // the database to be merged
+
+ urls = merge_db.URLs ();
+ // This ensures that every document added from merge_db has a unique ID
+ // in the new database
+ docIDOffset = db.NextDocID ();
+
+ urls->Start_Get ();
+ String *url;
+ String id;
+ while ((url = (String *) urls->Get_Next ()))
+ {
+ DocumentRef *ref = merge_db[url->get ()];
+ DocumentRef *old_ref = db[url->get ()];
+ if (!ref)
+ continue;
+
+ if (old_ref)
+ {
+ // Oh well, we knew this would happen. Let's get the duplicate
+ // And we'll only use the most recent date.
+
+ if (old_ref->DocTime () >= ref->DocTime ())
+ {
+ // Cool, the ref we're merging is too old, just ignore it
+ char str[20];
+ sprintf (str, "%d", ref->DocID ());
+ merge_dup_ids.Add (str, 0);
+
+ if (verbose > 1)
+ {
+ logEntry(form("[HTMERGE] Duplicate, URL: {%s} ignoring & merging copy\n", url));
+ }
+ }
+ else
+ {
+ // The ref we're merging is newer, delete the old one and add
+ char str[20];
+ sprintf (str, "%d", old_ref->DocID ());
+ db_dup_ids.Add (str, 0);
+ db.Delete (old_ref->DocID ());
+ ref->DocID (ref->DocID () + docIDOffset);
+ db.Add (*ref);
+ if (verbose > 1)
+ {
+ logEntry(form("[HTMERGE] Duplicate, URL: {%s} ignoring destination copy\n",url->get()));
+ }
+ }
+ }
+ else
+ {
+ // It's a new URL, just add it, making sure to load the excerpt
+ merge_db.ReadExcerpt (*ref);
+ ref->DocID (ref->DocID () + docIDOffset);
+ db.Add (*ref);
+ if (verbose > 1)
+ {
+ logEntry(form("[HTMERGE] Merged URL: {%s} \n",url->get()));
+ }
+ }
+ delete ref;
+ delete old_ref;
+ }
+ delete urls;
+
+ // As reported by Roman Dimov, we must update db.NextDocID()
+ // because of all the added records...
+ db.IncNextDocID (merge_db.NextDocID ());
+ merge_db.Close ();
+ db.Close ();
+
+ // OK, after merging the doc DBs, we do the same for the words
+ HtWordList mergeWordDB (*config), wordDB (*config);
+ List *words;
+ String docIDKey;
+
+ if (wordDB.Open (config->Find ("word_db"), O_RDWR) < 0)
+ {
+ reportError (form ("[HTMERGE] Unable to open/create word database '%s'",
+ (const char *) config->Find ("word_db")));
+ return(HTMERGE_ERROR_WORDDB_READ);
+ }
+
+ if (mergeWordDB.Open (merge_config["word_db"], O_RDONLY) < 0)
+ {
+ reportError (form ("[HTMERGE] Unable to open word database '%s'",
+ (const char *) merge_config["word_db"]));
+ return(HTMERGE_ERROR_WORDDB_READ);
+ }
+
+ // Start the merging by going through all the URLs that are in
+ // the database to be merged
+
+ words = mergeWordDB.WordRefs ();
+
+ words->Start_Get ();
+ HtWordReference *word;
+ while ((word = (HtWordReference *) words->Get_Next ()))
+ {
+ docIDKey = word->DocID ();
+ if (merge_dup_ids.Exists (docIDKey))
+ continue;
+
+ word->DocID (word->DocID () + docIDOffset);
+ wordDB.Override (*word);
+ }
+ delete words;
+
+ words = wordDB.WordRefs ();
+ words->Start_Get ();
+ while ((word = (HtWordReference *) words->Get_Next ()))
+ {
+ docIDKey = word->DocID ();
+ if (db_dup_ids.Exists (docIDKey))
+ wordDB.Delete (*word);
+ }
+ delete words;
+
+ // Cleanup--just close the two word databases
+ mergeWordDB.Close ();
+ wordDB.Close ();
+
+ return(TRUE);
+
+}
+